
    qiD                        d Z ddlZddlmZ ddlZddlmZ ddlmZ	 ddl
mZmZ ddlmZ ddlmZmZ d	d
lmZmZmZmZmZmZmZmZmZ ddlmZ  ej<                  e      Z e ed       G d de                    Z! G d de      Z" G d de      Z# G d de      Z$ G d de      Z% G d de      Z& G d de      Z'e G d de             Z(eZ) G d d e(e      Z* ed!       G d" d#e(             Z+ G d$ d%e      Z, G d& d'e      Z-g d(Z.y))zPyTorch UniSpeech model.    N)	dataclass   )initialization)ModelOutputWav2Vec2BaseModelOutput)PreTrainedModel)auto_docstringlogging   )	Wav2Vec2EncoderWav2Vec2EncoderStableLayerNormWav2Vec2FeatureEncoderWav2Vec2FeatureProjectionWav2Vec2ForCTC!Wav2Vec2ForSequenceClassificationWav2Vec2GumbelVectorQuantizerWav2Vec2ModelWav2Vec2PositionalConvEmbedding   )UniSpeechConfigzh
    Output type of [`UniSpeechForPreTrainingOutput`], with potential hidden states and attentions.
    )custom_introc                      e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	ej                  dz  ed<   dZ
ej                  dz  ed<   dZeej                     dz  ed<   dZeej                     dz  ed<   y)	UniSpeechForPreTrainingOutputa  
    loss (*optional*, returned when model is in train mode, `torch.FloatTensor` of shape `(1,)`):
        Total loss as the sum of the contrastive loss (L_m) and the diversity loss (L_d) as stated in the [official
        paper](https://huggingface.co/papers/2006.11477).
    projected_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`):
        Hidden-states of the model projected to *config.proj_codevector_dim* that can be used to predict the masked
        projected quantized states.
    projected_quantized_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`):
        Quantized extracted feature vectors projected to *config.proj_codevector_dim* representing the positive
        target vectors for contrastive loss.
    codevector_perplexity (`torch.FloatTensor` of shape `(1,)`):
        The perplexity of the codevector distribution, used to measure the diversity of the codebook.
    Nlossprojected_statesprojected_quantized_statescodevector_perplexityhidden_states
attentions)__name__
__module____qualname____doc__r   torchFloatTensor__annotations__r   r   r   r   tupler        a/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/unispeech/modular_unispeech.pyr   r   +   s     &*D%

d
")15e''$.5;? 1 1D 8?6:5,,t3:59M5**+d2926Je''(4/6r)   r   c                       e Zd Zy) UniSpeechPositionalConvEmbeddingNr    r!   r"   r(   r)   r*   r,   r,   H       r)   r,   c                       e Zd Zy)UniSpeechFeatureEncoderNr-   r(   r)   r*   r0   r0   L   r.   r)   r0   c                       e Zd Zy)UniSpeechFeatureProjectionNr-   r(   r)   r*   r2   r2   P   r.   r)   r2   c                       e Zd Zy)UniSpeechEncoderNr-   r(   r)   r*   r4   r4   T   r.   r)   r4   c                       e Zd Zy)UniSpeechEncoderStableLayerNormNr-   r(   r)   r*   r6   r6   X   r.   r)   r6   c                   "    e Zd Zed        Zd Zy)UniSpeechGumbelVectorQuantizerc                     | j                  d      }t        j                  t        j                  t        j                  ||      d             j                         }|S )Nr   dim)meanr$   expsumxlogy)probsmarginal_probs
perplexitys      r*   _compute_perplexityz2UniSpeechGumbelVectorQuantizer._compute_perplexity]   sI    *YY		%++nn*U[] ^^_cce
r)   c                    |j                   \  }}}| j                  |      }|j                  ||z  | j                  z  d      }| j                  rt
        j                  j                  |j                         | j                  d      j                  |      }t        j                  |j                  ||z  | j                  d      j                         d      }| j                  |      }n}|j                  d      } |j                  |j                    j!                  d|j                  dd      d      }|j                  ||z  | j                  d      }| j                  |      }|j                  ||z  d      }|j#                  d      | j$                  z  }	|	j                  ||z  | j                  | j&                  d      }
|
j)                  d      j                  ||d      }
|
|fS )Nr<   T)tauhardr:   r   g      ?)shapeweight_projview
num_groupstrainingnn
functionalgumbel_softmaxfloattemperaturetype_asr$   softmaxrD   argmax	new_zerosscatter_	unsqueezecodevectorsnum_varsr?   )selfr   
batch_sizesequence_lengthhidden_sizecodevector_probscodevector_soft_distrC   codevector_idxcodevectors_per_grouprY   s              r*   forwardz&UniSpeechGumbelVectorQuantizer.forwardc   s   3@3F3F0
O[ ((7%**:+G$//+Y[]^==!}};;##%4+;+;$  <  gm$ 
 $)=="":#?RTU[[]ce$  112FGJ +11b19N6}668K8KLUUN''A.   044Z/5QSWSbSbdfg112BCJ+00o1MrR 0 : :2 >AQAQ Q+00o1Mt`d`m`moqr!oob)..z?BOJ&&r)   N)r    r!   r"   staticmethodrD   rc   r(   r)   r*   r8   r8   \   s     
#'r)   r8   c                       e Zd ZU eed<   dZdZdZdZdZ	dZ
dZ ej                         d        Zdej                  ez  fdZd	ed
ej                  fdZy)UniSpeechPreTrainedModelconfig	unispeechinput_valuesaudioTc           
         t        |t              rut        j                  |j                  j
                  dd       t        j                  |j                  j                         t        j                  |j                         yt        |t              rt        j                  |j                  j
                  ddt        j                  d|j                  j                  d   |j                  j                  z  z        z         t        j                   |j                  j                  d       yt        |t"              rt        j                  d|j$                  j&                  z        }t        j                  |j$                  j
                  | |       t        j                  |j$                  j                  | |       yt        |t(        j*                        rct        j                  |j
                  d| j,                  j.                         |j                   t        j                  |j                         yyt        |t(        j0                  t(        j2                  f      r?t        j                  |j                         t        j4                  |j
                         yt        |t(        j6                        rt        j8                  |j
                         |j                  `t        j                  |j:                  |j                  |j                  d   z  z        }t        j                  |j                  | |       yyy)zInitialize the weights        r   )r=   stdr   r   )abN)
isinstancer8   initnormal_rJ   weightzeros_biasuniform_rY   r,   convmathsqrtkernel_sizein_channels	constant_r2   
projectionin_featuresrN   Linearrg   initializer_range	LayerNorm	GroupNormones_Conv1dkaiming_normal_groups)r[   moduleks      r*   _init_weightsz&UniSpeechPreTrainedModel._init_weights   s*    f<=LL++22!DKK**//0MM&,,- @ALL""		!v{{'>'>q'AFKKD[D['["\]]
 NN6;;++Q/ :;		!f//;;;<AMM&++22qbA>MM&++00QB!<		*LLSdkk6S6ST{{&FKK( 'r|| <=KK$JJv}}%		*  /{{&IIfmmv/A/AFDVDVWXDY/YZ[fkkaR15 ' +r)   input_lengthsc                     d }t        | j                  j                  | j                  j                        D ]  \  }} ||||      } |S )zH
        Computes the output length of the convolutional layers
        c                 >    t        j                  | |z
  |d      dz   S )Nfloor)rounding_moder   )r$   div)input_lengthrz   strides      r*   _conv_out_lengthzSUniSpeechPreTrainedModel._get_feat_extract_output_lengths.<locals>._conv_out_length   s"     99\K7wWZ[[[r)   )ziprg   conv_kernelconv_stride)r[   r   r   rz   r   s        r*    _get_feat_extract_output_lengthsz9UniSpeechPreTrainedModel._get_feat_extract_output_lengths   sQ    
	\
 $'t{{'>'>@W@W#X 	QK,]KPM	Q r)   feature_vector_lengthattention_maskc                    |j                  d      d d df   }| j                  |      j                  t        j                        }|j
                  d   }t        j                  ||f|j                  |j                        }d|t        j                  |j
                  d   |j                        |dz
  f<   |j                  dg      j                  d      j                  dg      j                         }|S )Nr<   r:   r   )dtypedevicer   )r   )cumsumr   tor$   longrI   zerosr   r   arangeflipbool)r[   r   r   non_padded_lengthsoutput_lengthsr\   s         r*   "_get_feature_vector_attention_maskz;UniSpeechPreTrainedModel._get_feature_vector_attention_mask   s     ,22r2:1b5A>>?QRUUV[V`V`a#))!,
./~7K7KTbTiTi
 uv^%9%9!%<^EZEZ[]kno]opq',,bT299"=BBB4HMMOr)   N)r    r!   r"   r   r&   base_model_prefixmain_input_nameinput_modalitiessupports_gradient_checkpointing_supports_flash_attn_supports_sdpa_supports_flex_attnr$   no_gradr   
LongTensorintr   r   r(   r)   r*   rf   rf      s}    #$O&*#NU]]_6 6Be>N>NQT>T  ]b]m]m r)   rf   c                       e Zd ZdefdZd Z	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  de	dz  d	e	dz  d
e	dz  de
ez  fdZy)UniSpeechModelrg   c                    t         j                  | |       || _        t        |      | _        t        |      | _        |j                  dkD  s|j                  dkD  rEt        j                  t        j                  |j                        j                               | _        |j                   rt#        |      | _        nt'        |      | _        | j)                          y )Nrl   )rf   __init__rg   r0   feature_extractorr2   feature_projectionmask_time_probmask_feature_probrN   	Parameterr$   Tensorr^   rv   masked_spec_embeddo_stable_layer_normr6   encoderr4   	post_init)r[   rg   s     r*   r   zUniSpeechModel.__init__   s     ))$7!8!@"<V"D  3&&*B*BS*H%'\\%,,v?Q?Q2R2[2[2]%^D"&&:6BDL+F3DL 	r)   c                     t        d      )NzNot needed for UniSpeech)AttributeErrorr[   s    r*   freeze_feature_encoderz%UniSpeechModel.freeze_feature_encoder   s    788r)   Nri   r   mask_time_indicesoutput_attentionsoutput_hidden_statesreturn_dictreturnc                 
   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }| j	                  |      }|j                  dd      }|| j                  |j                  d   |      }| j                  |      \  }	}| j                  |	||      }	| j                  |	||||      }
|
d   }	|s
|	|f|
dd z   S t        |	||
j                  |
j                        S )a/  
        mask_time_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices to mask extracted features for contrastive loss. When in training mode, model learns to predict
            masked extracted features in *config.proj_codevector_dim* space.
        Nr   r   )r   r   r   r   r   r   r   )last_hidden_stateextract_featuresr   r   )rg   r   r   use_return_dictr   	transposer   rI   r   _mask_hidden_statesr   UniSpeechBaseModelOutputr   r   )r[   ri   r   r   r   r   r   kwargsr   r   encoder_outputss              r*   rc   zUniSpeechModel.forward   s@    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]11,?+55a;%!DDEUE[E[\]E^`noN*.*A*ABR*S''00->~ 1 
 ,,)/!5# ' 
 (*!#34qr7JJJ'+-)77&11	
 	
r)   )NNNNN)r    r!   r"   r   r   r   r$   r   r%   r   r'   r   rc   r(   r)   r*   r   r      s     "9 /36:)-,0#'3
llT)3
 t+3
 !,,t3	3

  $;3
 #Tk3
 D[3
 
)	)3
r)   r   zZ
    UniSpeech Model with a vector-quantization module and ctc loss for pre-training.
    c                       e Zd Zdef fdZdefdZd Ze	 dde	j                  de	j                  de	j                  defd	       Ze	 	 	 	 dde	j                  d
z  de	j                  d
z  ded
z  ded
z  ded
z  deez  fd       Z xZS )UniSpeechForPreTrainingrg   c                 .   t         |   |       t        |      | _        t	        j
                  |j                        | _        t        |      | _	        t	        j                  |j                  |j                        | _        t	        j                  |j                  |j                        | _        t	        j                  |j                  |j                         | _        t	        j
                  |j$                        | _        | j)                          y )N)superr   r   rh   rN   Dropoutfeat_quantizer_dropoutdropout_featuresr8   	quantizerr   codevector_dimproj_codevector_dim	project_qr^   project_hidnum_ctc_classesctc_projfinal_dropoutdropoutr   )r[   rg   	__class__s     r*   r   z UniSpeechForPreTraining.__init__)  s     '/ "

6+H+H I7?6#8#8&:T:TU99V%?%?ASAST		&"4"4f6L6LMzz&"6"67 	r)   rR   c                 &    || j                   _        y)zb
        Set the Gumbel softmax temperature to a given value. Only necessary for training
        N)r   rR   )r[   rR   s     r*   set_gumbel_temperaturez.UniSpeechForPreTraining.set_gumbel_temperature8  s     &1"r)   c                 L    | j                   j                  j                          y)z
        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
        not be updated during training.
        N)rh   r   _freeze_parametersr   s    r*   r   z.UniSpeechForPreTraining.freeze_feature_encoder>  s    
 	((;;=r)   target_featuresnegative_featurespredicted_featuresc                     t        j                  | |gd      } t        j                  |j                         | j                         d      }|j	                  |       }||z  }|S )z
        Compute logits for contrastive loss based using cosine similarity as the distance measure between
        `[positive_feature, negative_features]` and `[predicted_features]`. Additionally, temperature can be applied.
        r   r:   r<   )r$   catcosine_similarityrQ   rS   )r   r   r   rR   logitss        r*   compute_contrastive_logitsz2UniSpeechForPreTraining.compute_contrastive_logitsE  sa      ))_6G$HaP(();)A)A)C_EZEZE\bde0 +%r)   Nri   r   r   r   r   r   c                    ||n| j                   j                  }| j                  |||||      }|d   }| j                  |d         }	| j	                  |	      \  }
}| j                  |
j                  | j
                  j                  j                              }
| j                  |
      }
t        j                  |j                  d      |j                  d            j                  | j                   j                        }|j                  dd      }t        j                   |      j#                         j                  |j$                        }|j                  dd      }|j'                  d      }|j)                  |d      |
j)                  | d      z   }| j+                  |      }| j-                  |      }d}|s||||
|f|dd z   S ||
|f|dd z   S t/        |||
||j0                  |j2                        S )	a  
        Example:

        ```python
        >>> import torch
        >>> from transformers import AutoFeatureExtractor, UniSpeechForPreTraining

        >>> feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/unispeech-large-1500h-cv")
        >>> model = UniSpeechForPreTraining.from_pretrained("microsoft/unispeech-large-1500h-cv")
        >>> # TODO: Add full pretraining example
        ```Nr   r   r   r<   rl   r   )r   r   r   r   r   r   )rg   r   rh   r   r   r   r   rs   r   r   r$   emptysizefill_replace_probr   	bernoullir   r   rX   masked_fillr   r   r   r   r   )r[   ri   r   r   r   r   r   outputstransformer_featuresr   quantized_featuresr   prob_replace_matrixsampled_replace_matrixr   r   s                   r*   rc   zUniSpeechForPreTraining.forwardY  s   , &1%<k$++B]B]..)/!5# ! 
  'qz  00<48NNCS4T11 "^^,>,A,A$..BWBWB]B],^_!--.@A#kk*>*C*CA*FH\HaHabcHdekkKK$$
 2;;AqA!&1D!E!J!J!L!O!OPdPkPk!l!7!A!A!Q!G!7!A!A"!E%112H#N**,B+BCH

 f%v& 24FH]^ahijikalll(*<>STW^_`_aWbbb,1'9"7!//))
 	
r)   )r   )NNNN)r    r!   r"   r   r   r   r   r   rd   r$   r%   r   r	   r   r   r'   r   rc   __classcell__)r   s   @r*   r   r   #  s     1# 1> 
 	** ,, "-- 	 &  /3)-,0#'E
llT)E
 t+E
  $;	E

 #TkE
 D[E
 
.	.E
 E
r)   r   c                       e Zd Zy)UniSpeechForCTCNr-   r(   r)   r*   r   r     r.   r)   r   c                       e Zd Zy)"UniSpeechForSequenceClassificationNr-   r(   r)   r*   r   r     r.   r)   r   )r   r   r   r   rf   )/r#   rx   dataclassesr   r$   torch.nnrN    r   rq   modeling_outputsr   r   modeling_utilsr   utilsr	   r
   wav2vec2.modeling_wav2vec2r   r   r   r   r   r   r   r   r   configuration_unispeechr   
get_loggerr    loggerr   r,   r0   r2   r4   r6   r8   rf   r   r   r   r   r   __all__r(   r)   r*   <module>r
     sH     !   & D - ,
 
 
 5 
		H	% 
7K 7 7.	'F 		4 		!: 		 		&D 	*'%B *'Z H H HV 3 H
-} H
V 
w
6 w

w
t	n 		)J 	r)   