
    qi%                        d Z ddlZddlZddlmZ ddlmZ ddlmZ ddl	m
Z
 ddlmZ dd	lmZ d
dlmZmZmZmZmZmZmZmZmZmZmZ ddlmZ  G d de
      Z G d de      Z G d dej>                        Z  G d dej>                        Z! G d de      Z" G d de      Z# G d de      Z$ G d de      Z% G d dee      Z&eZ' G d  d!e&e      Z( G d" d#e&e      Z) G d$ d%e      Z* G d& d'e      Z+ G d( d)e      Z,g d*Z-y)+zPyTorch Data2VecText model.    N)nn   )initialization)ACT2FN)GradientCheckpointingLayer)Wav2Vec2BaseModelOutput)PreTrainedModel   )Wav2Vec2AdapterWav2Vec2EncoderWav2Vec2FeatureEncoderWav2Vec2FeatureProjection#Wav2Vec2ForAudioFrameClassificationWav2Vec2ForCTC!Wav2Vec2ForSequenceClassificationWav2Vec2ForXVectorWav2Vec2ModelWav2Vec2PreTrainedModelWav2Vec2SamePadLayer   )Data2VecAudioConfigc                   &     e Zd Zd fd	Zd Z xZS )Data2VecAudioConvLayerc                    t         |           |dkD  r|j                  |dz
     nd| _        |j                  |   | _        t        j                  | j                  | j                  |j                  |   |j                  |   |j                        | _
        t        j                  | j                  d      | _        t        |j                     | _        y )Nr   r   )kernel_sizestridebiasTelementwise_affine)super__init__conv_dimin_conv_dimout_conv_dimr   Conv1dconv_kernelconv_stride	conv_biasconv	LayerNorm
layer_normr   feat_extract_activation
activation)selfconfiglayer_id	__class__s      e/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/data2vec/modular_data2vec_audio.pyr!   zData2VecAudioConvLayer.__init__+   s    <DqL6??8a<8a"OOH5II**84%%h/!!
	 ,,t'8'8TR !?!?@    c                     | j                  |      }|j                  dd      }| j                  |      }|j                  dd      }| j                  |      }|S )N)r)   	transposer+   r-   r.   hidden_statess     r2   forwardzData2VecAudioConvLayer.forward:   sV    		-0%//B76%//B76r3   )r   __name__
__module____qualname__r!   r:   __classcell__r1   s   @r2   r   r   *   s    Ar3   r   c                       e Zd Zy)Data2VecAudioPadLayerNr<   r=   r>    r3   r2   rB   rB   E       r3   rB   c                   $     e Zd Z fdZd Z xZS ) Data2VecAudioPositionalConvLayerc                 z   t         |           t        j                  |j                  |j                  |j
                  |j
                  dz  |j                        | _        t        |j
                        | _	        t        |j                     | _        t        j                  |j                  d      | _        y )Nr
   )r   paddinggroupsFr   )r    r!   r   r%   hidden_sizeconv_pos_kernel_sizenum_conv_pos_embedding_groupsr)   rB   rI   r   r,   r-   r*   r+   )r.   r/   r1   s     r2   r!   z)Data2VecAudioPositionalConvLayer.__init__J   s    II33//1477
	 -V-H-HI !?!?@,,v'9'9eTr3   c                     | j                  |      }| j                  |      }|j                  dd      }| j                  |      }|j                  dd      }| j	                  |      }|S Nr   r
   )r)   rI   r7   r+   r-   r8   s     r2   r:   z(Data2VecAudioPositionalConvLayer.forwardY   sd    		-0]3%//156%//156r3   r;   r@   s   @r2   rG   rG   I   s    Ur3   rG   c                   $     e Zd Z fdZd Z xZS )$Data2VecAudioPositionalConvEmbeddingc                     t         |           t        j                  t	        |j
                        D cg c]  }t        |       c}      | _        y c c}w )N)r    r!   r   
ModuleListrangenum_conv_pos_embeddingsrG   layers)r.   r/   _r1   s      r2   r!   z-Data2VecAudioPositionalConvEmbedding.__init__e   s@    mm?DVEcEc?de!-f5e
es   Ac                     |j                  dd      }| j                  D ]
  } ||      } |j                  dd      }|S rO   )r7   rV   )r.   r9   layers      r2   r:   z,Data2VecAudioPositionalConvEmbedding.forwardk   sI    %//15[[ 	1E!-0M	1%//15r3   r;   r@   s   @r2   rQ   rQ   d   s    
r3   rQ   c                       e Zd Zd Zy)Data2VecAudioFeatureEncoderc           	          t         j                  j                  |        t        j                  t	        |j
                        D cg c]  }t        ||       c}      | _        d| _        d| _	        y c c}w )N)r0   FT)
r   Moduler!   rS   rT   num_feat_extract_layersr   conv_layersgradient_checkpointing_requires_grad)r.   r/   is      r2   r!   z$Data2VecAudioFeatureEncoder.__init__t   s^    
		4 ==AFvGeGeAfgA#FQ7g
 ',#" hs   A5N)r<   r=   r>   r!   rD   r3   r2   r[   r[   s   s    #r3   r[   c                       e Zd Zy)Data2VecAudioFeatureProjectionNrC   rD   r3   r2   rd   rd   }   rE   r3   rd   c                       e Zd Zy)Data2VecAudioEncoderNrC   rD   r3   r2   rf   rf      rE   r3   rf   c                       e Zd Zy)Data2VecAudioAdapterNrC   rD   r3   r2   rh   rh      rE   r3   rh   c                   t    e Zd ZU eed<   dZdZdZdZdZ	dZ
dZ ej                         d        Zd Zd Zd	 Zy
)Data2VecAudioPreTrainedModelr/   data2vec_audioinput_valuesaudioTc                    t        |t              rt        j                  d|j                  j
                  z        }t        j                  |j                  j                  | |       t        j                  |j                  j                  | |       yt        |t              r+t        j                  |j                  j                  d       yt        |t        j                        rct        j                  |j                  d| j                   j"                         |j                   t        j$                  |j                         yyt        |t        j&                  t        j(                  f      rX|j                  t        j$                  |j                         |j                   t        j*                  |j                         yyt        |t        j,                        rt        j.                  |j                         |j                  `t        j                  |j0                  |j2                  |j4                  d   z  z        }t        j                  |j                  | |       yyy)zInitialize the weightsr   )abr           )meanstdN)
isinstancerd   mathsqrt
projectionin_featuresinituniform_weightr   rG   	constant_r)   r   Linearnormal_r/   initializer_rangezeros_r*   	GroupNormones_r%   kaiming_normal_rJ   in_channelsr   )r.   moduleks      r2   _init_weightsz*Data2VecAudioPreTrainedModel._init_weights   s    f<=		!f//;;;<AMM&++22qbA>MM&++00QB!< @ANN6;;++Q/		*LLSdkk6S6ST{{&FKK( 'r|| <={{&FKK(}}(

6==) )		*  /{{&IIfmmv/A/AFDVDVWXDY/YZ[fkkaR15 ' +r3   c                     t        d      NzNot needed for Data2VecAudioAttributeErrorr.   s    r2   _get_adaptersz*Data2VecAudioPreTrainedModel._get_adapters       ;<<r3   c                     t        d      r   r   r   s    r2   init_adapter_layersz0Data2VecAudioPreTrainedModel.init_adapter_layers   r   r3   c                     t        d      r   r   r   s    r2   load_adapterz)Data2VecAudioPreTrainedModel.load_adapter   r   r3   N)r<   r=   r>   r   __annotations__base_model_prefixmain_input_nameinput_modalitiessupports_gradient_checkpointing_supports_flash_attn_supports_sdpa_supports_flex_attntorchno_gradr   r   r   r   rD   r3   r2   rj   rj      sY    ($O&*#NU]]_6 62===r3   rj   c                   0     e Zd ZdefdZd Z fdZ xZS )Data2VecAudioModelr/   c                    t         j                  | |       || _        t        |      | _        t        |      | _        |j                  dkD  s|j                  dkD  rEt        j                  t        j                  |j                        j                               | _        t!        |      | _        |j$                  rt'        |      nd | _        | j+                          y )Nrq   )rj   r!   r/   r[   feature_extractorrd   feature_projectionmask_time_probmask_feature_probr   	Parameterr   TensorrK   rz   masked_spec_embedrf   encoderadd_adapterrh   adapter	post_init)r.   r/   s     r2   r!   zData2VecAudioModel.__init__   s    $--dF;!<V!D"@"H   3&&*B*BS*H%'\\%,,v?Q?Q2R2[2[2]%^D"+F37=7I7I+F3t 	r3   c                 8    | j                   j                          y)z
        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
        not be updated during training.
        N)r   _freeze_parametersr   s    r2   freeze_feature_encoderz)Data2VecAudioModel.freeze_feature_encoder   s    
 	113r3   c                 "    t        |   di |S NrD   r    r:   r.   super_kwargsr1   s     r2   r:   zData2VecAudioModel.forward       w...r3   )r<   r=   r>   r   r!   r   r:   r?   r@   s   @r2   r   r      s    2 "4/ /r3   r   c                   0     e Zd Zd Zd Zd Z fdZ xZS )Data2VecAudioForCTCc                    t         j                  | |       t        |      | _        t	        j
                  |j                        | _        |j                  t        d| j                   d      t        |d      r|j                  r|j                  n|j                  }t	        j                  ||j                        | _        | j#                          y)aZ  
        config ([`Data2VecAudioForCTC`]):
            Model configuration class with all the parameters of the model. Initializing with a config file does not
            load the weights associated with the model, only the configuration. Check out the
            [`~PreTrainedModel.from_pretrained`]  method to load the model weights.
        NzYou are trying to instantiate z with a configuration that does not define the vocabulary size of the language model head. Please instantiate the model as follows: `Data2VecAudioForCTC.from_pretrained(..., vocab_size=vocab_size)`. or define `vocab_size` of your model's configuration.r   )rj   r!   r   rk   r   Dropoutfinal_dropoutdropout
vocab_size
ValueErrorr1   hasattrr   output_hidden_sizerK   r}   lm_headr   )r.   r/   r   s      r2   r!   zData2VecAudioForCTC.__init__   s     	%--dF;08zz&"6"67$00@ AH H  *1)GFL^L^F%%djdvdv 	 yy!3V5F5FG 	r3   c                     t        d      r   r   r   s    r2   freeze_base_modelz%Data2VecAudioForCTC.freeze_base_model   r   r3   c                     t        d      r   r   r   s    r2   tie_weightszData2VecAudioForCTC.tie_weights   r   r3   c                 "    t        |   di |S r   r   r   s     r2   r:   zData2VecAudioForCTC.forward   r   r3   )r<   r=   r>   r!   r   r   r:   r?   r@   s   @r2   r   r      s    6==/ /r3   r   c                       e Zd Zy)&Data2VecAudioForSequenceClassificationNrC   rD   r3   r2   r   r      rE   r3   r   c                       e Zd Zy)(Data2VecAudioForAudioFrameClassificationNrC   rD   r3   r2   r   r     rE   r3   r   c                       e Zd Zy)Data2VecAudioForXVectorNrC   rD   r3   r2   r   r     rE   r3   r   )r   r   r   r   r   rj   ).__doc__ru   r   r    r   ry   activationsr   modeling_layersr   modeling_outputsr   modeling_utilsr	   wav2vec2.modeling_wav2vec2r   r   r   r   r   r   r   r   r   r   r   configuration_data2vec_audior   r   rB   r]   rG   rQ   r[   rd   rf   rh   rj   Data2VecAudioBaseModelOutputr   r   r   r   r   __all__rD   r3   r2   <module>r      s   "    & ! 9 7 -    >7 6	0 	ryy 6299 #"8 #	%> 		? 		? 	+=?4K +=\  7 /5} /:#/6 #/L	-N 		/R 		0 	r3   