
    qiZ                        d dl Z d dlZd dlmZ d dlmc mZ ddlmZ	 ddl
mZ ddlmZ ddlmZ ddlmZmZ ddlmZ dd	lmZ d
dlmZmZmZmZmZmZmZmZm Z  ddl!m"Z"  ejF                  e$      Z% G d de      Z& G d de      Z' G d dejP                        Z) G d de      Z* G d de      Z+ G d de      Z, G d dejP                        Z- G d dejP                        Z. G d dejP                        Z/ G d  d!ee       Z0eZ1 G d" d#e      Z2 G d$ d%e      Z3 G d& d'e      Z4 G d( d)e      Z5 G d* d+e      Z6g d,Z7y)-    N   )initialization)is_deepspeed_zero3_enabled)is_fsdp_managed_module)GradientCheckpointingLayer)BaseModelOutputWav2Vec2BaseModelOutput)PreTrainedModel)logging   )	Wav2Vec2FeatureProjectionWav2Vec2FeedForward#Wav2Vec2ForAudioFrameClassificationWav2Vec2ForCTC!Wav2Vec2ForSequenceClassificationWav2Vec2ForXVectorWav2Vec2ModelWav2Vec2PositionalConvEmbeddingWav2Vec2PreTrainedModel   )WavLMConfigc                       e Zd Zy)WavLMPositionalConvEmbeddingN__name__
__module____qualname__     Y/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/wavlm/modular_wavlm.pyr   r          r   r   c                       e Zd Zy)WavLMFeatureProjectionNr   r   r   r    r#   r#   #   r!   r   r#   c                       e Zd ZdZ	 	 	 	 ddedededededef fdZ	 	 	 	 dd
ej                  dej                  d	z  dej                  d	z  dede
ej                  ej                  d	z  e
ej                     d	z  f   f
dZd
ej                  dej                  ej                  z  dej                  dede
ej                  ej                  f   f
dZdededej                  fdZdej                  dej                  fdZ xZS )WavLMAttentionz=Multi-headed attention from 'Attention Is All You Need' paper	embed_dim	num_headsdropoutnum_bucketsmax_distancehas_relative_position_biasc                    t         |           || _        || _        || _        ||z  | _        | j
                  |z  | j                  k7  rt        d| j                   d| d      | j
                  dz  | _        t        j                  ||      | _
        t        j                  ||      | _        t        j                  ||      | _        t        j                  ||      | _        || _        || _        t        j                   t#        j$                  d| j                  dd            | _        t        j                  | j
                  d      | _        |r0t        j*                  | j                  | j                        | _        y y )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      r      )super__init__r&   r'   r(   head_dim
ValueErrorscalingnnLineark_projv_projq_projout_projr)   r*   	Parametertorchonesgru_rel_pos_constgru_rel_pos_linear	Embeddingrel_attn_embed)selfr&   r'   r(   r)   r*   r+   	__class__s          r    r/   zWavLMAttention.__init__*   s7    	""!Y.MMI%$..8MdnnM]$YKr3  }}d*ii	95ii	95ii	95		)Y7&(!#ejjDNNAq.Q!R"$))DMM1"=%"$,,t/?/?"PD &r   Nhidden_statesattention_maskposition_biasoutput_attentionsreturnc                     |j                         \  }}}|S| j                  ||      }|j                  d      j                  |ddd      j	                  || j
                  z  ||      }|j	                  |j                  dd | j
                  dfz         }	|	j                  dddd      }	| j                  |	      }
|
j	                  |	j                  dd dz         j                  d      }
t        j                  |
      j                  dd      \  }}||| j                  z  d	z
  z  d
z   }|j	                  || j
                  z  dd      |z  }|j	                  d||f      }| j                  ||||      \  }}|||fS )z'Attention layer with relative attentionNr   r   r   r   )r      dim      ?g       @)sizecompute_bias	unsqueezerepeatviewr'   shapepermuter=   sumr:   sigmoidchunkr<   torch_multi_head_self_attention)r@   rB   rC   rD   rE   indexbsztgt_len_gated_hidden_statesrelative_position_projgate_agate_bgate_outputgated_position_biasattn_outputattn_weightss                    r    forwardzWavLMAttention.forwardN   s    (,,.Wa   --gw?M''*11#q!Q?DDS4>>EY[bdkl  ,001D1DSb1IT^^]_L`1`a199!Q1E "&!8!89L!M!7!<!<=P=V=VWZXZ=[^d=d!e!i!ijl!m '=>DDQBDO)?)? ?# EFL *..sT^^/CRKm[166GW7MN$($H$H>+>@Q%
!\ L-77r   ra   c                 X   |j                  dd      x}x}}||j                  d      nd}dx}	}
d}t        j                  |||| j                  | j
                  t        j                  dg      t        j                  | j                  j                  | j                  j                  | j                  j                  f      |	|
|| j                  | j                  j                  | j                  j                  | j                   |||d| j                  j                  | j                  j                  | j                  j                        \  }}|j                  dd      }|C|dddf   j#                  |j$                  dd | j
                  fz   |j$                  dd z         }||fS )zCsimple wrapper around torch's multi_head_attention_forward functionr   r   NFT)use_separate_proj_weightq_proj_weightk_proj_weightv_proj_weight)	transposeneFmulti_head_attention_forwardr&   r'   r:   emptycatr7   biasr5   r6   r(   r8   weighttrainingbroadcast_torR   )r@   rB   rC   ra   rE   querykeyvaluekey_padding_maskbias_kbias_vadd_zero_attnrb   rc   s                 r    rW   z.WavLMAttention.torch_multi_head_self_attentionw   s    ,55a;;;e3A3M>,,Q/SW  %&$B$BNNNNKKIIt{{'')9)94;;;K;KLMLLMM  MMMM%)++,,++,,++,,+%
!\2 "++Aq1# (40==""2A&$..)::\=O=OPQPR=SSL L((r   query_length
key_lengthc                    t        j                  |t         j                        d d d f   }t        j                  |t         j                        d d d f   }||z
  }| j                  |      }|j	                  | j
                  j                  j                        }| j                  |      }|j                  g d      }|S )N)dtype)r   r   r   )	r:   arangelong_relative_positions_buckettor?   rq   devicerS   )r@   r{   r|   context_positionmemory_positionrelative_positionrelative_position_bucketvaluess           r    rN   zWavLMAttention.compute_bias   s     <<EJJG4P,,zDT1WM+.>>#'#B#BCT#U #;#>#>t?R?R?Y?Y?`?`#a $$%=>	*r   relative_positionsc                 $   | j                   dz  }|dkD  j                  t        j                        |z  }t        j                  |      }|dz  }||k  }t        j
                  |j                         |z        }|t        j
                  | j                  |z        z  }|||z
  z  }||z   j                  t        j                        }t        j                  |t        j                  ||dz
              }|t        j                  |||      z  }|S )Nr   r   r   )r)   r   r:   r   abslogfloatmathr*   min	full_likewhere)r@   r   r)   relative_buckets	max_exactis_smallrelative_positions_if_largerelative_position_if_larges           r    r   z)WavLMAttention._relative_positions_bucket   s   &&!+.266uzzB[P"YY'9:1$	%	1&+ii0B0H0H0JY0V&W#&ADHHTM^M^ajMjDk&k#&A[S\E\&]#&/2M&M%Q%QRWR\R\%]"%*YY&8RT_bcTc(d&
" 	EKK2DF`aar   )        i@  i   TNNFr   )r   r   r   __doc__intr   boolr/   r:   Tensortuplerd   FloatTensor
LongTensor
BoolTensorrW   rN   r   __classcell__rA   s   @r    r%   r%   '   s   G +/"Q"Q "Q 	"Q
 "Q "Q %)"QN /3-1"''8||'8 t+'8 ||d*	'8
  '8 
u||U\\D0%2E2LL	M'8R5)((5) ((5+;+;;5) #..	5)
  5) 
u  %"3"33	45)n # %BSBS  U=N=N  SXSdSd  r   r%   c                       e Zd Zy)WavLMFeedForwardNr   r   r   r    r   r      r!   r   r   c                   2     e Zd Zddedef fdZddZ xZS )WavLMEncoderLayerconfigr+   c                    t         |           t        |j                  |j                  |j
                  |j                  |j                  |      | _        t        j                  |j                        | _        t        j                  |j                  |j                        | _        t!        |      | _        t        j                  |j                  |j                        | _        y N)r&   r'   r(   r)   r*   r+   epsr.   r/   r%   hidden_sizenum_attention_headsattention_dropoutr)   max_bucket_distance	attentionr3   Dropouthidden_dropoutr(   	LayerNormlayer_norm_eps
layer_normr   feed_forwardfinal_layer_normr@   r   r+   rA   s      r    r/   zWavLMEncoderLayer.__init__       '((00,,**33'A
 zz&"7"78,,v'9'9v?T?TU,V4 "V-?-?VEZEZ [r   c                     |}| j                  |||||      \  }}}| j                  |      }||z   }| j                  |      }|| j                  |      z   }| j	                  |      }||f}|r||fz  }|S )NrC   rD   rE   rX   )r   r(   r   r   r   )	r@   rB   rC   rD   rE   rX   attn_residualrc   outputss	            r    rd   zWavLMEncoderLayer.forward   s    %59^^)'/ 6D 6
2|] ]3%56%(9(9-(HH--m< -0&Gr   Tr   r   r   r   r   r   r/   rd   r   r   s   @r    r   r      s    \{ \ \r   r   c                   2     e Zd Zddedef fdZddZ xZS ) WavLMEncoderLayerStableLayerNormr   r+   c                    t         |           t        |j                  |j                  |j
                  |j                  |j                  |      | _        t        j                  |j                        | _        t        j                  |j                  |j                        | _        t!        |      | _        t        j                  |j                  |j                        | _        y r   r   r   s      r    r/   z)WavLMEncoderLayerStableLayerNorm.__init__   r   r   c                     |}| j                  |      }| j                  ||||      \  }}}| j                  |      }||z   }|| j                  | j	                  |            z   }||f}|r||fz  }|S )N)rC   rD   rE   )r   r   r(   r   r   )r@   rB   rC   rD   rE   r   rc   r   s           r    rd   z(WavLMEncoderLayerStableLayerNorm.forward
  s    %659^^)'/	 6D 6
2|] ]3%5%(9(9$:O:OP]:^(__ -0&Gr   r   )NNFr   r   s   @r    r   r      s    \{ \ \r   r   c                   .     e Zd Z fdZ	 	 	 	 ddZ xZS )WavLMEncoderc           
         t         |           || _        t        |      | _        t        j                  |j                  |j                        | _	        t        j                  |j                        | _        t        j                  t        |j                        D cg c]  }t!        ||dk(         c}      | _        d| _        y c c}w Nr   r   )r+   F)r.   r/   r   r   pos_conv_embedr3   r   r   r   r   r   r   r(   
ModuleListrangenum_hidden_layersr   layersgradient_checkpointingr@   r   irA   s      r    r/   zWavLMEncoder.__init__   s    :6B,,v'9'9v?T?TUzz&"7"78mmUZ[a[s[sUtuPQv16Ku
 ',# v   !Cc                    |rdnd }|rdnd }|5|j                  d      j                  dd|j                  d         }d|| <   | j                  |      }	||	z   }| j	                  |      }| j                  |      }t               xs t        |       }
d }t        | j                        D ]y  \  }}|r||fz   }t        j                  g       }| j                  xr  |dkD  xr || j                  j                  k  }|r|
r ||||||      }|d d \  }}|rd}|sq|d   fz   }{ |r||fz   }|st        d |||fD              S t!        |||	      S )
Nr   rH   r   r   r   r   NNNc              3   &   K   | ]	  }||  y wNr   .0vs     r    	<genexpr>z'WavLMEncoder.forward.<locals>.<genexpr>a       mq_`_lm   last_hidden_staterB   
attentions)rO   rP   rR   r   r   r(   r   r   	enumerater   r:   randrr   r   	layerdropr   r   r@   rB   rC   rE   output_hidden_statesreturn_dictall_hidden_statesall_self_attentionsexpand_attention_maskposition_embeddingssynced_gpusrD   r   layerdropout_probabilityskip_the_layerlayer_outputss                    r    rd   zWavLMEncoder.forward+  s    #7BD$5b4%$2$<$<R$@$G$G1mNaNabcNd$e!45M001"11-@%(;;6]302R6LT6R!$++. 	PHAu#$58H$H! #(**R.!]]fq1uf:MPTP[P[PePe:eN![ %!#1"/&7! 0=Ra/@,} 2 &9]1=M<O&O#1	P4   1]4D Dm]4EGZ$[mmm++*
 	
r   NFFTr   r   r   r/   rd   r   r   s   @r    r   r     s    	, ";
r   r   c                   .     e Zd Z fdZ	 	 	 	 ddZ xZS )WavLMEncoderStableLayerNormc           
         t         |           || _        t        |      | _        t        j                  |j                  |j                        | _	        t        j                  |j                        | _        t        j                  t        |j                        D cg c]  }t!        ||dk(         c}      | _        d| _        y c c}w r   )r.   r/   r   r   r   r3   r   r   r   r   r   r   r(   r   r   r   r   r   r   r   s      r    r/   z$WavLMEncoderStableLayerNorm.__init__j  s    :6B,,v'9'9v?T?TUzz&"7"78mm v778 1UVZ[U[]
 ',#r   c                    |rdnd }|rdnd }|5|j                  d      j                  dd|j                  d         }d|| <   | j                  |      }	||	z   }| j	                  |      }t               xs t        |       }
d }t        | j                        D ]x  \  }}|r||fz   }t        j                  g       }| j                  xr  |dkD  xr || j                  j                  k  }|r|
r |||||      }|d d \  }}|rd}|sp|d   fz   }z | j                  |      }|r||fz   }|st        d |||fD              S t!        |||	      S )
Nr   rH   r   r   r   )rC   rE   rD   r   c              3   &   K   | ]	  }||  y wr   r   r   s     r    r   z6WavLMEncoderStableLayerNorm.forward.<locals>.<genexpr>  r   r   r   )rO   rP   rR   r   r(   r   r   r   r   r:   r   rr   r   r   r   r   r   r   s                    r    rd   z#WavLMEncoderStableLayerNorm.forwardx  s    #7BD$5b4%$2$<$<R$@$G$G1mNaNabcNd$e!45M001"11-@%(;;]302R6LT6R!$++. 	PHAu#$58H$H! #(**R.!]]fq1uf:MPTP[P[PePe:eN![ !&!#1&7"/	! 0=Ra/@,} 2 &9]1=M<O&O#/	P2 6 1]4D Dm]4EGZ$[mmm+;LYl
 	
r   r   r   r   s   @r    r   r   i  s    ," "9
r   r   c                   8     e Zd ZdZ fdZed        Zd Z xZS )WavLMGumbelVectorQuantizerz
    Vector quantization using gumbel softmax. See [CATEGORICAL REPARAMETERIZATION WITH
    GUMBEL-SOFTMAX](https://huggingface.co/papers/1611.01144) for more information.
    c                 0   t         |           |j                  | _        |j                  | _        |j                  | j                  z  dk7  r&t        d|j                   d| j                   d      t        j                  t        j                  d| j                  | j
                  z  |j                  | j                  z              | _        t        j                  |j                  d   | j                  | j
                  z        | _        d| _        y )Nr   z`config.codevector_dim z5 must be divisible by `config.num_codevector_groups` z for concatenation.r   rH   r   )r.   r/   num_codevector_groups
num_groupsnum_codevectors_per_groupnum_varscodevector_dimr1   r3   r9   r:   r   codevectorsr4   conv_dimweight_projtemperature)r@   r   rA   s     r    r/   z#WavLMGumbelVectorQuantizer.__init__  s     6688  4??2a7)&*?*?)@ A66:oo5F G%%  <<a4==!@&BWBW[_[j[jBjk
 99V__R%8$//DMM:YZ r   c                     | j                  d      }t        j                  t        j                  t        j                  ||      d             j                         }|S )Nr   rJ   rH   )meanr:   exprT   xlogy)probsmarginal_probs
perplexitys      r    _compute_perplexityz.WavLMGumbelVectorQuantizer._compute_perplexity  sI    *YY		%++nn*U[] ^^_cce
r   c                    |j                   \  }}}| j                  |      }|j                  ||z  | j                  z  d      }| j                  rt
        j                  j                  |j                         | j                  d      }|j                  |      }t        j                  |j                  ||z  | j                  d      j                         d      }| j                  |      }n}|j                  d      } |j                  |j                    j!                  d|j                  dd      d      }|j                  ||z  | j                  d      }| j                  |      }|j                  ||z  d      }|j#                  d      | j$                  z  }	|	j                  ||z  | j                  | j&                  d      }
|
j)                  d      j                  ||d      }
|
|fS )NrH   T)tauhardrJ   r   rL   )rR   r   rQ   r   rr   r3   
functionalgumbel_softmaxr   r   type_asr:   softmaxr  argmax	new_zerosscatter_rO   r   r   rT   )r@   rB   
batch_sizesequence_lengthr   codevector_probscodevector_soft_distr  codevector_idxcodevectors_per_groupr   s              r    rd   z"WavLMGumbelVectorQuantizer.forward  s   3@3F3F0
O[ ((7%**:+G$//+Y[]^==!}};;M<O<O<QW[WgWgnr;s/77F $)=="":#?RTU[[]ce$  112FGJ +11b19N6}668K8KLUUN''A.   044Z/5QSWSbSbdfg112BCJ+00o1MrR 0 : :2 >AQAQ Q+00o1Mt`d`m`moqr!oob)..z?BOJ&&r   )	r   r   r   r   r/   staticmethodr  rd   r   r   s   @r    r   r     s&    
*  
"'r   r   c                   t    e Zd ZU eed<   dZdZdZdZdZ	dZ
dZ ej                         d        Zd Zd	 Zd
 Zy)WavLMPreTrainedModelr   wavlminput_valuesaudioTFc           
         t        |t              rut        j                  |j                  j
                  dd       t        j                  |j                  j                         t        j                  |j                         yt        |t              rt        j                  |j                  j
                  ddt        j                  d|j                  j                  d   |j                  j                  z  z        z         t        j                   |j                  j                  d       yt        |t"              rt        j                  d|j$                  j&                  z        }t        j                  |j$                  j
                  | |       t        j                  |j$                  j                  | |       yt        |t(        j*                        rct        j                  |j
                  d| j,                  j.                         |j                   t        j                  |j                         yyt        |t(        j0                  t(        j2                  f      r?t        j                  |j                         t        j4                  |j
                         yt        |t(        j6                        rt        j8                  |j
                         |j                  `t        j                  |j:                  |j                  |j                  d   z  z        }t        j                  |j                  | |       yyy)zInitialize the weightsr   r   )r  stdr   r   )abN)
isinstancer   initnormal_r   rq   zeros_rp   uniform_r   r   convr   sqrtkernel_sizein_channels	constant_r#   
projectionin_featuresr3   r4   r   initializer_ranger   	GroupNormones_Conv1dkaiming_normal_groups)r@   moduleks      r    _init_weightsz"WavLMPreTrainedModel._init_weights  s*    f89LL++22!DKK**//0MM&,,- <=LL""		!v{{'>'>q'AFKKD[D['["\]]
 NN6;;++Q/ 67		!f//;;;<AMM&++22qbA>MM&++00QB!<		*LLSdkk6S6ST{{&FKK( 'r|| <=KK$JJv}}%		*  /{{&IIfmmv/A/AFDVDVWXDY/YZ[fkkaR15 ' +r   c                     t        d      NzNot needed for WavLMAttributeErrorr@   s    r    _get_adaptersz"WavLMPreTrainedModel._get_adapters&      344r   c                     t        d      r:  r;  r=  s    r    init_adapter_layersz(WavLMPreTrainedModel.init_adapter_layers)  r?  r   c                     t        d      r:  r;  r=  s    r    load_adapterz!WavLMPreTrainedModel.load_adapter,  r?  r   N)r   r   r   r   __annotations__base_model_prefixmain_input_nameinput_modalitiessupports_gradient_checkpointing_supports_flash_attn_supports_sdpa_supports_flex_attnr:   no_gradr8  r>  rA  rC  r   r   r    r  r    sZ    $O&*# NU]]_6 6B555r   r  c                       e Zd Zy)
WavLMModelNr   r   r   r    rN  rN  3  r!   r   rN  c                       e Zd Zy)WavLMForCTCNr   r   r   r    rP  rP  7  r!   r   rP  c                       e Zd Zy)WavLMForSequenceClassificationNr   r   r   r    rR  rR  ;  r!   r   rR  c                       e Zd Zy) WavLMForAudioFrameClassificationNr   r   r   r    rT  rT  ?  r!   r   rT  c                       e Zd Zy)WavLMForXVectorNr   r   r   r    rV  rV  C  r!   r   rV  )rT  rP  rR  rV  rN  r  )8r   r:   torch.nnr3   torch.nn.functionalr  rl    r   r%  integrations.deepspeedr   integrations.fsdpr   modeling_layersr   modeling_outputsr   r	   modeling_utilsr
   utilsr   wav2vec2.modeling_wav2vec2r   r   r   r   r   r   r   r   r   configuration_wavlmr   
get_loggerr   loggerr   r#   Moduler%   r   r   r   r   r   r   r  WavLMBaseModelOutputrN  rP  rR  rT  rV  __all__r   r   r    <module>rg     s>        & @ 7 9 H - 
 
 
 - 
		H	%	#B 		6 	c RYY c L	* 	&2 &R"'A "JG
299 G
TH
")) H
VC' C'L35?,C 35l / 	 		. 		%F 		'J 		( 	r   