
    qi                       d dl Z d dlZd dlZd dlZd dlmZ d dlmc mZ	 d dlm
Z
 ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ dd	lmZmZmZmZmZmZ dd
lmZmZ ddl m!Z!m"Z"m#Z# ddl$m%Z%  e#jL                  e'      Z( G d dejR                        Z* G d dejR                        Z+ G d dejR                        Z, G d dejR                        Z- G d dejR                        Z. G d de      Z/ G d de      Z0 G d dejR                        Z1 G d dejR                        Z2 G d  d!ejR                        Z3e! G d" d#e             Z4 G d$ d%e      Z5 G d& d'e      Z6 G d( d)e      Z7 G d* d+ejR                        Z8 G d, d-ejR                        Z9 G d. d/ejR                        Z:	 	 dKd0e;e<e<f   d1e=d2e<d3ej|                  dz  d4e<d5ej~                  fd6Z@eZAe! G d7 d8e4             ZBd9ZC e!d:;       G d< d=e4             ZD e!d>;       G d? d@e4             ZEe! G dA dBe4             ZF G dC dDejR                        ZG G dE dFejR                        ZH e!dG;       G dH dIe4             ZIg dJZJy)L    N)CrossEntropyLoss   )initialization)ACT2FN)is_deepspeed_zero3_enabled)is_fsdp_managed_module)GradientCheckpointingLayer)BaseModelOutputCausalLMOutputSequenceClassifierOutputTokenClassifierOutputWav2Vec2BaseModelOutputXVectorOutput)PreTrainedModel*get_torch_context_manager_or_global_device)auto_docstringis_peft_availablelogging   )WavLMConfigc                   $     e Zd Z fdZd Z xZS )WavLMSamePadLayerc                 P    t         |           |dz  dk(  rd| _        y d| _        y N   r   r   )super__init__num_pad_remove)selfnum_conv_pos_embeddings	__class__s     Z/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/wavlm/modeling_wavlm.pyr   zWavLMSamePadLayer.__init__&   s)    #:Q#>!#Ca    c                 V    | j                   dkD  r|d d d d d | j                    f   }|S Nr   )r   r   hidden_statess     r"   forwardzWavLMSamePadLayer.forward*   s6    ")!Q0F43F3F2F0F*FGMr#   __name__
__module____qualname__r   r(   __classcell__r!   s   @r"   r   r   %   s    Kr#   r   c                   $     e Zd Z fdZd Z xZS )WavLMPositionalConvEmbeddingc                    t         |           t        j                  |j                  |j                  |j
                  |j
                  dz  |j                        | _        t        j                  j                  }t        t        j                  j                  d      r$t        j                  j                  j                  }t               r(dd l}|j                  j                  | j                  j                   d      5   || j                  dd      | _        d d d        t        | j                  d      rU| j                  j                  j                   j"                  }| j                  j                  j                   j$                  }n,| j                  j&                  }| j                  j(                  }|j                  j+                  | |       |j                  j+                  | |       n || j                  dd      | _        t-        |j
                        | _        t0        |j2                     | _        y # 1 sw Y   'xY w)	Nr   )kernel_sizepaddinggroupsweight_normr   )modifier_rankweight)namedimparametrizations)r   r   nnConv1dhidden_sizer    num_conv_pos_embedding_groupsconvutilsr5   hasattrr:   r   	deepspeedzeroGatheredParametersr7   	original0	original1weight_gweight_vregister_external_parameterr   r3   r   feat_extract_activation
activation)r   configr5   rB   rG   rH   r!   s         r"   r   z%WavLMPositionalConvEmbedding.__init__1   s   II6622a777
	 hh**288,,m<((33??K%'224993C3CST2U I'		aH	Ityy"459955<<FF9955<<FF99--99--NN66tXFNN66tXF#DIIH!DDI()G)GH !?!?@I Is   IIc                     |j                  dd      }| j                  |      }| j                  |      }| j                  |      }|j                  dd      }|S Nr   r   )	transposer?   r3   rK   r&   s     r"   r(   z$WavLMPositionalConvEmbedding.forwardR   sV    %//15		-0]36%//15r#   r)   r.   s   @r"   r0   r0   0   s    ABr#   r0   c                   $     e Zd Z fdZd Z xZS )WavLMFeatureProjectionc                 4   t         |           t        j                  |j                  d   |j
                        | _        t        j                  |j                  d   |j                        | _	        t        j                  |j                        | _        y )Neps)r   r   r;   	LayerNormconv_dimlayer_norm_eps
layer_normLinearr=   
projectionDropoutfeat_proj_dropoutdropoutr   rL   r!   s     r"   r   zWavLMFeatureProjection.__init__^   sf    ,,vr':@U@UV))FOOB$79K9KLzz&":":;r#   c                 p    | j                  |      }| j                  |      }| j                  |      }||fS N)rY   r[   r^   )r   r'   norm_hidden_statess      r"   r(   zWavLMFeatureProjection.forwardd   s:    !__];(:;]3000r#   r)   r.   s   @r"   rQ   rQ   ]   s    <1r#   rQ   c                       e Zd ZdZ	 	 	 	 ddedededededef fdZ	 	 	 	 dd
ej                  dej                  d	z  dej                  d	z  dede
ej                  ej                  d	z  e
ej                     d	z  f   f
dZd
ej                  dej                  ej                  z  dej                  dede
ej                  ej                  f   f
dZdededej                  fdZdej                  dej                  fdZ xZS )WavLMAttentionz=Multi-headed attention from 'Attention Is All You Need' paper	embed_dim	num_headsr^   num_bucketsmax_distancehas_relative_position_biasc                    t         |           || _        || _        || _        ||z  | _        | j
                  |z  | j                  k7  rt        d| j                   d| d      | j
                  dz  | _        t        j                  ||      | _
        t        j                  ||      | _        t        j                  ||      | _        t        j                  ||      | _        || _        || _        t        j                   t#        j$                  d| j                  dd            | _        t        j                  | j
                  d      | _        |r0t        j*                  | j                  | j                        | _        y y )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      r      )r   r   re   rf   r^   head_dim
ValueErrorscalingr;   rZ   k_projv_projq_projout_projrg   rh   	Parametertorchonesgru_rel_pos_constgru_rel_pos_linear	Embeddingrel_attn_embed)r   re   rf   r^   rg   rh   ri   r!   s          r"   r   zWavLMAttention.__init__o   s7    	""!Y.MMI%$..8MdnnM]$YKr3  }}d*ii	95ii	95ii	95		)Y7&(!#ejjDNNAq.Q!R"$))DMM1"=%"$,,t/?/?"PD &r#   Nr'   attention_maskposition_biasoutput_attentionsreturnc                     |j                         \  }}}|S| j                  ||      }|j                  d      j                  |ddd      j	                  || j
                  z  ||      }|j	                  |j                  dd | j
                  dfz         }	|	j                  dddd      }	| j                  |	      }
|
j	                  |	j                  dd dz         j                  d      }
t        j                  |
      j                  dd      \  }}||| j                  z  d	z
  z  d
z   }|j	                  || j
                  z  dd      |z  }|j	                  d||f      }| j                  ||||      \  }}|||fS )z'Attention layer with relative attentionNr   r   rS   r   r   )r      r9         ?g       @)sizecompute_bias	unsqueezerepeatviewrf   shapepermuterw   sumrt   sigmoidchunkrv   torch_multi_head_self_attention)r   r'   rz   r{   r|   indexbsztgt_len_gated_hidden_statesrelative_position_projgate_agate_bgate_outputgated_position_biasattn_outputattn_weightss                    r"   r(   zWavLMAttention.forward   s    (,,.Wa   --gw?M''*11#q!Q?DDS4>>EY[bdkl  ,001D1DSb1IT^^]_L`1`a199!Q1E "&!8!89L!M!7!<!<=P=V=VWZXZ=[^d=d!e!i!ijl!m '=>DDQBDO)?)? ?# EFL *..sT^^/CRKm[166GW7MN$($H$H>+>@Q%
!\ L-77r#   r   c                 X   |j                  dd      x}x}}||j                  d      nd}dx}	}
d}t        j                  |||| j                  | j
                  t        j                  dg      t        j                  | j                  j                  | j                  j                  | j                  j                  f      |	|
|| j                  | j                  j                  | j                  j                  | j                   |||d| j                  j                  | j                  j                  | j                  j                        \  }}|j                  dd      }|C|dddf   j#                  |j$                  dd | j
                  fz   |j$                  dd z         }||fS )zCsimple wrapper around torch's multi_head_attention_forward functionr   r   NFT)use_separate_proj_weightq_proj_weightk_proj_weightv_proj_weight)rO   neFmulti_head_attention_forwardre   rf   rt   emptycatrq   biasro   rp   r^   rr   r7   trainingbroadcast_tor   )r   r'   rz   r   r|   querykeyvaluekey_padding_maskbias_kbias_vadd_zero_attnr   r   s                 r"   r   z.WavLMAttention.torch_multi_head_self_attention   s    ,55a;;;e3A3M>,,Q/SW  %&$B$BNNNNKKIIt{{'')9)94;;;K;KLMLLMM  MMMM%)++,,++,,++,,+%
!\2 "++Aq1# (40==""2A&$..)::\=O=OPQPR=SSL L((r#   query_length
key_lengthc                    t        j                  |t         j                        d d d f   }t        j                  |t         j                        d d d f   }||z
  }| j                  |      }|j	                  | j
                  j                  j                        }| j                  |      }|j                  g d      }|S )Ndtype)r   r   r   )	rt   arangelong_relative_positions_buckettory   r7   devicer   )r   r   r   context_positionmemory_positionrelative_positionrelative_position_bucketvaluess           r"   r   zWavLMAttention.compute_bias   s     <<EJJG4P,,zDT1WM+.>>#'#B#BCT#U #;#>#>t?R?R?Y?Y?`?`#a $$%=>	*r#   relative_positionsc                 $   | j                   dz  }|dkD  j                  t        j                        |z  }t        j                  |      }|dz  }||k  }t        j
                  |j                         |z        }|t        j
                  | j                  |z        z  }|||z
  z  }||z   j                  t        j                        }t        j                  |t        j                  ||dz
              }|t        j                  |||      z  }|S r   )rg   r   rt   r   abslogfloatmathrh   min	full_likewhere)r   r   rg   relative_buckets	max_exactis_smallrelative_positions_if_largerelative_position_if_larges           r"   r   z)WavLMAttention._relative_positions_bucket   s   &&!+.266uzzB[P"YY'9:1$	%	1&+ii0B0H0H0JY0V&W#&ADHHTM^M^ajMjDk&k#&A[S\E\&]#&/2M&M%Q%QRWR\R\%]"%*YY&8RT_bcTc(d&
" 	EKK2DF`aar#   )        i@  i   TNNFr   )r*   r+   r,   __doc__intr   boolr   rt   Tensortupler(   FloatTensor
LongTensor
BoolTensorr   r   r   r-   r.   s   @r"   rd   rd   l   s   G +/"Q"Q "Q 	"Q
 "Q "Q %)"QN /3-1"''8||'8 t+'8 ||d*	'8
  '8 
u||U\\D0%2E2LL	M'8R5)((5) ((5+;+;;5) #..	5)
  5) 
u  %"3"33	45)n # %BSBS  U=N=N  SXSdSd  r#   rd   c                   $     e Zd Z fdZd Z xZS )WavLMFeedForwardc                    t         |           t        j                  |j                        | _        t        j                  |j                  |j                        | _	        t        |j                  t              rt        |j                     | _        n|j                  | _        t        j                  |j                  |j                        | _        t        j                  |j                         | _        y ra   )r   r   r;   r\   activation_dropoutintermediate_dropoutrZ   r=   intermediate_sizeintermediate_dense
isinstance
hidden_actstrr   intermediate_act_fnoutput_densehidden_dropoutoutput_dropoutr_   s     r"   r   zWavLMFeedForward.__init__  s    $&JJv/H/H$I!"$))F,>,>@X@X"Yf''-'-f.?.?'@D$'-'8'8D$IIf&>&>@R@RS jj)>)>?r#   c                     | j                  |      }| j                  |      }| j                  |      }| j                  |      }| j	                  |      }|S ra   )r   r   r   r   r   r&   s     r"   r(   zWavLMFeedForward.forward   sX    //>00?11-@))-8++M:r#   r)   r.   s   @r"   r   r     s    @r#   r   c                   2     e Zd Zddedef fdZddZ xZS )WavLMEncoderLayerrL   ri   c                    t         |           t        |j                  |j                  |j
                  |j                  |j                  |      | _        t        j                  |j                        | _        t        j                  |j                  |j                        | _        t!        |      | _        t        j                  |j                  |j                        | _        y N)re   rf   r^   rg   rh   ri   rT   r   r   rd   r=   num_attention_headsattention_dropoutrg   max_bucket_distance	attentionr;   r\   r   r^   rV   rX   rY   r   feed_forwardfinal_layer_normr   rL   ri   r!   s      r"   r   zWavLMEncoderLayer.__init__+      '((00,,**33'A
 zz&"7"78,,v'9'9v?T?TU,V4 "V-?-?VEZEZ [r#   c                     |}| j                  |||||      \  }}}| j                  |      }||z   }| j                  |      }|| j                  |      z   }| j	                  |      }||f}|r||fz  }|S )Nrz   r{   r|   r   )r   r^   rY   r   r   )	r   r'   rz   r{   r|   r   attn_residualr   outputss	            r"   r(   zWavLMEncoderLayer.forward:  s    %59^^)'/ 6D 6
2|] ]3%56%(9(9-(HH--m< -0&Gr#   Tr   r*   r+   r,   r   r   r   r(   r-   r.   s   @r"   r   r   *  s    \{ \ \r#   r   c                   2     e Zd Zddedef fdZddZ xZS ) WavLMEncoderLayerStableLayerNormrL   ri   c                    t         |           t        |j                  |j                  |j
                  |j                  |j                  |      | _        t        j                  |j                        | _        t        j                  |j                  |j                        | _        t!        |      | _        t        j                  |j                  |j                        | _        y r   r   r   s      r"   r   z)WavLMEncoderLayerStableLayerNorm.__init__T  r   r#   c                     |}| j                  |      }| j                  ||||      \  }}}| j                  |      }||z   }|| j                  | j	                  |            z   }||f}|r||fz  }|S )N)rz   r{   r|   )rY   r   r^   r   r   )r   r'   rz   r{   r|   r   r   r   s           r"   r(   z(WavLMEncoderLayerStableLayerNorm.forwardc  s    %659^^)'/	 6D 6
2|] ]3%5%(9(9$:O:OP]:^(__ -0&Gr#   r   )NNFr   r.   s   @r"   r   r   S  s    \{ \ \r#   r   c                   .     e Zd Z fdZ	 	 	 	 ddZ xZS )WavLMEncoderc           
         t         |           || _        t        |      | _        t        j                  |j                  |j                        | _	        t        j                  |j                        | _        t        j                  t        |j                        D cg c]  }t!        ||dk(         c}      | _        d| _        y c c}w NrT   r   )ri   F)r   r   rL   r0   pos_conv_embedr;   rV   r=   rX   rY   r\   r   r^   
ModuleListrangenum_hidden_layersr   layersgradient_checkpointingr   rL   ir!   s      r"   r   zWavLMEncoder.__init__y  s    :6B,,v'9'9v?T?TUzz&"7"78mmUZ[a[s[sUtuPQv16Ku
 ',# v   !Cc                    |rdnd }|rdnd }|5|j                  d      j                  dd|j                  d         }d|| <   | j                  |      }	||	z   }| j	                  |      }| j                  |      }t               xs t        |       }
d }t        | j                        D ]y  \  }}|r||fz   }t        j                  g       }| j                  xr  |dkD  xr || j                  j                  k  }|r|
r ||||||      }|d d \  }}|rd}|sq|d   fz   }{ |r||fz   }|st        d |||fD              S t!        |||	      S )
N rS   r   r   r   r   NNNc              3   &   K   | ]	  }||  y wra   r  .0vs     r"   	<genexpr>z'WavLMEncoder.forward.<locals>.<genexpr>       mq_`_lm   last_hidden_stater'   
attentions)r   r   r   r   rY   r^   r   r   	enumerater  rt   randr   rL   	layerdropr   r
   r   r'   rz   r|   output_hidden_statesreturn_dictall_hidden_statesall_self_attentionsexpand_attention_maskposition_embeddingssynced_gpusr{   r  layerdropout_probabilityskip_the_layerlayer_outputss                    r"   r(   zWavLMEncoder.forward  s    #7BD$5b4%$2$<$<R$@$G$G1mNaNabcNd$e!45M001"11-@%(;;6]302R6LT6R!$++. 	PHAu#$58H$H! #(**R.!]]fq1uf:MPTP[P[PePe:eN![ %!#1"/&7! 0=Ra/@,} 2 &9]1=M<O&O#1	P4   1]4D Dm]4EGZ$[mmm++*
 	
r#   NFFTr)   r.   s   @r"   r   r   x  s    	, ";
r#   r   c                   .     e Zd Z fdZ	 	 	 	 ddZ xZS )WavLMEncoderStableLayerNormc           
         t         |           || _        t        |      | _        t        j                  |j                  |j                        | _	        t        j                  |j                        | _        t        j                  t        |j                        D cg c]  }t!        ||dk(         c}      | _        d| _        y c c}w r   )r   r   rL   r0   r   r;   rV   r=   rX   rY   r\   r   r^   r   r   r  r   r  r  r  s      r"   r   z$WavLMEncoderStableLayerNorm.__init__  s    :6B,,v'9'9v?T?TUzz&"7"78mm v778 1UVZ[U[]
 ',#r  c                    |rdnd }|rdnd }|5|j                  d      j                  dd|j                  d         }d|| <   | j                  |      }	||	z   }| j	                  |      }t               xs t        |       }
d }t        | j                        D ]x  \  }}|r||fz   }t        j                  g       }| j                  xr  |dkD  xr || j                  j                  k  }|r|
r |||||      }|d d \  }}|rd}|sp|d   fz   }z | j                  |      }|r||fz   }|st        d |||fD              S t!        |||	      S )
Nr  rS   r   r   r   )rz   r|   r{   r	  c              3   &   K   | ]	  }||  y wra   r  r  s     r"   r  z6WavLMEncoderStableLayerNorm.forward.<locals>.<genexpr>  r  r  r  )r   r   r   r   r^   r   r   r  r  rt   r  r   rL   r  rY   r   r
   r  s                    r"   r(   z#WavLMEncoderStableLayerNorm.forward  s    #7BD$5b4%$2$<$<R$@$G$G1mNaNabcNd$e!45M001"11-@%(;;]302R6LT6R!$++. 	PHAu#$58H$H! #(**R.!]]fq1uf:MPTP[P[PePe:eN![ !&!#1&7"/	! 0=Ra/@,} 2 &9]1=M<O&O#/	P2 6 1]4D Dm]4EGZ$[mmm+;LYl
 	
r#   r#  r)   r.   s   @r"   r%  r%    s    ," "9
r#   r%  c                   8     e Zd ZdZ fdZed        Zd Z xZS )WavLMGumbelVectorQuantizerz
    Vector quantization using gumbel softmax. See [CATEGORICAL REPARAMETERIZATION WITH
    GUMBEL-SOFTMAX](https://huggingface.co/papers/1611.01144) for more information.
    c                 0   t         |           |j                  | _        |j                  | _        |j                  | j                  z  dk7  r&t        d|j                   d| j                   d      t        j                  t        j                  d| j                  | j
                  z  |j                  | j                  z              | _        t        j                  |j                  d   | j                  | j
                  z        | _        d| _        y )Nr   z`config.codevector_dim z5 must be divisible by `config.num_codevector_groups` z for concatenation.r   rS   r   )r   r   num_codevector_groups
num_groupsnum_codevectors_per_groupnum_varscodevector_dimrm   r;   rs   rt   r   codevectorsrZ   rW   weight_projtemperaturer_   s     r"   r   z#WavLMGumbelVectorQuantizer.__init__  s     6688  4??2a7)&*?*?)@ A66:oo5F G%%  <<a4==!@&BWBW[_[j[jBjk
 99V__R%8$//DMM:YZ r#   c                     | j                  d      }t        j                  t        j                  t        j                  ||      d             j                         }|S )Nr   r   rS   )meanrt   expr   xlogy)probsmarginal_probs
perplexitys      r"   _compute_perplexityz.WavLMGumbelVectorQuantizer._compute_perplexity(  sI    *YY		%++nn*U[] ^^_cce
r#   c                    |j                   \  }}}| j                  |      }|j                  ||z  | j                  z  d      }| j                  rt
        j                  j                  |j                         | j                  d      }|j                  |      }t        j                  |j                  ||z  | j                  d      j                         d      }| j                  |      }n}|j                  d      } |j                  |j                    j!                  d|j                  dd      d      }|j                  ||z  | j                  d      }| j                  |      }|j                  ||z  d      }|j#                  d      | j$                  z  }	|	j                  ||z  | j                  | j&                  d      }
|
j)                  d      j                  ||d      }
|
|fS )NrS   T)tauhardr   r   r   )r   r2  r   r-  r   r;   
functionalgumbel_softmaxr   r3  type_asrt   softmaxr;  argmax	new_zerosscatter_r   r1  r/  r   )r   r'   
batch_sizesequence_lengthr=   codevector_probscodevector_soft_distr:  codevector_idxcodevectors_per_groupr1  s              r"   r(   z"WavLMGumbelVectorQuantizer.forward.  s   3@3F3F0
O[ ((7%**:+G$//+Y[]^==!}};;M<O<O<QW[WgWgnr;s/77F $)=="":#?RTU[[]ce$  112FGJ +11b19N6}668K8KLUUN''A.   044Z/5QSWSbSbdfg112BCJ+00o1MrR 0 : :2 >AQAQ Q+00o1Mt`d`m`moqr!oob)..z?BOJ&&r#   )	r*   r+   r,   r   r   staticmethodr;  r(   r-   r.   s   @r"   r*  r*    s&    
*  
"'r#   r*  c                       e Zd ZU eed<   dZdZdZdZdZ	dZ
dZ ej                         d        Zdd	ej                  ez  d
edz  fdZ	 ddedej                  fdZy)WavLMPreTrainedModelrL   wavlminput_valuesaudioTFc           
         t        |t              rut        j                  |j                  j
                  dd       t        j                  |j                  j                         t        j                  |j                         yt        |t              rt        j                  |j                  j
                  ddt        j                  d|j                  j                  d   |j                  j                  z  z        z         t        j                   |j                  j                  d       yt        |t"              rt        j                  d|j$                  j&                  z        }t        j                  |j$                  j
                  | |       t        j                  |j$                  j                  | |       yt        |t(        j*                        rct        j                  |j
                  d| j,                  j.                         |j                   t        j                  |j                         yyt        |t(        j0                  t(        j2                  f      r?t        j                  |j                         t        j4                  |j
                         yt        |t(        j6                        rt        j8                  |j
                         |j                  `t        j                  |j:                  |j                  |j                  d   z  z        }t        j                  |j                  | |       yyy)zInitialize the weightsr   r   )r5  stdr   r   )abN)r   r*  initnormal_r2  r7   zeros_r   uniform_r1  r0   r?   r   sqrtr2   in_channels	constant_rQ   r[   in_featuresr;   rZ   rL   initializer_rangerV   	GroupNormones_r<   kaiming_normal_r4   )r   moduleks      r"   _init_weightsz"WavLMPreTrainedModel._init_weights^  s*    f89LL++22!DKK**//0MM&,,- <=LL""		!v{{'>'>q'AFKKD[D['["\]]
 NN6;;++Q/ 67		!f//;;;<AMM&++22qbA>MM&++00QB!<		*LLSdkk6S6ST{{&FKK( 'r|| <=KK$JJv}}%		*  /{{&IIfmmv/A/AFDVDVWXDY/YZ[fkkaR15 ' +r#   Ninput_lengthsadd_adapterc                 T   || j                   j                  n|}d }t        | j                   j                  | j                   j                        D ]  \  }} ||||      } |rBt        | j                   j                        D ]   } ||d| j                   j                        }" |S )zH
        Computes the output length of the convolutional layers
        c                 >    t        j                  | |z
  |d      dz   S )Nfloor)rounding_moder   )rt   divinput_lengthr2   strides      r"   _conv_out_lengthzOWavLMPreTrainedModel._get_feat_extract_output_lengths.<locals>._conv_out_length  s"     99\K7wWZ[[[r#   r   )rL   rg  zipconv_kernelconv_strider   num_adapter_layersadapter_stride)r   rf  rg  rp  r2   ro  r   s          r"    _get_feat_extract_output_lengthsz5WavLMPreTrainedModel._get_feat_extract_output_lengths  s    
 2=1Ddkk--+	\
 $'t{{'>'>@W@W#X 	QK,]KPM	Q 4;;99: _ 04;;C]C] ^_ r#   feature_vector_lengthrz   c                     |j                  d      d d df   }| j                  ||      }|j                  t        j                        }|j
                  d   }t        j                  ||f|j                  |j                        }d|t        j                  |j
                  d   |j                        |dz
  f<   |j                  dg      j                  d      j                  dg      j                         }|S )NrS   r   rg  r   )r   r   r   )r   )cumsumrv  r   rt   r   r   zerosr   r   r   flipr   )r   rw  rz   rg  non_padded_lengthsoutput_lengthsrG  s          r"   "_get_feature_vector_attention_maskz7WavLMPreTrainedModel._get_feature_vector_attention_mask  s    
 ,22r2:1b5A>>?Q_j>k'**5::6#))!,
./~7K7KTbTiTi
 uv^%9%9!%<^EZEZ[]kno]opq',,bT299"=BBB4HMMOr#   ra   )r*   r+   r,   r   __annotations__base_model_prefixmain_input_nameinput_modalitiessupports_gradient_checkpointing_supports_flash_attn_supports_sdpa_supports_flex_attnrt   no_gradre  r   r   r   rv  r  r  r#   r"   rO  rO  S  s    $O&*# NU]]_6 6Be>N>NQT>T cgjncn , Y]%(:?:J:Jr#   rO  c                   &     e Zd Zd fd	Zd Z xZS )WavLMNoLayerNormConvLayerc                 d   t         |           |dkD  r|j                  |dz
     nd| _        |j                  |   | _        t        j                  | j                  | j                  |j                  |   |j                  |   |j                        | _
        t        |j                     | _        y )Nr   r   r2   ro  r   )r   r   rW   in_conv_dimout_conv_dimr;   r<   rr  rs  	conv_biasr?   r   rJ   rK   r   rL   layer_idr!   s      r"   r   z"WavLMNoLayerNormConvLayer.__init__  s    <DqL6??8a<8a"OOH5II**84%%h/!!
	 !!?!?@r#   c                 J    | j                  |      }| j                  |      }|S ra   )r?   rK   r&   s     r"   r(   z!WavLMNoLayerNormConvLayer.forward  s$    		-06r#   r   r)   r.   s   @r"   r  r    s    Ar#   r  c                   &     e Zd Zd fd	Zd Z xZS )WavLMLayerNormConvLayerc                    t         |           |dkD  r|j                  |dz
     nd| _        |j                  |   | _        t        j                  | j                  | j                  |j                  |   |j                  |   |j                        | _
        t        j                  | j                  d      | _        t        |j                     | _        y )Nr   r   r  T)elementwise_affine)r   r   rW   r  r  r;   r<   rr  rs  r  r?   rV   rY   r   rJ   rK   r  s      r"   r   z WavLMLayerNormConvLayer.__init__  s    <DqL6??8a<8a"OOH5II**84%%h/!!
	 ,,t'8'8TR !?!?@r#   c                     | j                  |      }|j                  dd      }| j                  |      }|j                  dd      }| j                  |      }|S )Nr?  rS   )r?   rO   rY   rK   r&   s     r"   r(   zWavLMLayerNormConvLayer.forward  sV    		-0%//B76%//B76r#   r  r)   r.   s   @r"   r  r    s    Ar#   r  c                   &     e Zd Zd fd	Zd Z xZS )WavLMGroupNormConvLayerc                    t         |           |dkD  r|j                  |dz
     nd| _        |j                  |   | _        t        j                  | j                  | j                  |j                  |   |j                  |   |j                        | _
        t        |j                     | _        t        j                  | j                  | j                  d      | _        y )Nr   r   r  T)r-  num_channelsaffine)r   r   rW   r  r  r;   r<   rr  rs  r  r?   r   rJ   rK   r`  rY   r  s      r"   r   z WavLMGroupNormConvLayer.__init__  s    <DqL6??8a<8a"OOH5II**84%%h/!!
	 !!?!?@,,$2C2CRVRcRclpqr#   c                 l    | j                  |      }| j                  |      }| j                  |      }|S ra   )r?   rY   rK   r&   s     r"   r(   zWavLMGroupNormConvLayer.forward  s2    		-066r#   r  r)   r.   s   @r"   r  r    s    r r#   r  c                   .     e Zd ZdZ fdZd Zd Z xZS )WavLMFeatureEncoderz.Construct the features from raw audio waveformc           	         t         |           |j                  dk(  rDt        |d      gt	        |j
                  dz
        D cg c]  }t        ||dz          c}z   }nV|j                  dk(  r.t	        |j
                        D cg c]  }t        ||       }}nt        d|j                   d      t        j                  |      | _        d| _        d	| _        y c c}w c c}w )
Ngroupr   )r  r   r  z`config.feat_extract_norm` is z), but has to be one of ['group', 'layer']FT)r   r   feat_extract_normr  r   num_feat_extract_layersr  r  rm   r;   r   conv_layersr  _requires_grad)r   rL   r  r  r!   s       r"   r   zWavLMFeatureEncoder.__init__  s    ##w.26AFGKPQWQoQorsQsKtKFG)&1q5AK K %%0PUV\VtVtPuv126AFvKv01I1I0JJst  ==5&+#"K ws   C"	C'c                 J    | j                         D ]	  }d|_         d| _        y )NF)
parametersrequires_gradr  r   params     r"   _freeze_parametersz&WavLMFeatureEncoder._freeze_parameters  s(    __& 	(E"'E	(#r#   c                     |d d d f   }| j                   r| j                  rd|_        | j                  D ]
  } ||      } |S )NT)r  r   r  r  )r   rQ  r'   
conv_layers       r"   r(   zWavLMFeatureEncoder.forward  sP    $QW- 4==*.M'** 	6J&}5M	6 r#   )r*   r+   r,   r   r   r  r(   r-   r.   s   @r"   r  r    s    8#"$

r#   r  c                   $     e Zd Z fdZd Z xZS )WavLMAdapterLayerc                     t         |           t        j                  |j                  d|j                  z  |j
                  |j                  d      | _        y )Nr   r   )ro  r3   )r   r   r;   r<   output_hidden_sizeadapter_kernel_sizeru  r?   r_   s     r"   r   zWavLMAdapterLayer.__init__  sJ    II%%)))&&((
	r#   c                 j    | j                  |      }t        j                  j                  |d      }|S )Nr   r   )r?   r;   r@  glur&   s     r"   r(   zWavLMAdapterLayer.forward#  s/    		-0))-Q)?r#   r)   r.   s   @r"   r  r    s    
r#   r  c                   $     e Zd Z fdZd Z xZS )WavLMAdapterc                    t         |           j                  j                  k7  rTt	        j
                  j                  j                        | _        t	        j                  j                        | _        nd x| _        | _        t	        j                  fdt        j                        D              | _        j                  | _        y )Nc              3   4   K   | ]  }t                y wra   )r  )r  r   rL   s     r"   r  z(WavLMAdapter.__init__.<locals>.<genexpr>5  s     #h!$5f$=#hs   )r   r   r  r=   r;   rZ   projrV   proj_layer_normr   r   rt  r  r  r_   s    `r"   r   zWavLMAdapter.__init__+  s     $$(:(::		&"4"4f6O6OPDI#%<<0I0I#JD /33DI,mm#huVMfMfGg#hh))r#   c                 h   | j                   .| j                  "| j                  |      }| j                  |      }|j                  dd      }| j                  D ]D  }t        j
                  j                         }| j                  r|| j                  kD  s= ||      }F |j                  dd      }|S rN   )r  r  rO   r  nprandomr   r  )r   r'   r  layerdrop_probs       r"   r(   zWavLMAdapter.forward8  s    99 T%9%9%E IIm4M 00?M%//15[[ 	5EYY--/N==^dnn%D %m 4	5
 &//15r#   r)   r.   s   @r"   r  r  *  s    *r#   r  r   	mask_probmask_lengthrz   	min_masksr}   c                    | \  }dk  rt        d      kD  rt        d d d      t        j                  j                  d      j	                         fd}|-|j                         j                  d      j                         nt        |      D cg c]  } c}}t        j                  |ft        	      }	g }
 |      }|d
k(  r|	S |D ]  } ||      }t        j                  j                  t        j                  |dz
  z
        |d      }t        |      d
k(  rdz
  }n|d
   }t        j                  |t        j                  ||z
  t        j                   	      |z  g      }|
j#                  |        t        j$                  |
      }
t        j&                  |
dddddf   ||f      }
|
j)                  ||z        }
t        j                        ddddf   }t        j&                  |||f      j)                  ||z        }|
|z   }
|
j+                         dz
  kD  rdz
  |
|
dz
  kD  <   t        j,                  |	|
dd       |	S c c}w )an  
    Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
    ASR](https://huggingface.co/papers/1904.08779). Note that this method is not optimized to run on TPU and should be run on
    CPU as part of the preprocessing during training.

    Args:
        shape: The shape for which to compute masks. This should be of a tuple of size 2 where
               the first element is the batch size and the second element is the length of the axis to span.
        mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
                    independently generated mask spans of length `mask_length` is computed by
                    `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
                    actual percentage will be smaller.
        mask_length: size of the mask
        min_masks: minimum number of masked spans
        attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
                        each batch dimension.
    r   z&`mask_length` has to be bigger than 0.zO`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: z and `sequence_length`: `c                     t        | z  z  z         }t        |      }|z  kD  rz  }| dz
  z
  |k  rt        | dz
  z
  d      }|S )z;Given input length, compute how many spans should be maskedr   r   )r   max)rn  num_masked_spanepsilonr  r  r  rH  s     r"   compute_num_masked_spanz6_compute_mask_indices.<locals>.compute_num_masked_spano  so    i,6DwNOoy9 [(?:-<O ;?+o=!,+/"BAFOr#   NrS   r   r   F)replace)rm   r  r  r  itemdetachr   tolistr   r{  r   choicer   lenconcatenateru   int32appendarrayr   reshaper  put_along_axis)r   r  r  rz   r  rG  r  r   rf  spec_aug_maskspec_aug_mask_idxsmax_num_masked_spanrn  r  spec_aug_mask_idxdummy_mask_idxoffsetsr  rH  s    `` `            @@r"   _compute_mask_indicesr  I  s   0 #(JQABB_$]^i]j&&7q:
 	
 iinnQ$$&G $ % 	##B'..0',Z'89!o9  HHj/:$GM1/Ba% 51,? II,,IIlkAo67RW - 
  !Q& -q0N.q1NNN(;o(MUWU]U] ^ao op
 	!!"34/52 "45 1a:&5H+(V ,33J@SVa@ab ii$T4]3Goog
4G'UV^^'+5G ,g5 /A"55GVYZGZ-!0CCD m%7B?w :s   $	I+c                   ,    e Zd Zdef fdZd Z	 	 ddej                  dej                  dz  dej                  dz  fdZ	e
	 	 	 	 	 dd	ej                  dz  dej                  dz  dej                  dz  d
edz  dedz  dedz  deez  fd       Z xZS )
WavLMModelrL   c                    t         |   |       || _        t        |      | _        t        |      | _        |j                  dkD  s|j                  dkD  rEt        j                  t        j                  |j                        j                               | _        |j                   rt#        |      | _        nt'        |      | _        |j(                  rt+        |      nd | _        | j/                          y )Nr   )r   r   rL   r  feature_extractorrQ   feature_projectionmask_time_probmask_feature_probr;   rs   rt   r   r=   rZ  masked_spec_embeddo_stable_layer_normr%  encoderr   rg  r  adapter	post_initr_   s     r"   r   zWavLMModel.__init__  s     !4V!<"8"@   3&&*B*BS*H%'\\%,,v?Q?Q2R2[2[2]%^D"&&6v>DL'/DL/5/A/A|F+t 	r#   c                 8    | j                   j                          yz
        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
        not be updated during training.
        N)r  r  r   s    r"   freeze_feature_encoderz!WavLMModel.freeze_feature_encoder  s    
 	113r#   Nr'   mask_time_indicesrz   c                    t        | j                  dd      s|S |j                         \  }}}|)| j                  j	                  |j
                        ||<   n| j                  j                  dkD  r| j                  rt        ||f| j                  j                  | j                  j                  || j                  j                        }t        j                  ||j                  t        j                        }| j                  j	                  |j
                        ||<   | j                  j                  dkD  r| j                  rt        ||f| j                  j                  | j                  j                   | j                  j"                        }t        j                  ||j                  t        j                        }|dddf   j%                  d|d      }d||<   |S )	z
        Masks extracted features along time axis and/or along feature axis according to
        [SpecAugment](https://huggingface.co/papers/1904.08779).
        apply_spec_augmentTNr   )r  r  rz   r  )r   r   )r  r  r  rS   )getattrrL   r   r  r   r   r  r   r  mask_time_lengthmask_time_min_masksrt   tensorr   r   r  mask_feature_lengthmask_feature_min_masksexpand)r   r'   r  rz   rG  rH  r=   mask_feature_indicess           r"   _mask_hidden_stateszWavLMModel._mask_hidden_states  s    t{{$8$?   4A3E3E3G0
O[(/3/E/E/H/HI\I\/]M+,[[''!+ 5_-++44 KK88-++99! !&->}G[G[chcmcm n/3/E/E/H/HI\I\/]M+,;;((1,#8[)++77 KK;;++<<	$  $)<<0D]MaMainisis#t #74#@#G#GO]_#` 23M./r#   rQ  r|   r  r  r}   c                 H   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }| j	                  |      }|j                  dd      }|!| j                  |j                  d   |d      }| j                  |      \  }	}| j                  |	||      }	| j                  |	||||      }
|
d   }	| j                  | j                  |	      }	|s
|	|f|
dd z   S t        |	||
j                  |
j                  	      S )
a/  
        mask_time_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices to mask extracted features for contrastive loss. When in training mode, model learns to predict
            masked extracted features in *config.proj_codevector_dim* space.
        Nr   r   Fry  )r  rz   rz   r|   r  r  r   )r  extract_featuresr'   r  )rL   r|   r  use_return_dictr  rO   r  r   r  r  r  r  WavLMBaseModelOutputr'   r  )r   rQ  rz   r  r|   r  r  kwargsr  r'   encoder_outputss              r"   r(   zWavLMModel.forward  sb     2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]11,?+55a;%!DD &&q)>u E N +/*A*ABR*S''00->~ 1 
 ,,)/!5# ' 
 (*<<# LL7M!#34qr7JJJ#+-)77&11	
 	
r#   )NNNNNNN)r*   r+   r,   r   r   r  rt   r   r   r  r   r   r   r   r  r(   r-   r.   s   @r"   r  r    s    { (4 7;26	,((, !,,t3, ((4/	,\  /36:)-,0#'8
llT)8
 t+8
 !,,t3	8

  $;8
 #Tk8
 D[8
 
%	%8
 8
r#   r  r   zm
    WavLM Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).
    )custom_introc                        e Zd Zddedz  f fdZd Zd Zd Ze	 	 	 	 	 dde	j                  dz  de	j                  dz  d	edz  d
edz  dedz  de	j                  dz  deez  fd       Z xZS )WavLMForCTCNtarget_langc                    t         |   |       t        |      | _        t	        j
                  |j                        | _        || _        |j                  t        d| j                   d      t        |d      r|j                  r|j                  n|j                  }t	        j                   ||j                        | _        | j%                          y)a/  
        target_lang (`str`, *optional*):
            Language id of adapter weights. Adapter weights are stored in the format adapter.<lang>.safetensors or
            adapter.<lang>.bin. Only relevant when using an instance of [`WavLMForCTC`] with adapters. Uses 'eng' by
            default.
        NzYou are trying to instantiate z with a configuration that does not define the vocabulary size of the language model head. Please instantiate the model as follows: `WavLMForCTC.from_pretrained(..., vocab_size=vocab_size)`. or define `vocab_size` of your model's configuration.rg  )r   r   r  rP  r;   r\   final_dropoutr^   r  
vocab_sizerm   r!   rA   rg  r  r=   rZ   lm_headr  )r   rL   r  r  r!   s       r"   r   zWavLMForCTC.__init__S  s     	 '
zz&"6"67&$00@ AH H  *1)GFL^L^F%%djdvdv 	 yy!3V5F5FG 	r#   c                 8   t               t        j                  d      k(  ry| j                  }|&t	        | j
                  dd      t        d| d      |-t	        | j
                  dd      t        j                  d       y|| j                  |d       yy)	a'  
        This method overwrites [`~PreTrainedModel.tie_weights`] so that adapter weights can be correctly loaded when
        passing `target_lang=...` to `from_pretrained(...)`.

        This method is **not** supposed to be called by the user and is prone to be changed in the future.
        metaNadapter_attn_dimzCannot pass `target_lang`: z- if `config.adapter_attn_dim` is not defined.z)By default `target_lang` is set to 'eng'.T)
force_load)
r   rt   r   r  r  rL   rm   loggerinfoload_adapter)r   r  r  s      r"   tie_weightszWavLMForCTC.tie_weightsp  s     675<<;OO &&"wt{{<NPT'U']:;-Gtuvv WT[[:Ld%S%_KKCD$kd; %r#   c                 L    | j                   j                  j                          yr  rP  r  r  r  s    r"   r  z"WavLMForCTC.freeze_feature_encoder      
 	

$$779r#   c                 P    | j                   j                         D ]	  }d|_         yz
        Calling this function will disable the gradient computation for the base model so that its parameters will not
        be updated during training. Only the classification head will be updated.
        FNrP  r  r  r  s     r"   freeze_base_modelzWavLMForCTC.freeze_base_model  (    
 ZZ**, 	(E"'E	(r#   rQ  rz   r|   r  r  labelsr}   c           
         ||n| j                   j                  }|I|j                         | j                   j                  k\  r"t	        d| j                   j                         | j                  |||||      }|d   }	| j                  |	      }	| j                  |	      }
d}|b||n$t        j                  |t        j                        }| j                  |j                  d            j                  t        j                        }|dk\  }|j                  d      }|j                  |      }t        j                   j#                  |
dt        j$                        j'                  dd      }t        j(                  j*                  j-                  d	
      5  t        j                   j/                  ||||| j                   j0                  | j                   j2                  | j                   j4                        }ddd       |s|
f|t6        d z   }||f|z   S |S t9        ||
|j:                  |j<                        S # 1 sw Y   ExY w)a  
        labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
            Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to
            the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`.
            All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
            config.vocab_size - 1]`.
        Nz$Label values must be <= vocab_size: r  r   r   rS   )r9   r   r   F)enabled)blank	reductionzero_infinitylosslogitsr'   r  )rL   r  r  r  rm   rP  r^   r  rt   	ones_liker   rv  r   r   masked_selectr;   r@  log_softmaxfloat32rO   backendscudnnflagsctc_losspad_token_idctc_loss_reductionctc_zero_infinity_HIDDEN_STATES_START_POSITIONr   r'   r  )r   rQ  rz   r|   r  r  r  r  r   r'   r!  r   rf  labels_masktarget_lengthsflattened_targets	log_probsoutputs                     r"   r(   zWavLMForCTC.forward  s'   $ &1%<k$++B]B]&**,$++2H2H"HCDKKDZDZC[\]]**)/!5#  
  
]3m, #1"<%//R^fkfpfpBq  !AA.BTBTUWBXY\\]b]g]ghM !A+K(__R0N & 4 4[ A 11&b1V``abdefI%%++E+: 	}}--%!"++22"kk<<"&++"?"? . 	 Y)F)G!HHF)-)9TGf$EvEfG4I4IV]VhVh
 	
	 	s   A#IIra   r   )r*   r+   r,   r   r   r  r  r  r   rt   r   r   r   r   r(   r-   r.   s   @r"   r  r  M  s    C$J :<0:(  /3)-,0#'&*E
llT)E
 t+E
  $;	E

 #TkE
 D[E
 t#E
 
	E
 E
r#   r  z
    WavLM Model with a sequence classification head on top (a linear layer over the pooled output) for tasks like
    SUPERB Keyword Spotting.
    c                        e Zd Z fdZd Zd Ze	 	 	 	 	 ddej                  dz  dej                  dz  de	dz  de	dz  d	e	dz  d
ej                  dz  de
ez  fd       Z xZS )WavLMForSequenceClassificationc                    t         |   |       t        |d      r|j                  rt	        d      t        |      | _        |j                  dz   }|j                  r0t        j                  t        j                  |      |z        | _        t        j                  |j                  |j                         | _        t        j                  |j                   |j$                        | _        | j)                          y )Nrg  z\Sequence classification does not support the use of WavLM adapters (config.add_adapter=True)r   )r   r   rA   rg  rm   r  rP  r  use_weighted_layer_sumr;   rs   rt   ru   layer_weightsrZ   r=   classifier_proj_size	projector
num_labels
classifierr  r   rL   
num_layersr!   s      r"   r   z'WavLMForSequenceClassification.__init__  s     6=)f.@.@n   '
--1
((!#ejj.Dz.Q!RD6#5#5v7R7RS))F$?$?ARARS 	r#   c                 L    | j                   j                  j                          yr  r  r  s    r"   r  z5WavLMForSequenceClassification.freeze_feature_encoder  r  r#   c                 P    | j                   j                         D ]	  }d|_         yr  r  r  s     r"   r  z0WavLMForSequenceClassification.freeze_base_model  r  r#   NrQ  rz   r|   r  r  r  r}   c                 <   ||n| j                   j                  }| j                   j                  rdn|}| j                  |||||      }| j                   j                  rr|t           }	t        j                  |	d      }	t        j                  j                  | j                  d      }
|	|
j                  ddd      z  j                  d      }	n|d   }	| j                  |	      }	||	j                  d      }n| j                  |	j                   d   |      }|j#                  d      j%                  dd|	j                   d         }d	|	| <   |	j                  d      |j                  d      j                  dd      z  }| j'                  |      }d}|Ft)               } ||j                  d| j                   j*                        |j                  d            }|s|f|t        d z   }||f|z   S |S t-        |||j.                  |j0                  
      S )	  
        input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
            Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
            into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library
            (`pip install torchcodec`) or the soundfile library (`pip install soundfile`).
            To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and conversion
            into a tensor of type `torch.FloatTensor`. See [`WavLMProcessor.__call__`] for details.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        NTr  r   r   rS   r   r   r   r  )rL   r  r6  rP  r-  rt   stackr;   r@  rC  r7  r   r   r9  r5  r  r   r   r   r;  r   r:  r   r'   r  )r   rQ  rz   r|   r  r  r  r  r   r'   norm_weightspooled_outputpadding_maskexpand_padding_maskr!  r   loss_fctr2  s                     r"   r(   z&WavLMForSequenceClassification.forward  s   0 &1%<k$++B]B]'+{{'I'ItOc**)/!5#  
 ;;--#$ABM!KK1=M==001C1C0LL*\->->r1a-HHMMRSMTM#AJM}5!)..1.5MBB=CVCVWXCY[ijL"."8"8"<"C"CAq-J]J]^_J`"a25M../)--!-4|7G7GA7G7N7S7STVXY7ZZM/')HFKKDKK,B,BCV[[QS_UDY)F)G!HHF)-)9TGf$EvE'!//))	
 	
r#   r   )r*   r+   r,   r   r  r  r   rt   r   r   r   r   r(   r-   r.   s   @r"   r4  r4    s    ":(  /3)-,0#'&*C
llT)C
 t+C
  $;	C

 #TkC
 D[C
 t#C
 
)	)C
 C
r#   r4  c                        e Zd Z fdZd Zd Ze	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  de	dz  d	e	dz  d
e	dz  de
ez  fd       Z xZS ) WavLMForAudioFrameClassificationc                    t         |   |       t        |d      r|j                  rt	        d      t        |      | _        |j                  dz   }|j                  r0t        j                  t        j                  |      |z        | _        t        j                  |j                  |j                         | _        |j                   | _        | j%                          y )Nrg  z_Audio frame classification does not support the use of WavLM adapters (config.add_adapter=True)r   )r   r   rA   rg  rm   r  rP  r  r6  r;   rs   rt   ru   r7  rZ   r=   r:  r;  r  r<  s      r"   r   z)WavLMForAudioFrameClassification.__init__P  s     6=)f.@.@q   '
--1
((!#ejj.Dz.Q!RD))F$6$68I8IJ ++r#   c                 L    | j                   j                  j                          yr  r  r  s    r"   r  z7WavLMForAudioFrameClassification.freeze_feature_encoder`  r  r#   c                 P    | j                   j                         D ]	  }d|_         yr  r  r  s     r"   r  z2WavLMForAudioFrameClassification.freeze_base_modelg  r  r#   NrQ  rz   r  r|   r  r  r}   c           	         ||n| j                   j                  }| j                   j                  rdn|}| j                  |||||      }| j                   j                  rr|t           }	t        j                  |	d      }	t        j                  j                  | j                  d      }
|	|
j                  ddd      z  j                  d      }	n|d   }	| j                  |	      }d}|\t               } ||j                  d| j                        t        j                   |j                  d| j                        d            }|s|f|t        d z   }|S t#        |||j$                  |j&                  	      S )
rA  NTr  r   r   rS   r   )axisr  )rL   r  r6  rP  r-  rt   rB  r;   r@  rC  r7  r   r   r;  r   r:  rD  r   r'   r  )r   rQ  rz   r  r|   r  r  r  r   r'   rC  r!  r   rG  r2  s                  r"   r(   z(WavLMForAudioFrameClassification.forwardo  sh   0 &1%<k$++B]B]'+{{'I'ItOc**)/!5#  
 ;;--#$ABM!KK1=M==001C1C0LL*\->->r1a-HHMMRSMTM#AJM/')HFKKDOO<ell6;;WY[_[j[jKkrs>tuDY)F)G!HHFM$!//))	
 	
r#   r   )r*   r+   r,   r   r  r  r   rt   r   r   r   r   r(   r-   r.   s   @r"   rI  rI  N  s     :(  /3&*)-,0#':
llT):
 t+:
 t#	:

  $;:
 #Tk:
 D[:
 
&	&:
 :
r#   rI  c                   &     e Zd Zd fd	Zd Z xZS )AMSoftmaxLossc                     t         |           || _        || _        || _        t        j                  t        j                  ||      d      | _	        t        j                         | _        y )NT)r  )r   r   scalemarginr:  r;   rs   rt   randnr7   r   r   )r   	input_dimr:  rR  rS  r!   s        r"   r   zAMSoftmaxLoss.__init__  sQ    
$ll5;;y*#EUYZ'')	r#   c                    |j                         }t        j                  j                  | j                  d      }t        j                  j                  |d      }t        j                  ||      }|| j                  z
  }t        j                  j                  || j                        }| j                  t        j                  |j                         ||      z  }| j                  ||      }|S )Nr   r   r   )flattenr;   r@  	normalizer7   rt   mmrS  one_hotr:  rR  r   r   r   )	r   r'   r  r7   	cos_thetapsionehotr!  r   s	            r"   r(   zAMSoftmaxLoss.forward  s    !((!(<//1/EHH]F3	$++%&&vt?ekk&++-iHHyy(r#   )g      >@g?r)   r.   s   @r"   rP  rP    s    *r#   rP  c                   X     e Zd Zd fd	Zdej
                  dej
                  fdZ xZS )	TDNNLayerc                    t         |           |dkD  r|j                  |dz
     n|j                  |   | _        |j                  |   | _        |j
                  |   | _        |j                  |   | _        t        j                  | j                  | j                  z  | j                        | _        t        j                         | _        y )Nr   r   )r   r   tdnn_dimr  r  tdnn_kernelr2   tdnn_dilationdilationr;   rZ   kernelReLUrK   r  s      r"   r   zTDNNLayer.__init__  s    <DqL6??8a<8foo^fNg"OOH5!--h7,,X6ii 0 043C3C CTEVEVW'')r#   r'   r}   c                 &   t               rddlm} t               r+t        | j                        rt        j                  d       |j                  dd      }| j                  j                  j                  | j                  | j                  | j                        j                  dd      }t        j                  j                  ||| j                  j                   | j"                        }|j                  dd      }| j%                  |      }|S )Nr   )	LoraLayerzDetected LoRA on TDNNLayer. LoRA weights won't be applied due to optimization. You should exclude TDNNLayer from LoRA's target modules.r   r   )rd  )r   peft.tuners.lorarh  r   re  warningswarnrO   r7   r   r  r2   r  r;   r@  conv1dr   rd  rK   )r   r'   rh  r7   s       r"   r(   zTDNNLayer.forward  s    2$++y1O &//15##(():):D<L<LdN^N^_iijkmno,,]FDKKDTDT_c_l_l,m%//156r#   r  )r*   r+   r,   r   rt   r   r(   r-   r.   s   @r"   r_  r_    s#    $U\\ ell r#   r_  zi
    WavLM Model with an XVector feature extraction head on top for tasks like Speaker Verification.
    c                        e Zd Z fdZd Zd Zdej                  ez  fdZ	e
	 	 	 	 	 ddej                  dz  dej                  dz  d	edz  d
edz  dedz  dej                  dz  deez  fd       Z xZS )WavLMForXVectorc                    t         |   |       t        |      | _        |j                  dz   }|j
                  r0t        j                  t        j                  |      |z        | _
        t        j                  |j                  |j                  d         | _        t        t!        |j                              D cg c]  }t#        ||       }}t        j$                  |      | _        t        j                  |j                  d   dz  |j(                        | _        t        j                  |j(                  |j(                        | _        t/        |j(                  |j0                        | _        | j5                          y c c}w )Nr   r   rS   r   )r   r   r  rP  r  r6  r;   rs   rt   ru   r7  rZ   r=   ra  r9  r   r  r_  r   tdnnxvector_output_dimr  r;  rP  r:  	objectiver  )r   rL   r=  r  tdnn_layersr!   s        r"   r   zWavLMForXVector.__init__  s    '
--1
((!#ejj.Dz.Q!RD6#5#5vq7IJ5:3v;O5PQy+QQMM+.	!#6??2+>+BFD]D]!^))F$=$=v?X?XY&v'@'@&BSBST Rs   >Fc                 L    | j                   j                  j                          yr  r  r  s    r"   r  z&WavLMForXVector.freeze_feature_encoder  r  r#   c                 P    | j                   j                         D ]	  }d|_         yr  r  r  s     r"   r  z!WavLMForXVector.freeze_base_model  r  r#   rf  c                 V    d }| j                   j                  D ]  } |||d      } |S )z?
        Computes the output length of the TDNN layers
        c                     | |z
  |z  dz   S )Nr   r  rm  s      r"   rp  zBWavLMForXVector._get_tdnn_output_lengths.<locals>._conv_out_length  s     !;.69A==r#   r   )rL   rb  )r   rf  rp  r2   s       r"   _get_tdnn_output_lengthsz(WavLMForXVector._get_tdnn_output_lengths  s:    
	>
  ;;22 	LK,]KKM	L r#   NrQ  rz   r|   r  r  r  r}   c                    ||n| j                   j                  }| j                   j                  rdn|}| j                  |||||      }| j                   j                  rr|t           }	t        j                  |	d      }	t        j                  j                  | j                  d      }
|	|
j                  ddd      z  j                  d      }	n|d   }	| j                  |	      }	| j                  D ]
  } ||	      }	 |%|	j                  d      }|	j!                  d      }n| j#                  |j                  d            }| j%                  |      }g }g }t'        |      D ]U  \  }}|j)                  |	|d|f   j                  d             |j)                  |	|d|f   j!                  d             W t        j                  |      }t        j                  |      }t        j*                  ||gd      }| j-                  |      }| j/                  |      }d}|| j1                  ||      }|s||f|t        d z   }||f|z   S |S t3        ||||j4                  |j6                        S )	rA  NTr  r   r   rS   r   )r   r!  
embeddingsr'   r  )rL   r  r6  rP  r-  rt   rB  r;   r@  rC  r7  r   r   r9  rp  r5  rT  rv  rx  r  r  r   r  r;  rr  r   r'   r  )r   rQ  rz   r|   r  r  r  r  r   r'   rC  
tdnn_layermean_featuresstd_featuresfeat_extract_output_lengthstdnn_output_lengthsr  lengthstatistic_poolingoutput_embeddingsr!  r   r2  s                          r"   r(   zWavLMForXVector.forward  s   0 &1%<k$++B]B]'+{{'I'ItOc**)/!5#  
 ;;--#$ABM!KK1=M==001C1C0LL*\->->r1a-HHMMRSMTM#AJM}5)) 	6J&}5M	6 !)..1.5M(,,,3L*.*O*OP^PbPbghPbPi*j'"&"?"?@["\ML&':; J	6$$]1gvg:%>%C%C%C%JK##M!WfW*$=$A$Aa$A$HIJ "KK6M ;;|4L!II}l&CL 223DE!23>>&&1D/07;X;Y3ZZF)-)9TGf$EvE(!//))
 	
r#   r   )r*   r+   r,   r   r  r  rt   r   r   rx  r   r   r   r   r   r(   r-   r.   s   @r"   rn  rn    s    &:(e6F6F6L   /3)-,0#'&*P
llT)P
 t+P
  $;	P

 #TkP
 D[P
 t#P
 
	P
 P
r#   rn  )rI  r  r4  rn  r  rO  r%   )Kr   rj  numpyr  rt   torch.nnr;   torch.nn.functionalr@  r   r    r   rW  activationsr   integrations.deepspeedr   integrations.fsdpr   modeling_layersr	   modeling_outputsr
   r   r   r   r   r   modeling_utilsr   r   r@   r   r   r   configuration_wavlmr   
get_loggerr*   r  Moduler   r0   rQ   rd   r   r   r   r   r%  r*  rO  r  r  r  r  r  r  r   r   r   r   ndarrayr  r  r  r-  r  r4  rI  rP  r_  rn  __all__r  r#   r"   <module>r     s          % & ! @ 7 9  Z ? ? , 
		H	%		 *299 *Z1RYY 1c RYY c Lryy 0&2 &R"'A "JG
299 G
TH
")) H
VC' C'L S? S Sl : *8 68 0#")) #L		 $299 F /3tc?tt t $$t+	t
 t ZZtn /  C
% C
 C
L !"  
K
& K

K
\ e
%9 e
e
P [
'; [
 [
|BII .		 @ 
C
* C

C
Lr#   