
    qiG                     j   d Z ddlZddlZddlmZ ddlmZ ddlmZ ddl	m
Z
 ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZmZmZmZmZmZmZmZmZmZm Z  ddl!m"Z" dZ# G d de      Z$ G d de      Z% G d de      Z& G d dejN                        Z( G d de      Z) G d dejN                        Z* G d de      Z+ G d d e      Z, G d! d"e      Z- G d# d$e      Z. G d% d&ejN                        Z/e G d' d(e             Z0e G d) d*e0             Z1 G d+ d,e      Z2 G d- d.e      Z3g d/Z4y)0zPyTorch SEW model.    N)nn   )initialization)ACT2FN)is_deepspeed_zero3_enabled)is_fsdp_managed_module)BaseModelOutput)PreTrainedModel)auto_docstring)is_flash_attention_requested   )Wav2Vec2AttentionWav2Vec2EncoderLayerWav2Vec2FeatureEncoderWav2Vec2FeedForwardWav2Vec2ForCTC!Wav2Vec2ForSequenceClassificationWav2Vec2GroupNormConvLayerWav2Vec2LayerNormConvLayerWav2Vec2NoLayerNormConvLayerWav2Vec2SamePadLayer_compute_mask_indices   )	SEWConfigc                       e Zd Zy)SEWNoLayerNormConvLayerN__name__
__module____qualname__     U/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/sew/modular_sew.pyr   r   0       r"   r   c                       e Zd Zy)SEWLayerNormConvLayerNr   r!   r"   r#   r&   r&   4   r$   r"   r&   c                       e Zd Zy)SEWGroupNormConvLayerNr   r!   r"   r#   r(   r(   8   r$   r"   r(   c                   $     e Zd Z fdZd Z xZS )SEWPositionalConvEmbeddingc                    t         |           t        j                  |j                  |j                  |j
                  |j
                  dz  |j                  |j                        | _        t        j                  j                  }t        t        j                  j                  d      r$t        j                  j                  j                  }t               r(dd l}|j                  j!                  | j                  j"                  d      5   || j                  dd      | _        d d d        t        | j                  d      rU| j                  j                  j"                  j$                  }| j                  j                  j"                  j&                  }n,| j                  j(                  }| j                  j*                  }|j                  j-                  | |       |j                  j-                  | |       n || j                  dd      | _        t/        |j
                        | _        t2        |j4                     | _        y # 1 sw Y   'xY w)	Nr   )kernel_sizepaddinggroupsstrideweight_normr   modifier_rankweight)namedimparametrizations)super__init__r   Conv1dhidden_sizenum_conv_pos_embeddingsnum_conv_pos_embedding_groupssqueeze_factorconvutilsr0   hasattrr6   r   	deepspeedzeroGatheredParametersr3   	original0	original1weight_gweight_vregister_external_parameterSEWSamePadLayerr-   r   feat_extract_activation
activation)selfconfigr0   rA   rF   rG   	__class__s         r#   r8   z#SEWPositionalConvEmbedding.__init__=   s   II6622a777((
	 hh**288,,m<((33??K%'224993C3CST2U I'		aH	Ityy"459955<<FF9955<<FF99--99--NN66tXFNN66tXF#DIIH!DDI&v'E'EF !?!?@I Is   IIc                 l    | j                  |      }| j                  |      }| j                  |      }|S N)r>   r-   rK   )rL   hidden_statess     r#   forwardz"SEWPositionalConvEmbedding.forward_   s2    		-0]36r"   r   r   r    r8   rR   __classcell__rN   s   @r#   r*   r*   <   s     ADr"   r*   c                       e Zd Zy)rI   Nr   r!   r"   r#   rI   rI   g   r$   r"   rI   c                   $     e Zd Z fdZd Z xZS )SEWUpsamplingc                     t         |           t        j                  |j                  |j                  |j
                  z        | _        t        |j                     | _	        |j
                  | _        y rP   )
r7   r8   r   Linearr:   r=   
projectionr   rJ   rK   rL   rM   rN   s     r#   r8   zSEWUpsampling.__init__l   sW    ))F$6$68J8JVMbMb8bc !?!?@$33r"   c                 .   | j                  |      }| j                  |      }| j                  dkD  rc|j                         \  }}}|| j                  z  }|| j                  z  }|j	                  ||| j                  |      }|j	                  |||      }|S )Nr   )r[   rK   r=   sizereshape)rL   rQ   bszsrc_lensrc_embed_dimtgt_lentgt_embed_dims          r#   rR   zSEWUpsampling.forwardr   s    66"*7*<*<*>'C- 3 33G)T-@-@@M)11#w@S@SUbcM)11#wNMr"   rS   rU   s   @r#   rX   rX   k   s    4r"   rX   c                       e Zd Zy)SEWFeatureEncoderNr   r!   r"   r#   rf   rf      r$   r"   rf   c                       e Zd Zy)SEWAttentionNr   r!   r"   r#   rh   rh      r$   r"   rh   c                       e Zd Zy)SEWFeedForwardNr   r!   r"   r#   rj   rj      r$   r"   rj   c                       e Zd Zy)SEWEncoderLayerNr   r!   r"   r#   rl   rl      r$   r"   rl   c                   .     e Zd Z fdZ	 	 	 	 ddZ xZS )
SEWEncoderc                    t         |           || _        t        |      | _        t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        t        j                  |j                        | _        t        j                   t#        |j$                        D cg c]  }t'        |       c}      | _        t+        |      | _        d| _        y c c}w )NepsF)r7   r8   rM   r*   pos_conv_embedr   	AvgPool1dr=   pool	LayerNormr:   layer_norm_eps
layer_normDropouthidden_dropoutdropout
ModuleListrangenum_hidden_layersrl   layersrX   upsamplegradient_checkpointing)rL   rM   _rN   s      r#   r8   zSEWEncoder.__init__   s    8@LL!6!68M8MN	,,v'9'9v?T?TUzz&"7"78mmeFLdLdFe$f_V%<$fg%f-&+# %gs   Dc           	      *   |rdnd }|rdnd }||j                  d      j                  dd|j                  d         }t        | j                        rd|| <   |d|v r|nd }ngd|| <   |j                         j                  d      }	|	| j                  j                  z  }
|j                  d   | j                  j                  z  }t        j                  d||
j                        j                  dd      j                  |
j                  d   d      }||
j                  dd      k  j                         }d|d d d d d d f   j                  |j                  	      z
  }|t        j                  |j                        j                   z  }|j                  |j                  d   d|j                  d   |j                  d         }|j                  d   }|j#                  dd      }| j%                  |      }| j'                  |      }t!        |j)                  d      |j)                  d            }|d
d |f   |d
d |f   z   }|j#                  dd      }| j+                  |      }| j-                  |      }t/               xs t1        |       }| j2                  D ]j  }|r||fz   }t        j4                  g       }| j6                  xr || j                  j8                  k  }|r|r ||||      }|d   }|rd}|sb|d   fz   }l |r||fz   }| j;                  |      }|j                  d   |k  r4t<        j>                  jA                  |ddd||j                  d   z
  f      }|stC        d |||fD              S tE        |||      S )Nr!   r   r           r   deviceg      ?)dtype.)attention_maskoutput_attentionsNNc              3   &   K   | ]	  }||  y wrP   r!   ).0vs     r#   	<genexpr>z%SEWEncoder.forward.<locals>.<genexpr>   s     mq_`_lms   last_hidden_staterQ   
attentions)#	unsqueezerepeatshaper   rM   longsumr=   torcharanger   viewexpandtor   finfomin	transposerr   rt   r^   rw   rz   r   r   r~   randtraining	layerdropr   r   
functionalpadtupler	   )rL   rQ   r   r   output_hidden_statesreturn_dictall_hidden_statesall_self_attentionsexpand_attention_maskinput_lengthsoutput_lengthsmax_encoder_lengthattention_idsn_input_timestepsposition_embeddingspooled_hidden_states
min_lengthsynced_gpuslayerdropout_probabilityskip_the_layerlayer_outputss                         r#   rR   zSEWEncoder.forward   s    #7BD$5b4%$2$<$<R$@$G$G1mNaNabcNd$e!+DKK88;4454B4NSTXfSfmq 9<445!/!4!4!6 ; ;B ?!.$++2L2L!L%2%8%8%;t{{?Y?Y%Y"LL$6~?T?TUT!R[VN003R8 
 #0.2E2Eb!2L"L!R!R!T "%~atQ6F'G'J'JQ^QdQd'J'e!e!/%++m>Q>Q2R2V2V!V!/!6!6"((+Q0D0DR0H.J^J^_aJb" *//2%//15"11-@#yy7,11"57K7P7PQS7TU
,S+:+-=>ATUXZe[eZeUeAff%//156]302R6LT6R[[ 	PE#$58H$H! #(**R.!]]Z/BT[[EZEZ/ZN![ %!.Te! !.a 0 , &9]1=M<O&O#'	P*   1]4D Dm4q!$55MM--maAGX[h[n[nop[qGq=rsMm]4EGZ$[mmm++*
 	
r"   )NFFTrS   rU   s   @r#   rn   rn      s    	, "W
r"   rn   c                       e Zd ZU eed<   dZdZdZdZdZ	dZ
dZ ej                         d        Zdej                  ez  fd	Zd
edej                  fdZy)SEWPreTrainedModelrM   sewinput_valuesaudioTFc           
         t        |t              rt        j                  |j                  j
                  ddt        j                  d|j                  j                  d   |j                  j                  z  z        z         t        j                  |j                  j                  d       nt        |t        j                        r8t        j                  |j
                  d| j                  j                         nut        |t        j                   t        j"                  f      r@t        j$                  |j                         t        j&                  |j
                         nt        |t        j(                        rt+               rddl}t/        |d      rht/        |d      r\|j0                  j3                  |j4                  |j6                  gd	      5  t        j8                  |j
                         ddd       no|j0                  j3                  |j
                  d	      5  t        j8                  |j
                         ddd       nt        j8                  |j
                         t        |t        j                  t        j(                  f      r-|j                   t        j$                  |j                         yyy# 1 sw Y   axY w# 1 sw Y   mxY w)
zInitialize the weightsr   r   r   )meanstdr   NrG   rF   r1   )
isinstancer*   initnormal_r>   r3   mathsqrtr,   in_channels	constant_biasr   rZ   rM   initializer_rangeru   	GroupNormzeros_ones_r9   r   rA   r@   rB   rC   rG   rF   kaiming_normal_)rL   modulerA   s      r#   _init_weightsz SEWPreTrainedModel._init_weights  s    f89LL""		!v{{'>'>q'AFKKD[D['["\]]
 NN6;;++Q/		*LLSdkk6S6STr|| <=KK$JJv}}%		*)+ 6:.76:3N"::FOOV__;]mn:o <,,V]];< < #::6==XY:Z <,,V]];< < $$V]]3fryy"))45&++:QKK$ ;R5< << <s    K/ KKK$r   c                     d }t        | j                  j                  | j                  j                        D ]  \  }} ||||      } |S )zH
        Computes the output length of the convolutional layers
        c                 >    t        j                  | |z
  |d      dz   S )Nfloor)rounding_moder   )r   div)input_lengthr,   r/   s      r#   _conv_out_lengthzMSEWPreTrainedModel._get_feat_extract_output_lengths.<locals>._conv_out_length&  s"     99\K7wWZ[[[r"   )ziprM   conv_kernelconv_stride)rL   r   r   r,   r/   s        r#    _get_feat_extract_output_lengthsz3SEWPreTrainedModel._get_feat_extract_output_lengths!  sQ    
	\
 $'t{{'>'>@W@W#X 	QK,]KPM	Q r"   feature_vector_lengthr   c                    | j                  |j                  d            j                  t        j                        }|j
                  d   }t        j                  ||f|j                  |j                        }d|t        j                  |j
                  d   |j                        |dz
  f<   |j                  dg      j                  d      j                  dg      j                         }|S )Nr   r   )r   r   r   r   )r   r   r   r   r   r   zerosr   r   r   flipcumsumbool)rL   r   r   r   
batch_sizes        r#   "_get_feature_vector_attention_maskz5SEWPreTrainedModel._get_feature_vector_attention_mask0  s    >>~?Q?QRT?UVYYZ_ZdZde#))!,
./~7K7KTbTiTi
 uv^%9%9!%<^EZEZ[]kno]opq',,bT299"=BBB4HMMOr"   N)r   r   r    r   __annotations__base_model_prefixmain_input_nameinput_modalitiessupports_gradient_checkpointing_supports_flash_attn_supports_sdpa_supports_flex_attnr   no_gradr   
LongTensorintr   r   r!   r"   r#   r   r      s|    $O&*#NU]]_% %<e>N>NQT>T 
 
]b]m]m 
r"   r   c                   &    e Zd Zdef fdZ	 	 ddej                  dej                  dz  dej                  dz  fdZe		 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  d	edz  d
edz  dedz  deez  fd       Z xZS )SEWModelrM   c                    t         |   |       || _        t        |      | _        t        j                  |j                  d   |j                        | _	        |j                  d   |j                  k7  | _        | j                  r2t        j                  |j                  d   |j                        | _        t        j                  |j                        | _        |j"                  dkD  s|j$                  dkD  rEt        j&                  t)        j*                  |j                        j-                               | _        t1        |      | _        | j5                          y )Nr   rp   r   )r7   r8   rM   rf   feature_extractorr   ru   conv_dimrv   rw   r:   project_featuresrZ   feature_projectionrx   feat_proj_dropoutfeature_dropoutmask_time_probmask_feature_prob	Parameterr   Tensoruniform_masked_spec_embedrn   encoder	post_initr\   s     r#   r8   zSEWModel.__init__?  s     !26!:,,vr':@U@UV & 3v7I7I I  &(ii0CVEWEW&XD#!zz&*B*BC  3&&*B*BS*H%'\\%,,v?Q?Q2R2[2[2]%^D"!&) 	r"   NrQ   mask_time_indicesr   c                    t        | j                  dd      s|S |j                         \  }}}|)| j                  j	                  |j
                        ||<   n| j                  j                  dkD  r| j                  rt        ||f| j                  j                  | j                  j                  || j                  j                        }t        j                  ||j                  t        j                        }| j                  j	                  |j
                        ||<   | j                  j                  dkD  r| j                  rt        ||f| j                  j                  | j                  j                   | j                  j"                        }t        j                  ||j                  t        j                        }|dddf   j%                  d|d      }d||<   |S )	z
        Masks extracted features along time axis and/or along feature axis according to
        [SpecAugment](https://huggingface.co/papers/1904.08779).
        apply_spec_augmentTNr   )	mask_probmask_lengthr   	min_masks)r   r   )r   r  r  r   )getattrrM   r^   r   r   r   r   r   r   mask_time_lengthmask_time_min_masksr   tensorr   r   r   mask_feature_lengthmask_feature_min_masksr   )rL   rQ   r   r   r   sequence_lengthr:   mask_feature_indicess           r#   _mask_hidden_stateszSEWModel._mask_hidden_statesS  s    t{{$8$?   4A3E3E3G0
O[(/3/E/E/H/HI\I\/]M+,[[''!+ 5_-++44 KK88-++99! !&->}G[G[chcmcm n/3/E/E/H/HI\I\/]M+,;;((1,#8[)++77 KK;;++<<	$  $)<<0D]MaMainisis#t #74#@#G#GO]_#` 23M./r"   r   r   r   r   returnc                 Z   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }| j	                  |      }|j                  dd      }| j                  |      }| j                  r| j                  |      }| j                  |      }	|| j                  |	j                  d   |      }| j                  |	|      }	| j                  |	||||      }
|
d   }	|s	|	f|
dd z   S t        |	|
j                  |
j                         S )a/  
        mask_time_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices to mask extracted features for contrastive loss. When in training mode, model learns to predict
            masked extracted features in *config.proj_codevector_dim* space.
        Nr   r   )r   )r   r   r   r   r   r   )rM   r   r   use_return_dictr   r   rw   r   r   r   r   r   r  r   r	   rQ   r   )rL   r   r   r   r   r   r   kwargsextract_featuresrQ   encoder_outputss              r#   rR   zSEWModel.forward  sU     2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]11,?+55a;??+;<  #667GH,,-=>%!DD]EXEXYZE[]klN00Rc0d,,)/!5# ' 
 (*!#oab&999+)77&11
 	
r"   r   )NNNNN)r   r   r    r   r8   r   FloatTensorr   r  r   r   r   r   r	   rR   rT   rU   s   @r#   r   r   =  s    y . 7;26	,((, !,,t3, ((4/	,\  /36:)-,0#'4
llT)4
 t+4
 !,,t3	4

  $;4
 #Tk4
 D[4
 
	 4
 4
r"   r   c                       e Zd Zy)	SEWForCTCNr   r!   r"   r#   r  r    r$   r"   r  c                       e Zd Zy)SEWForSequenceClassificationNr   r!   r"   r#   r  r    r$   r"   r  )r  r  r   r   )5__doc__r   r   r    r   r   activationsr   integrations.deepspeedr   integrations.fsdpr   modeling_outputsr	   modeling_utilsr
   r?   r   utils.genericr   wav2vec2.modeling_wav2vec2r   r   r   r   r   r   r   r   r   r   r   configuration_sewr   _HIDDEN_STATES_START_POSITIONr   r&   r(   Moduler*   rI   rX   rf   rh   rj   rl   rn   r   r   r  r  __all__r!   r"   r#   <module>r$     sH       & ! @ 7 / - # 9    ) !" 	: 		6 		6 	( (V	* 	BII ,	. 		$ 		( 		* 	c
 c
L B B BJ x
! x
 x
v	 		#D 	 Zr"   