
    qi                     4   d dl Z d dlmZ d dlZd dlZd dlmZ d dlmZ ddl	m
Z ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZmZ ddlmZmZmZ ddlmZ ddl m!Z!m"Z"m#Z# ddl$m%Z% ddl&m'Z'  e#jP                  e)      Z* G d de      Z+ G d de      Z, G d de      Z- G d dej\                        Z/ G d dej\                        Z0 G d dej\                        Z1 G d d ej\                        Z2	 	 dDd!ej\                  d"ejf                  d#ejf                  d$ejf                  d%ejf                  dz  d&e4dz  d'e4d(ee!   fd)Z5 G d* d+ej\                        Z6 G d, d-ej\                        Z7 G d. d/e      Z8 G d0 d1ej\                        Z9e" G d2 d3e             Z:	 	 dEd4e;e<e<f   d5e4d6e<d%ejz                  dz  d7e<d8ej|                  fd9Z?e" G d: d;e:             Z@dZA e"d<=       G d> d?e:             ZB e"d@=       G dA dBe:             ZCg dCZDy)F    N)Callable)nn)CrossEntropyLoss   )initialization)ACT2FN)is_deepspeed_zero3_enabled)is_fsdp_managed_module)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputCausalLMOutputSequenceClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel*get_torch_context_manager_or_global_device)Unpack)TransformersKwargsauto_docstringlogging)is_flash_attention_requested   )	SEWConfigc                   &     e Zd Zd fd	Zd Z xZS )SEWNoLayerNormConvLayerc                 d   t         |           |dkD  r|j                  |dz
     nd| _        |j                  |   | _        t        j                  | j                  | j                  |j                  |   |j                  |   |j                        | _
        t        |j                     | _        y )Nr   r   kernel_sizestridebias)super__init__conv_dimin_conv_dimout_conv_dimr   Conv1dconv_kernelconv_stride	conv_biasconvr   feat_extract_activation
activationselfconfiglayer_id	__class__s      V/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/sew/modeling_sew.pyr"   z SEWNoLayerNormConvLayer.__init__/   s    <DqL6??8a<8a"OOH5II**84%%h/!!
	 !!?!?@    c                 J    | j                  |      }| j                  |      }|S N)r*   r,   r.   hidden_statess     r2   forwardzSEWNoLayerNormConvLayer.forward=   s$    		-06r3   r   __name__
__module____qualname__r"   r8   __classcell__r1   s   @r2   r   r   .   s    Ar3   r   c                   &     e Zd Zd fd	Zd Z xZS )SEWLayerNormConvLayerc                    t         |           |dkD  r|j                  |dz
     nd| _        |j                  |   | _        t        j                  | j                  | j                  |j                  |   |j                  |   |j                        | _
        t        j                  | j                  d      | _        t        |j                     | _        y )Nr   r   r   T)elementwise_affine)r!   r"   r#   r$   r%   r   r&   r'   r(   r)   r*   	LayerNorm
layer_normr   r+   r,   r-   s      r2   r"   zSEWLayerNormConvLayer.__init__D   s    <DqL6??8a<8a"OOH5II**84%%h/!!
	 ,,t'8'8TR !?!?@r3   c                     | j                  |      }|j                  dd      }| j                  |      }|j                  dd      }| j                  |      }|S )N)r*   	transposerE   r,   r6   s     r2   r8   zSEWLayerNormConvLayer.forwardS   sV    		-0%//B76%//B76r3   r9   r:   r?   s   @r2   rA   rA   C   s    Ar3   rA   c                   &     e Zd Zd fd	Zd Z xZS )SEWGroupNormConvLayerc                    t         |           |dkD  r|j                  |dz
     nd| _        |j                  |   | _        t        j                  | j                  | j                  |j                  |   |j                  |   |j                        | _
        t        |j                     | _        t        j                  | j                  | j                  d      | _        y )Nr   r   r   T)
num_groupsnum_channelsaffine)r!   r"   r#   r$   r%   r   r&   r'   r(   r)   r*   r   r+   r,   	GroupNormrE   r-   s      r2   r"   zSEWGroupNormConvLayer.__init___   s    <DqL6??8a<8a"OOH5II**84%%h/!!
	 !!?!?@,,$2C2CRVRcRclpqr3   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r5   )r*   rE   r,   r6   s     r2   r8   zSEWGroupNormConvLayer.forwardo   s2    		-066r3   r9   r:   r?   s   @r2   rK   rK   ^   s    r r3   rK   c                   $     e Zd Z fdZd Z xZS )SEWPositionalConvEmbeddingc                    t         |           t        j                  |j                  |j                  |j
                  |j
                  dz  |j                  |j                        | _        t        j                  j                  }t        t        j                  j                  d      r$t        j                  j                  j                  }t               r(dd l}|j                  j!                  | j                  j"                  d      5   || j                  dd      | _        d d d        t        | j                  d      rU| j                  j                  j"                  j$                  }| j                  j                  j"                  j&                  }n,| j                  j(                  }| j                  j*                  }|j                  j-                  | |       |j                  j-                  | |       n || j                  dd      | _        t/        |j
                        | _        t2        |j4                     | _        y # 1 sw Y   'xY w)	N   )r   paddinggroupsr   weight_normr   modifier_rankweight)namedimparametrizations)r!   r"   r   r&   hidden_sizenum_conv_pos_embeddingsnum_conv_pos_embedding_groupssqueeze_factorr*   utilsrX   hasattrr^   r	   	deepspeedzeroGatheredParametersr[   	original0	original1weight_gweight_vregister_external_parameterSEWSamePadLayerrV   r   r+   r,   )r.   r/   rX   re   rj   rk   r1   s         r2   r"   z#SEWPositionalConvEmbedding.__init__w   s   II6622a777((
	 hh**288,,m<((33??K%'224993C3CST2U I'		aH	Ityy"459955<<FF9955<<FF99--99--NN66tXFNN66tXF#DIIH!DDI&v'E'EF !?!?@I Is   IIc                 l    | j                  |      }| j                  |      }| j                  |      }|S r5   )r*   rV   r,   r6   s     r2   r8   z"SEWPositionalConvEmbedding.forward   s2    		-0]36r3   r:   r?   s   @r2   rS   rS   v   s     ADr3   rS   c                   $     e Zd Z fdZd Z xZS )rm   c                 P    t         |           |dz  dk(  rd| _        y d| _        y )NrU   r   r   )r!   r"   num_pad_remove)r.   r`   r1   s     r2   r"   zSEWSamePadLayer.__init__   s)    #:Q#>!#Car3   c                 V    | j                   dkD  r|d d d d d | j                    f   }|S Nr   )rq   r6   s     r2   r8   zSEWSamePadLayer.forward   s6    ")!Q0F43F3F2F0F*FGMr3   r:   r?   s   @r2   rm   rm      s    Kr3   rm   c                   $     e Zd Z fdZd Z xZS )SEWUpsamplingc                     t         |           t        j                  |j                  |j                  |j
                  z        | _        t        |j                     | _	        |j
                  | _        y r5   )
r!   r"   r   Linearr_   rb   
projectionr   r+   r,   r.   r/   r1   s     r2   r"   zSEWUpsampling.__init__   sW    ))F$6$68J8JVMbMb8bc !?!?@$33r3   c                 .   | j                  |      }| j                  |      }| j                  dkD  rc|j                         \  }}}|| j                  z  }|| j                  z  }|j	                  ||| j                  |      }|j	                  |||      }|S )Nr   )rx   r,   rb   sizereshape)r.   r7   bszsrc_lensrc_embed_dimtgt_lentgt_embed_dims          r2   r8   zSEWUpsampling.forward   s    66"*7*<*<*>'C- 3 33G)T-@-@@M)11#w@S@SUbcM)11#wNMr3   r:   r?   s   @r2   ru   ru      s    4r3   ru   c                   .     e Zd ZdZ fdZd Zd Z xZS )SEWFeatureEncoderz.Construct the features from raw audio waveformc           	         t         |           |j                  dk(  rDt        |d      gt	        |j
                  dz
        D cg c]  }t        ||dz          c}z   }nV|j                  dk(  r.t	        |j
                        D cg c]  }t        ||       }}nt        d|j                   d      t        j                  |      | _        d| _        d	| _        y c c}w c c}w )
Ngroupr   )r0   r   layerz`config.feat_extract_norm` is z), but has to be one of ['group', 'layer']FT)r!   r"   feat_extract_normrK   rangenum_feat_extract_layersr   rA   
ValueErrorr   
ModuleListconv_layersgradient_checkpointing_requires_grad)r.   r/   ir   r1   s       r2   r"   zSEWFeatureEncoder.__init__   s    ##w.0!DEINvOmOmpqOqIrIDE'Q?I K %%0NSTZTrTrNst0!DtKt01I1I0JJst  ==5&+#"I us   C"	C'c                 J    | j                         D ]	  }d|_         d| _        y NF)
parametersrequires_gradr   r.   params     r2   _freeze_parametersz$SEWFeatureEncoder._freeze_parameters   s(    __& 	(E"'E	(#r3   c                     |d d d f   }| j                   r| j                  rd|_        | j                  D ]
  } ||      } |S )NT)r   trainingr   r   )r.   input_valuesr7   
conv_layers       r2   r8   zSEWFeatureEncoder.forward   sP    $QW- 4==*.M'** 	6J&}5M	6 r3   )r;   r<   r=   __doc__r"   r   r8   r>   r?   s   @r2   r   r      s    8#"$

r3   r   modulequerykeyvalueattention_maskscalingdropoutkwargsc                    ||j                  d      dz  }t        j                  ||j                  dd            |z  }|||z   }t        j
                  j                  |d      }t        j
                  j                  ||| j                        }t        j                  ||      }	|	j                  dd      j                         }	|	|fS )NrH         rU   r   r]   )pr   r   )
r{   torchmatmulrI   r   
functionalsoftmaxr   r   
contiguous)
r   r   r   r   r   r   r   r   attn_weightsattn_outputs
             r2   eager_attention_forwardr      s     **R.D( <<s}}Q':;gEL!#n4==((2(>L==((6??([L,,|U3K''1-88:K$$r3   c                   (    e Zd ZdZ	 	 	 	 	 ddedededededed	edz  f fd
Z	 	 	 dde	j                  de	j                  dz  de	j                  dz  dedz  dee   dee	j                  e	j                  dz  ee	j                     dz  f   fdZ xZS )SEWAttentionz=Multi-headed attention from 'Attention Is All You Need' paperN	embed_dim	num_headsr   
is_decoderr    	is_causalr/   c                 
   t         |           || _        || _        || _        ||z  | _        || _        | j
                  |z  | j                  k7  rt        d| j                   d| d      | j
                  dz  | _        || _	        || _
        t        j                  |||      | _        t        j                  |||      | _        t        j                  |||      | _        t        j                  |||      | _        y )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).r   )r    )r!   r"   r   r   r   head_dimr/   r   r   r   r   r   rw   k_projv_projq_projout_proj)	r.   r   r   r   r   r    r   r/   r1   s	           r2   r"   zSEWAttention.__init__  s     	""!Y.MMI%$..8MdnnM]$YKr3  }}d*$"ii	94@ii	94@ii	94@		)YTBr3   r7   key_value_statesr   output_attentionsr   returnc                    |du}|j                   dd \  }}|r|j                   d   n|}	||d| j                  f}
||	d| j                  f} | j                  |      j                  |
 j	                  dd      }|r|n|} | j                  |      j                  | j	                  dd      } | j                  |      j                  | j	                  dd      }t        j                  | j                  j                  t              } || ||||f| j                  sdn| j                  | j                  |d|\  }}|j                  ||d      j!                         }| j#                  |      }||dfS )z#Input shape: Batch x Time x ChannelNrH   r   rU           )r   r   r   )shaper   r   viewrI   r   r   r   get_interfacer/   _attn_implementationr   r   r   r   r|   r   r   )r.   r7   r   r   r   r   is_cross_attentionr}   r   r~   q_input_shapekv_input_shapequery_statescurrent_states
key_statesvalue_statesattention_interfacer   r   s                      r2   r8   zSEWAttention.forward&  s    .T9 %**3B/W/A"((+wgr4==9wDMM: 7t{{=166FPPQRTUV-?)]5T[[055~FPPQRTUV
7t{{>277HRRSTVWX(?(M(MKK,,.E)
 %8
%
  $}}C$,,LL/
%
 
%
!\ "))#w;FFHmmK0L$..r3   )r   FTFN)NNF)r;   r<   r=   r   intfloatboolr   r"   r   Tensorr   r   tupler8   r>   r?   s   @r2   r   r     s    G  #'CC C 	C
 C C C D CD 15.2).1/||1/  ,,-1/ t+	1/
  $;1/ -.1/ 
u||U\\D0%2E2LL	M1/r3   r   c                   $     e Zd Z fdZd Z xZS )SEWFeedForwardc                    t         |           t        j                  |j                        | _        t        j                  |j                  |j                        | _	        t        |j                  t              rt        |j                     | _        n|j                  | _        t        j                  |j                  |j                        | _        t        j                  |j                         | _        y r5   )r!   r"   r   Dropoutactivation_dropoutintermediate_dropoutrw   r_   intermediate_sizeintermediate_dense
isinstance
hidden_actstrr   intermediate_act_fnoutput_densehidden_dropoutoutput_dropoutry   s     r2   r"   zSEWFeedForward.__init__[  s    $&JJv/H/H$I!"$))F,>,>@X@X"Yf''-'-f.?.?'@D$'-'8'8D$IIf&>&>@R@RS jj)>)>?r3   c                     | j                  |      }| j                  |      }| j                  |      }| j                  |      }| j	                  |      }|S r5   )r   r   r   r   r   r6   s     r2   r8   zSEWFeedForward.forwardh  sX    //>00?11-@))-8++M:r3   r:   r?   s   @r2   r   r   Z  s    @r3   r   c                   &     e Zd Z fdZddZ xZS )SEWEncoderLayerc                    t         |           t        |j                  |j                  |j
                  d|      | _        t        j                  |j                        | _
        t        j                  |j                  |j                        | _        t        |      | _        t        j                  |j                  |j                        | _        y )NF)r   r   r   r   r/   eps)r!   r"   r   r_   num_attention_headsattention_dropout	attentionr   r   r   r   rD   layer_norm_epsrE   r   feed_forwardfinal_layer_normry   s     r2   r"   zSEWEncoderLayer.__init__s  s    %((00,,
 zz&"7"78,,v'9'9v?T?TU*62 "V-?-?VEZEZ [r3   c                     |}| j                  |||      \  }}}| j                  |      }||z   }| j                  |      }|| j                  |      z   }| j	                  |      }|f}|r||fz  }|S )Nr   r   )r   r   rE   r   r   )r.   r7   r   r   attn_residualr   _outputss           r2   r8   zSEWEncoderLayer.forward  s    %)-.L] *8 *
&|Q ]3%56%(9(9-(HH--m< "&Gr3   r   r:   r?   s   @r2   r   r   r  s    \r3   r   c                   .     e Zd Z fdZ	 	 	 	 ddZ xZS )
SEWEncoderc                    t         |           || _        t        |      | _        t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        t        j                  |j                        | _        t        j                   t#        |j$                        D cg c]  }t'        |       c}      | _        t+        |      | _        d| _        y c c}w )Nr   F)r!   r"   r/   rS   pos_conv_embedr   	AvgPool1drb   poolrD   r_   r   rE   r   r   r   r   r   num_hidden_layersr   layersru   upsampler   )r.   r/   r   r1   s      r2   r"   zSEWEncoder.__init__  s    8@LL!6!68M8MN	,,v'9'9v?T?TUzz&"7"78mmeFLdLdFe$f_V%<$fg%f-&+# %gs   Dc           	      *   |rdnd }|rdnd }||j                  d      j                  dd|j                  d         }t        | j                        rd|| <   |d|v r|nd }ngd|| <   |j                         j                  d      }	|	| j                  j                  z  }
|j                  d   | j                  j                  z  }t        j                  d||
j                        j                  dd      j                  |
j                  d   d      }||
j                  dd      k  j                         }d|d d d d d d f   j                  |j                  	      z
  }|t        j                  |j                        j                   z  }|j                  |j                  d   d|j                  d   |j                  d         }|j                  d   }|j#                  dd      }| j%                  |      }| j'                  |      }t!        |j)                  d      |j)                  d            }|d
d |f   |d
d |f   z   }|j#                  dd      }| j+                  |      }| j-                  |      }t/               xs t1        |       }| j2                  D ]j  }|r||fz   }t        j4                  g       }| j6                  xr || j                  j8                  k  }|r|r ||||      }|d   }|rd}|sb|d   fz   }l |r||fz   }| j;                  |      }|j                  d   |k  r4t<        j>                  jA                  |ddd||j                  d   z
  f      }|stC        d |||fD              S tE        |||      S )N rH   r   rU   r   r   deviceg      ?dtype.r   NNc              3   &   K   | ]	  }||  y wr5   r   ).0vs     r2   	<genexpr>z%SEWEncoder.forward.<locals>.<genexpr>  s     mq_`_lms   last_hidden_stater7   
attentions)#	unsqueezerepeatr   r   r/   longsumrb   r   aranger   r   expandtor   finfominrI   r   r   r{   rE   r   r	   r
   r   randr   	layerdropr   r   r   padr   r   )r.   r7   r   r   output_hidden_statesreturn_dictall_hidden_statesall_self_attentionsexpand_attention_maskinput_lengthsoutput_lengthsmax_encoder_lengthattention_idsn_input_timestepsposition_embeddingspooled_hidden_states
min_lengthsynced_gpusr   dropout_probabilityskip_the_layerlayer_outputss                         r2   r8   zSEWEncoder.forward  s    #7BD$5b4%$2$<$<R$@$G$G1mNaNabcNd$e!+DKK88;4454B4NSTXfSfmq 9<445!/!4!4!6 ; ;B ?!.$++2L2L!L%2%8%8%;t{{?Y?Y%Y"LL$6~?T?TUT!R[VN003R8 
 #0.2E2Eb!2L"L!R!R!T "%~atQ6F'G'J'JQ^QdQd'J'e!e!/%++m>Q>Q2R2V2V!V!/!6!6"((+Q0D0DR0H.J^J^_aJb" *//2%//15"11-@#yy7,11"57K7P7PQS7TU
,S+:+-=>ATUXZe[eZeUeAff%//156]302R6LT6R[[ 	PE#$58H$H! #(**R.!]]Z/BT[[EZEZ/ZN![ %!.Te! !.a 0 , &9]1=M<O&O#'	P*   1]4D Dm4q!$55MM--maAGX[h[n[nop[qGq=rsMm]4EGZ$[mmm++*
 	
r3   )NFFTr:   r?   s   @r2   r   r     s    	, "W
r3   r   c                       e Zd ZU eed<   dZdZdZdZdZ	dZ
dZ ej                         d        Zdej                  ez  fd	Zd
edej                  fdZy)SEWPreTrainedModelr/   sewr   audioTFc           
         t        |t              rt        j                  |j                  j
                  ddt        j                  d|j                  j                  d   |j                  j                  z  z        z         t        j                  |j                  j                  d       nt        |t        j                        r8t        j                  |j
                  d| j                  j                         nut        |t        j                   t        j"                  f      r@t        j$                  |j                         t        j&                  |j
                         nt        |t        j(                        rt+               rddl}t/        |d      rht/        |d      r\|j0                  j3                  |j4                  |j6                  gd	      5  t        j8                  |j
                         ddd       no|j0                  j3                  |j
                  d	      5  t        j8                  |j
                         ddd       nt        j8                  |j
                         t        |t        j                  t        j(                  f      r-|j                   t        j$                  |j                         yyy# 1 sw Y   axY w# 1 sw Y   mxY w)
zInitialize the weightsr   rU   r   )meanstdr   Nrk   rj   rY   )r   rS   initnormal_r*   r[   mathsqrtr   in_channels	constant_r    r   rw   r/   initializer_rangerD   rP   zeros_ones_r&   r	   re   rd   rf   rg   rk   rj   kaiming_normal_)r.   r   re   s      r2   _init_weightsz SEWPreTrainedModel._init_weights  s    f89LL""		!v{{'>'>q'AFKKD[D['["\]]
 NN6;;++Q/		*LLSdkk6S6STr|| <=KK$JJv}}%		*)+ 6:.76:3N"::FOOV__;]mn:o <,,V]];< < #::6==XY:Z <,,V]];< < $$V]]3fryy"))45&++:QKK$ ;R5< << <s    K/ KKK$r  c                     d }t        | j                  j                  | j                  j                        D ]  \  }} ||||      } |S )zH
        Computes the output length of the convolutional layers
        c                 >    t        j                  | |z
  |d      dz   S )Nfloor)rounding_moder   )r   div)input_lengthr   r   s      r2   _conv_out_lengthzMSEWPreTrainedModel._get_feat_extract_output_lengths.<locals>._conv_out_length+  s"     99\K7wWZ[[[r3   )zipr/   r'   r(   )r.   r  r>  r   r   s        r2    _get_feat_extract_output_lengthsz3SEWPreTrainedModel._get_feat_extract_output_lengths&  sQ    
	\
 $'t{{'>'>@W@W#X 	QK,]KPM	Q r3   feature_vector_lengthr   c                    | j                  |j                  d            j                  t        j                        }|j
                  d   }t        j                  ||f|j                  |j                        }d|t        j                  |j
                  d   |j                        |dz
  f<   |j                  dg      j                  d      j                  dg      j                         }|S )NrH   r   )r   r   r   r   )r@  r  r  r   r  r   zerosr   r   r  flipcumsumr   )r.   rA  r   r  
batch_sizes        r2   "_get_feature_vector_attention_maskz5SEWPreTrainedModel._get_feature_vector_attention_mask5  s    >>~?Q?QRT?UVYYZ_ZdZde#))!,
./~7K7KTbTiTi
 uv^%9%9!%<^EZEZ[]kno]opq',,bT299"=BBB4HMMOr3   N)r;   r<   r=   r   __annotations__base_model_prefixmain_input_nameinput_modalitiessupports_gradient_checkpointing_supports_flash_attn_supports_sdpa_supports_flex_attnr   no_gradr7  
LongTensorr   r@  rG  r   r3   r2   r'  r'    s|    $O&*#NU]]_% %<e>N>NQT>T 
 
]b]m]m 
r3   r'  r   	mask_probmask_length	min_masksr   c                    | \  }dk  rt        d      kD  rt        d d d      t        j                  j                  d      j	                         fd}|-|j                         j                  d      j                         nt        |      D cg c]  } c}}t        j                  |ft        	      }	g }
 |      }|d
k(  r|	S |D ]  } ||      }t        j                  j                  t        j                  |dz
  z
        |d      }t        |      d
k(  rdz
  }n|d
   }t        j                  |t        j                  ||z
  t        j                   	      |z  g      }|
j#                  |        t        j$                  |
      }
t        j&                  |
dddddf   ||f      }
|
j)                  ||z        }
t        j                        ddddf   }t        j&                  |||f      j)                  ||z        }|
|z   }
|
j+                         dz
  kD  rdz
  |
|
dz
  kD  <   t        j,                  |	|
dd       |	S c c}w )an  
    Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
    ASR](https://huggingface.co/papers/1904.08779). Note that this method is not optimized to run on TPU and should be run on
    CPU as part of the preprocessing during training.

    Args:
        shape: The shape for which to compute masks. This should be of a tuple of size 2 where
               the first element is the batch size and the second element is the length of the axis to span.
        mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
                    independently generated mask spans of length `mask_length` is computed by
                    `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
                    actual percentage will be smaller.
        mask_length: size of the mask
        min_masks: minimum number of masked spans
        attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
                        each batch dimension.
    r   z&`mask_length` has to be bigger than 0.zO`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: z and `sequence_length`: `c                     t        | z  z  z         }t        |      }|z  kD  rz  }| dz
  z
  |k  rt        | dz
  z
  d      }|S )z;Given input length, compute how many spans should be maskedr   r   )r   max)r=  num_masked_spanepsilonrS  rR  rT  sequence_lengths     r2   compute_num_masked_spanz6_compute_mask_indices.<locals>.compute_num_masked_spanh  so    i,6DwNOoy9 [(?:-<O ;?+o=!,+/"BAFOr3   NrH   r   r   F)replace)r   nprandomr  itemdetachr  tolistr   rC  r   choicer  lenconcatenateonesint32appendarraybroadcast_tor|   rX  put_along_axis)r   rR  rS  r   rT  rF  r\  r   r  spec_aug_maskspec_aug_mask_idxsmax_num_masked_spanr=  rY  spec_aug_mask_idxdummy_mask_idxoffsetsrZ  r[  s    `` `            @@r2   _compute_mask_indicesrr  B  s   0 #(JQABB_$]^i]j&&7q:
 	
 iinnQ$$&G $ % 	##B'..0',Z'89!o9  HHj/:$GM1/Ba% 51,? II,,IIlkAo67RW - 
  !Q& -q0N.q1NNN(;o(MUWU]U] ^ao op
 	!!"34/52 "45 1a:&5H+(V ,33J@SVa@ab ii$T4]3Goog
4G'UV^^'+5G ,g5 /A"55GVYZGZ-!0CCD m%7B?w :s   $	I+c                   &    e Zd Zdef fdZ	 	 ddej                  dej                  dz  dej                  dz  fdZe		 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  d	edz  d
edz  dedz  deez  fd       Z xZS )SEWModelr/   c                    t         |   |       || _        t        |      | _        t        j                  |j                  d   |j                        | _	        |j                  d   |j                  k7  | _        | j                  r2t        j                  |j                  d   |j                        | _        t        j                  |j                        | _        |j"                  dkD  s|j$                  dkD  rEt        j&                  t)        j*                  |j                        j-                               | _        t1        |      | _        | j5                          y )NrH   r   r   )r!   r"   r/   r   feature_extractorr   rD   r#   r   rE   r_   project_featuresrw   feature_projectionr   feat_proj_dropoutfeature_dropoutmask_time_probmask_feature_prob	Parameterr   r   uniform_masked_spec_embedr   encoder	post_initry   s     r2   r"   zSEWModel.__init__  s     !26!:,,vr':@U@UV & 3v7I7I I  &(ii0CVEWEW&XD#!zz&*B*BC  3&&*B*BS*H%'\\%,,v?Q?Q2R2[2[2]%^D"!&) 	r3   Nr7   mask_time_indicesr   c                    t        | j                  dd      s|S |j                         \  }}}|)| j                  j	                  |j
                        ||<   n| j                  j                  dkD  r| j                  rt        ||f| j                  j                  | j                  j                  || j                  j                        }t        j                  ||j                  t        j                        }| j                  j	                  |j
                        ||<   | j                  j                  dkD  r| j                  rt        ||f| j                  j                  | j                  j                   | j                  j"                        }t        j                  ||j                  t        j                        }|dddf   j%                  d|d      }d||<   |S )	z
        Masks extracted features along time axis and/or along feature axis according to
        [SpecAugment](https://huggingface.co/papers/1904.08779).
        apply_spec_augmentTNr   )rR  rS  r   rT  )r   r   )rR  rS  rT  rH   )getattrr/   r{   r  r  r   r{  r   rr  mask_time_lengthmask_time_min_masksr   tensorr   r   r|  mask_feature_lengthmask_feature_min_masksr  )r.   r7   r  r   rF  r[  r_   mask_feature_indicess           r2   _mask_hidden_stateszSEWModel._mask_hidden_states  s    t{{$8$?   4A3E3E3G0
O[(/3/E/E/H/HI\I\/]M+,[[''!+ 5_-++44 KK88-++99! !&->}G[G[chcmcm n/3/E/E/H/HI\I\/]M+,;;((1,#8[)++77 KK;;++<<	$  $)<<0D]MaMainisis#t #74#@#G#GO]_#` 23M./r3   r   r   r  r  r   c                 Z   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }| j	                  |      }|j                  dd      }| j                  |      }| j                  r| j                  |      }| j                  |      }	|| j                  |	j                  d   |      }| j                  |	|      }	| j                  |	||||      }
|
d   }	|s	|	f|
dd z   S t        |	|
j                  |
j                         S )a/  
        mask_time_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices to mask extracted features for contrastive loss. When in training mode, model learns to predict
            masked extracted features in *config.proj_codevector_dim* space.
        Nr   rU   )r  r   r   r  r  r   r  )r/   r   r  use_return_dictrv  rI   rE   rw  rx  rz  rG  r   r  r  r   r7   r  )r.   r   r   r  r   r  r  r   extract_featuresr7   encoder_outputss              r2   r8   zSEWModel.forward  sU     2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]11,?+55a;??+;<  #667GH,,-=>%!DD]EXEXYZE[]klN00Rc0d,,)/!5# ' 
 (*!#oab&999+)77&11
 	
r3   r  NNNNN)r;   r<   r=   r   r"   r   FloatTensorrQ  r  r   r   r   r   r   r8   r>   r?   s   @r2   rt  rt    s    y . 7;26	,((, !,,t3, ((4/	,\  /36:)-,0#'4
llT)4
 t+4
 !,,t3	4

  $;4
 #Tk4
 D[4
 
	 4
 4
r3   rt  zk
    SEW Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).
    )custom_introc                        e Zd Zddedz  f fdZd Zd Zd Ze	 	 	 	 	 dde	j                  dz  de	j                  dz  d	edz  d
edz  dedz  de	j                  dz  deez  fd       Z xZS )	SEWForCTCNtarget_langc                    t         |   |       t        |      | _        t	        j
                  |j                        | _        || _        |j                  t        d| j                   d      t        |d      r|j                  r|j                  n|j                  }t	        j                   ||j                        | _        | j%                          y)a-  
        target_lang (`str`, *optional*):
            Language id of adapter weights. Adapter weights are stored in the format adapter.<lang>.safetensors or
            adapter.<lang>.bin. Only relevant when using an instance of [`SEWForCTC`] with adapters. Uses 'eng' by
            default.
        NzYou are trying to instantiate z with a configuration that does not define the vocabulary size of the language model head. Please instantiate the model as follows: `SEWForCTC.from_pretrained(..., vocab_size=vocab_size)`. or define `vocab_size` of your model's configuration.add_adapter)r!   r"   rt  r(  r   r   final_dropoutr   r  
vocab_sizer   r1   rd   r  output_hidden_sizer_   rw   lm_headr  )r.   r/   r  r  r1   s       r2   r"   zSEWForCTC.__init__>  s     	 F#zz&"6"67&$00@ AH H  *1)GFL^L^F%%djdvdv 	 yy!3V5F5FG 	r3   c                 8   t               t        j                  d      k(  ry| j                  }|&t	        | j
                  dd      t        d| d      |-t	        | j
                  dd      t        j                  d       y|| j                  |d       yy)	a'  
        This method overwrites [`~PreTrainedModel.tie_weights`] so that adapter weights can be correctly loaded when
        passing `target_lang=...` to `from_pretrained(...)`.

        This method is **not** supposed to be called by the user and is prone to be changed in the future.
        metaNadapter_attn_dimzCannot pass `target_lang`: z- if `config.adapter_attn_dim` is not defined.z)By default `target_lang` is set to 'eng'.T)
force_load)
r   r   r   r  r  r/   r   loggerinfoload_adapter)r.   r   r  s      r2   tie_weightszSEWForCTC.tie_weights[  s     675<<;OO &&"wt{{<NPT'U']:;-Gtuvv WT[[:Ld%S%_KKCD$kd; %r3   c                 L    | j                   j                  j                          yz
        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
        not be updated during training.
        Nr(  rv  r   r.   s    r2   freeze_feature_encoderz SEWForCTC.freeze_feature_encoders      
 	""557r3   c                 P    | j                   j                         D ]	  }d|_         yz
        Calling this function will disable the gradient computation for the base model so that its parameters will not
        be updated during training. Only the classification head will be updated.
        FNr(  r   r   r   s     r2   freeze_base_modelzSEWForCTC.freeze_base_modelz  (    
 XX((* 	(E"'E	(r3   r   r   r   r  r  labelsr   c           
         ||n| j                   j                  }|I|j                         | j                   j                  k\  r"t	        d| j                   j                         | j                  |||||      }|d   }	| j                  |	      }	| j                  |	      }
d}|b||n$t        j                  |t        j                        }| j                  |j                  d            j                  t        j                        }|dk\  }|j                  d      }|j                  |      }t        j                   j#                  |
dt        j$                        j'                  dd      }t        j(                  j*                  j-                  d	
      5  t        j                   j/                  ||||| j                   j0                  | j                   j2                  | j                   j4                        }ddd       |s|
f|t6        d z   }||f|z   S |S t9        ||
|j:                  |j<                        S # 1 sw Y   ExY w)a  
        labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
            Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to
            the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`.
            All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
            config.vocab_size - 1]`.
        Nz$Label values must be <= vocab_size: r  r   r   rH   )r]   r   r   F)enabled)blank	reductionzero_infinitylosslogitsr7   r  )r/   r  rX  r  r   r(  r   r  r   	ones_liker  r@  r  r  masked_selectr   r   log_softmaxfloat32rI   backendscudnnflagsctc_losspad_token_idctc_loss_reductionctc_zero_infinity_HIDDEN_STATES_START_POSITIONr   r7   r  )r.   r   r   r   r  r  r  r   r   r7   r  r  r  labels_masktarget_lengthsflattened_targets	log_probsoutputs                     r2   r8   zSEWForCTC.forward  s'   $ &1%<k$++B]B]&**,$++2H2H"HCDKKDZDZC[\]](()/!5#  
  
]3m, #1"<%//R^fkfpfpBq  !AA.BTBTUWBXY\\]b]g]ghM !A+K(__R0N & 4 4[ A 11&b1V``abdefI%%++E+: 	}}--%!"++22"kk<<"&++"?"? . 	 Y)F)G!HHF)-)9TGf$EvEfG4I4IV]VhVh
 	
	 	s   A#IIr5   r  )r;   r<   r=   r   r"   r  r  r  r   r   r   r   r   r   r8   r>   r?   s   @r2   r  r  8  s    C$J :<08(  /3)-,0#'&*E
llT)E
 t+E
  $;	E

 #TkE
 D[E
 t#E
 
	E
 E
r3   r  z
    SEW Model with a sequence classification head on top (a linear layer over the pooled output) for tasks like
    SUPERB Keyword Spotting.
    c                        e Zd Z fdZd Zd Ze	 	 	 	 	 ddej                  dz  dej                  dz  de	dz  de	dz  d	e	dz  d
ej                  dz  de
ez  fd       Z xZS )SEWForSequenceClassificationc                    t         |   |       t        |d      r|j                  rt	        d      t        |      | _        |j                  dz   }|j                  r0t        j                  t        j                  |      |z        | _        t        j                  |j                  |j                         | _        t        j                  |j                   |j$                        | _        | j)                          y )Nr  zZSequence classification does not support the use of SEW adapters (config.add_adapter=True)r   )r!   r"   rd   r  r   rt  r(  r   use_weighted_layer_sumr   r}  r   rf  layer_weightsrw   r_   classifier_proj_size	projector
num_labels
classifierr  )r.   r/   
num_layersr1   s      r2   r"   z%SEWForSequenceClassification.__init__  s     6=)f.@.@l  F#--1
((!#ejj.Dz.Q!RD6#5#5v7R7RS))F$?$?ARARS 	r3   c                 L    | j                   j                  j                          yr  r  r  s    r2   r  z3SEWForSequenceClassification.freeze_feature_encoder  r  r3   c                 P    | j                   j                         D ]	  }d|_         yr  r  r   s     r2   r  z.SEWForSequenceClassification.freeze_base_model  r  r3   Nr   r   r   r  r  r  r   c                 <   ||n| j                   j                  }| j                   j                  rdn|}| j                  |||||      }| j                   j                  rr|t           }	t        j                  |	d      }	t        j                  j                  | j                  d      }
|	|
j                  ddd      z  j                  d      }	n|d   }	| j                  |	      }	||	j                  d      }n| j                  |	j                   d   |      }|j#                  d      j%                  dd|	j                   d         }d	|	| <   |	j                  d      |j                  d      j                  dd      z  }| j'                  |      }d}|Ft)               } ||j                  d| j                   j*                        |j                  d            }|s|f|t        d z   }||f|z   S |S t-        |||j.                  |j0                  
      S )a  
        input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
            Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
            into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library
            (`pip install torchcodec`) or the soundfile library (`pip install soundfile`).
            To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and conversion
            into a tensor of type `torch.FloatTensor`. See [`SEWProcessor.__call__`] for details.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        NTr  r   r   rH   r   rU   r   r  )r/   r  r  r(  r  r   stackr   r   r   r  r   r  r  r+  rG  r   r	  r
  r  r   r  r   r7   r  )r.   r   r   r   r  r  r  r   r   r7   norm_weightspooled_outputpadding_maskexpand_padding_maskr  r  loss_fctr  s                     r2   r8   z$SEWForSequenceClassification.forward  s   0 &1%<k$++B]B]'+{{'I'ItOc(()/!5#  
 ;;--#$ABM!KK1=M==001C1C0LL*\->->r1a-HHMMRSMTM#AJM}5!)..1.5MBB=CVCVWXCY[ijL"."8"8"<"C"CAq-J]J]^_J`"a25M../)--!-4|7G7GA7G7N7S7STVXY7ZZM/')HFKKDKK,B,BCV[[QS_UDY)F)G!HHF)-)9TGf$EvE'!//))	
 	
r3   r  )r;   r<   r=   r"   r  r  r   r   r   r   r   r   r8   r>   r?   s   @r2   r  r    s    "8(  /3)-,0#'&*C
llT)C
 t+C
  $;	C

 #TkC
 D[C
 t#C
 
)	)C
 C
r3   r  )r  r  rt  r'  )Nr   rs   )Er/  collections.abcr   numpyr^  r   r   torch.nnr    r   r-  activationsr   integrations.deepspeedr	   integrations.fsdpr
   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   modeling_utilsr   r   r   processing_utilsr   rc   r   r   r   utils.genericr   configuration_sewr   
get_loggerr;   r  r   rA   rK   ModulerS   rm   ru   r   r   r   r   r   r   r   r   r'  r   r   rQ  ndarrayrr  rt  r  r  r  __all__r   r3   r2   <module>r     s  *  $    % & ! @ 7 B 9 Y Y r r & @ @ 9 ( 
		H	%8 *6 66 0( (Vbii BII ,#		 #X !%II%<<% 
% <<	%
 LL4'% T\% % '(%8S/299 S/lRYY 0!0 !Hc
 c
L B B BR /3tc?tt t $$t+	t
 t ZZtn x
! x
 x
v !"  
K
" K

K
\ e
#5 e
e
P Zr3   