
    qi.                        d Z ddlZddlmZ ddlmZ ddlmZ ddl	m
Z
 ddlmZ ddlmZ dd	lmZ d
dlmZmZmZmZmZmZmZ ddlmZ dZ G d dej8                        Z G d de      Z G d de      Z G d dej8                        Z  G d de      Z! G d de      Z"e G d de             Z# G d dee#      Z$ G d de      Z% G d  d!e      Z&g d"Z'y)#zPyTorch Hubert model.    N   )initialization)ACT2FN)is_deepspeed_zero3_enabled)BaseModelOutput)PreTrainedModel)auto_docstring   )Wav2Vec2EncoderWav2Vec2EncoderStableLayerNormWav2Vec2FeatureEncoderWav2Vec2ForCTC!Wav2Vec2ForSequenceClassificationWav2Vec2ModelWav2Vec2SamePadLayer   )HubertConfigc                   $     e Zd Z fdZd Z xZS )HubertPositionalConvEmbeddingc                    t         |           t        j                  |j                  |j                  |j
                  |j
                  dz  |j                        | _        d | _        |j                  r&t        j                  |j                        | _        nt        j                  j                  }t        t        j                  j                  d      r$t        j                  j                  j                  }t               r(dd l}|j"                  j%                  | j                  j&                  d      5   || j                  dd      | _        d d d        t        | j                  d      rU| j                  j                  j&                  j(                  }| j                  j                  j&                  j*                  }n,| j                  j,                  }| j                  j.                  }|j"                  j1                  | |       |j"                  j1                  | |       n || j                  dd      | _        t3        |j
                        | _        t6        |j8                     | _        y # 1 sw Y   'xY w)	Nr
   )kernel_sizepaddinggroupsweight_normr   modifier_rankweight)namedimparametrizations)super__init__nnConv1dhidden_sizenum_conv_pos_embeddingsnum_conv_pos_embedding_groupsconv
batch_normconv_pos_batch_normBatchNorm1dutilsr   hasattrr    r   	deepspeedzeroGatheredParametersr   	original0	original1weight_gweight_vregister_external_parameterHubertSamePadLayerr   r   feat_extract_activation
activation)selfconfigr   r.   r3   r4   	__class__s         [/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/hubert/modular_hubert.pyr"   z&HubertPositionalConvEmbedding.__init__)   s   II6622a777
	 %% nnV-?-?@DO((..Krxx00-@ hh77CC)+ ^^66tyy7G7GWX6Y M +DIIH! LDIM499&89#yy99@@JJH#yy99@@JJH#yy11H#yy11H::4J::4J'		aH	)&*H*HI !?!?@M Ms   ?I??J	c                     |j                  dd      }| j                  | j                  |      }| j                  |      }| j                  |      }| j	                  |      }|j                  dd      }|S )Nr   r
   )	transposer)   r(   r   r8   r9   hidden_statess     r<   forwardz%HubertPositionalConvEmbedding.forwardN   sn    %//15??& OOM:M		-0]36%//15    __name__
__module____qualname__r"   rA   __classcell__r;   s   @r<   r   r   (   s    #AJ	rB   r   c                       e Zd Zy)r6   NrD   rE   rF    rB   r<   r6   r6   Z       rB   r6   c                       e Zd Zy)HubertFeatureEncoderNrJ   rK   rB   r<   rN   rN   ^   rL   rB   rN   c                   $     e Zd Z fdZd Z xZS )HubertFeatureProjectionc                 n   t         |           |j                  | _        | j                  r3t        j                  |j
                  d   |j                        | _        t        j                  |j
                  d   |j                        | _
        t        j                  |j                        | _        y )N)eps)r!   r"   feat_proj_layer_normr#   	LayerNormconv_dimlayer_norm_eps
layer_normLinearr%   
projectionDropoutfeat_proj_dropoutdropoutr9   r:   r;   s     r<   r"   z HubertFeatureProjection.__init__c   s}    $*$?$?!$$ ll6??2+>FDYDYZDO))FOOB$79K9KLzz&":":;rB   c                     | j                   r| j                  |      }| j                  |      }| j                  |      }|S )N)rT   rX   rZ   r]   r?   s     r<   rA   zHubertFeatureProjection.forwardk   s;    $$ OOM:M6]3rB   rC   rH   s   @r<   rP   rP   b   s    <rB   rP   c                       e Zd Zy)HubertEncoderNrJ   rK   rB   r<   ra   ra   t   rL   rB   ra   c                       e Zd Zy)HubertEncoderStableLayerNormNrJ   rK   rB   r<   rc   rc   x   rL   rB   rc   c                       e Zd ZU eed<   dZdZdZddgZdZ	dZ
dZdZ ej                         d        Zd	ej                   ez  fd
Zdedej                   fdZy)HubertPreTrainedModelr:   hubertinput_valuesaudioHubertEncoderLayerParametrizedConv1dTc                    t        |t        j                        rct        j                  |j
                  d| j                  j                         |j                   t        j                  |j                         yyt        |t        j                  t        j                  t        j                  f      rt        j                  |j                         t        j                  |j
                         t        |dd      ^t        j                  |j                         t        j                  |j                          t        j                  |j"                         yyt        |t        j$                        rt'               rddl}t+        |d      rht+        |d      r\|j,                  j/                  |j0                  |j2                  gd      5  t        j4                  |j
                         ddd       no|j,                  j/                  |j
                  d      5  t        j4                  |j
                         ddd       nt        j4                  |j
                         |j                   t        j                  |j                         yyt        |t6              r-t+        |d	      r t        j8                  |j:                         yyt        |t<              rHt+        |d
      r;t        j>                  |j@                  d| j                  jB                  dz   z         yyy# 1 sw Y   xY w# 1 sw Y   xY w)zInitialize the weights        )meanstdNrunning_meanr   r4   r3   r   masked_spec_embedlayer_weightsg      ?r   )"
isinstancer#   rY   initnormal_r   r:   initializer_rangebiaszeros_rU   	GroupNormr+   ones_getattrro   running_varnum_batches_trackedr$   r   r.   r-   r/   r0   r4   r3   kaiming_normal_HubertModeluniform_rp   HubertForSequenceClassification	constant_rq   num_hidden_layers)r9   moduler.   s      r<   _init_weightsz#HubertPreTrainedModel._init_weights   sF    fbii(LLSdkk6S6ST{{&FKK( 'r||R^^ LMKK$JJv}}%v~t4@F//0

6--.F667 A 		*)+ 6:.76:3N"::FOOV__;]mn:o <,,V]];< < #::6==XY:Z <,,V]];< < $$V]]3{{&FKK( ',v23f667 4 ?@v/v33SDKK<Y<Y\]<]5^_ 0 A< << <s    L0% L<0L9<Minput_lengthsc                     d }t        | j                  j                  | j                  j                        D ]  \  }} ||||      } |S )zH
        Computes the output length of the convolutional layers
        c                 >    t        j                  | |z
  |d      dz   S )Nfloor)rounding_moder   )torchdiv)input_lengthr   strides      r<   _conv_out_lengthzPHubertPreTrainedModel._get_feat_extract_output_lengths.<locals>._conv_out_length   s"     99\K7wWZ[[[rB   )zipr:   conv_kernelconv_stride)r9   r   r   r   r   s        r<    _get_feat_extract_output_lengthsz6HubertPreTrainedModel._get_feat_extract_output_lengths   sQ    
	\
 $'t{{'>'>@W@W#X 	QK,]KPM	Q rB   feature_vector_lengthattention_maskc                    | j                  |j                  d            j                  t        j                        }|j
                  d   }t        j                  ||f|j                  |j                        }d|t        j                  |j
                  d   |j                        |dz
  f<   |j                  dg      j                  d      j                  dg      j                         }|S )NrR   r   )dtypedevicer   )r   )r   sumtor   longshapezerosr   r   arangeflipcumsumbool)r9   r   r   output_lengths
batch_sizes        r<   "_get_feature_vector_attention_maskz8HubertPreTrainedModel._get_feature_vector_attention_mask   s    >>~?Q?QRT?UVYYZ_ZdZde#))!,
./~7K7KTbTiTi
 uv^%9%9!%<^EZEZ[]kno]opq',,bT299"=BBB4HMMOrB   N)rD   rE   rF   r   __annotations__base_model_prefixmain_input_nameinput_modalities_no_split_modulessupports_gradient_checkpointing_supports_flash_attn_supports_sdpa_supports_flex_attnr   no_gradr   
LongTensorintr   r   rK   rB   r<   re   re   |   s     $O-/CD&*#NU]]_!` !`Fe>N>NQT>T 
 
]b]m]m 
rB   re   c                        e Zd Zdef fdZd Z	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  de	dz  d	e	dz  d
e	dz  de
ez  fdZ xZS )r~   r:   c                    t         |   |       || _        t        |      | _        t        |      | _        |j                  dkD  s|j                  dkD  rEt        j                  t        j                  |j                        j                               | _        |j                   rt#        |      | _        nt'        |      | _        | j)                          | `y )Nrl   )r!   r"   r:   rN   feature_extractorrP   feature_projectionmask_time_probmask_feature_probr#   	Parameterr   Tensorr%   r   rp   do_stable_layer_normrc   encoderra   	post_initadapterr^   s     r<   r"   zHubertModel.__init__   s     !5f!="9&"A  3&&*B*BS*H%'\\%,,v?Q?Q2R2[2[2]%^D"&&7?DL(0DL 	LrB   c                     t        d      )NzNot needed for Hubert)AttributeError)r9   s    r<   freeze_feature_encoderz"HubertModel.freeze_feature_encoder   s    455rB   Nrg   r   mask_time_indicesoutput_attentionsoutput_hidden_statesreturn_dictreturnc                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }| j	                  |      }|j                  dd      }|| j                  |j                  d   |      }| j                  |      }	| j                  |	|      }	| j                  |	||||      }
|
d   }	|s	|	f|
dd z   S t        |	|
j                  |
j                        S )a1  
        mask_time_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices to mask extracted features for contrastive loss. When in training mode, model learns to predict
            masked extracted features in *config.proj_codevector_dim* space.

        Example:

        ```python
        >>> from transformers import AutoProcessor, HubertModel
        >>> from datasets import load_dataset

        >>> processor = AutoProcessor.from_pretrained("facebook/hubert-large-ls960-ft")
        >>> model = HubertModel.from_pretrained("facebook/hubert-large-ls960-ft")


        >>> def map_to_array(example):
        ...     example["speech"] = example["audio"]["array"]
        ...     return example


        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
        >>> ds = ds.map(map_to_array)

        >>> input_values = processor(ds["speech"][0], return_tensors="pt").input_values  # Batch size 1
        >>> hidden_states = model(input_values).last_hidden_state
        ```Nr   r
   )r   )r   r   r   r   r   )last_hidden_stater@   
attentions)r:   r   r   use_return_dictr   r>   r   r   r   _mask_hidden_statesr   r   r@   r   )r9   rg   r   r   r   r   r   kwargsextract_featuresr@   encoder_outputss              r<   rA   zHubertModel.forward   s,   H 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]11,?+55a;%!DDEUE[E[\]E^`noN//0@A00Rc0d,,)/!5# ' 
 (*!#oab&999+)77&11
 	
rB   )NNNNN)rD   rE   rF   r   r"   r   r   r   FloatTensorr   tupler   rA   rG   rH   s   @r<   r~   r~      s    | &6 /36:)-,0#'E
llT)E
 t+E
 !,,t3	E

  $;E
 #TkE
 D[E
 
	 E
rB   r~   c                       e Zd Zy)HubertForCTCNrJ   rK   rB   r<   r   r   '  rL   rB   r   c                       e Zd Zy)r   NrJ   rK   rB   r<   r   r   +  rL   rB   r   )r   r   r~   re   )(__doc__r   torch.nnr#    r   rs   activationsr   integrations.deepspeedr   modeling_outputsr   modeling_utilsr   r,   r	   wav2vec2.modeling_wav2vec2r   r   r   r   r   r   r   configuration_hubertr   _HIDDEN_STATES_START_POSITIONModuler   r6   rN   rP   ra   rc   re   r~   r   r   __all__rK   rB   r<   <module>r      s       & ! @ / - #   / !" /BII /d	- 		1 	bii $	O 		#A 	 HO H HV\
-!6 \
~	> 		&G 	 frB   