
    qi0                        d dl mZ d dlmZ d dlZd dlmZ ddlmZ ddl	m
Z
mZmZ ddlmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZmZmZmZmZ ddlmZ ddl m!Z!m"Z" ddl#m$Z$ ddl%m&Z&m'Z'm(Z(m)Z) ddl*m+Z+ ddl,m-Z-m.Z. ddl/m0Z0m1Z1m2Z2 ddl3m4Z4m5Z5m6Z6 ddl7m8Z8m9Z9  e)jt                  e;      Z< G d de      Z=e e'd       G d de                    Z> G d dej~                        Z@ G d d ej~                        ZA G d! d"e1      ZB G d# d$e0      ZC G d% d&e4      ZD G d' d(e      ZEe' G d) d*e"             ZF G d+ d,eF      ZG G d- d.e5      ZH G d/ d0e8      ZI e'd1       G d2 d3eFe             ZJg d4ZKy)5    )Callable)	dataclassN   )ACT2FN)CacheDynamicCacheEncoderDecoderCache)PreTrainedConfig)GenerationMixin)create_bidirectional_maskcreate_causal_mask)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPast)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput)RopeParameters)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogging)merge_with_config_defaults)OutputRecordercapture_outputs   )GlmAttentionGlmRotaryEmbeddingapply_rotary_pos_emb)LlamaDecoderLayer
LlamaModeleager_attention_forward)WhisperModelshift_tokens_rightc            2       n    e Zd ZdZdZdgZdddddZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dd
ed	z  ded	z  ded	z  ded	z  ded	z  ded	z  ded	z  ded	z  ded	z  ded	z  ded	z  ded	z  ded	z  de	d	z  ded	z  de
d	z  deeeef   z  d	z  de
d	z  de
d	z  de	d	z  ded	z  ded	z  ded	z  de
d	z  f0 fdZ xZS ) MoonshineConfiga7  
    This is the configuration class to store the configuration of a [`MoonshineModel`]. It is used to instantiate a Moonshine
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the Moonshine
    [UsefulSensors/moonshine-tiny](https://huggingface.co/UsefulSensors/moonshine-tiny).

    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PreTrainedConfig`] for more information.

    Args:
        vocab_size (`int`, *optional*, defaults to 32768):
            Vocabulary size of the Moonshine model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`MoonshineModel`].
        hidden_size (`int`, *optional*, defaults to 288):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 1152):
            Dimension of the MLP representations.
        encoder_num_hidden_layers (`int`, *optional*, defaults to 6):
            Number of hidden layers in the Transformer encoder.
        decoder_num_hidden_layers (`int`, *optional*, defaults to 6):
            Number of hidden layers in the Transformer decoder.
        encoder_num_attention_heads (`int`, *optional*, defaults to 8):
            Number of attention heads for each attention layer in the Transformer encoder.
        decoder_num_attention_heads (`int`, *optional*, defaults to 8):
            Number of attention heads for each attention layer in the Transformer decoder.
        encoder_num_key_value_heads (`int`, *optional*):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
            `encoder_num_key_value_heads=encoder_num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `encoder_num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
            by meanpooling all the original heads within that group. For more details, check out [this
            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
            `num_attention_heads`.
        decoder_num_key_value_heads (`int`, *optional*):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
            `decoder_num_key_value_heads=decoder_num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `decoder_num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
            by meanpooling all the original heads within that group. For more details, check out [this
            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
            `decoder_num_attention_heads`.
        pad_head_dim_to_multiple_of (`int`, *optional*):
            Pad head dimension in encoder and decoder to the next multiple of this value. Necessary for using certain
            optimized attention implementations.
        encoder_hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
            The non-linear activation function (function or string) in the encoder.
        decoder_hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
            The non-linear activation function (function or string) in the decoder.
        max_position_embeddings (`int`, *optional*, defaults to 512):
            The maximum sequence length that this model might ever be used with.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        decoder_start_token_id (`int`, *optional*, defaults to 1):
            Corresponds to the "<|startoftranscript|>" token, which is automatically used when no `decoder_input_ids`
            are provided to the `generate` function. It is used to guide the model`s generation process depending on
            the task.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models).
        rope_parameters (`RopeParameters`, *optional*):
            Dictionary containing the configuration parameters for the RoPE embeddings. The dictionary should contain
            a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE
            with longer `max_position_embeddings`.
        is_encoder_decoder (`bool`, *optional*, defaults to `True`):
            Whether the model is used as an encoder/decoder or not.
        attention_bias (`bool`, *optional*, defaults to `False`):
            Whether to use a bias in the query, key, value and output projection layers during self-attention.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        bos_token_id (`int`, *optional*, defaults to 1):
            Denotes beginning of sequences token id.
        eos_token_id (`int`, *optional*, defaults to 2):
            Denotes end of sequences token id.
        pad_token_id (`int`, *optional*):
            Padding token id.
        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
            Whether to tie weight embeddings

    Example:

    ```python
    >>> from transformers import MoonshineModel, MoonshineConfig

    >>> # Initializing a Moonshine style configuration
    >>> configuration = MoonshineConfig().from_pretrained("UsefulSensors/moonshine-tiny")

    >>> # Initializing a model from the configuration
    >>> model = MoonshineModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```	moonshinepast_key_valuesdecoder_num_key_value_headsdecoder_num_attention_headsdecoder_num_hidden_layersdecoder_hidden_act)num_key_value_headsnum_attention_headsnum_hidden_layers
hidden_actN
vocab_sizehidden_sizeintermediate_sizeencoder_num_hidden_layersencoder_num_attention_headsencoder_num_key_value_headspad_head_dim_to_multiple_ofencoder_hidden_actmax_position_embeddingsinitializer_rangedecoder_start_token_id	use_cacherope_parametersis_encoder_decoderattention_biasattention_dropoutbos_token_ideos_token_idpad_token_idtie_word_embeddingsc                    || _         || _        || _        || _        || _        || _        || _        ||}|| _        |	|}	|	| _        |
| _	        || _
        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        |j1                  dd       t3        | h  dd|i| y )Npartial_rotary_factorg?rB    )r5   r6   r7   r8   r/   r9   r.   r:   r-   r;   r<   r0   r=   r>   r?   r@   rB   rC   rD   rE   rF   rG   rH   rA   
setdefaultsuper__init__)selfr5   r6   r7   r8   r/   r9   r.   r:   r-   r;   r<   r0   r=   r>   r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   kwargs	__class__s                             a/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/moonshine/modular_moonshine.pyrN   zMoonshineConfig.__init__   s
   8 %&!2)B&)B&+F(+F(&.*E'+F(&.*E'+F(+F("4"4'>$!2&<#""4,!2(((&<##6 .137I,>I&I    )i   i   i     rT      rU   NNNgelusilui   g{Gz?   TNTF        rX   r    NT)__name__
__module____qualname____doc__
model_typekeys_to_ignore_at_inferenceattribute_mapintstrfloatboolr   dictrN   __classcell__rQ   s   @rR   r*   r*   1   s   Zx J#4"5<<8*	M "'"%(,01012323262626)/)/.1*.-.!%MQ*.&+*-#$#$#'+/3?J$J?J 4Z?J :	?J
 $':?J $':?J &)4Z?J &)4Z?J &)4Z?J &)4Z?J &)4Z?J  $J?J  $J?J "%t?J !4<?J  !$d
!?J" $;#?J$ ($sN/B*CCdJ%?J& !4K'?J( t)?J* !4<+?J, Dj-?J. Dj/?J0 Dj1?J2 "D[3?J ?JrS   r*   z
    Extends [~modeling_outputs.BaseModelOutput] to include the output attention mask since sequence length is not preserved in the model's forward.
    )custom_introc                   6    e Zd ZU dZej
                  dz  ed<   y)MoonshineEncoderModelOutputNattention_mask)rZ   r[   r\   rk   torchTensor__annotations__rK   rS   rR   rj   rj      s     +/NELL4'.rS   rj   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )MoonshineEncoderMLPc                    t         |           || _        t        |   | _        t        j                  |j                  |j                        | _	        t        j                  |j                  |j                        | _
        y NrM   rN   configr   activation_fnnnLinearr6   r7   fc1fc2rO   rt   r4   rQ   s      rR   rN   zMoonshineEncoderMLP.__init__   s^    #J/99V//1I1IJ99V55v7I7IJrS   hidden_statesreturnc                 l    | j                  |      }| j                  |      }| j                  |      }|S rr   )rx   ru   ry   )rO   r{   s     rR   forwardzMoonshineEncoderMLP.forward   s4    /**=9/rS   rZ   r[   r\   rN   rl   rm   r~   rf   rg   s   @rR   rp   rp      s$    KU\\ ell rS   rp   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )MoonshineDecoderMLPc                    t         |           || _        t        |   | _        t        j                  |j                  |j                  dz        | _	        t        j                  |j                  |j                        | _
        y )Nr    rs   rz   s      rR   rN   zMoonshineDecoderMLP.__init__   sc    #J/99V//1I1IA1MN99V55v7I7IJrS   r{   r|   c                     | j                  |      }|j                  dd      \  }}| j                  |      |z  }| j                  |      }|S )Nr    )dim)rx   chunkru   ry   )rO   r{   gates      rR   r~   zMoonshineDecoderMLP.forward   sS    /+11!1<t**40=@/rS   r   rg   s   @rR   r   r      s$    KU\\ ell rS   r   c                       e Zd Zy)MoonshineRotaryEmbeddingN)rZ   r[   r\   rK   rS   rR   r   r     s    rS   r   c                   h    e Zd Zdededededef
 fdZ	 	 	 	 	 ddej                  d	e	ej                  ej                  f   dz  d
ej                  dz  de
dz  dej                  dz  dej                  dz  dee   de	ej                  ej                  dz  e	ej                     dz  f   fdZ xZS )MoonshineAttentionrt   	layer_idx	is_causalr2   r1   c                 n   |j                  ||d       t        | 	  ||       || _        t	        |d|j
                  |j                  z        | _        | j                  j                  C| j                  j                  }|| j                  |z   dz
  |z  z  }|| j                  z
  | _
        y d| _
        y )N)r2   r1   head_dimrX   r   )updaterM   rN   r   getattrr6   r2   r   rt   r;   head_dim_padding)	rO   rt   r   r   r2   r1   target_multipletarget_head_dimrQ   s	           rR   rN   zMoonshineAttention.__init__  s     	.AZmno+"
F4F4F&JdJd4de ;;22>"kkEEO-$--/2QTU2UZi1ijO$3dmm$CD!$%D!rS   Nr{   position_embeddingsrk   r,   cache_positionkey_value_statesrP   r|   c                 N   |j                   d d \  }}	| j                  |      j                  ||	| j                  j                  | j
                        j                  dd      }
|d u}|Y|j                  j                  | j                        }|r&d|j                  | j                  <   |j                  }n|j                  }||n|}|rK|rIrG|j                  | j                     j                  }|j                  | j                     j                  }n| j                  |      j                  |d| j                  j                  | j
                        j                  dd      }| j!                  |      j                  |d| j                  j                  | j
                        j                  dd      }|r%|#|j#                  ||| j                  d|i      \  }}|s?|\  }}t%        |
|||      \  }
}|'|||d}|j#                  ||| j                  |      \  }}t'        j(                  | j                  j*                  t,              }| j.                  xr |d u xr |	dkD  }| j0                  dkD  rt2        j4                  j6                  j9                  |
d| j0                  f      }
t2        j4                  j6                  j9                  |d| j0                  f      }t2        j4                  j6                  j9                  |d| j0                  f      } || |
|||f| j:                  sdn| j<                  | j>                  |d	|\  }}| j0                  dkD  r|d
d | j0                   f   }|jA                  ||	d      jC                         }| jE                  |      }||fS )Nr   rX   r    Tr   )sincosr   r   rY   )dropoutscalingr   .)#shapeq_projviewrt   r1   r   	transpose
is_updatedgetr   cross_attention_cacheself_attention_cachelayerskeysvaluesk_projv_projr   r#   r   get_interface_attn_implementationr&   r   r   rl   rv   
functionalpadtrainingrD   r   reshape
contiguouso_proj)rO   r{   r   rk   r,   r   r   rP   bszq_lenquery_statesis_cross_attentionr   current_states
key_statesvalue_statesr   r   cache_kwargsattention_interfacer   attn_outputattn_weightss                          rR   r~   zMoonshineAttention.forward  sy    #(("-
U KK&++C8W8WY]YfYfgqqrsuvw 	 .T9&(3377GJ!=A**4>>:"1"G"G"1"F"F .>-I)}/j(//?DDJ*11$..AHHL N+c2t{{>>N1a  N+c2t{{>>N1a 
 "o&A+:+A+Adnn?OQ_>`,(
L "*HC';L*VY[^'_$L**'*3.Y+:+A+Adnnl,(
L )@(M(MKK,,.E)
 NNK~'=K%!)	  1$ 88..22<!TEZEZA[\L,,00aAVAV=WXJ 88..22<!TEZEZA[\L$7
%
  $}}C$2H2HLL
%
 
%
!\   1$%c+Cd.C.C-C+C&CDK!))#ub9DDFkk+.L((rS   )NNNNN)rZ   r[   r\   r*   ra   rd   rN   rl   rm   tupler   
LongTensorr   r   r~   rf   rg   s   @rR   r   r     s   && & 	&
 !& !&0 IM.2(,2604U)||U) #5<<#=>EU) t+	U)
 U) ((4/U)  ,,-U) -.U) 
u||U\\D0%2E2LL	MU)rS   r   c                   (     e Zd Zdedef fdZ xZS )MoonshineEncoderLayerrt   r   c                 F   t         |   ||       t        ||d|j                  |j                        | _        t        ||j                        | _        t        j                  |j                  d      | _        t        j                  |j                  d      | _        y )NFrt   r   r   r2   r1   bias)rM   rN   r   r9   r:   	self_attnrp   r<   mlprv   	LayerNormr6   input_layernormpost_attention_layernormrO   rt   r   rQ   s      rR   rN   zMoonshineEncoderLayer.__init__u  s    ++ & B B & B B
 'vv/H/HI!||F,>,>UK(*V5G5Ge(T%rS   )rZ   r[   r\   r*   ra   rN   rf   rg   s   @rR   r   r   t  s    U U3 U UrS   r   c                   
    e Zd Zddededz  f fdZ	 	 	 	 	 	 	 	 	 	 ddej                  dej                  dz  dej                  dz  dej                  dz  d	ej                  dz  d
ej                  dz  de	dz  de
dz  dej                  dz  deej                  ej                  f   dz  deej                  ej                  f   dz  dee   deej                  eej                  ej                  f   dz  f   fdZ xZS )MoonshineDecoderLayerNrt   r   c                    t         |           |j                  | _        t        ||d|j                  |j
                        | _        t        ||d|j                  |j
                        | _        t        ||j                        | _
        t        j                  |j                  d      | _        t        j                  |j                  d      | _        t        j                  |j                  d      | _        y )NTr   Fr   )rM   rN   r6   r   r2   r1   r   encoder_attnr   r4   r   rv   r   r   r   final_layernormr   s      rR   rN   zMoonshineDecoderLayer.__init__  s    !--+ & : : & : :
 / & : : & : :
 'vv/@/@A!||F,>,>UK(*V5G5Ge(T%!||F,>,>UKrS   r{   rk   encoder_hidden_statesencoder_attention_maskposition_idsencoder_position_idsr,   r@   r   r   encoder_position_embeddingsrP   r|   c                 (   |}| j                  |      } | j                  d||||||	|
d|\  }}||z   }|1|}| j                  |      }| j                  |||||      \  }}||z   }|}| j	                  |      }| j                  |      }||z   }|S )N)r{   rk   r   r,   r@   r   r   )r{   r   rk   r,   r@   rK   )r   r   r   r   r   r   )rO   r{   rk   r   r   r   r   r,   r@   r   r   r   rP   residual_s                  rR   r~   zMoonshineDecoderLayer.forward  s     !,,];)4>> 	
')%+) 3	
 	
q !=0 ,$H 99-HM#00+!65 /#  1  M1 %}4M ,,];/ =0rS   rr   )
NNNNNNFNNN)rZ   r[   r\   r*   ra   rN   rl   rm   r   r   rd   r   r   r   FloatTensorr~   rf   rg   s   @rR   r   r     sj   L L3: L6 /3596:048<(,!&26HLPT.||. t+.  %||d2	.
 !&t 3. &&-. $..5. . $;. ((4/. #5<<#=>E. &+5<<+E%F%M. +,. 
u  %(9(95;L;L(L"MPT"TT	U.rS   r   c                   \    e Zd ZU eed<   dZdZdZdZddgZ	dZ
dZdZdej                  fd	Zy
)MoonshinePreTrainedModelrt   modelinput_valuesaudioTr   r   input_lengthsc                 ~    t        |dz
  dz  dz         }t        |dz
  dz  dz         }t        |dz
  dz  dz         }|S )zH
        Computes the output length of the convolutional layers
           @   rX      r   r    )ra   )rO   r   output_conv1_lengthoutput_conv2_lengthoutput_conv3_lengths        rR    _get_feat_extract_output_lengthsz9MoonshinePreTrainedModel._get_feat_extract_output_lengths  sZ     "=3#6""<q"@A!#6#:a"?!"CD!#6#:a"?!"CD""rS   N)rZ   r[   r\   r*   rn   base_model_prefixmain_input_nameinput_modalitiessupports_gradient_checkpointing_no_split_modules_supports_flash_attn_supports_sdpa_can_compile_fullgraphrl   r   r   rK   rS   rR   r   r     sN    $O&*#02IJN!#e>N>N #rS   r   c                        e Zd ZdZdZeedZdef fdZ	de
j                  fdZde
j                  fd	Zee	 ddej"                  dej$                  d
z  dee   deez  fd              Z xZS )MoonshineEncoderz
    Transformer encoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MoonshineEncoderLayer`]

    Args:
        config: MoonshineConfig
    r   )
attentionsr{   rt   c           	      b   t         |   |       || _        |j                  }t	        j
                  d|ddd      | _        t	        j
                  |d|z  dd	      | _        t	        j
                  d|z  |dd	      | _        t	        j                  d|d
      | _
        t	        j                  t        |j                        D cg c]  }t        ||       c}      | _        t	        j                   |d      | _        t%        |      | _        d| _        | j+                          y c c}w )NrX   r   r   F)kernel_sizestrider   r    r   r   )r   r   gh㈵>)
num_groupsnum_channelsepsr   rt   )rM   rN   rt   r6   rv   Conv1dconv1conv2conv3	GroupNorm	groupnorm
ModuleListranger8   r   r   r   
layer_normr   
rotary_embgradient_checkpointing	post_init)rO   rt   	embed_dimidxrQ   s       rR   rN   zMoonshineEncoder.__init__  s     &&	YYq)ReT
YYy!i-QqQ
YYq9}iQqQ
PTUmm;@AaAa;bcC"63/c
 ,,yu=2&A&+# ds   D,r|   c                     | j                   S rr   r   rO   s    rR   get_input_embeddingsz%MoonshineEncoder.get_input_embeddings  s    zzrS   valuec                     || _         y rr   r  )rO   r  s     rR   set_input_embeddingsz%MoonshineEncoder.set_input_embeddings  s	    
rS   Nrk   rP   c                 d   |j                  d      }t        j                  j                  | j	                  |            }| j                  |      }t        j                  j                  | j                  |            }t        j                  j                  | j                  |            }|j                  ddd      }|3| j                  |j                  d         }d}|ddd|f   dd|f   }|}t        | j                  |||      }t        j                  d|j                  d   |j                   	      j                  d      }| j#                  ||
      }	| j$                  D ]  }
 |
|f|||	d|} | j'                  |      }t)        ||j+                               S d      S )a.  
        Args:
            input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
                Float values of the raw speech waveform. Raw speech waveform can be
                obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]`, a
                `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library (`pip install torchcodec`) or
                the soundfile library (`pip install soundfile`). To prepare the array into
                `input_values`, the [`AutoFeatureExtractor`] should be used for padding
                and conversion into a tensor of type `torch.FloatTensor`.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding indices in `input_values`. Mask values selected in `[0, 1]`:
                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.
                [What are attention masks?](../glossary#attention-mask)
        rX   r   r    Nr   i  .rt   inputs_embedsrk   r   devicer   )rk   r   r   )last_hidden_staterk   )	unsqueezerv   r   tanhr   r   rV   r   r   permuter   r   r   rt   rl   aranger  r  r   r  rj   ra   )rO   r   rk   rP   r{   mask_lendownsample_strideoutput_attention_maskr   r   encoder_layers              rR   r~   zMoonshineEncoder.forward  s   . $--a0**4::l+CD}5**4::m+DE**4::m+DE%--aA6 %<<^=Q=QRT=UVH *+C1D3D1D,DEc9H9nUN$2!2;;')"/	
 ||A}':':1'=mFZFZ[eefgh"oom,oW![[ 	M)-)$7	
 M	 6*+:H:T0446
 	
Z^
 	
rS   rr   )rZ   r[   r\   r]   r   r   r   _can_record_outputsr*   rN   rv   Moduler
  r  r   r   rl   r   rm   r   r   r   r   r~   rf   rg   s   @rR   r   r     s     %O(.
 $bii "))    /3;
'';
 t+;
 +,	;

 
(	(;
   ;
rS   r   c                       e Zd ZdZ eedd      e eedd      dZdef fdZ	e
e	 	 	 	 	 	 	 	 	 ddej                  d	z  d
ej                  d	z  dej                  d	z  ded	z  dej                   d	z  ded	z  dej                  d	z  dej                   d	z  dej                  d	z  dee   deez  fd              Z xZS )MoonshineDecoder	input_idsrX   r   )index
layer_namer   )r   r{   cross_attentionsrt   c           	         t         |   |       t        j                  |j                  d      | _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _
        y c c}w NFr   )rM   rN   rv   r   r6   normr   r   r3   r   r   )rO   rt   r  rQ   s      rR   rN   zMoonshineDecoder.__init__V  s[     LL!3!3%@	mmSXY_YqYqSr$sC%:63%G$st$ss   A=Nrk   r   r,   r  r@   r   r   r   rP   r|   c
                    |du |duz  rt        d      || j                  |      }|r6|4t        t        | j                        t        | j                              }|F||j                         nd}t        j                  |||j                  d   z   |j                        }||j                  d      }t        | j                  |||||      }t        | j                  ||	|      }	|}| j                  ||	      }| j                  D ]  } ||||f|	|||||d
|
} | j                  |      }t!        ||r|      S d      S )a  
        encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
            of the decoder.
        encoder_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding indices in `encoder_hidden_states`. Mask values selected in `[0, 1]`:
            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
            [What are attention masks?](../glossary#attention-mask)
        Nz:You must specify exactly one of input_ids or inputs_embedsr   r   rX   r  )rt   r  rk   r   r,   r   r  r  )r   r   r,   r@   r   r   )r  r,   )
ValueErrorembed_tokensr	   r   rt   get_seq_lengthrl   r  r   r  r  r   r   r  r   r'  r   )rO   r!  rk   r   r,   r  r@   r   r   r   rP   past_seen_tokenscausal_maskr{   r   decoder_layers                   rR   r~   zMoonshineDecoder.forward[  s   2 -t";<YZZ  --i8M01,dkk2RT`hlhshsTtuO!CRC^==?de"\\ "2]5H5H5K"KTaThThN )33A6L(;;'))+%
 ";;;'1"7	"
 &"oom,oW![[ 	M)% (>) /#-$7 M	 		-08+/8O
 	
>B
 	
rS   )	NNNNNNNNN)rZ   r[   r\   r   r   r   r   r  r*   rN   r   r   rl   r   rm   r   r   rd   r   r   r   r   r~   rf   rg   s   @rR   r   r   N  sK   !O$%7q[Y.*+=QSabu u
   .2.204(,26!%26:>6:N
##d*N
 t+N
 &&-	N

 N
 ((4/N
 $;N
 ((4/N
  %0047N
 !&t 3N
 +,N
 
(	(N
   N
rS   r   c                   n   e Zd Zd Zee	 	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  de	e	ej                        dz  de
dz  d	e	ej                     dz  d
e	ej                     dz  dedz  dej                  dz  dee   defd              Zy)MoonshineModelc                     t        d      )NzNot needed for Moonshine)AttributeErrorr	  s    rR   _mask_input_featuresz#MoonshineModel._mask_input_features  s    788rS   Nr   rk   decoder_input_idsdecoder_attention_maskencoder_outputsr,   decoder_inputs_embedsdecoder_position_idsr@   r   rP   r|   c                 V   | | j                   |fd|i|} | j                  d|||j                  |j                  ||||	|
d	|}t	        |j                  |j
                  |j                  |j                  |j                  |j                  |j                  |j                        S )a
  
        input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
            Float values of the raw speech waveform. Raw speech waveform can be
            obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]`, a
            `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library (`pip install torchcodec`) or
            the soundfile library (`pip install soundfile`). To prepare the array into
            `input_values`, the [`AutoFeatureExtractor`] should be used for padding
            and conversion into a tensor of type `torch.FloatTensor`.
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`):
            Indices of positions of each input sequence tokens in the position embeddings.
            Used to calculate the position embeddings up to `config.decoder_config.max_position_embeddings`

        Example:

        ```python
        >>> import torch
        >>> from transformers import AutoFeatureExtractor, MoonshineModel
        >>> from datasets import load_dataset

        >>> model = MoonshineModel.from_pretrained("UsefulSensors/moonshine-tiny")
        >>> feature_extractor = AutoFeatureExtractor.from_pretrained("UsefulSensors/moonshine-tiny")
        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
        >>> inputs = feature_extractor(ds[0]["audio"]["array"], return_tensors="pt")
        >>> input_values = inputs.input_values
        >>> decoder_input_ids = torch.tensor([[1, 1]]) * model.config.decoder_start_token_id
        >>> last_hidden_state = model(input_values, decoder_input_ids=decoder_input_ids).last_hidden_state
        >>> list(last_hidden_state.shape)
        [1, 2, 288]
        ```
        rk   )	r!  rk   r   r   r,   r  r   r@   r   )r  r,   decoder_hidden_statesdecoder_attentionsr$  encoder_last_hidden_stater   encoder_attentionsrK   )	encoderdecoderr  rk   r   r,   r{   r   r$  )rO   r   rk   r4  r5  r6  r,   r7  r8  r@   r   rP   decoder_outputss                rR   r~   zMoonshineModel.forward  s    \ "/;t||L/rYg/rkq/rOEQT\\ F
'1"1"C"C#2#A#A+/-)F
 F
 "-??+;;"1"?"?.99,==&5&G&G"1"?"?.99	
 		
rS   )
NNNNNNNNNN)rZ   r[   r\   r3  r   r   rl   r   r   r   r	   rd   r   r   r   r~   rK   rS   rR   r0  r0    sB   9  262659:>BF6:AE?C!%26E
''$.E
 ((4/E
 !++d2	E

 !& 0 04 7E
 uU%6%6784?E
 -t3E
  %U%6%67$>E
 $E$4$45<E
 $;E
 ((4/E
 +,E
 
E
  E
rS   r0  zj
    The Moonshine Model with a language modeling head. Can be used for automatic speech recognition.
    c                       e Zd ZddiZdef fdZd Zd Zdej                  fdZ
ee	 	 	 	 	 	 	 	 	 	 	 dd
ej                  d	z  dej                  d	z  dej                  d	z  dej                  d	z  deeej                        d	z  ded	z  deej                     d	z  deej                     d	z  ded	z  dej                  d	z  dej                  d	z  dee   defd              Z xZS )!MoonshineForConditionalGenerationzproj_out.weightz!model.decoder.embed_tokens.weightrt   c                     t         |   |       t        |      | _        t	        j
                  |j                  |j                  d      | _        | j                          y r&  )
rM   rN   r0  r   rv   rw   r6   r5   proj_outr  )rO   rt   rQ   s     rR   rN   z*MoonshineForConditionalGeneration.__init__  sH     #F+
		&"4"4f6G6GeT 	rS   c                     | j                   S rr   rD  r	  s    rR   get_output_embeddingsz7MoonshineForConditionalGeneration.get_output_embeddings  s    }}rS   c                     || _         y rr   rF  )rO   new_embeddingss     rR   set_output_embeddingsz7MoonshineForConditionalGeneration.set_output_embeddings  s	    &rS   r|   c                 6    | j                   j                         S rr   )r   r
  r	  s    rR   r
  z6MoonshineForConditionalGeneration.get_input_embeddings  s    zz..00rS   Nr   rk   r4  r5  r6  r,   r7  r8  r@   r   labelsrP   c                    |9|7|5t        || j                  j                  | j                  j                        } | j                  |f||||||||	|
d	|}| j                  |j                        }d}|(| j                  ||| j                  j                        }t        |||j                  |j                  |j                  |j                  |j                  |j                  |j                   	      S )a0  
        input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
            Float values of the raw speech waveform. Raw speech waveform can be
            obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]`, a
            `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library (`pip install torchcodec`) or
            the soundfile library (`pip install soundfile`). To prepare the array into
            `input_values`, the [`AutoFeatureExtractor`] should be used for padding
            and conversion into a tensor of type `torch.FloatTensor`.
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`):
            Indices of positions of each input sequence tokens in the position embeddings.
            Used to calculate the position embeddings up to `config.decoder_config.max_position_embeddings`

        Example:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, MoonshineForConditionalGeneration
        >>> from datasets import load_dataset

        >>> processor = AutoProcessor.from_pretrained("UsefulSensors/moonshine-tiny")
        >>> model = MoonshineForConditionalGeneration.from_pretrained("UsefulSensors/moonshine-tiny")

        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")

        >>> inputs = processor(ds[0]["audio"]["array"], return_tensors="pt")
        >>> input_values = inputs.input_values

        >>> generated_ids = model.generate(input_values, max_new_tokens=100)

        >>> transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        >>> transcription
        'Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel.'
        ```N)	rk   r4  r6  r5  r,   r7  r8  r@   r   )logitsrL  r5   )	lossrN  r,   r:  r;  r$  r<  r   r=  )r(   rt   rG   r?   r   rD  r  loss_functionr5   r   r,   r:  r;  r$  r<  r   r=  )rO   r   rk   r4  r5  r6  r,   r7  r8  r@   r   rL  rP   outputsrN  rO  s                   rR   r~   z)MoonshineForConditionalGeneration.forward  s   f  (-B-J$6DKK44dkk6X6X%! '1djj'
)/+#9+"7!5)'
 '
 w889%%VFt{{OeOe%fD#33")"?"?&99$55&-&G&G")"?"?&99

 
	
rS   )NNNNNNNNNNN)rZ   r[   r\   _tied_weights_keysr*   rN   rG  rJ  rv   r  r
  r   r   rl   r   r   r   r	   rd   r   r   r   r~   rf   rg   s   @rR   rB  rB    s    ,-PQ '1bii 1  262659:>BF6:AE?C!%26*.T
''$.T
 ((4/T
 !++d2	T

 !& 0 04 7T
 uU%6%6784?T
 -t3T
  %U%6%67$>T
 $E$4$45<T
 $;T
 ((4/T
   4'T
 +,T
 
T
  T
rS   rB  )r*   r0  r   rB  )Lcollections.abcr   dataclassesr   rl   torch.nnrv   activationsr   cache_utilsr   r   r	   configuration_utilsr
   
generationr   masking_utilsr   r   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   r   modeling_rope_utilsr   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.genericr   utils.output_capturingr   r   glm.modeling_glmr!   r"   r#   llama.modeling_llamar$   r%   r&   whisper.modeling_whisperr'   r(   
get_loggerrZ   loggerr*   rj   r  rp   r   r   r   r   r   r   r   r   r0  rB  __all__rK   rS   rR   <module>rj     s   % !   ! C C 3 ) J B 9  2 F & R R 7 E U U Y Y G 
		H	%eJ& eJP 
// / /")) "))  	1 	k) k)\U- U"G6 GT # # #0c
/ c
L]
z ]
@K
\ K
\ 
j
(@/ j

j
ZrS   