
    qi                        d dl mZ d dlmZ d dlmZ d dlZd dlmZ ddl	m
Z
 ddlmZmZmZ ddlmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZ ddlmZmZmZmZmZ ddl m!Z!m"Z" ddl#m$Z$m%Z% ddl&m'Z' ddl(m)Z)m*Z*m+Z+ ddl,m-Z-m.Z. ddl/m0Z0m1Z1 ddl2m3Z3 e e*d       G d de                    Z4 G d dejj                        Z6 G d dejj                        Z7 G d dejj                        Z8d ejr                  d!e:d"ejr                  fd#Z;	 dEd$ejj                  d%ejr                  d&ejr                  d'ejr                  d(ejr                  dz  d)e<d*e<d+e'e)   fd,Z=d- Z>dFd.Z? ee?       G d/ d0ejj                               Z@ G d1 d2e      ZA G d3 d4e      ZBe* G d5 d6e%             ZC G d7 d8eC      ZDe* G d9 d:eC             ZEe* G d; d<eC             ZFd=ejr                  d>e:d?e:fd@ZG e*dA       G dB dCeCe             ZHg dDZIy)G    )Callable)	dataclass)OptionalN   )ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)use_kernelized_func)create_bidirectional_maskcreate_causal_mask)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPast)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuple)maybe_autocastmerge_with_config_defaults)OutputRecordercapture_outputs   )MoonshineConfigz
    Extends [~modeling_outputs.BaseModelOutput] to include the output attention mask since sequence length is not preserved in the model's forward.
    )custom_introc                   6    e Zd ZU dZej
                  dz  ed<   y)MoonshineEncoderModelOutputNattention_mask)__name__
__module____qualname__r'   torchTensor__annotations__     b/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/moonshine/modeling_moonshine.pyr&   r&   3   s     +/NELL4'.r/   r&   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )MoonshineEncoderMLPc                    t         |           || _        t        |   | _        t        j                  |j                  |j                        | _	        t        j                  |j                  |j                        | _
        y Nsuper__init__configr   activation_fnnnLinearhidden_sizeintermediate_sizefc1fc2selfr8   
hidden_act	__class__s      r0   r7   zMoonshineEncoderMLP.__init__>   s^    #J/99V//1I1IJ99V55v7I7IJr/   hidden_statesreturnc                 l    | j                  |      }| j                  |      }| j                  |      }|S r4   )r>   r9   r?   )rA   rD   s     r0   forwardzMoonshineEncoderMLP.forwardE   s4    /**=9/r/   r(   r)   r*   r7   r+   r,   rG   __classcell__rC   s   @r0   r2   r2   =   s$    KU\\ ell r/   r2   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )MoonshineDecoderMLPc                    t         |           || _        t        |   | _        t        j                  |j                  |j                  dz        | _	        t        j                  |j                  |j                        | _
        y )N   r5   r@   s      r0   r7   zMoonshineDecoderMLP.__init__M   sc    #J/99V//1I1IA1MN99V55v7I7IJr/   rD   rE   c                     | j                  |      }|j                  dd      \  }}| j                  |      |z  }| j                  |      }|S )NrN   dim)r>   chunkr9   r?   )rA   rD   gates      r0   rG   zMoonshineDecoderMLP.forwardT   sS    /+11!1<t**40=@/r/   rH   rJ   s   @r0   rL   rL   L   s$    KU\\ ell r/   rL   c                        e Zd ZU ej                  ed<   ddef fdZe	 	 	 ddedz  de	d   de
dz  ded	ef   fd
       Z ej                         ed               Z xZS )MoonshineRotaryEmbeddinginv_freqNr8   c                    t         |           |j                  | _        |j                  | _        || _        | j
                  j                  d   | _        | j                  }| j                  dk7  rt        | j                     } || j
                  |      \  }| _
        | j                  d|d       | j                  d|j                         d       y )N	rope_typedefaultrW   F)
persistentoriginal_inv_freq)r6   r7   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenr8   rope_parametersrY   compute_default_rope_parametersr   attention_scalingregister_bufferclone)rA   r8   devicerope_init_fnrW   rC   s        r0   r7   z!MoonshineRotaryEmbedding.__init___   s    "("@"@$*$B$B!44[A!%!E!E>>Y&.t~~>L+7V+L($(ZeD0(..2BuUr/   re   ztorch.deviceseq_lenrE   ztorch.Tensorc                 n   | j                   d   }| j                   j                  dd      }t        | dd      xs | j                  | j                  z  }t        ||z        }d}d|t        j                  d|dt        j                        j                  |t        j                  	      |z  z  z  }||fS )
a  
        Computes the inverse frequencies according to the original RoPE implementation
        Args:
            config ([`~transformers.PreTrainedConfig`]):
                The model configuration.
            device (`torch.device`):
                The device to use for initialization of the inverse frequencies.
            seq_len (`int`, *optional*):
                The current sequence length. Unused for this type of RoPE.
        Returns:
            Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
            post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
        
rope_thetapartial_rotary_factorg      ?head_dimNr   rN   dtype)re   rm   )r`   getgetattrr<   num_attention_headsintr+   arangeint64tofloat)	r8   re   rg   baserj   rk   rR   attention_factorrW   s	            r0   ra   z8MoonshineRotaryEmbedding.compute_default_rope_parameterso   s    & %%l3 & 6 6 : :;RTW X6:t4h8J8JfNhNh8h(223 U\\!S!5;;?BB&X]XcXcBdgjjk
 )))r/   c                 N   | j                   d d d d f   j                         j                  |j                  d   dd      j	                  |j
                        }|d d d d d f   j                         }t        |j
                  j                  t              r/|j
                  j                  dk7  r|j
                  j                  nd}t        |d      5  |j                         |j                         z  j                  dd      }t        j                  ||fd	      }|j                         | j                  z  }|j                         | j                  z  }	d d d        j	                  |j                   
      	j	                  |j                   
      fS # 1 sw Y   AxY w)Nr   rP   r"   mpscpuF)device_typeenabledrN   rQ   rl   )rW   ru   expandshapert   re   
isinstancetypestrr   	transposer+   catcosrb   sinrm   )
rA   xposition_idsinv_freq_expandedposition_ids_expandedr{   freqsembr   r   s
             r0   rG   z MoonshineRotaryEmbedding.forward   sR    !MM$4-8>>@GGHZHZ[\H]_acdehhijiqiqr ,QaZ 8 > > @'1!((--'E!((--[`J`ahhmmfkUC 	5&,,.1F1L1L1NNYYZ[]^_E))UEN3C'')d444C'')d444C		5 vvAGGv$cff177f&;;;	5 	5s   BFF$r4   )NNN)r(   r)   r*   r+   r,   r-   r#   r7   staticmethodr   rq   tupleru   ra   no_gradr   rG   rI   rJ   s   @r0   rV   rV   \   s    llV V  )-+/"*$&*(* t* 
~u$	%	* *> U]]_<  <r/   rV   rD   n_reprE   c                     | j                   \  }}}}|dk(  r| S | dddddddddf   j                  |||||      } | j                  |||z  ||      S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r"   N)r~   r}   reshape)rD   r   batchnum_key_value_headsslenrk   s         r0   	repeat_kvr      so    
 2?1D1D.Ehz!!Qa"23::5BUW\^bdlmM  (;e(CT8TTr/   modulequerykeyvaluer'   scalingdropoutkwargsc                    t        || j                        }t        || j                        }	t        j                  ||j	                  dd            |z  }
||
|z   }
t
        j                  j                  |
dt        j                        j                  |j                        }
t
        j                  j                  |
|| j                        }
t        j                  |
|	      }|j	                  dd      j                         }||
fS )NrN   r   rP   )rR   rm   )ptrainingr"   )r   num_key_value_groupsr+   matmulr   r:   
functionalsoftmaxfloat32rt   rm   r   r   
contiguous)r   r   r   r   r'   r   r   r   
key_statesvalue_statesattn_weightsattn_outputs               r0   eager_attention_forwardr      s     3 ; ;<JUF$?$?@L<<z';';Aq'ABWLL!#n4==((2U]](SVVW\WbWbcL==((6??([L,,|\:K''1-88:K$$r/   c                 |    | ddddf   }| ddddf   }t        j                  | |fd      j                  d      S )	z*Rotates half the hidden dims of the input..r   NrN   r"   rP   rQ   )r+   stackflatten)r   x1x2s      r0   rotate_halfr      sJ    	
319B	
319B;;Ryb)11"55r/   c                    |j                  |      }|j                  |      }|dd|j                  d   dz  f   j                  dd      }|dd|j                  d   dz  f   j                  dd      }|j                  d   }| dd|f   | d|df   }}|dd|f   |d|df   }	}||z  t        |      |z  z   }
||z  t        |      |z  z   }t	        j
                  |
|gd      }
t	        j
                  ||	gd      }|
|fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    .NrP   rN   rQ   )	unsqueezer~   repeat_interleaver   r+   r   )qkr   r   unsqueeze_dim
rotary_dimq_rotq_passk_rotk_passq_embedk_embeds               r0   apply_rotary_pos_embr      sD   $ --
&C
--
&C c'SYYr]a'''
(
:
:1"
:
EC
c'SYYr]a'''
(
:
:1"
:
EC 2Jc;J;&'3
+;)<6Ec;J;&'3
+;)<6E s{{51C78Gs{{51C78G ii&)r2Gii&)r2GGr/   c                   l    e Zd ZdZdededededef
 fdZ	 	 	 	 	 dd	ej                  d
e
ej                  ej                  f   dz  dej                  dz  dedz  dej                  dz  dej                  dz  dee   de
ej                  ej                  dz  e
ej                     dz  f   fdZ xZS )MoonshineAttentionz=Multi-headed attention from 'Attention Is All You Need' paperr8   	layer_idx	is_causalrp   r   c                 8   t         |           |j                  ||d       || _        || _        t        |d|j                  |j                  z        | _        |j                  |j                  z  | _
        | j                  dz  | _        |j                  | _        || _        t        j                  |j                  |j                  | j                  z  |j                         | _        t        j                  |j                  |j                  | j                  z  |j                         | _        t        j                  |j                  |j                  | j                  z  |j                         | _        t        j                  |j                  | j                  z  |j                  d      | _        | j                  j*                  C| j                  j*                  }|| j                  |z   dz
  |z  z  }|| j                  z
  | _        y d| _        y )N)rp   r   rk   g      ࿩biasFr"   r   )r6   r7   updater8   r   ro   r<   rp   rk   r   r   r   attention_dropoutr   r:   r;   attention_biasq_projk_projv_projo_projpad_head_dim_to_multiple_ofhead_dim_padding)	rA   r8   r   r   rp   r   target_multipletarget_head_dimrC   s	           r0   r7   zMoonshineAttention.__init__   s    	.AZmno"
F4F4F&JdJd4de$*$>$>&B\B\$\!}}d*!'!9!9"ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii : :T]] JFL^L^ejk ;;22>"kkEEO-$--/2QTU2UZi1ijO$3dmm$CD!$%D!r/   NrD   position_embeddingsr'   past_key_valuescache_positionkey_value_statesr   rE   c                 N   |j                   d d \  }}	| j                  |      j                  ||	| j                  j                  | j
                        j                  dd      }
|d u}|Y|j                  j                  | j                        }|r&d|j                  | j                  <   |j                  }n|j                  }||n|}|rK|rIrG|j                  | j                     j                  }|j                  | j                     j                  }n| j                  |      j                  |d| j                  j                  | j
                        j                  dd      }| j!                  |      j                  |d| j                  j                  | j
                        j                  dd      }|r%|#|j#                  ||| j                  d|i      \  }}|s?|\  }}t%        |
|||      \  }
}|'|||d}|j#                  ||| j                  |      \  }}t'        j(                  | j                  j*                  t,              }| j.                  xr |d u xr |	dkD  }| j0                  dkD  rt2        j4                  j6                  j9                  |
d| j0                  f      }
t2        j4                  j6                  j9                  |d| j0                  f      }t2        j4                  j6                  j9                  |d| j0                  f      } || |
|||f| j:                  sdn| j<                  | j>                  |d	|\  }}| j0                  dkD  r|d
d | j0                   f   }|jA                  ||	d      jC                         }| jE                  |      }||fS )NrP   r"   rN   Tr   )r   r   r   r           )r   r   r   .)#r~   r   viewr8   r   rk   r   
is_updatedrn   r   cross_attention_cacheself_attention_cachelayerskeysvaluesr   r   r   r   r   get_interface_attn_implementationr   r   r   r+   r:   r   padr   r   r   r   r   r   )rA   rD   r   r'   r   r   r   r   bszq_lenquery_statesis_cross_attentionr   current_statesr   r   r   r   cache_kwargsattention_interfacer   r   r   s                          r0   rG   zMoonshineAttention.forward  sy    #(("-
U KK&++C8W8WY]YfYfgqqrsuvw 	 .T9&(3377GJ!=A**4>>:"1"G"G"1"F"F .>-I)}/j(//?DDJ*11$..AHHL N+c2t{{>>N1a  N+c2t{{>>N1a 
 "o&A+:+A+Adnn?OQ_>`,(
L "*HC';L*VY[^'_$L**'*3.Y+:+A+Adnnl,(
L )@(M(MKK,,.E)
 NNK~'=K%!)	  1$ 88..22<!TEZEZA[\L,,00aAVAV=WXJ 88..22<!TEZEZA[\L$7
%
  $}}C$2H2HLL
%
 
%
!\   1$%c+Cd.C.C-C+C&CDK!))#ub9DDFkk+.L((r/   )NNNNN)r(   r)   r*   __doc__r#   rq   boolr7   r+   r,   r   r   
LongTensorr   r   rG   rI   rJ   s   @r0   r   r      s   G#&#& #& 	#&
 !#& !#&P IM.2(,2604U)||U) #5<<#=>EU) t+	U)
 U) ((4/U)  ,,-U) -.U) 
u||U\\D0%2E2LL	MU)r/   r   c                   "    e Zd Zdedef fdZ	 	 	 	 	 	 ddej                  dej                  dz  dej                  dz  de	dz  d	e
dz  d
ej                  dz  deej                  ej                  f   dz  dee   dej                  fdZ xZS )MoonshineEncoderLayerr8   r   c                 d   t         |           |j                  | _        t        ||d|j                  |j
                        | _        t        ||j                        | _	        t        j                  |j                  d      | _        t        j                  |j                  d      | _        y )NFr8   r   r   rp   r   r   )r6   r7   r<   r   encoder_num_attention_headsencoder_num_key_value_heads	self_attnr2   encoder_hidden_actmlpr:   	LayerNorminput_layernormpost_attention_layernormrA   r8   r   rC   s      r0   r7   zMoonshineEncoderLayer.__init__u  s    !--+ & B B & B B
 'vv/H/HI!||F,>,>UK(*V5G5Ge(T%r/   NrD   r'   r   r   	use_cacher   r   r   rE   c                     |}	| j                  |      } | j                  d|||||||d|\  }}
|	|z   }|}	| j                  |      }| j                  |      }|	|z   }|S )NrD   r'   r   r   r   r   r   r.   )r   r   r   r   )rA   rD   r'   r   r   r   r   r   r   residual_s              r0   rG   zMoonshineEncoderLayer.forward  s     !,,];)4>> 	
')%+) 3	
 	
q !=0 !55mD/ =0r/   )NNNFNN)r(   r)   r*   r#   rq   r7   r+   r,   r   r   r   r   r   r   rG   rI   rJ   s   @r0   r   r   t  s    U U3 U& /304(,!&26HL|| t+ &&-	
  $; ((4/ #5<<#=>E +, 
r/   r   c                   
    e Zd Zddededz  f fdZ	 	 	 	 	 	 	 	 	 	 ddej                  dej                  dz  dej                  dz  dej                  dz  d	ej                  dz  d
ej                  dz  de	dz  de
dz  dej                  dz  deej                  ej                  f   dz  deej                  ej                  f   dz  dee   deej                  eej                  ej                  f   dz  f   fdZ xZS )MoonshineDecoderLayerNr8   r   c                    t         |           |j                  | _        t        ||d|j                  |j
                        | _        t        ||d|j                  |j
                        | _        t        ||j                        | _
        t        j                  |j                  d      | _        t        j                  |j                  d      | _        t        j                  |j                  d      | _        y )NTr   Fr   )r6   r7   r<   r   rp   r   r   encoder_attnrL   rB   r   r:   r   r   r   final_layernormr   s      r0   r7   zMoonshineDecoderLayer.__init__  s    !--+ & : : & : :
 / & : : & : :
 'vv/@/@A!||F,>,>UK(*V5G5Ge(T%!||F,>,>UKr/   rD   r'   encoder_hidden_statesencoder_attention_maskr   encoder_position_idsr   r   r   r   encoder_position_embeddingsr   rE   c                 (   |}| j                  |      } | j                  d||||||	|
d|\  }}||z   }|1|}| j                  |      }| j                  |||||      \  }}||z   }|}| j	                  |      }| j                  |      }||z   }|S )Nr   )rD   r   r'   r   r   r.   )r   r   r   r  r  r   )rA   rD   r'   r  r  r   r  r   r   r   r   r  r   r   r   s                  r0   rG   zMoonshineDecoderLayer.forward  s     !,,];)4>> 	
')%+) 3	
 	
q !=0 ,$H 99-HM#00+!65 /#  1  M1 %}4M ,,];/ =0r/   r4   )
NNNNNNFNNN)r(   r)   r*   r#   rq   r7   r+   r,   r   r   r   r   r   r   FloatTensorrG   rI   rJ   s   @r0   r   r     sj   L L3: L6 /3596:048<(,!&26HLPT.||. t+.  %||d2	.
 !&t 3. &&-. $..5. . $;. ((4/. #5<<#=>E. &+5<<+E%F%M. +,. 
u  %(9(95;L;L(L"MPT"TT	U.r/   r   c                   \    e Zd ZU eed<   dZdZdZdZddgZ	dZ
dZdZdej                  fd	Zy
)MoonshinePreTrainedModelr8   modelinput_valuesaudioTr   r   input_lengthsc                 ~    t        |dz
  dz  dz         }t        |dz
  dz  dz         }t        |dz
  dz  dz         }|S )zH
        Computes the output length of the convolutional layers
           @   r"      r   rN   )rq   )rA   r  output_conv1_lengthoutput_conv2_lengthoutput_conv3_lengths        r0    _get_feat_extract_output_lengthsz9MoonshinePreTrainedModel._get_feat_extract_output_lengths  sZ     "=3#6""<q"@A!#6#:a"?!"CD!#6#:a"?!"CD""r/   N)r(   r)   r*   r#   r-   base_model_prefixmain_input_nameinput_modalitiessupports_gradient_checkpointing_no_split_modules_supports_flash_attn_supports_sdpa_can_compile_fullgraphr+   r   r  r.   r/   r0   r
  r
    sN    $O&*#02IJN!#e>N>N #r/   r
  c                        e Zd ZdZdZeedZdef fdZ	de
j                  fdZde
j                  fd	Zee	 ddej"                  dej$                  d
z  dee   deez  fd              Z xZS )MoonshineEncoderz
    Transformer encoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MoonshineEncoderLayer`]

    Args:
        config: MoonshineConfig
    r  )
attentionsrD   r8   c           	      b   t         |   |       || _        |j                  }t	        j
                  d|ddd      | _        t	        j
                  |d|z  dd	      | _        t	        j
                  d|z  |dd	      | _        t	        j                  d|d
      | _
        t	        j                  t        |j                        D cg c]  }t        ||       c}      | _        t	        j                   |d      | _        t%        |      | _        d| _        | j+                          y c c}w )Nr"   r  r  F)kernel_sizestrider   rN   r  r   )r#  r$  gh㈵>)
num_groupsnum_channelsepsr   r8   )r6   r7   r8   r<   r:   Conv1dconv1conv2conv3	GroupNorm	groupnorm
ModuleListrangeencoder_num_hidden_layersr   r   r   
layer_normrV   
rotary_embgradient_checkpointing	post_init)rA   r8   	embed_dimidxrC   s       r0   r7   zMoonshineEncoder.__init__  s     &&	YYq)ReT
YYy!i-QqQ
YYq9}iQqQ
PTUmm;@AaAa;bcC"63/c
 ,,yu=2&A&+# ds   D,rE   c                     | j                   S r4   r*  rA   s    r0   get_input_embeddingsz%MoonshineEncoder.get_input_embeddings*  s    zzr/   r   c                     || _         y r4   r9  rA   r   s     r0   set_input_embeddingsz%MoonshineEncoder.set_input_embeddings-  s	    
r/   Nr'   r   c                 d   |j                  d      }t        j                  j                  | j	                  |            }| j                  |      }t        j                  j                  | j                  |            }t        j                  j                  | j                  |            }|j                  ddd      }|3| j                  |j                  d         }d}|ddd|f   dd|f   }|}t        | j                  |||      }t        j                  d|j                  d   |j                   	      j                  d      }| j#                  ||
      }	| j$                  D ]  }
 |
|f|||	d|} | j'                  |      }t)        ||j+                               S d      S )a.  
        Args:
            input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
                Float values of the raw speech waveform. Raw speech waveform can be
                obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]`, a
                `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library (`pip install torchcodec`) or
                the soundfile library (`pip install soundfile`). To prepare the array into
                `input_values`, the [`AutoFeatureExtractor`] should be used for padding
                and conversion into a tensor of type `torch.FloatTensor`.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding indices in `input_values`. Mask values selected in `[0, 1]`:
                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.
                [What are attention masks?](../glossary#attention-mask)
        r"   r   rN   NrP   i  .r8   inputs_embedsr'   r  re   r   )r'   r   r   )last_hidden_stater'   )r   r:   r   tanhr*  r.  gelur+  r,  permuter  r~   r   r8   r+   rr   re   r3  r   r2  r&   rq   )rA   r  r'   r   rD   mask_lendownsample_strideoutput_attention_maskr   r   encoder_layers              r0   rG   zMoonshineEncoder.forward0  s   . $--a0**4::l+CD}5**4::m+DE**4::m+DE%--aA6 %<<^=Q=QRT=UVH *+C1D3D1D,DEc9H9nUN$2!2;;')"/	
 ||A}':':1'=mFZFZ[eefgh"oom,oW![[ 	M)-)$7	
 M	 6*+:H:T0446
 	
Z^
 	
r/   r4   )r(   r)   r*   r   r  r   r   _can_record_outputsr#   r7   r:   Moduler;  r>  r   r!   r+   r  r,   r   r   r   r   rG   rI   rJ   s   @r0   r   r   
  s     %O(.
 $bii "))    /3;
'';
 t+;
 +,	;

 
(	(;
   ;
r/   r   c                       e Zd ZdZ eedd      e eedd      dZdef fdZ	e
e	 	 	 	 	 	 	 	 	 ddej                  d	z  d
ej                  d	z  dej                  d	z  ded	z  dej                   d	z  ded	z  dej                  d	z  dej                   d	z  dej                  d	z  dee   deez  fd              Z xZS )MoonshineDecoder	input_idsr"   r   )index
layer_namer  )r!  rD   cross_attentionsr8   c           	         t         |   |       |j                  | _        |j                  | _        t        j                  |j                  |j                  | j                        | _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        t        j                  |j                  d      | _        t!        |      | _        d| _        | j'                          y c c}w )NFr   r(  )r6   r7   pad_token_idpadding_idx
vocab_sizer:   	Embeddingr<   embed_tokensr/  r0  num_hidden_layersr   r   r   normrV   r3  r4  r5  )rA   r8   r7  rC   s      r0   r7   zMoonshineDecoder.__init__y  s     !.. ++LL):):F<N<NPTP`P`ammSXY_YqYqSr$sC%:63%G$stLL!3!3%@	2&A&+# 	 %ts   DNr'   r   r   rA  r   r   r  r  r   rE   c
                    |du |duz  rt        d      || j                  |      }|r6|4t        t        | j                        t        | j                              }|F||j                         nd}t        j                  |||j                  d   z   |j                        }||j                  d      }t        | j                  |||||      }t        | j                  ||	|      }	|}| j                  ||	      }| j                  D ]  } ||||f|	|||||d
|
} | j                  |      }t!        ||r|      S d      S )a  
        encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
            of the decoder.
        encoder_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding indices in `encoder_hidden_states`. Mask values selected in `[0, 1]`:
            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
            [What are attention masks?](../glossary#attention-mask)
        Nz:You must specify exactly one of input_ids or inputs_embedsr(  r   r"   rB  )r8   rA  r'   r   r   r   r@  rC  )r  r   r   r   r   r   )rD  r   )
ValueErrorrY  r
   r	   r8   get_seq_lengthr+   rr   r~   re   r   r   r   r3  r   r[  r   )rA   rP  r'   r   r   rA  r   r   r  r  r   past_seen_tokenscausal_maskrD   r   decoder_layers                   r0   rG   zMoonshineDecoder.forward  s   2 -t";<YZZ  --i8M01,dkk2RT`hlhshsTtuO!CRC^==?de"\\ "2]5H5H5K"KTaThThN )33A6L(;;'))+%
 ";;;'1"7	"
 &"oom,oW![[ 	M)% (>) /#-$7 M	 		-08+/8O
 	
>B
 	
r/   )	NNNNNNNNN)r(   r)   r*   r  r    r   r   rL  r#   r7   r   r!   r+   r   r,   r   r  r   r   r   r   r   rG   rI   rJ   s   @r0   rO  rO  p  sI   !O$%7q[Y.*+=QSab    .2.204(,26!%26:>6:N
##d*N
 t+N
 &&-	N

 N
 ((4/N
 $;N
 ((4/N
  %0047N
 !&t 3N
 +,N
 
(	(N
   N
r/   rO  c                       e Zd Zdef fdZd Zd Zd Zd Ze	e
	 	 	 	 	 	 	 	 	 	 ddej                  dz  d	ej                  dz  d
ej                  dz  dej                  dz  deeej                        dz  dedz  deej                     dz  deej                     dz  dedz  dej                  dz  dee   defd              Z xZS )MoonshineModelr8   c                     t         |   |       t        |      | _        t	        |      | _        | j                          y r4   )r6   r7   r   encoderrO  decoderr5  rA   r8   rC   s     r0   r7   zMoonshineModel.__init__  s2     '/'/r/   c                 .    | j                   j                  S r4   rf  rY  r:  s    r0   r;  z#MoonshineModel.get_input_embeddings  s    ||(((r/   c                 &    || j                   _        y r4   ri  r=  s     r0   r>  z#MoonshineModel.set_input_embeddings  s    $)!r/   c                 8    | j                   j                          y)z
        Calling this function will disable the gradient computation for the Moonshine encoder so that its parameters will
        not be updated during training.
        N)re  _freeze_parametersr:  s    r0   freeze_encoderzMoonshineModel.freeze_encoder  s    
 	'')r/   c                     t        d      )z
        Masks extracted features along time axis and/or along feature axis according to
        [SpecAugment](https://huggingface.co/papers/1904.08779).
        zNot needed for Moonshine)AttributeErrorr:  s    r0   _mask_input_featuresz#MoonshineModel._mask_input_features  s    
 788r/   Nr  r'   decoder_input_idsdecoder_attention_maskencoder_outputsr   decoder_inputs_embedsdecoder_position_idsr   r   r   rE   c                 V   | | j                   |fd|i|} | j                  d|||j                  |j                  ||||	|
d	|}t	        |j                  |j
                  |j                  |j                  |j                  |j                  |j                  |j                        S )a
  
        input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
            Float values of the raw speech waveform. Raw speech waveform can be
            obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]`, a
            `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library (`pip install torchcodec`) or
            the soundfile library (`pip install soundfile`). To prepare the array into
            `input_values`, the [`AutoFeatureExtractor`] should be used for padding
            and conversion into a tensor of type `torch.FloatTensor`.
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`):
            Indices of positions of each input sequence tokens in the position embeddings.
            Used to calculate the position embeddings up to `config.decoder_config.max_position_embeddings`

        Example:

        ```python
        >>> import torch
        >>> from transformers import AutoFeatureExtractor, MoonshineModel
        >>> from datasets import load_dataset

        >>> model = MoonshineModel.from_pretrained("UsefulSensors/moonshine-tiny")
        >>> feature_extractor = AutoFeatureExtractor.from_pretrained("UsefulSensors/moonshine-tiny")
        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
        >>> inputs = feature_extractor(ds[0]["audio"]["array"], return_tensors="pt")
        >>> input_values = inputs.input_values
        >>> decoder_input_ids = torch.tensor([[1, 1]]) * model.config.decoder_start_token_id
        >>> last_hidden_state = model(input_values, decoder_input_ids=decoder_input_ids).last_hidden_state
        >>> list(last_hidden_state.shape)
        [1, 2, 288]
        ```
        r'   )	rP  r'   r  r  r   rA  r   r   r   )rD  r   decoder_hidden_statesdecoder_attentionsrS  encoder_last_hidden_stater  encoder_attentionsr.   )	re  rf  rD  r'   r   r   rD   r!  rS  )rA   r  r'   rq  rr  rs  r   rt  ru  r   r   r   decoder_outputss                r0   rG   zMoonshineModel.forward  s    \ "/;t||L/rYg/rkq/rOEQT\\ F
'1"1"C"C#2#A#A+/-)F
 F
 "-??+;;"1"?"?.99,==&5&G&G"1"?"?.99	
 		
r/   )
NNNNNNNNNN)r(   r)   r*   r#   r7   r;  r>  rm  rp  r   r   r+   r  r   r   r
   r   r   r   r   rG   rI   rJ   s   @r0   rc  rc    s^    )**9  262659:>BF6:AE?C!%26E
''$.E
 ((4/E
 !++d2	E

 !& 0 04 7E
 uU%6%6784?E
 -t3E
  %U%6%67$>E
 $E$4$45<E
 $;E
 ((4/E
 +,E
 
E
  E
r/   rc  rP  rU  decoder_start_token_idc                     | j                  | j                        }| ddddf   j                         |ddddf<   ||dddf<   |t        d      |j	                  |dk(  |       |S )z1
    Shift input ids one token to the right.
    NrP   r"   r   z1self.model.config.pad_token_id has to be defined.i)	new_zerosr~   rd   r]  masked_fill_)rP  rU  r|  shifted_input_idss       r0   shift_tokens_rightr  B  s}     "++IOO<(CRC0668ae4adLMM""#4#<lKr/   zj
    The Moonshine Model with a language modeling head. Can be used for automatic speech recognition.
    c                       e Zd ZddiZdef fdZd Zd Zdej                  fdZ
ee	 	 	 	 	 	 	 	 	 	 	 dd
ej                  d	z  dej                  d	z  dej                  d	z  dej                  d	z  deeej                        d	z  ded	z  deej                     d	z  deej                     d	z  ded	z  dej                  d	z  dej                  d	z  dee   defd              Z xZS )!MoonshineForConditionalGenerationzproj_out.weightz!model.decoder.embed_tokens.weightr8   c                     t         |   |       t        |      | _        t	        j
                  |j                  |j                  d      | _        | j                          y )NFr   )
r6   r7   rc  r  r:   r;   r<   rW  proj_outr5  rg  s     r0   r7   z*MoonshineForConditionalGeneration.__init__Z  sH     #F+
		&"4"4f6G6GeT 	r/   c                     | j                   S r4   r  r:  s    r0   get_output_embeddingsz7MoonshineForConditionalGeneration.get_output_embeddingsb  s    }}r/   c                     || _         y r4   r  )rA   new_embeddingss     r0   set_output_embeddingsz7MoonshineForConditionalGeneration.set_output_embeddingse  s	    &r/   rE   c                 6    | j                   j                         S r4   )r  r;  r:  s    r0   r;  z6MoonshineForConditionalGeneration.get_input_embeddingsh  s    zz..00r/   Nr  r'   rq  rr  rs  r   rt  ru  r   r   labelsr   c                    |9|7|5t        || j                  j                  | j                  j                        } | j                  |f||||||||	|
d	|}| j                  |j                        }d}|(| j                  ||| j                  j                        }t        |||j                  |j                  |j                  |j                  |j                  |j                  |j                   	      S )a0  
        input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
            Float values of the raw speech waveform. Raw speech waveform can be
            obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]`, a
            `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library (`pip install torchcodec`) or
            the soundfile library (`pip install soundfile`). To prepare the array into
            `input_values`, the [`AutoFeatureExtractor`] should be used for padding
            and conversion into a tensor of type `torch.FloatTensor`.
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`):
            Indices of positions of each input sequence tokens in the position embeddings.
            Used to calculate the position embeddings up to `config.decoder_config.max_position_embeddings`

        Example:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, MoonshineForConditionalGeneration
        >>> from datasets import load_dataset

        >>> processor = AutoProcessor.from_pretrained("UsefulSensors/moonshine-tiny")
        >>> model = MoonshineForConditionalGeneration.from_pretrained("UsefulSensors/moonshine-tiny")

        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")

        >>> inputs = processor(ds[0]["audio"]["array"], return_tensors="pt")
        >>> input_values = inputs.input_values

        >>> generated_ids = model.generate(input_values, max_new_tokens=100)

        >>> transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        >>> transcription
        'Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel.'
        ```N)	r'   rq  rs  rr  r   rt  ru  r   r   )logitsr  rW  )	lossr  r   rw  rx  rS  ry  r  rz  )r  r8   rU  r|  r  r  rD  loss_functionrW  r   r   rw  rx  rS  ry  r  rz  )rA   r  r'   rq  rr  rs  r   rt  ru  r   r   r  r   outputsr  r  s                   r0   rG   z)MoonshineForConditionalGeneration.forwardk  s   f  (-B-J$6DKK44dkk6X6X%! '1djj'
)/+#9+"7!5)'
 '
 w889%%VFt{{OeOe%fD#33")"?"?&99$55&-&G&G")"?"?&99

 
	
r/   )NNNNNNNNNNN)r(   r)   r*   _tied_weights_keysr#   r7   r  r  r:   rM  r;  r   r   r+   r  r   r   r
   r   r   r   r   rG   rI   rJ   s   @r0   r  r  R  s    ,-PQ '1bii 1  262659:>BF6:AE?C!%26*.T
''$.T
 ((4/T
 !++d2	T

 !& 0 04 7T
 uU%6%6784?T
 -t3T
  %U%6%67$>T
 $E$4$45<T
 $;T
 ((4/T
   4'T
 +,T
 
T
  T
r/   r  )rc  r
  r  )r   )r"   )Jcollections.abcr   dataclassesr   typingr   r+   torch.nnr:   activationsr   cache_utilsr   r	   r
   
generationr   integrationsr   masking_utilsr   r   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   utils.genericr   r   utils.output_capturingr    r!   configuration_moonshiner#   r&   rM  r2   rL   rV   r,   rq   r   ru   r   r   r   r   r   r   r
  r   rO  rc  r  r  __all__r.   r/   r0   <module>r     s^  * % !    ! C C ) / J B 9  L F & I I G E 4 
// / /")) "))  @<ryy @<F	UU\\ 	U# 	U%,, 	U& %II%<<% 
% <<	%
 LL4'% % % '(%26%P )*}) }) +})@06 0fG6 GT # # #0c
/ c
L f
/ f
 f
R d
- d
 d
N%,, c [^   
j
(@/ j

j
Z ^r/   