
    qi                        d dl mZ d dlmZmZ d dlZd dlmc mZ	 d dlmZ ddl
mZ ddlmZ ddlmZmZmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZmZ ddlmZmZ ddlm Z  ddl!m"Z"m#Z#m$Z$ ddl%m&Z&m'Z' ddl(m)Z)m*Z* ddl+m,Z, ddl-m.Z.  e)       r	d dl/m0Z0m1Z1 nd\  Z0Z1 ed       G d dejd                               Z3 G d dejd                        Z4 G d dejd                        Z5 G d d      Z6d  Z7 ed!      d>d"       Z8d#ejr                  d$e:d%ejr                  fd&Z;	 d?d'ejd                  d(ejr                  d)ejr                  d*ejr                  d+ejr                  dz  d,e<d-e<d.e e"   fd/Z= ee8       G d0 d1ejd                               Z>d2 Z?e0e1fZ@ eAe@      ZB G d3 d4ejd                        ZC G d5 d6e      ZDe# G d7 d8e             ZEe# G d9 d:eE             ZFe# G d; d<eEe             ZGg d=ZHy)@    )Callable)AnyOptionalN)nn   )Cache)GenerationMixin)use_kernel_forward_from_hubuse_kernel_func_from_hubuse_kernelized_func)create_causal_mask)GradientCheckpointingLayer)BaseModelOutputWithPastCausalLMOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuple)maybe_autocastmerge_with_config_defaults)is_causal_conv1d_availableis_torchdynamo_compiling)capture_outputs   )
Lfm2Config)causal_conv1d_fncausal_conv1d_updateNNRMSNormc                   h     e Zd Zddeddf fdZdej                  dej                  fdZd Z xZ	S )	Lfm2RMSNormepsreturnNc                     t         |           t        j                  t	        j
                  |            | _        || _        y)z:
        Lfm2RMSNorm is equivalent to T5LayerNorm
        N)super__init__r   	Parametertorchonesweightvariance_epsilon)selfhidden_sizer&   	__class__s      X/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/lfm2/modeling_lfm2.pyr*   zLfm2RMSNorm.__init__3   s1     	ll5::k#:; #    hidden_statesc                 "   |j                   }|j                  t        j                        }|j	                  d      j                  dd      }|t        j                  || j                  z         z  }| j                  |j                  |      z  S )N   T)keepdim)	dtypetor,   float32powmeanrsqrtr/   r.   )r0   r5   input_dtypevariances       r3   forwardzLfm2RMSNorm.forward;   sy    #))%((7 $$Q',,R,>%Ht?T?T4T(UU{{]--k:::r4   c                 ^    t        | j                  j                         d| j                   S )Nz, eps=)tupler.   shaper/   r0   s    r3   
extra_reprzLfm2RMSNorm.extra_reprB   s*    ))*+6$2G2G1HIIr4   )gư>)
__name__
__module____qualname__floatr*   r,   TensorrB   rG   __classcell__r2   s   @r3   r%   r%   1   s7    $ $$ $;U\\ ;ell ;Jr4   r%   c                        e Zd ZU ej                  ed<   ddef fdZe	 	 	 ddedz  de	d   de
dz  ded	ef   fd
       Z ej                         ed               Z xZS )Lfm2RotaryEmbeddinginv_freqNconfigc                    t         |           |j                  | _        |j                  | _        || _        | j
                  j                  d   | _        | j                  }| j                  dk7  rt        | j                     } || j
                  |      \  }| _
        | j                  d|d       | j                  d|j                         d       y )N	rope_typedefaultrQ   F)
persistentoriginal_inv_freq)r)   r*   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenrR   rope_parametersrT   compute_default_rope_parametersr   attention_scalingregister_bufferclone)r0   rR   devicerope_init_fnrQ   r2   s        r3   r*   zLfm2RotaryEmbedding.__init__I   s    "("@"@$*$B$B!44[A!%!E!E>>Y&.t~~>L+7V+L($(ZeD0(..2BuUr4   r`   ztorch.deviceseq_lenr'   ztorch.Tensorc                    | j                   d   }t        | dd      xs | j                  | j                  z  }d}d|t	        j
                  d|dt        j                        j                  |t        j                        |z  z  z  }||fS )	a  
        Computes the inverse frequencies according to the original RoPE implementation
        Args:
            config ([`~transformers.PreTrainedConfig`]):
                The model configuration.
            device (`torch.device`):
                The device to use for initialization of the inverse frequencies.
            seq_len (`int`, *optional*):
                The current sequence length. Unused for this type of RoPE.
        Returns:
            Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
            post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
        
rope_thetahead_dimNg      ?r   r7   r:   r`   r:   )	r[   getattrr1   num_attention_headsr,   arangeint64r;   rK   )rR   r`   rb   basedimattention_factorrQ   s          r3   r\   z3Lfm2RotaryEmbedding.compute_default_rope_parametersY   s    & %%l3fj$/c63E3EIcIc3c U\\!S!5;;?BB&X]XcXcBdgjjk
 )))r4   c                 N   | j                   d d d d f   j                         j                  |j                  d   dd      j	                  |j
                        }|d d d d d f   j                         }t        |j
                  j                  t              r/|j
                  j                  dk7  r|j
                  j                  nd}t        |d      5  |j                         |j                         z  j                  dd      }t        j                  ||fd	      }|j                         | j                  z  }|j                         | j                  z  }	d d d        j	                  |j                   
      	j	                  |j                   
      fS # 1 sw Y   AxY w)Nr   r8   r   mpscpuF)device_typeenabledr7   rm   rf   )rQ   rK   expandrE   r;   r`   
isinstancetypestrr   	transposer,   catcosr]   sinr:   )
r0   xposition_idsinv_freq_expandedposition_ids_expandedrr   freqsembr{   r|   s
             r3   rB   zLfm2RotaryEmbedding.forwardw   sR    !MM$4-8>>@GGHZHZ[\H]_acdehhijiqiqr ,QaZ 8 > > @'1!((--'E!((--[`J`ahhmmfkUC 	5&,,.1F1L1L1NNYYZ[]^_E))UEN3C'')d444C'')d444C		5 vvAGGv$cff177f&;;;	5 	5s   BFF$NNNN)rH   rI   rJ   r,   rL   __annotations__r   r*   staticmethodr   intrD   rK   r\   no_gradr   rB   rM   rN   s   @r3   rP   rP   F   s    llVz V  $(+/"*T!*(* t* 
~u$	%	* *: U]]_<  <r4   rP   c                   *     e Zd Zdef fdZd Z xZS )Lfm2MLPrR   c                    t         |           |j                  }|j                  rat	        d|z  dz        }|j
                  Dt	        |j
                  |z        }|j                  ||j                  z   dz
  |j                  z  z  }t        j                  |j                  |d      | _
        t        j                  |j                  |d      | _        t        j                  ||j                  d      | _        y )Nr7   r   r   Fbias)r)   r*   intermediate_sizeblock_auto_adjust_ff_dimr   block_ffn_dim_multiplierblock_multiple_ofr   Linearr1   w1w3w2)r0   rR   r   r2   s      r3   r*   zLfm2MLP.__init__   s    "44** #A(9$9A$= >..:$'(G(GJ[([$\!$*$<$<&)A)AAAE&JbJbb%! ))F..0AN))F..0AN))-v/A/ANr4   c                     | j                  t        j                  | j                  |            | j	                  |      z        S r   )r   Fsilur   r   )r0   r}   s     r3   rB   zLfm2MLP.forward   s/    wwqvvdggaj)DGGAJ677r4   )rH   rI   rJ   r   r*   rB   rM   rN   s   @r3   r   r      s    Oz O8r4   r   c                      e Zd ZdZdZdZdZdZej                  dfde
dedej                  dej                  ez  dz  fdZ	 dd	ej                   d
ej                   dedeeef   dz  deej                   ej                   f   f
dZdej*                  fdZddedz  defdZdej                   dedeeef   fdZdefdZdefdZd Zy)Lfm2HybridConvCachea  
    Attention and conv cache for Lfm2.

    It stores the Key and Value states as a list of tensors, one for each layer.
    Attention layer cache shape: `[batch_size, num_heads, seq_len, head_dim]`.
    Conv layer cache shape: `[batch_size, hidden_size, L_cache-1]`.
    NFrR   max_batch_sizer:   r`   c                    g | _         g | _        || _        |j                  | _        | j                  j	                  d      | _        |j                  | _        || _        g | _        |t        j                  |      nd }t        |j                        D ]  }t        j                  | j                  |j                  | j                  | j                  |      }| j                  j                  |       | j                   j                  t        j                   g              | j                  j                  t        j                   g               y )Nfull_attention)r:   r`   )	key_cachevalue_cacher   layer_typesindexfirst_attention_layerconv_L_cache_dtype
conv_cacher,   r`   rangenum_hidden_layerszerosr1   appendtensor)r0   rR   r   r:   r`   _
conv_states          r3   r*   zLfm2HybridConvCache.__init__   s    ,!--%)%5%5%;%;<L%M""//.0)/);f%v//0 
	6A##""!!kkJ OO"":.NN!!%,,r"23##ELL$45
	6r4   
key_statesvalue_states	layer_idxcache_kwargsr'   c                    | j                   |   j                         dk(  r|| j                   |<   || j                  |<   nft        j                  | j                   |   |gd      | j                   |<   t        j                  | j                  |   |gd      | j                  |<   | j                   |   | j                  |   fS )a  
        Updates the cache with the new `key_states` and `value_states` for the layer `layer_idx`.

        Parameters:
            key_states (`torch.Tensor`):
                The new key states to cache.
            value_states (`torch.Tensor`):
                The new value states to cache.
            layer_idx (`int`):
                The index of the layer to cache the states for.
            cache_kwargs (`Dict[str, Any]`, `optional`):
                Additional arguments for the cache subclass. No additional arguments are used in `DynamicCache`.

        Return:
            A tuple containing the updated key and value states.
        r   rt   )r   numelr   r,   rz   )r0   r   r   r   r   s        r3   updatezLfm2HybridConvCache.update   s    0 >>)$**,1(2DNN9%*6DY'(-		4>>)3Lj2Y_a(bDNN9%*/))T5E5Ei5PR^4_eg*hDY'~~i($*:*:9*EEEr4   beam_idxc                    t        t        | j                              D ]:  }| j                  |   j                         r| j                  |   j                  }| j                  |   j                  d|j                  |            | j                  |<   | j                  |   j                  }| j                  |   j                  d|j                  |            | j                  |<   | j                  |   j                         s| j                  |   j                  }| j                  |   j                  d|j                  |            | j                  |<   = y)zDReorders the cache for beam search, given the selected beam indices.r   N)	r   lenr   r   r`   index_selectr;   r   r   )r0   r   r   r`   s       r3   reorder_cachez!Lfm2HybridConvCache.reorder_cache   s#   s4>>23 		mI~~i(..0	299,0NN9,E,R,RSTV^VaVabhVi,jy))))4;;.2.>.>y.I.V.VWXZbZeZeflZm.n  +y)//13::-1__Y-G-T-TUVX`XcXcdjXk-l	*		mr4   c                     | j                   |   dk7  r| j                  n|}t        | j                        |k  s | j                  |   j	                         dk(  ry| j                  |   j
                  d   S )zYReturns the sequence length of the cached states. A layer index can be optionally passed.r   r   r   )r   r   r   r   r   rE   r0   r   s     r3   get_seq_lengthz"Lfm2HybridConvCache.get_seq_length   sm     372B2B92MQa2aD..gp	t~~)+t~~i/H/N/N/PTU/U~~i(..r22r4   cache_positionc                 V    d}|j                   d   }| j                         }||z   }||fS )aB  
        Return a tuple (kv_length, kv_offset) corresponding to the length and offset that will be returned for
        the given layer at `layer_idx`.
        The masks are then prepared according to the given lengths (kv_length, kv_offset) and patterns (i.e. sliding_window, chunk_size),
        for each layer.
        r   )rE   r   )r0   r   r   full_mask_kv_offsetquery_lengthpast_seen_tokens	kv_lengths          r3   get_mask_sizesz"Lfm2HybridConvCache.get_mask_sizes   s@      %++A...0 #33	---r4   
max_lengthc                    |dk  r| j                         t        |      z
  }| j                         |k  ryt        t        | j                              D ]l  }| j                  |   j                         s!| j                  |   dd|ddf   | j                  |<   | j                  |   dd|ddf   | j                  |<   n y)z"Crop the cache to the given lengthr   N.)r   absr   r   r   r   r   )r0   r   idxs      r3   cropzLfm2HybridConvCache.crop  s    >,,.Z@J J.T^^,- 	SC~~c"((*&*nnS&9#{
{A:M&Ns#(,(8(8(=c;J;PQ>Q(R  %	Sr4   c                 ,    t        | j                        S r   )r   r   rF   s    r3   __len__zLfm2HybridConvCache.__len__  s    4>>""r4   c                     t        t        | j                              D ]  }| j                  |   j                          ! y r   )r   r   r   zero_r   s     r3   resetzLfm2HybridConvCache.reset  s4    s4??34 	/IOOI&,,.	/r4   r   )r   )rH   rI   rJ   __doc__r   is_compileabler   r   r,   r<   r   r   r:   r`   rx   r*   rL   dictr   rD   r   
LongTensorr   r   r   r   r   r    r4   r3   r   r      sB    NNIK #]],066 6 {{	6
 s"T)6F /3FLLF llF 	F
 38nt+F 
u||U\\)	*FBme&6&6 m3d
 33 3.U\\ .c .eTWY\T\o .Ss S# #/r4   r   c                     | dd| j                   d   dz  f   }| d| j                   d   dz  df   }t        j                  | |fd      S )z*Rotates half the hidden dims of the input..Nr8   r7   rt   )rE   r,   rz   )r}   x1x2s      r3   rotate_halfr   !  sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''r4   rotary_pos_embc                     |j                  |      }|j                  |      }| |z  t        |       |z  z   }||z  t        |      |z  z   }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )	unsqueezer   )qkr{   r|   unsqueeze_dimq_embedk_embeds          r3   apply_rotary_pos_embr   (  sY    & --
&C
--
&C3w;q>C/0G3w;q>C/0GGr4   r5   n_repr'   c                     | j                   \  }}}}|dk(  r| S | dddddddddf   j                  |||||      } | j                  |||z  ||      S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r   N)rE   ru   reshape)r5   r   batchnum_key_value_headsslenre   s         r3   	repeat_kvr   B  so    
 2?1D1D.Ehz!!Qa"23::5BUW\^bdlmM  (;e(CT8TTr4   modulequerykeyvalueattention_maskscalingdropoutkwargsc                    t        || j                        }t        || j                        }	t        j                  ||j	                  dd            |z  }
||
|z   }
t
        j                  j                  |
dt        j                        j                  |j                        }
t
        j                  j                  |
|| j                        }
t        j                  |
|	      }|j	                  dd      j                         }||
fS )Nr7   r   r8   )rm   r:   )ptrainingr   )r   num_key_value_groupsr,   matmulry   r   
functionalsoftmaxr<   r;   r:   r   r   
contiguous)r   r   r   r   r   r   r   r   r   r   attn_weightsattn_outputs               r3   eager_attention_forwardr   N  s     3 ; ;<JUF$?$?@L<<z';';Aq'ABWLL!#n4==((2U]](SVVW\WbWbcL==((6??([L,,|\:K''1-88:K$$r4   c                   
    e Zd ZdZdedef fdZ	 	 ddej                  de	ej                  ej                  f   dej                  dz  d	e
dz  d
ej                  dz  de	ej                  ej                  dz  f   fdZ xZS )Lfm2Attentionz=Multi-headed attention from 'Attention Is All You Need' paperrR   r   c                    t         |           || _        || _        t	        |d|j
                  |j                  z        | _        |j                  |j                  z  | _	        | j                  dz  | _
        d| _        t        j                  |j
                  |j                  | j                  z  d      | _        t        j                  |j
                  |j                  | j                  z  d      | _        t        j                  |j
                  |j                  | j                  z  d      | _        t        j                  |j                  | j                  z  |j
                  d      | _        t%        | j                  |j&                        | _        t%        | j                  |j&                        | _        y )Nre   g      TFr   r&   )r)   r*   rR   r   rh   r1   ri   re   r   r   r   	is_causalr   r   q_projk_projv_projout_projr%   norm_epsq_layernormk_layernormr0   rR   r   r2   s      r3   r*   zLfm2Attention.__init__k  sL   "
F4F4F&JdJd4de$*$>$>&B\B\$\!}}d*ii 2 2F4N4NQUQ^Q^4^ejkii 2 2F4N4NQUQ^Q^4^ejkii 2 2F4N4NQUQ^Q^4^ejk		&"<"<t}}"LfN`N`glm&t}}&//J&t}}&//Jr4   Nr5   position_embeddingsr   past_key_valuesr   r'   c                    |j                   d d }g |d| j                  }| j                   | j                  |      j                  |       j                  dd      }	| j                   | j                  |      j                  |       j                  dd      }
 | j                  |      j                  | j                  dd      }|\  }}t        |	|
||      \  }	}
|'|||d}|j                  |
|| j                  |      \  }
}t        j                  | j                  j                  t               } || |	|
||fd| j"                  d|\  }} |j$                  g |d j'                         }| j)                  |      }||fS )Nr8   r   r7   )r|   r{   r           )r   r   )rE   re   r  r   viewry   r  r   r  r   r   r   r   get_interfacerR   _attn_implementationr   r   r   r   r  )r0   r5   r  r   r  r   r   input_shapehidden_shapequery_statesr   r   r{   r|   r   attention_interfacer   r   outputs                      r3   rB   zLfm2Attention.forwardz  s    $))#2.88b8$--8''(GM(B(G(G(VWaabcefg%%&Edkk-&@&E&E|&TU__`acde
6t{{=166EOOPQSTU&S#7jRUWZ#[ j&#&snUL'6'='=j,X\XfXfht'u$J(?(M(MKK,,.E)
 %8	%
 LL	%
 	%
!\ *k));;;;FFH{+|##r4   r"   )rH   rI   rJ   r   r   r   r*   r,   rL   rD   r   r   rB   rM   rN   s   @r3   r   r   g  s    GKz Kc K( 7;26'$||'$ #5<<#=>'$ t+	'$
 -t3'$ ((4/'$ 
u||U\\D00	1'$r4   r   c                     |N|j                   d   dkD  r<|j                   d   dkD  r*| j                  }| |dddddf   z  j                  |      } | S )zm
    Tunes out the hidden states for padding tokens, see https://github.com/state-spaces/mamba/issues/66
    Nr   r   )rE   r:   r;   )r5   r   r:   s      r3   apply_mask_to_padding_statesr    sa    
 !n&:&:1&=&AnFZFZ[\F]`aFa##&1d
)CCGGNr4   c            
       r    e Zd Zdedef fdZ	 	 	 ddej                  dedz  dej                  dz  dej                  dz  fd	Z
	 	 	 ddej                  dedz  dej                  dz  dej                  dz  fd
Z	 	 	 ddej                  dedz  dej                  dz  dej                  dz  fdZ xZS )Lfm2ShortConvrR   r   c           	      2   t         |           || _        || _        |j                  | _        |j                  | _        t        j                  |j                  |j                  | j
                  |j                  | j                  | j
                  dz
        | _        t        j                  |j                  d|j                  z  | j                        | _        t        j                  |j                  |j                  | j                        | _        y )Nr   )in_channelsout_channelskernel_sizegroupsr   paddingr   r   )r)   r*   rR   r   r   L_cache	conv_biasr   r   Conv1dr1   convr   in_projr  r  s      r3   r*   zLfm2ShortConv.__init__  s    
 	"**$$	II**++%%LL1$
	 yy!3!3Q9K9K5KRVR[R[\		&"4"4f6H6HtyyYr4   Nr}   r  r   r   c                    t        ||      }| j                  |      j                  dd      }|j                  dd      \  }}}||z  }| j                  j
                  j                  | j                  j
                  j                  d      | j                  j
                  j                  d            }	|c|d   dkD  r[t        |j                  d      |j                  | j                     |	| j                  j                  d       }
|
j                  d      }
n|dt        j                  j!                  || j"                  |j$                  d   z
  df      }|j                  | j                     j'                  |       t)        ||	| j                  j                  d       }
||
z  }| j+                  |j                  dd      j-                               }|S )Nr8   r   r   rt   r   r7   )
activation)r  r!  ry   chunkr   r.   r  sizer!   squeezer   r   r   r   r   r   padr  rE   copy_r    r  r   )r0   r}   r  r   r   BCxBCBxconv_weightsconv_outr   ys                r3   cuda_kernels_forwardz"Lfm2ShortConv.cuda_kernels_forward  s    )N;ll1o''B/))A2)&1aUyy'',,TYY-=-=-B-B1-EtyyGWGWG\G\]^G_`&>!+<q+@+

2**4>>:		H  ))"-H*]]..rDLL288B<4OQR3ST
**4>>:@@L'L$))..UYZHLMM!++b"-88:;r4   c                    |j                   d   }t        ||      }| j                  |      j                  dd      }|j	                  dd      \  }}}||z  }	|5|d   dkD  r,|j
                  | j                     }
|j                  d| j                  dz
        }|
j                  dd      }
|	j                  |
j                  |
j                        |
d d d d |f<   |j
                  | j                     j                  |
       t        j                  |
j                  |	j                        | j                   j"                  d d dd d f   z  d      }| j$                  r|| j                   j$                  z  }|j'                  d      }n~|dt(        j*                  j-                  |	| j                  |	j                   d   z
  df      }
|j
                  | j                     j                  |
       | j!                  |	      d	d |f   }||z  }|j                  dd      j/                         }| j1                  |      }|S )
Nr   r8   r   r   rt   r   )shiftsdimsrg   .)rE   r  r!  ry   r$  r   r   clampr  rollr;   r`   r:   r(  r,   sumr   r.   r   r   r   r   r'  r   r  )r0   r}   r  r   r   seqlenr)  r*  r+  r,  r   r.  r/  s                r3   slow_forwardzLfm2ShortConv.slow_forward  s    (N;ll1o''B/))A2)&1aU&>!+<q+@(33DNNCJ+11!T\\A5EFN#<J/1uuJ<M<MU_UeUeu/fJq!^+,&&t~~6<<ZHyyryy!9DII<L<LQPQSTW<U!U[]^HyyDIINN*))"-H*]]..rDLL288B<4OQR3ST
**4>>:@@Lyy}S'6'\2HLKKB**,MM!r4   r5   c                     t         r6d|j                  j                  v rt               s| j	                  ||||      S | j                  ||||      S )Ncuda)is_fast_path_availabler`   rw   r   r0  r8  )r0   r5   r  r   r   s        r3   rB   zLfm2ShortConv.forward  sP     "f0D0D0I0I&IRjRl,,]O^]kll  Q_``r4   r   )rH   rI   rJ   r   r   r*   r,   rL   r   r   r0  r8  rB   rM   rN   s   @r3   r  r    s   ZZ Z2 7;26.2 <<  -t3  ((4/	 
 t+ J 7;26.2$<<$ -t3$ ((4/	$
 t+$R 7;26.2	a||	a -t3	a ((4/		a
 t+	ar4   r  c                       e Zd Zdedef fdZ	 	 	 	 	 ddej                  deej                  ej                  f   dz  dej                  dz  dej                  dz  d	e
dz  d
ej                  dz  dej                  fdZ xZS )Lfm2DecoderLayerrR   r   c                 f   t         |           |j                  |   dk(  | _        | j                  rt	        ||      | _        nt        ||      | _        t        |      | _	        t        |j                  |j                        | _        t        |j                  |j                        | _        y )Nr   r   )r)   r*   r   is_attention_layerr   	self_attnr  r   r   feed_forwardr%   r1   r  operator_normffn_normr  s      r3   r*   zLfm2DecoderLayer.__init__   s    "("4"4Y"?CS"S""*69=DN%fi8DI#FO(););Q#F$6$6FOOLr4   Nr5   r  r   r~   r  r   r'   c           
         |}| j                   r, | j                  d| j                  |      |||||d|\  }}	n$| j                  | j                  |      |||      }||z   }|| j	                  | j                  |            z   }|S )N)r5   r  r   r~   r  r   )r5   r  r   r   r   )r?  r@  rB  r   rA  rC  )
r0   r5   r  r   r~   r  r   r   residualr   s
             r3   rB   zLfm2DecoderLayer.forward,  s     !""-t~~  "00?$7-) /-   M1 !II"00? /--	 & M &0%(9(9$--:V(WWr4   )NNNNN)rH   rI   rJ   r   r   r*   r,   rL   rD   r   r   rB   rM   rN   s   @r3   r=  r=    s    
Mz 
Mc 
M IM.2046:26|| #5<<#=>E t+	
 &&- -t3 ((4/ 
r4   r=  c                   J    e Zd ZU eed<   dZdZdgZdgZdZ	dZ
dZdZdZeedZy)	Lfm2PreTrainedModelrR   modelTr=  r  F)r5   
attentionsN)rH   rI   rJ   r   r   base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraph_supports_attention_backendr=  r   _can_record_outputsr   r4   r3   rG  rG  N  sQ    &*#+,#4"5N""&)#r4   rG  c                       e Zd Zdef fdZeee	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dedz  dej                  dz  d	edz  d
ej                  dz  dee   defd                     Z xZS )	Lfm2ModelrR   c           	         t         |   |       |j                  | _        |j                  | _        t        j                  |j                  |j                  | j                        | _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        t        |      | _        d| _        t#        |j                  |j$                        | _        | j)                          y c c}w )N)rR   Fr   )r)   r*   pad_token_idpadding_idx
vocab_sizer   	Embeddingr1   embed_tokens
ModuleListr   r   r=  layersrP   
rotary_embgradient_checkpointingr%   r  embedding_norm	post_initr  s      r3   r*   zLfm2Model.__init__b  s     !.. ++LL):):F<N<NPTP`P`ammBGH`H`BabYfi0b
 .V<&+#)&*<*<&//R 	 cs   DN	input_idsr   r~   r  inputs_embeds	use_cacher   r   r'   c           
         |d u |d uz  rt        d      || j                  |      }|r>|<|j                  d   }	t        | j                  |	| j
                  | j                        }|F||j                         nd}
t        j                  |
|
|j                  d   z   |j                        }||j                  d      }t        | j                  |||||      }|j                  d   dk7  r|nd }|}| j                  ||      }| j                  d | j                  j                   D ]!  }|j                  r|n|} ||f|||||d|}# | j!                  |      }t#        ||	      S )
Nz:You must specify exactly one of input_ids or inputs_embedsr   )rR   r   r:   r`   r   )r`   )rR   rc  r   r   r  r~   )r~   )r   r  r~   r  r   )last_hidden_stater  )
ValueErrorr[  rE   r   rR   r:   r`   r   r,   rj   r   r   r^  r]  r   r?  r`  r   )r0   rb  r   r~   r  rc  rd  r   r   
batch_sizer   causal_masklinear_attentionr5   r  decoder_layer
layer_masks                    r3   rB   zLfm2Model.forwardr  s    -t";<YZZ  --i8M0&,,Q/J1{{:TZZX\XcXcO !CRC^==?de"\\ "2]5H5H5K"KTaThThN )33A6L(;;'))+%
 .;-@-@-Cq-H>d%"oom,oW "[[)H4;;+H+HI 
	M(5(H(HN^J))$7) /- M
	 ++M:&++
 	
r4   )NNNNNNN)rH   rI   rJ   r   r*   r   r   r   r,   r   rL   r   FloatTensorboolr   r   r   rB   rM   rN   s   @r3   rU  rU  `  s    z     .2.2046:26!%26@
##d*@
 t+@
 &&-	@

 -t3@
 ((4/@
 $;@
 ((4/@
 +,@
 
!@
    @
r4   rU  c                   b    e Zd ZddiZddiZddgdgfiZ fdZee	 	 	 	 	 	 	 	 	 dd	e	j                  dz  d
e	j                  dz  de	j                  dz  dedz  de	j                  dz  de	j                  dz  dedz  de	j                  dz  dee	j                  z  dee   defd              Z xZS )Lfm2ForCausalLMzlm_head.weightzmodel.embed_tokens.weightlm_headcolwise_gather_outputr5   logitsc                     t         |   |       t        |      | _        |j                  | _        t        j                  |j                  |j                  d      | _        | j                          y )NFr   )
r)   r*   rU  rH  rY  r   r   r1   rq  ra  )r0   rR   r2   s     r3   r*   zLfm2ForCausalLM.__init__  sU     v&
 ++yy!3!3V5F5FUS 	r4   Nrb  r   r~   r  rc  labelsrd  r   logits_to_keepr   r'   c
                 z    | j                   d|||||||d|
}|j                  }t        |	t              rt	        |	 d      n|	}| j                  |dd|ddf         }d}|* | j                  d||| j                  j                  d|
}t        |||j                  |j                  |j                        S )a  
        Example:

        ```python
        >>> from transformers import AutoTokenizer, Lfm2ForCausalLM

        >>> model = Lfm2ForCausalLM.from_pretrained("meta-lfm2/Lfm2-2-7b-hf")
        >>> tokenizer = AutoTokenizer.from_pretrained("meta-lfm2/Lfm2-2-7b-hf")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```)rb  r   r~   r  rc  rd  r   N)rs  ru  rY  )lossrs  r  r5   rI  r   )rH  rf  rv   r   slicerq  loss_functionrR   rY  r   r  r5   rI  )r0   rb  r   r~   r  rc  ru  rd  r   rv  r   outputsr5   slice_indicesrs  rx  s                   r3   rB   zLfm2ForCausalLM.forward  s    @ ,64:: 	,
)%+')	,
 	,
  118B>SV8W~ot4]kmA}a,?@A%4%%pVFt{{OeOepiopD%#33!//))
 	
r4   )	NNNNNNNNr   )rH   rI   rJ   _tied_weights_keys_tp_plan_pp_planr*   r   r   r,   r   rL   r   rm  rn  r   r   r   r   rB   rM   rN   s   @r3   rp  rp    s/   *,GH23H_-z:;H  .2.204(,26*.!%26-.8
##d*8
 t+8
 &&-	8

 8
 ((4/8
   4'8
 $;8
 ((4/8
 ell*8
 +,8
 
 8
  8
r4   rp  )rp  rU  rG  )r   )r
  )Icollections.abcr   typingr   r   r,   torch.nn.functionalr   r   r   cache_utilsr   
generationr	   integrationsr
   r   r   masking_utilsr   modeling_layersr   modeling_outputsr   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   utils.genericr   r   utils.import_utilsr   r   utils.output_capturingr   configuration_lfm2r   causal_conv1dr    r!   Moduler%   rP   r   r   r   r   rL   r   r   rK   r   r   r  kernel_modulesallr;  r  r=  rG  rU  rp  __all__r   r4   r3   <module>r     s=  ( %         ) f f / 9 O K F & I I G V 5 * DD-7** Y'J")) J (J(><")) ><B8bii 8(C/ C/L( *+ ,2	UU\\ 	U# 	U%,, 	U& %II%<<% 
% <<	%
 LL4'% % % '(%2 )*9$BII 9$ +9$x	 #$89^, haBII haV,1 ,^ /  " T
# T
 T
n H
)? H
 H
V Br4   