
    qi                     B   d dl mZ d dlmZmZmZ d dlZd dlmZ d dlm	Z	 ddl
mZ ddlmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZ ddlmZ ddlmZmZ ddlmZmZ ddl m!Z!m"Z" ddl#m$Z$ ddl%m&Z&m'Z'm(Z(m)Z)m*Z* ddl+m,Z, ddl-m.Z. ddl/m0Z0  e*jb                  e2      Z3 G d ded      Z4 G d d      Z5 G d dejl                        Z7d Z8d ejr                  d!e:d"ejr                  fd#Z;	 dHd$ejl                  d%ejr                  d&ejr                  d'ejr                  d(ejr                  dz  d)e<d*e<d+e$e&   fd,Z=dId-Z> ee>       G d. d/ejl                               Z? G d0 d1ej                  jl                        Z@d2ejr                  d3e:fd4ZAd5 ZBd6 ZCd7 ZD G d8 d9ejl                        ZE G d: d;ejl                        ZF ed<       G d= d>ejl                               ZG G d? d@e      ZHe' G dA dBe"             ZIe' G dC dDeI             ZJe' G dE dFeIe             ZKg dGZLy)J    )Callable)AnyOptional	TypedDictN)nn)ACT2FN   )initialization)Cache)GenerationMixin)use_kernel_forward_from_hubuse_kernelized_func)lazy_load_kernel)create_causal_mask)GradientCheckpointingLayer)BaseModelOutputWithPastCausalLMOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tupleis_torchdynamo_compilinglogging)maybe_autocast)resolve_internal_import   )BambaConfigc                       e Zd ZU dZej
                  ed<   ej
                  ed<   eed<   eed<   ej                  ed<   y)BambaFlashAttentionKwargsaU  
    Keyword arguments for advanced Flash Attention, causal-conv1d, and mamba_ssm kernel usage.
    Use cases include padding-free training and fewer `torch.compile` graph breaks.

    cu_seq_lens_q (`torch.LongTensor`):
        Gets cumulative sequence length for query state.
    cu_seq_lens_k (`torch.LongTensor`):
        Gets cumulative sequence length for key state.
    max_length_q (`int`):
        Maximum sequence length for query state.
    max_length_k (`int`):
        Maximum sequence length for key state.
    seq_idx (`torch.IntTensor`):
        Index of each packed sequence.
    cu_seq_lens_qcu_seq_lens_kmax_length_qmax_length_kseq_idxN)	__name__
__module____qualname____doc__torch
LongTensor__annotations__int	IntTensor     Z/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/bamba/modeling_bamba.pyr#   r#   6   s7      ######__r3   r#   F)totalc                   :   e Zd ZdZdZej                  dfdefdZd Z	d Z
	 ddej                  d	ej                  d
edeeef   dz  deej                  ej                  f   f
dZdej$                  fdZdej                  d
edeeef   fdZdd
edz  defdZy) HybridMambaAttentionDynamicCachea  
    A dynamic cache that can handle both the attention cache (which has a seq_len dimension) and the mamba cache
    (which has a constant shape regardless of seq_len).

    This cache has two sets of lists of tensors: `key_cache` and `value_cache` for attention cache and `conv_states`
    and `ssm_states` for mamba cache. Each of these lists has `num_layers` tensors. The expected shape for each tensor
    For attention layers, `key_cache` and `value_cache` have a shape of `(batch_size, num_heads, seq_len, head_dim)`,
    while `conv_states` and `ssm_states` have a shape of `(batch_size, 0)` (empty tensors).
    For mamba layers, `key_cache` and `value_cache` have a shape of `(batch_size, 0)` (empty tensors),
    while `conv_states` represents the convolution state and has a shape of `(batch_size, d_inner, d_conv)`,
    and `ssm_states` represents the ssm state and has a shape of `(batch_size, d_inner, d_state)`.
    FNconfigc                 ,   |j                   | _         d| _        |j                  }|j                  }g | _        g | _        g | _        t        |j                        D ]*  }| j                   |   dk(  r| xj                  t        j                  ||j                  |j                  z  d|j                  z  |z  z   |||      gz  c_        | xj
                  t        j                  ||j                  |j                  |||      gz  c_        | xj                  t        j                   g g|z  |      gz  c_        | xj
                  t        j                   g g|z  |      gz  c_        | j                  j#                  |       - t        |j                        D cg c]  }t        j                   g g|z  |       c}| _        t        |j                        D cg c]  }t        j                   g g|z  |       c}| _        y c c}w c c}w )NFmamba   devicedtyper=   )layers_block_typehas_previous_statemamba_d_convmamba_d_stateconv_states
ssm_statestransformer_layersrangenum_hidden_layersr-   zerosmamba_expandhidden_sizemamba_n_groupsmamba_n_headsmamba_d_headtensorappend	key_cachevalue_cache)	selfr8   
batch_sizer>   r=   conv_kernel_sizessm_state_sizei_s	            r4   __init__z)HybridMambaAttentionDynamicCache.__init__^   s   !'!9!9"'!..--"$v//0 	2A%%a(G3  KK",,v/A/AAAH]H]D]`nDnn(%#%   KK",,++&%#	$ 	   U\\2$2CF%S$TT ELL"
1B6$R#SS''..q11	24 SXX^XpXpRqrQ%,,tj'8HrTYZ`ZrZrTstqELL"
):6Jt sts    "H!"Hc                 ,    t        | j                        S N)lenrQ   rS   s    r4   __len__z(HybridMambaAttentionDynamicCache.__len__   s    4>>""r3   c                 >    | j                   |   | j                  |   fS r[   )rQ   rR   rS   	layer_idxs     r4   __getitem__z,HybridMambaAttentionDynamicCache.__getitem__   s!    ~~i($*:*:9*EEEr3   
key_statesvalue_statesra   cache_kwargsreturnc                    | j                   |   j                  d   dk(  r|| j                   |<   || j                  |<   nft        j                  | j                   |   |gd      | j                   |<   t        j                  | j                  |   |gd      | j                  |<   | j                   |   | j                  |   fS )Nr   r;   dim)rQ   shaperR   r-   cat)rS   rc   rd   ra   re   s        r4   updatez'HybridMambaAttentionDynamicCache.update   s     >>)$**2.!3(2DNN9%*6DY'(-		4>>)3Lj2Y_`(aDNN9%*/))T5E5Ei5PR^4_ef*gDY'~~i($*:*:9*EEEr3   beam_idxc                    | j                         dkD  rvt        t        | j                              D ]S  }| j                  |   j                  }| j                  |   j                  d|j                  |            | j                  |<   | j                  |   j                  }| j                  |   j                  d|j                  |            | j                  |<   | j                  |   j                  }| j                  |   j                  d|j                  |            | j                  |<   | j                  |   j                  }| j                  |   j                  d|j                  |            | j                  |<   V yy)zDReorders the cache for beam search, given the selected beam indices.r   N)
get_seq_lengthrG   r\   rQ   r=   index_selecttorR   rD   rE   )rS   rn   ra   r=   s       r4   reorder_cachez.HybridMambaAttentionDynamicCache.reorder_cache   s[    1$"3t~~#67 	m		299,0NN9,E,R,RSTV^VaVabhVi,jy))))4;;.2.>.>y.I.V.VWXZbZeZeflZm.n  +)))4;;.2.>.>y.I.V.VWXZbZeZeflZm.n  +3::-1__Y-G-T-TUVX`XcXcdjXk-l	*	m %r3   cache_positionc                 T    d}|j                   d   }| j                  |      |z   }||fS )zDReturn the length and offset of the cache, used to generate the maskr   )rk   rp   )rS   rt   ra   	kv_offsetquery_length	kv_lengths         r4   get_mask_sizesz/HybridMambaAttentionDynamicCache.get_mask_sizes   s7    	%++A.''	2\A	)##r3   c                     || j                   vr| j                   d   n|}t        | j                        |k  s| j                  |   j                  d   dk(  ry| j                  |   j                  d   S )zYReturns the sequence length of the cached states. A layer index can be optionally passed.r   rh   )rF   r\   rQ   rk   r`   s     r4   rp   z/HybridMambaAttentionDynamicCache.get_seq_length   sn     3<4CZCZ2ZD++A.`i	t~~)+t~~i/H/N/Nr/RVW/W~~i(..r22r3   r[   )r   )r)   r*   r+   r,   is_compileabler-   float16r!   rY   r^   rb   Tensorr0   dictstrr   tuplerm   r.   rs   ry   rp   r2   r3   r4   r7   r7   N   s     N>CmmTX $u{ $uL#F /3FLLF llF 	F
 38nt+F 
u||U\\)	*F"me&6&6 m$U\\ $c $eTWY\T\o $3d
 33 3r3   r7   c                        e Zd ZU ej                  ed<   ddef fdZe	 	 	 ddedz  de	d   de
dz  ded	ef   fd
       Z ej                         ed               Z xZS )BambaRotaryEmbeddinginv_freqNr8   c                    t         |           |j                  | _        |j                  | _        || _        | j
                  j                  d   | _        | j                  }| j                  dk7  rt        | j                     } || j
                  |      \  }| _
        | j                  d|d       | j                  d|j                         d       y )N	rope_typedefaultr   F)
persistentoriginal_inv_freq)superrY   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenr8   rope_parametersr   compute_default_rope_parametersr   attention_scalingregister_bufferclone)rS   r8   r=   rope_init_fnr   	__class__s        r4   rY   zBambaRotaryEmbedding.__init__   s    "("@"@$*$B$B!44[A!%!E!E>>Y&.t~~>L+7V+L($(ZeD0(..2BuUr3   r=   ztorch.deviceseq_lenrf   ztorch.Tensorc                    | j                   d   }t        | dd      xs | j                  | j                  z  }d}d|t	        j
                  d|dt        j                        j                  |t        j                        |z  z  z  }||fS )	a  
        Computes the inverse frequencies according to the original RoPE implementation
        Args:
            config ([`~transformers.PreTrainedConfig`]):
                The model configuration.
            device (`torch.device`):
                The device to use for initialization of the inverse frequencies.
            seq_len (`int`, *optional*):
                The current sequence length. Unused for this type of RoPE.
        Returns:
            Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
            post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
        
rope_thetahead_dimNg      ?r   r;   r>   r<   )	r   getattrrK   num_attention_headsr-   arangeint64rr   float)r8   r=   r   baserj   attention_factorr   s          r4   r   z4BambaRotaryEmbedding.compute_default_rope_parameters   s    & %%l3fj$/c63E3EIcIc3c U\\!S!5;;?BB&X]XcXcBdgjjk
 )))r3   c                 N   | j                   d d d d f   j                         j                  |j                  d   dd      j	                  |j
                        }|d d d d d f   j                         }t        |j
                  j                  t              r/|j
                  j                  dk7  r|j
                  j                  nd}t        |d      5  |j                         |j                         z  j                  dd      }t        j                  ||fd	      }|j                         | j                  z  }|j                         | j                  z  }	d d d        j	                  |j                   
      	j	                  |j                   
      fS # 1 sw Y   AxY w)Nr   rh   r    mpscpuF)device_typeenabledr;   ri   r   )r   r   expandrk   rr   r=   
isinstancetyper   r   	transposer-   rl   cosr   sinr>   )
rS   xposition_idsinv_freq_expandedposition_ids_expandedr   freqsembr   r   s
             r4   forwardzBambaRotaryEmbedding.forward   sR    !MM$4-8>>@GGHZHZ[\H]_acdehhijiqiqr ,QaZ 8 > > @'1!((--'E!((--[`J`ahhmmfkUC 	5&,,.1F1L1L1NNYYZ[]^_E))UEN3C'')d444C'')d444C		5 vvAGGv$cff177f&;;;	5 	5s   BFF$r[   NNN)r)   r*   r+   r-   r~   r/   r!   rY   staticmethodr   r0   r   r   r   no_gradr   r   __classcell__r   s   @r4   r   r      s    llV{ V  %)+/"*d"*(* t* 
~u$	%	* *: U]]_<  <r3   r   c                     | dd| j                   d   dz  f   }| d| j                   d   dz  df   }t        j                  | |fd      S )z*Rotates half the hidden dims of the input..Nrh   r;   ri   )rk   r-   rl   )r   x1x2s      r4   rotate_halfr      sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''r3   hidden_statesn_reprf   c                     | j                   \  }}}}|dk(  r| S | dddddddddf   j                  |||||      } | j                  |||z  ||      S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r    N)rk   r   reshape)r   r   batchnum_key_value_headsslenr   s         r4   	repeat_kvr     so    
 2?1D1D.Ehz!!Qa"23::5BUW\^bdlmM  (;e(CT8TTr3   modulequerykeyvalueattention_maskscalingdropoutkwargsc                    t        || j                        }t        || j                        }	t        j                  ||j	                  dd            |z  }
||
|z   }
t
        j                  j                  |
dt        j                        j                  |j                        }
t
        j                  j                  |
|| j                        }
t        j                  |
|	      }|j	                  dd      j                         }||
fS )Nr;   r	   rh   )rj   r>   )ptrainingr    )r   num_key_value_groupsr-   matmulr   r   
functionalsoftmaxfloat32rr   r>   r   r   
contiguous)r   r   r   r   r   r   r   r   rc   rd   attn_weightsattn_outputs               r4   eager_attention_forwardr     s     3 ; ;<JUF$?$?@L<<z';';Aq'ABWLL!#n4==((2U]](SVVW\WbWbcL==((6??([L,,|\:K''1-88:K$$r3   c                 h   |j                  |      }|j                  |      }|j                  d   }| dd|f   | d|df   }}|dd|f   |d|df   }	}||z  t        |      |z  z   }
||z  t        |      |z  z   }t        j                  |
|gd      }
t        j                  ||	gd      }|
|fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Removes the interleaving of cos and sin from GLM

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    rh   .Nri   )	unsqueezerk   r   r-   rl   )qkr   r   unsqueeze_dim
rotary_dimq_rotq_passk_rotk_passq_embedk_embeds               r4   apply_rotary_pos_embr   '  s    ( --
&C
--
&C 2Jc;J;&'3
+;)<6Ec;J;&'3
+;)<6E s{{51C78Gs{{51C78G ii&)r2Gii&)r2GGr3   c                       e Zd ZdZdedef fdZ	 	 	 	 ddej                  de	ej                  ej                  f   dz  dej                  dz  d	e
dz  d
ej                  dz  dee   de	ej                  ej                  f   fdZ xZS )BambaAttentionz=Multi-headed attention from 'Attention Is All You Need' paperr8   ra   c                 d   t         |           || _        || _        t	        |d|j
                  |j                  z        | _        |j                  |j                  z  | _	        | j                  dz  | _
        |j                  | _        d| _        t        j                  |j
                  |j                  | j                  z  |j                        | _        t        j                  |j
                  |j                  | j                  z  |j                        | _        t        j                  |j
                  |j                  | j                  z  |j                        | _        t        j                  |j                  | j                  z  |j
                  |j                        | _        y )Nr   g      Tbias)r   rY   r8   ra   r   rK   r   r   r   r   r   attention_dropout	is_causalr   Linearattention_biasq_projk_projv_projo_proj)rS   r8   ra   r   s      r4   rY   zBambaAttention.__init__Q  sM   "
F4F4F&JdJd4de$*$>$>&B\B\$\!}}d*!'!9!9ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii&&68J8JQWQfQf
r3   Nr   position_embeddingsr   past_key_valuesrt   r   rf   c                    |j                   d d }g |d| j                  }| j                  |      j                  |      j	                  dd      }	| j                  |      j                  |      j	                  dd      }
| j                  |      j                  |      j	                  dd      }|\  }}t        |	|
||      \  }	}
|'|||d}|j                  |
|| j                  |      \  }
}t        j                  | j                  j                  t              } || |	|
||f| j                  sdn| j                   | j"                  d|\  }} |j$                  g |d j'                         }| j)                  |      }||fS )Nrh   r    r;   )r   r   rt           )r   r   )rk   r   r   viewr   r   r   r   rm   ra   r   get_interfacer8   _attn_implementationr   r   r   r   r   r   r   )rS   r   r   r   r   rt   r   input_shapehidden_shapequery_statesrc   rd   r   r   re   attention_interfacer   r   s                     r4   r   zBambaAttention.forwardh  s    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST&S#7jRUWZ#[ j&#&snUL'6'='=j,X\XfXfht'u$J(?(M(MKK,,.E)
 %8	%
  $}}C$2H2HLL	%
 	%
!\ *k));;;;FFHkk+.L((r3   NNNN)r)   r*   r+   r,   r!   r0   rY   r-   r~   r   r   r.   r   r   r   r   r   s   @r4   r   r   M  s    G
{ 
s 
4 IM.2(,26))||)) #5<<#=>E)) t+	))
 )) ((4/)) +,)) 
u||U\\)	*))r3   r   c                   (     e Zd Zd fd	ZddZ xZS )BambaRMSNormGatedc                     t         |           t        j                  t	        j
                  |            | _        || _        y r[   r   rY   r   	Parameterr-   onesweightvariance_epsilonrS   rK   epsr   s      r4   rY   zBambaRMSNormGated.__init__  s/    ll5::k#:; #r3   c                    |j                   }|j                  t        j                        }|?|t        j
                  j                  |j                  t        j                              z  }|j                  d      j                  dd      }|t        j                  || j                  z         z  }| j                  |j                  |      z  S Nr;   rh   T)keepdim)r>   rr   r-   r   r   r   silupowmeanrsqrtr  r  )rS   r   gateinput_dtypevariances        r4   r   zBambaRMSNormGated.forward  s    #))%((7)BMM,>,>twwu}}?U,VVM $$Q',,R,>%Ht?T?T4T(UU{{]--k:::r3   gư>r[   r)   r*   r+   rY   r   r   r   s   @r4   r   r     s    $
	;r3   r   input_tensorpad_sizec                     t        | j                        dk(  r
ddddd|ddfnddd|ddf}t        j                  j                  j                  | |dd      S )z
    Padding x tensor with `pad_size` on the seq_len dim (dim=1)

    Assumes that we only have tensors of either size 4 or 3
       r   constant)moder   )r\   rk   r-   r   r   pad)r  r  	pad_shapes      r4   pad_tensor_by_sizer    sf     47|7I7I3Ja3OAq!Q!Q/VWYZ\]_gijlmUnI88""<ST"UUr3   c                    t        | |      } t        | j                        dk(  r.| j                  | j                  d   d|| j                  d         S | j                  | j                  d   d|| j                  d   | j                  d         S )z
    Padding input_tensor with `pad_size` on the seq_len dim (dim=1) and
    simultaneously splitting it into chunk sequences.

    Assumes that we only have tensors of either size 4 or 3
    r	   r   rh   r;   )r  r\   rk   r   )r  r  
chunk_sizes      r4   reshape_into_chunksr    s     &lH=L
<!###L$6$6q$92z<K]K]^_K`aa ##q!2z<3E3Ea3H,J\J\]^J_
 	
r3   c                 "   | j                  d      } | d   j                  g | j                         | } t        j                  t        j                  ||| j
                  t        j                        d      }| j                  | d      } t        j                  | d      }t        j                  t        j                  ||| j
                  t        j                        d      }|j                  | t        j                         }|S )zo
    More stable segment sum calculation. Uses cumulative sums and masking instead of direct subtractions.
    rh   .Nr<   )diagonalr   r{   ri   )
sizer   r-   trilr  r=   boolmasked_fillcumsuminf)r  r  masktensor_segsums       r4   segment_sumr+    s     ""2&J 2<	*11S<3D3D3FS
SL::ejjZ@S@S[`[e[efqstD++TE15LLL26M ::ejjZ@S@S[`[e[efqrsD!--teeiiZ@Mr3   c                     |N|j                   d   dkD  r<|j                   d   dkD  r*| j                  }| |dddddf   z  j                  |      } | S )zm
    Tunes out the hidden states for padding tokens, see https://github.com/state-spaces/mamba/issues/66
    Nr    r   )rk   r>   rr   )r   r   r>   s      r4   apply_mask_to_padding_statesr-    sa    
 !n&:&:1&=&AnFZFZ[\F]`aFa##&1d
)CCGGNr3   c                       e Zd ZdZdedef fdZ	 	 	 	 ddej                  de	dz  dej                  dz  d	ej                  dz  d
ej                  dz  f
dZ	 	 	 dde	dz  dej                  dz  d	ej                  dz  fdZ	 	 	 	 dde	dz  dej                  dz  d	ej                  dz  d
ej                  dz  fdZ xZS )
BambaMixeruP  
    Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`.
    A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective)
    ∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4,
    and is why Mamba is called **selective** state spaces)

    The are a few differences between this and Mamba2Mixer:
    - The variable use_precomputed_states is slightly different due to the hybrid cache structure
    - There's a few non-obvious bugs fixed with batching in the slow path that exist in main
    - Some extra variables that our layer doesn't need have been removed
    - We ported most of the refactors in https://github.com/huggingface/transformers/pull/35154, which is (as of Dec 18, 2024) unmerged
    r8   ra   c           	         t         |           |j                  | _        |j                  | _        |j
                  | _        |j                  | _        t        |j                  | j                  z        | _        || _        |j                  | _        |j                  | _        t"        |j                     | _        |j&                  | _        |j*                  | _        |j.                  | _        |j2                  | _        |j6                  | _        |j:                  | _        |j<                  | _        |j>                  | _        | j                  d| j0                  z  | j                  z  z   | _         tC        jD                  | j@                  | j@                  |j                  | j                  | j@                  | j                  dz
        | _#        | j                  | j@                  z   | j                  z   }tC        jH                  | j                  || j(                        | _%        tC        jL                  tO        jP                  | j                              | _)        tO        jT                  d| j                  dz         }tC        jL                  tO        jV                  |            | _,        t[        | j                  | j,                        | _.        tC        jL                  tO        jP                  | j                              | _/        tC        jH                  | j                  | j                  | j(                        | _0        tc        d      }te        |dd       a3te        |dd       a4tc        d	      }tk        |d
      a6tk        |d      a7tk        |d      a8ts        tl        tn        tp        th        tf        f      a:tt        stv        jy                  d       y tv        jy                  d       y )Nr;   r    )in_channelsout_channelsr   kernel_sizegroupspaddingr   r  zcausal-conv1dcausal_conv1d_updatecausal_conv1d_fnz	mamba-ssmz8ops.triton.selective_state_update.selective_state_update)chained_pathz1ops.triton.ssd_combined.mamba_chunk_scan_combinedz8ops.triton.ssd_combined.mamba_split_conv1d_scan_combineda  The fast path is not available because one of `(selective_state_update, causal_conv1d_fn, causal_conv1d_update)` is None. Falling back to the naive implementation. To install follow https://github.com/state-spaces/mamba/#installation and https://github.com/Dao-AILab/causal-conv1dzDThe fast path for Bamba will be used when running the model on a GPU)=r   rY   rM   	num_headsrK   rC   rV   rB   rU   r0   rJ   intermediate_sizera   mamba_conv_biasuse_conv_bias
hidden_act
activationr   actmamba_proj_biasuse_biasrms_norm_epslayer_norm_epsilonrL   n_groupsrN   r   mamba_chunk_sizer  time_step_limittime_step_mintime_step_maxconv_dimr   Conv1dconv1dr   in_projr  r-   r  dt_biasr   logA_logr   normDout_projr   r   r7  r8  r   selective_state_updatemamba_chunk_scan_combined mamba_split_conv1d_scan_combinedallis_fast_path_availableloggerwarning_once)rS   r8   ra   projection_sizeAcausal_conv1d	mamba_ssmr   s          r4   rY   zBambaMixer.__init__  s   --!--$22 & 3 3!$V%8%84;K;K%K!L"#33 ++&++,.."("5"5--++ 11%55#11#11..T]]1BTEXEX1XXii''--==))A-
 004==@4>>Qyy
 ||EJJt~~$>? LLDNNQ./\\%))A,/
%d&<&<$BYBYZ	ejj89		$"8"8$:J:JQUQ^Q^_ )9&}6LdS"=2DdK %[1	!8$^"
 %<$W%
! ,C$^,
(
 "%&)0 $"
 &>  fgr3   Nr   cache_paramsrt   r   r(   c                 P   t        ||      }| j                  |      }|j                  \  }}}	| j                  | j                  z  }
|d uxr} |j
                  xro |dk(  xrh |j                  | j                     j                  d   |j                  | j                     j                  d   cxk(  xr |k(  nc xr |d uxr |d   dkD  }|r|j                  d      j                  | j                  | j                  | j                  gd      \  }}}t        ||j                  | j                     | j                  j                   j                  d      | j                  j"                  | j$                        }t'        j                  || j                  |
|
gd      \  }}}t'        j(                  | j*                  j-                                }|d d d df   d d d d d f   j/                  d| j0                  | j                        j3                  t&        j4                        }|d d d d d f   j/                  dd| j0                        }| j6                  d d d df   j/                  d| j0                        }| j8                  d d d df   j/                  d| j0                        }|j;                  || j                  |j                  d   | j                  z        }|j;                  || j                  |j                  d   | j                  z        }|j;                  || j                  | j0                        }t=        |j                  | j                     ||||||d |d
      }|j;                  || j                  | j0                  z        }| j?                  ||      }| jA                  |      d d d df   }|S t'        j(                  | j*                  j-                                }| jB                  d	t-        d
      fk(  ri nd| jB                  i}| jD                  r|tG        || j                  j                   j                  d      | j                  j"                  | j6                  |f| j8                  | jH                  || j$                  | j>                  j                   | j>                  jJ                  | j@                  j                   | j@                  j"                  | j0                  | j                  ddd|}|S |j                  | j                  | j                  | j                  gd      \  }}}|v|jM                  dd      }tN        jP                  jS                  || jT                  |j                  d   z
  df      }|j                  | j                     jW                  |       | j$                  dvrH| jY                  | j                  |jM                  dd            dd |f   jM                  dd            }nqt[        |jM                  dd      | j                  j                   j                  d      | j                  j"                  | j$                  |      jM                  dd      }t        ||      }t'        j                  || j                  |
|
gd      \  }}}t]        |j;                  ||d| j0                        |||j;                  ||| j                  d      |j;                  ||| j                  d      f| jH                  | j8                  d |d| j6                  dd|\  }}|*|(|j                  | j                     jW                  |       |j;                  ||d      }| j?                  ||      }| jA                  |      }|S )Nr    r   rh   ri   .r   T)zrN  dt_softplusr   r(  dt_limitF)rR  r  r(   r?  rmsnorm_weightrmsnorm_epsoutproj_weightoutproj_biasheaddimngroupsnorm_before_gatereturn_final_statesr;   )r  swish)r   r  r   r?  r(   )r  rR  ra  r(   rk  rN  rb  )/r-  rM  rk   rE  rV   rA   rD   ra   rE   squeezesplitr;  rJ  r:  r7  rL  r  r   r?  r-   exprP  r   r   r   rr   r   rN  rR  r   rT  rQ  rS  rG  r   rV  r  r  r   r   r   r  rU   copy_r@  r8  rU  )rS   r   r_  rt   r   r(   projected_statesrT   r   rX   groups_time_state_sizeuse_precomputed_statesr  hidden_states_B_CdtBCr\  rN  rR  hidden_states_reshapedoutdt_limit_kwargshidden_states_B_C_transposedrD   scan_output	ssm_states                              r4   cuda_kernels_forwardzBambaMixer.cuda_kernels_forwardS  s    5]NS<<6 "/!4!4
GQ!%1D1D!D $ &//&1& ((8>>qA&&t~~6<<Q?& d*& q!A% 	 "*:*B*B1*E*K*K''GR +L +'D#R
 !5!((8""**1-  ! #(++!'')?AWX#M1a 4::++-..A!T3,1d
+222t}}dFYFYZ]]didqdq]rAAq$J&&r2t}}=Bll1dC<077DMMJGq$|$++B>Az4==!''!*2MNAz4==!''!*2MNA%2%7%7
DNNTXTaTa%b"2''7& M *..z4>>DMM;YZM IImT:M --.q$|<C| 
w 4::++-..A$($8$8S%,<O$ObV`bfbvbvUwO }}!56$KK&&..q1KK$$LL ff####'99#3#3 $		 : :#'==#7#7!%!3!3 MM MM%*(-#$ &%l 
A /?.D.D++T]]DNNKQS /E /+'  + 4E3N3NqRS3T0"$--"3"34..1M1S1STV1WWYZ[#K !,,T^^<BB;O??*;;(,$5$?$?1$EFsHWH}U__`acde)% )9+55a;#{{1199!<![[--#'?? ')  i1o & %AARTb$c!&+kk%++-CE[\'#q! *C!&&z7BNFF:wrBFF:wrB*  $ff#(, LL $* &*&Y" (\-E ++DNN;AA)L)..z7BG"iiT: mmK0
r3   c                    |j                   \  }}}|j                  }t        ||      }| j                  |      }	|	j	                  | j
                  | j                  | j                  gd      \  }
}}|d uxr} |j                  xro |dk(  xrh |j                  | j                     j                   d   |j                  | j                     j                   d   cxk(  xr |k(  nc xr |d uxr |d   dkD  }|rY|j                  | j                     j                  dd      |j                  | j                  <   |d d dd d f   j                  |j                  | j                     j                        |j                  | j                     d d d d df<   |j                  | j                     j                  | j                  j                   j                        }t#        j$                  || j                  j                   j'                  d      z  d      }| j(                  r|| j                  j*                  z   }| j-                  |      }n|v|j/                  dd      }t0        j2                  j5                  || j6                  |j                   d   z
  df      }|j                  | j                     j9                  |       | j-                  | j                  |j/                  dd            dd |f   j/                  dd            }t        ||      }t#        j                  || j
                  | j:                  | j<                  z  | j:                  | j<                  z  gd      \  }}}t#        j>                  | j@                  jC                                }|r|j                  | j                     j                  }|d d dd d f   d d d df   }|j/                  dd      jE                  ||j                   d   | jF                        }| jH                  d	   jE                  | jH                  j                   d   | jF                        }t"        j0                  j2                  jK                  ||j                  |j                        z         }t#        jL                  || jN                  d   | jN                  d         }|d
   jE                  | j                  | jF                  | j<                        j                  t"        jP                        }t#        j>                  |d	   |z        j                  |      }|jS                  || j:                  d      dd d d f   }|jE                  || j:                  | j                  | j:                  z  |j                   d         jU                         }|jS                  |d|j                   d         }|d	   |dd d d f   z  }|jS                  |d| jF                        }||d	   z  j                  |      }|j                  | j                     j9                  |j                  | j                     |z  |z          |jS                  || j:                  d      dd d d f   }|jE                  || j:                  | j                  | j:                  z  |j                   d         jU                         }|jS                  |d|j                   d         }|j                  | j                     j                  |j                  |j                        }|jW                  || j                  z  | jF                  | j<                        }|jW                  || j                  z  | j<                  d      }t#        jX                  ||      }|jW                  || j                  | jF                        }| jZ                  d	   jE                  | jZ                  j                   d   | jF                        }|||z  z   j                  |j                        }|jS                  |d      d d d df   }nt0        j2                  jK                  || jH                  z         }t#        jL                  || jN                  d   | jN                  d         }|jS                  ||d| jF                        jC                         }|jS                  ||d| j<                        jC                         }|jS                  ||d| j<                        jC                         }|j]                  | j                  | j:                  z  d| j                        }|j]                  | j                  | j:                  z  d| j                        }| j^                  || j^                  z  z
  | j^                  z  }| jZ                  d	   ta        ||      z  }||d	   z  }|j                  |j                        |z  }||||fD  cg c]  } tc        | || j^                         c} \  }}}}|je                  dddd      }t#        jf                  |d      }!t#        j>                  ti        |            }"|d d d d d d d d d d d f   |d d d d d d d d d d d f   z  }#|#j%                  d      }$|$d	   |"je                  ddddd      d	   z  }%|%j%                  d      }&|&d	   |d d d d d f   z  j%                  d      }'t#        j>                  |!d d d d d d dd f   |!z
        }(||(je                  dddd      d	   z  })|)dd d d f   |d	   z  j%                  d      }*|r<|j                  | j                     d d d df   j                  |*j                        }+nt#        jj                  |*d d d df         }+t#        jl                  |+|*gd      }*t#        j>                  ti        t0        j2                  j5                  |!d d d d d d df   d                  },|,j/                  dd      },|,d
   |*d d d d d df   z  j%                  d      }-|-d d d df   |-d d df   }.}*t#        j>                  |!      }/|dd d d f   |*d d d d d df   z  }0|/je                  dddd      }1|0j%                  d      |1d	   z  }2|'|2z   }|jS                  |d| j                  | jF                        }||z   }|dkD  r|d d d |d d d d f   }|jS                  ||d      }|.*|(|j                  | j                     j9                  |.       | jo                  ||
      }3| jq                  |3j                  |            }4|4S c c} w )Nrh   ri   r    r   )shiftsdimsr?   r;   .r!  ).NNr   r<   )rj   output_sizer	   r  r{   )r    r   )9rk   r>   r-  rM  rn  r;  rJ  r:  rA   rD   ra   rE   rollrr   r=   rL  r  r-   sumrm  r=  r   r@  r   r   r   r  rU   rp  rE  rV   ro  rP  r   r   r   rN  softplusclamprG  r   r   r   r   bmmrR  repeat_interleaver  r  r  permuter'  r+  
zeros_likerl   rQ  rS  )5rS   input_statesr_  rt   r   rT   r   rX   r>   rq  r  rt  ru  rs  rD   r{  r   rv  rw  r\  cache_devicerN  dAdBdBxrE   ssm_states_reshaped
C_reshapedyrR  r  
D_residualtA_cumsumLG_intermediateGM_intermediateMY_diagdecay_statesB_decaystatesprevious_statesdecay_chunk
new_statesr}  state_decay_outC_times_statesstate_decay_out_permutedY_offr|  contextualized_statess5                                                        r4   torch_forwardzBambaMixer.torch_forward  sU    ".!3!3
GQ"" 4L.Q<<5&6&<&<''GR '= '
#
 $ &//&1& ((8>>qA&&t~~6<<Q?& d*& q!A% 	 "7C7O7OPTP^P^7_7d7dlnuw7d7xL$$T^^4ARSTVWYZSZA[A^A^_k_w_wx|  yG  yG  `H  `O  `O  BPL$$T^^4Q2X> '224>>BEET[[M_M_MfMfEgK %		dkk0088;;! !!$58H8H$H! $): ; '/@/J/J1a/P, mm//043H3HKgKmKmnpKq3qst2u ((8>>{K $5F5P5PQRTU5V)WX[]e^e]eXe)f)p)pqrtu)v w89JN[#kk##T]]T5H5H%H$--Z^ZmZmJmn
q! YYtzz'')**!'224>>BIIL Aq!GQc\*Ba#**:rxx|T]]SBll9-44T\\5G5G5JDMMZG$$--b7::bhh3G.GHBR!5!5a!8$:N:Nq:QRB/"))$..$--I\I\]``glgtgt`uA))ByMA-.22,2GB
 		*dmmR8dAFAT]]DNNdmm4SUVU\U\]_U`allnA		*b!''"+6AI3a<0B *11*b$--PMi0044L4IC ##DNN399''7"<sB 		*dmmR8dAFAT]]DNNdmm4SUVU\U\]_U`allnA		*b!''"+6A &00@CC188[\[b[bCcJ",//*t~~2Mt}}^b^q^q"r
T^^ ;T=P=PRSTJ		-z:Az4>>4==AA y!((a$--HA]Q&&**1773A 		*b)!T3,7A ''T\\(9:BR!5!5a!8$:N:Nq:QRB)11*gr4==Y__aM		*gr43F3FGMMOA		*gr43F3FGMMOA##DNNdmm$CX\XfXf#gA##DNNdmm$CX\XfXf#gA'DOO*CCtVH	*-?x-XXJ *ByM9M](()B.A cpqrtuwxay%z\]&9!Xt&W%z"M1a 		!Q1%A||A2.H 		+a.)A q!Qa23a1dAq!8K6LLN""r"*A y\AIIaAq!,DY,OON""r"*A 	l]1a:%>>CCCJF !99XaArsl%;h%FGL,..q"b!<YGGGc4l+mI.FFKKPQKRF &"."9"9$.."I!TSV,"W"Z"Zbhbobo"Z"p"'"2"26!RaR%="AYY8a@F))K0A0A(1aQRTV;BWY_0`$abK%//15K%o61dC9PPUUZ[U\J *1crc6 2Jq"u4EIF $ii1OT1oq!T30GGN'6'>'>q!Q'J$#''+.Fy.QQE A		*b$..$--HAJA!|a'1a'(		*gr2A $)A''7==iHii4(
 !%knnU.C D$$G &{s   v	c                    t         rKd| j                  j                  j                  j                  v rt               s| j                  |||||      S |t        d      |j                  }|B|j                  d   dkD  r0|j                  d   dkD  r||d d d d d f   z  j                  |      }| j                  ||||      S )Ncudaz\`seq_idx` support requires fast path support. Please install `mamba_ssm` and `causal_conv1d`r    r   )rX  rM  r  r=   r   r   r~  NotImplementedErrorr>   rk   rr   r  )rS   r   r_  rt   r   r(   r   r>   s           r4   r   zBambaMixer.forward  s     "f0C0C0J0J0O0O&OXpXr,,]L.Zhjqrr%n  ##%.*>*>q*AA*E.J^J^_`JadeJe*^Aq$J-GGKKERM!!-~~^^r3   r   r   )r)   r*   r+   r,   r!   r0   rY   r-   r~   r7   r.   r1   r~  r  r   r   r   s   @r4   r/  r/    sF   Zh{ Zhs Zh~ AE26.2*.g||g 7=g ((4/	g
 t+g 4'gZ AE26.2L% 7=L% ((4/	L%
 t+L%d AE26.2*._ 7=_ ((4/	_
 t+_ 4'_r3   r/  c                   $     e Zd Z fdZd Z xZS )BambaMLPc                    t         |           || _        |j                  | _        |j                  | _        t        j                  | j                  | j                  |j                        | _        t        j                  | j                  | j                  |j                        | _	        t        j                  | j                  | j                  |j                        | _
        t        |j                     | _        y )Nr   )r   rY   r8   rK   r;  r   r   mlp_bias	gate_projup_proj	down_projr   r>  act_fnrS   r8   r   s     r4   rY   zBambaMLP.__init__  s    !--!'!9!94#3#3T5K5KRXRaRabyy!1!143I3IPVP_P_`4#9#94;K;KRXRaRabV../r3   c                     | j                  | j                  | j                  |            | j                  |      z        }|S r[   )r  r  r  r  )rS   r   r  s      r4   r   zBambaMLP.forward  s6    NN4;;t~~a/@#ADLLQRO#ST	r3   r  r   s   @r4   r  r    s    0r3   r  RMSNormc                   h     e Zd Zddeddf fdZdej                  dej                  fdZd Z xZ	S )	BambaRMSNormr  rf   Nc                     t         |           t        j                  t	        j
                  |            | _        || _        y)z;
        BambaRMSNorm is equivalent to T5LayerNorm
        Nr  r  s      r4   rY   zBambaRMSNorm.__init__  s1     	ll5::k#:; #r3   r   c                 "   |j                   }|j                  t        j                        }|j	                  d      j                  dd      }|t        j                  || j                  z         z  }| j                  |j                  |      z  S r	  )	r>   rr   r-   r   r  r  r  r  r  )rS   r   r  r  s       r4   r   zBambaRMSNorm.forward  sy    #))%((7 $$Q',,R,>%Ht?T?T4T(UU{{]--k:::r3   c                 ^    t        | j                  j                         d| j                   S )Nz, eps=)r   r  rk   r  r]   s    r4   
extra_reprzBambaRMSNorm.extra_repr  s*    ))*+6$2G2G1HIIr3   r  )
r)   r*   r+   r   rY   r-   r~   r   r  r   r   s   @r4   r  r    s7    $ $$ $;U\\ ;ell ;Jr3   r  c                   v    e Zd Zddededef fdZ	 	 	 	 	 	 	 ddej                  dej                  dz  dej                  dz  d	e
dz  d
edz  dedz  dej                  dz  deej                  ej                  f   dz  dee   deej                  eej                  ej                  f   dz  f   fdZ xZS )BambaDecoderLayerr8   ra   
layer_typec                 r   t         |           d}|dk(  rt        nd } ||      | _        t	        |j
                  |j                        | _        t	        |j
                  |j                        | _        || _	        |dk(  rt        ||      | _        y |dk(  rt        ||      | _        y t        d      )Nr    r6  r:   )r8   ra   	attentionzInvalid layer_type)r   rY   r  feed_forwardr  rK   rC  input_layernormpre_ff_layernormr  r/  r:   r   	self_attn
ValueError)rS   r8   ra   r  num_expertsffn_layer_classr   s         r4   rY   zBambaDecoderLayer.__init__	  s    &1Q&6(D+F3+F,>,>FDWDWX ,V-?-?VEXEX Y$ #6YGDJ;&+FI>DN122r3   Nr   r   r   r   output_attentions	use_cachert   r   r   rf   c	                 J   |}
| j                  |      }| j                  dk(  r | j                  d||||d|	}d}n-| j                  dk(  r | j                  d||||||||d|	\  }}|
|z   }|}
| j	                  |      }| j                  |      }|
|z   }|f}|r|fz  }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                `(batch, sequence_length)` where padding elements are indicated by 0.
            past_key_values (`HybridMambaAttentionDynamicCache`, *optional*): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence.
            position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
                with `head_dim` being the embedding dimension of each attention head.
            kwargs (`dict`, *optional*):
                Arbitrary kwargs. Can be used to provide `BambaFlashAttentionKwargs` for
                padding-free training and/or improve torch.compile performance.
        r:   )r   r_  rt   r   Nr  )r   r   r   r   r  r  rt   r   r2   )r  r  r:   r  r  r  )rS   r   r   r   r   r  r  rt   r   r   residualself_attn_weightsoutputss                r4   r   zBambaDecoderLayer.forward  s    D !,,]; ??g%&DJJ +,--	
 M !%__+/=t~~ 
0+-) /"3#-$7
0 
0,M, !=0 !--m<))-8 =0 ")++Gr3   )r:   )NNNFFNN)r)   r*   r+   r!   r0   r   rY   r-   r~   r.   r7   r%  r   r   r#   FloatTensorr   r   r   s   @r4   r  r    s   3{ 3s 3 3( /304CG).!&26HLK||K t+K &&-	K
 :D@K  $;K $;K ((4/K #5<<#=>EK 23K 
u  %(9(95;L;L(L"MPT"TT	UKr3   r  c                   p     e Zd ZU eed<   dZdZdgZdZdZ	dZ
dZ ej                          fd       Z xZS )BambaPreTrainedModelr8   modelTr  r   c           
      j   t         |   |       t        |t              rt	        j
                  |j                         t	        j                  |j                  t        j                  t        j                  d|j                  dz                      t	        j
                  |j                         y y )Nr    )r   _init_weightsr   r/  initones_rN  rp  rP  r-   rO  r   r:  rR  )rS   r   r   s     r4   r  z"BambaPreTrainedModel._init_weightst  sq    f%fj)JJv~~&JJv||UYYu||Av?O?ORS?S/T%UVJJvxx  *r3   )r)   r*   r+   r!   r/   base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_is_statefulr-   r   r  r   r   s   @r4   r  r  h  sN    &*#,-"3NLU]]_! !r3   r  c                   &    e Zd Zdef fdZee	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  de
dz  dej                  dz  d	edz  d
edz  dedz  dej                  dz  dee   defd              Zd Z xZS )
BambaModelr8   c           	      Z   t         |   |       |j                  | _        |j                  | _        t        j                  |j                  |j                  | j                        | _        g }t        |j                        D ],  }|j                  t        |||j                  |                . t        j                  |      | _        |j                   | _        t#        |j                  |j$                        | _        t)        |      | _        d| _        | j/                          y )N)ra   r  r6  )r8   F)r   rY   pad_token_idpadding_idx
vocab_sizer   	EmbeddingrK   embed_tokensrG   rH   rP   r  r@   
ModuleListlayersr   r  rC  final_layernormr   
rotary_embgradient_checkpointing	post_init)rS   r8   decoder_layersrW   r   s       r4   rY   zBambaModel.__init__  s     !.. ++LL):):F<N<NPTP`P`av//0 	rA!!"3FaTZTlTlmnTo"pq	rmmN3$*$?$?!+F,>,>FDWDWX.f=&+#r3   N	input_idsr   r   r   inputs_embedsr  r  output_hidden_statesrt   r   rf   c
                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|d u |d uz  rt	        d      | j
                  r%| j                  r|rt        j                  d       d}|| j                  |      }|}|r|t        j                  d       |	.t        j                  |j                  d   |j                        }	||	j                  d      }t        | j                   |||	||      }| j!                  ||	      }| j#                  ||	      }|rd
nd }|rd
nd }| j$                  D ]E  }|j&                  dk(  r|n|}|r||fz  } ||f||||||	|d|
}|d   }|s7|d   =||d   fz  }G | j)                  |      }|r||fz  }|r|j*                  sd|_        |sd n|}t-        ||||      S )Nz:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.FzBamba requires an initialized `HybridMambaAttentionDynamicCache` to return a cache. None was provided, so no cache will be returned.r    r?   r   )r8   r  r   rt   r   r   )r   r2   r:   )r   r   r   r  r  rt   r   T)last_hidden_stater   r   
attentions)r8   r  r  r  r  r  r   rY  rZ  r  r-   r   rk   r=   r   r   _update_mamba_maskr  r  r  r  rA   r   )rS   r  r   r   r   r  r  r  r  rt   r   r   causal_mask
mamba_maskr   all_hidden_statesall_self_attnsdecoder_layer
layer_masklayer_outputs
next_caches                        r4   r   zBambaModel.forward  sX    2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	-t";<YZZ&&4==Yj I  --i8M%0:
 !"\\-*=*=a*@I]I]^N)33A6L(;;'))+%
 ,,^^L
"oom,oW"6BD0d![[ 	:M'4'?'?7'JP[J#!m%55!)
)) /"3#-$7
 
M *!,M  #/"}Q'7&99N1	:4 ,,];  -!11?#E#E15O.!*T
&+&+%	
 	
r3   c                 R    |}|d   dkD  s|t        j                  |dk(        rd}|S )zv
        No need for zeroing states when
            1. Cached forward
            2. Attending to all inputs
        r   Nr    )r-   rW  )rS   r   rt   r  s       r4   r  zBambaModel._update_mamba_mask  s7     $
!q ^%?EIIn`aNaDbJr3   )	NNNNNNNNN)r)   r*   r+   r!   rY   r   r   r-   r.   r~   r7   r  r%  r   r#   r   r   r  r   r   s   @r4   r  r  }  s   { &  .2.204CG26!%)-,026c
##d*c
 t+c
 &&-	c

 :D@c
 ((4/c
 $;c
  $;c
 #Tkc
 ((4/c
 23c
 
!c
  c
J	r3   r  c                       e Zd ZddiZddiZddgdgfiZ fdZee	 	 	 	 	 	 	 	 	 	 	 dd	e	j                  dz  d
e	j                  dz  de	j                  dz  dedz  de	j                  dz  de	j                  dz  dedz  dedz  dedz  de	j                  dz  dee	j                  z  defd              Z	 	 	 	 	 	 	 d fd	Z xZS )BambaForCausalLMzlm_head.weightzmodel.embed_tokens.weightlm_headcolwise_gather_outputr   logitsc                 
   t         |   |       t        |      | _        |j                  | _        t        j                  |j                  |j                  d      | _        |j                  | _	        | j                          y )NFr   )r   rY   r  r  r  r   r   rK   r  z_loss_coefficientr  r  s     r4   rY   zBambaForCausalLM.__init__  sc     '
 ++yy!3!3V5F5FUS"(";"; 	r3   Nr  r   r   r   r  labelsr  r  r  rt   logits_to_keeprf   c                    ||n| j                   j                  }|	|	n| j                   j                  }	 | j                  d
||||||||	|
d	|}|j                  }t        |t              rt        | d      n|}| j                  |dd|ddf         }d}| | j                  d
||| j                   j                  d|}| j                  dkD  r[|j                  d      j                  |j                        j                  d      j!                         }|| j                  |z  z   }t#        |||j$                  |j&                  |j(                  	      S )aJ  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, BambaForCausalLM

        >>> model = BambaForCausalLM.from_pretrained("...")
        >>> tokenizer = AutoTokenizer.from_pretrained("...")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```N)	r  r   r   r   r  r  r  r  rt   )r   r  r  r   rh   ri   r   r;   )lossr   r   r   r  r2   )r8   r  r  r  r  r   r0   slicer  loss_functionr  r  	logsumexprr   r>   r  r  r   r   r   r  )rS   r  r   r   r   r  r  r  r  r  rt   r  r   r  r   slice_indicesr   r  z_losss                      r4   r   zBambaForCausalLM.forward  sw   N 2C1N-TXT_T_TqTq$8$D $++JjJj 	
 ,64:: ,
)%+'/!5),
 ,
  118B>SV8W~ot4]kmA}a,?@A%4%%pVFt{{OeOepiopD&&*))b)1444::4FJJ1MRRTd55>>%#33!//))
 	
r3   c	                     |:t        | j                  |j                  d   | j                  | j                        }| j                  j
                  |	d<   t        |   |f|||||||d|	}
|
S )Nr   r?   r  )r   r   r  rt   r   r  is_first_iteration)r7   r8   rk   r>   r=   num_logits_to_keepr   prepare_inputs_for_generation)rS   r  r   r   r  rt   r   r  r  r   model_inputsr   s              r4   r  z.BambaForCausalLM.prepare_inputs_for_generationd  s     ">Y__Q/DKKO $(;;#A#A w<

+)')%1

 

 r3   )NNNNNNNNNNr   )NNNNNTF)r)   r*   r+   _tied_weights_keys_tp_plan_pp_planrY   r   r   r-   r.   r~   r7   r  r%  r0   r   r   r  r   r   s   @r4   r  r    sr   *,GH23H_-z:;H  .2.204CG26*.!%)-,026-.K
##d*K
 t+K
 &&-	K

 :D@K
 ((4/K
   4'K
 $;K
  $;K
 #TkK
 ((4/K
 ell*K
 
 K
  K
`     r3   r  )r  r  r  )r   )r    )Mcollections.abcr   typingr   r   r   r-   r   transformers.activationsr    r
   r  cache_utilsr   
generationr   integrationsr   r   integrations.hub_kernelsr   masking_utilsr   modeling_layersr   modeling_outputsr   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   utils.genericr   utils.import_utilsr   configuration_bambar!   
get_loggerr)   rY  r#   r7   Moduler   r   r~   r0   r   r   r   r   r   r   r  r  r+  r-  r/  r  r  r  r  r  r  __all__r2   r3   r4   <module>r)     sV  4 % + +   + &   ) L 8 / 9 O K F & l l + 9 , 
		H	%	 0h3 h3V><299 ><B(	UU\\ 	U# 	U%,, 	U& %II%<<% 
% <<	%
 LL4'% % % '(%4#L )*C)RYY C) +C)L; ;*VU\\ VS V
((	w_ w_tryy   Y'J299 J (J(]2 ]@ !? ! !( D% D DN ~+_ ~ ~B Er3   