
    qi                     d   d Z ddlmZ ddlZddlmZ ddlmZ ddlmZm	Z	 ddl
mZmZmZmZmZmZ ddlmZmZmZmZmZ d	d
lmZ d	dlmZ d	dlmZ d	dlmZm Z  d	dl!m"Z" d	dl#m$Z$ d	dl%m&Z&m'Z'm(Z(m)Z) d	dl*m+Z+ ddl,m-Z-  e)j\                  e/      Z0 G d ded      Z1 G d de      Z G d de      Z2d0dZ3 G d de      Z4 G d d e      Z5 G d! d"ejl                        Z7 G d# d$e      Z8 G d% d&e      Z9 G d' d(e	      Z:e& G d) d*e"             Z;e& G d+ d,e;             Z< G d- d.e      Z=g d/Z>y)1zPyTorch Bamba model.    )	TypedDictN)nn)ACT2FN) HybridMambaAttentionDynamicCacheJambaAttentionDecoderLayer)LlamaAttentionLlamaForCausalLMLlamaMLPLlamaRMSNormLlamaRotaryEmbeddingrotate_half)MambaRMSNormGatedapply_mask_to_padding_statespad_tensor_by_sizereshape_into_chunkssegment_sum   )initialization)lazy_load_kernel)create_causal_mask)BaseModelOutputWithPastCausalLMOutputWithPast)PreTrainedModel)Unpack)auto_docstringcan_return_tupleis_torchdynamo_compilinglogging)resolve_internal_import   )BambaConfigc                       e Zd ZU dZej
                  ed<   ej
                  ed<   eed<   eed<   ej                  ed<   y)BambaFlashAttentionKwargsaU  
    Keyword arguments for advanced Flash Attention, causal-conv1d, and mamba_ssm kernel usage.
    Use cases include padding-free training and fewer `torch.compile` graph breaks.

    cu_seq_lens_q (`torch.LongTensor`):
        Gets cumulative sequence length for query state.
    cu_seq_lens_k (`torch.LongTensor`):
        Gets cumulative sequence length for key state.
    max_length_q (`int`):
        Maximum sequence length for query state.
    max_length_k (`int`):
        Maximum sequence length for key state.
    seq_idx (`torch.IntTensor`):
        Index of each packed sequence.
    cu_seq_lens_qcu_seq_lens_kmax_length_qmax_length_kseq_idxN)	__name__
__module____qualname____doc__torch
LongTensor__annotations__int	IntTensor     Y/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/bamba/modular_bamba.pyr#   r#   :   s7      ######__r3   r#   F)totalc                   6    e Zd ZdZej
                  dfdefdZy)r   a  
    A dynamic cache that can handle both the attention cache (which has a seq_len dimension) and the mamba cache
    (which has a constant shape regardless of seq_len).

    This cache has two sets of lists of tensors: `key_cache` and `value_cache` for attention cache and `conv_states`
    and `ssm_states` for mamba cache. Each of these lists has `num_layers` tensors. The expected shape for each tensor
    For attention layers, `key_cache` and `value_cache` have a shape of `(batch_size, num_heads, seq_len, head_dim)`,
    while `conv_states` and `ssm_states` have a shape of `(batch_size, 0)` (empty tensors).
    For mamba layers, `key_cache` and `value_cache` have a shape of `(batch_size, 0)` (empty tensors),
    while `conv_states` represents the convolution state and has a shape of `(batch_size, d_inner, d_conv)`,
    and `ssm_states` represents the ssm state and has a shape of `(batch_size, d_inner, d_state)`.
    Nconfigc                 ,   |j                   | _         d| _        |j                  }|j                  }g | _        g | _        g | _        t        |j                        D ]*  }| j                   |   dk(  r| xj                  t        j                  ||j                  |j                  z  d|j                  z  |z  z   |||      gz  c_        | xj
                  t        j                  ||j                  |j                  |||      gz  c_        | xj                  t        j                   g g|z  |      gz  c_        | xj
                  t        j                   g g|z  |      gz  c_        | j                  j#                  |       - t        |j                        D cg c]  }t        j                   g g|z  |       c}| _        t        |j                        D cg c]  }t        j                   g g|z  |       c}| _        y c c}w c c}w )NFmamba   devicedtyper<   )layers_block_typehas_previous_statemamba_d_convmamba_d_stateconv_states
ssm_statestransformer_layersrangenum_hidden_layersr-   zerosmamba_expandhidden_sizemamba_n_groupsmamba_n_headsmamba_d_headtensorappend	key_cachevalue_cache)	selfr7   
batch_sizer=   r<   conv_kernel_sizessm_state_sizei_s	            r4   __init__z)HybridMambaAttentionDynamicCache.__init__a   s   !'!9!9"'!..--"$v//0 	2A%%a(G3  KK",,v/A/AAAH]H]D]`nDnn(%#%   KK",,++&%#	$ 	   U\\2$2CF%S$TT ELL"
1B6$R#SS''..q11	24 SXX^XpXpRqrQ%,,tj'8HrTYZ`ZrZrTstqELL"
):6Jt sts    "H!"H)r)   r*   r+   r,   r-   float16r!   rX   r2   r3   r4   r   r   S   s"     ?DmmTX $u{ $ur3   r   c                       e Zd Zy)BambaRotaryEmbeddingNr)   r*   r+   r2   r3   r4   r[   r[          r3   r[   c                 h   |j                  |      }|j                  |      }|j                  d   }| dd|f   | d|df   }}|dd|f   |d|df   }	}||z  t        |      |z  z   }
||z  t        |      |z  z   }t        j                  |
|gd      }
t        j                  ||	gd      }|
|fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Removes the interleaving of cos and sin from GLM

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    .Ndim)	unsqueezeshaper   r-   cat)qkcossinunsqueeze_dim
rotary_dimq_rotq_passk_rotk_passq_embedk_embeds               r4   apply_rotary_pos_embrq      s    ( --
&C
--
&C 2Jc;J;&'3
+;)<6Ec;J;&'3
+;)<6E s{{51C78Gs{{51C78G ii&)r2Gii&)r2GGr3   c                       e Zd Zy)BambaAttentionNr\   r2   r3   r4   rs   rs      r]   r3   rs   c                       e Zd Zy)BambaRMSNormGatedNr\   r2   r3   r4   ru   ru      r]   r3   ru   c                       e Zd ZdZdedef fdZ	 	 	 	 ddej                  de	dz  dej                  dz  d	ej                  dz  d
ej                  dz  f
dZ	 	 	 dde	dz  dej                  dz  d	ej                  dz  fdZ	 	 	 	 dde	dz  dej                  dz  d	ej                  dz  d
ej                  dz  fdZ xZS )
BambaMixeruP  
    Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`.
    A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective)
    ∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4,
    and is why Mamba is called **selective** state spaces)

    The are a few differences between this and Mamba2Mixer:
    - The variable use_precomputed_states is slightly different due to the hybrid cache structure
    - There's a few non-obvious bugs fixed with batching in the slow path that exist in main
    - Some extra variables that our layer doesn't need have been removed
    - We ported most of the refactors in https://github.com/huggingface/transformers/pull/35154, which is (as of Dec 18, 2024) unmerged
    r7   	layer_idxc           	         t         |           |j                  | _        |j                  | _        |j
                  | _        |j                  | _        t        |j                  | j                  z        | _        || _        |j                  | _        |j                  | _        t"        |j                     | _        |j&                  | _        |j*                  | _        |j.                  | _        |j2                  | _        |j6                  | _        |j:                  | _        |j<                  | _        |j>                  | _        | j                  d| j0                  z  | j                  z  z   | _         tC        jD                  | j@                  | j@                  |j                  | j                  | j@                  | j                  dz
        | _#        | j                  | j@                  z   | j                  z   }tC        jH                  | j                  || j(                        | _%        tC        jL                  tO        jP                  | j                              | _)        tO        jT                  d| j                  dz         }tC        jL                  tO        jV                  |            | _,        t[        | j                  | j,                        | _.        tC        jL                  tO        jP                  | j                              | _/        tC        jH                  | j                  | j                  | j(                        | _0        tc        d      }te        |dd       a3te        |dd       a4tc        d	      }tk        |d
      a6tk        |d      a7tk        |d      a8ts        tl        tn        tp        th        tf        f      a:tt        stv        jy                  d       y tv        jy                  d       y )Nr:   r    )in_channelsout_channelsbiaskernel_sizegroupspadding)r|   epszcausal-conv1dcausal_conv1d_updatecausal_conv1d_fnz	mamba-ssmz8ops.triton.selective_state_update.selective_state_update)chained_pathz1ops.triton.ssd_combined.mamba_chunk_scan_combinedz8ops.triton.ssd_combined.mamba_split_conv1d_scan_combineda  The fast path is not available because one of `(selective_state_update, causal_conv1d_fn, causal_conv1d_update)` is None. Falling back to the naive implementation. To install follow https://github.com/state-spaces/mamba/#installation and https://github.com/Dao-AILab/causal-conv1dzDThe fast path for Bamba will be used when running the model on a GPU)=superrX   rL   	num_headsrJ   rB   rU   rA   rT   r0   rI   intermediate_sizerx   mamba_conv_biasuse_conv_bias
hidden_act
activationr   actmamba_proj_biasuse_biasrms_norm_epslayer_norm_epsilonrK   n_groupsrM   head_dimmamba_chunk_size
chunk_sizetime_step_limittime_step_mintime_step_maxconv_dimr   Conv1dconv1dLinearin_proj	Parameterr-   onesdt_biasarangelogA_logru   normDout_projr   getattrr   r   r   selective_state_updatemamba_chunk_scan_combined mamba_split_conv1d_scan_combinedallis_fast_path_availableloggerwarning_once)rR   r7   rx   projection_sizeAcausal_conv1d	mamba_ssm	__class__s          r4   rX   zBambaMixer.__init__   s   --!--$22 & 3 3!$V%8%84;K;K%K!L"#33 ++&++,.."("5"5--++ 11%55#11#11..T]]1BTEXEX1XXii''--==))A-
 004==@4>>Qyy
 ||EJJt~~$>? LLDNNQ./\\%))A,/
%d&<&<$BYBYZ	ejj89		$"8"8$:J:JQUQ^Q^_ )9&}6LdS"=2DdK %[1	!8$^"
 %<$W%
! ,C$^,
(
 "%&)0 $"
 &>  fgr3   Nhidden_statescache_paramscache_positionattention_maskr(   c                 P   t        ||      }| j                  |      }|j                  \  }}}	| j                  | j                  z  }
|d uxr} |j
                  xro |dk(  xrh |j                  | j                     j                  d   |j                  | j                     j                  d   cxk(  xr |k(  nc xr |d uxr |d   dkD  }|r|j                  d      j                  | j                  | j                  | j                  gd      \  }}}t        ||j                  | j                     | j                  j                   j                  d      | j                  j"                  | j$                        }t'        j                  || j                  |
|
gd      \  }}}t'        j(                  | j*                  j-                                }|d d d df   d d d d d f   j/                  d| j0                  | j                        j3                  t&        j4                        }|d d d d d f   j/                  dd| j0                        }| j6                  d d d df   j/                  d| j0                        }| j8                  d d d df   j/                  d| j0                        }|j;                  || j                  |j                  d   | j                  z        }|j;                  || j                  |j                  d   | j                  z        }|j;                  || j                  | j0                        }t=        |j                  | j                     ||||||d |d
      }|j;                  || j                  | j0                  z        }| j?                  ||      }| jA                  |      d d d df   }|S t'        j(                  | j*                  j-                                }| jB                  d	t-        d
      fk(  ri nd| jB                  i}| jD                  r|tG        || j                  j                   j                  d      | j                  j"                  | j6                  |f| j8                  | jH                  || j$                  | j>                  j                   | j>                  jJ                  | j@                  j                   | j@                  j"                  | j0                  | j                  ddd|}|S |j                  | j                  | j                  | j                  gd      \  }}}|v|jM                  dd      }tN        jP                  jS                  || jT                  |j                  d   z
  df      }|j                  | j                     jW                  |       | j$                  dvrH| jY                  | j                  |jM                  dd            dd |f   jM                  dd            }nqt[        |jM                  dd      | j                  j                   j                  d      | j                  j"                  | j$                  |      jM                  dd      }t        ||      }t'        j                  || j                  |
|
gd      \  }}}t]        |j;                  ||d| j0                        |||j;                  ||| j                  d      |j;                  ||| j                  d      f| jH                  | j8                  d |d| j6                  dd|\  }}|*|(|j                  | j                     jW                  |       |j;                  ||d      }| j?                  ||      }| jA                  |      }|S )Nr    r   r_   r`   .r=   T)zr   dt_softplusg        infdt_limitF)r   r   r(   r   rmsnorm_weightrmsnorm_epsoutproj_weightoutproj_biasheaddimngroupsnorm_before_gatereturn_final_statesr:   )siluswish)xweightr|   r   r(   )r   r   r   r(   r   r   r   )/r   r   rc   r   rU   r@   rC   rx   rD   squeezesplitr   r   r   r   r   r   r|   r   r-   expr   floatexpandr   tofloat32r   r   viewr   r   r   r   trainingr   r   variance_epsilon	transposer   
functionalpadrT   copy_r   r   r   )rR   r   r   r   r   r(   projected_statesrS   seq_lenrW   groups_time_state_sizeuse_precomputed_statesgatehidden_states_B_CdtBCr   r   r   hidden_states_reshapedoutdt_limit_kwargshidden_states_B_C_transposedrC   scan_output	ssm_states                              r4   cuda_kernels_forwardzBambaMixer.cuda_kernels_forward&  s    5]NS<<6 "/!4!4
GQ!%1D1D!D $ &//&1& ((8>>qA&&t~~6<<Q?& d*& q!A% 	 "*:*B*B1*E*K*K''GR +L +'D#R
 !5!((8""**1-  ! #(++!'')?AWX#M1a 4::++-..A!T3,1d
+222t}}dFYFYZ]]didqdq]rAAq$J&&r2t}}=Bll1dC<077DMMJGq$|$++B>Az4==!''!*2MNAz4==!''!*2MNA%2%7%7
DNNTXTaTa%b"2''7& M *..z4>>DMM;YZM IImT:M --.q$|<C| 
w 4::++-..A$($8$8S%,<O$ObV`bfbvbvUwO }}!56$KK&&..q1KK$$LL ff####'99#3#3 $		 : :#'==#7#7!%!3!3 MM MM%*(-#$ &%l 
A /?.D.D++T]]DNNKQS /E /+'  + 4E3N3NqRS3T0"$--"3"34..1M1S1STV1WWYZ[#K !,,T^^<BB;O??*;;(,$5$?$?1$EFsHWH}U__`acde)% )9+55a;#{{1199!<![[--#'?? ')  i1o & %AARTb$c!&+kk%++-CE[\'#q! *C!&&z7BNFF:wrBFF:wrB*  $ff#(, LL $* &*&Y" (\-E ++DNN;AA)L)..z7BG"iiT: mmK0
r3   c                    |j                   \  }}}|j                  }t        ||      }| j                  |      }	|	j	                  | j
                  | j                  | j                  gd      \  }
}}|d uxr} |j                  xro |dk(  xrh |j                  | j                     j                   d   |j                  | j                     j                   d   cxk(  xr |k(  nc xr |d uxr |d   dkD  }|rY|j                  | j                     j                  dd      |j                  | j                  <   |d d dd d f   j                  |j                  | j                     j                        |j                  | j                     d d d d df<   |j                  | j                     j                  | j                  j                   j                        }t#        j$                  || j                  j                   j'                  d      z  d      }| j(                  r|| j                  j*                  z   }| j-                  |      }n|v|j/                  dd      }t0        j2                  j5                  || j6                  |j                   d   z
  df      }|j                  | j                     j9                  |       | j-                  | j                  |j/                  dd            dd |f   j/                  dd            }t        ||      }t#        j                  || j
                  | j:                  | j<                  z  | j:                  | j<                  z  gd      \  }}}t#        j>                  | j@                  jC                                }|r|j                  | j                     j                  }|d d dd d f   d d d df   }|j/                  dd      jE                  ||j                   d   | jF                        }| jH                  d	   jE                  | jH                  j                   d   | jF                        }t"        j0                  j2                  jK                  ||j                  |j                        z         }t#        jL                  || jN                  d   | jN                  d         }|d
   jE                  | j                  | jF                  | j<                        j                  t"        jP                        }t#        j>                  |d	   |z        j                  |      }|jS                  || j:                  d      dd d d f   }|jE                  || j:                  | j                  | j:                  z  |j                   d         jU                         }|jS                  |d|j                   d         }|d	   |dd d d f   z  }|jS                  |d| jF                        }||d	   z  j                  |      }|j                  | j                     j9                  |j                  | j                     |z  |z          |jS                  || j:                  d      dd d d f   }|jE                  || j:                  | j                  | j:                  z  |j                   d         jU                         }|jS                  |d|j                   d         }|j                  | j                     j                  |j                  |j                        }|jW                  || j                  z  | jF                  | j<                        }|jW                  || j                  z  | j<                  d      }t#        jX                  ||      }|jW                  || j                  | jF                        }| jZ                  d	   jE                  | jZ                  j                   d   | jF                        }|||z  z   j                  |j                        }|jS                  |d      d d d df   }nt0        j2                  jK                  || jH                  z         }t#        jL                  || jN                  d   | jN                  d         }|jS                  ||d| jF                        jC                         }|jS                  ||d| j<                        jC                         }|jS                  ||d| j<                        jC                         }|j]                  | j                  | j:                  z  d| j                        }|j]                  | j                  | j:                  z  d| j                        }| j^                  || j^                  z  z
  | j^                  z  }| jZ                  d	   ta        ||      z  }||d	   z  }|j                  |j                        |z  }||||fD  cg c]  } tc        | || j^                         c} \  }}}}|je                  dddd      }t#        jf                  |d      }!t#        j>                  ti        |            }"|d d d d d d d d d d d f   |d d d d d d d d d d d f   z  }#|#j%                  d      }$|$d	   |"je                  ddddd      d	   z  }%|%j%                  d      }&|&d	   |d d d d d f   z  j%                  d      }'t#        j>                  |!d d d d d d dd f   |!z
        }(||(je                  dddd      d	   z  })|)dd d d f   |d	   z  j%                  d      }*|r<|j                  | j                     d d d df   j                  |*j                        }+nt#        jj                  |*d d d df         }+t#        jl                  |+|*gd      }*t#        j>                  ti        t0        j2                  j5                  |!d d d d d d df   d                  },|,j/                  dd      },|,d
   |*d d d d d df   z  j%                  d      }-|-d d d df   |-d d df   }.}*t#        j>                  |!      }/|dd d d f   |*d d d d d df   z  }0|/je                  dddd      }1|0j%                  d      |1d	   z  }2|'|2z   }|jS                  |d| j                  | jF                        }||z   }|dkD  r|d d d |d d d d f   }|jS                  ||d      }|.*|(|j                  | j                     j9                  |.       | jo                  ||
      }3| jq                  |3j                  |            }4|4S c c} w )Nr_   r`   r    r   )shiftsdimsr>   r:   .).N).NNr   r;   )ra   output_sizer      )r    r   )9rc   r=   r   r   r   r   r   r   r@   rC   rx   rD   rollr   r<   r   r   r-   sumr   r   r|   r   r   r   r   r   rT   r   r   rU   r   r   r   r   r   r   softplusclampr   r   reshape
contiguousr   bmmr   repeat_interleaver   r   r   permutecumsumr   
zeros_likerd   r   r   )5rR   input_statesr   r   r   rS   r   rW   r=   r   r   r   r   r   rC   r   r   r   r   r   cache_devicer   dAdBdBxrD   ssm_states_reshaped
C_reshapedyr   pad_size
D_residualtA_cumsumLG_intermediateGM_intermediateMY_diagdecay_statesB_decaystatesprevious_statesdecay_chunk
new_statesr   state_decay_outC_times_statesstate_decay_out_permutedY_offr   contextualized_statess5                                                        r4   torch_forwardzBambaMixer.torch_forward  sU    ".!3!3
GQ"" 4L.Q<<5&6&<&<''GR '= '
#
 $ &//&1& ((8>>qA&&t~~6<<Q?& d*& q!A% 	 "7C7O7OPTP^P^7_7d7dlnuw7d7xL$$T^^4ARSTVWYZSZA[A^A^_k_w_wx|  yG  yG  `H  `O  `O  BPL$$T^^4Q2X> '224>>BEET[[M_M_MfMfEgK %		dkk0088;;! !!$58H8H$H! $): ; '/@/J/J1a/P, mm//043H3HKgKmKmnpKq3qst2u ((8>>{K $5F5P5PQRTU5V)WX[]e^e]eXe)f)p)pqrtu)v w89JN[#kk##T]]T5H5H%H$--Z^ZmZmJmn
q! YYtzz'')**!'224>>BIIL Aq!GQc\*Ba#**:rxx|T]]SBll9-44T\\5G5G5JDMMZG$$--b7::bhh3G.GHBR!5!5a!8$:N:Nq:QRB/"))$..$--I\I\]``glgtgt`uA))ByMA-.22,2GB
 		*dmmR8dAFAT]]DNNdmm4SUVU\U\]_U`allnA		*b!''"+6AI3a<0B *11*b$--PMi0044L4IC ##DNN399''7"<sB 		*dmmR8dAFAT]]DNNdmm4SUVU\U\]_U`allnA		*b!''"+6A &00@CC188[\[b[bCcJ",//*t~~2Mt}}^b^q^q"r
T^^ ;T=P=PRSTJ		-z:Az4>>4==AA y!((a$--HA]Q&&**1773A 		*b)!T3,7A ''T\\(9:BR!5!5a!8$:N:Nq:QRB)11*gr4==Y__aM		*gr43F3FGMMOA		*gr43F3FGMMOA##DNNdmm$CX\XfXf#gA##DNNdmm$CX\XfXf#gA'DOO*CCtVH	*-?x-XXJ *ByM9M](()B.A cpqrtuwxay%z\]&9!Xt&W%z"M1a 		!Q1%A||A2.H 		+a.)A q!Qa23a1dAq!8K6LLN""r"*A y\AIIaAq!,DY,OON""r"*A 	l]1a:%>>CCCJF !99XaArsl%;h%FGL,..q"b!<YGGGc4l+mI.FFKKPQKRF &"."9"9$.."I!TSV,"W"Z"Zbhbobo"Z"p"'"2"26!RaR%="AYY8a@F))K0A0A(1aQRTV;BWY_0`$abK%//15K%o61dC9PPUUZ[U\J *1crc6 2Jq"u4EIF $ii1OT1oq!T30GGN'6'>'>q!Q'J$#''+.Fy.QQE A		*b$..$--HAJA!|a'1a'(		*gr2A $)A''7==iHii4(
 !%knnU.C D$$G &{s   v	c                    t         rKd| j                  j                  j                  j                  v rt               s| j                  |||||      S |t        d      |j                  }|B|j                  d   dkD  r0|j                  d   dkD  r||d d d d d f   z  j                  |      }| j                  ||||      S )Ncudaz\`seq_idx` support requires fast path support. Please install `mamba_ssm` and `causal_conv1d`r    r   )r   r   r   r<   typer   r   NotImplementedErrorr=   rc   r   r  )rR   r   r   r   r   r(   kwargsr=   s           r4   forwardzBambaMixer.forward  s     "f0C0C0J0J0O0O&OXpXr,,]L.Zhjqrr%n  ##%.*>*>q*AA*E.J^J^_`JadeJe*^Aq$J-GGKKERM!!-~~^^r3   )NNNN)NNN)r)   r*   r+   r,   r!   r0   rX   r-   Tensorr   r.   r1   r   r  r  __classcell__r   s   @r4   rw   rw      sF   Zh{ Zhs Zh~ AE26.2*.g||g 7=g ((4/	g
 t+g 4'gZ AE26.2L% 7=L% ((4/	L%
 t+L%d AE26.2*._ 7=_ ((4/	_
 t+_ 4'_r3   rw   c                       e Zd Zy)BambaMLPNr\   r2   r3   r4   r   r     r]   r3   r   c                       e Zd Zy)BambaRMSNormNr\   r2   r3   r4   r"  r"    r]   r3   r"  c                   v    e Zd Zddededef fdZ	 	 	 	 	 	 	 ddej                  dej                  dz  dej                  dz  d	e
dz  d
edz  dedz  dej                  dz  deej                  ej                  f   dz  dee   deej                  eej                  ej                  f   dz  f   fdZ xZS )BambaDecoderLayerr7   rx   
layer_typec                     t         |   ||       | `d}|dk(  rt        nd } ||      | _        || _        |dk(  rt        ||      | _        y |dk(  rt        ||      | _        y t        d      )Nr    r9   )r7   rx   	attentionzInvalid layer_type)
r   rX   	self_attnr   feed_forwardr%  rw   r9   rs   
ValueError)rR   r7   rx   r%  num_expertsffn_layer_classr   s         r4   rX   zBambaDecoderLayer.__init__  sv    +N&1Q&6(D+F3$ #6YGDJ;&+FI>DN122r3   Nr   r   position_idspast_key_valuesoutput_attentions	use_cacher   position_embeddingsr  returnc	                 J   |}
| j                  |      }| j                  dk(  r | j                  d||||d|	}d}n-| j                  dk(  r | j                  d||||||||d|	\  }}|
|z   }|}
| j	                  |      }| j                  |      }|
|z   }|f}|r|fz  }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                `(batch, sequence_length)` where padding elements are indicated by 0.
            past_key_values (`HybridMambaAttentionDynamicCache`, *optional*): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence.
            position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
                Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
                with `head_dim` being the embedding dimension of each attention head.
            kwargs (`dict`, *optional*):
                Arbitrary kwargs. Can be used to provide `BambaFlashAttentionKwargs` for
                padding-free training and/or improve torch.compile performance.
        r9   )r   r   r   r   Nr'  )r   r   r-  r.  r/  r0  r   r1  r2   )input_layernormr%  r9   r(  pre_ff_layernormr)  )rR   r   r   r-  r.  r/  r0  r   r1  r  residualself_attn_weightsoutputss                r4   r  zBambaDecoderLayer.forward  s    D !,,]; ??g%&DJJ +,--	
 M !%__+/=t~~ 
0+-) /"3#-$7
0 
0,M, !=0 !--m<))-8 =0 ")++Gr3   )r9   )NNNFFNN)r)   r*   r+   r!   r0   strrX   r-   r  r.   r   booltupler   r#   FloatTensorr  r  r  s   @r4   r$  r$    s   3{ 3s 3 3( /304CG).!&26HLK||K t+K &&-	K
 :D@K  $;K $;K ((4/K #5<<#=>EK 23K 
u  %(9(95;L;L(L"MPT"TT	UKr3   r$  c                   p     e Zd ZU eed<   dZdZdgZdZdZ	dZ
dZ ej                          fd       Z xZS )BambaPreTrainedModelr7   modelTr$  r.  c           
      j   t         |   |       t        |t              rt	        j
                  |j                         t	        j                  |j                  t        j                  t        j                  d|j                  dz                      t	        j
                  |j                         y y )Nr    )r   _init_weights
isinstancerw   initones_r   r   r   r-   r   r   r   r   )rR   moduler   s     r4   rA  z"BambaPreTrainedModel._init_weights*  sq    f%fj)JJv~~&JJv||UYYu||Av?O?ORS?S/T%UVJJvxx  *r3   )r)   r*   r+   r!   r/   base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_is_statefulr-   no_gradrA  r  r  s   @r4   r>  r>    sN    &*#,-"3NLU]]_! !r3   r>  c                   &    e Zd Zdef fdZee	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  de
dz  dej                  dz  d	edz  d
edz  dedz  dej                  dz  dee   defd              Zd Z xZS )
BambaModelr7   c           	      Z   t         |   |       |j                  | _        |j                  | _        t        j                  |j                  |j                  | j                        | _        g }t        |j                        D ],  }|j                  t        |||j                  |                . t        j                  |      | _        |j                   | _        t#        |j                  |j$                        | _        t)        |      | _        d| _        | j/                          y )N)rx   r%  r   )r7   F)r   rX   pad_token_idpadding_idx
vocab_sizer   	EmbeddingrJ   embed_tokensrF   rG   rO   r$  r?   
ModuleListlayers_attn_implementationr"  r   final_layernormr[   
rotary_embgradient_checkpointing	post_init)rR   r7   decoder_layersrV   r   s       r4   rX   zBambaModel.__init__5  s     !.. ++LL):):F<N<NPTP`P`av//0 	rA!!"3FaTZTlTlmnTo"pq	rmmN3$*$?$?!+F,>,>FDWDWX.f=&+#r3   N	input_idsr   r-  r.  inputs_embedsr0  r/  output_hidden_statesr   r  r2  c
                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|d u |d uz  rt	        d      | j
                  r%| j                  r|rt        j                  d       d}|| j                  |      }|}|r|t        j                  d       |	.t        j                  |j                  d   |j                        }	||	j                  d      }t        | j                   |||	||      }| j!                  ||	      }| j#                  ||	      }|rd
nd }|rd
nd }| j$                  D ]E  }|j&                  dk(  r|n|}|r||fz  } ||f||||||	|d|
}|d   }|s7|d   =||d   fz  }G | j)                  |      }|r||fz  }|r|j*                  sd|_        |sd n|}t-        ||||      S )Nz:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.FzBamba requires an initialized `HybridMambaAttentionDynamicCache` to return a cache. None was provided, so no cache will be returned.r    r>   r   )r7   r_  r   r   r.  r-  )r-  r2   r9   )r   r-  r.  r/  r0  r   r1  T)last_hidden_stater.  r   
attentions)r7   r/  r`  r0  r*  r[  r   r   r   rU  r-   r   rc   r<   rb   r   _update_mamba_maskrZ  rW  r%  rY  r@   r   )rR   r^  r   r-  r.  r_  r0  r/  r`  r   r  r   causal_mask
mamba_maskr1  all_hidden_statesall_self_attnsdecoder_layer
layer_masklayer_outputs
next_caches                        r4   r  zBambaModel.forwardH  sX    2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	-t";<YZZ&&4==Yj I  --i8M%0:
 !"\\-*=*=a*@I]I]^N)33A6L(;;'))+%
 ,,^^L
"oom,oW"6BD0d![[ 	:M'4'?'?7'JP[J#!m%55!)
)) /"3#-$7
 
M *!,M  #/"}Q'7&99N1	:4 ,,];  -!11?#E#E15O.!*T
&+&+%	
 	
r3   c                 R    |}|d   dkD  s|t        j                  |dk(        rd}|S )zv
        No need for zeroing states when
            1. Cached forward
            2. Attending to all inputs
        r   Nr    )r-   r   )rR   r   r   rf  s       r4   rd  zBambaModel._update_mamba_mask  s7     $
!q ^%?EIIn`aNaDbJr3   )	NNNNNNNNN)r)   r*   r+   r!   rX   r   r   r-   r.   r  r   r<  r:  r   r#   r   r  rd  r  r  s   @r4   rO  rO  3  s   { &  .2.204CG26!%)-,026c
##d*c
 t+c
 &&-	c

 :D@c
 ((4/c
 $;c
  $;c
 #Tkc
 ((4/c
 23c
 
!c
  c
J	r3   rO  c                   V    e Zd Z fdZ	 	 	 	 	 	 	 	 	 	 	 ddej
                  dz  dej                  dz  dej
                  dz  dedz  dej                  dz  dej
                  dz  d	e	dz  d
e	dz  de	dz  dej
                  dz  de
ej                  z  defdZ	 	 	 	 	 	 	 d fd	Z xZS )BambaForCausalLMc                 f    t         |   |       |j                  | _        | j                          y )N)r   rX   z_loss_coefficientr\  )rR   r7   r   s     r4   rX   zBambaForCausalLM.__init__  s*     "(";"; 	r3   Nr^  r   r-  r.  r_  labelsr0  r/  r`  r   logits_to_keepr2  c                    ||n| j                   j                  }|	|	n| j                   j                  }	 | j                  d
||||||||	|
d	|}|j                  }t        |t              rt        | d      n|}| j                  |dd|ddf         }d}| | j                  d
||| j                   j                  d|}| j                  dkD  r[|j                  d      j                  |j                        j                  d      j!                         }|| j                  |z  z   }t#        |||j$                  |j&                  |j(                  	      S )aJ  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, BambaForCausalLM

        >>> model = BambaForCausalLM.from_pretrained("...")
        >>> tokenizer = AutoTokenizer.from_pretrained("...")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```N)	r^  r   r-  r.  r_  r0  r/  r`  r   )logitsrr  rS  r   r_   r`   r   r:   )lossru  r.  r   rc  r2   )r7   r/  r`  r?  rb  rB  r0   slicelm_headloss_functionrS  rq  	logsumexpr   r=   powmeanr   r.  r   rc  )rR   r^  r   r-  r.  r_  rr  r0  r/  r`  r   rs  r  r8  r   slice_indicesru  rv  z_losss                      r4   r  zBambaForCausalLM.forward  sw   J 2C1N-TXT_T_TqTq$8$D $++JjJj 	
 ,64:: ,
)%+'/!5),
 ,
  118B>SV8W~ot4]kmA}a,?@A%4%%pVFt{{OeOepiopD&&*))b)1444::4FJJ1MRRTd55>>%#33!//))
 	
r3   c	                     |:t        | j                  |j                  d   | j                  | j                        }| j                  j
                  |	d<   t        |   |f|||||||d|	}
|
S )Nr   r>   rs  )r.  r   r_  r   r-  r0  is_first_iteration)r   r7   rc   r=   r<   num_logits_to_keepr   prepare_inputs_for_generation)rR   r^  r.  r   r_  r   r-  r0  r  r  model_inputsr   s              r4   r  z.BambaForCausalLM.prepare_inputs_for_generation  s     ">Y__Q/DKKO $(;;#A#A w<

+)')%1

 

 r3   )NNNNNNNNNNr   )NNNNNTF)r)   r*   r+   rX   r-   r.   r  r   r<  r:  r0   r   r  r  r  r  s   @r4   ro  ro    s/    .2.204CG26*.!%)-,026-.K
##d*K
 t+K
 &&-	K

 :D@K
 ((4/K
   4'K
 $;K
  $;K
 #TkK
 ((4/K
 ell*K
 
 K
`     r3   ro  )rO  ro  r>  )r    )?r,   typingr   r-   r   transformers.activationsr   (transformers.models.jamba.modeling_jambar   r   (transformers.models.llama.modeling_llamar   r	   r
   r   r   r   *transformers.models.mamba2.modeling_mamba2r   r   r   r   r    r   rC  integrations.hub_kernelsr   masking_utilsr   modeling_outputsr   r   modeling_utilsr   processing_utilsr   utilsr   r   r   r   utils.import_utilsr   configuration_bambar!   
get_loggerr)   r   r#   r[   rq   rs   ru   Modulerw   r   r"  r$  r>  rO  ro  __all__r2   r3   r4   <module>r     s;  &     + q   ' 8 / O - & X X 9 , 
		H	%	 22u'G 2uj	/ 	
#L	^ 		) 	
w_ w_t	x 		< 	]2 ]@ !? ! !( D% D DNu' up Er3   