
    qi                        d Z ddlZddlmZ ddlmZ ddlZddlmZ ddlm	Z	 ddl
mZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZmZmZ ddlmZmZm Z m!Z! ddl"m#Z#  ejH                  e%      Z& ed      rddl'm(Z( ndZ( e       rddl)m*Z* ndZ* G d d      Z+ G d dejX                        Z- G d dejX                        Z. G d de      Z/e G d de             Z0e ed !       G d" d#e                    Z1e ed$!       G d% d&e                    Z2e G d' d(e0             Z3 ed)!       G d* d+e0e             Z4g d,Z5y)-zPyTorch MAMBA model.    N)	dataclass)Any)nn)CrossEntropyLoss   )initialization)ACT2FN)PreTrainedConfig)GenerationMixin)lazy_load_kernel)GradientCheckpointingLayer)PreTrainedModel)ModelOutputauto_docstringlogging)is_mambapy_availableis_torch_greater_or_equal
is_tracingresolve_internal_import   )MambaConfigz2.9.0)associative_scan)pscanc            
           e Zd ZdZdZej                  dfdededej                  dej                  ez  dz  fdZd	ed
ej                  dej                  dej                  fdZd	edej                  fdZd Zy)
MambaCachea.  
    Cache for mamba model which does not have attention mechanism and key value states.

    Arguments:
        config (`PreTrainedConfig):
            The configuration file defining the shape-related attributes required to initialize the static cache.
        max_batch_size (`int`):
            The maximum batch size with which the model will be used. Note that a new instance must be instantiated if
            a smaller batch size is used.
        dtype (`torch.dtype`, *optional*, defaults to `torch.float16`):
            The default `dtype` to use when initializing the layer.
        device (`torch.device` or `str`, *optional*):
            The device on which the cache should be initialized. Should be the same as the layer.

    Example:

        ```python
        >>> import torch
        >>> from transformers import AutoTokenizer, MambaForCausalLM, MambaCache

        >>> model = MambaForCausalLM.from_pretrained("state-spaces/mamba-130m-hf")
        >>> tokenizer = AutoTokenizer.from_pretrained("state-spaces/mamba-130m-hf")

        >>> inputs = tokenizer(text="My name is Mamba", return_tensors="pt")

        >>> # Prepare a cache class and pass it to model's forward
        >>> cache_params = MambaCache(config=model.config, max_batch_size=1, device=model.device, dtype=model.dtype)
        >>> cache_position = torch.arange(len(inputs["input_ids"][0]), device=model.device)  # sequence length
        >>> outputs = model(**inputs, cache_params=cache_params, cache_position=cache_position, use_cache=True)
        >>> outputs.cache_params
        ```
    TNconfigmax_batch_sizedtypedevicec                    || _         || _        |j                  | _        |j                  | _        |j
                  | _        g | _        g | _        |t        j                  |      nd }t        |j                        D ]  }t        j                  | j                   | j                  | j                  || j                        }t        j                  | j                   | j                  | j                  || j                        }t        j                  j                  |       t        j                  j                  |       | j                  j!                  |       | j                  j!                  |        y )Nr   r   )r   _dtypeintermediate_size
state_sizessm_state_sizeconv_kernelconv_kernel_sizeconv_states
ssm_statestorchr   rangenum_hidden_layerszeros_dynamomark_static_addressappend)selfr   r   r   r   _
conv_state	ssm_states           Z/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/mamba/modeling_mamba.py__init__zMambaCache.__init__`   s/    -!'!9!9$// & 2 2/1.0)/);f%v//0 	.A',{{##&&%%kk(J ',kk##&&##kk'I MM--j9MM--i8##J/OO""9-'	.    	layer_idxnew_conv_statecache_positionreturnc                 "   | j                   |   j                  |j                  k7  r5| j                   |   j                  |j                        | j                   |<   | j                   |   }|j                  d| j                  dz
        }|j                  dd      }|j                  |j                  |j                        |d d d d |f<   | j                   |   j                          | j                   |xx   |z  cc<   | j                   |   S )Nr   r   )shiftsdimsr!   )r(   r   toclampr'   rollr   zero_)r1   r8   r9   r:   r3   s        r5   update_conv_statezMambaCache.update_conv_state   s    
 I&--1F1FF*.*:*:9*E*H*HI^I^*_DY'%%i0
'--a1F1F1JK__BR_8
+9+<+<JDUDU]g]m]m+<+n
1a'(#))+#z1#	**r7   new_ssm_statec                     | j                   |   j                          | j                   |xx   |j                  | j                   |   j                        z  cc<   | j                   |   S N)r)   rC   r@   r   )r1   r8   rE   s      r5   update_ssm_statezMambaCache.update_ssm_state   sT    	"((*	"m&6&6ty7Q7X7X&YY"y))r7   c                     t        t        | j                              D ]<  }| j                  |   j                          | j                  |   j                          > y rG   )r+   lenr(   rC   r)   )r1   r8   s     r5   resetzMambaCache.reset   sM    s4#3#345 	/IY'--/OOI&,,.	/r7   )__name__
__module____qualname____doc__is_compileabler*   float16r
   intr   r   strr6   Tensor
LongTensorrD   rH   rK    r7   r5   r   r   ;   s    B N #]],0#. #. #. {{	#.
 s"T)#.J++.3ll+LQL\L\+	+"*# *ell *
/r7   r   c            
       z    e Zd ZdZddededef fdZ ej                         d        Z
d Z	 	 	 dd	ej                  d
edz  dej                  dz  dej                  dz  fdZdd
edz  dej                  dz  dej                  dz  fdZ	 	 	 dd
edz  dej                  dz  dej                  dz  fdZ xZS )
MambaMixeru  
    Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`.
    A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective)
    ∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4,
    and is why Mamba is called **selective** state spaces)
    r   r8   initialize_mixer_weightsc           	         t         |           || _        |j                  | _        |j                  | _        |j                  | _        |j                  | _        t        |j                        | _
        || _        |j                  | _        t        j                  | j                  | j                  |j                  |j                  | j                  |j                  dz
        | _        |j                   | _        t$        |j                      | _        |j(                  | _        |j*                  | _        t        j,                  | j                  | j                  dz  |j.                        | _        t        j,                  | j                  | j                  | j
                  dz  z   d      | _        t        j,                  | j                  | j                  d      | _        t        j6                  t9        j:                  | j                  | j
                              | _        t        j6                  t9        j:                  | j                              | _        |r=| j4                  j@                  jB                  jD                  dk7  r| jG                          t        j,                  | j                  | j                  |j.                        | _$        |j.                  | _        tK        d      a&tO        tL        d	d       a(tO        tL        d
d       a)tK        d      a*tW        tT        d      a,tO        tT        dd       a-tO        tT        dd       a.| j_                          y )Nr   )in_channelsout_channelsbiaskernel_sizegroupspadding   r]   FTmetazcausal-conv1dcausal_conv1d_updatecausal_conv1d_fnz	mamba-ssmz8ops.triton.selective_state_update.selective_state_update)chained_pathselective_scan_fnmamba_inner_fn)0superr6   r   hidden_sizer$   r%   r&   r'   r#   rR   time_step_rankr8   use_conv_biasr   Conv1dconv1d
hidden_act
activationr	   actuse_mambapyuse_associative_scanLinearuse_biasin_projx_projdt_proj	Parameterr*   emptyA_logDweightr   typeinit_mamba_weightsout_projr   causal_conv1dgetattrrd   re   	mamba_ssmr   selective_state_updaterg   rh   warn_slow_implementation)r1   r   r8   rY   	__class__s       r5   r6   zMambaMixer.__init__   s   !--$// & 2 2!'!9!9!&"7"78"#11ii..//%%**))&&*
 !++&++,!--$*$?$?! yy!1!143I3IA3MTZTcTcdii 6 68K8KdNaNadeNe8elqryy!4!4d6L6LSWX \\%++d.D.DdFYFY"Z[
ekk$*@*@AB#(;(;(B(B(G(G6(Q##%		$"8"8$:J:JQWQ`Q`a )9&}6LdS"=2DdK %[1	!8$^"
 $I/BDI ,<dC%%'r7   c                    t        j                  d| j                  dz   t         j                  | j                  j
                        d d d f   }|j                  | j                  d      j                         }t        j                  | j                  t        j                  |             t        j                  | j                         | j                  j                  dz  | j                  j                   z  }| j                  j"                  dk(  r+t        j$                  | j&                  j(                  |       nE| j                  j"                  dk(  r,t        j*                  | j&                  j(                  | |       t        j,                  t        j.                  | j                  | j&                  j0                  j
                  t         j                        t3        j                  | j                  j4                        t3        j                  | j                  j6                        z
  z  t3        j                  | j                  j6                        z         j9                  | j                  j:                        }|t        j                  t        j<                  |              z   }t        j                  | j&                  j0                  |       y )	Nr   )r   r   r=   g      constantrandomr!   )min)r*   aranger%   float32r{   r   expandr#   
contiguousinitcopy_logones_r|   r   rk   time_step_scaletime_step_init_scheme	constant_rx   r}   uniform_exprandr]   mathtime_step_maxtime_step_minrA   time_step_floorexpm1)r1   Adt_init_stddtinv_dts        r5   r   zMambaMixer.init_mamba_weights   s   LLD//!35==QUQ[Q[QbQbcdhjkdklHHT++R0;;=

4::uyy|,

466kk00$69T9TT;;,,
:NN4<<..<[[..(:MM$,,--|[IYYJJt--dll6G6G6N6NV[VcVcdxx112TXXdkk>W>W5XXZhht{{0012
 %DKK//%
0	 	 eiibS!1 122

4<<$$f-r7   c                     t        t        t        t        t        t
        f      }|sM| j                  r+t               rt        j                  d       y t        d      t        j                  d       y y )Na  The fast path is not available because one of `(selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn)` is None. Falling back to the mamba.py backend. To install follow https://github.com/state-spaces/mamba/#installation for mamba-ssm and install the kernels library using `pip install kernels` or https://github.com/Dao-AILab/causal-conv1d for causal-conv1dzuse_mambapy is set to True but the mambapy package is not installed. To install it follow https://github.com/alxndrTL/mamba.py.a  The fast path is not available because one of `(selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn)` is None. Falling back to the sequential implementation of Mamba, as use_mambapy is set to False. To install follow https://github.com/state-spaces/mamba/#installation for mamba-ssm and install the kernels library using `pip install kernels` or https://github.com/Dao-AILab/causal-conv1d for causal-conv1d. For the mamba.py backend, follow https://github.com/alxndrTL/mamba.py.)allr   rg   re   rd   rh   rr   r   loggerwarning_onceImportError)r1   is_fast_path_availables     r5   r   z#MambaMixer.warn_slow_implementation   sw    !$#%68HJ^`no"
 &')''S & Z  ##W &r7   Nhidden_statescache_paramsr:   attention_maskc                 	   | j                  |      j                  dd      }| j                  r%|"t        || j                  j
                  | j                  r| j                  j                  nd | j                  j
                  | j                  j
                  | j                  j
                  | j                  r$| j                  j                  j                         nd t        j                  | j                  j                                d d | j                   j                         | j                  j                  j                         d      }|S |j#                  dd      \  }}|||j%                  d      z  }| j                  j
                  j'                  | j                  j
                  j)                  d      | j                  j
                  j)                  d            }|m|d   dkD  ret+        |j-                  d      |j.                  | j0                     || j                  j                  | j2                        }|j%                  d      }n|Yt4        j6                  j9                  || j:                  |j<                  d   z
  df      }	|j?                  | j0                  |	|       tA        ||| j                  j                  | j2                        }|||j%                  d      z  }| j                  |j                  dd            }
t        jB                  |
| jD                  | jF                  | jF                  gd      \  }}}| j                  j
                  |j                  dd      z  }t        j                  | j                  j                                }tI        | j                  d	      r$| j                  j                  j                         nd }|e|d   dkD  r]tK        |jL                  | j0                     |d
   |d
   ||d d df   |d d df   | j                   |d
   |d
      j%                  d      }nptO        ||||j                  dd      |j                  dd      | j                   j                         ||dd
      \  }}|||jQ                  | j0                  |       | j                  |j                  dd            }|S )Nr   ra   T)
delta_biasdelta_softplusdimr   r=   )rp   r]   ).r   )dt_softplus)r   return_last_state))rv   	transposetrainingrh   rn   r}   rl   r]   rw   rx   r   ru   floatr*   r   r{   r|   chunk	unsqueezeviewsizerd   squeezer(   r8   rp   r   
functionalpadr'   shaperD   re   splitrk   r%   hasattrr   r)   rg   rH   )r1   r   r   r:   r   projected_statescontextualized_statesgateconv_weightsr(   ssm_parameters	time_stepBCdiscrete_time_stepr   time_proj_biasscan_outputsr4   s                      r5   cuda_kernels_forwardzMambaMixer.cuda_kernels_forward  s7     <<6@@AF==\1$2 ""$($6$6  D""##$$.2mm""((*4::++-..<<,,224#%!p %$O #3"8"8"8"BM4) -0H0H0K K  ;;--224;;3E3E3J3J13Mt{{OaOaOfOfghOijL'N1,=,A 4!))"- ,,T^^< KK$$OO! !. 7 7 ;+"$--"3"3%(=(=@S@STV@W(WYZ'[#K !224>>;P^_ 0!<1A1Adoo! ) -0H0H0K K "[[)@)@A)FGN#kk!4!4d6I6I4K^K^ _egOIq! "&!4!4y7J7J1a7P!P4::++-..A:A$,,PV:WT\\..446]aN'N1,=,A5 ++DNN;!&)&v.adGadGFFL" $  )B-  +<!&KK1%KK1%FFLLN"#'&*+'i (\-E 11$..)L %)MM,2H2HA2N$O!$$r7   c           	         |j                   \  }}}|j                  }| j                  |      j                  dd      }	|	j	                  dd      \  }
}||
|j                  d      z  }
||j                  | j                     j                         }|j                  |
j                        }|j                   d   | j                  k(  rt        j                  j                  |
| j                  |
j                   d   z
  df      }|j                  | j                  ||       | j!                  | j#                  |
      dd |f         }
n9|j                  | j                  |
|      }|j                  | j"                  j$                  j                        }t'        j(                  || j"                  j$                  d d dd d f   z  d      }
| j*                  r|
| j"                  j,                  z  }
| j!                  |
      j                  |      j                  d      }
n`t'        j.                  || j0                  | j2                  f|
j                  |      }| j!                  | j#                  |
      dd |f         }
||
|j                  d      z  }
| j5                  |
j                  dd            }t'        j6                  || j8                  | j2                  | j2                  gd      \  }}}| j;                  |      }t        j                  j=                  |      j                  dd      }t'        j>                  | j@                  jC                                }t'        j>                  |d d d d d d f   |d d d d d d d f   z        }|d d d d d d d f   |d d d d d d d f   jC                         z  }||
d d d d d d d f   jC                         z  }| jD                  r| jF                  r|tI        |j                  dd      |j                  dd            }||j                  d      z  jK                  d      j                  dd      }||
| jL                  d d d d f   z  z   }|| j!                  |      z  }n| jN                  rtP        tS        |
      r|d	 }|j                  jT                  d
v rdnd}tQ        |||fd|      \  }}t'        jV                  |jY                  dddd      j                  |      |j                  d            jK                  d      jY                  ddd      }|d d d d dd d f   }ng }t[        |      D ]}  }|d d d d |d d f   |z  |d d d d |d d f   z   }t'        jV                  |j                  |      |d d |d d f   j                  d            }|j]                  |d d d d df           t'        j^                  |d      }||
| jL                  d d d d f   z  z   }|| j!                  |      z  }|(|j                  | j                     ja                  |       | jc                  |j                  dd            }|S )Nr   ra   r   r   r=   .r!   r   c                 0    | \  }}|\  }}||z  ||z  |z   fS rG   rV   )leftrighta_leftb_lefta_rightb_rights         r5   
combine_fnz+MambaMixer.slow_forward.<locals>.combine_fn  s/    %)NFF',$GW"W,g.>.HIIr7   )cudaxpu	pointwisegeneric)r   combine_mode)2r   r   rv   r   r   r   r)   r8   cloner@   r   r'   r   r   r   rD   rq   rn   r}   r*   sumrl   r]   r-   r#   r%   rw   r   rk   rx   softplusr   r{   r   rr   r   r   r   r|   rs   r   r   r~   matmulpermuter+   r0   stackr   r   )r1   input_statesr   r:   r   
batch_sizeseq_lenr2   r   r   r   r   r4   r3   r   r   r   r   r   r   
discrete_A
discrete_BdeltaB_uhsscan_outputr   r   all_hr   ir   s                                  r5   slow_forwardzMambaMixer.slow_forwardu  s   !-!3!3
GQ""<<5??1E.44QA4>t%)N,D,DQ,GGM #$//?EEGI!]%9%9:I ##A&$*?*??]]..!**]-@-@-DDaH

 ..t~~z>Z $])CC'M)R S);;DNNM[ij
']]4;;+=+=+D+DE
 %		*t{{7I7I!QPQ'7R*RXZ [%%!T[[%5%55M $ 7 : :5 A K KB OT33T5H5HI$++5I !HHT[[%?XgX%NOM%)N,D,DQ,GGM ]%<%<Q%BC++T00$2E2EtGZGZ[ac
	1a "\\)4]]334FGQQRSUVW YYtzz'')**YYqq$!125G1aQU5VVW
'1a61dAq=9I9O9O9QQ
aAtm < B B DD ,2Fz++Aq183E3Ea3KLBB/88;EEaKK%tQ}8M(MMK%6K ((-=-IjYfNglx  mAJ
 /9.?.?.D.D.W{]f+JX8NTUdpq5#ll5==Aq!+D+G+G+NPQP[P[\^P_`hhiklttuvxy{|}!!QA+.	  "w >A *1aA: 6 BXaQRTUWXjEY YI"',,y||E/BAaAgJDXDXY[D\"]K ''Aq!G(<=> $kk,B?%a9N)NOK&$7K'''7==iH !%k.C.CAq.I J$$r7   c                    t        t        t        t        t        t
        f      }|rKd| j                  j                  j                  j                  v rt        |      s| j                  ||||      S | j                  ||||      S )Nr   )r   r   rg   re   rd   rh   rw   r}   r   r~   r   r   r   )r1   r   r   r:   r   r   s         r5   forwardzMambaMixer.forward  s{     "%#%68HJ^`no"
 "f0B0B0I0I0N0N&NWaboWp,,]L.Zhii  nn]]r7   )TNNN)rL   rM   rN   rO   r   rR   boolr6   r*   no_gradr   r   rT   r   rU   r   r   r   __classcell__r   s   @r5   rX   rX      s?   6({ 6(s 6(VZ 6(p U]]_. .*4 +/2626c%||c% !4'c% ((4/	c%
 ((4/c%L^%zD7H ^%^c^n^nqu^u ^%  MR  M]  M]  `d  Md ^%H +/2626^ !4'^ ((4/	^
 ((4/^r7   rX   c                   ,     e Zd Zd fd	Zd Zd Z xZS )MambaRMSNormc                     t         |           t        j                  t	        j
                  |            | _        || _        y)zL
        MambaRMSNorm is equivalent to T5LayerNorm and LlamaRMSNorm
        N)ri   r6   r   ry   r*   onesr}   variance_epsilon)r1   rj   epsr   s      r5   r6   zMambaRMSNorm.__init__  s1     	ll5::k#:; #r7   c                 "   |j                   }|j                  t        j                        }|j	                  d      j                  dd      }|t        j                  || j                  z         z  }| j                  |j                  |      z  S )Nra   r=   T)keepdim)	r   r@   r*   r   powmeanrsqrtr   r}   )r1   r   input_dtypevariances       r5   r   zMambaRMSNorm.forward  sy    #))%((7 $$Q',,R,>%Ht?T?T4T(UU{{]--k:::r7   c                 R    | j                   j                  d    d| j                   S )Nr   z, eps=)r}   r   r   r1   s    r5   
extra_reprzMambaRMSNorm.extra_repr  s*    ++##A&'vd.C.C-DEEr7   )gư>)rL   rM   rN   r6   r   r   r   r   s   @r5   r   r     s    $;Fr7   r   c                   t     e Zd Z fdZ	 	 	 ddedz  dej                  dz  dej                  dz  fdZ xZS )
MambaBlockc                     t         |           || _        || _        |j                  | _        t        |j                  |j                        | _        t        ||d      | _
        y )Nr   F)r8   rY   )ri   r6   r   r8   residual_in_fp32r   rj   layer_norm_epsilonnormrX   mixer)r1   r   r8   r   s      r5   r6   zMambaBlock.__init__  sU    " & 7 7 !3!39R9RS	)V[\
r7   Nr   r:   r   c                    |}| j                  |j                  | j                   j                  j                              }| j                  r|j                  t
        j                        }| j                  ||||      }||z   }|S )N)r   r   r:   r   )r  r@   r}   r   r  r*   r   r  )r1   r   r   r:   r   residuals         r5   r   zMambaBlock.forward  s     !		-"2"29I9I9O9O"2"PQ  {{5==1H

^dr # 
 !=0r7   r   )	rL   rM   rN   r6   r   r*   rU   r   r   r   s   @r5   r  r    sR    ] +/2626 !4' ((4/	
 ((4/r7   r  c                   Z    e Zd ZU eed<   dZddgZdZdZ e	j                         d        Zy)MambaPreTrainedModelr   backboner  rX   Tc                 @   | j                   j                  }t        |t              r#|j	                          t        j                  |j                  j                  t        j                  d             |j                  j                  )t        j                  |j                  j                         t        j                  |j                  j                  t        j                  d             | j                   j                  rB|j                  j                  }|t        j                  | j                   j                        z  }t        |t         j"                        rNt        j$                  |j                  |       |j                   t        j                  |j                         yyt        |t&              r t        j(                  |j                         yt        |t         j*                        r"t        j$                  |j                  |       yy)zInitialize the weights.   )aN)std)r   initializer_range
isinstancerX   r   r   kaiming_uniform_rn   r}   r   sqrtr]   zeros_r   rescale_prenorm_residualr,   r   rt   normal_r   r   	Embedding)r1   moduler  ps       r5   _init_weightsz"MambaPreTrainedModel._init_weights  sN    kk++fj) %%'!!&--"6"6$))A,G}}!!-FMM../!!&//"8"8DIIaLI{{33 OO**TYYt{{<<==fbii(LLC0{{&FKK( '-JJv}}%-LLC0 .r7   N)rL   rM   rN   r   __annotations__base_model_prefix_no_split_modulessupports_gradient_checkpointing_is_statefulr*   r   r  rV   r7   r5   r  r    s>    "%|4&*#LU]]_"1 "1r7   r  z,
    Class for the MAMBA model outputs.
    )custom_introc                   |    e Zd ZU dZdZej                  dz  ed<   dZe	dz  ed<   dZ
eej                     dz  ed<   y)MambaOutputa9  
    cache_params (`MambaCache`):
        The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
        avoid providing the old `input_ids`.

        Includes both the State space model state matrices after the selective scan, and the Convolutional states
    Nlast_hidden_stater   r   )rL   rM   rN   rO   r&  r*   FloatTensorr  r   r   r   tuplerV   r7   r5   r%  r%  C  sH     37u((4/6&*L*t#*59M5**+d29r7   r%  zK
    Base class for causal language model (or autoregressive) outputs.
    c                       e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	e
dz  ed<   dZeej                     dz  ed<   y)MambaCausalLMOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    cache_params (`MambaCache`):
        The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
        avoid providing the old `input_ids`.

        Includes both the State space model state matrices after the selective scan, and the Convolutional states
    Nlosslogitsr   r   )rL   rM   rN   rO   r+  r*   r'  r  r,  r   r   r   r(  rV   r7   r5   r*  r*  W  s\    
 &*D%

d
")'+FE$+&*L*t#*59M5**+d29r7   r*  c                        e Zd Z fdZd Zd Zd Ze	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  de
dz  d	edz  d
edz  dedz  dej                  dz  dej                  dz  deez  fd       Z xZS )
MambaModelc           	         t         |   |       t        j                  |j                  |j
                        | _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        d| _        t        |j
                  |j                        | _        | j!                  | j"                         | j%                          y c c}w )N)r8   Fr  )ri   r6   r   r  
vocab_sizerj   
embeddings
ModuleListr+   r,   r  layersgradient_checkpointingr   r  norm_f"_register_load_state_dict_pre_hook	load_hook	post_init)r1   r   idxr   s      r5   r6   zMambaModel.__init__r  s     ,,v'8'8&:L:LMmmRWX^XpXpRq$r3Z#%F$rs&+#"6#5#56;T;TU//? %ss   &Cc                 f    |D ],  }d|v s|j                  |      ||j                  dd      <    y  y )Nz
embedding.zembeddings.)popreplace)r1   
state_dictprefixargsks        r5   r7  zMambaModel.load_hook~  s;     	Aq EO^^TUEV
199\=AB	r7   c                     | j                   S rG   r1  r   s    r5   get_input_embeddingszMambaModel.get_input_embeddings  s    r7   c                     || _         y rG   rB  r1   new_embeddingss     r5   set_input_embeddingszMambaModel.set_input_embeddings  s	    (r7   N	input_idsinputs_embedsr   	use_cacheoutput_hidden_statesreturn_dictr:   r   r;   c	                 8   ||n| j                   j                  }||n#| j                  s| j                   j                  nd}||n| j                   j                  }|du |duz  rt        d      || j                  |      }| j                  r| j                  r|rd}|r|st        | j                   |j                  d      |j                  |j                        }t        j                  d| j                   j                  |j                        }n|t        d      d}|}
|rdnd}| j                  D ]  } ||
|||	      }
|s||
fz   } | j!                  |
      }
|r||
fz   }|st#        d
 |
||fD              S t%        |
|r||      S d|      S )a  
        cache_params (`MambaCache`, *optional*):
            If passed along, the model uses the previous state in all the blocks (which will give the output for the
            `input_ids` provided as if the model add `state_input_ids + input_ids` as context).
        use_cache (`bool`, *optional*):
            If set to `True`, the `cache_params` is returned and can be used to quickly generate the next logits.
        NFz:You must specify exactly one of input_ids or inputs_embedsr   r!   r   zYou have to specify the `cache_position` manually when `use_cache=True` and `cache_params` is passed, you don't have to pass a `cache_params` if you are in prefilling stage because in that case it will be initialized for you automaticallyrV   r
  c              3   &   K   | ]	  }||  y wrG   rV   ).0vs     r5   	<genexpr>z%MambaModel.forward.<locals>.<genexpr>  s     fqXYXefs   )r&  r   r   )r   rK  r   rJ  use_return_dict
ValueErrorr1  r4  r   r   r   r   r*   r   r&   r3  r5  r(  r%  )r1   rH  rI  r   rJ  rK  rL  r:   r   kwargsr   all_hidden_statesmixer_blocks                r5   r   zMambaModel.forward  s   * %9$D $++JjJj 	 "+!6IZ^ZgZgT[[=R=Rmr	%0%<k$++B]B]-t";<YZZ  OOI6M&&4==YI#)KK!3!3A!6}?S?S[h[n[n  "'a1H1HQ^QeQe!f' !;   L%"6BD;; 		IK')--	M $$58H$H!		I M2 1]4D Df]LBS$Tfff+)2+
 	
8<+
 	
r7   )NNNNNNNN)rL   rM   rN   r6   r7  rC  rG  r   r*   rU   r   r   r(  r%  r   r   r   s   @r5   r.  r.  p  s    
)  .215*.!%,0#'2626M
##d*M
 ''$.M
 !4'	M

 $;M
 #TkM
 D[M
 ((4/M
 ((4/M
 
	M
 M
r7   r.  z
    The MAMBA Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    c                       e Zd ZddiZ fdZd Zd Z	 ddedee	e
f   ded	ee	e
f   fd
Z	 	 	 	 	 	 ddedz  dej                  dz  dej                  dz  dedz  f fdZe	 	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej&                  dz  dedz  dej                  dz  dedz  dedz  dedz  dej(                  dz  deej(                  z  d	eez  fd       Z xZS )MambaForCausalLMzlm_head.weightzbackbone.embeddings.weightc                     t         |   |       t        |      | _        t	        j
                  |j                  |j                  d      | _        | j                          y )NFrb   )
ri   r6   r.  r  r   rt   rj   r0  lm_headr8  )r1   r   r   s     r5   r6   zMambaForCausalLM.__init__  sF     "6*yy!3!3V5F5FUSr7   c                 6    | j                   j                         S rG   )r  rC  r   s    r5   rC  z%MambaForCausalLM.get_input_embeddings  s    }}1133r7   c                 8    | j                   j                  |      S rG   )r  rG  rE  s     r5   rG  z%MambaForCausalLM.set_input_embeddings  s    }}11.AAr7   outputsmodel_kwargsnum_new_tokensr;   c                    |j                  dd       |d<   |j                  dd      rd|v r|d   |d   dd  |z   |d<   d|v r?|d   }t        j                  ||j                  |j                  d   df      gd	      |d<   |S )
Nr   rJ  Tr:   r=   r   r   r   r   )getr*   catnew_onesr   )r1   r^  r_  r`  rU  r   s         r5   #_update_model_kwargs_for_generationz4MambaForCausalLM._update_model_kwargs_for_generation  s     (/{{>4'H^$[$/ L0-.:-9:J-KBC-PSa-aL)*|+)*:;N-2YY!8!8.:N:Nq:QST9U!VW]_.L)* r7   Nr   r:   r   is_first_iterationc           
         t        |   |f||||||d|}	|r|t        j                  d| j                  j
                  j                  |j                        |	d<   ||j                  d      }
n|j                  d      }
t        | j                  j
                  |
| j                  | j                        |	d<   |	S |r|d   dkD  rd |	d<   |	S )N)rI  rJ  r   r:   r   rf  r   rN  r:   r!   r   r   )ri   prepare_inputs_for_generationr*   r   r  r   r&   r   r   r   r   )r1   rH  rI  rJ  r   r:   r   rf  rU  model_inputsr   r   s              r5   rh  z.MambaForCausalLM.prepare_inputs_for_generation  s     w<	
'%))1	
 	
 -
 .3\\!T]]=Q=Q=]=]fofvfv-wL)*(!.!3!3A!6!*!2+5$$nT[[PTPZPZ,L(  >!,q0-1L)*r7   rH  rI  labelsrK  rL  rJ  logits_to_keepc           
         ||n| j                   j                  }| j                  |||||||	|      }|d   }t        |
t              rt        |
 d      n|
}| j                  |dd|ddf   j                  | j                  j                  j                              j                         }d}||j                  |j                        }|dddddf   j                         }|dddf   j                         }t               } ||j                  d|j                  d            |j                  d            }|s|f|dd z   }||f|z   S |S t!        |||j"                  |j$                        S )aS  
        cache_params (`MambaCache`, *optional*):
            If passed along, the model uses the previous state in all the blocks (which will give the output for the
            `input_ids` provided as if the model add `state_input_ids + input_ids` as context).
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
        use_cache (`bool`, *optional*):
            If set to `True`, the `cache_params` is returned and can be used to quickly generate the next logits.
        N)r   rI  rK  rL  rJ  r:   r   r   .r=   r   )r+  r,  r   r   )r   rS  r  r  rR   slicer[  r@   r}   r   r   r   r   r   r   r   r*  r   r   )r1   rH  r   rI  r   rj  rK  rL  rJ  r:   rk  rU  mamba_outputsr   slice_indicesr,  r+  shift_logitsshift_labelsloss_fctoutputs                        r5   r   zMambaForCausalLM.forward-  s   4 &1%<k$++B]B]%'!5#)) & 	
 &a(8B>SV8W~ot4]kmA}a,?@CCDLLDWDWD]D]^_eegYYv}}-F!#ssA+.99;L!#qr'?557L')HL--b,2C2CB2GH,J[J[\^J_`DYqr!22F)-)9TGf$EvE"&33'55	
 	
r7   )r   )NNNNNF)
NNNNNNNNNr   )rL   rM   rN   _tied_weights_keysr6   rC  rG  r   dictrS   r   rR   re  r   r*   rU   r   rh  r   r'  rT   r(  r*  r   r   r   s   @r5   rY  rY    s    +,HI4B YZ"26sCx.RU	c3h, *.2626*/'
 !4'' ((4/' ((4/' !4K'R  .22626*.*.,0#'!%.2-.?
##d*?
 ((4/?
 ((4/	?

 !4'?
   4'?
 #Tk?
 D[?
 $;?
 t+?
 ell*?
 
$	$?
 ?
r7   rY  )rY  r.  r  r   )6rO   r   dataclassesr   typingr   r*   r   torch.nnr    r   r   activationsr	   configuration_utilsr
   
generationr   integrationsr   modeling_layersr   modeling_utilsr   utilsr   r   r   utils.import_utilsr   r   r   r   configuration_mambar   
get_loggerrL   r   (torch._higher_order_ops.associative_scanr   mambapy.pscanr   r   ModulerX   r   r  r  r%  r*  r.  rY  __all__rV   r7   r5   <module>r     s     !    % & ! 3 ) , 9 - 
  - 
		H	%W%I #Ed/ d/N@^ @^F
F299 F(+ 8 *1? *1 *1Z 
:+ : : 
:+ : :& g
% g
 g
T L
+_ L
L
^ Sr7   