
    qi)                        d Z ddlZddlmZ ddlZddlmZ ddlmZ ddl	m
Z
 ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZmZmZ ddlmZ ddlmZ  ej8                  e      Zdej>                  de fdZ!d Z"d Z#d Z$ G d d      Z% G d dej
                  jL                        Z' G d dejL                        Z( G d dejL                        Z) G d de      Z*e G d  d!e             Z+e ed"#       G d$ d%e                    Z,e ed&#       G d' d(e                    Z-e G d) d*e+             Z. ed+#       G d, d-e+e             Z/g d.Z0y)/zPyTorch MAMBA2 model.    N)	dataclass)nn   )initialization)ACT2FN)GenerationMixin)lazy_load_kernel)GradientCheckpointingLayer)PreTrainedModel)ModelOutputauto_docstringis_torchdynamo_compilinglogging)resolve_internal_import   )Mamba2Configinput_tensorpad_sizec                     t        | j                        dk(  r
ddddd|ddfnddd|ddf}t        j                  j                  j                  | |dd      S )z
    Padding x tensor with `pad_size` on the seq_len dim (dim=1)

    Assumes that we only have tensors of either size 4 or 3
       r   constant)modevalue)lenshapetorchr   
functionalpad)r   r   	pad_shapes      \/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/mamba2/modeling_mamba2.pypad_tensor_by_sizer!   '   sf     47|7I7I3Ja3OAq!Q!Q/VWYZ\]_gijlmUnI88""<ST"UU    c                    t        | |      } t        | j                        dk(  r.| j                  | j                  d   d|| j                  d         S | j                  | j                  d   d|| j                  d   | j                  d         S )z
    Padding input_tensor with `pad_size` on the seq_len dim (dim=1) and
    simultaneously splitting it into chunk sequences.

    Assumes that we only have tensors of either size 4 or 3
    r   r      )r!   r   r   reshape)r   r   
chunk_sizes      r    reshape_into_chunksr(   2   s     &lH=L
<!###L$6$6q$92z<K]K]^_K`aa ##q!2z<3E3Ea3H,J\J\]^J_
 	
r"   c                 "   | j                  d      } | d   j                  g | j                         | } t        j                  t        j                  ||| j
                  t        j                        d      }| j                  | d      } t        j                  | d      }t        j                  t        j                  ||| j
                  t        j                        d      }|j                  | t        j                         }|S )zo
    More stable segment sum calculation. Uses cumulative sums and masking instead of direct subtractions.
    r$   .Ndevicedtype)diagonalr   dim)
sizeexpandr   trilonesr,   boolmasked_fillcumsuminf)r   r'   masktensor_segsums       r    segment_sumr<   F   s     ""2&J 2<	*11S<3D3D3FS
SL::ejjZ@S@S[`[e[efqstD++TE15LLL26M ::ejjZ@S@S[`[e[efqrsD!--teeiiZ@Mr"   c                     |N|j                   d   dkD  r<|j                   d   dkD  r*| j                  }| |dddddf   z  j                  |      } | S )zm
    Tunes out the hidden states for padding tokens, see https://github.com/state-spaces/mamba/issues/66
    Nr   r   )r   r-   to)hidden_statesattention_maskr-   s      r    apply_mask_to_padding_statesrA   Z   sa    
 !n&:&:1&=&AnFZFZ[\F]`aFa##&1d
)CCGGNr"   c            
           e Zd ZdZej
                  dfdededej                  de	dz  fdZ
	 dded	ej                  d
edej                  fdZdedej                  fdZd Zy)Mamba2Cachea  
    Arguments:
        config: Mamba2Config
        batch_size: int
        dtype: torch.dtype
        device: torch.device

    Attributes:
        dtype: (`torch.dtype`):
            The default `dtype` used to initializing the cache.
        conv_kernel_size: (`int`):
            Model's convolution kernel size taken from config.
        n_groups: (`int`):
            Model's number of groups taken from the config - similar to tensor parallel in Transformer.
        state_size: (`int`):
            Model's SSM state size taken from config.
        num_heads: (`int`):
            The number of heads used in the linear attention / SSM.
        head_dim: (`int`):
            The respective dimension of the heads used in the linear attention / SSM.
        intermediate_size: (`int`):
            Model's intermediate_size based on (expand * hidden_dim) from config.
        conv_states: (`torch.Tensor`):
            A tensor of shape `[num_layers, batch_size, conv_kernel_size, intermediate_size + 2 * n_groups * state_size]` that holds convolutional states.
        ssm_states: (`torch.Tensor`):
            A tensor of shape `[num_layers, batch_size, num_heads, head_dim, state_size]` that holds ssm states.
    Nconfig
batch_sizer-   r,   c           	      R   || _         |j                  | _        |j                  | _        |j                  | _        |j
                  | _        |j                  | _        t        |j                  |j                  z        | _
        t        j                  |j                  || j                  d| j                  z  | j                  z  z   | j                  ||      | _        t        j                  |j                  || j
                  | j                  | j                  ||      | _        y )Nr%   r+   )r-   conv_kernelconv_kernel_sizen_groups
state_size	num_headshead_dimintr3   hidden_sizeintermediate_sizer   zerosnum_hidden_layersconv_states
ssm_states)selfrD   rE   r-   r,   s        r    __init__zMamba2Cache.__init__   s     
 & 2 2 ++))!$V]]V5G5G%G!H ;;$$""Q%6%HH!!
  ++$$NNMMOO
r"   	layer_idxnew_conv_state
cache_initreturnc                 p   |r3|j                  | j                  j                        | j                  |<   ns| j                  |   j                  dd      | j                  |<   |d d dd d f   j                  | j                  j                        | j                  |   d d d d df<   | j                  |   S )Nr$   )shiftsdimsr   )r>   rR   r,   roll)rT   rV   rW   rX   s       r    update_conv_statezMamba2Cache.update_conv_state   s     *8*;*;D<L<L<S<S*TDY'*.*:*:9*E*J*JRT[]*J*^DY'4B1a74K4N4NtO_O_OfOf4gDY'1b1	**r"   new_ssm_statec                     |j                  | j                  j                        | j                  |<   | j                  |   S N)r>   rS   r,   )rT   rV   r_   s      r    update_ssm_statezMamba2Cache.update_ssm_state   s4    %2%5%5doo6L6L%M	"y))r"   c                 l    | j                   j                          | j                  j                          y ra   )rR   zero_rS   rT   s    r    resetzMamba2Cache.reset   s$     r"   )F)__name__
__module____qualname____doc__r   float16r   rM   r-   strrU   Tensorr6   r^   rb   rf    r"   r    rC   rC   f   s    : KP--nr
"
03
<AKK
adgkak
< PU++.3ll+HL+	+*# *ell * r"   rC   c                   (     e Zd Zd fd	ZddZ xZS )MambaRMSNormGatedc                     t         |           t        j                  t	        j
                  |            | _        || _        y ra   superrU   r   	Parameterr   r5   weightvariance_epsilonrT   rN   eps	__class__s      r    rU   zMambaRMSNormGated.__init__   s/    ll5::k#:; #r"   c                    |j                   }|j                  t        j                        }|?|t        j
                  j                  |j                  t        j                              z  }|j                  d      j                  dd      }|t        j                  || j                  z         z  }| j                  |j                  |      z  S Nr%   r$   T)keepdim)r-   r>   r   float32r   r   silupowmeanrsqrtrv   ru   )rT   r?   gateinput_dtypevariances        r    forwardzMambaRMSNormGated.forward   s    #))%((7)BMM,>,>twwu}}?U,VVM $$Q',,R,>%Ht?T?T4T(UU{{]--k:::r"   gư>ra   rg   rh   ri   rU   r   __classcell__ry   s   @r    rp   rp      s    $
	;r"   rp   c            
           e Zd ZdZddededef fdZ ej                         d        Z
	 	 	 ddej                  d	edz  d
ej                  dz  dej                  dz  fdZ	 	 	 ddej                  d	edz  d
ej                  dz  dej                  dz  fdZ	 	 	 dd	edz  d
ej                  dz  dej                  dz  fdZ xZS )Mamba2Mixeru  
    Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`.
    A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective)
    ∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4,
    and is why Mamba is called **selective** state spaces)
    rD   rV   initialize_mixer_weightsc           	      n   t         |           |j                  | _        |j                  | _        |j                  | _        |j                  | _        t        |j                  | j                  z        | _
        t        |j                        | _        || _        |j                  | _        |j                  | _        t         |j                     | _        |j$                  | _        |j&                  | _        |j(                  | _        |j*                  | _        |j,                  | _        |j.                  | _        |j0                  | _        |j2                  | _        |j4                  | _        | j                  d| j(                  z  | j
                  z  z   | _        t9        j:                  | j6                  | j6                  |j                  |j                  | j6                  |j                  dz
        | _        | j                  | j6                  z   | j                  z   }t9        j>                  | j                  ||j@                        | _!        t9        jD                  tG        jH                  | j                              | _%        t9        jD                  tG        jH                  | j                              | _&        tO        | j                  | j$                        | _(        t9        jD                  tG        jH                  | j                              | _)        |r3| jJ                  jT                  jV                  dk7  r| jY                          t9        j>                  | j                  | j                  |j@                        | _-        |j@                  | _         t]        d      }t_        |dd       a0t_        |d	d       a1t]        d
      }te        |d      a3te        |d      a4te        |d      a5tm        tf        th        tj        tb        t`        f      a7tn        stp        js                  d       y y )Nr%   r   )in_channelsout_channelsbiaskernel_sizegroupspaddingr   rx   metazcausal-conv1dcausal_conv1d_updatecausal_conv1d_fnz	mamba-ssmz8ops.triton.selective_state_update.selective_state_update)chained_pathz1ops.triton.ssd_combined.mamba_chunk_scan_combinedz8ops.triton.ssd_combined.mamba_split_conv1d_scan_combineda  The fast path is not available because one of `(selective_state_update, causal_conv1d_fn, causal_conv1d_update)` is None. Falling back to the naive implementation. To install follow https://github.com/state-spaces/mamba/#installation and https://github.com/Dao-AILab/causal-conv1d):rs   rU   rK   rN   rJ   ssm_state_sizerG   rH   rM   r3   rO   time_step_rankrV   use_conv_bias
hidden_act
activationr   actlayer_norm_epsilonrms_normrI   rL   r'   time_step_limittime_step_mintime_step_maxtime_step_floorconv_dimr   Conv1dconv1dLinearuse_biasin_projrt   r   emptydt_biasA_logrp   normDr,   typeinit_mamba2_weightsout_projr	   getattrr   r   r   selective_state_updatemamba_chunk_scan_combined mamba_split_conv1d_scan_combinedallis_fast_path_availableloggerwarning_once)rT   rD   rV   r   projection_sizecausal_conv1d	mamba_ssmry   s          r    rU   zMamba2Mixer.__init__   sD   ))!--$// & 2 2!$V]]T5E5E%E!F!&"7"78"#11 ++&++,"(";"; ++%55#11#11%55..T]]1BTEXEX1XXii%%**==&&*
 004==@4>>Qyy
 ||EKK$?@ \\%++dnn"=>
%d&<&<$BYBYZ	ekk$..9:#(;(;(@(@F(J$$&		$"8"8$:J:JQWQ`Q`a )9&}6LdS"=2DdK %[1	!8$^"
 %<$W%
! ,C$^,
(
 "%&)0 $"
 &> &r"   c                 t   t        j                  d| j                  dz   | j                  j                  t         j
                        }t        j                  | j                  t        j                  |             t        j                  | j                         t        j                  t        j                  | j                  | j                  j                  t         j
                        t        j                  | j                        t        j                  | j                         z
  z  t        j                  | j                         z         j#                  | j$                        }|t        j                  t        j&                  |              z   }t        j                  | j                  |       y )Nr   r+   )min)r   arangerK   r   r,   r}   initcopy_logones_r   exprandr   mathr   r   clampr   expm1)rT   Adtinv_dts       r    r   zMamba2Mixer.init_mamba2_weights*  s   LLDNNQ.tzz7H7HPUP]P]^

4::uyy|,

466YYJJt~~dll.A.AWxx**+dhht7I7I.JJLhht))*+
 %D((%
)	 	 eiibS!1 122

4<<(r"   Nr?   cache_paramscache_positionr@   c                    t        ||      }| j                  |      }|j                  \  }}}| j                  | j                  z  }	|j                  d   d| j
                  z  z
  d| j                  z  | j                  z  z
  | j                  z
  dz  }
|||d   dkD  r|j                  d      j                  |
|
| j
                  | j                  | j                  gd      \  }}}}}t        ||j                  | j                     | j                  j                  j                  d      | j                  j                  | j                         }t#        j                  || j
                  |	|	gd      \  }}}t#        j$                  | j&                  j)                                }|d d d df   d d d d d f   j+                  d| j,                  | j                        j/                  t"        j0                        }|d d d d d f   j+                  dd| j,                        }| j2                  d d d df   j+                  d| j,                        }| j4                  d d d df   j+                  d| j,                        }|j7                  || j                  |j                  d   | j                  z        }|j7                  || j                  |j                  d   | j                  z        }|j7                  || j                  | j,                        }t9        |j:                  | j                     ||||||d |d	
      }|j7                  || j                  | j,                  z        }| j=                  ||      }| j?                  |      d d d df   }|S t#        j$                  | j&                  j)                                }| j@                  d
t)        d      fk(  ri nd| j@                  i}| jB                  r|tE        || j                  j                  j                  d      | j                  j                  | j2                  |f| j4                  | jF                  d | j                   | j<                  j                  | j<                  jH                  | j>                  j                  | j>                  j                  | j,                  | j                  ddd|}|S |j                  |
|
| j
                  | j                  | j                  gd      \  }}}}}|l|jK                  dd      }tL        jN                  jQ                  ||jR                  |j                  d   z
  df      }|jU                  | j                  |d       | j                   dvrH| jW                  | j                  |jK                  dd            dd |f   jK                  dd            }nptY        |jK                  dd      | j                  j                  j                  d      | j                  j                  | j                         jK                  dd      }t        ||      }t#        j                  || j
                  |	|	gd      \  }}}t[        |j7                  ||d| j,                        |||j7                  ||| j                  d      |j7                  ||| j                  d      f| jF                  | j4                  d d d| j2                  dd|\  }}|||j]                  | j                  |       |j7                  ||d      }| j=                  ||      }| j?                  |      }|S )Nr$   r%   r   r   r0   .r-   T)zr   dt_softplusg        r9   dt_limitF)r   r'   seq_idxr   rmsnorm_weightrmsnorm_epsoutproj_weightoutproj_biasheaddimngroupsnorm_before_gatereturn_final_statesrV   rW   rX   )r~   swish)xru   r   r   )r'   r   r   r   r   r   r   rV   r_   )/rA   r   r   rI   r   rO   rK   squeezesplitr   r   rR   rV   r   ru   r   r   r   r   r   floatr3   rL   r>   r}   r   r   viewr   rS   r   r   r   trainingr   r'   rv   	transposer   r   r   rH   r^   r   r   r   rb   )rT   r?   r   r   r@   projected_statesrE   seq_len_groups_time_state_sized_mlpr   hidden_states_B_Cr   BCr   r   r   hidden_states_reshapedoutdt_limit_kwargshidden_states_B_C_transposedrR   scan_output	ssm_states                             r    cuda_kernels_forwardz Mamba2Mixer.cuda_kernels_forward:  s    5]NS<<6 "/!4!4
GQ!%1D1D!D""2&$((()$--$"5"556 nn  #(B~VWGX[\G\0@0H0H0K0Q0Qt55t}}dnnU[] 1R 1-Aq$)2
 !5!((8""**1-  ! #(++!'')?AWX#M1a 4::++-..A!T3,1d
+222t}}dFYFYZ]]didqdq]rAAq$J&&r2t}}=Bll1dC<077DMMJGq$|$++B>Az4==!''!*2MNAz4==!''!*2MNA%2%7%7
DNNTXTaTa%b"2''7& M *..z4>>DMM;YZM IImT:M --.q$|<Cz 
s 4::++-..A$($8$8S%,<O$ObV`bfbvbvUwO }}!56$KK&&..q1KK$$LL ff# ##'99#3#3 $		 : :#'==#7#7!%!3!3 MM MM%*(-#$ &%h 
} 5E4J4JE4#9#94==$..Y_a 5K 511d-r  +3D3N3NqRS3T0"$--"3"34%669U9[9[\^9__abc#K !22"&..Y] 3  ??*;;(,$5$?$?1$EFsHWH}U__`acde)% )9+55a;#{{1199!<![[--#'??	)
  i1o & %AARTb$c!&+kk%++-CE[\'#q! *C!&&z7BNFF:wrBFF:wrB*  $ff (, LL $* &*&Y" (\-E 11DNNZc1d)..z7BG"iiT: mmK0
r"   c                    |j                   \  }}}|j                  }t        ||      }| j                  |      }	|	j                   d   d| j                  z  z
  d| j
                  z  | j                  z  z
  | j                  z
  dz  }
|	j                  |
|
| j                  | j                  | j                  gd      \  }}}}}|||d   dkD  r|j                  | j                  |d       |j                  | j                     j                  | j                  j                  j                         }t#        j$                  || j                  j                  j'                  d      z  d      }| j(                  r|| j                  j*                  z   }| j-                  |      }n|l|j/                  dd      }t0        j2                  j5                  ||j6                  |j                   d   z
  df      }|j                  | j                  |d	       | j-                  | j                  |j/                  dd            d
d |f   j/                  dd            }t        ||      }t#        j                  || j                  | j
                  | j                  z  | j
                  | j                  z  gd      \  }}}t#        j8                  | j:                  j=                                }|||d   dkD  r|j>                  j                   }|d d dd d f   d d d d
f   }|j/                  dd      jA                  ||j                   d   | jB                        }| jD                  d   jA                  | jD                  j                   d   | jB                        }t"        j0                  j2                  jG                  ||j                  |j                        z         }t#        jH                  || jJ                  d   | jJ                  d         }|d   jA                  | j                  | jB                  | j                        j                  t"        jL                        }t#        j8                  |d   |z        j                  |      }|jO                  || j
                  d      d
d d d f   }|jA                  || j
                  | j                  | j
                  z  |j                   d         jQ                         }|jO                  |d|j                   d         }|d   |d
d d d f   z  }|jO                  |d| jB                        }||d   z  j                  |      }|jS                  | j                  |j>                  | j                     |z  |z          |jO                  || j
                  d      d
d d d f   }|jA                  || j
                  | j                  | j
                  z  |j                   d         jQ                         }|jO                  |d|j                   d         }|j>                  | j                     j                  |j                   |j                        }|jU                  || j                  z  | jB                  | j                        }|jU                  || j                  z  | j                  d      }t#        jV                  ||      }|jU                  || j                  | jB                        }| jX                  d   jA                  | jX                  j                   d   | jB                        }|||z  z   j                  |j                        }|jO                  |d      d d d d
f   }nt0        j2                  jG                  || jD                  z         }t#        jH                  || jJ                  d   | jJ                  d         }|jO                  ||d| jB                        j=                         }|jO                  ||d| j                        j=                         }|jO                  ||d| j                        j=                         }|j[                  | j                  | j
                  z  d| j                        }|j[                  | j                  | j
                  z  d| j                        }| j\                  || j\                  z  z
  | j\                  z  }| jX                  d   t_        ||      z  }||d   z  }|j                  |j                        |z  }||||fD cg c]  }ta        ||| j\                         c}\  }}}}|jc                  dddd      }t#        jd                  |d      } t#        j8                  tg        |            }!|d d d d d d d d d d d f   |d d d d d d d d d d d f   z  }"|"j%                  d      }#|#d   |!jc                  ddddd      d   z  }$|$j%                  d      }%|%d   |d d d d d f   z  j%                  d      }&t#        j8                  | d d d d d d dd f   | z
        }'||'jc                  dddd      d   z  }(|(d
d d d f   |d   z  j%                  d      })|F|D|d   dkD  r<|j>                  | j                     d d d d
f   j                  |)j                         }*nt#        jh                  |)d d d df         }*t#        jj                  |*|)gd      })t#        j8                  tg        t0        j2                  j5                  | d d d d d d df   d                  }+|+j/                  dd      }+|+d   |)d d d d d d
f   z  j%                  d      },|,d d d df   |,d d df   }-})t#        j8                  |       }.|d
d d d f   |)d d d d d d
f   z  }/|.jc                  dddd      }0|/j%                  d      |0d   z  }1|&|1z   }|jO                  |d| j                  | jB                        }||z   }|dkD  r|d d d |d d d d f   }|jO                  ||d      }|-||jS                  | j                  |-       | jm                  ||      }2| jo                  |2j                  |            }3|3S c c}w )Nr$   r%   r0   r   Fr   r,   r   T.r*   ).NNr   r   r+   )r1   output_sizer   r   r/   )r   r   )8r   r-   rA   r   rO   rI   r   rK   r   r   r^   rV   rR   r>   r   ru   r,   r   sumr   r   r   r   r   r   r   r   rH   r   r   r   rS   r3   rL   r   softplusr   r   r}   r&   
contiguousrb   r   bmmr   repeat_interleaver'   r!   r(   permuter8   r<   
zeros_likecatr   r   )4rT   r?   r   r   r@   rE   r   r   r-   r   r   r   r   r   rR   r   r   r   r   cache_devicer   dAdBdBxrS   ssm_states_reshaped
C_reshapedyr   r   
D_residualtA_cumsumLG_intermediateGM_intermediateMY_diagdecay_statesB_decaystatesprevious_statesdecay_chunk
new_statesr   state_decay_outC_times_statesstate_decay_out_permutedY_offr   contextualized_statess4                                                       r    torch_forwardzMamba2Mixer.torch_forward  s    "/!4!4
GQ## 5]NS<<6!''+a$2H2H.HH1t}}K\_c_r_rKrrsw  tB  tB  B  GH  H,<,B,Bt55t~~V\^ -C -
)1d%r
 #(B~VWGX[\G\**T^^Terw*x '224>>BEET[[M_M_MfMfEgK %		dkk0088;;! !!$58H8H$H! $): ; '/@/J/J1a/P, mm//0<3P3PSoSuSuvxSy3y{|2} ..Xcpt.u $5F5P5PQRTU5V)WX[]e^e]eXe)f)p)pqrtu)v w89JN[#kk##T]]T5H5H%H$--Z^ZmZmJmn
q! YYtzz'')**#(B~VWGX[\G\'2299L Aq!GQc\*Ba#**:rxx|T]]SBll9-44T\\5G5G5JDMMZG$$--b7::bhh3G.GHBR!5!5a!8$:N:Nq:QRB/"))$..$--I\I\]``glgtgt`uA))ByMA-.22,2GB
 		*dmmR8dAFAT]]DNNdmm4SUVU\U\]_U`allnA		*b!''"+6AI3a<0B *11*b$--PMi0044L4IC ))..*55dnnEJSP *  		*dmmR8dAFAT]]DNNdmm4SUVU\U\]_U`allnA		*b!''"+6A &00@CC188[\[b[bCcJ",//*t~~2Mt}}^b^q^q"r
T^^ ;T=P=PRSTJ		-z:Az4>>4==AA y!((a$--HA]Q&&**1773A 		*b)!T3,7A ''T\\(9:BR!5!5a!8$:N:Nq:QRB)11*gr4==Y__aM		*gr43F3FGMMOA		*gr43F3FGMMOA##DNNdmm$CX\XfXf#gA##DNNdmm$CX\XfXf#gA'DOO*CCtVH	*-?x-XXJ *ByM9M](()B.A cpqrtuwxay%z\]&9!Xt&W%z"M1a 		!Q1%A||A2.H 		+a.)A q!Qa23a1dAq!8K6LLN""r"*A y\AIIaAq!,DY,OON""r"*A 	l]1a:%>>CCCJF !99XaArsl%;h%FGL,..q"b!<YGGGc4l+mI.FFKKPQKRF 'N,F>Z[K\_`K`"."9"9$.."I!TSV,"W"Z"Zbhbobo"Z"p"'"2"26!RaR%="AYY8a@F))K0A0A(1aQRTV;BWY_0`$abK%//15K%o61dC9PPUUZ[U\J *1crc6 2Jq"u4EIF $ii1OT1oq!T30GGN'6'>'>q!Q'J$#''+.Fy.QQE A		*b$..$--HAJA!|a'1a'(		*gr2A $)A--V_-`ii4(
 !%knnU.C D$$G &{s   sc                     t         rJd| j                  j                  j                  j                  v rt               s| j                  ||||      S | j                  ||||      S )Ncuda)r   r   ru   r,   r   r   r   r  )rT   r?   r   r   r@   s        r    r   zMamba2Mixer.forward  sY     "f0C0C0J0J0O0O&OXpXr,,]L.Zhii!!-~~^^r"   )TNNN)rg   rh   ri   rj   r   rM   r6   rU   r   no_gradr   rm   rC   
LongTensorr   r  r   r   r   s   @r    r   r      sA   [| [ [W[ [z U]]_) )$ ,026.2`||` "D(` ((4/	`
 t+`L *./3,0B%||B% "D(B% ''$.	B%
 t+B%P ,026.2	_ "D(	_ ((4/		_
 t+	_r"   r   c                   &     e Zd Zd fd	Zd Z xZS )Mamba2RMSNormc                     t         |           t        j                  t	        j
                  |            | _        || _        y)zM
        Mamba2RMSNorm is equivalent to T5LayerNorm and LlamaRMSNorm
        Nrr   rw   s      r    rU   zMamba2RMSNorm.__init__  s1     	ll5::k#:; #r"   c                 "   |j                   }|j                  t        j                        }|j	                  d      j                  dd      }|t        j                  || j                  z         z  }| j                  |j                  |      z  S r{   )	r-   r>   r   r}   r   r   r   rv   ru   )rT   r?   r   r   s       r    r   zMamba2RMSNorm.forward  sy    #))%((7 $$Q',,R,>%Ht?T?T4T(UU{{]--k:::r"   r   r   r   s   @r    r$  r$    s    $;r"   r$  c                   t     e Zd Z fdZ	 	 	 ddedz  dej                  dz  dej                  dz  fdZ xZ	S )Mamba2Blockc                     t         |           || _        || _        |j                  | _        t        |j                  |j                        | _        t        ||d      | _
        y )Nr   F)rV   r   )rs   rU   rD   rV   residual_in_fp32r$  rN   r   r   r   mixer)rT   rD   rV   ry   s      r    rU   zMamba2Block.__init__  sU    " & 7 7!&"4"4&:S:ST	 9W\]
r"   Nr   r   r@   c                    |}| j                  |j                  | j                   j                  j                              }| j                  r|j                  t
        j                        }| j                  ||||      }||z   }|S )Nr   r   r   r@   )r   r>   ru   r-   r*  r   r}   r+  )rT   r?   r   r   r@   residuals         r    r   zMamba2Block.forward  s     !		-"2"29I9I9O9O"2"PQ  {{5==1H

^dr # 
 !=0r"   r   )
rg   rh   ri   rU   rC   r   r"  rm   r   r   r   s   @r    r(  r(    sP    ^ ,026.2 "D( ((4/	
 t+r"   r(  c                   X    e Zd ZU eed<   dZdgZdZdZ e	j                         d        Zy)Mamba2PreTrainedModelrD   backboner(  Tc                 L   | j                   j                  }t        |t              r#|j	                          t        j                  |j                  j                  t        j                  d             |j                  j                  )t        j                  |j                  j                         t        j                  |j                  j                  t        j                  d             | j                   j                  rB|j                  j                  }|t        j                  | j                   j                        z  }t        |t         j"                        rNt        j$                  |j                  |       |j                   t        j                  |j                         yyt        |t&        t(        f      r t        j*                  |j                         yt        |t         j,                        r"t        j$                  |j                  |       yy)zInitialize the weights.   )aN)std)rD   initializer_range
isinstancer   r   r   kaiming_uniform_r   ru   r   sqrtr   zeros_r   rescale_prenorm_residualrQ   r   r   normal_r$  rp   r   	Embedding)rT   moduler5  ps       r    _init_weightsz#Mamba2PreTrainedModel._init_weights  sT    kk++fk* &&(!!&--"6"6$))A,G}}!!-FMM../!!&//"8"8DIIaLI{{33 OO**TYYt{{<<==fbii(LLC0{{&FKK( '0A BCJJv}}%-LLC0 .r"   N)rg   rh   ri   r   __annotations__base_model_prefix_no_split_modulessupports_gradient_checkpointing_is_statefulr   r!  r@  rn   r"   r    r0  r0    s;    "&&*#LU]]_"1 "1r"   r0  z-
    Class for the MAMBA2 model outputs.
    )custom_introc                   |    e Zd ZU dZdZej                  dz  ed<   dZe	dz  ed<   dZ
eej                     dz  ed<   y)Mamba2Outputa:  
    cache_params (`Mamba2Cache`):
        The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
        avoid providing the old `input_ids`.

        Includes both the State space model state matrices after the selective scan, and the Convolutional states
    Nlast_hidden_stater   r?   )rg   rh   ri   rj   rI  r   FloatTensorrA  r   rC   r?   tuplern   r"   r    rH  rH  	  sH     37u((4/6'+L+$+59M5**+d29r"   rH  zK
    Base class for causal language model (or autoregressive) outputs.
    c                       e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	e
dz  ed<   dZeej                     dz  ed<   y)Mamba2CausalLMOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    cache_params (`Mamba2Cache`):
        The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
        avoid providing the old `input_ids`.

        Includes both the State space model state matrices after the selective scan, and the Convolutional states
    Nlosslogitsr   r?   )rg   rh   ri   rj   rN  r   rJ  rA  rO  r   rC   r?   rK  rn   r"   r    rM  rM    s\    
 &*D%

d
")'+FE$+'+L+$+59M5**+d29r"   rM  c                        e Zd Z fdZd Zd Zd Ze	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  de
dz  d	edz  d
edz  dedz  dej                  dz  dej                  dz  deez  fd       Z xZS )Mamba2Modelc           	         t         |   |       t        j                  |j                  |j
                        | _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        d| _        t        |j
                  |j                        | _        | j!                  | j"                         | j%                          y c c}w )N)rV   Fr   )rs   rU   r   r=  
vocab_sizerN   
embeddings
ModuleListrangerQ   r(  layersgradient_checkpointingr$  r   norm_f"_register_load_state_dict_pre_hook	load_hook	post_init)rT   rD   idxry   s      r    rU   zMamba2Model.__init__:  s     ,,v'8'8&:L:LMmmSXY_YqYqSr$sC[3%G$st&+##F$6$6F<U<UV//? %ts   &Cc                 f    |D ],  }d|v s|j                  |      ||j                  dd      <    y  y )Nz
embedding.zembeddings.)popreplace)rT   
state_dictprefixargsks        r    r[  zMamba2Model.load_hookF  s;     	Aq EO^^TUEV
199\=AB	r"   c                     | j                   S ra   rT  re   s    r    get_input_embeddingsz Mamba2Model.get_input_embeddingsL  s    r"   c                     || _         y ra   rf  rT   new_embeddingss     r    set_input_embeddingsz Mamba2Model.set_input_embeddingsO  s	    (r"   N	input_idsinputs_embedsr   	use_cacheoutput_hidden_statesreturn_dictr   r@   rY   c	                 8   ||n| j                   j                  }||n#| j                  s| j                   j                  nd}||n| j                   j                  }|du |duz  rt        d      || j                  |      }| j                  r| j                  r|rd}|r|st        | j                   |j                  d      |j                  |j                        }t        j                  d| j                   j                  |j                        }n|t        d      d}|}
|rdnd}| j                  D ]  } ||
|||	      }
|s||
fz   } | j!                  |
      }
|r||
fz   }|st#        d
 |
||fD              S t%        |
|r||      S d|      S )a  
        cache_params (`Mamba2Cache`, *optional*):
            If passed along, the model uses the previous state in all the blocks (which will give the output for the
            `input_ids` provided as if the model add `state_input_ids + input_ids` as context).
        use_cache (`bool`, *optional*):
            If set to `True`, the `cache_params` is returned and can be used to quickly generate the next logits.
        cache_position (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            The position of the current input in the cache. This is used to ensure that the cache is correctly updated.
            If `cache_params` is passed, `cache_position` should also be passed.
        NFz:You must specify exactly one of input_ids or inputs_embedsr   r+   r   zYou have to specify the `cache_position` manually when `use_cache=True` and `cache_params` is passed, you don't have to pass a `cache_params` if you are in prefilling stage because in that case it will be initialized for you automaticallyrn   r-  c              3   &   K   | ]	  }||  y wra   rn   ).0vs     r    	<genexpr>z&Mamba2Model.forward.<locals>.<genexpr>  s     fqXYXefs   )rI  r   r?   )rD   ro  r   rn  use_return_dict
ValueErrorrT  rX  rC   r2   r,   r-   r   r   rG   rW  rY  rK  rH  )rT   rl  rm  r   rn  ro  rp  r   r@   kwargsr?   all_hidden_statesmixer_blocks                r    r   zMamba2Model.forwardR  s   0 %9$D $++JjJj 	 "+!6IZ^ZgZgT[[=R=Rmr	%0%<k$++B]B]-t";<YZZ  OOI6M&&4==YI#*KK!3!3A!6}?S?S[h[n[n  "'a1H1HQ^QeQe!f' !;   L%"6BD;; 		IK')--	M $$58H$H!		I M2 1]4D Df]LBS$Tfff+)2+
 	
8<+
 	
r"   )NNNNNNNN)rg   rh   ri   rU   r[  rg  rk  r   r   r"  rC   r6   rm   rK  rH  r   r   r   s   @r    rQ  rQ  8  s    
)  .215+/!%,0#'26.2P
##d*P
 ''$.P
 "D(	P

 $;P
 #TkP
 D[P
 ((4/P
 t+P
 
	P
 P
r"   rQ  z
    The MAMBA2 Model transformer with a language modeling head on top (linear layer with weights not tied to the input
    embeddings).
    c                       e Zd ZddiZ fdZd Zd Z	 	 	 	 	 	 ddedz  dej                  dz  d	ej                  dz  d
edz  f fdZe	 	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dedz  dej                  dz  dedz  dedz  dedz  dej                  dz  d	ej                  dz  deej                  z  deez  fd       Z xZS )Mamba2ForCausalLMzlm_head.weightzbackbone.embeddings.weightc                     t         |   |       t        |      | _        t	        j
                  |j                  |j                  d      | _        | j                          y )NFr   )
rs   rU   rQ  r1  r   r   rN   rS  lm_headr\  )rT   rD   ry   s     r    rU   zMamba2ForCausalLM.__init__  sF     #F+yy!3!3V5F5FUSr"   c                 6    | j                   j                         S ra   )r1  rg  re   s    r    rg  z&Mamba2ForCausalLM.get_input_embeddings  s    }}1133r"   c                 8    | j                   j                  |      S ra   )r1  rk  ri  s     r    rk  z&Mamba2ForCausalLM.set_input_embeddings  s    }}11.AAr"   Nr   r   r@   is_first_iterationc           
         t        |   |f||||||d|}	|r|t        j                  d| j                  j
                  j                  |j                        |	d<   ||j                  d      }
n|j                  d      }
t        | j                  j
                  |
| j                  | j                        |	d<   |	S |r|	d   d   dkD  rd |	d<   |	S )N)rm  rn  r   r   r@   r  r   r   r   r+   r   r@   )rs   prepare_inputs_for_generationr   r   r1  rD   rG   r,   r2   rC   r-   )rT   rl  rm  rn  r   r   r@   r  rx  model_inputsmax_batch_sizery   s              r    r  z/Mamba2ForCausalLM.prepare_inputs_for_generation  s     w<	
'%))1	
 	
 -
 .3\\!T]]=Q=Q=]=]fofvfv-wL)*(!.!3!3A!6!*!2+6$$nT[[PTPZPZ,L(  <(89!<q@-1L)*r"   rl  rm  labelsro  rp  rn  logits_to_keeprY   c           
      (   ||n| j                   j                  }| j                  ||||||||	      }|d   }t        |
t              rt        |
 d      n|
}| j                  |dd|ddf   j                  | j                  j                  j                              j                         }d}|* | j                  d||| j                   j                  d|}|s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )ao  
        cache_params (`Mamba2Cache`, *optional*):
            If passed along, the model uses the previous state in all the blocks (which will give the output for the
            `input_ids` provided as if the model add `state_input_ids + input_ids` as context).
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
        use_cache (`bool`, *optional*):
            If set to `True`, the `cache_params` is returned and can be used to quickly generate the next logits.
        cache_position (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            The position of the current input in the cache. This is used to ensure that the cache is correctly updated.
            If `cache_params` is passed, `cache_position` should also be passed.
        N)r   rm  ro  rp  rn  r   r@   r   )rO  r  rS  r   )rN  rO  r   r?   rn   )rD   rv  r1  r7  rM   slicer~  r>   ru   r-   r   loss_functionrS  rM  r   r?   )rT   rl  rm  r   r  ro  rp  rn  r   r@   r  rx  mamba2_outputsr?   slice_indicesrO  rN  outputs                     r    r   zMamba2ForCausalLM.forward  s0   : &1%<k$++B]B]%'!5#)) ' 	
 'q)8B>SV8W~ot4]kmA}a,?@CCDLLDWDWD]D]^_eeg%4%%pVFt{{OeOepiopDY!33F)-)9TGf$EvE#'44(66	
 	
r"   )NNNNNF)
NNNNNNNNNr   )rg   rh   ri   _tied_weights_keysrU   rg  rk  rC   r   r"  rm   r6   r  r   rJ  rM   rK  rM  r   r   r   s   @r    r|  r|    ss    +,HI4B +/26.2*/(
 "D(( ((4/( t+( !4K(T  .226+/*.,0#'!%.2.2-.;
##d*;
 ((4/;
 "D(	;

   4';
 #Tk;
 D[;
 $;;
 t+;
 t+;
 ell*;
 
%	%;
 ;
r"   r|  )r|  rQ  r0  )1rj   r   dataclassesr   r   r    r   r   activationsr   
generationr   integrationsr	   modeling_layersr
   modeling_utilsr   utilsr   r   r   r   utils.import_utilsr   configuration_mamba2r   
get_loggerrg   r   rm   rM   r!   r(   r<   rA   rC   Modulerp   r   r$  r(  r0  rH  rM  rQ  r|  __all__rn   r"   r    <module>r     s     !   & ! ) , 9 - S S 9 . 
		H	%VU\\ VS V
((	J  J Z; ;$f_")) f_R;BII ;", 8 *1O *1 *1Z :; : : :; : :& j
' j
 j
Z v
- v
v
r Hr"   