
    qi                        d dl mZ d dlmZ d dlZd dlmZ ddlmZ ddl	m
Z
 ddlmZ dd	lmZ dd
lmZmZ ddlmZmZ ddlmZmZ ddlmZ ddlmZmZmZ ddlmZ ddl m!Z! ddl"m#Z#m$Z$ ddl%m&Z&m'Z'm(Z( ddl)m*Z* ddl+m,Z,m-Z- ddl.m/Z/  ej`                  e1      Z2 G d de'      Z3 G d d      Z4 G d de&      Z5 G d dejl                        Z7 G d  d!e*      Z8 G d" d#e,      Z9 G d$ d%ejl                        Z: G d& d'e      Z; G d( d)e      Z<e;e<d*Z= G d+ d,e      Z>e G d- d.e>             Z? G d/ d0e-      Z@ G d1 d2ee>      ZAg d3ZBy)4    )Callable)AnyN)nn   )initialization)ACT2FN)lazy_load_kernel)create_causal_mask) GenericForSequenceClassificationGradientCheckpointingLayer)MoeCausalLMOutputWithPastMoeModelOutputWithPast)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringlogging)merge_with_config_defaults)resolve_internal_import)OutputRecordercapture_outputs   )LlamaAttentionLlamaRMSNormeager_attention_forward)
MistralMLP)MixtralExpertsMixtralForCausalLM   )JambaConfigc                       e Zd Zy)JambaRMSNormN__name__
__module____qualname__     Y/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/jamba/modular_jamba.pyr#   r#   .       r)   r#   c                   4   e Zd ZdZdZej                  dfdZd Zd Z		 ddej                  dej                  d	ed
eeef   dz  deej                  ej                  f   f
dZdej"                  fdZdej                  d	edeeef   fdZdd	edz  defdZy) HybridMambaAttentionDynamicCachea  
    A dynamic cache that can handle both the attention cache (which has a seq_len dimension) and the mamba cache
    (which has a constant shape regardless of seq_len).

    This cache has two sets of lists of tensors: `key_cache` and `value_cache` for attention cache and `conv_states`
    and `ssm_states` for mamba cache. Each of these lists has `num_layers` tensors. The expected shape for each tensor
    For attention layers, `key_cache` and `value_cache` have a shape of `(batch_size, num_heads, seq_len, head_dim)`,
    while `conv_states` and `ssm_states` have a shape of `(batch_size, 0)` (empty tensors).
    For mamba layers, `key_cache` and `value_cache` have a shape of `(batch_size, 0)` (empty tensors),
    while `conv_states` represents the convolution state and has a shape of `(batch_size, d_inner, d_conv)`,
    and `ssm_states` represents the ssm state and has a shape of `(batch_size, d_inner, d_state)`.
    FNc           
         || _         |j                  | _        d| _        |j                  |j                  z  }|j
                  }|j                  }g | _        g | _        g | _	        t        |j                        D ]  }| j                  |   dk(  r]| xj                  t        j                  |||||      gz  c_        | xj                  t        j                  |||||      gz  c_        r| xj                  t        j                  g g|z  |      gz  c_        | xj                  t        j                  g g|z  |      gz  c_        | j                  j                  |        t        |j                        D 	cg c]  }	t        j                  g g|z  |       c}	| _        t        |j                        D 	cg c]  }	t        j                  g g|z  |       c}	| _        y c c}	w c c}	w )NFmambadevicedtyper1   )r2   layers_block_typehas_previous_statemamba_expandhidden_sizemamba_d_statemamba_d_convconv_states
ssm_statestransformer_layersrangenum_hidden_layerstorchzerostensorappend	key_cachevalue_cache)
selfconfig
batch_sizer2   r1   intermediate_sizessm_state_sizeconv_kernel_sizei_s
             r*   __init__z)HybridMambaAttentionDynamicCache.__init__B   s   
!'!9!9"'"//&2D2DD--!.."$v//0 	2A%%a(G3  KK
,=?OX^fkl%   KK
,=~V\dij$    U\\2$2CF%S$TT ELL"
1B6$R#SS''..q1	2 SXX^XpXpRqrQ%,,tj'8HrTYZ`ZrZrTstqELL"
):6Jt sts   ?"G+ "G0c                 ,    t        | j                        S N)lenrC   )rE   s    r*   __len__z(HybridMambaAttentionDynamicCache.__len__\   s    4>>""r)   c                 >    | j                   |   | j                  |   fS rO   )rC   rD   rE   	layer_idxs     r*   __getitem__z,HybridMambaAttentionDynamicCache.__getitem___   s!    ~~i($*:*:9*EEEr)   
key_statesvalue_statesrT   cache_kwargsreturnc                    | j                   |   j                  d   dk(  r|| j                   |<   || j                  |<   nft        j                  | j                   |   |gd      | j                   |<   t        j                  | j                  |   |gd      | j                  |<   | j                   |   | j                  |   fS )Nr   r   dim)rC   shaperD   r?   cat)rE   rV   rW   rT   rX   s        r*   updatez'HybridMambaAttentionDynamicCache.updateb   s     >>)$**2.!3(2DNN9%*6DY'(-		4>>)3Lj2Y_`(aDNN9%*/))T5E5Ei5PR^4_ef*gDY'~~i($*:*:9*EEEr)   beam_idxc                    | j                         dkD  rvt        t        | j                              D ]S  }| j                  |   j                  }| j                  |   j                  d|j                  |            | j                  |<   | j                  |   j                  }| j                  |   j                  d|j                  |            | j                  |<   | j                  |   j                  }| j                  |   j                  d|j                  |            | j                  |<   | j                  |   j                  }| j                  |   j                  d|j                  |            | j                  |<   V yy)zDReorders the cache for beam search, given the selected beam indices.r   N)
get_seq_lengthr=   rP   rC   r1   index_selecttorD   r:   r;   )rE   ra   rT   r1   s       r*   reorder_cachez.HybridMambaAttentionDynamicCache.reorder_caches   s[    1$"3t~~#67 	m		299,0NN9,E,R,RSTV^VaVabhVi,jy))))4;;.2.>.>y.I.V.VWXZbZeZeflZm.n  +)))4;;.2.>.>y.I.V.VWXZbZeZeflZm.n  +3::-1__Y-G-T-TUVX`XcXcdjXk-l	*	m %r)   cache_positionc                 T    d}|j                   d   }| j                  |      |z   }||fS )zDReturn the length and offset of the cache, used to generate the maskr   )r^   rc   )rE   rg   rT   	kv_offsetquery_length	kv_lengths         r*   get_mask_sizesz/HybridMambaAttentionDynamicCache.get_mask_sizes   s7    	%++A.''	2\A	)##r)   c                     || j                   vr| j                   d   n|}t        | j                        |k  s| j                  |   j                  d   dk(  ry| j                  |   j                  d   S )zYReturns the sequence length of the cached states. A layer index can be optionally passed.r   r[   )r<   rP   rC   r^   rS   s     r*   rc   z/HybridMambaAttentionDynamicCache.get_seq_length   sn     3<4CZCZ2ZD++A.`i	t~~)+t~~i/H/N/Nr/RVW/W~~i(..r22r)   rO   )r   )r%   r&   r'   __doc__is_compileabler?   float16rM   rQ   rU   Tensorintdictstrr   tupler`   
LongTensorrf   rl   rc   r(   r)   r*   r-   r-   2   s     N16t u4#F /3FLLF llF 	F
 38nt+F 
u||U\\)	*F"me&6&6 m$U\\ $c $eTWY\T\o $3d
 33 3r)   r-   c                        e Zd Zdedef fdZ	 	 	 ddej                  dej                  dz  dedz  dej                  dz  d	e
e   d
eej                  ej                  dz  f   fdZ xZS )JambaAttentionrF   rT   c                    t         |   ||       t        j                  |j                  |j
                  | j                  z  d      | _        t        j                  |j                  |j                  | j                  z  d      | _	        t        j                  |j                  |j                  | j                  z  d      | _
        t        j                  |j
                  | j                  z  |j                  d      | _        y NFbias)superrM   r   Linearr7   num_attention_headshead_dimq_projnum_key_value_headsk_projv_projo_proj)rE   rF   rT   	__class__s      r*   rM   zJambaAttention.__init__   s    +ii 2 2F4N4NQUQ^Q^4^ejkii 2 2F4N4NQUQ^Q^4^ejkii 2 2F4N4NQUQ^Q^4^ejkii : :T]] JFL^L^ejkr)   Nhidden_statesattention_maskpast_key_valuesrg   kwargsrY   c                    |j                   d d }g |d| j                  }| j                  |      j                  |      j	                  dd      }| j                  |      j                  |      j	                  dd      }	| j                  |      j                  |      j	                  dd      }
|#|j                  |	|
| j                  d|i      \  }	}
t        j                  | j                  j                  t              } || ||	|
|f| j                  sdn| j                  | j                   d|\  }} |j"                  g |d j%                         }| j'                  |      }||fS )Nr[   r    r   rg           )dropoutscaling)r^   r   r   view	transposer   r   r`   rT   r   get_interfacerF   _attn_implementationr   trainingattention_dropoutr   reshape
contiguousr   )rE   r   r   r   rg   r   input_shapehidden_shapequery_statesrV   rW   attention_interfaceattn_outputattn_weightss                 r*   forwardzJambaAttention.forward   s|    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST&'6'='=L$..;K^:\($J )@(M(MKK,,.E)
 %8	%
  $}}C$2H2HLL	%
 	%
!\ *k));;;;FFHkk+.L((r)   NNN)r%   r&   r'   r!   rs   rM   r?   rr   r-   rw   r   r   rv   r   __classcell__r   s   @r*   ry   ry      s    l{ ls l /3CG26%)||%) t+%) :D@	%)
 ((4/%) +,%) 
u||U\\D00	1%)r)   ry   c                        e Zd ZdZdef fdZ	 	 ddej                  dedz  dej                  dz  fdZ
ddedz  dej                  dz  fd	Z	 	 ddedz  dej                  dz  fd
Z xZS )JambaMambaMixeru  
    Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`.
    A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective)
    ∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4,
    and is why Mamba is called **selective** state spaces)
    rF   c           	         t         |           || _        || _        |j                  | _        |j
                  | _        |j                  | _        |j                  |j                  z  | _
        |j                  | _        |j                  | _        |j                  | _        t#        j$                  | j                  | j                  | j                  | j                  | j                  | j                  dz
        | _        |j(                  | _        t,        |j(                     | _        t#        j0                  | j                  | j                  dz  | j                         | _        t#        j0                  | j                  | j                  | j                  dz  z   d      | _        t#        j0                  | j                  | j                  d      | _        t9        j:                  d| j                  dz         d d d f   }|j=                  | j                  d      j?                         }t#        j@                  t9        jB                  |            | _"        t#        j@                  t9        jF                  | j                              | _$        t#        j0                  | j                  | j                  | j                         | _%        tM        | j                  |jN                        | _(        tM        | j                  |jN                        | _)        tM        | j                  |jN                        | _*        tW        d	      }tY        |d
d       a-tY        |dd       a.tW        d      }t_        |d      a0tY        |dd       a1tY        |dd       a2tg        t`        tb        t\        tZ        td        f      a4th        stj        jm                  d       y y )Nr    )in_channelsout_channelsr}   kernel_sizegroupspaddingr   r|   FTr[   epszcausal-conv1dcausal_conv1d_updatecausal_conv1d_fnz	mamba-ssmz8ops.triton.selective_state_update.selective_state_update)chained_pathselective_scan_fnmamba_inner_fna  The fast path is not available because on of `(selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn)` is None. To install follow https://github.com/state-spaces/mamba/#installation and https://github.com/Dao-AILab/causal-conv1d.)7r~   rM   rF   rT   r7   r8   rI   r9   rJ   r6   rH   mamba_dt_ranktime_step_rankmamba_conv_biasuse_conv_biasmamba_proj_biasuse_biasr   Conv1dconv1d
hidden_act
activationr   actr   in_projx_projdt_projr?   arangeexpandr   	ParameterlogA_logonesDout_projr#   rms_norm_epsdt_layernormb_layernormc_layernormr	   getattrr   r   r   selective_state_updater   r   allis_fast_path_availableloggerwarning_once)rE   rF   rT   Acausal_conv1d	mamba_ssmr   s         r*   rM   zJambaMambaMixer.__init__   s   "!--$22 & 3 3!'!4!4v7I7I!I$22#33..ii..//##--))))A-
 !++&++, yy!1!143I3IA3MTXTaTabii 6 68K8KdNaNadeNe8elqryy!4!4d6L6LSWX LLD//!34T1W=HHT++R0;;=\\%))A,/
ejj)?)?@A		$"8"8$:J:JQUQ^Q^_()<)<&BUBUV'(;(;ATATU'(;(;ATATU )9&}6LdS"=2DdK %[1	!8$^"
 $I/BDI ,<dC "%#%68HJ^`no"
 &R &r)   Nr   cache_paramsr   c                 4
   |j                   \  }}}|d uxrm |j                  xr_ |dk(  xrX |j                  | j                     j                   d   |j                  | j                     j                   d   cxk(  xr |k(  nc }| j                  |      j                  dd      }|j                  dd      \  }}	|||j                  d      z  }| j                  j                  j                  | j                  j                  j                  d      | j                  j                  j                  d            }
|ret        |j                  d      |j                  | j                     |
| j                  j                  | j                         }|j                  d      }n|dt"        j$                  j'                  || j(                  |j                   d   z
  df      }|j                  | j                     j+                  |       t-        ||
| j                  j                  | j                         }|||j                  d      z  }| j/                  |j                  dd            }t1        j2                  || j4                  | j6                  | j6                  gd      \  }}}| j9                  |      }| j;                  |      }| j=                  |      }| j>                  j                  j@                  }t1        jB                         5  t1        jD                  | j>                  j                  j@                        | j>                  j                  _         d d d        | j?                  |      j                  dd      }t1        jB                         5  || j>                  j                  _         d d d        t1        jF                  | jH                  jK                                }||jK                         nd }|r]tM        |j                  | j                     |d   |d   ||d d df   |d d df   | jN                  |	d   |d	
      j                  d      }n|tQ        ||||j                  dd      |j                  dd      | jN                  jK                         |	|dd

      \  }}|*|(|j                  | j                     j+                  |       | jS                  |j                  dd            }|S # 1 sw Y   xY w# 1 sw Y   WxY w)Nr    r   r   r\   r[   )r   ).r   T)dt_softplus)delta_softplusreturn_last_state)*r^   r5   r:   rT   r;   r   r   chunk	unsqueezer   weightr   sizer   squeezer}   r   r   
functionalpadrJ   copy_r   r   r?   splitr   rI   r   r   r   r   datano_grad
zeros_likeexpr   floatr   r   r   r   )rE   r   r   r   rG   seq_lenrL   use_precomputed_statesprojected_statesgateconv_weightsr:   ssm_parameters	time_stepBCtime_proj_biasdiscrete_time_stepr   scan_outputs	ssm_statecontextualized_statess                         r*   cuda_kernels_forwardz$JambaMambaMixer.cuda_kernels_forward  s    "/!4!4
GQ$ //1 ((8>>qA&&t~~6<<Q?	 	  <<6@@AF /44QA4>t%)N,D,DQ,GGM {{))..t{{/A/A/F/Fq/I4;;K]K]KbKbcdKef!0%%b)((8  M *33B7M' mm//@U@UXeXkXklnXo@oqr?st((8>>{K,]L$++JZJZgkgvgvwM%)N,D,DQ,GGM ]%<%<Q%BC++T00$2E2EtGZGZ[ac
	1a %%i0	QQ **//]]_ 	N%*%5%5dll6G6G6L6L%MDLL"	N!\\)4>>q!D]]_ 	4%3DLL"	4 YYtzz'')**3A3M--/SW!1''7f%"6*!Q$!Q$V  im  '8"Aq!Aq!#"&'#L) $)A''7==iH !%l.D.DQ.J K$$S	N 	N	4 	4s   AT T T
Tc           	      n   |j                   \  }}}|j                  }| j                  |      j                  dd      }|j	                  dd      \  }	}
||	|j                  d      z  }	t        |t              }|r8|j                  | j                     j                   d   |k(  r| j                  r(|j                  | j                     j                         }n|j                  | j                     }|j                  |	j                        }|j                  r|dk(  r|j                  | j                     j                   d   |k(  r|j                  | j                     }t!        j"                  |dd      }|	d d d d df   |d d d d df<   ||j                  | j                  <   t!        j$                  || j&                  j(                  d d dd d f   z  d      }	| j*                  r|	| j&                  j,                  z  }	| j/                  |	      j                  |      j                  d      }	nt0        j2                  j5                  |	| j6                  |	j                   d   z
  df      }||j                  | j                  <   | j/                  | j'                  |	      dd |f         }	n`t!        j8                  || j:                  | j<                  f|	j                  |      }| j/                  | j'                  |	      dd |f         }	||	|j                  d      z  }	| j?                  |	j                  dd            }t!        j@                  || jB                  | j<                  | j<                  gd      \  }}}| jE                  |      }| jG                  |      }| jI                  |      }| jK                  |      }t0        j2                  jM                  |      j                  dd      }t!        jN                  | jP                  jS                                }t!        jN                  |d d d d d d f   |d d d d d d d f   z        }|d d d d d d d f   |d d d d d d d f   jS                         z  }||	d d d d d d d f   jS                         z  }g }tU        |      D ]}  }|d d d d |d d f   |z  |d d d d |d d f   z   }t!        jV                  |j                  |      |d d |d d f   j                  d            }|jY                  |d d d d df           t!        jZ                  |d      }||	| j\                  d d d d f   z  z   }|| j/                  |
      z  }|r||j                  | j                  <   | j_                  |j                  dd            }|S )	Nr    r   r\   r   r[   )shiftsdims.r0   )0r^   r2   r   r   r   r   
isinstancer-   r;   rT   r   clonere   r1   r5   r:   r?   rollsumr   r   r   r}   r   r   r   r   rJ   r@   rH   rI   r   r   r   r   r   r   r   softplusr   r   r   r=   matmulrB   stackr   r   )rE   input_statesr   r   rG   r   rL   r2   r   r   r   	use_cacher   
conv_stater   r   r   r   r   r   
discrete_A
discrete_BdeltaB_ur   rK   scan_outputr   s                              r*   slow_forwardzJambaMambaMixer.slow_forwardw  s   !-!3!3
GQ""<<5??1E.44QA4>t%)N,D,DQ,GGM|-MN	00@FFqIZW}}(33DNNCIIK	(33DNNC	!]%9%9:I..7a< ,,T^^<BB1ES)55dnnE
"ZZ
2BG
'4Q1W'=
1a8$;E((8 %		*t{{7I7I!QPQ'7R*RXZ [%%!T[[%5%55M $ 7 : :5 A K KB O]]..!**]-@-@-DDaH
 <F((8 $])CC'M)R ST33T5H5HI$++5I !HHT[[%?XgX%NOM%)N,D,DQ,GGM ]%<%<Q%BC++T00$2E2EtGZGZ[ac
	1a %%i0	QQ!\\)4]]334FGQQRSUVW YYtzz'')**YYqq$!125G1aQU5VVW
'1a61dAq=9I9O9O9QQ
aAtm < B B DDw 	6A"1aA:.:XaAqj=QQI,,y||E':AaAgJ<P<PQS<TUKAq!G 45	6 kk,B7!]TVVD!TM5J%JK"TXXd^36?L##DNN3 !%k.C.CAq.I J$$r)   c                 V   | j                   j                  rXt        r,d| j                  j                  j
                  j                  vr&t        j                  d       d| j                   _        | j                   j                  r| j                  |||      S | j                  |||      S )NcudazFast Mamba kernels are not available. Make sure that they are installed and that the mamba module is on a CUDA device. Turning off the fast path `config.use_mamba_kernels=False` and falling back to the slow path.F)rF   use_mamba_kernelsr   r   r   r1   typer   r   r   r  )rE   r   r   r   s       r*   r   zJambaMambaMixer.forward  s     ;;((&&8J8J8Q8Q8V8V*VV
 -2DKK);;((,,]L.YY  nMMr)   )NN)r%   r&   r'   ro   r!   rM   r?   rr   r-   rw   r   r  r   r   r   s   @r*   r   r      s    A{ AL AE26	h%||h% 7=h% ((4/	h%VR%7WZ^7^ R%w|  xH  xH  KO  xO R%p AE26	N 7=N ((4/	Nr)   r   c                       e Zd Zy)JambaMLPNr$   r(   r)   r*   r  r    r+   r)   r  c                       e Zd Zy)JambaExpertsNr$   r(   r)   r*   r  r    r+   r)   r  c                   f     e Zd ZdZdef fdZd Zdej                  dej                  fdZ	 xZ
S )JambaSparseMoeBlocka  
    This implementation is
    strictly equivalent to standard MoE with full capacity (no
    dropped tokens). It's faster since it formulates MoE operations
    in terms of block-sparse operations to accommodate imbalanced
    assignments of tokens to experts, whereas standard MoE either
    (1) drop tokens at the cost of reduced performance or (2) set
    capacity factor to number of experts and thus waste computation
    and memory on padding.
    rF   c                 ,   t         |           |j                  | _        |j                  | _        |j                  | _        |j                  | _        t        j                  | j                  | j                  d      | _        t        |      | _        y r{   )r~   rM   r7   
hidden_dimrH   ffn_dimnum_expertsnum_experts_per_toktop_kr   r   routerr  expertsrE   rF   r   s     r*   rM   zJambaSparseMoeBlock.__init__  sm     ,,//!--//
ii1A1AN#F+r)   c                     t         j                  j                  j                  |dt         j                        }t        j
                  || j                  d      \  }}||j                  |j                        fS )Nr[   )r]   r2   r\   )	r?   r   r   softmaxr   topkr  re   r2   )rE   r   router_logitsrouting_weightstop_k_weightstop_k_indexs         r*   route_tokens_to_expertsz+JambaSparseMoeBlock.route_tokens_to_experts  sb    ((--55mSXS^S^5_%*ZZQS%T"{M,,]-@-@AAAr)   r   rY   c                     |j                   \  }}}|j                  d|      }| j                  |      }| j                  ||      \  }}| j	                  |||      }|j                  |||      }|S )Nr[   )r^   r   r  r"  r  r   )rE   r   rG   sequence_lengthr  r  r!  r   s           r*   r   zJambaSparseMoeBlock.forward  sx    2?2E2E/
OZ%**2z:M2%)%A%A-Q^%_"]]KO%--j/:Vr)   )r%   r&   r'   ro   r!   rM   r"  r?   rr   r   r   r   s   @r*   r  r    s5    	,{ ,B
U\\ ell r)   r  c                        e Zd Zdedef fdZ	 	 	 	 	 ddej                  dej                  dz  dej                  dz  de	dz  d	e
dz  d
ej                  dz  dee   dej                  fdZ xZS )JambaAttentionDecoderLayerrF   rT   c                 R   t         |           |j                  r|j                  |   nd}t        ||      | _        |dkD  rt
        nt        } ||      | _        t        |j                  |j                        | _        t        |j                  |j                        | _        y )Nr    r   )r~   rM   layers_num_expertsry   	self_attnr  r  feed_forwardr#   r7   r   input_layernormpre_ff_layernormrE   rF   rT   r  ffn_layer_classr   s        r*   rM   z#JambaAttentionDecoderLayer.__init__  s    >D>W>Wf//	:]^'	:1<q-h+F3+F,>,>FDWDWX ,V-?-?VEXEX Yr)   Nr   r   position_idsr   r  rg   r   rY   c           
          |}| j                  |      } | j                  d||||||d|\  }}	||z   }|}| j                  |      }| j                  |      }||z   }|S )N)r   r   r/  r   r  rg   r(   )r+  r)  r,  r*  )
rE   r   r   r/  r   r  rg   r   residualrL   s
             r*   r   z"JambaAttentionDecoderLayer.forward  s     !,,];)4>> 
')%+)
 
q !=0 --m<))-8 =0r)   )NNNFN)r%   r&   r'   r!   rs   rM   r?   rr   rw   r-   boolr   r   FloatTensorr   r   r   s   @r*   r&  r&    s    Z{ Zs Z /304CG!&26|| t+ &&-	
 :D@ $; ((4/ +, 
		r)   r&  c                        e Zd Zdedef fdZ	 	 	 ddej                  dej                  dz  dej                  dz  de	dz  d	e
e   d
ej                  fdZ xZS )JambaMambaDecoderLayerrF   rT   c                 T   t         |           |j                  r|j                  |   nd}t        ||      | _        |dkD  rt
        nt        } ||      | _        t        |j                  |j                        | _        t        |j                  |j                        | _        y )Nr    )rF   rT   r   )r~   rM   r(  r   r/   r  r  r*  r#   r7   r   r+  r,  r-  s        r*   rM   zJambaMambaDecoderLayer.__init__7  s    >D>W>Wf//	:]^$FiH
1<q-h+F3+F,>,>FDWDWX ,V-?-?VEXEX Yr)   Nr   r   r/  r   r   rY   c                     |}| j                  |      }| j                  |||      }||z   }|}| j                  |      }| j                  |      }||z   }|S )N)r   r   r   )r+  r/   r,  r*  )rE   r   r   r/  r   r   r1  s          r*   r   zJambaMambaDecoderLayer.forward@  sv     !,,];

'() # 

 !=0 --m<))-8 =0r)   r   )r%   r&   r'   r!   rs   rM   r?   rr   rw   r-   r   r   r3  r   r   r   s   @r*   r5  r5  6  s    Z{ Zs Z /304CG|| t+ &&-	
 :D@ +, 
		r)   r5  )	attentionr/   c                        e Zd ZU eed<   dZdZddgZdZdZ	dZ
dZeege eej"                  d      d	Z ej(                          fd
       Z xZS )JambaPreTrainedModelrF   modelTr&  r5  r   r  )
layer_name)r   
attentionsr  c                    t         |   |       t        |t              rt	        j
                  d|j                  dz         d d d f   }|j                  |j                  d      j                         }t        j                  |j                  t	        j                  |             t        j                  |j                         y t        |t               rmt        j"                  |j$                  d| j&                  j(                         t        j"                  |j*                  d| j&                  j(                         y y )Nr    r[   r   )meanstd)r~   _init_weightsr   r   r?   r   rI   r   rH   r   initr   r   r   ones_r   r  normal_gate_up_projrF   initializer_range	down_proj)rE   moduler   r   s      r*   rA  z"JambaPreTrainedModel._init_weightsi  s    f%fo.Q 5 5 9:47CA1126AACAJJv||UYYq\2JJvxx -LL,,3DKK<Y<YZLL))9V9VW .r)   )r%   r&   r'   r!   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_is_statefulr&  r5  ry   r   r   r   _can_record_outputsr?   r   rA  r   r   s   @r*   r:  r:  Z  sw    &*#57OP"3NL46LM$'		hG U]]_	X 	Xr)   r:  c                       e Zd Zdef fdZeee	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dedz  dej                  dz  d	edz  d
ej                  dz  dee   defd                     Zd Z xZS )
JambaModelrF   c                     t         |   |       |j                  | _        |j                  | _        t        j                  |j                  |j                  | j                        | _        g }t        |j                        D ]1  }t        |j                  |      }|j                   |||             3 t        j                  |      | _        t!        |j                  |j"                        | _        d| _        | j)                          y )N)rT   r   F)r~   rM   pad_token_idpadding_idx
vocab_sizer   	Embeddingr7   embed_tokensr=   r>   ALL_DECODER_LAYER_TYPESr4   rB   
ModuleListlayersr#   r   final_layernormgradient_checkpointing	post_init)rE   rF   decoder_layersrK   layer_classr   s        r*   rM   zJambaModel.__init__x  s     !.. ++LL):):F<N<NPTP`P`av//0 	DA1&2J2J12MNK!!+f"BC	D mmN3+F,>,>FDWDWX&+#r)   N	input_idsr   r/  r   inputs_embedsr  rg   r   rY   c           
         |d u |d uz  rt        d      || j                  |      }|r<|:t        | j                  |j                  d   |j
                  |j                        }|F||j                         nd}	t        j                  |	|	|j                  d   z   |j                        }||j                  d      }t        | j                  |||||      }
| j                  ||      }|}| j                  D ]%  }t        |t              r|n|
} ||f|||||d|}' | j!                  |      }|r|j"                  sd|_        t%        ||	      S )
Nz:You must specify exactly one of input_ids or inputs_embedsr   )rF   rG   r2   r1   r    r3   )rF   rc  r   rg   r   r/  )r   r/  r   r  rg   T)last_hidden_stater   )
ValueErrorrY  r-   rF   r^   r2   r1   rc   r?   r   r   r
   _update_mamba_maskr\  r   r5  r]  r5   r   )rE   rb  r   r/  r   rc  r  rg   r   past_seen_tokenscausal_mask
mamba_maskr   decoder_layer
layer_masks                  r*   r   zJambaModel.forward  s    -t";<YZZ  --i8M0>{{(..q1#))$++	O !CRC^==?de"\\ "2]5H5H5K"KTaThThN )33A6L(;;'))+%
 ,,^^L
%![[ 	M'1-AW'X^iJ))) /#- M	 ,,];?#E#E15O.%++
 	
r)   c                 V    |}||d   dkD  s|t        j                  |dk(        rd}|S )zv
        No need for zeroing states when
            1. Cached forward
            2. Attending to all inputs
        Nr   r    )r?   r   )rE   r   rg   rj  s       r*   rg  zJambaModel._update_mamba_mask  s<     $
&>!+<q+@&599^q5H+IJr)   )NNNNNNN)r%   r&   r'   r!   rM   r   r   r   r?   rw   rr   r-   r3  r2  r   r   r   r   rg  r   r   s   @r*   rS  rS  v  s    { $   .2.204CG26!%26A
##d*A
 t+A
 &&-	A

 :D@A
 ((4/A
 $;A
 ((4/A
 +,A
 
 A
    A
Fr)   rS  c                   D    e Zd Zdef fdZ	 	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dedz  dej                  dz  d	ej                  dz  d
e
dz  de
dz  dej                  dz  deej                  z  dee   def fdZ xZS )JambaForCausalLMrF   c                 F    t         |   |       |j                  | _        y rO   )r~   rM   r  r  s     r*   rM   zJambaForCausalLM.__init__  s     !--r)   Nrb  r   r/  r   rc  labelsr  output_router_logitsrg   logits_to_keepr   rY   c                 4    t        |   ||||||||	|
f	i |S )aj  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, JambaForCausalLM

        >>> model = JambaForCausalLM.from_pretrained("ai21labs/Jamba-v0.1")
        >>> tokenizer = AutoTokenizer.from_pretrained("ai21labs/Jamba-v0.1")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```)r~   r   )rE   rb  r   r/  r   rc  rq  r  rr  rg   rs  r   r   s               r*   r   zJambaForCausalLM.forward  s<    H w
 
 	
r)   )
NNNNNNNNNr   )r%   r&   r'   r!   rM   r?   rw   rr   r-   r3  r2  rs   r   r   r   r   r   r   s   @r*   ro  ro    s   .{ . .2.204CG26*.!%,026-./
##d*/
 t+/
 &&-	/

 :D@/
 ((4//
   4'/
 $;/
 #Tk/
 ((4//
 ell*/
 +,/
 
#/
 /
r)   ro  c                       e Zd Zy)JambaForSequenceClassificationNr$   r(   r)   r*   rv  rv    r+   r)   rv  )ro  rv  rS  r:  )Ccollections.abcr   typingr   r?   r    r   rB  activationsr   integrationsr	   masking_utilsr
   modeling_layersr   r   modeling_outputsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   utils.genericr   utils.import_utilsr   utils.output_capturingr   r   llama.modeling_llamar   r   r   mistral.modeling_mistralr   mixtral.modeling_mixtralr   r   configuration_jambar!   
get_loggerr%   r   r#   r-   ry   Moduler   r  r  r  r&  r5  rZ  r:  rS  ro  rv  __all__r(   r)   r*   <module>r     s@  & %    & ! , / [ Q F & @ @ 7 9 E X X 1 I , 
		H	%	< 	\3 \3~-)^ -)`]Nbii ]N@		z 		> 	"")) "J%!; %P7 B )CMcd X? X8 d% d dN4
) 4
n	%EG[ 	 gr)   