
    qin                        d dl mZ d dlmZ d dlZd dlmZ ddlmZ ddl	m
Z
 ddlmZ dd	lmZmZmZmZmZ dd
lmZ ddlmZmZ ddlmZmZ ddlmZmZ ddlmZ ddl m!Z!m"Z"m#Z#m$Z$ ddl%m&Z& ddl'm(Z( ddl)m*Z*m+Z+ ddl,m-Z-  e$j\                  e/      Z0 ed       G d dejb                               Z2 G d d      Z3d Z4 ed      dEd       Z5dejl                  de7dejl                  fd Z8	 dFd!ejb                  d"ejl                  d#ejl                  d$ejl                  d%ejl                  dz  d&e9d'e9d(ee!   fd)Z: ee5       G d* d+ejb                               Z; G d, d-ejb                        Z< G d. d/ejb                        Z=e G d0 d1ejb                               Z> G d2 d3ejb                        Z? G d4 d5e      Z@ G d6 d7e      ZA G d8 d9e      ZBe@eAd:ZCe" G d; d<eB             ZD	 	 	 dGd=ejl                  eEejl                     z  dz  d>e7dz  d%ejl                  dz  dejl                  e7z  fd?ZFe" G d@ dAeBe             ZG G dB dCeeB      ZHg dDZIy)H    )Callable)AnyN)nn   )initialization)ACT2FN)GenerationMixin)lazy_load_kerneluse_experts_implementationuse_kernel_forward_from_hubuse_kernel_func_from_hubuse_kernelized_func)create_causal_mask) GenericForSequenceClassificationGradientCheckpointingLayer)MoeCausalLMOutputWithPastMoeModelOutputWithPast)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogging)merge_with_config_defaults)resolve_internal_import)OutputRecordercapture_outputs   )JambaConfigRMSNormc                   h     e Zd Zddeddf fdZdej                  dej                  fdZd Z xZ	S )	JambaRMSNormepsreturnNc                     t         |           t        j                  t	        j
                  |            | _        || _        y)z;
        JambaRMSNorm is equivalent to T5LayerNorm
        N)super__init__r   	Parametertorchonesweightvariance_epsilon)selfhidden_sizer$   	__class__s      Z/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/jamba/modeling_jamba.pyr(   zJambaRMSNorm.__init__:   s1     	ll5::k#:; #    hidden_statesc                 "   |j                   }|j                  t        j                        }|j	                  d      j                  dd      }|t        j                  || j                  z         z  }| j                  |j                  |      z  S )N   T)keepdim)	dtypetor*   float32powmeanrsqrtr-   r,   )r.   r3   input_dtypevariances       r1   forwardzJambaRMSNorm.forwardB   sy    #))%((7 $$Q',,R,>%Ht?T?T4T(UU{{]--k:::r2   c                 ^    t        | j                  j                         d| j                   S )Nz, eps=)tupler,   shaper-   r.   s    r1   
extra_reprzJambaRMSNorm.extra_reprI   s*    ))*+6$2G2G1HIIr2   )gư>)
__name__
__module____qualname__floatr(   r*   Tensorr@   rE   __classcell__r0   s   @r1   r#   r#   8   s7    $ $$ $;U\\ ;ell ;Jr2   r#   c                   4   e Zd ZdZdZej                  dfdZd Zd Z		 ddej                  dej                  d	ed
eeef   dz  deej                  ej                  f   f
dZdej"                  fdZdej                  d	edeeef   fdZdd	edz  defdZy) HybridMambaAttentionDynamicCachea  
    A dynamic cache that can handle both the attention cache (which has a seq_len dimension) and the mamba cache
    (which has a constant shape regardless of seq_len).

    This cache has two sets of lists of tensors: `key_cache` and `value_cache` for attention cache and `conv_states`
    and `ssm_states` for mamba cache. Each of these lists has `num_layers` tensors. The expected shape for each tensor
    For attention layers, `key_cache` and `value_cache` have a shape of `(batch_size, num_heads, seq_len, head_dim)`,
    while `conv_states` and `ssm_states` have a shape of `(batch_size, 0)` (empty tensors).
    For mamba layers, `key_cache` and `value_cache` have a shape of `(batch_size, 0)` (empty tensors),
    while `conv_states` represents the convolution state and has a shape of `(batch_size, d_inner, d_conv)`,
    and `ssm_states` represents the ssm state and has a shape of `(batch_size, d_inner, d_state)`.
    FNc           
         || _         |j                  | _        d| _        |j                  |j                  z  }|j
                  }|j                  }g | _        g | _        g | _	        t        |j                        D ]  }| j                  |   dk(  r]| xj                  t        j                  |||||      gz  c_        | xj                  t        j                  |||||      gz  c_        r| xj                  t        j                  g g|z  |      gz  c_        | xj                  t        j                  g g|z  |      gz  c_        | j                  j                  |        t        |j                        D 	cg c]  }	t        j                  g g|z  |       c}	| _        t        |j                        D 	cg c]  }	t        j                  g g|z  |       c}	| _        y c c}	w c c}	w )NFmambadevicer8   rR   )r8   layers_block_typehas_previous_statemamba_expandr/   mamba_d_statemamba_d_convconv_states
ssm_statestransformer_layersrangenum_hidden_layersr*   zerostensorappend	key_cachevalue_cache)
r.   config
batch_sizer8   rR   intermediate_sizessm_state_sizeconv_kernel_sizei_s
             r1   r(   z)HybridMambaAttentionDynamicCache.__init__]   s   
!'!9!9"'"//&2D2DD--!.."$v//0 	2A%%a(G3  KK
,=?OX^fkl%   KK
,=~V\dij$    U\\2$2CF%S$TT ELL"
1B6$R#SS''..q1	2 SXX^XpXpRqrQ%,,tj'8HrTYZ`ZrZrTstqELL"
):6Jt sts   ?"G+ "G0c                 ,    t        | j                        S N)lenra   rD   s    r1   __len__z(HybridMambaAttentionDynamicCache.__len__w   s    4>>""r2   c                 >    | j                   |   | j                  |   fS rk   )ra   rb   r.   	layer_idxs     r1   __getitem__z,HybridMambaAttentionDynamicCache.__getitem__z   s!    ~~i($*:*:9*EEEr2   
key_statesvalue_statesrp   cache_kwargsr%   c                    | j                   |   j                  d   dk(  r|| j                   |<   || j                  |<   nft        j                  | j                   |   |gd      | j                   |<   t        j                  | j                  |   |gd      | j                  |<   | j                   |   | j                  |   fS )Nr6   r   r5   dim)ra   rC   rb   r*   cat)r.   rr   rs   rp   rt   s        r1   updatez'HybridMambaAttentionDynamicCache.update}   s     >>)$**2.!3(2DNN9%*6DY'(-		4>>)3Lj2Y_`(aDNN9%*/))T5E5Ei5PR^4_ef*gDY'~~i($*:*:9*EEEr2   beam_idxc                    | j                         dkD  rvt        t        | j                              D ]S  }| j                  |   j                  }| j                  |   j                  d|j                  |            | j                  |<   | j                  |   j                  }| j                  |   j                  d|j                  |            | j                  |<   | j                  |   j                  }| j                  |   j                  d|j                  |            | j                  |<   | j                  |   j                  }| j                  |   j                  d|j                  |            | j                  |<   V yy)zDReorders the cache for beam search, given the selected beam indices.r   N)
get_seq_lengthr\   rl   ra   rR   index_selectr9   rb   rY   rZ   )r.   rz   rp   rR   s       r1   reorder_cachez.HybridMambaAttentionDynamicCache.reorder_cache   s[    1$"3t~~#67 	m		299,0NN9,E,R,RSTV^VaVabhVi,jy))))4;;.2.>.>y.I.V.VWXZbZeZeflZm.n  +)))4;;.2.>.>y.I.V.VWXZbZeZeflZm.n  +3::-1__Y-G-T-TUVX`XcXcdjXk-l	*	m %r2   cache_positionc                 T    d}|j                   d   }| j                  |      |z   }||fS )zDReturn the length and offset of the cache, used to generate the maskr   )rC   r|   )r.   r   rp   	kv_offsetquery_length	kv_lengths         r1   get_mask_sizesz/HybridMambaAttentionDynamicCache.get_mask_sizes   s7    	%++A.''	2\A	)##r2   c                     || j                   vr| j                   d   n|}t        | j                        |k  s| j                  |   j                  d   dk(  ry| j                  |   j                  d   S )zYReturns the sequence length of the cached states. A layer index can be optionally passed.r   r6   )r[   rl   ra   rC   ro   s     r1   r|   z/HybridMambaAttentionDynamicCache.get_seq_length   sn     3<4CZCZ2ZD++A.`i	t~~)+t~~i/H/N/Nr/RVW/W~~i(..r22r2   rk   )r   )rF   rG   rH   __doc__is_compileabler*   float16r(   rm   rq   rJ   intdictstrr   rB   ry   
LongTensorr~   r   r|    r2   r1   rN   rN   M   s     N16t u4#F /3FLLF llF 	F
 38nt+F 
u||U\\)	*F"me&6&6 m$U\\ $c $eTWY\T\o $3d
 33 3r2   rN   c                     | dd| j                   d   dz  f   }| d| j                   d   dz  df   }t        j                  | |fd      S )z*Rotates half the hidden dims of the input..Nr6   r5   rv   )rC   r*   rx   )xx1x2s      r1   rotate_halfr      sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''r2   rotary_pos_embc                     |j                  |      }|j                  |      }| |z  t        |       |z  z   }||z  t        |      |z  z   }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )	unsqueezer   )qkcossinunsqueeze_dimq_embedk_embeds          r1   apply_rotary_pos_embr      sY    & --
&C
--
&C3w;q>C/0G3w;q>C/0GGr2   r3   n_repr%   c                     | j                   \  }}}}|dk(  r| S | dddddddddf   j                  |||||      } | j                  |||z  ||      S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r   N)rC   expandreshape)r3   r   batchnum_key_value_headsslenhead_dims         r1   	repeat_kvr      so    
 2?1D1D.Ehz!!Qa"23::5BUW\^bdlmM  (;e(CT8TTr2   modulequerykeyvalueattention_maskscalingdropoutkwargsc                    t        || j                        }t        || j                        }	t        j                  ||j	                  dd            |z  }
||
|z   }
t
        j                  j                  |
dt        j                        j                  |j                        }
t
        j                  j                  |
|| j                        }
t        j                  |
|	      }|j	                  dd      j                         }||
fS )Nr5   r   r6   rw   r8   )ptrainingr   )r   num_key_value_groupsr*   matmul	transposer   
functionalsoftmaxr:   r9   r8   r   r   
contiguous)r   r   r   r   r   r   r   r   rr   rs   attn_weightsattn_outputs               r1   eager_attention_forwardr      s     3 ; ;<JUF$?$?@L<<z';';Aq'ABWLL!#n4==((2U]](SVVW\WbWbcL==((6??([L,,|\:K''1-88:K$$r2   c                        e Zd ZdZdedef fdZ	 	 	 ddej                  dej                  dz  de	dz  d	ej                  dz  d
ee   deej                  ej                  dz  f   fdZ xZS )JambaAttentionz=Multi-headed attention from 'Attention Is All You Need' paperrc   rp   c                    t         |           || _        || _        t	        |d|j
                  |j                  z        | _        |j                  |j                  z  | _	        | j                  dz  | _
        |j                  | _        d| _        t        j                  |j
                  |j                  | j                  z  d      | _        t        j                  |j
                  |j                  | j                  z  d      | _        t        j                  |j
                  |j                  | j                  z  d      | _        t        j                  |j                  | j                  z  |j
                  d      | _        y )Nr   g      TFbias)r'   r(   rc   rp   getattrr/   num_attention_headsr   r   r   r   attention_dropout	is_causalr   Linearq_projk_projv_projo_proj)r.   rc   rp   r0   s      r1   r(   zJambaAttention.__init__   s,   "
F4F4F&JdJd4de$*$>$>&B\B\$\!}}d*!'!9!9ii 2 2F4N4NQUQ^Q^4^ejkii 2 2F4N4NQUQ^Q^4^ejkii 2 2F4N4NQUQ^Q^4^ejkii : :T]] JFL^L^ejkr2   Nr3   r   past_key_valuesr   r   r%   c                    |j                   d d }g |d| j                  }| j                  |      j                  |      j	                  dd      }| j                  |      j                  |      j	                  dd      }	| j                  |      j                  |      j	                  dd      }
|#|j                  |	|
| j                  d|i      \  }	}
t        j                  | j                  j                  t              } || ||	|
|f| j                  sdn| j                  | j                   d|\  }} |j"                  g |d j%                         }| j'                  |      }||fS )Nr6   r   r5   r           )r   r   )rC   r   r   viewr   r   r   ry   rp   r   get_interfacerc   _attn_implementationr   r   r   r   r   r   r   )r.   r3   r   r   r   r   input_shapehidden_shapequery_statesrr   rs   attention_interfacer   r   s                 r1   r@   zJambaAttention.forward  s|    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST&'6'='=L$..;K^:\($J )@(M(MKK,,.E)
 %8	%
  $}}C$2H2HLL	%
 	%
!\ *k));;;;FFHkk+.L((r2   NNN)rF   rG   rH   r   r    r   r(   r*   rJ   rN   r   r   r   rB   r@   rK   rL   s   @r1   r   r      s    Gl{ ls l" /3CG26%)||%) t+%) :D@	%)
 ((4/%) +,%) 
u||U\\D00	1%)r2   r   c                        e Zd ZdZdef fdZ	 	 ddej                  dedz  dej                  dz  fdZ
ddedz  dej                  dz  fd	Z	 	 ddedz  dej                  dz  fd
Z xZS )JambaMambaMixeru  
    Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`.
    A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective)
    ∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4,
    and is why Mamba is called **selective** state spaces)
    rc   c           	         t         |           || _        || _        |j                  | _        |j
                  | _        |j                  | _        |j                  |j                  z  | _
        |j                  | _        |j                  | _        |j                  | _        t#        j$                  | j                  | j                  | j                  | j                  | j                  | j                  dz
        | _        |j(                  | _        t,        |j(                     | _        t#        j0                  | j                  | j                  dz  | j                         | _        t#        j0                  | j                  | j                  | j                  dz  z   d      | _        t#        j0                  | j                  | j                  d      | _        t9        j:                  d| j                  dz         d d d f   }|j=                  | j                  d      j?                         }t#        j@                  t9        jB                  |            | _"        t#        j@                  t9        jF                  | j                              | _$        t#        j0                  | j                  | j                  | j                         | _%        tM        | j                  |jN                        | _(        tM        | j                  |jN                        | _)        tM        | j                  |jN                        | _*        tW        d	      }tY        |d
d       a-tY        |dd       a.tW        d      }t_        |d      a0tY        |dd       a1tY        |dd       a2tg        t`        tb        t\        tZ        td        f      a4th        stj        jm                  d       y y )Nr   )in_channelsout_channelsr   kernel_sizegroupspaddingr5   r   FTr6   r$   zcausal-conv1dcausal_conv1d_updatecausal_conv1d_fnz	mamba-ssmz8ops.triton.selective_state_update.selective_state_update)chained_pathselective_scan_fnmamba_inner_fna  The fast path is not available because on of `(selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn)` is None. To install follow https://github.com/state-spaces/mamba/#installation and https://github.com/Dao-AILab/causal-conv1d.)7r'   r(   rc   rp   r/   rW   rf   rX   rg   rV   re   mamba_dt_ranktime_step_rankmamba_conv_biasuse_conv_biasmamba_proj_biasuse_biasr   Conv1dconv1d
hidden_act
activationr   actr   in_projx_projdt_projr*   aranger   r   r)   logA_logr+   Dout_projr#   rms_norm_epsdt_layernormb_layernormc_layernormr
   r   r   r   r   selective_state_updater   r   allis_fast_path_availableloggerwarning_once)r.   rc   rp   Acausal_conv1d	mamba_ssmr0   s         r1   r(   zJambaMambaMixer.__init__4  s   "!--$22 & 3 3!'!4!4v7I7I!I$22#33..ii..//##--))))A-
 !++&++, yy!1!143I3IA3MTXTaTabii 6 68K8KdNaNadeNe8elqryy!4!4d6L6LSWX LLD//!34T1W=HHT++R0;;=\\%))A,/
ejj)?)?@A		$"8"8$:J:JQUQ^Q^_()<)<&BUBUV'(;(;ATATU'(;(;ATATU )9&}6LdS"=2DdK %[1	!8$^"
 $I/BDI ,<dC "%#%68HJ^`no"
 &R &r2   Nr3   cache_paramsr   c                 4
   |j                   \  }}}|d uxrm |j                  xr_ |dk(  xrX |j                  | j                     j                   d   |j                  | j                     j                   d   cxk(  xr |k(  nc }| j                  |      j                  dd      }|j                  dd      \  }}	|||j                  d      z  }| j                  j                  j                  | j                  j                  j                  d      | j                  j                  j                  d            }
|ret        |j                  d      |j                  | j                     |
| j                  j                  | j                         }|j                  d      }n|dt"        j$                  j'                  || j(                  |j                   d   z
  df      }|j                  | j                     j+                  |       t-        ||
| j                  j                  | j                         }|||j                  d      z  }| j/                  |j                  dd            }t1        j2                  || j4                  | j6                  | j6                  gd      \  }}}| j9                  |      }| j;                  |      }| j=                  |      }| j>                  j                  j@                  }t1        jB                         5  t1        jD                  | j>                  j                  j@                        | j>                  j                  _         d d d        | j?                  |      j                  dd      }t1        jB                         5  || j>                  j                  _         d d d        t1        jF                  | jH                  jK                                }||jK                         nd }|r]tM        |j                  | j                     |d   |d   ||d d df   |d d df   | jN                  |	d   |d	
      j                  d      }n|tQ        ||||j                  dd      |j                  dd      | jN                  jK                         |	|dd

      \  }}|*|(|j                  | j                     j+                  |       | jS                  |j                  dd            }|S # 1 sw Y   xY w# 1 sw Y   WxY w)Nr   r   r5   rv   r6   )r   ).r   T)dt_softplus)delta_softplusreturn_last_state)*rC   rU   rY   rp   rZ   r   r   chunkr   r   r,   r   sizer   squeezer   r   r   r   padrg   copy_r   r   r*   splitr   rf   r   r   r   r   datano_grad
zeros_likeexpr   rI   r   r   r   r   )r.   r3   r  r   rd   seq_lenri   use_precomputed_statesprojected_statesgateconv_weightsrY   ssm_parameters	time_stepBCtime_proj_biasdiscrete_time_stepr   scan_outputs	ssm_statecontextualized_statess                         r1   cuda_kernels_forwardz$JambaMambaMixer.cuda_kernels_forwardw  s    "/!4!4
GQ$ //1 ((8>>qA&&t~~6<<Q?	 	  <<6@@AF /44QA4>t%)N,D,DQ,GGM {{))..t{{/A/A/F/Fq/I4;;K]K]KbKbcdKef!0%%b)((8  M *33B7M' mm//@U@UXeXkXklnXo@oqr?st((8>>{K,]L$++JZJZgkgvgvwM%)N,D,DQ,GGM ]%<%<Q%BC++T00$2E2EtGZGZ[ac
	1a %%i0	QQ **//]]_ 	N%*%5%5dll6G6G6L6L%MDLL"	N!\\)4>>q!D]]_ 	4%3DLL"	4 YYtzz'')**3A3M--/SW!1''7f%"6*!Q$!Q$V  im  '8"Aq!Aq!#"&'#L) $)A''7==iH !%l.D.DQ.J K$$S	N 	N	4 	4s   AT T T
Tc           	      n   |j                   \  }}}|j                  }| j                  |      j                  dd      }|j	                  dd      \  }	}
||	|j                  d      z  }	t        |t              }|r8|j                  | j                     j                   d   |k(  r| j                  r(|j                  | j                     j                         }n|j                  | j                     }|j                  |	j                        }|j                  r|dk(  r|j                  | j                     j                   d   |k(  r|j                  | j                     }t!        j"                  |dd      }|	d d d d df   |d d d d df<   ||j                  | j                  <   t!        j$                  || j&                  j(                  d d dd d f   z  d      }	| j*                  r|	| j&                  j,                  z  }	| j/                  |	      j                  |      j                  d      }	nt0        j2                  j5                  |	| j6                  |	j                   d   z
  df      }||j                  | j                  <   | j/                  | j'                  |	      dd |f         }	n`t!        j8                  || j:                  | j<                  f|	j                  |      }| j/                  | j'                  |	      dd |f         }	||	|j                  d      z  }	| j?                  |	j                  dd            }t!        j@                  || jB                  | j<                  | j<                  gd      \  }}}| jE                  |      }| jG                  |      }| jI                  |      }| jK                  |      }t0        j2                  jM                  |      j                  dd      }t!        jN                  | jP                  jS                                }t!        jN                  |d d d d d d f   |d d d d d d d f   z        }|d d d d d d d f   |d d d d d d d f   jS                         z  }||	d d d d d d d f   jS                         z  }g }tU        |      D ]}  }|d d d d |d d f   |z  |d d d d |d d f   z   }t!        jV                  |j                  |      |d d |d d f   j                  d            }|jY                  |d d d d df           t!        jZ                  |d      }||	| j\                  d d d d f   z  z   }|| j/                  |
      z  }|r||j                  | j                  <   | j_                  |j                  dd            }|S )	Nr   r5   rv   r   r6   )shiftsdims.rQ   )0rC   r8   r   r   r  r   
isinstancerN   rZ   rp   r   cloner9   rR   rU   rY   r*   rollsumr   r,   r   r   r   r   r   r  rg   r^   re   rf   r   r  r   r   r   r   r   softplusr  r   rI   r\   r   r`   stackr   r   )r.   input_statesr  r   rd   r  ri   r8   r  r3   r  	use_cacher  
conv_stater  r  r  r  r  r   
discrete_A
discrete_BdeltaB_ur  rh   scan_outputr  s                              r1   slow_forwardzJambaMambaMixer.slow_forward  s   !-!3!3
GQ""<<5??1E.44QA4>t%)N,D,DQ,GGM|-MN	00@FFqIZW}}(33DNNCIIK	(33DNNC	!]%9%9:I..7a< ,,T^^<BB1ES)55dnnE
"ZZ
2BG
'4Q1W'=
1a8$;E((8 %		*t{{7I7I!QPQ'7R*RXZ [%%!T[[%5%55M $ 7 : :5 A K KB O]]..!**]-@-@-DDaH
 <F((8 $])CC'M)R ST33T5H5HI$++5I !HHT[[%?XgX%NOM%)N,D,DQ,GGM ]%<%<Q%BC++T00$2E2EtGZGZ[ac
	1a %%i0	QQ!\\)4]]334FGQQRSUVW YYtzz'')**YYqq$!125G1aQU5VVW
'1a61dAq=9I9O9O9QQ
aAtm < B B DDw 	6A"1aA:.:XaAqj=QQI,,y||E':AaAgJ<P<PQS<TUKAq!G 45	6 kk,B7!]TVVD!TM5J%JK"TXXd^36?L##DNN3 !%k.C.CAq.I J$$r2   c                 V   | j                   j                  rXt        r,d| j                  j                  j
                  j                  vr&t        j                  d       d| j                   _        | j                   j                  r| j                  |||      S | j                  |||      S )NcudazFast Mamba kernels are not available. Make sure that they are installed and that the mamba module is on a CUDA device. Turning off the fast path `config.use_mamba_kernels=False` and falling back to the slow path.F)rc   use_mamba_kernelsr   r   r,   rR   typer   r   r   r1  )r.   r3   r  r   s       r1   r@   zJambaMambaMixer.forward7  s     ;;((&&8J8J8Q8Q8V8V*VV
 -2DKK);;((,,]L.YY  nMMr2   )NN)rF   rG   rH   r   r    r(   r*   rJ   rN   r   r   r1  r@   rK   rL   s   @r1   r   r   ,  s    A{ AL AE26	h%||h% 7=h% ((4/	h%VR%7WZ^7^ R%w|  xH  xH  KO  xO R%p AE26	N 7=N ((4/	Nr2   r   c                   $     e Zd Z fdZd Z xZS )JambaMLPc                    t         |           || _        |j                  | _        |j                  | _        t        j                  | j                  | j                  d      | _        t        j                  | j                  | j                  d      | _        t        j                  | j                  | j                  d      | _	        t        |j                     | _        y NFr   )r'   r(   rc   r/   re   r   r   	gate_projup_proj	down_projr   r   act_fnr.   rc   r0   s     r1   r(   zJambaMLP.__init__M  s    !--!'!9!94#3#3T5K5KRWXyy!1!143I3IPUV4#9#94;K;KRWXV../r2   c                     | j                  | j                  | j                  |            | j                  |      z        }|S rk   )r<  r=  r:  r;  )r.   r   r<  s      r1   r@   zJambaMLP.forwardW  s6    NN4;;t~~a/@#ADLLQRO#ST	r2   )rF   rG   rH   r(   r@   rK   rL   s   @r1   r7  r7  L  s    0r2   r7  c                        e Zd ZdZdef fdZdej                  dej                  dej                  dej                  fdZ xZ	S )	JambaExpertsz2Collection of expert weights stored as 3D tensors.rc   c                    t         |           |j                  | _        |j                  | _        |j                  | _        t        j                  t        j                  | j                  d| j                  z  | j
                              | _        t        j                  t        j                  | j                  | j
                  | j                              | _        t        |j                     | _        y )Nr5   )r'   r(   num_local_expertsnum_expertsr/   
hidden_dimre   intermediate_dimr   r)   r*   emptygate_up_projr<  r   r   r=  r>  s     r1   r(   zJambaExperts.__init__`  s    !33 ,, & 8 8LLT5E5Eq4K`K`G`bfbqbq)rsekk$2B2BDOOUYUjUj&klV../r2   r3   top_k_indextop_k_weightsr%   c                 f   t        j                  |      }t        j                         5  t         j                  j                  j                  || j                        }|j                  ddd      }t        j                  |j                  d      d      j                         }d d d        D ]  }|d   }|| j                  k(  rt        j                  |         \  }}	||	   }
t        j                  j                  |
| j                  |         j                  dd      \  }}| j                  |      |z  }t        j                  j                  || j                   |         }|||	|d f   z  }|j#                  d|	|j%                  |j&                                |S # 1 sw Y   xY w)N)num_classesr5   r   r   )r6   r   rv   r6   )r*   r  r  r   r   one_hotrD  permutegreaterr'  nonzerowherelinearrH  r  r=  r<  
index_add_r9   r8   )r.   r3   rI  rJ  final_hidden_statesexpert_mask
expert_hit
expert_idx	top_k_pos	token_idxcurrent_stater  upcurrent_hidden_statess                 r1   r@   zJambaExperts.forwardi  s    $..}=]]_ 	S((--55ktO_O_5`K%--aA6K{8'DaHPPRJ	S
 % 
	nJ#AJT---#(;;{:/F#G Iy))4M}}++M4;L;LZ;XY__`agi_jHD"$(KK$5$:!$&MM$8$89NPTP^P^_iPj$k!$9M)U^`dJd<e$e!**1i9N9Q9QReRkRk9lm
	n #"#	S 	Ss   A=F&&F0)
rF   rG   rH   r   r    r(   r*   rJ   r@   rK   rL   s   @r1   rA  rA  \  sM    <0{ 0#||# \\# ||	#
 
#r2   rA  c                   f     e Zd ZdZdef fdZd Zdej                  dej                  fdZ	 xZ
S )JambaSparseMoeBlocka  
    This implementation is
    strictly equivalent to standard MoE with full capacity (no
    dropped tokens). It's faster since it formulates MoE operations
    in terms of block-sparse operations to accommodate imbalanced
    assignments of tokens to experts, whereas standard MoE either
    (1) drop tokens at the cost of reduced performance or (2) set
    capacity factor to number of experts and thus waste computation
    and memory on padding.
    rc   c                 ,   t         |           |j                  | _        |j                  | _        |j                  | _        |j                  | _        t        j                  | j                  | j                  d      | _        t        |      | _        y r9  )r'   r(   r/   rE  re   ffn_dimrD  num_experts_per_toktop_kr   r   routerrA  expertsr>  s     r1   r(   zJambaSparseMoeBlock.__init__  sm     ,,//!--//
ii1A1AN#F+r2   c                     t         j                  j                  j                  |dt         j                        }t        j
                  || j                  d      \  }}||j                  |j                        fS )Nr6   r   rv   )	r*   r   r   r   rI   topkrb  r9   r8   )r.   r3   router_logitsrouting_weightsrJ  rI  s         r1   route_tokens_to_expertsz+JambaSparseMoeBlock.route_tokens_to_experts  sb    ((--55mSXS^S^5_%*ZZQS%T"{M,,]-@-@AAAr2   r3   r%   c                     |j                   \  }}}|j                  d|      }| j                  |      }| j                  ||      \  }}| j	                  |||      }|j                  |||      }|S )Nr6   )rC   r   rc  ri  rd  r   )r.   r3   rd   sequence_lengthrE  rg  rI  rJ  s           r1   r@   zJambaSparseMoeBlock.forward  sx    2?2E2E/
OZ%**2z:M2%)%A%A-Q^%_"]]KO%--j/:Vr2   )rF   rG   rH   r   r    r(   ri  r*   rJ   r@   rK   rL   s   @r1   r^  r^    s5    	,{ ,B
U\\ ell r2   r^  c                        e Zd Zdedef fdZ	 	 	 	 	 ddej                  dej                  dz  dej                  dz  de	dz  d	e
dz  d
ej                  dz  dee   dej                  fdZ xZS )JambaAttentionDecoderLayerrc   rp   c                 R   t         |           |j                  r|j                  |   nd}t        ||      | _        |dkD  rt
        nt        } ||      | _        t        |j                  |j                        | _        t        |j                  |j                        | _        y )Nr   r   )r'   r(   layers_num_expertsr   	self_attnr^  r7  feed_forwardr#   r/   r   input_layernormpre_ff_layernormr.   rc   rp   rD  ffn_layer_classr0   s        r1   r(   z#JambaAttentionDecoderLayer.__init__  s    >D>W>Wf//	:]^'	:1<q-h+F3+F,>,>FDWDWX ,V-?-?VEXEX Yr2   Nr3   r   position_idsr   r+  r   r   r%   c           
          |}| j                  |      } | j                  d||||||d|\  }}	||z   }|}| j                  |      }| j                  |      }||z   }|S )N)r3   r   rv  r   r+  r   r   )rr  rp  rs  rq  )
r.   r3   r   rv  r   r+  r   r   residualri   s
             r1   r@   z"JambaAttentionDecoderLayer.forward  s     !,,];)4>> 
')%+)
 
q !=0 --m<))-8 =0r2   )NNNFN)rF   rG   rH   r    r   r(   r*   rJ   r   rN   boolr   r   FloatTensorr@   rK   rL   s   @r1   rm  rm    s    Z{ Zs Z /304CG!&26|| t+ &&-	
 :D@ $; ((4/ +, 
		r2   rm  c                        e Zd Zdedef fdZ	 	 	 ddej                  dej                  dz  dej                  dz  de	dz  d	e
e   d
ej                  fdZ xZS )JambaMambaDecoderLayerrc   rp   c                 T   t         |           |j                  r|j                  |   nd}t        ||      | _        |dkD  rt
        nt        } ||      | _        t        |j                  |j                        | _        t        |j                  |j                        | _        y )Nr   )rc   rp   r   )r'   r(   ro  r   rP   r^  r7  rq  r#   r/   r   rr  rs  rt  s        r1   r(   zJambaMambaDecoderLayer.__init__  s    >D>W>Wf//	:]^$FiH
1<q-h+F3+F,>,>FDWDWX ,V-?-?VEXEX Yr2   Nr3   r   rv  r   r   r%   c                     |}| j                  |      }| j                  |||      }||z   }|}| j                  |      }| j                  |      }||z   }|S )N)r3   r  r   )rr  rP   rs  rq  )r.   r3   r   rv  r   r   rx  s          r1   r@   zJambaMambaDecoderLayer.forward  sv     !,,];

'() # 

 !=0 --m<))-8 =0r2   r   )rF   rG   rH   r    r   r(   r*   rJ   r   rN   r   r   rz  r@   rK   rL   s   @r1   r|  r|    s    Z{ Zs Z /304CG|| t+ &&-	
 :D@ +, 
		r2   r|  c                        e Zd ZU eed<   dZdZddgZdZdZ	dZ
dZeege eej"                  d      d	Z ej(                          fd
       Z xZS )JambaPreTrainedModelrc   modelTrm  r|  r   rc  )
layer_name)r3   
attentionsrg  c                    t         |   |       t        |t              rt	        j
                  d|j                  dz         d d d f   }|j                  |j                  d      j                         }t        j                  |j                  t	        j                  |             t        j                  |j                         y t        |t               rmt        j"                  |j$                  d| j&                  j(                         t        j"                  |j*                  d| j&                  j(                         y y )Nr   r6   r   )r<   std)r'   _init_weightsr$  r   r*   r   rf   r   re   r   initr  r   r   ones_r   rA  normal_rH  rc   initializer_ranger<  )r.   r   r   r0   s      r1   r  z"JambaPreTrainedModel._init_weights  s    f%fo.Q 5 5 9:47CA1126AACAJJv||UYYq\2JJvxx -LL,,3DKK<Y<YZLL))9V9VW .r2   )rF   rG   rH   r    __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_is_statefulrm  r|  r   r   r   r   _can_record_outputsr*   r  r  rK   rL   s   @r1   r  r    sw    &*#57OP"3NL46LM$'		hG U]]_	X 	Xr2   r  )	attentionrP   c                       e Zd Zdef fdZeee	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dedz  dej                  dz  d	edz  d
ej                  dz  dee   defd                     Zd Z xZS )
JambaModelrc   c                     t         |   |       |j                  | _        |j                  | _        t        j                  |j                  |j                  | j                        | _        g }t        |j                        D ]1  }t        |j                  |      }|j                   |||             3 t        j                  |      | _        t!        |j                  |j"                        | _        d| _        | j)                          y )N)rp   r   F)r'   r(   pad_token_idpadding_idx
vocab_sizer   	Embeddingr/   embed_tokensr\   r]   ALL_DECODER_LAYER_TYPESrT   r`   
ModuleListlayersr#   r   final_layernormgradient_checkpointing	post_init)r.   rc   decoder_layersrh   layer_classr0   s        r1   r(   zJambaModel.__init__  s     !.. ++LL):):F<N<NPTP`P`av//0 	DA1&2J2J12MNK!!+f"BC	D mmN3+F,>,>FDWDWX&+#r2   N	input_idsr   rv  r   inputs_embedsr+  r   r   r%   c           
         |d u |d uz  rt        d      || j                  |      }|r<|:t        | j                  |j                  d   |j
                  |j                        }|F||j                         nd}	t        j                  |	|	|j                  d   z   |j                        }||j                  d      }t        | j                  |||||      }
| j                  ||      }|}| j                  D ]%  }t        |t              r|n|
} ||f|||||d|}' | j!                  |      }|r|j"                  sd|_        t%        ||	      S )
Nz:You must specify exactly one of input_ids or inputs_embedsr   )rc   rd   r8   rR   r   rS   )rc   r  r   r   r   rv  )r   rv  r   r+  r   T)last_hidden_stater   )
ValueErrorr  rN   rc   rC   r8   rR   r|   r*   r   r   r   _update_mamba_maskr  r$  r|  r  rU   r   )r.   r  r   rv  r   r  r+  r   r   past_seen_tokenscausal_mask
mamba_maskr3   decoder_layer
layer_masks                  r1   r@   zJambaModel.forward%  s    -t";<YZZ  --i8M0>{{(..q1#))$++	O !CRC^==?de"\\ "2]5H5H5K"KTaThThN )33A6L(;;'))+%
 ,,^^L
%![[ 	M'1-AW'X^iJ))) /#- M	 ,,];?#E#E15O.%++
 	
r2   c                 V    |}||d   dkD  s|t        j                  |dk(        rd}|S )zv
        No need for zeroing states when
            1. Cached forward
            2. Attending to all inputs
        Nr   r   )r*   r   )r.   r   r   r  s       r1   r  zJambaModel._update_mamba_maskk  s<     $
&>!+<q+@&599^q5H+IJr2   )NNNNNNN)rF   rG   rH   r    r(   r   r   r   r*   r   rJ   rN   rz  ry  r   r   r   r@   r  rK   rL   s   @r1   r  r    s    { $   .2.204CG26!%26A
##d*A
 t+A
 &&-	A

 :D@A
 ((4/A
 $;A
 ((4/A
 +,A
 
 A
    A
Fr2   r  gate_logitsrD  c                    | t        | t              syt        | t              rC| d   j                  }t        j                  | D cg c]  }|j                  |       c}d      }t        j                  j                  j                  d      }t        j                  ||d      \  }}	t        j                  j                  j                  |	|      }
|>t        j                  |
j                         d      }t        j                  |d      }n|j                  \  }}|j                  d   ||z  z  }|dddddddf   j                  |||||f      j                  d||      j                        }t        j                   |
j                         |z  d      t        j                   |d      z  }|ddddddf   j                  ||||f      j                  d|      j                  |      }t        j                   ||z  d      t        j                   |d      z  }t        j                   ||j#                  d      z        }||z  S c c}w )a  
    Computes auxiliary load balancing loss as in Switch Transformer - implemented in Pytorch.

    See Switch Transformer (https://huggingface.co/papers/2101.03961) for more details. This function implements the loss
    function presented in equations (4) - (6) of the paper. It aims at penalizing cases where the routing between
    experts is too unbalanced.

    Args:
        gate_logits:
            Logits from the `gate`, should be a tuple of model.config.num_hidden_layers tensors of
            shape [batch_size X sequence_length, num_experts].
        num_experts:
            Number of experts
        top_k:
            The number of experts to route per-token, can be also interpreted as the `top-k` routing
            parameter.
        attention_mask (`torch.Tensor`, *optional*):
            The attention_mask used in forward function
            shape [batch_size X sequence_length] if not None.

    Returns:
        The auxiliary loss.
    Nr   rv   r6   )r$  rB   rR   r*   rx   r9   r   r   r   rf  rM  r<   rI   rC   r   r   r'  r   )r  rD  rb  r   compute_device
layer_gateconcatenated_gate_logitsrh  ri   selected_expertsrU  tokens_per_expertrouter_prob_per_expertrd   rk  r]   expert_attention_mask router_per_expert_attention_maskoverall_losss                      r1   load_balancing_loss_funcr  y  s9   : *[%"@+u%$Q..#(99^i-jPZjmmN.K-jpq#r hh))112JPR1SO**_eDA((%%--.>LK!JJ{'8'8':B "'O!C&4&:&:#
O4::1=*B^_ 4AtT12V&
OUKXYWR,R	 	 "IIk&7&7&9<Q&QWXY\a\e\e!q]
 
 4At+,V&
O[QRWR%R	 	) "'?=]+]cd!ehmhqhq,!i
 "
 99.1G1Q1QRS1TTUL+%%[ .ks   Ic                   t    e Zd ZddiZddiZddgdgfiZdef fdZee		 	 	 	 	 	 	 	 	 	 dd
e
j                  d	z  de
j                  d	z  de
j                  d	z  ded	z  de
j                  d	z  de
j                  d	z  ded	z  ded	z  de
j                  d	z  dee
j                  z  dee   defd              Z xZS )JambaForCausalLMzlm_head.weightzmodel.embed_tokens.weightlm_headcolwise_gather_outputr3   logitsrc   c                 N   t         |   |       t        |      | _        |j                  | _        t        j                  |j                  |j                  d      | _        |j                  | _	        |j                  | _
        |j                  | _        | j                          y r9  )r'   r(   r  r  r  r   r   r/   r  router_aux_loss_coefrD  ra  r  r>  s     r1   r(   zJambaForCausalLM.__init__  s     '
 ++yy!3!3V5F5FUS$*$?$?!!--#)#=#=  	r2   Nr  r   rv  r   r  labelsr+  output_router_logitsr   logits_to_keepr   r%   c                 l   ||n| j                   j                  } | j                  d||||||||	d|}|j                  }t	        |
t
              rt        |
 d      n|
}| j                  |dd|ddf         }d}| | j                  ||| j                  fi |}d}|rYt        |j                  | j                  | j                  |      }|+|| j                  |j                  |j                         z  z  }t#        ||||j$                  |j&                  |j(                  |j                        S )aj  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, JambaForCausalLM

        >>> model = JambaForCausalLM.from_pretrained("ai21labs/Jamba-v0.1")
        >>> tokenizer = AutoTokenizer.from_pretrained("ai21labs/Jamba-v0.1")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```N)r  r   rv  r   r  r+  r  r   )lossaux_lossr  r   r3   r  rg  r   )rc   r  r  r  r$  r   slicer  loss_functionr  r  rg  rD  ra  r  r9   rR   r   r   r3   r  )r.   r  r   rv  r   r  r  r+  r  r   r  r   outputsr3   slice_indicesr  r  r  s                     r1   r@   zJambaForCausalLM.forward  sX   P %9$D $++JjJj 	
 +5$** 
+
)%+'!5)
+
 
+
  118B>SV8W~ot4]kmA}a,?@A%4%%ffdooPPD/%%  ((	H !11HKK4LLL(#33!//))!//
 	
r2   )
NNNNNNNNNr   )rF   rG   rH   _tied_weights_keys_tp_plan_pp_planr    r(   r   r   r*   r   rJ   rN   rz  ry  r   r   r   r   r@   rK   rL   s   @r1   r  r    sV   *,GH23H_-z:;H
{ 
  .2.204CG26*.!%,026-.R
##d*R
 t+R
 &&-	R

 :D@R
 ((4/R
   4'R
 $;R
 #TkR
 ((4/R
 ell*R
 +,R
 
#R
  R
r2   r  c                       e Zd Zy)JambaForSequenceClassificationN)rF   rG   rH   r   r2   r1   r  r  4  s    r2   r  )r  r  r  r  )r   )r   )Nr5   N)Jcollections.abcr   typingr   r*   r    r   r  activationsr   
generationr	   integrationsr
   r   r   r   r   masking_utilsr   modeling_layersr   r   modeling_outputsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.genericr   utils.import_utilsr   utils.output_capturingr   r   configuration_jambar    
get_loggerrF   r   Moduler#   rN   r   r   rJ   r   r   rI   r   r   r   r7  rA  r^  rm  r|  r  r  r  rB   r  r  r  __all__r   r2   r1   <module>r     s  2 %    & ! )  0 [ Q F & R R 7 9 E , 
		H	% Y'J299 J (J(\3 \3~( *+ ,2	UU\\ 	U# 	U%,, 	U& %II%<<% 
% <<	%
 LL4'% % % '(%2 )*6)RYY 6) +6)r]Nbii ]N@	ryy   $#299 $# $#N"")) "J%!; %P7 BX? X8 )CMcd  d% d dR #
*.	O&ell 33d:O&tO& LL4'	O&
 \\CO&d e
+_ e
 e
P	%EG[ 	 gr2   