
    qiR                        d Z ddlZddlmZ ddlmZ ddlZddlmZ ddlm	Z	m
Z
mZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZ ddlmZmZm Z  ddl!m"Z"m#Z# ddl$m%Z% ddl&m'Z'm(Z( ddl)m*Z* ddl+m,Z,  e(jZ                  e.      Z/ G d dej`                        Z1dejd                  de3dejd                  fdZ4 G d d      Z5	 d>dej`                  d ejd                  d!ejd                  d"ejd                  d#ejd                  dz  d$e6d%e6fd&Z7 G d' d(ej`                        Z8 G d) d*ej`                        Z9 G d+ d,ej`                        Z: G d- d.ej`                        Z; G d/ d0e      Z< G d1 d2e      Z=e' G d3 d4e#             Z>e' G d5 d6e>             Z? G d7 d8e>e      Z@ e'd9:       G d; d<e>             ZAg d=ZBy)?zPyTorch Zamba model.    N)Callable)Any)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )initialization)ACT2FN)Cache)GenerationMixin)lazy_load_kernel)create_causal_mask)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputWithPastCausalLMOutputWithPast SequenceClassifierOutputWithPast)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)auto_docstringlogging)resolve_internal_import   )ZambaConfigc                   h     e Zd Zddeddf fdZdej                  dej                  fdZd Z xZ	S )	ZambaRMSNormepsreturnNc                     t         |           t        j                  t	        j
                  |            | _        || _        y)z;
        ZambaRMSNorm is equivalent to T5LayerNorm
        N)super__init__r   	Parametertorchonesweightvariance_epsilon)selfhidden_sizer   	__class__s      Z/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/zamba/modeling_zamba.pyr#   zZambaRMSNorm.__init__2   s1     	ll5::k#:; #    hidden_statesc                 "   |j                   }|j                  t        j                        }|j	                  d      j                  dd      }|t        j                  || j                  z         z  }| j                  |j                  |      z  S )N   T)keepdim)	dtypetor%   float32powmeanrsqrtr(   r'   )r)   r.   input_dtypevariances       r,   forwardzZambaRMSNorm.forward:   sy    #))%((7 $$Q',,R,>%Ht?T?T4T(UU{{]--k:::r-   c                 ^    t        | j                  j                         d| j                   S )Nz, eps=)tupler'   shaper(   r)   s    r,   
extra_reprzZambaRMSNorm.extra_reprA   s*    ))*+6$2G2G1HIIr-   )gư>)
__name__
__module____qualname__floatr#   r%   Tensorr;   r@   __classcell__r+   s   @r,   r   r   1   s7    $ $$ $;U\\ ;ell ;Jr-   r   r.   n_repr    c                     | j                   \  }}}}|dk(  r| S | dddddddddf   j                  |||||      } | j                  |||z  ||      S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r   N)r>   expandreshape)r.   rH   batchnum_key_value_headsslenhead_dims         r,   	repeat_kvrP   F   so    
 2?1D1D.Ehz!!Qa"23::5BUW\^bdlmM  (;e(CT8TTr-   c                   .   e Zd ZdZdZej                  dfdZd Z	 ddej                  dej                  de
d	eeef   dz  d
eej                  ej                  f   f
dZdej                   fdZdde
dz  d
e
fdZdej                  de
d
ee
e
f   fdZy)ZambaHybridDynamicCachea  
    A dynamic cache that can handle both the attention cache (which has a seq_len dimension) and the mamba cache
    (which has a constant shape regardless of seq_len).

    This cache has two sets of lists of tensors: `key_cache` and `value_cache` for attention cache and `conv_states`
    and `ssm_states` for mamba cache. Each of these lists has `num_layers` tensors. The expected shape for each tensor
    For attention layers, `key_cache` and `value_cache` have a shape of `(batch_size, num_heads, seq_len, head_dim)`,
    while `conv_states` and `ssm_states` have a shape of `(batch_size, 0)` (empty tensors).
    For mamba layers, `key_cache` and `value_cache` have a shape of `(batch_size, 0)` (empty tensors),
    while `conv_states` represents the convolution state and has a shape of `(batch_size, d_inner, d_conv)`,
    and `ssm_states` represents the ssm state and has a shape of `(batch_size, d_inner, d_state)`.
    FNc           
      ,   || _         d| _        |j                  | _        d| _        |j                  |j
                  z  | _        |j                  | _        |j                  | _
        |j                  | _        g | _        g | _        g | _        i | _        i | _        i | _        t%        |j&                        D ]  }| xj                  t)        j*                  || j                  | j                  ||      gz  c_        || j                  | j                  | j                  z  | j                  f}| xj                  t)        j*                  |||      gz  c_        | j                  |   dk(  s| j                  j-                  |        t%        |j&                        D cg c]  }t)        j.                  g g|z  |       c}| _        t%        |j&                        D cg c]  }t)        j.                  g g|z  |       c}| _        y c c}w c c}w )NFdevicer3   hybridrU   )r3   is_compileablelayers_block_typehas_previous_statemamba_expandr*   intermediate_sizemamba_d_statessm_state_sizemamba_d_convconv_kernel_sizen_mamba_headsconv_states
ssm_statestransformer_layers_modules_parameters_buffersrangenum_hidden_layersr%   zerosappendtensor	key_cachevalue_cache)r)   config
batch_sizer3   rU   icache_shape_s           r,   r#   z ZambaHybridDynamicCache.__init__b   s   
#!'!9!9"'!'!4!4v7I7I!I$22 & 3 3#11"$v//0 	2AJ(>(>@U@U^dlqr!  ""&&$*<*<<##	K OOKe TUUO%%a(H4''..q1	2 SXX^XpXpRqrQ%,,tj'8HrTYZ`ZrZrTstqELL"
):6Jt sts    "H!"Hc                 ,    t        | j                        S N)lenrm   r?   s    r,   __len__zZambaHybridDynamicCache.__len__   s    4>>""r-   
key_statesvalue_states	layer_idxcache_kwargsr    c                    | j                   |   j                  d   dk(  r|| j                   |<   || j                  |<   nft        j                  | j                   |   |gd      | j                   |<   t        j                  | j                  |   |gd      | j                  |<   | j                   |   | j                  |   fS )Nr1   r   r0   dim)rm   r>   rn   r%   cat)r)   rx   ry   rz   r{   s        r,   updatezZambaHybridDynamicCache.update   s     >>)$**2.!3(2DNN9%*6DY'(-		4>>)3Lj2Y_`(aDNN9%*/))T5E5Ei5PR^4_ef*gDY'~~i($*:*:9*EEEr-   beam_idxc                    | j                         dkD  rvt        t        | j                              D ]S  }| j                  |   j                  }| j                  |   j                  d|j                  |            | j                  |<   | j                  |   j                  }| j                  |   j                  d|j                  |            | j                  |<   | j                  |   j                  }| j                  |   j                  d|j                  |            | j                  |<   | j                  |   j                  }| j                  |   j                  d|j                  |            | j                  |<   V yy)zDReorders the cache for beam search, given the selected beam indices.r   N)
get_seq_lengthrh   rv   rm   rU   index_selectr4   rn   rb   rc   )r)   r   rz   rU   s       r,   reorder_cachez%ZambaHybridDynamicCache.reorder_cache   s[    1$"3t~~#67 	m		299,0NN9,E,R,RSTV^VaVabhVi,jy))))4;;.2.>.>y.I.V.VWXZbZeZeflZm.n  +)))4;;.2.>.>y.I.V.VWXZbZeZeflZm.n  +3::-1__Y-G-T-TUVX`XcXcdjXk-l	*	m %r-   c                     || j                   vr| j                   d   n|}t        | j                        |k  s| j                  |   j                  d   dk(  ry| j                  |   j                  d   S )zYReturns the sequence length of the cached states. A layer index can be optionally passed.r   r1   )rd   rv   rm   r>   )r)   rz   s     r,   r   z&ZambaHybridDynamicCache.get_seq_length   sn     3<4CZCZ2ZD++A.`i	t~~)+t~~i/H/N/Nr/RVW/W~~i(..r22r-   cache_positionc                 T    d}|j                   d   }| j                  |      |z   }||fS )zDReturn the length and offset of the cache, used to generate the maskr   )r>   r   )r)   r   rz   	kv_offsetquery_length	kv_lengths         r,   get_mask_sizesz&ZambaHybridDynamicCache.get_mask_sizes   s7    	%++A.''	2\A	)##r-   ru   )r   )rA   rB   rC   __doc__rX   r%   float16r#   rw   rE   intdictstrr   r=   r   
LongTensorr   r   r    r-   r,   rR   rR   R   s     N16t u@# /3FLLF llF 	F
 38nt+F 
u||U\\)	*F$me&6&6 m3d
 33 3$U\\ $c $eTWY\T\o $r-   rR   modulequerykeyvalueattention_maskscalingdropoutc                    t        || j                        }t        || j                        }	t        j                  ||j	                  dd            |z  }
||
|z   }
t
        j                  j                  |
dt        j                        j                  |j                        }
t
        j                  j                  |
|| j                        }
t        j                  |
|	      }|j	                  dd      j                         }||
fS )Nr0   r	   r1   )r~   r3   )ptrainingr   )rP   num_key_value_groupsr%   matmul	transposer   
functionalsoftmaxr5   r4   r3   r   r   
contiguous)r   r   r   r   r   r   r   kwargsrx   ry   attn_weightsattn_outputs               r,   eager_attention_forwardr      s     3 ; ;<JUF$?$?@L<<z';';Aq'ABWLL!#n4==((2U]](SVVW\WbWbcL==((6??([L,,|\:K''1-88:K$$r-   c                        e Zd ZdZdedef fdZ	 ddej                  dedej                  dz  de	dz  d	e
e   d
eej                  ej                  dz  eej                     dz  f   fdZ xZS )ZambaAttentionaA  
    Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
    and "Generating Long Sequences with Sparse Transformers".

    Adapted from transformers.models.mistral.modeling_mistral.MistralAttention:
    The input dimension here is attention_hidden_size = 2 * hidden_size, and head_dim = attention_hidden_size // num_heads.
    The extra factor of 2 comes from the input being the concatenation of original_hidden_states with the output of the previous (mamba) layer
    (see fig. 2 in https://huggingface.co/papers/2405.16712).
    Additionally, replaced
    attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) with
    attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim/2)
    ro   rz   c                 .   t         |           || _        || _        |j                  | _        |j
                  | _        |j                  |j                  z  | _	        |j                  | _
        | j                  dz  dz  | _        d| _        |j                  | _        t        j                  |j                  |j                  | j                  z  d      | _        t        j                  |j                  |j                  | j                  z  d      | _        t        j                  |j                  |j                  | j                  z  d      | _        t        j                  |j                  | j                  z  |j&                  d      | _        y )Nr0         TFbias)r"   r#   ro   rz   attention_hidden_sizeattention_head_dimrO   num_attention_headsrM   r   max_position_embeddingsr   	is_causalattention_dropoutr   Linearq_projk_projv_projr*   o_projr)   ro   rz   r+   s      r,   r#   zZambaAttention.__init__   s9   "%+%A%A"11$*$>$>&B\B\$\!'-'E'E$)d2!'!9!9ii < <f>X>X[_[h[h>hotuii < <f>X>X[_[h[h>hotuii < <f>X>X[_[h[h>hotuii : :T]] JFL^L^ejkr-   Nr.   r   past_key_valuesr   r    c                    |j                   d d }g |d| j                  }| j                  |      j                  |      j	                  dd      }| j                  |      j                  |      j	                  dd      }	| j                  |      j                  |      j	                  dd      }
||j                  |	|
|      \  }	}
t        j                  | j                  j                  t              } || ||	|
|f| j                  sdn| j                  | j                  d|\  }} |j                   g |d j#                         }| j%                  |      }||fS )Nr1   r   r0           )r   r   )r>   rO   r   viewr   r   r   r   r   get_interfacero   _attn_implementationr   r   r   r   rK   r   r   )r)   r.   rz   r   r   r   input_shapehidden_shapequery_statesrx   ry   attention_interfacer   r   s                 r,   r;   zZambaAttention.forward   sk    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST&'6'='=j,Xa'b$J(?(M(MKK,,.E)
 %8	%
  $}}C$2H2HLL	%
 	%
!\ *k));;;;FFHkk+.L((r-   ru   )rA   rB   rC   r   r   r   r#   r%   rE   rR   r   r   r=   r;   rF   rG   s   @r,   r   r      s    l{ ls l. ;?#)||#) #) t+	#)
 147#) -.#) 
u||U\\D0%2E2LL	M#)r-   r   c                   l     e Zd ZdZdef fdZ	 d	dej                  defdZ	d	defdZ
d	defdZ xZS )
ZambaMambaMixeruE  
    Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`.
    A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective)
    ∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4,
    and is why Mamba is called **selective** state spaces)

    This module differs from `transformers.models.mamba.modeling_mamba.MambaMixer` in two ways:
    - Added multi-head: the output of `self.in_proj` is split into `self.n_mamba_heads` heads, and each head
    undergoes an independent forward pass, identical to the original `MambaMixer`, up until the pre-activations of
    `self.out_proj`. The pre-activations, coming from different mamba heads, are then concatenated and fed into `self.out_proj`.
    ro   c           	      	   t         |           || _        || _        |j                  | _        |j
                  | _        |j                  | _        |j                  |j                  z  | _
        |j                  | _        |j                  | _        | j                  | j                  z  | _        |j                  | _        |j"                  | _        t'        j(                  | j                  | j                  | j                   | j                  | j                  | j                  dz
        | _        |j,                  | _        t0        |j,                     | _        |j4                  | _        t'        j8                  | j                  | j                  dz  | j$                        | _        t'        j<                  t?        j@                  | j                  | j                  | j                  dz  z   | j                              | _!        t'        j<                  t?        j@                  | j                  | j                  | j                        dz
  dz  | j                  dz  z        | _"        t'        j<                  t?        j@                  | j                  | j                              | _#        t?        jH                  d| j                  dz   t>        jJ                        d d d f   }|jM                  | j                  d      jO                         }t'        j<                  t?        jP                  |      jS                  | j                  | j                  d            | _*        t'        j<                  t?        jV                  | j                  | j                              | _,        t'        j8                  | j                  | j                  | j$                        | _-        t]        d      a/ta        t^        d	d       a1ta        t^        d
d       a2t]        d      a3ti        tf        d      a5ta        tf        dd       a6ta        tf        dd       a7tq        tj        tl        td        tb        tn        f      a9tr        stt        jw                  d       y y )Nr   )in_channelsout_channelsr   kernel_sizegroupspaddingr0   r   g      ?r3   r1   zcausal-conv1dcausal_conv1d_updatecausal_conv1d_fnz	mamba-ssmz8ops.triton.selective_state_update.selective_state_update)chained_pathselective_scan_fnmamba_inner_fnaq  The fast path is not available because one of `(selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn)` is None. To install follow https://github.com/state-spaces/mamba/#installation and https://github.com/Dao-AILab/causal-conv1d. If you want to use the naive implementation, set `use_mamba_kernels=False` in the model config)<r"   r#   ro   rz   r*   r]   r^   r_   r`   r[   r\   mamba_dt_ranktime_step_rankra   mamba_head_dimmamba_conv_biasuse_conv_biasmamba_proj_biasuse_biasr   Conv1dconv1dhidden_mamba_act
activationr   actuse_mamba_kernelsuse_fast_kernelsr   in_projr$   r%   rj   x_proj_weightdt_proj_weightdt_proj_biasaranger5   rJ   r   logrK   A_logr&   Dout_projr   causal_conv1dgetattrr   r   	mamba_ssmr   selective_state_updater   r   allis_fast_path_availableloggerwarning_once)r)   ro   rz   Ar+   s       r,   r#   zZambaMambaMixer.__init__$  s   "!--$22 & 3 3!'!4!4v7I7I!I$22#11"448J8JJ#33..ii..//##--))))A-
 !11&112 & 8 8 yy!1!143I3IA3MTXTaTab  \\KK""##d&9&9A&==##
 !ll[[++T-@-@$BUBUVY\\!!3&'

 LLT5G5GI\I\)]^ LLD//!35==I$PQ'RHHT++R0;;=\\%))A,"6"6t7I7I4K^K^`b"cd
ejj););T=P=PQR		$"8"8$:J:JQUQ^Q^_ )9&}6LdS"=2DdK %[1	!8$^"
 $I/BDI ,<dC "%#%68HJ^`no"
 &^ &r-   r.   cache_paramsc                    |j                   \  }}}|d uxr |j                  xr |dk(  }| j                  |      j                  dd      }|j	                  |dd|      j                  dd      \  }}	|j                  d      j                         }|	j                  d      }	|	j                  || j                  d|      j                  dd      }	| j                  j                  j	                  | j                  j                  j                  d      | j                  j                  j                  d            }
|ret        |j                  d      |j                  | j                     |
| j                  j                   | j"                        }|j%                  d      }n|,t'        j(                  |dk(        s||j%                  d      z  }|dt*        j,                  j/                  || j0                  |j                   d   z
  df      }|j                  | j                     j3                  |       t5        ||
| j                  j                   | j"                        }|,t'        j(                  |dk(        s||j%                  d      z  }|j                  d| j                  | j6                  |      j                  dd      }| j8                  d d d d d d d f   |z  j                  dd      }t'        j:                  || j<                  | j>                  | j>                  gd      \  }}}| j@                  d d d f   |j                  dd      z  }t'        jB                  | jD                  jG                                }| jH                  | jH                  jG                         nd }t'        jJ                  |d|f|jL                  |jN                        }|rtQ        | j                        D ]  }tS        |jT                  | j                     d d |f   ||d	df   ||d	df   ||   ||d d df   ||d d df   | jV                  |   |	|d	df   ||   d

      j%                  d      }t'        jX                  ||fd      } nAt'        jJ                  |d| j6                  | j>                  f|jL                  |jN                        }tQ        | j                        D ]  }t[        ||   ||   ||   ||   j                  dd      ||   j                  dd      | jV                  |   jG                         |	|   ||   d
d

      \  }}t'        jX                  ||fd      j                         }t'        jX                  ||j%                  d      fd      } |*|(|jT                  | j                     j3                  |       | j]                  |j                  dd            }|S )Nr   r0   r1   r}   r   )r   r   rT   .T)dt_softplus)delta_softplusreturn_last_state)/r>   rZ   r   r   r   chunksqueezer   rK   ra   r   r'   sizer   rb   rz   r   r   	unsqueezer%   r   r   r   padr`   copy_r   r   r   splitr   r^   r   expr   rD   r   emptyrU   r3   rh   r   rc   r   r   r   r   )r)   r.   r   r   rp   seq_lenrs   use_precomputed_statesprojected_statesgateconv_weightsrb   ssm_parameters	time_stepBCdiscrete_time_stepr   time_proj_biasscan_outputsnscan_outputs_	ssm_state
ssm_state_contextualized_statess                            r,   cuda_kernels_forwardz$ZambaMambaMixer.cuda_kernels_forwards  s    "/!4!4
GQ!-T!9!nl>]>]!nbimnbn  <<6@@AF.33JAwOUUVW]^U_t%--a0;;=||A||J(:(:BHRRSTVWX {{))..t{{/A/A/F/Fq/I4;;K]K]KbKbcdKef!0%%b)((8  M *33B7M)%))Na<O2P -0H0H0K K' mm//@U@UXeXkXklnXo@oqr?st((8>>{K,]L$++JZJZgkgvgvwM)%))Na<O2P -0H0H0K K
 &--b$2D2DdFYFY[bcmmnoqrs,,Qa];mKVVWY[]^++T00$2E2EtGZGZ[ac
	1a "00D9I<O<OPRTV<WWYYtzz'')** 7;6G6G6S**002Y]{{J7#;MDXDX`m`s`st!4--. O 6 ++DNN;AqDA!!S!),&q#qy1aDaAgJaAgJFF1ICO"1% $! )B-   %yy,)FANO  Q 3 3T5H5HI$++#))I
 4--. S,=!!$&q)aDaDNN1a(aDNN1a(FF1IOO%G"1%#'&*-)z  %yy,)FANYY[!IIy*2F2Fq2I&JPQR	S $)A''7==iH !%l.D.DQ.J K$$r-   c           
      R   |j                   \  }}}|j                  }| j                  |      j                  dd      }|j	                  |dd|      j                  dd      \  }	}
|	j                  d      j                         }	|
j                  d      }
|
j                  || j                  d|      j                  dd      }
t        |t              }|r|j                  | j                     j                   d   |k(  ri| j                  r(|j                  | j                     j                         }n|j                  | j                     }|j!                  |	j"                        }|j$                  r|dk(  r|j&                  | j                     j                   d   |k(  r|j&                  | j                     }t)        j*                  |dd      }|	d d d d df   |d d d d df<   ||j&                  | j                  <   t)        j,                  || j.                  j0                  d d dd d f   z  d      }	| j2                  r|	| j.                  j4                  z  }	| j7                  |	      j!                  |      j9                  d      }	nn|+|	|d d |	j                   d    d f   j9                  d      z  }	t:        j<                  j?                  |	| j@                  |	j                   d   z
  df      }||j&                  | j                  <   | j7                  | j/                  |	      dd |f         }	||	|d d |	j                   d    d f   j9                  d      z  }	nt)        jB                  || j                  | jD                  | jF                  f|	j"                  |      }||	|j9                  d      z  }	| j7                  | j/                  |	      dd |f         }	||	|j9                  d      z  }	|	j                  d| j                  | jD                  |      j                  dd      }	| jH                  d d d d d d d f   |	z  j                  dd	      }t)        jJ                  || jL                  | jF                  | jF                  gd      \  }}}| jN                  d d d f   |j                  dd	      z  | jP                  d d d d d d f   z   }t:        j<                  jS                  |      }t)        jT                  | jV                  jY                                }t)        jT                  |d d d d d d d d f   |d d d d d d d d d f   z        }|d d d d d d d d d f   |d d d d d d d d d f   jY                         z  }||	d d d d d d d d d f   jY                         z  }g }t[        |      D ]  }|d d d d d d |d d f   j                  dd      |z  |d d d d d d |d d f   j                  dd      z   }t)        j\                  |j                  dd      j!                  |      |d d d d |d d f   j9                  d            }|j_                  |d d d d d d df           t)        j`                  |d      }||	| jb                  d d d d d d f   z  z   }|| j7                  |
      z  }|r||j                  | j                  <   | je                  |j                  dd      j                  |d|      j                  dd            }|S )
Nr   r0   r1   r}   r   )shiftsdims.rT   r   )3r>   r3   r   r   r   r   r   r   rK   ra   
isinstancerR   rc   rz   r   cloner4   rU   rZ   rb   r%   rollsumr   r'   r   r   r   r   r   r   r   r`   rj   r   r^   r   r   r   r   r   softplusr   r   rD   rh   r   rk   stackr   r   )r)   input_statesr   r   rp   r  rs   r3   r  r.   r  	use_cacher  
conv_stater  r  r  r	  r
  r   
discrete_A
discrete_BdeltaB_ur  rq   scan_outputr  s                              r,   slow_forwardzZambaMambaMixer.slow_forward  s   !-!3!3
GQ""<<5??1E.33JAwOUUVW]^U_t%--a0;;=||A||J(:(:BHRRSTVWX|-DE	00@FFqIZW}}(33DNNCIIK	(33DNNC	!]%9%9:I //qL ,,T^^<BB1ES)55dnnE
"ZZ
2BG
'4Q1W'=
1a8$;E((8 %		*t{{7I7I!QPQ'7R*RXZ [%%!T[[%5%55M $ 7 : :5 A K KB O!-$1N1}GZGZ[]G^F^F`C`4a4k4klm4n$nM]]..}t?T?TWdWjWjkmWn?npq>rs
;E((8 $])CC'M)R S!-$1N1}GZGZ[]G^F^F`C`4a4k4klm4n$nMT//1D1DdFYFYZ$++I
 ) -0H0H0K K HHT[[%?XgX%NOM) -0H0H0K K &--b$2D2DdFYFY[bcmmnoqrs,,Qa];mKVVWY[]^++T00$2E2EtGZGZ[ac
	1a #11!T':Y=P=PQSUW=XX\`\m\mtQ]
 
  ]]334FG YYtzz'')**YYqD!T1!458J1aQRTUW[K[8\\]
'1aD(89AaD!Q>N<O<U<U<WW
aAq$.> ? E E GGw 	9A"1aAq=1;;AqAIMPXYZ\]_`bcefYfPgPqPqrsuvPwwI,,y':':1a'@'C'CE'JAaQRTUWXjMLcLcdfLghKAq!QJ 78	9 kk,B7!]TVVAtQ<L5M%MN!DHHTN26?L##DNN3 !%!!!Q'//
BHRRSTVWX!
 %$r-   c                    t        t        t        t        t        t
        f      }| j                  rC|r"d| j                  j                  j                  vrt        d      | j                  |||      S | j                  |||      S )NcudazFast Mamba kernels are not available. Make sure to they are installed and that the mamba module is on a CUDA device. lease run 'pip install causal-conv1d>=1.2.0' and 'pip install mamba-ssm', or set use_mamba_kernels=False in the model's config.)r   )r   r   r   r   r   r   r   r   rU   type
ValueErrorr  r#  )r)   r.   r   r   r   s        r,   r;   zZambaMambaMixer.forward1  s    !$#%68HJ^`no"
   )V4;M;M;T;T;Y;Y-Y i 
 ,,]LYg,hh  ^ \\r-   )NN)rA   rB   rC   r   r   r#   r%   rE   rR   r  r#  r;   rF   rG   s   @r,   r   r     sZ    
M{ M` im_%"\\_%9P_%B[%7N [%z]3J ]r-   r   c                   $     e Zd Z fdZd Z xZS )ZambaMLPc                    t         |           || _        |j                  | _        |j                  | _        t        j                  | j                  | j                  d      | _        t        j                  | j                  | j                  d      | _        t        j                  | j                  | j                  d      | _	        t        |j                     | _        y NFr   )r"   r#   ro   r*   r\   r   r   	gate_projup_proj	down_projr   
hidden_actact_fnr)   ro   r+   s     r,   r#   zZambaMLP.__init__C  s    !--!'!9!94#3#3T5K5KRWXyy!1!143I3IPUV4#9#94;K;KRWXV../r-   c                     | j                  | j                  | j                  |            | j                  |      z        }|S ru   )r.  r0  r,  r-  )r)   xr.  s      r,   r;   zZambaMLP.forwardM  s6    NN4;;t~~a/@#ADLLQRO#ST	r-   )rA   rB   rC   r#   r;   rF   rG   s   @r,   r)  r)  B  s    0r-   r)  c                       e Zd Zddededz  f fdZ	 	 	 	 ddej                  dej                  dedej                  dz  dedz  d	e	dz  d
e	dz  de
e   deej                  eej                  ej                  f   dz  f   fdZ xZS )ZambaAttentionDecoderLayerNro   rz   c                     t         |           t        ||      | _        t	        |      | _        t        |j                  |j                        | _	        t        |j                  |j                        | _        y )Nr   )r"   r#   r   	self_attnr)  feed_forwardr   r   rms_norm_epsinput_layernormr*   pre_ff_layernormr   s      r,   r#   z#ZambaAttentionDecoderLayer.__init__S  s_    '	:$V,+F,H,HfNaNab ,V-?-?VEXEX Yr-   r.   original_hidden_statesr   r   output_attentionsr  r   r    c           
          t        j                  ||gd      }| j                  |      } | j                  d||||||d|\  }}	| j	                  |      }| j                  |      }|f}
|r|
|	fz  }
|
S )a  
        Args:
            hidden_states (`torch.FloatTensor`): output of previous Mamba layer of shape `(batch, seq_len, embed_dim)`
            original_hidden_states (`torch.FloatTensor`): word embedding output of shape `(batch, seq_len, embed_dim)`.
                This is concatenated with `hidden_states` (which is the output of the previous (mamba) layer). The
                concatenated tensor is then used as input of the pre-attention RMSNorm
                (see fig. 2 in https://huggingface.co/papers/2405.16712).
            layer_idx (`int`): layer_idx in the forward pass. Used to distinguish Zamba's tied transformer layers.
            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                `(batch, sequence_length)` where padding elements are indicated by 0.
            past_key_values (`ZambaHybridDynamicCache`, *optional*): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence.
        r1   r}   )r.   rz   r   r   r>  r  r   )r%   concatenater;  r8  r<  r9  )r)   r.   r=  rz   r   r   r>  r  r   self_attn_weightsoutputss              r,   r;   z"ZambaAttentionDecoderLayer.forward[  s    > ))=:P*QWYZ,,];+94>> ,
')+/,
 ,
(( --m<))-8 ")++Gr-   ru   )NNFF)rA   rB   rC   r   r   r#   r%   rE   rR   boolr   r   r=   FloatTensorr;   rF   rG   s   @r,   r5  r5  R  s    Z{ ZsTz Z /3:>).!&3||3 !&3 	3
 t+3 1473  $;3 $;3 -.3 
u  %(9(95;L;L(L"MPT"TT	U3r-   r5  c                       e Zd Zdedef fdZ	 	 	 	 	 	 	 	 	 	 ddej                  dej                  dz  dedz  dej                  dz  dej                  dz  d	edz  d
e	dz  de	dz  dej                  dz  dej                  dz  dej                  dz  deej                  eej                  ej                  f   dz  f   fdZ xZS )ZambaMambaDecoderLayerro   rz   c                     t         |           t        ||      | _        t	        |j
                  |j                        | _        || _        y )N)ro   rz   r7  )	r"   r#   r   mambar   r*   r:  r;  rz   r   s      r,   r#   zZambaMambaDecoderLayer.__init__  s>    $FiH
+F,>,>FDWDWX"r-   Nr.   r=  r   causal_maskr   r>  r  r   position_idstransformer_hidden_statesr    c                     |}|||z   n|}| j                  |      }| j                  |||      }d}||z   }|f}|r||fz  }|r||fz  }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                `(batch, sequence_length)` where padding elements are indicated by 0.
            past_key_values (`ZambaHybridDynamicCache`, *optional*): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence.
        N)r.   r   r   )r;  rH  )r)   r.   r=  rz   r   rI  r   r>  r  r   rJ  rK  r   residualrA  rB  s                   r,   r;   zZambaMambaDecoderLayer.forward  s    > !
 :S9^M55dq 	 ,,];

'() # 
 ! !=0 ")++G))Gr-   )
NNNNNFFNNN)rA   rB   rC   r   r   r#   r%   rE   rR   rC  r   r=   rD  r;   rF   rG   s   @r,   rF  rF    s,   #{ #s # 7; $.2+/:>).!&26049=;||; !&t 3; :	;
 t+; \\D(; 147;  $;; $;; ((4/; &&-; $)<<$#6; 
u  %(9(95;L;L(L"MPT"TT	U;r-   rF  c                   l    e Zd Zdedej
                  def fdZ	 	 	 	 	 	 	 	 ddej                  dej                  dz  de
dz  d	ej                  dz  d
ej                  dz  dedz  dedz  dedz  dej                  dz  deej                  eej                  ej                  f   dz  f   fdZ xZS )ZambaHybridLayershared_transflinearrH  c                 L    t         |           || _        || _        || _        y ru   )r"   r#   rP  rQ  mamba_decoder)r)   rP  rQ  rH  r+   s       r,   r#   zZambaHybridLayer.__init__  s%    *"r-   Nr.   r=  rz   r   rI  r   r>  r  r   r    c
           
          | j                  ||||||||	      }
|
d   }|r|
d   }| j                  |      }| j                  |||||||	      }
|r|
d   f|
dd z   }
|
S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            original_hidden_states (`torch.FloatTensor`): word embedding output that will be concatenated with
            hidden activations to form the input of the shared transformer layer.
            layer_idx (`int`): layer number.
            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                `(batch, sequence_length)` where padding elements are indicated by 0.
            past_key_values (`ZambaHybridDynamicCache`, *optional*): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
                Indices depicting the position of the input sequence tokens in the sequence.
        )r=  rz   r   r   r>  r  r   r   r   )rK  r   r   r>  r  r   r0   N)rP  rQ  rS  )r)   r.   r=  rz   r   rI  r   r>  r  r   layer_outputsrK  rA  s                r,   r;   zZambaHybridLayer.forward  s    > **#9&+/) + 	
 %2!$4! -a 0$(KK0I$J!**&?)+/) + 
 *1-/@AMRSRTDUUMr-   )NNNNNFFN)rA   rB   rC   r5  r   r   rF  r#   r%   rE   r   rR   rC  r   r=   rD  r;   rF   rG   s   @r,   rO  rO    s   #&@ #")) #\r # 7; $.2+/:>).!&26>||> !&t 3> :	>
 t+> \\D(> 147>  $;> $;> ((4/> 
u  %(9(95;L;L(L"MPT"TT	U>r-   rO  c                   r     e Zd ZU eed<   dZdZddgZdZdZ	dZ
dZ ej                          fd       Z xZS )	ZambaPreTrainedModelro   modelTr5  rF  r   Fc                    | j                   j                  }t        |   |       t	        |t
              rt        j                  |j                  d|       | j                   j                  dz  }t        j                  |j                  | |       | j                   j                  | j                   j                  z  | j                   j                  z  }t        j                   t        j"                  | j                   j                  |      t%        j&                  | j                   j(                        t%        j&                  | j                   j*                        z
  z  t%        j&                  | j                   j*                        z         j-                  | j                   j.                        }|t        j&                  t        j0                  |              z   }t        j2                  |j4                  |       t        j6                  d|j8                  dz   t        j:                        d d d f   }|j=                  |j>                  d      jA                         }t        j2                  |jB                  t        j&                  |      jE                  |j                  |jF                  d             t        jH                  |jJ                         y y )Nr   )r7   stdr   )minr   r   r1   )&ro   initializer_ranger"   _init_weightsr  r   initnormal_r   r   uniform_r   r[   r*   ra   r%   r   randmathr   time_step_maxtime_step_minclamptime_step_floorexpm1r   r   r   r^   r5   rJ   r\   r   r   rK   r   ones_r   )	r)   r   rZ  dt_init_stdr   dtinv_dtr   r+   s	           r,   r]  z"ZambaPreTrainedModel._init_weights*  s   kk++f%fo.LL--CSA++33T9KMM&//+{K![[558O8OOSWS^S^SlSllN

4;;44nE88DKK556$++B[B[9\\^((4;;4456 e33e4	  %))U[["%5$566FJJv**F3Q 5 5 9OPTVWPWXA1126AACAJJv||UYYq\%9%9&:N:NPVPePegi%jkJJvxx % /r-   )rA   rB   rC   r   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_is_statefulr%   no_gradr]  rF   rG   s   @r,   rW  rW    sQ    &*#57OP"3 NLU]]_! !r-   rW  c                   "    e Zd ZdZdef fdZe	 	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  de
dz  d	ej                  dz  d
edz  dedz  dedz  dedz  dej                  dz  deez  fd       Z xZS )
ZambaModelz
    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`ZambaDecoderLayer`]

    Args:
        config: ZambaConfig
    ro   c                 h   t         |   |       |j                  | _        |j                  | _        t        j                  |j                  |j                  | j                        | _        |j                  | _	        g }d | _
        t        | j                        D ]  \  }}t        ||      }|dk(  rt        j                  | j                  j                  | j                  j                  d      }|j                  t!        t#        |      ||             | j                  d| dd| di| _
        |j                  |        t        j$                  |      | _        |j(                  | _        t+        |j                  |j,                  	      | _        d| _        | j3                          y )
N)rz   rV   Fr   z
layers.(?!z\.)\d+.shared_transfzlayers.z.shared_transfr7  )r"   r#   pad_token_idpadding_idx
vocab_sizer   	Embeddingr*   embed_tokensrY   _tied_weights_keys	enumeraterF  r   ro   rk   rO  r5  
ModuleListlayersr   r   r:  final_layernormgradient_checkpointing	post_init)r)   ro   r  layer_id
layer_typerH  rQ  r+   s          r,   r#   zZambaModel.__init__L  sq    !.. ++LL):):F<N<NPTP`P`a!'!9!9"&$-d.D.D$E 
	% Hj*6XFEX%4;;#:#:DKK<S<SZ_`./I&/QSY[`ab**2%hZ/CDPXzYgFh/D+ e$
	% mmF+$*$?$?!+F,>,>FDWDWX&+#r-   N	input_idsr   rJ  r   inputs_embedsr  r>  output_hidden_statesreturn_dictr   r    c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|	|	n| j                   j                  }	|d u |d uz  rt        d      | j                  r%| j                  r|rt        j                  d       d}|| j                  |      }|}t        j                  |      }|r|t        j                  d       |
.t        j                  |j                  d   |j                        }
||
j!                  d      }t#        | j                   |||
||      }|rd	nd }|rd	nd }t%        | j&                        D ]5  \  }}|r||fz  } ||||||||||

	      }|d   }|s'|d   -||d   fz  }7 | j)                  |      }|r||fz  }|r|j*                  sd|_        t-        ||r|nd ||      }|	r|S |j/                         S )NzaYou cannot specify both input_ids and inputs_embeds at the same time, and must specify either onezX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.Fz{Zamba requires an initialized `ZambaHybridDynamicCache` to return a cache. None was provided, so no cache will be returned.r   rW   r   )ro   r  r   r   r   rJ  r   )r   r>  r  r   T)last_hidden_stater   r.   
attentions)ro   r>  r  r  use_return_dictr'  r  r   r   r   r|  r%   r  r   r>   rU   r   r   r~  r  r  rZ   r   to_tuple)r)   r  r   rJ  r   r  r  r>  r  r  r   r   r.   r=  rI  all_hidden_statesall_self_attnsrz   layerrU  outputs                        r,   r;   zZambaModel.forwardi  sW    2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B]-t";<s  &&4==Yj I  --i8M%!&]!; 0:
 !"\\-*=*=a*@I]I]^N)33A6L(;;'))+%
 #7BD0d )$++ 6 	:Iu#!m%55!!& /"3#-
M *!,M  #/"}Q'7&99N+	:. ,,];  -!11?#E#E15O.(+/8Od+%	
 %v;&//*;;r-   
NNNNNNNNNN)rA   rB   rC   r   r   r#   r   r%   r   rE   rR   rD  rC  r=   r   r;   rF   rG   s   @r,   rv  rv  C  s   { :  .2.204:>26!%)-,0#'26f<##d*f< t+f< &&-	f<
 147f< ((4/f< $;f<  $;f< #Tkf< D[f< ((4/f< 
(	(f< f<r-   rv  c                       e Zd ZddiZdef fdZe	 	 	 	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  d	e
dz  d
ej                  dz  dej                  dz  dedz  dedz  dedz  dedz  dej                  dz  deej                  z  deez  fd       Z	 	 	 	 	 	 	 d fd	Z xZS )ZambaForCausalLMzlm_head.weightzmodel.embed_tokens.weightro   c                     t         |   |       t        |      | _        |j                  | _        t        j                  |j                  |j                  d      | _        | j                          y r+  )
r"   r#   rv  rX  rz  r   r   r*   lm_headr  r1  s     r,   r#   zZambaForCausalLM.__init__  sU     '
 ++yy!3!3V5F5FUS 	r-   Nr  r   rJ  r   r  labelsr  r>  r  r  r   logits_to_keepr    c                    ||n| j                   j                  }|	|	n| j                   j                  }	|
|
n| j                   j                  }
| j	                  ||||||||	||

      }|d   }t        |t              rt        | d      n|}| j                  |dd|ddf         }d}| | j                  ||| j                  fi |}|
s|f|dd z   }||f|z   S |S t        |||j                  |j                  |j                        S )ah  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, ZambaForCausalLM

        >>> model = ZambaForCausalLM.from_pretrained("Zyphra/Zamba-7B-v1")
        >>> tokenizer = AutoTokenizer.from_pretrained("Zyphra/Zamba-7B-v1")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```N)
r  r   rJ  r   r  r  r>  r  r   r  r   r   losslogitsr   r.   r  )ro   r>  r  r  rX  r  r   slicer  loss_functionrz  r   r   r.   r  )r)   r  r   rJ  r   r  r  r  r>  r  r  r   r  r   rB  r.   slice_indicesr  r  r  s                       r,   r;   zZambaForCausalLM.forward  sL   P 2C1N-TXT_T_TqTq %9$D $++JjJj 	 &1%<k$++B]B] **)%+'/!5)#  
  
8B>SV8W~ot4]kmA}a,?@A%4%%ffdooPPDY,F'+'7D7V#CVC%#33!//))
 	
r-   c	                     |:t        | j                  |j                  d   | j                  | j                        }| j                  j
                  |	d<   t        |   |f|||||||d|	}
|
S )Nr   )r3   rU   r  )r   r   r  r   rJ  r  is_first_iteration)rR   ro   r>   r3   rU   num_logits_to_keepr"   prepare_inputs_for_generation)r)   r  r   r   r  r   rJ  r  r  r   model_inputsr+   s              r,   r  z.ZambaForCausalLM.prepare_inputs_for_generation2  s     "5Y__Q/tzz$++O $(;;#A#A w<

+)')%1

 

 r-   )NNNNNNNNNNNr   )NNNNNTF)rA   rB   rC   r}  r   r#   r   r%   r   rE   rR   rD  rC  r   r=   r   r;   r  rF   rG   s   @r,   r  r    sj   *,GH{   .2.204:>26*.!%)-,0#'26-.O
##d*O
 t+O
 &&-	O

 147O
 ((4/O
   4'O
 $;O
  $;O
 #TkO
 D[O
 ((4/O
 ell*O
 
'	'O
 O
h     r-   r  a  
    The Zamba Model with a sequence classification head on top (linear layer).

    [`ZambaForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-2) do.

    Since it does classification on the last token, it requires to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    )custom_introc                       e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dedz  dej                  dz  dej                  dz  d	e
dz  d
e
dz  de
dz  de
dz  deez  fd       Z xZS )ZambaForSequenceClassificationc                     t         |   |       |j                  | _        t        |      | _        t        j                  |j                  | j                  d      | _        | j                          y r+  )
r"   r#   
num_labelsrv  rX  r   r   r*   scorer  r1  s     r,   r#   z'ZambaForSequenceClassification.__init__d  sS      ++'
YYv114??O
 	r-   Nr  r   rJ  r   r  r  r  r>  r  r  r    c                    |
|
n| j                   j                  }
| j                  ||||||||	|
	      }|d   }| j                  |      }||j                  d   }n|j                  d   }| j                   j
                  |dk7  rt        d      | j                   j
                  d}n||| j                   j
                  k7  j                  |j                  t        j                        }t        j                  |j                  d   |j                  t        j                        }||z  j                  d      }n.d}t        j                  | j                  j                    d       |t        j                  ||j                  	      |f   }d}||j                  |j                        }| j                   j"                  | j$                  dk(  rd
| j                   _        nl| j$                  dkD  rL|j&                  t        j(                  k(  s|j&                  t        j*                  k(  rd| j                   _        nd| j                   _        | j                   j"                  d
k(  rIt-               }| j$                  dk(  r& ||j/                         |j/                               }n |||      }n| j                   j"                  dk(  r=t1               } ||j3                  d| j$                        |j3                  d            }n,| j                   j"                  dk(  rt5               } |||      }|
s|f|dd z   }||f|z   S |S t7        |||j8                  |j:                  |j<                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        N)r   rJ  r   r  r  r>  r  r  r   r   z=Cannot handle batch sizes > 1 if no padding token is defined.r1   rT   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`rW   
regressionsingle_label_classificationmulti_label_classificationr  )ro   r  rX  r  r>   rx  r'  r4   rU   r%   int32r   argmaxr   r   r+   rA   problem_typer  r3   longr   r   r   r   r   r   r   r   r.   r  )r)   r  r   rJ  r   r  r  r  r>  r  r  r   transformer_outputsr.   r  rp   last_non_pad_tokennon_pad_masktoken_indicespooled_logitsr  loss_fctr  s                          r,   r;   z&ZambaForSequenceClassification.forwardm  s   * &1%<k$++B]B]"jj)%+'/!5# ) 

 ,A.M* "+J&,,Q/J;;##+
a\]];;##+!#"%)A)AAEEfmmUZU`U`aL!LL)<V]]Z_ZeZefM"/,">!F!Fr!J!#>>**+ ,Z Z
 u||Jv}}MOaabYYv}}-F{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#M$9$9$;V^^=MND#M6:D))-JJ+- 2 22t GUWY))-II,.v6#%(;AB(??F)-)9TGf$EvE/ /??-;;*55
 	
r-   r  )rA   rB   rC   r#   r   r%   r   rE   r   rD  rC  r=   r   r;   rF   rG   s   @r,   r  r  U  s     .2.204(,26*.!%)-,0#'\
##d*\
 t+\
 &&-	\

 \
 ((4/\
   4'\
 $;\
  $;\
 #Tk\
 D[\
 
1	1\
 \
r-   r  )r  r  rv  rW  )r   )Cr   rb  collections.abcr   typingr   r%   r   torch.nnr   r   r    r
   r^  activationsr   cache_utilsr   
generationr   integrations.hub_kernelsr   masking_utilsr   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   utils.import_utilsr   configuration_zambar   
get_loggerrA   r   Moduler   rE   r   rP   rR   rD   r   r   r   r)  r5  rF  rO  rW  rv  r  r  __all__r   r-   r,   <module>r     s  &   $    A A & !   ) 8 / B 9 q q F & , 9 , 
		H	%J299 J*	UU\\ 	U# 	U%,, 	Uc$ c$Z %II%<<% 
% <<	%
 LL4'% % %2C)RYY C)Lg]bii g]V	ryy  < <~B7 BJE1 EP !!? !! !!H L<% L< L<`~+_ ~B g
%9 g
g
T gr-   