
    qiؾ                    P   d dl Z d dlmZmZ d dlmZ d dlmZ d dlZd dl	m
Z
 d dlm
c mZ ddlmZ ddlmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZ ddlm Z m!Z!m"Z" ddl#m$Z$m%Z% ddl&m'Z'm(Z( ddl)m*Z* ddl+m,Z,m-Z-m.Z.m/Z/m0Z0 ddl1m2Z2m3Z3 ddl4m5Z5 ddl6m7Z7 ddl8m9Z9m:Z:m;Z;m<Z< ee. G d de!                    Z=e e.d       G d de                     Z>e e.d       G d d e,                    Z? G d! d"e
j                        ZA G d# d$e
j                        ZB G d% d&e
j                        ZC G d' d(e
j                        ZD G d) d*e
j                        ZE G d+ d,e
j                        ZF G d- d.e
j                        ZG G d/ d0e
j                        ZH G d1 d2e
j                        ZI G d3 d4e
j                        ZJ G d5 d6e
j                        ZL G d7 d8e
j                        ZM G d9 d:e
j                        ZN G d; d<e
j                        ZOd= ZPd>ej                  d?eRd@ej                  fdAZS	 	 	 didBe
j                  dCej                  dDej                  dEej                  dFej                  dz  dGeTdHeTdz  dIeTdz  d@eUej                  ej                  f   fdJZVdjdKej                  dLej                  dMej                  dNeRfdOZW eeW       G dP dQe
j                               ZX G dR dSe      ZYe. G dT dUe(             ZZ G dV dWeZ      Z[ G dX dYe
j                        Z\ e.dZ       G d[ d\eZ             Z] e.d]       G d^ d_eZe             Z^ G d` dae
j                        Z_ e.db       G dc ddeZ             Z` e.de       G df dgeZe             Zag dhZby)k    N)CallableSequence)	dataclass)Optional   )initialization)ACT2FN)CacheDynamicCache)GenerationMixin)use_kernelized_func)create_causal_mask!create_sliding_window_causal_mask)GradientCheckpointingLayer)BaseModelOutputWithPastBaseModelOutputWithPoolingCausalLMOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringcan_return_tupletorch_compilable_check)maybe_autocastmerge_with_config_defaults)capture_outputs   )	AutoModel   )Gemma3nAudioConfigGemma3nConfigGemma3nTextConfigGemma3nVisionConfigc                   :    e Zd ZU dZdZej                  dz  ed<   y)Gemma3nAudioEncoderModelOutputzy
    audio_mel_mask (`torch.BoolTensor`, *optional*):
        A torch.BoolTensor of shape `(batch_size, num_frames)`
    Naudio_mel_mask)__name__
__module____qualname____doc__r*   torch
BoolTensor__annotations__     ^/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/gemma3n/modeling_gemma3n.pyr)   r)   6   s    
 /3NE$$t+2r3   r)   zL
    Base class for Gemma3n outputs, with hidden states and attentions.
    custom_introc                   b    e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   y)Gemma3nModelOutputWithPasta  
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
    audio_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        audio_hidden_states of the model produced by the audio encoder and after projecting the last hidden state.
    Nimage_hidden_statesaudio_hidden_states)	r+   r,   r-   r.   r9   r/   FloatTensorr1   r:   r2   r3   r4   r8   r8   A   s5     59**T1848**T18r3   r8   zS
    Base class for Gemma3n causal language model (or autoregressive) outputs.
    c                   "   e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	e
dz  ed<   dZeej                     dz  ed<   dZeej                     dz  ed<   dZej                  dz  ed<   dZej                  dz  ed	<   y)
Gemma3nCausalLMOutputWithPastaF  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.text_config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder after projecting last hidden state.
    audio_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        audio_hidden_states of the model produced by the audio encoder and after projecting the last hidden state.
    Nlosslogitspast_key_valueshidden_states
attentionsr9   r:   )r+   r,   r-   r.   r>   r/   r;   r1   r?   r@   r
   rA   tuplerB   r9   r:   r2   r3   r4   r=   r=   [   s    $ &*D%

d
")'+FE$+$(OUT\(59M5**+d2926Je''(4/648**T1848**T18r3   r=   c                   r     e Zd Zd
dededef fdZd Zdej                  dej                  fdZ
d	 Z xZS )Gemma3nRMSNormdimeps
with_scalec                     t         |           || _        || _        | j                  r.t	        j
                  t        j                  |            | _        y | j                  dt        j                  d      d       y )Nweight      ?F
persistent)super__init__rG   rH   nn	Parameterr/   onesrJ   register_buffertensor)selfrF   rG   rH   	__class__s       r4   rO   zGemma3nRMSNorm.__init__   sY    $??,,uzz#7DK  5<<+< Or3   c                     |t        j                  |j                  d      j                  dd      | j                  z         z  S )Nr!   T)keepdim)r/   sqrtpowmeanrG   )rU   xs     r4   _normzGemma3nRMSNorm._norm   s4    5::aeeAhmmBm=HIIIr3   r]   returnc                     | j                  |j                               | j                  j                         z  }|j                  |      S N)r^   floatrJ   type_as)rU   r]   outputs      r4   forwardzGemma3nRMSNorm.forward   s9     AGGI&):):)<<~~a  r3   c                 ^    t        | j                  j                         d| j                   S )Nz, eps=)rC   rJ   shaperG   rU   s    r4   
extra_reprzGemma3nRMSNorm.extra_repr   s'    ))*+6$((<<r3   )gư>T)r+   r,   r-   intrb   boolrO   r^   r/   Tensorre   ri   __classcell__rV   s   @r4   rE   rE   ~   sG    PC Pe P PJ! !%,, !=r3   rE   c                       e Zd Zdef fdZdej                  dej                  dej                  fdZdej                  de	d	e	d
e	de	de	de	dej                  fdZ
dej                  dej                  dej                  fdZ xZS )%Gemma3nAudioRelativePositionEmbeddingconfigc                 R   t         |           || _        | j                  j                  | _        | j                  j
                  | _        | j                  | j                  z  | _        t        d| j                  j                  dz
        | _
        | j                  j                  | _        t        j                  | j                  | j                  | j                  z  d      | _        d}d}| j                  dz  }t!        j"                  t%        |      t%        |      z        t        |dz
  d      z  }|t'        j(                  t'        j*                  |      | z        z  }| j-                  d|j%                         j/                  d      j/                  d      d	       y )
Nr   r#   FbiasrK        @r!   inv_timescalesrL   )rN   rO   rq   conf_num_attention_heads	num_headshidden_sizechannelshead_dimmaxconf_attention_context_leftmax_backwardconf_attention_context_rightmax_forwardrP   Linearpos_projmathlogrb   r/   exparangerS   	unsqueeze)rU   rq   min_timescalemax_timescalenum_timescaleslog_timescale_incrementrv   rV   s          r4   rO   z.Gemma3nAudioRelativePositionEmbedding.__init__   sL   ==//74;;#J#JQ#NO;;CC		$--$--1OV[\!+"&((5+?%BV+V"WZ]^lop^prsZt"t&5<<3OSjRj3j)kk  ",,Q/99!< 	 	
r3   positiondtyper_   c                 P   |j                         j                  d      }|| j                  j                  |j                  t
        j                        z  }t        j                  t        j                  |      t        j                  |      gd      }|j                  |      S )NrX   devicer   rF   )rb   r   rv   tor   r/   float32catsincostype)rU   r   r   scaled_timetiming_signals        r4   _get_timing_signal_1d_posz?Gemma3nAudioRelativePositionEmbedding._get_timing_signal_1d_pos   s}    >>#--b1!4!4!7!7xV[VcVc!7!dd		599[#9599[;Q"RXZ[!!%((r3   term_bd_before_shift
batch_sizerx   num_query_blocksquery_block_sizekey_context_sizemax_span_plus_1c                     |dz   |z
  }d|f}	t         j                  j                  ||	      }
|
j                  |||||dz   z  f      }|ddddddd||z  f   }|j                  |||||f      }|S )aZ  Performs the relative shift.

        Args:
          term_bd_before_shift: Tensor of shape [B, N, U, W, F_span]. batch_size
            (B), num_heads (N), num_query_blocks (U), query_block_size (W),
            key_context_size (C = W+L+R), max_span_plus_1 (F_span = L+R+1).

        Returns:
          Tensor of shape [B, N, U, W, C].
        r#   r   N)rP   
functionalpadreshape)rU   r   r   rx   r   r   r   r   pad_amount_last_dimpadding_tupleterm_bd_paddedterm_bd_reshapedterm_bd_slicedterm_bd_shifteds                 r4   _relative_shiftz5Gemma3nAudioRelativePositionEmbedding._relative_shift   s    4  0!3F /0**+?O
 *11  $4q$89	
 *!Q3X5EHX5X3X*XY )00   
 r3   querieskeysc           	      R   |j                   \  }}}}}|j                   \  }}}	}}t        j                  | j                  | j                   dz
  d|j
                        j                  d      }
|
j                   d   }| j                  |
|j                        }| j                  |      }|j                  d|| j                  | j                        j                  d      }|j                  ddddd      }|j                  ddddd      }t        j                  ||      }|j                  ddddd      }|j                  ddd      }|j                  ||||z  |      }t        j                  ||      }|j                  |||||      }| j!                  ||||||	|      }||z   S )	Nr#   rX   r   r   r   r   r!      )rg   r/   r   r~   r   r   r   r   r   r   r   rx   r{   squeezepermutematmulr   )rU   r   r   r   r   r   rx   r{   _r   pos_indicesr   sin_emb_timing_signalprojected_sin_embsin_emb	queries_pkeys_p_tterm_ac
q_permuted
s_permuted
q_reshapedterm_bd_unshifed_matmulterm_bd_unshifedr   s                           r4   re   z-Gemma3nAudioRelativePositionEmbedding.forward   s    OVmmK
$&6	8'+zz$11 ll4#4#4t7G7G6G!6KRX_XfXfgqq
 &++A. $ > >w}} !? !

 !MM*?@#++APTP]P]^ff
 OOAq!Q2	<<1aA.,,y(3 __Q1a3
 __Q1-
  ''
I?ORb?bdlm

 #(,,z:"F 3::
 ..
 ((r3   )r+   r,   r-   r$   rO   r/   rl   r   r   rj   r   re   rm   rn   s   @r4   rp   rp      s    
1 
.)%,, )u{{ )W\WcWc );#ll; ; 	;
 ; ; ; ; 
;zL)u|| L)5<< L)ELL L)r3   rp   c                   *    e Zd Zdef fdZd Zdej                  dededej                  fdZ	d	ej                  dej                  fd
Z
d	ej                  dej                  fdZd	ej                  dej                  dej                  fdZ xZS )Gemma3nAudioAttentionrq   c                    t         |           || _        | j                  j                  | _        | j                  j
                  | _        | j
                  | j                  z  | _        | j                  j                  | _        | j                  j                  | _
        t        d| j                  j                  dz
        | _        | j                  j                  | _        | j                  | j                  z   | j                  z   | _        t#        |      | _        t'        j(                  t+        j,                  | j                  f            | _        t'        j0                  | j
                  | j                  | j                  z  d      | _        t'        j0                  | j
                  | j                  | j                  z  d      | _        t'        j0                  | j
                  | j                  | j                  z  d      | _        | j                  dz  }dt*        j&                  j8                  j;                  t+        j<                  d            z  }| j?                  d||z  jA                         jC                         d	       | jE                         }| j?                  d
|d	       | j?                  dt+        j<                  | j                        jG                         d	       y )Nr   r#   Frs         rK           q_scalerL   local_causal_valid_masksoftcap)$rN   rO   rq   rw   rx   ry   r{   conf_attention_chunk_size
chunk_sizer   max_future_horizonr|   r}   max_past_horizonconf_attention_logit_capattention_logits_soft_capcontext_sizerp   relative_position_embeddingrP   rQ   r/   zerosper_dim_scaler   q_projk_projv_projr   softplusrT   rS   clonedetachcreate_local_causal_valid_maskrb   )rU   rq   r   r_softplus_0r   rV   s        r4   rO   zGemma3nAudioAttention.__init__D  s)   ==;;22((DNN:++??"&++"J"J #At{{'N'NQR'R S)-)M)M& OOd.C.CCdF]F]]+PQW+X(\\%++t}}6F*GHii 0 0$..4==2PW\]ii 0 0$..4==2PW\]ii 0 0$..4==2PW\]--%UXX0099%,,s:KLLY<)?(F(F(H(O(O(Q^cd"&"E"E"G68O\abLL778>>@ 	 	
r3   c                    t        j                  t        j                  | j                  | j                  ft         j
                        d      j                  }t        j                  t        j                  | j                  | j                  ft         j
                        | j                  | j                  z         }t        j                  | j                  | j                  ft         j
                        }||z  |z  }|S )Nr   r   )diagonal)	r/   trilrR   r   r   rk   Tr   r   )rU   lower_causal_maskupper_causal_maskr   s       r4   r   z4Gemma3nAudioAttention.create_local_causal_valid_maskf  s    !JJJJ))4??;5::N
 ! 	 "JJJJ):):;5::N**T-D-DD
 #(**doot?P?P-QY^YcYc"d"9<M"MPa"a&&r3   r]   pad_left	pad_rightr_   c                     |j                   ^}}}|j                  ||g|      }|j                  ||g|      }t        j                  |||gd      }|S )Nr#   r   )rg   	new_zerosr/   r   )	rU   r]   r   r   batchr   
tail_shapeleftrights	            r4   	_pad_dim1zGemma3nAudioAttention._pad_dim1s  s^     !q:{{E89j9:UI;
;<IItQ&A.r3   rA   c                 (   |j                   }|dd \  }}|| j                  z   dz
  | j                  z  }|| j                  z  |z
  x}dkD  r| j                  |d|      }||| j                  f|dd z   }|j                  |      j	                         }|S )aE  Turns a sequence to non overlapping blocks.

        Args:
            hidden_states: a tensor of [batch, time, ...].

        Returns:
            A tensor of [batch, num_blocks, block_size, ...], with necessary
            paddings,
            where output[:, i, ...] are x[:, i*block_size:(i+1)*block_size, ...].
        Nr!   r#   r   )rg   r   r   r   
contiguous)rU   rA   rg   bt
num_blockspadding_lenpermute_dimss           r4   _convert_to_blockz'Gemma3nAudioAttention._convert_to_blockz  s     ##Ray1$//)A-$//A
%7!;;Kq@ NN=![IM:t7%)C%--l;FFHr3   c                 \   | j                   }| j                  | j                  z   dz
  }| j                  |||      }| j                  }| j                  }|j                  d||      }|j                  dkD  r'|j                  dkD  rt        j                  |dd      }|j                         S )a  Extracts temporal context for every block.

        Args:
            hidden_states: a tensor of [batch, time, ...].

        Returns:
            A tensor of [batch, num_blocks, context_size, ...], with necessary
            paddings,
            where context_size = block_size + left_context + right_context,
            and output[:, i, ...] are x[:, start-left_context:end+right_context,
            ...],
            start = i * block_size, end = (i + 1) * block_size.
        r#   )	dimensionsizestepr!   r   rX   )sourcedestination)
r   r   r   r   r   unfoldndimr/   movedimr   )rU   rA   r   r   	frame_len
frame_step
x_unfoldeds          r4   _extract_block_contextz,Gemma3nAudioAttention._extract_block_context  s     (( ++doo=A	}h	J%%	__
 #))AIJ)W
 !joo&9 z"!LJ$$&&r3   maskc                 	   g |j                   d d | j                  | j                  }| j                  |      j	                  |      j                         }| j                  |      j	                  |      j                         }| j                  |      j	                  |      j                         }t        j                  j                  j                  | j                        }ddd| j                  f}|j                  |      }	|| j                  z  |	z  }|j                   d d \  }
}| j                  |      }| j!                  |      }| j!                  |      }|j                   d   }| }| j!                  |      }|j"                  dk(  rI|j                   d   |j                   d   z  | j$                  k(  r|j	                  |
|| j$                        }|j                   |
|| j$                  fk7  r,t'        d|j                    d|
 d| d| j$                   d		      |j)                  d      j)                  d
      }| j*                  j)                  d      j)                  d      j)                  d      }t        j,                  ||j/                  |j0                              }| j3                  ||      }| j4                  j/                  |j0                        }||z  }t        j6                  |      }||z  }t        j8                  ||t        j:                  |j<                        j>                        }t        j                  j                  jA                  |dt        jB                        j/                  |j<                        }|j                   \  }}}}}|j                   d   }|jE                  ddddd      j	                  d||      }|jE                  ddddd      j	                  d||      }t        jF                  ||      } | j	                  |||||      jE                  ddddd      }!|!j	                  |
|| jH                  z  | j                  | j                  f      }!|!d d d |f   }!|!S )NrX   r#   r!   r   r   z%Shape of extracted_valid_mask_blocks z	 is not (z, z) after potential reshape.r   rF   r   r   )%rg   rx   r{   r   r   r   r   r   r/   rP   r   r   r   viewr   r   r   r   r   
ValueErrorr   r   logical_andr   r   r   r   tanhwherefinfor   minsoftmaxr   r   bmmr   )"rU   rA   r   	qkv_shapequery_states
key_statesvalue_statesper_dim_scale_spbroadcast_shapeper_dim_scale_sp_broadcastr   q_timequery_blocks
key_blocksvalue_blocksr   original_valid_maskextracted_valid_mask_blockscondition_from_input_validitycondition_from_causalityfinal_condition_for_wherer?   softcap_valprobabilitiesb_dimn_dimu_dimw_dimc_dimh_dimprob_bunv_bun
result_bmmcontext_vectorss"                                     r4   re   zGemma3nAudioAttention.forward  sT   Nm))#2.NNN	{{=199)DOOQ[[/77	BMMO
{{=199)DOOQ 88..778J8JKaDMM2%5%:%:?%K"#dll25OO)//3
F--l;00<
22<@'--a0  $e '+&A&ABU&V# (,,1+11!47R7X7XYZ7[[_c_p_pp*E*M*M,d.?.?+' ',,1
 

 /556i
| L$%R(9(9'::TV  )D(M(Ma(P(Z(Z[](^% $(#?#?#I#I!#L#V#VWX#Y#c#cde#f 
 %*$5$5)$''(E(L(LM%
! 11,
K lloofmm4+%F#+% 6FLL@Y@]@]^++33F%--3X[[bnbtbt[u -:,?,?)ueUE""2& ((Aq!Q7??E5Q$$Q1a3;;BuMYYx/
$,,UE5%OWWXY[\^_abdef)11 4??2	
 *!WfW*5r3   )r+   r,   r-   r$   rO   r   r/   rl   rj   r   r   r   r0   re   rm   rn   s   @r4   r   r   C  s     
1  
D'5<< 3 3 5<< u||  ,.'ELL .'U\\ .'`dU\\ d9I9I dell dr3   r   c                   r     e Zd ZdZ	 d	dedee   def fdZdej                  dej                  fdZ
 xZS )
Gemma3nAudioCumulativeGroupNorma  Applies Group Normalization cumulatively over the time dimension.

    This layer normalizes the input by calculating the mean and variance
    cumulatively over the time dimension (dim 1). The statistics are computed
    over all feature dimensions (specified by `feature_dims` and `num_channels`)
    for elements marked as valid by the optional `mask`.

    If a `mask` is provided (True for valid, False for invalid/padded),
    invalid time steps do not contribute to the statistics calculation, and
    their corresponding output values are zeroed out.

    Scale and bias, if enabled, are applied per-channel (last dimension).
    This behavior is similar to JAX's `GroupNormalization` with `num_groups=1`
    and `cumulative=True`.
    num_channelsfeature_dimsrG   c           	         t         |           || _        t        |      | _        || _        t        j                  t        j                  |            | _
        t        t        ddt        | j                        z   dz               | _        y )Nr!   r#   )rN   rO   r+  rC   r,  rG   rP   rQ   r/   rR   rJ   rangelenreduction_axes)rU   r+  r,  rG   rV   s       r4   rO   z(Gemma3nAudioCumulativeGroupNorm.__init__8  sr     	(!,/ ll5::l#;< $E!QT5F5F1G-G!-K$LMr3   rA   r_   c                    | j                   | j                  fz   }|j                  dd |k7  rt        d|j                  dd  d|       |j                  }t
        j                  }|j                  |      }t        j                  ||      }t        j                  || j                  d      }t        j                  |d	      }t        j                  || j                  d      }	t        j                  |	d	      }
t        j                  |
d
      }||z  }||z
  j                  d      }t        j                  || j                  d      }t        j                  |d	      }||z  }||z
  t        j                  || j                  z         z  }| j                   j                  |      }dg|j#                         dz
  z  | j                  gz   }||j%                  |      z  }||z  }|j                  |      S )zApplies cumulative group norm, optionally using a mask.

        Args:
          hidden_states: Input tensor, shape [B, T, *feature_dims, C].

        Returns:
          Normalized tensor with the same shape as x.
        r!   NzInput tensor shape suffix z> does not match expected suffix (feature_dims + num_channels) r   TrF   rY   r#   r   rK   )r
  )r,  r+  rg   r  r   r/   r   r   	ones_likesumr0  cumsumclampr[   rsqrtrG   rJ   rF   r  )rU   rA   expected_input_suffixinput_dtype
calc_dtypex_calc	mask_calcsum_values_at_tcum_sum_valueselements_in_group_at_tcum_count_elementssafe_cum_count_elementscum_meansquared_diff_from_meansum_sq_diff_at_tcum_sum_sq_diffcum_variancenormalized_xscalescale_view_shapefinal_outputs                        r4   re   z'Gemma3nAudioCumulativeGroupNorm.forwardJ  s    !% 1 1T5F5F4H Hqr"&;;,]-@-@-D,E F99N8OQ 
 $))]]
!!*- OOF*=	  ))F0C0CTRo1= "'9$:M:MW[!\"\\*@aH"'++.@c"J "$;;
 #)8"3!8!8!; 99%;ATAT^bc  ,,'7Q? ')@@ )U[[9P-QQ z*3-"3"3"5"9:d>O>O=PP#ejj1A&BB $i/{++r3   )gMbP?)r+   r,   r-   r.   rj   r   rb   rO   r/   rl   re   rm   rn   s   @r4   r*  r*  '  sT    ( 	NN smN 	N$G,U\\ G,ell G,r3   r*  c                   ~     e Zd ZdZ	 d
dedededeeeeef   f fdZdej                  dej                  fd	Z
 xZS )Gemma3nAudioSSCPConvBlockzA single convolution block for the SubSampleConvProjection.

    This block consists of a 2D convolution, followed by CumulativeGroupNorm,
    and a ReLU activation. It handles manual padding for the convolution.
    rq   idxinput_freq_dimmanual_paddingc                 J   t         |           || _        || _        |dk(  rdn| j                  j                  |dz
     }| j                  j                  |   }| j                  j
                  |   \  }}| j                  j                  |   \  }	}
t        j                  ||||f|	|
fdd      | _	        || j                  d   z   | j                  d   z   }||z
  |
z  dz   }t        ||f| j                  j                        | _        t        j                         | _        y )Nr   r#   )r   r   F)in_channelsout_channelskernel_sizestridepaddingrt   )r+  r,  rG   )rN   rO   rq   rO  sscp_conv_channel_sizesscp_conv_kernel_sizesscp_conv_stride_sizerP   Conv2dconvr*  sscp_conv_group_norm_epsnormReLU
activation)rU   rq   rM  rN  rO  rQ  rR  kernel_hkernel_wstride_hstride_wf_in_padded
f_out_convrV   s                r4   rO   z"Gemma3nAudioSSCPConvBlock.__init__  s%    	, !8a)K)KCRSG)T{{99#>![[>>sC(![[>>sC(II#% h'

	 %t':':1'==@S@STU@VV!H,9A=
3%$44
	 '')r3   audio_encodingsr_   c                    t        j                  || j                  dd      j                  | j                  j
                  j                        }| j	                  |      }|j                  dddd      j                         }| j                  |      }|j                  dddd      j                         }| j                  |      S )Nconstantr   )modevaluer   r!   r   r#   )Fr   rO  r   rZ  rJ   r   r   r   r\  r^  )rU   re  audio_encodings_paddedaudio_encodings_conv
x_for_normx_normedaudio_encodings_normeds          r4   re   z!Gemma3nAudioSSCPConvBlock.forward  s     "#8K8KR\dg!h!k!kII"""

  $yy)?@ *11!Q1=HHJ
99Z(!)!1!1!Q1!=!H!H!J566r3   ))r   r   r   r   )r+   r,   r-   r.   r$   rj   rC   rO   r/   rl   re   rm   rn   s   @r4   rL  rL    sc     5A)$")$ )$ 	)$
 c3S01)$V7u|| 7 7r3   rL  c                   \     e Zd Zdef fdZdej                  dej                  fdZ xZS )#Gemma3nAudioSubSampleConvProjectionrq   c                 p   t         |           || _        |j                  }g }g }t	        d      D ]n  }|j
                  |   \  }}|j                  |   \  }}	d}
|dz
  }d}d}|||
|f}|j                  |       ||z   |z   }||z
  |	z  dz   }|j                  |       |}p t        d|j                  ||d         | _	        t        d|d   ||d         | _
        |j                  d   }|d   }||z  | _        t        j                  | j                  | j                  j                  d      | _        y )Nr!   r   r#   )rM  rN  rq   rO  rX   Frs   )rN   rO   rq   input_feat_sizer.  rW  rX  appendrL  conv_0conv_1rV  input_proj_in_featuresrP   r   ry   input_proj_linear)rU   rq   current_f_for_block_inputcalculated_block_paddingcalculated_f_out_dimsir_  r`  ra  rb  	pad_t_toppad_t_bottom
pad_f_leftpad_f_rightmanual_padding_tuplerc  f_out_after_convfinal_c_outfinal_f_outrV   s                      r4   rO   z,Gemma3nAudioSubSampleConvProjection.__init__  s   $*$:$:!#%  "q 	9A!'!=!=a!@Hh!'!=!=a!@Hh I#a<L JK 	$  %++,@A 4j@;NK +h 68CaG!(()9:(8%=	9@ 0!113A6	
 0033A6	
 33B7+B/&1K&?#!#4+F+FH_H_fk!lr3   re  r_   c                     |j                  d      }| j                  |      }| j                  |      }|j                  \  }}}}|j	                  dddd      j                         }|j                  ||||z        }	| j                  |	      }
|
S )Nr#   r   r!   r   )r   ru  rv  rg   r   r   r  rx  )rU   re  audio_encodings_reshapedr]   r   c_outt_outf_out
x_permutedoutput_flattenedrd   s              r4   re   z+Gemma3nAudioSubSampleConvProjection.forward  s     $3#<#<Q#? KK01KKN!"5%YYq!Q*557
%??1eUU]C''(89r3   	r+   r,   r-   r$   rO   r/   rl   re   rm   rn   s   @r4   rq  rq    s.    7m1 7mru||  r3   rq  c                   t     e Zd Zdef fdZdej                  dej                  dej                  fdZ xZ	S )Gemma3nAudioConformerAttentionrq   c                    t         |           || _        | j                  j                  | _        | j                  dt        j                  | j                  j                        d       t        | j                  j                        | _
        t        |      | _        t        j                  | j                  | j                  j                  d      | _        t        | j                  j                        | _        y )Ngradient_clippingFrL   rs   )rN   rO   rq   ry   post_in_featuresrS   r/   rT   r  rE   pre_attn_normr   attnrP   r   post	post_normrU   rq   rV   s     r4   rO   z'Gemma3nAudioConformerAttention.__init__#  s     $ 7 70%,,t{{?\?\2]jop+DKK,C,CD)&1	IId33T[[5L5LSXY	'(?(?@r3   re  r*   r_   c                    |}t        j                  || j                   | j                        }| j                  |      }| j	                  ||      }|j
                  \  }}}}	|j                  ||||	z        }
| j                  |
      }t        j                  || j                   | j                        }|| j                  |      z   S ra   )	r/   r6  r  r  r  rg   r   r  r  )rU   re  r*   audio_encodings_input_to_attnaudio_encodings_normaudio_encodings_attn_outr   r   rx   r{   r  s              r4   re   z&Gemma3nAudioConformerAttention.forward-  s    (7%++o8N8N7NPTPfPfg#11/B#'99-A>#R  %=$B$B!1i#;#C#CAq)V^J^#_ ))$<=++o8N8N7NPTPfPfg,t~~o/NNNr3   
r+   r,   r-   r$   rO   r/   rl   r0   re   rm   rn   s   @r4   r  r  "  sA    A1 AOu|| OUEUEU OZ_ZfZf Or3   r  c                   \     e Zd Zdef fdZdej                  dej                  fdZ xZS ) Gemma3nAudioConformerFeedForwardrq   c                    t         |           || _        | j                  dt	        j
                  | j                  j                        d       t        | j                  j                        | _	        t        j                  | j                  j                  | j                  j                  dz  d      | _        t        j                  | j                  j                  dz  | j                  j                  d      | _        t        | j                  j                        | _        | j                  j                  | _        y )Nr  FrL   r   rs   )rN   rO   rq   rS   r/   rT   r  rE   ry   pre_layer_normrP   r   ffw_layer_1ffw_layer_2post_layer_normconf_residual_weightpost_layer_scaler  s     r4   rO   z)Gemma3nAudioConformerFeedForward.__init__?  s    0%,,t{{?\?\2]jop,T[[-D-DE99T[[%<%<dkk>U>UXY>Y`ef99T[[%<%<q%@$++BYBY`ef-dkk.E.EF $ @ @r3   re  r_   c                    |}t        j                  || j                   | j                        }| j                  |      }| j	                  |      }t
        j                  j                  |      }| j                  |      }t        j                  || j                   | j                        }| j                  |      }||| j                  z  z   S ra   )r/   r6  r  r  r  rP   r   silur  r  r  )rU   re  residuals      r4   re   z(Gemma3nAudioConformerFeedForward.forwardK  s    "++o8N8N7NPTPfPfg--o>(,(8(8(I--,,_=(,(8(8(I++o8N8N7NPTPfPfg..??T-B-BBCCr3   r  rn   s   @r4   r  r  >  s0    
A1 
A	Du|| 	D 	Dr3   r  c                   \     e Zd Zdef fdZdej                  dej                  fdZ xZS ) Gemma3nAudioConformerLightConv1drq   c           	         t         |           || _        t        | j                  j                  | j                  j
                        | _        t        j                  | j                  j                  | j                  j                  dz  d      | _	        t        j                  | j                  j                  | j                  j                  | j                  j                  dd| j                  j                  d      | _        | j                  dt        j                  | j                  j                         d	       t        | j                  j                  | j                  j
                        | _        t        j                  | j                  j                  | j                  j                  d      | _        | j                  j                  dz
  | _        y )
NrG   r!   Frs   r#   r   )rQ  rR  rS  rT  rU  groupsrt   r  rL   )rN   rO   rq   rE   ry   rms_norm_epsr  rP   r   linear_startConv1dconf_conv_kernel_sizedepthwise_conv1drS   r/   rT   r  	conv_norm
linear_endcausal_paddingr  s     r4   rO   z)Gemma3nAudioConformerLightConv1d.__init__X  sD   ,T[[-D-D$++JbJbcIIdkk&=&=t{{?V?VYZ?Zafg "		//0099;;**!
 	0%,,t{{?\?\2]jop'(?(?T[[E]E]^))DKK$;$;T[[=T=T[`a"kk??!Cr3   re  r_   c                 :   |}| j                  |      }| j                  |      }t        j                  j                  j                  |d      }|j                  ddd      }t        j                  || j                  df      }| j                  |      }|j                  ddd      }t        j                  || j                   | j                        }| j                  |      }t        j                  j                  |      }| j                  |      }||z   }|S )NrX   r   r   r!   r#   )r  r  r/   rP   r   glur   rj  r   r  r  r6  r  r  r  r  )rU   re  audio_encodings_residualaudio_encodings_permutedaudio_encodings_permuted_paddedrd   s         r4   re   z(Gemma3nAudioConformerLightConv1d.forwardm  s   #2 --o>++O<((--11/r1J#2#:#:1a#C *+%%0H4K^K^`aJb*c'//0OP)11!Q:++o8N8N7NPTPfPfg..9--,,_=///: #;;r3   r  rn   s   @r4   r  r  W  s-    D1 D*u||  r3   r  c                   t     e Zd Zdef fdZdej                  dej                  dej                  fdZ xZ	S )Gemma3nAudioConformerBlockrq   c                    t         |           || _        t        | j                        | _        t        | j                        | _        t        | j                        | _        t        | j                        | _	        | j                  dt        j                  | j                  j                        d       t        | j                  j                        | _        y )Nr  FrL   )rN   rO   rq   r  ffw_layer_startr  	attentionr  lconv1dffw_layer_endrS   r/   rT   r  rE   ry   r\  r  s     r4   rO   z#Gemma3nAudioConformerBlock.__init__  s    ?L7D7D=dkkJ0%,,t{{?\?\2]jop"4;;#:#:;	r3   re  r*   r_   c                 j   | j                  |      }| j                  ||      }| }||j                  d      j                  |j                        z  }| j                  |      }| j                  |      }t        j                  || j                   | j                        }| j                  |      }|S )NrX   )r  r  r   r   r   r  r  r/   r6  r  r\  )rU   re  r*   validity_mask_for_lconvaudio_encodings_for_lconv_inputrd   s         r4   re   z"Gemma3nAudioConformerBlock.forward  s    ..?...I#1/*9<S<]<]^`<a<d<d!!=
 +
' ,,'FG,,_=++o8N8N7NPTPfPfg?+r3   r  rn   s   @r4   r  r    s;    	<1 	<u|| UEUEU Z_ZfZf r3   r  c            	       Z     e Zd ZdZd	dedededef fdZdej                  f fdZ	 xZ
S )
Gemma3nTextScaledWordEmbeddingz\
    This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
    num_embeddingsembedding_dimpadding_idxembed_scalec                     t         |   |||       || _        | j                  dt	        j
                  |      d       y )Nr  FrL   )rN   rO   scalar_embed_scalerS   r/   rT   )rU   r  r  r  r  rV   s        r4   rO   z'Gemma3nTextScaledWordEmbedding.__init__  s;    D"-]ELL,ERWXr3   	input_idsc                     t         |   |      | j                  j                  | j                  j
                        z  S ra   )rN   re   r  r   rJ   r   )rU   r  rV   s     r4   re   z&Gemma3nTextScaledWordEmbedding.forward  s2    wy)D,<,<,?,?@Q@Q,RRRr3   )rK   )r+   r,   r-   r.   rj   rb   rO   r/   rl   re   rm   rn   s   @r4   r  r    sG    Ys Y3 YS Y_d Y
S S Sr3   r  c                   `     e Zd ZdZdef fdZdej                  dej                  fdZ xZ	S )Gemma3nTextLaurelBlockz Learned Augmented Residual Layerrq   c                    t         |           || _        t        j                  | j                  j
                  | j                  j                  d      | _        t        j                  | j                  j                  | j                  j
                  d      | _        t        | j                  j
                  | j                  j                        | _        y )NFrs   r  )rN   rO   rq   rP   r   ry   laurel_ranklinear_leftlinear_rightrE   r  post_laurel_normr  s     r4   rO   zGemma3nTextLaurelBlock.__init__  s    99T[[%<%<dkk>U>U\abIIdkk&=&=t{{?V?V]bc .t{{/F/FDKKLdLd er3   rA   r_   c                 r    | j                  |      }| j                  |      }| j                  |      }||z   S ra   )r  r  r  )rU   rA   laurel_hidden_statesnormed_laurel_hidden_statess       r4   re   zGemma3nTextLaurelBlock.forward  sC    -1-=-=m-L-1->->?S-T&*&;&;<P&Q#:::r3   )
r+   r,   r-   r.   r&   rO   r/   rl   re   rm   rn   s   @r4   r  r    s0    *f0 f;U\\ ;ell ;r3   r  c                        e Zd Zd	dedef fdZdej                  dej                  fdZdej                  dej                  fdZ	 xZ
S )
Gemma3nTextMLPrq   	layer_idxc                    t         |           || _        |j                  | _        |j                  |   | _        t        j                  | j                  | j                  d      | _        t        j                  | j                  | j                  d      | _        t        j                  | j                  | j                  d      | _	        t        |j                     | _        |j                  |   | _        y NFrs   )rN   rO   rq   ry   intermediate_sizerP   r   	gate_projup_proj	down_projr	   hidden_activationact_fnactivation_sparsity_patternactivation_sparsityrU   rq   r  rV   s      r4   rO   zGemma3nTextMLP.__init__  s    !--!'!9!9)!D4#3#3T5K5KRWXyy!1!143I3IPUV4#9#94;K;KRWXV556#)#E#Ei#P r3   rA   r_   c                     | j                  |      }| j                  dkD  r| j                  |      }| j                  |      }| j	                  |      }| j                  ||z        }|S )Nr   )r  r  _gaussian_topkr  r  r  )rU   rA   r  activationsr  r  s         r4   re   zGemma3nTextMLP.forward  sc    NN=1	##c)++I6Ikk),,,}-NN;#89	r3   inputsc                    t        j                  | j                  t         j                  |j                        }t         j
                  j                  j                  dd      }|j                  |      }|j                  |j                        }t        j                  |dd      }t        j                  |ddd      }|||z  z   }t        j                  j                  ||z
        S )	Nr   r   r   r#   rX   Tr2  F)rF   rY   unbiased)r/   rT   r  r   r   distributionsnormalNormalicdfr   r   r\   stdrP   r   relu)rU   r  target_sparsity_tensornormal_diststd_multiplierinputs_mean
inputs_stdcutoff_xs           r4   r  zGemma3nTextMLP._gaussian_topk  s    !&d.F.Femmdjdqdq!r ))00771='2'7'78N'O',,V\\:jjR>YYv2teL
n!<<}}!!&8"344r3   )r   )r+   r,   r-   r&   rj   rO   r/   rl   re   r  rm   rn   s   @r4   r  r    sP    	Q0 	QS 	QU\\ ell 5U\\ 5ell 5r3   r  c                   X    e Zd ZdZdef fdZdej                  dej                  fdZdej                  dej                  fdZ	d	ej                  d
ej                  dej                  fdZ
dej                  dej                  fdZdej                  dej                  fdZ xZS )Gemma3nTextAltUpa  Alternating Updates (AltUp)

    The AltUp module wraps transformer layers. The `predict` step modifies the
    input to the transformer layer, and the `correct` step propagates the output
    of the transformer layer to the sparsely updated dimensions.

    See more in the research paper:

    https://proceedings.neurips.cc/paper_files/paper/2023/file/f2059277ac6ce66e7e5543001afa8bb5-Paper-Conference.pdf
    rq   c                 F   t         |           || _        t        j                  t        j                  | j                  j                              | _        t        j                  | j                  j                  | j                  j                  d      | _        t        j                  | j                  j                  | j                  j                  dz  d      | _        t        j                  | j                  j                  | j                  j                  d      | _        t        | j                  j                  | j                  j                        | _        | j#                  dt        j$                  | j                  j                  dz        d       y )NFrs   r!   r  router_input_scale      rL   )rN   rO   rq   rP   rQ   r/   r   ry   correct_output_scaler   altup_num_inputscorrection_coefsprediction_coefsmodality_routerrE   r  router_normrS   rT   r  s     r4   rO   zGemma3nTextAltUp.__init__  s   $&LLT[[=T=T1U$V! "		$++*F*FHdHdkp q "		$++*F*FHdHdfgHgns t!yy)@)@$++B^B^ejk)$++*A*At{{G_G_`15<<@W@WY]@]3^kpqr3   r]   r_   c                     | j                  |      | j                  z  }| j                  |      }t        j                  |j                               j                  |      S ra   )r  r  r  r/   r  rb   rc   )rU   r]   router_inputsrouteds       r4   compute_router_modalitiesz*Gemma3nTextAltUp.compute_router_modalities  sM    ((+d.E.EE%%m4zz&,,.)11!44r3   rA   c                    | j                  || j                  j                           }| j                  ro| j                  j                  Y| j
                  j                  j                  j                  | j                  j                   | j                  j                          | j                  |      j                  g |j                  dd | j                  j                  | j                  j                   j                  dddd      }t        j                  |j                  dddd      |      }|j                  dddd      }||z  }|j                         j!                  |      S )a  Predicts the output of a layer using a trainable map.

        Args:
            hidden_states: A 4D tensor of shape `[num_altup_inputs, batch_size, num_tokens, hidden_size]` derived by
                stacking the input embeddings and preprocessing the last `num_altup_inputs - 1` matrices.

        Returns:
            A 4D tensor of shape `[num_altup_inputs, batch_size, num_tokens, hidden_size]` containing the predictions.
        NrX   r   r#   r   r!   )r  rq   altup_active_idxtrainingaltup_coef_clipr  rJ   dataclamp_r   rg   r  r   r/   r   r   rc   )rU   rA   
modalities	all_coefspredictionss        r4   predictzGemma3nTextAltUp.predict  s@    33M$++B^B^4_`
==T[[88D!!((--44dkk6Q6Q5QSWS^S^SnSnoD!!*-Wi &&s+i-1[[-I-IiKO;;KgKgiWQ1a  	 ll=#8#8Aq!#DiP!))!Q15}$%%'//>>r3   r  	activatedc                    | j                  |      }||| j                  j                     z
  }|j                  | j                  j                  ddd      }| j
                  r| j                  j                  | j                  j                  j                  | j                  j                   | j                  j                        }t        j                  j                  j                  ||d      dz   }n| j                  |      dz   }|j                  ddd      j                  d      }t        j                   ||      }||z  }|j#                         j%                  |      S )a_  Corrects the predictions relative to the

        Args:
            predictions: A 4D tensor of shape `[num_altup_inputs, batch_size, num_tokens, hidden_size]` derived by
                stacking the input embeddings and preprocessing the last `num_altup_inputs - 1` matrices.
            activated: A 3D tensor of shape `[batch_size, num_tokens, hidden_size]` containing the activated inputs.

        Returns:
            A 4D tensor of shape `[num_altup_inputs, batch_size, num_tokens, hidden_size]` correcting the original
                predictions relative to the activated input embeddings.
        r#   Nrs   rK   r!   r   rX   )r  rq   r	  repeatr  r
  r  r   rJ   r6  r/   rP   r   linearr   r   mulr   rc   )rU   r  r  r  
innovationrJ   r  	correcteds           r4   correctzGemma3nTextAltUp.correct  s,    33I>
T[[-I-I!JJ
&&t{{'C'CQ1M
==T[[88D**11779T9T8TVZVaVaVqVqrF++22:vD2QTWWI--j9C?I
 %%aA.88<	IIj)4	[ 	##%--i88r3   r  c                 p    |j                  | j                        | j                  z  j                  |      S )a	  
        This is only defined as the `forward` so that accelerate hooks can move correctly `correct_output_scale`
        (which is a nn.Parameter, not a Module) between devices when offloading. It is otherwise only used in
        `scale_corrected_output`
        )rc   r  rU   r  s     r4   re   zGemma3nTextAltUp.forward9  s2     !!$";";<t?X?XXaabkllr3   c                 $    | j                  |      S )zMScales the provided 3D tensor of shape [batch_size, num_tokens, hidden_size].)re   r  s     r4   scale_corrected_outputz'Gemma3nTextAltUp.scale_corrected_outputA  s    ||I&&r3   )r+   r,   r-   r.   r&   rO   r/   rl   r  r  r  re   r  rm   rn   s   @r4   r  r    s    	r0 r55<< 5ELL 5
?U\\ ?ell ?895<< 9ELL 9U\\ 9>m m%,, m' ' 'r3   r  c                     | dd| j                   d   dz  f   }| d| j                   d   dz  df   }t        j                  | |fd      S )z*Rotates half the hidden dims of the input..NrX   r!   r   )rg   r/   r   )r]   x1x2s      r4   rotate_halfr!  F  sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''r3   rA   n_repr_   c                     | j                   \  }}}}|dk(  r| S | dddddddddf   j                  |||||      } | j                  |||z  ||      S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r#   N)rg   expandr   )rA   r"  r   num_key_value_headsslenr{   s         r4   	repeat_kvr'  M  so    
 2?1D1D.Ehz!!Qa"23::5BUW\^bdlmM  (;e(CT8TTr3   modulequerykeyri  attention_maskdropoutscalingr   c                 |   || j                   dz  }t        || j                        }	t        || j                        }
t        j                  ||	j                  dd            |z  }|||z  }t        j                  |      }||z  }|||z   }t        j                  j                  |dt        j                        j                  |j                        }t        j                  j                  ||| j                        }t        j                  ||
      }|j                  dd      j                         }||fS )Nr   r!   r   rX   r  )pr
  r#   )r{   r'  num_key_value_groupsr/   r   	transposer  rP   r   r  r   r   r   r,  r
  r   )r(  r)  r*  ri  r+  r,  r-  r   kwargsr  r  attn_weightsattn_outputs                r4   eager_attention_forwardr5  Y  s    //4'3 ; ;<JUF$?$?@L<<z';';Aq'ABWLL#g-zz,/#g-!#n4 ==((2U]](SVVW\WbWbcL==((6??([L,,|\:K''1-88:K$$r3   r]   r   r   unsqueeze_dimc                 n    |j                  |      }|j                  |      }| |z  t        |       |z  z   S )a\  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        x (`torch.Tensor`): The tensor to embed.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )r   r!  )r]   r   r   r6  s       r4   apply_rotary_pos_embr8  {  s8    " --
&C
--
&CGA,--r3   c                       e Zd ZdZdedef fdZ	 	 	 	 ddej                  dej                  dej                  dz  d	e	dz  d
ej                  dz  dee   deej                  ej                  dz  eej                     dz  f   fdZ xZS )Gemma3nTextAttentionz=Multi-headed attention from 'Attention Is All You Need' paperrq   r  c                    t         |           t        |d      r|j                  |   nd | _        || _        || _        t        |d|j                  |j                  z        | _
        |j                  |j                  z  | _        d| _        | j
                  j                  | _        d| _        t!        j"                  |j                  |j                  | j                  z  |j$                        | _        t!        j"                  |j                  |j                  | j                  z  |j$                        | _        t!        j"                  |j                  |j                  | j                  z  |j$                        | _        t!        j"                  |j                  | j                  z  |j                  |j$                        | _        | j                  dk(  r|j.                  nd | _        | j                  dk(  | _        t3        |j                  |j4                        | _        t3        |j                  |j4                        | _        t3        |j                  |j4                  d	      | _        | j
                  j<                  | j
                  j>                  z
  }||cxk\  xr d
kD  nc | _         |j                  d | }| j@                  r@tC        |      dz
  |d d d   jE                  |j                  |         z
  | _#        d| _$        y d | _#        |tC        |      dz
  |d d d   jE                  |j                  |         z
  k(  | _$        y )Nlayer_typesr{   rK   Trs   sliding_attention)rF   rG   F)rF   rG   rH   r   r#   rX   )%rN   rO   hasattrr<  
layer_typerq   r  getattrry   num_attention_headsr{   r%  r0  r-  attention_dropout	is_causalrP   r   attention_biasr   r   r   o_projsliding_window
is_slidingrE   r  q_normk_normv_normnum_hidden_layersnum_kv_shared_layersis_kv_shared_layerr/  indexkv_shared_layer_indexstore_full_length_kv)rU   rq   r  first_kv_shared_layer_idxprev_layersrV   s        r4   rO   zGemma3nTextAttention.__init__  s   ;B6=;Y&,,Y7_c"
F4F4F&JdJd4de$*$>$>&B\B\$\!!%!>!>ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii&&68J8JQWQfQf
 8<J]7]f33cg//-@@$f>Q>QR$f>Q>QR$f>Q>Q^cd$(KK$A$ADKKDdDd$d!"+/H"L1"L(()C*CD""),[)9A)=DbD@Q@W@WX^XjXjktXu@v)vD&(-D%)-D&(1S5E5IKX\Z\X\L]LcLc""9-M 6 )D%r3   NrA   position_embeddingsr+  r@   cache_positionr2  r_   c                 2   |j                   d d }g |d| j                  j                  }|\  }	}
| j                  |      j	                  |      }| j                  |      }t        ||	|
d      }|j                  dd      }| j                  rU|S|j                  | j                     \  }}|j                  |j                        }|j                  |j                        }n| j                  |      j	                  |      }| j                  |      }t        ||	|
d      }|j                  dd      }| j                  |      j	                  |      }| j!                  |      }|j                  dd      }|x|
|	|| j"                  d}| j                  s!|j%                  ||| j&                  |      \  }}| j(                  r.t+        |d      si |_	        ||f|j                  | j&                  <   t-        j.                  | j                  j0                  t2              } || ||||f| j4                  r| j6                  nd| j8                  | j"                  d|\  }} |j:                  g |d j=                         }| j?                  |      }||fS )	NrX   r!   )r6  r#   )r   r   rT  rF  shared_layersr   )r,  r-  rF  ) rg   rq   r{   r   r  rH  r8  r1  rM  rV  rO  r   r   r   rI  r   rJ  rF  updater  rP  r>  r   get_interface_attn_implementationr5  r
  rB  r-  r   r   rE  )rU   rA   rS  r+  r@   rT  r2  input_shapehidden_shaper   r   r  r  r  cache_kwargsattention_interfacer4  r3  s                     r4   re   zGemma3nTextAttention.forward  s    $))#2.??b?$++*>*>?&S{{=166|D{{<0+L#sRST#--a3 ""'B'6'D'DTE_E_'`$J#|':':;J'??<+>+>?L]388FJZ0J-j#sRSTJ#--a3J;;}5::<HL;;|4L'11!Q7L& "0"&"5"5	L **+:+A+Adnnl,(
L ((@46O1@JL@X--dnn=(?(M(MKK,,.E)
 %8
%
 /3mmD**LL..
%
 
%
!\ *k));;;;FFHkk+.L((r3   NNNN)r+   r,   r-   r.   r&   rj   rO   r/   rl   r
   
LongTensorr   r   rC   re   rm   rn   s   @r4   r:  r:    s    G*0 *S *^ -1.2(,26E)||E) #\\E) t+	E)
 E) ((4/E) +,E) 
u||U\\D0%2E2LL	ME)r3   r:  c                   N    e Zd Zdedef fdZ	 	 	 	 	 	 ddej                  dej                  dej                  dej                  dz  d	ej                  dz  d
e	dz  dej                  dz  de
e   deej                  eej                  ej                  f   dz  f   fdZ xZS )Gemma3nTextDecoderLayerrq   r  c                    t         |           || _        |j                  | _        || _        |j
                  |   | _        t        ||      | _        t        ||      | _
        t        | j                  |j                        | _        t        | j                  |j                        | _        t        | j                  |j                        | _        t        | j                  |j                        | _        |j"                  | _        t$        |j&                     | _        t+        |      | _        t/        |      | _        t3        j4                  | j                  | j"                  d      | _        t3        j4                  | j"                  | j                  d      | _        t        | j                  |j                        | _        y )N)r  r  Frs   )rN   rO   rq   ry   r  r<  attention_typer:  	self_attnr  mlprE   r  input_layernormpost_attention_layernormpre_feedforward_layernormpost_feedforward_layernormhidden_size_per_layer_inputr	   r  r  r  altupr  laurelrP   r   per_layer_input_gateper_layer_projectionpost_per_layer_input_normr  s      r4   rO   z Gemma3nTextDecoderLayer.__init__
  s]   !--"$00;-fi@!&I>-d.>.>FDWDWX(6t7G7GVM`M`(a%)78H8HfNaNa)b&*89I9IvObOb*c'+1+M+M(V556%f-
,V4$&IId.>.>@`@`gl$m!$&IId.N.NPTP`P`gl$m!)78H8HfNaNa)b&r3   NrA   rS  per_layer_inputr+  position_idsr@   rT  r2  r_   c           
      t   | j                   j                  |      }	|	| j                  j                     }
| j	                  |
      }| j                  |      } | j                  d||||||d|\  }}| j                  |      }|
|z   }||z   t        j                  d      z  }| j                  |      }| j                  |      }| j                  |      }||z   }| j                   j                  |	|      }|| j                  j                     j                         }| j                  j                  r| j                   j!                  |      }| j#                  |      }| j%                  |      }t'        j(                  ||      }| j+                  |      }| j-                  |      }|dd xxx |z  ccc |S )N)rA   r+  rq  rS  r@   rT  r!   r#   r2   )rk  r  rq   r	  rf  rl  rd  rg  r   rZ   rh  re  ri  r  r   altup_correct_scaler  rm  r  r/   multiplyrn  ro  )rU   rA   rS  rp  r+  rq  r@   rT  r2  r  active_predictionactive_prediction_normedlaurel_outputr  r   
attn_gatedattn_laurel	attn_normattn_ffwattn_ffw_normattn_ffw_laurel_gatedcorrected_predictionsfirst_predictions                          r4   re   zGemma3nTextDecoderLayer.forward   s    jj((7'(D(DE#'#7#78I#J $<= $.. 
2)% 3+)
 
a ,,T2&-
!M1TYYq\A22;?	88I&77A +m ; $

 2 2;@U V01M1MNTTV;;**#zz@@AQR  445EF;;'78 >>*:OL  445EF99:JKab!%55!$$r3   )NNNNNN)r+   r,   r-   r&   rj   rO   r/   rl   r_  r
   r   r   rC   r;   re   rm   rn   s   @r4   ra  ra  	  s    c0 cS c2 -1(,.204(,263%||3% #\\3% 	3%
 t+3% &&-3% 3% ((4/3% +,3% 
u||U5#4#4e6G6G#GH4OO	P3%r3   ra  c                        e Zd ZU eed<   dZdZdgZdgZdZ	dZ
dZdZdZeedZdZ ej&                          fd       Z xZS )	Gemma3nPreTrainedModelrq   modelTra  r@   )rA   rB   )imagetextaudioc                 F   t         |   |       t        |t              r!t	        j
                  |j                         nt        |t              rt	        j                  |j                         |j                  dz  }dt        j                  j                  j                  t        j                  d            z  }t	        j                   |j"                  ||z         t	        j$                  |j&                  |j(                         t	        j                   |j*                  |j-                                nt        |t.              r,t	        j$                  |j0                  |j2                         nit        |t4              rXt	        j                  |j6                         t	        j$                  |j8                  | j:                  j<                  dz         nt        |t>              rd\  }}|j@                  dz  }tC        jD                  tG        |      tG        |      z        tI        |dz
  d      z  }|t        jJ                  t        jL                  |      | z        z  }t	        j                   |jN                  |jG                         jQ                  d      jQ                  d             n&t        |tR              rdt	        j$                  |jT                  | j<                  dz         t	        j$                  |jV                  dtC        jX                  d	      z         nt        |tZ              r|j\                  D ]  }	|j^                  }
|j`                  |	   d
k7  rtb        |j`                  |	      }
 |
|j:                  |	      \  }}t	        j                   te        ||	 d      |       t	        j                   te        ||	 d      |        tg        |d      r5t	        j$                  |jh                  | j:                  jh                         y y )Nr   rK   r   r  )rK   ru   r!   r#   r          @defaultr?  	_inv_freq_original_inv_freqr  )5rN   _init_weights
isinstancer*  initones_rJ   r   zeros_r   r{   r/   rP   r   r   rT   copy_r   	constant_r   r   r   r   r  r  r  r  r  r  rq   ry   rp   rz   r   r   rb   r|   r   r   rv   r   Gemma3nTextModelper_layer_projection_scaleper_layer_input_scalerZ   Gemma3nRotaryEmbeddingr<  compute_default_rope_parameters	rope_typer   r@  r>  r  )rU   r(  r   r   r   r   r   r   rv   r?  rope_init_fncurr_inv_freqr   rV   s                r4   r  z$Gemma3nPreTrainedModel._init_weightsi  s   f%f=>JJv}}% 56KK,,-oot+G!4!4!=!=ell3>O!PPLJJv~~w'=>NN6>>6+K+KLJJv55v7\7\7^_ >?NN6--v/H/HI 01KK334NN644dkk6M6Mt6ST EF+5(M=#__1N&*hhu]/CeMFZ/Z&[^a"A_ '# +UYYu||N7SWnVn7n-ooNJJv,,n.B.B.D.N.Nq.Q.[.[\].^_ 01NN6<<d>N>NPT>TUNN677TYYs^9KL 67$00 ^
%EE##J/9<#6v7G7G
7S#TL#/*#U q

76j\+CDmT

76j\9K+LM}]^ 6./NN633T[[5R5RS 0r3   )r+   r,   r-   r%   r1   base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraph_supports_attention_backendra  r:  _can_record_outputsinput_modalitiesr/   no_gradr  rm   rn   s   @r4   r  r  V  sv    &*#23#4"5N!"&0* 2U]]_%T %Tr3   r  c                        e Zd ZU dZeed<   dZdZdef fdZe	e
dej                  dej                  dee   deez  fd	              Z xZS )
Gemma3nAudioEncoderzx
    An audio encoder based on the [Universal Speech Model](https://huggingface.co/papers/2303.01037) architecture.
    rq   	audio_melr  c                    t         |   |       || _        t        |      | _        t        j                  t        |j                        D cg c]  }t        |       c}      | _
        | j                          y c c}w ra   )rN   rO   rq   rq  subsample_conv_projectionrP   
ModuleListr.  conf_num_hidden_layersr  	conformer	post_init)rU   rq   r   rV   s      r4   rO   zGemma3nAudioEncoder.__init__  se     )LV)T&9>v?\?\9]^A'/^
 	 _s   A=r*   r2  r_   c                 >   | j                  |      }|j                  d   }d}t        t        | j                  j
                              D ]!  }|| j                  j
                  |   d   z  }# t        j                  ||j                        |z  }t        j                  ||j                  d   dz
        }|j                  dkD  r>|j                  dk(  r/|j                  d      j                  |j                  d   d      }n`|j                  |j                  k(  rG|j                  d   dk(  r5|j                  d   dk7  r#||j                  d   k(  r|j                  d      }t        j                  |d|      }	| j                  D ]  }
 |
||	      } | j                  j                  dkD  r@|dddd| j                  j                  f   }|	dddd| j                  j                  f   }	|j!                  |	j                  d      d      }t#        ||	      S )	a  Encodes a batch of MELs.

        Args:
            audio_mel: a torch.Tensor of shape [batch, num_frames, num_channels,
              mel_bins].

        Returns:
            audio_encodings: a torch.Tensor of shape
                `[batch_size, self.config.audio_soft_tokens_per_image,
                self.config.audio_config.hidden_size]`
            audio_mel_mask: a torch.BoolTensor of shape [batch, num_frames].
        r#   r   r   )r|   rX   Nr   )last_hidden_stater*   )r  rg   r.  r/  rq   rX  r/   r   r   r6  r   r   r$  gatherr  conf_reduction_factormasked_fillr)   )rU   r  r*   r2  re  t_subtime_stride_productstride_pair_idxindicescurrent_maskblocks              r4   re   zGemma3nAudioEncoder.forward  s   " 88C  %%a($S)J)J%KL 	YO4;;#D#D_#UVW#XX	Y ,,u^-B-BCFYY++g>+?+?+BQ+FG "w||q'8''*11.2F2Fq2I2NG7<</$$Q'1,a A%q)) ''*G||NAw?^^ 	CE#O\BO	C ;;,,q0-a1UDKK4U4U1U.UVO'+Odkk.O.O+O(OPL)55l6L6LR6PRUV--'
 	
r3   )r+   r,   r-   r.   r$   r1   main_input_namer  rO   r   r    r/   rl   r0   r   r   rC   r)   re   rm   rn   s   @r4   r  r    sz     !O1   8
8
7<7G7G8
SYZlSm8
	/	/8
   8
r3   r  c                        e Zd ZU ej                  ed<   ddef fdZe	 	 	 	 ddedz  de	d   de
dz  dedz  d	ed
ef   f
d       Z ej                         edd              Z xZS )r  inv_freqNrq   c                 v   t         |           |j                  | _        |j                  | _        || _        t        t        |j                              | _        i | _	        | j                  D ]  }| j
                  j                  |   }||d   | j                  |<   | j                  }| j                  |   dk7  rt        | j                  |      } || j
                  ||      \  }}| j                  | d|d       | j                  | d|j                         d       t        | | d|        y )	Nr  r  r  r  FrL   r  _attention_scaling)rN   rO   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenrq   listsetr<  r  rope_parametersr  r   rS   r   setattr)	rU   rq   r   r?  rope_paramsr  r  curr_attention_scalingrV   s	           r4   rO   zGemma3nRotaryEmbedding.__init__  s8   "("@"@$*$B$B!F$6$6 78** 	UJ++55jAK")4[)ADNN:&%)%I%IL~~j)Y624>>*3MN4@fak4l1M1  J<y!9=UZ [  J</A!BMDWDWDYfk lDZL(:;=ST	Ur3   r   ztorch.deviceseq_lenr?  r_   ztorch.Tensorc                     | j                   |   d   }t        | dd      xs | j                  | j                  z  }d}d|t	        j
                  d|dt        j                        j                  |t        j                        |z  z  z  }||fS )	a|  
        Computes the inverse frequencies according to the original RoPE implementation
        Args:
            config ([`~transformers.PreTrainedConfig`]):
                The model configuration.
            device (`torch.device`):
                The device to use for initialization of the inverse frequencies.
            seq_len (`int`, *optional*):
                The current sequence length. Unused for this type of RoPE.
            layer_type (`str`, *optional*):
                The current layer type if the model has different RoPE parameters per type.
                Should not be used unless `config.layer_types is not None`

        Returns:
            Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
            post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
        
rope_thetar{   NrK   r   r!   r   r   )	r  r@  ry   rA  r/   r   int64r   rb   )rq   r   r  r?  baserF   attention_factorr  s           r4   r  z6Gemma3nRotaryEmbedding.compute_default_rope_parameters  s    2 %%j1,?fj$/c63E3EIcIc3c U\\!S!5;;?BB&X]XcXcBdgjjk
 )))r3   c                 N   t        | | d      }t        | | d      }|d d d d f   j                         j                  |j                  d   dd      j	                  |j
                        }|d d d d d f   j                         }t        |j
                  j                  t              r/|j
                  j                  dk7  r|j
                  j                  nd}t        |d	      5  |j                         |j                         z  j                  dd
      }	t        j                  |	|	fd      }
|
j                         |z  }|
j                         |z  }d d d        j	                  |j                        j	                  |j                        fS # 1 sw Y   AxY w)Nr  r  r   rX   r#   mpscpuF)device_typeenabledr!   r   r   )r@  rb   r$  rg   r   r   r  r   strr   r1  r/   r   r   r   r   )rU   r]   rq  r?  r  attention_scalinginv_freq_expandedposition_ids_expandedr  freqsembr   r   s                r4   re   zGemma3nRotaryEmbedding.forward!  sl    4J<y!9:#DZL8J*KL$T1d]399;BB<CUCUVWCXZ\^_`ccdedldlm ,QaZ 8 > > @'1!((--'E!((--[`J`ahhmmfkUC 	0&,,.1F1L1L1NNYYZ[]^_E))UEN3C'')//C'')//C		0 vvAGGv$cff177f&;;;	0 	0s   *A1FF$NNr^  ra   )r+   r,   r-   r/   rl   r1   r&   rO   staticmethodr   rj   r  rC   rb   r  r  r   re   rm   rn   s   @r4   r  r    s    llU0 U. +/+/"!%	!*!D(!*(!* t!* $J	!*
 
~u$	%!* !*F U]]_<  <r3   r  zBThe base Gemma 3n language model without a language modeling head.c                       e Zd ZU eed<   dZdef fdZe ed      e		 	 	 	 	 	 	 	 dde
j                  dz  de
j                  dz  d	e
j                  dz  d
e
j                  dz  dedz  de
j                  dz  dedz  de
j                  dz  dee   defd                     Zde
j                  de
j                  fdZ	 dde
j                  de
j                  dz  de
j                  fdZ xZS )r  rq   )r  c           
      |   t         |   |       |j                  | _        |j                  | _        t        |j                  |j                  | j                  | j                  j                  dz        | _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        t        |j                  |j                         | _        t%        |      | _        d| _        |j                  | _        |j*                  | _        t        |j,                  |j                  |j*                  z  | j                  |j*                  dz        | _        t        j0                  | j                  |j                  |j*                  z  d      | _        t        |j*                  |j                         | _        t        j                  t        d| j                  j6                        D cg c].  }t        j0                  | j                  | j                  d      0 c}      | _        t        j                  t        d| j                  j6                        D cg c].  }t        j0                  | j                  | j                  d      0 c}      | _        | j=                  dt?        j@                  | j                  dz        d	       | j=                  d
t?        jB                  t?        j@                  d            d	       | jE                          y c c}w c c}w c c}w )N      ?)r  r  Frs   r#   r  r   rL   r  r  )#rN   rO   pad_token_idr  
vocab_sizer  ry   rq   embed_tokensrP   r  r.  rK  ra  layersrE   r  r\  r  
rotary_embgradient_checkpointingrj  vocab_size_per_layer_inputembed_tokens_per_layerr   per_layer_model_projectionper_layer_projection_normr  altup_projectionsaltup_unembed_projectionsrS   r/   rT   r7  r  )rU   rq   r  r   rV   s       r4   rO   zGemma3nTextModel.__init__9  s    !.. ++ ;v1143C3CQUQ\Q\QhQhjmQm
 mmINvOgOgIhiI$VY7i
 #6#5#56;N;NO	08&+#!--+1+M+M(&D--$$v'I'II::C?	'
# +-))$$v'I'II+
' *88Z8Z`f`s`s)t&!#PUVWY]YdYdYuYuPvw1RYYt'')9)9Fw"
 *,PUVWY]YdYdYuYuPvw1RYYt'')9)9Fw*
& 	95<<HXHXZ^H^;_lqr4ekk%,,sBS6Tafg 	K j4 x xs   "L/3L413L9F)tie_last_hidden_statesNr  per_layer_inputsr+  rq  r@   inputs_embeds	use_cacherT  r2  r_   c	           	      $   |du |duz  rt        d      |"| j                  |      }| j                  |      }| j                  ||      }|r|t	        | j
                        }|E||j                         nd}
t        j                  |j                  d   |j                        |
z   }||j                  d      }t        |x}t              s*| j
                  |||||d}t        di |t        di |d}|}t        j                   |d	z  d
d      dz  }t        j"                  d      }|g}t%        d| j
                  j&                        D ]  } | j(                  |dz
     |      }|j+                  |j,                  |j                        }t        j                   |d	z  d
d      }t        j.                  t        j0                  ||j+                  |j                                    }||z  |z  }|j3                  |        t        j4                  |d      }i }| j
                  j6                  D ]  }| j9                  |||      ||<    | j:                  d| j
                  j<                   D ]G  }||j>                     }|dddd|j@                  ddf   } ||||j>                     |f||||d|	}I t        j                   |d   d	z  d
d      dz  }|d   g}t%        d| j
                  j&                        D ]  } | jB                  |dz
     ||         }|j+                  |j,                  |j                        }t        j                   |d	z  d
d      }t        j.                  t        j0                  ||j+                  |j                                    }||z  |z  }|j3                  |        t        j4                  |      }t        j                   |d      }| jE                  |      }tG        ||      S )z
        per_layer_inputs (torch.Tensor, *optional*, defaults to None):
            Pre-computed per-layer embeddings. If None, they are derived from input_ids if provided.
        N:You must specify exactly one of input_ids or inputs_embedsrq   r   r#   r   )rq   r  r+  rT  r@   rq  )full_attentionr=  r!   rX   Tr2  r  gh㈵>r  r   )r+  rq  r@   rT  )r  r@   r2   )$r  r  get_per_layer_inputsproject_per_layer_inputsr   rq   get_seq_lengthr/   r   rg   r   r   r  dictr   r   r\   rT   r.  r  r  r   r   rZ   maximumrt  stackr<  r  r  rK  rc  r  r  r\  r   )rU   r  r  r+  rq  r@   r  r  rT  r2  past_seen_tokenscausal_mask_mappingmask_kwargshidden_states_0target_magnitudeepsilon_tensortemp_hidden_statesr|  
altup_projcurrent_hidden_statenew_magnituderA   rS  r?  decoder_layercausal_maskrp  altup_unemb_projs                               r4   re   zGemma3nTextModel.forwardj  s3   & -t";<YZZ  --i8M#88C88HXY0*$++>O!CRC^==?de"\\-*=*=a*@I]I]^aqqN)33A6L ?-F ++!."0"0#2 ,K #5"C{"C%F%U%U# ( !::oq&8b$OSVVd+-.q$++667 	<A6//A6GJ#-==7L7LUeUlUl=#m !JJ';Q'>BPTUM!JJu}}]NDUDUVfVmVmDn'opM#7:J#J]#Z %%&:;	< $6A> ++11 	gJ.2oom\[e.f
+	g "[[)H4;;+H+HI 	M-m.J.JKK.q!]5L5La/OPO)#M$@$@A	  +) /-	 	M		  !::mA&6!&;TRVYY+A./q$++667 	<A-RT-K-KAPQE-RS`abSc-d#3#6#6_=R=R[k[r[r#6#s !JJ';Q'>BPTUM!JJu}}]NDUDUVfVmVmDn'opM#7:J#J]#Z %%&:;	< $67

=a8		-0&++
 	
r3   c                      | j                  |      j                  g |j                  | j                  j                  | j
                   S ra   )r  r   rg   rq   rK  rj  )rU   r  s     r4   r  z%Gemma3nTextModel.get_per_layer_inputs  sP    =t**95== 
__
KK))
 ,,
 	
r3   c                    | j                  |      }|| j                  j                  |j                  |j                        z  } |j
                  g |j                  d d | j                  j                  | j                   }| j                  |      }||S |j                  |j                  k7  r |dd | j                  j                  d d f   }||z   | j                  j                  |j                  |j                        z  S )Nr  rX   .)r  r  r   r   r   r   rg   rq   rK  rj  r  r  )rU   r  r  rn  s       r4   r  z)Gemma3nTextModel.project_per_layer_inputs  s.   
 .2-L-L]-[ ? ? B B%%.B.I.I !C !
 	
  <3;;  
  "% 
KK)) 
 ,, 

  $==>RS#''%%)9)?)??/5Tt{{7T7T5TVW0WX$'774;U;U;X;X%%.B.I.I <Y <
 
 	
r3   )NNNNNNNNra   )r+   r,   r-   r&   r1   r  rO   r   r    r   r/   r_  rl   r
   r;   rk   r   r   r   re   r  r  rm   rn   s   @r4   r  r  4  so    /0 /b  E2 .204.204(,26!%26l
##d*l
  ,,-l
 t+	l

 &&-l
 l
 ((4/l
 $;l
 ((4/l
 +,l
 
!l
  3  l
\
e.>.> 
5<< 
 15
||
  ,,-
 
	
r3   r  z?The base Gemma 3n language model with a language modeling head.c                   |    e Zd ZU ddiZddiZddgdgfiZeed<   dd	iZdef fd
Z	e
e	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dedz  dej                   dz  dej                  dz  dedz  dej                  dz  deej                  z  dee   defd              Z xZS )Gemma3nForCausalLMlm_head.weightzmodel.embed_tokens.weightlm_headcolwise_gather_outputrA   r?   rq   zmodel.language_modelr  c                     t         |   |       t        |      | _        |j                  | _        t        j                  |j                  |j                  d      | _        | j                          y r  )
rN   rO   r  r  r  rP   r   ry   r  r  r  s     r4   rO   zGemma3nForCausalLM.__init__  sU     %f-
 ++yy!3!3V5F5FUS 	r3   Nr  r+  rq  r@   r  labelsr  rT  logits_to_keepr2  r_   c
                     | j                   d|||||||d|
}|j                  }t        |	t              rt	        |	 d      n|	}| j                  |dd|ddf         }| j                  j                  G|| j                  j                  z  }t        j                  |      }|| j                  j                  z  }d}| | j                  ||| j                  fi |
}t        |||j                  |j                  |j                        S )a  
        Example:

        ```python
        >>> from transformers import AutoTokenizer, Gemma3nForCausalLM

        >>> model = Gemma3nForCausalLM.from_pretrained("google/gemma-2-9b")
        >>> tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b")

        >>> prompt = "What is your favorite condiment?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "What is your favorite condiment?"
        ```)r  r+  rq  r@   r  r  rT  N)r>   r?   r@   rA   rB   r2   )r  r  r  rj   slicer  rq   final_logit_softcappingr/   r  loss_functionr  r   r@   rA   rB   )rU   r  r+  rq  r@   r  r  r  rT  r  r2  outputsrA   slice_indicesr?   r>   s                   r4   re   zGemma3nForCausalLM.forward  s   B ,64:: 	,
)%+')	,
 	,
  118B>SV8W~ot4]kmA}a,?@A;;..:dkkAAAFZZ'FdkkAAAF%4%%ffdooPPD%#33!//))
 	
r3   )	NNNNNNNNr   )r+   r,   r-   _tied_weights_keys_tp_plan_pp_planr&   r1   _checkpoint_conversion_mappingrO   r   r   r/   r_  rl   r
   r;   rk   rj   r   r   r   re   rm   rn   s   @r4   r	  r	    sH   *,GH23H_-z:;H&<g%F"0   .2.204(,26*.!%26-.=
##d*=
 t+=
 &&-	=

 =
 ((4/=
   4'=
 $;=
 ((4/=
 ell*=
 +,=
 
 =
  =
r3   r	  c                        e Zd ZdZdeez  def fdZ	 	 d
dej                  dz  dej                  dz  dej                  fd	Z xZS )Gemma3nMultimodalEmbedderzQEmbeds token ids or soft tokens for multimodal content into language model space.multimodal_configtext_configc                 r   t         |           |j                  | _        |j                  | _        |j                  | _        |j                  | _        |j                  | _        t        j                  | j                  | j                        | _        t        | j                  | j
                        | _        t        | j                  | j
                        | _        t        j                  | j                  | j                  d      | _        t        | j                  | j
                  d      | _        y )Nr  Frs   )rG   rH   )rN   rO   ry   multimodal_hidden_sizer  rG   vocab_offsetr  text_hidden_sizerP   	Embedding	embeddingrE   hard_embedding_normsoft_embedding_normr   embedding_projectionembedding_post_projection_norm)rU   r  r  rV   s      r4   rO   z"Gemma3nMultimodalEmbedder.__init__T  s    
 	&7&C&C#$11-::+66 + 7 7doot7R7RS#1$2M2MSWS[S[#\ #1$2M2MSWS[S[#\ $&IId.I.I4K`K`gl$m!.<T=R=RX\X`X`mr.s+r3   Nr  r  r_   c                     |du |duz  rt        d      || j                  |      }n/| j                  || j                  z
        }| j	                  |      }| j                  |      }| j                  |      S )a  Embeds token ids or soft tokens for multimodal content into language model space.

        Args:
            input_ids: A torch.LongTensor containing the token ids to embed. Values should be in the range
                `[vocab_offset, vocab_offset + vocab_size)`.
            inputs_embeds: A torch.Tensor containing the soft tokens to embed.

        Returns:
            A torch.Tensor of embeddings with  shape `[batch_size, seq_len, self.config.text_config.hidden_size]`.
        Nr  )r  r%  r#  r   r$  r&  r'  )rU   r  r  emb_normhard_embemb_norm_projs         r4   re   z!Gemma3nMultimodalEmbedder.forwardg  s     -t";<YZZ$//>H~~i$2C2C&CDH//9H11(;22=AAr3   r  )r+   r,   r-   r.   r$   r'   r&   rO   r/   r_  rl   re   rm   rn   s   @r4   r  r  Q  sk    [t-0CCt 't* .2-1B##d*B ||d*B 
	Br3   r  z
    The base Gemma 3n model comprising a vision backbone, an audio backbone, and a language model without a
    language modeling head.
    c            "           e Zd Zi ZdZdef fdZd Zd Ze	 e
d      dej                  d	ee   d
eez  fd              Z	 	 	 	 d!dej$                  dz  dej                  dz  dej                  dz  dej                  dz  fdZe		 	 	 	 	 	 	 	 	 	 	 	 	 	 d"dej$                  dz  dej                  dz  dej                  dz  dej(                  dz  dej(                  dz  dej$                  dz  dedz  dej$                  dz  dej$                  dz  dej                  dz  dej$                  dz  dedz  dedz  dedz  dee   d
ef d       Ze	 e
d      dej(                  dej(                  d	ee   d
eez  fd               Z xZS )#Gemma3nModelFrq   c                 $   t         |   |       t        j                  |j                        | _        |j                  j                  | _        t        j                  |j                        }|| _        |j                  j                  | _	        t        j                  |j                        | _        t        |j                  |j                        | _        t        |j                  |j                        | _        | j                          y )Nr  )rN   rO   r"   from_configvision_configvision_towerr  r  language_modelr  audio_configaudio_towerr  embed_visionembed_audior  )rU   rq   r2  rV   s      r4   rO   zGemma3nModel.__init__  s     %119M9MN ,,77"..f6H6HI,*0*<*<*W*W'$001D1DE5f6J6JFL^L^_4V5H5H&J\J\]r3   c                 6    | j                   j                         S ra   )r2  get_input_embeddingsrh   s    r4   r8  z!Gemma3nModel.get_input_embeddings  s    ""7799r3   c                 :    | j                   j                  |       y ra   )r2  set_input_embeddingsrU   ri  s     r4   r:  z!Gemma3nModel.set_input_embeddings  s    007r3   zOProjects the last hidden state from the vision model into language model space.r5   pixel_valuesr2  r_   c                     | j                   d	|ddd|}|j                  }|j                  |j                  d   | j                  j
                  j                  | j                  j                        j                  ddd      }|| j                  j
                  j                  dz  z  }| j                  |      |_
        |S )
NFT)r<  
do_poolingreturn_dictr   r!   r#   r  r  r2   )r1  r  r   rg   rq   r0  ry   vision_soft_tokens_per_imager   r5  pooler_output)rU   r<  r2  vision_outputsr  s        r4   get_image_featureszGemma3nModel.get_image_features  s     +**sQVdhslrs*<< .55##A&KK%%11KK44
 '!Q
	 	 	T[[66BBCGG'+'8'8GX'8'Y$r3   Nr  r  image_featuresaudio_featuresc           	         || | j                         t        j                  | j                  j                  t        j
                  |j                              k(  }|j                  d      }| | j                         t        j                  | j                  j                  t        j
                  |j                              k(  j                  d      }n2|| j                  j                  k(  }|| j                  j                  k(  }|j                         }|j                  d      j                  |      j                  |j                        }|Qt        ||   j                         |j                         k(  d| d|j                  d   |j                  d   z          |j                         }|j                  d      j                  |      j                  |j                        }|Qt        ||   j                         |j                         k(  d| d|j                  d   |j                  d   z          ||fS )z
        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
        equal to the length of multimodal features. If the lengths are different, an error is raised.
        r  rX   z6Image features and image tokens do not match, tokens: z, features: r   r#   z6Audio features and audio tokens do not match, tokens: )r8  r/   rT   rq   image_token_idlongr   allaudio_token_idr4  r   	expand_asr   r   numelrg   )	rU   r  r  rE  rF  special_image_maskspecial_audio_maskn_image_tokensn_audio_tokenss	            r4   get_placeholder_maskz!Gemma3nModel.get_placeholder_mask  sb    !.2M$2K2K2MT[[77uzzR_RfRfg3 " "4!7!7!;.4,,.LL!;!;5::VcVjVjk c"g  "+dkk.H.H!H!*dkk.H.H!H+//1/99"=GGVYYZgZnZno%"01779^=Q=Q=SSHHXXdeseyeyz{e|  @N  @T  @T  UV  @W  fW  eX  Y
 ,//1/99"=GGVYYZgZnZno%"01779^=Q=Q=SSHHXXdeseyeyz{e|  @N  @T  @T  UV  @W  fW  eX  Y
 "#555r3   input_featuresr+  input_features_maskrq  r@   token_type_idsrT  r  r  output_attentionsoutput_hidden_states	lm_kwargsc                 	   |du |
duz  rt        d      ||n| j                  j                  }||n| j                  j                  }|} | j	                         |      }
t        j                  |dk\  || j                  k        }t        j                  ||t        j                  |            }| j                  j                  |      }t        j                  || j                  j                  k\  || j                  j                  k        }| j                  j                  | j                  j                  z   dz
  }t        j                  |||      j!                  |
j"                        }| j                  |      }|j!                  |
j"                  |
j$                        }|j'                  d      j)                  |
      }t        j                  |||
      }
|| j                  j                  k\  }| j                  j                  | j                  j                  z   dz
  }t        j                  |||      j!                  |
j"                        }| j                  |      }|j!                  |
j"                  |
j$                        }|j'                  d      j)                  |
      }t        j                  |||
      }
nd}|l| j+                  |d      j,                  }|j!                  |
j"                  |
j$                        }| j/                  ||
|	      \  }}|
j1                  ||      }
|K|H| j3                  || d      } | j,                  }!| j4                  }t        j6                  | j                  dz
  ggt
        j8                  |!j"                  
      }"| j                  |"      }#t        j                  |j'                  d      |#|!      }!|!j:                  \  }$}%}&| j                  j<                  |%z
  }'|#j?                  |$|'|&      }(t        j@                  |!|(fd      }!|!j!                  |
j"                  |
j$                        }!| j/                  ||
|!      \  }})|
j1                  |)|!      }
 | j                  dd|||||
|||d|	d|}*tC        |*jD                  |r|*jF                  nd|*jH                  |*jJ                  |nd|!      S d      S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.

        Example:

        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, Gemma3nForConditionalGeneration

        >>> model = Gemma3nForConditionalGeneration.from_pretrained("google/gemma3n2-3b-mix-224")
        >>> processor = AutoProcessor.from_pretrained("google/gemma3n2-3b-mix-224")

        >>> prompt = "Where is the cat standing?"
        >>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> inputs = processor(images=image, text=prompt,  return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(**inputs,)
        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Where is the cat standing?\nsnow"
        ```
        Nr  r   r#   )r  rX   T)r?  )r  rE  r  r   )r  rF  )r  r  r+  rq  r@   r  r  rV  rW  r?  rT  )r  r@   rA   rB   r9   r:   r2   )&r  rq   rV  rW  r8  r/   r  r  r  
zeros_liker2  r  r5  r   r6  r  r   r   r   r   rL  rD  rB  rR  masked_scatterget_audio_featuresr*   rT   rI  rg   audio_soft_tokens_per_imager$  r   r8   r  r@   rA   rB   )+rU   r  r<  rS  r+  rT  rq  r@   rU  rT  r  r  r  rV  rW  rX  per_layer_inputs_maskper_layer_inputs_tokensr  vision_maskdummy_vision_token_idvision_input_idsvision_embedsexpanded_vision_mask
audio_maskdummy_audio_token_idaudio_input_idsaudio_embedsexpanded_audio_maskrE  rN  r   audio_outputsrF  audio_padding_toksaudio_padding_embsaudio_batch_sizeaudio_seq_lenaudio_embed_dimextra_padding_tokensextra_padding_featuresrO  r  s+                                              r4   re   zGemma3nModel.forward  s   b -t";<YZZ1B1N-TXT_T_TqTq$8$D $++JjJj 	  7D557	BM %*$5$5i1niRVRqRqFq$r!&+kk2GTYTdTdenTo&p##22GGH_`  ++T..;;;YIYIYIfIf=fK %)$5$5$B$BTEVEVEaEa$ade$e!${{;	CXY\\]j]q]qr --8H-IM),,]-A-A=CVCVWM#.#8#8#<#F#F}#U !KK(<m][M #d&6&6&C&CCJ#'#3#3#@#@4CSCSC^C^#^ab#b #kk*iAUVYYZgZnZnoO++o+FL'??=+?+?ATATUL","6"6r":"D"D]"S!KK(;\=YM# #!44\t4TbbN+..}/C/C]EXEXYN$($=$=~ %> %! *889K^\M %*=*I 33NEXDXfj3kM*88N&55J "'!0C/D.EUZZ`n`u`u!v!%!1!1<N!1!O"[[)=)=b)ACUWefN?M?S?S<m_#';;#J#J]#Z %7%>%>?OQegv%w""YY8N'OUVWN+..}/C/C]EXEXYN$($=$=~ %> %!A! *889K^\M%$%% 
-)%+'/!5)
 
 *%777@G33d!//))2>2JPT2@2L
 	
 SW
 	
r3   zPProjects the last hidden state from the audio encoder into language model space.c                 x     | j                   ||fddi|}| j                  |j                        }||_        |S )a0  
        input_features (`torch.FloatTensor]` of shape `(num_images, seq_length, num_features)`):
            The tensors corresponding to the input audio.
        input_features_mask (`torch.FloatTensor]` of shape `(num_images, seq_length)`):
            The attention mask for the input audio.
        r?  Tr@  )r4  r6  r  rB  )rU   rS  rT  r2  rj  rh  s         r4   r\  zGemma3nModel.get_audio_featuresu  sV     9I8H8H/9
=A9
EK9
 ''m6U6U'V&2#r3   r^  )NNNNNNNNNNNNNN)r+   r,   r-   r  accepts_loss_kwargsr%   rO   r8  r:  r   r   r/   r;   r   r   rC   r   rD  r_  rR  rl   r
   rk   r8   re   r)   r\  rm   rn   s   @r4   r-  r-    s    &("} :8 !rs'' +, 
+	+	 t , .2263737*6##d**6 ((4/*6 ))D0	*6
 ))D0*6X  .21537.23704(,262626*.!%)-,0O
##d*O
 ''$.O
 ))D0	O

 t+O
 #\\D0O
 &&-O
 O
 ((4/O
 ((4/O
 ((4/O
   4'O
 $;O
  $;O
 #TkO
  ./!O
" 
$#O
 O
b !st #\\ +,	
 
/	/ u r3   r-  z
    The base Gemma 3n model comprising a vision backbone, an audio backbone, a language model, and a language modeling
    head.
    c            %       R    e Zd Zi ZddiZdef fdZd Zd Ze	de
j                  dee   fd	       Zee		 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dde
j                   d
z  de
j                  d
z  de
j                  d
z  de
j"                  d
z  de
j"                  d
z  de
j                   d
z  ded
z  de
j                   d
z  de
j                   d
z  de
j                  d
z  de
j                   d
z  ded
z  ded
z  ded
z  dee
j"                  z  dee   def"d              Z	 	 	 	 	 	 	 	 	 	 	 	 	 d fd	Z xZS )Gemma3nForConditionalGenerationr
  z(model.language_model.embed_tokens.weightrq   c                     t         |   |       t        |      | _        t	        j
                  |j                  j                  |j                  j                  d      | _	        | j                          y r  )rN   rO   r-  r  rP   r   r  ry   r  r  r  r  s     r4   rO   z(Gemma3nForConditionalGeneration.__init__  sS     !&)
yy!3!3!?!?ASASA^A^ejkr3   c                 6    | j                   j                         S ra   )r  r8  rh   s    r4   r8  z4Gemma3nForConditionalGeneration.get_input_embeddings  s    zz..00r3   c                 :    | j                   j                  |       y ra   )r  r:  r;  s     r4   r:  z4Gemma3nForConditionalGeneration.set_input_embeddings  s    

''.r3   r<  r2  c                 <     | j                   j                  |fi |S ra   )r  rD  )rU   r<  r2  s      r4   rD  z2Gemma3nForConditionalGeneration.get_image_features  s    ,tzz,,\DVDDr3   Nr  rS  r+  rT  rq  r@   rU  rT  r  r  r  rV  rW  r  rX  r_   c                    ||n| j                   j                  }||n| j                   j                  } | j                  d	|||||||||	|
||||dd|}|j                  }t        |t              rt        | d      n|}| j                  |dd|ddf         }| j                   j                         j                  x}||z  }t        j                  |      }||z  }d}|O|j                         }|dddddf   }|dddf   }||dd|j                  d    df   j                  |j                         }||j                  |j                         dk7     j#                         }||j                  |j                         dk7     j#                         }n |j#                         }|j#                         }t%        j&                         }|j)                  d| j                   j*                  j,                        }|j)                  d      j                  |j                         } |||      }t/        |||j0                  |j2                  |j4                  |j6                  |j8                        S )
a  
        input_features_mask (torch.Tensor, *optional*, defaults to None):
            The attention mask for the input audio.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are
            ignored (masked), the loss is only computed for the tokens with labels in
            `[0, ..., config.text_config.vocab_size]`.

        Example:

        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, Gemma3ForConditionalGeneration

        >>> model = Gemma3ForConditionalGeneration.from_pretrained("google/gemma-3-4b-it")
        >>> processor = AutoProcessor.from_pretrained("google/gemma-3-4b-it")

        >>> messages = [
        ...     {
        ...         "role": "system",
        ...         "content": [
        ...             {"type": "text", "text": "You are a helpful assistant."}
        ...         ]
        ...     },
        ...     {
        ...         "role": "user", "content": [
        ...             {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"},
        ...             {"type": "text", "text": "Where is the cat standing?"},
        ...         ]
        ...     },
        ... ]

        >>> inputs = processor.apply_chat_template(
        ...     messages,
        ...     tokenizer=True,
        ...     return_dict=True,
        ...     return_tensors="pt",
        ...     add_generation_prompt=True
        ... )
        >>> # Generate
        >>> generate_ids = model.generate(**inputs)
        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "user\nYou are a helpful assistant.\n\n\n\n\n\nWhere is the cat standing?\nmodel\nBased on the image, the cat is standing in a snowy area, likely outdoors. It appears to"
        ```
        NT)r  r<  rS  r+  rT  rq  r@   rU  rT  r  r  r  rV  rW  r?  .rX   r#   r   )r>   r?   r@   rA   rB   r9   r:   r2   )rq   rV  rW  r  r  r  rj   r  r  get_text_configr  r/   r  rb   rg   r   r   r   rP   CrossEntropyLossr  r  r  r=   r@   rA   rB   r9   r:   )rU   r  r<  rS  r+  rT  rq  r@   rU  rT  r  r  r  rV  rW  r  rX  r  rA   r  r?   r  r>   shift_logitsshift_labelsshift_attention_maskloss_fctflat_logitsflat_labelss                                r4   re   z'Gemma3nForConditionalGeneration.forward  s   J 2C1N-TXT_T_TqTq$8$D $++JjJj 	 $** 
%)) 3%+))'/!5
  !
&  118B>SV8W~ot4]kmA}a,?@A'+{{'B'B'D'\'\\#i55FZZ'F55F\\^F!#ssA+.L!#qr'?L) (6a,:L:LQ:O9O9Q6Q'R'U'UV\VcVc'd$+,@,C,CFMM,RVW,WXcce+,@,C,CLDWDW,X\],]^iik+668+668**,H&++B0G0G0R0RSK&++B/22<3F3FGKK5D,#33!//)) ' ; ; ' ; ;
 	
r3   c                 b    t        |   |f||||||||
|d	|}|s|s||d<   ||d<   |	|d<   |S )N)	r@   r  r+  rq  rT  r  r  rU  is_first_iterationr<  rS  rT  )rN   prepare_inputs_for_generation)rU   r  r@   r  rT  rq  r<  rS  r+  rT  rU  r  r  r  r  r2  model_inputsrV   s                    r4   r  z=Gemma3nForConditionalGeneration.prepare_inputs_for_generation,	  sn    & w<
+')%)))1
 
" Y+7L(-;L)*2EL./r3   )NNNNNNNNNNNNNNr   )NNNNNNNNNTNNF)r+   r,   r-   r  r  r%   rO   r8  r:  r   r/   r;   r   r   rD  r   r_  rl   r
   rk   rj   r=   re   r  rm   rn   s   @r4   ru  ru    s"    &("*,VW} 1/ Eu/@/@ EFSeLf E E  .21537.23704(,262626*.!%)-,0-.!B
##d*B
 ''$.B
 ))D0	B

 t+B
 #\\D0B
 &&-B
 B
 ((4/B
 ((4/B
 ((4/B
   4'B
 $;B
  $;B
 #TkB
  ell*!B
" ./#B
$ 
'%B
  B
N   ) )r3   ru  )r  r	  ru  r-  r  r  )r   NN)r#   )cr   collections.abcr   r   dataclassesr   typingr   r/   torch.nnrP   torch.nn.functionalr   rj   r   r  r  r	   cache_utilsr
   r   
generationr   integrationsr   masking_utilsr   r   modeling_layersr   modeling_outputsr   r   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   utils.genericr   r   utils.output_capturingr    autor"   configuration_gemma3nr$   r%   r&   r'   r)   r8   r=   ModulerE   rp   r   r*  rL  rq  r  r  r  r  r"  r  r  r  r  r!  rl   rj   r'  rb   rC   r5  r8  r:  ra  r  r  r  r  r	  r  r-  ru  __all__r2   r3   r4   <module>r     s  *  . !      & ! . ) / R 9 k k K F &  H 5  l l 3%? 3  3 
9!8 9 9( 
9K 9 9:=RYY =6g)BII g)TaBII aHj,bii j,ZB7		 B7JF")) FRORYY O8Dryy D2(ryy (V 6SR\\ S;RYY ;$#5RYY #5L`'ryy `'F(	UU\\ 	U# 	U%,, 	U$   %II%<<% 
% <<	%
 LL4'% % T\% T\% 5<<%&%D.ELL .u|| .%,, ._b ., )*t)299 t) +t)nJ%8 J%Z 8T_ 8T 8TvN
0 N
bN<RYY N<b abF
- F
 cF
R ^_O
/ O
 `O
d/B		 /Bd @) @@F C&<o CCLr3   