
    qi                        d dl mZ d dlmZ d dlmZ d dlZd dlmZ ddl	m
Z ddlmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZmZmZ ddlmZmZ ddlm Z m!Z!m"Z"m#Z# ddl$m%Z%m&Z& ddl'm(Z(m)Z) ddl*m+Z+ ddl,m-Z-m.Z.m/Z/m0Z0m1Z1m2Z2 ddl3m4Z4 ddl5m6Z6m7Z7 ddl8m9Z9 ddl:m;Z; ddl<m=Z=m>Z>  e1j~                  e@      ZAe e/d       G d de                     ZBe e/d       G d d e-                    ZC G d! d"ej                        ZE G d# d$ej                        ZG G d% d&ej                        ZH G d' d(ej                        ZId) ZJ ed*      d`d+       ZKd,ej                  d-eMd.ej                  fd/ZN	 	 	 dad0ej                  d1ej                  d2ej                  d3ej                  d4ej                  dz  d5eOd6eOdz  d7eOdz  d.ePej                  ej                  f   fd8ZQ eeK       G d9 d:ej                               ZR G d; d<e      ZSe/ G d= d>e)             ZTd?eMd.eeMeMeMeMgeUf   fd@ZVe/ G dA dBeT             ZWe/ G dC dDeTe             ZX G dE dFej                        ZYdGej                  dz  dHej                  dz  d.edz  fdIZZ e4dJdKdLM      	 	 	 	 dbdNedLej                  d4ej                  dz  dOej                  dPedz  dQej                  dz  dGej                  dz  dRej                  dz  dSeUdTeUdz  d.e\fdU       Z] e/dV       G dW dXeT             Z^ e/dV       G dY dZeTe             Z_ G d[ d\eT      Z` G d] d^eeT      Zag d_Zby)c    )Callable)	dataclass)OptionalN   )initialization)ACT2FN)CacheDynamicCache)PreTrainedConfig)GenerationMixin)use_kernel_func_from_hubuse_kernelized_func)create_causal_maskcreate_masks_for_generate!create_sliding_window_causal_mask) GenericForSequenceClassificationGradientCheckpointingLayer)BaseModelOutputWithPastBaseModelOutputWithPoolingCausalLMOutputWithPast SequenceClassifierOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringcan_return_tupleloggingtorch_compilable_check)deprecate_kwarg)maybe_autocastmerge_with_config_defaults)capture_outputs   )	AutoModel   )Gemma3ConfigGemma3TextConfigzK
    Base class for Gemma3 outputs, with hidden states and attentions.
    custom_introc                   :    e Zd ZU dZdZej                  dz  ed<   y)Gemma3ModelOutputWithPasta  
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
    Nimage_hidden_states)__name__
__module____qualname____doc__r0   torchFloatTensor__annotations__     \/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/gemma3/modeling_gemma3.pyr/   r/   8   s     59**T18r9   r/   zR
    Base class for Gemma3 causal language model (or autoregressive) outputs.
    c                       e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	e
dz  ed<   dZeej                     dz  ed<   dZeej                     dz  ed<   dZej                  dz  ed<   y)	Gemma3CausalLMOutputWithPasta8  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.text_config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder after projecting last hidden state.
    Nlosslogitspast_key_valueshidden_states
attentionsr0   )r1   r2   r3   r4   r=   r5   r6   r7   r>   r?   r	   r@   tuplerA   r0   r8   r9   r:   r<   r<   H   s     &*D%

d
")'+FE$+$(OUT\(59M5**+d2926Je''(4/648**T18r9   r<   c            	       Z     e Zd ZdZd	dedededef fdZdej                  f fdZ	 xZ
S )
Gemma3TextScaledWordEmbeddingz\
    This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
    num_embeddingsembedding_dimpadding_idxembed_scalec                     t         |   |||       || _        | j                  dt	        j
                  |      d       y )NrH   F
persistent)super__init__scalar_embed_scaleregister_bufferr5   tensor)selfrE   rF   rG   rH   	__class__s        r:   rM   z&Gemma3TextScaledWordEmbedding.__init__k   s;    D"-]ELL,ERWXr9   	input_idsc                     t         |   |      | j                  j                  | j                  j
                        z  S N)rL   forwardrH   toweightdtype)rQ   rS   rR   s     r:   rV   z%Gemma3TextScaledWordEmbedding.forwardp   s2    wy)D,<,<,?,?@Q@Q,RRRr9   )      ?)r1   r2   r3   r4   intfloatrM   r5   TensorrV   __classcell__rR   s   @r:   rD   rD   f   sG    Ys Y3 YS Y_d Y
S S Sr9   rD   c                   *     e Zd Zdef fdZd Z xZS )	Gemma3MLPconfigc                    t         |           || _        |j                  | _        |j                  | _        t        j                  | j                  | j                  d      | _        t        j                  | j                  | j                  d      | _        t        j                  | j                  | j                  d      | _	        t        |j                     | _        y NFbias)rL   rM   rb   hidden_sizeintermediate_sizennLinear	gate_projup_proj	down_projr   hidden_activationact_fnrQ   rb   rR   s     r:   rM   zGemma3MLP.__init__u   s    !--!'!9!94#3#3T5K5KRWXyy!1!143I3IPUV4#9#94;K;KRWXV556r9   c                     | j                  | j                  | j                  |            | j                  |      z        }|S rU   )rm   ro   rk   rl   )rQ   xrm   s      r:   rV   zGemma3MLP.forward   s6    NN4;;t~~a/@#ADLLQRO#ST	r9   )r1   r2   r3   r+   rM   rV   r^   r_   s   @r:   ra   ra   t   s    7/ 7r9   ra   c                   <     e Zd Zddedef fdZd Zd Zd Z xZ	S )Gemma3RMSNormdimepsc                     t         |           || _        t        j                  t        j                  |            | _        y rU   )rL   rM   rv   ri   	Parameterr5   zerosrX   )rQ   ru   rv   rR   s      r:   rM   zGemma3RMSNorm.__init__   s.    ll5;;s#34r9   c                     |t        j                  |j                  d      j                  dd      | j                  z         z  S )Nr'   T)keepdim)r5   rsqrtpowmeanrv   )rQ   rr   s     r:   _normzGemma3RMSNorm._norm   s4    5;;quuQx}}R}>IJJJr9   c                     | j                  |j                               }|d| j                  j                         z   z  }|j                  |      S )NrZ   )r   r\   rX   type_as)rQ   rr   outputs      r:   rV   zGemma3RMSNorm.forward   sC    AGGI& 3!2!2!445~~a  r9   c                 ^    t        | j                  j                         d| j                   S )Nz, eps=)rB   rX   shaperv   rQ   s    r:   
extra_reprzGemma3RMSNorm.extra_repr   s'    ))*+6$((<<r9   )gư>)
r1   r2   r3   r[   r\   rM   r   rV   r   r^   r_   s   @r:   rt   rt      s&    5C 5e 5
K!=r9   rt   c                        e Zd ZU ej                  ed<   ddef fdZe	 	 	 	 ddedz  de	d   de
dz  dedz  d	ed
ef   f
d       Z ej                         edd              Z xZS )Gemma3RotaryEmbeddinginv_freqNrb   c                 v   t         |           |j                  | _        |j                  | _        || _        t        t        |j                              | _        i | _	        | j                  D ]  }| j
                  j                  |   }||d   | j                  |<   | j                  }| j                  |   dk7  rt        | j                  |      } || j
                  ||      \  }}| j                  | d|d       | j                  | d|j                         d       t        | | d|        y )	N	rope_typedefault
layer_type	_inv_freqFrJ   _original_inv_freq_attention_scaling)rL   rM   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenrb   listsetlayer_typesr   rope_parameterscompute_default_rope_parametersr   rO   clonesetattr)	rQ   rb   devicer   rope_paramsrope_init_fncurr_inv_freqcurr_attention_scalingrR   s	           r:   rM   zGemma3RotaryEmbedding.__init__   s8   "("@"@$*$B$B!F$6$6 78** 	UJ++55jAK")4[)ADNN:&%)%I%IL~~j)Y624>>*3MN4@fak4l1M1  J<y!9=UZ [  J</A!BMDWDWDYfk lDZL(:;=ST	Ur9   r   ztorch.deviceseq_lenr   returnztorch.Tensorc                     | j                   |   d   }t        | dd      xs | j                  | j                  z  }d}d|t	        j
                  d|dt        j                        j                  |t        j                        |z  z  z  }||fS )	a|  
        Computes the inverse frequencies according to the original RoPE implementation
        Args:
            config ([`~transformers.PreTrainedConfig`]):
                The model configuration.
            device (`torch.device`):
                The device to use for initialization of the inverse frequencies.
            seq_len (`int`, *optional*):
                The current sequence length. Unused for this type of RoPE.
            layer_type (`str`, *optional*):
                The current layer type if the model has different RoPE parameters per type.
                Should not be used unless `config.layer_types is not None`

        Returns:
            Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
            post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
        
rope_thetahead_dimNrZ   r   r'   rY   r   rY   )	r   getattrrg   num_attention_headsr5   arangeint64rW   r\   )rb   r   r   r   baseru   attention_factorr   s           r:   r   z5Gemma3RotaryEmbedding.compute_default_rope_parameters   s    2 %%j1,?fj$/c63E3EIcIc3c U\\!S!5;;?BB&X]XcXcBdgjjk
 )))r9   c                 N   t        | | d      }t        | | d      }|d d d d f   j                         j                  |j                  d   dd      j	                  |j
                        }|d d d d d f   j                         }t        |j
                  j                  t              r/|j
                  j                  dk7  r|j
                  j                  nd}t        |d	      5  |j                         |j                         z  j                  dd
      }	t        j                  |	|	fd      }
|
j                         |z  }|
j                         |z  }d d d        j	                  |j                        j	                  |j                        fS # 1 sw Y   AxY w)Nr   r   r   r{   r)   mpscpuF)device_typeenabledr'   ru   r   )r   r\   expandr   rW   r   
isinstancetypestrr$   	transposer5   catcossinrY   )rQ   rr   position_idsr   r   attention_scalinginv_freq_expandedposition_ids_expandedr   freqsembr   r   s                r:   rV   zGemma3RotaryEmbedding.forward   sl    4J<y!9:#DZL8J*KL$T1d]399;BB<CUCUVWCXZ\^_`ccdedldlm ,QaZ 8 > > @'1!((--'E!((--[`J`ahhmmfkUC 	0&,,.1F1L1L1NNYYZ[]^_E))UEN3C'')//C'')//C		0 vvAGGv$cff177f&;;;	0 	0s   *A1FF$)NNNNNNrU   )r1   r2   r3   r5   r]   r7   r+   rM   staticmethodr   r[   r   rB   r\   r   no_gradr   rV   r^   r_   s   @r:   r   r      s    llU/ U. *.+/"!%	!* 4'!*(!* t!* $J	!*
 
~u$	%!* !*F U]]_<  <r9   r   c                     | dd| j                   d   dz  f   }| d| j                   d   dz  df   }t        j                  | |fd      S )z*Rotates half the hidden dims of the input..Nr{   r'   r   )r   r5   r   )rr   x1x2s      r:   rotate_halfr      sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''r9   rotary_pos_embc                     |j                  |      }|j                  |      }| |z  t        |       |z  z   }||z  t        |      |z  z   }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )	unsqueezer   )qkr   r   unsqueeze_dimq_embedk_embeds          r:   apply_rotary_pos_embr      sY    & --
&C
--
&C3w;q>C/0G3w;q>C/0GGr9   r@   n_repr   c                     | j                   \  }}}}|dk(  r| S | dddddddddf   j                  |||||      } | j                  |||z  ||      S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r)   N)r   r   reshape)r@   r   batchnum_key_value_headsslenr   s         r:   	repeat_kvr   
  so    
 2?1D1D.Ehz!!Qa"23::5BUW\^bdlmM  (;e(CT8TTr9   modulequerykeyvalueattention_maskdropoutscalingsoftcapc                 |   || j                   dz  }t        || j                        }	t        || j                        }
t        j                  ||	j                  dd            |z  }|||z  }t        j                  |      }||z  }|||z   }t        j                  j                  |dt        j                        j                  |j                        }t        j                  j                  ||| j                        }t        j                  ||
      }|j                  dd      j                         }||fS )N      r'   r   r{   )ru   rY   )ptrainingr)   )r   r   num_key_value_groupsr5   matmulr   tanhri   
functionalsoftmaxfloat32rW   rY   r   r   
contiguous)r   r   r   r   r   r   r   r   kwargs
key_statesvalue_statesattn_weightsattn_outputs                r:   eager_attention_forwardr     s    //4'3 ; ;<JUF$?$?@L<<z';';Aq'ABWLL#g-zz,/#g-!#n4 ==((2U]](SVVW\WbWbcL==((6??([L,,|\:K''1-88:K$$r9   c                       e Zd ZdZdedef fdZ	 	 	 	 ddej                  dej                  dej                  dz  d	e	dz  d
ej                  dz  dee   deej                  ej                  dz  eej                     dz  f   fdZ xZS )Gemma3Attentionz=Multi-headed attention from 'Attention Is All You Need' paperrb   	layer_idxc                     t         |           t        |d      r|j                  |   nd | _        || _        || _        t        |d|j                  |j                  z        | _
        |j                  |j                  z  | _        |j                  dz  | _        | j
                  j                  | _        | j
                  j                    | _        t%        j&                  |j                  |j                  | j                  z  |j(                        | _        t%        j&                  |j                  |j                  | j                  z  |j(                        | _        t%        j&                  |j                  |j                  | j                  z  |j(                        | _        t%        j&                  |j                  | j                  z  |j                  |j(                        | _        | j
                  j2                  | _        | j                  dk(  r|j4                  nd | _        | j                  dk(  | _        t9        |j                  |j:                        | _        t9        |j                  |j:                        | _        y )Nr   r   r   re   sliding_attention)ru   rv   ) rL   rM   hasattrr   r   rb   r   r   rg   r   r   r   r   query_pre_attn_scalarr   attention_dropoutuse_bidirectional_attention	is_causalri   rj   attention_biasq_projk_projv_projo_projattn_logit_softcappingsliding_window
is_slidingrt   rms_norm_epsq_normk_normrQ   rb   r   rR   s      r:   rM   zGemma3Attention.__init__<  s   ;B6=;Y&,,Y7_c"
F4F4F&JdJd4de$*$>$>&B\B\$\!33T9!%!>!>![[DDDii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii&&68J8JQWQfQf
 '+kk&H&H#7;J]7]f33cg//-@@#V=P=PQ#V=P=PQr9   Nr@   position_embeddingsr   r?   cache_positionr   r   c                 r   |j                   d d }g |d| j                  }| j                  |      j                  |      j	                  dd      }	| j                  |      j                  |      j	                  dd      }
| j                  |      j                  |      j	                  dd      }| j                  |	      }	| j                  |
      }
|\  }}t        |	|
||      \  }	}
|'|||d}|j                  |
|| j                  |      \  }
}t        j                  | j                  j                  t               } || |	|
||f| j"                  r| j$                  nd| j&                  | j(                  d|\  }} |j*                  g |d j-                         }| j/                  |      }||fS )Nr{   r)   r'   )r   r   r
          )r   r   r  )r   r   r   viewr   r   r   r  r  r   updater   r   get_interfacerb   _attn_implementationr   r   r   r   r  r   r   r  )rQ   r@   r	  r   r?   r
  r   input_shapehidden_shapequery_statesr   r   r   r   cache_kwargsattention_interfacer   r   s                     r:   rV   zGemma3Attention.forwardZ  s    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST{{<0[[,
&S#7jRUWZ#[ j&#&snUL'6'='=j,X\XfXfht'u$J(?(M(MKK,,.E)
 %8
%
 /3mmD**LL..
%
 
%
!\ *k));;;;FFHkk+.L((r9   r   )r1   r2   r3   r4   r+   r[   rM   r5   r]   r	   
LongTensorr   r   rB   rV   r^   r_   s   @r:   r   r   8  s    GR/ RC RB -1.2(,26-)||-) #\\-) t+	-)
 -) ((4/-) +,-) 
u||U\\D0%2E2LL	M-)r9   r   c                   4    e Zd Zdedef fdZ	 	 	 	 	 ddej                  dej                  dej                  dz  dej                  dz  d	e	dz  d
ej                  dz  de
e   deej                  eej                  ej                  f   dz  f   fdZ xZS )Gemma3DecoderLayerrb   r   c                    t         |           || _        |j                  | _        || _        |j
                  |   | _        t        ||      | _        t        |      | _
        t        | j                  |j                        | _        t        | j                  |j                        | _        t        | j                  |j                        | _        t        | j                  |j                        | _        y )N)rb   r   rv   )rL   rM   rb   rg   r   r   attention_typer   	self_attnra   mlprt   r  input_layernormpost_attention_layernormpre_feedforward_layernormpost_feedforward_layernormr  s      r:   rM   zGemma3DecoderLayer.__init__  s    !--"$00;()LV$,T-=-=6CVCVW(5d6F6FFL_L_(`%)6t7G7GVM`M`)a&*78H8HfNaNa*b'r9   Nr@   r	  r   r   r?   r
  r   r   c           
         |}| j                  |      } | j                  d||||||d|\  }}	| j                  |      }||z   }|}| j                  |      }| j	                  |      }| j                  |      }||z   }|S )N)r@   r	  r   r   r?   r
  r8   )r  r  r  r   r  r!  )
rQ   r@   r	  r   r   r?   r
  r   residual_s
             r:   rV   zGemma3DecoderLayer.forward  s     !,,];)4>> 
' 3)%+)
 
q 55mD =0 66}E/77F =0r9   )NNNNN)r1   r2   r3   r+   r[   rM   r5   r]   r  r	   r   r   rB   r6   rV   r^   r_   s   @r:   r  r    s    c/ cC c  -1.204(,26 ||  #\\  t+	 
 &&-    ((4/  +,  
u  %(9(95;L;L(L"MPT"TT	U r9   r  c                        e Zd ZU eed<   dZdZg dZdgZdZ	dZ
dZdZdZeedZdZ ej&                          fd       Z xZS )	Gemma3PreTrainedModelrb   modelT)r  SiglipVisionEmbeddingsSiglipEncoderLayer#SiglipMultiheadAttentionPoolingHeadr?   )r@   rA   )imagetextc                    t         |   |       t        |t              r t	        j
                  |j                         y d|j                  j                  v r t	        j
                  |j                         y t        |t              r+t	        j                  |j                  |j                         y t        |t              r|j                  D ]  }|j                   }|j"                  |   dk7  rt$        |j"                  |      } ||j&                  |      \  }}t	        j(                  t+        || d      |       t	        j(                  t+        || d      |        y y )NRMSNormr   r   r   r   )rL   _init_weightsr   Gemma3MultiModalProjectorinitzeros_mm_input_projection_weightrR   r1   rX   rD   	constant_rH   rN   r   r   r   r   r   rb   copy_r   )rQ   r   r   r   r   r$  rR   s         r:   r/  z#Gemma3PreTrainedModel._init_weights  s   f%f78KK99:&**333KK& =>NN6--v/H/HI 56$00 ^
%EE##J/9<#6v7G7G
7S#TL#/*#U q

76j\+CDmT

76j\9K+LM}]^ 7r9   )r1   r2   r3   r*   r7   base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraph_supports_attention_backendr  r   _can_record_outputsinput_modalitiesr5   r   r/  r^   r_   s   @r:   r&  r&    sw    &*# $5"5N!"&+% )U]]_^ ^r9   r&  r  c           
      P     dt         dt         dt         dt         dt        f
 fd}|S )zA
    Enables a bidirectional mask within the sliding window.
    	batch_idxhead_idxq_idxkv_idxr   c                 &    t        ||z
        k  S )zA token can attend to any other token if their absolute distance is within
        the (exclusive) sliding window size (distance < sliding_window).)abs)rB  rC  rD  rE  r  s       r:   
inner_maskz1_bidirectional_window_overlay.<locals>.inner_mask  s     56>"^33r9   r[   bool)r  rH  s   ` r:   _bidirectional_window_overlayrK    s3    
4c 4S 4 4c 4d 4
 r9   c                   "    e Zd ZU eed<   dZdef fdZeee		 	 	 	 	 	 	 dde
j                  dz  de
j                  dz  de
j                  dz  dedz  d	e
j                  dz  d
edz  de
j                  dz  dee   defd                     Z xZS )Gemma3TextModelrb   r,  c           	      (   t         |   |       |j                  | _        |j                  | _        t        |j                  |j                  | j                  | j                  j                  dz        | _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        t        |j                  |j                         | _        t%        |      | _        d| _        | j+                          y c c}w )N      ?)rH   r  F)rL   rM   pad_token_idrG   
vocab_sizerD   rg   rb   embed_tokensri   
ModuleListrangenum_hidden_layersr  layersrt   r  normr   
rotary_embgradient_checkpointing	post_initr  s      r:   rM   zGemma3TextModel.__init__  s     !.. ++ :v1143C3CQUQ\Q\QhQhjmQm
 mmDI&JbJbDcdy	2d
 "&"4"4&:M:MN	/7&+# 	 es   "DNrS   r   r   r?   inputs_embeds	use_cacher
  r   r   c           
         |d u |d uz  rt        d      || j                  |      }|r|t        | j                        }|F||j	                         nd}	t        j                  |	|	|j                  d   z   |j                        }||j                  d      }t        |x}
t              sx| j                  |||||d}|j                         }| j                  j                  r(d |d<   t        | j                  j                        |d<   t!        di |t#        di |d	}
|}i }| j                  j$                  D ]  }| j'                  |||      ||<    | j(                  d | j                  j*                   D ]+  } ||f|
|j,                     ||j,                     |||d
|}- | j/                  |      }t1        ||      S )N:You must specify exactly one of input_ids or inputs_embedsrb   r   r)   r   rb   r\  r   r
  r?   r   c                  L    t        j                  dt         j                        S )NTr   )r5   rP   rJ  )argss    r:   <lambda>z)Gemma3TextModel.forward.<locals>.<lambda>:  s    TY^YcYc@d r9   or_mask_function)full_attentionr   )r   r	  r   r?   r
  )last_hidden_stater?   r8   )
ValueErrorrS  r
   rb   get_seq_lengthr5   r   r   r   r   r   dictcopyr   rK  r  r   r   r   rY  rW  rV  r  rX  r   )rQ   rS   r   r   r?   r\  r]  r
  r   past_seen_tokenscausal_mask_mappingmask_kwargssliding_mask_kwargsr@   r	  r   decoder_layers                    r:   rV   zGemma3TextModel.forward  s    -t";<YZZ  --i8M0*$++>O!CRC^==?de"\\ "2]5H5H5K"KTaThThN )33A6L ?-F ++!."0"0#2 ,K #."2"2"4{{662d./:WX\XcXcXrXr:s#$67 #5"C{"C%F%]I\%]# & ++11 	gJ.2oom\[e.f
+	g "[[)H4;;+H+HI 		M)2=3O3OP$78T8T$U) /- M		 		-0&++
 	
r9   )NNNNNNN)r1   r2   r3   r+   r7   r@  rM   r%   r&   r   r5   r  r]   r	   r6   rJ  r   r   r   rV   r^   r_   s   @r:   rM  rM    s     / &   .2.204(,26!%26J
##d*J
 t+J
 &&-	J

 J
 ((4/J
 $;J
 ((4/J
 +,J
 
!J
    J
r9   rM  c                   t    e Zd ZU ddiZddiZddgdgfiZeed<   def fdZe	e
	 	 	 	 	 	 	 	 	 dd
ej                  d	z  dej                  d	z  dej                  d	z  ded	z  dej                  d	z  dej                  d	z  ded	z  dej                  d	z  deej                  z  dee   defd              Z xZS )Gemma3ForCausalLMlm_head.weightzmodel.embed_tokens.weightlm_headcolwise_gather_outputr@   r>   rb   c                     t         |   |       t        |      | _        |j                  | _        t        j                  |j                  |j                  d      | _        | j                          y rd   )
rL   rM   rM  r'  rR  ri   rj   rg   ru  r[  rp   s     r:   rM   zGemma3ForCausalLM.__init__c  sU     $V,
 ++yy!3!3V5F5FUS 	r9   NrS   r   r   r?   r\  labelsr]  r
  logits_to_keepr   r   c
                     | j                   d|||||||d|
}|j                  }t        |	t              rt	        |	 d      n|	}| j                  |dd|ddf         }| j                  j                  G|| j                  j                  z  }t        j                  |      }|| j                  j                  z  }d}| | j                  ||| j                  fi |
}t        |||j                  |j                  |j                        S )a  
        Example:

        ```python
        >>> from transformers import AutoTokenizer, Gemma3ForCausalLM

        >>> model = Gemma3ForCausalLM.from_pretrained("google/gemma-2-9b")
        >>> tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b")

        >>> prompt = "What is your favorite condiment?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "What is your favorite condiment?"
        ```)rS   r   r   r?   r\  r]  r
  Nr=   r>   r?   r@   rA   r8   )r'  rh  r   r[   sliceru  rb   final_logit_softcappingr5   r   loss_functionrR  r   r?   r@   rA   )rQ   rS   r   r   r?   r\  rx  r]  r
  ry  r   outputsr@   slice_indicesr>   r=   s                   r:   rV   zGemma3ForCausalLM.forwardl  s   B ,64:: 	,
)%+')	,
 	,
  118B>SV8W~ot4]kmA}a,?@A;;..:dkkAAAFZZ'FdkkAAAF%4%%ffdooPPD%#33!//))
 	
r9   )	NNNNNNNNr   )r1   r2   r3   _tied_weights_keys_tp_plan_pp_planr+   r7   rM   r    r   r5   r  r]   r	   r6   rJ  r[   r   r   r   rV   r^   r_   s   @r:   rs  rs  \  s=   *,GH23H_-z:;H/   .2.204(,26*.!%26-.=
##d*=
 t+=
 &&-	=

 =
 ((4/=
   4'=
 $;=
 ((4/=
 ell*=
 +,=
 
 =
  =
r9   rs  c                   D     e Zd Zdef fdZdej                  fdZ xZS )r0  rb   c                    t         |           t        j                  t	        j
                  |j                  j                  |j                  j                              | _	        t        |j                  j                  |j                  j                        | _        t        |j                  j                  |j                  j                  z        | _        t        |j"                  dz        | _        | j                   | j$                  z  | _        t        j(                  | j&                  | j&                        | _        y )Nr  rP  )kernel_sizestride)rL   rM   ri   rx   r5   ry   vision_configrg   text_configr3  rt   layer_norm_epsmm_soft_emb_normr[   
image_size
patch_sizepatches_per_imagemm_tokens_per_imagetokens_per_sider  	AvgPool2davg_poolrp   s     r:   rM   z"Gemma3MultiModalProjector.__init__  s    *,,,KK,,88&:L:L:X:XY+
' !.  ,,&2F2F2U2U!
 "%V%9%9%D%DH\H\HgHg%g!h"6#=#=s#BC11T5I5II1A1A$JZJZ[r9   vision_outputsc                    |j                   \  }}}|j                  dd      }|j                  ||| j                  | j                        }|j	                         }| j                  |      }|j                  d      }|j                  dd      }| j                  |      }t        j                  || j                        }|j                  |      S )Nr)   r'   )r   r   r   r  r   r  flattenr  r5   r   r3  r   )	rQ   r  
batch_sizer$  rg   reshaped_vision_outputspooled_vision_outputsnormed_vision_outputsprojected_vision_outputss	            r:   rV   z!Gemma3MultiModalProjector.forward  s    %3%9%9"
A{"0":":1a"@"9"A"AT%;%;T=S=S#
 #:"D"D"F $.E F 5 = =a @ 5 ? ?1 E $ 5 56K L#(<<0EtGfGf#g '//??r9   )	r1   r2   r3   r*   rM   r5   r]   rV   r^   r_   s   @r:   r0  r0    s#    \| \ @ell @r9   r0  token_type_idsimage_group_idsc           
      Z      ydt         dt         dt         dt         dt        f
 fd}|S )z
    This function adds the correct offsets to the `q_idx` and `kv_idx` as the torch API can only accept lengths,
    not start and end indices.
    NrB  rC  rD  rE  r   c                 :   t        j                  |j                  d   k  |d      }t        j                  |j                  d   k  |d      }| |f   }t        j                  |j                  d   k  |d      }| |f   }t        j                  |j                  d   k  |d      }| |f   }t        j                  |j                  d   k  |d      }| |f   }	t        j                  |j                  d   k  |	d      }	|dk(  |dk(  z  }
||	k(  }|
|z  S )Nr)   r   r{   )r5   wherer   )rB  rC  rD  rE  
safe_q_idxsafe_kv_idxtoken_type_ids_at_q_idxtoken_type_ids_at_kv_idximage_group_ids_at_q_idximage_group_ids_at_kv_idxis_image_blocksame_image_blockr  r  s               r:   rH  z0token_type_ids_mask_function.<locals>.inner_mask  sM    [[)=)=a)@!@%K
kk&>+?+?+B"BFAN"0J1F"G"'++en6J6J16M.MOfhi"j#1)[2H#I #(;;v8L8LQ8O/OQikl#m #29j3H#I #(;;u7L7LQ7O/OQikm#n $3I{4J$K!$)KK9N9Nq9Q0QSlnp$q!1Q6;SWX;XY37PP  000r9   rI  )r  r  rH  s   `` r:   token_type_ids_mask_functionr    s>     1c 1S 1 1c 1d 12 r9   input_embeds5.6.0r\  versionnew_namerb   r
  r?   r   pixel_valuesis_trainingis_first_iterationc
                    |r|t        d      | j                         |||||d}|	|	n|du xs |j                   xs |du}	||	r|dk(  j                  |j                        }t
        j                  j                  |dd      ddddf   }|| z  }t        j                  |j                         d	      dz
  }t        j                  ||d      }t        |j                  |j                        |      |d
<   t        di |S )a  
    Overwrites the base `create_masks_for_generate` with `token_type_ids` masking to create the causal mask mapping
    for all kinds of forward passes. Gemma3 uses a bidirectional mask for images.

    Uses `pixel_values` as an optional input to disambiguate edge cases.
    Nz;`token_type_ids` is required as a model input when trainingrb  r)   )r)   r   r   )r   r{   r   rf  r8   )ri  get_text_configis_initializedrW   r   ri   r   padr5   cumsumr[   r  r  r   )rb   r\  r   r
  r?   r   r  r  r  r  r   ro  is_imageis_previous_imagenew_image_startr  s                   r:   create_causal_mask_mappingr    s4   ( ~-VWW ((*&((*$K ) 	%g_-K-K)Kg|cgOg 
 !&8 #a'++N,A,ABMM--ha-HCRCP"&7%77,,':':'<!DqH++hD*Fn334o+
&' %3{33r9   zy
    The Base Gemma3 model which consists of a vision backbone and a language model without language modeling head.,
    c                   ,    e Zd ZddiZdZdef fdZd Zd Ze	 e
d	      d
ej                  dee   deez  fd              Zdej$                  dej                  dej                  fdZe	e
	 	 	 	 	 	 	 	 	 	 ddej$                  dz  d
ej                  dz  dej(                  dz  dej$                  dz  dedz  dej$                  dz  dej$                  dz  dej                  dz  dej$                  dz  dedz  dee   deez  fd              Z xZS )Gemma3Modelzlanguage_model.modellanguage_modelFrb   c                 2   t         |   |       t        j                  |j                        | _        t        |      | _        |j                  j                  | _	        t        j                  |j                        }|| _
        | j                          y )Nr`  )rL   rM   r(   from_configr  vision_towerr0  multi_modal_projectorr  rR  r  r[  )rQ   rb   r  rR   s      r:   rM   zGemma3Model.__init__=  sq     %119M9MN%>v%F" ,,77"..f6H6HI,r9   c                 6    | j                   j                         S rU   )r  get_input_embeddingsr   s    r:   r  z Gemma3Model.get_input_embeddingsG  s    ""7799r9   c                 :    | j                   j                  |       y rU   )r  set_input_embeddingsrQ   r   s     r:   r  z Gemma3Model.set_input_embeddingsJ  s    007r9   zOProjects the last hidden state from the vision model into language model space.r,   r  r   r   c                 t     | j                   d|dd|}|j                  }| j                  |      |_        |S )NT)r  return_dictr8   )r  rh  r  pooler_output)rQ   r  r   r  rh  s        r:   get_image_featureszGemma3Model.get_image_featuresM  sH    
 +**aRVaZ`a*<<'+'A'ABS'T$r9   rS   r\  image_featuresc                 N   |m| | j                         t        j                  | j                  j                  t        j
                  |j                              k(  }|j                  d      }n|| j                  j                  k(  }|j                         }|j                  d   |j                  d   z  }|j                  d      j                  |      j                  |j                        }t        ||   j                         |j                         k(  d| d|        |S )z
        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
        equal to the length of multimodal features. If the lengths are different, an error is raised.
        )rY   r   r{   r   r)   z6Image features and image tokens do not match, tokens: z, features: )r  r5   rP   rb   image_token_idlongr   allsumr   r   	expand_asrW   r"   numel)rQ   rS   r\  r  special_image_maskn_image_tokensn_image_featuress          r:   get_placeholder_maskz Gemma3Model.get_placeholder_maskX  s    !.2M$2K2K2MT[[77uzzR_RfRfg3 " "4!7!7!;!*dkk.H.H!H+//1)//2^5I5I!5LL/99"=GGVYYZgZnZno,-3359M9M9OOD^DTT`aq`rs	
 "!r9   Nr   r   r?   r  r
  rx  r]  	lm_kwargsc                    |du |duz  rt        d      |R| j                  j                  | j                  k\  r/|| j                  j                  k(  }|j	                         }d||<   n|}| | j                         |      }|F||j                         nd}t        j                  |||j                  d   z   |j                        }|i| j                  |d      j                  }|j                  |j                  |j                        }| j                  |||      }|j!                  ||      }t#        |x}t$              s(t'        | j                  |||||||| j(                  		      } | j*                  d|||||
d|d
|}t-        |j.                  |j0                  |j2                  |j4                  |      S d      S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.

        Example:

        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, Gemma3ForConditionalGeneration

        >>> model = Gemma3ForConditionalGeneration.from_pretrained("google/gemma32-3b-mix-224")
        >>> processor = AutoProcessor.from_pretrained("google/gemma32-3b-mix-224")

        >>> prompt = "Where is the cat standing?"
        >>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> inputs = processor(images=image, text=prompt,  return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(**inputs,)
        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Where is the cat standing?\nsnow"
        ```Nr_  r   r)   ra  T)r  )r\  r  )r  )r   r   r?   r\  r]  r  r
  )rh  r?   r@   rA   r0   r8   )ri  rb   r  rR  r   r  rj  r5   r   r   r   r  r  rW   rY   r  masked_scatterr   rk  r  r   r  r/   rh  r?   r@   rA   )rQ   rS   r  r   r   r?   r  r
  r\  rx  r]  r  r  llm_input_idsrm  r  rn  r  s                     r:   rV   zGemma3Model.forwardp  s   Z -t";<YZZ  T[[%?%?4??%R!*dkk.H.H!H%OO-M01M,-%M 7D557FM!CRC^==?de"\\ "2]5H5H5K"KTaThThN
 #!44\t4TbbN+..}/C/C]EXEXYN!%!:!:~ "; " *889K^\M ?-F"< MM
# &$%% 	
.%+')	
 	
 )%77#33!//))2>2J
 	

 QU
 	
r9   )
NNNNNNNNNN)r1   r2   r3   _checkpoint_conversion_mappingaccepts_loss_kwargsr*   rM   r  r  r    r   r5   r6   r   r   rB   r   r  r  r  r]   r	   rJ  r/   rV   r^   r_   s   @r:   r  r  3  s    '=>N%O"| :8 !rs!--9?@R9S	+	+ t "))":?:K:K"]b]n]n"0  .215.204(,262626*.!%g
##d*g
 ''$.g
 t+	g

 &&-g
 g
 ((4/g
 ((4/g
 ((4/g
   4'g
 $;g
 ./g
 
*	*g
  g
r9   r  c                       e Zd ZdddddZddiZdZd	ef fd
Zd Zd Z	e
dej                  dee   fd       Zee
	 	 	 	 	 	 	 	 	 	 	 d$dej"                  dz  dej                  dz  dej$                  dz  dej"                  dz  dedz  dej"                  dz  dej"                  dz  dej                  dz  dej"                  dz  dedz  deej$                  z  dee   deez  fd              Z	 	 	 	 	 	 	 	 	 	 	 d% fd	Ze edd d!      	 	 d&d	edej$                  dej$                  dz  dej$                  dedz  dej$                  dz  dej$                  dz  d"edz  defd#              Z xZS )'Gemma3ForConditionalGenerationmodel.language_modelmodel.vision_towermodel.multi_modal_projectorru  )^language_model.model^vision_tower^multi_modal_projectorz^language_model.lm_headrt  z(model.language_model.embed_tokens.weightFrb   c                     t         |   |       t        |      | _        t	        j
                  |j                  j                  |j                  j                  d      | _	        | j                          y rd   )rL   rM   r  r'  ri   rj   r  rg   rR  ru  r[  rp   s     r:   rM   z'Gemma3ForConditionalGeneration.__init__  sS      (
yy!3!3!?!?ASASA^A^ejkr9   c                 6    | j                   j                         S rU   r'  r  r   s    r:   r  z3Gemma3ForConditionalGeneration.get_input_embeddings      zz..00r9   c                 :    | j                   j                  |       y rU   r'  r  r  s     r:   r  z3Gemma3ForConditionalGeneration.set_input_embeddings      

''.r9   r  r   c                 <     | j                   j                  |fi |S rU   )r'  r  )rQ   r  r   s      r:   r  z1Gemma3ForConditionalGeneration.get_image_features  s    ,tzz,,\DVDDr9   NrS   r   r   r?   r  r
  r\  rx  r]  ry  r  r   c                     | j                   d||||||||
|	|d
|}|d   }t        |t              rt        | d      n|}| j	                  |dd|ddf         }d}|	O|j                         }|dddddf   }|	dddf   }||dd|j                  d    df   j                  |j                        }||j                  |j                        dk7     j                         }||j                  |j                        dk7     j                         }n |j                         }|j                         }t        j                         }|j                  d| j                  j                  j                        }|j                  d      j                  |j                        } |||      }t!        |||j"                  |j$                  |j&                  |j(                        S )	a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.

        Example:

        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, Gemma3ForConditionalGeneration

        >>> model = Gemma3ForConditionalGeneration.from_pretrained("google/gemma-3-4b-it")
        >>> processor = AutoProcessor.from_pretrained("google/gemma-3-4b-it")

        >>> messages = [
        ...     {
        ...         "role": "system",
        ...         "content": [
        ...             {"type": "text", "text": "You are a helpful assistant."}
        ...         ]
        ...     },
        ...     {
        ...         "role": "user", "content": [
        ...             {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"},
        ...             {"type": "text", "text": "Where is the cat standing?"},
        ...         ]
        ...     },
        ... ]

        >>> inputs = processor.apply_chat_template(
        ...     messages,
        ...     tokenize=True,
        ...     return_dict=True,
        ...     return_tensors="pt",
        ...     add_generation_prompt=True
        ... )
        >>> # Generate
        >>> generate_ids = model.generate(**inputs)
        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "user\nYou are a helpful assistant.\n\n\n\n\n\nWhere is the cat standing?\nmodel\nBased on the image, the cat is standing in a snowy area, likely outdoors. It appears to"
        ```
        )
rS   r  r  r   r   r?   r\  r]  rx  r
  r   N.r{   r)   )r=   r>   r?   r@   rA   r0   r8   )r'  r   r[   r|  ru  r\   r   rW   r   r   ri   CrossEntropyLossr  rb   r  rR  r<   r?   r@   rA   r0   )rQ   rS   r  r   r   r?   r  r
  r\  rx  r]  ry  r  r  r@   r  r>   r=   shift_logitsshift_labelsshift_attention_maskloss_fctflat_logitsflat_labelss                           r:   rV   z&Gemma3ForConditionalGeneration.forward  s   | $** 
%))%+')
 
  
8B>SV8W~ot4]kmA}a,?@A\\^F!#ssA+.L!#qr'?L) (6a,:L:LQ:O9O9Q6Q'R'U'UV\VcVc'd$+,@,C,CFMM,RVW,WXcce+,@,C,CLDWDW,X\],]^iik+668+668**,H&++B0G0G0R0RSK&++B/22<3F3FGKK5D+#33!//)) ' ; ;
 	
r9   c                 N    t        |   |f||||||	|
||d	|}|s|	s||d<   |S )N)	r?   r\  r   r   r
  r]  ry  r  r  r  )rL   prepare_inputs_for_generation)rQ   rS   r?   r\  r
  r   r  r   r  r]  ry  rx  r  r   model_inputsrR   s                  r:   r  z<Gemma3ForConditionalGeneration.prepare_inputs_for_generationm  sX    " w<
+')%)))1
 
$ Y+7L(r9   r  r  r  r  c           
          t        | ||||||fd|i|j                         D 	
ci c]  \  }	}
|	dk7  s|	|
 c}
}	S c c}
}	w )Nr  r  )r  items)rb   r\  r   r
  r?   r   r  r  r   r   vs              r:   r   z8Gemma3ForConditionalGeneration.create_masks_for_generate  s`     *

  2

 !'F1!~2Eq!tF

 
	
 Gs   ==)NNNNNNNNNNr   )NNNNNNNTNNF)NF) r1   r2   r3   r  r  r  r*   rM   r  r  r   r5   r6   r   r   r  r    r  r]   r	   rJ  r[   rB   r<   rV   r  r   r#   r   rk  r   r^   r_   s   @r:   r  r    s    "8-"?#,	&" +,VW  | 1/ Eu/@/@ EFSeLf E E  .215.204(,262626*.!%-.l
##d*l
 ''$.l
 t+	l

 &&-l
 l
 ((4/l
 ((4/l
 ((4/l
   4'l
 $;l
 ell*l
 ./l
 
-	-l
  l
b  &P ^WO /3*/
 
||
 t+
 	

 
 llT)
 t+
 !4K
 

 P 
r9   r  c                   Z    e Zd ZddddZ fdZd Zd Zee	 	 	 	 	 	 	 	 	 dd	e	j                  dz  d
e	j                  dz  de	j                  dz  de	j                  dz  dedz  de	j                  dz  de	j                  dz  de	j                  dz  dedz  dee   defd              Z xZS )Gemma3ForSequenceClassificationr  r  r  )r  r  r  c                     t         |   |       |j                  | _        t        |      | _        t        j                  |j                  j                  | j                  d      | _	        | j                          y rd   )rL   rM   
num_labelsr  r'  ri   rj   r  rg   scorer[  rp   s     r:   rM   z(Gemma3ForSequenceClassification.__init__  sZ      ++ (
YYv11==tUZ[
 	r9   c                 6    | j                   j                         S rU   r  r   s    r:   r  z4Gemma3ForSequenceClassification.get_input_embeddings  r  r9   c                 :    | j                   j                  |       y rU   r  r  s     r:   r  z4Gemma3ForSequenceClassification.set_input_embeddings  r  r9   NrS   r  r   r   r?   r\  r  rx  r]  r   r   c
                     | j                   |f|||||||	d|
}|j                  }| j                  |      }||j                  d   }n|j                  d   }| j                  j
                  j                  |dk7  rt        d      | j                  j
                  j                  d}n||| j                  j
                  j                  k7  j                  |j                  t        j                        }t        j                  |j                  d   |j                  t        j                        }||z  j                  d      }n.d}t        j                  | j                   j"                   d       |t        j                  ||j                  	      |f   }d}|| j%                  |||| j                  
      }t'        |||j(                  |j*                  |j,                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        )r   r  r   r?   r\  r  r]  Nr   r)   z=Cannot handle batch sizes > 1 if no padding token is defined.r{   r   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`ra  )r>   rx  pooled_logitsrb   r{  )r'  rh  r  r   rb   r  rQ  ri  rW   r   r5   int32r   argmaxloggerwarning_oncerR   r1   r~  r   r?   r@   rA   )rQ   rS   r  r   r   r?   r\  r  rx  r]  r   transformer_outputsr@   r>   r  last_non_pad_tokennon_pad_masktoken_indicesr  r=   s                       r:   rV   z'Gemma3ForSequenceClassification.forward  s   , )djj

)%%+')

 

 ,==M* "+J&,,Q/J;;""//7J!O\]];;""//7!#"%)@)@)M)MMQQRXR_R_afalalmL!LL)<V]]Z_ZeZefM"/,">!F!Fr!J!#>>**+ ,Z Z
 u||Jv}}MOaab%%VFR_hlhshs%tD/ /??-;;*55
 	
r9   )	NNNNNNNNN)r1   r2   r3   r  rM   r  r  r    r   r5   r  r6   r]   r	   rJ  r   r   r   rV   r^   r_   s   @r:   r  r    s.   !7-"?&"1/  .215.204(,2626*.!%C
##d*C
 ''$.C
 t+	C

 &&-C
 C
 ((4/C
 ((4/C
   4'C
 $;C
 +,C
 
*C
  C
r9   r  c                        e Zd ZU dZeed<   dZy)#Gemma3TextForSequenceClassificationz
    Gemma3TextForSequenceClassification is a text-only sequence classification model that works with Gemma3TextConfig.
    It uses the generic sequence classification implementation for efficiency and consistency.
    rb   rN  N)r1   r2   r3   r4   r+   r7   r@  r8   r9   r:   r  r    s    
  r9   r  )r&  rM  rs  r  r  r  r  )r)   )r  NN)NNFN)ccollections.abcr   dataclassesr   typingr   r5   torch.nnri    r   r1  activationsr   cache_utilsr	   r
   configuration_utilsr   
generationr   integrationsr   r   masking_utilsr   r   r   modeling_layersr   r   modeling_outputsr   r   r   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r    r!   r"   utils.deprecationr#   utils.genericr$   r%   utils.output_capturingr&   autor(   configuration_gemma3r*   r+   
get_loggerr1   r  r/   r<   	EmbeddingrD   Modulera   rt   r   r   r   r]   r[   r   r\   rB   r   r   r  r&  rJ  rK  rM  rs  r0  r  r6   rk  r  r  r  r  r  __all__r8   r9   r:   <module>r'     sB  * % !    & ! . 3 ) I m m [  L F & w w 0 G 5  @ 
		H	% 
9 7 9 9 
9; 9 90SBLL S		  =BII =(N<BII N<b( *+ ,2	UU\\ 	U# 	U%,, 	U$   %II%<<% 
% <<	%
 LL4'% % T\% T\% 5<<%&%D )*N)bii N) +N)b.3 .b (^O (^ (^V
# 
(CcSVCWY]C]:^ 
 d
+ d
 d
N N
- N
 N
b!@		 !@H%LL4'%\\D(% _%P ?K +/-1&*5454<<54 LL4'54 LL	54
 T\54 ,,%54 LL4'54 ##d*54 54 t54 
54 L54p 
a
' a

a
H 
L
%:O L

L
^[
&; [
|!*JLa !r9   