
    qii                     \   d dl mZ d dlmZmZmZ d dlZd dlmZ ddl	m
Z ddlmZmZ ddlmZmZ ddlmZmZmZ dd	lmZmZ dd
lmZmZmZ ddlmZmZm Z  ddl!m"Z"m#Z# ddl$m%Z% ddl&m'Z'm(Z(m)Z)m*Z* ddl+m,Z, ddl-m.Z. ddl/m0Z0 ddl1m2Z2m3Z3m4Z4m5Z5m6Z6m7Z7m8Z8m9Z9m:Z: ddl;m<Z<m=Z=m>Z>m?Z?m@Z@ ddlAmBZB  e*j                  eD      ZE G d de0e      ZF G d de      ZG G d de?      ZH G d de<      ZI G d dej                        ZK G d  d!e4      ZL G d" d#e7      ZM G d$ d%e8      ZN G d& d'e2      ZO G d( d)e      ZPdZQ G d* d+e6      ZRd,eSd-eeSeSeSeSgeTf   fd.ZU G d/ d0e5      ZV G d1 d2e3      ZW G d3 d4ej                        ZY e,d5d6d78      	 	 	 	 dLd9ed7ej                  d:ej                  dz  d;ej                  d<edz  d=ej                  dz  d>ej                  dz  d?ej                  dz  d@eTdAeTdz  d-e\fdB       Z] G dC dDe>      Z^ G dE dFe=      Z_ G dG dHeR      Z` G dI dJeeR      Zag dKZby)M    )Callable)AnyLiteralOptionalN   )initialization)CacheDynamicCache)PreTrainedConfiglayer_type_validation)create_causal_maskcreate_masks_for_generate!create_sliding_window_causal_mask) GenericForSequenceClassificationGradientCheckpointingLayer)BaseModelOutputWithPastBaseModelOutputWithPooling SequenceClassifierOutputWithPast)ROPE_INIT_FUNCTIONSRopeParametersdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogging)deprecate_kwarg)maybe_autocast   )Gemma2Config)	Gemma2AttentionGemma2ForCausalLM	Gemma2MLPGemma2ModelGemma2PreTrainedModelGemma2RMSNormGemma2RotaryEmbeddingapply_rotary_pos_embeager_attention_forward)PaliGemmaCausalLMOutputWithPast!PaliGemmaForConditionalGenerationPaliGemmaModelPaligemmaModelOutputWithPasttoken_type_ids_mask_function)SiglipVisionConfigc            4          e Zd ZdZdZdddddddddd	Zddd	Z	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d'ded
z  ded
z  ded
z  ded
z  ded
z  ded
z  ded
z  ded
z  ded
z  de	d
z  ded
z  de
d
z  ded
z  ded
z  ded
z  de
d
z  de	d
z  ded
z  ded
z  dee   d
z  de	d
z  d e	d
z  d!eed"   ef   d
z  d#e
d
z  d$e
d
z  f2d%Zd(d&Zy
))Gemma3TextConfigay  
    This is the configuration class to store the configuration of a [`Gemma3TextModel`]. It is used to instantiate an Gemma3Text
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the Gemma3Text-7B.
    e.g. [google/gemma3_text-7b](https://huggingface.co/google/gemma3_text-7b)
    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PreTrainedConfig`] for more information.

    Args:
        vocab_size (`int`, *optional*, defaults to 262208):
            Vocabulary size of the Gemma3Text model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`Gemma3TextModel`]
        hidden_size (`int`, *optional*, defaults to 2304):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 9216):
            Dimension of the MLP representations.
        num_hidden_layers (`int`, *optional*, defaults to 26):
            Number of hidden layers in the Transformer decoder.
        num_attention_heads (`int`, *optional*, defaults to 8):
            Number of attention heads for each attention layer in the Transformer decoder.
        num_key_value_heads (`int`, *optional*, defaults to 4):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
            by meanpooling all the original heads within that group. For more details, check out [this
            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
            `num_attention_heads`.
        head_dim (`int`, *optional*, defaults to 256):
            The attention head dimension.
        hidden_activation (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
            The non-linear activation function (function or string) in the decoder. Will default to `"gelu_pytorch_tanh"`
            if not specified. `"gelu_pytorch_tanh"` uses an approximation of the `"gelu"` activation function.
        max_position_embeddings (`int`, *optional*, defaults to 131072):
            The maximum sequence length that this model might ever be used with.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon used by the rms normalization layers.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`.
        pad_token_id (`int`, *optional*, defaults to 0):
            Padding token id.
        eos_token_id (`int`, *optional*, defaults to 1):
            End of stream token id.
        bos_token_id (`int`, *optional*, defaults to 2):
            Beginning of stream token id.
        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
            Whether to use a bias in the query, key, value and output projection layers during self-attention.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        query_pre_attn_scalar (`float`, *optional*, defaults to 256):
            Scaling factor used on the attention scores
        sliding_window (`int`, *optional*, defaults to 4096):
            In Gemma3Text, every other layer uses sliding window attention. This is the size of the sliding window.
        layer_types (`list`, *optional*):
            Attention pattern for each layer.
        final_logit_softcapping (`float`, *optional*):
            Scaling factor when applying tanh softcapping on the logits.
        attn_logit_softcapping (`float`, *optional*):
            Scaling factor when applying tanh softcapping on the attention scores.
        rope_parameters (`dict`, *optional*):
            Dictionary mapping attention patterns (`"full_attention"`, `"sliding_attention"`) to `RopeParameters`.
            Each value should be a dictionary containing `rope_type` and optional scaling parameters.
        use_bidirectional_attention (`bool`, *optional*, defaults to `False`):
            If True, the model will attend to all text tokens instead of using a causal mask. This does not change
            behavior for vision tokens.
        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
            Whether to tie weight embeddings

    ```python
    >>> from transformers import Gemma3TextModel, Gemma3TextConfig
    >>> # Initializing a Gemma3Text gemma3_text-7b style configuration
    >>> configuration = Gemma3TextConfig()
    >>> # Initializing a model from the gemma3_text-7b style configuration
    >>> model = Gemma3TextModel(configuration)
    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```
    gemma3_textcolwisereplicated_with_grad_allreducerowwise)	zlayers.*.self_attn.q_projzlayers.*.self_attn.k_projzlayers.*.self_attn.v_projzlayers.*.self_attn.q_normzlayers.*.self_attn.k_normzlayers.*.self_attn.o_projzlayers.*.mlp.gate_projzlayers.*.mlp.up_projzlayers.*.mlp.down_projg    .Ag     @)globallocalN
vocab_sizehidden_sizeintermediate_sizenum_hidden_layersnum_attention_headsnum_key_value_headshead_dimhidden_activationmax_position_embeddingsinitializer_rangerms_norm_eps	use_cachepad_token_ideos_token_idbos_token_idattention_biasattention_dropoutquery_pre_attn_scalarsliding_windowlayer_typesfinal_logit_softcappingattn_logit_softcappingrope_parametersfull_attentionsliding_attentionuse_bidirectional_attentiontie_word_embeddingsc                    || _         || _        || _        || _        || _        |	| _        || _        || _        || _        || _	        || _
        || _        |
| _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        |r| j&                  dz  dz   | _        |j1                  dd      | _        | j,                  Et5        | j                        D cg c]!  }t7        |dz   | j2                  z        rdnd# c}| _        t9        | j,                  | j                         || _        t=        j>                  di | y c c}w )Nr!      sliding_window_pattern   rS   rR    ) rF   rH   rG   rU   r:   rB   r;   r<   r=   r>   r@   r?   rC   rD   rE   rI   rJ   rA   rK   rL   rN   rO   rM   rT   get_sliding_window_patternrangeboolr   rP   r   __init__)selfr:   r;   r<   r=   r>   r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   rN   rO   rP   rT   rU   kwargsis                               [/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/gemma3/modular_gemma3.pyr_   zGemma3TextConfig.__init__   sy   : )((#6 $'>$&!2!2#6  #6 !2(",!2!2%:",'>$&<#&+F(&#'#6#6!#;q"@D (.zz2JA'N$# t556  (,QUd6R6R,R'S#Yii D 	d..0F0FG.!!+F+ s   <&E'c                    |j                  dd       }ddiddid}| j                  | j                  n|| _        || j                  d   j                  |       | j                  j                  d      ddi| j                  d<   | j                  d   j	                  d|j                  d| j
                  d                | j                  j                  d      ddi| j                  d<   | j                  d   j	                  d|j                  d	| j
                  d
                | j                          | j                  |       |S )Nrope_scaling	rope_typedefault)rS   rR   rR   
rope_thetar8   rS   rope_local_base_freqr9   )ignore_keys)poprP   updater[   
setdefaultdefault_thetastandardize_rope_paramsvalidate_rope)r`   ignore_keys_at_rope_validationra   re   default_rope_paramss        rc   convert_rope_params_to_dictz,Gemma3TextConfig.convert_rope_params_to_dict   s[   zz.$7
 #.y!9*I6
 8<7K7K7Wt33]p#  !1299,G ##$45=6A95MD  !12-.99&**\43E3Eh3OP	
 ##$78@9Di8PD  !4501<<&**%;T=O=OPW=XY	

 	$$&'EF    )i@  i 	  i $              gelu_pytorch_tanhi   {Gz?ư>Tr   rW   r!   F        rx   i   NNNNFTN)__name__
__module____qualname____doc__
model_typebase_model_tp_planrn   intstrfloatr^   listdictr   r   r_   rs   rZ   rt   rc   r3   r3   >   s$   Pd J%.%.%.%E%E%."+ )"+
  +X>M ")"&(,(**+*+"(;.5*.#'!%#$#$#$&+*-,/%)(,04/3gk38+/5D,$JD, 4ZD, :	D,
 :D, !4ZD, !4ZD, *D, :D, "%tD, !4<D, DjD, $;D, DjD, DjD,  Dj!D," t#D,$ !4<%D,&  #Tz'D,( d
)D,* #Y%+D,, "'-D,. !&/D,0 g&KLn\]`dd1D,2 &*D[3D,4 "D[5D,Lrt   r3   c                        e Zd ZdZdZddddZeedZ	 	 	 	 	 	 	 	 dd	ee	e
ef   z  dz  d
ee	e
ef   z  dz  dedz  dedz  dedz  dedz  dedz  dedz  f fdZ xZS )Gemma3Configa	  
    This is the configuration class to store the configuration of a [`Gemma3ForConditionalGeneration`]. It is used to instantiate an
    Gemma3ForConditionalGeneration according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the PaliGemma-2B.

    e.g. [google/gemma-3-4b](https://huggingface.co/google/gemma-3-4b)

    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PreTrainedConfig`] for more information.

    Args:
        text_config (`Union[Gemma3TextConfig, dict]`, *optional*):
            The config object of the text backbone.
        vision_config (`Union[AutoConfig, dict]`,  *optional*):
            Custom vision config or dict.
        mm_tokens_per_image (`int`, *optional*, defaults to 256):
            The number of tokens per image embedding.
        boi_token_index (`int`, *optional*, defaults to 255999):
            The begin-of-image token index to wrap the image prompt.
        eoi_token_index (`int`, *optional*, defaults to 256000):
            The end-of-image token index to wrap the image prompt.
        image_token_index (`int`, *optional*, defaults to 262144):
            The image token index to encode the image prompt.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
            Whether to tie weight embeddings

    Example:

    ```python
    >>> from transformers import Gemma3ForConditionalGeneration, Gemma3Config, SiglipVisionConfig, Gemma3TextConfig

    >>> # Initializing a Siglip-like vision config
    >>> vision_config = SiglipVisionConfig()

    >>> # Initializing a Gemma3 Text config
    >>> text_config = Gemma3TextConfig()

    >>> # Initializing a Gemma3 gemma-3-4b style configuration
    >>> configuration = Gemma3Config(vision_config, text_config)

    >>> # Initializing a model from the gemma-3-4b style configuration
    >>> model = Gemma3TextConfig(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```gemma3image_token_indexboi_token_indexeoi_token_index)image_token_idboi_token_ideoi_token_id)text_configvision_configNr   r   mm_tokens_per_imagerC   rU   c	                    | t               }t        j                  d       nt        |t              rt        di |}t        |t              rt        di |}n!|t               }t        j                  d       || _        || _        || _        || _	        || _
        || _        || _        || _        t        
| <  di |	 y )Nz@text_config is None, using default Gemma3TextConfig text config.zFvision_config is None, using default SiglipVisionConfig vision config.rZ   )r3   loggerinfo
isinstancer   r1   r   r   r   r   r   r   rC   rU   superr_   )r`   r   r   r   r   r   r   rC   rU   ra   	__class__s             rc   r_   zGemma3Config.__init__A  s     *,KKKZ[T**9[9KmT*.??M".0MKK`a&*#6 ..!2!2#6 "6"rt   )NNrx   i i  i   rz   T)r~   r   r   r   r   attribute_mapr3   r1   sub_configsr   r   r   r   r   r^   r_   __classcell__r   s   @rc   r   r     s    /b J-))M (+K AEDH*-&-&-(/*.+/!#%S#X6=!# *DcN:TA!# !4Z	!#
 t!# t!# :!# !4<!# "D[!# !#rt   r   c                       e Zd Zy)Gemma3ModelOutputWithPastNr~   r   r   rZ   rt   rc   r   r   e      rt   r   c                       e Zd Zy)Gemma3CausalLMOutputWithPastNr   rZ   rt   rc   r   r   i  r   rt   r   c            	       Z     e Zd ZdZd	dedededef fdZdej                  f fdZ	 xZ
S )
Gemma3TextScaledWordEmbeddingz\
    This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
    num_embeddingsembedding_dimpadding_idxembed_scalec                     t         |   |||       || _        | j                  dt	        j
                  |      d       y )Nr   F
persistent)r   r_   scalar_embed_scaleregister_buffertorchtensor)r`   r   r   r   r   r   s        rc   r_   z&Gemma3TextScaledWordEmbedding.__init__r  s;    D"-]ELL,ERWXrt   	input_idsc                     t         |   |      | j                  j                  | j                  j
                        z  S r}   )r   forwardr   toweightdtype)r`   r   r   s     rc   r   z%Gemma3TextScaledWordEmbedding.forwardw  s2    wy)D,<,<,?,?@Q@Q,RRRrt   )      ?)r~   r   r   r   r   r   r_   r   Tensorr   r   r   s   @rc   r   r   m  sG    Ys Y3 YS Y_d Y
S S Srt   r   c                   $     e Zd Zdef fdZ xZS )	Gemma3MLPconfigc                 $    t         |   |       y r}   r   r_   r`   r   r   s     rc   r_   zGemma3MLP.__init__|  s     rt   )r~   r   r   r3   r_   r   r   s   @rc   r   r   {  s    !/ ! !rt   r   c                   *     e Zd Zddedef fdZ xZS )Gemma3RMSNormdimepsc                 (    t         |   ||       y )Nr   r   r   )r`   r   r   r   s      rc   r_   zGemma3RMSNorm.__init__  s    Sc*rt   )r{   )r~   r   r   r   r   r_   r   r   s   @rc   r   r     s    +C +e + +rt   r   c                       e Zd ZddefdZe	 	 	 	 ddedz  ded   dedz  dedz  de	d	e
f   f
d
       Z ej                         edd              Zy)Gemma3RotaryEmbeddingNr   c                    t         j                  j                          |j                  | _        |j                  | _        || _        t        t        |j                              | _	        i | _
        | j                  D ]  }| j                  j                  |   }||d   | j                  |<   | j                  }| j                  |   dk7  rt        | j                  |      } || j                  ||      \  }}| j                  | d|d       | j                  | d|j                         d       t!        | | d|        y )	Nrf   rg   
layer_type	_inv_freqFr   _original_inv_freq_attention_scaling)nnModuler_   rB   max_seq_len_cachedoriginal_max_seq_lenr   r   setrM   rf   rP   compute_default_rope_parametersr   r   clonesetattr)r`   r   devicer   rope_paramsrope_init_fncurr_inv_freqcurr_attention_scalings           rc   r_   zGemma3RotaryEmbedding.__init__  s<   
		"("@"@$*$B$B!F$6$6 78** 	UJ++55jAK")4[)ADNN:&%)%I%IL~~j)Y624>>*3MN4@fak4l1M1  J<y!9=UZ [  J</A!BMDWDWDYfk lDZL(:;=ST	Urt   r   ztorch.deviceseq_lenr   returnztorch.Tensorc                     | j                   |   d   }t        | dd      xs | j                  | j                  z  }d}d|t	        j
                  d|dt        j                        j                  |t        j                        |z  z  z  }||fS )	a|  
        Computes the inverse frequencies according to the original RoPE implementation
        Args:
            config ([`~transformers.PreTrainedConfig`]):
                The model configuration.
            device (`torch.device`):
                The device to use for initialization of the inverse frequencies.
            seq_len (`int`, *optional*):
                The current sequence length. Unused for this type of RoPE.
            layer_type (`str`, *optional*):
                The current layer type if the model has different RoPE parameters per type.
                Should not be used unless `config.layer_types is not None`

        Returns:
            Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
            post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
        rh   r@   Nr   r   r!   r   r   r   )	rP   getattrr;   r>   r   arangeint64r   r   )r   r   r   r   baser   attention_factorinv_freqs           rc   r   z5Gemma3RotaryEmbedding.compute_default_rope_parameters  s    2 %%j1,?fj$/c63E3EIcIc3c U\\!S!5;;?BB&X]XcXcBdgjjk
 )))rt   c                 N   t        | | d      }t        | | d      }|d d d d f   j                         j                  |j                  d   dd      j	                  |j
                        }|d d d d d f   j                         }t        |j
                  j                  t              r/|j
                  j                  dk7  r|j
                  j                  nd}t        |d	      5  |j                         |j                         z  j                  dd
      }	t        j                  |	|	fd      }
|
j                         |z  }|
j                         |z  }d d d        j	                  |j                        j	                  |j                        fS # 1 sw Y   AxY w)Nr   r   r   rW   mpscpuF)device_typeenabledr!   r   r   )r   r   expandshaper   r   r   typer   r    	transposer   catcossinr   )r`   xposition_idsr   r   attention_scalinginv_freq_expandedposition_ids_expandedr   freqsembr   r   s                rc   r   zGemma3RotaryEmbedding.forward  sl    4J<y!9:#DZL8J*KL$T1d]399;BB<CUCUVWCXZ\^_`ccdedldlm ,QaZ 8 > > @'1!((--'E!((--[`J`ahhmmfkUC 	0&,,.1F1L1L1NNYYZ[]^_E))UEN3C'')//C'')//C		0 vvAGGv$cff177f&;;;	0 	0s   *A1FF$)NNNNNNr}   )r~   r   r   r3   r_   staticmethodr   r   r   tupler   r   r   no_gradr   r   rZ   rt   rc   r   r     s    U/ U. *.+/"!%	!* 4'!*(!* t!* $J	!*
 
~u$	%!* !*F U]]_<  <rt   r   c                       e Zd Zdedef fdZ	 	 	 	 ddej                  dej                  dej                  dz  dedz  d	ej                  dz  d
e
e   deej                  ej                  dz  eej                     dz  f   fdZ xZS )Gemma3Attentionr   	layer_idxc                 b   t         |   ||       | j                  dk(  r|j                  nd | _        | j                  dk(  | _        | j
                  j                   | _        t        |j                  |j                        | _        t        |j                  |j                        | _        y )NrS   r   )r   r_   r   rL   
is_slidingr   rT   	is_causalr   r@   rD   q_normk_normr`   r   r  r   s      rc   r_   zGemma3Attention.__init__  s    +7;J]7]f33cg//-@@![[DDD#V=P=PQ#V=P=PQrt   Nhidden_statesposition_embeddingsattention_maskpast_key_valuescache_positionra   r   c                 r   |j                   d d }g |d| j                  }| j                  |      j                  |      j	                  dd      }	| j                  |      j                  |      j	                  dd      }
| j                  |      j                  |      j	                  dd      }| j                  |	      }	| j                  |
      }
|\  }}t        |	|
||      \  }	}
|'|||d}|j                  |
|| j                  |      \  }
}t        j                  | j                  j                  t               } || |	|
||f| j"                  r| j$                  nd| j&                  | j(                  d|\  }} |j*                  g |d j-                         }| j/                  |      }||fS )Nr   rW   r!   )r   r   r  r|   )dropoutscalingrL   )r   r@   q_projviewr   k_projv_projr  r  r*   rl   r  r   get_interfacer   _attn_implementationr+   trainingrJ   r  rL   reshape
contiguouso_proj)r`   r  r	  r
  r  r  ra   input_shapehidden_shapequery_states
key_statesvalue_statesr   r   cache_kwargsattention_interfaceattn_outputattn_weightss                     rc   r   zGemma3Attention.forward  s    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST{{<0[[,
&S#7jRUWZ#[ j&#&snUL'6'='=j,X\XfXfht'u$J(?(M(MKK,,.E)
 %8
%
 /3mmD**LL..
%
 
%
!\ *k));;;;FFHkk+.L((rt   r   )r~   r   r   r3   r   r_   r   r   r	   
LongTensorr   r   r   r   r   r   s   @rc   r   r     s    R/ RC R -1.2(,26-)||-) #\\-) t+	-)
 -) ((4/-) +,-) 
u||U\\D0%2E2LL	M-)rt   r   c                   4    e Zd Zdedef fdZ	 	 	 	 	 ddej                  dej                  dej                  dz  dej                  dz  d	e	dz  d
ej                  dz  de
e   deej                  eej                  ej                  f   dz  f   fdZ xZS )Gemma3DecoderLayerr   r  c                    t         |           || _        |j                  | _        || _        |j
                  |   | _        t        ||      | _        t        |      | _
        t        | j                  |j                        | _        t        | j                  |j                        | _        t        | j                  |j                        | _        t        | j                  |j                        | _        y )N)r   r  r   )r   r_   r   r;   r  rM   attention_typer   	self_attnr   mlpr   rD   input_layernormpost_attention_layernormpre_feedforward_layernormpost_feedforward_layernormr  s      rc   r_   zGemma3DecoderLayer.__init__  s    !--"$00;()LV$,T-=-=6CVCVW(5d6F6FFL_L_(`%)6t7G7GVM`M`)a&*78H8HfNaNa*b'rt   Nr  r	  r
  r   r  r  ra   r   c           
         |}| j                  |      } | j                  d||||||d|\  }}	| j                  |      }||z   }|}| j                  |      }| j	                  |      }| j                  |      }||z   }|S )N)r  r	  r
  r   r  r  rZ   )r+  r)  r,  r-  r*  r.  )
r`   r  r	  r
  r   r  r  ra   residual_s
             rc   r   zGemma3DecoderLayer.forward  s     !,,];)4>> 
' 3)%+)
 
q 55mD =0 66}E/77F =0rt   )NNNNN)r~   r   r   r3   r   r_   r   r   r#  r	   r   r   r   FloatTensorr   r   r   s   @rc   r%  r%    s    c/ cC c  -1.204(,26 ||  #\\  t+	 
 &&-    ((4/  +,  
u  %(9(95;L;L(L"MPT"TT	U rt   r%  c                   J    e Zd ZdZdZg dZ ej                         d        Zy)Gemma3PreTrainedModelmodel)imagetext)r%  SiglipVisionEmbeddingsSiglipEncoderLayer#SiglipMultiheadAttentionPoolingHeadc                    t        j                  | |       t        |t              r t	        j
                  |j                         y d|j                  j                  v r t	        j
                  |j                         y t        |t              r+t	        j                  |j                  |j                         y t        |t              r|j                  D ]  }|j                   }|j"                  |   dk7  rt$        |j"                  |      } ||j&                  |      \  }}t	        j(                  t+        || d      |       t	        j(                  t+        || d      |        y y )NRMSNormrg   r   r   r   )r   _init_weightsr   Gemma3MultiModalProjectorinitzeros_mm_input_projection_weightr   r~   r   r   	constant_r   r   r   rM   r   rf   r   r   copy_r   )r`   moduler   r   r   r1  s         rc   r=  z#Gemma3PreTrainedModel._init_weightsM  s    %%dF3f78KK99:&**333KK& =>NN6--v/H/HI 56$00 ^
%EE##J/9<#6v7G7G
7S#TL#/*#U q

76j\+CDmT

76j\9K+LM}]^ 7rt   N)	r~   r   r   base_model_prefixinput_modalities_no_split_modulesr   r   r=  rZ   rt   rc   r4  r4  C  s4    ( U]]_^ ^rt   r4  rL   r   c           
      P     dt         dt         dt         dt         dt        f
 fd}|S )zA
    Enables a bidirectional mask within the sliding window.
    	batch_idxhead_idxq_idxkv_idxr   c                 &    t        ||z
        k  S )zA token can attend to any other token if their absolute distance is within
        the (exclusive) sliding window size (distance < sliding_window).)abs)rI  rJ  rK  rL  rL   s       rc   
inner_maskz1_bidirectional_window_overlay.<locals>.inner_maskf  s     56>"^33rt   )r   r^   )rL   rO  s   ` rc   _bidirectional_window_overlayrP  a  s3    
4c 4S 4 4c 4d 4
 rt   c                       e Zd ZU eed<   dZdef fdZ	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  de
dz  d	ej                  dz  d
edz  dej                  dz  dee   defdZ xZS )Gemma3TextModelr   r7  c                     t         |   |       t        |j                  |j                  | j
                  | j                  j                  dz        | _        y )N      ?)r   )r   r_   r   r:   r;   r   r   embed_tokensr   s     rc   r_   zGemma3TextModel.__init__r  sM      :v1143C3CQUQ\Q\QhQhjmQm
rt   Nr   r
  r   r  inputs_embedsrE   r  ra   r   c           
         |d u |d uz  rt        d      || j                  |      }|r|t        | j                        }|F||j	                         nd}	t        j                  |	|	|j                  d   z   |j                        }||j                  d      }t        |x}
t              sx| j                  |||||d}|j                         }| j                  j                  r(d |d<   t        | j                  j                        |d<   t!        di |t#        di |d	}
|}i }| j                  j$                  D ]  }| j'                  |||      ||<    | j(                  d | j                  j*                   D ]+  } ||f|
|j,                     ||j,                     |||d
|}- | j/                  |      }t1        ||      S )N:You must specify exactly one of input_ids or inputs_embeds)r   r   rW   r   r   rW  r
  r  r  r   c                  L    t        j                  dt         j                        S )NTr   )r   r   r^   )argss    rc   <lambda>z)Gemma3TextModel.forward.<locals>.<lambda>  s    TY^YcYc@d rt   or_mask_functionrQ   )r
  r	  r   r  r  )last_hidden_stater  rZ   )
ValueErrorrV  r
   r   get_seq_lengthr   r   r   r   	unsqueezer   r   copyrT   rP  rL   r   r   rM   
rotary_emblayersr=   r(  normr   )r`   r   r
  r   r  rW  rE   r  ra   past_seen_tokenscausal_mask_mappingmask_kwargssliding_mask_kwargsr  r	  r   decoder_layers                    rc   r   zGemma3TextModel.forwardz  s    -t";<YZZ  --i8M0*$++>O!CRC^==?de"\\ "2]5H5H5K"KTaThThN )33A6L ?-F ++!."0"0#2 ,K #."2"2"4{{662d./:WX\XcXcXrXr:s#$67 #5"C{"C%F%]I\%]# & ++11 	gJ.2oom\[e.f
+	g "[[)H4;;+H+HI 		M)2=3O3OP$78T8T$U) /- M		 		-0&++
 	
rt   )NNNNNNN)r~   r   r   r3   __annotations__rF  r_   r   r#  r   r	   r2  r^   r   r   r   r   r   r   s   @rc   rR  rR  n  s     
/ 
 .2.204(,26!%26J
##d*J
 t+J
 &&-	J

 J
 ((4/J
 $;J
 ((4/J
 +,J
 
!J
rt   rR  c                   0     e Zd ZU eed<   def fdZ xZS )Gemma3ForCausalLMr   c                 D    t         |   |       t        |      | _        y r}   )r   r_   rR  r5  r   s     rc   r_   zGemma3ForCausalLM.__init__  s     $V,
rt   )r~   r   r   r3   rm  r_   r   r   s   @rc   ro  ro    s    -/ - -rt   ro  c                   D     e Zd Zdef fdZdej                  fdZ xZS )r>  r   c                    t         |           t        j                  t	        j
                  |j                  j                  |j                  j                              | _	        t        |j                  j                  |j                  j                        | _        t        |j                  j                  |j                  j                  z        | _        t        |j"                  dz        | _        | j                   | j$                  z  | _        t        j(                  | j&                  | j&                        | _        y )Nr'  rU  )kernel_sizestride)r   r_   r   	Parameterr   zerosr   r;   r   rA  r   layer_norm_epsmm_soft_emb_normr   
image_size
patch_sizepatches_per_imager   tokens_per_siders  	AvgPool2davg_poolr   s     rc   r_   z"Gemma3MultiModalProjector.__init__  s    *,,,KK,,88&:L:L:X:XY+
' !.  ,,&2F2F2U2U!
 "%V%9%9%D%DH\H\HgHg%g!h"6#=#=s#BC11T5I5II1A1A$JZJZ[rt   vision_outputsc                    |j                   \  }}}|j                  dd      }|j                  ||| j                  | j                        }|j	                         }| j                  |      }|j                  d      }|j                  dd      }| j                  |      }t        j                  || j                        }|j                  |      S )NrW   r!   )r   r   r  r{  r  r~  flattenrx  r   matmulrA  type_as)	r`   r  
batch_sizer1  r;   reshaped_vision_outputspooled_vision_outputsnormed_vision_outputsprojected_vision_outputss	            rc   r   z!Gemma3MultiModalProjector.forward  s    %3%9%9"
A{"0":":1a"@"9"A"AT%;%;T=S=S#
 #:"D"D"F $.E F 5 = =a @ 5 ? ?1 E $ 5 56K L#(<<0EtGfGf#g '//??rt   )	r~   r   r   r   r_   r   r   r   r   r   s   @rc   r>  r>    s#    \| \ @ell @rt   r>  input_embedsz5.6.0rW  )versionnew_namer   r
  r  r  r   token_type_idspixel_valuesis_trainingis_first_iterationc
                    |r|t        d      | j                         |||||d}|	|	n|du xs |j                   xs |du}	||	r|dk(  j                  |j                        }t
        j                  j                  |dd      ddddf   }|| z  }t        j                  |j                         d	      dz
  }t        j                  ||d      }t        |j                  |j                        |      |d
<   t        di |S )a  
    Overwrites the base `create_masks_for_generate` with `token_type_ids` masking to create the causal mask mapping
    for all kinds of forward passes. Gemma3 uses a bidirectional mask for images.

    Uses `pixel_values` as an optional input to disambiguate edge cases.
    Nz;`token_type_ids` is required as a model input when trainingr[  rW   )rW   r   r   )valuer   r   r_  rZ   )ra  get_text_configis_initializedr   r   r   
functionalpadr   cumsumr   wherer0   r   )r   rW  r
  r  r  r   r  r  r  r  ra   rj  is_imageis_previous_imagenew_image_startimage_group_idss                   rc   create_causal_mask_mappingr    s4   ( ~-VWW ((*&((*$K ) 	%g_-K-K)Kg|cgOg 
 !&8 #a'++N,A,ABMM--ha-HCRCP"&7%77,,':':'<!DqH++hD*Fn334o+
&' %3{33rt   c                       e Zd ZdZdef fdZe ed      dej                  de
e   deez  fd	              Zee	 	 	 	 	 	 	 	 	 	 ddej                  d
z  dej                  d
z  dej                   d
z  dej                  d
z  ded
z  dej                  d
z  dej                  d
z  dej                  d
z  dej                  d
z  ded
z  de
e   deez  fd              Z xZS )Gemma3ModelFr   c                 (    t         |   |       | `y r}   )r   r_   text_config_dtyper   s     rc   r_   zGemma3Model.__init__0  s     "rt   zOProjects the last hidden state from the vision model into language model space.)custom_intror  ra   r   c                 t     | j                   d|dd|}|j                  }| j                  |      |_        |S )NT)r  return_dictrZ   )vision_towerr`  multi_modal_projectorpooler_output)r`   r  ra   r  r`  s        rc   get_image_featureszGemma3Model.get_image_features4  sH    
 +**aRVaZ`a*<<'+'A'ABS'T$rt   Nr   r
  r   r  r  r  rW  labelsrE   	lm_kwargsc                    |d u |d uz  rt        d      |R| j                  j                  | j                  k\  r/|| j                  j                  k(  }|j	                         }d||<   n|}| | j                         |      }|F||j                         nd}t        j                  |||j                  d   z   |j                        }|i| j                  |d      j                  }|j                  |j                  |j                        }| j                  |||      }|j!                  ||      }t#        |x}t$              s(t'        | j                  |||||||| j(                  	      } | j*                  d|||||
d|d	|}t-        |j.                  |j0                  |j2                  |j4                  |
      S d 
      S )NrY  r   rW   rZ  T)r  )rW  image_features)r  )r
  r   r  rW  rE   r  r  )r`  r  r  
attentionsimage_hidden_statesrZ   )ra  r   r   r:   r   get_input_embeddingsrb  r   r   r   r   r  r  r   r   get_placeholder_maskmasked_scatterr   r   r  r  language_modelr   r`  r  r  r  )r`   r   r  r
  r   r  r  r  rW  r  rE   r  special_image_maskllm_input_idsrh  r  ri  outputss                     rc   r   zGemma3Model.forward?  s     -t";<YZZ  T[[%?%?4??%R!*dkk.H.H!H%OO-M01M,-%M 7D557FM!CRC^==?de"\\ "2]5H5H5K"KTaThThN
 #!44\t4TbbN+..}/C/C]EXEXYN!%!:!:~ "; " *889K^\M ?-F"< MM
# &$%% 	
.%+')	
 	
 )%77#33!//))2>2J
 	

 QU
 	
rt   )
NNNNNNNNNN)r~   r   r   accepts_loss_kwargsr   r_   r   r   r   r2  r   r   r   r   r  r#  r   r	   r^   r   r   r   r   s   @rc   r  r  ,  s   #| # !rs!--9?@R9S	+	+ t   .215.204(,262626*.!%J
##d*J
 ''$.J
 t+	J

 &&-J
 J
 ((4/J
 ((4/J
 ((4/J
   4'J
 $;J
 ./J
 
*	*J
  J
rt   r  c                       e Zd ZdZee	 	 	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  de
dz  dej                  dz  d	ej                  dz  d
ej                  dz  dej                  dz  dedz  deej                  z  dee   deez  fd              Z	 	 	 	 	 	 	 	 	 	 	 d fd	Z xZS )Gemma3ForConditionalGenerationFNr   r  r
  r   r  r  r  rW  r  rE   logits_to_keepr  r   c                     | j                   d||||||||
|	|d
|}|d   }t        |t              rt        | d      n|}| j	                  |dd|ddf         }d}|	O|j                         }|dddddf   }|	dddf   }||dd|j                  d    df   j                  |j                        }||j                  |j                        dk7     j                         }||j                  |j                        dk7     j                         }n |j                         }|j                         }t        j                         }|j                  d| j                  j                  j                        }|j                  d      j                  |j                        } |||      }t!        |||j"                  |j$                  |j&                  |j(                        S )	a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.

        Example:

        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, Gemma3ForConditionalGeneration

        >>> model = Gemma3ForConditionalGeneration.from_pretrained("google/gemma-3-4b-it")
        >>> processor = AutoProcessor.from_pretrained("google/gemma-3-4b-it")

        >>> messages = [
        ...     {
        ...         "role": "system",
        ...         "content": [
        ...             {"type": "text", "text": "You are a helpful assistant."}
        ...         ]
        ...     },
        ...     {
        ...         "role": "user", "content": [
        ...             {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"},
        ...             {"type": "text", "text": "Where is the cat standing?"},
        ...         ]
        ...     },
        ... ]

        >>> inputs = processor.apply_chat_template(
        ...     messages,
        ...     tokenize=True,
        ...     return_dict=True,
        ...     return_tensors="pt",
        ...     add_generation_prompt=True
        ... )
        >>> # Generate
        >>> generate_ids = model.generate(**inputs)
        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "user\nYou are a helpful assistant.\n\n\n\n\n\nWhere is the cat standing?\nmodel\nBased on the image, the cat is standing in a snowy area, likely outdoors. It appears to"
        ```
        )
r   r  r  r
  r   r  rW  rE   r  r  r   N.r   rW   )losslogitsr  r  r  r  rZ   )r5  r   r   slicelm_headr   r   r   r   r  r   CrossEntropyLossr  r   r   r:   r   r  r  r  r  )r`   r   r  r
  r   r  r  r  rW  r  rE   r  r  r  r  slice_indicesr  r  shift_logitsshift_labelsshift_attention_maskloss_fctflat_logitsflat_labelss                           rc   r   z&Gemma3ForConditionalGeneration.forward  s   | $** 
%))%+')
 
  
8B>SV8W~ot4]kmA}a,?@A\\^F!#ssA+.L!#qr'?L) (6a,:L:LQ:O9O9Q6Q'R'U'UV\VcVc'd$+,@,C,CFMM,RVW,WXcce+,@,C,CLDWDW,X\],]^iik+668+668**,H&++B0G0G0R0RSK&++B/22<3F3FGKK5D+#33!//)) ' ; ;
 	
rt   c                 N    t        |   |f||||||	|
||d	|}|s|	s||d<   |S )N)	r  rW  r
  r   r  rE   r  r  r  r  )r   prepare_inputs_for_generation)r`   r   r  rW  r  r   r  r
  r  rE   r  r  r  ra   model_inputsr   s                  rc   r  z<Gemma3ForConditionalGeneration.prepare_inputs_for_generation  sX    " w<
+')%)))1
 
$ Y+7L(rt   )NNNNNNNNNNr   )NNNNNNNTNNF)r~   r   r   r  r   r   r   r#  r2  r   r	   r^   r   r   r   r   r   r   r  r   r   s   @rc   r  r    sx      .215.204(,262626*.!%-.l
##d*l
 ''$.l
 t+	l

 &&-l
 l
 ((4/l
 ((4/l
 ((4/l
   4'l
 $;l
 ell*l
 ./l
 
-	-l
  l
b  & &rt   r  c                   Z    e Zd ZddddZ fdZd Zd Zee	 	 	 	 	 	 	 	 	 dd	e	j                  dz  d
e	j                  dz  de	j                  dz  de	j                  dz  dedz  de	j                  dz  de	j                  dz  de	j                  dz  dedz  dee   defd              Z xZS )Gemma3ForSequenceClassificationzmodel.language_modelzmodel.vision_towerzmodel.multi_modal_projector)z^language_model.modelz^vision_towerz^multi_modal_projectorc                     t         |   |       |j                  | _        t        |      | _        t        j                  |j                  j                  | j                  d      | _	        | j                          y )NF)bias)r   r_   
num_labelsr  r5  r   Linearr   r;   score	post_initr   s     rc   r_   z(Gemma3ForSequenceClassification.__init__3  sZ      ++ (
YYv11==tUZ[
 	rt   c                 6    | j                   j                         S r}   )r5  r  )r`   s    rc   r  z4Gemma3ForSequenceClassification.get_input_embeddings<  s    zz..00rt   c                 :    | j                   j                  |       y r}   )r5  set_input_embeddings)r`   r  s     rc   r  z4Gemma3ForSequenceClassification.set_input_embeddings?  s    

''.rt   Nr   r  r
  r   r  rW  r  r  rE   ra   r   c
                     | j                   |f|||||||	d|
}|j                  }| j                  |      }||j                  d   }n|j                  d   }| j                  j
                  j                  |dk7  rt        d      | j                  j
                  j                  d}n||| j                  j
                  j                  k7  j                  |j                  t        j                        }t        j                  |j                  d   |j                  t        j                        }||z  j                  d      }n.d}t        j                  | j                   j"                   d       |t        j                  ||j                  	      |f   }d}|| j%                  |||| j                  
      }t'        |||j(                  |j*                  |j,                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        )r
  r  r   r  rW  r  rE   Nr   rW   z=Cannot handle batch sizes > 1 if no padding token is defined.r   r   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`rZ  )r  r  pooled_logitsr   )r  r  r  r  r  )r5  r`  r  r   r   r   rF   ra  r   r   r   int32r   argmaxr   warning_oncer   r~   loss_functionr   r  r  r  )r`   r   r  r
  r   r  rW  r  r  rE   ra   transformer_outputsr  r  r  last_non_pad_tokennon_pad_masktoken_indicesr  r  s                       rc   r   z'Gemma3ForSequenceClassification.forwardB  s   , )djj

)%%+')

 

 ,==M* "+J&,,Q/J;;""//7J!O\]];;""//7!#"%)@)@)M)MMQQRXR_R_afalalmL!LL)<V]]Z_ZeZefM"/,">!F!Fr!J!#>>**+ ,Z Z
 u||Jv}}MOaab%%VFR_hlhshs%tD/ /??-;;*55
 	
rt   )	NNNNNNNNN)r~   r   r   _checkpoint_conversion_mappingr_   r  r  r   r   r   r#  r2  r   r	   r^   r   r   r   r   r   r   s   @rc   r  r  ,  s.   !7-"?&"1/  .215.204(,2626*.!%C
##d*C
 ''$.C
 t+	C

 &&-C
 C
 ((4/C
 ((4/C
   4'C
 $;C
 +,C
 
*C
  C
rt   r  c                        e Zd ZU dZeed<   dZy)#Gemma3TextForSequenceClassificationz
    Gemma3TextForSequenceClassification is a text-only sequence classification model that works with Gemma3TextConfig.
    It uses the generic sequence classification implementation for efficiency and consistency.
    r   rS  N)r~   r   r   r   r3   rm  rF  rZ   rt   rc   r  r    s    
  rt   r  )	r   r3   r4  rR  ro  r  r  r  r  )NNFN)ccollections.abcr   typingr   r   r   r   torch.nnr    r   r?  cache_utilsr	   r
   configuration_utilsr   r   masking_utilsr   r   r   modeling_layersr   r   modeling_outputsr   r   r   modeling_rope_utilsr   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.deprecationr   utils.genericr    gemma2.configuration_gemma2r"   gemma2.modeling_gemma2r#   r$   r%   r&   r'   r(   r)   r*   r+   paligemma.modeling_paligemmar,   r-   r.   r/   r0   siglipr1   
get_loggerr~   r   r3   r   r   r   	Embeddingr   r   r   r   r   r%  GEMMA3_START_DOCSTRINGr4  r   r^   rP  rR  ro  r   r>  r   r2  r   r  r  r  r  r  __all__rZ   rt   rc   <module>r     s   % ) )   & . J m m [ u u 
 G & R R 0 + 6
 
 
  ( 
		H	%C|%5 CL^## ^#B	 < 		#B 	SBLL S!	 !
+M +
L<1 L<`7)o 7)t.3 .b  ^1 ^<
# 
(CcSVCWY]C]:^ 
V
k V
r-) -!@		 !@H ?K +/-1&*5454<<54 LL4'54 LL	54
 T\54 ,,%54 LL4'54 ##d*54 54 t54 
54 L54p_
. _
D[%F [|[
&; [
|!*JLa !
rt   