
    qiB                         d dl mZmZ ddlmZmZ ddlmZ ddlm	Z	 ddl
mZ  e	j                  e      Z G d d	e      Z G d
 de      Zdd	gZy)    )AnyLiteral   )PreTrainedConfiglayer_type_validation)RopeParameters)logging   )SiglipVisionConfigc            4           e Zd ZdZdZdgZdddddddddd	Zdgd	gfd
dgd
gfd
gd
gfdZdddZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d-de	dz  de	dz  de	dz  de	dz  de	dz  de	dz  de	dz  de
dz  de	dz  dedz  de	dz  dedz  de	dz  de	dz  de	dz  d edz  d!edz  d"e	dz  d#e	dz  d$ee
   dz  d%edz  d&edz  d'eed(   ef   dz  d)edz  d*edz  f2 fd+Zd.d,Z xZS )/Gemma3TextConfigay  
    This is the configuration class to store the configuration of a [`Gemma3TextModel`]. It is used to instantiate an Gemma3Text
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the Gemma3Text-7B.
    e.g. [google/gemma3_text-7b](https://huggingface.co/google/gemma3_text-7b)
    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PreTrainedConfig`] for more information.

    Args:
        vocab_size (`int`, *optional*, defaults to 262208):
            Vocabulary size of the Gemma3Text model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`Gemma3TextModel`]
        hidden_size (`int`, *optional*, defaults to 2304):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 9216):
            Dimension of the MLP representations.
        num_hidden_layers (`int`, *optional*, defaults to 26):
            Number of hidden layers in the Transformer decoder.
        num_attention_heads (`int`, *optional*, defaults to 8):
            Number of attention heads for each attention layer in the Transformer decoder.
        num_key_value_heads (`int`, *optional*, defaults to 4):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
            by meanpooling all the original heads within that group. For more details, check out [this
            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
            `num_attention_heads`.
        head_dim (`int`, *optional*, defaults to 256):
            The attention head dimension.
        hidden_activation (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
            The non-linear activation function (function or string) in the decoder. Will default to `"gelu_pytorch_tanh"`
            if not specified. `"gelu_pytorch_tanh"` uses an approximation of the `"gelu"` activation function.
        max_position_embeddings (`int`, *optional*, defaults to 131072):
            The maximum sequence length that this model might ever be used with.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon used by the rms normalization layers.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`.
        pad_token_id (`int`, *optional*, defaults to 0):
            Padding token id.
        eos_token_id (`int`, *optional*, defaults to 1):
            End of stream token id.
        bos_token_id (`int`, *optional*, defaults to 2):
            Beginning of stream token id.
        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
            Whether to use a bias in the query, key, value and output projection layers during self-attention.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        query_pre_attn_scalar (`float`, *optional*, defaults to 256):
            Scaling factor used on the attention scores
        sliding_window (`int`, *optional*, defaults to 4096):
            In Gemma3Text, every other layer uses sliding window attention. This is the size of the sliding window.
        layer_types (`list`, *optional*):
            Attention pattern for each layer.
        final_logit_softcapping (`float`, *optional*):
            Scaling factor when applying tanh softcapping on the logits.
        attn_logit_softcapping (`float`, *optional*):
            Scaling factor when applying tanh softcapping on the attention scores.
        rope_parameters (`dict`, *optional*):
            Dictionary mapping attention patterns (`"full_attention"`, `"sliding_attention"`) to `RopeParameters`.
            Each value should be a dictionary containing `rope_type` and optional scaling parameters.
        use_bidirectional_attention (`bool`, *optional*, defaults to `False`):
            If True, the model will attend to all text tokens instead of using a causal mask. This does not change
            behavior for vision tokens.
        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
            Whether to tie weight embeddings

    ```python
    >>> from transformers import Gemma3TextModel, Gemma3TextConfig
    >>> # Initializing a Gemma3Text gemma3_text-7b style configuration
    >>> configuration = Gemma3TextConfig()
    >>> # Initializing a model from the gemma3_text-7b style configuration
    >>> model = Gemma3TextModel(configuration)
    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```
    gemma3_textpast_key_valuescolwisereplicated_with_grad_allreducerowwise)	zlayers.*.self_attn.q_projzlayers.*.self_attn.k_projzlayers.*.self_attn.v_projzlayers.*.self_attn.q_normzlayers.*.self_attn.k_normzlayers.*.self_attn.o_projzlayers.*.mlp.gate_projzlayers.*.mlp.up_projzlayers.*.mlp.down_proj	input_idsinputs_embedshidden_statesattention_mask)embed_tokenslayersnormg    .Ag     @)globallocalN
vocab_sizehidden_sizeintermediate_sizenum_hidden_layersnum_attention_headsnum_key_value_headshead_dimhidden_activationmax_position_embeddingsinitializer_rangerms_norm_eps	use_cachepad_token_ideos_token_idbos_token_idattention_biasattention_dropoutquery_pre_attn_scalarsliding_windowlayer_typesfinal_logit_softcappingattn_logit_softcappingrope_parameters)full_attentionsliding_attentionuse_bidirectional_attentiontie_word_embeddingsc                    || _         || _        || _        || _        || _        |	| _        || _        || _        || _        || _	        || _
        || _        |
| _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        |r| j&                  dz  dz   | _        |j1                  dd      | _        | j,                  Et5        | j                        D cg c]!  }t7        |dz   | j2                  z        rdnd# c}| _        t9        | j,                  | j                         || _        t=        | |  di | y c c}w )Nr
      sliding_window_pattern   r4   r3    ) r(   r*   r)   r6   r   r$   r   r   r   r    r"   r!   r%   r&   r'   r+   r,   r#   r-   r.   r0   r1   r/   r5   get_sliding_window_patternrangeboolr   r2   super__init__)selfr   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r2   r5   r6   kwargsi	__class__s                               a/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/gemma3/configuration_gemma3.pyrA   zGemma3TextConfig.__init__   sy   : )((#6 $'>$&!2!2#6  #6 !2(",!2!2%:",'>$&<#&+F(&#'#6#6!#;q"@D (.zz2JA'N$# t556  (,QUd6R6R,R'S#Yii D 	d..0F0FG."6" s   =&E"c                    |j                  dd       }ddiddid}| j                  | j                  n|| _        || j                  d   j                  |       | j                  j                  d      ddi| j                  d<   | j                  d   j	                  d|j                  d| j
                  d                | j                  j                  d      ddi| j                  d<   | j                  d   j	                  d|j                  d	| j
                  d
                | j                          | j                  |       |S )Nrope_scaling	rope_typedefault)r4   r3   r3   
rope_thetar   r4   rope_local_base_freqr   )ignore_keys)popr2   updater<   
setdefaultdefault_thetastandardize_rope_paramsvalidate_rope)rB   ignore_keys_at_rope_validationrC   rH   default_rope_paramss        rF   convert_rope_params_to_dictz,Gemma3TextConfig.convert_rope_params_to_dict   s[   zz.$7
 #.y!9*I6
 8<7K7K7Wt33]p#  !1299,G ##$45=6A95MD  !12-.99&**\43E3Eh3OP	
 ##$78@9Di8PD  !4501<<&**%;T=O=OPW=XY	

 	$$&'EF    )i@  i 	  i $              gelu_pytorch_tanhi   {Gz?gư>Tr   r8   r
   Fg        r[   i   NNNNFT)N)__name__
__module____qualname____doc__
model_typekeys_to_ignore_at_inferencebase_model_tp_planbase_model_pp_planrQ   intstrfloatr?   listdictr   r   rA   rV   __classcell__rE   s   @rF   r   r       sd   Pd J#4"5%.%.%.%E%E%."+ )"+
 &(9:#%568IJ!"_$56
  +X>M ")"&(,(**+*+"(;.5*.#'!%#$#$#$&+*-,/%)(,04/3gk38+/5D#$JD# 4ZD# :	D#
 :D# !4ZD# !4ZD# *D# :D# "%tD# !4<D# DjD# $;D# DjD# DjD#  Dj!D#" t#D#$ !4<%D#&  #Tz'D#( d
)D#* #Y%+D#, "'-D#. !&/D#0 g&KLn\]`dd1D#2 &*D[3D#4 "D[5D#LrW   r   c                        e Zd ZdZdZddddZeedZ	 	 	 	 	 	 	 	 dd	ee	e
ef   z  dz  d
ee	e
ef   z  dz  dedz  dedz  dedz  dedz  dedz  dedz  f fdZ xZS )Gemma3Configa	  
    This is the configuration class to store the configuration of a [`Gemma3ForConditionalGeneration`]. It is used to instantiate an
    Gemma3ForConditionalGeneration according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the PaliGemma-2B.

    e.g. [google/gemma-3-4b](https://huggingface.co/google/gemma-3-4b)

    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PreTrainedConfig`] for more information.

    Args:
        text_config (`Union[Gemma3TextConfig, dict]`, *optional*):
            The config object of the text backbone.
        vision_config (`Union[AutoConfig, dict]`,  *optional*):
            Custom vision config or dict.
        mm_tokens_per_image (`int`, *optional*, defaults to 256):
            The number of tokens per image embedding.
        boi_token_index (`int`, *optional*, defaults to 255999):
            The begin-of-image token index to wrap the image prompt.
        eoi_token_index (`int`, *optional*, defaults to 256000):
            The end-of-image token index to wrap the image prompt.
        image_token_index (`int`, *optional*, defaults to 262144):
            The image token index to encode the image prompt.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
            Whether to tie weight embeddings

    Example:

    ```python
    >>> from transformers import Gemma3ForConditionalGeneration, Gemma3Config, SiglipVisionConfig, Gemma3TextConfig

    >>> # Initializing a Siglip-like vision config
    >>> vision_config = SiglipVisionConfig()

    >>> # Initializing a Gemma3 Text config
    >>> text_config = Gemma3TextConfig()

    >>> # Initializing a Gemma3 gemma-3-4b style configuration
    >>> configuration = Gemma3Config(vision_config, text_config)

    >>> # Initializing a model from the gemma-3-4b style configuration
    >>> model = Gemma3TextConfig(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```gemma3image_token_indexboi_token_indexeoi_token_index)image_token_idboi_token_ideoi_token_id)text_configvision_configNrv   rw   mm_tokens_per_imager%   r6   c	                    | t               }t        j                  d       nt        |t              rt        di |}t        |t              rt        di |}n!|t               }t        j                  d       || _        || _        || _        || _	        || _
        || _        || _        || _        t        
| <  di |	 y )Nz@text_config is None, using default Gemma3TextConfig text config.zFvision_config is None, using default SiglipVisionConfig vision config.r;   )r   loggerinfo
isinstancerj   r   rv   rw   rx   rq   rr   rp   r%   r6   r@   rA   )rB   rv   rw   rx   rq   rr   rp   r%   r6   rC   rE   s             rF   rA   zGemma3Config.__init__)  s     *,KKKZ[T**9[9KmT*.??M".0MKK`a&*#6 ..!2!2#6 "6"rW   )NNr[   i i  i   r]   T)r^   r_   r`   ra   rb   attribute_mapr   r   sub_configsrj   rg   r   rf   rh   r?   rA   rk   rl   s   @rF   rn   rn      s    /b J-))M (+K AEDH*-&-&-(/*.+/!#%S#X6=!# *DcN:TA!# !4Z	!#
 t!# t!# :!# !4<!# "D[!# !#rW   rn   N)typingr   r   configuration_utilsr   r   modeling_rope_utilsr   utilsr	   siglipr   
get_loggerr^   rz   r   rn   __all__r;   rW   rF   <module>r      sW   *   J 1  ' 
		H	%I' IX^## ^#B -
.rW   