
    qi                       d dl Z d dlmZ d dlmZmZ d dlZd dlmZ ddl	m
Z ddlmZmZmZ ddlmZmZ ddlmZmZmZ dd	lmZ dd
lmZ ddlmZmZmZmZm Z m!Z!m"Z" ddl#m$Z$m%Z% ddl&m'Z'm(Z( ddl)m*Z* ddl+m,Z,m-Z-m.Z.m/Z/m0Z0 ddl1m2Z2 ddl3m4Z4m5Z5 ddl6m7Z7 ddl8m9Z9m:Z: ddl;m<Z<m=Z=m>Z>m?Z?m@Z@mAZAmBZBmCZCmDZDmEZEmFZF ddlGmHZH ddlImJZJmKZKmLZLmMZM  e/j                  eO      ZP G d de:e      ZQ G d de9      ZR G d de:e      ZS G d de      ZT G d  d!e@      ZU G d" d#e=      ZV G d$ d%eA      ZW G d& d'e<      ZX G d( d)e<      ZYdJd*eZd+efd,Z[ G d- d.eK      Z\ G d/ d0eK      Z] G d1 d2eL      Z^ G d3 d4eJ      Z_ G d5 d6e>      Z` G d7 d8eB      Zae- G d9 d:e?             Zb G d; d<eb      Zc G d= d>eb      Zd G d? d@eb      Zee- G dA dBeb             Zf G dC dDebe      Zge- G dE dFeb             Zhe- G dG dHeb             Zig dIZjy)K    N)Callable)AnyOptional   )initialization)DynamicCacheEncoderDecoderCacheStaticCache)PreTrainedConfiglayer_type_validation)GenerationConfigGenerationMixinGenerationMode)create_bidirectional_mask)FlashAttentionKwargs)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsBaseModelOutputWithPoolingSeq2SeqLMOutputSeq2SeqModelOutputSequenceClassifierOutputTokenClassifierOutput)ROPE_INIT_FUNCTIONSRopeParameters)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tupleloggingtorch_compilable_check)merge_with_config_defaults)OutputRecordercapture_outputs   )	AutoModel)Gemma3ConfigGemma3TextConfig)Gemma3Attention	Gemma3MLPGemma3MultiModalProjectorGemma3PreTrainedModelGemma3RMSNormGemma3RotaryEmbeddingGemma3TextScaledWordEmbeddingapply_rotary_pos_embcreate_causal_mask!create_sliding_window_causal_maskeager_attention_forward)SiglipVisionConfig)T5GemmaClassificationHeadT5GemmaEncoderLayerT5GemmaLMHeadbidirectional_mask_functionc            2       H   e Zd ZdZdZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddedz  dedz  dedz  dedz  dedz  d	edz  d
edz  dedz  dedz  dedz  dedz  dedz  dedz  dedz  dedz  dedz  dedz  dedz  dedz  de	e   dz  dedz  dedz  de
eee
f   z  dz  f.dZy)T5Gemma2TextConfiga  
    This is the configuration class to store the configuration of a [`T5Gemma2TextModel`]. It is used to instantiate the encoder's
    text model portion of the T5Gemma2 Model according to the specified arguments, defining the model architecture. Instantiating
    a configuration with the defaults will yield a similar configuration to that of the T5Gemma2Text-7B.
    e.g. [google/t5gemma2_text-7b](https://huggingface.co/google/t5gemma2_text-7b)
    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PreTrainedConfig`] for more information.

    Args:
        vocab_size (`int`, *optional*, defaults to 262208):
            Vocabulary size of the T5Gemma2Text model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`T5Gemma2TextModel`]
        hidden_size (`int`, *optional*, defaults to 2304):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 9216):
            Dimension of the MLP representations.
        num_hidden_layers (`int`, *optional*, defaults to 26):
            Number of hidden layers in the Transformer decoder.
        num_attention_heads (`int`, *optional*, defaults to 8):
            Number of attention heads for each attention layer in the Transformer decoder.
        num_key_value_heads (`int`, *optional*, defaults to 4):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
            by meanpooling all the original heads within that group. For more details, check out [this
            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
            `num_attention_heads`.
        head_dim (`int`, *optional*, defaults to 256):
            The attention head dimension.
        hidden_activation (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
            The non-linear activation function (function or string) in the decoder. Will default to `"gelu_pytorch_tanh"`
            if not specified. `"gelu_pytorch_tanh"` uses an approximation of the `"gelu"` activation function.
        max_position_embeddings (`int`, *optional*, defaults to 131072):
            The maximum sequence length that this model might ever be used with.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon used by the rms normalization layers.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`.
        pad_token_id (`int`, *optional*, defaults to 0):
            Padding token id.
        eos_token_id (`int`, *optional*, defaults to 1):
            End of stream token id.
        bos_token_id (`int`, *optional*, defaults to 2):
            Beginning of stream token id.
        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
            Whether to use a bias in the query, key, value and output projection layers during self-attention.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        query_pre_attn_scalar (`float`, *optional*, defaults to 256):
            Scaling factor used on the attention scores
        sliding_window (`int`, *optional*, defaults to 4096):
            In T5Gemma2Text, every other layer uses sliding window attention. This is the size of the sliding window.
        layer_types (`list`, *optional*):
            Attention pattern for each layer.
        final_logit_softcapping (`float`, *optional*):
            Scaling factor when applying tanh softcapping on the logits.
        attn_logit_softcapping (`float`, *optional*):
            Scaling factor when applying tanh softcapping on the attention scores.
        rope_parameters (`RopeParameters`, *optional*):
            Dictionary containing the configuration parameters for the RoPE embeddings. The dictionary should contain
            a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE
            with longer `max_position_embeddings`.
    t5gemma2_textN
vocab_sizehidden_sizeintermediate_sizenum_hidden_layersnum_attention_headsnum_key_value_headshead_dimhidden_activationmax_position_embeddingsinitializer_rangerms_norm_eps	use_cachepad_token_ideos_token_idbos_token_idattention_biasattention_dropoutquery_pre_attn_scalarsliding_windowlayer_typesfinal_logit_softcappingattn_logit_softcappingrope_parametersc                    || _         || _        || _        || _        |	| _        || _        || _        || _        || _        || _	        || _
        |
| _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        |j-                  dd      | _        | j*                  Et1        | j                        D cg c]!  }t3        |dz   | j.                  z        rdnd# c}| _        t5        | j*                  | j                         || _        t9        j:                  di | y c c}w Nsliding_window_pattern      sliding_attentionfull_attention rI   rK   rJ   r=   rE   r>   r?   r@   rA   rC   rB   rF   rG   rH   rL   rM   rD   rN   rO   rQ   rR   rP   get_sliding_window_patternrangeboolr   rS   r   __init__selfr=   r>   r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   rN   rO   rP   rQ   rR   rS   kwargsis                             _/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/t5gemma2/modular_t5gemma2.pyra   zT5Gemma2TextConfig.__init__   N   6 )(($'>$&!2!2#6  #6 !2(",!2!2%:",'>$&<#& (.zz2JA'N$# t556  (,QUd6R6R,R'S#Yii D 	d..0F0FG.!!+F+    &E i@  i 	  i $              gelu_pytorch_tanhi   {Gz?gư>Tr   rX   r&   F        rm   i   NNNN__name__
__module____qualname____doc__
model_typeintstrfloatr`   listr   dictra   r[       rf   r;   r;   L   s   BH !J ")"&(,(**+*+"(;.5*.#'!%#$#$#$&+*-,/%)(,04/3MQ1=,$J=, 4Z=, :	=,
 :=, !4Z=, !4Z=, *=, :=, "%t=, !4<=, Dj=, $;=, Dj=, Dj=,  Dj!=," t#=,$ !4<%=,&  #Tz'=,( d
)=,* #Y%+=,, "'-=,. !&/=,0 ($sN/B*CCdJ1=,r|   r;   c                       e Zd ZdZeedZy)T5Gemma2EncoderConfigt5gemma2_encoder)text_configvision_configN)rr   rs   rt   rv   r;   r5   sub_configsr[   r|   rf   r~   r~      s    #J *+Kr|   r~   c            2       H   e Zd ZdZdZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddedz  dedz  dedz  dedz  dedz  d	edz  d
edz  dedz  dedz  dedz  dedz  dedz  dedz  dedz  dedz  dedz  dedz  dedz  dedz  de	e   dz  dedz  dedz  de
eee
f   z  dz  f.dZy)T5Gemma2DecoderConfiga
  
    This is the configuration class to store the configuration of a [`T5Gemma2DecoderModel`]. It is used to instantiate the decoder
    text model portion of the T5Gemma2 Model according to the specified arguments, defining the model architecture. Instantiating
    a configuration with the defaults will yield a similar configuration to that of the T5Gemma2Decoder-7B.
    e.g. [google/t5gemma2_text-7b](https://huggingface.co/google/t5gemma2_text-7b)
    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PreTrainedConfig`] for more information.

    Args:
        vocab_size (`int`, *optional*, defaults to 262208):
            Vocabulary size of the T5Gemma2Decoder model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`T5Gemma2DecoderModel`]
        hidden_size (`int`, *optional*, defaults to 2304):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 9216):
            Dimension of the MLP representations.
        num_hidden_layers (`int`, *optional*, defaults to 26):
            Number of hidden layers in the Transformer decoder.
        num_attention_heads (`int`, *optional*, defaults to 8):
            Number of attention heads for each attention layer in the Transformer decoder.
        num_key_value_heads (`int`, *optional*, defaults to 4):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
            by meanpooling all the original heads within that group. For more details, check out [this
            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
            `num_attention_heads`.
        head_dim (`int`, *optional*, defaults to 256):
            The attention head dimension.
        hidden_activation (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
            The non-linear activation function (function or string) in the decoder. Will default to `"gelu_pytorch_tanh"`
            if not specified. `"gelu_pytorch_tanh"` uses an approximation of the `"gelu"` activation function.
        max_position_embeddings (`int`, *optional*, defaults to 131072):
            The maximum sequence length that this model might ever be used with.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon used by the rms normalization layers.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`.
        pad_token_id (`int`, *optional*, defaults to 0):
            Padding token id.
        eos_token_id (`int`, *optional*, defaults to 1):
            End of stream token id.
        bos_token_id (`int`, *optional*, defaults to 2):
            Beginning of stream token id.
        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
            Whether to use a bias in the query, key, value and output projection layers during self-attention.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        query_pre_attn_scalar (`float`, *optional*, defaults to 256):
            Scaling factor used on the attention scores
        sliding_window (`int`, *optional*, defaults to 4096):
            In T5Gemma2Decoder, every other layer uses sliding window attention. This is the size of the sliding window.
        layer_types (`list`, *optional*):
            Attention pattern for each layer.
        final_logit_softcapping (`float`, *optional*):
            Scaling factor when applying tanh softcapping on the logits.
        attn_logit_softcapping (`float`, *optional*):
            Scaling factor when applying tanh softcapping on the attention scores.
        rope_parameters (`RopeParameters`, *optional*):
            Dictionary containing the configuration parameters for the RoPE embeddings. The dictionary should contain
            a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE
            with longer `max_position_embeddings`.
    t5gemma2_decoderNr=   r>   r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   rN   rO   rP   rQ   rR   rS   c                    || _         || _        || _        || _        |	| _        || _        || _        || _        || _        || _	        || _
        |
| _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        |j-                  dd      | _        | j*                  Et1        | j                        D cg c]!  }t3        |dz   | j.                  z        rdnd# c}| _        t5        | j*                  | j                         || _        t9        j:                  di | y c c}w rU   r\   rb   s                             rf   ra   zT5Gemma2DecoderConfig.__init__#  rg   rh   ri   rq   r[   r|   rf   r   r      s   BH $J ")"&(,(**+*+"(;.5*.#'!%#$#$#$&+*-,/%)(,04/3MQ1=,$J=, 4Z=, :	=,
 :=, !4Z=, !4Z=, *=, :=, "%t=, !4<=, Dj=, $;=, Dj=, Dj=,  Dj!=," t#=,$ !4<%=,&  #Tz'=,( d
)=,* #Y%+=,, "'-=,. !&/=,0 ($sN/B*CCdJ1=,r|   r   c                        e Zd ZdZdZdgZeedZdddZ		 	 	 	 	 	 	 	 	 dd	ee
eef   z  dz  d
ee
eef   z  dz  dedededededededz  f fdZ xZS )T5Gemma2ConfigaV  
    This is the configuration class to store the configuration of a [`T5Gemma2Model`]. It is used to instantiate an T5Gemma2
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to a hypothetical balanced Gemma3 encoder-decoder model.
    e.g. [google/t5gemma-2-270m-270m](https://huggingface.co/google/t5gemma-2-270m-270m)
    Configuration objects inherit from [PreTrainedConfig] and can be used to control the model outputs. Read the
    documentation from [PreTrainedConfig] for more information.

    Args:
        encoder (`Union[T5Gemma2EncoderConfig, dict]`, optional, *optional*):
            Configuration for the encoder.
        decoder (`Union[T5Gemma2DecoderConfig, dict]`, optional, *optional*):
            Configuration for the decoder.
        is_encoder_decoder (bool, optional, *optional*, defaults to `True`):
            Whether the model is used as an encoder/decoder or not.
        dropout_rate (`float`, *optional*, defaults to 0.0):
            The ratio for all dropout layers (following T5).
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for attention.
        classifier_dropout_rate (`float`, *optional*, defaults to 0.0):
            The dropout ratio for classifier (following T5).
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        image_token_index (`int`, *optional*, defaults to 256001):
            The image token index to encode the image prompt. Defaults to 256001, which is right after the eoi_token_index.
            Note this is different from Gemma 3.
        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
            Whether to tie weight embeddings

    ```python
    >>> from transformers import T5Gemma2Config, T5Gemma2Model
    >>> t5gemma2_config = T5Gemma2Config.from_pretrained("google/t5gemma-270m-270m")
    >>> model = T5Gemma2Model(t5gemma2_config)
    ```
    t5gemma2past_key_values)encoderdecoderimage_token_indexeoi_token_index)image_token_ideoi_token_idNr   r   is_encoder_decoderdropout_raterM   classifier_dropout_raterF   tie_word_embeddingsc
                 ^   t        |t              rt        di |}nI| t               }t        j	                  d       n't        |t              st        t        |       d      t        |t              rt        di |}nI| t               }t        j	                  d       n't        |t              st        t        |       d      |j                  j                  |j                  k7  r0t        d|j                  j                   d|j                   d      |st        d      |j                  j                  |j                  k7  r0t        d|j                  j                   d|j                   d      ||j                  _        ||j                  _        ||j                  _        ||_        || _        ||_        ||_        || _        d	D ]  }||
vst#        ||      |
|<    || _        || _        |j(                  | _        || _        |	| _        t-        | \  dd
|i|
 y )NzDencoder is None, using default T5Gemma2EncoderConfig encoder config.z is not supported.zDdecoder is None, using default T5Gemma2DecoderConfig decoder config.zBImbalanced encoder-decoder is not supported in T5Gemma2: encoder (z) vs decoder (z).z4T5Gemma2Model only support encoder-decoder modeling.zRImbalanced encoder-decoder vocabulary size is not supported in T5Gemma2: encoder ()rK   rI   rJ   r=   r   r[   )
isinstancer{   r~   loggerinfo
ValueErrortyper   r   r>   r=   r   rM   r   r   r   r   getattrr   rF   r   r   superra   )rc   r   r   r   r   rM   r   rF   r   r   rd   special_token_key	__class__s               rf   ra   zT5Gemma2Config.__init__  s+    gt$+6g6G_+-GKK^_g'<= DM?2D!EFFgt$+6g6G_+-GKK^_g'<= DM?2D!EFF**g.A.AA#//;;<N7K^K^J__ac 
 "STT))W-?-??#//::;>'J\J\I]]_a  ,8(0A-2C/$5!  ,$5!!_ 	P .,3G=N,O()	P (?$!2&66!2#6 I,>I&Ir|   )	NNTrp   rp   rp   ro   i T)rr   rs   rt   ru   rv   keys_to_ignore_at_inferencer~   r   r   attribute_mapr{   rx   r   r`   ry   rw   ra   __classcell__r   s   @rf   r   r   c  s    "H J#4"5 )(K .)M BFAE#'!#&),#'!(+/DJ&c3h7$>DJ 'c3h7$>DJ !	DJ
 DJ !DJ "'DJ !DJ DJ "D[DJ DJr|   r   c                       e Zd Zy)T5Gemma2RMSNormNrr   rs   rt   r[   r|   rf   r   r         r|   r   c                   *     e Zd Zdef fdZd Z xZS )T5Gemma2MLPconfigc                 l    t         |   |       t        j                  |j                        | _        y N)r   ra   nnDropoutr   dropoutrc   r   r   s     rf   ra   zT5Gemma2MLP.__init__  s&     zz&"5"56r|   c                     | j                  | j                  |            | j                  |      z  }| j                  |      }| j	                  |      }|S r   )act_fn	gate_projup_projr   	down_proj)rc   xhidden_statesr   s       rf   forwardzT5Gemma2MLP.forward  sH    DNN1$56aH]3NN=1	r|   )rr   rs   rt   r;   ra   r   r   r   s   @rf   r   r     s    71 7r|   r   c                   |     e Zd Zddef fdZe	 	 	 	 ddedz  ded   dedz  dedz  de	d	e
f   f
 fd
       Z xZS )T5Gemma2RotaryEmbeddingNr   c                 &    t         |   ||       y r   r   ra   )rc   r   devicer   s      rf   ra   z T5Gemma2RotaryEmbedding.__init__  s    (r|   r   ztorch.deviceseq_len
layer_typereturnztorch.Tensorc                 (    t         |   | |||      S r   )r   compute_default_rope_parameters)r   r   r   r   r   s       rf   r   z7T5Gemma2RotaryEmbedding.compute_default_rope_parameters  s     w6vvwPZ[[r|   r   )NNNN)rr   rs   rt   r;   ra   staticmethodr   rw   rx   tuplery   r   r   r   s   @rf   r   r     s    )1 ) ,0+/"!%	\"T)\(\ t\ $J	\
 
~u$	%\ \r|   r   c                   (     e Zd Zdedef fdZ xZS )T5Gemma2SelfAttentionr   	layer_idxc                 4    t         |   ||       d| _        y NFr   ra   	is_causalrc   r   r   r   s      rf   ra   zT5Gemma2SelfAttention.__init__      +r|   )rr   rs   rt   r;   rw   ra   r   r   s   @rf   r   r     s    1 c  r|   r   c                   N    e Zd ZdZdedef fdZ	 	 ddej                  de	ej                  ej                  f   dej                  dz  d	ej                  d
e
dz  dej                  dz  dee   de	ej                  ej                  dz  e	ej                     dz  f   fdZ xZS )T5Gemma2MergedAttentionz6Merged self-attention and cross-attention for decoder.r   r   c                 4    t         |   ||       d| _        y r   r   r   s      rf   ra   z T5Gemma2MergedAttention.__init__  r   r|   Nr   position_embeddingsmerged_attention_maskencoder_hidden_statesr   cache_positionrd   r   c                    |j                   d d }g |d| j                  }	|j                   d d }
g |
d| j                  }| j                  |      j                  |	      j	                  dd      }| j                  |      j                  |	      j	                  dd      }| j                  |      j                  |	      j	                  dd      }| j                  |      }| j                  |      }|\  }}t        ||||      \  }}|d|||d}|j                  }|j                  ||| j                  |      \  }}|j                  j                  | j                        }|j                  }|s| j                  |      j                  |      j	                  dd      }| j                  |      j                  |      j	                  dd      }| j                  |      }|j                  ||| j                        \  }}d|j                  | j                  <   nFj                   | j                     j"                  }|j                   | j                     j$                  }|}|
d   }t'        j(                  ||gd      }t'        j(                  ||gd      }t+        j,                  | j.                  j0                  t2              } || ||||f| j4                  r| j6                  nd| j8                  d|\  }} |j:                  g |d j=                         }| j?                  |      }||d	d | f   }|d	| d f   }nd
\  }}|||fS )NrX   r&   )sincosr   Tdimrp   )r   scaling.NN) shaperC   q_projview	transposek_projv_projq_normk_normr1   self_attention_cacheupdater   
is_updatedr]   cross_attention_cachelayerskeysvaluestorchcatr   get_interfacer   _attn_implementationr4   trainingrM   r   reshape
contiguouso_proj)rc   r   r   r   r   r   r   rd   input_shapehidden_shapecross_input_shapecross_hidden_shapequery_states
key_statesvalue_statesr   r   cache_kwargsr   r   r   cross_key_statescross_value_statescross_key_sizeattention_interfaceattn_outputattn_weightsself_attn_weightscross_attn_weightss                                rf   r   zT5Gemma2MergedAttention.forward  s    $))#2.88b8$--8177<D0D"DdmmD {{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST{{<0[[,
&S#7jRUWZ#[ j& $'snUL#2#G#G ';'B'BL$..,($J
 )3377GJ$3$I$I!"*#{{+@AFFGYZddefhij!%-B!C!H!HI[!\!f!fghjk!l#{{+;<*7L7S7S$&8$..84 "4 >B**4>>:4;;DNNKPP!6!=!=dnn!M!T!T $*1-YY
,<=1E
yy,0B!CK(?(M(MKK,,.E)
 %8!	%
 /3mmD**LL	%
 	%
!\ *k));;;;FFHkk+. # ,S2BN?2B-B C!-cN?3C.C!D4>11-/AAAr|   r   )rr   rs   rt   ru   r;   rw   ra   r   Tensorr   r	   
LongTensorr   r   r   r   r   s   @rf   r   r      s    @1 c  7;26YB ||YB #5<<#=>	YB
  %||d2YB  %||YB -t3YB ((4/YB -.YB 
u||U\\D0%2E2LL	MYBr|   r   rO   r   c           
      T     dt         dt         dt         dt         dt        f
 fd}|S )zL
    This creates uni/bidirectional attention mask with sliding window.
    	batch_idxhead_idxq_idxkv_idxr   c                 t    	r
d}}n
dz   dz  
dz  dz   }}||z
  }|dk\  ||k  z  }|dk  | |k  z  }||z  S )Nr   rX   r&   r[   )r  r  r  r	  left_window_sizeright_window_sizedist	left_mask
right_maskr   rO   s            rf   
inner_maskz0sliding_window_mask_function.<locals>.inner_maskh  sp    2@!/4BQ4F13L~bcNcfgNg/v~QY4*:#:;	QhD5+<#<=
:%%r|   )rw   r`   )rO   r   r  s   `` rf   sliding_window_mask_functionr  c  s3    
	&c 	&S 	& 	&c 	&d 	& r|   c                       e Zd Zy)T5Gemma2EncoderLayerNr   r[   r|   rf   r  r  v  r   r|   r  c                   0    e Zd ZdZdef fdZ	 	 	 	 	 	 ddej                  deej                  ej                  f   dej                  dz  dej                  dz  d	e
dz  d
edz  dej                  dz  dej                  dz  dej                  fdZ xZS )T5Gemma2DecoderLayerzFDecoder sub-layer: merged attention instead of vanilla self-attention.r   c                 J    t         |   ||       t        ||      | _        y )N)r   r   )r   ra   r   	self_attnr   s      rf   ra   zT5Gemma2DecoderLayer.__init__}  s&    + 1
r|   Nr   r   r   position_idsr   rH   r   r   r   c	                 F   |}
| j                  |      } | j                  d||||||||d|	\  }}}| j                  |      }|
| j                  |      z   }|}
| j	                  |      }| j                  |      }| j                  |      }|
| j                  |      z   }|S )N)r   r   r   r  r   rH   r   r   r[   )pre_self_attn_layernormr  post_self_attn_layernormr   pre_feedforward_layernormmlppost_feedforward_layernorm)rc   r   r   r   r  r   rH   r   r   rd   residual_s               rf   r   zT5Gemma2DecoderLayer.forward  s     !44]C,dnn 

' 3"7%+)"7

 

q! 55mD 4<<#>> 66}E/77F 4<<#>>r|   )NNNFNN)rr   rs   rt   ru   rw   ra   r   r  r   r  r	   r`   FloatTensorr   r   r   s   @rf   r  r  z  s    P
# 
 6:046:!&2659"||" #5<<#=>"  %||d2	"
 &&-" -t3" $;" ((4/"  %||d2" 
		"r|   r  c                       e Zd Zy)T5Gemma2LMHeadNr   r[   r|   rf   r#  r#    r   r|   r#  c                       e Zd Zy)T5Gemma2ClassificationHeadNr   r[   r|   rf   r%  r%    r   r|   r%  c                   $     e Zd Zdef fdZ xZS )T5Gemma2MultiModalProjectorr   c                 $    t         |   |       y r   r   r   s     rf   ra   z$T5Gemma2MultiModalProjector.__init__  s     r|   )rr   rs   rt   r~   ra   r   r   s   @rf   r'  r'    s    !4 ! !r|   r'  c                   b     e Zd ZdZ	 	 d
dededededef
 fdZdej                  f fd	Z	 xZ
S )T5Gemma2TextScaledWordEmbeddingzCT5Gemma2 Embedding: override to add eoi token embedding separately.num_embeddingsembedding_dimpadding_idxembed_scaler   c                     t         |   ||||       || _        t        j                  t        j                  | j                              | _        y r   )	r   ra   r   r   	Parameterr   zerosr,  eoi_embedding)rc   r+  r,  r-  r.  r   r   s         rf   ra   z(T5Gemma2TextScaledWordEmbedding.__init__  s@     	[Q.\\%++d6H6H*IJr|   	input_idsc                     t         |   |      | j                  j                  | j                  j
                        z  }| j                  j                  |j
                        ||| j                  k(  <   |S r   )r   r   r.  toweightdtyper2  r   )rc   r3  input_embeddingsr   s      rf   r   z'T5Gemma2TextScaledWordEmbedding.forward  sf     7?958H8H8K8KDKKL]L]8^^>B>P>P>S>STdTjTj>kd&:&::;r|   )g      ?  )rr   rs   rt   ru   rw   ry   ra   r   r  r   r   r   s   @rf   r*  r*    s^    M !&
K
K 
K 	
K
 
K 
K     r|   r*  c                       e Zd ZU eed<   dZdZdZdZg dZ	e
eg eedd       eedd       eed	d
      gdZd Zd Zy)T5Gemma2PreTrainedModelr   modelTF)r  r  SiglipVisionEmbeddingsSiglipEncoderLayer#SiglipMultiheadAttentionPoolingHeadrX   r  )index
layer_namer&   
cross_attn)r   
attentionsc                    t        j                  | |       t        |t              r t	        j
                  |j                         y t        |t              rJt	        j
                  |j                         t	        j                  |j                  |j                         y t        |t              r|j                  j                  j                  d   dz  }t	        j                   |j                  j                  d| j"                  j$                  |z         t'        |j                  d      rA|j                  j(                  *t	        j
                  |j                  j(                         y y y d|j*                  j,                  v r t	        j
                  |j                         y t        |t.              r|j0                  D ]  }|j2                  }|j4                  |   dk7  rt6        |j4                  |      } ||j"                  |      \  }}t	        j8                  t;        || d	      |       t	        j8                  t;        || d
      |        y y )Nr   g      rp   )meanstdbiasRMSNormdefault)r   	_inv_freq_original_inv_freq)r   _init_weightsr   r'  initzeros_mm_input_projection_weightr*  r2  	constant_r.  scalar_embed_scaler%  out_projr6  r   normal_r   rF   hasattrrG  r   rr   r   rP   r   	rope_typer   copy_r   )rc   modulescaler   rope_init_fncurr_inv_freqr   s          rf   rL  z%T5Gemma2PreTrainedModel._init_weights  s   %%dF3f9:KK99: ?@KK,,-NN6--v/H/HI :;OO**003t;ELL//ct{{?\?\_d?dev/FOO4H4H4TFOO001 5U/ &**333KK& 78$00 ^
%EE##J/9<#6v7G7G
7S#TL#/*#U q

76j\+CDmT

76j\9K+LM}]^ 9r|   c                 <   | j                   j                  }|j                  }|j                  }|t	        d      |j                  |j                        }|dddf   j                         |dddf<   ||d<   |t	        d      |j                  |dk(  |       |S )	z
        Shifts input_ids to the right, prepends the decoder_start_token_id, and handles
        pad_token_id replacement for labels that were -100.
        This is a common preparation step for decoder inputs in sequence-to-sequence models.
        Nz:self.model.config.decoder.bos_token_id has to be defined. .r   rX   ).r   z9self.model.config.decoder.pad_token_id has to be defined.i)	r   r   rK   rI   r   	new_zerosr   clonemasked_fill_)rc   r3  decoder_configdecoder_start_token_idrI   shifted_input_idss         rf   %prepare_decoder_input_ids_from_labelsz=T5Gemma2PreTrainedModel.prepare_decoder_input_ids_from_labels  s     ,,!/!<!<%22!)YZZ &//	@%.sCRCx%8%>%>%@#qr'"$:&!XYY 	&&'8D'@,O  r|   N)rr   rs   rt   r   __annotations__base_model_prefixsupports_gradient_checkpointing_supports_flash_attn_supports_flex_attn_no_split_modulesr  r  r$   r   r   _can_record_outputsrL  rb  r[   r|   rf   r;  r;    st    &*# ! /0DE0kR2!T2!U
^0!r|   r;  c                       e Zd ZU eed<   eedZ	 ddedef fdZ	e
ee	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  d	ej                   dz  d
ej                  dz  dee   defd                     Z xZS )T5Gemma2TextEncoderr   )rC  r   r   c           	      ^   t         |   |       |j                  | _        |j                  | _        t        |j                  |j                  | j                  |j                  dz  |      | _        t        |j                  |j                        | _
        d| _        t        j                  t        |j                        D cg c]  }t!        ||       c}      | _        t        j$                  |j&                        | _        t+        |      | _        | j/                          y c c}w Ng      ?)r.  r   )epsF)r   ra   rI   r-  r=   r*  r>   embed_tokensr   rG   normgradient_checkpointingr   
ModuleListr_   r@   r  r   r   r   r   r   
rotary_emb	post_initrc   r   r   r   r   s       rf   ra   zT5Gemma2TextEncoder.__init__$  s    
 	 !.. ++;**C/+
 $F$6$6F<O<OP	&+#mmFKFLdLdFef!&)4f
 zz&"5"561&9 	 g   D*Nr3  attention_maskr  inputs_embedstoken_type_idsrd   r   c           
      0   |d u |d uz  rt        d      |j                  dd        || j                  |      }|>t        j                  d|j
                  d   |j                        j                  d      }t        |x}t              sJ| j                  ||d}t        di |t        di |dt        | j                  j                  d	      id
}|}	i }
| j                  j                  D ]  }| j                  |	||      |
|<    | j!                  |	      }	| j"                  d | j                  j$                   D ](  } ||	|
|j&                     ||j&                     |fi |}	* | j)                  |	      }	| j!                  |	      }	t+        |	      S )N:You must specify exactly one of input_ids or inputs_embedsr   r   rX   r   )r   rx  rw  and_mask_functionF)r   rZ   rY   )last_hidden_stater[   )r   popro  r   aranger   r   	unsqueezer   r{   r   r   r  rO   rP   rs  r   r   r@   attention_typerp  r   )rc   r3  rw  r  rx  ry  rd   self_attn_mask_mappingmask_kwargsr   r   r   layer_modules                rf   r   zT5Gemma2TextEncoder.forward@  s    -t";<YZZ 	

$d+  --i8M <<=+>+>q+A-J^J^_iijklLNB0DI++!."0K #<"Jk"J%> &!&&B4;;C]C]in&o&&" & !++11 	gJ.2oom\[e.f
+	g ]3 KK(G$++*G*GH 	L(#L$?$?@&|'B'BC	
 M	 		-0]3+
 	
r|   r9  )NNNNN)rr   rs   rt   r;   rc  r   r  ri  rw   ra   r#   r%   r   r   r  r  r!  r   r   r   r   r   r   s   @rf   rk  rk    s    +-  '" 8   .2.20426.2<
##d*<
 t+<
 &&-	<

 ((4/<
 t+<
 +,<
 
<
    <
r|   rk  c                       e Zd ZU eed<   	 ddedef fdZd Zd Ze	e
dej                  dee   deez  fd	              Zd
ej$                  dz  dej&                  dz  dej&                  fdZe
	 	 	 	 	 	 dd
ej$                  dz  dej                  dz  dej$                  dz  dej&                  dz  dej&                  dz  dej                  dz  dee   defd       Z xZS )T5Gemma2Encoderr   r   c                     t         |   |       t        j                  |j                  |      | _        t        j                  |j                        | _	        t        |      | _        | j                          y )N)r   r   )r   ra   rk  _from_configr   
text_modelr'   from_configr   vision_towerr'  multi_modal_projectorrt  )rc   r   r   r   s      rf   ra   zT5Gemma2Encoder.__init__  sb    
 	 -::6;M;M_n:o%119M9MN%@%H" 	r|   c                 6    | j                   j                         S r   )r  get_input_embeddingsrc   s    rf   r  z$T5Gemma2Encoder.get_input_embeddings  s    3355r|   c                 8    | j                   j                  |      S r   )r  set_input_embeddingsrc   new_embeddingss     rf   r  z$T5Gemma2Encoder.set_input_embeddings  s    33NCCr|   pixel_valuesrd   r   c                 x     | j                   d|dd|}|j                  }| j                  |      }||_        |S )NT)r  return_dictr[   )r  r  r  pooler_output)rc   r  rd   vision_outputsr  image_featuress         rf   get_image_featuresz"T5Gemma2Encoder.get_image_features  sM     +**aRVaZ`a*<<334EF'5$r|   r3  Nrx  r  c                 D   | j                   j                  }|f|t        d      | | j                         t	        j
                  |t        j                  |j                              k(  }|j                  d      }n||k(  }|j                         }|j                  d      j                  |      j                  |j                        }|j                  d   |j                  d   z  }t        ||   j                         |j                         k(  d| d|        |S )z
        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
        equal to the length of multimodal features. If the lengths are different, an error is raised.
        z9Either `input_ids` or `inputs_embeds` has to be provided.)r7  r   r   r   rX   z6Image features and image tokens do not match: tokens: z, features )r   r   r   r  r   tensorlongr   allsumr  	expand_asr5  r   r"   numel)rc   r3  rx  r  r   special_image_maskn_image_tokensn_image_featuress           rf   get_image_placeholder_maskz*T5Gemma2Encoder.get_image_placeholder_mask  s"    33$ !\]]!.2M$2K2K2M^5::mFZFZ[3 " "4!7!7!;!*n!<+//1/99"=GGVYYZgZnZno)//2^5I5I!5LL,-3359M9M9OOD^DTT_`p_qr	
 "!r|   rw  r  ry  c                 j   |d u |d uz  rt        d      || j                  j                  |      }|i| j                  |d      j                  }|j                  |j                  |j                        }| j                  |||      }	|j                  |	|      } | j                  d|||d|}
|
S )Nr{  T)r  )rx  r  )rx  rw  r  r[   )
r   r  ro  r  r  r5  r   r7  r  masked_scatter)rc   r3  rw  r  rx  r  ry  rd   r  
image_maskoutputss              rf   r   zT5Gemma2Encoder.forward  s     -t";<YZZ  OO88CM#!44\t4TbbN+..}/C/C]EXEXYN88~ 9 J *88^TM!$// 
')%
 	
 r|   r  )NNNNNN)rr   rs   rt   r~   rc  rw   ra   r  r  r    r   r   r  r   r   r   r   r  r  r!  r  r   r   r   r   s   @rf   r  r    sk   !!
  '% 6D 
!LL
4:;M4N
	+	+
  
"##d*" ((4/" ))	"<  .2.2042615.2!##d*! t+! &&-	!
 ((4/! ''$.! t+! +,! 
! !r|   r  c                       e Zd ZU eed<    eed       eed      edZddede	f fdZ
eee	 	 	 	 	 	 	 	 	 dd	ej                  dz  d
ej                   dz  dej                  dz  dedz  dej$                  dz  dedz  dej                  dz  dej                   dz  dej                   dz  dee   defd                     Z xZS )T5Gemma2Decoderr   rX   )r@  r&   )rC  cross_attentionsr   r   c           	      ^   t         |   |       |j                  | _        |j                  | _        t        |j                  |j                  |j                  |j                  dz  |      | _        t        |j                  |j                        | _
        d| _        t        j                  t        |j                        D cg c]  }t!        ||       c}      | _        t        j$                  |j&                        | _        t+        |      | _        | j/                          y c c}w rm  )r   ra   rI   r-  r=   r*  r>   ro  r   rG   rp  rq  r   rr  r_   r@   r  r   r   r   r   r   rs  rt  ru  s       rf   ra   zT5Gemma2Decoder.__init__  s     !.. ++;**C/+
 $F$6$6F<O<OP	&+#mmFKFLdLdFef!&)4f
 zz&"5"561&9	 grv  Nr3  rw  r  r   rx  rH   r   r   encoder_attention_maskrd   r   c
                    |d u |d uz  rt        d      |t        d      || j                  |      }| j                  s,|r*|(t        t	        | j
                        t	                     }|F||j                         nd}t        j                  |||j                  d   z   |j                        }||j                  d      }t        |x}t              s>| j
                  |||||j                  nd |d}d |d	<   t        di |t!        di |d
}t        |	x}t              s-| j
                  ||	|d d d}dt        di |dt#        |	      ii}t        j$                  |d   |d   gd      t        j$                  |d   |d   gd      d
}|}i }| j
                  j&                  D ]  }| j)                  |||      ||<    | j+                  |      }| j,                  d | j
                  j.                   D ],  } ||||j0                     ||j0                     |||||fi |
}. | j3                  |      }| j+                  |      }t5        ||      S )Nr{  z0`encoder_hidden_states` must be given in decoderr  r   rX   r|  )r   rx  rw  r   r   r  c                  L    t        j                  dt         j                        S )NT)r7  )r   r  r`   )argss    rf   <lambda>z)T5Gemma2Decoder.forward.<locals>.<lambda>7  s    U\\$V[V`V`=a r|   r}  r~  rZ   or_mask_functionr   r   rY   )r  r   r[   )r   ro  r   r	   r   r   get_seq_lengthr   r  r   r   r  r   r{   r   r2   r3   r9   r   rP   rs  r   r   r@   r  rp  r   )rc   r3  rw  r  r   rx  rH   r   r   r  rd   past_seen_tokensr  r  cross_attn_mask_mappingmerged_attn_mask_mappingr   r   r   r  s                       rf   r   zT5Gemma2Decoder.forward  s     -t";<YZZ (OPP  --i8M}}/F1,dkk2RT`TbcO!CRC^==?de"\\ "2]5H5H5K"KTaThThN )33A6LNB0DI++!."0"0KZKf?#G#Glp ,K 0bK+,"4"C{"C%F%U%U&"
 5KK1TR++!6"8"0#' $K !"4 #!#%@AW%X#'# $ii'(89;RSc;dekm "''(;<>UVf>ghnp"	$
  & !++11 	gJ.2oom\[e.f
+	g ]3 KK(G$++*G*GH 	L(#L$?$?@()D)DE%
 
M	 		-0]38++
 	
r|   r  )	NNNNNNNNN)rr   rs   rt   r   rc  r$   r   r  ri  rw   ra   r#   r%   r   r   r  r  r	   r!  r`   r   r   r   r   r   r   s   @rf   r  r    sN   !!$%<AF*+B!L-4 s ,   .2.2046:26!%26596:h
##d*h
 t+h
 &&-	h

 -t3h
 ((4/h
 $;h
 ((4/h
  %||d2h
 !&t 3h
 +,h
 
3h
    h
r|   r  c            !           e Zd ZdddZdef fdZd Zd Zd Zd	 Z	e
e	 	 	 	 	 	 	 	 	 	 	 	 	 ddej                  d
z  dej                  d
z  dej                  d
z  dej                  d
z  dej                  d
z  dej                  d
z  dej                  d
z  ded
z  ded
z  dej$                  d
z  dej$                  d
z  ded
z  dej                  d
z  dee   defd              Z xZS )T5Gemma2Modelz&encoder.text_model.embed_tokens.weightz-encoder.text_model.embed_tokens.eoi_embedding)zdecoder.embed_tokens.weightz"decoder.embed_tokens.eoi_embeddingr   c                     t         |   |       t        |j                  |j                        | _        t        |j                  |j                        | _        | j                          y r   )r   ra   r  r   r   r  r   rt  r   s     rf   ra   zT5Gemma2Model.__init__}  sL      'v~~v7M7MN&v~~v7M7MNr|   c                     | j                   S r   )r   r  s    rf   get_encoderzT5Gemma2Model.get_encoder      ||r|   c                     | j                   S r   r   r  s    rf   get_decoderzT5Gemma2Model.get_decoder  r  r|   c                 6    | j                   j                         S r   )r   r  r  s    rf   r  z"T5Gemma2Model.get_input_embeddings  s    ||0022r|   c                 8    | j                   j                  |      S r   )r   r  r  s     rf   r  z"T5Gemma2Model.set_input_embeddings  s    ||00@@r|   Nr3  r  rw  r  decoder_input_idsdecoder_attention_maskdecoder_position_idsencoder_outputsr   rx  decoder_inputs_embedsrH   r   rd   r   c                 P   | | j                   d||||
|dd|}|j                  } | j                  d|||||	||||dd
|}t        |j                  |j                  |j
                  |j                  |j                  |j                  |j
                  |j                        S )aX  
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, decoder_sequence_length)`, *optional*):
            Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the range `[0,
            config.decoder.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
        T)r3  rw  r  rx  r  r  )
r3  rw  r  rx  r   r   r  rH   r   r  )r  r   decoder_hidden_statesdecoder_attentionsr  encoder_last_hidden_stater   encoder_attentionsr[   )r   r  r   r   r   r   rC  r  )rc   r3  r  rw  r  r  r  r  r  r   rx  r  rH   r   rd   r   decoder_outputss                    rf   r   zT5Gemma2Model.forward  s    8 "*dll #-)+)  O !0 A A '$,, 
'1-/+"7#1)
 
 "-??+;;"1"?"?.99,==&5&G&G"1"?"?.99	
 		
r|   )NNNNNNNNNNNNN)rr   rs   rt   _tied_weights_keysr   ra   r  r  r  r  r    r   r   r  r!  
BoolTensorr   r	   r  r`   r   r   r   r   r   r   s   @rf   r  r  v  s    (P.]
~ 3A  .215370459:>8<266:-159!%26#?
 ##d*?
 ''$.	?

 ))D0?
 &&-?
 !++d2?
 !& 0 04 7?
 $..5?
 )4/?
 -t3?
 ||d*?
  %||d2?
  $;!?
" ((4/#?
$ +,%?
& 
'?
  ?
r|   r  c            &           e Zd ZddiZddiZddgdgfiZdef fdZd	 Zd
 Z	d Z
d Zd Zd Zeedej"                  dee   deez  fd              Zed        Zee	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d*dej2                  dz  dej4                  dz  dej4                  dz  dej2                  dz  dej2                  dz  dej6                  dz  dej2                  dz  dedz  dedz  dej4                  dz  dej4                  dz  dej2                  dz  d edz  d!ej2                  dz  d"eej"                  z  dee   deej4                     e z  f"d#              Z!d$e"d%e#d&e$d'ed(edef fd)Z% xZ&S )+ T5Gemma2ForConditionalGenerationzlm_head.out_proj.weightz,model.encoder.text_model.embed_tokens.weightzlm_head.out_projcolwise_gather_outputr   logitsr   c                    t         |   |       t        |      | _        |j                  j
                  | _        t        |j                  j                  | j
                        | _        d| _	        | j                          y )NForMaskedLM)r   ra   r  r<  r   r=   r#  r>   lm_head	loss_typert  r   s     rf   ra   z)T5Gemma2ForConditionalGeneration.__init__  sZ     "6*
 ..33%fnn&@&@$//R&r|   c                 &    || j                   _        y r   r  rR  r  s     rf   set_output_embeddingsz6T5Gemma2ForConditionalGeneration.set_output_embeddings  s     .r|   c                 .    | j                   j                  S r   r  r  s    rf   get_output_embeddingsz6T5Gemma2ForConditionalGeneration.get_output_embeddings  s    ||$$$r|   c                 6    | j                   j                         S r   r<  r  r  s    rf   r  z5T5Gemma2ForConditionalGeneration.get_input_embeddings      zz..00r|   c                 :    | j                   j                  |       y r   r<  r  rc   values     rf   r  z5T5Gemma2ForConditionalGeneration.set_input_embeddings      

''.r|   c                 6    | j                   j                         S r   )r<  r  r  s    rf   r  z,T5Gemma2ForConditionalGeneration.get_encoder      zz%%''r|   c                 6    | j                   j                         S r   )r<  r  r  s    rf   r  z,T5Gemma2ForConditionalGeneration.get_decoder  r  r|   r  rd   r   c                 D     | j                         j                  |fi |S r   )r  r  )rc   r  rd   s      rf   r  z3T5Gemma2ForConditionalGeneration.get_image_features  s%    
 5t!44\LVLLr|   c                 6    | j                         j                  S r   )r  r  r  s    rf   r  z-T5Gemma2ForConditionalGeneration.vision_tower   s    !...r|   Nr3  rw  r  r  r  r  r  r   rx  r  labelsrH   r   logits_to_keepc                    |||| j                  |      } | j                  d|||||||||	|
|||d|}|j                  }t        |t              rt        | d      n|}| j                  |dd|ddf         }| j                  j                  }|j                  3||j                  z  }t        j                  |      }||j                  z  }d}| | j                  ||| j                  fi |}t        |||j                  |j                   |j"                  |j$                  |j&                  |j(                  |j*                  	      S )a  
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, decoder_sequence_length)`, *optional*):
            Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the range `[0,
            config.decoder.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
        N)r3  r  rw  r  r  r  r  r  r   rx  r  rH   r   )	lossr  r   r  r  r  r  r   r  r[   )rb  r<  r  r   rw   slicer  r   r   rQ   r   tanhloss_functionr=   r   r   r  r  r  r  r   r  )rc   r3  r  rw  r  r  r  r  r  r   rx  r  r  rH   r   r  rd   r  r   slice_indicesr  r_  r  s                          rf   r   z(T5Gemma2ForConditionalGeneration.forward  su   D "3";@U@] $ J J6 R.8djj /
%)%/#9!5++'"7)/
 /
" (998B>SV8W~ot4]kmA}a,?@A,,11=nDDDFZZ'FnDDDF%4%%ffdooPPD+;;"1"G"G.AA,==&5&O&O"1"G"G.AA

 
	
r|   generation_configmodel_kwargsgeneration_mode
batch_sizemax_cache_lengthc           	      P   t         |   |||||       |j                  du ry|j                  }|d}nd|j                  v }t	        j
                  | j                  j                  d            }|`|`	||d}	|j                  d      }
|
t        |
t              st        d      t        |
j                        d	kD  r|
j                  j                  d	      ryt!        |
j"                        }|t$        k(  r|d
   d	   j&                  d   |	d<    |di |	|
_        n=t        t)        di | j                  j                  d      |dt)                     |d<   t+        | d      r=| j,                  0t        | j,                  t              st        d      |d   | _        yyy)zMOverride cache preparation to support T5Gemma2-specific EncoderDecoder Cache.FN	offloadedTr  )r   
offloadingr   zaThe `past_key_values` in `model_kwargs` must be of type `EncoderDecoderCache` for T5Gemma2 model.r   r  rX   max_cache_len_cachezLThe internal cache must be of type `EncoderDecoderCache` for T5Gemma2 model.r[   )r   _prepare_cache_for_generationrH   cache_implementationcopydeepcopyr   get_text_configrO   rP   r]   r   r	   r   lenr   r   r   r
   r   r   rT  r  )rc   r  r  r  r  r  r  offload_cachecross_attn_configcross_attn_cache_kwargsr   cross_attn_clsr   s               rf   r   z>T5Gemma2ForConditionalGeneration._prepare_cache_for_generationW  s    	-	
 &&%/0EE'!M'+<+Q+QQM !MM$++*E*Ed*E*ST ,) ('#

 '**+<=&o/BC w 
 ?--.27Q7Q7U7UVW7X!/"G"GHN,;GHY;Z[\;];c;cde;f'84B4]E\4]O1 /B "&++"="=d"="K&3 /L*+ 4"t{{'>dkk+>? !opp&'89DK	 (?"r|   )NNNNNNNNNNNNNNr   )'rr   rs   rt   r  _tp_plan_pp_planr   ra   r  r  r  r  r  r  r    r   r   r  r   r   r   r   r  propertyr  r  r!  r  r   r	   r`   rw   r   r   r   r{   r   r   r   r   s   @rf   r  r    s   !#Q #$;<H"o%6
$CDH~ /%1/(( M!LLM4:;M4NM	+	+M  M
 / /  .215370459:>8<266:26:>*.!%26-.'O
 ##d*O
 ''$.	O

 ))D0O
 &&-O
 !++d2O
 !& 0 04 7O
 $..5O
 )4/O
 -t3O
 ((4/O
  %0047O
    4'!O
" $;#O
$ ((4/%O
& ell*'O
( +,)O
* 
u  	!O	3+O
  O
bI:+I: I: (	I:
 I: I: 
I: I:r|   r  c                       e Zd Zdef fdZd Zd Zee	 	 	 	 	 	 	 	 	 	 	 dde	j                  dz  de	j                  dz  de	j                  dz  d	e	j                  dz  d
e	j                  dz  de	j                  dz  de	j                  dz  dedz  de	j                  dz  de	j                  dz  de	j                  dz  dee   defd              Z xZS )!T5Gemma2ForSequenceClassificationr   c                 "   t         |   |       |j                  | _        |j                  j                  | _        t        |      | _        t        |dd      }t        | j                  | j                  |      | _	        | j                          y Nr   g?r   ra   
num_labelsr   r>   r  r<  r   r%  scorert  rc   r   classifier_dropoutr   s      rf   ra   z*T5Gemma2ForSequenceClassification.__init__  sp      ++!>>55"6*
$V-FL/0@0@$//Sef
r|   c                 6    | j                   j                         S r   r  r  s    rf   r  z6T5Gemma2ForSequenceClassification.get_input_embeddings  r  r|   c                 :    | j                   j                  |       y r   r  r  s     rf   r  z6T5Gemma2ForSequenceClassification.set_input_embeddings  r  r|   Nr3  r  rw  r  r  r  r  r  rx  r  r  rd   r   c                 v   |	|
#t        d| j                  j                   d      |t        d      || j	                  |      } | j
                  |f||||||||	|
dd
|}|j                  }|j                  }|j                  }| j                  |      }|j                  d   }|| j                  j                  k7  j                  |j                  t        j                         }t        j"                  |j                  d   |j                  t        j                   	      }||z  j%                  d      }t        j&                  ||j                  d   d
z
        }|t        j"                  ||j                        |f   }d}|| j)                  |||| j                        }t+        ||||      S )  
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, decoder_sequence_length)`, *optional*):
            Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the range `[0,
            config.decoder.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        N8Passing input embeddings is currently not supported for .You have to specify input_idsF
r  rw  r  r  r  r  r  rx  r  rH   r   r   )r   r7  rX   )maxr|  )r  r  pooled_logitsr   r  r  r   rC  )NotImplementedErrorr   rr   r   rb  r<  r  r  r  r  r   r   rI   r5  r   r   int32r  argmaxclampr  r   )rc   r3  r  rw  r  r  r  r  r  rx  r  r  rd   r  r  r   rC  r  r  non_pad_masktoken_indiceslast_non_pad_tokenr  r  s                           rf   r   z)T5Gemma2ForSequenceClassification.forward  s   4 $(=(I%J4>>KbKbJccde  <==$ $ J J9 U&0djj'
%)%/#9!5+'"7'
 '
 $5555//
-.__Q'
)T[[-E-EEII&--Y^YdYde%6%<%<R%@^c^i^ij+l:BB2F"[[);ARAXAXY[A\_`A`au||Jv}}MOaab%%VFR_hlhshs%tD' '!	
 	
r|   NNNNNNNNNNN)rr   rs   rt   r   ra   r  r  r    r   r   r  r!  r  r   r   r   r   r   r   r   s   @rf   r  r    s\   	~ 	1/  .215.204596:8<2626:>*.J
##d*J
 ''$.J
 t+	J

 &&-J
 !++d2J
 !&t 3J
 $..5J
 )4/J
 ((4/J
  %0047J
   4'J
 +,J
 
"J
  J
r|   r  c                       e Zd Zdef fdZd Zd Zee	 	 	 	 	 	 	 	 	 	 	 dde	j                  dz  de	j                  dz  de	j                  dz  d	e	j                  dz  d
e	j                  dz  de	j                  dz  de	j                  dz  dedz  de	j                  dz  de	j                  dz  de	j                  dz  dee   defd              Z xZS )T5Gemma2ForTokenClassificationr   c                 "   t         |   |       |j                  | _        |j                  j                  | _        t        |      | _        t        |dd      }t        | j                  | j                  |      | _	        | j                          y r  r  r  s      rf   ra   z'T5Gemma2ForTokenClassification.__init__  sp      ++!>>55"6*
$V-FL/0@0@$//Sef
r|   c                 6    | j                   j                         S r   r  r  s    rf   r  z3T5Gemma2ForTokenClassification.get_input_embeddings  r  r|   c                 :    | j                   j                  |       y r   r  r  s     rf   r  z3T5Gemma2ForTokenClassification.set_input_embeddings  r  r|   Nr3  r  rw  r  r  r  r  r  rx  r  r  rd   r   c                    |	|
#t        d| j                  j                   d      |t        d      || j	                  |      } | j
                  |f||||||||	|
dd
|}|j                  }|j                  }|j                  }| j                  |      }d}|| j                  ||| j                        }t        ||||      S )r  Nr  r  r  Fr  r   )r!  r   rr   r   rb  r<  r  r  r  r  r  r   r   )rc   r3  r  rw  r  r  r  r  r  rx  r  r  rd   r  r  r   rC  r  r  s                      rf   r   z&T5Gemma2ForTokenClassification.forward  s   4 $(=(I%J4>>KbKbJccde  <==$ $ J J9 U&0djj'
%)%/#9!5+'"7'
 '
 $5555//
-.%%ffdkkBD$'!	
 	
r|   r(  )rr   rs   rt   r   ra   r  r  r    r   r   r  r!  r  r   r   r   r   r   r   r   s   @rf   r*  r*    s\   
~ 
1/  .215.204596:8<2626:>*.@
##d*@
 ''$.@
 t+	@

 &&-@
 !++d2@
 !&t 3@
 $..5@
 )4/@
 ((4/@
  %0047@
   4'@
 +,@
 
@
  @
r|   r*  )
r   r;   r~   r   r  r  r  r;  r  r*  )T)kr  collections.abcr   typingr   r   r   torch.nnr    r   rM  cache_utilsr   r	   r
   configuration_utilsr   r   
generationr   r   r   masking_utilsr   modeling_flash_attention_utilsr   modeling_outputsr   r   r   r   r   r   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r    r!   r"   utils.genericr#   utils.output_capturingr$   r%   autor'   gemma3.configuration_gemma3r(   r)   gemma3.modeling_gemma3r*   r+   r,   r-   r.   r/   r0   r1   r2   r3   r4   siglipr5   t5gemma.modeling_t5gemmar6   r7   r8   r9   
get_loggerrr   r   r;   r~   r   r   r   r   r   r   r   rw   r  r  r  r#  r%  r'  r*  r;  rk  r  r  r  r  r  r*  __all__r[   r|   rf   <module>rF     sJ    $     & I I J K K 6 B   G F &  8 E  H    (  
		H	%D,)+; D,NL D,,.> D,NvJ% vJr	m 		) 	\3 \O `Bo `BF  &	. 	.. .b	] 		!: 	!"; !
 &C  * L!3 L! L!^b
1 b
Je- ePI
- I
X \
+ \
 \
~J:'> J:Z ^
(? ^
 ^
B U
%< U
 U
pr|   