
    qi                       d dl Z d dlmZ d dlmZ d dlZd dlmZ ddlm	Z
 ddlmZ ddlmZmZmZmZ ddlmZmZmZ dd	lmZmZ dd
lmZmZmZ ddlmZ ddlm Z  ddl!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z( ddl)m*Z*m+Z+ ddl,m-Z-m.Z. ddl/m0Z0 ddl1m2Z2m3Z3m4Z4m5Z5 ddl6m7Z7m8Z8 ddl9m:Z:m;Z; ddl<m=Z= ddl>m?Z?m@Z@mAZAmBZB  G d dej                        ZD G d dej                        ZE G d dej                        ZFd ZG ed      dRd        ZHd!ej                  d"eJd#ej                  fd$ZK	 	 	 dSd%ej                  d&ej                  d'ej                  d(ej                  d)ej                  dz  d*eLd+eLdz  d,eLdz  d#eMej                  ej                  f   fd-ZN eeH       G d. d/ej                               ZO eeH       G d0 d1ej                               ZP G d2 d3e       ZQ G d4 d5e       ZR G d6 d7ej                        ZS G d8 d9ej                        ZT G d: d;ej                        ZU G d< d=ej                        ZWe3 G d> d?e.             ZXdTd@eJd#efdAZY G dB dCeX      ZZ G dD dEeX      Z[d)ej                  dz  d#efdFZ\ G dG dHeX      Z]e3 G dI dJeX             Z^ G dK dLeXe      Z_e3 G dM dNeX             Z`e3 G dO dPeX             Zag dQZby)U    N)Callable)Optional   )initialization)ACT2FN)CacheDynamicCacheEncoderDecoderCacheStaticCache)GenerationConfigGenerationMixinGenerationMode)use_kernel_func_from_hubuse_kernelized_func)create_bidirectional_maskcreate_causal_mask!create_sliding_window_causal_mask)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsBaseModelOutputWithPoolingSeq2SeqLMOutputSeq2SeqModelOutputSequenceClassifierOutputTokenClassifierOutput)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tupletorch_compilable_check)maybe_autocastmerge_with_config_defaults)OutputRecordercapture_outputs   )	AutoModel   )T5Gemma2ConfigT5Gemma2DecoderConfigT5Gemma2EncoderConfigT5Gemma2TextConfigc                   <     e Zd Zddedef fdZd Zd Zd Z xZ	S )T5Gemma2RMSNormdimepsc                     t         |           || _        t        j                  t        j                  |            | _        y N)super__init__r4   nn	Parametertorchzerosweight)selfr3   r4   	__class__s      `/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/t5gemma2/modeling_t5gemma2.pyr8   zT5Gemma2RMSNorm.__init__8   s.    ll5;;s#34    c                     |t        j                  |j                  d      j                  dd      | j                  z         z  S )Nr*   T)keepdim)r;   rsqrtpowmeanr4   )r>   xs     r@   _normzT5Gemma2RMSNorm._norm=   s4    5;;quuQx}}R}>IJJJrA   c                     | j                  |j                               }|d| j                  j                         z   z  }|j                  |      S )N      ?)rI   floatr=   type_as)r>   rH   outputs      r@   forwardzT5Gemma2RMSNorm.forward@   sC    AGGI& 3!2!2!445~~a  rA   c                 ^    t        | j                  j                         d| j                   S )Nz, eps=)tupler=   shaper4   r>   s    r@   
extra_reprzT5Gemma2RMSNorm.extra_reprG   s'    ))*+6$((<<rA   )gư>)
__name__
__module____qualname__intrL   r8   rI   rO   rT   __classcell__r?   s   @r@   r2   r2   7   s&    5C 5e 5
K!=rA   r2   c                   *     e Zd Zdef fdZd Z xZS )T5Gemma2MLPconfigc                    t         |           || _        |j                  | _        |j                  | _        t        j                  | j                  | j                  d      | _        t        j                  | j                  | j                  d      | _        t        j                  | j                  | j                  d      | _	        t        |j                     | _        t        j                  |j                        | _        y )NFbias)r7   r8   r]   hidden_sizeintermediate_sizer9   Linear	gate_projup_proj	down_projr   hidden_activationact_fnDropoutdropout_ratedropoutr>   r]   r?   s     r@   r8   zT5Gemma2MLP.__init__L   s    !--!'!9!94#3#3T5K5KRWXyy!1!143I3IPUV4#9#94;K;KRWXV556zz&"5"56rA   c                     | j                  | j                  |            | j                  |      z  }| j                  |      }| j	                  |      }|S r6   )rh   rd   re   rk   rf   )r>   rH   hidden_statesrf   s       r@   rO   zT5Gemma2MLP.forwardW   sH    DNN1$56aH]3NN=1	rA   )rU   rV   rW   r0   r8   rO   rY   rZ   s   @r@   r\   r\   K   s    	71 	7rA   r\   c                        e Zd ZU ej                  ed<   ddef fdZe	 	 	 	 ddedz  de	d   de
dz  dedz  d	ed
ef   f
d       Z ej                         edd              Z xZS )T5Gemma2RotaryEmbeddinginv_freqNr]   c                 v   t         |           |j                  | _        |j                  | _        || _        t        t        |j                              | _        i | _	        | j                  D ]  }| j
                  j                  |   }||d   | j                  |<   | j                  }| j                  |   dk7  rt        | j                  |      } || j
                  ||      \  }}| j                  | d|d       | j                  | d|j                         d       t        | | d|        y )	N	rope_typedefault
layer_type	_inv_freqF
persistent_original_inv_freq_attention_scaling)r7   r8   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenr]   listsetlayer_typesrs   rope_parameterscompute_default_rope_parametersr   register_bufferclonesetattr)	r>   r]   devicerv   rope_paramsrope_init_fncurr_inv_freqcurr_attention_scalingr?   s	           r@   r8   z T5Gemma2RotaryEmbedding.__init__a   s8   "("@"@$*$B$B!F$6$6 78** 	UJ++55jAK")4[)ADNN:&%)%I%IL~~j)Y624>>*3MN4@fak4l1M1  J<y!9=UZ [  J</A!BMDWDWDYfk lDZL(:;=ST	UrA   r   ztorch.deviceseq_lenrv   returnztorch.Tensorc                     | j                   |   d   }t        | dd      xs | j                  | j                  z  }d}d|t	        j
                  d|dt        j                        j                  |t        j                        |z  z  z  }||fS )	a|  
        Computes the inverse frequencies according to the original RoPE implementation
        Args:
            config ([`~transformers.PreTrainedConfig`]):
                The model configuration.
            device (`torch.device`):
                The device to use for initialization of the inverse frequencies.
            seq_len (`int`, *optional*):
                The current sequence length. Unused for this type of RoPE.
            layer_type (`str`, *optional*):
                The current layer type if the model has different RoPE parameters per type.
                Should not be used unless `config.layer_types is not None`

        Returns:
            Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
            post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
        
rope_thetahead_dimNrK   r   r*   dtyper   r   )	r   getattrra   num_attention_headsr;   arangeint64torL   )r]   r   r   rv   baser3   attention_factorrq   s           r@   r   z7T5Gemma2RotaryEmbedding.compute_default_rope_parametersx   s    2 %%j1,?fj$/c63E3EIcIc3c U\\!S!5;;?BB&X]XcXcBdgjjk
 )))rA   c                 N   t        | | d      }t        | | d      }|d d d d f   j                         j                  |j                  d   dd      j	                  |j
                        }|d d d d d f   j                         }t        |j
                  j                  t              r/|j
                  j                  dk7  r|j
                  j                  nd}t        |d	      5  |j                         |j                         z  j                  dd
      }	t        j                  |	|	fd      }
|
j                         |z  }|
j                         |z  }d d d        j	                  |j                        j	                  |j                        fS # 1 sw Y   AxY w)Nrw   r{   r   rC   r,   mpscpuF)device_typeenabledr*   r3   r   )r   rL   expandrR   r   r   
isinstancetypestrr&   	transposer;   catcossinr   )r>   rH   position_idsrv   rq   attention_scalinginv_freq_expandedposition_ids_expandedr   freqsembr   r   s                r@   rO   zT5Gemma2RotaryEmbedding.forward   sl    4J<y!9:#DZL8J*KL$T1d]399;BB<CUCUVWCXZ\^_`ccdedldlm ,QaZ 8 > > @'1!((--'E!((--[`J`ahhmmfkUC 	0&,,.1F1L1L1NNYYZ[]^_E))UEN3C'')//C'')//C		0 vvAGGv$cff177f&;;;	0 	0s   *A1FF$r6   NNNN)rU   rV   rW   r;   Tensor__annotations__r0   r8   staticmethodr   rX   r   rQ   rL   r   no_gradr   rO   rY   rZ   s   @r@   rp   rp   ^   s    llU1 U. ,0+/"!%	!*"T)!*(!* t!* $J	!*
 
~u$	%!* !*F U]]_<  <rA   rp   c                     | dd| j                   d   dz  f   }| d| j                   d   dz  df   }t        j                  | |fd      S )z*Rotates half the hidden dims of the input..NrC   r*   r   )rR   r;   r   )rH   x1x2s      r@   rotate_halfr      sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''rA   rotary_pos_embc                     |j                  |      }|j                  |      }| |z  t        |       |z  z   }||z  t        |      |z  z   }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )	unsqueezer   )qkr   r   unsqueeze_dimq_embedk_embeds          r@   apply_rotary_pos_embr      sY    & --
&C
--
&C3w;q>C/0G3w;q>C/0GGrA   rn   n_repr   c                     | j                   \  }}}}|dk(  r| S | dddddddddf   j                  |||||      } | j                  |||z  ||      S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r,   N)rR   r   reshape)rn   r   batchnum_key_value_headsslenr   s         r@   	repeat_kvr      so    
 2?1D1D.Ehz!!Qa"23::5BUW\^bdlmM  (;e(CT8TTrA   modulequerykeyvalueattention_maskrk   scalingsoftcapc                 |   || j                   dz  }t        || j                        }	t        || j                        }
t        j                  ||	j                  dd            |z  }|||z  }t        j                  |      }||z  }|||z   }t        j                  j                  |dt        j                        j                  |j                        }t        j                  j                  ||| j                        }t        j                  ||
      }|j                  dd      j                         }||fS )N      r*   r   rC   )r3   r   )ptrainingr,   )r   r   num_key_value_groupsr;   matmulr   tanhr9   
functionalsoftmaxfloat32r   r   rk   r   
contiguous)r   r   r   r   r   rk   r   r   kwargs
key_statesvalue_statesattn_weightsattn_outputs                r@   eager_attention_forwardr      s    //4'3 ; ;<JUF$?$?@L<<z';';Aq'ABWLL#g-zz,/#g-!#n4 ==((2U]](SVVW\WbWbcL==((6??([L,,|\:K''1-88:K$$rA   c                       e Zd ZdZdedef fdZ	 	 	 	 ddej                  dej                  dej                  dz  d	e	dz  d
ej                  dz  dee   deej                  ej                  dz  eej                     dz  f   fdZ xZS )T5Gemma2SelfAttentionz=Multi-headed attention from 'Attention Is All You Need' paperr]   	layer_idxc                    t         |           t        |d      r|j                  |   nd | _        || _        || _        t        |d|j                  |j                  z        | _
        |j                  |j                  z  | _        |j                  dz  | _        | j
                  j                  | _        d| _        t#        j$                  |j                  |j                  | j                  z  |j&                        | _        t#        j$                  |j                  |j                  | j                  z  |j&                        | _        t#        j$                  |j                  |j                  | j                  z  |j&                        | _        t#        j$                  |j                  | j                  z  |j                  |j&                        | _        | j
                  j0                  | _        | j                  dk(  r|j2                  nd | _        | j                  dk(  | _        t7        |j                  |j8                        | _        t7        |j                  |j8                        | _        y Nr   r   r   Fr_   sliding_attention)r3   r4   r7   r8   hasattrr   rv   r]   r   r   ra   r   r   r   r   query_pre_attn_scalarr   attention_dropout	is_causalr9   rc   attention_biasq_projk_projv_projo_projattn_logit_softcappingsliding_window
is_slidingr2   rms_norm_epsq_normk_normr>   r]   r   r?   s      r@   r8   zT5Gemma2SelfAttention.__init__     ;B6=;Y&,,Y7_c"
F4F4F&JdJd4de$*$>$>&B\B\$\!33T9!%!>!>ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii&&68J8JQWQfQf
 '+kk&H&H#7;J]7]f33cg//-@@%&//v?R?RS%&//v?R?RSrA   Nrn   position_embeddingsr   past_key_valuescache_positionr   r   c                 r   |j                   d d }g |d| j                  }| j                  |      j                  |      j	                  dd      }	| j                  |      j                  |      j	                  dd      }
| j                  |      j                  |      j	                  dd      }| j                  |	      }	| j                  |
      }
|\  }}t        |	|
||      \  }	}
|'|||d}|j                  |
|| j                  |      \  }
}t        j                  | j                  j                  t               } || |	|
||f| j"                  r| j$                  nd| j&                  | j(                  d|\  }} |j*                  g |d j-                         }| j/                  |      }||fS )NrC   r,   r*   r   r   r           )rk   r   r   )rR   r   r   viewr   r   r   r   r   r   updater   r   get_interfacer]   _attn_implementationr   r   r   r   r   r   r   r   )r>   rn   r   r   r   r   r   input_shapehidden_shapequery_statesr   r   r   r   cache_kwargsattention_interfacer   r   s                     r@   rO   zT5Gemma2SelfAttention.forward   s    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST{{<0[[,
&S#7jRUWZ#[ j&#&snUL'6'='=j,X\XfXfht'u$J(?(M(MKK,,.E)
 %8
%
 /3mmD**LL..
%
 
%
!\ *k));;;;FFHkk+.L((rA   r   )rU   rV   rW   __doc__r0   rX   r8   r;   r   r   
LongTensorr!   r"   rQ   rO   rY   rZ   s   @r@   r   r      s    GT1 Tc TB -1.2(,26-)||-) #\\-) t+	-)
 -) ((4/-) +,-) 
u||U\\D0%2E2LL	M-)rA   r   c                   N    e Zd ZdZdedef fdZ	 	 ddej                  de	ej                  ej                  f   dej                  dz  d	ej                  d
e
dz  dej                  dz  dee   de	ej                  ej                  dz  e	ej                     dz  f   fdZ xZS )T5Gemma2MergedAttentionz6Merged self-attention and cross-attention for decoder.r]   r   c                    t         |           t        |d      r|j                  |   nd | _        || _        || _        t        |d|j                  |j                  z        | _
        |j                  |j                  z  | _        |j                  dz  | _        | j
                  j                  | _        d| _        t#        j$                  |j                  |j                  | j                  z  |j&                        | _        t#        j$                  |j                  |j                  | j                  z  |j&                        | _        t#        j$                  |j                  |j                  | j                  z  |j&                        | _        t#        j$                  |j                  | j                  z  |j                  |j&                        | _        | j
                  j0                  | _        | j                  dk(  r|j2                  nd | _        | j                  dk(  | _        t7        |j                  |j8                        | _        t7        |j                  |j8                        | _        y r   r   r   s      r@   r8   z T5Gemma2MergedAttention.__init__T  r   rA   Nrn   r   merged_attention_maskencoder_hidden_statesr   r   r   r   c                    |j                   d d }g |d| j                  }	|j                   d d }
g |
d| j                  }| j                  |      j                  |	      j	                  dd      }| j                  |      j                  |	      j	                  dd      }| j                  |      j                  |	      j	                  dd      }| j                  |      }| j                  |      }|\  }}t        ||||      \  }}|d|||d}|j                  }|j                  ||| j                  |      \  }}|j                  j                  | j                        }|j                  }|s| j                  |      j                  |      j	                  dd      }| j                  |      j                  |      j	                  dd      }| j                  |      }|j                  ||| j                        \  }}d|j                  | j                  <   nFj                   | j                     j"                  }|j                   | j                     j$                  }|}|
d   }t'        j(                  ||gd      }t'        j(                  ||gd      }t+        j,                  | j.                  j0                  t2              } || ||||f| j4                  r| j6                  nd| j8                  d|\  }} |j:                  g |d j=                         }| j?                  |      }||d	d | f   }|d	| d f   }nd
\  }}|||fS )NrC   r,   r*   r   Tr   r   )rk   r   .NN) rR   r   r   r   r   r   r   r   r   r   self_attention_cacher   r   
is_updatedgetcross_attention_cachelayerskeysvaluesr;   r   r   r   r]   r  r   r   r   r   r   r   r   )r>   rn   r   r  r  r   r   r   r  r  cross_input_shapecross_hidden_shaper  r   r   r   r   r  r  r  r  cross_key_statescross_value_statescross_key_sizer  r   r   self_attn_weightscross_attn_weightss                                r@   rO   zT5Gemma2MergedAttention.forwardr  s    $))#2.88b8$--8177<D0D"DdmmD {{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST{{<0[[,
&S#7jRUWZ#[ j& $'snUL#2#G#G ';'B'BL$..,($J
 )3377GJ$3$I$I!"*#{{+@AFFGYZddefhij!%-B!C!H!HI[!\!f!fghjk!l#{{+;<*7L7S7S$&8$..84 "4 >B**4>>:4;;DNNKPP!6!=!=dnn!M!T!T $*1-YY
,<=1E
yy,0B!CK(?(M(MKK,,.E)
 %8!	%
 /3mmD**LL	%
 	%
!\ *k));;;;FFHkk+. # ,S2BN?2B-B C!-cN?3C.C!D4>11-/AAArA   r  )rU   rV   rW   r  r0   rX   r8   r;   r   rQ   r
   r  r!   r   rO   rY   rZ   s   @r@   r
  r
  P  s    @T1 Tc TN 7;26YB ||YB #5<<#=>	YB
  %||d2YB  %||YB -t3YB ((4/YB -.YB 
u||U\\D0%2E2LL	MYBrA   r
  c                        e Zd ZdZdef fdZ	 	 	 ddej                  deej                  ej                  f   dz  dej                  dz  dej                  dz  d	eej                  f   f
d
Z xZS )T5Gemma2EncoderLayerzEncoder sub-layer.r   c                 D   t         |           |j                  | _        || _        || _        |j
                  |   | _        t        ||      | _        t        |j                  |j                        | _        t        |j                  |j                        | _        t        |      | _        t        |j                  |j                        | _        t        |j                  |j                        | _        t#        j$                  |j&                        | _        y N)r]   r   r4   )r7   r8   ra   r]   r   r   attention_typer   	self_attnr2   r   pre_self_attn_layernormpost_self_attn_layernormr\   mlppre_feedforward_layernormpost_feedforward_layernormr9   ri   rj   rk   r   s      r@   r8   zT5Gemma2EncoderLayer.__init__  s    !--"$00;.
 (7v7I7IvObOb'c$(78J8JPVPcPc(d%v&)89K9KQWQdQd)e&*9&:L:LRXReRe*f'zz&"5"56rA   Nrn   r   r   r   r   c           	      >   |}| j                  |      } | j                  d||||d d|\  }}| j                  |      }|| j                  |      z   }|}| j	                  |      }| j                  |      }| j                  |      }|| j                  |      z   }|S )N)rn   r   r   r   r    r%  r$  r&  rk   r(  r'  r)  )r>   rn   r   r   r   r   residual_s           r@   rO   zT5Gemma2EncoderLayer.forward  s     !44]C)4>> 
' 3)% 
 
q 55mD 4<<#>> 66}E/77F 4<<#>>rA   )NNN)rU   rV   rW   r  rX   r8   r;   r   rQ   r  FloatTensorrO   rY   rZ   s   @r@   r  r    s    7# 7. IM.204|| #5<<#=>E t+	
 &&- 
u  !	"rA   r  c                   0    e Zd ZdZdef fdZ	 	 	 	 	 	 ddej                  deej                  ej                  f   dej                  dz  dej                  dz  d	e
dz  d
edz  dej                  dz  dej                  dz  dej                  fdZ xZS )T5Gemma2DecoderLayerzFDecoder sub-layer: merged attention instead of vanilla self-attention.r   c                 D   t         |           |j                  | _        || _        || _        |j
                  |   | _        t        ||      | _        t        |j                  |j                        | _        t        |j                  |j                        | _        t        |      | _        t        |j                  |j                        | _        t        |j                  |j                        | _        t#        j$                  |j&                        | _        y r!  )r7   r8   ra   r]   r   r   r#  r
  r$  r2   r   r%  r&  r\   r'  r(  r)  r9   ri   rj   rk   r   s      r@   r8   zT5Gemma2DecoderLayer.__init__  s    !--"$00; 1
 (7v7I7IvObOb'c$(78J8JPVPcPc(d%v&)89K9KQWQdQd)e&*9&:L:LRXReRe*f'zz&"5"56rA   Nrn   r   r  r   r   	use_cacher   r  r   c	                 F   |}
| j                  |      } | j                  d||||||||d|	\  }}}| j                  |      }|
| j                  |      z   }|}
| j	                  |      }| j                  |      }| j                  |      }|
| j                  |      z   }|S )N)rn   r   r  r   r   r3  r   r  r+  r,  )r>   rn   r   r  r   r   r3  r   r  r   r-  r.  s               r@   rO   zT5Gemma2DecoderLayer.forward  s     !44]C,dnn 

' 3"7%+)"7

 

q! 55mD 4<<#>> 66}E/77F 4<<#>>rA   )NNNFNN)rU   rV   rW   r  rX   r8   r;   r   rQ   r  r
   boolr/  rO   rY   rZ   s   @r@   r1  r1    s    P7# 72 6:046:!&2659"||" #5<<#=>"  %||d2	"
 &&-" -t3" $;" ((4/"  %||d2" 
		"rA   r1  c                   j     e Zd ZdZd	dededef fdZdej                  dej                  fdZ	 xZ
S )
T5Gemma2LMHeadz.Head for language modeling (generation) tasks.ra   
vocab_sizer`   c                 \    t         |           t        j                  |||      | _        y )Nr_   )r7   r8   r9   rc   out_proj)r>   ra   r8  r`   r?   s       r@   r8   zT5Gemma2LMHead.__init__B  s"    		+zErA   rn   r   c                 (    | j                  |      }|S r6   )r:  )r>   rn   logitss      r@   rO   zT5Gemma2LMHead.forwardF  s    }-rA   )F)rU   rV   rW   r  rX   r5  r8   r;   r   rO   rY   rZ   s   @r@   r7  r7  ?  s?    8FC FS F FU\\ ell rA   r7  c                   j     e Zd ZdZd	dededef fdZdej                  dej                  fdZ	 xZ
S )
T5Gemma2ClassificationHeadz-Head for sentence-level classification tasks.ra   
num_labelsclassifier_dropout_ratec                     t         |           t        j                  |      | _        t        j
                  ||      | _        y )N)r   )r7   r8   r9   ri   rk   rc   r:  )r>   ra   r?  r@  r?   s       r@   r8   z#T5Gemma2ClassificationHead.__init__N  s1    zz$;<		+z:rA   rn   r   c                 J    | j                  |      }| j                  |      }|S r6   )rk   r:  )r>   rn   s     r@   rO   z"T5Gemma2ClassificationHead.forwardS  s$    ]3m4rA   )r   rU   rV   rW   r  rX   rL   r8   r;   r   rO   rY   rZ   s   @r@   r>  r>  K  s<    7;C ;S ;SX ;
U\\ ell rA   r>  c                   D     e Zd Zdef fdZdej                  fdZ xZS )T5Gemma2MultiModalProjectorr]   c                    t         |           t        j                  t	        j
                  |j                  j                  |j                  j                              | _	        t        |j                  j                  |j                  j                        | _        t        |j                  j                  |j                  j                  z        | _        t        |j"                  dz        | _        | j                   | j$                  z  | _        t        j(                  | j&                  | j&                        | _        y )Nr"        ?)kernel_sizestride)r7   r8   r9   r:   r;   r<   vision_configra   text_configmm_input_projection_weightr2   layer_norm_epsmm_soft_emb_normrX   
image_size
patch_sizepatches_per_imagemm_tokens_per_imagetokens_per_siderH  	AvgPool2davg_poolrl   s     r@   r8   z$T5Gemma2MultiModalProjector.__init__Z  s    *,,,KK,,88&:L:L:X:XY+
' !0  ,,&2F2F2U2U!
 "%V%9%9%D%DH\H\HgHg%g!h"6#=#=s#BC11T5I5II1A1A$JZJZ[rA   vision_outputsc                    |j                   \  }}}|j                  dd      }|j                  ||| j                  | j                        }|j	                         }| j                  |      }|j                  d      }|j                  dd      }| j                  |      }t        j                  || j                        }|j                  |      S )Nr,   r*   )rR   r   r   rQ  r   rU  flattenrN  r;   r   rL  rM   )	r>   rV  
batch_sizer.  ra   reshaped_vision_outputspooled_vision_outputsnormed_vision_outputsprojected_vision_outputss	            r@   rO   z#T5Gemma2MultiModalProjector.forwardj  s    %3%9%9"
A{"0":":1a"@"9"A"AT%;%;T=S=S#
 #:"D"D"F $.E F 5 = =a @ 5 ? ?1 E $ 5 56K L#(<<0EtGfGf#g '//??rA   )	rU   rV   rW   r/   r8   r;   r   rO   rY   rZ   s   @r@   rE  rE  Y  s$    \4 \ @ell @rA   rE  c                   b     e Zd ZdZ	 	 d
dededededef
 fdZdej                  f fd	Z	 xZ
S )T5Gemma2TextScaledWordEmbeddingzCT5Gemma2 Embedding: override to add eoi token embedding separately.num_embeddingsembedding_dimpadding_idxembed_scaleeoi_token_indexc                     t         |   |||       || _        | j                  dt	        j
                  |      d       || _        t        j                  t	        j                  | j                              | _        y )Nrc  Frx   )r7   r8   scalar_embed_scaler   r;   tensorrd  r9   r:   r<   ra  eoi_embedding)r>   r`  ra  rb  rc  rd  r?   s         r@   r8   z(T5Gemma2TextScaledWordEmbedding.__init__  se     	D"-]ELL,ERWX.\\%++d6H6H*IJrA   	input_idsc                     t         |   |      | j                  j                  | j                  j
                        z  }| j                  j                  |j
                        ||| j                  k(  <   |S r6   )r7   rO   rc  r   r=   r   rh  rd  )r>   ri  input_embeddingsr?   s      r@   rO   z'T5Gemma2TextScaledWordEmbedding.forward  sf     7?958H8H8K8KDKKL]L]8^^>B>P>P>S>STdTjTj>kd&:&::;rA   )rK     rC  rZ   s   @r@   r_  r_  }  s^    M !&KK K 	K
 K K     rA   r_  c                        e Zd ZU eed<   dZdZg dZdgZdZ	dZ
dZdZdZeeg eedd	       eedd	       eed
d	      gdZdZ ej,                          fd       Zd Z xZS )T5Gemma2PreTrainedModelr]   modelT)r  r1  SiglipVisionEmbeddingsSiglipEncoderLayer#SiglipMultiheadAttentionPoolingHeadr   Fr,   r$  )index
layer_namer*   
cross_attn)rn   
attentions)imagetextc                    t         |   |       t        |t              r t	        j
                  |j                         y t        |t              rJt	        j
                  |j                         t	        j                  |j                  |j                         y t        |t              r|j                  j                  j                  d   dz  }t	        j                   |j                  j                  d| j"                  j$                  |z         t'        |j                  d      rA|j                  j(                  *t	        j
                  |j                  j(                         y y y d|j*                  j,                  v r t	        j
                  |j                         y t        |t.              r|j0                  D ]  }|j2                  }|j4                  |   dk7  rt6        |j4                  |      } ||j"                  |      \  }}t	        j8                  t;        || d	      |       t	        j8                  t;        || d
      |        y y )Nr   r   r   )rG   stdr`   RMSNormrt   ru   rw   rz   )r7   _init_weightsr   rE  initzeros_rL  r_  rh  	constant_rc  rf  r>  r:  r=   rR   normal_r]   initializer_ranger   r`   r?   rU   rp   r   r   rs   r   copy_r   )r>   r   scalerv   r   r   r.  r?   s          r@   r|  z%T5Gemma2PreTrainedModel._init_weights  s   f%f9:KK99: ?@KK,,-NN6--v/H/HI :;OO**003t;ELL//ct{{?\?\_d?dev/FOO4H4H4TFOO001 5U/ &**333KK& 78$00 ^
%EE##J/9<#6v7G7G
7S#TL#/*#U q

76j\+CDmT

76j\9K+LM}]^ 9rA   c                 <   | j                   j                  }|j                  }|j                  }|t	        d      |j                  |j                        }|dddf   j                         |dddf<   ||d<   |t	        d      |j                  |dk(  |       |S )	z
        Shifts input_ids to the right, prepends the decoder_start_token_id, and handles
        pad_token_id replacement for labels that were -100.
        This is a common preparation step for decoder inputs in sequence-to-sequence models.
        Nz:self.model.config.decoder.bos_token_id has to be defined. .rC   r,   ).r   z9self.model.config.decoder.pad_token_id has to be defined.i)	r]   decoderbos_token_idpad_token_id
ValueError	new_zerosrR   r   masked_fill_)r>   ri  decoder_configdecoder_start_token_idr  shifted_input_idss         r@   %prepare_decoder_input_ids_from_labelsz=T5Gemma2PreTrainedModel.prepare_decoder_input_ids_from_labels  s     ,,!/!<!<%22!)YZZ &//	@%.sCRCx%8%>%>%@#qr'"$:&!XYY 	&&'8D'@,O  rA   )rU   rV   rW   r-   r   base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraph_supports_attention_backendr  r1  r(   r   r
  _can_record_outputsinput_modalitiesr;   r   r|  r  rY   rZ   s   @r@   rn  rn    s    &*# $5"5 !N!"&.0DE0kR2!T2!U
 )U]]_^ ^0!rA   rn  r   c           
      T     dt         dt         dt         dt         dt        f
 fd}|S )zL
    This creates uni/bidirectional attention mask with sliding window.
    	batch_idxhead_idxq_idxkv_idxr   c                 t    	r
d}}n
dz   dz  
dz  dz   }}||z
  }|dk\  ||k  z  }|dk  | |k  z  }||z  S )Nr   r,   r*   r+  )r  r  r  r  left_window_sizeright_window_sizedist	left_mask
right_maskr   r   s            r@   
inner_maskz0sliding_window_mask_function.<locals>.inner_mask  sp    2@!/4BQ4F13L~bcNcfgNg/v~QY4*:#:;	QhD5+<#<=
:%%rA   rX   r5  )r   r   r  s   `` r@   sliding_window_mask_functionr    s3    
	&c 	&S 	& 	&c 	&d 	& rA   c                       e Zd ZU eed<   eedZ	 ddedef fdZ	e
ee	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  d	ej                   dz  d
ej                  dz  dee   defd                     Z xZS )T5Gemma2TextEncoderr]   )rv  rn   rd  c           	      ^   t         |   |       |j                  | _        |j                  | _        t        |j                  |j                  | j                  |j                  dz  |      | _        t        |j                  |j                        | _
        d| _        t        j                  t        |j                        D cg c]  }t!        ||       c}      | _        t        j$                  |j&                        | _        t+        |      | _        | j/                          y c c}w NrG  )rc  rd  r"  F)r7   r8   r  rb  r8  r_  ra   embed_tokensr2   r   normgradient_checkpointingr9   
ModuleListrangenum_hidden_layersr  r  ri   rj   rk   rp   
rotary_emb	post_initr>   r]   rd  r   r?   s       r@   r8   zT5Gemma2TextEncoder.__init__  s    
 	 !.. ++;**C/+
 $F$6$6F<O<OP	&+#mmFKFLdLdFef!&)4f
 zz&"5"561&9 	 g   D*Nri  r   r   inputs_embedstoken_type_idsr   r   c           
      0   |d u |d uz  rt        d      |j                  dd        || j                  |      }|>t        j                  d|j
                  d   |j                        j                  d      }t        |x}t              sJ| j                  ||d}t        di |t        di |dt        | j                  j                  d	      id
}|}	i }
| j                  j                  D ]  }| j                  |	||      |
|<    | j!                  |	      }	| j"                  d | j                  j$                   D ](  } ||	|
|j&                     ||j&                     |fi |}	* | j)                  |	      }	| j!                  |	      }	t+        |	      S )N:You must specify exactly one of input_ids or inputs_embedsr   r   r,   r   )r]   r  r   and_mask_functionF)r   full_attentionr   )last_hidden_stater+  )r  popr  r;   r   rR   r   r   r   dictr]   r   r  r   r   r  rk   r  r  r#  r  r   )r>   ri  r   r   r  r  r   self_attn_mask_mappingmask_kwargsrn   r   rv   layer_modules                r@   rO   zT5Gemma2TextEncoder.forward!  s    -t";<YZZ 	

$d+  --i8M <<=+>+>q+A-J^J^_iijklLNB0DI++!."0K #<"Jk"J%> &!&&B4;;C]C]in&o&&" & !++11 	gJ.2oom\[e.f
+	g ]3 KK(G$++*G*GH 	L(#L$?$?@&|'B'BC	
 M	 		-0]3+
 	
rA   rl  )NNNNN)rU   rV   rW   r0   r   r   r  r  rX   r8   r'   r)   r#   r;   r  r   r/  r!   r"   r   rO   rY   rZ   s   @r@   r  r    s    +-  '" 8   .2.20426.2<
##d*<
 t+<
 &&-	<

 ((4/<
 t+<
 +,<
 
<
    <
rA   r  c                       e Zd ZU eed<   	 ddedef fdZd Zd Ze	e
dej                  dee   deez  fd	              Zd
ej$                  dz  dej&                  dz  dej&                  fdZe
	 	 	 	 	 	 dd
ej$                  dz  dej                  dz  dej$                  dz  dej&                  dz  dej&                  dz  dej                  dz  dee   defd       Z xZS )T5Gemma2Encoderr]   rd  c                     t         |   |       t        j                  |j                  |      | _        t        j                  |j                        | _	        t        |      | _        | j                          y )N)rd  r]   )r7   r8   r  _from_configrK  
text_modelr+   from_configrJ  vision_towerrE  multi_modal_projectorr  )r>   r]   rd  r?   s      r@   r8   zT5Gemma2Encoder.__init__f  sb    
 	 -::6;M;M_n:o%119M9MN%@%H" 	rA   c                 6    | j                   j                         S r6   )r  get_input_embeddingsrS   s    r@   r  z$T5Gemma2Encoder.get_input_embeddingst  s    3355rA   c                 8    | j                   j                  |      S r6   )r  set_input_embeddingsr>   new_embeddingss     r@   r  z$T5Gemma2Encoder.set_input_embeddingsw  s    33NCCrA   pixel_valuesr   r   c                 x     | j                   d|dd|}|j                  }| j                  |      }||_        |S )NT)r  return_dictr+  )r  r  r  pooler_output)r>   r  r   rV  r  image_featuress         r@   get_image_featuresz"T5Gemma2Encoder.get_image_featuresz  sM     +**aRVaZ`a*<<334EF'5$rA   ri  Nr  r  c                 D   | j                   j                  }|f|t        d      | | j                         t	        j
                  |t        j                  |j                              k(  }|j                  d      }n||k(  }|j                         }|j                  d      j                  |      j                  |j                        }|j                  d   |j                  d   z  }t        ||   j                         |j                         k(  d| d|        |S )z
        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
        equal to the length of multimodal features. If the lengths are different, an error is raised.
        z9Either `input_ids` or `inputs_embeds` has to be provided.)r   r   rC   r   r,   z6Image features and image tokens do not match: tokens: z, features )r]   image_token_idr  r  r;   rg  longr   allsumr   	expand_asr   rR   r%   numel)r>   ri  r  r  r  special_image_maskn_image_tokensn_image_featuress           r@   get_image_placeholder_maskz*T5Gemma2Encoder.get_image_placeholder_mask  s"    33$ !\]]!.2M$2K2K2M^5::mFZFZ[3 " "4!7!7!;!*n!<+//1/99"=GGVYYZgZnZno)//2^5I5I!5LL,-3359M9M9OOD^DTT_`p_qr	
 "!rA   r   r   r  c                 j   |d u |d uz  rt        d      || j                  j                  |      }|i| j                  |d      j                  }|j                  |j                  |j                        }| j                  |||      }	|j                  |	|      } | j                  d|||d|}
|
S )Nr  T)r  )r  r  )r  r   r   r+  )
r  r  r  r  r  r   r   r   r  masked_scatter)r>   ri  r   r   r  r  r  r   r  
image_maskoutputss              r@   rO   zT5Gemma2Encoder.forward  s     -t";<YZZ  OO88CM#!44\t4TbbN+..}/C/C]EXEXYN88~ 9 J *88^TM!$// 
')%
 	
 rA   r  )NNNNNN)rU   rV   rW   r/   r   rX   r8   r  r  r$   r#   r;   r   r!   r"   rQ   r   r  r  r/  r  r   rO   rY   rZ   s   @r@   r  r  c  sk   !!
  '% 6D 
!LL
4:;M4N
	+	+
  
"##d*" ((4/" ))	"<  .2.2042615.2!##d*! t+! &&-	!
 ((4/! ''$.! t+! +,! 
! !rA   r  c           
      P     dt         dt         dt         dt         dt        f
 fd}|S )z4
    This creates bidirectional attention mask.
    r  r  r  r  r   c                     %t        j                  dt         j                        S | |f   j                  t         j                        S )Nr+  r   )r;   onesr5  r   )r  r  r  r  r   s       r@   r  z/bidirectional_mask_function.<locals>.inner_mask  s=    !::b

33i/033EJJ??rA   r  )r   r  s   ` r@   bidirectional_mask_functionr    s9    
@c @S @ @c @d @
 rA   c                       e Zd ZU eed<    eed       eed      edZddede	f fdZ
eee	 	 	 	 	 	 	 	 	 dd	ej                  dz  d
ej                   dz  dej                  dz  dedz  dej$                  dz  dedz  dej                  dz  dej                   dz  dej                   dz  dee   defd                     Z xZS )T5Gemma2Decoderr]   r,   )rs  r*   )rv  cross_attentionsrn   rd  c           	      ^   t         |   |       |j                  | _        |j                  | _        t        |j                  |j                  |j                  |j                  dz  |      | _        t        |j                  |j                        | _
        d| _        t        j                  t        |j                        D cg c]  }t!        ||       c}      | _        t        j$                  |j&                        | _        t+        |      | _        | j/                          y c c}w r  )r7   r8   r  rb  r8  r_  ra   r  r2   r   r  r  r9   r  r  r  r1  r  ri   rj   rk   rp   r  r  r  s       r@   r8   zT5Gemma2Decoder.__init__  s     !.. ++;**C/+
 $F$6$6F<O<OP	&+#mmFKFLdLdFef!&)4f
 zz&"5"561&9	 gr  Nri  r   r   r   r  r3  r   r  encoder_attention_maskr   r   c
                    |d u |d uz  rt        d      |t        d      || j                  |      }| j                  s,|r*|(t        t	        | j
                        t	                     }|F||j                         nd}t        j                  |||j                  d   z   |j                        }||j                  d      }t        |x}t              s>| j
                  |||||j                  nd |d}d |d	<   t        di |t!        di |d
}t        |	x}t              s-| j
                  ||	|d d d}dt        di |dt#        |	      ii}t        j$                  |d   |d   gd      t        j$                  |d   |d   gd      d
}|}i }| j
                  j&                  D ]  }| j)                  |||      ||<    | j+                  |      }| j,                  d | j
                  j.                   D ],  } ||||j0                     ||j0                     |||||fi |
}. | j3                  |      }| j+                  |      }t5        ||      S )Nr  z0`encoder_hidden_states` must be given in decoderr  r   r,   r  )r]   r  r   r   r   r   c                  L    t        j                  dt         j                        S )NTr   )r;   rg  r5  )argss    r@   <lambda>z)T5Gemma2Decoder.forward.<locals>.<lambda>%  s    U\\$V[V`V`=a rA   r  r  r  or_mask_functionrC   r   r   )r  r   r+  )r  r  r   r
   r	   r]   get_seq_lengthr;   r   rR   r   r   r   r  r  r   r   r  r   r   r  rk   r  r  r#  r  r   )r>   ri  r   r   r   r  r3  r   r  r  r   past_seen_tokensr  r  cross_attn_mask_mappingmerged_attn_mask_mappingrn   r   rv   r  s                       r@   rO   zT5Gemma2Decoder.forward  s     -t";<YZZ (OPP  --i8M}}/F1,dkk2RT`TbcO!CRC^==?de"\\ "2]5H5H5K"KTaThThN )33A6LNB0DI++!."0"0KZKf?#G#Glp ,K 0bK+,"4"C{"C%F%U%U&"
 5KK1TR++!6"8"0#' $K !"4 #!#%@AW%X#'# $ii'(89;RSc;dekm "''(;<>UVf>ghnp"	$
  & !++11 	gJ.2oom\[e.f
+	g ]3 KK(G$++*G*GH 	L(#L$?$?@()D)DE%
 
M	 		-0]38++
 	
rA   r  )	NNNNNNNNN)rU   rV   rW   r.   r   r(   r
  r1  r  rX   r8   r'   r)   r#   r;   r  r   r
   r/  r5  r!   r"   r   rO   rY   rZ   s   @r@   r  r    sN   !!$%<AF*+B!L-4 s ,   .2.2046:26!%26596:h
##d*h
 t+h
 &&-	h

 -t3h
 ((4/h
 $;h
 ((4/h
  %||d2h
 !&t 3h
 +,h
 
3h
    h
rA   r  c            !           e Zd ZdddZdef fdZd Zd Zd Zd	 Z	e
e	 	 	 	 	 	 	 	 	 	 	 	 	 ddej                  d
z  dej                  d
z  dej                  d
z  dej                  d
z  dej                  d
z  dej                  d
z  dej                  d
z  ded
z  ded
z  dej$                  d
z  dej$                  d
z  ded
z  dej                  d
z  dee   defd              Z xZS )T5Gemma2Modelz&encoder.text_model.embed_tokens.weightz-encoder.text_model.embed_tokens.eoi_embedding)zdecoder.embed_tokens.weightz"decoder.embed_tokens.eoi_embeddingr]   c                     t         |   |       t        |j                  |j                        | _        t        |j                  |j                        | _        | j                          y r6   )r7   r8   r  encoderrd  r  r  r  rl   s     r@   r8   zT5Gemma2Model.__init__k  sL      'v~~v7M7MN&v~~v7M7MNrA   c                     | j                   S r6   )r  rS   s    r@   get_encoderzT5Gemma2Model.get_encodert      ||rA   c                     | j                   S r6   r  rS   s    r@   get_decoderzT5Gemma2Model.get_decoderw  r  rA   c                 6    | j                   j                         S r6   )r  r  rS   s    r@   r  z"T5Gemma2Model.get_input_embeddingsz  s    ||0022rA   c                 8    | j                   j                  |      S r6   )r  r  r  s     r@   r  z"T5Gemma2Model.set_input_embeddings}  s    ||00@@rA   Nri  r  r   r   decoder_input_idsdecoder_attention_maskdecoder_position_idsencoder_outputsr   r  decoder_inputs_embedsr3  r   r   r   c                 P   | | j                   d||||
|dd|}|j                  } | j                  d|||||	||||dd
|}t        |j                  |j                  |j
                  |j                  |j                  |j                  |j
                  |j                        S )aX  
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, decoder_sequence_length)`, *optional*):
            Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the range `[0,
            config.decoder.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
        T)ri  r   r   r  r  r  )
ri  r   r   r  r   r  r  r3  r   r  )r  r   decoder_hidden_statesdecoder_attentionsr  encoder_last_hidden_stater  encoder_attentionsr+  )r  r  r  r   r   rn   rv  r  )r>   ri  r  r   r   r  r  r	  r
  r   r  r  r3  r   r   r  decoder_outputss                    r@   rO   zT5Gemma2Model.forward  s    8 "*dll #-)+)  O !0 A A '$,, 
'1-/+"7#1)
 
 "-??+;;"1"?"?.99,==&5&G&G"1"?"?.99	
 		
rA   )NNNNNNNNNNNNN)rU   rV   rW   _tied_weights_keysr-   r8   r   r  r  r  r$   r#   r;   r  r/  
BoolTensorr   r
   r   r5  r!   r"   r   rO   rY   rZ   s   @r@   r  r  d  s    (P.]
~ 3A  .215370459:>8<266:-159!%26#?
 ##d*?
 ''$.	?

 ))D0?
 &&-?
 !++d2?
 !& 0 04 7?
 $..5?
 )4/?
 -t3?
 ||d*?
  %||d2?
  $;!?
" ((4/#?
$ +,%?
& 
'?
  ?
rA   r  c            &           e Zd ZddiZddiZddgdgfiZdef fdZd	 Zd
 Z	d Z
d Zd Zd Zeedej"                  dee   deez  fd              Zed        Zee	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d*dej2                  dz  dej4                  dz  dej4                  dz  dej2                  dz  dej2                  dz  dej6                  dz  dej2                  dz  dedz  dedz  dej4                  dz  dej4                  dz  dej2                  dz  d edz  d!ej2                  dz  d"eej"                  z  dee   deej4                     e z  f"d#              Z!d$e"d%e#d&e$d'ed(edef fd)Z% xZ&S )+ T5Gemma2ForConditionalGenerationzlm_head.out_proj.weightz,model.encoder.text_model.embed_tokens.weightzlm_head.out_projcolwise_gather_outputrn   r<  r]   c                    t         |   |       t        |      | _        |j                  j
                  | _        t        |j                  j                  | j
                        | _        d| _	        | j                          y )NForMaskedLM)r7   r8   r  ro  r  r8  r7  ra   lm_head	loss_typer  rl   s     r@   r8   z)T5Gemma2ForConditionalGeneration.__init__  sZ     "6*
 ..33%fnn&@&@$//R&rA   c                 &    || j                   _        y r6   r  r:  r  s     r@   set_output_embeddingsz6T5Gemma2ForConditionalGeneration.set_output_embeddings  s     .rA   c                 .    | j                   j                  S r6   r  rS   s    r@   get_output_embeddingsz6T5Gemma2ForConditionalGeneration.get_output_embeddings  s    ||$$$rA   c                 6    | j                   j                         S r6   ro  r  rS   s    r@   r  z5T5Gemma2ForConditionalGeneration.get_input_embeddings      zz..00rA   c                 :    | j                   j                  |       y r6   ro  r  r>   r   s     r@   r  z5T5Gemma2ForConditionalGeneration.set_input_embeddings      

''.rA   c                 6    | j                   j                         S r6   )ro  r   rS   s    r@   r   z,T5Gemma2ForConditionalGeneration.get_encoder      zz%%''rA   c                 6    | j                   j                         S r6   )ro  r  rS   s    r@   r  z,T5Gemma2ForConditionalGeneration.get_decoder  r(  rA   r  r   r   c                 D     | j                         j                  |fi |S r6   )r   r  )r>   r  r   s      r@   r  z3T5Gemma2ForConditionalGeneration.get_image_features  s%    
 5t!44\LVLLrA   c                 6    | j                         j                  S r6   )r   r  rS   s    r@   r  z-T5Gemma2ForConditionalGeneration.vision_tower  s    !...rA   Nri  r   r   r  r  r	  r
  r   r  r  labelsr3  r   logits_to_keepc                    |||| j                  |      } | j                  d|||||||||	|
|||d|}|j                  }t        |t              rt        | d      n|}| j                  |dd|ddf         }| j                  j                  }|j                  3||j                  z  }t        j                  |      }||j                  z  }d}| | j                  ||| j                  fi |}t        |||j                  |j                   |j"                  |j$                  |j&                  |j(                  |j*                  	      S )a  
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, decoder_sequence_length)`, *optional*):
            Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the range `[0,
            config.decoder.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
        N)ri  r  r   r   r  r  r	  r
  r   r  r  r3  r   )	lossr<  r   r  r  r  r  r  r  r+  )r  ro  r  r   rX   slicer  r]   r  final_logit_softcappingr;   r   loss_functionr8  r   r   r  r  r  r  r  r  )r>   ri  r  r   r   r  r  r	  r
  r   r  r  r,  r3  r   r-  r   r  rn   slice_indicesr<  r  r/  s                          r@   rO   z(T5Gemma2ForConditionalGeneration.forward  su   D "3";@U@] $ J J6 R.8djj /
%)%/#9!5++'"7)/
 /
" (998B>SV8W~ot4]kmA}a,?@A,,11=nDDDFZZ'FnDDDF%4%%ffdooPPD+;;"1"G"G.AA,==&5&O&O"1"G"G.AA

 
	
rA   generation_configmodel_kwargsgeneration_moderY  max_cache_lengthc           	      P   t         |   |||||       |j                  du ry|j                  }|d}nd|j                  v }t	        j
                  | j                  j                  d            }|`|`	||d}	|j                  d      }
|
t        |
t              st        d      t        |
j                        d	kD  r|
j                  j                  d	      ryt!        |
j"                        }|t$        k(  r|d
   d	   j&                  d   |	d<    |di |	|
_        n=t        t)        di | j                  j                  d      |dt)                     |d<   t+        | d      r=| j,                  0t        | j,                  t              st        d      |d   | _        yyy)zMOverride cache preparation to support T5Gemma2-specific EncoderDecoder Cache.FN	offloadedTr  )r]   
offloadingr   zaThe `past_key_values` in `model_kwargs` must be of type `EncoderDecoderCache` for T5Gemma2 model.r   r
  r,   max_cache_len_cachezLThe internal cache must be of type `EncoderDecoderCache` for T5Gemma2 model.r+  )r7   _prepare_cache_for_generationr3  cache_implementationcopydeepcopyr]   get_text_configr   r   r  r   r
   r  lenr  r   r  r   rR   r	   r   r<  )r>   r4  r5  r6  rY  r7  r>  offload_cachecross_attn_configcross_attn_cache_kwargsr   cross_attn_clsr?   s               r@   r=  z>T5Gemma2ForConditionalGeneration._prepare_cache_for_generationE  s    	-	
 &&%/0EE'!M'+<+Q+QQM !MM$++*E*Ed*E*ST ,) ('#

 '**+<=&o/BC w 
 ?--.27Q7Q7U7UVW7X!/"G"GHN,;GHY;Z[\;];c;cde;f'84B4]E\4]O1 /B "&++"="=d"="K&3 /L*+ 4"t{{'>dkk+>? !opp&'89DK	 (?"rA   )NNNNNNNNNNNNNNr   )'rU   rV   rW   r  _tp_plan_pp_planr-   r8   r  r  r  r  r   r  r$   r#   r;   r   r!   r"   rQ   r   r  propertyr  r  r/  r  r   r
   r5  rX   r   rO   r   r  r   r=  rY   rZ   s   @r@   r  r    s   !#Q #$;<H"o%6
$CDH~ /%1/(( M!LLM4:;M4NM	+	+M  M
 / /  .215370459:>8<266:26:>*.!%26-.'O
 ##d*O
 ''$.	O

 ))D0O
 &&-O
 !++d2O
 !& 0 04 7O
 $..5O
 )4/O
 -t3O
 ((4/O
  %0047O
    4'!O
" $;#O
$ ((4/%O
& ell*'O
( +,)O
* 
u  	!O	3+O
  O
bI:+I: I: (	I:
 I: I: 
I: I:rA   r  c                       e Zd Zdef fdZd Zd Zee	 	 	 	 	 	 	 	 	 	 	 dde	j                  dz  de	j                  dz  de	j                  dz  d	e	j                  dz  d
e	j                  dz  de	j                  dz  de	j                  dz  dedz  de	j                  dz  de	j                  dz  de	j                  dz  dee   defd              Z xZS )!T5Gemma2ForSequenceClassificationr]   c                 "   t         |   |       |j                  | _        |j                  j                  | _        t        |      | _        t        |dd      }t        | j                  | j                  |      | _	        | j                          y Nr@  g?r7   r8   r?  r  ra   r  ro  r   r>  scorer  r>   r]   classifier_dropoutr?   s      r@   r8   z*T5Gemma2ForSequenceClassification.__init__  sp      ++!>>55"6*
$V-FL/0@0@$//Sef
rA   c                 6    | j                   j                         S r6   r!  rS   s    r@   r  z6T5Gemma2ForSequenceClassification.get_input_embeddings  r"  rA   c                 :    | j                   j                  |       y r6   r$  r%  s     r@   r  z6T5Gemma2ForSequenceClassification.set_input_embeddings  r&  rA   Nri  r  r   r   r  r  r	  r
  r  r  r,  r   r   c                 v   |	|
#t        d| j                  j                   d      |t        d      || j	                  |      } | j
                  |f||||||||	|
dd
|}|j                  }|j                  }|j                  }| j                  |      }|j                  d   }|| j                  j                  k7  j                  |j                  t        j                         }t        j"                  |j                  d   |j                  t        j                   	      }||z  j%                  d      }t        j&                  ||j                  d   d
z
        }|t        j"                  ||j                        |f   }d}|| j)                  |||| j                        }t+        ||||      S )  
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, decoder_sequence_length)`, *optional*):
            Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the range `[0,
            config.decoder.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        N8Passing input embeddings is currently not supported for .You have to specify input_idsF
r  r   r   r  r  r	  r
  r  r  r3  r   rC   r   r,   )maxr  )r<  r,  pooled_logitsr]   r/  r<  rn   rv  )NotImplementedErrorr?   rU   r  r  ro  r  r  r  rO  rR   r]   r  r   r   r;   int32r   argmaxclampr2  r   )r>   ri  r  r   r   r  r  r	  r
  r  r  r,  r   r  r  rn   rv  r<  rY  non_pad_masktoken_indiceslast_non_pad_tokenr[  r/  s                           r@   rO   z)T5Gemma2ForSequenceClassification.forward  s   4 $(=(I%J4>>KbKbJccde  <==$ $ J J9 U&0djj'
%)%/#9!5+'"7'
 '
 $5555//
-.__Q'
)T[[-E-EEII&--Y^YdYde%6%<%<R%@^c^i^ij+l:BB2F"[[);ARAXAXY[A\_`A`au||Jv}}MOaab%%VFR_hlhshs%tD' '!	
 	
rA   NNNNNNNNNNN)rU   rV   rW   r-   r8   r  r  r$   r#   r;   r  r/  r   r   r!   r"   r   rO   rY   rZ   s   @r@   rK  rK    s\   	~ 	1/  .215.204596:8<2626:>*.J
##d*J
 ''$.J
 t+	J

 &&-J
 !++d2J
 !&t 3J
 $..5J
 )4/J
 ((4/J
  %0047J
   4'J
 +,J
 
"J
  J
rA   rK  c                       e Zd Zdef fdZd Zd Zee	 	 	 	 	 	 	 	 	 	 	 dde	j                  dz  de	j                  dz  de	j                  dz  d	e	j                  dz  d
e	j                  dz  de	j                  dz  de	j                  dz  dedz  de	j                  dz  de	j                  dz  de	j                  dz  dee   defd              Z xZS )T5Gemma2ForTokenClassificationr]   c                 "   t         |   |       |j                  | _        |j                  j                  | _        t        |      | _        t        |dd      }t        | j                  | j                  |      | _	        | j                          y rM  rN  rP  s      r@   r8   z'T5Gemma2ForTokenClassification.__init__  sp      ++!>>55"6*
$V-FL/0@0@$//Sef
rA   c                 6    | j                   j                         S r6   r!  rS   s    r@   r  z3T5Gemma2ForTokenClassification.get_input_embeddings  r"  rA   c                 :    | j                   j                  |       y r6   r$  r%  s     r@   r  z3T5Gemma2ForTokenClassification.set_input_embeddings  r&  rA   Nri  r  r   r   r  r  r	  r
  r  r  r,  r   r   c                    |	|
#t        d| j                  j                   d      |t        d      || j	                  |      } | j
                  |f||||||||	|
dd
|}|j                  }|j                  }|j                  }| j                  |      }d}|| j                  ||| j                        }t        ||||      S )rU  NrV  rW  rX  FrY  r\  )r]  r?   rU   r  r  ro  r  r  r  rO  r2  r]   r   )r>   ri  r  r   r   r  r  r	  r
  r  r  r,  r   r  r  rn   rv  r<  r/  s                      r@   rO   z&T5Gemma2ForTokenClassification.forward  s   4 $(=(I%J4>>KbKbJccde  <==$ $ J J9 U&0djj'
%)%/#9!5+'"7'
 '
 $5555//
-.%%ffdkkBD$'!	
 	
rA   rd  )rU   rV   rW   r-   r8   r  r  r$   r#   r;   r  r/  r   r   r!   r"   r   rO   rY   rZ   s   @r@   rf  rf    s\   
~ 
1/  .215.204596:8<2626:>*.@
##d*@
 ''$.@
 t+	@

 &&-@
 !++d2@
 !&t 3@
 $..5@
 )4/@
 ((4/@
  %0047@
   4'@
 +,@
 
@
  @
rA   rf  )r  r  r  rn  rK  rf  )r,   )r   NN)T)cr?  collections.abcr   typingr   r;   torch.nnr9    r   r}  activationsr   cache_utilsr   r	   r
   r   
generationr   r   r   integrationsr   r   masking_utilsr   r   r   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   r   r   r   modeling_rope_utilsr   r   modeling_utilsr   r    processing_utilsr!   utilsr"   r#   r$   r%   utils.genericr&   r'   utils.output_capturingr(   r)   autor+   configuration_t5gemma2r-   r.   r/   r0   Moduler2   r\   rp   r   r   r   rX   r   rL   rQ   r   r   r
  r  r1  r7  r>  rE  	Embeddingr_  rn  r  r  r  r  r  r  r  rK  rf  __all__r+  rA   r@   <module>r     s$  *  $    & ! P P K K I m m B 9   L F & a a G E  t t=bii =(")) &N<bii N<b( *+ ,2	UU\\ 	U# 	U%,, 	U$   %II%<<% 
% <<	%
 LL4'% % T\% T\% 5<<%&%D )*N)BII N) +N)b )*zBbii zB +zBz15 1h:5 :z	RYY 	 !@")) !@H bll  . S!o S! S!l  &b
1 b
Je- eP
t0C 
 
I
- I
X \
+ \
 \
~J:'> J:Z ^
(? ^
 ^
B U
%< U
 U
prA   