
    qi-                       d Z ddlZddlZddlZddlmZ ddlmZmZmZ ddl	m
Z ddlmZ ddlmZmZmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZmZmZmZmZmZm Z  ddl!m"Z" ddl#m$Z$m%Z%m&Z&m'Z'm(Z( ddl)m*Z*  e'jV                  e,      Z- G d dej\                        Z/ G d dej\                        Z0 G d dej\                        Z1 G d dej\                        Z2 G d dej\                        Z3 G d dej\                        Z4 G d dej\                        Z5 G d d e      Z6 G d! d"ej\                        Z7e& G d# d$e"             Z8 G d% d&e8      Z9e& G d' d(e8             Z: e&d)*       G d+ d,e8e             Z;e& G d- d.e8             Z< e&d/*       G d0 d1e8             Z=e& G d2 d3e8             Z>e& G d4 d5e8             Z?g d6Z@y)7zPyTorch mT5 model.    N)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )initialization)ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)create_bidirectional_maskcreate_causal_mask)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput#Seq2SeqQuestionAnsweringModelOutputSeq2SeqSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)DUMMY_INPUTS
DUMMY_MASKauto_docstringloggingtorch_compilable_check   )	MT5Configc                   &     e Zd Zd fd	Zd Z xZS )MT5LayerNormc                     t         |           t        j                  t	        j
                  |            | _        || _        y)zd
        Construct a layernorm module in the MT5 style. No bias and no subtraction of mean.
        N)super__init__r   	Parametertorchonesweightvariance_epsilon)selfhidden_sizeeps	__class__s      V/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/mt5/modeling_mt5.pyr$   zMT5LayerNorm.__init__0   s1     	ll5::k#:; #    c                    |j                  t        j                        j                  d      j	                  dd      }|t        j
                  || j                  z         z  }| j                  j                  t        j                  t        j                  fv r%|j                  | j                  j                        }| j                  |z  S )N   T)keepdim)tor&   float32powmeanrsqrtr)   r(   dtypefloat16bfloat16)r*   hidden_statesvariances      r.   forwardzMT5LayerNorm.forward8   s     !##EMM266q9>>r4>P%Ht?T?T4T(UU ;; ??),,T[[->->?M{{]**r/   )gư>)__name__
__module____qualname__r$   r>   __classcell__r-   s   @r.   r!   r!   /   s    $+r/   r!   c                   *     e Zd Zdef fdZd Z xZS )MT5DenseActDenseconfigc                 ^   t         |           t        j                  |j                  |j
                  d      | _        t        j                  |j
                  |j                  d      | _        t        j                  |j                        | _
        t        |j                     | _        y NFbias)r#   r$   r   Lineard_modeld_ffwiwoDropoutdropout_ratedropoutr	   dense_act_fnactr*   rF   r-   s     r.   r$   zMT5DenseActDense.__init__J   sn    ))FNNFKKeD))FKKeDzz&"5"56&--.r/   c                    | j                  |      }| j                  |      }| j                  |      }t        | j                  j
                  t        j                        r|j                  | j                  j
                  j                  k7  r`| j                  j
                  j                  t        j                  k7  r/|j                  | j                  j
                  j                        }| j	                  |      }|S N)rN   rT   rR   
isinstancerO   r(   r&   Tensorr9   int8r4   r*   r<   s     r.   r>   zMT5DenseActDense.forwardQ   s    ./]3tww~~u||4##tww~~';';;$$

2),,TWW^^-A-ABM.r/   r?   r@   rA   r   r$   r>   rB   rC   s   @r.   rE   rE   I   s    /y /r/   rE   c                   *     e Zd Zdef fdZd Z xZS )MT5DenseGatedActDenserF   c                    t         |           t        j                  |j                  |j
                  d      | _        t        j                  |j                  |j
                  d      | _        t        j                  |j
                  |j                  d      | _        t        j                  |j                        | _        t        |j                     | _        y rH   )r#   r$   r   rK   rL   rM   wi_0wi_1rO   rP   rQ   rR   r	   rS   rT   rU   s     r.   r$   zMT5DenseGatedActDense.__init__a   s    IIfnnfkkF	IIfnnfkkF	))FKKeDzz&"5"56&--.r/   c                 ,   | j                  | j                  |            }| j                  |      }||z  }| j                  |      }t	        | j
                  j                  t        j                        r|j                  | j
                  j                  j                  k7  r`| j
                  j                  j                  t        j                  k7  r/|j                  | j
                  j                  j                        }| j                  |      }|S rW   )rT   r`   ra   rR   rX   rO   r(   r&   rY   r9   rZ   r4   )r*   r<   hidden_geluhidden_linears       r.   r>   zMT5DenseGatedActDense.forwardi   s    hhtyy78		-0#m3]3 tww~~u||4##tww~~';';;$$

2),,TWW^^-A-ABM.r/   r\   rC   s   @r.   r^   r^   `   s    /y /r/   r^   c                   *     e Zd Zdef fdZd Z xZS )
MT5LayerFFrF   c                    t         |           |j                  rt        |      | _        nt        |      | _        t        |j                  |j                        | _	        t        j                  |j                        | _        y )Nr,   )r#   r$   is_gated_actr^   DenseReluDenserE   r!   rL   layer_norm_epsilon
layer_normr   rP   rQ   rR   rU   s     r.   r$   zMT5LayerFF.__init__   s_    "7"?D"26":D&v~~6;T;TUzz&"5"56r/   c                 r    | j                  |      }| j                  |      }|| j                  |      z   }|S rW   )rl   rj   rR   )r*   r<   forwarded_statess      r.   r>   zMT5LayerFF.forward   s=    ??=9../?@%5E(FFr/   r\   rC   s   @r.   rf   rf   ~   s    7y 7r/   rf   c                   f     e Zd Z	 	 ddededz  f fdZed	d       Zd
dZ	 	 	 	 	 	 	 	 ddZ	 xZ
S )MT5AttentionNrF   	layer_idxc                    t         |           |j                  | _        || _        |j                  | _        |j
                  | _        |j                  | _        |j                  | _        |j                  | _
        |j                  | _        | j                  | j                  z  | _        || _        |9| j                  r-t        j!                  d| j"                  j$                   d       t'        j(                  | j                  | j                  d      | _        t'        j(                  | j                  | j                  d      | _        t'        j(                  | j                  | j                  d      | _        t'        j(                  | j                  | j                  d      | _        | j                  r/t'        j2                  | j                  | j                        | _        d| _        y )NzInstantiating a decoder z without passing `layer_idx` is not recommended and will to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` when creating this class.FrI   )r#   r$   
is_decoderhas_relative_attention_biasrelative_attention_num_bucketsrelative_attention_max_distancerL   d_kvkey_value_proj_dim	num_headsn_headsrQ   rR   	inner_dimrq   loggerwarning_oncer-   r?   r   rK   qkvo	Embeddingrelative_attention_biasgradient_checkpointingr*   rF   rt   rq   r-   s       r.   r$   zMT5Attention.__init__   si    	 +++F(.4.S.S+/5/U/U,~~"(++''**(?(??"*4>>+B+B*C D, , 4<<eD4<<eD4<<eD4>>4<<eD+++-<<8[8[]a]i]i+jD(&+#r/   c                 T   d}|rC|dz  }|| dkD  j                  t        j                        |z  z  }t        j                  |       } n*t        j                  | t        j
                  |              } |dz  }| |k  }|t        j                  | j                         |z        t        j                  ||z        z  ||z
  z  j                  t        j                        z   }t        j                  |t        j                  ||dz
              }|t        j                  || |      z  }|S )a  
        Adapted from Mesh Tensorflow:
        https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593

        Translate relative position to a bucket number for relative attention. The relative position is defined as
        memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
        position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
        small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
        positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
        This should allow for more graceful generalization to longer sequences than the model has been trained on

        Args:
            relative_position: an int32 Tensor
            bidirectional: a boolean - whether the attention is bidirectional
            num_buckets: an integer
            max_distance: an integer

        Returns:
            a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
        r   r1   r   )r4   r&   longabsmin
zeros_likelogfloatmath	full_likewhere)relative_positionbidirectionalnum_bucketsmax_distancerelative_buckets	max_exactis_smallrelative_position_if_larges           r.   _relative_position_bucketz&MT5Attention._relative_position_bucket   s(   , AK!2Q!6 : :5:: F TT %		*; <!&+<e>N>NO`>a!b b  1$	$y0 &/II'--/);<hh|i/01Y&( "UZZ.	&"
 &+YY&8RT_bcTc(d&
" 	EKK2CE_``r/   c                    | | j                   j                  j                  }|.t        j                  |t        j
                  |      dddf   }n|dddf   j                  |      }t        j                  |t        j
                  |      dddf   }||z
  }| j                  || j                   | j                  | j                        }| j                  |      }	|	j                  g d      j                  d      }	|	S )z%Compute binned relative position biasN)r9   device)r   r   r   )r1   r   r   r   )r   r(   r   r&   aranger   r4   r   rs   ru   rv   permute	unsqueeze)
r*   query_length
key_lengthr   cache_positioncontext_positionmemory_positionr   relative_position_bucketvaluess
             r.   compute_biaszMT5Attention.compute_bias   s    >1188??F!$||L

SYZ[\^b[bc-ag699&A,,zFSTXZ[T[\+.>>#'#A#A#.;;==	 $B $
  --.FG	*44Q7r/   c
                    |j                   dd \  }
}|du}| j                  |      }|j                  |
d| j                  | j                        j                  dd      }d}t        |t              rA|j                  j                  | j                        }|r|j                  }n|j                  }n|}|r|n|}|rK|I|rG|j                  | j                     j                  }|j                  | j                     j                  }n| j!                  |      }| j#                  |      }|j                  |
d| j                  | j                        j                  dd      }|j                  |
d| j                  | j                        j                  dd      }|T|s|	nd}	|j%                  ||| j                  d|	i      \  }}|r)t        |t              rd|j                  | j                  <   t'        j(                  ||j                  dd            }||j                   d	   }||n|	d   dz   }| j*                  sZt'        j,                  d| j                  ||f|j.                  |j0                  
      }| j2                  rE| j4                  r9d|_        n1| j9                  |||j.                  |	      }|dddd| dddf   }|#|ddddddd|j                   d	   f   }||z   }|}||z  }t:        j<                  j?                  |jA                         d      jC                  |      }t:        j<                  jE                  || jD                  | j4                        }t'        j(                  ||      }|j                  dd      jG                         }|j                  |
d| jH                        }| jK                  |      }||f}|r||fz   }|S )z
        Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states).
        Nr1   r2   r   Fr   Tr   )r   r9   )r   r   dim)ptraining)&shaper~   viewrz   rx   	transposerX   r   
is_updatedgetrq   cross_attention_cacheself_attention_cachelayerskeysr   r   r   updater&   matmulrt   zerosr   r9   r   r   requires_gradr   r   
functionalsoftmaxr   type_asrR   
contiguousr{   r   )r*   r<   maskkey_value_statesposition_biaspast_key_valuesr   	use_cacheoutput_attentionsr   
batch_size
seq_lengthis_cross_attentionquery_statesr   curr_past_key_valuescurrent_states
key_statesvalue_statesscoresr   real_seq_lengthcausal_maskposition_bias_maskedattn_weightsattn_outputoutputss                              r.   r>   zMT5Attention.forward   s   " "/!4!4Ra!8
J .T9vvm,#((RtG^G^_iijkmno 
o':;(3377GJ!'6'L'L$'6'K'K$#2 -?)]/"=*-44T^^DIIJ/66t~~FMML/J66.1L#RtG^G^_iijkmnoJ',,ZT\\4KbKbcmmnoqrsL*7It+?+F+Fdnn?OQ_>`,(
L &*_FY*ZAEO..t~~> lJ,@,@A,FG #))"-J.:.FlN[]L^abLbO33 %j*=fmm[a[g[g! ..4==26M/ $ 1 1#ZVd !2 ! !.aZKL!.C D"1a,Bj.>.>r.B,B#BC - ;,&& }},,V\\^,DLLVT}},,\T\\TXTaTa,bll<>!++Aq1<<>!&&z2t~~Fff[)./Gr/   FN)T       )NN)NNNNNFFN)r?   r@   rA   r   intr$   staticmethodr   r   r>   rB   rC   s   @r.   rp   rp      sa     %* $	 , , :	 ,D -  - ^. br/   rp   c                   @     e Zd Zddedz  f fdZ	 	 	 	 	 	 ddZ xZS )MT5LayerSelfAttentionNrq   c                     t         |           t        |||      | _        t	        |j
                  |j                        | _        t        j                  |j                        | _        y )Nrt   rq   rh   )r#   r$   rp   SelfAttentionr!   rL   rk   rl   r   rP   rQ   rR   r   s       r.   r$   zMT5LayerSelfAttention.__init___  sT    )0KW`
 'v~~6;T;TUzz&"5"56r/   c           	          | j                  |      }| j                  |||||||      }	|| j                  |	d         z   }|f|	dd  z   }
|
S )N)r   r   r   r   r   r   r   r   )rl   r   rR   )r*   r<   attention_maskr   r   r   r   r   normed_hidden_statesattention_outputr   s              r.   r>   zMT5LayerSelfAttention.forwardg  ss      $}=-- '+/) . 
 &5Ea5H(II "%5ab%99r/   r   )NNNFFNr?   r@   rA   r   r$   r>   rB   rC   s   @r.   r   r   ^  s-    7SSWZ 7 r/   r   c                   B     e Zd Zddedz  f fdZ	 	 	 	 	 	 	 ddZ xZS )MT5LayerCrossAttentionNrq   c                     t         |           t        |d|      | _        t	        |j
                  |j                        | _        t        j                  |j                        | _        y )NFr   rh   )r#   r$   rp   EncDecAttentionr!   rL   rk   rl   r   rP   rQ   rR   )r*   rF   rq   r-   s      r.   r$   zMT5LayerCrossAttention.__init__  sO    +FPUajk&v~~6;T;TUzz&"5"56r/   c
                     | j                  |      }
| j                  |
||||||||		      }|| j                  |d         z   }|f|dd  z   }|S )N)r   r   r   r   r   r   r   r   r   r   )rl   r   rR   )r*   r<   r   r   r   r   r   r   r   r   r   r   layer_outputr   s                 r.   r>   zMT5LayerCrossAttention.forward  sx      $}=// -'+%/) 0 

 %t||4DQ4G'HH/$4QR$88r/   rW   )NNNFNFNr   rC   s   @r.   r   r     s/    7#* 7 r/   r   c                   H     e Zd Zddedz  f fdZ	 	 	 	 	 	 	 	 	 	 ddZ xZS )MT5BlockNrq   c                 p   t         |           |j                  | _        t        j                         | _        | j
                  j                  t        |||             | j                  r&| j
                  j                  t        ||             | j
                  j                  t        |             y )Nr   )rq   )
r#   r$   rs   r   
ModuleListlayerappendr   r   rf   r   s       r.   r$   zMT5Block.__init__  s     ++]]_


!&Famvw	
 ??JJ4VyQR

*V,-r/   c           
          | j                   d   ||||||	|      }|d   }|dd  }|j                  t        j                  k(  rt        j                  t        j
                  |      j                         t        j                  |j                        j                  dz
  t        j                  |j                        j                        }t        j                  || |      }| j                  xr |d u}|r | j                   d   ||||||d   dz   ||	      }|d   }|j                  t        j                  k(  rt        j                  t        j
                  |      j                         t        j                  |j                        j                  dz
  t        j                  |j                        j                        }t        j                  || |      }||dd  z   } | j                   d   |      }|j                  t        j                  k(  rt        j                  t        j
                  |      j                         t        j                  |j                        j                  dz
  t        j                  |j                        j                        }t        j                  || |      }|f}||z   S )Nr   )r   r   r   r   r   r   r   i  )r   maxr2   )r   r   r   r   r   r   r   )r   r9   r&   r:   r   isinfanyfinfor   clamprs   )r*   r<   r   r   encoder_hidden_statesencoder_attention_maskencoder_decoder_position_biasr   r   r   return_dictr   self_attention_outputsattention_outputsclamp_valuedo_cross_attentioncross_attention_outputsr   s                     r.   r>   zMT5Block.forward  sl    "/A)'+/)"
 /q12126 %--/++M*..0M//044t;M//044K
 "KKK<[YM!__R1Fd1R&3djjm!65; /+B/!3#"3	'# 4A6M ""emm3#kkKK.224KK 3 34884?KK 3 3488
 !&M|Q\ ] !24KAB4O O '

2}5 %--/++M*..0M//044t;M//044K
 "KKK<[YM " ''	
r/   r   )
NNNNNNFFTNr   rC   s   @r.   r   r     s:    
.SSWZ 
. "#&*M
r/   r   c                   `     e Zd ZdZdef fdZdej                  dej                  fdZ xZ	S )MT5ClassificationHeadz-Head for sentence-level classification tasks.rF   c                 (   t         |           t        j                  |j                  |j                        | _        t        j                  |j                        | _        t        j                  |j                  |j                        | _
        y )N)r   )r#   r$   r   rK   rL   denserP   classifier_dropoutrR   
num_labelsout_projrU   s     r.   r$   zMT5ClassificationHead.__init__  sZ    YYv~~v~~>
zzF$=$=>		&..&2C2CDr/   r<   returnc                     | j                  |      }| j                  |      }t        j                  |      }| j                  |      }| j	                  |      }|S rW   )rR   r   r&   tanhr  r[   s     r.   r>   zMT5ClassificationHead.forward  sN    ]3

=1

=1]3m4r/   )
r?   r@   rA   __doc__r   r$   r&   rY   r>   rB   rC   s   @r.   r   r     s/    7Ey EU\\ ell r/   r   c                   t    e Zd ZU eed<   dZdZdZdgZdgZ	e
d        Z ej                         d        Zd Zy	)
MT5PreTrainedModelrF   transformerTr   rO   c                 v    t        j                  t              }t        j                  t              }|||d}|S )N)decoder_input_ids	input_idsdecoder_attention_mask)r&   tensorr   r   )r*   r  
input_maskdummy_inputss       r.   r  zMT5PreTrainedModel.dummy_inputs!  s6    LL.	\\*-
!*"&0

 r/   c                 r   | j                   j                  }t        |t              r$t	        j
                  |j                  |dz         y	t        |t        t        t        t        f      rt	        j                  |j                  j                  d|dz         t        |d      rE| j                   j                  s/t	        j                  |j                  j                  d|dz         t        |d      rpt	        j                  |j                   j                  d|| j                   j"                  dz  z         t	        j$                  |j                   j&                         y	y	t        |t(              rft        |d      rYt	        j                  |j*                  j                  d|dz         t	        j$                  |j*                  j&                         y	y	t        |t,              r9t	        j                  |j.                  j                  d|| j                   j"                  dz  z         t        |j.                  d      r?|j.                  j&                  )t	        j$                  |j.                  j&                         t	        j                  |j0                  j                  d|| j                   j"                  dz  z         t        |j0                  d      rA|j0                  j&                  *t	        j$                  |j0                  j&                         y	y	y	t        |t2              r9t	        j                  |j4                  j                  d|| j                   j"                  dz  z         t        |j4                  d      r?|j4                  j&                  )t	        j$                  |j4                  j&                         t	        j                  |j6                  j                  d|| j                   j8                  dz  z         t        |j6                  d      rA|j6                  j&                  *t	        j$                  |j6                  j&                         y	y	y	t        |t:              rt	        j                  |j<                  j                  d|| j                   j"                  dz  z         t        |j<                  d      r?|j<                  j&                  )t	        j$                  |j<                  j&                         t	        j                  |j>                  j                  d|| j                   j"                  dz  z         t        |j>                  d      r?|j>                  j&                  )t	        j$                  |j>                  j&                         t	        j                  |j6                  j                  d|| j                   j8                  dz  z         t        |j6                  d      rA|j6                  j&                  *t	        j$                  |j6                  j&                         y	y	y	t        |t@              rP| j                   j"                  }| j                   jB                  }| j                   jD                  }t	        j                  |jF                  j                  d|||z  dz  z         t	        j                  |jH                  j                  d||dz  z         t	        j                  |jJ                  j                  d||dz  z         t	        j                  |jL                  j                  d|||z  dz  z         |jN                  r3t	        j                  |jP                  j                  d||dz  z         y	y	y	)
zInitialize the weightsg      ?g        )r7   stdlm_head
qa_outputsg      
classifierrJ   N))rF   initializer_factorrX   r!   init	constant_r(   MT5ModelMT5ForConditionalGenerationMT5EncoderModelMT5ForQuestionAnsweringnormal_sharedhasattrtie_word_embeddingsr  r  rL   zeros_rJ   MT5ForTokenClassificationr  r   r   r  rE   rN   rO   rM   r^   r`   ra   rp   rw   ry   r~   r   r   r   rt   r   )r*   modulefactorrL   rx   rz   s         r.   _init_weightsz MT5PreTrainedModel._init_weights,  s7    //fl+NN6==&3,72OE\]
 LL--CVc\Jvy)$++2Q2QV^^22&3,Ov|,V..55CVPTP[P[PcPchlOlEmnF--223 -  9:v|,V..55CVc\RF--223 -  56LL,,3Ft{{GZGZ_cFc<dev||V,1B1B1NFLL--.LL//cv$++J]J]bfIf?ghv/FOO4H4H4TFOO001 5U/ 01LL))DKKDWDW\`C`9abvyy&)fiinn.HFIINN+LL))DKKDTDTY]C]9^_vyy&)fiinn.HFIINN+ /I) 56LL++#6dkkFYFY^bEb;cdv{{F+0@0@0LFKK,,-LL++#6dkkFYFY^bEb;cdv{{F+0@0@0LFKK,,-LL))DKKDTDTY]C]9^_vyy&)fiinn.HFIINN+ /I)-kk))G!%!1!1kk++GLLs7M_C_dhBh8ijLLs'4-8PQLLs'4-8PQLLs7M_C_dhBh8ij11V;;BBRX]dim\mRno 2 .r/   c                 8   | j                   j                  }| j                   j                  }|t        d      |j	                  |j
                        }|dd df   j                         |ddd f<   ||d<   |t        d      |j                  |dk(  |       |S )Nzself.model.config.decoder_start_token_id has to be defined. In MT5 it is usually set to the pad_token_id. See MT5 docs for more information..r2   r   ).r   z1self.model.config.pad_token_id has to be defined.)rF   decoder_start_token_idpad_token_id
ValueError	new_zerosr   clonemasked_fill_)r*   r  r(  r)  shifted_input_idss        r.   _shift_rightzMT5PreTrainedModel._shift_rightc  s    !%!C!C{{//!)5 
 &//	@%.sCRCx%8%>%>%@#qr'"$:&!PQQ&&'8D'@,O  r/   N)r?   r@   rA   r   __annotations__base_model_prefixsupports_gradient_checkpointing_can_compile_fullgraph_no_split_modules_keep_in_fp32_modulespropertyr  r&   no_gradr%  r/   r/   r.   r  r    sb     %&*#!#!F  U]]_4p 4pl!r/   r  c                   B     e Zd Z fdZd Z	 	 	 	 	 	 	 	 	 	 	 ddZ xZS )MT5Stackc                    t         |   |       t        j                  |j                  |j
                        | _        |j                  | _        t        j                  t        |j                        D cg c]  }t        |t        |dk(        |       c}      | _        t        |j
                  |j                        | _        t        j"                  |j$                        | _        | j)                          d| _        y c c}w )Nr   r   rh   F)r#   r$   r   r   
vocab_sizerL   embed_tokensrs   r   range
num_layersr   boolblockr!   rk   final_layer_normrP   rQ   rR   	post_initr   )r*   rF   ir-   s      r.   r$   zMT5Stack.__init__{  s     LL):):FNNK ++]]^cdjdudu^vwYZXf$qAv,RSTw

 !-V^^AZAZ [zz&"5"56 	&+# xs   7!Dc                     || _         y rW   )r=  r*   new_embeddingss     r.   set_input_embeddingszMT5Stack.set_input_embeddings  s
    *r/   c                 X   ||n| j                   j                  }||n| j                   j                  }|	|	n| j                   j                  }	|
|
n| j                   j                  }
|$|"| j
                  rdnd}t        d| d| d      |&|j                         }|j                  d|d         }n8||j                         d d }n"| j
                  rdnd}t        d| d| d	      | j                  r%| j                  r|rt        j                  d
       d}|(| j                  t        d      | j                  |      }|\  }}|du r| j
                  st        d|  d      | j
                  rf|rr|p| j                   j                  r5t        t!        | j                         t!        | j                               }n%t!        | j                         }n| j
                  sd }||j#                         nd}|%t%        j&                  |||z   |j(                        }| j                   j
                  r7t+        | j                   |||t-        |t              r|j.                  n|      }nt1        | j                   ||      }d }| j
                  r|t1        | j                   |||      }|	rdnd }|rdnd }|r| j
                  rdnd }d }d }| j3                  |      }| j4                  D ]`  }|	r||fz   } |||||||||||
|      }|d   }|d   }| j
                  r|	||rdnd   }|sB||d   fz   }| j
                  sX||d   fz   }b | j7                  |      }| j3                  |      }|	r||fz   }|
st9        d |||||fD              S t;        |||||      S )Ndecoder_ zYou cannot specify both zinput_ids and zinputs_embeds at the same timer2   zYou have to specify either zinput_ids or inputs_embedszZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fz<You have to initialize the model with valid token embeddingsTz)`use_cache` can only be set to `True` if z is used as a decoder)rF   r   )r   )rF   rL  r   r   r   )rF   rL  r   )rF   rL  r   r   r8  )r   r   r   r   r   r   r   r1      c              3   $   K   | ]  }|| 
 y wrW   r8  ).0r   s     r.   	<genexpr>z#MT5Stack.forward.<locals>.<genexpr>!  s      
 = 
s   )last_hidden_stater   r<   
attentionscross_attentions)rF   r   r   output_hidden_statesuse_return_dictrs   r*  sizer   r   r   r|   r}   r=  is_encoder_decoderr   r   get_seq_lengthr&   r   r   r   rX   r   r   rR   rA  rB  tupler   )r*   r  r   r   r   rL  r   r   r   rT  r   r   kwargserr_msg_prefixinput_shaper   r   past_key_values_lengthencoder_extended_attention_maskall_hidden_statesall_attentionsall_cross_attentionsr   r   r<   layer_modulelayer_outputss                              r.   r>   zMT5Stack.forward  s4    "+!6IDKK<Q<Q	1B1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] ]%>+/??ZN*>*:.HXXvw  "#..*K!r;r?;I&',,.s3K+/??ZN:>:J-XfWggtuvv&&4==##p "	   ( !_`` --i8M!,
J?? #LTFRg!hii??_4;;11&9$DKK8,dkk:Z'O '3$++&FO #OETE`!?!?!Afg!"\\&(>(KTaThThN ;;!!/{{+--o/BC !0 D D$N 7{{+-N +/'??4@.G{{+5&;	/+ #7BD0d&7DOOrRV(,%]3 JJ 	VL#$58H$H!(%/- /#"3'-M *!,M
 *!,M#8#D0=CTaZ[0\- !/=3C2E!E??+?=QRCSBU+U(=	V@ --m<]3   1]4D D 
 "#%"(
 
 
 9+++%1
 	
r/   )NNNNNNNNNNN)r?   r@   rA   r$   rH  r>   rB   rC   s   @r.   r:  r:  z  s6    , +
 "#!d
r/   r:  c                       e Zd ZU dZdZeed<   dgZdddZdef fdZ	d Z
d	 Ze	 	 	 	 	 	 	 	 	 	 	 	 	 ddej                  d
z  dej                  d
z  dej                  d
z  dej                   d
z  deeej                        d
z  ded
z  dej&                  d
z  dej&                  d
z  ded
z  ded
z  ded
z  ded
z  dej                  d
z  deej                     ez  fd       Z xZS )r  aw  
    Examples:

    ```python
    >>> from transformers import MT5Model, AutoTokenizer

    >>> model = MT5Model.from_pretrained("google/mt5-small")
    >>> tokenizer = AutoTokenizer.from_pretrained("google/mt5-small")
    >>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien."
    >>> summary = "Weiter Verhandlung in Syrien."
    >>> inputs = tokenizer(article, return_tensors="pt")
    >>> labels = tokenizer(text_target=summary, return_tensors="pt")

    >>> outputs = model(input_ids=inputs["input_ids"], decoder_input_ids=labels["input_ids"])
    >>> hidden_states = outputs.last_hidden_state
    ```mt5rF   Fdecoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weightshared.weightencoder.embed_tokens.weightdecoder.embed_tokens.weightc                    t         |   |       t        j                  |j                  |j
                        | _        t        j                  |      }d|_	        d|_
        t        |      | _        t        j                  |      }d|_	        |j                  |_        t        |      | _        | j!                          y NFT)r#   r$   r   r   r<  rL   r  copydeepcopyrs   r   r:  encodernum_decoder_layersr?  decoderrC  r*   rF   encoder_configdecoder_configr-   s       r.   r$   zMT5Model.__init__Q  s     ll6#4#4fnnEv.$)!#( /v.$(!$*$=$=!/ 	r/   c                     | j                   S rW   r  r*   s    r.   get_input_embeddingszMT5Model.get_input_embeddingsc      {{r/   c                 ~    || _         | j                  j                  |       | j                  j                  |       y rW   r  ro  rH  rq  rF  s     r.   rH  zMT5Model.set_input_embeddingsg  -    $)).9)).9r/   Nr  r   r  r  encoder_outputsr   rL  decoder_inputs_embedsr   r   rT  r   r   r  c                 F   |	|	n| j                   j                  }	||n| j                   j                  }|| j                  ||||
||      }nI|rGt	        |t
              s7t        |d   t        |      dkD  r|d   ndt        |      dkD  r|d   nd      }|d   }| j                  |||||||	|
|||      }|s||z   S t        |j                  |j                  |j                  |j                  |j                  |j                  |j                  |j                        S )	a\
  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. MT5 is a model with relative position embeddings so you
            should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            [What are input IDs?](../glossary#input-ids)

            To know more on how to prepare `input_ids` for pretraining take a look a [MT5 Training](./mt5#training).
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            MT5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

            To know more on how to prepare `decoder_input_ids` for pretraining take a look at [MT5
            Training](./mt5#training).
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, MT5Model

        >>> tokenizer = AutoTokenizer.from_pretrained("google/mt5-small")
        >>> model = MT5Model.from_pretrained("google/mt5-small")

        >>> input_ids = tokenizer(
        ...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
        ... ).input_ids  # Batch size 1
        >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1

        >>> # preprocess: Prepend decoder_input_ids with start token which is pad token for MT5Model.
        >>> # This is not needed for torch's MT5ForConditionalGeneration as it does this internally using labels arg.
        >>> decoder_input_ids = model._shift_right(decoder_input_ids)

        >>> # forward pass
        >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
        >>> last_hidden_states = outputs.last_hidden_state
        ```Nr  r   rL  r   rT  r   r   r   r1   rQ  r<   rR  r  r   rL  r   r   r   r   r   rT  r   r   )rQ  r   decoder_hidden_statesdecoder_attentionsrS  encoder_last_hidden_stater   encoder_attentions)rF   r   rU  ro  rX   r   lenrq  r   rQ  r   r<   rR  rS  )r*   r  r   r  r  r}  r   rL  r~  r   r   rT  r   r   rZ  r<   decoder_outputss                    r.   r>   zMT5Model.forwardl  sR   H "+!6IDKK<Q<Q	%0%<k$++B]B] ""ll#-+"3%9' + O O_!M-"1!"4474H14Loa0RV14_1E1I?1-tO (* ,,'1/+"/#1/!5#) ' 
 "_44!-??+;;"1"?"?.99,==&5&G&G"1"?"?.99	
 		
r/   NNNNNNNNNNNNN)r?   r@   rA   r  
model_typer   r0  "_keys_to_ignore_on_load_unexpected_tied_weights_keysr$   rx  rH  r   r&   
LongTensorFloatTensor
BoolTensorrY  r
   rY   r@  r   r>   rB   rC   s   @r.   r  r  5  s   " J*r)s&'6'6y $:
  .23759:>BF(,-159!%)-,0#'26s
##d*s
 ))D0s
 !++d2	s

 !& 0 04 7s
 uU%6%6784?s
 s
 ||d*s
  %||d2s
 $;s
  $;s
 #Tks
 D[s
 ((4/s
  
u  	!$6	6!s
 s
r/   r  z;
    MT5 Model with a `language modeling` head on top.
    )custom_introc            !           e Zd ZU dZdZeed<   dgZddddZdef fdZ	d Z
d	 Ze	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddej                  d
z  dej                  d
z  dej                  d
z  dej                   d
z  deeej$                        d
z  ded
z  dej                  d
z  dej                  d
z  dej                  d
z  ded
z  ded
z  ded
z  ded
z  dej                  d
z  deej                     ez  fd       Zdej$                  fdZ xZS )r  a  
    Examples:

    ```python
    >>> from transformers import MT5ForConditionalGeneration, AutoTokenizer

    >>> model = MT5ForConditionalGeneration.from_pretrained("google/mt5-small")
    >>> tokenizer = AutoTokenizer.from_pretrained("google/mt5-small")
    >>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien."
    >>> summary = "Weiter Verhandlung in Syrien."
    >>> inputs = tokenizer(article, text_target=summary, return_tensors="pt")

    >>> outputs = model(**inputs)
    >>> loss = outputs.loss
    ```re  rF   rf  rg  )ri  rj  zlm_head.weightc                    t         |   |       |j                  | _        t	        j
                  |j                  |j                        | _        t        j                  |      }d|_
        d|_        t        |      | _        t        j                  |      }d|_
        |j                  |_        t        |      | _        t	        j"                  |j                  |j                  d      | _        | j'                          y )NFTrI   )r#   r$   rL   	model_dimr   r   r<  r  rm  rn  rs   r   r:  ro  rp  r?  rq  rK   r  rC  rr  s       r.   r$   z$MT5ForConditionalGeneration.__init__  s     ll6#4#4fnnEv.$)!#( /v.$(!$*$=$=!/yy1B1BO 	r/   c                     | j                   S rW   rv  rw  s    r.   rx  z0MT5ForConditionalGeneration.get_input_embeddings  ry  r/   c                 ~    || _         | j                  j                  |       | j                  j                  |       y rW   r{  rF  s     r.   rH  z0MT5ForConditionalGeneration.set_input_embeddings  r|  r/   Nr  r   r  r  r}  r   rL  r~  labelsr   r   rT  r   r   r  c                 t   |
|
n| j                   j                  }
||n| j                   j                  }|| j                  ||||||      }nI|rGt	        |t
              s7t        |d   t        |      dkD  r|d   ndt        |      dkD  r|d   nd      }|d   }|	||| j                  |	      }| j                  |||||||
||||      }|d   }| j                  |      }d}|	^t        d	      }|	j                  |j                        }	 ||j                  d
|j                  d
            |	j                  d
            }|s|f|dd z   |z   }||f|z   S |S t        |||j                   |j"                  |j$                  |j&                  |j(                  |j"                  |j$                  	      S )a  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. MT5 is a model with relative position embeddings so you
            should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            [What are input IDs?](../glossary#input-ids)

            To know more on how to prepare `input_ids` for pretraining take a look a [MT5 Training](./mt5#training).
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            MT5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

            To know more on how to prepare `decoder_input_ids` for pretraining take a look at [MT5
            Training](./mt5#training).
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[-100, 0, ...,
            config.vocab_size - 1]`. All labels set to `-100` are ignored (masked), the loss is only computed for
            labels in `[0, ..., config.vocab_size]`

        Examples:

        ```python
        >>> from transformers import AutoTokenizer, MT5ForConditionalGeneration

        >>> tokenizer = AutoTokenizer.from_pretrained("google/mt5-small")
        >>> model = MT5ForConditionalGeneration.from_pretrained("google/mt5-small")

        >>> # training
        >>> input_ids = tokenizer("The <extra_id_0> walks in <extra_id_1> park", return_tensors="pt").input_ids
        >>> labels = tokenizer("<extra_id_0> cute dog <extra_id_1> the <extra_id_2>", return_tensors="pt").input_ids
        >>> outputs = model(input_ids=input_ids, labels=labels)
        >>> loss = outputs.loss
        >>> logits = outputs.logits

        >>> # inference
        >>> input_ids = tokenizer(
        ...     "summarize: studies have shown that owning a dog is good for you", return_tensors="pt"
        ... ).input_ids  # Batch size 1
        >>> outputs = model.generate(input_ids)
        >>> print(tokenizer.decode(outputs[0], skip_special_tokens=True))
        >>> # studies have shown that owning a dog is good for you.
        ```Nr  r   r   r1   r  r  r'  ignore_indexr2   	losslogitsr   r  r  rS  r  r   r  )rF   r   rU  ro  rX   r   r  r/  rq  r  r   r4   r   r   rV  r   r   r<   rR  rS  rQ  )r*   r  r   r  r  r}  r   rL  r~  r  r   r   rT  r   r   rZ  r<   r  sequence_output	lm_logitsr  loss_fctoutputs                          r.   r>   z#MT5ForConditionalGeneration.forward!  s    T "+!6IDKK<Q<Q	%0%<k$++B]B] ""ll#-+"3%9' + O O_!M-"1!"4474H14Loa0RV14_1E1I?1-tO (*"3";@U@] $ 1 1& 9 ,,'1/+"/#1/!5#) ' 
 *!,LL1	'T:HYYy//0FINN2y~~b/ABFKKPROTD\OAB$77/IF)-)9TGf$EvE+;;"1"?"?.99,==&5&G&G"1"?"?.99

 
	
r/   c                 $    | j                  |      S rW   )r/  )r*   r  s     r.   %prepare_decoder_input_ids_from_labelszAMT5ForConditionalGeneration.prepare_decoder_input_ids_from_labels  s      ((r/   )NNNNNNNNNNNNNN)r?   r@   rA   r  r  r   r0  r  r  r$   rx  rH  r   r&   r  r  r  rY  rY   r
   r@  r   r>   r  rB   rC   s   @r.   r  r    s     J*r)s&'6'6)y *:
  .23759:>=A(,26:>*.!%)-,0#'26L
##d*L
 ))D0L
 !++d2	L

 !& 0 04 7L
 uU\\23d:L
 L
 ((4/L
  %0047L
   4'L
 $;L
  $;L
 #TkL
 D[L
 ((4/L
" 
u  	!O	3#L
 L
^)ELL )r/   r  c                       e Zd ZU dZdZeed<   ddiZdef fdZd Z	d Z
e	 	 	 	 	 	 dd
ej                  d	z  dej                  d	z  dej                  d	z  ded	z  ded	z  ded	z  deej                     ez  fd       Z xZS )r  a  
    Examples:

    ```python
    >>> from transformers import MT5EncoderModel, AutoTokenizer

    >>> model = MT5EncoderModel.from_pretrained("google/mt5-small")
    >>> tokenizer = AutoTokenizer.from_pretrained("google/mt5-small")
    >>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien."
    >>> input_ids = tokenizer(article, return_tensors="pt").input_ids
    >>> outputs = model(input_ids)
    >>> hidden_state = outputs.last_hidden_state
    ```re  rF   ri  rg  c                     t         |   |       t        j                  |j                  |j
                        | _        |}d|_        d|_        t        |      | _
        | j                          y )NF)r#   r$   r   r   r<  rL   r  r   rW  r:  ro  rC  )r*   rF   rs  r-   s      r.   r$   zMT5EncoderModel.__init__  sY     ll6#4#4fnnE#( ,1)/ 	r/   c                     | j                   S rW   rv  rw  s    r.   rx  z$MT5EncoderModel.get_input_embeddings  ry  r/   c                 H    || _         | j                  j                  |       y rW   )r  ro  rH  rF  s     r.   rH  z$MT5EncoderModel.set_input_embeddings  s    $)).9r/   Nr  r   rL  r   rT  r   r  c                 h    ||n| j                   j                  }| j                  ||||||      }|S )aJ  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. MT5 is a model with relative position embeddings so you
            should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            To know more on how to prepare `input_ids` for pretraining take a look a [MT5 Training](./mt5#training).

        Example:

        ```python
        >>> from transformers import AutoTokenizer, MT5EncoderModel

        >>> tokenizer = AutoTokenizer.from_pretrained("google/mt5-small")
        >>> model = MT5EncoderModel.from_pretrained("google/mt5-small")
        >>> input_ids = tokenizer(
        ...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
        ... ).input_ids  # Batch size 1
        >>> outputs = model(input_ids=input_ids)
        >>> last_hidden_states = outputs.last_hidden_state
        ```r  )rF   rU  ro  )	r*   r  r   rL  r   rT  r   rZ  r}  s	            r.   r>   zMT5EncoderModel.forward  sH    F &1%<k$++B]B],,)'/!5# ' 
 r/   )NNNNNN)r?   r@   rA   r  r  r   r0  r  r$   rx  rH  r   r&   r  r  r@  rY  r   r>   rB   rC   s   @r.   r  r    s     J%

y 
:  .23726)-,0#',##d*, ))D0, ((4/	,
  $;, #Tk, D[, 
u  	!O	3, ,r/   r  z
    MT5 model with a sequence classification/head on top (a linear layer on top of the pooled output) e.g. for GLUE
    tasks.
    c                   ~    e Zd ZdgZdef fdZe	 	 	 	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  d	e
ej                     dz  d
ej                  dz  dej                  dz  dej                  dz  dedz  dedz  dedz  dedz  deez  fd       Z xZS )MT5ForSequenceClassificationrf  rF   c                     t         |   |       t        |      | _        t	        |      | _        | j                          y rW   )r#   r$   r  r	  r   classification_headrC  rU   s     r.   r$   z%MT5ForSequenceClassification.__init__  s6     #F+#8#@  	r/   Nr  r   r  r  r}  rL  r~  r  r   r   rT  r   r  c                    ||n| j                   j                  }|d}	|$|"t        d| j                  j                         | ||t        d      | j                  |      }| j                  ||||||||	|
||      }|d   }|j                  | j                   j                        j                  |j                        }t        t        j                  |j                  d            j!                         dk(  d       |j"                  \  }}}||ddf   j%                  |d	|      ddd	ddf   }| j'                  |      }d}||j                  |j                        }| j                   j(                  | j                   j*                  dk(  rd
| j                   _        nv| j                   j*                  dkD  rL|j,                  t        j.                  k(  s|j,                  t        j0                  k(  rd| j                   _        nd| j                   _        | j                   j(                  d
k(  rSt3               }| j                   j*                  dk(  r& ||j5                         |j5                               }n |||      }n| j                   j(                  dk(  rGt7               } ||j%                  d	| j                   j*                        |j%                  d	            }n,| j                   j(                  dk(  rt9               } |||      }|s|f|dd z   }||f|z   S |S t;        |||j<                  |j>                  |j@                  |jB                  |jD                  |jF                  |jH                  	      S )a  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. MT5 is a model with relative position embeddings so you
            should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            [What are input IDs?](../glossary#input-ids)

            To know more on how to prepare `input_ids` for pretraining take a look a [MT5 Training](./mt5#training).
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            MT5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

            To know more on how to prepare `decoder_input_ids` for pretraining take a look at [MT5
            Training](./mt5#training).
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        NFz8Passing input embeddings is currently not supported for If no `decoder_input_ids` or `decoder_inputs_embeds` are passed, `input_ids` cannot be `None`. Please pass either `input_ids` or `decoder_input_ids` or `decoder_inputs_embeds`.)
r   r  r  r}  rL  r~  r   r   rT  r   r   r   z7All examples must have the same number of <eos> tokens.r2   
regressionsingle_label_classificationmulti_label_classificationr  )%rF   rU  NotImplementedErrorr-   r?   r*  r/  r	  eqeos_token_idr4   r   r   r&   unique_consecutivesumnumelr   r   r  problem_typer  r9   r   r   r   squeezer   r   r   r   r  r  rS  r  r   r  )r*   r  r   r  r  r}  rL  r~  r  r   r   rT  r   rZ  r   r  eos_maskr   _r+   sentence_representationr  r  r  r  s                            r.   r>   z$MT5ForSequenceClassification.forward$  sN   b &1%<k$++B]B]I!:%J4>>KbKbJcd  $)>)F  U 
 !% 1 1) <"")/#9+'"7/!5# # 
 "!*<< 8 89<<_=S=ST$$X\\!_5;;=BE	
 &5%:%:"
A{"1(A+">"C"CJPRT_"`abdfhiai"j))*ABYYv}}-F{{''/;;))Q./;DKK,[[++a/V\\UZZ5OSYS_S_chclclSl/LDKK,/KDKK,{{''<7"9;;))Q.#FNN$4fnn6FGD#FF3D))-JJ+-B0F0F GUWY))-II,./Y,F)-)9TGf$EvE.#33")"?"?&99$55&-&G&G")"?"?&99

 
	
r/   )NNNNNNNNNNNN)r?   r@   rA   r  r   r$   r   r&   r  rY   listr  r@  rY  r   r>   rB   rC   s   @r.   r  r    sU    +s)s&y   .2.259:>:>26:>*.!%)-,0#'A
##d*A
 t+A
 !++d2	A

 !& 0 04 7A
 e//047A
 ((4/A
  %0047A
   4'A
 $;A
  $;A
 #TkA
 D[A
 
0	0A
 A
r/   r  c                        e Zd Zdef fdZe	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dedz  d	edz  d
edz  de	ej                     e
z  fd       Z xZS )r"  rF   c                 ,   t         |   |       |j                  | _        t        |      | _        t        j                  |j                        | _        t        j                  |j                  |j                        | _        | j                          y rW   )r#   r$   r  r  r	  r   rP   r   rR   rK   r+   r  rC  rU   s     r.   r$   z"MT5ForTokenClassification.__init__  sj      ++*62zz&";";<))F$6$68I8IJ 	r/   Nr  r   rL  r  r   rT  r   r  c                    ||n| j                   j                  }| j                  ||||||      }	|	d   }
| j                  |
      }
| j	                  |
      }d}|<t               } ||j                  d| j                        |j                  d            }|s||	dd f}||f|z   S |S t        |||	j                  |	j                        S )a>  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. MT5 is a model with relative position embeddings so you
            should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            [What are input IDs?](../glossary#input-ids)

            To know more on how to prepare `input_ids` for pretraining take a look a [MT5 Training](./t5#training).
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        N)r   rL  r   rT  r   r   r2   r1   )r  r  r<   rR  )rF   rU  r	  rR   r  r   r   r  r   r<   rR  )r*   r  r   rL  r  r   rT  r   rZ  r   r<   r  r  r  r  s                  r.   r>   z!MT5ForTokenClassification.forward  s    6 &1%<k$++B]B]"")'/!5# # 
  
]3/')HFKKDOO<fkk"oNDgam,F)-)9TGf$EvE$!//))	
 	
r/   )NNNNNNN)r?   r@   rA   r   r$   r   r&   rY   r@  rY  r   r>   rB   rC   s   @r.   r"  r"    s    	y 	  *..2-1&*)-,0#'6
<<$&6
 t+6
 ||d*	6

 t#6
  $;6
 #Tk6
 D[6
 
u||	4	46
 6
r/   r"  c                       e Zd ZdgZdddZdef fdZd Zd Ze		 	 	 	 	 	 	 	 	 	 	 	 	 dd	e
j                  dz  d
e
j                  dz  de
j                  dz  de
j                  dz  deee
j                        dz  de
j                  dz  de
j                  dz  de
j                  dz  de
j                  dz  dedz  dedz  dedz  dedz  dee
j                     ez  fd       Z xZS )r  rf  rg  rh  rF   c                 $   t         |   |       |j                  | _        t	        j
                  |j                  |j                        | _        t        j                  |      }d|_
        d|_        t        |      | _        t        j                  |      }d|_
        |j                  |_        t        |      | _        |j"                  | _        t	        j$                  |j&                  |j"                        | _        | j+                          y rl  )r#   r$   rL   r  r   r   r<  r  rm  rn  rs   r   r:  ro  rp  r?  rq  r  rK   r+   r  rC  rr  s       r.   r$   z MT5ForQuestionAnswering.__init__  s     ll6#4#4fnnEv.$)!#( /v.$(!$*$=$=!/ ++))F$6$68I8IJ 	r/   c                     | j                   S rW   rv  rw  s    r.   rx  z,MT5ForQuestionAnswering.get_input_embeddings  ry  r/   c                 ~    || _         | j                  j                  |       | j                  j                  |       y rW   r{  rF  s     r.   rH  z,MT5ForQuestionAnswering.set_input_embeddings  r|  r/   Nr  r   r  r  r}  start_positionsend_positionsrL  r~  r   r   rT  r   r  c                    ||n| j                   j                  }|
|
n| j                   j                  }
||d}
| |	|t        d      | j	                  |      }|
|
n| j                   j                  }
||n| j                   j                  }|| j                  ||||||      }nI|rGt        |t              s7t        |d   t        |      dkD  r|d   ndt        |      dkD  r|d   nd      }|d   }| j                  |||	d|||
|||	
      }|d   }| j                  |      }|j                  dd
      \  }}|j                  d
      j                         }|j                  d
      j                         }d}||t        |j                               dkD  r*|j                  d
      j                  |j                         }t        |j                               dkD  r*|j                  d
      j                  |j                         }|j                  d      }|j#                  d|      }|j#                  d|      }t%        |      } |||      } |||      }||z   dz  }|s||f|dd z   |z   }||f|z   S |S t'        ||||j(                  |j*                  |j,                  |j.                  |j0                  |j*                  |j,                  
      S )az  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. T5 is a model with relative position embeddings so you
            should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            [What are input IDs?](../glossary#input-ids)

            To know more on how to prepare `input_ids` for pretraining take a look a [T5 Training](./t5#training).
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            T5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).

            To know more on how to prepare `decoder_input_ids` for pretraining take a look at [T5
            Training](./t5#training).
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        NFr  r  r   r   r1   r  )
r  r   rL  r   r   r   r   r   rT  r   r2   r   r  )
r  start_logits
end_logitsr   r  r  rS  r  r   r  )rF   rU  r   r*  r/  ro  rX   r   r  rq  r  splitr  r   rV  r4   r   r   r   r   r   r<   rR  rS  rQ  )r*   r  r   r  r  r}  r  r  rL  r~  r   r   rT  r   rZ  r<   r  r  r  r  r  
total_lossignored_indexr  
start_lossend_lossr  s                              r.   r>   zMT5ForQuestionAnswering.forward  s0   ^ &1%<k$++B]B]!*!6IDKK<Q<Q	&=+DI
 $)>)F  U 
 !% 1 1) <!*!6IDKK<Q<Q	%0%<k$++B]B] ""ll#-+"3%9' + O O_!M-"1!"4474H14Loa0RV14_1E1I?1-tO (* ,,'1/ "/#1/!5# ' 
 *!,1#)<<r<#: j#++B/::<''+668

&=+D?'')*Q."1"9"9""="@"@ATAT"U=%%'(1, - 5 5b 9 < <Z=N=N O(--a0M-33A}EO)//=AM']CH!,@J
M:H$x/14J"J//!"2EEWF/9/EZMF*Q6Q2%!+;;"1"?"?.99,==&5&G&G"1"?"?.99
 	
r/   r  )r?   r@   rA   r  r  r   r$   rx  rH  r   r&   r  r  r  rY  rY   r@  r   r>   rB   rC   s   @r.   r  r    s   *r)s&'6'6y .:
  .23759:>=A371526:>!%)-,0#'I
##d*I
 ))D0I
 !++d2	I

 !& 0 04 7I
 uU\\23d:I
 ))D0I
 ''$.I
 ((4/I
  %0047I
 $;I
  $;I
 #TkI
 D[I
  
u  	!$G	G!I
 I
r/   r  )r  r  r  r  r"  r  r  )Ar  rm  r   r&   r   torch.nnr   r   r   rK  r   r  activationsr	   cache_utilsr
   r   r   
generationr   masking_utilsr   r   modeling_layersr   modeling_outputsr   r   r   r   r   r   r   modeling_utilsr   utilsr   r   r   r   r   configuration_mt5r   
get_loggerr?   r|   Moduler!   rE   r^   rf   rp   r   r   r   r   r  r:  r  r  r  r  r"  r  __all__r8  r/   r.   <module>r     s        A A & ! C C ) J 9   . ^ ^ ( 
		H	%+299 +4ryy .BII < &I299 IZBII F!RYY !JZ
) Z
|BII $ ^! ^! ^!Dx
! x
v k
! k
 k
\ 
I)"4o I)
I)X Y( Y Yx O
#5 O
O
d E
 2 E
 E
P s
0 s
 s
lr/   