Ë
    «q±iÊU ã                   óÆ  — d Z ddlZddlZddlmZ ddlZddlmZmZ ddlm	Z	 ddl
mZ ddlmZmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZmZmZ ddlmZ  ej<                  e«      Z d?d„Z!d„ Z"d?d„Z#d„ Z$e ed¬«       G d„ de«      «       «       Z%e ed¬«       G d„ de«      «       «       Z&e ed¬«       G d„ de«      «       «       Z'e ed¬«       G d„ de«      «       «       Z(e G d „ d!e«      «       Z) G d"„ d#ejT                  «      Z+ G d$„ d%ejX                  «      Z- G d&„ d'ejX                  «      Z. G d(„ d)ejX                  «      Z/ G d*„ d+e«      Z0 G d,„ d-e«      Z1 ed.¬«       G d/„ d0e)«      «       Z2 ed1¬«       G d2„ d3e)«      «       Z3e G d4„ d5e)«      «       Z4 ed6¬«       G d7„ d8e)e«      «       Z5 ed9¬«       G d:„ d;e)e«      «       Z6 G d<„ d=e)«      Z7g d>¢Z8y)@zRPyTorch ProphetNet model, ported from ProphetNet repo(fairsequery_states version).é    N)Ú	dataclass)ÚTensorÚnn)Ú	LayerNormé   )ÚACT2FN)ÚCacheÚDynamicCacheÚEncoderDecoderCache)ÚGenerationMixin)ÚGradientCheckpointingLayer)ÚBaseModelOutput)ÚPreTrainedModel)ÚModelOutputÚauto_docstringÚloggingé   )ÚProphetNetConfigc                 óÄ   — |r/t         j                  j                  | j                  «       |¬«      S t         j                  j                  | |t        j
                  ¬«      S )N©Údim©r   Údtype)r   Ú
functionalÚsoftmaxÚfloatÚtorchÚfloat32)Úhidden_stater   Ú
onnx_traces      úd/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/prophetnet/modeling_prophetnet.pyr   r   %   sH   € ÙÜ}‰}×$Ñ$ \×%7Ñ%7Ó%9¸sÐ$ÓCÐCä}‰}×$Ñ$ \°sÄ%Ç-Á-Ð$ÓPÐPó    c                 óz  — t        j                  || | f||¬«      t        j                  |«      j                  z  }|j	                  «       j                  «       }t        |«      D ]0  }||   j                  dd¬«       ||   j                  | dz   «       Œ2 d|dd…dd…df<   t        j                  ||gd¬«      S )	z@
    This function computes the bias for the predict stream
    )Údevicer   r   F)Úwrapr   Né   r   )
r   ÚonesÚfinfoÚminÚdetachÚcloneÚrangeÚfill_diagonal_Útriu_Úcat)Úsequence_lengthÚngramr$   r   Ú
left_blockÚright_blockÚ
stream_idxs          r!   Úngram_attention_biasr5   ,   s¾   € ô
 	
‰
E˜?¨OÐ<ÀVÐSXÔYÔ\a×\gÑ\gÐhmÓ\n×\rÑ\rÑrð ð ×#Ñ#Ó%×+Ñ+Ó-€Kä˜E“lò 6ˆ
ØJÑ×.Ñ.¨q°uÐ.Ô=Ø:Ñ×$Ñ$ j [°1¡_Õ5ð6ð €JŠq’!QˆwÑÜ9‰9j +Ð.°AÔ6Ð6r"   c                 ó¦  — | }d}|rX| dz  } |t        j                  |t        j                  |«      «      j                  «       | z  z   }t        j                  |«      }n)t        j
                  |t        j                  |«      «      }| dz  }t        j                  ||«      }|t        j                  |j                  «       |z  «      t        j                  ||z  «      z  | |z
  z  z   }t        j                  |t        j                  |«      | dz
  z  «      j                  «       }|t        j                  ||j                  «       |«      z   }|S )zo
    This function computes individual parts of the relative position buckets. For more detail, see paper.
    r   r&   r   )r   ÚltÚ
zeros_likeÚintÚabsÚmaxÚlogr   Úmathr)   Ú	ones_likeÚwhere)	Únum_bucketsÚmax_distanceÚrelative_positionsÚis_bidirectionalÚinv_relative_positionsÚrel_positions_bucketÚ	max_exactÚis_smallÚval_if_larges	            r!   Úcompute_relative_bucketsrI   =   sG  € ð 1Ð0ÐØÐáØ! QÑ&ˆà Üh‰hÐ-¬u×/?Ñ/?Ð@VÓ/WÓX×\Ñ\Ó^ÐalÑlñmð 	ô "'§¡Ð+AÓ!BÑä!&§¡Ð+AÄ5×CSÑCSÐTjÓCkÓ!lÐà˜qÑ €IÜx‰xÐ.°	Ó:€HØœuŸy™yÐ)?×)EÑ)EÓ)GÈ)Ñ)SÓTÔW[×W_ÑW_ØyÑ óXñ  à	yÑ	 ñ "ñ "€Lô —9‘9˜\¬5¯?©?¸<Ó+HÈKÐZ[ÉOÑ+\Ó]×aÑaÓc€LØ/´%·+±+¸hÐH^×HbÑHbÓHdÐfrÓ2sÑsÐØÐr"   c                 ó’  — |j                  d«      j                  d|j                  d«      d«      }||j                  d«      z
  }t        j                  |dz
  |fd¬«      j                  d«      }|j                  d|j                  d«      d«      }||j                  d«      z
  }t        | ||d¬«      }t        | ||d¬«      }||fS )zm
    This function computes both main and predict relative position buckets. For more detail, see paper.
    r   éÿÿÿÿr   F)rC   )Ú	unsqueezeÚrepeatÚsizer   r/   rI   )r@   rA   Úposition_idsÚmain_stream_relative_positionsÚ$predicting_stream_relative_positionsÚmain_relative_position_bucketsÚ!predict_relative_position_bucketss          r!   Ú#compute_all_stream_relative_bucketsrT   X   sí   € ð
 &2×%;Ñ%;¸AÓ%>×%EÑ%EÀaÈ×IZÑIZÐ[]ÓI^Ð`aÓ%bÐ"Ø%CÀl×F\ÑF\Ð]_ÓF`Ñ%`Ð"ô ,1¯9©9°lÀQÑ6FÈÐ5UÐ[]Ô+^×+hÑ+hÐijÓ+kÐ(Ø+O×+VÑ+VÐWXÐZf×ZkÑZkÐlnÓZoÐqrÓ+sÐ(Ø+OÐR^×RhÑRhÐikÓRlÑ+lÐ(ô &>Ø\Ð#AÐTYô&Ð"ô )AØ\Ð#GÐZ_ô)Ð%ð *Ð+LÐLÐLr"   zF
    Base class for sequence-to-sequence language models outputs.
    )Úcustom_introc                   ó  — e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	ej                  dz  ed<   dZ
edz  ed<   dZeej                     dz  ed<   dZeej                     dz  ed<   dZeej                     dz  ed	<   dZeej                     dz  ed
<   dZeej                     dz  ed<   dZej                  dz  ed<   dZeej                     dz  ed<   dZeej                     dz  ed<   y)ÚProphetNetSeq2SeqLMOutputaÖ  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss.
    logits (`torch.FloatTensor` of shape `(batch_size, decoder_sequence_length, config.vocab_size)`):
        Prediction scores of the main stream language modeling head (scores for each vocabulary token before
        SoftMax).
    logits_ngram (`torch.FloatTensor` of shape `(batch_size, ngram * decoder_sequence_length, config.vocab_size)`):
        Prediction scores of the predict stream language modeling head (scores for each vocabulary token before
        SoftMax).
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
        used (see `past_key_values` input) to speed up sequential decoding.
    decoder_ngram_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, ngram * decoder_sequence_length, hidden_size)`.

        Hidden-states of the predict stream of the decoder at the output of each layer plus the initial embedding
        outputs.
    decoder_ngram_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
        decoder_sequence_length, decoder_sequence_length)`.

        Attentions weights of the predict stream of the decoder, after the attention softmax, used to compute the
        weighted average in the self-attention heads.
    encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
        Sequence of hidden-states at the output of the last layer of the encoder of the model.
    NÚlossÚlogitsÚlogits_ngramÚpast_key_valuesÚdecoder_hidden_statesÚdecoder_ngram_hidden_statesÚdecoder_attentionsÚdecoder_ngram_attentionsÚcross_attentionsÚencoder_last_hidden_stateÚencoder_hidden_statesÚencoder_attentions)Ú__name__Ú
__module__Ú__qualname__Ú__doc__rX   r   ÚFloatTensorÚ__annotations__rY   rZ   r[   r	   r\   Útupler]   r^   r_   r`   ra   rb   rc   © r"   r!   rW   rW   o   s)  … ñð< &*€Dˆ%×
Ñ
˜dÑ
"Ó)Ø'+€FˆE×Ñ Ñ$Ó+Ø-1€L%×#Ñ# dÑ*Ó1Ø$(€OU˜T‘\Ó(Ø=AÐ˜5 ×!2Ñ!2Ñ3°dÑ:ÓAØCGÐ  u×'8Ñ'8Ñ!9¸DÑ!@ÓGØ:>Ð˜˜e×/Ñ/Ñ0°4Ñ7Ó>Ø@DÐ˜e E×$5Ñ$5Ñ6¸Ñ=ÓDØ8<Ðe˜E×-Ñ-Ñ.°Ñ5Ó<Ø:>Ð˜u×0Ñ0°4Ñ7Ó>Ø=AÐ˜5 ×!2Ñ!2Ñ3°dÑ:ÓAØ:>Ð˜˜e×/Ñ/Ñ0°4Ñ7Ô>r"   rW   z‹
    Base class for model encoder's outputs that also contains : pre-computed hidden states that can speed up sequential
    decoding.
    c                   óÖ  — e Zd ZU dZej
                  ed<   dZej
                  dz  ed<   dZe	dz  ed<   dZ
eej
                     dz  ed<   dZeej
                     dz  ed<   dZeej
                     dz  ed<   dZeej
                     dz  ed	<   dZeej
                     dz  ed
<   dZej
                  dz  ed<   dZeej
                     dz  ed<   dZeej
                     dz  ed<   y)ÚProphetNetSeq2SeqModelOutputaÀ  
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, decoder_sequence_length, hidden_size)`):
        Sequence of main stream hidden-states at the output of the last layer of the decoder of the model.

        If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
        hidden_size)` is output.
    last_hidden_state_ngram (`torch.FloatTensor` of shape `(batch_size,ngram * decoder_sequence_length, config.vocab_size)`, *optional*):
        Sequence of predict stream hidden-states at the output of the last layer of the decoder of the model.
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
        used (see `past_key_values` input) to speed up sequential decoding.
    decoder_ngram_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, ngram * decoder_sequence_length, hidden_size)`.

        Hidden-states of the predict stream of the decoder at the output of each layer plus the initial embedding
        outputs.
    decoder_ngram_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
        decoder_sequence_length, decoder_sequence_length)`.

        Attentions weights of the predict stream of the decoder, after the attention softmax, used to compute the
        weighted average in the
    encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
        Sequence of hidden-states at the output of the last layer of the encoder of the model.
    Úlast_hidden_stateNÚlast_hidden_state_ngramr[   r\   r]   r^   r_   r`   ra   rb   rc   )rd   re   rf   rg   r   rh   ri   ro   r[   r	   r\   rj   r]   r^   r_   r`   ra   rb   rc   rk   r"   r!   rm   rm   ¢   s  … ñð: ×(Ñ(Ó(Ø8<Ð˜U×.Ñ.°Ñ5Ó<Ø$(€OU˜T‘\Ó(Ø=AÐ˜5 ×!2Ñ!2Ñ3°dÑ:ÓAØCGÐ  u×'8Ñ'8Ñ!9¸DÑ!@ÓGØ:>Ð˜˜e×/Ñ/Ñ0°4Ñ7Ó>Ø@DÐ˜e E×$5Ñ$5Ñ6¸Ñ=ÓDØ8<Ðe˜E×-Ñ-Ñ.°Ñ5Ó<Ø:>Ð˜u×0Ñ0°4Ñ7Ó>Ø=AÐ˜5 ×!2Ñ!2Ñ3°dÑ:ÓAØ:>Ð˜˜e×/Ñ/Ñ0°4Ñ7Ô>r"   rm   zs
    Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).
    c                   óR  — e Zd ZU dZej
                  ed<   dZej
                  dz  ed<   dZe	dz  ed<   dZ
eej
                     dz  ed<   dZeej
                     dz  ed<   dZeej
                     dz  ed<   dZeej
                     dz  ed	<   dZeej
                     dz  ed
<   y)ÚProphetNetDecoderModelOutputaÇ  
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, decoder_sequence_length, hidden_size)`):
        Sequence of main stream hidden-states at the output of the last layer of the decoder of the model.

        If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
        hidden_size)` is output.
    last_hidden_state_ngram (`torch.FloatTensor` of shape `(batch_size, ngram * decoder_sequence_length, config.vocab_size)`):
        Sequence of predict stream hidden-states at the output of the last layer of the decoder of the model.
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
        used (see `past_key_values` input) to speed up sequential decoding.
    hidden_states_ngram (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, ngram * decoder_sequence_length, hidden_size)`.

        Hidden-states of the predict stream of the decoder at the output of each layer plus the initial embedding
        outputs.
    ngram_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
        decoder_sequence_length, decoder_sequence_length)`.

        Attentions weights of the predict stream of the decoder, after the attention softmax, used to compute the
        weighted average in the
    rn   Nro   r[   Úhidden_statesÚhidden_states_ngramÚ
attentionsÚngram_attentionsr`   )rd   re   rf   rg   r   rh   ri   ro   r[   r	   rr   rj   rs   rt   ru   r`   rk   r"   r!   rq   rq   Ô   s¾   … ñð6 ×(Ñ(Ó(Ø8<Ð˜U×.Ñ.°Ñ5Ó<Ø$(€OU˜T‘\Ó(Ø59€M5˜×*Ñ*Ñ+¨dÑ2Ó9Ø;?Ð˜˜u×0Ñ0Ñ1°DÑ8Ó?Ø26€Je×'Ñ'Ñ(¨4Ñ/Ó6Ø8<Ðe˜E×-Ñ-Ñ.°Ñ5Ó<Ø8<Ðe˜E×-Ñ-Ñ.°Ñ5Ô<r"   rq   c                   ó„  — e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	ej                  dz  ed<   dZ
edz  ed<   dZeej                     dz  ed<   dZeej                     dz  ed<   dZeej                     dz  ed	<   dZeej                     dz  ed
<   dZeej                     dz  ed<   y)ÚProphetNetDecoderLMOutputa¶	  
    ngram_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, ngram * decoder_sequence_length, hidden_size)`.

        Hidden-states of the predict stream of the decoder at the output of each layer plus the initial embedding
        outputs.
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss.
    logits (`torch.FloatTensor` of shape `(batch_size, decoder_sequence_length, config.vocab_size)`):
        Prediction scores of the main stream language modeling head (scores for each vocabulary token before
        SoftMax).
    logits_ngram (`torch.FloatTensor` of shape `(batch_size, ngram * decoder_sequence_length, config.vocab_size)`):
        Prediction scores of the predict stream language modeling head (scores for each vocabulary token before
        SoftMax).
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
        used (see `past_key_values` input) to speed up sequential decoding.
    hidden_states_ngram (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, ngram * decoder_sequence_length, hidden_size)`.

        Hidden-states of the predict stream of the decoder at the output of each layer plus the initial embedding
        outputs.
    ngram_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
        decoder_sequence_length, decoder_sequence_length)`.

        Attentions weights of the predict stream of the decoder, after the attention softmax, used to compute the
        weighted average in the
    NrX   rY   rZ   r[   rr   rs   rt   ru   r`   )rd   re   rf   rg   rX   r   rh   ri   rY   rZ   r[   r	   rr   rj   rs   rt   ru   r`   rk   r"   r!   rw   rw      sÜ   … ñ ðD &*€Dˆ%×
Ñ
˜dÑ
"Ó)Ø'+€FˆE×Ñ Ñ$Ó+Ø-1€L%×#Ñ# dÑ*Ó1Ø$(€OU˜T‘\Ó(Ø59€M5˜×*Ñ*Ñ+¨dÑ2Ó9Ø;?Ð˜˜u×0Ñ0Ñ1°DÑ8Ó?Ø26€Je×'Ñ'Ñ(¨4Ñ/Ó6Ø8<Ðe˜E×-Ñ-Ñ.°Ñ5Ó<Ø8<Ðe˜E×-Ñ-Ñ.°Ñ5Ô<r"   rw   c                   ó&   — e Zd ZU eed<   dZdZd„ Zy)ÚProphetNetPreTrainedModelÚconfigÚ
prophetnetTc                 ó‚  — | j                   j                  }| j                   j                  }|€J d«       ‚|j                  |j                  «      }|dd d…f   j                  «       |ddd …f<   ||d<   |€J d«       ‚|j                  |dk(  |«       t        j                  |dk\  «      j                  «       sJ d	«       ‚|S )
Nz™self.model.config.decoder_start_token_id has to be defined. In ProphetNet it is usually set to the pad_token_id. See ProphetNet docs for more information.rK   r   ).r   z1self.model.config.pad_token_id has to be defined.éœÿÿÿr   z8Verify that `shifted_input_ids` has only positive values)
rz   Údecoder_start_token_idÚpad_token_idÚ	new_zerosÚshaper+   Úmasked_fill_r   ÚallÚitem)ÚselfÚ	input_idsr~   r   Úshifted_input_idss        r!   Ú_shift_rightz&ProphetNetPreTrainedModel._shift_right:  sØ   € Ø!%§¡×!CÑ!CÐØ—{‘{×/Ñ/ˆà%Ð1ð 	
ðFó	
Ð1ð &×/Ñ/°	·±Ó@ÐØ%.¨s°C°R°C¨xÑ%8×%>Ñ%>Ó%@Ð˜#˜q™r˜'Ñ"Ø$:Ð˜&Ñ!àÐ'Ð\Ð)\Ó\Ð'à×&Ñ&Ð'8¸DÑ'@À,ÔOäy‰yÐ*¨aÑ/Ó0×5Ñ5Ô7ÐsÐ9sÓsÐ7à Ð r"   N)rd   re   rf   r   ri   Úbase_model_prefixÚsupports_gradient_checkpointingrˆ   rk   r"   r!   ry   ry   4  s   … àÓØ$ÐØ&*Ð#ó!r"   ry   c                   óB   ‡ — e Zd ZdZdeddfˆ fd„Zdˆ fd„	Zˆ fd„Zˆ xZS )	ÚProphetNetPositionalEmbeddingsa  
    This module learns positional embeddings up to a fixed maximum size. Padding ids are ignored by either offsetting
    based on padding_idx or by setting padding_idx to None and ensuring that the appropriate position ids are passed to
    the forward function.
    rz   ÚreturnNc                 ó†   •— |j                   | _        t        ‰|   |j                   |j                  |j
                  «       y ©N)Úmax_position_embeddingsÚ
max_lengthÚsuperÚ__init__Úhidden_sizer   ©r…   rz   Ú	__class__s     €r!   r“   z'ProphetNetPositionalEmbeddings.__init__X  s3   ø€ Ø ×8Ñ8ˆŒÜ‰Ñ˜×7Ñ7¸×9KÑ9KÈV×M`ÑM`Õar"   c                 óD  •— || j                   J d«       ‚|€ø|k|j                  «       dk7  rX|j                  «       }|d   |z   }t        j                  dt        j                  |¬«      t        | j                   |z   «      z  }n‹|€&t        j                  |t        j                  |¬«      }t        j                  |d¬«      j                  |«      |z  j	                  «       | j                   z   }|j                  d| j                  dz
  «      }t        ‰| -  |«      |fS )NzCIf position_ids is pre-computed then padding_idx should not be set.r   r   )r   r   ©r   r$   r   )Úpadding_idxÚget_seq_lengthr   r'   Úlongr9   ÚcumsumÚtype_asÚclampr‘   r’   Úforward)	r…   Úinputs_shaper$   Úattention_maskr[   rO   Úprev_num_input_idsÚnum_input_idsr–   s	           €r!   rŸ   z&ProphetNetPositionalEmbeddings.forward\  s  ø€ ØÐ$¨$×*:Ñ*:Ð*Bð 	
ØQó	
ÐCð ÐØÐ*¨×/MÑ/MÓ/OÐSTÒ/Tð &5×%CÑ%CÓ%EÐ"Ø ,¨Q¡Ð2DÑ DÜ$Ÿz™z¨&¼¿
¹
È6ÔRÜ˜×(Ñ(¨=Ñ8Ó9ñ ‘ð "Ð)Ü%*§Z¡Z°ÄEÇJÁJÐW]Ô%^Nô —L‘L °QÔ7×?Ñ?ÀÓOÐR`Ñ`ß‘$“&˜4×+Ñ+ñ ,ð
  ,×1Ñ1°!°T·_±_ÀqÑ5HÓIä‰w‰˜|Ó,¨lÐ:Ð:r"   c                 ó"   •— t         ‰|   |«      S r   )r’   rŸ   )r…   rO   r–   s     €r!   Ú_forwardz'ProphetNetPositionalEmbeddings._forwardx  s   ø€ Ü‰w‰˜|Ó,Ð,r"   )NNN)	rd   re   rf   rg   r   r“   rŸ   r¥   Ú__classcell__©r–   s   @r!   rŒ   rŒ   Q  s.   ø„ ñðbÐ/ð b°Dõ bõ;÷8-ð -r"   rŒ   c                   ó¦   ‡ — e Zd ZdZddedededz  fˆ fd„Z	 	 	 	 	 ddedz  dedz  d	edz  d
e	dz  de
j                  dz  deeedz  f   fd„Zˆ xZS )ÚProphetNetAttentionz=Multi-headed attention from 'Attention Is All You Need' paperNrz   Únum_attn_headsÚ	layer_idxc                 ó¸  •— t         ‰|   «        |j                  }|j                  | _        |j                  | _        || _        ||z  | _        || _        | j                  |z  |k(  sJ d«       ‚t        j                  ||«      | _
        t        j                  ||«      | _        t        j                  ||«      | _        t        j                  ||«      | _        y )Nzw`config.hidden_size` must be divisible by `config.num_encoder_attention_heads` and `config.num_decoder_attention_heads`)r’   r“   r”   Úattention_dropoutÚdropoutrª   Úhead_dimr«   r   ÚLinearÚkey_projÚ
value_projÚ
query_projÚout_proj)r…   rz   rª   r«   r”   r–   s        €r!   r“   zProphetNetAttention.__init__  sº   ø€ Ü‰ÑÔØ×(Ñ(ˆà!'×!9Ñ!9ˆÔØ—~‘~ˆŒØ,ˆÔØ# ~Ñ5ˆŒØ"ˆŒà}‰}˜~Ñ-°Ò<ð 	
ð4ó	
Ð<ô
 Ÿ	™	 +¨{Ó;ˆŒÜŸ)™) K°Ó=ˆŒÜŸ)™) K°Ó=ˆŒäŸ	™	 +¨{Ó;ˆr"   Úkey_value_statesr¡   r[   Úoutput_attentionsÚcache_positionr   c                 ó˜  — |j                  «       \  }}}	|d u}
t        |j                  «       «      |||	gk(  sJ d|||	f› d|j                  «       › «       ‚| j                  |«      | j                  dz  z  }d}|St	        |t
        «      rA|j                  j                  | j                  «      }|
r|j                  }n|j                  }n|}|
r|n|}|
rK|I|rGj                  | j                     j                  }|j                  | j                     j                  }nè| j                  |«      }| j                  |«      }|j!                  |d| j"                  | j                  «      j%                  dd«      }|j!                  |d| j"                  | j                  «      j%                  dd«      }|T|
s|nd }j'                  ||| j                  d|i«      \  }}|
r)t	        |t
        «      rd	|j                  | j                  <   |j!                  ||| j"                  | j                  «      j%                  dd«      }|j                  d«      }t)        j*                  d
||j%                  dd«      «      }|| j"                  ||f}|j                  «       |k7  rt-        d|› d|j                  «       › «      ‚||j/                  «       dk(  rd }|| j"                  d|f}|2|j                  «       |k7  rt-        d|› d|j                  «       › «      ‚|||z   }|r|}nd }t0        j2                  j5                  |d¬«      }t0        j2                  j7                  || j8                  | j:                  ¬«      }t)        j*                  d
||«      }|| j"                  || j                  f}|j                  «       |k7  rt-        d|› d|j                  «       › «      ‚|j%                  dd«      j=                  |||	«      }| j?                  |«      }t0        j2                  j7                  || j6                  | j:                  ¬«      }||fS )Nz Size of hidden states should be z	, but is ç      à?FrK   r   r&   r·   Tzbsij,bsjk->bsikr   z#Attention weights should have size r   z Attention mask should have size r   ©ÚpÚtrainingz `attn_output` should have shape ú, but is of shape ) rN   Úlistr³   r¯   Ú
isinstancer   Ú
is_updatedÚgetr«   Úcross_attention_cacheÚself_attention_cacheÚlayersÚkeysÚvaluesr±   r²   Úviewrª   Ú	transposeÚupdater   ÚeinsumÚ
ValueErrorr   r   r   r   r®   r­   r¼   Úreshaper´   )r…   rr   rµ   r¡   r[   r¶   r·   Ú
batch_sizeÚtgt_lenr”   Úis_cross_attentionÚquery_statesrÀ   Úcurr_past_key_valuesÚcurrent_statesÚ
key_statesÚvalue_statesÚsrc_lenÚattn_weightsÚexpected_shapeÚattn_weights_reshapedÚ
attn_probsÚattn_outputs                          r!   rŸ   zProphetNetAttention.forward”  sO  € ð ,9×+=Ñ+=Ó+?Ñ(ˆ
G˜[ð .°TÐ9ÐÜM×&Ñ&Ó(Ó)ØØØð.
ò 
ð 	pð .¨j¸'À;Ð.NÐ-OÈyÐYf×YkÑYkÓYmÐXnÐoó		pð 
ð —‘ }Ó5¸¿¹ÈÑ9KÑLˆàˆ
ØÐ&Ü˜/Ô+>Ô?Ø,×7Ñ7×;Ñ;¸D¿N¹NÓK
Ù%à+:×+PÑ+PÑ(à+:×+OÑ+OÑ(à'6Ð$á-?Ñ)À]ˆÙ /Ð"=Á*à-×4Ñ4°T·^±^ÑD×IÑIˆJØ/×6Ñ6°t·~±~ÑF×MÑM‰LàŸ™ ~Ó6ˆJØŸ?™?¨>Ó:ˆLØ#Ÿ™¨°R¸×9LÑ9LÈdÏmÉmÓ\×fÑfÐghÐjkÓlˆJØ'×,Ñ,¨Z¸¸T×=PÑ=PÐRV×R_ÑR_Ó`×jÑjÐklÐnoÓpˆLàÐ*á7I¡ÈtØ+?×+FÑ+FØ ¨d¯n©nÐ?OÐQ_Ð>`ó,Ñ(
˜Lñ &¬*°_ÔFYÔ*ZØAEO×.Ñ.¨t¯~©~Ñ>à#×(Ñ(¨°W¸d×>QÑ>QÐSW×S`ÑS`Óa×kÑkÐlmÐopÓqˆØ—/‘/ !Ó$ˆä—|‘|Ð$5°|ÀZ×EYÑEYÐZ[Ð]^ÓE_Ó`ˆØ$ d×&9Ñ&9¸7ÀGÐLˆØ×ÑÓ .Ò0ÜÐBÀ>ÐBRÐR[Ð\h×\mÑ\mÓ\oÐ[pÐqÓrÐrð Ð%¨.×*<Ñ*<Ó*>À!Ò*CØ!ˆNà$ d×&9Ñ&9¸1¸gÐFˆØÐ%¨.×*=Ñ*=Ó*?À>Ò*QÜÐ?ÀÐ?OÈyÐYg×YlÑYlÓYnÐXoÐpÓqÐqØÐ%Ø'¨.Ñ8ˆLÙØ$0Ñ!à$(Ð!ä—}‘}×,Ñ,¨\¸rÐ,ÓBˆä—]‘]×*Ñ*ØØ×$Ñ$Ø—]‘]ð +ó 
ˆ
ô
 —l‘lÐ#4°jÀ,ÓOˆØ$ d×&9Ñ&9¸7ÀDÇMÁMÐRˆØ×ÑÓ Ò/ÜÐ?ÀÐ?OÐOaÐbm×brÑbrÓbtÐauÐvÓwÐwà!×+Ñ+¨A¨qÓ1×9Ñ9¸*ÀgÈ{Ó[ˆØ—m‘m KÓ0ˆä—m‘m×+Ñ+¨K¸4¿<¹<ÐRV×R_ÑR_Ð+Ó`ˆØÐ1Ð1Ð1r"   r   )NNNFN)rd   re   rf   rg   r   r9   r“   r   r	   Úboolr   rj   rŸ   r¦   r§   s   @r!   r©   r©   |  s¨   ø„ ÙGñ<Ð/ð <Àð <ÐQTÐW[ÑQ[õ <ð0 +/Ø(,Ø(,Ø).Ø.2ñ^2ð ! 4™-ð^2ð  ™ð	^2ð
  ™ð^2ð   $™;ð^2ð Ÿ™ tÑ+ð^2ð 
ˆvv ‘}Ð$Ñ	%÷^2r"   r©   c                   ó2   ‡ — e Zd ZdZdedefˆ fd„Zd„ Zˆ xZS )ÚProphetNetFeedForwardzm
    This is the residual two feed-forward layer block based on the original Transformer implementation.
    rz   Úffn_dimc                 ó*  •— t         ‰|   «        t        |j                     | _        t        j                  |j                  |«      | _        t        j                  ||j                  «      | _	        |j                  | _
        |j                  | _        y r   )r’   r“   r   Úactivation_functionÚactivation_fnr   r°   r”   ÚintermediateÚoutputÚactivation_dropoutr®   )r…   rz   rÞ   r–   s      €r!   r“   zProphetNetFeedForward.__init__ú  sk   ø€ Ü‰ÑÔÜ# F×$>Ñ$>Ñ?ˆÔÜŸI™I f×&8Ñ&8¸'ÓBˆÔÜ—i‘i ¨×);Ñ);Ó<ˆŒØ"(×";Ñ";ˆÔØ—~‘~ˆr"   c                 óD  — | j                  |«      }| j                  |«      }t        j                  j	                  || j
                  | j                  ¬«      }| j                  |«      }t        j                  j	                  || j                  | j                  ¬«      }|S )Nrº   )râ   rá   r   r   r®   rä   r¼   rã   )r…   rr   s     r!   rŸ   zProphetNetFeedForward.forward  s„   € Ø×)Ñ)¨-Ó8ˆØ×*Ñ*¨=Ó9ˆäŸ™×-Ñ-¨m¸t×?VÑ?VÐae×anÑanÐ-ÓoˆØŸ™ MÓ2ˆÜŸ™×-Ñ-¨m¸t¿|¹|ÐVZ×VcÑVcÐ-ÓdˆØÐr"   )	rd   re   rf   rg   r   r9   r“   rŸ   r¦   r§   s   @r!   rÝ   rÝ   õ  s!   ø„ ñð&Ð/ð &¸#õ &ör"   rÝ   c                   ó`   ‡ — e Zd Zd
defˆ fd„Zd„ Zd„ Z	 	 	 	 	 	 	 ddedz  fd„Zd„ Z	d	„ Z
ˆ xZS )ÚProphetNetNgramSelfAttentionNrz   c                 ó²  •— t         ‰|   «        |j                  | _        |j                  | _        |j                  | _        |j
                  | _        |j                  | _        |j                  | _        |j                  | j                  z  | _	        |j                  | _
        || _        | j                  | j                  z  |j                  k(  sJ d«       ‚t        j                  |j                  |j                  «      | _        t        j                  |j                  |j                  «      | _        t        j                  |j                  |j                  «      | _        t        j                  |j                  |j                  «      | _        t        j                  |j                  | j                  | j                  z  «      | _        d| _        y )Nz6config.hidden_size must be divisible by num_attn_headsF)r’   r“   r”   r@   Úrelative_max_distanceÚnum_decoder_attention_headsrª   r®   r­   r¯   r1   r«   r   r°   r±   r²   r³   r´   Úrelative_pos_embeddingsr    ©r…   rz   r«   r–   s      €r!   r“   z%ProphetNetNgramSelfAttention.__init__  sh  ø€ Ü‰ÑÔØ!×-Ñ-ˆÔà!×-Ñ-ˆÔØ%+×%AÑ%AˆÔ"Ø$×@Ñ@ˆÔØ—~‘~ˆŒØ!'×!9Ñ!9ˆÔØ×*Ñ*¨d×.AÑ.AÑAˆŒØ—\‘\ˆŒ
Ø"ˆŒà}‰}˜t×2Ñ2Ñ2°f×6HÑ6HÒHð 	
ØDó	
ÐHô Ÿ	™	 &×"4Ñ"4°f×6HÑ6HÓIˆŒÜŸ)™) F×$6Ñ$6¸×8JÑ8JÓKˆŒÜŸ)™) F×$6Ñ$6¸×8JÑ8JÓKˆŒô Ÿ	™	 &×"4Ñ"4°f×6HÑ6HÓIˆŒô (*§y¡y°×1CÑ1CÀT×EUÑEUÐX\×XkÑXkÑEkÓ'lˆÔ$ð  ˆr"   c                 óŽ   — |j                  ||| j                  | j                  «      j                  dd«      j	                  «       S ©Nr   r&   )rÇ   rª   r¯   rÈ   Ú
contiguous)r…   ÚtensorÚseq_lenrÍ   s       r!   Ú_shapez#ProphetNetNgramSelfAttention._shape+  s9   € Ø{‰{˜: w°×0CÑ0CÀTÇ]Á]ÓS×]Ñ]Ð^_ÐabÓc×nÑnÓpÐpr"   c                 ó   — d| _         y )NT)r    ©r…   s    r!   Úprepare_for_onnx_export_z5ProphetNetNgramSelfAttention.prepare_for_onnx_export_.  s	   € Øˆr"   r[   c	           	      ó8  — |j                  «       \  }	}
}t        |j                  «       «      |	|
|gk(  sJ d|	|
|f› d|j                  › «       ‚| j                  |«      }| j	                  |«      }| j                  |«      }|| j                  dz  z  }| j                  ||
|	«      }| j                  |d|	«      }| j                  |d|	«      }|	| j                  d| j                  f} |j                  |Ž } |j                  |Ž } |j                  |Ž }|j                  d| j                  z   d¬«      }|j                  d| j                  z   d¬«      }|j                  d| j                  z   d¬«      }|j                  d| j                  z   d¬«      }|d   |dd  }}|d   |dd  }}|d   |dd  }}|d   |dd  }}|Bt        |t        «      r|j                  }n|}|j                  ||| j                   d	|i«      \  }}|
d| j                  z   z  }t#        j$                  d
||j'                  dd«      «      }| j)                  ||||«      }||z   }|||z   }t+        |d| j,                  ¬«      j/                  |«      } t0        j2                  j5                  | | j6                  | j8                  ¬«      } t#        j$                  d
| |«      }!|!j'                  dd«      j                  |	d||«      }!| j;                  |!«      }!t#        j<                  |d«      j?                  |	| j                  | j                  || j                  «      }"t#        j<                  |D #cg c]  }#t#        j@                  ||#gd«      ‘Œ c}#d«      }$t#        j<                  |d¬«      }%t#        j@                  |D &cg c])  }&t#        j@                  ||&gd«      jC                  d«      ‘Œ+ c}&d«      }'t#        j$                  d|"|$f«      }(| jE                  |%|(||«      })|(|)z   }(|5|jG                  ddddd«      }|jI                  |(jJ                  «      }|(|z   }(t+        |(d| j,                  ¬«      j/                  |(«      }*t0        j2                  j5                  |*| j6                  | j8                  ¬«      }*t#        j$                  d|*|'j'                  dd«      f«      }+|+j'                  dd«      }+|+j                  |	| j                  ||«      }+| j;                  |+«      }+t#        j@                  |!|+gd«      j?                  |	d|«      },| j?                  |	| j                  |d«      } t0        j2                  j5                  |,| j4                  | j8                  ¬«      },|,| |*fS c c}#w c c}&w )Nz#`hidden_states` should be of shape r½   r¹   rK   r   r   r&   r   r·   zbntc,bncs->bntsr   )r   r    rº   zbnhtc,bnhsc->bnhtsé   zbnhts,bnhsc->bnhtc)&rN   r¾   r   r³   r±   r²   r¯   rò   rª   rÌ   Úchunkr1   r¿   r   rÃ   rÉ   r«   r   rÊ   rÈ   Ú get_main_relative_pos_embeddingsr   r    r   r   r   r®   r­   r¼   r´   ÚstackrÇ   r/   rL   Ú#get_predict_relative_pos_embeddingsÚpermuteÚtor   )-r…   rr   r[   r¡   Úextended_predict_attention_maskrR   rS   rO   r·   rÍ   Úngram_sequence_lengthr”   rÐ   rÓ   rÔ   Ú
proj_shapeÚhidden_states_listÚquery_states_listÚkey_states_listÚvalue_states_listÚmain_hidden_statesÚhidden_states_predict_listÚmain_query_statesÚpredict_query_states_listÚmain_key_statesÚpredict_key_states_listÚmain_value_statesÚpredict_value_states_listrÑ   r0   Úmain_attn_weightsÚmain_relative_pos_embeddingsÚmain_attn_probsÚmain_attn_outputÚpredict_query_statesÚkeyÚpredict_key_statesÚpredict_hidden_statesÚv_pÚpredict_value_statesÚpredict_attn_weightsÚpredict_relative_pos_embeddingsÚpredict_attn_probsÚpredict_attn_outputrÚ   s-                                                r!   rŸ   z$ProphetNetNgramSelfAttention.forward1  s  € ð :G×9KÑ9KÓ9MÑ6ˆ
Ð)¨;ÜM×&Ñ&Ó(Ó)¨jÐ:OÐQ\Ð-]Ò]ð 	
Ø1°*Ð>SÐU`Ð2`Ð1að bØ×#Ñ#Ð$ð&ó	
Ð]ð —‘ }Ó5ˆØ—]‘] =Ó1ˆ
Ø—‘ }Ó5ˆð $ t§}¡}°cÑ'9Ñ:ˆð —{‘{ <Ð1FÈ
ÓSˆØ—[‘[ ¨R°Ó<ˆ
Ø—{‘{ <°°ZÓ@ˆØ  $×"5Ñ"5°r¸4¿=¹=ÐIˆ
à+|×+Ñ+¨ZÐ8ˆØ'Z×'Ñ'¨Ð4ˆ
Ø+|×+Ñ+¨ZÐ8ˆð +×0Ñ0°°T·Z±Z±ÀQÐ0ÓGÐØ(×.Ñ.¨q°4·:±:©~À1Ð.ÓEÐØ$×*Ñ*¨1¨t¯z©z©>¸qÐ*ÓAˆØ(×.Ñ.¨q°4·:±:©~À1Ð.ÓEÐà9KÈAÑ9NÐPbÐcdÐceÐPfÐ6ÐØ7HÈÑ7KÐM^Ð_`Ð_aÐMbÐ4ÐØ3BÀ1Ñ3EÀÐWXÐWYÐGZÐ0ˆØ7HÈÑ7KÐM^Ð_`Ð_aÐMbÐ4Ðð Ð&Ü˜/Ô+>Ô?Ø'6×'KÑ'KÑ$à'6Ð$Ø1E×1LÑ1LØÐ!2°D·N±NÐEUÐWeÐDfó2Ñ.ˆOÐ.ð
 0°A¸¿
¹
±NÑCˆô "ŸL™LÐ):Ð<MÈ×OhÑOhÐijÐlmÓOnÓoÐð (,×'LÑ'LØÐ 1°<ÐA_ó(
Ð$ð .Ð0LÑLÐàÐ%Ø 1°NÑ BÐä!ØØØ—‘ô
÷ ‰'Ð#Ó
$ð	 	ô Ÿ-™-×/Ñ/°À4×CYÑCYÐdh×dqÑdqÐ/Órˆô
 !Ÿ<™<Ð(9¸?ÐL]Ó^Ðà+×5Ñ5°a¸Ó;×CÑCÀJÐPQÐSbÐdoÓpÐØŸ=™=Ð)9Ó:Ðô  %Ÿ{™{Ð+DÀaÓH×MÑMØ˜Ÿ
™
 D×$7Ñ$7¸È$Ï-É-ó 
Ðô
 #Ÿ[™[ÐZqÖ)rÐSV¬%¯)©)°_ÀcÐ4JÈAÕ*NÒ)rÐtuÓvÐô !&§¡Ð,FÈAÔ NÐô  %Ÿy™yØLeÖfÀSŒUY‰YÐ)¨3Ð/°Ó3×=Ñ=¸aÕ@ÒfÐhió 
Ðô  %Ÿ|™|Ð,@ÐCWÐYkÐBlÓmÐð +/×*RÑ*RØ!Ð#7¸ÐGhó+
Ð'ð
  4Ð6UÑUÐà*Ð6à.M×.UÑ.UÐVWÐYZÐ\]Ð_`ÐbcÓ.dÐ+Ø.M×.PÑ.PÐQe×QkÑQkÓ.lÐ+Ø#7Ð:YÑ#YÐ ä$Ø ØØ—‘ô
÷ ‰'Ð&Ó
'ð	 	ô  Ÿ]™]×2Ñ2Ø $×"8Ñ"8À4Ç=Á=ð 3ó 
Ðô $Ÿl™lØ Ð#5Ð7K×7UÑ7UÐVWÐYZÓ7[Ð"\ó
Ðð 2×;Ñ;¸A¸qÓAÐØ1×9Ñ9¸*ÀdÇjÁjÐRaÐcnÓoÐØ"Ÿm™mÐ,?Ó@Ðô —i‘iÐ!1Ð3FÐ GÈÓK×PÑPÐQ[Ð]_ÐalÓmˆà)×.Ñ.¨z¸4×;NÑ;NÐP_ÐacÓdˆä—m‘m×+Ñ+¨K¸4¿<¹<ÐRV×R_ÑR_Ð+Ó`ˆà˜OÐ-?Ð?Ð?ùò{ *sùò gs   ÍVÎ,.Vc                 ó  — |j                   \  }}}}|j                  ||||«      }|€Ç|j                   d d \  }}	t        j                  d|j                   d   dz   «      j	                  d«      j	                  d«      j                  ||	d«      j                  |j                  «      }
|
|j	                  d«      j                  ||	d«      z
  }
t        | j                  | j                  |
d«      }| j                  |«      }|j                  |j                   d d | j                  | j                  fz   «      }|j                  dddd«      }|j                  |j                   d d dz   «      }|j                  d| j                  d«      }|j                  d|j                   d   «      }|j                  «       }|j                  d|j!                  d«      «      }t        j"                  |d|¬«      }|j                  |||d«      }|S )	Nr&   r   rK   r   Fr   )rK   ©r   Úindex)r   rÇ   r   ÚarangerL   rM   rý   r$   rI   r@   ré   rë   rª   rü   rÌ   r›   rN   Úgather)r…   rr   rÖ   rO   rR   rÍ   rª   rÎ   rÕ   r0   rB   Úrel_pos_embeddingsr  s                r!   rù   z=ProphetNetNgramSelfAttention.get_main_relative_pos_embeddingsÔ  s  € ð 8D×7IÑ7IÑ4ˆ
N G¨WØ#×(Ñ(¨°^ÀWÈgÓVˆØ)Ð1Ø*7×*=Ñ*=¸b¸qÐ*AÑ'ˆJ˜ä—‘˜Q × 2Ñ 2°2Ñ 6¸Ñ :Ó;ß‘˜1“ß‘˜1“ß‘˜
 O°QÓ7ß‘L×'Ñ'Ó(ð ð "4°l×6LÑ6LÈQÓ6O×6VÑ6VÐWaÐcrÐtuÓ6vÑ!vÐÜ-EØ× Ñ  $×"<Ñ"<Ð>PÐRWó.Ð*ð
 "×9Ñ9¸-ÓHÐØ/×4Ñ4Ø×$Ñ$ R aÐ(¨D×,<Ñ,<¸d×>QÑ>QÐ+RÑRó
Ðð 0×7Ñ7¸¸1¸aÀÓCÐà/×7Ñ7¸×8JÑ8JÈ2ÈAÐ8NÐQVÑ8VÓWÐà)G×)NÑ)NÈqÐRV×ReÑReÐghÓ)iÐ&à)G×)LÑ)LØÐ.×4Ñ4°RÑ8ó*
Ð&ð *H×)LÑ)LÓ)NÐ&à/×7Ñ7¸Ð<N×<SÑ<SÐTVÓ<WÓXÐä',§|¡|Ð4FÈAÐUsÔ'tÐ$Ø'C×'HÑ'HÈÐUcÐelÐnpÓ'qÐ$Ø+Ð+r"   c                 ó(  — |j                   dd \  }}|€É|j                   d   }|d   d   |dz
  k(  sJ d«       ‚t        j                  d|«      j                  d«      j                  d«      j	                  ||d«      j                  |j                  «      }||j                  d«      j	                  ||d«      z
  }t        | j                  | j                  |d«      }|j                  dd«      }| j                  |«      }	|	j                  |j                   d d | j                  | j                  fz   «      }	|	j                  ddddd«      }	|	j                  d| j                  «      }	|j                  d«      }|j	                  | j                   d| j                  d«      }|j                  d|j#                  d«      «      j%                  «       }t        j&                  |	d|¬	«      }
|
j                  || j                   | j                  |d«      }
|
S )
Nr   r&   rK   r   zb`position_ids` are incorrect. They should be of the format 1 2 3 4 5 ... (key_sequence_length - 1)Fr÷   r   r  )r   r   r  rL   rM   rý   r$   rI   r@   ré   rÈ   rë   rÇ   rª   rü   rÌ   r1   rN   r›   r  )r…   rr   rÖ   rO   rS   rÍ   r0   Úkey_sequence_lengthrB   r   r  s              r!   rû   z@ProphetNetNgramSelfAttention.get_predict_relative_pos_embeddings  s+  € ð '4×&9Ñ&9¸!¸AÐ&>Ñ#ˆ
Oà,Ð4Ø".×"4Ñ"4°RÑ"8ÐØ ‘? 1Ñ%Ð)<¸qÑ)@Ò@ð ØtóÐ@ô —‘˜QÐ 3Ó4ß‘˜1“ß‘˜1“ß‘˜
 O°QÓ7ß‘L×'Ñ'Ó(ð ð "4°l×6LÑ6LÈQÓ6O×6VÑ6VÐWaÐcrÐtuÓ6vÑ!vÐÜ0HØ× Ñ  $×"<Ñ"<Ð>PÐRWó1Ð-ð
 &×/Ñ/°°1Ó5ˆØ!×9Ñ9¸-ÓHÐð 0×4Ñ4Ø×Ñ  Ð$¨×(8Ñ(8¸$×:MÑ:MÐ'NÑNó
Ðð 0×7Ñ7¸¸1¸aÀÀAÓFÐà/×7Ñ7¸¸D×<LÑ<LÓMÐà,M×,WÑ,WÐXYÓ,ZÐ)Ø,M×,TÑ,TØJ‰J˜˜4×.Ñ.°ó-
Ð)ð -N×,RÑ,RØÐ1×6Ñ6°rÓ:ó-
ç
‰$‹&ð 	*ô +0¯,©,Ø AÐ-Nô+
Ð'ð
 +J×*NÑ*NØ˜Ÿ
™
 D×$7Ñ$7¸È"ó+
Ð'ð /Ð.r"   r   )NNNNNNN)rd   re   rf   r   r“   rò   rõ   r	   rŸ   rù   rû   r¦   r§   s   @r!   rç   rç     sU   ø„ ñ Ð/õ  ò<qòð )-ØØ(,Ø'+Ø*.ØØña@ð  ™óa@òF+,öZ9/r"   rç   c                   ó8   ‡ — e Zd ZdZdefˆ fd„Z	 ddefd„Zˆ xZS )ÚProphetNetEncoderLayerz&
    Encoder block for Prophetnet
    rz   c                 óö   •— t         ‰|   «        t        ||j                  «      | _        t        |j                  «      | _        t        ||j                  «      | _
        t        |j                  «      | _        y r   )r’   r“   r©   Únum_encoder_attention_headsÚ	self_attnr   r”   Úself_attn_layer_normrÝ   Úencoder_ffn_dimÚfeed_forwardÚfeed_forward_layer_normr•   s     €r!   r“   zProphetNetEncoderLayer.__init__B  s_   ø€ Ü‰ÑÔä,¨V°V×5WÑ5WÓXˆŒÜ$-¨f×.@Ñ.@Ó$AˆÔ!ô 2°&¸&×:PÑ:PÓQˆÔÜ'0°×1CÑ1CÓ'DˆÕ$r"   r¶   c                 ó¼   — | j                  |||¬«      \  }}| j                  ||z   «      }| j                  |«      }| j                  ||z   «      }|f}|r||fz  }|S )N)rr   r¡   r¶   )r'  r(  r*  r+  )r…   rr   r¡   r¶   Úattention_outputrÖ   Úfeed_forward_outputÚoutputss           r!   rŸ   zProphetNetEncoderLayer.forwardL  sƒ   € ð *.¯©Ø'Ø)Ø/ð *8ó *
Ñ&Ð˜,ð
 ×1Ñ1Ð2BÀ]Ñ2RÓSˆð #×/Ñ/°Ó>ÐØ×4Ñ4Ð5HÈ=Ñ5XÓYˆà Ð"ˆáØ˜Ñ&ˆGàˆr"   ©F)	rd   re   rf   rg   r   r“   rÛ   rŸ   r¦   r§   s   @r!   r$  r$  =  s+   ø„ ñðEÐ/õ Eð #(ñ	ð  ÷	r"   r$  c                   ó|   ‡ — e Zd ZdZd	defˆ fd„Z	 	 	 	 	 	 	 	 	 	 	 d
dedz  dedz  dej                  dz  fd„Z	ˆ xZ
S )ÚProphetNetDecoderLayerz&
    Decoder block for Prophetnet
    Nrz   c                 ój  •— t         ‰|   «        t        ||¬«      | _        t	        |j
                  «      | _        |j                  r7t        ||j                  |¬«      | _
        t	        |j
                  «      | _        t        ||j                  «      | _        t	        |j
                  «      | _        y )N©r«   )r’   r“   rç   r'  r   r”   r(  Úadd_cross_attentionr©   rê   Ú
cross_attnÚcross_attn_layer_normrÝ   Údecoder_ffn_dimr*  r+  rì   s      €r!   r“   zProphetNetDecoderLayer.__init__k  sŽ   ø€ Ü‰ÑÔä5°fÈ	ÔRˆŒÜ$-¨f×.@Ñ.@Ó$AˆÔ!ð ×%Ò%Ü1°&¸&×:\Ñ:\ÐhqÔrˆDŒOÜ)2°6×3EÑ3EÓ)FˆDÔ&ô 2°&¸&×:PÑ:PÓQˆÔÜ'0°×1CÑ1CÓ'DˆÕ$r"   Ú	use_cacher¶   r·   c           	      ó,  — | j                  ||	|||||¬«      \  }}}| j                  ||z   «      }d }|-| j                  ||||	|¬«      \  }}| j                  ||z   «      }| j	                  |«      }| j                  ||z   «      }|f}|r||||fz  }|S )N)rr   r[   r¡   rþ   rR   rS   rO   )rr   rµ   r¡   r[   r¶   )r'  r(  r6  r7  r*  r+  )r…   rr   r¡   rb   Úencoder_attn_maskrþ   rR   rS   rO   r[   r9  r¶   r·   Úngram_attention_outputÚself_attn_weightsÚself_attn_weights_ngramÚcross_attn_weightsr-  r.  r/  s                       r!   rŸ   zProphetNetDecoderLayer.forwardz  sî   € ð  NRÏ^É^Ø'Ø+Ø)Ø,KØ+IØ.OØ%ð N\ó N
ÑJÐÐ 1Ð3Jð ×1Ñ1°-ÐBXÑ2XÓYˆà!ÐØ Ð,à37·?±?Ø+Ø!6Ø0Ø /Ø"3ð 4Có 4Ñ0ÐÐ0ð !×6Ñ6Ð7GÈ-Ñ7WÓXˆMð #×/Ñ/°Ó>ÐØ×4Ñ4Ð5HÈ=Ñ5XÓYˆà Ð"ˆáØÐ)Ð+BÐDVÐWÑWˆGàˆr"   r   )NNNNNNNNTFN)rd   re   rf   rg   r   r“   rÛ   r   r   rŸ   r¦   r§   s   @r!   r2  r2  f  sn   ø„ ññEÐ/õ Eð$ Ø"ØØ(,Ø'+Ø*.ØØØ!%Ø).Ø.2ñ0ð ˜$‘;ð0ð   $™;ð0ð Ÿ™ tÑ+÷0r"   r2  z=
    The standalone encoder part of the ProphetNetModel.
    c                   óÒ   ‡ — e Zd Zdefˆ fd„Zd„ Zd„ Ze	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  d	e
dz  d
e
dz  de
dz  deez  fd„«       Zˆ xZS )ÚProphetNetEncoderrz   c                 ó®  •— t         ‰|   |«       t        j                  |j                  |j
                  |j                  ¬«      | _        t        |«      | _	        t        |j
                  «      | _        t        j                  t        |j                  «      D cg c]  }t        |«      ‘Œ c}«      | _        d| _        | j%                  «        y c c}w )N©r™   F)r’   r“   r   Ú	EmbeddingÚ
vocab_sizer”   r   Úword_embeddingsrŒ   Úposition_embeddingsr   Úembeddings_layer_normÚ
ModuleListr,   Únum_encoder_layersr$  rÄ   Úgradient_checkpointingÚ	post_init)r…   rz   Ú_r–   s      €r!   r“   zProphetNetEncoder.__init__³  s   ø€ Ü‰Ñ˜Ô ä!Ÿ|™|¨F×,=Ñ,=¸v×?QÑ?QÐ_e×_rÑ_rÔsˆÔÜ#AÀ&Ó#IˆÔ Ü%.¨v×/AÑ/AÓ%BˆÔ"ä—m‘mÌUÐSY×SlÑSlÓMmÖ$nÈÔ%;¸FÕ%CÒ$nÓoˆŒà&+ˆÔ#à‰Õùò	 %os   ÂCc                 ó   — | j                   S r   ©rF  rô   s    r!   Úget_input_embeddingsz&ProphetNetEncoder.get_input_embeddingsÀ  ó   € Ø×#Ñ#Ð#r"   c                 ó   — || _         y r   rO  ©r…   Úvalues     r!   Úset_input_embeddingsz&ProphetNetEncoder.set_input_embeddingsÃ  ó
   € Ø$ˆÕr"   Nr†   r¡   Úinputs_embedsr¶   Úoutput_hidden_statesÚreturn_dictr   c                 óü  — ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|€|€t	        d«      ‚||t	        d«      ‚||€| j                  |«      }||d|dd…dddd…f   j                  d| j                   j                  dd«      z
  t        j                  | j                  «      j                  z  }|j                  |j                  «      }nd}| j                  |j                  dd |j                  «      \  }	}
||	z   }| j!                  |«      }t"        j$                  j'                  || j                   j&                  | j(                  ¬«      }|rdnd}|rdnd}t+        | j,                  «      D ])  \  }}|r||fz   } ||||¬	«      }|d
   }|sŒ!||d   fz   }Œ+ |r||fz   }|st/        d„ |||fD «       «      S t1        |||¬«      S )a	  
        Example:

        ```python
        >>> from transformers import AutoTokenizer, ProphetNetEncoder
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/prophetnet-large-uncased")
        >>> model = ProphetNetEncoder.from_pretrained("patrickvonplaten/prophetnet-large-uncased-standalone")
        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> last_hidden_states = outputs.last_hidden_state
        ```Nz3Either input_ids or inputs_embeds has to be passed.z2Make sure to only pass input_ids or inputs_embeds.ç      ð?r   r&   rº   rk   )r¡   r¶   r   c              3   ó&   K  — | ]	  }|€Œ|–— Œ y ­wr   rk   ©Ú.0Úvs     r!   ú	<genexpr>z,ProphetNetEncoder.forward.<locals>.<genexpr>  s   è ø€ Òl˜qÐ^_Ñ^kœÑlùó   ‚Š)rn   rr   rt   )rz   r¶   rX  Úuse_return_dictrË   rF  rM   r&  r   r(   r   r)   rý   rG  r   r$   rH  r   r   r®   r¼   Ú	enumeraterÄ   rj   r   )r…   r†   r¡   rW  r¶   rX  rY  ÚkwargsÚextended_attention_maskrG  rO   rr   rb   Úall_attentionsÚidxÚencoder_layerÚlayer_outputss                    r!   rŸ   zProphetNetEncoder.forwardÆ  sD  € ð4 2CÐ1NÑ-ÐTX×T_ÑT_×TqÑTqÐà$8Ð$DÑ È$Ï+É+×JjÑJjð 	ð &1Ð%<‘kÀ$Ç+Á+×B]ÑB]ˆàÐ Ð!6ÜÐRÓSÐSØÐ" }Ð'@ÜÐQÓRÐRØÐ" }Ð'<Ø ×0Ñ0°Ó;ˆMð Ð%àn¢Q¨¨d²AÐ%5Ñ6×=Ñ=¸aÀÇÁ×AhÑAhÐjkÐmnÓoÑoÜ—‘˜DŸJ™JÓ'×+Ñ+ñ',Ð#ð '>×&@Ñ&@À×ATÑATÓ&UÑ#à&*Ð#à,0×,DÑ,DÀ]×EXÑEXÐY[ÐZ[ÐE\Ð^k×^rÑ^rÓ,sÑ)Ð˜\à%Ð(;Ñ;ˆØ×2Ñ2°=ÓAˆÜŸ™×-Ñ-¨m¸t¿{¹{×?RÑ?RÐ]a×]jÑ]jÐ-Ókˆá&:¡ÀÐÙ0™°dˆä"+¨D¯K©KÓ"8ò 	FÑˆCÙ#Ø(=ÀÐ@PÑ(PÐ%á)ØØ6Ø"3ôˆMð *¨!Ñ,ˆMâ Ø!/°=ÀÑ3CÐ2EÑ!E‘ð	Fñ  Ø$9¸]Ð<LÑ$LÐ!áÜÑl ]Ð4IÈ>Ð$ZÔlÓlÐlÜØ+Ð;PÐ]kô
ð 	
r"   )NNNNNN)rd   re   rf   r   r“   rP  rU  r   r   r   rÛ   rj   r   rŸ   r¦   r§   s   @r!   rA  rA  ­  s»   ø„ ðÐ/õ ò$ò%ð ð *.Ø.2Ø-1Ø)-Ø,0Ø#'ñN
à—<‘< $Ñ&ðN
ð Ÿ™ tÑ+ðN
ð —|‘| dÑ*ð	N
ð
   $™;ðN
ð # T™kðN
ð ˜D‘[ðN
ð 
Ñ	 òN
ó ôN
r"   rA  z=
    The standalone decoder part of the ProphetNetModel.
    c                   ó\  ‡ — e Zd Zdefˆ fd„Zd„ Zd„ Ze	 	 	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  d	ej                  dz  d
e
dz  dej                  dz  dedz  dedz  dedz  dedz  dej                  dz  deez  fd„«       Zd„ Zd„ Zd„ Zˆ xZS )ÚProphetNetDecoderrz   c           	      ó¼  •— t         ‰|   |«       |j                  | _        |j                  | _        |j                  | _        |j
                  | _        |j                  | _        t        j                  |j                  |j                  |j                  ¬«      | _        t        |«      | _        t        j                  | j                  |j                  d «      | _        t        j"                  t%        |j&                  «      D cg c]  }t)        ||¬«      ‘Œ c}«      | _        t-        |j                  «      | _        d| _        | j3                  «        y c c}w )NrC  r4  F)r’   r“   r1   r@   ré   r®   r   Úmax_target_positionsr   rD  rE  r”   r   rF  rŒ   rG  Úngram_embeddingsrI  r,   Únum_decoder_layersr2  rÄ   r   rH  rK  rL  )r…   rz   Úir–   s      €r!   r“   zProphetNetDecoder.__init__  s  ø€ Ü‰Ñ˜Ô à—\‘\ˆŒ
Ø!×-Ñ-ˆÔØ%+×%AÑ%AˆÔ"Ø—~‘~ˆŒØ$*×$BÑ$BˆÔ!ä!Ÿ|™|¨F×,=Ñ,=¸v×?QÑ?QÐ_e×_rÑ_rÔsˆÔÜ#AÀ&Ó#IˆÔ ä "§¡¨T¯Z©Z¸×9KÑ9KÈTÓ RˆÔÜ—m‘mÜBGÈ×HaÑHaÓBbÖc¸QÔ# F°aÖ8Òcó
ˆŒô &/¨v×/AÑ/AÓ%BˆÔ"à&+ˆÔ#à‰Õùò ds   ÄEc                 ó   — | j                   S r   rO  rô   s    r!   rP  z&ProphetNetDecoder.get_input_embeddings4  rQ  r"   c                 ó   — || _         y r   rO  rS  s     r!   rU  z&ProphetNetDecoder.set_input_embeddings7  rV  r"   Nr†   r¡   rb   Úencoder_attention_maskr[   rW  r9  r¶   rX  rY  r·   r   c                 ó

  — ||n| j                   j                  }||n| j                   j                  }|	|	n| j                   j                  }	|
|
n| j                   j                  }
|€|€t        d«      ‚||t        d«      ‚||€| j                  |«      }|j                  dd \  }}| j                  r%| j                  r|rt        j                  d«       d}|rd|€b|€| j                   j                  r4t        t        | j                   ¬«      t        | j                   ¬«      «      nt        | j                   ¬«      }||j                  «       nd}| j!                  ||f|j"                  |¬	«      \  }}|dk7  rd
\  }}n| j%                  |«      \  }}| j                   j'                  |dz   «      }||z   }| j(                  j*                  }|dk7  r\|j-                  d«      dk(  sJ d«       ‚t/        | j0                  «      D cg c]  }||dz
     |z   j3                  |dd«      ‘Œ  }}d}d}nOt/        | j0                  «      D cg c]  }||dz
     |z   ‘Œ }}| j5                  ||«      }| j7                  ||«      }||d|dd…dddd…f   j3                  d| j                   j8                  dd«      z
  t;        j<                  | j>                  «      j@                  z  }|jC                  |j>                  «      }nd}t;        jD                  |g|z   d«      }| jF                  r| jG                  |«      }tH        jJ                  jM                  || jL                  | j                  ¬«      }|	rdnd}|	r| j                   j0                  dkD  rdnd}|rdnd}|rdnd}|r| j                   jN                  rdnd} tQ        | jR                  «      D ]Œ  \  }!}"|	r7||dd…d|…f   fz  }| j                   j0                  dkD  r||dd…|d…f   fz  } |"||||||||||||¬«      }#|#d   }|sŒ[||#d   fz  }||#d   fz  }| j                   jN                  sŒ„| |#d   fz  } ŒŽ |	r7||dd…d|…f   fz  }| j                   j0                  dkD  r||dd…|d…f   fz  }|dd…d|…f   }$| j                   j0                  dkD  r|dd…|d…f   nd}%|
stU        d„ |$|%|||||| fD «       «      S tW        |$|%|||||| ¬«      S c c}w c c}w )a  
        Example:

        ```python
        >>> from transformers import AutoTokenizer, ProphetNetDecoder
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/prophetnet-large-uncased")
        >>> model = ProphetNetDecoder.from_pretrained("microsoft/prophetnet-large-uncased", add_cross_attention=False)
        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> last_hidden_states = outputs.last_hidden_state
        ```NzGEither `decoder_input_ids` or `decoder_inputs_embeds` has to be passed.zFMake sure to only pass `decoder_input_ids` or `decoder_inputs_embeds`.r&   zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...F)rz   r   )r$   r[   )NNr   zOAt the moment `use_cache` is only supported for `decoder_input_ids` of length 1r[  rº   rk   )	r;  rþ   rR   rS   rO   r[   r9  r¶   r·   r   c              3   ó$   K  — | ]  }||–— Œ
 y ­wr   rk   r]  s     r!   r`  z,ProphetNetDecoder.forward.<locals>.<genexpr>ß  s   è ø€ ò àð =ô ñùs   ‚)rn   ro   r[   rr   rs   rt   ru   r`   ),rz   r9  r¶   rX  rb  rË   rF  r   rK  r¼   ÚloggerÚwarning_onceÚis_encoder_decoderr   r
   rš   rG  r$   Ú!compute_buffered_relative_bucketsr¥   rn  ÚweightrN   r,   r1   rM   Úprepare_attention_maskÚprepare_predict_attention_maskrê   r   r(   r   r)   rý   r/   rH  r   r   r®   r5  rc  rÄ   rj   rq   )&r…   r†   r¡   rb   rs  r[   rW  r9  r¶   rX  rY  r·   rd  rÍ   r0   Úpast_key_values_lengthÚmain_stream_pos_embedrO   rR   rS   Úpredicting_stream_pos_embedrr   rn  r1   Úngram_hidden_statesre  rþ   Úextended_encoder_attention_maskÚall_main_stream_hidden_statesÚall_ngram_stream_hidden_statesÚall_main_stream_attnsÚall_ngram_stream_attnsÚall_cross_attnsrg  Údecoder_layerri  rn   ro   s&                                         r!   rŸ   zProphetNetDecoder.forward:  sð  € ð< "+Ð!6‘I¸D¿K¹K×<QÑ<Qˆ	Ø1BÐ1NÑ-ÐTX×T_ÑT_×TqÑTqÐà$8Ð$DÑ È$Ï+É+×JjÑJjð 	ð &1Ð%<‘kÀ$Ç+Á+×B]ÑB]ˆàÐ Ð!6ÜÐfÓgÐgØÐ" }Ð'@ÜÐeÓfÐfØÐ" }Ð'<Ø ×0Ñ0°Ó;ˆMà&3×&9Ñ&9¸"¸1Ð&=Ñ#ˆ
Oà×&Ò&¨4¯=ª=ÙÜ×#Ñ#Øpôð "	á˜Ð0ð )Ð4¸¿¹×8VÒ8Vô $¤L¸¿¹Ô$DÄlÐZ^×ZeÑZeÔFfÔgä!¨¯©Ô5ð ð FUÐE` ×!?Ñ!?Ô!AÐfgÐà.2×.FÑ.FØ˜Ð)Ø ×'Ñ'Ø+ð /Gó /
Ñ+Ð˜|ð " QÒ&ØPZÑMÐ*Ñ,Mð
 ×6Ñ6°|ÓDñØ.Ø1à&*×&>Ñ&>×&GÑ&GÈÐWXÑHXÓ&YÐ#ð &Ð(=Ñ=ˆà×0Ñ0×7Ñ7Ðð " QÒ&Ø ×%Ñ% aÓ(¨AÒ-ð ØaóÐ-ô # 4§:¡:Ó.ö#àð " %¨!¡)Ñ,Ð/JÑJ×RÑRÐS]Ð_`ÐbcÕdð#Ðð #ð '+Ð#Ø.2Ñ+ô Z_Ð_c×_iÑ_iÓYjö#ØPUÐ! %¨!¡)Ñ,Ð/JÓJð#Ðð #ð '+×&AÑ&AÀ-ÐQ_Ó&`Ð#Ø.2×.QÑ.QÐR_ÐaoÓ.pÐ+ð "Ð-àÐ,ªQ°°dºAÐ-=Ñ>×EÑEÀaÈÏÉ×IpÑIpÐrsÐuvÓwÑwÜ—‘˜DŸJ™JÓ'×+Ñ+ñ/,Ð+ð /N×.PÑ.PÐQ^×QdÑQdÓ.eÑ+à.2Ð+äŸ	™	 = /Ð4GÑ"GÈÓKˆà×%Ò%Ø ×6Ñ6°}ÓEˆMäŸ™×-Ñ-¨m¸t¿|¹|ÐVZ×VcÑVcÐ-Ódˆñ /C©ÈÐ%Ù/CÈÏÉ×HYÑHYÐ\]ÒH]©ÐcgÐ&á&7¡¸TÐÙ'8¡¸dÐÙ 1°d·k±k×6UÒ6U™"Ð[_ˆä"+¨D¯K©KÓ"8ò 	;ÑˆCÙ#à-°-ÂÐCSÀOÐCSÐ@SÑ2TÐ1VÑVÐ-Ø—;‘;×$Ñ$ qÒ(Ø2°}ÂQÈÑHXÐEXÑ7YÐ6[Ñ[Ð2á)ØØ'Ø%Ø"AØ0OØ/MØ2SØ)Ø /Ø#Ø"3Ø-ôˆMð *¨!Ñ,ˆMÚ Ø%¨-¸Ñ*:Ð)<Ñ<Ð%Ø&¨=¸Ñ+;Ð*=Ñ=Ð&à—;‘;×2Ó2Ø#¨°aÑ(8Ð':Ñ:‘Oð9	;ñ<  Ø)¨mºAÐ?OÀÐ?OÐ<OÑ.PÐ-RÑRÐ)Ø{‰{× Ñ  1Ò$Ø.°=ÂÀOÑDTÐATÑ3UÐ2WÑWÐ.ð *ª!Ð-=¨oÐ-=Ð*=Ñ>ÐØHLÏÉ×HYÑHYÐ\]ÒH] -²°?Ñ3CÐ0CÒ"DÐcgÐáÜñ ð &Ø+Ø#Ø1Ø2Ø)Ø*Ø#ð	ôó ð ô ,Ø/Ø$;Ø+Ø7Ø >Ø,Ø3Ø,ô	
ð 		
ùòw#ùò#s   È#S;ÉT c           	      óð  — |j                   \  }}t        j                  d| j                  «      j	                  |j
                  «      j                  dd«      }t        | j                  | j                  |«      \  }}|d d …d |…d |…f   j                  |dd«      }t        j                  |d d …d |…d |…f   |d d …d |…| j                  | j                  |z   …f   gd«      j                  |dd«      }||fS rî   )r   r   r  rm  rý   r$   rM   rT   r@   ré   r/   )r…   rO   rÍ   r0   Úmain_relative_bucketsÚpredict_relative_bucketss         r!   ry  z3ProphetNetDecoder.compute_buffered_relative_bucketsø  s!  € Ø&2×&8Ñ&8Ñ#ˆ
Oä—|‘| A t×'@Ñ'@ÓA×DÑDÀ\×EXÑEXÓY×`Ñ`ÐabÐdeÓfˆÜ:]Ø×Ñ˜d×8Ñ8¸,ó;
Ñ7ÐÐ7ð
 !6²aÐ9I¸/Ð9IÐK[ÈOÐK[Ð6[Ñ \× cÑ cÐdnÐpqÐstÓ uÐÜ#(§9¡9à(ªÐ,<¨_Ð,<Ð>N¸Ð>NÐ)NÑOØ(ÚÐ'˜Ð'¨×)BÑ)BÀT×E^ÑE^ÐapÑEpÐ)pÐpñðð ó$
÷ ‰&˜Q Ó
"ð 	!ð %Ð&>Ð>Ð>r"   c                 óL  — |j                   d d \  }}t        j                  ||ft        j                  |j                  «      j
                  |j                  |j                  ¬«      }t        j                  |d«      }|d |…d |…f   d d d d …d d …f   j                  || j                  j                  f|j                   z   «      }|@d|d d …d d d d …f   z
  t        j                  | j                  «      j
                  z  }||z   }n|}|j                  |j                  «      S )Nr&   r˜   r   r[  )r   r   Úfullr(   r   r)   r$   ÚtriuÚexpandrz   rê   rý   )r…   rr   r¡   rÍ   Ú
seq_lengthÚcausal_maskÚextended_causal_maskre  s           r!   r{  z(ProphetNetDecoder.prepare_attention_mask  s%  € Ø!.×!4Ñ!4°R°aÐ!8Ñˆ
Jô —j‘jØ˜Ð$ÜK‰K˜×+Ñ+Ó,×0Ñ0Ø×%Ñ%Ø ×'Ñ'ô	
ˆô —j‘j ¨aÓ0ˆà*¨;¨J¨;¸¸¸Ð+CÑDÀTÈ4ÒQRÒTUÐEUÑV×]Ñ]Ø˜Ÿ™×@Ñ@ÐAÀK×DUÑDUÑUó 
Ðð
 Ð%Ø'*¨^ºA¸tÀTÊ1Ð<LÑ-MÑ'MÔQV×Q\ÑQ\Ð]a×]gÑ]gÓQh×QlÑQlÑ&lÐ#Ø&:Ð=TÑ&TÑ#à&:Ð#Ø&×)Ñ)¨-×*=Ñ*=Ó>Ð>r"   c           	      ó&  — |j                   d d \  }}t        | j                  | j                  |j                  |j
                  «      }t        j                  |d d …d |…d |…f   |d d …d |…| j                  | j                  |z   …f   gd¬«      }|d d d d …d d …d d …f   j                  || j                  j                  f|j                   z   «      }|¡d|d d …d d d d d …f   z
  t        j                  | j
                  «      j                  z  }|j                  || j                  j                  | j                  ||f«      }t        j                  |t        j                  |«      gd¬«      }||z   }n|}|j                  |j
                  «      S )Nr&   rK   r   r[  )r   r5   rm  r1   r$   r   r   r/   rŽ  rz   rê   r(   r)   r8   rý   )	r…   rr   r¡   rÍ   r  Úpredict_causal_maskÚextended_predict_causal_maskre  rþ   s	            r!   r|  z0ProphetNetDecoder.prepare_predict_attention_mask&  sª  € Ø!.×!4Ñ!4°R°aÐ!8Ñˆ
Jô 3Ø×%Ñ% t§z¡z°=×3GÑ3GÈ×I\ÑI\ó
Ðô $Ÿi™ià#¢A {¨
 {°K°Z°KÐ$?Ñ@Ø#Ú{˜
{ D×$=Ñ$=À×@YÑ@YÐ\fÑ@fÐ$fÐfñðð ô
Ðð (;¸4ÀÂqÊ!ÊQÐ;NÑ'O×'VÑ'VØ˜Ÿ™×@Ñ@ÐAÐDW×D]ÑD]Ñ]ó(
Ð$ð
 Ð%Ø'*¨^ºA¸tÀTÈ4ÒQRÐ<RÑ-SÑ'SÔW\×WbÑWbÐcg×cmÑcmÓWn×WrÑWrÑ&rÐ#Ø&=×&DÑ&DØ˜TŸ[™[×DÑDÀdÇjÁjÐR\Ð^hÐió'Ð#ô ',§i¡iØ(¬%×*:Ñ*:Ð;RÓ*SÐTÐZ\ô'Ð#ð /KÐMdÑ.dÑ+à.JÐ+Ø.×1Ñ1°-×2EÑ2EÓFÐFr"   ©NNNNNNNNNNN)rd   re   rf   r   r“   rP  rU  r   r   r   r	   rÛ   rj   rq   rŸ   ry  r{  r|  r¦   r§   s   @r!   rk  rk    s6  ø„ ðÐ/õ ò,$ò%ð ð *.Ø.2Ø59Ø6:Ø(,Ø-1Ø!%Ø)-Ø,0Ø#'Ø.2ñ{
à—<‘< $Ñ&ð{
ð Ÿ™ tÑ+ð{
ð  %Ÿ|™|¨dÑ2ð	{
ð
 !&§¡¨tÑ 3ð{
ð  ™ð{
ð —|‘| dÑ*ð{
ð ˜$‘;ð{
ð   $™;ð{
ð # T™kð{
ð ˜D‘[ð{
ð Ÿ™ tÑ+ð{
ð 
Ð-Ñ	-ò{
ó ð{
òz?ò,?ö0!Gr"   rk  c                   ó€  ‡ — e Zd ZdddœZdefˆ fd„Zd„ Zd„ Ze	 	 	 	 	 	 	 	 	 	 	 	 	 dde	j                  dz  d	e	j                  dz  d
e	j                  dz  de	j                  dz  dedz  dedz  de	j                  dz  de	j                  dz  dedz  dedz  dedz  dedz  de	j                  dz  deez  fd„«       Zˆ xZS )ÚProphetNetModelúword_embeddings.weight)zencoder.word_embeddings.weightúdecoder.word_embeddings.weightrz   c                 ój  •— t         ‰|   |«       t        j                  |j                  |j
                  |j                  ¬«      | _        t        j                  |«      }d|_
        t        |«      | _        t        j                  |«      }d|_        t        |«      | _        | j!                  «        y )NrC  FT)r’   r“   r   rD  rE  r”   r   rF  ÚcopyÚdeepcopyr9  rA  ÚencoderÚ
is_decoderrk  ÚdecoderrL  )r…   rz   Úencoder_configÚdecoder_configr–   s       €r!   r“   zProphetNetModel.__init__Q  sˆ   ø€ Ü‰Ñ˜Ô Ü!Ÿ|™|¨F×,=Ñ,=¸v×?QÑ?QÐ_e×_rÑ_rÔsˆÔäŸ™ vÓ.ˆØ#(ˆÔ Ü(¨Ó8ˆŒäŸ™ vÓ.ˆØ$(ˆÔ!Ü(¨Ó8ˆŒð 	‰Õr"   c                 ó   — | j                   S r   rO  rô   s    r!   rP  z$ProphetNetModel.get_input_embeddings`  rQ  r"   c                 ó~   — || _         | j                   | j                  _         | j                   | j                  _         y r   )rF  r  rŸ  rS  s     r!   rU  z$ProphetNetModel.set_input_embeddingsc  s.   € Ø$ˆÔØ'+×';Ñ';ˆ‰Ô$Ø'+×';Ñ';ˆ‰Õ$r"   Nr†   r¡   Údecoder_input_idsÚdecoder_attention_maskÚencoder_outputsr[   rW  Údecoder_inputs_embedsr9  r¶   rX  rY  r·   r   c                 óX  — |	|	n| j                   j                  }	|
|
n| j                   j                  }
||n| j                   j                  }||n| j                   j                  }|€| j                  ||||
||¬«      }| j                  |||d   ||||
||	||¬«      }|s||z   S t        |j                  |j                  |j                  |j                  |j                  |j                  |j                  |j                  |j                  |j                  |j                  ¬«      S )añ  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            ProphetNet uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If
            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
            `past_key_values`).
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, ProphetNetModel

        >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/prophetnet-large-uncased")
        >>> model = ProphetNetModel.from_pretrained("microsoft/prophetnet-large-uncased")

        >>> input_ids = tokenizer(
        ...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
        ... ).input_ids  # Batch size 1
        >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1
        >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)

        >>> last_hidden_states = outputs.last_hidden_state  # main stream hidden states
        >>> last_hidden_states_ngram = outputs.last_hidden_state_ngram  # predict hidden states
        ```)r†   r¡   rW  r¶   rX  rY  r   )r†   r¡   rb   rs  r[   rW  r¶   rX  r9  rY  r·   )rn   ro   r[   r\   r]   r^   r_   r`   ra   rb   rc   )rz   r9  r¶   rX  rb  r  rŸ  rm   rn   ro   r[   rr   rs   rt   ru   r`   )r…   r†   r¡   r¤  r¥  r¦  r[   rW  r§  r9  r¶   rX  rY  r·   rd  Údecoder_outputss                   r!   rŸ   zProphetNetModel.forwardh  sN  € ðf "+Ð!6‘I¸D¿K¹K×<QÑ<Qˆ	Ø1BÐ1NÑ-ÐTX×T_ÑT_×TqÑTqÐà$8Ð$DÑ È$Ï+É+×JjÑJjð 	ð &1Ð%<‘kÀ$Ç+Á+×B]ÑB]ˆàÐ"Ø"Ÿl™lØ#Ø-Ø+Ø"3Ø%9Ø'ð +ó ˆOð Ÿ,™,Ø'Ø1Ø"1°!Ñ"4Ø#1Ø+Ø/Ø/Ø!5ØØ#Ø)ð 'ó 
ˆñ Ø" _Ñ4Ð4Ü+Ø-×?Ñ?Ø$3×$KÑ$KØ+×;Ñ;Ø"1×"?Ñ"?Ø(7×(KÑ(KØ.×9Ñ9Ø%4×%EÑ%EØ,×=Ñ=Ø&5×&GÑ&GØ"1×"?Ñ"?Ø.×9Ñ9ô
ð 	
r"   )NNNNNNNNNNNNN)rd   re   rf   Ú_tied_weights_keysr   r“   rP  rU  r   r   r   Ú
BoolTensorrj   r	   rÛ   rm   rŸ   r¦   r§   s   @r!   r—  r—  J  sa  ø„ ð +CØ*BñÐð
Ð/õ ò$ò<ð
 ð *.Ø.2Ø15Ø:>Ø(,Ø(,Ø-1Ø59Ø!%Ø)-Ø,0Ø#'Ø.2ñ`
à—<‘< $Ñ&ð`
ð Ÿ™ tÑ+ð`
ð !Ÿ<™<¨$Ñ.ð	`
ð
 !&× 0Ñ 0°4Ñ 7ð`
ð  ™ð`
ð  ™ð`
ð —|‘| dÑ*ð`
ð  %Ÿ|™|¨dÑ2ð`
ð ˜$‘;ð`
ð   $™;ð`
ð # T™kð`
ð ˜D‘[ð`
ð Ÿ™ tÑ+ð`
ð  
Ð-Ñ	-ò!`
ó ô`
r"   r—  zh
    The ProphetNet Model with a language modeling head. Can be used for sequence generation tasks.
    c            !       óà  ‡ — e Zd ZddiZdefˆ fd„Zd„ Ze	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  d	ej                  dz  d
ej                  dz  dej                  dz  dedz  dej                  dz  dej                  dz  dej                  dz  dedz  dedz  dedz  dedz  dej                  dz  deez  fd„«       Zdd„Zdej                  fd„Zdˆ fd„	Zˆ xZS )Ú"ProphetNetForConditionalGenerationúlm_head.weightú!prophetnet.word_embeddings.weightrz   c                 ó
  •— t         ‰|   |«       t        |«      | _        |j                  | _        |j                  | _        t        j                  |j                  |j                  d¬«      | _        | j                  «        y )NF©Úbias)r’   r“   r—  r{   r   r™   Údisable_ngram_lossr   r°   r”   rE  Úlm_headrL  r•   s     €r!   r“   z+ProphetNetForConditionalGeneration.__init__Ö  sd   ø€ Ü‰Ñ˜Ô Ü)¨&Ó1ˆŒØ!×.Ñ.ˆÔØ"(×";Ñ";ˆÔä—y‘y ×!3Ñ!3°V×5FÑ5FÈUÔSˆŒð 	‰Õr"   c                 ó.   — | j                   j                  S r   )r{   rF  rô   s    r!   rP  z7ProphetNetForConditionalGeneration.get_input_embeddingsá  s   € Ø‰×.Ñ.Ð.r"   Nr†   r¡   r¤  r¥  r¦  r[   rW  r§  Úlabelsr9  r¶   rX  rY  r·   r   c                 ó\  — ||n| j                   j                  }|	|€|€| j                  |	«      }| j                  |||||||||
||||¬«      }||j                  n|j                  dd \  }}|d   j                  || j                   j                  |d«      }| j                  |«      }|dd…df   }| j                   j                  dkD  r|dd…dd…f   nd}|j                  «       s|j                  «       }d}|	| j                  ||	«      }|s*t        d„ ||fD «       «      }||f|z   |dd z   S ||dd z   S t        ||||j                  |j                  |j                  |j                   |j"                  |j$                  |j&                  |j(                  |j*                  ¬«      S )	a…  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            ProphetNet uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If
            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
            `past_key_values`).
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[-100, 0, ...,
            config.vocab_size - 1]`. All labels set to `-100` are ignored (masked), the loss is only computed for
            labels in `[0, ..., config.vocab_size]`

        Example:

        ```python
        >>> from transformers import AutoTokenizer, ProphetNetForConditionalGeneration

        >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/prophetnet-large-uncased")
        >>> model = ProphetNetForConditionalGeneration.from_pretrained("microsoft/prophetnet-large-uncased")

        >>> input_ids = tokenizer(
        ...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
        ... ).input_ids  # Batch size 1
        >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1
        >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)

        >>> logits_next_token = outputs.logits  # logits to predict next token as usual
        >>> logits_ngram_next_tokens = outputs.logits_ngram  # logits to predict 2nd, 3rd, ... next tokens
        ```N)r†   r¡   r¤  r¥  r¦  r[   rW  r§  r9  r¶   rX  rY  r·   r&   r   rK   r   c              3   ó&   K  — | ]	  }|€Œ|–— Œ y ­wr   rk   r]  s     r!   r`  z=ProphetNetForConditionalGeneration.forward.<locals>.<genexpr>D  ó   è ø€ ÒR QÀAÁMœqÑRùra  )rX   rY   rZ   r[   r\   r]   r^   r_   r`   ra   rb   rc   )rz   rb  rˆ   r{   r   rÇ   r1   r´  Úis_contiguousrï   Ú_compute_lossrj   rW   r[   r\   r]   r^   r_   r`   ra   rb   rc   )r…   r†   r¡   r¤  r¥  r¦  r[   rW  r§  r¶  r9  r¶   rX  rY  r·   rd  r/  rÍ   r0   Úpredicting_streamsÚpredict_logitsrY   rZ   rX   Ú
all_logitss                            r!   rŸ   z*ProphetNetForConditionalGeneration.forwardä  só  € ðp &1Ð%<‘kÀ$Ç+Á+×B]ÑB]ˆàÐÐ"3Ð";Ð@UÐ@]à $× 1Ñ 1°&Ó 9Ðà—/‘/ØØ)Ø/Ø#9Ø+Ø+Ø'Ø"7ØØ/Ø!5Ø#Ø)ð "ó 
ˆð  (9Ð'DÐ×#Ò#ÐJ_×JeÑJeÐfhÐghÐJiñ 	$ˆ
Oð % Q™ZŸ_™_¨Z¸¿¹×9JÑ9JÈOÐ]_Ó`ÐØŸ™Ð&8Ó9ˆà¢ 1 Ñ%ˆØ04·±×0AÑ0AÀAÒ0E~¢a¨© eÒ,È4ˆð ×#Ñ#Ô%Ø×&Ñ&Ó(ˆFàˆØÐØ×%Ñ% n°fÓ=ˆDáÜÑR¨6°<Ð*@ÔRÓRˆJØ9=Ð9ID7˜ZÑ'¨'°!°"¨+Ñ5ÐgÈzÐ\cÐdeÐdfÐ\gÑOgÐgä,ØØØ)Ø '× 7Ñ 7Ø&-×&CÑ&CØ,3×,OÑ,OØ#*×#=Ñ#=Ø)0×)IÑ)IØ!(×!9Ñ!9Ø*1×*KÑ*KØ&-×&CÑ&CØ#*×#=Ñ#=ôð r"   c                 óÌ  — |j                  | j                  j                  |j                  d«      |j                  d«      «      j	                  |«      }t        | j                  j                  «      D ]!  }|dkD  r| j                  r n|||d d …d d …f<   Œ# |j                  dd«      j                  «       }t        j                  j                  |j                  d|j                  d«      «      dt        j                  ¬«      }t        j                  j                  ||j                  d«      d¬«      }| j                  j                   dkD  r“|j#                  dd¬	«       }|j%                  |«      j                  d«      }	||	   }|j'                  «       }| j                  j                   |j                  d«      z  }
d
| j                  j                   z
  |z  |
|z  z   }|S ©Nr   r   rK   r   Úmean)Ú	reductiong        T)r   Úkeepdimr[  ©r€   rz   r1   rN   Úfill_r,   r³  rÈ   rï   r   r   Úlog_softmaxrÇ   r   r   Únll_lossÚepsÚsumÚnerÁ  ©r…   rY   r¶  Úignore_indexÚexpend_targetsrp  ÚlprobsrX   Úsmooth_lossÚnon_masked_tokensÚeps_is              r!   r»  z0ProphetNetForConditionalGeneration._compute_lossV  ó–  € Ø×)Ñ)¨$¯+©+×*;Ñ*;¸V¿[¹[È»^ÈVÏ[É[ÐYZË^Ó\×bÑbÐcoÓpˆät—{‘{×(Ñ(Ó)ò 	-ˆAØ1Šu˜×0Ò0ÙØ&,ˆN˜1ša¢˜7Ò#ð	-ð
 ×!Ñ! ! QÓ'×2Ñ2Ó4ˆÜ—‘×*Ñ*ØK‰K˜˜FŸK™K¨›OÓ,ØÜ—-‘-ð +ó 
ˆô }‰}×%Ñ% f¨n×.AÑ.AÀ"Ó.EÐQWÐ%ÓXˆà;‰;?‰?˜SÒ Ø!Ÿ:™:¨"°d˜:Ó;Ð;ˆKØ .× 1Ñ 1°,Ó ?× DÑ DÀRÓ HÐØ%Ð&7Ñ8ˆKØ%×*Ñ*Ó,ˆKà—K‘K—O‘O f§k¡k°"£oÑ5ˆEØ˜$Ÿ+™+Ÿ/™/Ñ)¨TÑ1°E¸KÑ4GÑGˆDàˆr"   c                 ó$   — | j                  |«      S r   )rˆ   )r…   r¶  s     r!   Ú%prepare_decoder_input_ids_from_labelszHProphetNetForConditionalGeneration.prepare_decoder_input_ids_from_labelsr  s   € Ø× Ñ  Ó(Ð(r"   c                 óT   •— |€| j                   j                  S t        ‰|   |¬«      S )N)Úmodality)r{   r  r’   Úget_encoder)r…   rÖ  r–   s     €r!   r×  z.ProphetNetForConditionalGeneration.get_encoderu  s,   ø€ ØÐØ—?‘?×*Ñ*Ð*ä‘7Ñ&°Ð&Ó9Ð9r"   )NNNNNNNNNNNNNN©r}   r   )rd   re   rf   rª  r   r“   rP  r   r   r   r«  r	   rÛ   rj   rW   rŸ   r»  rÔ  r×  r¦   r§   s   @r!   r­  r­  Ì  s—  ø„ ð 	Ð=ðÐð	Ð/õ 	ò/ð ð *.Ø.2Ø15Ø:>Ø/3Ø(,Ø-1Ø59Ø&*Ø!%Ø)-Ø,0Ø#'Ø.2ñoà—<‘< $Ñ&ðoð Ÿ™ tÑ+ðoð !Ÿ<™<¨$Ñ.ð	oð
 !&× 0Ñ 0°4Ñ 7ðoð Ÿ™¨Ñ,ðoð  ™ðoð —|‘| dÑ*ðoð  %Ÿ|™|¨dÑ2ðoð —‘˜tÑ#ðoð ˜$‘;ðoð   $™;ðoð # T™kðoð ˜D‘[ðoð Ÿ™ tÑ+ðoð" 
Ð*Ñ	*ò#oó ðoóbð8)¸E¿L¹Ló )÷:ñ :r"   r­  zt
    The standalone decoder part of the ProphetNetModel with a lm head on top. The model can be used for causal
    c                   óp  ‡ — e Zd ZdddœZdefˆ fd„Zd„ Zd„ Ze	 	 	 	 	 	 	 	 	 	 	 dde	j                  dz  d	e	j                  dz  d
e	j                  dz  de	j                  dz  dedz  de	j                  dz  de	j                  dz  dedz  dedz  dedz  dedz  deez  fd„«       Zdd„Z	 	 	 	 dˆ fd„	Zˆ xZS )ÚProphetNetForCausalLMr¯  )r®  z)prophetnet.decoder.word_embeddings.weightrz   c                 óP  •— t        j                  |«      }d|_        d|_        t        ‰|   |«       t        |«      | _        |j                  | _	        |j                  | _
        t        j                  |j                  |j                  d¬«      | _        | j!                  «        y )NTFr±  )r›  rœ  rž  rx  r’   r“   ÚProphetNetDecoderWrapperr{   r   r™   r³  r   r°   r”   rE  r´  rL  r•   s     €r!   r“   zProphetNetForCausalLM.__init__‡  s‚   ø€ ä—‘˜vÓ&ˆØ ˆÔØ$)ˆÔ!Ü‰Ñ˜Ô Ü2°6Ó:ˆŒà!×.Ñ.ˆÔØ"(×";Ñ";ˆÔä—y‘y ×!3Ñ!3°V×5FÑ5FÈUÔSˆŒð 	‰Õr"   c                 óB   — | j                   j                  j                  S r   ©r{   rŸ  rF  rô   s    r!   rP  z*ProphetNetForCausalLM.get_input_embeddings—  s   € Ø‰×&Ñ&×6Ñ6Ð6r"   c                 ó:   — || j                   j                  _        y r   rÞ  rS  s     r!   rU  z*ProphetNetForCausalLM.set_input_embeddingsš  s   € Ø27ˆ‰×ÑÕ/r"   Nr†   r¡   rb   rs  r[   rW  r¶  r9  r¶   rX  rY  r   c                 óº  — ||n| j                   j                  }| j                  j                  ||||||||	|
|¬«
      }||j                  n|j                  dd \  }}|d   j                  || j                   j                  |d«      }| j                  |«      }|dd…df   }| j                   j                  dkD  r|dd…dd…f   nd}d}|| j                  ||«      }|s*t        d„ ||fD «       «      }||f|z   |dd z   S ||dd z   S t        ||||j                  |j                  |j                  |j                  |j                  |j                   ¬«	      S )	aª  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
            ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]`

        Example:

        ```python
        >>> from transformers import AutoTokenizer, ProphetNetForCausalLM
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/prophetnet-large-uncased")
        >>> model = ProphetNetForCausalLM.from_pretrained("microsoft/prophetnet-large-uncased")
        >>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder."
        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> logits = outputs.logits

        >>> # Model can also be used with EncoderDecoder framework
        >>> from transformers import BertTokenizer, EncoderDecoderModel, AutoTokenizer
        >>> import torch

        >>> tokenizer_enc = BertTokenizer.from_pretrained("google-bert/bert-large-uncased")
        >>> tokenizer_dec = AutoTokenizer.from_pretrained("microsoft/prophetnet-large-uncased")
        >>> model = EncoderDecoderModel.from_encoder_decoder_pretrained(
        ...     "google-bert/bert-large-uncased", "microsoft/prophetnet-large-uncased"
        ... )

        >>> ARTICLE = (
        ...     "the us state department said wednesday it had received no "
        ...     "formal word from bolivia that it was expelling the us ambassador there "
        ...     "but said the charges made against him are `` baseless ."
        ... )
        >>> input_ids = tokenizer_enc(ARTICLE, return_tensors="pt").input_ids
        >>> labels = tokenizer_dec(
        ...     "us rejects charges against its ambassador in bolivia", return_tensors="pt"
        ... ).input_ids
        >>> outputs = model(input_ids=input_ids, decoder_input_ids=labels[:, :-1], labels=labels[:, 1:])

        >>> loss = outputs.loss
        ```N)
r†   r¡   rb   rs  r[   rW  r9  r¶   rX  rY  r&   r   rK   r   c              3   ó&   K  — | ]	  }|€Œ|–— Œ y ­wr   rk   r]  s     r!   r`  z0ProphetNetForCausalLM.forward.<locals>.<genexpr>õ  r¹  ra  )	rX   rY   rZ   r[   rr   rs   rt   ru   r`   )rz   rb  r{   rŸ  r   rÇ   r1   r´  r»  rj   rw   r[   rr   rs   rt   ru   r`   )r…   r†   r¡   rb   rs  r[   rW  r¶  r9  r¶   rX  rY  rd  r/  rÍ   r0   r¼  r½  rY   rZ   rX   r¾  s                         r!   rŸ   zProphetNetForCausalLM.forward  s’  € ðv &1Ð%<‘kÀ$Ç+Á+×B]ÑB]ˆð —/‘/×)Ñ)ØØ)Ø"7Ø#9Ø+Ø'ØØ/Ø!5Ø#ð *ó 
ˆð :CÐ9N i§o¢oÐTa×TgÑTgÐhjÐijÐTkÑ#ˆ
Oà$ Q™ZŸ_™_¨Z¸¿¹×9JÑ9JÈOÐ]_Ó`ÐØŸ™Ð&8Ó9ˆà¢ 1 Ñ%ˆØ04·±×0AÑ0AÀAÒ0E~¢a¨© eÒ,È4ˆàˆØÐØ×%Ñ% n°fÓ=ˆDáÜÑR¨6°<Ð*@ÔRÓRˆJØ9=Ð9ID7˜ZÑ'¨'°!°"¨+Ñ5ÐgÈzÐ\cÐdeÐdfÐ\gÑOgÐgä,ØØØ)Ø '× 7Ñ 7Ø%×3Ñ3Ø$+×$?Ñ$?Ø"×-Ñ-Ø!(×!9Ñ!9Ø!(×!9Ñ!9ô
ð 
r"   c                 óÌ  — |j                  | j                  j                  |j                  d«      |j                  d«      «      j	                  |«      }t        | j                  j                  «      D ]!  }|dkD  r| j                  r n|||d d …d d …f<   Œ# |j                  dd«      j                  «       }t        j                  j                  |j                  d|j                  d«      «      dt        j                  ¬«      }t        j                  j                  ||j                  d«      d¬«      }| j                  j                   dkD  r“|j#                  dd¬	«       }|j%                  |«      j                  d«      }	||	   }|j'                  «       }| j                  j                   |j                  d«      z  }
d
| j                  j                   z
  |z  |
|z  z   }|S rÀ  rÄ  rË  s              r!   r»  z#ProphetNetForCausalLM._compute_loss  rÒ  r"   c                 óV   •— t        ‰|   |f||||dœ|¤Ž}|j                  dd «       |S )N)r[   r¡   r9  Úis_first_iterationr·   )r’   Úprepare_inputs_for_generationÚpop)	r…   r†   r[   r¡   r9  rä  rd  Úmodel_inputsr–   s	           €r!   rå  z3ProphetNetForCausalLM.prepare_inputs_for_generation   sK   ø€ ô ‘wÑ<Øð
à+Ø)ØØ1ñ
ð ñ
ˆð 	×ÑÐ)¨4Ô0àÐr"   r•  rØ  )NNNF)rd   re   rf   rª  r   r“   rP  rU  r   r   r   r	   rÛ   rj   rw   rŸ   r»  rå  r¦   r§   s   @r!   rÚ  rÚ  |  sS  ø„ ð >Ø5XñÐð
Ð/õ ò 7ò8ð ð *.Ø.2Ø59Ø6:Ø(,Ø-1Ø&*Ø!%Ø)-Ø,0Ø#'ñdà—<‘< $Ñ&ðdð Ÿ™ tÑ+ðdð  %Ÿ|™|¨dÑ2ð	dð
 !&§¡¨tÑ 3ðdð  ™ðdð —|‘| dÑ*ðdð —‘˜tÑ#ðdð ˜$‘;ðdð   $™;ðdð # T™kðdð ˜D‘[ðdð 
Ð*Ñ	*òdó ðdóLð> ØØØ ÷ñ r"   rÚ  c                   ó6   ‡ — e Zd ZdZddiZdefˆ fd„Zd„ Zˆ xZS )rÜ  z„
    This is a wrapper class, so that [`ProphetNetForCausalLM`] can correctly be loaded from pretrained prophetnet
    classes.
    r™  r˜  rz   c                 óÚ   •— t         ‰|   |«       t        j                  |j                  |j
                  |j                  ¬«      | _        t        |«      | _	        | j                  «        y )NrC  )r’   r“   r   rD  rE  r”   r   rF  rk  rŸ  rL  r•   s     €r!   r“   z!ProphetNetDecoderWrapper.__init__C  sP   ø€ Ü‰Ñ˜Ô ä!Ÿ|™|¨F×,=Ñ,=¸v×?QÑ?QÐ_e×_rÑ_rÔsˆÔÜ(¨Ó0ˆŒð 	‰Õr"   c                 ó&   —  | j                   |i |¤ŽS r   )rŸ  )r…   Úargsrd  s      r!   rŸ   z ProphetNetDecoderWrapper.forwardL  s   € Øˆt|‰|˜TÐ, VÑ,Ð,r"   )	rd   re   rf   rg   rª  r   r“   rŸ   r¦   r§   s   @r!   rÜ  rÜ  9  s*   ø„ ñð 	)Ð*BðÐðÐ/õ ö-r"   rÜ  )rk  rA  rÚ  r­  r—  ry   r0  )9rg   r›  r=   Údataclassesr   r   r   r   Útorch.nnr   Úactivationsr   Úcache_utilsr	   r
   r   Ú
generationr   Úmodeling_layersr   Úmodeling_outputsr   Úmodeling_utilsr   Úutilsr   r   r   Úconfiguration_prophetnetr   Ú
get_loggerrd   rv  r   r5   rI   rT   rW   rm   rq   rw   ry   rD  rŒ   ÚModuler©   rÝ   rç   r$  r2  rA  rk  r—  r­  rÚ  rÜ  Ú__all__rk   r"   r!   ú<module>rù     s€  ðñ Yã Û Ý !ã ß Ý å !ß CÑ CÝ )Ý 9Ý /Ý -ß 9Ñ 9Ý 6ð 
ˆ×	Ñ	˜HÓ	%€óQò7ó" ò6Mð. Ùðôô
*? ó *?óó ð*?ðZ Ùðôô(? ;ó (?óó ð(?ðV Ùðôô
#= ;ó #=óó ð#=ðL Ùðôô
+= ó +=óó ð+=ð\ ô! ó !ó ð!ô8(- R§\¡\ô (-ôVv2˜"Ÿ)™)ô v2ôr˜BŸI™Iô ô.n/ 2§9¡9ô n/ôb	&Ð7ô &ôRDÐ7ô DñN ðôô
c
Ð1ó c
óð
c
ñL ðôô
jGÐ1ó jGóð
jGðZ	 ô~
Ð/ó ~
ó ð~
ñB ðôô
h:Ð)BÀOó h:óð
h:ñV ðôô
uÐ5°ó uóð
uôp-Ð8ô -ò.r"   