
    qiU                       d Z ddlZddlZddlmZ ddlZddlmZmZ ddlm	Z	 ddl
mZ ddlmZmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZmZmZ ddlmZ  ej<                  e      Z d?dZ!d Z"d?dZ#d Z$e ed       G d de                    Z%e ed       G d de                    Z&e ed       G d de                    Z'e ed       G d de                    Z(e G d  d!e             Z) G d" d#ejT                        Z+ G d$ d%ejX                        Z- G d& d'ejX                        Z. G d( d)ejX                        Z/ G d* d+e      Z0 G d, d-e      Z1 ed.       G d/ d0e)             Z2 ed1       G d2 d3e)             Z3e G d4 d5e)             Z4 ed6       G d7 d8e)e             Z5 ed9       G d: d;e)e             Z6 G d< d=e)      Z7g d>Z8y)@zRPyTorch ProphetNet model, ported from ProphetNet repo(fairsequery_states version).    N)	dataclass)Tensornn)	LayerNorm   )ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)GradientCheckpointingLayer)BaseModelOutput)PreTrainedModel)ModelOutputauto_docstringlogging   )ProphetNetConfigc                     |r/t         j                  j                  | j                         |      S t         j                  j                  | |t        j
                        S )Ndimr   dtype)r   
functionalsoftmaxfloattorchfloat32)hidden_stater   
onnx_traces      d/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/prophetnet/modeling_prophetnet.pyr   r   %   sH    }}$$\%7%7%9s$CC}}$$\s%--$PP    c                 z   t        j                  || | f||      t        j                  |      j                  z  }|j	                         j                         }t        |      D ]0  }||   j                  dd       ||   j                  | dz          2 d|dddddf<   t        j                  ||gd      S )	z@
    This function computes the bias for the predict stream
    )devicer   r   F)wrapr   N   r   )
r   onesfinfomindetachclonerangefill_diagonal_triu_cat)sequence_lengthngramr$   r   
left_blockright_block
stream_idxs          r!   ngram_attention_biasr5   ,   s    
 	

E?O<VSXY\a\g\ghm\n\r\rr  ##%++-KEl 6
J..qu.=:$$j[1_56 Jq!Qw99j+.A66r"   c                    | }d}|rX| dz  } |t        j                  |t        j                  |            j                         | z  z   }t        j                  |      }n)t        j
                  |t        j                  |            }| dz  }t        j                  ||      }|t        j                  |j                         |z        t        j                  ||z        z  | |z
  z  z   }t        j                  |t        j                  |      | dz
  z        j                         }|t        j                  ||j                         |      z   }|S )zo
    This function computes individual parts of the relative position buckets. For more detail, see paper.
    r   r&   r   )r   lt
zeros_likeintabsmaxlogr   mathr)   	ones_likewhere)	num_bucketsmax_distancerelative_positionsis_bidirectionalinv_relative_positionsrel_positions_bucket	max_exactis_smallval_if_larges	            r!   compute_relative_bucketsrI   =   sG    10!Q& hh-u/?/?@V/WX\\^allm 	 "'+A!B!&+A5CSCSTjCk!lq Ixx.	:Huyy)?)E)E)G))STW[W_W_y X  	y	  " "L 99\5??<+HKZ[O+\]aacL/%++hH^HbHbHdfr2ssr"   c                    |j                  d      j                  d|j                  d      d      }||j                  d      z
  }t        j                  |dz
  |fd      j                  d      }|j                  d|j                  d      d      }||j                  d      z
  }t        | ||d      }t        | ||d      }||fS )zm
    This function computes both main and predict relative position buckets. For more detail, see paper.
    r   r   F)rC   )	unsqueezerepeatsizer   r/   rI   )r@   rA   position_idsmain_stream_relative_positions$predicting_stream_relative_positionsmain_relative_position_buckets!predict_relative_position_bucketss          r!   #compute_all_stream_relative_bucketsrT   X   s    
 &2%;%;A%>%E%EaIZIZ[]I^`a%b"%ClF\F\]_F`%`" ,199lQ6F5U[]+^+h+hij+k(+O+V+VWXZfZkZklnZoqr+s(+OR^RhRhikRl+l( &>\#ATY&" )A\#GZ_)% *+LLLr"   zF
    Base class for sequence-to-sequence language models outputs.
    )custom_introc                      e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	ej                  dz  ed<   dZ
edz  ed<   dZeej                     dz  ed<   dZeej                     dz  ed<   dZeej                     dz  ed	<   dZeej                     dz  ed
<   dZeej                     dz  ed<   dZej                  dz  ed<   dZeej                     dz  ed<   dZeej                     dz  ed<   y)ProphetNetSeq2SeqLMOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss.
    logits (`torch.FloatTensor` of shape `(batch_size, decoder_sequence_length, config.vocab_size)`):
        Prediction scores of the main stream language modeling head (scores for each vocabulary token before
        SoftMax).
    logits_ngram (`torch.FloatTensor` of shape `(batch_size, ngram * decoder_sequence_length, config.vocab_size)`):
        Prediction scores of the predict stream language modeling head (scores for each vocabulary token before
        SoftMax).
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
        used (see `past_key_values` input) to speed up sequential decoding.
    decoder_ngram_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, ngram * decoder_sequence_length, hidden_size)`.

        Hidden-states of the predict stream of the decoder at the output of each layer plus the initial embedding
        outputs.
    decoder_ngram_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
        decoder_sequence_length, decoder_sequence_length)`.

        Attentions weights of the predict stream of the decoder, after the attention softmax, used to compute the
        weighted average in the self-attention heads.
    encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
        Sequence of hidden-states at the output of the last layer of the encoder of the model.
    Nlosslogitslogits_ngrampast_key_valuesdecoder_hidden_statesdecoder_ngram_hidden_statesdecoder_attentionsdecoder_ngram_attentionscross_attentionsencoder_last_hidden_stateencoder_hidden_statesencoder_attentions)__name__
__module____qualname____doc__rX   r   FloatTensor__annotations__rY   rZ   r[   r	   r\   tupler]   r^   r_   r`   ra   rb   rc    r"   r!   rW   rW   o   s)   < &*D%

d
")'+FE$+-1L%##d*1$(OUT\(=A5!2!23d:ACGu'8'8!9D!@G:>e//047>@DeE$5$56=D8<eE--.5<:>u0047>=A5!2!23d:A:>e//047>r"   rW   z
    Base class for model encoder's outputs that also contains : pre-computed hidden states that can speed up sequential
    decoding.
    c                      e Zd ZU dZej
                  ed<   dZej
                  dz  ed<   dZe	dz  ed<   dZ
eej
                     dz  ed<   dZeej
                     dz  ed<   dZeej
                     dz  ed<   dZeej
                     dz  ed	<   dZeej
                     dz  ed
<   dZej
                  dz  ed<   dZeej
                     dz  ed<   dZeej
                     dz  ed<   y)ProphetNetSeq2SeqModelOutputa  
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, decoder_sequence_length, hidden_size)`):
        Sequence of main stream hidden-states at the output of the last layer of the decoder of the model.

        If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
        hidden_size)` is output.
    last_hidden_state_ngram (`torch.FloatTensor` of shape `(batch_size,ngram * decoder_sequence_length, config.vocab_size)`, *optional*):
        Sequence of predict stream hidden-states at the output of the last layer of the decoder of the model.
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
        used (see `past_key_values` input) to speed up sequential decoding.
    decoder_ngram_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, ngram * decoder_sequence_length, hidden_size)`.

        Hidden-states of the predict stream of the decoder at the output of each layer plus the initial embedding
        outputs.
    decoder_ngram_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
        decoder_sequence_length, decoder_sequence_length)`.

        Attentions weights of the predict stream of the decoder, after the attention softmax, used to compute the
        weighted average in the
    encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
        Sequence of hidden-states at the output of the last layer of the encoder of the model.
    last_hidden_stateNlast_hidden_state_ngramr[   r\   r]   r^   r_   r`   ra   rb   rc   )rd   re   rf   rg   r   rh   ri   ro   r[   r	   r\   rj   r]   r^   r_   r`   ra   rb   rc   rk   r"   r!   rm   rm      s   : (((8<U..5<$(OUT\(=A5!2!23d:ACGu'8'8!9D!@G:>e//047>@DeE$5$56=D8<eE--.5<:>u0047>=A5!2!23d:A:>e//047>r"   rm   zs
    Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).
    c                   R   e Zd ZU dZej
                  ed<   dZej
                  dz  ed<   dZe	dz  ed<   dZ
eej
                     dz  ed<   dZeej
                     dz  ed<   dZeej
                     dz  ed<   dZeej
                     dz  ed	<   dZeej
                     dz  ed
<   y)ProphetNetDecoderModelOutputa  
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, decoder_sequence_length, hidden_size)`):
        Sequence of main stream hidden-states at the output of the last layer of the decoder of the model.

        If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
        hidden_size)` is output.
    last_hidden_state_ngram (`torch.FloatTensor` of shape `(batch_size, ngram * decoder_sequence_length, config.vocab_size)`):
        Sequence of predict stream hidden-states at the output of the last layer of the decoder of the model.
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
        used (see `past_key_values` input) to speed up sequential decoding.
    hidden_states_ngram (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, ngram * decoder_sequence_length, hidden_size)`.

        Hidden-states of the predict stream of the decoder at the output of each layer plus the initial embedding
        outputs.
    ngram_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
        decoder_sequence_length, decoder_sequence_length)`.

        Attentions weights of the predict stream of the decoder, after the attention softmax, used to compute the
        weighted average in the
    rn   Nro   r[   hidden_stateshidden_states_ngram
attentionsngram_attentionsr`   )rd   re   rf   rg   r   rh   ri   ro   r[   r	   rr   rj   rs   rt   ru   r`   rk   r"   r!   rq   rq      s    6 (((8<U..5<$(OUT\(59M5**+d29;?u001D8?26Je''(4/68<eE--.5<8<eE--.5<r"   rq   c                      e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	ej                  dz  ed<   dZ
edz  ed<   dZeej                     dz  ed<   dZeej                     dz  ed<   dZeej                     dz  ed	<   dZeej                     dz  ed
<   dZeej                     dz  ed<   y)ProphetNetDecoderLMOutputa	  
    ngram_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, ngram * decoder_sequence_length, hidden_size)`.

        Hidden-states of the predict stream of the decoder at the output of each layer plus the initial embedding
        outputs.
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss.
    logits (`torch.FloatTensor` of shape `(batch_size, decoder_sequence_length, config.vocab_size)`):
        Prediction scores of the main stream language modeling head (scores for each vocabulary token before
        SoftMax).
    logits_ngram (`torch.FloatTensor` of shape `(batch_size, ngram * decoder_sequence_length, config.vocab_size)`):
        Prediction scores of the predict stream language modeling head (scores for each vocabulary token before
        SoftMax).
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
        used (see `past_key_values` input) to speed up sequential decoding.
    hidden_states_ngram (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
        shape `(batch_size, ngram * decoder_sequence_length, hidden_size)`.

        Hidden-states of the predict stream of the decoder at the output of each layer plus the initial embedding
        outputs.
    ngram_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
        decoder_sequence_length, decoder_sequence_length)`.

        Attentions weights of the predict stream of the decoder, after the attention softmax, used to compute the
        weighted average in the
    NrX   rY   rZ   r[   rr   rs   rt   ru   r`   )rd   re   rf   rg   rX   r   rh   ri   rY   rZ   r[   r	   rr   rj   rs   rt   ru   r`   rk   r"   r!   rw   rw      s     D &*D%

d
")'+FE$+-1L%##d*1$(OUT\(59M5**+d29;?u001D8?26Je''(4/68<eE--.5<8<eE--.5<r"   rw   c                   &    e Zd ZU eed<   dZdZd Zy)ProphetNetPreTrainedModelconfig
prophetnetTc                    | j                   j                  }| j                   j                  }|J d       |j                  |j                        }|dd df   j                         |ddd f<   ||d<   |J d       |j                  |dk(  |       t        j                  |dk\        j                         sJ d	       |S )
Nzself.model.config.decoder_start_token_id has to be defined. In ProphetNet it is usually set to the pad_token_id. See ProphetNet docs for more information.rK   r   ).r   z1self.model.config.pad_token_id has to be defined.r   z8Verify that `shifted_input_ids` has only positive values)
rz   decoder_start_token_idpad_token_id	new_zerosshaper+   masked_fill_r   allitem)self	input_idsr~   r   shifted_input_idss        r!   _shift_rightz&ProphetNetPreTrainedModel._shift_right:  s    !%!C!C{{//%1 	
F	
1 &//	@%.sCRCx%8%>%>%@#qr'"$:&!'\)\\'&&'8D'@,Oyy*a/0557s9ss7  r"   N)rd   re   rf   r   ri   base_model_prefixsupports_gradient_checkpointingr   rk   r"   r!   ry   ry   4  s    $&*#!r"   ry   c                   B     e Zd ZdZdeddf fdZd fd	Z fdZ xZS )	ProphetNetPositionalEmbeddingsa  
    This module learns positional embeddings up to a fixed maximum size. Padding ids are ignored by either offsetting
    based on padding_idx or by setting padding_idx to None and ensuring that the appropriate position ids are passed to
    the forward function.
    rz   returnNc                     |j                   | _        t        |   |j                   |j                  |j
                         y N)max_position_embeddings
max_lengthsuper__init__hidden_sizer   r   rz   	__class__s     r!   r   z'ProphetNetPositionalEmbeddings.__init__X  s3     88779K9KVM`M`ar"   c                 D   || j                   J d       ||k|j                         dk7  rX|j                         }|d   |z   }t        j                  dt        j                  |      t        | j                   |z         z  }n|&t        j                  |t        j                  |      }t        j                  |d      j                  |      |z  j	                         | j                   z   }|j                  d| j                  dz
        }t        | -  |      |fS )NzCIf position_ids is pre-computed then padding_idx should not be set.r   r   )r   r   r   r$   r   )padding_idxget_seq_lengthr   r'   longr9   cumsumtype_asclampr   r   forward)	r   inputs_shaper$   attention_maskr[   rO   prev_num_input_idsnum_input_idsr   s	           r!   r   z&ProphetNetPositionalEmbeddings.forward\  s   $$*:*:*B 	
Q	
C */M/M/OST/T &5%C%C%E" ,Q2D D$zz&

6R((=89  ")%*ZZEJJW]%^N LLQ7??OR``$&4++ ,
  ,11!T__q5HIw|,l::r"   c                 "    t         |   |      S r   )r   r   )r   rO   r   s     r!   _forwardz'ProphetNetPositionalEmbeddings._forwardx  s    w|,,r"   )NNN)	rd   re   rf   rg   r   r   r   r   __classcell__r   s   @r!   r   r   Q  s.    b/ bD b;8- -r"   r   c                        e Zd ZdZddedededz  f fdZ	 	 	 	 	 ddedz  dedz  d	edz  d
e	dz  de
j                  dz  deeedz  f   fdZ xZS )ProphetNetAttentionz=Multi-headed attention from 'Attention Is All You Need' paperNrz   num_attn_heads	layer_idxc                    t         |           |j                  }|j                  | _        |j                  | _        || _        ||z  | _        || _        | j                  |z  |k(  sJ d       t        j                  ||      | _
        t        j                  ||      | _        t        j                  ||      | _        t        j                  ||      | _        y )Nzw`config.hidden_size` must be divisible by `config.num_encoder_attention_heads` and `config.num_decoder_attention_heads`)r   r   r   attention_dropoutdropoutr   head_dimr   r   Linearkey_proj
value_proj
query_projout_proj)r   rz   r   r   r   r   s        r!   r   zProphetNetAttention.__init__  s    ((!'!9!9~~,#~5"}}~-< 	
4	
<
 		+{;))K=))K=		+{;r"   key_value_statesr   r[   output_attentionscache_positionr   c                    |j                         \  }}}	|d u}
t        |j                               |||	gk(  sJ d|||	f d|j                                 | j                  |      | j                  dz  z  }d}|St	        |t
              rA|j                  j                  | j                        }|
r|j                  }n|j                  }n|}|
r|n|}|
rK|I|rGj                  | j                     j                  }|j                  | j                     j                  }n| j                  |      }| j                  |      }|j!                  |d| j"                  | j                        j%                  dd      }|j!                  |d| j"                  | j                        j%                  dd      }|T|
s|nd }j'                  ||| j                  d|i      \  }}|
r)t	        |t
              rd	|j                  | j                  <   |j!                  ||| j"                  | j                        j%                  dd      }|j                  d      }t)        j*                  d
||j%                  dd            }|| j"                  ||f}|j                         |k7  rt-        d| d|j                                ||j/                         dk(  rd }|| j"                  d|f}|2|j                         |k7  rt-        d| d|j                                |||z   }|r|}nd }t0        j2                  j5                  |d      }t0        j2                  j7                  || j8                  | j:                        }t)        j*                  d
||      }|| j"                  || j                  f}|j                         |k7  rt-        d| d|j                                |j%                  dd      j=                  |||	      }| j?                  |      }t0        j2                  j7                  || j6                  | j:                        }||fS )Nz Size of hidden states should be z	, but is       ?FrK   r   r&   r   Tzbsij,bsjk->bsikr   z#Attention weights should have size r   z Attention mask should have size r   ptrainingz `attn_output` should have shape , but is of shape ) rN   listr   r   
isinstancer   
is_updatedgetr   cross_attention_cacheself_attention_cachelayerskeysvaluesr   r   viewr   	transposeupdater   einsum
ValueErrorr   r   r   r   r   r   r   reshaper   )r   rr   r   r   r[   r   r   
batch_sizetgt_lenr   is_cross_attentionquery_statesr   curr_past_key_valuescurrent_states
key_statesvalue_statessrc_lenattn_weightsexpected_shapeattn_weights_reshaped
attn_probsattn_outputs                          r!   r   zProphetNetAttention.forward  sO    ,9+=+=+?(
G[ .T9M&&().
 
 	p .j';.N-OyYfYkYkYmXno		p 
 }59KL
&/+>?,77;;DNNK
%+:+P+P(+:+O+O('6$-?)]/"=*-44T^^DIIJ/66t~~FMML~6J??>:L#R9L9Ldmm\ffghjklJ',,ZT=P=PRVR_R_`jjklnopL*7It+?+F+Fdnn?OQ_>`,(
L &*_FY*ZAEO..t~~>#((Wd>Q>QSWS`S`akklmopq//!$||$5|ZEYEYZ[]^E_`$d&9&97GL.0B>BRR[\h\m\m\o[pqrr %.*<*<*>!*C!N$d&9&91gF%.*=*=*?>*Q??OyYgYlYlYnXopqq%'.8L$0!$(!}},,\r,B]]**$$]] + 


 ll#4j,O$d&9&97DMMR/??OOabmbrbrbtauvww!++Aq199*g{[mmK0mm++K4<<RVR_R_+`111r"   r   )NNNFN)rd   re   rf   rg   r   r9   r   r   r	   boolr   rj   r   r   r   s   @r!   r   r   |  s    G</ < <QTW[Q[ <0 +/(,(,)..2^2 !4-^2 	^2
 ^2  $;^2 t+^2 
vv}$	%^2r"   r   c                   2     e Zd ZdZdedef fdZd Z xZS )ProphetNetFeedForwardzm
    This is the residual two feed-forward layer block based on the original Transformer implementation.
    rz   ffn_dimc                 *   t         |           t        |j                     | _        t        j                  |j                  |      | _        t        j                  ||j                        | _	        |j                  | _
        |j                  | _        y r   )r   r   r   activation_functionactivation_fnr   r   r   intermediateoutputactivation_dropoutr   )r   rz   r   r   s      r!   r   zProphetNetFeedForward.__init__  sk    #F$>$>?IIf&8&8'Bii););<"(";";~~r"   c                 D   | j                  |      }| j                  |      }t        j                  j	                  || j
                  | j                        }| j                  |      }t        j                  j	                  || j                  | j                        }|S )Nr   )r   r   r   r   r   r   r   r   )r   rr   s     r!   r   zProphetNetFeedForward.forward  s    ))-8**=9--mt?V?Vaeanan-oM2--mt||VZVcVc-dr"   )	rd   re   rf   rg   r   r9   r   r   r   r   s   @r!   r   r     s!    &/ &# &r"   r   c                   `     e Zd Zd
def fdZd Zd Z	 	 	 	 	 	 	 ddedz  fdZd Z	d	 Z
 xZS )ProphetNetNgramSelfAttentionNrz   c                    t         |           |j                  | _        |j                  | _        |j                  | _        |j
                  | _        |j                  | _        |j                  | _        |j                  | j                  z  | _	        |j                  | _
        || _        | j                  | j                  z  |j                  k(  sJ d       t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        t        j                  |j                  | j                  | j                  z        | _        d| _        y )Nz6config.hidden_size must be divisible by num_attn_headsF)r   r   r   r@   relative_max_distancenum_decoder_attention_headsr   r   r   r   r1   r   r   r   r   r   r   r   relative_pos_embeddingsr    r   rz   r   r   s      r!   r   z%ProphetNetNgramSelfAttention.__init__  sh   !--!--%+%A%A"$@@~~!'!9!9**d.A.AA\\
"}}t222f6H6HH 	
D	
H 		&"4"4f6H6HI))F$6$68J8JK))F$6$68J8JK 		&"4"4f6H6HI (*yy1C1CTEUEUX\XkXkEk'l$  r"   c                     |j                  ||| j                  | j                        j                  dd      j	                         S Nr   r&   )r   r   r   r   
contiguous)r   tensorseq_lenr   s       r!   _shapez#ProphetNetNgramSelfAttention._shape+  s9    {{:w0C0CT]]S]]^_abcnnppr"   c                     d| _         y )NT)r    r   s    r!   prepare_for_onnx_export_z5ProphetNetNgramSelfAttention.prepare_for_onnx_export_.  s	    r"   r[   c	           	      8   |j                         \  }	}
}t        |j                               |	|
|gk(  sJ d|	|
|f d|j                          | j                  |      }| j	                  |      }| j                  |      }|| j                  dz  z  }| j                  ||
|	      }| j                  |d|	      }| j                  |d|	      }|	| j                  d| j                  f} |j                  | } |j                  | } |j                  | }|j                  d| j                  z   d      }|j                  d| j                  z   d      }|j                  d| j                  z   d      }|j                  d| j                  z   d      }|d   |dd  }}|d   |dd  }}|d   |dd  }}|d   |dd  }}|Bt        |t              r|j                  }n|}|j                  ||| j                   d	|i      \  }}|
d| j                  z   z  }t#        j$                  d
||j'                  dd            }| j)                  ||||      }||z   }|||z   }t+        |d| j,                        j/                  |      } t0        j2                  j5                  | | j6                  | j8                        } t#        j$                  d
| |      }!|!j'                  dd      j                  |	d||      }!| j;                  |!      }!t#        j<                  |d      j?                  |	| j                  | j                  || j                        }"t#        j<                  |D #cg c]  }#t#        j@                  ||#gd       c}#d      }$t#        j<                  |d      }%t#        j@                  |D &cg c])  }&t#        j@                  ||&gd      jC                  d      + c}&d      }'t#        j$                  d|"|$f      }(| jE                  |%|(||      })|(|)z   }(|5|jG                  ddddd      }|jI                  |(jJ                        }|(|z   }(t+        |(d| j,                        j/                  |(      }*t0        j2                  j5                  |*| j6                  | j8                        }*t#        j$                  d|*|'j'                  dd      f      }+|+j'                  dd      }+|+j                  |	| j                  ||      }+| j;                  |+      }+t#        j@                  |!|+gd      j?                  |	d|      },| j?                  |	| j                  |d      } t0        j2                  j5                  |,| j4                  | j8                        },|,| |*fS c c}#w c c}&w )Nz#`hidden_states` should be of shape r   r   rK   r   r   r&   r   r   zbntc,bncs->bntsr   )r   r    r   zbnhtc,bnhsc->bnhts   zbnhts,bnhsc->bnhtc)&rN   r   r   r   r   r   r   r   r   r   chunkr1   r   r   r   r   r   r   r   r    get_main_relative_pos_embeddingsr   r    r   r   r   r   r   r   r   stackr   r/   rL   #get_predict_relative_pos_embeddingspermutetor   )-r   rr   r[   r   extended_predict_attention_maskrR   rS   rO   r   r   ngram_sequence_lengthr   r   r   r   
proj_shapehidden_states_listquery_states_listkey_states_listvalue_states_listmain_hidden_stateshidden_states_predict_listmain_query_statespredict_query_states_listmain_key_statespredict_key_states_listmain_value_statespredict_value_states_listr   r0   main_attn_weightsmain_relative_pos_embeddingsmain_attn_probsmain_attn_outputpredict_query_stateskeypredict_key_statespredict_hidden_statesv_ppredict_value_statespredict_attn_weightspredict_relative_pos_embeddingspredict_attn_probspredict_attn_outputr   s-                                                r!   r   z$ProphetNetNgramSelfAttention.forward1  s    :G9K9K9M6
);M&&()j:OQ\-]] 	
1*>SU`2`1a b##$&	
] }5]]=1
}5 $t}}c'9: {{<1F
S[[R<
{{<Z@ $"5"5r4==I
+|++Z8'Z''4
+|++Z8 +00TZZQ0G(..q4::~1.E$**1tzz>q*A(..q4::~1.E9KA9NPbcdcePf67H7KM^_`_aMb43B13EWXWYGZ07H7KM^_`_aMb4 &/+>?'6'K'K$'6$1E1L1L!2DNNEUWeDf2.O.
 0A

NC "LL):<MOhOhijlmOno (,'L'L 1<A_(
$ .0LL% 1N B!
 '#
$	 	 --//4CYCYdhdqdq/r
 !<<(9?L]^+55a;CCJPQSbdop==)9:  %{{+DaHMM

D$7$7$-- 

 #[[Zq)rSV%))_c4JA*N)rtuv !&,FA N  %yyLefSUYY)3/3==a@fhi 
  %||,@CWYkBlm +/*R*R!#7Gh+
'
  46UU*6.M.U.UVWYZ\]_`bc.d+.M.P.PQeQkQk.l+#7:Y#Y $ 
 '&
'	 	  ]]22$"8"84== 3 
 $ll #57K7U7UVWYZ7["\
 2;;AqA199*djjRacno"mm,?@ ii!13F GKPPQ[]_alm)..z4;N;NP_acdmm++K4<<RVR_R_+`O-???{ *s gs   V,.Vc                    |j                   \  }}}}|j                  ||||      }||j                   d d \  }}	t        j                  d|j                   d   dz         j	                  d      j	                  d      j                  ||	d      j                  |j                        }
|
|j	                  d      j                  ||	d      z
  }
t        | j                  | j                  |
d      }| j                  |      }|j                  |j                   d d | j                  | j                  fz         }|j                  dddd      }|j                  |j                   d d dz         }|j                  d| j                  d      }|j                  d|j                   d         }|j                         }|j                  d|j!                  d            }t        j"                  |d|      }|j                  |||d      }|S )	Nr&   r   rK   r   Fr   )rK   r   index)r   r   r   arangerL   rM   r   r$   rI   r@   r   r   r   r   r   r   rN   gather)r   rr   r   rO   rR   r   r   r   r   r0   rB   rel_pos_embeddingsr  s                r!   r   z=ProphetNetNgramSelfAttention.get_main_relative_pos_embeddings  s    8D7I7I4
NGW#((^WgV)1*7*=*=bq*A'JQ 2 22 6 :;11
OQ7L''(  "4l6L6LQ6O6V6VWacrtu6v!v-E  $"<"<>PRW.*
 "99-H/44$$Ra(D,<,<d>Q>Q+RR
 0771aC/778J8J2A8NQV8VW)G)N)NqRVReRegh)i&)G)L)L.44R8*
& *H)L)L)N&/77<N<S<STV<WX',||4FAUs't$'C'H'HUcelnp'q$++r"   c                 (   |j                   dd \  }}||j                   d   }|d   d   |dz
  k(  sJ d       t        j                  d|      j                  d      j                  d      j	                  ||d      j                  |j                        }||j                  d      j	                  ||d      z
  }t        | j                  | j                  |d      }|j                  dd      }| j                  |      }	|	j                  |j                   d d | j                  | j                  fz         }	|	j                  ddddd      }	|	j                  d| j                        }	|j                  d      }|j	                  | j                   d| j                  d      }|j                  d|j#                  d            j%                         }t        j&                  |	d|	      }
|
j                  || j                   | j                  |d      }
|
S )
Nr   r&   rK   r   zb`position_ids` are incorrect. They should be of the format 1 2 3 4 5 ... (key_sequence_length - 1)Fr   r   r  )r   r   r  rL   rM   r   r$   rI   r@   r   r   r   r   r   r   r   r1   rN   r   r  )r   rr   r   rO   rS   r   r0   key_sequence_lengthrB   r   r  s              r!   r   z@ProphetNetNgramSelfAttention.get_predict_relative_pos_embeddings  s+    '4&9&9!A&>#
O,4"."4"4R"8?1%)<q)@@ t@ Q 3411
OQ7L''(  "4l6L6LQ6O6V6VWacrtu6v!v0H  $"<"<>PRW1-
 &//15!99-H 044$(8(8$:M:M'NN
 0771aAF/77D<L<LM,M,W,WXY,Z),M,T,TJJ4..-
) -N,R,R166r:-

$& 	* +0,,A-N+
'
 +J*N*N

D$7$7"+
' /.r"   r   )NNNNNNN)rd   re   rf   r   r   r   r   r	   r   r   r   r   r   s   @r!   r   r     sU     /  <q )-(,'+*.a@ a@F+,Z9/r"   r   c                   8     e Zd ZdZdef fdZ	 ddefdZ xZS )ProphetNetEncoderLayerz&
    Encoder block for Prophetnet
    rz   c                     t         |           t        ||j                        | _        t        |j                        | _        t        ||j                        | _
        t        |j                        | _        y r   )r   r   r   num_encoder_attention_heads	self_attnr   r   self_attn_layer_normr   encoder_ffn_dimfeed_forwardfeed_forward_layer_normr   s     r!   r   zProphetNetEncoderLayer.__init__B  s_    ,VV5W5WX$-f.@.@$A! 2&&:P:PQ'01C1C'D$r"   r   c                     | j                  |||      \  }}| j                  ||z         }| j                  |      }| j                  ||z         }|f}|r||fz  }|S )N)rr   r   r   )r'  r(  r*  r+  )r   rr   r   r   attention_outputr   feed_forward_outputoutputss           r!   r   zProphetNetEncoderLayer.forwardL  s     *.')/ *8 *
&,
 112B]2RS #//>445H=5XY "&Gr"   F)	rd   re   rf   rg   r   r   r   r   r   r   s   @r!   r$  r$  =  s+    E/ E #(	  	r"   r$  c                   |     e Zd ZdZd	def fdZ	 	 	 	 	 	 	 	 	 	 	 d
dedz  dedz  dej                  dz  fdZ	 xZ
S )ProphetNetDecoderLayerz&
    Decoder block for Prophetnet
    Nrz   c                 j   t         |           t        ||      | _        t	        |j
                        | _        |j                  r7t        ||j                  |      | _
        t	        |j
                        | _        t        ||j                        | _        t	        |j
                        | _        y )Nr   )r   r   r   r'  r   r   r(  add_cross_attentionr   r   
cross_attncross_attn_layer_normr   decoder_ffn_dimr*  r+  r   s      r!   r   zProphetNetDecoderLayer.__init__k  s    5f	R$-f.@.@$A! %%1&&:\:\hqrDO)263E3E)FD& 2&&:P:PQ'01C1C'D$r"   	use_cacher   r   c           	      ,   | j                  ||	|||||      \  }}}| j                  ||z         }d }|-| j                  ||||	|      \  }}| j                  ||z         }| j	                  |      }| j                  ||z         }|f}|r||||fz  }|S )N)rr   r[   r   r   rR   rS   rO   )rr   r   r   r[   r   )r'  r(  r6  r7  r*  r+  )r   rr   r   rb   encoder_attn_maskr   rR   rS   rO   r[   r9  r   r   ngram_attention_outputself_attn_weightsself_attn_weights_ngramcross_attn_weightsr-  r.  r/  s                       r!   r   zProphetNetDecoderLayer.forwardz  s      NR^^'+),K+I.O% N\ N
J 13J 11-BX2XY! ,37??+!60 /"3 4C 400 !667G-7WXM #//>445H=5XY ")+BDVWWGr"   r   )NNNNNNNNTFN)rd   re   rf   rg   r   r   r   r   r   r   r   r   s   @r!   r2  r2  f  sn    E/ E$ "(,'+*.!%)..20 $;0  $;0 t+0r"   r2  z=
    The standalone encoder part of the ProphetNetModel.
    c                        e Zd Zdef fdZd Zd Ze	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  d	e
dz  d
e
dz  de
dz  deez  fd       Z xZS )ProphetNetEncoderrz   c                    t         |   |       t        j                  |j                  |j
                  |j                        | _        t        |      | _	        t        |j
                        | _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _        | j%                          y c c}w )Nr   F)r   r   r   	Embedding
vocab_sizer   r   word_embeddingsr   position_embeddingsr   embeddings_layer_norm
ModuleListr,   num_encoder_layersr$  r   gradient_checkpointing	post_init)r   rz   _r   s      r!   r   zProphetNetEncoder.__init__  s     !||F,=,=v?Q?Q_e_r_rs#A&#I %.v/A/A%B"mmUSYSlSlMm$n%;F%C$no&+#	 %os   Cc                     | j                   S r   rF  r   s    r!   get_input_embeddingsz&ProphetNetEncoder.get_input_embeddings      ###r"   c                     || _         y r   rO  r   values     r!   set_input_embeddingsz&ProphetNetEncoder.set_input_embeddings  
    $r"   Nr   r   inputs_embedsr   output_hidden_statesreturn_dictr   c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }||t	        d      ||t	        d      ||| j                  |      }||d|ddddddf   j                  d| j                   j                  dd      z
  t        j                  | j                        j                  z  }|j                  |j                        }nd}| j                  |j                  dd |j                        \  }	}
||	z   }| j!                  |      }t"        j$                  j'                  || j                   j&                  | j(                        }|rdnd}|rdnd}t+        | j,                        D ])  \  }}|r||fz   } ||||	      }|d
   }|s!||d   fz   }+ |r||fz   }|st/        d |||fD              S t1        |||      S )a	  
        Example:

        ```python
        >>> from transformers import AutoTokenizer, ProphetNetEncoder
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/prophetnet-large-uncased")
        >>> model = ProphetNetEncoder.from_pretrained("patrickvonplaten/prophetnet-large-uncased-standalone")
        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> last_hidden_states = outputs.last_hidden_state
        ```Nz3Either input_ids or inputs_embeds has to be passed.z2Make sure to only pass input_ids or inputs_embeds.      ?r   r&   r   rk   )r   r   r   c              3   &   K   | ]	  }||  y wr   rk   .0vs     r!   	<genexpr>z,ProphetNetEncoder.forward.<locals>.<genexpr>  s     lq^_^kl   )rn   rr   rt   )rz   r   rX  use_return_dictr   rF  rM   r&  r   r(   r   r)   r   rG  r   r$   rH  r   r   r   r   	enumerater   rj   r   )r   r   r   rW  r   rX  rY  kwargsextended_attention_maskrG  rO   rr   rb   all_attentionsidxencoder_layerlayer_outputss                    r!   r   zProphetNetEncoder.forward  sD   4 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]!6RSS"}'@QRR"}'< 00;M %nQdA%56==aAhAhjkmnooDJJ'++',# '>&@&@ATAT&U#&*#,0,D,D]EXEXY[Z[E\^k^r^r,s)\%(;;22=A--mt{{?R?R]a]j]j-k&:0d"+DKK"8 	FC#(=@P(P%)6"3M *!,M !/=3C2E!E	F  $9]<L$L!l]4I>$Zlll+;P]k
 	
r"   )NNNNNN)rd   re   rf   r   r   rP  rU  r   r   r   r   rj   r   r   r   r   s   @r!   rA  rA    s    / $%  *..2-1)-,0#'N
<<$&N
 t+N
 ||d*	N

  $;N
 #TkN
 D[N
 
	 N
 N
r"   rA  z=
    The standalone decoder part of the ProphetNetModel.
    c                   \    e Zd Zdef fdZd Zd Ze	 	 	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  d	ej                  dz  d
e
dz  dej                  dz  dedz  dedz  dedz  dedz  dej                  dz  deez  fd       Zd Zd Zd Z xZS )ProphetNetDecoderrz   c           	         t         |   |       |j                  | _        |j                  | _        |j                  | _        |j
                  | _        |j                  | _        t        j                  |j                  |j                  |j                        | _        t        |      | _        t        j                  | j                  |j                  d       | _        t        j"                  t%        |j&                        D cg c]  }t)        ||       c}      | _        t-        |j                        | _        d| _        | j3                          y c c}w )NrC  r4  F)r   r   r1   r@   r   r   r   max_target_positionsr   rD  rE  r   r   rF  r   rG  ngram_embeddingsrI  r,   num_decoder_layersr2  r   r   rH  rK  rL  )r   rz   ir   s      r!   r   zProphetNetDecoder.__init__  s    \\
!--%+%A%A"~~$*$B$B!!||F,=,=v?Q?Q_e_r_rs#A&#I  "TZZ9K9KT RmmBGHaHaBbcQ#Fa8c
 &/v/A/A%B"&+# ds   Ec                     | j                   S r   rO  r   s    r!   rP  z&ProphetNetDecoder.get_input_embeddings4  rQ  r"   c                     || _         y r   rO  rS  s     r!   rU  z&ProphetNetDecoder.set_input_embeddings7  rV  r"   Nr   r   rb   encoder_attention_maskr[   rW  r9  r   rX  rY  r   r   c                 

   ||n| j                   j                  }||n| j                   j                  }|	|	n| j                   j                  }	|
|
n| j                   j                  }
||t        d      ||t        d      ||| j                  |      }|j                  dd \  }}| j                  r%| j                  r|rt        j                  d       d}|rd|b|| j                   j                  r4t        t        | j                         t        | j                               nt        | j                         }||j                         nd}| j!                  ||f|j"                  |	      \  }}|dk7  rd
\  }}n| j%                  |      \  }}| j                   j'                  |dz         }||z   }| j(                  j*                  }|dk7  r\|j-                  d      dk(  sJ d       t/        | j0                        D cg c]  }||dz
     |z   j3                  |dd        }}d}d}nOt/        | j0                        D cg c]  }||dz
     |z    }}| j5                  ||      }| j7                  ||      }||d|ddddddf   j3                  d| j                   j8                  dd      z
  t;        j<                  | j>                        j@                  z  }|jC                  |j>                        }nd}t;        jD                  |g|z   d      }| jF                  r| jG                  |      }tH        jJ                  jM                  || jL                  | j                        }|	rdnd}|	r| j                   j0                  dkD  rdnd}|rdnd}|rdnd}|r| j                   jN                  rdnd} tQ        | jR                        D ]  \  }!}"|	r7||ddd|f   fz  }| j                   j0                  dkD  r||dd|df   fz  } |"||||||||||||      }#|#d   }|s[||#d   fz  }||#d   fz  }| j                   jN                  s| |#d   fz  }  |	r7||ddd|f   fz  }| j                   j0                  dkD  r||dd|df   fz  }|ddd|f   }$| j                   j0                  dkD  r|dd|df   nd}%|
stU        d |$|%|||||| fD              S tW        |$|%||||||       S c c}w c c}w )a  
        Example:

        ```python
        >>> from transformers import AutoTokenizer, ProphetNetDecoder
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/prophetnet-large-uncased")
        >>> model = ProphetNetDecoder.from_pretrained("microsoft/prophetnet-large-uncased", add_cross_attention=False)
        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> last_hidden_states = outputs.last_hidden_state
        ```NzGEither `decoder_input_ids` or `decoder_inputs_embeds` has to be passed.zFMake sure to only pass `decoder_input_ids` or `decoder_inputs_embeds`.r&   zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...F)rz   r   )r$   r[   )NNr   zOAt the moment `use_cache` is only supported for `decoder_input_ids` of length 1r[  r   rk   )	r;  r   rR   rS   rO   r[   r9  r   r   r   c              3   $   K   | ]  }|| 
 y wr   rk   r]  s     r!   r`  z,ProphetNetDecoder.forward.<locals>.<genexpr>  s       = s   )rn   ro   r[   rr   rs   rt   ru   r`   ),rz   r9  r   rX  rb  r   rF  r   rK  r   loggerwarning_onceis_encoder_decoderr   r
   r   rG  r$   !compute_buffered_relative_bucketsr   rn  weightrN   r,   r1   rM   prepare_attention_maskprepare_predict_attention_maskr   r   r(   r   r)   r   r/   rH  r   r   r   r5  rc  r   rj   rq   )&r   r   r   rb   rs  r[   rW  r9  r   rX  rY  r   rd  r   r0   past_key_values_lengthmain_stream_pos_embedrO   rR   rS   predicting_stream_pos_embedrr   rn  r1   ngram_hidden_statesre  r   extended_encoder_attention_maskall_main_stream_hidden_statesall_ngram_stream_hidden_statesall_main_stream_attnsall_ngram_stream_attnsall_cross_attnsrg  decoder_layerri  rn   ro   s&                                         r!   r   zProphetNetDecoder.forward:  s   < "+!6IDKK<Q<Q	1B1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]!6fgg"}'@eff"}'< 00;M&3&9&9"1&=#
O&&4==##p "	0 )48V8V $L$DlZ^ZeZeFfg!5  FUE`!?!?!Afg.2.F.F) ''+ /G /
+| "Q&PZM*,M
 66|D.1&*&>&>&G&GWXHX&Y# &(==0077 "Q& %%a(A- a- #4::.# "%!),/JJRRS]_`bcd# # '+#.2+ Z__c_i_iYj#PU!%!),/JJ# # '+&A&A-Q_&`#.2.Q.QR_ao.p+ "-,QdA-=>EEaIpIprsuvwwDJJ'++/,+ /N.P.PQ^QdQd.e+.2+		=/4G"GK%% 66}EM--mt||VZVcVc-d /C%/CHYHY\]H]cg&&7T'8d 1dkk6U6U"[_"+DKK"8 	;C#--CSOCS@S2T1VV-;;$$q(2}QHXEX7Y6[[2)'%"A0O/M2S) /#"3-M *!,M %-*:)<<%&=+;*==&;;22#a(8'::O9	;<  )mA?O?O<O.P-RR){{  1$.=ODTAT3U2WW. *!-=o-=*=>HLHYHY\]H]-?3C0C"Dcg  &+#12)*#	   ,/$;+7 >,3,	
 		
w##s   #S;T c           	         |j                   \  }}t        j                  d| j                        j	                  |j
                        j                  dd      }t        | j                  | j                  |      \  }}|d d d |d |f   j                  |dd      }t        j                  |d d d |d |f   |d d d || j                  | j                  |z   f   gd      j                  |dd      }||fS r   )r   r   r  rm  r   r$   rM   rT   r@   r   r/   )r   rO   r   r0   main_relative_bucketspredict_relative_bucketss         r!   ry  z3ProphetNetDecoder.compute_buffered_relative_buckets  s!   &2&8&8#
O||At'@'@ADD\EXEXY``abdef:]d88,;
77
 !6a9I/9IK[OK[6[ \ c cdnpqst u#(99(,<_,<>N>N)NO('')B)BTE^E^apEp)pp $
 &Q
" 	! %&>>>r"   c                 L   |j                   d d \  }}t        j                  ||ft        j                  |j                        j
                  |j                  |j                        }t        j                  |d      }|d |d |f   d d d d d d f   j                  || j                  j                  f|j                   z         }|@d|d d d d d d f   z
  t        j                  | j                        j
                  z  }||z   }n|}|j                  |j                        S )Nr&   r   r   r[  )r   r   fullr(   r   r)   r$   triuexpandrz   r   r   )r   rr   r   r   
seq_lengthcausal_maskextended_causal_maskre  s           r!   r{  z(ProphetNetDecoder.prepare_attention_mask  s%   !.!4!4Ra!8
J jj$KK++,00%% ''	
 jja0*;J;+CDT4QRTUEUV]]@@AKDUDUU 

 %'*^AtT1<L-M'MQVQ\Q\]a]g]gQhQlQl&l#&:=T&T#&:#&))-*=*=>>r"   c           	      &   |j                   d d \  }}t        | j                  | j                  |j                  |j
                        }t        j                  |d d d |d |f   |d d d || j                  | j                  |z   f   gd      }|d d d d d d d d f   j                  || j                  j                  f|j                   z         }|d|d d d d d d d f   z
  t        j                  | j
                        j                  z  }|j                  || j                  j                  | j                  ||f      }t        j                  |t        j                  |      gd      }||z   }n|}|j                  |j
                        S )Nr&   rK   r   r[  )r   r5   rm  r1   r$   r   r   r/   r  rz   r   r(   r)   r8   r   )	r   rr   r   r   r  predict_causal_maskextended_predict_causal_maskre  r   s	            r!   r|  z0ProphetNetDecoder.prepare_predict_attention_mask&  s   !.!4!4Ra!8
J 3%%tzz=3G3GI\I\
 $ii#A{
{KZK$?@#{
{D$=$=@Y@Y\f@f$ff 
 (;4q!Q;N'O'V'V@@ADWD]D]](
$
 %'*^AtT4QR<R-S'SW\WbWbcgcmcmWnWrWr&r#&=&D&DT[[DDdjjR\^hi'# ',ii(%*:*:;R*STZ\'# /KMd.d+.J+.11-2E2EFFr"   NNNNNNNNNNN)rd   re   rf   r   r   rP  rU  r   r   r   r	   r   rj   rq   r   ry  r{  r|  r   r   s   @r!   rk  rk    s6   / ,$%  *..2596:(,-1!%)-,0#'.2{
<<$&{
 t+{
  %||d2	{

 !&t 3{
 {
 ||d*{
 $;{
  $;{
 #Tk{
 D[{
 t+{
 
-	-{
 {
z?,?0!Gr"   rk  c                       e Zd ZdddZdef fdZd Zd Ze	 	 	 	 	 	 	 	 	 	 	 	 	 dde	j                  dz  d	e	j                  dz  d
e	j                  dz  de	j                  dz  dedz  dedz  de	j                  dz  de	j                  dz  dedz  dedz  dedz  dedz  de	j                  dz  deez  fd       Z xZS )ProphetNetModelword_embeddings.weight)zencoder.word_embeddings.weightdecoder.word_embeddings.weightrz   c                 j   t         |   |       t        j                  |j                  |j
                  |j                        | _        t        j                  |      }d|_
        t        |      | _        t        j                  |      }d|_        t        |      | _        | j!                          y )NrC  FT)r   r   r   rD  rE  r   r   rF  copydeepcopyr9  rA  encoder
is_decoderrk  decoderrL  )r   rz   encoder_configdecoder_configr   s       r!   r   zProphetNetModel.__init__Q  s     !||F,=,=v?Q?Q_e_r_rsv.#( (8v.$(!(8 	r"   c                     | j                   S r   rO  r   s    r!   rP  z$ProphetNetModel.get_input_embeddings`  rQ  r"   c                 ~    || _         | j                   | j                  _         | j                   | j                  _         y r   )rF  r  r  rS  s     r!   rU  z$ProphetNetModel.set_input_embeddingsc  s.    $'+';';$'+';';$r"   Nr   r   decoder_input_idsdecoder_attention_maskencoder_outputsr[   rW  decoder_inputs_embedsr9  r   rX  rY  r   r   c                 X   |	|	n| j                   j                  }	|
|
n| j                   j                  }
||n| j                   j                  }||n| j                   j                  }|| j                  ||||
||      }| j                  |||d   ||||
||	||      }|s||z   S t        |j                  |j                  |j                  |j                  |j                  |j                  |j                  |j                  |j                  |j                  |j                        S )a  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            ProphetNet uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If
            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
            `past_key_values`).
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, ProphetNetModel

        >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/prophetnet-large-uncased")
        >>> model = ProphetNetModel.from_pretrained("microsoft/prophetnet-large-uncased")

        >>> input_ids = tokenizer(
        ...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
        ... ).input_ids  # Batch size 1
        >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1
        >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)

        >>> last_hidden_states = outputs.last_hidden_state  # main stream hidden states
        >>> last_hidden_states_ngram = outputs.last_hidden_state_ngram  # predict hidden states
        ```)r   r   rW  r   rX  rY  r   )r   r   rb   rs  r[   rW  r   rX  r9  rY  r   )rn   ro   r[   r\   r]   r^   r_   r`   ra   rb   rc   )rz   r9  r   rX  rb  r  r  rm   rn   ro   r[   rr   rs   rt   ru   r`   )r   r   r   r  r  r  r[   rW  r  r9  r   rX  rY  r   rd  decoder_outputss                   r!   r   zProphetNetModel.forwardh  sN   f "+!6IDKK<Q<Q	1B1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]""ll#-+"3%9' + O ,,'1"1!"4#1+//!5#) ' 
 "_44+-??$3$K$K+;;"1"?"?(7(K(K.99%4%E%E,==&5&G&G"1"?"?.99
 	
r"   )NNNNNNNNNNNNN)rd   re   rf   _tied_weights_keysr   r   rP  rU  r   r   r   
BoolTensorrj   r	   r   rm   r   r   r   s   @r!   r  r  J  sa    +C*B
/ $<
  *..215:>(,(,-159!%)-,0#'.2`
<<$&`
 t+`
 !<<$.	`

 !& 0 04 7`
 `
 `
 ||d*`
  %||d2`
 $;`
  $;`
 #Tk`
 D[`
 t+`
  
-	-!`
 `
r"   r  zh
    The ProphetNet Model with a language modeling head. Can be used for sequence generation tasks.
    c            !           e Zd ZddiZdef fdZd Ze	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  d	ej                  dz  d
ej                  dz  dej                  dz  dedz  dej                  dz  dej                  dz  dej                  dz  dedz  dedz  dedz  dedz  dej                  dz  deez  fd       ZddZdej                  fdZd fd	Z xZS )"ProphetNetForConditionalGenerationlm_head.weight!prophetnet.word_embeddings.weightrz   c                 
   t         |   |       t        |      | _        |j                  | _        |j                  | _        t        j                  |j                  |j                  d      | _        | j                          y )NFbias)r   r   r  r{   r   r   disable_ngram_lossr   r   r   rE  lm_headrL  r   s     r!   r   z+ProphetNetForConditionalGeneration.__init__  sd     )&1!.."(";";yy!3!3V5F5FUS 	r"   c                 .    | j                   j                  S r   )r{   rF  r   s    r!   rP  z7ProphetNetForConditionalGeneration.get_input_embeddings  s    ...r"   Nr   r   r  r  r  r[   rW  r  labelsr9  r   rX  rY  r   r   c                 \   ||n| j                   j                  }|	||| j                  |	      }| j                  |||||||||
||||      }||j                  n|j                  dd \  }}|d   j                  || j                   j                  |d      }| j                  |      }|dddf   }| j                   j                  dkD  r|ddddf   nd}|j                         s|j                         }d}|	| j                  ||	      }|s*t        d ||fD              }||f|z   |dd z   S ||dd z   S t        ||||j                  |j                  |j                  |j                   |j"                  |j$                  |j&                  |j(                  |j*                        S )	a  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            ProphetNet uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If
            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
            `past_key_values`).
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[-100, 0, ...,
            config.vocab_size - 1]`. All labels set to `-100` are ignored (masked), the loss is only computed for
            labels in `[0, ..., config.vocab_size]`

        Example:

        ```python
        >>> from transformers import AutoTokenizer, ProphetNetForConditionalGeneration

        >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/prophetnet-large-uncased")
        >>> model = ProphetNetForConditionalGeneration.from_pretrained("microsoft/prophetnet-large-uncased")

        >>> input_ids = tokenizer(
        ...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
        ... ).input_ids  # Batch size 1
        >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1
        >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)

        >>> logits_next_token = outputs.logits  # logits to predict next token as usual
        >>> logits_ngram_next_tokens = outputs.logits_ngram  # logits to predict 2nd, 3rd, ... next tokens
        ```N)r   r   r  r  r  r[   rW  r  r9  r   rX  rY  r   r&   r   rK   r   c              3   &   K   | ]	  }||  y wr   rk   r]  s     r!   r`  z=ProphetNetForConditionalGeneration.forward.<locals>.<genexpr>D       RQAMqRra  )rX   rY   rZ   r[   r\   r]   r^   r_   r`   ra   rb   rc   )rz   rb  r   r{   r   r   r1   r  is_contiguousr   _compute_lossrj   rW   r[   r\   r]   r^   r_   r`   ra   rb   rc   )r   r   r   r  r  r  r[   rW  r  r  r9  r   rX  rY  r   rd  r/  r   r0   predicting_streamspredict_logitsrY   rZ   rX   
all_logitss                            r!   r   z*ProphetNetForConditionalGeneration.forward  s   p &1%<k$++B]B]"3";@U@] $ 1 1& 9//)/#9++'"7/!5#) " 
  (9'D##J_JeJefhghJi 	$
O %QZ__Z9J9JO]_`&891%040A0AA0E~ae,4 ##%&&(F%%nf=DR6<*@RRJ9=9ID7Z''!"+5gz\cdedf\gOgg,) ' 7 7&-&C&C,3,O,O#*#=#=)0)I)I!(!9!9*1*K*K&-&C&C#*#=#= r"   c                    |j                  | j                  j                  |j                  d      |j                  d            j	                  |      }t        | j                  j                        D ]!  }|dkD  r| j                  r n|||d d d d f<   # |j                  dd      j                         }t        j                  j                  |j                  d|j                  d            dt        j                        }t        j                  j                  ||j                  d      d      }| j                  j                   dkD  r|j#                  dd	       }|j%                  |      j                  d      }	||	   }|j'                         }| j                  j                   |j                  d      z  }
d
| j                  j                   z
  |z  |
|z  z   }|S Nr   r   rK   r   mean)	reductiong        T)r   keepdimr[  r   rz   r1   rN   fill_r,   r  r   r   r   r   log_softmaxr   r   r   nll_lossepssumner  r   rY   r  ignore_indexexpend_targetsrp  lprobsrX   smooth_lossnon_masked_tokenseps_is              r!   r  z0ProphetNetForConditionalGeneration._compute_lossV     ))$++*;*;V[[^V[[YZ^\bbcopt{{(() 	-A1u00&,N1a7#	-
 !!!Q'224**KKFKKO,-- + 
 }}%%fn.A.A".EQW%X;;??S !::"d:;;K . 1 1, ? D DR H%&78K%**,KKKOOfkk"o5E$++//)T1EK4GGDr"   c                 $    | j                  |      S r   )r   )r   r  s     r!   %prepare_decoder_input_ids_from_labelszHProphetNetForConditionalGeneration.prepare_decoder_input_ids_from_labelsr  s      ((r"   c                 T    || j                   j                  S t        |   |      S )N)modality)r{   r  r   get_encoder)r   r  r   s     r!   r  z.ProphetNetForConditionalGeneration.get_encoderu  s,    ??***7&&99r"   )NNNNNNNNNNNNNNr}   r   )rd   re   rf   r  r   r   rP  r   r   r   r  r	   r   rj   rW   r   r  r  r  r   r   s   @r!   r  r    s    	=	/ 	/  *..215:>/3(,-159&*!%)-,0#'.2o<<$&o t+o !<<$.	o
 !& 0 04 7o ,o o ||d*o  %||d2o t#o $;o  $;o #Tko D[o t+o" 
*	*#o ob8)ELL ): :r"   r  zt
    The standalone decoder part of the ProphetNetModel with a lm head on top. The model can be used for causal
    c                   p    e Zd ZdddZdef fdZd Zd Ze	 	 	 	 	 	 	 	 	 	 	 dde	j                  dz  d	e	j                  dz  d
e	j                  dz  de	j                  dz  dedz  de	j                  dz  de	j                  dz  dedz  dedz  dedz  dedz  deez  fd       ZddZ	 	 	 	 d fd	Z xZS )ProphetNetForCausalLMr  )r  z)prophetnet.decoder.word_embeddings.weightrz   c                 P   t        j                  |      }d|_        d|_        t        |   |       t        |      | _        |j                  | _	        |j                  | _
        t        j                  |j                  |j                  d      | _        | j!                          y )NTFr  )r  r  r  rx  r   r   ProphetNetDecoderWrapperr{   r   r   r  r   r   r   rE  r  rL  r   s     r!   r   zProphetNetForCausalLM.__init__  s    v& $)! 26:!.."(";";yy!3!3V5F5FUS 	r"   c                 B    | j                   j                  j                  S r   r{   r  rF  r   s    r!   rP  z*ProphetNetForCausalLM.get_input_embeddings  s    &&666r"   c                 :    || j                   j                  _        y r   r  rS  s     r!   rU  z*ProphetNetForCausalLM.set_input_embeddings  s    27/r"   Nr   r   rb   rs  r[   rW  r  r9  r   rX  rY  r   c                    ||n| j                   j                  }| j                  j                  ||||||||	|
|
      }||j                  n|j                  dd \  }}|d   j                  || j                   j                  |d      }| j                  |      }|dddf   }| j                   j                  dkD  r|ddddf   nd}d}|| j                  ||      }|s*t        d ||fD              }||f|z   |dd z   S ||dd z   S t        ||||j                  |j                  |j                  |j                  |j                  |j                   	      S )	a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
            ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]`

        Example:

        ```python
        >>> from transformers import AutoTokenizer, ProphetNetForCausalLM
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/prophetnet-large-uncased")
        >>> model = ProphetNetForCausalLM.from_pretrained("microsoft/prophetnet-large-uncased")
        >>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder."
        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> logits = outputs.logits

        >>> # Model can also be used with EncoderDecoder framework
        >>> from transformers import BertTokenizer, EncoderDecoderModel, AutoTokenizer
        >>> import torch

        >>> tokenizer_enc = BertTokenizer.from_pretrained("google-bert/bert-large-uncased")
        >>> tokenizer_dec = AutoTokenizer.from_pretrained("microsoft/prophetnet-large-uncased")
        >>> model = EncoderDecoderModel.from_encoder_decoder_pretrained(
        ...     "google-bert/bert-large-uncased", "microsoft/prophetnet-large-uncased"
        ... )

        >>> ARTICLE = (
        ...     "the us state department said wednesday it had received no "
        ...     "formal word from bolivia that it was expelling the us ambassador there "
        ...     "but said the charges made against him are `` baseless ."
        ... )
        >>> input_ids = tokenizer_enc(ARTICLE, return_tensors="pt").input_ids
        >>> labels = tokenizer_dec(
        ...     "us rejects charges against its ambassador in bolivia", return_tensors="pt"
        ... ).input_ids
        >>> outputs = model(input_ids=input_ids, decoder_input_ids=labels[:, :-1], labels=labels[:, 1:])

        >>> loss = outputs.loss
        ```N)
r   r   rb   rs  r[   rW  r9  r   rX  rY  r&   r   rK   r   c              3   &   K   | ]	  }||  y wr   rk   r]  s     r!   r`  z0ProphetNetForCausalLM.forward.<locals>.<genexpr>  r  ra  )	rX   rY   rZ   r[   rr   rs   rt   ru   r`   )rz   rb  r{   r  r   r   r1   r  r  rj   rw   r[   rr   rs   rt   ru   r`   )r   r   r   rb   rs  r[   rW  r  r9  r   rX  rY  rd  r/  r   r0   r  r  rY   rZ   rX   r  s                         r!   r   zProphetNetForCausalLM.forward  s   v &1%<k$++B]B] //)))"7#9+'/!5# * 
 :C9NiooTaTgTghjijTk#
O$QZ__Z9J9JO]_`&891%040A0AA0E~ae,4%%nf=DR6<*@RRJ9=9ID7Z''!"+5gz\cdedf\gOgg,) ' 7 7%33$+$?$?"--!(!9!9!(!9!9
 
r"   c                    |j                  | j                  j                  |j                  d      |j                  d            j	                  |      }t        | j                  j                        D ]!  }|dkD  r| j                  r n|||d d d d f<   # |j                  dd      j                         }t        j                  j                  |j                  d|j                  d            dt        j                        }t        j                  j                  ||j                  d      d      }| j                  j                   dkD  r|j#                  dd	       }|j%                  |      j                  d      }	||	   }|j'                         }| j                  j                   |j                  d      z  }
d
| j                  j                   z
  |z  |
|z  z   }|S r  r  r  s              r!   r  z#ProphetNetForCausalLM._compute_loss  r  r"   c                 V    t        |   |f||||d|}|j                  dd        |S )N)r[   r   r9  is_first_iterationr   )r   prepare_inputs_for_generationpop)	r   r   r[   r   r9  r  rd  model_inputsr   s	           r!   r  z3ProphetNetForCausalLM.prepare_inputs_for_generation   sK     w<
+)1
 
 	)40r"   r  r  )NNNF)rd   re   rf   r  r   r   rP  rU  r   r   r   r	   r   rj   rw   r   r  r  r   r   s   @r!   r  r  |  sS    >5X
/  78  *..2596:(,-1&*!%)-,0#'d<<$&d t+d  %||d2	d
 !&t 3d d ||d*d t#d $;d  $;d #Tkd D[d 
*	*d dL>   r"   r  c                   6     e Zd ZdZddiZdef fdZd Z xZS )r  z
    This is a wrapper class, so that [`ProphetNetForCausalLM`] can correctly be loaded from pretrained prophetnet
    classes.
    r  r  rz   c                     t         |   |       t        j                  |j                  |j
                  |j                        | _        t        |      | _	        | j                          y )NrC  )r   r   r   rD  rE  r   r   rF  rk  r  rL  r   s     r!   r   z!ProphetNetDecoderWrapper.__init__C  sP     !||F,=,=v?Q?Q_e_r_rs(0 	r"   c                 &     | j                   |i |S r   )r  )r   argsrd  s      r!   r   z ProphetNetDecoderWrapper.forwardL  s    t||T,V,,r"   )	rd   re   rf   rg   r  r   r   r   r   r   s   @r!   r  r  9  s*     	)*B/ -r"   r  )rk  rA  r  r  r  ry   r0  )9rg   r  r=   dataclassesr   r   r   r   torch.nnr   activationsr   cache_utilsr	   r
   r   
generationr   modeling_layersr   modeling_outputsr   modeling_utilsr   utilsr   r   r   configuration_prophetnetr   
get_loggerrd   rv  r   r5   rI   rT   rW   rm   rq   rw   ry   rD  r   Moduler   r   r   r$  r2  rA  rk  r  r  r  r  __all__rk   r"   r!   <module>r     s   Y   !    ! C C ) 9 / - 9 9 6 
		H	%Q7" 6M. 
*? *? *?Z (?; (? (?V 
#=; #= #=L 
+= += +=\ ! ! !8(-R\\ (-Vv2")) v2rBII .n/299 n/b	&7 &RD7 DN 
c
1 c

c
L 
jG1 jG
jGZ	 ~
/ ~
 ~
B 
h:)BO h:
h:V 
u5 u
up-8 -.r"   