
    qiQI                        d Z ddlZddlZddlmZ ddlmZ ddlmZ ddl	m
Z
 ddlmZ dd	lmZmZmZ dd
lmZ ddlmZ ddlmZmZmZmZmZ ddlmZ ddlmZ ddlm Z   G d de      Z!e G d de             Z" G d de      Z# G d de      Z$e G d de"             Z% ed       G d de"e             Z& G d  d!e      Z' G d" d#e      Z( G d$ d%e      Z)g d&Z*y)'zPyTorch PLBART model.    N)nn)CrossEntropyLoss   )initialization)Cache)GenerationMixin)BaseModelOutputSeq2SeqLMOutputSeq2SeqModelOutput)PreTrainedModel)auto_docstring   )BartClassificationHeadBartDecoderBartEncoderBartForCausalLMBartScaledWordEmbedding)'BigBirdPegasusForSequenceClassification)shift_tokens_right   )PLBartConfigc                       e Zd Zy)PLBartScaledWordEmbeddingN__name__
__module____qualname__     [/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/plbart/modular_plbart.pyr   r   ,       r   r   c                   F     e Zd ZU eed<   dZdZddgZdZdZ	dZ
 fdZ xZS )PLBartPreTrainedModelconfigmodelTPLBartDecoderLayerPLBartEncoderLayerc                     t         |   |       t        |t              r t	        j
                  |j                         y y N)super_init_weights
isinstancePLBartForConditionalGenerationinitzeros_final_logits_bias)selfmodule	__class__s     r    r+   z#PLBartPreTrainedModel._init_weights:   s2    f%f<=KK001 >r   )r   r   r   r   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_supports_flash_attn_supports_sdpa_supports_flex_attnr+   __classcell__r3   s   @r    r#   r#   0   s<    &*#-/CDN2 2r   r#   c                       e Zd Zy)PLBartEncoderNr   r   r   r    r>   r>   @   r!   r   r>   c                       e Zd Zy)PLBartDecoderNr   r   r   r    r@   r@   D   r!   r   r@   c                       e Zd ZdddZdef fdZd Zd Ze	 	 	 	 	 	 	 	 	 	 	 	 	 dde	j                  dz  d	e	j                  dz  d
e	j                  dz  de	j                  dz  dee	j                     dz  dedz  de	j                  dz  de	j                  dz  dedz  dedz  dedz  dedz  de	j                  dz  dee	j                     ez  fd       Z xZS )PLBartModelzshared.weight)zencoder.embed_tokens.weightzdecoder.embed_tokens.weightr$   c                 J   t         |   |       |j                  |j                  }}|j                  rt        j                  |j                        nd}t        ||j                  ||      | _	        t        |      | _        t        |      | _        | j                          y )Ng      ?)embed_scale)r*   __init__pad_token_id
vocab_sizescale_embeddingmathsqrtd_modelr   sharedr>   encoderr@   decoder	post_init)r1   r$   padding_idxrG   rD   r3   s        r    rE   zPLBartModel.__init__O   sz     "("5"5v7H7HZ393I3Idii/s/
FNNKepq$V,$V,r   c                     | j                   S r)   )rL   )r1   s    r    get_input_embeddingsz PLBartModel.get_input_embeddings[   s    {{r   c                 ~    || _         | j                   | j                  _        | j                   | j                  _        y r)   )rL   rM   embed_tokensrN   )r1   values     r    set_input_embeddingsz PLBartModel.set_input_embeddings^   s)    $(KK!$(KK!r   N	input_idsattention_maskdecoder_input_idsdecoder_attention_maskencoder_outputspast_key_valuesinputs_embedsdecoder_inputs_embeds	use_cacheoutput_attentionsoutput_hidden_statesreturn_dictcache_positionreturnc                    |
|
n| j                   j                  }
||n| j                   j                  }|	|	n| j                   j                  }	||n| j                   j                  }|"| t        || j                   j                        }|| j                  ||||
||      }nI|rGt        |t              s7t        |d   t        |      dkD  r|d   ndt        |      dkD  r|d   nd      }| j                  |||d   ||||	|
|||      }|s||z   S t        |j                  |j                  |j                  |j                   |j"                  |j                  |j                  |j                         S )	a  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`] or [`PLBartMultiTokenizer`] depending on the checkpoint.
            See [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            PLBart uses a specific language id token as the starting token for `decoder_input_ids` generation that
            varies according to source and target language, *e.g.* 50003 for *en_XX*, and 50001 for *java*. If
            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
            `past_key_values`).

            For translation and summarization training, `decoder_input_ids` should be provided. If no
            `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
            for denoising pre-training following the paper.
        decoder_attention_mask (:
            obj:*torch.LongTensor* of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior:
            generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also be used by default.
        N)rW   rX   r]   r`   ra   rb   r   r   r   )last_hidden_statehidden_states
attentions)rW   rX   encoder_hidden_statesencoder_attention_maskr\   r]   r_   r`   ra   rb   rc   )rf   r\   decoder_hidden_statesdecoder_attentionscross_attentionsencoder_last_hidden_stateri   encoder_attentions)r$   r`   ra   r_   use_return_dictr   rF   rM   r,   r	   lenrN   r   rf   r\   rg   rh   rm   )r1   rW   rX   rY   rZ   r[   r\   r]   r^   r_   r`   ra   rb   rc   kwargsdecoder_outputss                   r    forwardzPLBartModel.forwardc   s   P 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B] $)>)F 29dkk>V>V W""ll#-+"3%9' + O O_!M-"1!"4474H14Loa0RV14_1E1I?1-tO ,,'1"1!"4#1+//!5#) ' 
 "_44!-??+;;"1"?"?.99,==&5&G&G"1"?"?.99	
 		
r   )NNNNNNNNNNNNN)r   r   r   _tied_weights_keysr   rE   rR   rV   r   torch
LongTensorTensorlistFloatTensorr   booltupler   rt   r;   r<   s   @r    rB   rB   H   s~    (7'6

| 
0
  .226596::>(,26:>!%)-,0#'26_
##d*_
 ((4/_
 !++d2	_

 !&t 3_
 e//047_
 _
 ((4/_
  %0047_
 $;_
  $;_
 #Tk_
 D[_
 ((4/_
  
u||	1	1!_
 _
r   rB   zv
    The PLBART Model with a language modeling head. Can be used for code-to-text, text-to-code and code-to-code.
    )custom_introc            !       :    e Zd ZdZdgZddiZdef fdZ	 dded	edz  d
e	de
j                  f fdZdeddfdZe	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddej                   dz  dej                   dz  dej                   dz  dej"                  dz  deej&                     dz  dedz  dej&                  dz  dej&                  dz  dej"                  dz  de	dz  de	dz  de	dz  de	dz  dej                   dz  deej"                     ez  fd       Zdej"                  fdZ xZS ) r-   r%   r0   zlm_head.weightzmodel.shared.weightr$   c                 x   t         |   |       t        |      | _        | j	                  dt        j                  d| j                  j                  j                  f             t        j                  |j                  | j                  j                  j                  d      | _        | j                          y )Nr0   r   F)bias)r*   rE   rB   r%   register_bufferrv   zerosrL   num_embeddingsr   LinearrK   lm_headrO   )r1   r$   r3   s     r    rE   z'PLBartForConditionalGeneration.__init__   s~      (
0%++q$**BSBSBbBb>c2deyy1B1B1Q1QX]^r   Nnew_num_tokenspad_to_multiple_ofmean_resizingrd   c                 z    t         |   |||      }| j                  |j                  j                  d          |S )Nr   )r*   resize_token_embeddings_resize_final_logits_biasweightshape)r1   r   r   r   new_embeddingsr3   s        r    r   z6PLBartForConditionalGeneration.resize_token_embeddings   s?     8I[]jk&&~'<'<'B'B1'EFr   c                 6   | j                   j                  d   }||k  r| j                   d d d |f   }nSt        j                  d||z
  f| j                   j                        }t        j
                  | j                   |gd      }| j                  d|       y )Nr   )device)dimr0   )r0   r   rv   r   r   catr   )r1   r   old_num_tokensnew_bias
extra_biass        r    r   z8PLBartForConditionalGeneration._resize_final_logits_bias   s    //55b9^+--a..@AHa.)H%IRVRhRhRoRopJyy$"8"8*!E1MH0(;r   rW   rX   rY   rZ   r[   r\   r]   r^   labelsr_   r`   ra   rb   rc   c                    ||n| j                   j                  }|	$|"| t        |	| j                   j                        }| j	                  |||||||||
||||      }| j                  |d         }|| j                  j                  |j                        z   }d}|	Ft               } ||j                  d| j                   j                        |	j                  d            }|s|f|dd z   }||f|z   S |S t        |||j                  |j                  |j                  |j                   |j"                  |j$                  |j&                  	      S )a
  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`] or [`PLBartMultiTokenizer`] depending on the checkpoint.
            See [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            PLBart uses a specific language id token as the starting token for `decoder_input_ids` generation that
            varies according to source and target language, *e.g.* 50003 for *en_XX*, and 50001 for *java*. If
            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
            `past_key_values`).

            For translation and summarization training, `decoder_input_ids` should be provided. If no
            `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
            for denoising pre-training following the paper.
        decoder_attention_mask (:
            obj:*torch.LongTensor* of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior:
            generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also be used by default.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example Mask-filling:

        ```python
        >>> from transformers import AutoTokenizer, PLBartForConditionalGeneration

        >>> model = PLBartForConditionalGeneration.from_pretrained("uclanlp/plbart-base")
        >>> tokenizer = AutoTokenizer.from_pretrained("uclanlp/plbart-base")

        >>> # en_XX is the language symbol id <LID> for English
        >>> TXT = "<s> Is 0 the <mask> Fibonacci number ? </s> en_XX"
        >>> input_ids = tokenizer([TXT], add_special_tokens=False, return_tensors="pt").input_ids

        >>> logits = model(input_ids).logits
        >>> masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item()
        >>> probs = logits[0, masked_index].softmax(dim=0)
        >>> values, predictions = probs.topk(5)

        >>> tokenizer.decode(predictions).split()
        ['first', 'same', 'highest', 'result', 'number']
        ```
        N)rX   rY   r[   rZ   r\   r]   r^   r_   r`   ra   rb   rc   r   r   r   )	losslogitsr\   rk   rl   rm   rn   ri   ro   )r$   rp   r   rF   r%   r   r0   tor   r   viewrG   r
   r\   rk   rl   rm   rn   ri   ro   )r1   rW   rX   rY   rZ   r[   r\   r]   r^   r   r_   r`   ra   rb   rc   rr   outputs	lm_logitsmasked_lm_lossloss_fctoutputs                        r    rt   z&PLBartForConditionalGeneration.forward   st   D &1%<k$++B]B] (-B-J$6vt{{?W?W$X!**)/+#9+'"7/!5#)  
 LL,	 6 6 9 9):J:J KK	')H%innR9O9O&PRXR]R]^`RabN\GABK/F3A3M^%.YSYY#33")"?"?&99$55&-&G&G")"?"?&99

 
	
r   c                 B    t        || j                  j                        S r)   )r   r$   rF   )r1   r   s     r    %prepare_decoder_input_ids_from_labelszDPLBartForConditionalGeneration.prepare_decoder_input_ids_from_labelsY  s    !&$++*B*BCCr   )NT)NNNNNNNNNNNNNN)r   r   r   r5   _keys_to_ignore_on_load_missingru   r   rE   intr{   r   	Embeddingr   r   r   rv   rw   rx   ry   rz   r   r|   r
   rt   r   r;   r<   s   @r    r-   r-      s     ':&;#/|  ae!7:TzY]	< < <  .226596::>(,26:>&*!%)-,0#'26l
##d*l
 ((4/l
 !++d2	l

 !&t 3l
 e//047l
 l
 ((4/l
  %0047l
 t#l
 $;l
  $;l
 #Tkl
 D[l
 ((4/l
" 
u||		.#l
 l
\DELL Dr   r-   c                       e Zd Zy)PLBartClassificationHeadNr   r   r   r    r   r   ]  r!   r   r   c                        e Zd Z fdZ xZS )PLBartForSequenceClassificationc                  :     t               j                  di |  y)a  
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`] or [`PLBartMultiTokenizer`] depending on the checkpoint.
            See [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            PLBart uses a specific language id token as the starting token for `decoder_input_ids` generation that
            varies according to source and target language, *e.g.* 50003 for *en_XX*, and 50001 for *java*. If
            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
            `past_key_values`).

            For translation and summarization training, `decoder_input_ids` should be provided. If no
            `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
            for denoising pre-training following the paper.
        decoder_attention_mask (:
            obj:*torch.LongTensor* of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior:
            generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also be used by default.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr   r*   rt   super_kwargsr3   s    r    rt   z'PLBartForSequenceClassification.forwardb  s    4 	','r   )r   r   r   rt   r;   r<   s   @r    r   r   a  s    ( (r   r   c                   (     e Zd Ze fd       Z xZS )PLBartForCausalLMc                  :     t               j                  di |  y)aF  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, PLBartForCausalLM

        >>> tokenizer = AutoTokenizer.from_pretrained("uclanlp/plbart-base")
        >>> model = PLBartForCausalLM.from_pretrained("uclanlp/plbart-base")
        >>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder."
        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> logits = outputs.logits
        >>> expected_shape = [1, inputs.input_ids.shape[-1], model.config.vocab_size]
        >>> list(logits.shape) == expected_shape
        True
        ```Nr   r   r   s    r    rt   zPLBartForCausalLM.forward  s    0 	','r   )r   r   r   r   rt   r;   r<   s   @r    r   r     s    ( (r   r   )r   r-   r   rB   r#   )+__doc__rI   rv   r   torch.nnr    r   r.   cache_utilsr   
generationr   modeling_outputsr	   r
   r   modeling_utilsr   utilsr   bart.modeling_bartr   r   r   r   r   (bigbird_pegasus.modeling_bigbird_pegasusr   mbart.modeling_mbartr   configuration_plbartr   r   r#   r>   r@   rB   r-   r   r   r   __all__r   r   r    <module>r      s       % &   ) 
 . #  _ 5 .	 7 	 2O 2 2	K 		K 	 z
' z
 z
z 
OD%:O OD
ODd	5 	(&M (<( (8r   