
    qiGZ                        d Z ddlZddlmZ ddlmZmZmZ ddlmZ	 ddl
mZ ddlmZmZmZmZmZmZmZ ddlmZ dd	lmZ dd
lmZmZmZ ddlmZ ddlmZm Z m!Z!m"Z"m#Z#m$Z$m%Z% ddl&m'Z'  ejP                  e)      Z* G d de!      Z+ G d de%      Z, G d de       Z- G d de"      Z.e G d de             Z/e G d de$             Z0 G d de#      Z1 G d de      Z2 ed !       G d" d#e/e             Z3e G d$ d%e/             Z4 ed&!       G d' d(e/             Z5e G d) d*e/             Z6e G d+ d,e/             Z7e G d- d.e/             Z8g d/Z9y)0zPyTorch Data2VecText model.    N)BCEWithLogitsLossCrossEntropyLossMSELoss   )initialization)GenerationMixin),BaseModelOutputWithPoolingAndCrossAttentions!CausalLMOutputWithCrossAttentionsMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)Unpack)TransformersKwargsauto_docstringlogging)can_return_tuple   )RobertaClassificationHeadRobertaCrossAttentionRobertaEmbeddingsRobertaLayerRobertaLMHeadRobertaModelRobertaSelfAttention   )Data2VecTextConfigc                       e Zd Zy)Data2VecTextEmbeddingsN__name__
__module____qualname__     d/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/data2vec/modular_data2vec_text.pyr!   r!   2       r'   r!   c                       e Zd Zy)Data2VecTextSelfAttentionNr"   r&   r'   r(   r+   r+   6   r)   r'   r+   c                       e Zd Zy)Data2VecTextCrossAttentionNr"   r&   r'   r(   r-   r-   :   r)   r'   r-   c                       e Zd Zy)Data2VecTextLayerNr"   r&   r'   r(   r/   r/   >   r)   r'   r/   c                   N     e Zd ZeZdZdZddgZdZdZ	dZ
dZeeedZ fdZ xZS )Data2VecTextPreTrainedModeldata2vec_textTData2VecTextForTextEmbeddingsr/   )hidden_states
attentionscross_attentionsc                 6   t         |   |       t        |t              ryt	        j
                  |j                  t        j                  |j                  j                  d         j                  d             t	        j                  |j                         y y )N)r   r8   )super_init_weights
isinstancer!   initcopy_position_idstorcharangeshapeexpandzeros_token_type_ids)selfmodule	__class__s     r(   r:   z)Data2VecTextPreTrainedModel._init_weightsR   sm    f%f45JJv**ELL9L9L9R9RSU9V,W,^,^_f,ghKK--. 6r'   )r#   r$   r%   r   config_classbase_model_prefixsupports_gradient_checkpointing_no_split_modules_supports_flash_attn_supports_sdpa_supports_flex_attn_supports_attention_backendr/   r+   r-   _can_record_outputsr:   __classcell__rG   s   @r(   r1   r1   B   sR    %L'&*#8:MNN"&*/6/ /r'   r1   c                       e Zd Zy)Data2VecTextModelNr"   r&   r'   r(   rT   rT   Y   s    r'   rT   c                       e Zd Zy)Data2VecTextLMHeadNr"   r&   r'   r(   rV   rV   ^   r)   r'   rV   c                       e Zd Zy)Data2VecTextClassificationHeadNr"   r&   r'   r(   rX   rX   b   r)   r'   rX   zX
    Data2VecText Model with a `language modeling` head on top for CLM fine-tuning.
    )custom_introc                        e Zd ZdddZ fdZd Zd Zee	 	 	 	 	 	 	 	 	 	 	 	 dde	j                  dz  d	e	j                  dz  d
e	j                  dz  de	j                  dz  de	j                  dz  de	j                  dz  de	j                  dz  de	j                  dz  deee	j                        dz  dedz  de	j                  dz  dee	j                  z  dee   deez  fd              Z xZS )Data2VecTextForCausalLM/data2vec_text.embeddings.word_embeddings.weightlm_head.biaszlm_head.decoder.weightzlm_head.decoder.biasc                     t         |   |       |j                  st        j	                  d       t        |d      | _        t        |      | _        | j                          y )NzTIf you want to use `Data2VecTextLMHeadModel` as a standalone, add `is_decoder=True.`Fadd_pooling_layer
r9   __init__
is_decoderloggerwarningrT   r2   rV   lm_head	post_initrE   configrG   s     r(   rc   z Data2VecTextForCausalLM.__init__q   sM       NNqr.vO)&1 	r'   c                 .    | j                   j                  S Nrg   decoderrE   s    r(   get_output_embeddingsz-Data2VecTextForCausalLM.get_output_embeddings}       ||###r'   c                 &    || j                   _        y rl   rm   rE   new_embeddingss     r(   set_output_embeddingsz-Data2VecTextForCausalLM.set_output_embeddings       -r'   N	input_idsattention_maskrD   r>   inputs_embedsencoder_hidden_statesencoder_attention_masklabelspast_key_values	use_cachecache_positionlogits_to_keepkwargsreturnc                    |d}
 | j                   |f|||||||	|
|dd
|}|j                  }t        |t              rt	        | d      n|}| j                  |dd|ddf         }d}|* | j                  d||| j                  j                  d|}t        |||j                  |j                  |j                  |j                        S )aA  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
            ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`

        Example:

        ```python
        >>> from transformers import AutoTokenizer, Data2VecTextForCausalLM, Data2VecTextConfig
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("facebook/data2vec-text-base")
        >>> config = Data2VecTextConfig.from_pretrained("facebook/data2vec-text-base")
        >>> config.is_decoder = True
        >>> model = Data2VecTextForCausalLM.from_pretrained("facebook/data2vec-text-base", config=config)

        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> prediction_logits = outputs.logits
        ```NFT)
rx   rD   r>   ry   rz   r{   r}   r~   r   return_dict)logitsr|   
vocab_size)lossr   r}   r4   r5   r6   r&   )r2   last_hidden_stater;   intslicerg   loss_functionrj   r   r
   r}   r4   r5   r6   )rE   rw   rx   rD   r>   ry   rz   r{   r|   r}   r~   r   r   r   outputsr4   slice_indicesr   r   s                      r(   forwardzData2VecTextForCausalLM.forward   s   P I@R@R@RA
))%'"7#9+)A
 A
  118B>SV8W~ot4]kmA}a,?@A%4%%pVFt{{OeOepiopD0#33!//))$55
 	
r'   )NNNNNNNNNNNr   )r#   r$   r%   _tied_weights_keysrc   rp   ru   r   r   r?   
LongTensorFloatTensortupleboolTensorr   r   r   r
   r   rQ   rR   s   @r(   r[   r[   f   s    #T .

$.  .237260426:>;?*.BF!%.2-.H
##d*H
 ))D0H
 ((4/	H

 &&-H
 ((4/H
  %0047H
 !& 1 1D 8H
   4'H
 uU%6%6784?H
 $;H
 t+H
 ell*H
 +,H
 
2	2H
  H
r'   r[   c                   f    e Zd ZdddZ fdZd Zd Zee	 	 	 	 	 	 	 	 dde	j                  dz  d	e	j                  dz  d
e	j                  dz  de	j                  dz  de	j                  dz  de	j                  dz  de	j                  dz  de	j                  dz  dee   deez  fd              Z xZS )Data2VecTextForMaskedLMr\   r]   r^   c                     t         |   |       |j                  rt        j	                  d       t        |d      | _        t        |      | _        | j                          y )NzsIf you want to use `Data2VecTextForMaskedLM` make sure `config.is_decoder=False` for bi-directional self-attention.Fr`   rb   ri   s     r(   rc   z Data2VecTextForMaskedLM.__init__   sS     NN1
 /vO)&1 	r'   c                 .    | j                   j                  S rl   rm   ro   s    r(   rp   z-Data2VecTextForMaskedLM.get_output_embeddings   rq   r'   c                 &    || j                   _        y rl   rm   rs   s     r(   ru   z-Data2VecTextForMaskedLM.set_output_embeddings   rv   r'   Nrw   rx   rD   r>   ry   rz   r{   r|   r   r   c	                 t    | j                   |f||||||dd|	}
|
d   }| j                  |      }d}|at               }|j                  |j                        } ||j                  d| j                  j                        |j                  d            }t        |||
j                  |
j                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        T)rx   rD   r>   ry   rz   r{   r   r   Nr8   r   r   r4   r5   )r2   rg   r   todeviceviewrj   r   r   r4   r5   )rE   rw   rx   rD   r>   ry   rz   r{   r|   r   r   sequence_outputprediction_scoresmasked_lm_lossloss_fcts                  r(   r   zData2VecTextForMaskedLM.forward   s    ( %$$$

))%'"7#9

 

 "!* LL9')HYY0778F%&7&<&<RAWAW&XZ`ZeZefhZijN$!//))	
 	
r'   )NNNNNNNN)r#   r$   r%   r   rc   rp   ru   r   r   r?   r   r   r   r   r   r   r   rQ   rR   s   @r(   r   r      s    #T .
$.  .237260426:>;?*.,
##d*,
 ))D0,
 ((4/	,

 &&-,
 ((4/,
  %0047,
 !& 1 1D 8,
   4',
 +,,
 
	,
  ,
r'   r   z
    Data2VecText Model transformer with a sequence classification/regression head on top (a linear layer on top of the
    pooled output) e.g. for GLUE tasks.
    c                       e Zd Z fdZee	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	e	e
   d
eez  fd              Z xZS )%Data2VecTextForSequenceClassificationc                     t         |   |       |j                  | _        || _        t	        |d      | _        t        |      | _        | j                          y NFr`   )	r9   rc   
num_labelsrj   rT   r2   rX   
classifierrh   ri   s     r(   rc   z.Data2VecTextForSequenceClassification.__init__$  sK      ++.vO8@ 	r'   Nrw   rx   rD   r>   ry   r|   r   r   c           	          | j                   |f||||dd|}|d   }	| j                  |	      }
d}||j                  |
j                        }| j                  j
                  | j                  dk(  rd| j                  _        nl| j                  dkD  rL|j                  t        j                  k(  s|j                  t        j                  k(  rd| j                  _        nd| j                  _        | j                  j
                  dk(  rIt               }| j                  dk(  r& ||
j                         |j                               }n ||
|      }n| j                  j
                  dk(  r=t               } ||
j                  d	| j                        |j                  d	            }n,| j                  j
                  dk(  rt               } ||
|      }t!        ||
|j"                  |j$                  
      S )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Trx   rD   r>   ry   r   r   Nr   
regressionsingle_label_classificationmulti_label_classificationr8   r   )r2   r   r   r   rj   problem_typer   dtyper?   longr   r   squeezer   r   r   r   r4   r5   rE   rw   rx   rD   r>   ry   r|   r   r   r   r   r   r   s                r(   r   z-Data2VecTextForSequenceClassification.forward/  s   $ %$$$
))%'
 
 "!*1YYv}}-F{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#FF3D))-JJ+-B @&++b/R))-II,./'!//))	
 	
r'   NNNNNN)r#   r$   r%   rc   r   r   r?   r   r   r   r   r   r   r   rQ   rR   s   @r(   r   r     s    	  .237260426*.:
##d*:
 ))D0:
 ((4/	:

 &&-:
 ((4/:
   4':
 +,:
 
)	):
  :
r'   r   c                       e Zd Z fdZee	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	e	e
   d
eez  fd              Z xZS )Data2VecTextForMultipleChoicec                     t         |   |       t        |      | _        t	        j
                  |j                        | _        t	        j                  |j                  d      | _
        | j                          y )Nr   )r9   rc   rT   r2   nnDropouthidden_dropout_probdropoutLinearhidden_sizer   rh   ri   s     r(   rc   z&Data2VecTextForMultipleChoice.__init__p  sW     .v6zz&"<"<=))F$6$6: 	r'   Nrw   rD   rx   r|   r>   ry   r   r   c           	      "   ||j                   d   n|j                   d   }|!|j                  d|j                  d            nd}	|!|j                  d|j                  d            nd}
|!|j                  d|j                  d            nd}|!|j                  d|j                  d            nd}|1|j                  d|j                  d      |j                  d            nd} | j                  |	f|
|||dd|}|d   }| j	                  |      }| j                  |      }|j                  d|      }d}|.t               }|j                  |j                        } |||      }t        |||j                  |j                        S )a[  
        input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        Nr   r8   T)r>   rD   rx   ry   r   r   )rA   r   sizer2   r   r   r   r   r   r   r4   r5   )rE   rw   rD   rx   r|   r>   ry   r   num_choicesflat_input_idsflat_position_idsflat_token_type_idsflat_attention_maskflat_inputs_embedsr   pooled_outputr   reshaped_logitsr   r   s                       r(   r   z%Data2VecTextForMultipleChoice.forwardz  s   T -6,Aiooa(}GZGZ[\G]CLCXINN2,>?^bLXLdL--b,2C2CB2GHjnR`Rln11"n6I6I"6MNrvR`Rln11"n6I6I"6MNrv ( r=#5#5b#9=;M;Mb;QR 	 %$$$
*..,
 
  
]3/ ++b+6')HYY556FOV4D("!//))	
 	
r'   r   )r#   r$   r%   rc   r   r   r?   r   r   r   r   r   r   r   rQ   rR   s   @r(   r   r   n  s      .22637*.0426O
##d*O
 ((4/O
 ))D0	O

   4'O
 &&-O
 ((4/O
 +,O
 
*	*O
  O
r'   r   c                       e Zd Z fdZee	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	e	e
   d
eez  fd              Z xZS )"Data2VecTextForTokenClassificationc                 d   t         |   |       |j                  | _        t        |d      | _        |j
                  |j
                  n|j                  }t        j                  |      | _	        t        j                  |j                  |j                        | _        | j                          y r   )r9   rc   r   rT   r2   classifier_dropoutr   r   r   r   r   r   r   rh   )rE   rj   r   rG   s      r(   rc   z+Data2VecTextForTokenClassification.__init__  s      ++.vO)/)B)B)NF%%TZTnTn 	 zz"45))F$6$68I8IJ 	r'   Nrw   rx   rD   r>   ry   r|   r   r   c           	      ~    | j                   |f||||dd|}|d   }	| j                  |	      }	| j                  |	      }
d}|Wt               }|j	                  |
j
                        } ||
j                  d| j                        |j                  d            }t        ||
|j                  |j                        S )z
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        Tr   r   Nr8   r   )r2   r   r   r   r   r   r   r   r   r4   r5   r   s                r(   r   z*Data2VecTextForTokenClassification.forward  s      %$$$
))%'
 
 "!*,,71')HYYv}}-FFKKDOO<fkk"oND$!//))	
 	
r'   r   )r#   r$   r%   rc   r   r   r?   r   r   r   r   r   r   r   rQ   rR   s   @r(   r   r     s      .237260426*.)
##d*)
 ))D0)
 ((4/	)

 &&-)
 ((4/)
   4')
 +,)
 
&	&)
  )
r'   r   c                   0    e Zd Z fdZee	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	ej                  dz  d
e	e
   deez  fd              Z xZS ) Data2VecTextForQuestionAnsweringc                     t         |   |       |j                  | _        t        |d      | _        t        j                  |j                  |j                        | _        | j                          y r   )
r9   rc   r   rT   r2   r   r   r   
qa_outputsrh   ri   s     r(   rc   z)Data2VecTextForQuestionAnswering.__init__  sV      ++.vO))F$6$68I8IJ 	r'   Nrw   rx   rD   r>   ry   start_positionsend_positionsr   r   c           	          | j                   |f||||dd|}	|	d   }
| j                  |
      }|j                  dd      \  }}|j                  d      j	                         }|j                  d      j	                         }d }||t        |j                               dkD  r|j                  d      }t        |j                               dkD  r|j                  d      }|j                  d      }|j                  d|      }|j                  d|      }t        |      } |||      } |||      }||z   dz  }t        ||||	j                  |	j                  	      S )
NTr   r   r   r8   )dim)ignore_indexr   )r   start_logits
end_logitsr4   r5   )r2   r   splitr   
contiguouslenr   clampr   r   r4   r5   )rE   rw   rx   rD   r>   ry   r   r   r   r   r   r   r   r   
total_lossignored_indexr   
start_lossend_losss                      r(   r   z(Data2VecTextForQuestionAnswering.forward  s    %$$$
))%'
 
 "!*1#)<<r<#: j#++B/::<''+668

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M-33A}EO)//=AM']CH!,@J
M:H$x/14J+%!!//))
 	
r'   )NNNNNNN)r#   r$   r%   rc   r   r   r?   r   r   r   r   r   r   r   rQ   rR   s   @r(   r   r     s      .23726042637153
##d*3
 ))D03
 ((4/	3

 &&-3
 ((4/3
 ))D03
 ''$.3
 +,3
 
-	-3
  3
r'   r   )r[   r   r   r   r   r   rT   r1   ):__doc__r?   torch.nnr   r   r   r    r   r<   
generationr   modeling_outputsr	   r
   r   r   r   r   r   modeling_utilsr   processing_utilsr   utilsr   r   r   utils.genericr   roberta.modeling_robertar   r   r   r   r   r   r   configuration_data2vec_textr   
get_loggerr#   re   r!   r+   r-   r/   r1   rT   rV   rX   r[   r   r   r   r   r   __all__r&   r'   r(   <module>r      s   "   A A & )   . & @ @ -   < 
		H	%	. 		 4 		!6 		 	 // / /, 	 	 		 		%> 	 
b
9? b

b
J I
9 I
 I
X H
,G H
H
V \
$? \
 \
~ :
)D :
 :
z @
'B @
 @
F	r'   