
    qi J                     ~   d dl Z d dl mZ d dlmZmZmZ ddlmZ ddlm	Z	m
Z
mZmZ ddlmZ ddlmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZmZmZmZ  G d de      Z G d de      Z G d de      Z G d de      Z  G d de      Z!e G d de              Z"e G d de              Z#e G d de              Z$g dZ%y)    N)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )create_bidirectional_mask)BaseModelOutputMaskedLMOutputSequenceClassifierOutputTokenClassifierOutput)RopeParameters)Unpack)auto_docstring)TransformersKwargscan_return_tuple   )LlamaConfig)LlamaAttention
LlamaModelLlamaPreTrainedModelLlamaRMSNormc                   p     e Zd ZdZdZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddeeeef   z  dz  f fdZ xZ	S )EuroBertConfiga  
    This is the configuration class to store the configuration of a [`EuroBertModel`]. It is used to instantiate an EuroBert
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of [EuroBERT/EuroBERT-210m](https://huggingface.co/EuroBERT/EuroBERT-210m).

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        vocab_size (`int`, *optional*, defaults to 128256):
            Vocabulary size of the EuroBert model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`EuroBertModel`]
        hidden_size (`int`, *optional*, defaults to 768):
            Dimensionality of the encoder layers and the pooler layer.
        intermediate_size (`int`, *optional*, defaults to 3072):
            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
        num_hidden_layers (`int`, *optional*, defaults to 12):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 12):
            Number of attention heads for each attention layer in the Transformer encoder.
        num_key_value_heads (`int`, *optional*):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
            by meanpooling all the original heads within that group. For more details, check out [this
            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
            `num_attention_heads`.
        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
            The non-linear activation function (function or string) in the encoder and pooler.
        max_position_embeddings (`int`, *optional*, defaults to 8192):
            The maximum sequence length that this model might ever be used with. EuroBert supports up to 8192 tokens,
            EuroBert-pretrained up to 2048.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
            The epsilon used by the rms normalization layers.
        bos_token_id (`int`, *optional*, defaults to 128000):
            Beginning of stream token id.
        eos_token_id (`int`, *optional*, defaults to 128001):
            End of stream token id.
        pad_token_id (`int`, *optional*, defaults to 128001):
            Padding token id.
        mask_token_id (`int`, *optional*, defaults to 128002):
            Mask token id.
        pretraining_tp (`int`, *optional*, defaults to 1):
            Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
            document](https://huggingface.co/docs/transformers/main/perf_train_gpu_many#tensor-parallelism) to
            understand more about it. This value is necessary to ensure exact reproducibility of the pretraining
            results. Please refer to [this issue](https://github.com/pytorch/pytorch/issues/76232).
        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
            Whether to tie weight embeddings
        rope_parameters (`RopeParameters`, *optional*):
            Dictionary containing the configuration parameters for the RoPE embeddings. The dictionary should contain
            a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE
            with longer `max_position_embeddings`.
        attention_bias (`bool`, *optional*, defaults to `False`):
            Whether to use a bias in the query, key, value and output projection layers during self-attention.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        mlp_bias (`bool`, *optional*, defaults to `False`):
            Whether to use a bias in up_proj, down_proj and gate_proj layers in the MLP layers.
        head_dim (`int`, *optional*):
            The attention head dimension. If None, it will default to hidden_size // num_attention_heads
        classifier_pooling (`str`, *optional*, defaults to `"late"`):
            The pooling strategy to use for the classifier. Can be one of ['bos', 'mean', 'late'].

    ```python
    >>> from transformers import EuroBertModel, EuroBertConfig

    >>> # Initializing a EuroBert eurobert-base style configuration
    >>> configuration = EuroBertConfig()

    >>> # Initializing a model from the eurobert-base style configuration
    >>> model = EuroBertModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```eurobertNrope_parametersc                     ||}|j                  dd        t        |   di d|d|d|d|d|d|d|d	|d
|	d|
ddd|d|d|d|d|d|d|d|d|d|| || _        || _        d| _        y )N	use_cache
vocab_sizehidden_sizeintermediate_sizenum_hidden_layersnum_attention_headsnum_key_value_heads
hidden_actmax_position_embeddingsinitializer_rangerms_norm_epsFbos_token_ideos_token_idpad_token_idpretraining_tptie_word_embeddingsr   attention_biasattention_dropoutmlp_biashead_dim )popsuper__init__mask_token_idclassifier_pooling	is_causal)selfr   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r5   r+   r,   r   r-   r.   r/   r0   r6   kwargs	__class__s                           _/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/eurobert/modular_eurobert.pyr4   zEuroBertConfig.__init__q   s   4 &"5

;% 	
!	
#	
 0	
 0		

 !4	
 !4	
 "	
 %<	
 0	
 &	
 	
 &	
 &	
 &	
 *	
  !4!	
" ,#	
$ *%	
& 0'	
( )	
* -	
0 +"4    )i  i   i      r=   Nsilui    g{Gz?h㈵>i   r@   i    FNFg        FNlate)
__name__
__module____qualname____doc__
model_typer   dictstrr4   __classcell__r:   s   @r;   r   r      s}    N` J   $!MQ!/8$ ($sN/B*CCdJ%8 8r<   r   c                         e Zd Zd fd	Z xZS )EuroBertRMSNormc                 &    t         |   ||       y N)r3   r4   )r8   r   epsr:   s      r;   r4   zEuroBertRMSNorm.__init__   s    c*r<   )r?   )rC   rD   rE   r4   rJ   rK   s   @r;   rM   rM      s    + +r<   rM   c                   (     e Zd Zdedef fdZ xZS )EuroBertAttentionconfig	layer_idxc                 4    t         |   ||       d| _        y )NF)r3   r4   r7   )r8   rS   rT   r:   s      r;   r4   zEuroBertAttention.__init__   s    +r<   )rC   rD   rE   r   intr4   rJ   rK   s   @r;   rR   rR      s    ~ #  r<   rR   c                       e Zd Zy)EuroBertPreTrainedModelN)rC   rD   rE   r1   r<   r;   rX   rX      s    r<   rX   c                       e Zd Z	 	 	 	 d	dej                  dej
                  dz  dej                  dz  dej                  dz  dee   de	e
z  fdZy)
EuroBertModelN	input_idsattention_maskposition_idsinputs_embedsr9   returnc                    |d u |d uz  rt        d      || j                  |      }|=t        j                  |j                  d   |j
                        j                  d      }t        | j                  ||      }|}| j                  ||      }| j                  d | j                  j                   D ]  }	 |	|f|||d|} | j                  |      }t        |      S )	Nz:You must specify exactly one of input_ids or inputs_embedsrA   )devicer   )rS   r^   r\   )r]   )r\   position_embeddingsr]   )last_hidden_state)
ValueErrorembed_tokenstorcharangeshapera   	unsqueezer   rS   
rotary_emblayersr!   normr	   )
r8   r[   r\   r]   r^   r9   bidirectional_maskhidden_statesrb   encoder_layers
             r;   forwardzEuroBertModel.forward   s    -t";<YZZ *.*;*;I*FM <<(;(;A(>}G[G[\ffghiL6;;')
 &"oom,oW![[)H4;;+H+HI 	M)1$7)	
 M	 		-0+
 	
r<   )NNNN)rC   rD   rE   rf   
LongTensorTensorFloatTensorr   r   tupler	   rp   r1   r<   r;   rZ   rZ      s     '+.20426&
##&
 t+&
 &&-	&

 ((4/&
 +,&
 
	 &
r<   rZ   c                   0    e Zd ZddiZddiZddgdgfiZdef fdZee		 	 	 	 	 dd
e
j                  d	z  de
j                  d	z  de
j                  d	z  de
j                  d	z  de
j                  d	z  dee   dee
j                     ez  fd              Z xZS )EuroBertForMaskedLMzlm_head.weightzmodel.embed_tokens.weightlm_headcolwise_gather_outputrn   logitsrS   c                     t         |   |       t        |      | _        t	        j
                  |j                  |j                  |j                        | _	        | j                          y rO   )r3   r4   rZ   modelr   Linearr   r   r/   rw   	post_initr8   rS   r:   s     r;   r4   zEuroBertForMaskedLM.__init__   sL     "6*
yy!3!3V5F5FX 	r<   Nr[   r\   r]   r^   labelsr9   r_   c                     | j                   d||||d|}| j                  |j                        }d}	|* | j                  d||| j                  j
                  d|}	t        |	||j                  |j                        S )a)  
        Example:

        ```python
        >>> from transformers import AutoTokenizer, EuroBertForMaskedLM

        >>> model = EuroBertForMaskedLM.from_pretrained("EuroBERT/EuroBERT-210m")
        >>> tokenizer = AutoTokenizer.from_pretrained("EuroBERT/EuroBERT-210m")

        >>> text = "The capital of France is <|mask|>."
        >>> inputs = tokenizer(text, return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> # To get predictions for the mask:
        >>> masked_index = inputs["input_ids"][0].tolist().index(tokenizer.mask_token_id)
        >>> predicted_token_id = outputs.logits[0, masked_index].argmax(axis=-1)
        >>> predicted_token = tokenizer.decode(predicted_token_id)
        >>> print("Predicted token:", predicted_token)
        Predicted token:  Paris
        ```)r[   r\   r]   r^   N)ry   r   r   lossry   rn   
attentionsr1   )	r{   rw   rc   loss_functionrS   r   r
   rn   r   )
r8   r[   r\   r]   r^   r   r9   outputsry   r   s
             r;   rp   zEuroBertForMaskedLM.forward   s    > $.4:: $
)%'	$

 $
 g778%4%%pVFt{{OeOepiopD!//))	
 	
r<   NNNNN)rC   rD   rE   _tied_weights_keys_tp_plan_pp_planr   r4   r   r   rf   rq   rr   rs   r   r   rt   r
   rp   rJ   rK   s   @r;   rv   rv      s    *,GH23H_-z:;H~   .2.20426*./
##d*/
 t+/
 &&-	/

 ((4//
   4'/
 +,/
 
u||	~	-/
  /
r<   rv   c                       e Zd Zdef fdZee	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	ee   d
eej                     ez  fd              Z xZS )!EuroBertForSequenceClassificationrS   c                    t         |   |       |j                  | _        |j                  | _        t	        |      | _        t        j                  |j                  |j                        | _	        t        j                         | _        t        j                  |j                  | j                        | _        | j                          y rO   )r3   r4   
num_labelsr6   rZ   r{   r   r|   r   denseGELU
activation
classifierr}   r~   s     r;   r4   z*EuroBertForSequenceClassification.__init__)  s      ++"(";";"6*
YYv1163E3EF
'')))F$6$6Hr<   Nr[   r\   r]   r^   r   r9   r_   c                 b    | j                   |f|||d|}|d   }| j                  dv r| j                  dk(  r
|d d df   }	n^| j                  dk(  rO||j                  d      }	n:||j                  d      z  j	                  d      }	|	|j	                  dd	
      z  }	| j                  	      }	| j                  |	      }	| j                  |	      }
n| j                  dk(  r| j                  |      }| j                  |      }| j                  |      }
||
j                  d      }
n:|
|j                  d      z  j	                  d      }
|
|j	                  dd	
      z  }
d }||j                  
j                        }| j                  j                  | j                  dk(  rd| j                  _        nl| j                  dkD  rL|j                  t        j                  k(  s|j                  t        j                   k(  rd| j                  _        nd| j                  _        | j                  j                  dk(  rIt#               }| j                  dk(  r& ||
j%                         |j%                               }n ||
|      }n| j                  j                  dk(  r=t'               } ||
j)                  d| j                        |j)                  d            }n,| j                  j                  dk(  rt+               } ||
|      }t-        |
|j.                  |j0                        S )Nr\   r]   r^   r   )bosmeanr   r   rA   )dimT)r   keepdimrB   
regressionsingle_label_classificationmulti_label_classificationr   )r{   r6   r   ri   sumr   r   r   tora   rS   problem_typer   dtyperf   longrV   r   squeezer   viewr   r   rn   r   )r8   r[   r\   r]   r^   r   r9   encoder_outputrc   pooled_outputry   xr   loss_fcts                 r;   rp   z)EuroBertForSequenceClassification.forward4  s    $
)%'	

 
 +1-""o5&&%/ 1!Q$ 7((F2!)$5$:$:q$:$AM%69Q9QRT9U%U$Z$Z_`$Z$aM!^%7%7At%7%LLM JJ}5M OOM:M__]3F$$.

,-A"A__Q'F%+ >#;#;B#??DDDK.,,D,AAYYv}}-F{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#FF3D))-JJ+-B @&++b/R))-II,./'(66%00	
 	
r<   r   )rC   rD   rE   r   r4   r   r   rf   rq   rr   rs   r   r   rt   r   rp   rJ   rK   s   @r;   r   r   '  s    	~ 	  .2.20426*.H
##d*H
 t+H
 &&-	H

 ((4/H
   4'H
 +,H
 
u||	7	7H
  H
r<   r   c                       e Zd Zdef fdZd Zd Zee	 	 	 	 	 dde	j                  dz  de	j                  dz  de	j                  dz  d	e	j                  dz  d
e	j                  dz  dee   deez  fd              Z xZS )EuroBertForTokenClassificationrS   c                     t         |   |       |j                  | _        t        |      | _        t        j                  |j                  |j                        | _        | j                          y rO   )
r3   r4   r   rZ   r{   r   r|   r   r   r}   r~   s     r;   r4   z'EuroBertForTokenClassification.__init__  sQ      ++"6*
))F$6$68I8IJr<   c                 .    | j                   j                  S rO   r{   re   )r8   s    r;   get_input_embeddingsz3EuroBertForTokenClassification.get_input_embeddings  s    zz&&&r<   c                 &    || j                   _        y rO   r   )r8   values     r;   set_input_embeddingsz3EuroBertForTokenClassification.set_input_embeddings  s    "'

r<   Nr[   r\   r]   r^   r   r9   r_   c                 "    | j                   |f|||d|}|d   }| j                  |      }	d}
|<t               } ||	j                  d| j                        |j                  d            }
t        |
|	|j                  |j                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        r   r   Nr   r   )r{   r   r   r   r   r   rn   r   )r8   r[   r\   r]   r^   r   r9   r   sequence_outputry   r   r   s               r;   rp   z&EuroBertForTokenClassification.forward  s    " $**
)%'	

 
 "!*1')HFKKDOO<fkk"oND$!//))	
 	
r<   r   )rC   rD   rE   r   r4   r   r   r   r   rf   rq   rr   rs   r   r   rt   r   rp   rJ   rK   s   @r;   r   r     s    ~ '(  .2.20426*.#
##d*#
 t+#
 &&-	#

 ((4/#
   4'#
 +,#
 
&	&#
  #
r<   r   )r   rX   rZ   rv   r   r   )&rf   r   torch.nnr   r   r   masking_utilsr   modeling_outputsr	   r
   r   r   modeling_rope_utilsr   processing_utilsr   utilsr   utils.genericr   r   llamar   llama.modeling_llamar   r   r   r   r   rM   rR   rX   rZ   rv   r   r   __all__r1   r<   r;   <module>r      s       A A 6 p p 1 & # A  a aK[ K\+l +
 	2 	'
J '
T >
1 >
 >
B V
(? V
 V
r 4
%< 4
 4
nr<   