
    qi9                     x    d dl mZ ddlmZmZ ddlmZ ddlmZ  ej                  e
      Z G d de      ZdgZy)	    )Literal   )PreTrainedConfiglayer_type_validation)RopeParameters)loggingc            F       8    e Zd ZdZdZdgZdddZ fdZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d2d	edz  d
edz  dedz  dedz  dedz  de	dz  dedz  de
dz  de
dz  de
dz  dedz  dedz  dedz  dedz  dedz  dedz  dedz  de
dz  dee	   dz  deed   ef   dz  dedz  de
dz  d edz  d!e
dz  d"edz  d#ed$   d%e
dz  d&edz  d'e	dz  d(edz  d)edz  d*edz  d+edz  d,edz  fD fd-Zd3d.Z fd/Zed0        Zej*                  d1        Z xZS )4ModernBertConfigaO  
    This is the configuration class to store the configuration of a [`ModernBertModel`]. It is used to instantiate an ModernBert
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the ModernBERT-base.
    e.g. [answerdotai/ModernBERT-base](https://huggingface.co/answerdotai/ModernBERT-base)

    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PreTrainedConfig`] for more information.

    Args:
        vocab_size (`int`, *optional*, defaults to 50368):
            Vocabulary size of the ModernBert model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`ModernBertModel`]
        hidden_size (`int`, *optional*, defaults to 768):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 1152):
            Dimension of the MLP representations.
        num_hidden_layers (`int`, *optional*, defaults to 22):
            Number of hidden layers in the Transformer decoder.
        num_attention_heads (`int`, *optional*, defaults to 12):
            Number of attention heads for each attention layer in the Transformer decoder.
        hidden_activation (`str` or `function`, *optional*, defaults to `"gelu"`):
            The non-linear activation function (function or string) in the decoder. Will default to `"gelu"`
            if not specified.
        max_position_embeddings (`int`, *optional*, defaults to 8192):
            The maximum sequence length that this model might ever be used with.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        initializer_cutoff_factor (`float`, *optional*, defaults to 2.0):
            The cutoff factor for the truncated_normal_initializer for initializing all weight matrices.
        norm_eps (`float`, *optional*, defaults to 1e-05):
            The epsilon used by the rms normalization layers.
        norm_bias (`bool`, *optional*, defaults to `False`):
            Whether to use bias in the normalization layers.
        pad_token_id (`int`, *optional*, defaults to 50283):
            Padding token id.
        eos_token_id (`int`, *optional*, defaults to 50282):
            End of stream token id.
        bos_token_id (`int`, *optional*, defaults to 50281):
            Beginning of stream token id.
        cls_token_id (`int`, *optional*, defaults to 50281):
            Classification token id.
        sep_token_id (`int`, *optional*, defaults to 50282):
            Separation token id.
        attention_bias (`bool`, *optional*, defaults to `False`):
            Whether to use a bias in the query, key, value and output projection layers during self-attention.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        layer_types (`list`, *optional*):
            Attention pattern for each layer.
        rope_parameters (`dict`, *optional*):
            Dictionary mapping attention patterns (`"full_attention"`, `"sliding_attention"`) to `RopeParameters`.
            Each value should be a dictionary containing `rope_type` and optional scaling parameters.
        local_attention (`int`, *optional*, defaults to 128):
            The window size for local attention.
        embedding_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the embeddings.
        mlp_bias (`bool`, *optional*, defaults to `False`):
            Whether to use bias in the MLP layers.
        mlp_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the MLP layers.
        decoder_bias (`bool`, *optional*, defaults to `True`):
            Whether to use bias in the decoder layers.
        classifier_pooling (`str`, *optional*, defaults to `"cls"`):
            The pooling method for the classifier. Should be either `"cls"` or `"mean"`. In local attention layers, the
            CLS token doesn't attend to all tokens on long sequences.
        classifier_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the classifier.
        classifier_bias (`bool`, *optional*, defaults to `False`):
            Whether to use bias in the classifier.
        classifier_activation (`str`, *optional*, defaults to `"gelu"`):
            The activation function for the classifier.
        deterministic_flash_attn (`bool`, *optional*, defaults to `False`):
            Whether to use deterministic flash attention. If `False`, inference will be faster but not deterministic.
        sparse_prediction (`bool`, *optional*, defaults to `False`):
            Whether to use sparse prediction for the masked language model instead of returning the full dense logits.
        sparse_pred_ignore_index (`int`, *optional*, defaults to -100):
            The index to ignore for the sparse prediction.
        reference_compile (`bool`, *optional*):
            Whether to compile the layers of the model which were compiled during pretraining. If `None`, then parts of
            the model will be compiled if 1) `triton` is installed, 2) the model is not on MPS, 3) the model is not
            shared between devices, and 4) the model is not resized after initialization. If `True`, then the model may
            be faster in some scenarios. This argument is deprecated and will be removed in a future version.
        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
            Whether to tie weight embeddings

    Examples:

    ```python
    >>> from transformers import ModernBertModel, ModernBertConfig

    >>> # Initializing a ModernBert style configuration
    >>> configuration = ModernBertConfig()

    >>> # Initializing a model from the modernbert-base style configuration
    >>> model = ModernBertModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```
modernbertpast_key_valuesg     Ag     @)globallocalc                 b    |dk(  r|t         j                  d       d }t        |   ||       y )Nreference_compilezThe `reference_compile` argument is deprecated and will be removed in `transformers v5.2.0`Use `torch.compile()` directly on the model instead.)loggerwarning_oncesuper__setattr__)selfnamevalue	__class__s      i/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/modernbert/configuration_modernbert.pyr   zModernBertConfig.__setattr__   s;    &&5+<G ED%(    N
vocab_sizehidden_sizeintermediate_sizenum_hidden_layersnum_attention_headshidden_activationmax_position_embeddingsinitializer_rangeinitializer_cutoff_factornorm_eps	norm_biaspad_token_ideos_token_idbos_token_idcls_token_idsep_token_idattention_biasattention_dropoutlayer_typesrope_parameters)full_attentionsliding_attentionlocal_attentionembedding_dropoutmlp_biasmlp_dropoutdecoder_biasclassifier_poolingclsmeanclassifier_dropoutclassifier_biasclassifier_activationdeterministic_flash_attnsparse_predictionsparse_pred_ignore_indexr   tie_word_embeddingsc#                 b   || _         || _        || _        || _        || _        |"| _        || _        || _        || _        || _	        || _
        || _        || _        |	| _        |
| _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        | | _        |!| _        | j0                  dvrtA        d| j0                   d      || _!        |#jE                  dd      | _#        | jB                  BtI        | j                        D $cg c]  }$tK        |$| jF                  z        rdnd  c}$| _!        tM        | jB                  | j                         || _'        tQ        %|   di |# y c c}$w )	Nr7   zQInvalid value for `classifier_pooling`, should be either "cls" or "mean", but is .global_attn_every_n_layersr   r0   r/    )*r&   r(   r'   r)   r*   r@   r   r!   r   r   r   r   r"   r#   r$   r%   r+   r,   r    r1   r2   r3   r4   r5   r6   r:   r;   r<   r=   r>   r?   r   
ValueErrorr-   getrC   rangeboolr   r.   r   __init__)&r   r   r   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r1   r2   r3   r4   r5   r6   r:   r;   r<   r=   r>   r?   r   r@   kwargsir   s&                                        r   rI   zModernBertConfig.__init__   s   L )((((#6 $'>$&!2!2#6 !2)B& ",!2!2.!2 &("4"4.%:"(@%!2(@%!2""/9cdhd{d{c||}~  ' +1**5QST*U'# t556  (,A0O0O,O'P#Vff D 	d..0F0FG."6" s   
#F,c                    |j                  dd       }ddiddid}| j                  | j                  n|| _        |<| j                  d   j                  |       | j                  d   j                  |       | j                  j                  d      ddi| j                  d<   | j                  d   j	                  d|j                  d| j
                  d	                | j                  j                  d      ddi| j                  d<   | j                  d   j	                  d|j                  d
| j
                  d                | j                          | j                  |       |S )Nrope_scaling	rope_typedefault)r0   r/   r/   r0   
rope_thetaglobal_rope_thetar   local_rope_thetar   )ignore_keys)popr.   updaterF   
setdefaultdefault_thetastandardize_rope_paramsvalidate_rope)r   ignore_keys_at_rope_validationrJ   rM   default_rope_paramss        r   convert_rope_params_to_dictz,ModernBertConfig.convert_rope_params_to_dict   su   zz.$7
 #.y!9*I6
 8<7K7K7Wt33]p#  !1299,G  !45<<\J ##$45=6A95MD  !12-.99&**%8$:L:LX:VW	
 ##$78@9Di8PD  !4501<<&**%79K9KG9TU	

 	$$&'EFr   c                 H    t         |          }|j                  dd        |S )Nr   )r   to_dictrT   )r   outputr   s     r   r^   zModernBertConfig.to_dict  s#    "

&-r   c                      | j                   dz  S )zKHalf-window size: `local_attention` is the total window, so we divide by 2.   r1   )r   s    r   sliding_windowzModernBertConfig.sliding_window  s     ##q((r   c                     |dz  | _         y)z<Set sliding_window by updating local_attention to 2 * value.ra   Nrb   )r   r   s     r   rc   zModernBertConfig.sliding_window  s      %qyr   )"i  i   i        gelui    g{Gz?g       @gh㈵>Fik  j  i  ri   rh   F        NN   rj   Frj   Tr8   rj   Frg   FFiNT)N)__name__
__module____qualname____doc__
model_typekeys_to_ignore_at_inferencerW   r   intstrfloatrH   listdictr   r   rI   r\   r^   propertyrc   setter__classcell__)r   s   @r   r
   r
       s   cJ J#4"5(8<M) "'"%(,(**,(..2*.25!%!&#(#(#(#(#(&+*-(,gk&)*- %$'$(5:+.',,205)./3)-+/GY#$JY# 4ZY# :	Y#
 :Y# !4ZY# :Y# "%tY# !4<Y# $)4<Y# $,Y# $;Y# DjY# DjY# DjY#  Dj!Y#" Dj#Y#$ t%Y#& !4<'Y#( #Y%)Y#* g&KLn\]`dd+Y#, t-Y#. !4</Y#0 +1Y#2 T\3Y#4 Tk5Y#6 $M27Y#8 "DL9Y#: ;Y#<  #Tz=Y#> #'+?Y#@  $;AY#B #&*CY#D  $;EY#F "D[GY#v>
 ) ) ) )r   r
   N)typingr   configuration_utilsr   r   modeling_rope_utilsr   utilsr   
get_loggerrl   r   r
   __all__rD   r   r   <module>r      sA   ,  J 1  
		H	%z)' z)z 
r   