
    qi-&                     l    d Z ddlmZ ddlmZ ddlmZ  ej                  e      Z	 G d de      Z
dgZy)zMinistral model configuration   )PreTrainedConfig)RopeParameters)loggingc            (       ^    e Zd ZdZdZdgZddddddddZdgdgfd	d
gd	gfd	gd	gfdZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d!dedz  dedz  dedz  dedz  dedz  dedz  dedz  de	dz  dedz  de
dz  de
dz  dedz  dedz  dedz  dedz  dedz  deee	ef   z  dz  dedz  de
dz  f& fd Z xZS )"Ministral3Configap  
    This is the configuration class to store the configuration of a [`Ministral3Model`]. It is used to instantiate an
    Mistral model according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the mistralai/Ministral-3-8B-Base-2512, mistralai/Ministral-3-8B-Instruct-2512 or mistralai/Ministral-3-8B-Reasoning-2512.

    [mistralai/Ministral-3-8B-Base-2512](https://huggingface.co/mistralai/Ministral-3-8B-Base-2512)
    [mistralai/Ministral-3-8B-Instruct-2512](https://huggingface.co/mistralai/Ministral-3-8B-Instruct-2512)
    [mistralai/Ministral-3-8B-Reasoning-2512](https://huggingface.co/mistralai/Ministral-3-8B-Reasoning-2512)

    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PreTrainedConfig`] for more information.

    Args:
        vocab_size (`Optional`, *optional*, defaults to 131072):
            Vocabulary size of the Ministral3 model. Defines the number of different tokens that can be represented by
            the `inputs_ids` passed when calling [`Ministral3Model`].
        hidden_size (`Optional`, *optional*, defaults to 4096):
            Dimensionality of the embeddings and hidden states.
        intermediate_size (`Optional`, *optional*, defaults to 14336):
            Dimensionality of the intermediate (feed-forward) layer.
        num_hidden_layers (`Optional`, *optional*, defaults to 34):
            Number of hidden layers in the Transformer decoder.
        num_attention_heads (`Optional`, *optional*, defaults to 32):
            Number of attention heads for each attention layer in the Transformer decoder.
        num_key_value_heads (`Optional`, *optional*, defaults to 8):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA); if
            `num_key_value_heads=1`, the model will use Multi Query Attention (MQA); otherwise GQA is used.
        head_dim (`Optional`, *optional*, defaults to 128):
            The attention head dimension. If not specified, will default to `hidden_size // num_attention_heads`.
        hidden_act (`Optional`, *optional*, defaults to `"silu"`):
            The non-linear activation function (function or string) in the decoder.
        max_position_embeddings (`Optional`, *optional*, defaults to 262144):
            The maximum sequence length that this model might ever be used with.
        initializer_range (`Optional`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        rms_norm_eps (`Optional`, *optional*, defaults to 1e-05):
            The epsilon used by the rms normalization layers.
        use_cache (`Optional`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`.
        pad_token_id (`Optional`, *optional*, defaults to 11):
            The id of the padding token.
        bos_token_id (`Optional`, *optional*, defaults to 1):
            The id of the "beginning-of-sequence" token.
        eos_token_id (`Optional`, *optional*, defaults to 2):
            The id of the "end-of-sequence" token.
        tie_word_embeddings (`Optional`, *optional*, defaults to `False`):
            Whether the model's input and output word embeddings should be tied.
        rope_parameters (`Union`, *optional*, defaults to `{'type': 'yarn', 'rope_theta': 1000000.0, 'factor': 16.0, 'original_max_position_embeddings': 16384, 'beta_fast': 32.0, 'beta_slow': 1.0, 'mscale_all_dim': 1.0, 'mscale': 1.0, 'llama_4_scaling_beta': 0.1}`):
            Dictionary containing the configuration parameters for the RoPE embeddings, including optional Yarn scaling
            settings such as `factor`, `original_max_position_embeddings`, `mscale`, and `llama_4_scaling_beta`.
        sliding_window (`Optional`, *optional*):
            Sliding window attention window size. If `None`, full attention is used.
        attention_dropout (`Optional`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.

    Example:

    ```python
    >>> from transformers import Ministral3Config, Ministral3ForCausalLM, Mistral3Config, Mistral3ForConditionalGeneration, PixtralVisionConfig

    >>> # Initializing a Pixtral-vision config
    >>> vision_config = PixtralVisionConfig()

    >>> # Initializing a Ministral3 config
    >>> text_config = Ministral3Config()

    >>> # Initializing a Mistral3 configuration
    >>> configuration = Mistral3Config(vision_config, text_config)

    >>> # Initializing a model from the Ministral3 configuration
    >>> text_model = Ministral3ForCausalLM(text_config)

    >>> # Initializing a model from the Mistral3 configuration
    >>> model = Mistral3ForConditionalGeneration(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```
ministral3past_key_valuescolwiserowwise)zlayers.*.self_attn.q_projzlayers.*.self_attn.k_projzlayers.*.self_attn.v_projzlayers.*.self_attn.o_projzlayers.*.mlp.gate_projzlayers.*.mlp.up_projzlayers.*.mlp.down_proj	input_idsinputs_embedshidden_statesattention_mask)embed_tokenslayersnormN
vocab_sizehidden_sizeintermediate_sizenum_hidden_layersnum_attention_headsnum_key_value_headshead_dim
hidden_actmax_position_embeddingsinitializer_rangerms_norm_eps	use_cachepad_token_idbos_token_ideos_token_idtie_word_embeddingsrope_parameterssliding_windowattention_dropoutc                    |dddd|	dddddd
}|| _         |	| _        || _        || _        || _        || _        || _        ||n||z  | _        ||}|| _        || _	        |
| _
        || _        || _        || _        d	|v rt        j                  d
       || _        || _        || _        || _        || _        t+        | X  ddddhi| y )Nyarng    .Ag      0@i @  g      @@g      ?g?)
type
rope_thetafactor original_max_position_embeddingsr   	beta_fast	beta_slowmscale_all_dimmscalellama_4_scaling_betalayer_typeszDetected Mistral model with layer_types. Consider using AutoModel or Ministral classes instead to enable alternating attention compatibility.ignore_keys_at_rope_validationr0   r    )r   r   r   r   r   r   r$   r   r   r   r   r   r   r%   loggerwarning_oncer#   r   r    r!   r"   super__init__)selfr   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   r$   r%   kwargs	__class__s                        i/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/ministral3/configuration_ministral3.pyr7   zMinistral3Config.__init__|   s"   . "'49+B! "%(+O %'>$&!2!2#6 ,$,$8kM`>` &"5#6 $!2("!2F" `  /(((#6  	
,BD]+^	
	
    )i   i   i 8  "             silui   g{Gz?gh㈵>T         FNNg        )__name__
__module____qualname____doc__
model_typekeys_to_ignore_at_inferencebase_model_tp_planbase_model_pp_planintstrfloatboolr   dictr7   __classcell__)r:   s   @r;   r   r      s   Ob J#4"5 &/%.%.%."+ )"+ &(9:#%568IJ!"_$56 "("&(-(**,*+"!'.4*.%)!%#%#$#$+0MQ%)*-)G
$JG
 4ZG
 :	G

 :G
 !4ZG
 !4ZG
 *G
 $JG
 "%tG
 !4<G
 dlG
 $;G
 DjG
 DjG
  Dj!G
" "D[#G
$ ($sN/B*CCdJ%G
& d
'G
( !4<)G
 G
r<   r   N)rH   configuration_utilsr   modeling_rope_utilsr   utilsr   
get_loggerrE   r4   r   __all__r3   r<   r;   <module>rX      sA    $ 3 1  
		H	%k
' k
\ 
r<   