
    qiz                     l    d Z ddlmZ ddlmZ ddlmZ  ej                  e      Z	 G d de      Z
dgZy)zJetMoe model configuration   )PreTrainedConfig)RopeParameters)loggingc            ,       D    e Zd ZdZdZdgZddiZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddedz  dedz  d	edz  d
edz  dedz  dedz  dedz  dedz  dedz  dedz  de	dz  de
dz  de	dz  dedz  dedz  dedz  de	dz  deeeef   z  dz  dedz  de
dz  de
dz  f* fdZ xZS )JetMoeConfigax  
    This is the configuration class to store the configuration of a [`JetMoeModel`]. It is used to instantiate a
    JetMoe model according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a configuration of the JetMoe-4B.

    [jetmoe/jetmoe-8b](https://huggingface.co/jetmoe/jetmoe-8b)

    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PreTrainedConfig`] for more information.


    Args:
        vocab_size (`int`, *optional*, defaults to 32000):
            Vocabulary size of the JetMoe model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`JetMoeModel`]
        hidden_size (`int`, *optional*, defaults to 2048):
            Dimension of the hidden representations.
        num_hidden_layers (`int`, *optional*, defaults to 12):
            Number of hidden layers in the Transformer encoder.
        num_key_value_heads (`int`, *optional*, defaults to 16):
            Number of attention heads for each key and value in the Transformer encoder.
        kv_channels (`int`, *optional*, defaults to 128):
            Defines the number of channels for the key and value tensors.
        intermediate_size (`int`, *optional*, defaults to 5632):
            Dimension of the MLP representations.
        max_position_embeddings (`int`, *optional*, defaults to 4096):
            The maximum sequence length that this model might ever be used with. JetMoe's attention allows sequence of
            up to 4096 tokens.
        activation_function (`string`, *optional*, defaults to `"silu"`):
            Defines the activation function for MLP experts.
        num_local_experts (`int`, *optional*, defaults to 8):
            Defines the number of experts in the MoE and MoA.
        num_experts_per_tok (`int, *optional*, defaults to 2):
            The number of experts to route per-token and for MoE and MoA.
        output_router_logits (`bool`, *optional*, defaults to `False`):
            Whether or not the router logits should be returned by the model. Enabling this will also
            allow the model to output the auxiliary loss.
        aux_loss_coef (`float`, *optional*, defaults to 0.01):
            The coefficient for the auxiliary loss.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`.
        bos_token_id (`int`, *optional*, defaults to 1):
            The id of the "beginning-of-sequence" token.
        eos_token_id (`int`, *optional*, defaults to 2):
            The id of the "end-of-sequence" token.
        pad_token_id (`int`, *optional*):
            The id of the padding token.
        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
            Whether the model's input and output word embeddings should be tied.
        rope_parameters (`RopeParameters`, *optional*):
            Dictionary containing the configuration parameters for the RoPE embeddings. The dictionary should contain
            a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE
            with longer `max_position_embeddings`.
        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon used by the rms normalization layers.
        initializer_range (`float`, *optional*, defaults to 0.01):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.

    ```python
    >>> from transformers import JetMoeModel, JetMoeConfig

    >>> # Initializing a JetMoe 4B style configuration
    >>> configuration = JetMoeConfig()

    >>> # Initializing a model from the JetMoe 4B style configuration
    >>> model = JetMoeModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```jetmoepast_key_valueshead_dimkv_channelsN
vocab_sizehidden_sizenum_hidden_layersnum_key_value_headsintermediate_sizemax_position_embeddingsactivation_functionnum_local_expertsnum_experts_per_tokoutput_router_logitsaux_loss_coef	use_cachebos_token_ideos_token_idpad_token_idtie_word_embeddingsrope_parametersrms_norm_epsinitializer_rangeattention_dropoutc                 ~   |
|	kD  rt        d      || _        || _        || _        ||
z  | _        || _        || _        || _        || _        || _	        |	| _
        |
| _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        t/        | `  di | y )NzG`num_experts_per_tok` must be less than or equal to `num_local_experts` )
ValueErrorr   r   r   num_attention_headsr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   super__init__)selfr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   kwargs	__class__s                          a/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/jetmoe/configuration_jetmoe.pyr%   zJetMoeConfig.__init__g   s    2 !22fgg$&!2#69L#L #6 &!2'>$#6 !2#6 $8!*"!2!2((((.#6 "6"    )i }  i            i   i   silu      F{Gz?T   r0   NTNgư>r1   g        )__name__
__module____qualname____doc__
model_typekeys_to_ignore_at_inferenceattribute_mapintstrboolfloatr   dictr%   __classcell__)r(   s   @r)   r   r      s   HT J#4"5/M "'"&(**,"%(,.2*0()*+,1&*!%#$#$#'+/MQ#'*.*--2#$J2# 4Z2# :	2#
 !4Z2# 4Z2# :2# "%t2# !4Z2# :2# !4Z2# #Tk2# t|2# $;2# Dj2#  Dj!2#" Dj#2#$ "D[%2#& ($sN/B*CCdJ'2#( Dj)2#* !4<+2#, !4<-2# 2#r*   r   N)r6   configuration_utilsr   modeling_rope_utilsr   utilsr   
get_loggerr3   loggerr   __all__r!   r*   r)   <module>rF      sA    ! 3 1  
		H	%A## A#H 
r*   