
    qi(                         d Z ddlmZ ddlmZ ddlmZ ddlmZ  ej                  e
      Z G d de      Z G d	 d
e      Z G d de      ZdgZy)zDBRX model configuration    )Any   )PreTrainedConfig)RopeParameters)loggingc            	       F     e Zd ZdZdZ	 	 	 d	dededz  dedef fdZ xZ	S )
DbrxAttentionConfiga  Configuration class for Dbrx Attention.

    [`DbrxAttention`] class. It is used to instantiate attention layers
    according to the specified arguments, defining the layers architecture.

    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PreTrainedConfig`] for more information.

    Args:
        attn_pdrop (`float`, *optional*, defaults to 0.0):
            The dropout probability for the attention layers.
        clip_qkv (`float`, *optional*):
            If set, clip the queries, keys, and values in the attention layer to this value.
        kv_n_heads (`int`, *optional*, defaults to 1):
            For grouped_query_attention only, allow user to specify number of kv heads.
    attn_configN
attn_pdropclip_qkv
kv_n_headskwargsc                 N    t        |   di | || _        || _        || _        y )N )super__init__r   r   r   )selfr   r   r   r   	__class__s        ]/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/dbrx/configuration_dbrx.pyr   zDbrxAttentionConfig.__init__.   s*     	"6"$ $    )        N   )
__name__
__module____qualname____doc__base_config_keyfloatintr   r   __classcell__r   s   @r   r	   r	      sN    " $O  !%	
%
% $,
% 	
%
 
% 
%r   r	   c                   l     e Zd ZdZdZ	 	 	 	 	 	 	 	 ddedz  dededededz  d	ed
edz  def fdZ	 xZ
S )DbrxFFNConfiga|  Configuration class for Dbrx FFN.

    [`DbrxFFN`] class. It is used to instantiate feedforward layers according to
    the specified arguments, defining the layers architecture.

    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PreTrainedConfig`] for more information.

    Args:
        ffn_act_fn (`dict`, *optional*, defaults to `None`): A dict specifying activation function for the FFN.
            The dict should have a key 'name' with the value being the name of the activation function along with
            any additional keyword arguments. If `None`, then set to `{"name": "silu"}`.
        ffn_hidden_size (`int`, *optional*, defaults to 3584): The hidden size of the feedforward network.
        moe_num_experts (`int`, *optional*, defaults to 4): The number of experts in the mixture of experts layer.
        moe_top_k (`int`, *optional*, defaults to 1): The number of experts to use in the mixture of experts layer.
        moe_jitter_eps (`float`, *optional*, defaults to `None`): If not `None`, the jitter epsilon for the mixture of experts layer.
        moe_loss_weight (`float`, *optional*, defaults to 0.01): The loss weight for the mixture of experts layer.
        moe_normalize_expert_weights (`float`, *optional*, defaults to 1.0): The normalization factor for the expert weights.
    
ffn_configN
ffn_act_fnffn_hidden_sizemoe_num_experts	moe_top_kmoe_jitter_epsmoe_loss_weightmoe_normalize_expert_weightsr   c	                    t         |           |ddi}|| _        || _        || _        || _        || _        || _        || _        || _	        dD ]  }
|
|	v s|	j                  |
        t        |	      dk7  rt        d|	      y )Nnamesilu)
model_typeattn_implementationexperts_implementationtransformers_version_commit_hashtorch_dtypedtyper   zFound unknown kwargs=)r   r   hidden_sizer%   r&   r'   r(   r)   r*   r+   poplen
ValueError)r   r6   r%   r&   r'   r(   r)   r*   r+   r   kr   s              r   r   zDbrxFFNConfig.__init__R   s     	 &)J&$..",.,H)
 
	A F{

1
	 v;!5fY788 r   )i   Ni      r   Ng{Gz?g      ?)r   r   r   r   r   dictr   r   r   r   r    r!   s   @r   r#   r#   ;   s    ( #O "&# '+!%58$9 4K$9 	$9
 $9 $9 $9 $9 ',dl$9 $9 $9r   r#   c            %       "    e Zd ZdZdZeedZdddddZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dde	d	z  de	d	z  de	d	z  de	d	z  d
e	d	z  de
d	z  de
d	z  ded	z  ded	z  ded	z  de
d	z  ded	z  deeeef   z  d	z  de	d	z  de	d	z  de	d	z  ded	z  def$ fdZ xZS )
DbrxConfiga  

    This is the configuration class to store the configuration of a [`DbrxModel`]. It is used to instantiate a Dbrx model according to the
    specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a different configuration to that of the [transformers-community/dbrx-instruct](https://huggingface.co/transformers-community/dbrx-instruct) architecture.
    Note: this link points to a re-upload; the original repository is closed.

    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PreTrainedConfig`] for more information.


    Args:
        d_model (`int`, *optional*, defaults to 2048):
            Dimensionality of the embeddings and hidden states.
        n_heads (`int`, *optional*, defaults to 16):
            Number of attention heads for each attention layer in the Transformer encoder.
        n_layers (`int`, *optional*, defaults to 24):
            Number of hidden layers in the Transformer encoder.
        max_seq_len (`int`, *optional*, defaults to 2048):
            The maximum sequence length of the model.
        vocab_size (`int`, *optional*, defaults to 32000):
            Vocabulary size of the Dbrx model. Defines the maximum number of different tokens that can be represented by
            the `inputs_ids` passed when calling [`DbrxModel`].
        resid_pdrop (`float`, *optional*, defaults to 0.0):
            The dropout probability applied to the attention output before combining with residual.
        emb_pdrop (`float`, *optional*, defaults to 0.0):
            The dropout probability for the embedding layer.
        attn_config (`dict`, *optional*):
            A dictionary used to configure the model's attention module.
        ffn_config (`dict`, *optional*):
            A dictionary used to configure the model's FFN module.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models).
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        output_router_logits (`bool`, *optional*, defaults to `False`):
            Whether or not the router logits should be returned by the model. Enabling this will also
            allow the model to output the auxiliary loss. See [here]() for more details.


    Example:
    ```python
    >>> from transformers import DbrxConfig, DbrxModel

    >>> # Initializing a Dbrx configuration
    >>> configuration = DbrxConfig(n_layers=2, d_model=256, n_heads=8, vocab_size=128)

    >>> # Initializing a model (with random weights) from the configuration
    >>> model = DbrxModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```
    dbrx)r
   r$   n_headsd_modeln_layersmax_seq_len)num_attention_headsr6   num_hidden_layersmax_position_embeddingsN
vocab_sizeresid_pdrop	emb_pdropr
   r$   	use_cacheinitializer_rangeoutput_router_logitsrope_parameterspad_token_idbos_token_ideos_token_idtie_word_embeddingsr   c                 .   |t               | _        n(t        |t              rt        di || _        n|| _        |	t	               | _        n(t        |	t              rt	        di |	| _        n|	| _        || _        || _        || _        || _	        || _
        || _        || _        |
| _        || _        || _        | j                  j                   | _        |rt%        d      || _        || _        || _        || _        || _        t1        | d  di | y )Nz5tie_word_embeddings is not supported for DBRX models.r   )r	   r
   
isinstancer<   r#   r$   rA   r@   rB   rC   rG   rH   rI   rJ   rK   rL   r   num_key_value_headsr9   rN   rO   rP   rQ   rM   r   r   )r   rA   r@   rB   rC   rG   rH   rI   r
   r$   rJ   rK   rL   rM   rN   rO   rP   rQ   r   r   s                      r   r   zDbrxConfig.__init__   s   * 24DT*2A[AD*D+oDO
D)+9j9DO(DO &$&""!2$8!#'#3#3#>#> TUU(((#6 ."6"r   )         rU   i }  r   r   NNTg{Gz?FNNNNF)r   r   r   r   r/   r	   r#   sub_configsattribute_mapr   r   boolr   r<   strr   r   r    r!   s   @r   r>   r>   y   st   5n J"5]SK( '#0	M # !"&!&$'"%26+/!%*.,1MQ#'#'#'+0%7#t7# t7# *	7#
 4Z7# $J7# T\7# 4<7# )4/7# "D(7# $;7# !4<7# #Tk7# ($sN/B*CCdJ7# Dj7#  Dj!7#" Dj#7#$ "D[%7#& '7# 7#r   r>   N)r   typingr   configuration_utilsr   modeling_rope_utilsr   utilsr   
get_loggerr   loggerr	   r#   r>   __all__r   r   r   <module>rc      s_      3 1  
		H	%%* %B;9$ ;9|x#! x#v .r   