Ë
    «q±i&'  ã                   ó<   — d dl mZmZ d dlmZ  G d„ de«      ZdgZy)é   )ÚPreTrainedConfigÚlayer_type_validation)ÚRopeParametersc            ,       ó|  ‡ — e Zd ZdZdZdgZddddddddœZdgdgfd	d
gd	gfd	gd	gfdœZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d#dedz  dedz  dedz  de	dz  dedz  dedz  dedz  de
dz  dedz  de	dz  dedz  dedz  dedz  dedz  dedz  dedz  deee
ef   z  dz  dedz  de	dz  d edz  d!ee
   dz  f*ˆ fd"„Zˆ xZS )$ÚCohere2Configac  
    This is the configuration class to store the configuration of a [`CohereModel`]. It is used to instantiate an Cohere
    model according to the specified arguments, defining the model architecture.

    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PreTrainedConfig`] for more information. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the [CohereForAI/c4ai-command-r-v01](https://huggingface.co/CohereForAI/c4ai-command-r-v01) model.


    Args:
        vocab_size (`int`, *optional*, defaults to 256000):
            Vocabulary size of the Cohere model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`CohereModel`]
        hidden_size (`int`, *optional*, defaults to 8192):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 22528):
            Dimension of the MLP representations.
        logit_scale (`float`, *optional*, defaults to 0.0625):
            The scaling factor for the output logits.
        num_hidden_layers (`int`, *optional*, defaults to 40):
            Number of hidden layers in the Transformer decoder.
        num_attention_heads (`int`, *optional*, defaults to 64):
            Number of attention heads for each attention layer in the Transformer decoder.
        num_key_value_heads (`int`, *optional*):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
            by meanpooling all the original heads within that group. For more details, check out [this
            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
            `num_attention_heads`.
        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
            The non-linear activation function (function or string) in the decoder.
        max_position_embeddings (`int`, *optional*, defaults to 8192):
            The maximum sequence length that this model might ever be used with.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
            The epsilon used by the layer normalization.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`.
        pad_token_id (`int`, *optional*, defaults to 0):
            Padding token id.
        bos_token_id (`int`, *optional*, defaults to 5):
            Beginning of stream token id.
        eos_token_id (`int`, *optional*, defaults to 255001):
            End of stream token id.
        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
            Whether to tie weight embeddings
        rope_parameters (`RopeParameters`, *optional*):
            Dictionary containing the configuration parameters for the RoPE embeddings. The dictionary should contain
            a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE
            with longer `max_position_embeddings`.
        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
            Whether to use a bias in the query, key, value and output projection layers during self-attention.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        sliding_window (`int`, *optional*, defaults to 4096):
            Size of the sliding window attention context.
        layer_types (`list`, *optional*):
            Attention pattern for each layer.

    ```python
    >>> from transformers import Cohere2Model, Cohere2Config

    >>> # Initializing a Cohere Nextmodel configuration
    >>> configuration = Cohere2Config()

    >>> # Initializing a model from the Cohere2 configuration
    >>> model = Cohere2Model(configuration) # doctest: +SKIP

    >>> # Accessing the model configuration
    >>> configuration = model.config # doctest: +SKIP
    ```
    Úcohere2Úpast_key_valuesÚcolwiseÚrowwise)zlayers.*.self_attn.q_projzlayers.*.self_attn.k_projzlayers.*.self_attn.v_projzlayers.*.self_attn.o_projzlayers.*.mlp.gate_projzlayers.*.mlp.up_projzlayers.*.mlp.down_projÚ	input_idsÚinputs_embedsÚhidden_statesÚattention_mask)Úembed_tokensÚlayersÚnormNÚ
vocab_sizeÚhidden_sizeÚintermediate_sizeÚlogit_scaleÚnum_hidden_layersÚnum_attention_headsÚnum_key_value_headsÚ
hidden_actÚmax_position_embeddingsÚinitializer_rangeÚlayer_norm_epsÚ	use_cacheÚpad_token_idÚbos_token_idÚeos_token_idÚtie_word_embeddingsÚrope_parametersÚattention_biasÚattention_dropoutÚsliding_windowÚlayer_typesc                 ó¤  •— || _         |	| _        || _        || _        || _        || _        || _        |€|}|| _        || _        |
| _	        || _
        || _        || _        || _        || _        || _        ||z  | _        || _        || _        || _        || _        |j+                  dd«      | _        | j                  €Wt/        | dd«      | _        t1        | j
                  «      D cg c]!  }t3        |dz   | j,                  z  «      rdnd‘Œ# c}| _        t5        | j                  | j
                  «       || _        t9        ‰| t  di |¤Ž y c c}w )NÚsliding_window_patterné   é   Úsliding_attentionÚfull_attention© )r   r   r   r   r   r   r   r   r   r   r   r   r$   r%   r&   r'   Úhead_dimr   r    r!   r"   ÚgetÚ_sliding_window_patternÚgetattrÚrangeÚboolr   r#   ÚsuperÚ__init__)Úselfr   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   ÚkwargsÚiÚ	__class__s                           €úc/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/cohere2/configuration_cohere2.pyr6   zCohere2Config.__init__x   sn  ø€ ð2 %ˆŒØ'>ˆÔ$Ø&ˆÔØ&ˆÔØ!2ˆÔØ!2ˆÔØ#6ˆÔ ð Ð&Ø"5Ðà#6ˆÔ Ø$ˆŒØ!2ˆÔØ,ˆÔØ"ˆŒØ,ˆÔØ!2ˆÔØ,ˆÔØ&ˆÔð $Ð':Ñ:ˆŒà(ˆÔØ(ˆÔØ(ˆÔØ#6ˆÔ ð (.§z¡zÐ2JÈAÓ'NˆÔ$à×ÑÐ#ä+2°4Ð9QÐSTÓ+UˆDÔ(ô ˜t×5Ñ5Ó6ö àô (,¨Q°©U°d×6RÑ6RÑ,RÔ'SÑ#ÐYiÑiò ˆDÔô 	˜d×.Ñ.°×0FÑ0FÔGà.ˆÔÜ‰ÑÑ"˜6Ó"ùò s   Ã(&E)i è é    i X  g      °?é(   é@   NÚsilur<   g{®Gáz”?gñhãˆµøä>Té    é   iä TNFg        i   N)Ú__name__Ú
__module__Ú__qualname__Ú__doc__Ú
model_typeÚkeys_to_ignore_at_inferenceÚbase_model_tp_planÚbase_model_pp_planÚintÚfloatÚstrr4   r   ÚdictÚlistr6   Ú__classcell__)r:   s   @r;   r   r      s  ø„ ñKðZ €JØ#4Ð"5Ðà%.Ø%.Ø%.Ø%.Ø"+Ø )Ø"+ñÐð &˜¨Ð(9Ð:Ø#Ð%5Ð6¸Ð8IÐJØ!Ð" _Ð$5Ð6ñÐð "(Ø"&Ø(-Ø$*Ø(*Ø*,Ø*.Ø!'Ø.2Ø*.Ø%)Ø $Ø#$Ø#$Ø#)Ø+/ØMQØ&+Ø*-Ø%)Ø(,ñ-D#à˜$‘JðD#ð ˜4‘ZðD#ð  ™:ð	D#ð
 ˜T‘\ðD#ð  ™:ðD#ð ! 4™ZðD#ð ! 4™ZðD#ð ˜$‘JðD#ð "% t¡ðD#ð ! 4™<ðD#ð ˜d™
ðD#ð ˜‘:ðD#ð ˜D‘jðD#ð ˜D‘jðD#ð  ˜D‘jð!D#ð" " D™[ð#D#ð$ (¨$¨s°NÐ/BÑ*CÑCÀdÑJð%D#ð& ˜t™ð'D#ð( ! 4™<ð)D#ð* ˜d™
ð+D#ð, ˜#‘Y Ñ%÷-D#ñ D#ó    r   N)Úconfiguration_utilsr   r   Úmodeling_rope_utilsr   r   Ú__all__r.   rP   r;   ú<module>rT      s(   ð÷* KÝ 1ôc#Ð$ô c#ðL Ð
rP   