
    qi"                     T    d dl mZ ddlmZmZ  G d de      Z G d de      ZddgZy)	   )PreTrainedConfig   )CONFIG_MAPPING
AutoConfigc                   >     e Zd ZdZdZ	 	 	 	 	 	 	 	 	 	 	 d fd	Z xZS )GlmAsrEncoderConfiga`  
    This is the configuration class to store the configuration of a [`GlmAsrEncoder`]. It is used to instantiate a
    glmasr audio encoder according to the specified arguments, defining the model architecture. Instantiating a
    configuration with the defaults will yield a similar configuration to that of the audio encoder of the glmasr
    architecture.

    e.g. [zai-org/GLM-ASR-Nano-2512](https://huggingface.co/zai-org/GLM-ASR-Nano-2512)

    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PreTrainedConfig`] for more information.

    Args:
        hidden_size (`int`, *optional*, defaults to 1280):
            Dimensionality of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 5120):
            Dimension of the MLP representations.
        num_hidden_layers (`int`, *optional*, defaults to 32):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 20):
            Number of attention heads for each attention layer in the Transformer encoder.
        num_key_value_heads (`int`, *optional*):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
            by meanpooling all the original heads within that group. For more details, check out [this
            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
            `num_attention_heads`.
        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler.
        max_position_embeddings (`int`, *optional*, defaults to 1500):
            The maximum sequence length that this model might ever be used with.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        rope_parameters (`RopeParameters`, *optional*):
            Dictionary containing the configuration parameters for the RoPE embeddings. The dictionary should contain
            a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE
            with longer `max_position_embeddings`.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        num_mel_bins (`int`, *optional*, defaults to 128):
            Number of mel features used per input features. Should correspond to the value used in the
            `GlmAsrProcessor` class.

    ```python
    >>> from transformers import GlmAsrEncoderConfig, GlmAsrEncoder

    >>> # Initializing a GlmAsrEncoderConfig
    >>> configuration = GlmAsrEncoderConfig()

    >>> # Initializing a GlmAsrEncoder (with random weights)
    >>> model = GlmAsrEncoder(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```glmasr_encoderc                     || _         || _        || _        || _        ||}|| _        || _        || _        ||z  | _        || _        |	| _	        |
| _
        || _        |j                  dd       t        | 8  di | y )Npartial_rotary_factorg      ? )hidden_sizeintermediate_sizenum_hidden_layersnum_attention_headsnum_key_value_heads
hidden_actinitializer_rangehead_dimmax_position_embeddingsrope_parametersattention_dropoutnum_mel_bins
setdefaultsuper__init__)selfr   r   r   r   r   r   r   r   r   r   r   kwargs	__class__s                a/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/glmasr/configuration_glmasr.pyr   zGlmAsrEncoderConfig.__init__O   s     '!2!2#6 &"5#6 $!2#'::'>$.!2(137"6"    )i   i          Ngelui  g{Gz?Ng           )__name__
__module____qualname____doc__
model_typer   __classcell__r   s   @r   r   r      s=    7r "J   $# #r    r   c                   `     e Zd ZdZdZeedZdddddd	d
ddg dddddZ	 	 	 	 d fd	Z xZ	S )GlmAsrConfiga  
    This is the configuration class to store the configuration of a [`GlmAsrForConditionalGeneration`]. It is used to instantiate an
    glmasr model according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the glmasr-Mini-3B.

    e.g. [zai-org/GLM-ASR-Nano-2512](https://huggingface.co/zai-org/GLM-ASR-Nano-2512)

    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PreTrainedConfig`] for more information.

    Args:
        audio_config (`Union[AutoConfig, dict]`, *optional*):
            The config object or dictionary of the audio encoder.
        text_config (`Union[AutoConfig, dict]`, *optional*):
            The config object or dictionary of the text model.
        audio_token_id (`int`, *optional*, defaults to 59260):
            The audio token index to encode the audio prompt.
        projector_hidden_act (`str`, *optional*, defaults to `"gelu"`):
            The activation function (function or string) in the multi-modal projector.

    ```python
    >>> from transformers import GlmAsrForConditionalGeneration, GlmAsrConfig

    >>> # Initializing a glmasr configuration
    >>> configuration = GlmAsrConfig()

    >>> # Initializing a GLM-ASR-Nano-2512 model with random weights
    >>> model = GlmAsrForConditionalGeneration(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```glmasr)text_configaudio_configi  i   i            i    gh㈵>T)in  iu  iw  g     @default)
rope_theta	rope_type)
vocab_sizer   r   r   r   r   r   rms_norm_eps	use_cacheeos_token_idr   c                    t        |t              r'|j                  dd      |d<   t        |d      di |}n|t        d          }|| _        t        |t              r5|j                  dd      |d<   t        |d      di i | j
                  |}n|t        d   di | j
                  }|| _        |j                  | _        |j                  | _        || _	        || _
        t        | 0  di | y )Nr)   r	   llamar   )
isinstancedictgetr   r0   _default_text_config_kwargsr/   r7   r   audio_token_idprojector_hidden_actr   r   )r   r0   r/   rA   rB   r   r   s         r   r   zGlmAsrConfig.__init__   s    lD))5)9)9,HX)YL&),|*DEUUL!)*:;=L(k4((3g(NK%(\)BC ET55EEK  (1UD4T4TUK&%00&22,$8!"6"r    )NNi|  r#   )
r%   r&   r'   r(   r)   r   sub_configsr@   r   r*   r+   s   @r   r-   r-   q   sb    B J",jIK !! #'-*1	J#  ## #r    r-   N)configuration_utilsr   autor   r   r   r-   __all__r   r    r   <module>rG      s9    4 -[#* [#|P## P#f !.
1r    