
    qi3G                         d dl mZ d dlmZ d dlmZ ddlmZ  ej                  e	      Z
 G d de      Z G d d	e      Zdd	gZy
)   )PreTrainedConfig)RopeParameters)logging   )
AutoConfigc            ,       L    e Zd ZdZdZdZdgZddiZdZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dd	e	dz  d
e	dz  de	dz  de	dz  de	dz  de	dz  de	dz  de	dz  de	dz  de	dz  de
dz  de	dz  dedz  de	dz  de	dz  de	dz  deeeef   z  dz  dedz  de
dz  dedz  de	dz  f* fdZ xZS )CsmDepthDecoderConfiga|  
    This is the configuration class to store the configuration of a [`CsmDepthDecoderModel`]. It is used to instantiate an CSM depth decoder
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield
    a similar configuration to that of the csm-1b.

    e.g. [sesame/csm-1b](https://huggingface.co/sesame/csm-1b)

    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PreTrainedConfig`] for more information.


    Args:
        num_codebooks (`int`, *optional*, defaults to 32):
            Number of codebooks used in the underlying codec model responsible for tokenizing the audio.
        backbone_hidden_size (`int`, *optional*, defaults to 2048):
            Dimension of the hidden representations of the backbone model used with this depth decoder.
        vocab_size (`int`, *optional*, defaults to 2051):
            Vocabulary size of the CsmDepthDecoder model. Defines the number of different audio tokens that can be represented by each codebook.
        hidden_size (`int`, *optional*, defaults to 1024):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 8192):
            Dimension of the MLP representations.
        num_hidden_layers (`int`, *optional*, defaults to 4):
            Number of hidden layers in the Transformer decoder.
        num_attention_heads (`int`, *optional*, defaults to 8):
            Number of attention heads for each attention layer in the Transformer decoder.
        num_key_value_heads (`int`, *optional*, defaults to 2):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
            by meanpooling all the original heads within that group. For more details, check out [this
            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
            `num_attention_heads`.
        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
            The non-linear activation function (function or string) in the decoder.
        max_position_embeddings (`int`, *optional*, defaults to 33):
            The maximum sequence length that this model might ever be used with.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
            The epsilon used by the rms normalization layers.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`.
        pad_token_id (`int`, *optional*, defaults to 2050):
            Padding token id.
        bos_token_id (`int`, *optional*):
            Beginning of stream token id.
        eos_token_id (`int`, *optional*):
            End of stream token id.
        rope_parameters (`RopeParameters`, *optional*):
            Dictionary containing the configuration parameters for the RoPE embeddings. The dictionary should contain
            a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE
            with longer `max_position_embeddings`.
        attention_bias (`bool`, *optional*, defaults to `False`):
            Whether to use a bias in the query, key, value and output projection layers during self-attention.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        mlp_bias (`bool`, *optional*, defaults to `False`):
            Whether to use a bias in up_proj, down_proj and gate_proj layers in the MLP layers.
        head_dim (`int`, *optional*):
            The attention head dimension. If None, it will default to hidden_size // num_attention_heads

    ```python
    >>> from transformers import CsmDepthDecoder, CsmDepthDecoderConfig

    >>> # Initializing a CsmDepthDecoder
    >>> configuration = CsmDepthDecoderConfig()
    >>> model = CsmDepthDecoderModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```csm_depth_decoder_modeldepth_decoder_configpast_key_valuescodebook_size
vocab_size    ANnum_codebooksbackbone_hidden_sizehidden_sizeintermediate_sizenum_hidden_layersnum_attention_headsnum_key_value_heads
hidden_actmax_position_embeddingsinitializer_rangerms_norm_eps	use_cachepad_token_idbos_token_ideos_token_idrope_parametersattention_biasattention_dropoutmlp_biashead_dimc                    |j                  dd      rt        d      || _        || _        || _        || _        || _        || _        |
| _        || _	        || _
        || _        || _        ||}|| _        |	| _        || _        || _        || _        || _        || _        || _        ||n| j                  | j                  z  | _        || _        t/        | `  di | y )Ntie_word_embeddingsFzE`tie_word_embeddings=True` is not supported for CsmDepthDecoderConfig )pop
ValueErrorr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   r   super__init__)selfr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   kwargs	__class__s                          [/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/csm/configuration_csm.pyr*   zCsmDepthDecoderConfig.__init__m   s    2 ::+U3dee(((*$$8!'>$&!2!2#6  &"5#6 $!2(",!2 $,$8d>N>NRVRjRj>j."6"    )         i             r   silu!   {Gz?h㈵>TNNNNF        FN)__name__
__module____qualname____doc__
model_typebase_config_keykeys_to_ignore_at_inferenceattribute_mapdefault_thetaintfloatboolr   dictstrr*   __classcell__r-   s   @r.   r	   r	      s   IV +J,O#4"5M M %'+/!%"&(,()*+*+!'.0*.#'!%#'#'#'MQ&+*- %#-6#Tz6# "Dj6# $J	6#
 4Z6# :6# :6# !4Z6# !4Z6# $J6# "%t6# !4<6# Dj6# $;6# Dj6#  Dj!6#" Dj#6#$ ($sN/B*CCdJ%6#& t'6#( !4<)6#* ++6#, *-6# 6#r/   r	   c            :           e Zd ZdZdZdZdgZdZee	dZ
ddiZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d&d
ed	z  ded	z  ded	z  ded	z  ded	z  ded	z  ded	z  ded	z  ded	z  ded	z  ded	z  ded	z  ded	z  ded	z  ded	z  ded	z  ded	z  ded	z  ded	z  ded	z  deeeef   z  d	z  ded	z  ded	z  d ed	z  d!ed	z  d"ed	z  d#ed	z  d$ed	z  f8 fd%Z xZS )'	CsmConfigaW  
    This is the configuration class to store the configuration of a [`CsmForConditionalGeneration`]. It is used to instantiate an CSM
    model according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the csm-1b.

    e.g. [sesame/csm-1b](https://huggingface.co/sesame/csm-1b)

    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PreTrainedConfig`] for more information.

    Args:
        num_codebooks (`int`, *optional*, defaults to 32):
            Number of codebooks used in the underlying codec model responsible for tokenizing the audio.
        vocab_size (`int`, *optional*, defaults to 2051):
            Vocabulary size of the Csm model. Defines the number of different audio tokens that can be represented by each codebook.
        text_vocab_size (`int`, *optional*, defaults to 128256):
            Vocabulary size of the text input for the Csm model. Defines the number of different text tokens that can be represented.
        hidden_size (`int`, *optional*, defaults to 2048):
            Dimension of the hidden representations of the backbone model.
        intermediate_size (`int`, *optional*, defaults to 8192):
            Dimension of the MLP representations of the backbone model.
        num_hidden_layers (`int`, *optional*, defaults to 16):
            Number of hidden layers in the backbone model Transformer decoder.
        num_attention_heads (`int`, *optional*, defaults to 32):
            Number of attention heads for each attention layer in the backbone model Transformer decoder.
        num_key_value_heads (`int`, *optional*, defaults to 8):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
            by meanpooling all the original heads within that group. For more details, check out [this
            paper](https://huggingface.co/papers/2305.13245).
        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
            The non-linear activation function (function or string) in the backbone model Transformer decoder.
        max_position_embeddings (`int`, *optional*, defaults to 2048):
            The maximum sequence length that this model might ever be used with.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
            The epsilon used by the rms normalization layers.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`.
        pad_token_id (`int`, *optional*, defaults to 128002):
            Padding token id.
        codebook_pad_token_id (`int`, *optional*, defaults to 2050):
            Padding token id for codebook tokens.
        codebook_eos_token_id (`int`, *optional*, defaults to 0):
            End of stream token id for codebook tokens.
        bos_token_id (`int`, *optional*, defaults to 128000):
            Beginning of stream token id.
        eos_token_id (`int`, *optional*):
            End of stream token id.
        audio_token_id (`int`, *optional*, defaults to 128002):
            Audio token id in the text input.
        audio_eos_token_id (`int`, *optional*, defaults to 128003):
            End of stream token id for audio in the text input.
        rope_parameters (`RopeParameters`, *optional*):
            Dictionary containing the configuration parameters for the RoPE embeddings. The dictionary should contain
            a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE
            with longer `max_position_embeddings`.
        attention_bias (`bool`, *optional*, defaults to `False`):
            Whether to use a bias in the query, key, value and output projection layers during self-attention.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        mlp_bias (`bool`, *optional*, defaults to `False`):
            Whether to use a bias in up_proj, down_proj and gate_proj layers in the MLP layers.
        head_dim (`int`, *optional*):
            The attention head dimension. If None, it will default to hidden_size // num_attention_heads
        tie_codebooks_embeddings (`bool`, *optional*, defaults to `True`):
            Whether to tie the codebook tokens embeddings of the backbone model to the codebook tokens embeddings of the depth decoder.
        depth_decoder_config (`CsmDepthDecoderConfig`, *optional*):
            Configuration for the depth decoder.
        codec_config (`PreTrainedConfig`, *optional*):
            Configuration for the codec.

    ```python
    >>> from transformers import CsmForConditionalGeneration, CsmConfig

    >>> # Initializing a CsmConfig
    >>> configuration = CsmConfig()

    >>> # Initializing a model
    >>> model = CsmForConditionalGeneration(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```csm
csm_configr   r   )codec_configr   r   r   Nr   text_vocab_sizer   r   r   r   r   r   r   r   r   r   r   codebook_pad_token_idcodebook_eos_token_idr   r   audio_token_idaudio_eos_token_idr   r    r!   r"   r#   tie_codebooks_embeddingsr   rO   c                    |j                  dd      rt        d      |%t               | _        t        j                  d       n8t        |t              rt        di || _        nt        |t              r|| _        |0t        j                  d      | _
        t        j                  d       nBt        |t              rt        j                  di || _
        nt        |t              r|| _
        || _        || _        || _        || _        || _        || _        || _        || _        |
| _        || _        || _        || _        || _        ||}|| _        |	| _        || _        || _        || _        || _        || _        || _         ||n| j*                  | j0                  z  | _!        || _"        || _#        || _$        || _%        d| _&        tO        |   di | y )Nr%   Fz9`tie_word_embeddings=True` is not supported for CsmConfigzAdepth_decoder_config is None, using default depth decoder config.mimiz9codec_config is None, using default audio encoder config.r&   ))r'   r(   r	   r   loggerinfo
isinstancerG   r   	for_modelrO   r   rP   r   rS   rT   rQ   rR   rU   r   r   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   r   r   r   r   r%   r)   r*   )r+   r   r   rP   r   r   r   r   r   r   r   r   r   r   r   rQ   rR   r   r   rS   rT   r   r    r!   r"   r#   rU   r   rO   r,   r-   s                                 r.   r*   zCsmConfig.__init__  s   @ ::+U3XYY'(=(?D%KK[\,d3(=(U@T(UD%,.CD(<D% * 4 4V <DKKSTd+ * 4 4 D| DD&67 ,D.*,"4%:"%:"(@%$'>$&!2!2#6  &"5#6 $!2(",!2 $,$8d>N>NRVRjRj>j.(((#( "6"r/   )r0   r2   i  r1   r3      r0   r5   r6   r1   r8   r9   T i      i  Nr]   i NFr:   FNTNN)r;   r<   r=   r>   r?   r@   rA   rC   r   r	   sub_configsrB   rD   rH   rE   rF   r   rG   r*   rI   rJ   s   @r.   rL   rL      sS   Wr J"O#4"5M" 5K
 	M %'!%&,"&(,(**,*+!'.2*.#'!%#),0,-#)#'%+)/MQ&+*- %#04,0$(;U#TzU# $JU# t	U#
 4ZU# :U# :U# !4ZU# !4ZU# $JU# "%tU# !4<U# DjU# $;U# DjU#   #Tz!U#"  #Tz#U#$ Dj%U#& Dj'U#( d
)U#*  $J+U#, ($sN/B*CCdJ-U#. t/U#0 !4<1U#2 +3U#4 *5U#6 #'+7U#8 #Tk9U#: Tk;U# U#r/   rL   N)configuration_utilsr   modeling_rope_utilsr   utilsr   auto.configuration_autor   
get_loggerr;   rX   r	   rL   __all__r&   r/   r.   <module>rf      sV     4 1  0 
		H	%J#, J#Z{#  {#~ r/   