
    qiY                     P    d Z ddlZddlZddlmZ ddlmZmZ  G d de      Z	dgZ
y)	zXcodec model configuration    N   )PreTrainedConfig   )CONFIG_MAPPING
AutoConfigc                   D    e Zd ZdZdZeedZdg ddg dddZi Zd	d
dddgddgddgddd	dd	d	fde	e
   d	z  dedede	e
   de	e   de	e   dededed	z  de
f fdZedefd       Zedefd       Zedefd       Zedefd       Zedefd       Zedefd       Z xZS ) XcodecConfiga
  
    This is the configuration class to store the configuration of an [`XcodecModel`]. It is used to instantiate a
    Xcodec model according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the
    [Manel/X-Codec](https://huggingface.co/Manel/X-Codec) architecture.

    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PreTrainedConfig`] for more information.

    Args:
        target_bandwidths (`List[float]`, *optional*, defaults to `[0.5, 1, 1.5, 2, 4]`):
            The range of different bandwidths (in kbps) the model can encode audio with.
        sample_rate (`int`, *optional*, defaults to 16000):
            The sampling rate at which the audio waveform should be digitalized, in hertz (Hz).
        kernel_size (`int`, *optional*, defaults to 3):
            Kernel size for the initial semantic convolution.
        channel_ratios (`List[float]`, *optional*, defaults to `[1, 1]`):
            Expansion factors for the number of output channels in each semantic block.
        strides (`List[int]`, *optional*, defaults to `[1, 1]`):
            Strides for each semantic encoder block.
        block_dilations (`List[int]`, *optional*, defaults to `[1, 1]`):
            Dilation factors for the residual units in semantic blocks.
        unit_kernel_size (`int`, *optional*, defaults to 3):
            Kernel size inside each ResidualUnit in semantic blocks.
        codebook_size (`int`, *optional*, defaults to 1024):
            Number of entries in each residual quantizer's codebook.
        codebook_dim (`int`, *optional*):
            Dimensionality of each codebook vector. Defaults to sum of hidden size of acoustic and semantic models.
        initializer_range (`float`, *optional*, defaults to 0.02):
            Standard deviation of the truncated normal initializer for all weight matrices.
        acoustic_model_config (`Union[Dict, DacConfig]`, *optional*):
            An instance of the configuration for the acoustic (DAC) model.
        semantic_model_config (`Union[Dict, HubertConfig, WavLMConfig]`, *optional*):
            An instance of the configuration object for the semantic (HuBERT) model.

    Example:

    ```python
    >>> from transformers import XcodecModel, XcodecConfig

    >>> # Initializing configuration
    >>> configuration = XcodecConfig()

    >>> # Initializing a model (with random weights) from the configuration
    >>> model = XcodecModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```xcodec)acoustic_model_configsemantic_model_config@   )         r   i      )encoder_hidden_sizedownsampling_ratiosdecoder_hidden_sizeupsampling_ratioshidden_sizeNi>  r      g{Gz?target_bandwidthssample_ratekernel_sizechannel_ratiosstridesblock_dilationsunit_kernel_sizecodebook_sizecodebook_diminitializer_rangec                    t        |t              r5|j                  dd      |d<   t        |d      di i | j                  |}n|t        d   di | j                  }|| _        t        |t              r5|j                  dd      |d<   t        |d      di i | j                  |}n|t        d   di | j                  }|| _        |g d}|| _        || _	        || _
        || _        || _        || _        || _        || _        |
| _        |	-| j
                  j"                  | j                  j"                  z   }	|	| _        t'        | P  di | y )N
model_typedachubert)g      ?r   g      ?r   r    )
isinstancedictgetr   %_default_acoustic_model_config_kwargsr   %_default_semantic_model_config_kwargsr   r   r   r   r   r   r   r   r   r!   r   r    super__init__)selfr   r   r   r   r   r   r   r   r    r!   r   r   kwargs	__class__s                 a/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/xcodec/configuration_xcodec.pyr-   zXcodecConfig.__init__^   s     +T22G2K2KLZ_2`!,/$23H3V$W %YT??YCXY%! #*$25$9$gD<f<f$g!%:"+T22G2K2KLZb2c!,/$23H3V$W %YT??YCXY%! #*$28$<$jt?i?i$j!%:"$ 3!2&&,. 0*!255AADD^D^DjDjjL("6"    returnc                 Z    t        j                  | j                  | j                  z        S N)mathceilr   
hop_lengthr.   s    r1   
frame_ratezXcodecConfig.frame_rate   s     yy))DOO;<<r2   c                 .    | j                   j                  S r5   )r   r   r9   s    r1   semantic_hidden_sizez!XcodecConfig.semantic_hidden_size   s    ))555r2   c                 f    t        t        j                  | j                  j                              S r5   )intnpprodr   r   r9   s    r1   r8   zXcodecConfig.hop_length   s"    277455IIJKKr2   c                 f    t        j                  t        j                  | j                              S r5   )r6   r7   log2r   r9   s    r1   codebook_nbitszXcodecConfig.codebook_nbits   s     yy4#5#5677r2   c                 \    | j                   j                  | j                  j                  z   S r5   )r   r   r   r9   s    r1   r   zXcodecConfig.hidden_size   s%    ))558R8R8^8^^^r2   c                 l    t        d| j                  d   z  | j                  | j                  z  z        S )Ni  )r>   r   r:   rC   r9   s    r1   num_quantizerszXcodecConfig.num_quantizers   s1    4$00444K^K^9^_``r2   )__name__
__module____qualname____doc__r#   r   sub_configsr*   r+   listfloatr>   r-   propertyr:   r<   r8   rC   r   rG   __classcell__)r0   s   @r1   r	   r	      s   0d J ",!+K  "  ,#)-) -/) 15 '(!fV&'V !!#'#'""2#;-2# 2# 	2#
 U2# c2# c2# 2# 2# Dj2# !2#h =C = = 6c 6 6 LC L L 8 8 8 _S _ _ a a ar2   r	   )rK   r6   numpyr?   configuration_utilsr   autor   r   r	   __all__r&   r2   r1   <module>rU      s2    !   3 -Pa# Paf 
r2   