
    qi                     `    d Z ddlmZ ddlmZ  ej
                  e      Z G d de      ZdgZ	y)zT5 model configuration   )PreTrainedConfig)loggingc                   d     e Zd ZdZdZdgZdddddZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d
 fd		Z xZS )T5Configa  
    This is the configuration class to store the configuration of a [`T5Model`]. It is used to
    instantiate a T5 model according to the specified arguments, defining the model architecture. Instantiating a
    configuration with the defaults will yield a similar configuration to that of the T5
    [google-t5/t5-small](https://huggingface.co/google-t5/t5-small) architecture.

    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PreTrainedConfig`] for more information.

    Arguments:
        vocab_size (`int`, *optional*, defaults to 32128):
            Vocabulary size of the T5 model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`T5Model`].
        d_model (`int`, *optional*, defaults to 512):
            Size of the encoder layers and the pooler layer.
        d_kv (`int`, *optional*, defaults to 64):
            Size of the key, query, value projections per attention head. The `inner_dim` of the projection layer will
            be defined as `num_heads * d_kv`.
        d_ff (`int`, *optional*, defaults to 2048):
            Size of the intermediate feed forward layer in each `T5Block`.
        num_layers (`int`, *optional*, defaults to 6):
            Number of hidden layers in the Transformer encoder.
        num_decoder_layers (`int`, *optional*):
            Number of hidden layers in the Transformer decoder. Will use the same value as `num_layers` if not set.
        num_heads (`int`, *optional*, defaults to 8):
            Number of attention heads for each attention layer in the Transformer encoder.
        relative_attention_num_buckets (`int`, *optional*, defaults to 32):
            The number of buckets to use for each attention layer.
        relative_attention_max_distance (`int`, *optional*, defaults to 128):
            The maximum distance of the longer sequences for the bucket separation.
        dropout_rate (`float`, *optional*, defaults to 0.1):
            The ratio for all dropout layers.
        classifier_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for classifier.
        layer_norm_eps (`float`, *optional*, defaults to 1e-6):
            The epsilon used by the layer normalization layers.
        initializer_factor (`float`, *optional*, defaults to 1):
            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
            testing).
        feed_forward_proj (`string`, *optional*, defaults to `"relu"`):
            Type of feed forward layer to be used. Should be one of `"relu"` or `"gated-gelu"`. T5v1.1 uses the
            `"gated-gelu"` feed forward projection. Original T5 uses `"relu"`.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models).
    t5past_key_valuesd_model	num_heads
num_layersd_kv)hidden_sizenum_attention_headsnum_hidden_layershead_dimc                 B   || _         || _        || _        || _        || _        || _        ||n| j
                  | _        || _        || _        |	| _	        |
| _
        || _        || _        || _        || _        || _        || _        || _        | j                  j%                  d      }|d   | _        |d   dk(  | _        t+        |      dkD  r|d   dk7  st+        |      dkD  rt-        d| d      |d	k(  rd
| _        |du | _        d| _        t3        | h  dd|i| y )N-    gated      z`feed_forward_proj`: z is not a valid activation function of the dense layer. Please make sure `feed_forward_proj` is of the format `gated-{ACT_FN}` or `{ACT_FN}`, e.g. 'gated-gelu' or 'relu'z
gated-gelugelu_newTis_encoder_decoder )
is_decoder
vocab_sizer	   r   d_ffr   num_decoder_layersr
   relative_attention_num_bucketsrelative_attention_max_distancedropout_rateclassifier_dropoutlayer_norm_epsiloninitializer_factorfeed_forward_proj	use_cachepad_token_ideos_token_idsplitdense_act_fnis_gated_actlen
ValueErrorscale_decoder_outputstie_word_embeddingssuper__init__)selfr   r	   r   r   r   r   r
   r   r    r!   r#   r$   r%   r   r&   r'   r(   r"   r/   r   kwargsact_info	__class__s                          Y/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/t5/configuration_t5.pyr1   zT5Config.__init__O   sR   0 %$		$"4"@doo 	 #.L+/N,("4"4"4!2"(())//4$RL$QK72x=1!!73x=1;L'(9': ;) )  , *D &9D%@"#' I,>I&I    )i}  i   @   i      N          g?gư>g      ?reluTTr   r   g        TF)	__name__
__module____qualname____doc__
model_typekeys_to_ignore_at_inferenceattribute_mapr1   __classcell__)r5   s   @r6   r   r      sx    ,\ J#4"5 *)	M ')(+  +CJ CJr7   r   N)
rA   configuration_utilsr   utilsr   
get_loggerr>   loggerr   __all__r   r7   r6   <module>rK      s?     3  
		H	%{J {J| ,r7   