
    qi!                     |    d Z ddlmZ ddlmZ ddlmZ ddlmZm	Z	  ej                  e      Z G d de      ZdgZy	)
zFuyu model configuration   )PreTrainedConfig)RopeParameters)logging   )CONFIG_MAPPING
AutoConfigc            0       `    e Zd ZdZdZdeiZdgZdZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dde	dz  de	dz  d	e	dz  d
e	dz  de	dz  de
dz  de	dz  de	dz  de	dz  de	dz  dedz  de	dz  dedz  dedz  deee
ef   z  dz  dedz  dedz  dedz  de	dz  de	dz  de	dz  de	dz  dedz  f. fdZ xZS )
FuyuConfiga_  
    This is the configuration class to store the configuration of a [`FuyuForCausalLM`]. It is used to instantiate an
    Fuyu model according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the
    [adept/fuyu-8b](https://huggingface.co/adept/fuyu-8b).

    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PreTrainedConfig`] for more information.


    Args:
        vocab_size (`int`, *optional*, defaults to 262144):
            Vocabulary size of the Fuyu model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`FuyuForCausalLM`]
        hidden_size (`int`, *optional*, defaults to 4096):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 16384):
            Dimension of the MLP representations.
        num_hidden_layers (`int`, *optional*, defaults to 36):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 64):
            Number of attention heads for each attention layer in the Transformer encoder.
        hidden_act (`str` or `function`, *optional*, defaults to `"relu2"`):
            The non-linear activation function (function or string) in the decoder.
        max_position_embeddings (`int`, *optional*, defaults to 16384):
            The maximum sequence length that this model might ever be used with.
        image_size (`int`, *optional*, defaults to 300):
            The input image size.
        patch_size (`int`, *optional*, defaults to 30):
            The input vision transformer encoding patch size.
        num_channels (`int`, *optional*, defaults to 3):
            The input image number of channels.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
            The epsilon used by the rms normalization layers.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`. Whether to tie weight embeddings
        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
            Whether to tie input and output embeddings.
        rope_parameters (`RopeParameters`, *optional*):
            Dictionary containing the configuration parameters for the RoPE embeddings. The dictionary should contain
            a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE
            with longer `max_position_embeddings`.
        qk_layernorm (`bool`, *optional*, defaults to `True`):
            Whether or not to normalize the Queries and Keys after projecting the hidden states
        hidden_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio after applying the MLP to the hidden states.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio after computing the attention scores.
        pad_token_id (`int`, *optional*):
            The id of the *padding* token.
        bos_token_id (`int`, *optional*, defaults to 1):
            The id of the *beginning-of-sequence* token.
        eos_token_id (`Union[int, list[int]]`, *optional*, defaults to 2):
            The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
        image_token_id (`int`, *optional*, defaults to 71011):
            The id of the image placeholder token.
        text_config (`dict`, *optional*):
            Dictionary of configuration options used to initialize the `language``[`Aut`].

    ```python
    >>> from transformers import FuyuConfig

    >>> # Initializing a Fuyu fuyu-7b style configuration
    >>> configuration = FuyuConfig()
    ```fuyutext_configpast_key_valuesg     j@N
vocab_sizehidden_sizeintermediate_sizenum_hidden_layersnum_attention_heads
hidden_actmax_position_embeddings
image_size
patch_sizenum_channelsinitializer_rangelayer_norm_eps	use_cachetie_word_embeddingsrope_parametersqk_layernormhidden_dropoutattention_dropoutpad_token_idbos_token_ideos_token_idimage_token_idc                 ^   |Ji d|d|d|d|d|d|d|d|d	|d
|d|d|d|d|d|d|d|}t         j                  d       |j                  dd      }t        |   di || _        || _        || _        || _        |	| _        |
| _	        || _
        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        |j/                  dd       || _        || _        || _        || _        t9        | t  di | y )Nr   r   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   r"   zEtext_config is None. initializing the text model with default values.
model_type	persimmonpartial_rotary_factorg      ? )loggerinfogetr   r   _vocab_sizer   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r#   r   
setdefaultr   r    r!   r"   super__init__)selfr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   r   kwargstext_model_type	__class__s                             ]/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/fuyu/configuration_fuyu.pyr/   zFuyuConfig.__init__d   s   6 j)+B { $%6	
 $%6 &': j $%6 !. Y "?  !. $%6   !" #K& KK_`%//,D)/:I[I%'>$$$(&!2!2#6 $!2,"(,!2,.137#6 ((("6"    )i   i    @  $   @   relu2r6   i,     r   g{Gz?gh㈵>TFNT        r;   N   r   ic N)__name__
__module____qualname____doc__r%   r   sub_configskeys_to_ignore_at_inferencedefault_thetaintstrfloatboolr   dictr/   __classcell__)r3   s   @r4   r
   r
      s   CJ J *-K#4"5M "("&(-(**,!(.3!$!##$*.%)!%+0MQ$('**-#'#$#$%*#'1K#$JK# 4ZK# :	K#
 :K# !4ZK# $JK# "%tK# $JK# $JK# DjK# !4<K# d
K# $;K# "D[K#  ($sN/B*CCdJ!K#" Tk#K#$ %K#& !4<'K#( Dj)K#* Dj+K#, Dj-K#. d
/K#0 D[1K# K#r5   r
   N)r@   configuration_utilsr   modeling_rope_utilsr   utilsr   autor   r   
get_loggerr=   r)   r
   __all__r(   r5   r4   <module>rP      sC     3 1  - 
		H	%V#! V#r .r5   