
    qi%>                     f    d dl mZ d dlmZ  G d de      Z G d de      Z G d de      Zg d	Zy
)   )PreTrainedConfig)RopeParametersc                   J     e Zd ZdZdZdZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d fd	Z xZS )Glm4vVisionConfiga  
    This is the configuration class to store the configuration of a [`Glm4vVisionModel`]. It is used to instantiate an Glm4vVisionModel
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield
    a similar configuration to that of
    GLM-4.1V-9B-Thinking [THUDM/GLM-4.1V-9B-Thinking](https://huggingface.co/THUDM/GLM-4.1V-9B-Thinking).

    Args:
            depth (`int`, *optional*, defaults to 24):
                Number of layers (depth) in the model.
            hidden_size (`int`, *optional*, defaults to 1536):
                Dimensionality of the encoder layers and the pooler layer.
            hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
                The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
                `"relu"`, `"selu"` and `"gelu_new"` are supported.
            attention_bias (`bool`, *optional*, defaults to `False`):
                Whether to add a bias to the queries, keys and values.
            attention_dropout (`float`, *optional*, defaults to 0.0):
                Dropout probability for attention weights.
            num_heads (`<fill_type>`, *optional*, defaults to 12): <fill_docstring>
            in_channels (`<fill_type>`, *optional*, defaults to 3): <fill_docstring>
            image_size (`int` or `list[int]`, *optional*, defaults to 336):
                The size (resolution) of each image.
            patch_size (`int`, *optional*, defaults to 14):
                The size (resolution) of each patch.
            rms_norm_eps (`float`, *optional*, defaults to 1e-05):
                The epsilon used by the rms normalization layers.
            spatial_merge_size (`int`, *optional*, defaults to 2):
                The size used for merging spatial dimensions.
            temporal_patch_size (`int`, *optional*, defaults to 2):
                The size used for patches along the temporal dimension.
            out_hidden_size (`int`, *optional*, defaults to 4096):
                The output hidden size of the vision model.
            intermediate_size (`int`, *optional*, defaults to 13696):
                Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
            initializer_range (`float`, *optional*, defaults to 0.02):
                The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
    Example:

    ```python
    >>> from transformers import Glm4vVisionConfig, Glm4vVisionModel

    >>> # Initializing a Glm4vVisionConfig GLM-4.1V-9B style configuration
    >>> configuration = Glm4vVisionConfig()

    >>> # Initializing a model (with random weights) from the GLM-4.1V-9B configuration
    >>> model = Glm4vVisionModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```glm4v_visionvision_configc                     t        |   di | || _        || _        || _        || _        || _        || _        |	| _        || _	        || _
        || _        || _        || _        |
| _        || _        || _        y )N )super__init__depthhidden_size
hidden_act	num_headsin_channels
image_size
patch_sizespatial_merge_sizetemporal_patch_sizeout_hidden_sizeintermediate_sizeinitializer_rangerms_norm_epsattention_biasattention_dropout)selfr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   kwargs	__class__s                    _/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/glm4v/configuration_glm4v.pyr   zGlm4vVisionConfig.__init__O   s    & 	"6"
&$"&$$"4#6 .!2!2(,!2    )   i   siluF           r   iP     h㈵>   r'      5  {Gz?)__name__
__module____qualname____doc__
model_typebase_config_keyr   __classcell__r   s   @r   r   r      sN    1f  J%O !#3 #3r    r   c                   $    e Zd ZdZdZdZdgZddddddd	Zd
gdgfddgdgfdgdgfdZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 dde	dz  de	dz  de	dz  de	dz  de	dz  de	dz  de
dz  de	dz  dedz  de	dz  dedz  dedz  deee
ef   z  dz  de	dz  f fdZ xZS ) Glm4vTextConfiga  
    This is the configuration class to store the configuration of a [`Glm4vModel`]. It is used to instantiate a
    GLM-4.1V model according to the specified arguments, defining the model architecture. Instantiating a
    configuration with the defaults will yield a similar configuration to that of
    GLM-4.1V-9B-Thinking [THUDM/GLM-4.1V-9B-Thinking](https://huggingface.co/THUDM/GLM-4.1V-9B-Thinking).

    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PreTrainedConfig`] for more information.

    Args:
        vocab_size (`int`, *optional*, defaults to 151552):
            Vocabulary size of the Glm4v model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`Glm4vModel`]
        hidden_size (`int`, *optional*, defaults to 4096):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 13696):
            Dimension of the MLP representations.
        num_hidden_layers (`int`, *optional*, defaults to 40):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 32):
            Number of attention heads for each attention layer in the Transformer encoder.
        num_key_value_heads (`int`, *optional*, defaults to 2):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
            by meanpooling all the original heads within that group. For more details checkout [this
            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to `32`.
        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
            The non-linear activation function (function or string) in the decoder.
        max_position_embeddings (`int`, *optional*, defaults to 32768):
            The maximum sequence length that this model might ever be used with.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
            The epsilon used by the rms normalization layers.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        rope_parameters (`RopeParameters`, *optional*):
            Dictionary containing the configuration parameters for the RoPE embeddings. The dictionary should contain
            a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE
            with longer `max_position_embeddings`.
        pad_token_id (`int`, *optional*):
            The id of the padding token.


    ```python
    >>> from transformers import Glm4vTextModel, Glm4vConfig

    >>> # Initializing a GLM-4.1V style configuration
    >>> configuration = Glm4vConfig()

    >>> # Initializing a model from the GLM-4.1V style configuration
    >>> model = Glm4vTextModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```
glm4v_texttext_configpast_key_valuescolwiserowwisecolwise_gather_outputrowwise_split_input)zlayers.*.self_attn.q_projzlayers.*.self_attn.k_projzlayers.*.self_attn.v_projzlayers.*.self_attn.o_projzlayers.*.mlp.gate_up_projzlayers.*.mlp.down_proj	input_idsinputs_embedshidden_statesattention_mask)embed_tokenslayersnormN
vocab_sizer   r   num_hidden_layersnum_attention_headsnum_key_value_headsr   max_position_embeddingsr   r   	use_cacher   rope_parameterspad_token_idc                     || _         || _        || _        || _        || _        || _        ||}|| _        || _        |	| _        |
| _	        || _
        || _        || _        || _        t        | <  dddhi| y )Nignore_keys_at_rope_validationmrope_sectionr
   )rC   rG   r   r   rD   rE   rF   r   r   r   rH   r   rI   rJ   r   r   )r   rC   r   r   rD   rE   rF   r   rG   r   r   rH   r   rI   rJ   r   r   s                   r   r   zGlm4vTextConfig.__init__   s    $ %'>$&!2!2#6  &"5#6 $!2("!2.(T8ITVTr    )i P r(   r)   (       r'   r"   i   r*   r&   Tr#   NN)r+   r,   r-   r.   r/   r0   keys_to_ignore_at_inferencebase_model_tp_planbase_model_pp_planintstrfloatboolr   dictr   r1   r2   s   @r   r4   r4   u   s   <| J#O#4"5 &/%.%.%.%<"7 &(9:#%568IJ!"_$56 "("&(-(**,*+!'.3*.#(!%*-MQ#'&U$J&U 4Z&U :	&U
 :&U !4Z&U !4Z&U $J&U "%t&U !4<&U Dj&U $;&U !4<&U ($sN/B*CCdJ&U Dj&U &Ur    r4   c                   J     e Zd ZdZdZeedZdgZ	 	 	 	 	 	 	 	 	 d fd	Z	 xZ
S )Glm4vConfiga  
    This is the configuration class to store the configuration of a [`Glm4vModel`]. It is used to instantiate a
    GLM-4.1V model according to the specified arguments, defining the model architecture. Instantiating a
    configuration with the defaults will yield a similar configuration to that of
    GLM-4.1V-9B-Thinking [THUDM/GLM-4.1V-9B-Thinking](https://huggingface.co/THUDM/GLM-4.1V-9B-Thinking).

    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PreTrainedConfig`] for more information.


    Args:
        text_config (`Union[PreTrainedConfig, dict]`, *optional*, defaults to `Glm4vTextConfig`):
            The config object or dictionary of the text backbone.
        vision_config (`Union[PreTrainedConfig, dict]`,  *optional*, defaults to `Glm4vVisionConfig`):
            The config object or dictionary of the vision backbone.
        image_token_id (`int`, *optional*, defaults to 151343):
            The image token index to encode the image prompt.
        video_token_id (`int`, *optional*, defaults to 151344):
            The video token index to encode the image prompt.
        image_start_token_id (`int`, *optional*, defaults to 151339):
            The image start token index to encode the start of image.
        image_end_token_id (`int`, *optional*, defaults to 151340):
            The image end token index to encode the end of image.
        video_start_token_id (`int`, *optional*, defaults to 151341):
            The video start token index to encode the start of video.
        video_end_token_id (`int`, *optional*, defaults to 151342):
            The video end token index to encode the end of video.
        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
            Whether the model's input and output word embeddings should be tied.

    ```python
    >>> from transformers import Glm4vForConditionalGeneration, Glm4vConfig

    >>> # Initializing a GLM-4.1V style configuration
    >>> configuration = Glm4vConfig()

    >>> # Initializing a model from the GLM-4.1V style configuration
    >>> model = Glm4vForConditionalGeneration(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```glm4v)r   r6   r7   c
                    t        |t              r | j                  d   di || _        n| | j                  d          | _        t        |t              r | j                  d   di || _        n| | j                  d   di |
| _        || _        || _        || _        || _        || _	        || _
        |	| _        t        | 4  di |
 y )Nr   r6   r
   )
isinstancerW   sub_configsr   r6   image_token_idvideo_token_idvideo_start_token_idvideo_end_token_idimage_start_token_idimage_end_token_idtie_word_embeddingsr   r   )r   r6   r   r^   r_   rb   rc   r`   ra   rd   r   r   s              r   r   zGlm4vConfig.__init__  s     mT*!B!1!1/!B!S]!SD"!B!1!1/!B!DDk4(>t//>MMD >t//>HHD,,$8!"4$8!"4#6 "6"r    )	NNi/O i0O i+O i,O i-O i.O F)r+   r,   r-   r.   r/   r   r4   r]   rP   r   r1   r2   s   @r   rY   rY      sJ    )V J$5oVK#4"5 #!#!!# #r    rY   )rY   r4   r   N)configuration_utilsr   modeling_rope_utilsr   r   r4   rY   __all__r
   r    r   <module>rh      sG   ( 4 1Z3( Z3zwU& wUtO#" O#d Br    