
    qi#                    8   d dl Z d dlmZ d dlZd dlZd dlmZ d dlmc m	Z
 d dlmZ ddlmZ ddlmZ ddlmZmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddl m!Z!m"Z" ddl#m$Z$ ddl%m&Z&m'Z' ddl(m)Z) ddl*m+Z+m,Z, ddl-m.Z.m/Z/m0Z0m1Z1m2Z2 ddl3m4Z4m5Z5 ddl6m7Z7 ddl8m9Z9 ddl:m;Z;m<Z<m=Z=m>Z> ddl?m@Z@mAZAmBZBmCZCmDZDmEZEmFZFmGZGmHZHmIZI ddlJmKZK ddlLmMZMmNZN  e1j                  eP      ZQ G d de      ZR G d de      ZS G d  d!e      ZT G d" d#e<      ZU G d$ d%eD      ZV G d& d'e@      ZW G d( d)eA      ZX G d* d+ej                        ZZ G d, d-ej                        Z[ G d. d/eH      Z\ G d0 d1eI      Z] G d2 d3e=      Z^d4 Z_dOd5Z` G d6 d7ej                        Za G d8 d9e;      Zb G d: d;e      Zc G d< d=eE      Zd G d> d?eF      Ze G d@ dAee      Zf G dB dCeG      Zg G dD dEeK      Zh G dF dGeB      Zi G dH dIeC      Zj G dJ dKeN      Zk G dL dMeM      Zlg dNZmy)P    N)Callable)	LayerNorm   )initialization)ACT2FN)CacheDynamicCache)PreTrainedConfig)BatchFeature)
ImageInput)create_causal_mask)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputWithPastBaseModelOutputWithPooling)RopeParameters)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)PreTokenizedInput	TextInput)TransformersKwargsauto_docstringcan_return_tupleloggingtorch_compilable_check)maybe_autocastmerge_with_config_defaults)capture_outputs)
VideoInput   )Glm4MLPGlm4RMSNormGlm4RotaryEmbeddingeager_attention_forward)
Qwen2_5_VisionPatchEmbedQwen2_5_VisionRotaryEmbedding Qwen2_5_VLCausalLMOutputWithPast"Qwen2_5_VLForConditionalGenerationQwen2_5_VLMLPQwen2_5_VLModelOutputWithPastQwen2_5_VLPreTrainedModelQwen2_5_VLTextModelQwen2_5_VLVisionAttentionQwen2_5_VLVisionBlock)Qwen2VLModel)Qwen2VLProcessorQwen2VLProcessorKwargsc                   J     e Zd ZdZdZdZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d fd	Z xZS )Glm4vVisionConfiga  
    This is the configuration class to store the configuration of a [`Glm4vVisionModel`]. It is used to instantiate an Glm4vVisionModel
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield
    a similar configuration to that of
    GLM-4.1V-9B-Thinking [THUDM/GLM-4.1V-9B-Thinking](https://huggingface.co/THUDM/GLM-4.1V-9B-Thinking).

    Args:
            depth (`int`, *optional*, defaults to 24):
                Number of layers (depth) in the model.
            hidden_size (`int`, *optional*, defaults to 1536):
                Dimensionality of the encoder layers and the pooler layer.
            hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
                The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
                `"relu"`, `"selu"` and `"gelu_new"` are supported.
            attention_bias (`bool`, *optional*, defaults to `False`):
                Whether to add a bias to the queries, keys and values.
            attention_dropout (`float`, *optional*, defaults to 0.0):
                Dropout probability for attention weights.
            num_heads (`<fill_type>`, *optional*, defaults to 12): <fill_docstring>
            in_channels (`<fill_type>`, *optional*, defaults to 3): <fill_docstring>
            image_size (`int` or `list[int]`, *optional*, defaults to 336):
                The size (resolution) of each image.
            patch_size (`int`, *optional*, defaults to 14):
                The size (resolution) of each patch.
            rms_norm_eps (`float`, *optional*, defaults to 1e-05):
                The epsilon used by the rms normalization layers.
            spatial_merge_size (`int`, *optional*, defaults to 2):
                The size used for merging spatial dimensions.
            temporal_patch_size (`int`, *optional*, defaults to 2):
                The size used for patches along the temporal dimension.
            out_hidden_size (`int`, *optional*, defaults to 4096):
                The output hidden size of the vision model.
            intermediate_size (`int`, *optional*, defaults to 13696):
                Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
            initializer_range (`float`, *optional*, defaults to 0.02):
                The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
    Example:

    ```python
    >>> from transformers import Glm4vVisionConfig, Glm4vVisionModel

    >>> # Initializing a Glm4vVisionConfig GLM-4.1V-9B style configuration
    >>> configuration = Glm4vVisionConfig()

    >>> # Initializing a model (with random weights) from the GLM-4.1V-9B configuration
    >>> model = Glm4vVisionModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```glm4v_visionvision_configc                     t        |   di | || _        || _        || _        || _        || _        || _        |	| _        || _	        || _
        || _        || _        || _        |
| _        || _        || _        y )N )super__init__depthhidden_size
hidden_act	num_headsin_channels
image_size
patch_sizespatial_merge_sizetemporal_patch_sizeout_hidden_sizeintermediate_sizeinitializer_rangerms_norm_epsattention_biasattention_dropout)selfr;   r<   r=   rH   rI   r>   r?   r@   rA   rG   rB   rC   rD   rE   rF   kwargs	__class__s                    Y/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/glm4v/modular_glm4v.pyr:   zGlm4vVisionConfig.__init__}   s    & 	"6"
&$"&$$"4#6 .!2!2(,!2    )   i   siluF           r   iP     h㈵>r!   r!      5  {Gz?)__name__
__module____qualname____doc__
model_typebase_config_keyr:   __classcell__rL   s   @rM   r4   r4   F   sN    1f  J%O !#3 #3rN   r4   c                   $    e Zd ZdZdZdZdgZddddddd	Zd
gdgfddgdgfdgdgfdZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 dde	dz  de	dz  de	dz  de	dz  de	dz  de	dz  de
dz  de	dz  dedz  de	dz  dedz  dedz  deee
ef   z  dz  de	dz  f fdZ xZS ) Glm4vTextConfiga  
    This is the configuration class to store the configuration of a [`Glm4vModel`]. It is used to instantiate a
    GLM-4.1V model according to the specified arguments, defining the model architecture. Instantiating a
    configuration with the defaults will yield a similar configuration to that of
    GLM-4.1V-9B-Thinking [THUDM/GLM-4.1V-9B-Thinking](https://huggingface.co/THUDM/GLM-4.1V-9B-Thinking).

    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PreTrainedConfig`] for more information.

    Args:
        vocab_size (`int`, *optional*, defaults to 151552):
            Vocabulary size of the Glm4v model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`Glm4vModel`]
        hidden_size (`int`, *optional*, defaults to 4096):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 13696):
            Dimension of the MLP representations.
        num_hidden_layers (`int`, *optional*, defaults to 40):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 32):
            Number of attention heads for each attention layer in the Transformer encoder.
        num_key_value_heads (`int`, *optional*, defaults to 2):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
            by meanpooling all the original heads within that group. For more details checkout [this
            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to `32`.
        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
            The non-linear activation function (function or string) in the decoder.
        max_position_embeddings (`int`, *optional*, defaults to 32768):
            The maximum sequence length that this model might ever be used with.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
            The epsilon used by the rms normalization layers.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        rope_parameters (`RopeParameters`, *optional*):
            Dictionary containing the configuration parameters for the RoPE embeddings. The dictionary should contain
            a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE
            with longer `max_position_embeddings`.
        pad_token_id (`int`, *optional*):
            The id of the padding token.


    ```python
    >>> from transformers import Glm4vTextModel, Glm4vConfig

    >>> # Initializing a GLM-4.1V style configuration
    >>> configuration = Glm4vConfig()

    >>> # Initializing a model from the GLM-4.1V style configuration
    >>> model = Glm4vTextModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```
glm4v_texttext_configpast_key_valuescolwiserowwisecolwise_gather_outputrowwise_split_input)zlayers.*.self_attn.q_projzlayers.*.self_attn.k_projzlayers.*.self_attn.v_projzlayers.*.self_attn.o_projzlayers.*.mlp.gate_up_projzlayers.*.mlp.down_proj	input_idsinputs_embedshidden_statesattention_mask)embed_tokenslayersnormN
vocab_sizer<   rE   num_hidden_layersnum_attention_headsnum_key_value_headsr=   max_position_embeddingsrF   rG   	use_cacherI   rope_parameterspad_token_idc                     || _         || _        || _        || _        || _        || _        ||}|| _        || _        |	| _        |
| _	        || _
        || _        || _        || _        t        | <  dddhi| y )Nignore_keys_at_rope_validationmrope_sectionr8   )rp   rt   r<   rE   rq   rr   rs   r=   rF   rG   ru   rI   rv   rw   r9   r:   )rJ   rp   r<   rE   rq   rr   rs   r=   rt   rF   rG   ru   rI   rv   rw   rK   rL   s                   rM   r:   zGlm4vTextConfig.__init__   s    $ %'>$&!2!2#6  &"5#6 $!2("!2.(T8ITVTrN   )i P rU   rV   (       r!   rP   i   rW   rT   TrQ   NN)rX   rY   rZ   r[   r\   r]   keys_to_ignore_at_inferencebase_model_tp_planbase_model_pp_planintstrfloatboolr   dictr:   r^   r_   s   @rM   ra   ra      s   <| J#O#4"5 &/%.%.%.%<"7 &(9:#%568IJ!"_$56 "("&(-(**,*+!'.3*.#(!%*-MQ#'&U$J&U 4Z&U :	&U
 :&U !4Z&U !4Z&U $J&U "%t&U !4<&U Dj&U $;&U !4<&U ($sN/B*CCdJ&U Dj&U &UrN   ra   c                   J     e Zd ZdZdZeedZdgZ	 	 	 	 	 	 	 	 	 d fd	Z	 xZ
S )Glm4vConfiga  
    This is the configuration class to store the configuration of a [`Glm4vModel`]. It is used to instantiate a
    GLM-4.1V model according to the specified arguments, defining the model architecture. Instantiating a
    configuration with the defaults will yield a similar configuration to that of
    GLM-4.1V-9B-Thinking [THUDM/GLM-4.1V-9B-Thinking](https://huggingface.co/THUDM/GLM-4.1V-9B-Thinking).

    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PreTrainedConfig`] for more information.


    Args:
        text_config (`Union[PreTrainedConfig, dict]`, *optional*, defaults to `Glm4vTextConfig`):
            The config object or dictionary of the text backbone.
        vision_config (`Union[PreTrainedConfig, dict]`,  *optional*, defaults to `Glm4vVisionConfig`):
            The config object or dictionary of the vision backbone.
        image_token_id (`int`, *optional*, defaults to 151343):
            The image token index to encode the image prompt.
        video_token_id (`int`, *optional*, defaults to 151344):
            The video token index to encode the image prompt.
        image_start_token_id (`int`, *optional*, defaults to 151339):
            The image start token index to encode the start of image.
        image_end_token_id (`int`, *optional*, defaults to 151340):
            The image end token index to encode the end of image.
        video_start_token_id (`int`, *optional*, defaults to 151341):
            The video start token index to encode the start of video.
        video_end_token_id (`int`, *optional*, defaults to 151342):
            The video end token index to encode the end of video.
        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
            Whether the model's input and output word embeddings should be tied.

    ```python
    >>> from transformers import Glm4vForConditionalGeneration, Glm4vConfig

    >>> # Initializing a GLM-4.1V style configuration
    >>> configuration = Glm4vConfig()

    >>> # Initializing a model from the GLM-4.1V style configuration
    >>> model = Glm4vForConditionalGeneration(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```glm4v)r6   rc   rd   c
                    t        |t              r | j                  d   di || _        n| | j                  d          | _        t        |t              r | j                  d   di || _        n| | j                  d   di |
| _        || _        || _        || _        || _        || _	        || _
        |	| _        t        | 4  di |
 y )Nr6   rc   r8   )
isinstancer   sub_configsr6   rc   image_token_idvideo_token_idvideo_start_token_idvideo_end_token_idimage_start_token_idimage_end_token_idtie_word_embeddingsr9   r:   )rJ   rc   r6   r   r   r   r   r   r   r   rK   rL   s              rM   r:   zGlm4vConfig.__init__M  s     mT*!B!1!1/!B!S]!SD"!B!1!1/!B!DDk4(>t//>MMD >t//>HHD,,$8!"4$8!"4#6 "6"rN   )	NNi/O i0O i+O i,O i-O i.O F)rX   rY   rZ   r[   r\   r4   ra   r   r}   r:   r^   r_   s   @rM   r   r     sJ    )V J$5oVK#4"5 #!#!!# #rN   r   c                       e Zd Zy)Glm4vRMSNormNrX   rY   rZ   r8   rN   rM   r   r   p      rN   r   c                   &     e Zd Zddef fdZ xZS )Glm4VisionMlpbiasc                 H    t         |   ||       |j                  | _        y N)r9   r:   rD   rE   )rJ   configr   rL   s      rM   r:   zGlm4VisionMlp.__init__u  s     &!'!7!7rN   F)rX   rY   rZ   r   r:   r^   r_   s   @rM   r   r   t  s    8T 8 8rN   r   c                       e Zd ZdeddfdZy)Glm4vVisionPatchEmbedr   returnNc                 t   t         j                  j                  |        |j                  | _        |j                  | _        |j
                  | _        |j                  | _        | j                  | j                  | j                  g}t        j                  | j
                  | j                  ||      | _	        y )N)kernel_sizestride)
nnModuler:   rA   rC   r?   r<   	embed_dimConv3dproj)rJ   r   r   s      rM   r:   zGlm4vVisionPatchEmbed.__init__{  s    
		4  ++#)#=#= !--++//$//RIId..K`kl	rN   )rX   rY   rZ   r4   r:   r8   rN   rM   r   r   z  s    m0 mT mrN   r   c                       e Zd Zy)Glm4vVisionRotaryEmbeddingNr   r8   rN   rM   r   r     r   rN   r   c                   n     e Zd Zd
dededededdf
 fdZdej                  dej                  fd	Z	 xZ
S )Glm4vVisionPatchMergerdimcontext_dimr=   r   r   Nc                 x   t         |           t        j                  |||      | _        t        |      | _        t        j                  |||      | _        t        j                  |||      | _        t        j                  |||      | _	        t        j                         | _        t        |   | _        y )Nr   )r9   r:   r   Linearr   r   post_projection_norm	gate_projup_proj	down_projGELUact1r   act_fn)rJ   r   r   r=   r   rL   s        rM   r:   zGlm4vVisionPatchMerger.__init__  s    IIc3T2	$-cN!3$?yyk=;$?GGI	Z(rN   hidden_statec                     | j                  |      }| j                  | j                  |            }| j                  | j	                  | j                  |            | j                  |      z        S r   )r   r   r   r   r   r   r   )rJ   r   s     rM   forwardzGlm4vVisionPatchMerger.forward  sY    yy.yy!:!:<!HI~~dkk$..*FG$,,WcJddeerN   r   )rX   rY   rZ   r   r   r   r:   torchTensorr   r^   r_   s   @rM   r   r     sJ    )C )c )s )$ )[_ )fELL fU\\ frN   r   c                   D     e Zd Zdef fdZdej                  fdZ xZS )Glm4vVisionEmbeddingsr   c                 f   t         |           || _        |j                  | _        |j
                  | _        |j                  | _        | j
                  | j                  z  dz  | _        | j                  | _        t        j                  | j                  | j                        | _        d| _        y )Nr!   bicubic)r9   r:   r   r<   r   r@   rA   num_patchesnum_positionsr   	Embeddingposition_embeddinginterpolated_methodrJ   r   rL   s     rM   r:   zGlm4vVisionEmbeddings.__init__  s    ++ ++ ++ OOt>1D!--"$,,t/A/A4>>"R#, rN   r   c           	      2   | j                   j                  }|j                  d   }|j                  }t	        |t
              r&t        j                  ||t        j                        }|j                  d   }	t        |	dz        }
|j                  |
|
|      j                  ddd      j                  d      j                  |t        j                        }t        j                  t!        t#        |            D cg c]  }||df   j%                  ||          c}      j                  |t        j                        }t        j                  t!        t#        |            D cg c]  }||df   j%                  ||          c}      j                  |t        j                        }|dz   |z  dz  dz
  }|dz   |z  dz  dz
  }t        j&                  ||fd      j                  d      j                  d      }t)        j*                  ||| j,                  dd	
      }|j/                  d      j/                  d      j                  dd      }|j                  |j0                        j                  |j                        }||z   }|S c c}w c c}w )a  
        Forward pass with integrated position encoding adaptation using 2D interpolation.

        Args:
            embeddings: Input embeddings tensor
            lengths (torch.Tensor): Sequence lengths for each image in the batch.
            image_shapes (torch.Tensor): Tensor of shape [batch_size, 3] representing the image shapes (t, h, w).
            h_coords (torch.Tensor): Tensor of shape [total_seq] representing the h coordinate for each patch.
            w_coords (torch.Tensor): Tensor of shape [total_seq] representing the w coordinate for each patch.

        Returns:
            torch.Tensor: Embeddings with adapted position encoding added.
           )devicedtyper   g      ?r!   r   Fborder)modealign_cornerspadding_mode)r   weightshaper   r   listr   tensorlongr   viewpermute	unsqueezetofloat32catrangelenrepeatstackFgrid_sampler   squeezer   )rJ   
embeddingslengthsimage_shapesh_coordsw_coordspos_embed_weightr<   r   orig_size_sq	orig_sizepos_embed_2ditarget_htarget_wnorm_wnorm_hgridinterpolated_embed_fp32adapted_pos_embed_fp32adapted_pos_embeds                        rM   r   zGlm4vVisionEmbeddings.forward  s^     2299&,,Q/!(( gt$ll76LG (--a0c)*	!!)YDWQ1Yq\RvU]]R3	 	 99USVW^S_M`al1a4077
Cabee f 
 99USVW^S_M`al1a4077
Cabee f 

 c>X-2Q6c>X-2Q6 {{FF+4>>qAKKAN #$--$T%=%=Uai#

 "9!@!@!C!K!KB!O!W!WXY[\!]2556F6L6LMPPQ[QbQbc  "33
3 b bs   < J' J)	rX   rY   rZ   r4   r:   r   r   r   r^   r_   s   @rM   r   r     s#    
-0 
-;PUP\P\ ;rN   r   c                   (     e Zd Zdeddf fdZ xZS )Glm4vVisionAttentionr   r   Nc                 $   t         |   |       |j                  | _        t        j                  |j
                  |j
                  dz  |j                        | _        t        j                  |j
                  |j
                  d      | _        y )Nr   r   F)	r9   r:   rI   r   r   r<   rH   qkvr   r   s     rM   r:   zGlm4vVisionAttention.__init__  si     !'!9!999V//1C1Ca1GfNcNcdIIf00&2D2D5Q	rN   )rX   rY   rZ   r4   r:   r^   r_   s   @rM   r   r     s     R0 RT R RrN   r   c                         e Zd Zd fdZ xZS )Glm4vVisionBlockc                     t         |   |       t        |j                  |j                        | _        t        |j                  |j                        | _        t        |      | _        t        |d      | _
        y )NepsFr   )r9   r:   r   r<   rG   norm1norm2r   attnr   mlpr   s     rM   r:   zGlm4vVisionBlock.__init__  s^     !&"4"4&:M:MN
!&"4"4&:M:MN
(0	 e4rN   r   N)rX   rY   rZ   r:   r^   r_   s   @rM   r   r     s    5 5rN   r   c                   2     e Zd Zddef fdZd Zd Z xZS )Glm4vTextRotaryEmbeddingr   c                 h    t         |           |j                  j                  dg d      | _        y )Nrz   )   rR   rR   )r9   r:   rv   getrz   )rJ   r   r   rL   s      rM   r:   z!Glm4vTextRotaryEmbedding.__init__  s)    #3377UrN   c                 ^   | j                   d d d d d f   j                         j                  d|j                  d   dd      }|d d d d d d d f   j                         }t	        |j
                  j                  t              r/|j
                  j                  dk7  r|j
                  j                  nd}t        |d      5  |j                         |j                         z  j                  dd      }| j                  || j                        }t        j                  ||fd	      }|j                         | j                  z  }|j!                         | j                  z  }	d d d        j#                  |j$                  
      	j#                  |j$                  
      fS # 1 sw Y   AxY w)Nr   r   r   mpscpuF)device_typeenabledr!   r   r   )inv_freqr   expandr   r   r   typer   r   	transposeapply_mroperz   r   r   cosattention_scalingsinr   r   )
rJ   xposition_idsinv_freq_expandedposition_ids_expandedr  freqsembr  r  s
             rM   r   z Glm4vTextRotaryEmbedding.forward  s`    !MM$a*=>DDFMMaQ]QcQcdeQfhjlmn ,Q4] ; A A C'1!((--'E!((--[`J`ahhmmfkUC 	5&,,.1F1L1L1NNYYZ[]^_E$$UD,>,>?E))UEN3C'')d444C'')d444C	5 vvAGGv$cff177f&;;;	5 	5s   B!F##F,c           	          |}|j                  |d      }t        j                  t        |      D cg c]  \  }}||dz      c}}d      }|S c c}}w )Nr   r   r   )splitr   r   	enumerate)rJ   r  rz   sectionchunksr   chunkresults           rM   r  z$Glm4vTextRotaryEmbedding.apply_mrope  sQ    W"-69JKXQE!a%LKQST Ls   A
r   )rX   rY   rZ   ra   r:   r   r  r^   r_   s   @rM   r  r    s    V V< rN   r  c                 |    | ddddf   }| ddddf   }t        j                  | |fd      j                  d      S )	z*Rotates half the hidden dims of the input..r   Nr!   r   r   r   )r   r   flatten)r  x1x2s      rM   rotate_half_llmr(    sJ    	
319B	
319B;;Ryb)11"55rN   c                    |j                  |      }|j                  |      }|dd|j                  d   dz  f   j                  dd      }|dd|j                  d   dz  f   j                  dd      }|j                  d   }| dd|f   | d|df   }}|dd|f   |d|df   }	}||z  t        |      |z  z   }
||z  t        |      |z  z   }t	        j
                  |
|gd      }
t	        j
                  ||	gd      }|
|fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    .Nr   r!   r   )r   r   repeat_interleaver(  r   r   )qkr  r  unsqueeze_dim
rotary_dimq_rotq_passk_rotk_passq_embedk_embeds               rM   apply_rotary_pos_embr5    sD   $ --
&C
--
&C c'SYYr]a'''
(
:
:1"
:
EC
c'SYYr]a'''
(
:
:1"
:
EC 2Jc;J;&'3
+;)<6Ec;J;&'3
+;)<6E s{u5;<Gs{u5;<G ii&)r2Gii&)r2GGrN   c                   H    e Zd ZdZddededz  f fdZ	 	 	 	 ddej                  de	ej                  ej                  f   dz  dej                  dz  d	e
dz  d
ej                  dz  dee   de	ej                  ej                  dz  e	ej                     dz  f   fdZ xZS )Glm4vTextAttentionz
    Multi-headed attention from 'Attention Is All You Need' paper.
    and "Generating Long Sequences with Sparse Transformers".
    Nr   	layer_idxc                    t         |           || _        || _        |j                  | _        |j
                  | _        | j                  | j                  z  | _        |j                  | _        | j                  | j                  z  | _	        d| _
        |j                  | _        |j                  | _        | j                  dz  | _        t        j                  | j                  | j                  | j                  z  d      | _        t        j                  | j                  | j                  | j                  z  d      | _        t        j                  | j                  | j                  | j                  z  d      | _        t        j                  | j                  | j                  z  | j                  d      | _        y )NTg      r   F)r9   r:   r   r8  r<   rr   r>   head_dimrs   num_key_value_groups	is_causalrI   rv   scalingr   r   q_projk_projv_projo_projrJ   r   r8  rL   s      rM   r:   zGlm4vTextAttention.__init__H  sI   "!--33((DNN:#)#=#= $(NNd6N6N$N!!'!9!9%55}}d*ii 0 0$..4==2PW[\ii 0 0$2J2JT]]2Zaefii 0 0$2J2JT]]2Zaefii >@P@PW\]rN   rk   position_embeddingsrl   rd   cache_positionrK   r   c                 T   |j                         \  }}}	| j                  |      }
| j                  |      }| j                  |      }|
j	                  ||d| j
                        j                  dd      }
|j	                  ||d| j
                        j                  dd      }|j	                  ||d| j
                        j                  dd      }|\  }}t        |
|||      \  }
}|'|||d}|j                  ||| j                  |      \  }}t        j                  | j                  j                  t              } || |
|||f| j                  sdn| j                   | j"                  d|\  }}|j%                  ||d      j'                         }| j)                  |      }||fS )Nr   r   r!   )r  r  rD  rQ   )dropoutr=  )sizer>  r?  r@  r   r:  r  r5  updater8  r   get_interfacer   _attn_implementationr%   trainingrI   r=  reshape
contiguousrA  )rJ   rk   rC  rl   rd   rD  rK   bszq_len_query_states
key_statesvalue_statesr  r  cache_kwargsattention_interfaceattn_outputattn_weightss                      rM   r   zGlm4vTextAttention.forward\  s    &**,UA{{=1[[/
{{=1#((eRGQQRSUVW__S%T]]CMMaQRS
#((eRGQQRSUVW&S#7jRUWZ#[ j&#&snUL'6'='=j,X\XfXfht'u$J(?(M(MKK,,.E)
 %8	%
  $}}C$2H2HLL	%
 	%
!\ "))#ub9DDFkk+.L((rN   r   NNNN)rX   rY   rZ   r[   ra   r   r:   r   r   tupler   
LongTensorr   r   r   r^   r_   s   @rM   r7  r7  B  s    
^ ^3: ^. IM.2(,26+)||+) #5<<#=>E+) t+	+)
 +) ((4/+) -.+) 
u||U\\D0%2E2LL	M+)rN   r7  c                       e Zd Zy)Glm4vTextMLPNr   r8   rN   rM   r\  r\    r   rN   r\  c                   d    e Zd Zdedef fdZe	 	 	 	 	 	 ddej                  de	ej                  ej                  f   dz  dej                  dz  dej                  dz  d	edz  d
edz  dej                  dz  de	ej                  e	ej                  ej                  f   dz  f   fd       Z xZS )Glm4vTextDecoderLayerr   r8  c                    t         |           |j                  | _        t        ||      | _        t        |      | _        t        |j                  |j                        | _	        t        |j                  |j                        | _
        t        |j                  |j                        | _        t        |j                  |j                        | _        y )Nr   )r9   r:   r<   r7  	self_attnr\  r  r   rG   input_layernormpost_attention_layernormpost_self_attn_layernormpost_mlp_layernormrB  s      rM   r:   zGlm4vTextDecoderLayer.__init__  s    !--+FI>'+F,>,>FDWDWX(4V5G5GVM`M`(a%(4V5G5GVM`M`(a%".v/A/AvGZGZ"[rN   Nrk   rC  rl   r  rd   ru   rD  r   c                    |}	| j                  |      } | j                  d|||||||d|\  }}
| j                  |      }|	|z   }|}	| j                  |      }| j	                  |      }| j                  |      }|	|z   }|S )N)rk   rC  rl   r  rd   ru   rD  r8   )ra  r`  rc  rb  r  rd  )rJ   rk   rC  rl   r  rd   ru   rD  rK   residualrP  s              rM   r   zGlm4vTextDecoderLayer.forward  s     !,,]; *4>> 	
' 3)%+)	
 	
q 55mD =0 !55mD///> =0rN   )NNNNFN)rX   rY   rZ   ra   r   r:   r   r   r   rY  rZ  r   r   FloatTensorr   r^   r_   s   @rM   r^  r^    s    \ \3 \  IM.204(,!&26%||% #5<<#=>E% t+	%
 &&-% % $;% ((4/% 
u  %(9(95;L;L(L"MPT"TT	U% %rN   r^  c                       e Zd Zy)Glm4vModelOutputWithPastNr   r8   rN   rM   ri  ri    r   rN   ri  c                   $    e Zd ZddgZeedZd Zy)Glm4vPreTrainedModelr^  r   rk   
attentionsc                 .   t        j                  | |       t        |t              rod|j                  t        j                  d|j                  dt
        j                        |j                  z  z  z  }t        j                  |j                  |       y y )Ng      ?r   r!   r  )r   _init_weightsr   r   thetar   aranger   r   initcopy_r  )rJ   moduler  s      rM   ro  z"Glm4vPreTrainedModel._init_weights  sm    %%dF3f89fllu||Avzz1TYT_T_/`cicmcm/mnoHJJv1 :rN   N)rX   rY   rZ   _no_split_modulesr^  r7  _can_record_outputsro  r8   rN   rM   rk  rk    s!    02DE.(
2rN   rk  c                        e Zd ZU eed<   dZdgZeedZ	d fdZ
d Zeeedej                   d	ej                   d
ee   deez  fd                     Z xZS )Glm4vVisionModelr   )imagevideor   rl  r   c                 F   t         |   |       |j                  | _        |j                  | _        t	        |      | _        t        |      | _        |j                  |j                  z  }t        |dz        | _        t        j                  t        |j                        D cg c]  }t!        |       c}      | _        t%        |j&                  |j(                  |j*                        | _        t/        |j                  |j0                        | _        t        j4                  |j                  |j&                  |j                  |j                        | _        t/        |j                  |j0                        | _        d| _        | j=                          y c c}w )Nr!   )r   r   r=   r   )r?   out_channelsr   r   F)r9   r:   rB   rA   r   r   r   patch_embedr<   r>   r   rotary_pos_embr   
ModuleListr   r;   r   blocksr   rD   rE   r=   mergerr   rG   post_conv_layernormConv2d
downsamplepost_layernormgradient_checkpointing	post_init)rJ   r   r:  rP  rL   s       rM   r:   zGlm4vVisionModel.__init__  sA    "(";"; ++/708%%)9)998QGmmuV\\GZ$[!%5f%=$[\,&&F4L4LY_YjYj
 $00B0BH[H[#\ ))**//11,,	
 +6+=+=6CVCVW&+# %\s   %Fc                    g }|D ]s  \  }}}t        j                  |      j                  d      j                  d|      }|j	                  || j
                  z  | j
                  || j
                  z  | j
                        }|j                  dddd      }|j                         }t        j                  |      j                  d      j                  |d      }|j	                  || j
                  z  | j
                  || j
                  z  | j
                        }|j                  dddd      }|j                         }|j                  t        j                  ||gd      j                  |d             v t        j                  |d      }|d d dd f   j                         }| j                  |      }	|	|   j                  d      }
|
|fS )Nr   r   r   r!   r   r   )r   rq  r   r  rL  rB   r   r%  appendr   r   r   maxr~  )rJ   grid_thwpos_idsthwhpos_idswpos_idsmax_grid_sizerotary_pos_emb_fullr~  s              rM   rot_pos_embzGlm4vVisionModel.rot_pos_emb  s    	SGAq!||A003::2qAH''T,,,''T,,,''	H  ''1a3H'')H||A003::1bAH''T,,,''T,,,''	H  ''1a3H'')HNN5;;(';DKKAqQR)	S* ))G+ AB++-"11-@,W5==a@w&&rN   rk   r  rK   c           	      n   | j                  |      }| j                  |      }| j                  |      \  }}t        j                  ||fd      }|j                         |j                         f}t        j                  |dddf   |dddf   z  |dddf         j                  dt        j                  j                         r|j                  nt        j                        }t        j                  |dd	      }|dd |dd z
  j                         }	| j!                  ||	||dddf   j#                  |j$                        |dddf   j#                  |j$                              }| j&                  D ]  }
 |
|f||d
|} | j)                  |      }|j+                  d| j,                  | j,                  |j.                  d         }|j1                  dddd      }| j3                  |      j+                  d| j4                  j6                        }| j9                  |      }t;        ||      S )a\  
        hidden_states (`torch.Tensor` of shape `(seq_len, hidden_size)`):
            The final hidden states of the model.
        grid_thw (`torch.Tensor` of shape `(num_images_or_videos, 3)`):
            The temporal, height and width of feature shape of each image in LLM.

        Returns:
            `torch.Tensor`: hidden_states.
        r   r   Nr   r!   r   )r   r   )r   r   )value)
cu_seqlensrC  r   )last_hidden_statepooler_output)r}  r  r  r   r   r  r  r*  cumsumjit
is_tracingr   int32r   padtolistr   r   r   r  r  r   rB   r   r   r  r   rD   r  r   )rJ   rk   r  rK   r~  image_type_idsr  rC  r  seqlensblkmerged_hidden_statess               rM   r   zGlm4vVisionModel.forward  s&    ((700?)-)9)9()C&ii8bA"wwy#'')4,,Xad^hq!tn-LhWXZ[W[n]dd
 %*II$8$8$:(.. e 

 UU:vQ7
ab>JsO3;;=1a4 ##M$8$891a4 ##M$8$89
 ;; 	C%$7 	M	 ++M:%**'')@)@-BUBUVXBY
 &--aAq96;;B@[@[\#{{=9)+.
 	
rN   r  )rX   rY   rZ   r4   __annotations__input_modalitiesru  r   r   rv  r:   r  r   r   r   r   r   r   r   rY  r   r   r^   r_   s   @rM   rx  rx    s    )+,)*
8':  9
"\\9
5:\\9
MSTfMg9
	+	+9
    9
rN   rx  c                       e Zd Zdef fdZeee	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dedz  dej                  dz  d	edz  d
ej                  dz  dee   deez  fd                     Z xZS )Glm4vTextModelr   c           	      .   t         |   |       t        j                  t	        |j
                        D cg c]  }t        ||       c}      | _        t        |j                  |j                        | _        t        |      | _        | `| `y c c}w )Nr   r   )r9   r:   r   r  r   rq   r^  rn   r   r<   rG   ro   r  
rotary_embrJ  has_sliding_layersrB  s      rM   r:   zGlm4vTextModel.__init__V  s{     mmGLVMeMeGfg)"695g
 !!3!39L9LM	2&A%# hs   BNri   rl   r  rd   rj   ru   rD  rK   r   c           
      ^   |d u |d uz  rt        d      |r6|4t        j                  j                         st	        | j
                        }|| j                  |      }|F||j                         nd}	t        j                  |	|	|j                  d   z   |j                        }|2|j                  ddd      j                  d|j                  d   d      }n2|j                  dk(  r#|d	   j                  d|j                  d   d      }|j                  dk(  r|j                  d   d
k(  r|d   }
|dd  }nd }
| j
                  |||||
d}t        di |}|}| j                  ||      }| j                   D ]  } ||f||
|||d|}|} | j#                  |      }t%        ||      S )N:You must specify exactly one of input_ids or inputs_embedsr  r   r   r   r   r   r!   )N.   )r   rj   rl   rD  rd   r  )r  )rl   r  rd   rD  rC  )r  rd   r8   )
ValueErrorr   r  r  r	   r   rm   get_seq_lengthrq  r   r   r   r  ndimr   r  rn   ro   r   )rJ   ri   rl   r  rd   rj   ru   rD  rK   past_seen_tokenstext_position_idsmask_kwargscausal_maskrk   rC  decoder_layerlayer_outputss                    rM   r   zGlm4vTextModel.forward`  s    -t";<YZZ 09M9M9O*$++>O  --i8M!CRC^==?de"\\ "2]5H5H5K"KTaThThN
 )..q!R8??=CVCVWXCY[]^L!#'	299!\=O=OPQ=RTVWL !l&8&8&;q&@ ,Q'+L !% kk*,,.-
 )7;7%"oom,oW![[ 
	*M)*. /-$7 M *M
	* 		-0&++
 	
rN   )NNNNNNN)rX   rY   rZ   ra   r:   r   r   r   r   rZ  r   r   rg  r   r   r   rY  r   r   r^   r_   s   @rM   r  r  U  s    $ $  .2.204(,26!%26Q
##d*Q
 t+Q
 &&-	Q

 Q
 ((4/Q
 $;Q
 ((4/Q
 -.Q
 
(	(Q
    Q
rN   r  c                    |    e Zd Zi ZddgZ fdZee	 ddej                  dej                  dz  dee   deez  fd	              Z	 	 dd
ej                  dej                  dej                  dz  dej                  dz  fdZ	 	 	 dd
ej                  dej"                  dej                  dz  dej                  dz  dej$                  dz  deej$                  ej$                  f   fdZee	 	 	 	 	 	 	 	 	 	 	 	 dd
ej                  dz  dej$                  dz  dej                  dz  dedz  dej                  dz  dej$                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej"                  dz  dej                  dz  dee   deez  fd              Z xZS )
Glm4vModelr^  r   c                 l    t         |   |       t        j                  |j                        | _        y r   )r9   r:   rx  _from_configr6   visualr   s     rM   r:   zGlm4vModel.__init__  s(     &33F4H4HIrN   Npixel_values_videosvideo_grid_thwrK   r   c                 4   |j                  | j                  j                        }g }|j                         }|D ]N  \  }}}t	        j
                  d||g      j                  d      j                  |d      }	|j                  |	       P t	        j                  |d      }
 | j                  |f|
dd|}|j                  d      | j                  j                  dz  z  j                         }t	        j                  |j                  |      }||_        |S )a[  
        pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
            The tensors corresponding to the input videos.
        video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
            The temporal, height and width of feature shape of each video in LLM.
        r   r   r   T)r  return_dictr   r!   )r  r  r   r  r   r   r   r   r  r   prodrB   r  r  )rJ   r  r  rK   temp_frames_hwvideo_grid_thw_listr  r  r  repeated_rowflattened_video_grid_thwvision_outputssplit_sizesvideo_embedss                 rM   get_video_featureszGlm4vModel.get_video_features  s    266t{{7H7HI,335* 	0GAq! <<Aq	2<<Q?FFq!LL!!,/	0 $)99^#C $
*BPT
X^
 &**2.$++2P2PRS2SS[[]{{>#?#?M'3$rN   ri   rj   image_featuresvideo_featuresc                 T   || | j                         t        j                  | j                  j                  t        j
                  |j                              k(  }|j                  d      }| | j                         t        j                  | j                  j                  t        j
                  |j                              k(  }|j                  d      }n2|| j                  j                  k(  }|| j                  j                  k(  }|j                         }|j                  d      j                  |      j                  |j                        }|At        ||   j                         |j                         k(  d| d|j                  d           |j                         }|j                  d      j                  |      j                  |j                        }|At        ||   j                         |j                         k(  d| d|j                  d           ||fS )z
        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
        equal to the length of multimodal features. If the lengths are different, an error is raised.
        r   r   r   z6Image features and image tokens do not match, tokens: z, features: r   z6Video features and video tokens do not match, tokens: )get_input_embeddingsr   r   r   r   r   r   allr   sumr   	expand_asr   r   numelr   )	rJ   ri   rj   r  r  special_image_maskspecial_video_maskn_image_tokensn_video_tokenss	            rM   get_placeholder_maskzGlm4vModel.get_placeholder_mask  s    !.2M$2K2K2MT[[77uzzR_RfRfg3 " "4!7!7!;!.2M$2K2K2MT[[77uzzR_RfRfg3 " "4!7!7!; "+dkk.H.H!H!*dkk.H.H!H+//1/99"=GGVYYZgZnZno%"01779^=Q=Q=SSHHXXdeseyeyz{e|d}~
 ,//1/99"=GGVYYZgZnZno%"01779^=Q=Q=SSHHXXdeseyeyz{e|d}~ "#555rN   mm_token_type_idsimage_grid_thwrl   c           	      0   | j                   j                  j                  }g }t        j                  d|j
                  d   |j
                  d   |j                  |j                        }	|t        |      nd|t        |      ndd}
t        |      D ]K  \  }}||   }|,|||   j                            }|||   j                            }g }t        j                  t        |j                               d       D ]7  \  }}t        |      }|d   d   }|d   d   dz   }|j                  |||f       9 d}d}g }|D ]  \  }}}|dk(  r^||z
  }|j                  t        j                   ||j                  	      j#                  dd      j%                  dd      |z          ||z  }j|d
k(  r%|dk(  rt'        |
|         }|dz  }|d   k\  rdn|}nt'        |
|         }|d   }| j)                  |||||j                  	      }|j                  |       |t+        |d   |d
         |z  z  } t        j,                  |d      j/                  dd      }|5|j1                  |	j                        |	dd|||   j                         f<   n"|j1                  |	j                        |	dd|f<   |j                  |j+                         dz   t3        |      z
         N t        j4                  ||j                  	      j7                  d      }|	|fS )u	  
        Calculate the 3D rope index based on image and video's sizes. The utility expects a `vision + text`
        sequence and will error out otherwise. For pure text sequence, please rely on model's auto-inferred
        position ids. In a mixed vision + text sequence, vision tokens use 3D RoPE (temporal, height, width)
        while text tokens use standard 1D RoPE.

        Example:
            Temporal patches: 3; Height patches: 2; Width patches: 2
            Each vision input results in (temporal x height × width) positions. Here: 3 x 2 × 2 = 12 positions total.

            Temporal position IDs are spaced by:
                `interval = tokens_per_second * temporal_patch_size / fps`

                If fps = 1; tokens_per_second = 25; temporal_patch_size = 2, temporal IDs increase by 50 for each temporal patch:
                `[0, 0, 0, 0, 50, 50, 50, 50, 100, 100, 100, 100]`

            Height IDs repeat per row: `[0, 0, 1, 1, ...]`
            Width IDs alternate per column: `[0, 1, 0, 1, ...]`
            Text tokens follow standard 1D RoPE and the position IDs grow consequently with a step of `1`

        Args:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
                it.
            mm_token_type_ids (`torch.IntTensor` of shape `(batch_size, sequence_length)`):
                Token type ids matching each modality to a different value in the input sequence, i.e. text (0), image (1), video (2).
            image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
                The temporal, height and width of feature shape of each image in LLM.
            video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
                The temporal, height and width of feature shape of each video in LLM.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

        Returns:
            position_ids (`torch.LongTensor` of shape `(3, batch_size, sequence_length)`)
            mrope_position_deltas (`torch.Tensor` of shape `(batch_size)`)
        r   r   r   r  N)r   r!   c                     | d   S )Nr   r8   )r  s    rM   <lambda>z+Glm4vModel.get_rope_index.<locals>.<lambda>O  s    `abc`d rN   r   r  r!   r   )r   r6   rB   r   zerosr   r   r   iterr  r   	itertoolsgroupbyr  r   r  rq  r   r  nextget_vision_position_idsr  r   rL  r   r   r   r   )rJ   ri   r  r  r  rl   rK   rB   mrope_position_deltasr  
grid_iters	batch_idxcurrent_input_idsinput_token_typeinput_type_groupkeygroupstart_index	end_indexcurrent_posvideo_group_indexllm_pos_ids_listmodality_type	start_idxend_idxtext_lenr  temp_merge_sizevision_position_idsllm_positionss                                 rM   get_rope_indexzGlm4vModel.get_rope_index  sj   b "[[66II "{{OOAOOA//##
 (6'AtN#t'5'AtN#t


 -6i,@ 1	[(I(0;)$5nY6O6T6T6V$W!#3N94M4R4R4T#U !'//	:J:Q:Q:S0TVde G
UU#Ahqk!"IaL1,	 ''k9(EF	G K !!5E W1y' A%&2H$++Xi6F6FGLLQPRSZZ[\^`adoo  8+K %),1'+J},E'FH)Q.)1Bhqk1QAWh)#'
=(A#B '/qkO*.*F*F#X@R[d[k[k +G +' %++,?@3x{HQK#@DV#VVK7W8 "II&6A>FFq"MM)O\O_O_`l`s`sOtQ	>)+D+I+I+KKL-:-=-=l>Q>Q-RQ	\*!(():):)<q)@3GXCY)YZc1	[d !&-B9K[K[ \ f fgh i222rN   r  rd   pixel_valuesrope_deltasrD  c           
         |du |duz  rt        d      | | j                         |      }|| j                  ||d      j                  }t	        j
                  |d      j                  |j                  |j                        }| j                  |||      \  }}|j                  ||      }|| j                  ||	d      j                  }t	        j
                  |d      j                  |j                  |j                        }| j                  |||      \  }}|j                  ||      }|| j                  |||	||||	      } | j                  dd|||||d
|}t        di |d| j                  iS )a  
        image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
            The temporal, height and width of feature shape of each image in LLM.
        video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
            The temporal, height and width of feature shape of each video in LLM.
        rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*):
            The rope index difference between sequence length and multimodal rope.
        Nr  T)r  r   r   )r  )r  )ri   r  r  rj   rl   rd   r  )ri   r  rl   rd   rj   rD  r  r8   )r  r  get_image_featuresr  r   r   r   r   r   r  masked_scatterr  compute_3d_position_idslanguage_modelri  r  )rJ   ri   rl   r  rd   rj   r  r  r  r  r  r  rD  rK   image_embeds
image_maskrP  r  
video_maskoutputss                       rM   r   zGlm4vModel.forward}  s   4 -t";<YZZ 7D557	BM#22<]a2bppL 99\q9<<]=Q=QS`SfSfgL 55i_k5lMJ)88\RM*223Fdh2iwwL 99\q9<<]=Q=QS`SfSfgL 55i_k5lMAz)88\RM77#--+- /"3 8 L &$%% 
%)+')
 
 ( 

((
 	
rN   r   )NNNNN)NNNNNNNNNNNN)rX   rY   rZ   _checkpoint_conversion_mappingru  r:   r   r   r   rg  rZ  r   r   rY  r   r  r  	IntTensorr   r  r   ri  r   r^   r_   s   @rM   r  r    s   %'"02DEJ  37".. ((4/ +,	
 
+	+  B 4837(6##(6 (((6 ))D0	(6
 ))D0(6\ 3726.2s3##s3 !??s3 ((4/	s3
 ((4/s3 t+s3 
u||U\\)	*s3j  .2.204(,26,08<2626/34826B
##d*B
 t+B
 &&-	B

 B
 ((4/B
 llT)B
 #..5B
 ((4/B
 ((4/B
 %%,B
 !??T1B
 ((4/B
 +,B
 
)	)B
  B
rN   r  c                       e Zd Zy)Glm4vCausalLMOutputWithPastNr   r8   rN   rM   r  r    r   rN   r  c                    b    e Zd Zi Z	 	 	 	 	 	 	 	 	 	 	 	 	 ddej
                  dz  dej                  dz  dej
                  dz  dedz  dej                  dz  dej
                  dz  dej                  dz  d	ej                  dz  d
ej
                  dz  dej
                  dz  dej                  dz  dej
                  dz  de
ej                  z  dee   deez  fdZ	 	 	 	 	 	 	 	 	 	 	 d fd	Z	 ddej
                  dz  dej                  dz  deej                  ej                  f   fdZ xZS )Glm4vForConditionalGenerationNri   rl   r  rd   rj   labelsr  r  r  r  r  rD  logits_to_keeprK   r   c                     | j                   d||||	|
||||||d|}|d   }t        |t              rt        | d      n|}| j	                  |dd|ddf         }d}|2| j                  ||| j                  j                  j                        }t        |||j                  |j                  |j                  |j                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
        image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
            The temporal, height and width of feature shape of each image in LLM.
        video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
            The temporal, height and width of feature shape of each video in LLM.

        Example:

        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, Glm4vForConditionalGeneration

        >>> model = Glm4vForConditionalGeneration.from_pretrained("zai-org/GLM-4.1V-9B-Thinking")
        >>> processor = AutoProcessor.from_pretrained("zai-org/GLM-4.1V-9B-Thinking")

        >>> messages = [
            {
                "role": "user",
                "content": [
                    {"type": "image", "url": "https://www.ilankelman.org/stopsigns/australia.jpg"},
                    {"type": "text", "text": "What is shown in this image?"},
                ],
            },
        ]
        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        >>> inputs = processor(text=[text], images=[image], vision_infos=[vision_infos])

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "The image shows a street scene with a red stop sign in the foreground. In the background, there is a large red gate with Chinese characters ..."
        ```)ri   r  r  r  r  r  r  rl   rd   rj   rD  r   N)logitsr  rp   )lossr  rd   rk   rm  r  r8   )modelr   r   slicelm_headloss_functionr   rc   rp   r  rd   rk   rm  r  )rJ   ri   rl   r  rd   rj   r  r  r  r  r  r  rD  r  rK   r  rk   slice_indicesr  r  s                       rM   r   z%Glm4vForConditionalGeneration.forward  s    v $** 
% 3))/%)+')
 
  
 9C>SV8W~ot4]kmA}a,?@A%%VFt{{OfOfOqOq%rD*#33!//))++
 	
rN   c                 \    t        |   |f|||||||	|
|||d|}|s|r
d |d<   d |d<   |S )N)rd   rl   rj   rD  r  r  r  r  r  ru   is_first_iterationr  r  )r9   prepare_inputs_for_generation)rJ   ri   rd   rl   rj   rD  r  ru   r  r  r  r  r  rK   model_inputsrL   s                  rM   r  z;Glm4vForConditionalGeneration.prepare_inputs_for_generation(  si    $ w<
+)')%% 3))1
 
  "i+/L(26L./rN   c                    || | j                         t        j                  | j                  j                  t        j
                  |j                              k(  d   }| | j                         t        j                  | j                  j                  t        j
                  |j                              k(  d   }| | j                         t        j                  | j                  j                  t        j
                  |j                              k(  d   }nK|| j                  j                  k(  }|| j                  j                  k(  }|| j                  j                  k(  }t        j                  |j                         |j                         z
  d      }|dkD  }|| z  }|j                  d      }	|j                  d      }
|	|
fS )aa  
        Get the number of images and videos for each sample to calculate the separation length of the sample tensor.
        These parameters are not passed through the processor to avoid unpredictable impacts from interface modifications.

        Args:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                Indices of input sequence tokens in the vocabulary.

        Returns:
            image_nums (`torch.LongTensor` of shape `(batch_size, num_images_sample)`)
            video_nums (`torch.LongTensor` of shape `(batch_size, num_videos_sample)`)
        r  ).r   r   r   r   )r  r   r   r   r   r   r   r   r   r  r   r  )rJ   ri   rj   is_imageis_video_startis_video_endvideo_levelinside_videostandalone_imagesimage_countsvideo_countss              rM   _get_image_nums_and_video_numsz<Glm4vForConditionalGeneration._get_image_nums_and_video_numsP  s   $ $.4,,.LL!A!A\i\p\pq H .4,,.LL!A!A\i\p\pq N .4,,.LL!?!?uzzZgZnZno L !DKK$D$DDH&$++*J*JJN$(F(FFL ll>#5#5#7,:J:J:L#LRST"Q %6 ),,,3%))a)0\))rN   )NNNNNNNNNNNNr   )NNNNNTNNNNFr   )rX   rY   rZ   r  r   rZ  r   r   rg  r  r   r   r   rY  r  r   r  r"  r^   r_   s   @rM   r
  r
    s   %'" .2.204(,26*.,08<26264826-.[
##d*[
 t+[
 &&-	[

 [
 ((4/[
   4'[
 llT)[
 #..5[
 ((4/[
 ((4/[
 !??T1[
 ((4/[
 ell*[
 +,[
  
,	,![
@   &V .26*##d*6* ||d*6* 
u||U\\)	*	6*rN   r
  c                   "    e Zd ZddddddidZy)Glm4vProcessorKwargsFT)paddingreturn_token_type_idsreturn_mm_token_type_idsreturn_metadata)text_kwargsvideos_kwargsN)rX   rY   rZ   	_defaultsr8   rN   rM   r$  r$    s#     %*(,

 ,T2IrN   r$  c                   z     e Zd Zd
 fd	Z	 	 	 ddedz  deez  ee   z  ee   z  dedz  de	e
   def
dZd	 Z xZS )Glm4vProcessorNc                     t         |   ||||       t        |d      sdn|j                  | _        t        |d      sdn|j                  | _        |j                  d      | _        |j                  d      | _        y )N)chat_templateimage_tokenz	<|image|>video_tokenz	<|video|>z<|begin_of_video|>z<|end_of_video|>)r9   r:   hasattrr0  r1  convert_tokens_to_idsvideo_start_idvideo_end_id)rJ   image_processor	tokenizervideo_processorr/  rK   rL   s         rM   r:   zGlm4vProcessor.__init__  sv    )_Tab.5i.O;U^UjUj.5i.O;U^UjUj'==>RS%;;<NOrN   imagestextvideosrK   r   c                 L	    | j                   t        fd| j                  j                  i|}| | j                  dd|i|d   }|d   }ni }d}|E | j
                  dd|i|d   }|j                  d      s|j                  d	      }	n|d	   }	|d
   }
ni }d}
t        |t              s|g}|j                         }|| j                  j                  dz  }d}t        t        |            D ]  }| j                  ||   v rS||   j                         |z  }||   j!                  | j                  d|z  d      ||<   |dz  }| j                  ||   v rS||   j!                  d| j                        ||<    |
| j
                  j                  dz  }d}t        t        |            D ]  }| j"                  ||   v r|
|   d   }d}	|   }|j$                  t&        j)                  d       |j$                  dn|j$                  |_        |j*                  ddd   }g }t        dt        |            D ]  }|j-                  ||           |d| }t        |      |k  r'|j-                  |r|d   nd       t        |      |k  r't        |      D ]  }||   }| j/                  |      }||z  } ||   j!                  | j"                  |d      ||<   |
|   j                         |z  |
|   d   z  }t        |      D ]:  }| j                  ||   v s||   j!                  | j                  d|z  d      ||<   < |dz  }| j"                  ||   v r||   j!                  d| j                        ||<    |d   j                  dd      }|d   j                  dd      } | j                  |fi |d   }| j1                  ||ddg       |rt3        j4                  |d         }t3        j6                  |d         }t3        j8                  || j:                  k(  d      }t3        j8                  || j<                  k(  d      } || kD  }!d||| j>                  k(  |!z  <   d||| j>                  k(  |! z  <   |jA                         |d<   tC        i ||||      S )a  
        Returns:
            [`BatchFeature`]: A [`BatchFeature`] with the following fields:

            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
              `None`).
            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
            - **pixel_values_videos** -- Pixel values of videos to be fed to a model. Returned when `videos` is not `None`.
            - **image_grid_thw** -- List of image 3D grid in LLM. Returned when `images` is not `None`.
            - **video_grid_thw** -- List of video 3D grid in LLM. Returned when `videos` is not `None`.
        tokenizer_init_kwargsNr9  images_kwargsr  r;  r*  r(  video_metadatar  r!   r   z<|placeholder|>r    a  SmolVLM requires frame timestamps to construct prompts, but the `fps` of the input video could not be inferred. Probably `video_metadata` was missing from inputs and you passed pre-sampled frames. Defaulting to `fps=24`. Please provide `video_metadata` for more accurate results.rO   r   r)  return_tensorsr'  Fry  rz  )
modalitiesri   )axisr  )datatensor_typer8   )"_merge_kwargsr$  r7  init_kwargsr6  r8  r  popr   r   copy
merge_sizer   r   r0  r  replacer1  fpsloggerwarning_once
timestampsr  replace_frame_token_id_check_special_mm_tokensnparray
zeros_liker  r4  r5  r   r  r   )"rJ   r9  r:  r;  rK   output_kwargsimage_inputsr  videos_inputsr?  r  merge_lengthindexr   num_image_tokensvideo_index
num_framesvideo_structuremetadatarO  unique_timestampsidxselected_timestamps	frame_idxtimestamp_secframe_structurerA  r'  text_inputs	array_idsr  startsendsis_video_modalitys"                                     rM   __call__zGlm4vProcessor.__call__  sP   ( +** 
"&.."<"<
 

 /4//`v`A_`L)*:;NL!N0D00aa-P_B`aM::/0!.!2!23C!D!./?!@*+;<NM!N$%6Dyy{%//::A=LE3t9% O&&$q'1'5e'<'A'A'C|'S$"1good.>.>@QTd@dfghDGQJE &&$q'1 q'//*;T=M=MNQO %//::A=LK3t9% &O&&$q'1!/!<Q!?J&(O-k:H||+++q
 *2)=28<<HL!)!4!4SqS!9J(*%$QJ8 B)00CAB +<KZ*H'12Z?+22Na3Fr3Jghi 12Z? &+:%6 ;	(;I(F*.*E*Em*T'?:;
 #1good.>.>QRSDG&{388:lJn]hNijkNll % &+:%6 q	++tAw6&*1good6F6FHY\lHlno&pDGq  1$KG &&$q'1J q'//*;T=M=MNQM&ON '}599:JDQ#0#?#C#CD^`e#f $dnnTJ]=-IJ%%dKWgDV%W#[!9:I "k+.F G
 YYyD,?,??aHF99Y$*;*;;!DD &XYyD,?,??CTTU[\yD,?,??EVDVWX/@/G/G/IK+,!QK!Q<!Q=!Q_mnnrN   c                 8    d| j                    dt        |       S )Nz<|begin_of_image|>z<|end_of_image|>)r0  r   )rJ   rc  s     rM   rP  z%Glm4vProcessor.replace_frame_token_id  s#    #D$4$4#55Ec-FXEYZZrN   rX  r  )rX   rY   rZ   r:   r   r   r   r   r    r   r$  r   rj  rP  r^   r_   s   @rM   r-  r-    s    P %)Z^$(	woT!wo ++d9o=EV@WWwo T!	wo
 -.wo 
wor[rN   r-  )	r   ra   r4   r
  r  rk  r-  r  rx  )r   )nr  collections.abcr   numpyrR  r   torch.nnr   torch.nn.functional
functionalr   r   r@  r   rr  activationsr   cache_utilsr   r	   configuration_utilsr
   feature_extraction_utilsr   image_utilsr   masking_utilsr   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   modeling_rope_utilsr   modeling_utilsr   r   processing_utilsr   tokenization_utils_baser   r   utilsr   r   r   r   r   utils.genericr   r   utils.output_capturingr   video_utilsr    glm4.modeling_glm4r"   r#   r$   r%   qwen2_5_vl.modeling_qwen2_5_vlr&   r'   r(   r)   r*   r+   r,   r-   r.   r/   qwen2_vl.modeling_qwen2_vlr0   qwen2_vl.processing_qwen2_vlr1   r2   
get_loggerrX   rM  r4   ra   r   r   r   r   r   r   r   r   r   r   r  r(  r5  r7  r\  r^  ri  rk  rx  r  r  r  r
  r$  r-  __all__r8   rN   rM   <module>r     s
    $       & ! . 3 4 % / B 9 S 1 F & C  H 5 % c c   6 
		H	%Z3( Z3zwU& wUtO#" O#f	; 	8M 8	m4 	m	!> 	fRYY f"HBII HVR4 R5, 52 86%PE) E)P	7 	16 1h	< 	24 2~
+ ~
B_
( _
DJ
 J
Z	"B 	~*$F ~*B1 B[% B[J
rN   