
    qi5                       d dl Z d dlmZ d dlmZ d dlmZmZ d dlZd dl	m
Z
 d dlm
c mZ d dl	mZ ddlmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlm Z  ddl!m"Z"m#Z#m$Z$ ddl%m&Z&m'Z' ddl(m)Z)m*Z* ddl+m,Z, ddl-m.Z.m/Z/m0Z0m1Z1 ddl2m3Z3m4Z4m5Z5 ddl6m7Z7 ddl8m9Z9m:Z:m;Z;  ed       G d de
jx                               Z= G d de
jx                        Z> G d de
jx                        Z? G d d e
jx                        Z@ G d! d"e
jx                        ZA G d# d$e
jx                        ZBd% ZCd&ej                  d'ej                  d(ej                  d)ej                  d*eEej                  ej                  f   f
d+ZFd,ej                  d-eGd*ej                  fd.ZH	 dXd/e
jx                  d0ej                  d1ej                  d2ej                  d3ej                  dz  d4eId5eId6e,e.   fd7ZJ G d8 d9e
jx                        ZK G d: d;e       ZL G d< d=e
jx                        ZMd> ZNdYd?ZO G d@ dAe
jx                        ZP G dB dCe
jx                        ZQ G dD dEe       ZRe e/dFG       G dH dIe$                    ZSe/ G dJ dKe*             ZT G dL dMeT      ZUe/ G dN dOeT             ZVe/ G dP dQeT             ZWe e/dRG       G dS dTe$                    ZX G dU dVeTe      ZYg dWZZy)Z    N)Callable)	dataclass)AnyOptional)	LayerNorm   )initialization)ACT2FN)CacheDynamicCache)GenerationMixin)use_kernel_forward_from_hub)create_causal_mask)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputWithPastBaseModelOutputWithPoolingModelOutput)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tupletorch_compilable_check)is_flash_attention_requestedmaybe_autocastmerge_with_config_defaults)capture_outputs   )Glm4vConfigGlm4vTextConfigGlm4vVisionConfigRMSNormc                   h     e Zd Zddeddf fdZdej                  dej                  fdZd Z xZ	S )	Glm4vRMSNormepsreturnNc                     t         |           t        j                  t	        j
                  |            | _        || _        y)z;
        Glm4vRMSNorm is equivalent to T5LayerNorm
        N)super__init__nn	Parametertorchonesweightvariance_epsilon)selfhidden_sizer)   	__class__s      Z/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/glm4v/modeling_glm4v.pyr-   zGlm4vRMSNorm.__init__2   s1     	ll5::k#:; #    hidden_statesc                 "   |j                   }|j                  t        j                        }|j	                  d      j                  dd      }|t        j                  || j                  z         z  }| j                  |j                  |      z  S )N   T)keepdim)	dtypetor0   float32powmeanrsqrtr3   r2   )r4   r9   input_dtypevariances       r7   forwardzGlm4vRMSNorm.forward:   sy    #))%((7 $$Q',,R,>%Ht?T?T4T(UU{{]--k:::r8   c                 ^    t        | j                  j                         d| j                   S )Nz, eps=)tupler2   shaper3   r4   s    r7   
extra_reprzGlm4vRMSNorm.extra_reprA   s*    ))*+6$2G2G1HIIr8   )gư>)
__name__
__module____qualname__floatr-   r0   TensorrF   rK   __classcell__r6   s   @r7   r(   r(   0   s7    $ $$ $;U\\ ;ell ;Jr8   r(   c                   ,     e Zd Zddef fdZd Z xZS )Glm4VisionMlpbiasc                    t         |           |j                  | _        |j                  | _        t        j                  | j                  | j                  |      | _        t        j                  | j                  | j                  |      | _        t        j                  | j                  | j                  |      | _	        t        |j                     | _        y NrU   )r,   r-   r5   out_hidden_sizeintermediate_sizer.   Linear	gate_projup_proj	down_projr
   
hidden_actact_fn)r4   configrU   r6   s      r7   r-   zGlm4VisionMlp.__init__F   s    !--!'!7!74#3#3T5K5KRVWyy!1!143I3IPTU4#9#94;K;KRVWV../r8   c                     | j                  | j                  | j                  |            | j                  |      z        S N)r^   r`   r\   r]   r4   hidden_states     r7   rF   zGlm4VisionMlp.forwardO   s2    ~~dkk$..*FG$,,WcJddeer8   F)rL   rM   rN   boolr-   rF   rQ   rR   s   @r7   rT   rT   E   s    0T 0fr8   rT   c                   `     e Zd Zdeddf fdZdej                  dej                  fdZ xZS )Glm4vVisionPatchEmbedra   r*   Nc                 T   t         |           |j                  | _        |j                  | _        |j                  | _        |j
                  | _        | j                  | j                  | j                  g}t        j                  | j                  | j                  ||      | _	        y )N)kernel_sizestride)
r,   r-   
patch_sizetemporal_patch_sizein_channelsr5   	embed_dimr.   Conv3dproj)r4   ra   rk   r6   s      r7   r-   zGlm4vVisionPatchEmbed.__init__T   s     ++#)#=#= !--++//$//RIId..K`kl	r8   r9   c                 6   | j                   j                  j                  }|j                  d| j                  | j
                  | j                  | j                        }| j                  |j                  |            j                  d| j                        }|S )Nr<   r>   )	rr   r2   r>   viewro   rn   rm   r?   rp   )r4   r9   target_dtypes      r7   rF   zGlm4vVisionPatchEmbed.forward^   s~    yy''--%**  $":":DOOT__
 		-"2"2"2"FGLLRQUQ_Q_`r8   	rL   rM   rN   r%   r-   r0   rP   rF   rQ   rR   s   @r7   ri   ri   S   s5    m0 mT mU\\ ell r8   ri   c                   r     e Zd ZU ej                  ed<   d	dededdf fdZdedej                  fdZ	 xZ
S )
Glm4vVisionRotaryEmbeddinginv_freqdimthetar*   Nc                     t         |           || _        || _        d|t	        j
                  d|dt        j                        |z  z  z  }| j                  d|d       y )N      ?r   r;   rt   rz   F
persistent)r,   r-   r{   r|   r0   arangerO   register_buffer)r4   r{   r|   rz   r6   s       r7   r-   z#Glm4vVisionRotaryEmbedding.__init__j   sY    
%ELLC%++$NQT$TUVZeDr8   seqlenc                     t        j                  || j                  j                  | j                  j                        }t        j
                  || j                        }|S )Ndevicer>   )r0   r   rz   r   r>   outer)r4   r   seqfreqss       r7   rF   z"Glm4vVisionRotaryEmbedding.forwardq   sA    ll6$--*>*>dmmFYFYZC/r8   )g     @)rL   rM   rN   r0   rP   __annotations__intrO   r-   rF   rQ   rR   s   @r7   ry   ry   g   sB    llEC E ED Ec ell r8   ry   c                   n     e Zd Zd
dededededdf
 fdZdej                  dej                  fd	Z	 xZ
S )Glm4vVisionPatchMergerr{   context_dimr_   rU   r*   Nc                 x   t         |           t        j                  |||      | _        t        |      | _        t        j                  |||      | _        t        j                  |||      | _        t        j                  |||      | _	        t        j                         | _        t        |   | _        y rW   )r,   r-   r.   r[   rr   r   post_projection_normr\   r]   r^   GELUact1r
   r`   )r4   r{   r   r_   rU   r6   s        r7   r-   zGlm4vVisionPatchMerger.__init__x   s    IIc3T2	$-cN!3$?yyk=;$?GGI	Z(r8   re   c                     | j                  |      }| j                  | j                  |            }| j                  | j	                  | j                  |            | j                  |      z        S rc   )rr   r   r   r^   r`   r\   r]   rd   s     r7   rF   zGlm4vVisionPatchMerger.forward   sY    yy.yy!:!:<!HI~~dkk$..*FG$,,WcJddeer8   rf   )rL   rM   rN   r   strrg   r-   r0   rP   rF   rQ   rR   s   @r7   r   r   w   sJ    )C )c )s )$ )[_ )fELL fU\\ fr8   r   c                   D     e Zd Zdef fdZdej                  fdZ xZS )Glm4vVisionEmbeddingsra   c                 f   t         |           || _        |j                  | _        |j
                  | _        |j                  | _        | j
                  | j                  z  dz  | _        | j                  | _        t        j                  | j                  | j                        | _        d| _        y )Nr;   bicubic)r,   r-   ra   r5   rp   
image_sizerm   num_patchesnum_positionsr.   	Embeddingposition_embeddinginterpolated_methodr4   ra   r6   s     r7   r-   zGlm4vVisionEmbeddings.__init__   s    ++ ++ ++ OOt>1D!--"$,,t/A/A4>>"R#, r8   r*   c           	      2   | j                   j                  }|j                  d   }|j                  }t	        |t
              r&t        j                  ||t        j                        }|j                  d   }	t        |	dz        }
|j                  |
|
|      j                  ddd      j                  d      j                  |t        j                        }t        j                  t!        t#        |            D cg c]  }||df   j%                  ||          c}      j                  |t        j                        }t        j                  t!        t#        |            D cg c]  }||df   j%                  ||          c}      j                  |t        j                        }|dz   |z  dz  dz
  }|dz   |z  dz  dz
  }t        j&                  ||fd      j                  d      j                  d      }t)        j*                  ||| j,                  dd	
      }|j/                  d      j/                  d      j                  dd      }|j                  |j0                        j                  |j                        }||z   }|S c c}w c c}w )a  
        Forward pass with integrated position encoding adaptation using 2D interpolation.

        Args:
            embeddings: Input embeddings tensor
            lengths (torch.Tensor): Sequence lengths for each image in the batch.
            image_shapes (torch.Tensor): Tensor of shape [batch_size, 3] representing the image shapes (t, h, w).
            h_coords (torch.Tensor): Tensor of shape [total_seq] representing the h coordinate for each patch.
            w_coords (torch.Tensor): Tensor of shape [total_seq] representing the w coordinate for each patch.

        Returns:
            torch.Tensor: Embeddings with adapted position encoding added.
        r"   r   r   g      ?r;   r<   r{   Fborder)modealign_cornerspadding_mode)r   r2   rI   r   
isinstancelistr0   tensorlongr   ru   permute	unsqueezer?   r@   catrangelenrepeatstackFgrid_sampler   squeezer>   )r4   
embeddingslengthsimage_shapesh_coordsw_coordspos_embed_weightr5   r   orig_size_sq	orig_sizepos_embed_2ditarget_htarget_wnorm_wnorm_hgridinterpolated_embed_fp32adapted_pos_embed_fp32adapted_pos_embeds                        r7   rF   zGlm4vVisionEmbeddings.forward   s^     2299&,,Q/!(( gt$ll76LG (--a0c)*	!!)YDWQ1Yq\RvU]]R3	 	 99USVW^S_M`al1a4077
Cabee f 
 99USVW^S_M`al1a4077
Cabee f 

 c>X-2Q6c>X-2Q6 {{FF+4>>qAKKAN #$--$T%=%=Uai#

 "9!@!@!C!K!KB!O!W!WXY[\!]2556F6L6LMPPQ[QbQbc  "33
3 b bs   < J' Jrw   rR   s   @r7   r   r      s#    
-0 
-;PUP\P\ ;r8   r   c                     | dd| j                   d   dz  f   }| d| j                   d   dz  df   }t        j                  | |fd      S )*Rotates half the hidden dims of the input..Nr<   r;   r   )rI   r0   r   xx1x2s      r7   rotate_halfr      sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''r8   qkcossinr*   c                    | j                   }|j                   }| j                         |j                         }} |j                  d      j                         |j                  d      j                         }}| |z  t        |       |z  z   }||z  t        |      |z  z   }|j	                  |      }|j	                  |      }||fS )N)r>   rO   r   r   r?   )r   r   r   r   orig_q_dtypeorig_k_dtypeq_embedk_embeds           r7   apply_rotary_pos_emb_visionr      s     77L77L779aggiqA}}R &&(#--*;*A*A*CC3w;q>C/0G3w;q>C/0Gjj&Gjj&GGr8   r9   n_repc                     | j                   \  }}}}|dk(  r| S | dddddddddf   j                  |||||      } | j                  |||z  ||      S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r"   N)rI   expandreshape)r9   r   batchnum_key_value_headsslenhead_dims         r7   	repeat_kvr      so    
 2?1D1D.Ehz!!Qa"23::5BUW\^bdlmM  (;e(CT8TTr8   modulequerykeyvalueattention_maskscalingdropoutkwargsc                    t        || j                        }t        || j                        }	t        j                  ||j	                  dd            |z  }
||
|z   }
t
        j                  j                  |
dt        j                        j                  |j                        }
t
        j                  j                  |
|| j                        }
t        j                  |
|	      }|j	                  dd      j                         }||
fS )Nr;   r   r<   r{   r>   )ptrainingr"   )r   num_key_value_groupsr0   matmul	transposer.   
functionalsoftmaxr@   r?   r>   r   r   
contiguous)r   r   r   r   r   r   r   r   
key_statesvalue_statesattn_weightsattn_outputs               r7   eager_attention_forwardr      s     3 ; ;<JUF$?$?@L<<z';';Aq'ABWLL!#n4==((2U]](SVVW\WbWbcL==((6??([L,,|\:K''1-88:K$$r8   c                        e Zd Zdeddf fdZ	 	 d
dej                  dej                  dej                  dz  deej                  ej                  f   dz  dej                  f
d	Z xZ	S )Glm4vVisionAttentionra   r*   Nc                    t         |           |j                  | _        |j                  | _        | j                  | j                  z  | _        d| _        t        j                  |j                  |j                  dz  |j                        | _
        t        j                  |j                  |j                  d      | _        | j
                  dz  | _        || _        |j                  | _        d| _        y )Nr"   r   rX   F      )r,   r-   r5   r{   	num_headsr   r   r.   r[   attention_biasqkvrr   r   ra   attention_dropout	is_causalr   s     r7   r-   zGlm4vVisionAttention.__init__  s    %%))DNN2$%!99V//1C1Ca1GfNcNcdIIf00&2D2D5Q	}}d*!'!9!9r8   r9   
cu_seqlensrotary_pos_embposition_embeddingsc                    |j                   d   }| j                  |      j                  |d| j                  d      j	                  dddd      j                  d      \  }}}	|\  }
}t        |||
|      \  }}|j                  dd      j                  d      }|j                  dd      j                  d      }|	j                  dd      j                  d      }	t        j                  | j                  j                  t              }t        | j                        rT|dd  |d d z
  j                         } || |||	fd | j                   | j"                  sdn| j$                  ||||dd|\  }}n|dd  |d d z
  }|||	fD cg c](  }t'        j(                  ||j+                         d	      * }}t-        | D cg c]<  \  }}} || |||fd | j                   | j"                  sdn| j$                  dd
|d   > }}}}t'        j.                  |d	      }|j                  |d      j1                         }| j3                  |      }|S c c}w c c}}}w )Nr   r   r<   r"   r;           F)r   r   r   cu_seq_lens_qcu_seq_lens_kmax_length_qmax_length_kr   r   )r   r   r   r   )rI   r   r   r   r   unbindr   r   r   r   get_interfacera   _attn_implementationr   r   maxr   r   r   r0   splittolistzipr   r   rr   )r4   r9   r  r  r  r   
seq_lengthquery_statesr   r   r   r   attention_interface
max_seqlenr   _r   r   splitsr   r   vattn_outputss                          r7   rF   zGlm4vVisionAttention.forward  s    #((+
HH]#++J4>>2NVVWXZ[]^`abiijkl 	/j, 'S#>|ZY\^a#b j#--a3==a@))!Q/99!<
#--a3==a@(?(M(MKK,,.E)
 (4$QR.:cr?:??AJ0	
  $#'==d6L6L(('' NK" !nz#26GLXZdfrKsAGFGNN$4!<F    #F|  Aq! $	

 $( LL'+}}C$:P:P#
 
 
L   ))La8K!))*b9DDFii,-s   -I?AINN)
rL   rM   rN   r%   r-   r0   rP   rH   rF   rQ   rR   s   @r7   r   r     s    0 T " /3HLB||B LLB t+	B
 #5<<#=>EB 
Br8   r   c                        e Zd Zd	 fdZ	 	 d
dej
                  dej
                  dej
                  dz  deej
                  ej
                  f   dz  dej
                  f
dZ xZS )Glm4vVisionBlockr*   Nc                     t         |           t        |j                  |j                        | _        t        |j                  |j                        | _        t        |      | _        t        |d      | _
        y )Nr)   FrX   )r,   r-   r(   r5   rms_norm_epsnorm1norm2r   attnrT   mlpr   s     r7   r-   zGlm4vVisionBlock.__init__a  s\    !&"4"4&:M:MN
!&"4"4&:M:MN
(0	 e4r8   r9   r  r  r  c                     | | j                   | j                  |      f|||d|z   }|| j                  | j                  |            z   }|S )N)r  r  r  )r!  r  r"  r   )r4   r9   r  r  r  r   s         r7   rF   zGlm4vVisionBlock.forwardh  sc     &			JJ}%)
!) 3	)

 )
 
 &M1J(KKr8   r*   Nr  )	rL   rM   rN   r-   r0   rP   rH   rF   rQ   rR   s   @r7   r  r  `  sq    5 /3HL|| LL t+	
 #5<<#=>E 
r8   r  c                        e Zd ZU ej                  ed<   ddef fdZe	 	 	 ddedz  de	d   de
dz  ded	ef   fd
       Z ej                         ed               Zd Z xZS )Glm4vTextRotaryEmbeddingrz   Nra   c                    t         |           |j                  | _        |j                  | _        || _        | j
                  j                  d   | _        | j                  }| j                  dk7  rt        | j                     } || j
                  |      \  }| _
        | j                  d|d       | j                  d|j                         d       |j                  j                  dg d      | _        y )	N	rope_typedefaultrz   Fr   original_inv_freqmrope_section)      r-  )r,   r-   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenra   rope_parametersr(  compute_default_rope_parametersr   attention_scalingr   clonegetr+  )r4   ra   r   rope_init_fnrz   r6   s        r7   r-   z!Glm4vTextRotaryEmbedding.__init__~  s    "("@"@$*$B$B!44[A!%!E!E>>Y&.t~~>L+7V+L($(ZeD0(..2BuU#3377Ur8   r   ztorch.deviceseq_lenr*   ztorch.Tensorc                 n   | j                   d   }| j                   j                  dd      }t        | dd      xs | j                  | j                  z  }t        ||z        }d}d|t        j                  d|dt        j                        j                  |t        j                  	      |z  z  z  }||fS )
a  
        Computes the inverse frequencies according to the original RoPE implementation
        Args:
            config ([`~transformers.PreTrainedConfig`]):
                The model configuration.
            device (`torch.device`):
                The device to use for initialization of the inverse frequencies.
            seq_len (`int`, *optional*):
                The current sequence length. Unused for this type of RoPE.
        Returns:
            Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
            post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
        
rope_thetapartial_rotary_factorr~   r   Nr   r;   rt   r   )r1  r5  getattrr5   num_attention_headsr   r0   r   int64r?   rO   )	ra   r   r7  baser:  r   r{   attention_factorrz   s	            r7   r2  z8Glm4vTextRotaryEmbedding.compute_default_rope_parameters  s    & %%l3 & 6 6 : :;RTW X6:t4h8J8JfNhNh8h(223 U\\!S!5;;?BB&X]XcXcBdgjjk
 )))r8   c                 ^   | j                   d d d d d f   j                         j                  d|j                  d   dd      }|d d d d d d d f   j                         }t	        |j
                  j                  t              r/|j
                  j                  dk7  r|j
                  j                  nd}t        |d      5  |j                         |j                         z  j                  dd      }| j                  || j                        }t        j                  ||fd	      }|j                         | j                  z  }|j!                         | j                  z  }	d d d        j#                  |j$                  
      	j#                  |j$                  
      fS # 1 sw Y   AxY w)Nr   r"   r<   mpscpuF)device_typeenabledr;   r   rt   )rz   rO   r   rI   r   r   typer   r   r   apply_mroper+  r0   r   r   r3  r   r?   r>   )
r4   r   position_idsinv_freq_expandedposition_ids_expandedrC  r   embr   r   s
             r7   rF   z Glm4vTextRotaryEmbedding.forward  s`   
 !MM$a*=>DDFMMaQ]QcQcdeQfhjlmn ,Q4] ; A A C'1!((--'E!((--[`J`ahhmmfkUC 	5&,,.1F1L1L1NNYYZ[]^_E$$UD,>,>?E))UEN3C'')d444C'')d444C	5 vvAGGv$cff177f&;;;	5 	5s   B!F##F,c           	          |}|j                  |d      }t        j                  t        |      D cg c]  \  }}||dz      c}}d      }|S c c}}w )Nr<   r   r   )r  r0   r   	enumerate)r4   r   r+  sectionchunksr   chunkresults           r7   rF  z$Glm4vTextRotaryEmbedding.apply_mrope  sQ    W"-69JKXQE!a%LKQST Ls   A
rc   NNN)rL   rM   rN   r0   rP   r   r$   r-   staticmethodr   r   rH   rO   r2  no_gradr   rF   rF  rQ   rR   s   @r7   r&  r&  {  s    llV V" )-+/"*$&*(* t* 
~u$	%	* *> U]]_<  < r8   r&  c                 |    | ddddf   }| ddddf   }t        j                  | |fd      j                  d      S )	r   .r   Nr;   r"   r<   r   r   )r0   r   flattenr   s      r7   rotate_half_llmrV    sJ    	
319B	
319B;;Ryb)11"55r8   c                    |j                  |      }|j                  |      }|dd|j                  d   dz  f   j                  dd      }|dd|j                  d   dz  f   j                  dd      }|j                  d   }| dd|f   | d|df   }}|dd|f   |d|df   }	}||z  t        |      |z  z   }
||z  t        |      |z  z   }t	        j
                  |
|gd      }
t	        j
                  ||	gd      }|
|fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    .Nr<   r;   r   )r   rI   repeat_interleaverV  r0   r   )r   r   r   r   unsqueeze_dim
rotary_dimq_rotq_passk_rotk_passr   r   s               r7   apply_rotary_pos_embr_    sD   $ --
&C
--
&C c'SYYr]a'''
(
:
:1"
:
EC
c'SYYr]a'''
(
:
:1"
:
EC 2Jc;J;&'3
+;)<6Ec;J;&'3
+;)<6E s{u5;<Gs{u5;<G ii&)r2Gii&)r2GGr8   c                   H    e Zd ZdZddededz  f fdZ	 	 	 	 ddej                  de	ej                  ej                  f   dz  dej                  dz  d	e
dz  d
ej                  dz  dee   de	ej                  ej                  dz  e	ej                     dz  f   fdZ xZS )Glm4vTextAttentionz
    Multi-headed attention from 'Attention Is All You Need' paper.
    and "Generating Long Sequences with Sparse Transformers".
    Nra   	layer_idxc                    t         |           || _        || _        |j                  | _        |j
                  | _        | j                  | j                  z  | _        |j                  | _        | j                  | j                  z  | _	        d| _
        |j                  | _        |j                  | _        | j                  dz  | _        t        j                  | j                  | j                  | j                  z  d      | _        t        j                  | j                  | j                  | j                  z  d      | _        t        j                  | j                  | j                  | j                  z  d      | _        t        j                  | j                  | j                  z  | j                  d      | _        y )NTr   rX   F)r,   r-   ra   rb  r5   r<  r   r   r   r   r   r   r1  r   r.   r[   q_projk_projv_projo_projr4   ra   rb  r6   s      r7   r-   zGlm4vTextAttention.__init__  sI   "!--33((DNN:#)#=#= $(NNd6N6N$N!!'!9!9%55}}d*ii 0 0$..4==2PW[\ii 0 0$2J2JT]]2Zaefii 0 0$2J2JT]]2Zaefii >@P@PW\]r8   r9   r  r   past_key_valuescache_positionr   r*   c                 T   |j                         \  }}}	| j                  |      }
| j                  |      }| j                  |      }|
j	                  ||d| j
                        j                  dd      }
|j	                  ||d| j
                        j                  dd      }|j	                  ||d| j
                        j                  dd      }|\  }}t        |
|||      \  }
}|'|||d}|j                  ||| j                  |      \  }}t        j                  | j                  j                  t              } || |
|||f| j                  sdn| j                   | j"                  d|\  }}|j%                  ||d      j'                         }| j)                  |      }||fS )Nr<   r"   r;   )r   r   rj  r  )r   r   )sizerd  re  rf  ru   r   r   r_  updaterb  r   r  ra   r  r   r   r   r   r   r   rg  )r4   r9   r  r   ri  rj  r   bszq_lenr  r  r   r   r   r   cache_kwargsr  r   r   s                      r7   rF   zGlm4vTextAttention.forward  s    &**,UA{{=1[[/
{{=1#((eRGQQRSUVW__S%T]]CMMaQRS
#((eRGQQRSUVW&S#7jRUWZ#[ j&#&snUL'6'='=j,X\XfXfht'u$J(?(M(MKK,,.E)
 %8	%
  $}}C$2H2HLL	%
 	%
!\ "))#ub9DDFkk+.L((r8   rc   )NNNN)rL   rM   rN   __doc__r$   r   r-   r0   rP   rH   r   
LongTensorr   r   rF   rQ   rR   s   @r7   ra  ra    s    
^ ^3: ^. IM.2(,26+)||+) #5<<#=>E+) t+	+)
 +) ((4/+) -.+) 
u||U\\D0%2E2LL	M+)r8   ra  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )Glm4vTextMLPc                 *   t         |           || _        t        j                  |j
                  d|j                  z  d      | _        t        j                  |j                  |j
                  d      | _        t        |j                     | _        y )Nr;   FrX   )r,   r-   ra   r.   r[   r5   rZ   gate_up_projr^   r
   r_   activation_fnr   s     r7   r-   zGlm4vTextMLP.__init__@  sp    IIf&8&8!f>V>V:V]bc6#;#;V=O=OV[\#F$5$56r8   r9   r*   c                     | j                  |      }|j                  dd      \  }}|| j                  |      z  }| j                  |      S )Nr;   r<   r   )rv  rO  rw  r^   )r4   r9   	up_statesgates       r7   rF   zGlm4vTextMLP.forwardH  sL    %%m4	#//!/4i 2 24 88	~~i((r8   )rL   rM   rN   r-   r0   FloatTensorrF   rQ   rR   s   @r7   rt  rt  ?  s'    7)U%6%6 )5;L;L )r8   rt  c                   d    e Zd Zdedef fdZe	 	 	 	 	 	 ddej                  de	ej                  ej                  f   dz  dej                  dz  dej                  dz  d	edz  d
edz  dej                  dz  de	ej                  e	ej                  ej                  f   dz  f   fd       Z xZS )Glm4vTextDecoderLayerra   rb  c                    t         |           |j                  | _        t        ||      | _        t        |      | _        t        |j                  |j                        | _	        t        |j                  |j                        | _
        t        |j                  |j                        | _        t        |j                  |j                        | _        y )Nr  )r,   r-   r5   ra  	self_attnrt  r"  r(   r  input_layernormpost_attention_layernormpost_self_attn_layernormpost_mlp_layernormrh  s      r7   r-   zGlm4vTextDecoderLayer.__init__R  s    !--+FI>'+F,>,>FDWDWX(4V5G5GVM`M`(a%(4V5G5GVM`M`(a%".v/A/AvGZGZ"[r8   Nr9   r  r   rG  ri  	use_cacherj  r*   c                    |}	| j                  |      } | j                  d|||||||d|\  }}
| j                  |      }|	|z   }|}	| j                  |      }| j	                  |      }| j                  |      }|	|z   }|S )N)r9   r  r   rG  ri  r  rj   )r  r  r  r  r"  r  )r4   r9   r  r   rG  ri  r  rj  r   residualr  s              r7   rF   zGlm4vTextDecoderLayer.forward\  s     !,,]; *4>> 	
' 3)%+)	
 	
q 55mD =0 !55mD///> =0r8   )NNNNFN)rL   rM   rN   r$   r   r-   r   r0   rP   rH   rr  r   rg   r{  rF   rQ   rR   s   @r7   r}  r}  Q  s    \ \3 \  IM.204(,!&26%||% #5<<#=>E% t+	%
 &&-% % $;% ((4/% 
u  %(9(95;L;L(L"MPT"TT	U% %r8   r}  zJ
    Base class for Llava outputs, with hidden states and attentions.
    )custom_introc                       e Zd ZU dZdZej                  dz  ed<   dZe	dz  ed<   dZ
eej                     dz  ed<   dZeej                     dz  ed<   dZej                  dz  ed<   y)Glm4vModelOutputWithPasta[  
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*):
        The rope index difference between sequence length and multimodal rope.
    Nlast_hidden_stateri  r9   
attentionsrope_deltas)rL   rM   rN   rq  r  r0   r{  r   ri  r   r9   rH   r  r  rr  r  r8   r7   r  r    sv     37u((4/6$(OUT\(59M5**+d2926Je''(4/6+/K!!D(/r8   r  c                   \     e Zd ZU eed<   dZdZdZddgZdZ	dZ
dZdZdZeedZ fd	Z xZS )
Glm4vPreTrainedModelra   model)imagevideotextTr}  r  ri  r9   r  c                 "   t         |   |       t        |t              rod|j                  t        j                  d|j                  dt
        j                        |j                  z  z  z  }t        j                  |j                  |       y y )Nr~   r   r;   rt   )r,   _init_weightsr   ry   r|   r0   r   r{   rO   initcopy_rz   )r4   r   rz   r6   s      r7   r  z"Glm4vPreTrainedModel._init_weights  sk    f%f89fllu||Avzz1TYT_T_/`cicmcm/mnoHJJv1 :r8   )rL   rM   rN   r#   r   base_model_prefixinput_modalitiessupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_can_compile_fullgraph_supports_attention_backendr}  ra  _can_record_outputsr  rQ   rR   s   @r7   r  r    s\    1&*#02DE"3N!"&.(
2 2r8   r  c                        e Zd ZU eed<   dZdgZeedZ	d fdZ
d Zeeedej                   d	ej                   d
ee   deez  fd                     Z xZS )Glm4vVisionModelra   )r  r  r  r  r*   c                 F   t         |   |       |j                  | _        |j                  | _        t	        |      | _        t        |      | _        |j                  |j                  z  }t        |dz        | _        t        j                  t        |j                        D cg c]  }t!        |       c}      | _        t%        |j&                  |j(                  |j*                        | _        t/        |j                  |j0                        | _        t        j4                  |j                  |j&                  |j                  |j                        | _        t/        |j                  |j0                        | _        d| _        | j=                          y c c}w )Nr;   )r{   r   r_   r  )ro   out_channelsrk   rl   F)r,   r-   spatial_merge_sizerm   r   r   ri   patch_embedr5   r   ry   r  r.   
ModuleListr   depthr  blocksr   rY   rZ   r_   mergerr(   r  post_conv_layernormConv2d
downsamplepost_layernormgradient_checkpointing	post_init)r4   ra   r   r  r6   s       r7   r-   zGlm4vVisionModel.__init__  sA    "(";"; ++/708%%)9)998QGmmuV\\GZ$[!%5f%=$[\,&&F4L4LY_YjYj
 $00B0BH[H[#\ ))**//11,,	
 +6+=+=6CVCVW&+# %\s   %Fc                    g }|D ]s  \  }}}t        j                  |      j                  d      j                  d|      }|j	                  || j
                  z  | j
                  || j
                  z  | j
                        }|j                  dddd      }|j                         }t        j                  |      j                  d      j                  |d      }|j	                  || j
                  z  | j
                  || j
                  z  | j
                        }|j                  dddd      }|j                         }|j                  t        j                  ||gd      j                  |d             v t        j                  |d      }|d d dd f   j                         }| j                  |      }	|	|   j                  d      }
|
|fS )Nr"   r<   r   r;   r   r   )r0   r   r   r   r   r  r   rU  appendr   r   r   r  r  )r4   grid_thwpos_idsthwhpos_idswpos_idsmax_grid_sizerotary_pos_emb_fullr  s              r7   rot_pos_embzGlm4vVisionModel.rot_pos_emb  s    	SGAq!||A003::2qAH''T,,,''T,,,''	H  ''1a3H'')H||A003::1bAH''T,,,''T,,,''	H  ''1a3H'')HNN5;;(';DKKAqQR)	S* ))G+ AB++-"11-@,W5==a@w&&r8   r9   r  r   c           	      n   | j                  |      }| j                  |      }| j                  |      \  }}t        j                  ||fd      }|j                         |j                         f}t        j                  |dddf   |dddf   z  |dddf         j                  dt        j                  j                         r|j                  nt        j                        }t        j                  |dd	      }|dd |dd z
  j                         }	| j!                  ||	||dddf   j#                  |j$                        |dddf   j#                  |j$                              }| j&                  D ]  }
 |
|f||d
|} | j)                  |      }|j+                  d| j,                  | j,                  |j.                  d         }|j1                  dddd      }| j3                  |      j+                  d| j4                  j6                        }| j9                  |      }t;        ||      S )a\  
        hidden_states (`torch.Tensor` of shape `(seq_len, hidden_size)`):
            The final hidden states of the model.
        grid_thw (`torch.Tensor` of shape `(num_images_or_videos, 3)`):
            The temporal, height and width of feature shape of each image in LLM.

        Returns:
            `torch.Tensor`: hidden_states.
        r<   r   Nr"   r;   r   r   )r"   r   )r   )r  r  r   )r  pooler_output)r  r  r  r0   r   r   r   rX  cumsumjit
is_tracingr>   int32r   padr  r   r?   r   r  r  ru   r  rI   r   r  ra   rY   r  r   )r4   r9   r  r   r  image_type_idsrJ  r  r  seqlensblkmerged_hidden_statess               r7   rF   zGlm4vVisionModel.forward  s&    ((700?)-)9)9()C&ii8bA"wwy#'')4,,Xad^hq!tn-LhWXZ[W[n]dd
 %*II$8$8$:(.. e 

 UU:vQ7
ab>JsO3;;=1a4 ##M$8$891a4 ##M$8$89
 ;; 	C%$7 	M	 ++M:%**'')@)@-BUBUVXBY
 &--aAq96;;B@[@[\#{{=9)+.
 	
r8   r$  )rL   rM   rN   r%   r   r  r  r  r   r  r-   r  r    r!   r   r0   rP   r   r   rH   r   rF   rQ   rR   s   @r7   r  r    s    )+,)*
8':  9
"\\9
5:\\9
MSTfMg9
	+	+9
    9
r8   r  c                   (    e Zd ZU eed<   dZdef fdZeee		 	 	 	 	 	 	 dde
j                  dz  de
j                  dz  de
j                  dz  dedz  d	e
j                  dz  d
edz  de
j                  dz  dee   deez  fd                     Z xZS )Glm4vTextModelra   )r  c           	         t         |   |       |j                  | _        |j                  | _        t        j                  |j                  |j                  | j                        | _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        t        |j                  |j                        | _        t#        |      | _        d| _        | j)                          y c c}w )Nr  ra   F)r,   r-   pad_token_idpadding_idx
vocab_sizer.   r   r5   embed_tokensr  r   num_hidden_layersr}  layersr(   r  normr&  
rotary_embr  r  rh  s      r7   r-   zGlm4vTextModel.__init__<  s     !.. ++LL):):F<N<NPTP`P`ammGLVMeMeGfg)"695g
 !!3!39L9LM	2&A&+# hs   DN	input_idsr   rG  ri  inputs_embedsr  rj  r   r*   c           
      ^   |d u |d uz  rt        d      |r6|4t        j                  j                         st	        | j
                        }|| j                  |      }|F||j                         nd}	t        j                  |	|	|j                  d   z   |j                        }|2|j                  ddd      j                  d|j                  d   d      }n2|j                  dk(  r#|d	   j                  d|j                  d   d      }|j                  dk(  r|j                  d   d
k(  r|d   }
|dd  }nd }
| j
                  |||||
d}t        di |}|}| j                  ||      }| j                   D ]  } ||f||
|||d|}|} | j#                  |      }t%        ||      S )N:You must specify exactly one of input_ids or inputs_embedsr  r   r"   r   r<   r   r;   N.   )ra   r  r   rj  ri  rG  )rG  )r   rG  ri  rj  r  )r  ri  r  )
ValueErrorr0   r  r  r   ra   r  get_seq_lengthr   rI   r   ru   r   ndimr   r  r  r  r   )r4   r  r   rG  ri  r  r  rj  r   past_seen_tokenstext_position_idsmask_kwargscausal_maskr9   r  decoder_layerlayer_outputss                    r7   rF   zGlm4vTextModel.forwardL  s    -t";<YZZ 09M9M9O*$++>O  --i8M!CRC^==?de"\\ "2]5H5H5K"KTaThThN
 )..q!R8??=CVCVWXCY[]^L!#'	299!\=O=OPQ=RTVWL !l&8&8&;q&@ ,Q'+L !% kk*,,.-
 )7;7%"oom,oW![[ 
	*M)*. /-$7 M *M
	* 		-0&++
 	
r8   )NNNNNNN)rL   rM   rN   r$   r   r  r-   r   r    r!   r0   rr  rP   r   r{  rg   r   r   rH   r   rF   rQ   rR   s   @r7   r  r  7  s        .2.204(,26!%26Q
##d*Q
 t+Q
 &&-	Q

 Q
 ((4/Q
 $;Q
 ((4/Q
 -.Q
 
(	(Q
    Q
r8   r  c                    n    e Zd ZdZi ZdZddgZ fdZd Zd Z		 	 	 	 d&d	e
d
ee
e
e
f   ej                  z  de
de
de
deej                  z  dz  fdZ	 	 	 d'dej"                  dej$                  dej"                  dz  dej"                  dz  dej                  dz  deej                  ej                  f   fdZee	 d(dej.                  dej"                  dz  dee   deez  fd              Zee	 d(dej.                  dej"                  dz  dee   deez  fd              Z	 	 d)dej"                  dej.                  dej.                  dz  dej.                  dz  fdZ	 	 	 	 	 d*dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d ej                  dz  dej$                  dz  dej                  dz  fd!Zee	 	 	 	 	 	 	 	 	 	 	 	 d+dej"                  dz  dej                  dz  d"ej"                  dz  d edz  dej.                  dz  dej                  dz  dej.                  dz  dej"                  dz  dej"                  dz  d#ej"                  dz  dej$                  dz  d$ej"                  dz  dee   dee z  fd%              Z! xZ"S ),
Glm4vModelr  Fr}  r  c                     t         |   |       t        j                  |j                        | _        t        j                  |j                        | _        d | _	        | j                          y rc   )r,   r-   r  _from_configvision_configvisualr  text_configlanguage_modelr  r  r   s     r7   r-   zGlm4vModel.__init__  sU     &33F4H4HI,99&:L:LM 	r8   c                 6    | j                   j                         S rc   )r  get_input_embeddingsrJ   s    r7   r  zGlm4vModel.get_input_embeddings  s    ""7799r8   c                 :    | j                   j                  |       y rc   )r  set_input_embeddingsr4   r   s     r7   r  zGlm4vModel.set_input_embeddings  s    007r8   Nstart_positionr  temp_merge_sizer  time_intervalr   c                    |d   j                         |z  |d   j                         |z  |d   j                         |z  }	}}||	z  |z  }
t        j                  |||	z   |      j                  ||z        }t        j                  |||z   |      j	                  |	|z        }t        j
                  |
f||t        j                        }||z  }t        j                  |||gd      }|S )a  
        Compute 3D positional indices for vision tokens derived from a single image or video input.

        The positions are generated from the input grid defined by temporal (T), height (H), and
        width (W) dimensions. Temporal and spatial dimensions can be downscaled according to the
        merge sizes used in the vision backbone. The resulting positions are offset by `start_position`.

        Args:
            start_position (`int`):
                Offset added to all computed positional indices.
            grid_thw (`Sequence[int]` or `torch.Tensor` of shape `(3,)`):
                The (T, H, W) grid representing the feature layout of the current image or video after patch embedding.
            temp_merge_size (`int`, *optional*):
                Factor by which the temporal dimension is reduced in the backbone. The temporal grid size is divided
                by this value. Defaults to 1.
            spatial_merge_size (`int`, *optional*):
                Factor by which the spatial dimensions (H and W) are reduced in the backbone. Both H and W are divided
                by this value. Defaults to 1.
            time_interval (`int`, *optional*):
                Spacing factor applied between consecutive temporal position indices.Defaults to 1.
            device (`str` or `torch.device`, *optional*):
                Device on which the resulting tensor is allocated. If `None`, uses the current default device.

        Returns:
            torch.LongTensor of shape (3, sequence_length):
                Positional indices for temporal, height, and width dimensions,
                flattened into sequence form and offset by `start_position`.
        r   r"   r;   r  r   r   )itemr0   r   r   rX  fullr   r   )r4   r  r  r  r  r  r   
llm_grid_t
llm_grid_h
llm_grid_wimage_seq_lengthposition_widthposition_heightposition_temporalvision_position_idss                  r7   get_vision_position_idsz"Glm4vModel.get_vision_position_ids  s   L QK/1QK"44QK"44 !+J
 &
2Z?nnz6QZ`ahh#
  ,,~~
7R[abtt#
 "JJ(8':NSYafakakl-=#kk+<o~*^def""r8   r  mm_token_type_idsimage_grid_thwvideo_grid_thwr   r*   c           	      0   | j                   j                  j                  }g }t        j                  d|j
                  d   |j
                  d   |j                  |j                        }	|t        |      nd|t        |      ndd}
t        |      D ]K  \  }}||   }|,|||   j                            }|||   j                            }g }t        j                  t        |j                               d       D ]7  \  }}t        |      }|d   d   }|d   d   dz   }|j                  |||f       9 d}d}g }|D ]  \  }}}|dk(  r^||z
  }|j                  t        j                   ||j                  	      j#                  dd      j%                  dd      |z          ||z  }j|d
k(  r%|dk(  rt'        |
|         }|dz  }|d   k\  rdn|}nt'        |
|         }|d   }| j)                  |||||j                  	      }|j                  |       |t+        |d   |d
         |z  z  } t        j,                  |d      j/                  dd      }|5|j1                  |	j                        |	dd|||   j                         f<   n"|j1                  |	j                        |	dd|f<   |j                  |j+                         dz   t3        |      z
         N t        j4                  ||j                  	      j7                  d      }|	|fS )u	  
        Calculate the 3D rope index based on image and video's sizes. The utility expects a `vision + text`
        sequence and will error out otherwise. For pure text sequence, please rely on model's auto-inferred
        position ids. In a mixed vision + text sequence, vision tokens use 3D RoPE (temporal, height, width)
        while text tokens use standard 1D RoPE.

        Example:
            Temporal patches: 3; Height patches: 2; Width patches: 2
            Each vision input results in (temporal x height × width) positions. Here: 3 x 2 × 2 = 12 positions total.

            Temporal position IDs are spaced by:
                `interval = tokens_per_second * temporal_patch_size / fps`

                If fps = 1; tokens_per_second = 25; temporal_patch_size = 2, temporal IDs increase by 50 for each temporal patch:
                `[0, 0, 0, 0, 50, 50, 50, 50, 100, 100, 100, 100]`

            Height IDs repeat per row: `[0, 0, 1, 1, ...]`
            Width IDs alternate per column: `[0, 1, 0, 1, ...]`
            Text tokens follow standard 1D RoPE and the position IDs grow consequently with a step of `1`

        Args:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
                it.
            mm_token_type_ids (`torch.IntTensor` of shape `(batch_size, sequence_length)`):
                Token type ids matching each modality to a different value in the input sequence, i.e. text (0), image (1), video (2).
            image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
                The temporal, height and width of feature shape of each image in LLM.
            video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
                The temporal, height and width of feature shape of each video in LLM.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

        Returns:
            position_ids (`torch.LongTensor` of shape `(3, batch_size, sequence_length)`)
            mrope_position_deltas (`torch.Tensor` of shape `(batch_size)`)
        r   r   r"   r>   r   N)r"   r;   c                     | d   S )Nr"   r  )r   s    r7   <lambda>z+Glm4vModel.get_rope_index.<locals>.<lambda>9  s    `abc`d r8   r<   r  r;   r   )ra   r  r  r0   zerosrI   r>   r   iterrL  rg   	itertoolsgroupbyr  r   r  r   ru   r   nextr  r  r   r   r?   r   r   r   )r4   r  r  r  r  r   r   r  mrope_position_deltasrG  
grid_iters	batch_idxcurrent_input_idsinput_token_typeinput_type_groupr   groupstart_index	end_indexcurrent_posvideo_group_indexllm_pos_ids_listmodality_type	start_idxend_idxtext_lenr  r  r   llm_positionss                                 r7   get_rope_indexzGlm4vModel.get_rope_index  sj   b "[[66II "{{OOAOOA//##
 (6'AtN#t'5'AtN#t


 -6i,@ 1	[(I(0;)$5nY6O6T6T6V$W!#3N94M4R4R4T#U !'//	:J:Q:Q:S0TVde G
UU#Ahqk!"IaL1,	 ''k9(EF	G K !!5E W1y' A%&2H$++Xi6F6FGLLQPRSZZ[\^`adoo  8+K %),1'+J},E'FH)Q.)1Bhqk1QAWh)#'
=(A#B '/qkO*.*F*F#X@R[d[k[k +G +' %++,?@3x{HQK#@DV#VVK7W8 "II&6A>FFq"MM)O\O_O_`l`s`sOtQ	>)+D+I+I+KKL-:-=-=l>Q>Q-RQ	\*!(():):)<q)@3GXCY)YZc1	[d !&-B9K[K[ \ f fgh i222r8   pixel_values_videosr   c                 4   |j                  | j                  j                        }g }|j                         }|D ]N  \  }}}t	        j
                  d||g      j                  d      j                  |d      }	|j                  |	       P t	        j                  |d      }
 | j                  |f|
dd|}|j                  d      | j                  j                  dz  z  j                         }t	        j                  |j                  |      }||_        |S )[  
        pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
            The tensors corresponding to the input videos.
        video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
            The temporal, height and width of feature shape of each video in LLM.
        r"   r   r   Tr  return_dictr<   r;   )rE  r  r>   r  r0   r   r   r   r  r   prodr  r  r  )r4   r   r  r   temp_frames_hwvideo_grid_thw_listr  r  r  repeated_rowflattened_video_grid_thwvision_outputssplit_sizesvideo_embedss                 r7   get_video_featureszGlm4vModel.get_video_featuresg  s    266t{{7H7HI,335* 	0GAq! <<Aq	2<<Q?FFq!LL!!,/	0 $)99^#C $
*BPT
X^
 &**2.$++2P2PRS2SS[[]{{>#?#?M'3$r8   pixel_valuesc                 <   |j                  | j                  j                        } | j                  |f|dd|}|j                  d      | j                  j                  dz  z  j                         }t        j                  |j                  |      }||_        |S )T  
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
            The tensors corresponding to the input images.
        image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
            The temporal, height and width of feature shape of each image in LLM.
        Tr#  r<   r;   )	rE  r  r>   r%  r  r  r0   r  r  )r4   r.  r  r   r*  r+  image_embedss          r7   get_image_featureszGlm4vModel.get_image_features  s     $(():):;$\gNX\g`fg%**2.$++2P2PRS2SS[[]{{>#?#?M'3$r8   r  image_featuresvideo_featuresc                 T   || | j                         t        j                  | j                  j                  t        j
                  |j                              k(  }|j                  d      }| | j                         t        j                  | j                  j                  t        j
                  |j                              k(  }|j                  d      }n2|| j                  j                  k(  }|| j                  j                  k(  }|j                         }|j                  d      j                  |      j                  |j                        }|At        ||   j                         |j                         k(  d| d|j                  d           |j                         }|j                  d      j                  |      j                  |j                        }|At        ||   j                         |j                         k(  d| d|j                  d           ||fS )z
        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
        equal to the length of multimodal features. If the lengths are different, an error is raised.
        r  r<   z6Image features and image tokens do not match, tokens: z, features: r   z6Video features and video tokens do not match, tokens: )r  r0   r   ra   image_token_idr   r   allvideo_token_idsumr   	expand_asr?   r   numelrI   )	r4   r  r  r3  r4  special_image_maskspecial_video_maskn_image_tokensn_video_tokenss	            r7   get_placeholder_maskzGlm4vModel.get_placeholder_mask  s    !.2M$2K2K2MT[[77uzzR_RfRfg3 " "4!7!7!;!.2M$2K2K2MT[[77uzzR_RfRfg3 " "4!7!7!; "+dkk.H.H!H!*dkk.H.H!H+//1/99"=GGVYYZgZnZno%"01779^=Q=Q=SSHHXXdeseyeyz{e|d}~
 ,//1/99"=GGVYYZgZnZno%"01779^=Q=Q=SSHHXXdeseyeyz{e|d}~ "#555r8   ri  c                 L   |dn|j                         }|d uxr |d uxr
 |d uxs |d u}	|	r3| j                  |dk(  r"| j                  |||||      \  }
}|| _        |
S | j                  5|j                  \  }}}|u|j	                         j                  d      dz
  }
|
j                  |dk(  d      }
|
j                  d|d      j                  ddd      j                  |j                        }
nVt        j                  |||z         }
|
j                  ddd      j                  d|d      j                  |j                        }
| j                  j                  || j                  j                  d   z  d      }|
|j                  |j                        z   }
|
S d }
|
S )Nr   )r  r  r   r  r<   r"   r   r   r  )r  r  r  rI   r   r  masked_fillru   r   r?   r   r0   r   r   rX  )r4   r  r  r  r  r   ri  r  past_key_values_lengthcan_compute_mroperG  r  
batch_sizer  r  deltas                   r7   compute_3d_position_idsz"Glm4vModel.compute_3d_position_ids  s    '6&=?CaCaCcT! K!-Kt+I~T/I 	 $"2"2":>TXY>Y(,(;(;---"3 )< )%L+  +D   )(5(;(;%J
A)-224;;B?!C+77!8KQO+00JCJJ1aQRSVVWdWkWkl$||,BDZ]gDgh+00Ar:AA!ZQSTWWXeXlXlm$$66zTEUEUE[E[\]E^7^de6fE'%((-:N:N(*OOL   Lr8   rG  r  rj  c           
         |du |duz  rt        d      | | j                         |      }|| j                  ||d      j                  }t	        j
                  |d      j                  |j                  |j                        }| j                  |||      \  }}|j                  ||      }|| j                  ||	d      j                  }t	        j
                  |d      j                  |j                  |j                        }| j                  |||      \  }}|j                  ||      }|| j                  |||	||||	      } | j                  dd|||||d
|}t        di |d| j                  iS )a  
        image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
            The temporal, height and width of feature shape of each image in LLM.
        video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
            The temporal, height and width of feature shape of each video in LLM.
        rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*):
            The rope index difference between sequence length and multimodal rope.
        Nr  T)r$  r   r   )r3  )r4  )r  r  r  r  r   ri  r  )r  rG  r   ri  r  rj  r  r  )r  r  r2  r  r0   r   r?   r   r>   r@  masked_scatterr-  rG  r  r  r  )r4   r  r   rG  ri  r  r.  r   r  r  r  r  rj  r   r1  
image_maskr  r,  
video_maskoutputss                       r7   rF   zGlm4vModel.forward  s   4 -t";<YZZ 7D557	BM#22<]a2bppL 99\q9<<]=Q=QS`SfSfgL 55i_k5lMJ)88\RM*223Fdh2iwwL 99\q9<<]=Q=QS`SfSfgL 55i_k5lMAz)88\RM77#--+- /"3 8 L &$%% 
%)+')
 
 ( 

((
 	
r8   )r"   r"   r"   NrQ  rc   r  )NNNNN)NNNNNNNNNNNN)#rL   rM   rN   r  _checkpoint_conversion_mappingaccepts_loss_kwargsr  r-   r  r  r   r   r0   rP   r   r   r  rr  	IntTensorrH   r  r   r   r{  r   r   r   r-  r2  r@  rG  r   r  rF   rQ   rR   s   @r7   r  r    sW   %'"02DE:8  !"#,06#6# sC}%46# 	6#
  6# 6# ell"T)6#x 3726.2s3##s3 !??s3 ((4/	s3
 ((4/s3 t+s3 
u||U\\)	*s3j  37".. ((4/ +,	
 
+	+  :  37'' ((4/ +,	
 
+	+  0 4837(6##(6 (((6 ))D0	(6
 ))D0(6\ /3.2.2/348)<<$&) ||d*) t+	)
 t+) t+) ,) !??T1) 
	)V  .2.204(,26,08<2626/34826B
##d*B
 t+B
 &&-	B

 B
 ((4/B
 llT)B
 #..5B
 ((4/B
 ((4/B
 %%,B
 !??T1B
 ((4/B
 +,B
 
)	)B
  B
r8   r  zQ
    Base class for Glm4v causal language model (or autoregressive) outputs.
    c                       e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	e
dz  ed<   dZeej                     dz  ed<   dZeej                     dz  ed<   dZej                  dz  ed<   y)	Glm4vCausalLMOutputWithPasta  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*):
        The rope index difference between sequence length and multimodal rope.
    Nlosslogitsri  r9   r  r  )rL   rM   rN   rq  rR  r0   r{  r   rS  ri  r   r9   rH   r  r  rr  r  r8   r7   rQ  rQ  8  s     &*D%

d
")'+FE$+$(OUT\(59M5**+d2926Je''(4/6+/K!!D(/r8   rQ  c            "           e Zd Zi ZddiZdZ fdZd Zd Ze		 d de
j                  d	e
j                  dz  d
ee   deez  fd       Ze		 d de
j                  de
j                  dz  d
ee   deez  fd       Zee		 	 	 	 	 	 	 	 	 	 	 	 	 d!de
j                  dz  de
j(                  dz  de
j                  dz  dedz  de
j                  dz  de
j                  dz  de
j(                  dz  de
j                  dz  de
j                  dz  d	e
j                  dz  de
j,                  dz  de
j                  dz  dee
j(                  z  d
ee   deez  fd              Z	 	 	 	 	 	 	 	 	 	 	 d" fd	Z fdZ	 d de
j                  dz  de
j(                  dz  dee
j(                  e
j(                  f   fdZ	 	 	 d#dedede
j                  dz  dee
j                  eee f   f   fdZ! xZ"S )$Glm4vForConditionalGenerationzlm_head.weightz(model.language_model.embed_tokens.weightFc                     t         |   |       t        |      | _        t	        j
                  |j                  j                  |j                  j                  d      | _	        | j                          y )NFrX   )r,   r-   r  r  r.   r[   r  r5   r  lm_headr  r   s     r7   r-   z&Glm4vForConditionalGeneration.__init__[  sS     '
yy!3!3!?!?ASASA^A^ejkr8   c                 6    | j                   j                         S rc   )r  r  rJ   s    r7   r  z2Glm4vForConditionalGeneration.get_input_embeddingsb  s    zz..00r8   c                 :    | j                   j                  |       y rc   )r  r  r  s     r7   r  z2Glm4vForConditionalGeneration.set_input_embeddingse  s    

''.r8   Nr   r  r   r*   c                 @     | j                   j                  d||d|S )r"  )r   r  r  )r  r-  )r4   r   r  r   s       r7   r-  z0Glm4vForConditionalGeneration.get_video_featuresh  s/     -tzz,, 
 3N
V\
 	
r8   r.  r  c                 @     | j                   j                  d||d|S )r0  )r.  r  r  )r  r2  )r4   r.  r  r   s       r7   r2  z0Glm4vForConditionalGeneration.get_image_featuresy  s'     -tzz,,p,Wepioppr8   r  r   rG  ri  r  labelsr  rj  logits_to_keepc                     | j                   d||||	|
||||||d|}|d   }t        |t              rt        | d      n|}| j	                  |dd|ddf         }d}|2| j                  ||| j                  j                  j                        }t        |||j                  |j                  |j                  |j                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
        image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
            The temporal, height and width of feature shape of each image in LLM.
        video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
            The temporal, height and width of feature shape of each video in LLM.

        Example:

        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, Glm4vForConditionalGeneration

        >>> model = Glm4vForConditionalGeneration.from_pretrained("zai-org/GLM-4.1V-9B-Thinking")
        >>> processor = AutoProcessor.from_pretrained("zai-org/GLM-4.1V-9B-Thinking")

        >>> messages = [
            {
                "role": "user",
                "content": [
                    {"type": "image", "url": "https://www.ilankelman.org/stopsigns/australia.jpg"},
                    {"type": "text", "text": "What is shown in this image?"},
                ],
            },
        ]
        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        >>> inputs = processor(text=[text], images=[image], vision_infos=[vision_infos])

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "The image shows a street scene with a red stop sign in the foreground. In the background, there is a large red gate with Chinese characters ..."
        ```)r  r.  r   r  r  r  rG  r   ri  r  rj  r   N)rS  r\  r  )rR  rS  ri  r9   r  r  r  )r  r   r   slicerW  loss_functionra   r  r  rQ  ri  r9   r  r  )r4   r  r   rG  ri  r  r\  r.  r   r  r  r  rj  r]  r   rL  r9   slice_indicesrS  rR  s                       r7   rF   z%Glm4vForConditionalGeneration.forward  s    z $** 
% 3))/%)+')
 
  
 9C>SV8W~ot4]kmA}a,?@A%%VFt{{OfOfOqOq%rD*#33!//))++
 	
r8   c                 \    t        |   |f|||||||	|
|||d|}|s|r
d |d<   d |d<   |S )N)ri  r   r  rj  rG  r.  r   r  r  r  is_first_iterationr.  r   )r,   prepare_inputs_for_generation)r4   r  ri  r   r  rj  rG  r  r.  r   r  r  rc  r   model_inputsr6   s                  r7   rd  z;Glm4vForConditionalGeneration.prepare_inputs_for_generation  si    $ w<
+)')%% 3))1
 
  "i+/L(26L./r8   c                    t         |   ||      }d}|j                  d      x}|j                         }|dk7  r4| j                  j
                  |d   | j                  j
                  z   }|S d|v r|d   j                  d   dkD  r|d   }t        |j                        dk(  xr, |j                  t        j                  t        j                  fv }|r|j                  d      }|j                  d      |j                  d	      [|j                         D 	ci c]  \  }}	|dk7  s||	 }}}	 | j                  j                  |fi |\  }
}|| j                  _        no|j                  d      j                  d
dd      }
t        j                   |j                  d   dt        j                  |j"                        | j                  _        |d   }t        j$                  ||
gd      }|S c c}	}w )Nr   ri  r  r  r"   r;   r  r  r  r   r<   r  r   )r,   $_prepare_position_ids_for_generationr5  r  r  r  rI   r   r>   r0   r   r   itemsr  r   r   r	  r   r   )r4   inputs_tensormodel_kwargstext_positionspast_lengthcacherG  is_input_idsr   r  vision_positionsr  r6   s               r7   rg  zBGlm4vForConditionalGeneration._prepare_position_ids_for_generation  s    EmUab !%%&788EE..0K!

 6 6 B))4tzz7M7MML ,&<+D+J+J1+MPQ+Q(5M=../14g9L9LQVQZQZ\a\f\fPg9g  !45A!!"23?<CSCSTdCeCq-9-?-?-AVTQQ+EUAqDVLV,EDJJ,E,Em,dWc,d)k%0DJJ"-77:AA!RL%*[[##A&MDXDX&DJJ"
 (	2yy.2B!CK Ws   G3*G3c                    || | j                         t        j                  | j                  j                  t        j
                  |j                              k(  d   }| | j                         t        j                  | j                  j                  t        j
                  |j                              k(  d   }| | j                         t        j                  | j                  j                  t        j
                  |j                              k(  d   }nK|| j                  j                  k(  }|| j                  j                  k(  }|| j                  j                  k(  }t        j                  |j                         |j                         z
  d      }|dkD  }|| z  }|j                  d      }	|j                  d      }
|	|
fS )aa  
        Get the number of images and videos for each sample to calculate the separation length of the sample tensor.
        These parameters are not passed through the processor to avoid unpredictable impacts from interface modifications.

        Args:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                Indices of input sequence tokens in the vocabulary.

        Returns:
            image_nums (`torch.LongTensor` of shape `(batch_size, num_images_sample)`)
            video_nums (`torch.LongTensor` of shape `(batch_size, num_videos_sample)`)
        r  ).r   r"   r   r   )r  r0   r   ra   image_start_token_idr   r   video_start_token_idvideo_end_token_idr  r   r9  )r4   r  r  is_imageis_video_startis_video_endvideo_levelinside_videostandalone_imagesimage_countsvideo_countss              r7   _get_image_nums_and_video_numsz<Glm4vForConditionalGeneration._get_image_nums_and_video_nums5  s   $ $.4,,.LL!A!A\i\p\pq H .4,,.LL!A!A\i\p\pq N .4,,.LL!?!?uzzZgZnZno L !DKK$D$DDH&$++*J*JJN$(F(FFL ll>#5#5#7,:J:J:L#LRST"Q %6 ),,,3%))a)0\))r8   expand_sizeis_encoder_decoderc                      dk(  rfS g d fd}fd} |      j                  d       |      |r*j                  d      t        d       |d         d<   fS )	Nr"   )r.  r  r   r  second_per_grid_tsc                 4   j                  dd       }j                  dd       }j                  j                  dd             \  }}d }| D ]9  }|dk(  rct        j                  |t	        |            }|D cg c]'  }t        j
                  |d      j                         ) }	} || |   |	
	      | |<   l|dk(  rt	        |      }	 || |   |	
	      | |<   |d
k(  rct        j                  |t	        |            }|D cg c]'  }t        j
                  |d      j                         ) }	} || |   |	
	      | |<   |dk(  rt	        |      }	 || |   |	
	      | |<   |dk(  s  || |   t	        |      
	      | |<   < | S c c}w c c}w )Nr  r  r  )r  c                     t        j                  | |      }|gdg| j                         dz
  z  z   }t        j                  |D cg c]  } |j                  |  c}d      }|S c c}w )Nr"   r   r   )r0   r  r{   r   r   )r   r   repeat_timessamplesrepeat_argssamplerP  s          r7   _repeat_interleave_sampleszGlm4vForConditionalGeneration._expand_inputs_for_generation.<locals>._expand_dict_for_generation_visual.<locals>._repeat_interleave_samples  sa    ++a1+nsaeegk/BBg#VFMFMM;$?#V\]^ $Ws   A&r.  r"   r   )r   r  r   r  )r5  r|  r0   r  r   r%  r9  )dict_to_expandr  r  
image_nums
video_numsr  r   r  r  r   r}  r  rj  r4   s             r7   "_expand_dict_for_generation_visualzgGlm4vForConditionalGeneration._expand_inputs_for_generation.<locals>._expand_dict_for_generation_visual~  s   )--.>EN)--.>EN%)%H%H)9)9/4)P &I &"J
 & .(#kk.$z:JKGMTU6uzz&a8<<>UGU*D&s+W;+N3' ,,":.G*D&s+W;+N3' 11#kk.$z:JKGMTU6uzz&a8<<>UGU*D&s+W;+N3' ,,":.G*D&s+W;+N3' 00*D&s+T*5ET_+N3'7< "!3 V Vs   =,F,Fc                    | D ]{  }|dk(  r,| |   j                   dk(  r| |   j                  d      | |<   4|dk7  s:| |   @t        | |   t        j                        s^|vsc| |   j                  d      | |<   } | S )NrG  r   r"   r   rj  r   )r  rX  r   r0   rP   )r  r   r}  visual_keyss     r7   _expand_dict_for_generationz`Glm4vForConditionalGeneration._expand_inputs_for_generation.<locals>._expand_dict_for_generation  s    % 	d.(^C-@-E-E-J*8*=*O*OP[ab*O*cN3'++&s+7">##6E;.*8*=*O*OP[ab*O*cN3'	d "!r8   r   r   encoder_outputszMIf `is_encoder_decoder` is True, make sure that `encoder_outputs` is defined.)rX  r5  r  )r4   r}  r~  r  rj  r  r  r  s   `` ``  @r7   _expand_inputs_for_generationz;Glm4vForConditionalGeneration._expand_inputs_for_generationm  s     !l**w+	"Z	" :,G !33KQ3GI2<@ 12: !pqq.I,WhJi.jL*+,&&r8   rc   )NNNNNNNNNNNNr   )NNNNNTNNNNF)r"   FN)#rL   rM   rN   rM  _tied_weights_keysrN  r-   r  r  r   r0   r{  rr  r   r   rH   r   r-  r2  r   rP   r   rO  r   rQ  rF   rd  rg  r|  rg   dictr   r   r  rQ   rR   s   @r7   rU  rU  U  s9   %'"*,VW1/  37
"..
 ((4/
 +,	

 
+	+
 
   37q''q ((4/q +,	q
 
+	+q q  .2.204(,26*.,08<26264826-.[
##d*[
 t+[
 &&-	[

 [
 ((4/[
   4'[
 llT)[
 #..5[
 ((4/[
 ((4/[
 !??T1[
 ((4/[
 ell*[
 +,[
  
,	,![
  [
@   &P$R .26*##d*6* ||d*6* 
u||U\\)	*	6*t #(-1	W'W' !W' ##d*	W' 
uc3h/	0W'r8   rU  )rU  r  r  r  r  )r  )r"   )[r  collections.abcr   dataclassesr   typingr   r   r0   torch.nnr.   torch.nn.functionalr   r   r    r	   r  activationsr
   cache_utilsr   r   
generationr   integrationsr   masking_utilsr   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.genericr   r   r    utils.output_capturingr!   configuration_glm4vr#   r$   r%   Moduler(   rT   ri   ry   r   r   r   rP   rH   r   r   r   rO   r   r   r  r&  rV  r_  ra  rt  r}  r  r  r  r  r  rQ  rU  __all__r  r8   r7   <module>r     s  (  $ !        & ! . ) 7 / B 9 ` ` K F & a a e e 5 P P Y'J299 J (J(fBII fBII (  fRYY f"HBII HV(||+0<<>Cll
5<<%&	UU\\ 	U# 	U%,, 	U& %II%<<% 
% <<	%
 LL4'% % % '(%2P299 Pf1 6Jryy JZ6%PE) E)P)299 )$16 1h 
0{ 0 0$ 2? 2 20~
+ ~
B h
) h
 h
V Q
% Q
 Q
h 
0+ 0 0.o'$8/ o'd xr8   