
    qi6"                     4   d dl ZddlmZ ddlmZ ddlmZmZm	Z	 ddl
mZ ddlmZ dd	lmZmZmZ dd
lmZ ddlmZ  G d de      Z G d de      Z G d de      Z G d de      Z G d de      Z G d de      Z G d de      Z G d de      Zg dZy)    N   )PreTrainedConfig)VideoMetadata   )CONFIG_MAPPING
AutoConfig	AutoModel)Glm4vImageProcessor)Glm4vImageProcessorFast)Glm4vForConditionalGeneration
Glm4vModelGlm4vPreTrainedModel)Glm4vProcessor)Glm4vVideoProcessorc                   J     e Zd ZdZdZeedZdgZ	 	 	 	 	 	 	 	 	 d fd	Z xZ	S )Glm46VConfiga  
    This is the configuration class to store the configuration of a [`Glm4vModel`]. It is used to instantiate a
    GLM-4.6V model according to the specified arguments, defining the model architecture. Instantiating a
    configuration with the defaults will yield a similar configuration to that of
    GLM-4.1V-9B-Thinking [zai-org/GLM-4.1V-9B-Thinking](https://huggingface.co/zai-org/GLM-4.1V-9B-Thinking).

    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PreTrainedConfig`] for more information.

    Args:
        text_config (`Union[PreTrainedConfig, dict]`, *optional*, defaults to `Glm4vTextConfig`):
            The config object or dictionary of the text backbone.
        vision_config (`Union[PreTrainedConfig, dict]`,  *optional*, defaults to `Glm4vVisionConfig`):
            The config object or dictionary of the vision backbone.
        image_token_id (`int`, *optional*, defaults to 151343):
            The image token index to encode the image prompt.
        video_token_id (`int`, *optional*, defaults to 151344):
            The video token index to encode the image prompt.
        image_start_token_id (`int`, *optional*, defaults to 151339):
            The image start token index to encode the start of image.
        image_end_token_id (`int`, *optional*, defaults to 151340):
            The image end token index to encode the end of image.
        video_start_token_id (`int`, *optional*, defaults to 151361):
            The video start token index to encode the start of video.
        video_end_token_id (`int`, *optional*, defaults to 151362):
            The video end token index to encode the end of video.
        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
            Whether to tie weight embeddings

    ```python
    >>> from transformers import Glm46VForConditionalGeneration, Glm46VConfig

    >>> # Initializing a GLM-4.6V style configuration
    >>> configuration = Glm46VConfig()

    >>> # Initializing a model from the GLM-4.6V style configuration
    >>> model = Glm4vForConditionalGeneration(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```glm46v)text_configvision_configpast_key_valuesc
                    t        |t              r,|j                  dd      |d<   t        |d      di || _        n|t        d          | _        t        |t              r,|j                  dd      |d<   t        |d      di || _        n|t        d          | _        || _        || _        || _        || _	        || _
        || _        |	| _        t        | 8  di |
 y )N
model_typeglm4v_vision
glm4v_text )
isinstancedictgetr   r   r   image_token_idvideo_token_idvideo_start_token_idvideo_end_token_idimage_start_token_idimage_end_token_idtie_word_embeddingssuper__init__)selfr   r   r   r    r#   r$   r!   r"   r%   kwargs	__class__s              [/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/glm46v/modular_glm46v.pyr'   zGlm46VConfig.__init__K   s     mT**7*;*;L.*YM,'!/l0K!L!]}!]D"!/!?!ADk4((3l(SK%-k,.GHW;WD -l;=D,,$8!"4$8!"4#6 "6"    )	NNi/O i0O i+O i,O iAO iBO F)
__name__
__module____qualname____doc__r   r   sub_configskeys_to_ignore_at_inferencer'   __classcell__r*   s   @r+   r   r      sJ    (T J",zJK#4"5 #!#!!!# !#r,   r   c                       e Zd ZdZdZd Zy)Glm46VPreTrainedModelNc                     t        d      )Nz
Not needed)AttributeError)r(   modules     r+   _init_weightsz#Glm46VPreTrainedModel._init_weightss   s    \**r,   )r-   r.   r/   _can_record_outputs_no_split_modulesr:   r   r,   r+   r6   r6   o   s    +r,   r6   c                   "     e Zd ZdZ fdZ xZS )Glm46VModelNc                     t         |   |       t        j                  |j                        | _        t        j                  |j                        | _        y N)r&   r'   r	   from_configr   visualr   language_model)r(   configr*   s     r+   r'   zGlm46VModel.__init__z   sA     ++F,@,@A'33F4F4FGr,   )r-   r.   r/   r<   r'   r3   r4   s   @r+   r>   r>   w   s    H Hr,   r>   c                       e Zd Zy)Glm46VForConditionalGenerationNr-   r.   r/   r   r,   r+   rF   rF          r,   rF   c                       e Zd Zd Zy)Glm46VProcessorc                 *    d| j                    d|ddS )Nz<|begin_of_image|>z<|end_of_image|>z.1fz seconds)image_token)r(   timestamp_secs     r+   replace_frame_token_idz&Glm46VProcessor.replace_frame_token_id   s$    #D$4$4#55EmTWEXX`aar,   N)r-   r.   r/   rN   r   r,   r+   rJ   rJ      s    br,   rJ   c                       e Zd Zy)Glm46VImageProcessorNrG   r   r,   r+   rP   rP      rH   r,   rP   c                       e Zd Zy)Glm46VImageProcessorFastNrG   r   r,   r+   rR   rR      rH   r,   rR   c                   ,    e Zd Z	 ddedeez  dz  fdZy)Glm46VVideoProcessorNmetadatafpsc                    |t        |dd       t        d      |j                  }|dz
  }|j                  xs t	        ||j
                  z        dz   }dddd}d}d}	t        ||	      }
|
d	k  r|d	   }n|
d
k  r|d
   }n|d   }t        |
|z  | j                  z        }t        ||      }d|j
                  z  }t        |      D cg c]  }||z  	 }}t        |      }||k  r/t        j                  d|dz
  |t              j                         }nLg }d}d| j                  |z  z  }t        |      D ](  }||   |k\  s||z  }|j                  |       ||k\  s( n t        |      |k  rVt        |      dk(  rdt        |dz
  d      }}n
|d   |d   }}t        j                  |||t              j                         }n<t        |      |kD  r.t        j                  d|dz
  |t              j                         }t!               g }}|D ])  }||vs|j#                  |       |j                  |       + t        |      dz  r|j                  |d          t        j$                  |      S c c}w )NrV   zAsked to sample frames per second but no video metadata was provided which is required when sampling in Glm46V. Please pass in `VideoMetadata` object or set `do_sample_frames=False`   r   g      ?)   ,  `	  i  r[   rY   rZ   r   )dtype)getattr
ValueErrortotal_num_framesdurationroundrV   mininttemporal_patch_sizerangenplinspacetolistappendlenmaxsetaddarray)r(   rU   rV   r)   total_framesmax_frame_idxra   DYNAMIC_FPS_THRESMAX_FRAME_COUNT_DYNAMICMAX_DURATIONeffective_duration
target_fps	extract_tduration_per_framei
timestamps
max_secondframe_indicescurrent_secondinv_fpsframe_indexstartendseenuniqidxs                             r+   sample_framesz"Glm46VVideoProcessor.sample_frames   s    wx=EX 
  00$q($$Omhll.J(Ka(O!"#6"% <8#*2.J3&*3/J*40J*Z7$:R:RRS		#:;	-6;L6IJa,,J
J]
)#KK<!+;YcRYY[MMN433j@AG$\2 k*n<"g-N!((5%3 }	)=!Q&L1$4a 8s*1-}R/@sKKsISIPPRM)+KK<!+;YcRYY[MUBd  	!C$C 	!
 t9q=KKR!xx~E Ks   	I3r@   )r-   r.   r/   r   rd   floatr   r   r,   r+   rT   rT      s*     #'@@ 5[4@r,   rT   )r   r>   r6   rF   rJ   rP   rR   rT   )numpyrg   configuration_utilsr   video_utilsr   autor   r   r	   glm4v.image_processing_glm4vr
   !glm4v.image_processing_glm4v_fastr   glm4v.modeling_glm4vr   r   r   glm4v.processing_glm4vr   glm4v.video_processing_glm4vr   r   r6   r>   rF   rJ   rP   rR   rT   __all__r   r,   r+   <module>r      s      3 ( 8 8 > G b b 3 >P## P#f+0 +H* H	%B 	bn b
	. 		6 	A. AH	r,   