
    qi-                         d dl Zd dlZddlmZ ddlmZmZmZm	Z	m
Z
mZ ddlmZmZ ddlmZmZ ddlmZmZ ddlmZmZmZ d	d
lmZ  G d ded      Z eded       G d de             ZdgZy)    N   )BatchFeature)OPENAI_CLIP_MEANOPENAI_CLIP_STDChannelDimensionPILImageResamplingSizeDictget_image_size)UnpackVideosKwargs)
TensorTypeadd_start_docstrings)BASE_VIDEO_PROCESSOR_DOCSTRINGBaseVideoProcessor)VideoMetadatagroup_videos_by_shapereorder_videos   )smart_resizec                   J    e Zd ZU eeef   ed<   eed<   eed<   eed<   eed<   y)Glm46VVideoProcessorInitKwargsmax_image_size
patch_sizetemporal_patch_size
merge_sizemax_durationN)__name__
__module____qualname__dictstrint__annotations__     d/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/glm46v/video_processing_glm46v.pyr   r   )   s&    cN"OOr%   r   F)totalzfConstructs a fast GLM-4V image processor that dynamically resizes videos based on the original videos.aj  
        patch_size (`int`, *optional*, defaults to 14):
            The spacial patch size of the vision encoder.
        temporal_patch_size (`int`, *optional*, defaults to 2):
            The temporal patch size of the vision encoder.
        merge_size (`int`, *optional*, defaults to 2):
            The merge size of the vision encoder to llm encoder.
    c                       e Zd Zej                  ZdddZddiZeZ	e
ZdZdZdZdZdZdZdZdZdZeZd	ZdZd
dgZdee   f fdZ	 d$dedz  def fdZ	 d$dede e!z  dz  fdZ"dddej                  dddddddddfde#e$jJ                     de&de&dedz  dede&de!de&de!e#e!   z  dz  de!e#e!   z  dz  de dz  d e dz  d!e dz  d"e'e(z  dz  fd#Z) xZ*S )%Glm46VVideoProcessori 1  i )shortest_edgelongest_edger+   T      ,     pixel_values_videosvideo_grid_thwkwargsc                     t        |   di | | j                  D| j                  j                  dd       | j                  j                  dd       t	        d      y y )Nr*   r+   :size must contain 'shortest_edge' and 'longest_edge' keys.r$   )super__init__sizeget
ValueError)selfr2   	__class__s     r&   r6   zGlm46VVideoProcessor.__init__R   s^    "6"99 IIMM/408DIIMM.Z^<_<gYZZ =h !r%   Nr7   returnc                 P    |d|vsd|vrt        d      t        |   dd|i|S )z
        Update kwargs that need further processing before being validated
        Can be overridden by subclasses to customize the processing of kwargs.
        r*   r+   r4   r7   r$   )r9   r5   _further_process_kwargs)r:   r7   r2   r;   s      r&   r>   z,Glm46VVideoProcessor._further_process_kwargsY   s>     !<VZ@ZYZZw.CDCFCCr%   metadatafpsc                    |t        |dd      t        d      |j                  }|dz
  }|j                  xs t	        ||j
                  z        dz   }dddd}d}d	}	t        ||	      }
|
d
k  r|d
   }n|
dk  r|d   }n|d	   }t        |
|z  | j                  z        }t        ||      }d|j
                  z  }t        |      D cg c]  }||z  	 }}t        |      }||k  r/t        j                  d|dz
  |t              j                         }nLg }d}d| j                  |z  z  }t        |      D ](  }||   |k\  s||z  }|j                  |       ||k\  s( n t        |      |k  rVt        |      dk(  rdt        |dz
  d      }}n
|d   |d   }}t        j                  |||t              j                         }n<t        |      |kD  r.t        j                  d|dz
  |t              j                         }t!               g }}|D ])  }||vs|j#                  |       |j                  |       + t        |      dz  r|j                  |d          t        j$                  |      S c c}w )a  
        Args:
            metadata (`VideoMetadata`):
                Metadata of the video containing information about total duration, fps and total number of frames.
            fps (`int` or `float`, *optional*):
                Target frames to sample per second. Defaults to `self.fps`.
        Returns:
            np.ndarray:
                Indices to sample video frames.
        Nr@   zAsked to sample frames per second but no video metadata was provided which is required when sampling in Glm46V. Please pass in `VideoMetadata` object or set `do_sample_frames=False`r   r   g      ?)   r.   `	  i  rC   rB   r.   r   )dtype)getattrr9   total_num_framesdurationroundr@   minr"   r   rangenplinspacetolistappendlenmaxsetaddarray)r:   r?   r@   r2   total_framesmax_frame_idxrH   DYNAMIC_FPS_THRESMAX_FRAME_COUNT_DYNAMICMAX_DURATIONeffective_duration
target_fps	extract_tduration_per_framei
timestamps
max_secondframe_indicescurrent_secondinv_fpsframe_indexstartendseenuniqidxs                             r&   sample_framesz"Glm46VVideoProcessor.sample_framesg   s     wx=EX 
  00$q($$Omhll.J(Ka(O!"#6"% <8#*2.J3&*3/J*40J*Z7$:R:RRS		#:;	-6;L6IJa,,J
J]
)#KK<!+;YcRYY[MMN433j@AG$\2 k*n<"g-N!((5%3 }	)=!Q&L1$4a 8s*1-}R/@sKKsISIPPRM)+KK<!+;YcRYY[MUBd  	!C$C 	!
 t9q=KKR!xx~E Ks   	I3gp?videosdo_convert_rgb	do_resizeinterpolation
do_rescalerescale_factordo_normalize
image_mean	image_stdr   r   r   return_tensorsc                    t        |      \  }}i }|j                         D ]  \  }}|j                  \  }}}}}|||}}}|rwt        ||||||z  |j                  |j
                        \  }}|j                  ||z  |||      }| j                  |t        ||      |      }|j                  |||||      }|||<    t        ||      }t        |      \  }}i } i }!|j                         D ]  \  }}t        |d   t        j                        \  }}| j                  |||||	|
      }|}"|"j                  d   |z  dk7  r:|"d d dd f   j                  d|dz
  ddd      }#t        j                   |"|#gd      }"|"j                  d d	 \  }$}%}&|%|z  }%||z  ||z  }(}'|"j                  |$|%||&|'|z  |||(|z  ||
      }"|"j#                  ddd
dddd	ddd
      }"|"j%                  |$|%|'z  |(z  |&|z  |z  |z        })|)| |<   |%|'|(gg|$z  |!|<     t        | |      }*t        |!|      }!t        j                   |*d      }+t        j&                  |!      },|+|,d}-t)        |-|      S )N)
num_framesheightwidthtemporal_factorfactor
min_pixels
max_pixels)rw   rx   )r7   rn   r   )channel_dimr   rE   )dimr               r-      	   )r0   r1   )datatensor_type)r   itemsshaper   r*   r+   viewresizer	   r   r
   r   FIRSTrescale_and_normalizerepeattorchcatpermutereshapetensorr   ).r:   rk   rl   rm   r7   rn   ro   rp   rq   rr   rs   r   r   r   rt   r2   grouped_videosgrouped_videos_indexresized_videos_groupedr   stacked_videosBTCHWrv   rw   rx   resized_heightresized_widthresized_videosprocessed_videos_groupedprocessed_gridspatchesrepeats
batch_sizegrid_tchannelgrid_hgrid_wflatten_patchesprocessed_videosr0   r1   r   s.                                                 r&   _preprocessz Glm46VVideoProcessor._preprocess   s   $ 0EV/L,,!#%3%9%9%; 	;!E>*00MAq!Q()1aJ0<)!$7%
2#11#001- "0!4!4QUAq!!D!%"!}M"/ "- "
 "0!4!4Q1nm!\,:"5))	;* ((>@TU 0E^/T,,#% %3%9%9%; %	M!E>,:>!;LZjZpZp,q)NM "77
NL*V_N %G }}Q"55:!!RS&/004G!4KQPQSTU))Wg$6A>*1--*;'J22F+z9=J;VFFll#*$*$G ooaAq!Q1aCG%oo&(--
:ZGO />$U+'-vv&>%?*%LOE"K%	MN **BDXY(:NO#ii(8a@o6#6,

 >BBr%   )N)+r   r   r   r   BICUBICresampler7   r   r   rr   r   rs   rm   ro   rq   rl   do_sample_framesr   r   r   r   r   valid_kwargsrv   r@   model_input_namesr   r6   r	   r    r>   r   r"   floatrj   listr   Tensorboolr!   r   r   __classcell__)r;   s   @r&   r)   r)   1   s    "))H&8KLD$&9:N!JIIJLNJLJ1LJ
C.0@A[(F!G [ !%DoD 
	D" #'JJ 5[4J^  $ $,>,F,F )!1504!%*.!%26aCU\\"aC aC 	aC
 oaC *aC aC aC aC DK'$.aC 4;&-aC $JaC !4ZaC $JaC j(4/aCr%   r)   )numpyrL   r   image_processing_utilsr   image_utilsr   r   r   r   r	   r
   processing_utilsr   r   utilsr   r   video_processing_utilsr   r   video_utilsr   r   r   image_processing_glm46vr   r   r)   __all__r$   r%   r&   <module>r      sy   ,   2  5 5 X O O 1\  l"WC- WCWCt "
"r%   