
    qi&                         d Z ddlZddlZddlZddlmZ ddlmZm	Z	m
Z
mZmZmZ ddlmZmZ ddlmZmZ ddlmZmZ dd	lmZmZmZ d
dlmZ  G d ded      Z eded       G d de             ZdgZy)z#video processor class for GLM-4.1V.    N   )BatchFeature)OPENAI_CLIP_MEANOPENAI_CLIP_STDChannelDimensionPILImageResamplingSizeDictget_image_size)UnpackVideosKwargs)
TensorTypeadd_start_docstrings)BASE_VIDEO_PROCESSOR_DOCSTRINGBaseVideoProcessor)VideoMetadatagroup_videos_by_shapereorder_videos   )smart_resizec                   J    e Zd ZU eeef   ed<   eed<   eed<   eed<   eed<   y)Glm4vVideoProcessorInitKwargsmax_image_size
patch_sizetemporal_patch_size
merge_sizemax_durationN)__name__
__module____qualname__dictstrint__annotations__     b/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/glm4v/video_processing_glm4v.pyr   r   %   s&    cN"OOr%   r   F)totalzfConstructs a fast GLM-4V image processor that dynamically resizes videos based on the original videos.aj  
        patch_size (`int`, *optional*, defaults to 14):
            The spacial patch size of the vision encoder.
        temporal_patch_size (`int`, *optional*, defaults to 2):
            The temporal patch size of the vision encoder.
        merge_size (`int`, *optional*, defaults to 2):
            The merge size of the vision encoder to llm encoder.
    c                       e Zd Zej                  ZdddZddiZeZ	e
ZdZdZdZdZdZdZdZdZdZeZd	ZdZd
dgZdee   f fdZ	 d$dedz  def fdZ	 d$dede e!z  dz  fdZ"dddej                  dddddddddfde#e$jJ                     de&de&dedz  dede&de!de&de!e#e!   z  dz  de!e#e!   z  dz  de dz  d e dz  d!e dz  d"e'e(z  dz  fd#Z) xZ*S )%Glm4vVideoProcessori 1  i )shortest_edgelongest_edger+   T      i,     pixel_values_videosvideo_grid_thwkwargsc                     t        |   di | | j                  D| j                  j                  dd       | j                  j                  dd       t	        d      y y )Nr*   r+   :size must contain 'shortest_edge' and 'longest_edge' keys.r$   )super__init__sizeget
ValueError)selfr1   	__class__s     r&   r5   zGlm4vVideoProcessor.__init__N   s^    "6"99 IIMM/408DIIMM.Z^<_<gYZZ =h !r%   Nr6   returnc                 P    |d|vsd|vrt        d      t        |   dd|i|S )z
        Update kwargs that need further processing before being validated
        Can be overridden by subclasses to customize the processing of kwargs.
        r*   r+   r3   r6   r$   )r8   r4   _further_process_kwargs)r9   r6   r1   r:   s      r&   r=   z+Glm4vVideoProcessor._further_process_kwargsU   s>     !<VZ@ZYZZw.CDCFCCr%   metadatafpsc                    |t        |dd      t        d      |j                  }||n| j                  }|dz
  }|j                  xs t        ||j                  z        dz   }|| j                  k  rpt        t        j                  ||z              }t        |      D 	cg c]:  }	t        |t        t        j                  |	|j                  z  |z                    < }
}	nt        | j                  |z        }||k\  rt        t        |            }
n[t        j                  d||d      }|D cg c]7  }t        |t        t        j                  ||j                  z                    9 }
}t!               g }}|
D ])  }||vs|j#                  |       |j%                  |       + t'        |      dz  r|j%                  |d          t        j(                  |      S c c}	w c c}w )	a  
        Args:
            metadata (`VideoMetadata`):
                Metadata of the video containing information about total duration, fps and total number of frames.
            fps (`int` or `float`, *optional*):
                Target frames to sample per second. Defaults to `self.fps`.
        Returns:
            np.ndarray:
                Indices to sample video frames.
        Nr?   zAsked to sample frames per second but no video metadata was provided which is required when sampling in GLM4V. Please pass in `VideoMetadata` object or set `do_sample_frames=False`r   r   T)endpoint)getattrr8   total_num_framesr?   durationroundr   r"   mathfloorrangeminceillistnplinspacesetaddappendlenarray)r9   r>   r?   r1   total_framesrequested_fpsmax_frame_idxrE   niframe_indicesnum_samplestarget_secondstseenuniqidxs                    r&   sample_framesz!Glm4vVideoProcessor.sample_framesc   s     wx=EX 
  00"DHH$q($$Omhll.J(Ka(Ot(((DJJx-789AkpqrkstfgSDIIa(,,>NQ^>^4_0`atMtd//-?@Kl* $U<%8 9!#Q+PT!U_m nZ[]C		!hllBR8S4T!U n nUBd  	!C$C 	!
 t9q=KKR!xx~% u !os   "?G"3<G'gp?videosdo_convert_rgb	do_resizeinterpolation
do_rescalerescale_factordo_normalize
image_mean	image_stdr   r   r   return_tensorsc                    t        |      \  }}i }|j                         D ]  \  }}|j                  \  }}}}}|||}}}|rwt        ||||||z  |j                  |j
                        \  }}|j                  ||z  |||      }| j                  |t        ||      |      }|j                  |||||      }|||<    t        ||      }t        |      \  }}i } i }!|j                         D ]  \  }}t        |d   t        j                        \  }}| j                  |||||	|
      }|}"|"j                  d   |z  dk7  r:|"d d dd f   j                  d|dz
  ddd      }#t        j                   |"|#gd      }"|"j                  d d	 \  }$}%}&|%|z  }%||z  ||z  }(}'|"j                  |$|%||&|'|z  |||(|z  ||
      }"|"j#                  ddd
dddd	ddd
      }"|"j%                  |$|%|'z  |(z  |&|z  |z  |z        })|)| |<   |%|'|(gg|$z  |!|<     t        | |      }*t        |!|      }!t        j                   |*d      }+t        j&                  |!      },|+|,d}-t)        |-|      S )N)
num_framesheightwidthtemporal_factorfactor
min_pixels
max_pixels)rm   rn   )r6   rd   r   )channel_dimr   rB   )dimr               r-      	   )r/   r0   )datatensor_type)r   itemsshaper   r*   r+   viewresizer	   r   r
   r   FIRSTrescale_and_normalizerepeattorchcatpermutereshapetensorr   ).r9   ra   rb   rc   r6   rd   re   rf   rg   rh   ri   r   r   r   rj   r1   grouped_videosgrouped_videos_indexresized_videos_groupedr~   stacked_videosBTCHWrl   rm   rn   resized_heightresized_widthresized_videosprocessed_videos_groupedprocessed_gridspatchesrepeats
batch_sizegrid_tchannelgrid_hgrid_wflatten_patchesprocessed_videosr/   r0   r{   s.                                                 r&   _preprocesszGlm4vVideoProcessor._preprocess   s   $ 0EV/L,,!#%3%9%9%; 	;!E>*00MAq!Q()1aJ0<)!$7%
2#11#001- "0!4!4QUAq!!D!%"!}M"/ "- "
 "0!4!4Q1nm!\,:"5))	;* ((>@TU 0E^/T,,#% %3%9%9%; %	M!E>,:>!;LZjZpZp,q)NM "77
NL*V_N %G }}Q"55:!!RS&/004G!4KQPQSTU))Wg$6A>*1--*;'J22F+z9=J;VFFll#*$*$G ooaAq!Q1aCG%oo&(--
:ZGO />$U+'-vv&>%?*%LOE"K%	MN **BDXY(:NO#ii(8a@o6#6,

 >BBr%   )N)+r   r   r   r   BICUBICresampler6   r   r   rh   r   ri   rc   re   rg   rb   do_sample_framesr   r   r   r   r   valid_kwargsrl   r?   model_input_namesr   r5   r	   r    r=   r   r"   floatr`   rL   r   Tensorboolr!   r   r   __classcell__)r:   s   @r&   r)   r)   -   s    "))H&8KLD$&9:N!JIIJLNJLJ0LJ
C.0@A[(E!F [ !%DoD 
	D" #'00 5[40j  $ $,>,F,F )!1504!%*.!%26aCU\\"aC aC 	aC
 oaC *aC aC aC aC DK'$.aC 4;&-aC $JaC !4ZaC $JaC j(4/aCr%   r)   ) __doc__rG   numpyrM   r   image_processing_utilsr   image_utilsr   r   r   r   r	   r
   processing_utilsr   r   utilsr   r   video_processing_utilsr   r   video_utilsr   r   r   image_processing_glm4vr   r   r)   __all__r$   r%   r&   <module>r      s    *    2  5 5 X O O 0L  l"}C, }C}C@ !
!r%   