
    qi                         d Z ddlmZ ddlZddlmc mc mZ ddl	m
Z
 ddlmZmZmZmZ ddlmZmZ ddlmZ dd	lmZ dd
lmZmZmZ  G d ded      Z G d de      ZdgZy)z(Fast Video processor class for InternVL.    )OptionalN   )BatchFeature)OPENAI_CLIP_MEANOPENAI_CLIP_STDPILImageResamplingSizeDict)UnpackVideosKwargs)
TensorType)BaseVideoProcessor)VideoMetadatagroup_videos_by_shapereorder_videosc                   $    e Zd ZU eez  ez  ed<   y) InternVLVideoProcessorInitKwargsinitial_shiftN)__name__
__module____qualname__boolfloatint__annotations__     h/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/internvl/video_processing_internvl.pyr   r      s    %<#%%r   r   F)totalc                   4    e Zd Zej                  ZeZeZ	dddZ
dZdZdZdZdZdZeZdee   f fdZ	 	 	 dded	edz  d
eez  dz  deez  ez  dz  fdZ	 dded   dedededed   dedededededeee   z  dz  deee   z  dz  deez  dz  defdZ  xZ!S ) InternVLVideoProcessori  )heightwidthTFkwargsc                 $    t        |   di | y )Nr   )super__init__)selfr#   	__class__s     r   r&   zInternVLVideoProcessor.__init__.   s    "6"r   Nmetadata
num_framesfpsr   c                 d   ||n| j                   }||n| j                  }|j                  }|6|4||j                  t	        d      t        ||j                  z  |z        }|du r||z  dz  }||kD  rt	        d| d| d      t        j                  ||||z        j                         }|S )a  
        Default sampling function which uniformly samples the desired number of frames between 0 and total number of frames.
        If `fps` is passed along with metadata, `fps` frames per second are sampled uniformty. Arguments `num_frames`
        and `fps` are mutually exclusive.

        Args:
            metadata (`VideoMetadata`):
                Metadata of the video containing information about total duration, fps and total number of frames.
            num_frames (`int`, *optional*):
                Maximum number of frames to sample. Defaults to `self.num_frames`.
            fps (`int` or `float`, *optional*):
                Target frames to sample per second. Defaults to `self.fps`.
            initial_shift (`bool`, `float` or `int`, defaults to `self.initial_shift`):
                The initial shift to apply when sampling frames. If `True`, the shift is set so that frames are sampled from the middle of the video.

        Returns:
            np.ndarray:
                Indices to sample video frames.
        zAsked to sample `fps` frames per second but no video metadata was provided which is required when sampling with `fps`. Please pass in `VideoMetadata` object or use a fixed `num_frames` per input videoT   z(Video can't be sampled. The `num_frames=z` exceeds `total_num_frames=z`. )r*   r   total_num_framesr+   
ValueErrorr   torcharange)r'   r)   r*   r+   r   r#   r.   indicess           r   sample_framesz$InternVLVideoProcessor.sample_frames1   s    6 $.#9Zt
)6)BHZHZ#44 #/8<<#7 h  -<sBCJD ,z9A=M((::,Fbcsbttwx  ,,}.>@PS]@]^bbdr   videosztorch.Tensordo_convert_rgb	do_resizesizeinterpolationztvF.InterpolationModedo_center_crop	crop_size
do_rescalerescale_factordo_normalize
image_mean	image_stdreturn_tensorsreturnc           	         t        |      \  }}i }|j                         D ]3  \  }}|r| j                  |      }|r| j                  |||      }|||<   5 t	        ||      }t        |      \  }}i }|j                         D ]4  \  }}|r| j                  ||      }| j                  |||	|
||      }|||<   6 t	        ||      }t        d|i|      S )N)r7   r8   pixel_values_videos)datatensor_type)r   itemsconvert_to_rgbresizer   center_croprescale_and_normalizer   )r'   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   r@   r#   grouped_videosgrouped_videos_indexresized_videos_groupedshapestacked_videosresized_videosprocessed_videos_groupedprocessed_videoss                          r   _preprocessz"InternVLVideoProcessor._preprocessd   s   $ 0EV/L,,!#%3%9%9%; 	;!E>!%!4!4^!D!%^$Vc!d,:"5)	; ((>@TU 0E^/T,,#% %3%9%9%; 	=!E>!%!1!1.)!L!77
NL*V_N /=$U+	= **BDXY"79I!JXfggr   )NNN)N)"r   r   r   r   BICUBICresampler   r>   r   r?   r7   r6   r;   r=   r5   r   do_sample_framesr   valid_kwargsr
   r&   r   r   r   r   r3   listr	   r   strr   r   rS   __classcell__)r(   s   @r   r    r    !   s~   !))H!JIC(DIJLNM3L#(H!I # "&"&3711 $J1 5[4	1
 e|c)D01B 37+h^$+h +h 	+h
 +h   78+h +h +h +h +h +h DK'$.+h 4;&-+h j(4/+h  
!+hr   r    )__doc__typingr   r0   $torchvision.transforms.v2.functional
transformsv2
functionaltvFimage_processing_utilsr   image_utilsr   r   r   r	   processing_utilsr
   r   utilsr   video_processing_utilsr   video_utilsr   r   r   r   r    __all__r   r   r   <module>ri      sX    /   2 2 2 Z Z 4  8 O O&|5 &nh/ nhb $
$r   