
    qi)                         d Z ddlmZ ddlmZ ddlmZmZmZm	Z	m
Z
 ddlmZmZmZ ddlmZ  ej"                  e      Ze G d de             ZdgZy	)
zq
Processor class for InstructBLIP. Largely copy of Blip2Processor with addition of a tokenizer for the Q-Former.
   )BatchFeature)ProcessorMixin)
AddedTokenPaddingStrategyPreTokenizedInput	TextInputTruncationStrategy)
TensorTypeauto_docstringlogging)
VideoInputc            $           e Zd Zd fd	Ze	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddedz  deez  ee   z  ee   z  de	de	e
z  ez  de	e
z  ez  dedz  d	ed
edz  de	dz  de	de	de	de	de	de	de
ez  dz  def"d       Zed        Z xZS )InstructBlipVideoProcessorNc                     t        |d      s2t        ddd      | _        |j                  | j                  gd       n|j                  | _        || _        t
        |   |||       y)a&  
        qformer_tokenizer (`AutoTokenizer`):
            An instance of ['PreTrainedTokenizer`]. The Q-Former tokenizer is a required input.
        num_query_tokens (`int`, *optional*):
            Number of tokens used by the Qformer as queries, should be same as in model's config.
        video_tokenz<video>FT)
normalizedspecial)special_tokensN)hasattrr   r   
add_tokensnum_query_tokenssuper__init__)selfvideo_processor	tokenizerqformer_tokenizerr   kwargs	__class__s         t/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/instructblipvideo/processing_instructblipvideo.pyr   z#InstructBlipVideoProcessor.__init__$   sd     y-0))tTD  $"2"2!3D I(44D 0)5FG    imagestextadd_special_tokenspadding
truncation
max_lengthstridepad_to_multiple_ofreturn_attention_maskreturn_overflowing_tokensreturn_special_tokens_maskreturn_offsets_mappingreturn_token_type_idsreturn_lengthverbosereturn_tensorsreturnc                    ||t        d      i }|5t        |t              r|g}n.t        |t              st        |d   t              st        d       | j                  d||||||||	|
||||||d|}|j                  d      |d<   |j                  d      |d<   ||| j                  z  } | j                  d||||||||	|
|||||d d|}|c| j                  j                  | j                  z  d	z  }| j                  |d
|	|
||||d 	      }|D ]  }||   D cg c]
  }||   |z    c}||<     |j                  |       |$| j                  ||      }|j                  |       t        ||      }|S c c}w )Nz3You have to specify at least one of images or text.    zAInvalid input text. Please provide a string, or a list of strings)r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   	input_idsqformer_input_idsattention_maskqformer_attention_mask   F)r$   r*   r+   r,   r-   r.   r/   r1   )r1   )tensor_type )
ValueError
isinstancestrlistr   popr   r   r   contentupdater   r   )r   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r   encodingqformer_text_encodingtext_encodingvideo_tokensvideo_text_encodingksampleimage_encodings                             r    __call__z#InstructBlipVideoProcessor.__call__3   s   * >dlRSS$$vd+JtAw4L !dee$:D$:$: %#5%%#5&;*C+E'=&;+-%  !%!$ -B,E,Ek,RH()1F1J1JK[1\H-. %d333
*DNN #5%%#5&;*C+E'=&;+#  !M& !#//77$:O:OORSS&*nn ',*?.G/I+A*?"/#' '5 
'# ' hAVcdeVf'gF(;A(>(G'gM!$hOOM*!11&1XNOON+nE (hs   "E?c                 r    | j                   j                  }| j                  j                  }ddg}||z   |z   S )Nr6   r8   )r   model_input_namesr   )r   tokenizer_input_namesvideo_processor_input_namesqformer_input_namess       r    rM   z,InstructBlipVideoProcessor.model_input_names   sB     $ @ @&*&:&:&L&L#24LM$'BBEXXXr!   )N)NNTFNNr4   NNFFFFFTN)__name__
__module____qualname__r   r   r   r   r   r?   boolr>   r   r	   intr
   r   rK   propertyrM   __classcell__)r   s   @r    r   r   "   si   H  %)Z^#'056:!%)--1*/+0',&+#26#`T!` ++d9o=EV@WW` !	`
 o-` 3J!33` $J` `  $J`  $d{` $(` %)` !%`  $` `  !`" j(4/#`& 
'` `D Y Yr!   r   N)__doc__image_processing_utilsr   processing_utilsr   tokenization_utils_baser   r   r   r   r	   utilsr
   r   r   video_utilsr   
get_loggerrQ   loggerr   __all__r;   r!   r    <module>ra      sg    3 .  9 8 % 
		H	% xY xY xYv (
(r!   