
    qi                       d dl Z d dlmZ d dlmZ d dlmZ d dlZd dlmZ ddl	m
Z ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZmZmZmZmZ ddlmZmZ ddlmZ ddl m!Z! ddl"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z( ddl)m*Z* ddl+m,Z,m-Z- ddl.m/Z/m0Z0m1Z1 ddl2m3Z3m4Z4m5Z5  e'jl                  e7      Z8 G d dejr                        Z: G d dejr                        Z;e% G d de             Z<	 dLdejr                  dejz                  dejz                  d ejz                  d!ejz                  dz  d"e>d#e>fd$Z? G d% d&ejr                        Z@ G d' d(ejr                        ZA G d) d*e      ZB G d+ d,ejr                        ZC G d- d.e<      ZD G d/ d0ejr                        ZE G d1 d2ejr                        ZF G d3 d4ejr                        ZG G d5 d6ejr                        ZH G d7 d8ejr                        ZI G d9 d:e      ZJ G d; d<ejr                        ZK G d= d>e<      ZLe e%d?@       G dA dBe#                    ZM e%dC@       G dD dEe<             ZNee% G dF dGe                    ZO e%dH@       G dI dJe<e             ZPg dKZQy)M    N)Callable)	dataclass)Any)nn   )initialization)ACT2FN)GenerationMixin)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsBaseModelOutputWithPooling,BaseModelOutputWithPoolingAndCrossAttentionsCausalLMOutputWithPastSeq2SeqLMOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)apply_chunking_to_forward)ModelOutputTransformersKwargsauto_docstringcan_return_tuplelogging	torch_int)merge_with_config_defaults)OutputRecordercapture_outputs   )	AutoModelAutoModelForCausalLMAutoModelForSeq2SeqLM   )InstructBlipVideoConfigInstructBlipVideoQFormerConfigInstructBlipVideoVisionConfigc                        e Zd Zdef fdZdej                  dededej                  fdZddej                  d	e
dej                  fd
Z xZS )!InstructBlipVideoVisionEmbeddingsconfigc                 p   t         |           || _        |j                  | _        |j
                  | _        |j                  | _        t        j                  t        j                  dd| j                              | _        t        j                  d| j                  | j                  | j                        | _        | j
                  | j                  z  dz  | _        | j                  dz   | _        t        j                  t        j                  d| j                  | j                              | _        y )Nr$   r   )in_channelsout_channelskernel_sizestrider    )super__init__r*   hidden_size	embed_dim
image_size
patch_sizer   	Parametertorchrandnclass_embeddingConv2dpatch_embeddingnum_patchesnum_positionsposition_embeddingselfr*   	__class__s     r/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/instructblipvideo/modeling_instructblipvideo.pyr1   z*InstructBlipVideoVisionEmbeddings.__init__=   s    ++ ++ ++!||EKK1dnn,MN!yyDOO\`\k\k 
 !OOt>1D!--1"$,,u{{1d>P>PRVR`R`/a"b    
embeddingsheightwidthreturnc                    |j                   d   dz
  }| j                  j                   d   dz
  }t        j                  j	                         s||k(  r||k(  r| j                  S | j                  ddddf   }| j                  ddddf   }|j                   d   }|| j
                  z  }	|| j
                  z  }
t        |dz        }|j                  d|||      }|j                  dddd      }t        j                  j                  ||	|
fdd	
      }|j                  dddd      j                  dd|      }t        j                  ||fd      S )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r$   Ng      ?r   r   r    bicubicF)sizemodealign_cornersdim)shaper>   r7   jit
is_tracingr5   r   reshapepermuter   
functionalinterpolateviewcat)r@   rD   rE   rF   r<   r=   class_pos_embedpatch_pos_embedrO   
new_height	new_widthsqrt_num_positionss               rB   interpolate_pos_encodingz:InstructBlipVideoVisionEmbeddings.interpolate_pos_encodingO   s`    !&&q)A-//55a81< yy##%+*F6UZ?***11!RaR%811!QR%8r"t.
T__,	&}c'9:)11!5GI[]`a)11!Q1=--33i(	 4 
 *11!Q1=BB1b#Nyy/?;CCrC   pixel_valuesr^   c                    |j                   \  }}}}| j                  j                  j                  }| j                  |j	                  |            }|j                  d      j                  dd      }| j                  j                  |dd      j	                  |      }	t        j                  |	|gd      }
|r| j                  |
||      }n| j                  }|
|d d d |
j                  d      d d f   j	                  |      z   }
|
S )Ndtyper    r$   rI   rN   )rP   r;   weightrb   toflatten	transposer9   expandr7   rX   r^   r>   rK   )r@   r_   r^   
batch_size_rE   rF   target_dtypepatch_embedsclass_embedsrD   r>   s               rB   forwardz)InstructBlipVideoVisionEmbeddings.forwardw   s    '3'9'9$
Avu++2288++LOO,O,OP#++A.88A>++22:q"EHHVYYl;C
#!%!>!>z6SX!Y!%!8!8"4Q8L*//!:L8La5O"P"S"ST`"aa
rC   F)__name__
__module____qualname__r'   r1   r7   Tensorintr^   FloatTensorboolrm   __classcell__rA   s   @rB   r)   r)   <   sm    c< c$&D5<< &D &DUX &D]b]i]i &DPE$5$5 QU bgbnbn rC   r)   c                   2     e Zd ZdZ fdZ	 	 	 	 ddZ xZS )"InstructBlipVideoQFormerEmbeddingsz;Construct the embeddings from word and position embeddings.c                 ,   t         |           t        j                  |j                  |j
                  |j                        | _        t        j                  |j                  |j
                        | _	        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        | j!                  dt#        j$                  |j                        j'                  d      d       || _        y )N)padding_idxepsposition_idsr$   rI   F)
persistent)r0   r1   r   	Embedding
vocab_sizer2   pad_token_idword_embeddingsmax_position_embeddingsposition_embeddings	LayerNormlayer_norm_eps	layernormDropouthidden_dropout_probdropoutregister_bufferr7   arangerg   r*   r?   s     rB   r1   z+InstructBlipVideoQFormerEmbeddings.__init__   s    !||F,=,=v?Q?Q_e_r_rs#%<<0N0NPVPbPb#c f&8&8f>S>STzz&"<"<= 	ELL)G)GHOOPWXej 	 	
 rC   c                    ||j                         d   }nd}|&| j                  d d |||z   f   j                         }|\| j                  |      }| j	                  |j                  |j                              }||z   }|t        j                  ||fd      }n|}|j                  | j                  j                  j                        }| j                  |      }| j                  |      }|S )Nr$   r   rN   )rK   r~   cloner   r   rd   devicer7   rX   r   rc   rb   r   )r@   	input_idsr~   query_embedspast_key_values_length
seq_lengthrD   r   s           rB   rm   z*InstructBlipVideoQFormerEmbeddings.forward   s      ")!,JJ,,Q0FVlIl0l-lmssuL --i8J"&":":<??:K\K\;]"^#&99J'"YYj'AqI
%J]]4>>#8#8#>#>?
^^J/
\\*-
rC   NNNr   )ro   rp   rq   __doc__r1   rm   rv   rw   s   @rB   ry   ry      s    E"  rC   ry   c                   z     e Zd ZU eed<   dZdZdZdZdZ	dZ
dZdZg dZ ej                          fd       Z xZS ) InstructBlipVideoPreTrainedModelr*   blip)videotextT)ry   InstructBlipVideoAttention*InstructBlipVideoQFormerMultiHeadAttention"InstructBlipVideoQFormerSelfOutputc                 :   t         |   |       | j                  j                  }t	        |t
              rEt        j                  |j                  d|       t        j                  |j                  d|       yt	        |t        t        f      r t        j                  |j                         yt	        |t              rZt        j                  |j                   t#        j$                  |j                   j&                  d         j)                  d             yy)zInitialize the weights        )meanstdrI   r   N)r0   _init_weightsr*   initializer_range
isinstancer)   inittrunc_normal_r>   r9   )InstructBlipVideoForConditionalGenerationInstructBlipVideoModelzeros_query_tokensry   copy_r~   r7   r   rP   rg   )r@   modulefactorrA   s      rB   r   z.InstructBlipVideoPreTrainedModel._init_weights   s     	f%..f?@v88sOv55CVL!JLb cdKK++, BCJJv**ELL9L9L9R9RSU9V,W,^,^_f,gh DrC   )ro   rp   rq   r%   __annotations__base_model_prefixinput_modalitiessupports_gradient_checkpointing_supports_attention_backend_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraph_no_split_modulesr7   no_gradr   rv   rw   s   @rB   r   r      s^    ##(&*#"&N! U]]_
i 
irC   r   r   querykeyvalueattention_maskscalingr   c                 p   t        j                  ||j                  dd            |z  }|||z   }t        j                  j                  |d      }t        j                  j                  ||| j                        }t        j                  ||      }	|	j                  dd      j                         }	|	|fS )NrI   rN   )ptrainingr$   r    )	r7   matmulrf   r   rU   softmaxr   r   
contiguous)
r   r   r   r   r   r   r   kwargsattn_weightsattn_outputs
             rB   eager_attention_forwardr      s     <<s}}R'<=GL!#n4==((2(>L==((6??([L,,|U3K''1-88:K$$rC   c                        e Zd ZdZ fdZdej                  dedefdZdej                  de	ej                  ej                  d	z  e	ej                     d	z  f   fd
Z
 xZS )r   z=Multi-headed attention from 'Attention Is All You Need' paperc                    t         |           || _        |j                  | _        |j
                  | _        | j                  | j                  z  | _        | j                  | j                  z  | j                  k7  r&t        d| j                   d| j                   d      | j                  dz  | _	        d| _
        |j                  | _        t        j                  | j                  d| j                  z  d      | _        |j                  ret        j                   t#        j$                  | j                              }t        j                   t#        j$                  | j                              }nd }d }|Qt#        j&                  |t#        j(                  |d      |f      }t        j                   |      | j                  _        t        j                  | j                  | j                        | _        y )	Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      Fr   )bias)requires_grad)r0   r1   r*   r2   r3   num_attention_heads	num_headshead_dim
ValueErrorscale	is_causalattention_dropoutr   Linearqkvqkv_biasr6   r7   zerosrX   
zeros_liker   
projection)r@   r*   q_biasv_biasr   rA   s        rB   r1   z#InstructBlipVideoAttention.__init__   su   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
!'!9!9 99T^^Q-?eL??\\%++dnn"=>F\\%++dnn"=>FFFyy&%*:*:6QV*WY_!`aHLL2DHHM))DNNDNNCrC   tensorseq_lenbszc                     |j                  ||| j                  | j                        j                  dd      j	                         S )Nr$   r    )rW   r   r   rf   r   )r@   r   r   r   s       rB   _shapez!InstructBlipVideoAttention._shape  s7    {{3GQQRSUVWbbddrC   hidden_statesrG   Nc                    |j                         \  }}}| j                  |      }|j                  ||d| j                  || j                  z        j	                  ddddd      }|d   |d   |d   }	}}t        j                  | j                  j                  t              }
 |
| |||	fd| j                  sdn| j                  | j                  d|\  }}|j                  ||d	      j                         }| j                  |      }||fS )
z#Input shape: Batch x Time x Channelr   r    r   r$      Nr   )r   r   r   rI   )rK   r   rS   r   rT   r   get_interfacer*   _attn_implementationr   r   r   r   r   r   )r@   r   r   r   tgt_lenr3   	mixed_qkvquery_states
key_statesvalue_statesattention_interfacer   r   s                rB   rm   z"InstructBlipVideoAttention.forward  s#    #0"4"4"6WiHH]+	%%c7At~~yTXTbTbGbckkq!Q
	 2;1y|YWX\,j(?(M(MKK,,.E)
 %8		%

  #}}C$2H2HJJ	%
 	%
!\ "))#w;FFHook2L((rC   )ro   rp   rq   r   r1   r7   rr   rs   r   tuplerm   rv   rw   s   @rB   r   r      sp    GD>eU\\ eC ec e")||") 
u||U\\D0%2E2LL	M	")rC   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )InstructBlipVideoMLPc                    t         |           || _        t        |j                     | _        t        j                  |j                  |j                        | _
        t        j                  |j                  |j                        | _        y N)r0   r1   r*   r	   
hidden_actactivation_fnr   r   r2   intermediate_sizefc1fc2r?   s     rB   r1   zInstructBlipVideoMLP.__init__=  sd    #F$5$5699V//1I1IJ99V55v7I7IJrC   r   rG   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r   )r   r   r   r@   r   s     rB   rm   zInstructBlipVideoMLP.forwardD  s4    /**=9/rC   ro   rp   rq   r1   r7   rr   rm   rv   rw   s   @rB   r   r   <  s$    KU\\ ell rC   r   c                   p     e Zd Zdef fdZedej                  dee	   dej                  fd       Z xZS )InstructBlipVideoEncoderLayerr*   c                 D   t         |           |j                  | _        t	        |      | _        t        j                  | j                  |j                        | _	        t        |      | _        t        j                  | j                  |j                        | _        y Nr|   )r0   r1   r2   r3   r   	self_attnr   r   r   layer_norm1r   mlplayer_norm2r?   s     rB   r1   z&InstructBlipVideoEncoderLayer.__init__L  sm    ++3F;<<F<Q<QR'/<<F<Q<QRrC   r   r   rG   c                     |}| j                  |      } | j                  dd|i|\  }}||z   }|}| j                  |      }| j                  |      }||z   }|S )Nr    )r   r   r  r  )r@   r   r   residualri   s        rB   rm   z%InstructBlipVideoEncoderLayer.forwardT  s}     !((7)4>> 
'

q &0 ((7/%0rC   )ro   rp   rq   r%   r1   r   r7   rr   r   r   rt   rm   rv   rw   s   @rB   r   r   K  sR    S6 S || +, 
			 rC   r   c                   N     e Zd ZdZdef fdZedee   de	e
z  fd       Z xZS )InstructBlipVideoEncodera"  
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`InstructBlipVideoEncoderLayer`].

    Args:
        config (`InstructBlipVideoConfig`):
            The corresponding vision configuration for the `InstructBlipVideoEncoder`.
    r*   c                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w NF)
r0   r1   r*   r   
ModuleListrangenum_hidden_layersr   layersgradient_checkpointing)r@   r*   ri   rA   s      rB   r1   z!InstructBlipVideoEncoder.__init__u  sQ    mmTYZ`ZrZrTs$tq%B6%J$tu&+# %us   A#r   rG   c                 R    |}| j                   D ]  } ||fi |} t        |      S )Nlast_hidden_state)r  r   )r@   inputs_embedsr   r   encoder_layers        rB   rm   z InstructBlipVideoEncoder.forward{  s>     &![[ 	M)M	 ??rC   )ro   rp   rq   r   r%   r1   r   r   r   r   r   rm   rv   rw   s   @rB   r  r  k  sL    ,6 , @ +,@ 
	 	@ @rC   r  c                        e Zd ZU dZdZeed<   eedZ	def fdZ
e ed      e	 	 ddej                  dz  d	ed
ee   deez  fd                     Zd Z xZS )InstructBlipVideoVisionModelr_   r   r*   )r   
attentionsc                     t         |   |       || _        |j                  }t	        |      | _        t        |      | _        t        j                  ||j                        | _        | j                          y r   )r0   r1   r*   r2   r)   rD   r  encoderr   r   r   post_layernorm	post_init)r@   r*   r3   rA   s      rB   r1   z%InstructBlipVideoVisionModel.__init__  s]     &&	;FC/7 ll9&:O:OPrC   F)tie_last_hidden_statesNr^   r   rG   c                     |t        d      | j                  ||      } | j                  dd|i|}|j                  }| j	                  |      }|d d dd d f   }| j	                  |      }t        ||      S )Nz You have to specify pixel_values)r^   r  r   r  pooler_outputr  )r   rD   r  r  r  r   )r@   r_   r^   r   r   encoder_outputsr  pooled_outputs           rB   rm   z$InstructBlipVideoVisionModel.forward  s     ?@@Ogh+74<< ,
',
,

 ,== //0AB)!Q'2++M:)/'
 	
rC   c                     | j                   S r   )rD   r@   s    rB   get_input_embeddingsz1InstructBlipVideoVisionModel.get_input_embeddings  s    rC   r	  )ro   rp   rq   main_input_namer   r'   r   r   r   _can_record_outputsr1   r   r   r   r7   rt   ru   r   r   r   r   rm   r#  rv   rw   s   @rB   r  r    s    $O))60
	< 	  E2 26).
''$.
 #'
 +,	

 
+	+
  3  
6rC   r  c                   X     e Zd Zd	 fd	Zd Zd Zd Zd Zd Z	 	 	 d
de	e
   fdZ xZS )r   c                    t         |           || _        |j                  |j                  z  dk7  r0t        |d      s$t        d|j                  |j                  fz        |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _	        t        j                  |j                  | j                        | _        |r_t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        n^t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                   |j"                        | _        d| _        y )Nr   embedding_sizezLThe hidden size (%d) is not a multiple of the number of attention heads (%d)F)r0   r1   r*   r2   r   hasattrr   rs   attention_head_sizeall_head_sizer   r   r   encoder_hidden_sizer   r   r   attention_probs_dropout_probr   save_attentionr@   r*   is_cross_attentionrA   s      rB   r1   z3InstructBlipVideoQFormerMultiHeadAttention.__init__  sa    : ::a?PVXhHi^%%v'A'ABC 
 $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
yy!;!;T=O=OPDH6#=#=t?Q?QRDJyy!3!3T5G5GHDH6#5#5t7I7IJDJzz&"E"EF#rC   c                     || _         y r   attn_gradients)r@   r3  s     rB   save_attn_gradientsz>InstructBlipVideoQFormerMultiHeadAttention.save_attn_gradients  s
    ,rC   c                     | j                   S r   r2  r"  s    rB   get_attn_gradientsz=InstructBlipVideoQFormerMultiHeadAttention.get_attn_gradients  s    """rC   c                     || _         y r   attention_map)r@   r9  s     rB   save_attention_mapz=InstructBlipVideoQFormerMultiHeadAttention.save_attention_map  s
    *rC   c                     | j                   S r   r8  r"  s    rB   get_attention_mapz<InstructBlipVideoQFormerMultiHeadAttention.get_attention_map  s    !!!rC   c                     |j                         d d | j                  | j                  fz   } |j                  | }|j	                  dddd      S )NrI   r   r    r$   r   )rK   r   r*  rW   rT   )r@   xnew_x_shapes      rB   transpose_for_scoresz?InstructBlipVideoQFormerMultiHeadAttention.transpose_for_scores  sN    ffhsmt'?'?AYAY&ZZAFFK yyAq!$$rC   r   c                    |d u}|rC| j                  | j                  |            }| j                  | j                  |            }|}n@| j                  | j                  |            }| j                  | j                  |            }| j                  |      }	| j                  |	      }
t	        j
                  |
|j                  dd            }|t        j                  | j                        z  }|j                  }|||z   } t        j                  d      |      j                  |      }|r8| j                  r,| j                  |       |j!                  | j"                         | j%                  |      }t	        j
                  ||      }|j'                  dddd      j)                         }|j+                         d d | j,                  fz   } |j.                  | }||fS )NrI   r   rN   r   r    r$   r   )r@  r   r   r   r7   r   rf   mathsqrtr*  rb   r   Softmaxrd   r.  r:  register_hookr4  r   rT   r   rK   r+  rW   )r@   r   r   encoder_hidden_statesencoder_attention_maskr   r0  	key_layervalue_layermixed_query_layerquery_layerattention_scoresattention_scores_dtypeattention_probsattention_probs_droppedcontext_layernew_context_layer_shapes                    rB   rm   z2InstructBlipVideoQFormerMultiHeadAttention.forward  s    3$>11$((;P2QRI33DJJ?T4UVK3N11$((=2IJI33DJJ}4MNK JJ}5//0AB !<<Y5H5HR5PQ+dii8P8P.QQ!1!7!7%/.@ -"**,-=>AABXY$"5"5##O4))$*B*BC #',,"?%<kJ%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S***,CDo--rC   rn   NNN)ro   rp   rq   r1   r4  r6  r:  r<  r@  r   r   rm   rv   rw   s   @rB   r   r     sA    $0-#+"% "#4. +,4.rC   r   c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )r   c                 (   t         |           t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        t        j                  |j                        | _
        y r   )r0   r1   r   r   r2   denser   r   r   r   r   r?   s     rB   r1   z+InstructBlipVideoQFormerSelfOutput.__init__#  s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=rC   r   input_tensorrG   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S r   rU  r   r   r@   r   rV  s      rB   rm   z*InstructBlipVideoQFormerSelfOutput.forward)  7    

=1]3}|'CDrC   r   rw   s   @rB   r   r   "  1    >U\\  RWR^R^ rC   r   c                        e Zd Zd
 fd	Z	 	 	 ddej
                  dej                  dz  dej                  dz  dej                  dz  dee   dej
                  fd	Z	 xZ
S )!InstructBlipVideoQFormerAttentionc                 d    t         |           t        ||      | _        t	        |      | _        y r   )r0   r1   r   	attentionr   outputr/  s      rB   r1   z*InstructBlipVideoQFormerAttention.__init__1  s)    CFL^_8@rC   Nr   r   rF  rG  r   rG   c                 ^     | j                   d||||d|\  }}| j                  ||      }|S )N)r   r   rF  rG  r  )r_  r`  )	r@   r   r   rF  rG  r   r   ri   attention_outputs	            rB   rm   z)InstructBlipVideoQFormerAttention.forward6  sL     ( 
')"7#9	

 
Q  ;;{MBrC   rn   rR  )ro   rp   rq   r1   r7   rr   rt   r   r   rm   rv   rw   s   @rB   r]  r]  0  s    A 48:>;? ||  ))D0   %0047	 
 !& 1 1D 8  +,  
 rC   r]  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )$InstructBlipVideoQFormerIntermediatec                    t         |           t        j                  |j                  |j
                        | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y r   )r0   r1   r   r   r2   r   rU  r   r   strr	   intermediate_act_fnr?   s     rB   r1   z-InstructBlipVideoQFormerIntermediate.__init__J  s]    YYv1163K3KL
f''-'-f.?.?'@D$'-'8'8D$rC   r   rG   c                 J    | j                  |      }| j                  |      }|S r   )rU  rg  r   s     rB   rm   z,InstructBlipVideoQFormerIntermediate.forwardR  s&    

=100?rC   r   rw   s   @rB   rd  rd  I  s#    9U\\ ell rC   rd  c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )InstructBlipVideoQFormerOutputc                 (   t         |           t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        y r   )r0   r1   r   r   r   r2   rU  r   r   r   r   r   r?   s     rB   r1   z'InstructBlipVideoQFormerOutput.__init__Y  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=rC   r   rV  rG   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S r   rX  rY  s      rB   rm   z&InstructBlipVideoQFormerOutput.forward_  rZ  rC   r   rw   s   @rB   rj  rj  X  r[  rC   rj  c                   F     e Zd Z fdZ	 	 	 	 ddee   fdZd Zd Z xZ	S )InstructBlipVideoQFormerLayerc                 f   t         |           |j                  | _        d| _        t	        |      | _        || _        ||j                  z  dk(  rt	        |d      | _        d| _	        nd| _	        t        |      | _        t        |      | _        t        |      | _        t        |      | _        y )Nr$   r   T)r0  F)r0   r1   chunk_size_feed_forwardseq_len_dimr]  r_  	layer_idxcross_attention_frequencycrossattentionhas_cross_attentionrd  intermediaterj  r`  intermediate_queryoutput_queryr@   r*   rr  rA   s      rB   r1   z&InstructBlipVideoQFormerLayer.__init__g  s    '-'E'E$:6B"v7771<"CF_c"dD'+D$',D$@H4V<"Fv"N:6BrC   r   c           
      b    | j                   |fd|i|}|dkD  r|d d d |d d f   }| j                  r$|t        d       | j                  |f|||d|}t	        | j
                  | j                  | j                  |      }	|j                  d   |kD  rjt	        | j                  | j                  | j                  |d d |d d d f         j                  |	j                        }
t        j                  |	|
gd      }	|	S t	        | j                  | j                  | j                  |      }	|	S )Nr   r   z>encoder_hidden_states must be given for cross-attention layers)r   rF  rG  r$   rN   )r_  ru  r   rt  r   feed_forward_chunk_queryrp  rq  rP   feed_forward_chunkrd   r   r7   rX   )r@   r   r   rF  rG  query_lengthr   rb  query_attention_outputlayer_outputlayer_output_texts              rB   rm   z%InstructBlipVideoQFormerLayer.forward{  sp    *4>>
)
 
 !%5a,6I%J"''(0$%eff)<)<)<**#1*?+A	*
 *& 5--,,  &	L  %%a(<7$=++00$$$Qq%89	%
 "\(() "  %yy,8I)JPQR  5'',,   	L rC   c                 L    | j                  |      }| j                  ||      }|S r   )rv  r`  r@   rb  intermediate_outputr  s       rB   r|  z0InstructBlipVideoQFormerLayer.feed_forward_chunk  s,    "//0@A{{#68HIrC   c                 L    | j                  |      }| j                  ||      }|S r   )rw  rx  r  s       rB   r{  z6InstructBlipVideoQFormerLayer.feed_forward_chunk_query  s.    "556FG(()<>NOrC   r   )
ro   rp   rq   r1   r   r   rm   r|  r{  rv   rw   s   @rB   rn  rn  f  s7    C. "#3 +,3j
rC   rn  c                   D     e Zd Z fdZe	 	 	 	 ddee   fd       Z xZS )InstructBlipVideoQFormerEncoderc           	          t         |           || _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        d| _	        y c c}w r	  )
r0   r1   r*   r   r
  r  r  rn  layerr  ry  s      rB   r1   z(InstructBlipVideoQFormerEncoder.__init__  sW    ]]OTU[UmUmOno)*69=o

 ',# ps   A$r   c                     t        | j                  j                        D ]  }| j                  |   } ||||f||d|}! t	        |      S )N)rG  r}  r  )r  r*   r  r  r   )	r@   r   r   rF  rG  r}  r   ilayer_modules	            rB   rm   z'InstructBlipVideoQFormerEncoder.forward  sk     t{{445 
	A::a=L(% (>) M
	 9+
 	
rC   r   )	ro   rp   rq   r1   r   r   r   rm   rv   rw   s   @rB   r  r    s:    ,  "#
 +,
 
rC   r  c                       e Zd ZdZdZdZdZdZe e	e
dd      g e	e
dd      gdZdef fd	Zd
 Zd Z	 ddej"                  dee   dej(                  dedej"                  f
dZeee	 	 	 	 	 ddej4                  dej6                  dz  dej4                  dz  dej"                  dz  dej6                  dz  dej6                  dz  dee   deej6                     ez  fd                     Z xZ S )InstructBlipVideoQFormerModelz
    Querying Transformer (Q-Former), used in InstructBlipVideo. Slightly modified from BLIP-2 as it also takes the
    instruction as input.
    Fr$   z
.attention)index
layer_namez.crossattention)r   r  cross_attentionsr*   c                     t         |   |       || _        t        |      | _        t        |      | _        | j                          y r   )r0   r1   r*   ry   rD   r  r  r  r?   s     rB   r1   z&InstructBlipVideoQFormerModel.__init__  s9     <VD6v>rC   c                 .    | j                   j                  S r   rD   r   r"  s    rB   r#  z2InstructBlipVideoQFormerModel.get_input_embeddings  s    ...rC   c                 &    || j                   _        y r   r  r@   r   s     rB   set_input_embeddingsz2InstructBlipVideoQFormerModel.set_input_embeddings  s    */'rC   r   input_shaper   	has_queryrG   c                    |j                         dk(  r|dddddddf   }n=|j                         dk(  r|ddddddf   }nt        d| d|j                   d      |j                  | j                        }d|z
  d	z  }|S )
a>  
        Makes broadcastable attention and causal masks so that future and masked tokens are ignored.

        Arguments:
            attention_mask (`torch.Tensor`):
                Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
            input_shape (`tuple[int]`):
                The shape of the input to the model.
            device: (`torch.device`):
                The device of the input to the model.

        Returns:
            `torch.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`.
        r   Nr    z!Wrong shape for input_ids (shape z) or attention_mask (shape )ra   g      ?g     )rO   r   rP   rd   rb   )r@   r   r  r   r  extended_attention_masks         rB   get_extended_attention_maskz9InstructBlipVideoQFormerModel.get_extended_attention_mask  s    . 1$&4Qa]&C#!Q& '5QdA5E&F#3K=@[\j\p\p[qqrs  #:"<"<4::"<"N#&)@#@H"L&&rC   Nr   r~   r   rF  rG  r   c                    ||t        d      ||j                  d   nd}| j                  |||      }	|	j                         dd }
|
\  }}|	j                  }|t        j                  ||f|      }| j                  ||
|      }|t        |t              r|d   j                         \  }}}n|j                         \  }}}||f}t        |t              r|D cg c]  }| j                  |       }}n?|)t        j                  ||      }| j                  |      }n| j                  |      }nd} | j                  |	f||||d|}|j                  }|dddddf   }t        ||	      S c c}w )
a$  
        query_embeds (`torch.FloatTensor`  of shape `(batch_size, sequence_length, hidden_size)`):
            Hidden states to be used in the attention computation. If cross-attention,
            will be used for the query (i.e., key and value will use the encoder_hidden_states).
        Nz7You have to specify query_embeds when input_ids is Noner$   r   )r   r~   r   rI   )r   )r   rF  rG  r}  r  )r   rP   rD   rK   r   r7   onesr  r   listinvert_attention_maskr  r  r   )r@   r   r   r~   r   rF  rG  r   r}  embedding_outputr  rh   r   r   r  encoder_batch_sizeencoder_sequence_lengthri   encoder_hidden_shapemaskencoder_extended_attention_maskr  sequence_outputr   s                           rB   rm   z%InstructBlipVideoQFormerModel.forward/  s   $ !5VWW0<0H|))!,a??%% + 
 '++-cr2!,
J!((!"ZZ*j)A6RN #'"B"B>S^`f"g !,/6AVWXAYA^A^A`>"$;QAVA[A[A]>"$;Q$68O#P 0$7`v2wX\43M3Md3S2w/2w'/).4HQW)X&262L2LMc2d/262L2LMc2d/.2++74<<,
2"7#B%,
 ,
 *;;'1a0;-'
 	
) 3xs   E6rn   )NNNNN)!ro   rp   rq   r   r   r   r   r   rn  r   r   r%  r&   r1   r#  r  r7   rr   r   rs   r   ru   r  r   r   r   
LongTensorrt   r   r   r   rm   rv   rw   s   @rB   r  r    s   
 #( N 7EQ[gh
 EQ[lm
= /0  )')' 3Z)' 	)'
 )' 
)'V   4804,0:>;?F
##F
 ))D0F
 &&-	F

 llT)F
  %0047F
 !& 1 1D 8F
 +,F
 
u  	!$P	PF
    F
rC   r  zV
    Class defining the outputs of [`InstructBlipVideoForConditionalGeneration`].
    )custom_introc                       e Zd ZU dZdZeej                     dz  ed<   dZ	eej                     dz  ed<   dZ
edz  ed<   dZedz  ed<   dZeez  dz  ed<   dee   fd	Zy)
4InstructBlipVideoForConditionalGenerationModelOutputa  
    loss (`torch.FloatTensor`, *optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
        Language modeling loss from the language model.
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head of the language model.
    vision_outputs (`BaseModelOutputWithPooling`):
        Outputs of the vision encoder.
    qformer_outputs (`BaseModelOutputWithPoolingAndCrossAttentions`):
        Outputs of the Q-Former (Querying Transformer).
    language_model_outputs (`CausalLMOutputWithPast` or `Seq2SeqLMOutput`):
        Outputs of the language model.
    Nlosslogitsvision_outputsqformer_outputslanguage_model_outputsrG   c                 H     t         fd j                         D              S )Nc              3   d   K   | ]'  }|d vr|   nt        |      j                          ) yw)r  r  r  N)getattrto_tuple).0kr@   s     rB   	<genexpr>zPInstructBlipVideoForConditionalGenerationModelOutput.to_tuple.<locals>.<genexpr>  sB      
  WW Gq!**,-
s   -0)r   keysr"  s   `rB   r  z=InstructBlipVideoForConditionalGenerationModelOutput.to_tuple  s%     
 YY[	
 
 	
rC   )ro   rp   rq   r   r  r   r7   rt   r   r  r  r   r  r   r  r   r   r   r  r  rC   rB   r  r  {  s     -1D%!!
"T
)0.2FE%##$t+28<N.5<KOOADHONR2_DtKR
%* 
rC   r  z`
    InstructBlipVideo base Model consisting of language model, qformer and vision encoder.
    c            "           e Zd ZdZdgZdef fdZd Zd Zd Z	de
j                  d	e
j                  fd
Zee	 	 	 	 	 	 	 	 	 	 	 dde
j                  de
j                  de
j                  dz  de
j                  dz  de
j                  dz  de
j                  dz  de
j                  dz  d	e
j                   dz  dedz  dedz  dedz  dededz  dee   deez  fd              Z xZS )r   r_   r   r*   c                    t         |   |       t        |j                        | _        t        j                  t        j                  d|j                  |j                  j                              | _        t        |j                        | _        t        j                  |j                  j                  |j                   j                        | _        t%        j&                  |j                         | _        | j+                          y Nr$   )r0   r1   r  vision_configvision_modelr   r6   r7   r   num_query_tokensqformer_configr2   r   r  qformerr   text_configlanguage_projectionr!   from_configlanguage_modelr  r?   s     rB   r1   zInstructBlipVideoModel.__init__  s     89M9MNLLQ8O8OQWQfQfQrQr)st4V5J5JK#%99V-B-B-N-NPVPbPbPnPn#o '33F4F4FG 	rC   c                 6    | j                   j                         S r   r  r#  r"  s    rB   r#  z+InstructBlipVideoModel.get_input_embeddings      ""7799rC   c                 :    | j                   j                  |       y r   r  r  r  s     rB   r  z+InstructBlipVideoModel.set_input_embeddings      007rC   c                    | j                   }t        |      dkD  r:d|vr6t        j                  j	                         dkD  rt
        j                  d       t        | j                  d      rd| j                  j                  _
        yyz
        Some pre-processing hacks to make the model `accelerate` compatible. Check
        https://github.com/huggingface/transformers/pull/21707 for more details.
        r$   r  a  The `language_model` is not in the `hf_device_map` dictionary and you are running your script in a multi-GPU environment. this may lead to unexpected behavior when using `accelerate`. Please pass a `device_map` that contains `language_model` to remove this warning. Please refer to https://github.com/huggingface/blog/blob/main/accelerate-large-models.md for more details on creating a `device_map` for large models._hf_hookTNhf_device_maplenr7   cudadevice_countloggerwarningr)  r  r  io_same_devicer@   r  s     rB   _preprocess_acceleratez-InstructBlipVideoModel._preprocess_accelerate  y    
 **}!&6m&KPUPZPZPgPgPilmPmNNM 4&&
3:>D((7 4rC   r   r  c                    |m| | j                         t        j                  | j                  j                  t        j
                  |j                              k(  }|j                  d      }n|| j                  j                  k(  }|j                  d      j                  |      j                  |j                        }|S zZ
        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`.
        rb   r   rI   )r#  r7   r   r*   image_token_idlongr   all	unsqueeze	expand_asrd   r@   r   r  special_image_masks       rB   get_placeholder_maskz+InstructBlipVideoModel.get_placeholder_mask       !.2M$2K2K2MT[[77uzzR_RfRfg3 " "4!7!7!;!*dkk.H.H!H/99"=GGVYYZgZnZno!!rC   Nqformer_input_idsqformer_attention_maskr   decoder_input_idsdecoder_attention_maskoutput_attentionsoutput_hidden_statesreturn_dictr^   	use_cacher   rG   c                    ||n| j                   j                  }|j                  \  }}}}}|j                  ||z  |||      }| j	                  ||	|
||      }|d   }t        j                  |j                         dd t
        j                  |j                        }| j                  j                  |j                  d   dd      }t        j                  |j                         dd t
        j                  |j                        }|t        j                  |      }|j                  |d      }|j                  |d      }t        j                  ||gd      }| j                  ||||||	|
|      }|d   ddd|j                  d      ddf   }| j!                  |      }|j                  || j                   j"                  |z  d      }|Q | j$                  j'                         |      }|| j                   j(                  k(  }|t        j                  |      }nl| | j'                         t        j*                  | j                   j(                  t
        j                  |j                              k(  }|j-                  d      }|j/                  d      j1                  |      j3                  |j                        }|j3                  |j                  |j4                        }|j7                  ||      }| j                   j8                  r | j$                  d|||	|
||d	|}n | j$                  d|||||	|
||d
|}t;        |||      S )a  
        qformer_input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of input sequence tokens in the vocabulary of the Q-Former. Input tokens can optionally be provided
            to serve as text prompt, which the Q-Former model will encode.

            Indices can be obtained using [`InstructBlipVideoProcessor`]. See [`InstructBlipVideoProcessor.__call__`] for
            details.

            [What are input IDs?](../glossary#input-ids)
        qformer_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.

            Only relevant in case an encoder-decoder language model (like T5) is used.
        N)r_   r  r  r  r^   r   rI   r  rN   r$   )r   r   r   rF  rG  r  r  r  r  r   r  r  r  r  )r  r   r  r  r  r  r  r  r  r  )r*   use_return_dictrP   rS   r  r7   r  rK   r  r   r   rg   	ones_likerepeat_interleaverX   r  r  r  r  r#  video_token_idr   r  r  r  rd   rb   masked_scatteruse_decoder_only_language_modelr  )r@   r_   r  r  r   r   r  r  r  r  r  r  r^   r  r   rh   frameschannelrE   rF   r  image_embedsimage_attention_maskr   query_attention_maskquery_outputsquery_outputlanguage_model_inputsr  outputss                                 rB   rm   zInstructBlipVideoModel.forward  s   R &1%<k$++B]B] 6B5G5G2
FGVU#++J,?&RWX**%/!5#%= + 
 &a(  %zz,*;*;*=cr*B%**]i]p]pq ((//0B0B10Er2N$zz,*;*;*=cr*B%**]i]p]pq!)%*__5F%G"-??A?N!7!I!I&VW!I!X!&,@BX+Y_`!a'1%".#7/!5# % 	
 %Q'+A\->->q-A+A1(DE !% 8 8 F !6 = =j$++JfJfioJoqs t FD//DDFyQM!*dkk.H.H!H%!&!;!.2M$2K2K2MT[[77uzzR_RfRfg3 " "4!7!7!;/99"=GGVYYZgZnZno 5 8 89M9M}ObOb c%445GI^_;;66)d)) +-"3%9'# G *d)) 
+-"3'="3%9'#
 
G D))#*
 	
rC   )NNNNNNNNNFN)ro   rp   rq   r$  _keep_in_fp32_modulesr%   r1   r#  r  r  r7   r  rt   r  r   r   rr   ru   r   r   r   r  rm   rv   rw   s   @rB   r   r     s    %O+,6 :8?("e.>.> "uO`O` " 
 ;?.22659:>-1)-,0#').!%
''
 !,,
 !& 0 04 7	

 $$t+
 ((4/
 !++d2
 !& 0 04 7
 ||d*
  $;
 #Tk
 D[
 #'
 $;
 -.
  
E	E!
  
rC   r   c                   :    e Zd ZU dZdZedz  ed<   dZedz  ed<   y)'BaseModelOutputWithVisionQformerOutputsz
    vision_outputs (`BaseModelOutputWithPooling`):
        Outputs of the vision encoder.
    qformer_outputs (`BaseModelOutputWithPoolingAndCrossAttentions`):
        Outputs of the Q-Former (Querying Transformer).
    Nr  r  )	ro   rp   rq   r   r  r   r   r  r   r  rC   rB   r  r  a  s)     9=N.5<KOOADHOrC   r  a  
    InstructBlipVideo Model for generating text given an image and an optional text prompt. The model consists of a vision
    encoder, Querying Transformer (Q-Former) and a language model.

    One can optionally pass `input_ids` to the model, which serve as a text prompt, to make the language model continue
    the prompt. Otherwise, the language model starts generating text from the [BOS] (beginning-of-sequence) token.
    c            $           e Zd ZU eed<   dZdZdgZdef fdZd Z	d Z
d Zd	ej                  fd
Zd! fd	Zd Zd Zdej&                  dej(                  fdZee	 	 	 	 	 	 	 	 	 	 	 	 d"dej(                  dej(                  dej&                  dz  dej(                  dz  dej&                  dz  dej&                  dz  dej&                  dz  dej(                  dz  dedz  dedz  dej&                  dz  dedz  dededz  dee   d	eez  f d              Z ej<                         	 	 	 	 	 	 d#dej(                  dej&                  dz  dej&                  dz  dej&                  dz  dej&                  dz  dej(                  dz  ded	ej&                  fd       Zee	 	 d$dej(                  dej&                  dej&                  dz  dedz  dee   d	ee z  fd               Z! xZ"S )%r   r*   r_   Tr   c                    t         |   |       t        j                  |j                        | _        t        j                  t        j                  d|j                  |j                  j                              | _        t        j                  |j                        | _        t        j                   |j                  j                  |j"                  j                        | _        |j&                  r t)        j*                  |j"                        }nt-        j*                  |j"                        }|| _        | j1                          y r  )r0   r1   r  _from_configr  r  r   r6   r7   r   r  r  r2   r   r  r  r   r  r  r  r"   r  r#   r  r  )r@   r*   r  rA   s      rB   r1   z2InstructBlipVideoForConditionalGeneration.__init__  s     8EEfFZFZ[LLQ8O8OQWQfQfQrQr)st4AA&BWBWX#%99V-B-B-N-NPVPbPbPnPn#o 111==f>P>PQN2>>v?Q?QRN, 	rC   c                 6    | j                   j                         S r   r  r"  s    rB   r#  z>InstructBlipVideoForConditionalGeneration.get_input_embeddings  r  rC   c                 :    | j                   j                  |       y r   r  r  s     rB   r  z>InstructBlipVideoForConditionalGeneration.set_input_embeddings  r  rC   c                 :    | j                   j                  |       y r   )r  set_output_embeddings)r@   new_embeddingss     rB   r  z?InstructBlipVideoForConditionalGeneration.set_output_embeddings  s    11.ArC   rG   c                 6    | j                   j                         S r   )r  get_output_embeddingsr"  s    rB   r  z?InstructBlipVideoForConditionalGeneration.get_output_embeddings  s    ""88::rC   Nc                 \    || j                   j                         S t        |   |      S )N)modality)r  get_encoderr0   )r@   r  rA   s     rB   r  z5InstructBlipVideoForConditionalGeneration.get_encoder  s1    &&22447&&99rC   c                 6    | j                   j                         S r   )r  get_decoderr"  s    rB   r  z5InstructBlipVideoForConditionalGeneration.get_decoder  s    ""..00rC   c                    | j                   }t        |      dkD  r:d|vr6t        j                  j	                         dkD  rt
        j                  d       t        | j                  d      rd| j                  j                  _
        yyr  r  r  s     rB   r  z@InstructBlipVideoForConditionalGeneration._preprocess_accelerate  r  rC   r   r  c                    |m| | j                         t        j                  | j                  j                  t        j
                  |j                              k(  }|j                  d      }n|| j                  j                  k(  }|j                  d      j                  |      j                  |j                        }|S r  )r#  r7   r   r*   r  r  r   r  r  r  rd   r  s       rB   r  z>InstructBlipVideoForConditionalGeneration.get_placeholder_mask  r  rC   r  r  r   r  r  r  r  labelsr  r^   r  r   c                 <   ||n| j                   j                  } | j                  |f|||dd|}|j                  }|j                  }|j
                  }| | j                         |      }|t        j                  |      }|j                  |j                  |j                        }| j                  ||      }|j                  ||      }| j                   j                  re | j                  d|||	|
||d|}|r|j                   n|d   }d}|w | j"                  d||| j                   j$                  j&                  d|}nB | j                  d|||||	|
|||d	|}|r|j(                  n|d   }|r|j                   n|d	   }t+        |||||
      S )a  
        qformer_input_ids (`torch.LongTensor` of shape (batch_size, sequence_length)):
            The sequence used as a prompt to be fed to the Q-Former module.
        qformer_attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
            Mask to avoid performing attention on padding token indices.

        Examples:

        ```python
        >>> from transformers import InstructBlipVideoProcessor, InstructBlipVideoForConditionalGeneration
        >>> import torch
        >>> from huggingface_hub import hf_hub_download
        >>> import av
        >>> import numpy as np

        >>> def read_video_pyav(container, indices):
        ...     '''
        ...     Decode the video with PyAV decoder.
        ...     Args:
        ...         container (`av.container.input.InputContainer`): PyAV container.
        ...         indices (`list[int]`): List of frame indices to decode.
        ...     Returns:
        ...         result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
        ...     '''
        ...     frames = []
        ...     container.seek(0)
        ...     start_index = indices[0]
        ...     end_index = indices[-1]
        ...     for i, frame in enumerate(container.decode(video=0)):
        ...         if i > end_index:
        ...             break
        ...         if i >= start_index and i in indices:
        ...             frames.append(frame)
        ...     return np.stack([x.to_ndarray(format="rgb24") for x in frames])

        >>> model = InstructBlipVideoForConditionalGeneration.from_pretrained("Salesforce/instructblip-vicuna-7b", device_map="auto")
        >>> processor = InstructBlipVideoProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b")

        >>> file_path = hf_hub_download(
        ...       repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
        ... )
        >>> container = av.open(file_path)

        >>> # sample uniformly 4 frames from the videWhy is this video funny?o
        >>> total_frames = container.streams.video[0].frames
        >>> indices = np.arange(0, total_frames, total_frames / 4).astype(int)
        >>> clip = read_video_pyav(container, indices)

        >>> prompt = "What is happening in the video?"
        >>> inputs = processor(text=prompt, images=clip, return_tensors="pt").to(model.device)

        >>> outputs = model.generate(
        ...     **inputs,
        ...     do_sample=False,
        ...     num_beams=5,
        ...     max_length=256,
        ...     repetition_penalty=1.5,
        ...     length_penalty=1.0,
        ... )
        >>> generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()
        >>> print(generated_text)
        "A person is eating a bowl of pasta, and they are using a fork to eat it. The person is sitting at a table, and the plate of pasta is on the table in front"
        ```NTr  r  r^   r  r  r  r   )r  r  r   )	r  r   r  r  r  r  r  r  r  r$   )r  r  r  r  r  r  )r*   r  get_video_featuresr  r  r  r#  r7   r  rd   r   rb   r  r  r  r  r  loss_functionr  r   r  r  )r@   r_   r  r  r   r   r  r  r  r  r  r  r  r^   r  r   video_featuresr  r  r  r  r  r  r  s                           rB   rm   z1InstructBlipVideoForConditionalGeneration.forward  s   f &1%<k$++B]B]BY$BYBYC
/#9%=C
 C
 !/ < <(88'66 7D557	BM!"__Y7N 5 8 89M9M}ObOb c!66yP]6^%445GI^_;;66)d)) +-"3%9'# G (3W^^
FD!)t)) !&T[[=T=T=_=_ci
 *d)) +-"3'="3%9'# G $/7<<GAJD'2W^^
FC)+#*
 	
rC   c                 X   t        | d      r| j                          |j                  d   }	| j                  ||||d      }
|
j                  }||| j
                  j                  g| j
                  j                  z  dz  }|| j
                  j                  j                  gz   }t        j                  |gt        j                  |j                        }|j                  |	d      } | j                         |      }|t        j                   |      }|j#                  |j                  |j$                        }| j'                  ||      }|j)                  ||      }||d	}| j*                  j
                  j,                  s||d
<    | j*                  j.                  di ||}|S )a  
        Overrides `generate` function to be able to use the model as a conditional generator.

        Args:
            pixel_values (`torch.FloatTensor` of shape (batch_size, num_channels, height, width) or
                (batch_size, num_frames, num_channels, height, width)): Input images or videos to be processed.
            qformer_input_ids (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
                The sequence used as a prompt to be fed to the Q-Former module.
            qformer_attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
                Mask to avoid performing attention on padding token indices.
            input_ids (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
                The sequence used as a prompt for the generation.
            attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
                Mask to avoid performing attention on padding token indices.
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Embedded representation of the inputs. Should be float, not int tokens.
            interpolate_pos_encoding (`bool`, *optional*, defaults to `False`):
                Whether to interpolate the positional encoding of the image embeddings.

        Returns:
            captions (list): A list of strings of length batch_size * num_captions.
        r  r   Tr  r   r  r$   r  )r  r   r   r  )r)  r  rP   r  r  r*   video_token_indexr  r  bos_token_idr7   r   r  r   repeatr#  r  rd   rb   r  r  r  is_encoder_decodergenerate)r@   r_   r  r  r   r   r  r^   generate_kwargsrh   r  r  video_tokensstart_tokensr  inputsr  s                    rB   r  z2InstructBlipVideoForConditionalGeneration.generate_  s   D 4)'')!''*
BFBYBY/#9%= CZ C
 !/ < <   $ = =>A]A]]`aa+t{{/F/F/S/S.TT!LL,uzzR^ReRef	%,,Z;	7D557	BM!"__Y7N 5 8 89M9M}ObOb c!66yP]6^%445GI^_#0NS""))<<"+F;.$%%..KK?KrC   c           
         |j                   \  }}}}	}
|j                  ||z  ||	|
      } | j                  d||dd|}t        |j                  |j
                  |j                  |j                  |d      }|d   }t        j                  |j                         dd t        j                  |j                        }| j                  j                  |j                   d   dd      }t        j                  |j                         dd t        j                  |j                        }|t        j                  |      }|j!                  |d      }|j!                  |d      }t        j"                  ||gd	      } | j$                  d|||||dd
|}||_        |d   ddd|j                  d	      ddf   }| j)                  |      }|j                  || j*                  j,                  |z  d      }||_        |S )a  
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
            The tensors corresponding to the input images.
        qformer_input_ids (`torch.LongTensor` of shape (batch_size, sequence_length)):
            The sequence used as a prompt to be fed to the Q-Former module.
        qformer_attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
            Mask to avoid performing attention on padding token indices.
        T)r_   r^   r  N)r  r  r   r  r  r  r   rI   r  rN   r$   )r   r   r   rF  rG  r  r  )rP   rS   r  r  r  r  r   r  r7   r  rK   r  r   r   rg   r  r  rX   r  r  r  r*   r  )r@   r_   r  r  r^   r   rh   r  r  rE   rF   r  r  r  r   r  r  r  r  s                      rB   r  z<InstructBlipVideoForConditionalGeneration.get_video_features  s.   ( 6B5G5G2
FGVU#++J,?&RWX5FT5F5F 6
%%=6
 	6
 A,>>(66(66%00) 
 &a(  %zz,*;*;*=cr*B%**]i]p]pq ((//0B0B10Er2N$zz,*;*;*=cr*B%**]i]p]pq!)%*__5F%G"-??A?N!7!I!I&VW!I!X!&,@BX+Y_`!a&$,, 
'1%".#7
 
 *9&&q)!-C|/@/@/C-CQ*FG 11,? (//
DKK<X<X[a<acef'5$rC   r   )NNNNNNNNNNFN)NNNNNFr	  )#ro   rp   rq   r%   r   r$  r   r  r1   r#  r  r  r   Moduler  r  r  r  r7   r  rt   r  r   r   ru   r   r   r   r  rm   r   r  r  r  rv   rw   s   @rB   r   r   o  s;    $#$O!+,6 (:8B;ryy ;:1?("e.>.> "uO`O` " 
 ;?.22659:>26)-,0*.#').!%P
''P
 !,,P
 !& 0 04 7	P

 $$t+P
 ((4/P
 !++d2P
 !& 0 04 7P
 ((4/P
  $;P
 #TkP
   4'P
 D[P
 #'P
 $;P
  +,!P
" 
E	E#P
  P
d U]]_ 6::>-12626).D''D !++d2D !& 0 04 7	D
 ##d*D ((4/D ((4/D #'D 
		D DL 
 ;?05E''E !++E !& 0 04 7	E
 #'+E +,E 
8	8E  ErC   r   )r  r   r  r   r   )r   )RrB  collections.abcr   dataclassesr   typingr   r7   r    r   r   activationsr	   
generationr
   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   r   r   modeling_utilsr   r   processing_utilsr   pytorch_utilsr   utilsr   r   r   r   r   r   utils.genericr   utils.output_capturingr   r   autor!   r"   r#   configuration_instructblipvideor%   r&   r'   
get_loggerro   r  r%  r)   ry   r   rr   floatr   r   r   r   r  r  r   r   r]  rd  rj  rn  r  r  r  r   r  r   __all__r  rC   rB   <module>r:     s  ,  $ !    & ! ) B 9  G & 6 j j 7 E I I  
		H	%G		 GT/ /d i i iR %II%<<% 
% <<	%
 LL4'% % %.G) G)T299 $> @@ryy @@3#C 3l^. ^.B  		  2299 RYY R$> Rj!
bii !
HY
$D Y
x 

; 
 
: 
{
= {

{
| 	P.H 	P  	P u0PRa uuprC   