
    qik                        d dl Z d dlmZmZ d dlmZmZmZmZm	Z	m
Z
mZmZ ddlmZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZmZmZ ddlmZmZ  ej<                  e      Z  G d de      Z! G d de      Z" G d de      Z# G d de	      Z$ G d de      Z% G d de
      Z& G d de      Z' G d de      Z( G d de      Z)g dZ*y)     N)InstructBlipQFormerConfigInstructBlipVisionConfig)'BaseModelOutputWithVisionQformerOutputs$InstructBlipForConditionalGeneration/InstructBlipForConditionalGenerationModelOutputInstructBlipModelInstructBlipPreTrainedModelInstructBlipQFormerModelInstructBlipVisionModelTransformersKwargs   )PreTrainedConfig)FlashAttentionKwargs)BaseModelOutputWithPooling)!MODEL_FOR_CAUSAL_LM_MAPPING_NAMES)Unpack)auto_docstringcan_return_tuplelogging   )CONFIG_MAPPING
AutoConfigc                       e Zd Zy)InstructBlipVideoVisionConfigN__name__
__module____qualname__     q/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/instructblipvideo/modular_instructblipvideo.pyr   r   -       r    r   c                       e Zd Zy)InstructBlipVideoQFormerConfigNr   r   r    r!   r$   r$   1   r"   r    r$   c                   F     e Zd ZdZdZddiZeeedZ		 	 	 	 	 d fd	Z
 xZS )InstructBlipVideoConfiga
  
    [`InstructBlipVideoConfig`] is the configuration class to store the configuration of a
    [`InstructBlipVideoForConditionalGeneration`]. It is used to instantiate a Instructblipvideo model according to the specified
    arguments, defining the vision model, Q-Former model and language model configs. Instantiating a configuration with
    the defaults will yield a similar configuration to that of the Instructblipvideo
    [Salesforce/instruct-blip-flan-t5](https://huggingface.co/Salesforce/instruct-blip-flan-t5) architecture.

    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PreTrainedConfig`] for more information.

    Args:
        vision_config (`dict`, *optional*):
            Dictionary of configuration options used to initialize [`InstructBlipVideoVisionConfig`].
        qformer_config (`dict`, *optional*):
            Dictionary of configuration options used to initialize [`InstructBlipVideoQFormerConfig`].
        text_config (`dict`, *optional*):
            Dictionary of configuration options used to initialize any [`PreTrainedConfig`].
        num_query_tokens (`int`, *optional*, defaults to 32):
            The number of query tokens passed through the Transformer.

        video_token_index (`int`, *optional*):
            Token index of special video token.
        kwargs (*optional*):
            Dictionary of keyword arguments.

    Example:

    ```python
    >>> from transformers import (
    ...     InstructBlipVideoVisionConfig,
    ...     InstructBlipVideoQFormerConfig,
    ...     OPTConfig,
    ...     InstructBlipVideoConfig,
    ...     InstructBlipVideoForConditionalGeneration,
    ... )

    >>> # Initializing a InstructBlipVideoConfig with Salesforce/instruct-blip-flan-t5 style configuration
    >>> configuration = InstructBlipVideoConfig()

    >>> # Initializing a InstructBlipVideoForConditionalGeneration (with random weights) from the Salesforce/instruct-blip-flan-t5 style configuration
    >>> model = InstructBlipVideoForConditionalGeneration(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config

    >>> # We can also initialize a InstructBlipVideoConfig from a InstructBlipVideoVisionConfig, InstructBlipVideoQFormerConfig and any PreTrainedConfig

    >>> # Initializing Instructblipvideo vision, Instructblipvideo Q-Former and language model configurations
    >>> vision_config = InstructBlipVideoVisionConfig()
    >>> qformer_config = InstructBlipVideoQFormerConfig()
    >>> text_config = OPTConfig()

    >>> config = InstructBlipVideoConfig(vision_config=vision_config, qformer_config=qformer_config, text_config=text_config)
    ```instructblipvideovideo_token_idvideo_token_index)text_configqformer_configvision_configc                    |#t        d          }t        j                  d       n0t        |t              r |j                  dd      }t        |   di |}| t               }t        j                  d       nt        |t              rt        di |}| t               }t        j                  d       nt        |t              rt        di |}|| _        || _	        || _
        || _        || _        | j                  j                  | j                  _        | j                  j                  t         v | _        d| _        d| _        t)        | T  di | y )	NoptzTtext_config is None. Initializing the text config with default values (`OPTConfig`).
model_typez\qformer_config is None. Initializing the InstructBlipVideoQFormerConfig with default values.z``vision_config` is `None`. initializing the `InstructBlipVideoVisionConfig` with default values.g      ?g{Gz?r   )r   loggerinfo
isinstancedictgetr$   r   r*   r,   r+   num_query_tokensr)   hidden_sizeencoder_hidden_sizer/   r   use_decoder_only_language_modelinitializer_factorinitializer_rangesuper__init__)	selfr,   r+   r*   r5   r)   kwargstext_model_type	__class__s	           r!   r<   z InstructBlipVideoConfig.__init__w   s5    (/1KKKnoT*)oolEBO(9HKHK!;=NKKvw-;MnMN 9;MKKr t,9JMJM&*, 0!2262D2D2P2P//3/?/?/J/JNo/o,"%!%"6"r    )NNN    N)r   r   r   __doc__r/   attribute_mapr   r$   r   sub_configsr<   __classcell__)r@   s   @r!   r&   r&   5   sJ    5n %J-M "86K (# (#r    r&   c                       e Zd ZdZy) InstructBlipVideoPreTrainedModel)videotextNr   r   r   input_modalitiesr   r    r!   rG   rG      s    (r    rG   c                       e Zd ZdZy)InstructBlipVideoVisionModelrH   NrJ   r   r    r!   rM   rM      s    r    rM   c                       e Zd Zy)InstructBlipVideoQFormerModelNr   r   r    r!   rO   rO      r"   r    rO   c                       e Zd Zy)4InstructBlipVideoForConditionalGenerationModelOutputNr   r   r    r!   rQ   rQ      r"   r    rQ   c                    P   e Zd Z	 	 	 	 	 	 	 	 	 	 	 ddej                  dej                  dej
                  dz  dej                  dz  dej
                  dz  dej
                  dz  dej
                  dz  d	ej                  dz  d
edz  dedz  dedz  dededz  dee	   de
ez  fdZy)InstructBlipVideoModelNpixel_valuesqformer_input_idsqformer_attention_mask	input_idsattention_maskdecoder_input_idsdecoder_attention_maskinputs_embedsoutput_attentionsoutput_hidden_statesreturn_dictinterpolate_pos_encoding	use_cacher>   returnc                    ||n| j                   j                  }|j                  \  }}}}}|j                  ||z  |||      }| j	                  ||	|
||      }|d   }t        j                  |j                         d d t
        j                  |j                        }| j                  j                  |j                  d   dd      }t        j                  |j                         d d t
        j                  |j                        }|t        j                  |      }|j                  |d      }|j                  |d      }t        j                  ||gd      }| j                  ||||||	|
|      }|d   d d d |j                  d      d d f   }| j!                  |      }|j                  || j                   j"                  |z  d      }|Q | j$                  j'                         |      }|| j                   j(                  k(  }|t        j                  |      }nl| | j'                         t        j*                  | j                   j(                  t
        j                  |j                              k(  }|j-                  d      }|j/                  d      j1                  |      j3                  |j                        }|j3                  |j                  |j4                        }|j7                  ||      }| j                   j8                  r | j$                  d|||	|
||d|}n | j$                  d|||||	|
||d	|}t;        |||
      S )N)rT   r\   r]   r^   r_   r   dtypedevicedim   )rW   rX   query_embedsencoder_hidden_statesencoder_attention_maskr\   r]   r^   r[   rX   r\   r]   r^   r`   )r[   rX   rY   rZ   r\   r]   r^   r`   )vision_outputsqformer_outputslanguage_model_outputsr   )configuse_return_dictshapereshapevision_modeltorchonessizelongrf   query_tokensexpand	ones_likerepeat_interleavecatqformerlanguage_projectionr5   language_modelget_input_embeddingsr(   tensorall	unsqueeze	expand_astore   masked_scatterr8   rQ   )r=   rT   rU   rV   rW   rX   rY   rZ   r[   r\   r]   r^   r_   r`   r>   
batch_sizeframeschannelheightwidthrn   image_embedsimage_attention_maskrz   query_attention_maskquery_outputsquery_outputlanguage_model_inputsspecial_image_maskoutputss                                 r!   forwardzInstructBlipVideoModel.forward   s   " &1%<k$++B]B] 6B5G5G2
FGVU#++J,?&RWX**%/!5#%= + 
 &a(  %zz,*;*;*=cr*B%**]i]p]pq ((//0B0B10Er2N$zz,*;*;*=cr*B%**]i]p]pq!)%*__5F%G"-??A?N!7!I!I&VW!I!X!&,@BX+Y_`!a'1%".#7/!5# % 	
 %Q'+A\->->q-A+A1(DE !% 8 8 F !6 = =j$++JfJfioJoqs t FD//DDFyQM!*dkk.H.H!H%!&!;!.2M$2K2K2MT[[77uzzR_RfRfg3 " "4!7!7!;/99"=GGVYYZgZnZno 5 8 89M9M}ObOb c%445GI^_;;66)d)) +-"3%9'# G *d)) 
+-"3'="3%9'#
 
G D))#*
 	
r    )NNNNNNNNNFN)r   r   r   rv   FloatTensor
LongTensorTensorboolr   r   tuplerQ   r   r   r    r!   rS   rS      s6   
 ;?.22659:>-1)-,0#').!%i
''i
 !,,i
 !& 0 04 7	i

 $$t+i
 ((4/i
 !++d2i
 !& 0 04 7i
 ||d*i
  $;i
 #Tki
 D[i
 #'i
 $;i
 -.i
  
E	E!i
r    rS   c            "       D   e Zd Zee	 	 ddej                  dej                  dej                  dz  dedz  de	e
   deez  fd              Zd	 Zd
ej                  dej                  fdZ	 	 	 	 	 	 	 	 	 	 	 	 ddej                  dej                  dej                  dz  d
ej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dedz  dedz  dej                  dz  dedz  dededz  de	e
   deez  f dZ ej$                         	 	 	 	 	 	 ddej                  dej                  dz  dej                  dz  d
ej                  dz  dej                  dz  dej                  dz  dedej                  fd       Zy))InstructBlipVideoForConditionalGenerationNrT   rU   rV   r_   r>   ra   c           
         |j                   \  }}}}	}
|j                  ||z  ||	|
      } | j                  d||dd|}t        |j                  |j
                  |j                  |j                  |d      }|d   }t        j                  |j                         dd t        j                  |j                        }| j                  j                  |j                   d   dd      }t        j                  |j                         dd t        j                  |j                        }|t        j                  |      }|j!                  |d      }|j!                  |d      }t        j"                  ||gd	      } | j$                  d|||||dd
|}||_        |d   ddd|j                  d	      ddf   }| j)                  |      }|j                  || j*                  j,                  |z  d      }||_        |S )a  
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
            The tensors corresponding to the input images.
        qformer_input_ids (`torch.LongTensor` of shape (batch_size, sequence_length)):
            The sequence used as a prompt to be fed to the Q-Former module.
        qformer_attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
            Mask to avoid performing attention on padding token indices.
        T)rT   r_   r^   N)last_hidden_statepooler_outputhidden_states
attentionsrn   ro   r   rc   rd   rg   ri   )rW   rX   rj   rk   rl   r^   r   )rs   rt   ru   r   r   r   r   r   rv   rw   rx   ry   rf   rz   r{   r|   r}   r~   r   ro   r   rq   r5   )r=   rT   rU   rV   r_   r>   r   r   r   r   r   rn   r   r   rz   r   ro   r   video_featuress                      r!   get_video_featuresz<InstructBlipVideoForConditionalGeneration.get_video_features   s.   ( 6B5G5G2
FGVU#++J,?&RWX5FT5F5F 6
%%=6
 	6
 A,>>(66(66%00) 
 &a(  %zz,*;*;*=cr*B%**]i]p]pq ((//0B0B10Er2N$zz,*;*;*=cr*B%**]i]p]pq!)%*__5F%G"-??A?N!7!I!I&VW!I!X!&,@BX+Y_`!a&$,, 
'1%".#7
 
 *9&&q)!-C|/@/@/C-CQ*FG 11,? (//
DKK<X<X[a<acef'5$r    c                      t        d      )Nz=No need to inherit as this architecture only supports videos.)AttributeError)super_kwargss    r!   get_image_featuresz<InstructBlipVideoForConditionalGeneration.get_image_featuresi  s    \]]r    rW   r[   c                    |m| | j                         t        j                  | j                  j                  t        j
                  |j                              k(  }|j                  d      }n|| j                  j                  k(  }|j                  d      j                  |      j                  |j                        }|S )zZ
        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`.
        rd   rc   )r   rv   r   rq   r(   ry   rf   r   r   r   r   )r=   rW   r[   r   s       r!   get_placeholder_maskz>InstructBlipVideoForConditionalGeneration.get_placeholder_maskl  s     !.2M$2K2K2MT[[77uzzR_RfRfg3 " "4!7!7!;!*dkk.H.H!H/99"=GGVYYZgZnZno!!r    rX   rY   rZ   r\   r]   labelsr^   r`   c                 <   ||n| j                   j                  } | j                  |f|||dd|}|j                  }|j                  }|j
                  }| | j                         |      }|t        j                  |      }|j                  |j                  |j                        }| j                  ||      }|j                  ||      }| j                   j                  re | j                  d|||	|
||d|}|r|j                   n|d   }d}|w | j"                  d||| j                   j$                  j&                  d|}nB | j                  d|||||	|
|||d	|}|r|j(                  n|d   }|r|j                   n|d	   }t+        |||||
      S )a  
        qformer_input_ids (`torch.LongTensor` of shape (batch_size, sequence_length)):
            The sequence used as a prompt to be fed to the Q-Former module.
        qformer_attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
            Mask to avoid performing attention on padding token indices.

        Examples:

        ```python
        >>> from transformers import InstructBlipVideoProcessor, InstructBlipVideoForConditionalGeneration
        >>> import torch
        >>> from huggingface_hub import hf_hub_download
        >>> import av
        >>> import numpy as np

        >>> def read_video_pyav(container, indices):
        ...     '''
        ...     Decode the video with PyAV decoder.
        ...     Args:
        ...         container (`av.container.input.InputContainer`): PyAV container.
        ...         indices (`list[int]`): List of frame indices to decode.
        ...     Returns:
        ...         result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
        ...     '''
        ...     frames = []
        ...     container.seek(0)
        ...     start_index = indices[0]
        ...     end_index = indices[-1]
        ...     for i, frame in enumerate(container.decode(video=0)):
        ...         if i > end_index:
        ...             break
        ...         if i >= start_index and i in indices:
        ...             frames.append(frame)
        ...     return np.stack([x.to_ndarray(format="rgb24") for x in frames])

        >>> model = InstructBlipVideoForConditionalGeneration.from_pretrained("Salesforce/instructblip-vicuna-7b", device_map="auto")
        >>> processor = InstructBlipVideoProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b")

        >>> file_path = hf_hub_download(
        ...       repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
        ... )
        >>> container = av.open(file_path)

        >>> # sample uniformly 4 frames from the videWhy is this video funny?o
        >>> total_frames = container.streams.video[0].frames
        >>> indices = np.arange(0, total_frames, total_frames / 4).astype(int)
        >>> clip = read_video_pyav(container, indices)

        >>> prompt = "What is happening in the video?"
        >>> inputs = processor(text=prompt, images=clip, return_tensors="pt").to(model.device)

        >>> outputs = model.generate(
        ...     **inputs,
        ...     do_sample=False,
        ...     num_beams=5,
        ...     max_length=256,
        ...     repetition_penalty=1.5,
        ...     length_penalty=1.0,
        ... )
        >>> generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()
        >>> print(generated_text)
        "A person is eating a bowl of pasta, and they are using a fork to eat it. The person is sitting at a table, and the plate of pasta is on the table in front"
        ```NTrU   rV   r_   r^   r[   rm   r   )logitsr   
vocab_size)	r[   rX   rY   rZ   r\   r]   r^   r   r`   ri   )lossr   rn   ro   rp   r   )rq   rr   r   r   ro   rn   r   rv   r|   r   rf   re   r   r   r8   r   r   loss_functionr*   r   r   rQ   )r=   rT   rU   rV   rW   rX   rY   rZ   r[   r\   r]   r   r^   r_   r`   r>   r   r   ro   rn   r   r   r   r   s                           r!   r   z1InstructBlipVideoForConditionalGeneration.forward{  s   b &1%<k$++B]B]BY$BYBYC
/#9%=C
 C
 !/ < <(88'66 7D557	BM!"__Y7N 5 8 89M9M}ObOb c!66yP]6^%445GI^_;;66)d)) +-"3%9'# G (3W^^
FD!)t)) !&T[[=T=T=_=_ci
 *d)) +-"3'="3%9'# G $/7<<GAJD'2W^^
FC)+#*
 	
r    c                 X   t        | d      r| j                          |j                  d   }	| j                  ||||d      }
|
j                  }||| j
                  j                  g| j
                  j                  z  dz  }|| j
                  j                  j                  gz   }t        j                  |gt        j                  |j                        }|j                  |	d      } | j                         |      }|t        j                   |      }|j#                  |j                  |j$                        }| j'                  ||      }|j)                  ||      }||d	}| j*                  j
                  j,                  s||d
<    | j*                  j.                  di ||}|S )a  
        Overrides `generate` function to be able to use the model as a conditional generator.

        Args:
            pixel_values (`torch.FloatTensor` of shape (batch_size, num_channels, height, width) or
                (batch_size, num_frames, num_channels, height, width)): Input images or videos to be processed.
            qformer_input_ids (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
                The sequence used as a prompt to be fed to the Q-Former module.
            qformer_attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
                Mask to avoid performing attention on padding token indices.
            input_ids (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
                The sequence used as a prompt for the generation.
            attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
                Mask to avoid performing attention on padding token indices.
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Embedded representation of the inputs. Should be float, not int tokens.
            interpolate_pos_encoding (`bool`, *optional*, defaults to `False`):
                Whether to interpolate the positional encoding of the image embeddings.

        Returns:
            captions (list): A list of strings of length batch_size * num_captions.
        hf_device_mapr   Tr      rd   ri   r   )r[   rX   rW   r   )hasattr_preprocess_acceleraters   r   r   rq   r)   r5   r*   bos_token_idrv   r   ry   rf   repeatr   r|   r   re   r   r   r   is_encoder_decodergenerate)r=   rT   rU   rV   rW   rX   r[   r_   generate_kwargsr   r   r   video_tokensstart_tokensr   inputsr   s                    r!   r   z2InstructBlipVideoForConditionalGeneration.generate  s   D 4)'')!''*
BFBYBY/#9%= CZ C
 !/ < <   $ = =>A]A]]`aa+t{{/F/F/S/S.TT!LL,uzzR^ReRef	%,,Z;	7D557	BM!"__Y7N 5 8 89M9M}ObOb c!66yP]6^%445GI^_#0NS""))<<"+F;.$%%..KK?Kr    )NF)NNNNNNNNNNFN)NNNNNF)r   r   r   r   r   rv   r   r   r   r   r   r   r   r   r   r   rQ   r   no_gradr   r   r    r!   r   r     s   
 ;?05E''E !++E !& 0 04 7	E
 #'+E +,E 
8	8E  EN^"e.>.> "uO`O` "& ;?.22659:>26)-,0*.#').!%P
''P
 !,,P
 !& 0 04 7	P

 $$t+P
 ((4/P
 !++d2P
 !& 0 04 7P
 ((4/P
  $;P
 #TkP
   4'P
 D[P
 #'P
 $;P
  +,!P
" 
E	E#P
d U]]_ 6::>-12626).D''D !++d2D !& 0 04 7	D
 ##d*D ((4/D ((4/D #'D 
		D Dr    r   )r&   r$   r   rM   rG   rO   rS   r   )+rv   ;transformers.models.instructblip.configuration_instructblipr   r   6transformers.models.instructblip.modeling_instructblipr   r   r   r   r	   r
   r   r   configuration_utilsr   modeling_flash_attention_utilsr   modeling_outputsr   models.auto.modeling_autor   processing_utilsr   utilsr   r   r   autor   r   
get_loggerr   r0   r   r$   r&   rG   rM   rO   rQ   rS   r   __all__r   r    r!   <module>r      s     	 	 	 4 B : J & > > - 
		H	%	$< 		%> 	j#. j#Z)'B )#: 	$< 		;j 	j
. j
Zs0T sl		r    