Ë
    «q±iïk  ã                   ó–  — d dl Z d dlmZmZ d dlmZmZmZmZm	Z	m
Z
mZmZ ddlmZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZmZmZ ddlmZmZ  ej<                  e«      Z  G d„ de«      Z! G d„ de«      Z" G d„ de«      Z# G d„ de	«      Z$ G d„ de«      Z% G d„ de
«      Z& G d„ de«      Z' G d„ de«      Z( G d„ de«      Z)g d¢Z*y) é    N)ÚInstructBlipQFormerConfigÚInstructBlipVisionConfig)Ú'BaseModelOutputWithVisionQformerOutputsÚ$InstructBlipForConditionalGenerationÚ/InstructBlipForConditionalGenerationModelOutputÚInstructBlipModelÚInstructBlipPreTrainedModelÚInstructBlipQFormerModelÚInstructBlipVisionModelÚTransformersKwargsé   )ÚPreTrainedConfig)ÚFlashAttentionKwargs)ÚBaseModelOutputWithPooling)Ú!MODEL_FOR_CAUSAL_LM_MAPPING_NAMES)ÚUnpack)Úauto_docstringÚcan_return_tupleÚloggingé   )ÚCONFIG_MAPPINGÚ
AutoConfigc                   ó   — e Zd Zy)ÚInstructBlipVideoVisionConfigN©Ú__name__Ú
__module__Ú__qualname__© ó    úq/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/instructblipvideo/modular_instructblipvideo.pyr   r   -   ó   „ Ør    r   c                   ó   — e Zd Zy)ÚInstructBlipVideoQFormerConfigNr   r   r    r!   r$   r$   1   r"   r    r$   c                   óF   ‡ — e Zd ZdZdZddiZeeedœZ		 	 	 	 	 dˆ fd„	Z
ˆ xZS )ÚInstructBlipVideoConfiga¾
  
    [`InstructBlipVideoConfig`] is the configuration class to store the configuration of a
    [`InstructBlipVideoForConditionalGeneration`]. It is used to instantiate a Instructblipvideo model according to the specified
    arguments, defining the vision model, Q-Former model and language model configs. Instantiating a configuration with
    the defaults will yield a similar configuration to that of the Instructblipvideo
    [Salesforce/instruct-blip-flan-t5](https://huggingface.co/Salesforce/instruct-blip-flan-t5) architecture.

    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PreTrainedConfig`] for more information.

    Args:
        vision_config (`dict`, *optional*):
            Dictionary of configuration options used to initialize [`InstructBlipVideoVisionConfig`].
        qformer_config (`dict`, *optional*):
            Dictionary of configuration options used to initialize [`InstructBlipVideoQFormerConfig`].
        text_config (`dict`, *optional*):
            Dictionary of configuration options used to initialize any [`PreTrainedConfig`].
        num_query_tokens (`int`, *optional*, defaults to 32):
            The number of query tokens passed through the Transformer.

        video_token_index (`int`, *optional*):
            Token index of special video token.
        kwargs (*optional*):
            Dictionary of keyword arguments.

    Example:

    ```python
    >>> from transformers import (
    ...     InstructBlipVideoVisionConfig,
    ...     InstructBlipVideoQFormerConfig,
    ...     OPTConfig,
    ...     InstructBlipVideoConfig,
    ...     InstructBlipVideoForConditionalGeneration,
    ... )

    >>> # Initializing a InstructBlipVideoConfig with Salesforce/instruct-blip-flan-t5 style configuration
    >>> configuration = InstructBlipVideoConfig()

    >>> # Initializing a InstructBlipVideoForConditionalGeneration (with random weights) from the Salesforce/instruct-blip-flan-t5 style configuration
    >>> model = InstructBlipVideoForConditionalGeneration(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config

    >>> # We can also initialize a InstructBlipVideoConfig from a InstructBlipVideoVisionConfig, InstructBlipVideoQFormerConfig and any PreTrainedConfig

    >>> # Initializing Instructblipvideo vision, Instructblipvideo Q-Former and language model configurations
    >>> vision_config = InstructBlipVideoVisionConfig()
    >>> qformer_config = InstructBlipVideoQFormerConfig()
    >>> text_config = OPTConfig()

    >>> config = InstructBlipVideoConfig(vision_config=vision_config, qformer_config=qformer_config, text_config=text_config)
    ```ÚinstructblipvideoÚvideo_token_idÚvideo_token_index)Útext_configÚqformer_configÚvision_configc                 ó°  •— |€#t        d   «       }t        j                  d«       n0t        |t        «      r |j                  dd«      }t        |   di |¤Ž}|€ t        «       }t        j                  d«       nt        |t        «      rt        di |¤Ž}|€ t        «       }t        j                  d«       nt        |t        «      rt        di |¤Ž}|| _        || _	        || _
        || _        || _        | j                  j                  | j                  _        | j                  j                  t         v | _        d| _        d| _        t)        ‰| T  di |¤Ž y )	NÚoptzTtext_config is None. Initializing the text config with default values (`OPTConfig`).Ú
model_typez\qformer_config is None. Initializing the InstructBlipVideoQFormerConfig with default values.z``vision_config` is `None`. initializing the `InstructBlipVideoVisionConfig` with default values.g      ð?g{®Gáz”?r   )r   ÚloggerÚinfoÚ
isinstanceÚdictÚgetr$   r   r*   r,   r+   Únum_query_tokensr)   Úhidden_sizeÚencoder_hidden_sizer/   r   Úuse_decoder_only_language_modelÚinitializer_factorÚinitializer_rangeÚsuperÚ__init__)	Úselfr,   r+   r*   r5   r)   ÚkwargsÚtext_model_typeÚ	__class__s	           €r!   r<   z InstructBlipVideoConfig.__init__w   s5  ø€ ð ÐÜ(¨Ñ/Ó1ˆKÜK‰KÐnÕoÜ˜¤TÔ*Ø)Ÿo™o¨l¸EÓBˆOÜ(¨Ñ9ÑH¸KÑHˆKàÐ!Ü;Ó=ˆNÜK‰KÐvÕwÜ˜¬Ô-Ü;ÑM¸nÑMˆNàÐ Ü9Ó;ˆMÜK‰KØrõô ˜¤tÔ,Ü9ÑJ¸MÑJˆMà&ˆÔØ*ˆÔØ,ˆÔà 0ˆÔØ!2ˆÔØ26×2DÑ2D×2PÑ2Pˆ×ÑÔ/Ø/3×/?Ñ/?×/JÑ/JÔNoÐ/oˆÔ,Ø"%ˆÔØ!%ˆÔÜ‰ÑÑ"˜6Ó"r    )NNNé    N)r   r   r   Ú__doc__r/   Úattribute_mapr   r$   r   Úsub_configsr<   Ú__classcell__)r@   s   @r!   r&   r&   5   sJ   ø„ ñ5ðn %€JàÐ-ð€Mð "Ø8Ø6ñ€Kð ØØØØ÷(#ñ (#r    r&   c                   ó   — e Zd ZdZy)Ú InstructBlipVideoPreTrainedModel)ÚvideoÚtextN©r   r   r   Úinput_modalitiesr   r    r!   rG   rG   ¢   s   „ Ø(Ñr    rG   c                   ó   — e Zd ZdZy)ÚInstructBlipVideoVisionModelrH   NrJ   r   r    r!   rM   rM   ¦   s   „ ØÑr    rM   c                   ó   — e Zd Zy)ÚInstructBlipVideoQFormerModelNr   r   r    r!   rO   rO   ª   r"   r    rO   c                   ó   — e Zd Zy)Ú4InstructBlipVideoForConditionalGenerationModelOutputNr   r   r    r!   rQ   rQ   ®   r"   r    rQ   c                    óP  — e Zd Z	 	 	 	 	 	 	 	 	 	 	 ddej                  dej                  dej
                  dz  dej                  dz  dej
                  dz  dej
                  dz  dej
                  dz  d	ej                  dz  d
edz  dedz  dedz  dededz  dee	   de
ez  fd„Zy)ÚInstructBlipVideoModelNÚpixel_valuesÚqformer_input_idsÚqformer_attention_maskÚ	input_idsÚattention_maskÚdecoder_input_idsÚdecoder_attention_maskÚinputs_embedsÚoutput_attentionsÚoutput_hidden_statesÚreturn_dictÚinterpolate_pos_encodingÚ	use_cacher>   Úreturnc                 ó¼  — ||n| j                   j                  }|j                  \  }}}}}|j                  ||z  |||«      }| j	                  ||	|
||¬«      }|d   }t        j                  |j                  «       d d t
        j                  |j                  ¬«      }| j                  j                  |j                  d   dd«      }t        j                  |j                  «       d d t
        j                  |j                  ¬«      }|€t        j                  |«      }|j                  |d¬«      }|j                  |d¬«      }t        j                  ||gd¬«      }| j                  ||||||	|
|¬«      }|d   d d …d |j                  d«      …d d …f   }| j!                  |«      }|j                  || j                   j"                  |z  d«      }|€Q | j$                  j'                  «       |«      }|| j                   j(                  k(  }|€‚t        j                  |«      }nl| | j'                  «       t        j*                  | j                   j(                  t
        j                  |j                  ¬«      «      k(  }|j-                  d«      }|j/                  d«      j1                  |«      j3                  |j                  «      }|j3                  |j                  |j4                  «      }|j7                  ||«      }| j                   j8                  r | j$                  d|||	|
||dœ|¤Ž}n | j$                  d|||||	|
||d	œ|¤Ž}t;        |||¬
«      S )N)rT   r\   r]   r^   r_   r   éÿÿÿÿ©ÚdtypeÚdevice©Údimé   )rW   rX   Úquery_embedsÚencoder_hidden_statesÚencoder_attention_maskr\   r]   r^   ©r[   rX   r\   r]   r^   r`   )r[   rX   rY   rZ   r\   r]   r^   r`   )Úvision_outputsÚqformer_outputsÚlanguage_model_outputsr   )ÚconfigÚuse_return_dictÚshapeÚreshapeÚvision_modelÚtorchÚonesÚsizeÚlongrf   Úquery_tokensÚexpandÚ	ones_likeÚrepeat_interleaveÚcatÚqformerÚlanguage_projectionr5   Úlanguage_modelÚget_input_embeddingsr(   ÚtensorÚallÚ	unsqueezeÚ	expand_asÚtore   Úmasked_scatterr8   rQ   )r=   rT   rU   rV   rW   rX   rY   rZ   r[   r\   r]   r^   r_   r`   r>   Ú
batch_sizeÚframesÚchannelÚheightÚwidthrn   Úimage_embedsÚimage_attention_maskrz   Úquery_attention_maskÚquery_outputsÚquery_outputÚlanguage_model_inputsÚspecial_image_maskÚoutputss                                 r!   ÚforwardzInstructBlipVideoModel.forward³   s„  € ð" &1Ð%<‘kÀ$Ç+Á+×B]ÑB]ˆð 6B×5GÑ5GÑ2ˆ
F˜G V¨UØ#×+Ñ+¨J¸Ñ,?ÀÈ&ÐRWÓXˆà×*Ñ*Ø%Ø/Ø!5Ø#Ø%=ð +ó 
ˆð & aÑ(ˆô  %Ÿz™z¨,×*;Ñ*;Ó*=¸c¸rÐ*BÌ%Ï*É*Ð]i×]pÑ]pÔqÐð ×(Ñ(×/Ñ/°×0BÑ0BÀ1Ñ0EÀrÈ2ÓNˆÜ$Ÿz™z¨,×*;Ñ*;Ó*=¸c¸rÐ*BÌ%Ï*É*Ð]i×]pÑ]pÔqÐà!Ð)Ü%*§_¡_Ð5FÓ%GÐ"à-×?Ñ?ÀÈAÐ?ÓNÐØ!7×!IÑ!IÈ&ÐVWÐ!IÓ!XÐÜ!&§¡Ð,@ÐBXÐ+YÐ_`Ô!aÐØŸ™Ø'Ø1Ø%Ø".Ø#7Ø/Ø!5Ø#ð %ó 	
ˆð % QÑ'ªÐ+A¨\×->Ñ->¸qÓ-AÐ+AÂ1Ð(DÑEˆð !%× 8Ñ 8¸Ó FÐð !6× =Ñ =¸jÈ$Ï+É+×JfÑJfÐioÑJoÐqsÓ tÐØÐ ØF˜D×/Ñ/×DÑDÓFÀyÓQˆMØ!*¨d¯k©k×.HÑ.HÑ!HÐØÐ%Ü!&§¡°Ó!;‘à!.Ð2M°$×2KÑ2KÓ2MÜ—‘˜TŸ[™[×7Ñ7¼u¿z¹zÐR_×RfÑRfÔgó3ñ "Ðð "4×!7Ñ!7¸Ó!;Ðà/×9Ñ9¸"Ó=×GÑGÈÓV×YÑYÐZg×ZnÑZnÓoÐØ 5× 8Ñ 8¸×9MÑ9MÈ}×ObÑObÓ cÐØ%×4Ñ4Ð5GÐI^Ó_ˆà;‰;×6Ò6Ø)d×)Ñ)ð Ø+Ø-Ø"3Ø%9Ø'Ø#ñð ñ‰Gð *d×)Ñ)ð 
Ø+Ø-Ø"3Ø'=Ø"3Ø%9Ø'Ø#ñ
ð ñ
ˆGô DØ)Ø)Ø#*ô
ð 	
r    )NNNNNNNNNFN)r   r   r   rv   ÚFloatTensorÚ
LongTensorÚTensorÚboolr   r   ÚtuplerQ   r–   r   r    r!   rS   rS   ²   s6  „ ð
 ;?Ø.2Ø26Ø59Ø:>Ø-1Ø)-Ø,0Ø#'Ø).Ø!%ñi
à×'Ñ'ði
ð !×,Ñ,ði
ð !&× 0Ñ 0°4Ñ 7ð	i
ð
 ×$Ñ$ tÑ+ði
ð ×(Ñ(¨4Ñ/ði
ð !×+Ñ+¨dÑ2ði
ð !&× 0Ñ 0°4Ñ 7ði
ð —|‘| dÑ*ði
ð   $™;ði
ð # T™kði
ð ˜D‘[ði
ð #'ði
ð ˜$‘;ði
ð Ð-Ñ.ði
ð  
ÐEÑ	Eô!i
r    rS   c            "       óD  — e Zd Zee	 	 ddej                  dej                  dej                  dz  dedz  de	e
   deez  fd„«       «       Zd	„ Zd
ej                  dej                  fd„Z	 	 	 	 	 	 	 	 	 	 	 	 ddej                  dej                  dej                  dz  d
ej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dedz  dedz  dej                  dz  dedz  dededz  de	e
   deez  f d„Z ej$                  «       	 	 	 	 	 	 ddej                  dej                  dz  dej                  dz  d
ej                  dz  dej                  dz  dej                  dz  dedej                  fd„«       Zy)Ú)InstructBlipVideoForConditionalGenerationNrT   rU   rV   r_   r>   ra   c           
      ó  — |j                   \  }}}}	}
|j                  ||z  ||	|
«      } | j                  d||ddœ|¤Ž}t        |j                  |j
                  |j                  |j                  |d¬«      }|d   }t        j                  |j                  «       dd t        j                  |j                  ¬«      }| j                  j                  |j                   d   dd«      }t        j                  |j                  «       dd t        j                  |j                  ¬«      }|€t        j                  |«      }|j!                  |d¬«      }|j!                  |d¬«      }t        j"                  ||gd	¬«      } | j$                  d|||||dd
œ|¤Ž}||_        |d   dd…d|j                  d	«      …dd…f   }| j)                  |«      }|j                  || j*                  j,                  |z  d«      }||_        |S )a  
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
            The tensors corresponding to the input images.
        qformer_input_ids (`torch.LongTensor` of shape (batch_size, sequence_length)):
            The sequence used as a prompt to be fed to the Q-Former module.
        qformer_attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
            Mask to avoid performing attention on padding token indices.
        T)rT   r_   r^   N)Úlast_hidden_stateÚpooler_outputÚhidden_statesÚ
attentionsrn   ro   r   rc   rd   rg   ri   )rW   rX   rj   rk   rl   r^   r   )rs   rt   ru   r   rŸ   r    r¡   r¢   rv   rw   rx   ry   rf   rz   r{   r|   r}   r~   r   ro   r€   rq   r5   )r=   rT   rU   rV   r_   r>   r‰   rŠ   r‹   rŒ   r   rn   rŽ   r   rz   r   ro   r’   Úvideo_featuress                      r!   Úget_video_featuresz<InstructBlipVideoForConditionalGeneration.get_video_features   s.  € ð( 6B×5GÑ5GÑ2ˆ
F˜G V¨UØ#×+Ñ+¨J¸Ñ,?ÀÈ&ÐRWÓXˆà5F°T×5FÑ5Fð 6
Ø%Ø%=Øñ6
ð ñ	6
ˆô AØ,×>Ñ>Ø(×6Ñ6Ø(×6Ñ6Ø%×0Ñ0Ø)Ø ô
ˆð & aÑ(ˆô  %Ÿz™z¨,×*;Ñ*;Ó*=¸c¸rÐ*BÌ%Ï*É*Ð]i×]pÑ]pÔqÐð ×(Ñ(×/Ñ/°×0BÑ0BÀ1Ñ0EÀrÈ2ÓNˆÜ$Ÿz™z¨,×*;Ñ*;Ó*=¸c¸rÐ*BÌ%Ï*É*Ð]i×]pÑ]pÔqÐà!Ð)Ü%*§_¡_Ð5FÓ%GÐ"à-×?Ñ?ÀÈAÐ?ÓNÐØ!7×!IÑ!IÈ&ÐVWÐ!IÓ!XÐÜ!&§¡Ð,@ÐBXÐ+YÐ_`Ô!aÐØ&˜$Ÿ,™,ð 
Ø'Ø1Ø%Ø".Ø#7Øñ
ð ñ
ˆð *9ˆÔ&Ø& qÑ)ª!Ð-C¨|×/@Ñ/@ÀÓ/CÐ-CÂQÐ*FÑGˆð ×1Ñ1°,Ó?ˆð (×/Ñ/°
¸D¿K¹K×<XÑ<XÐ[aÑ<aÐceÓfˆØ'5ˆÔ$àÐr    c                  ó   — t        d«      ‚)Nz=No need to inherit as this architecture only supports videos.)ÚAttributeError)Úsuper_kwargss    r!   Úget_image_featuresz<InstructBlipVideoForConditionalGeneration.get_image_featuresi  s   € ÜÐ\Ó]Ð]r    rW   r[   c                 óˆ  — |€m| | j                  «       t        j                  | j                  j                  t        j
                  |j                  ¬«      «      k(  }|j                  d«      }n|| j                  j                  k(  }|j                  d«      j                  |«      j                  |j                  «      }|S )zZ
        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`.
        rd   rc   )r‚   rv   rƒ   rq   r(   ry   rf   r„   r…   r†   r‡   )r=   rW   r[   r”   s       r!   Úget_placeholder_maskz>InstructBlipVideoForConditionalGeneration.get_placeholder_maskl  s©   € ð ÐØ!.Ð2M°$×2KÑ2KÓ2MÜ—‘˜TŸ[™[×7Ñ7¼u¿z¹zÐR_×RfÑRfÔgó3ñ "Ðð "4×!7Ñ!7¸Ó!;Ñà!*¨d¯k©k×.HÑ.HÑ!HÐà/×9Ñ9¸"Ó=×GÑGÈÓV×YÑYÐZg×ZnÑZnÓoÐØ!Ð!r    rX   rY   rZ   r\   r]   Úlabelsr^   r`   c                 ó<  — ||n| j                   j                  } | j                  |f|||ddœ|¤Ž}|j                  }|j                  }|j
                  }|€ | j                  «       |«      }|€t        j                  |«      }|j                  |j                  |j                  «      }| j                  ||¬«      }|j                  ||«      }| j                   j                  re | j                  d|||	|
||dœ|¤Ž}|r|j                   n|d   }d}|w | j"                  d||| j                   j$                  j&                  dœ|¤Ž}nB | j                  d|||||	|
|||dœ	|¤Ž}|r|j(                  n|d   }|r|j                   n|d	   }t+        |||||¬
«      S )a˜  
        qformer_input_ids (`torch.LongTensor` of shape (batch_size, sequence_length)):
            The sequence used as a prompt to be fed to the Q-Former module.
        qformer_attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
            Mask to avoid performing attention on padding token indices.

        Examples:

        ```python
        >>> from transformers import InstructBlipVideoProcessor, InstructBlipVideoForConditionalGeneration
        >>> import torch
        >>> from huggingface_hub import hf_hub_download
        >>> import av
        >>> import numpy as np

        >>> def read_video_pyav(container, indices):
        ...     '''
        ...     Decode the video with PyAV decoder.
        ...     Args:
        ...         container (`av.container.input.InputContainer`): PyAV container.
        ...         indices (`list[int]`): List of frame indices to decode.
        ...     Returns:
        ...         result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
        ...     '''
        ...     frames = []
        ...     container.seek(0)
        ...     start_index = indices[0]
        ...     end_index = indices[-1]
        ...     for i, frame in enumerate(container.decode(video=0)):
        ...         if i > end_index:
        ...             break
        ...         if i >= start_index and i in indices:
        ...             frames.append(frame)
        ...     return np.stack([x.to_ndarray(format="rgb24") for x in frames])

        >>> model = InstructBlipVideoForConditionalGeneration.from_pretrained("Salesforce/instructblip-vicuna-7b", device_map="auto")
        >>> processor = InstructBlipVideoProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b")

        >>> file_path = hf_hub_download(
        ...       repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
        ... )
        >>> container = av.open(file_path)

        >>> # sample uniformly 4 frames from the videWhy is this video funny?o
        >>> total_frames = container.streams.video[0].frames
        >>> indices = np.arange(0, total_frames, total_frames / 4).astype(int)
        >>> clip = read_video_pyav(container, indices)

        >>> prompt = "What is happening in the video?"
        >>> inputs = processor(text=prompt, images=clip, return_tensors="pt").to(model.device)

        >>> outputs = model.generate(
        ...     **inputs,
        ...     do_sample=False,
        ...     num_beams=5,
        ...     max_length=256,
        ...     repetition_penalty=1.5,
        ...     length_penalty=1.0,
        ... )
        >>> generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()
        >>> print(generated_text)
        "A person is eating a bowl of pasta, and they are using a fork to eat it. The person is sitting at a table, and the plate of pasta is on the table in front"
        ```NT©rU   rV   r_   r^   ©r[   rm   r   )Úlogitsr«   Ú
vocab_size)	r[   rX   rY   rZ   r\   r]   r^   r«   r`   ri   )Úlossr¯   rn   ro   rp   r   )rq   rr   r¤   r    ro   rn   r‚   rv   r|   r‡   rf   re   rª   rˆ   r8   r   r¯   Úloss_functionr*   r°   r±   rQ   )r=   rT   rU   rV   rW   rX   rY   rZ   r[   r\   r]   r«   r^   r_   r`   r>   r£   r“   ro   rn   r”   r•   r¯   r±   s                           r!   r–   z1InstructBlipVideoForConditionalGeneration.forward{  sý  € ðb &1Ð%<‘kÀ$Ç+Á+×B]ÑB]ˆàBYÀ$×BYÑBYØðC
à/Ø#9Ø%=ØñC
ð ñC
ˆð !/× <Ñ <ÐØ(×8Ñ8ˆØ'×6Ñ6ˆàÐ Ø7˜D×5Ñ5Ó7¸	ÓBˆMàÐ!Ü"Ÿ_™_¨YÓ7ˆNà 5× 8Ñ 8¸×9MÑ9MÈ}×ObÑObÓ cÐØ!×6Ñ6°yÐP]Ð6Ó^ÐØ%×4Ñ4Ð5GÐI^Ó_ˆà;‰;×6Ò6Ø)d×)Ñ)ð Ø+Ø-Ø"3Ø%9Ø'Ø#ñð ñˆGñ (3W—^’^¸À¹
ˆFØˆDØÐ!Ø)t×)Ñ)ð Ø!¨&¸T¿[¹[×=TÑ=T×=_Ñ=_ñØciñ‘ð
 *d×)Ñ)ð Ø+Ø-Ø"3Ø'=Ø"3Ø%9Ø'ØØ#ñð ñˆGñ $/7—<’<°G¸A±JˆDÙ'2W—^’^¸À¹
ˆFäCØØØ)Ø+Ø#*ô
ð 	
r    c                 óX  — t        | d«      r| j                  «        |j                  d   }	| j                  ||||d¬«      }
|
j                  }|€°|€˜| j
                  j                  g| j
                  j                  z  dz  }|| j
                  j                  j                  gz   }t        j                  |gt        j                  |j                  ¬«      }|j                  |	d«      } | j                  «       |«      }|€t        j                   |«      }|j#                  |j                  |j$                  «      }| j'                  ||¬«      }|j)                  ||«      }||d	œ}| j*                  j
                  j,                  s||d
<    | j*                  j.                  di |¤|¤Ž}|S )aÙ  
        Overrides `generate` function to be able to use the model as a conditional generator.

        Args:
            pixel_values (`torch.FloatTensor` of shape (batch_size, num_channels, height, width) or
                (batch_size, num_frames, num_channels, height, width)): Input images or videos to be processed.
            qformer_input_ids (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
                The sequence used as a prompt to be fed to the Q-Former module.
            qformer_attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
                Mask to avoid performing attention on padding token indices.
            input_ids (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
                The sequence used as a prompt for the generation.
            attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
                Mask to avoid performing attention on padding token indices.
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Embedded representation of the inputs. Should be float, not int tokens.
            interpolate_pos_encoding (`bool`, *optional*, defaults to `False`):
                Whether to interpolate the positional encoding of the image embeddings.

        Returns:
            captions (list): A list of strings of length batch_size * num_captions.
        Úhf_device_mapr   Tr­   é   rd   ri   r®   )r[   rX   rW   r   )ÚhasattrÚ_preprocess_acceleraters   r¤   r    rq   r)   r5   r*   Úbos_token_idrv   rƒ   ry   rf   Úrepeatr‚   r|   r‡   re   rª   rˆ   r   Úis_encoder_decoderÚgenerate)r=   rT   rU   rV   rW   rX   r[   r_   Úgenerate_kwargsr‰   r£   r“   Úvideo_tokensÚstart_tokensr”   Úinputsr•   s                    r!   r»   z2InstructBlipVideoForConditionalGeneration.generate  s¢  € ôD 4˜Ô)à×'Ñ'Ô)à!×'Ñ'¨Ñ*ˆ
ØBF×BYÑBYØØ/Ø#9Ø%=Øð CZó C
ˆð !/× <Ñ <ÐàÐ ØÐ Ø $§¡× =Ñ =Ð>ÀÇÁ×A]ÑA]Ñ]Ð`aÑaØ+¨t¯{©{×/FÑ/F×/SÑ/SÐ.TÑTÜ!ŸL™L¨,¨¼u¿z¹zÐR^×ReÑReÔf	Ø%×,Ñ,¨Z¸Ó;	Ø7˜D×5Ñ5Ó7¸	ÓBˆMàÐ!Ü"Ÿ_™_¨YÓ7ˆNà 5× 8Ñ 8¸×9MÑ9MÈ}×ObÑObÓ cÐØ!×6Ñ6°yÐP]Ð6Ó^ÐØ%×4Ñ4Ð5GÐI^Ó_ˆà#0ÀNÑSˆØ×"Ñ"×)Ñ)×<Ò<Ø"+ˆF;Ñà.$×%Ñ%×.Ñ.ÑK°ÐK¸?ÑKˆàˆr    )NF)NNNNNNNNNNFN)NNNNNF)r   r   r   r   r   rv   r—   r˜   rš   r   r   r›   r   r¤   r¨   rª   rQ   r–   Úno_gradr»   r   r    r!   r   r     sÎ  „ ØØð
 ;?Ø05ñEà×'Ñ'ðEð !×+Ñ+ðEð !&× 0Ñ 0°4Ñ 7ð	Eð
 #'¨¡+ðEð Ð+Ñ,ðEð 
Ð8Ñ	8òEó ó ðEòN^ð"¨e×.>Ñ.>ð "Èu×O`ÑO`ó "ð& ;?Ø.2Ø26Ø59Ø:>Ø26Ø)-Ø,0Ø*.Ø#'Ø).Ø!%ñP
à×'Ñ'ðP
ð !×,Ñ,ðP
ð !&× 0Ñ 0°4Ñ 7ð	P
ð
 ×$Ñ$ tÑ+ðP
ð ×(Ñ(¨4Ñ/ðP
ð !×+Ñ+¨dÑ2ðP
ð !&× 0Ñ 0°4Ñ 7ðP
ð ×(Ñ(¨4Ñ/ðP
ð   $™;ðP
ð # T™kðP
ð × Ñ  4Ñ'ðP
ð ˜D‘[ðP
ð #'ðP
ð ˜$‘;ðP
ð  Ð+Ñ,ð!P
ð" 
ÐEÑ	Eó#P
ðd €U‡]]ƒ_ð 6:Ø:>Ø-1Ø26Ø26Ø).ñDà×'Ñ'ðDð !×+Ñ+¨dÑ2ðDð !&× 0Ñ 0°4Ñ 7ð	Dð
 ×#Ñ# dÑ*ðDð ×(Ñ(¨4Ñ/ðDð ×(Ñ(¨4Ñ/ðDð #'ðDð 
×	Ñ	òDó ñDr    r   )r&   r$   r   rM   rG   rO   rS   r   )+rv   Ú;transformers.models.instructblip.configuration_instructblipr   r   Ú6transformers.models.instructblip.modeling_instructblipr   r   r   r   r	   r
   r   r   Úconfiguration_utilsr   Úmodeling_flash_attention_utilsr   Úmodeling_outputsr   Úmodels.auto.modeling_autor   Úprocessing_utilsr   Úutilsr   r   r   Úautor   r   Ú
get_loggerr   r0   r   r$   r&   rG   rM   rO   rQ   rS   r   Ú__all__r   r    r!   ú<module>rÌ      sÑ   ðó  ÷÷	÷ 	ó 	õ 4Ý BÝ :Ý JÝ &ß >Ñ >ß -ð 
ˆ×	Ñ	˜HÓ	%€ô	Ð$<ô 	ô	Ð%>ô 	ôj#Ð.ô j#ôZ)Ð'Bô )ôÐ#:ô ô	Ð$<ô 	ô	Ð;jô 	ôj
Ð.ô j
ôZsÐ0Tô sòl		r    