
    qiA                         d dl mZmZ ddlmZ ddlmZmZ ddlm	Z	m
Z
mZmZ ddlmZmZ ddlmZmZ  e       rd dlZ G d	 d
e
d      Ze G d de             ZdgZy)    )OptionalUnion   )BatchFeature)
ImageInputis_valid_image)MultiModalDataProcessingKwargsProcessorMixinUnpack)PreTokenizedInput	TextInput)auto_docstringis_torch_availableNc                   &    e Zd ZddidddddidZy	)
ColQwen2ProcessorKwargspaddinglongestchannels_firstT)data_formatdo_convert_rgbreturn_tensorspt)text_kwargsimages_kwargscommon_kwargsN)__name__
__module____qualname__	_defaults     b/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/colqwen2/processing_colqwen2.pyr   r   "   s,     y
 ,"
 +D1	Ir"   r   F)totalc                   b    e Zd Z	 	 	 	 	 ddedz  dedz  f fdZe	 	 ddedz  deez  e	e   z  e	e   z  de
e   defd	       Zdd
Zed        Zedefd       Z	 ddedz  de
e   defdZdee	e   z  de
e   defdZ	 	 	 ddede	d   f   dede	d   f   deded   dedef   ddfdZ xZS )ColQwen2ProcessorNvisual_prompt_prefixquery_prefixc                     t         |   |||       t        |d      sdn|j                  | _        t        |d      sdn|j                  | _        |xs d| _        |xs d| _        y)	ar  
        visual_prompt_prefix (`str`, *optional*, defaults to `"<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>Describe the image.<|im_end|><|endoftext|>"`):
            A string that gets tokenized and prepended to the image tokens.
        query_prefix (`str`, *optional*, defaults to `"Query: "`):
            A prefix to be used for the query.
        )chat_templateimage_tokenz<|image_pad|>video_tokenz<|video_pad|>zf<|im_start|>user
<|vision_start|><|image_pad|><|vision_end|>Describe the image.<|im_end|><|endoftext|>zQuery: N)super__init__hasattrr+   r,   r'   r(   )selfimage_processor	tokenizerr*   r'   r(   kwargs	__class__s          r#   r.   zColQwen2Processor.__init__1   so     	)=Q29)]2S?YbYnYn29)]2S?YbYnYn$8 %
u 	! )5Ir"   imagestextr3   returnc                 v    | j                   t        fd| j                  j                  i|}|d   j	                  dd      }|du}||t        d      ||t        d      |7t        |      r|g}n^t        |t              rt        |d         rn?t        |t              r$t        |d   t              rt        |d   d         st        d      | j                  gt        |      z  } | j                  dd	|i|d
   }|d   }	|	| j                  j                  dz  }
d}t        t        |            D ]  }| j                  ||   v rQ||   j                  | j                  d|	|   j!                         |
z  z  d      ||<   |dz  }| j                  ||   v rQ||   j                  d| j                        ||<     | j                  |fddi|d   }t#        i ||      }|d   dddf   |d   dddf   z  }t        t%        j&                  |d   |j)                                     }t$        j*                  j,                  j.                  j1                  |d      |d<   |r.|d   j3                  |d   dk(  d      }|j5                  d|i       |S |t        |t6              r|g}n.t        |t              rt        |d   t6              st        d      || j8                  dz  }g }|D ]%  }| j:                  |z   |z   }|j=                  |       '  | j                  |fddi|d   }|S y)a  
        Returns:
            [`BatchFeature`]: A [`BatchFeature`] with the following fields:

            - **input_ids** -- List of token ids to be fed to a model.
            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
              `None`).
            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
        tokenizer_init_kwargsr   suffixNz&Either text or images must be providedz5Only one of text or images can be processed at a timer   zAimages must be an image, list of images or list of list of imagesr5   r   image_grid_thw   z<|placeholder|>   return_token_type_idsF)datapixel_valuesT)batch_first	input_idstoken_type_idsilabelsz*Text must be a string or a list of strings
   r!   )_merge_kwargsr   r2   init_kwargspop
ValueErrorr   
isinstancelistr'   lenr1   
merge_sizeranger+   replaceprodr   torchsplittolistnnutilsrnnpad_sequencemasked_fillupdatestrquery_augmentation_tokenr(   append)r0   r5   r6   r3   output_kwargsr:   r>   	texts_docimage_inputsr;   merge_lengthindexitext_inputsreturn_dataoffsetsr@   rD   texts_queryqueryaugmented_querybatch_querys                         r#   __call__zColQwen2Processor.__call__I   s   " +**#
"&.."<"<
 

 }-11(DA &d 2<FNEFF 2TUUf% FD)nVAY.G .:fQi3NSabhijbklmbnSo !dee223c&kAI/4//`v`A_`L)*:;N)#33>>As9~. ]A**il:'0|';'; ,,.?>RWCXC]C]C_coCo.prs(	! 
	 **il:
 $-Q<#7#78I4K[K[#\IaL] )$..&+  .K ',K{,Kl,KLK ""23AqD9KHX<YZ[]^Z^<__G  K79IJL
 +0((..*<*<*I*I$ +J +K' %$[1==kJZ>[_`>`bfg""Hf#56$$v t,DGS1I !MNN~66;%'K 4"&"3"3e";f"D""?34 )$..&+  .K + r"   c                    i }|t         j                  j                  di       }|j                  |       |j                  dd      xs | j                  j
                  }|D cg c]   } | j                  j                  g || " }}|D cg c]
  }||dz  z   }	}|j                  |	|d       t        di |S c c}w c c}w )a  
        Computes the number of placeholder tokens needed for multimodal inputs with the given sizes.
        Args:
            image_sizes (`list[list[int]]`, *optional*):
                The input sizes formatted as (height, width) per each image.
        Returns:
            `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided
            input modalities, along with other useful data.
        Nr   rM   r<   )num_image_tokensnum_image_patchesr!   )r   r    getrY   r1   rM   get_number_of_image_patchesr	   )
r0   image_sizesr3   vision_datar   rM   
image_sizerm   num_patchesrl   s
             r#   _get_num_multimodal_tokensz,ColQwen2Processor._get_num_multimodal_tokens   s     "3==AA/SUVM  (&**<>a$BVBVBaBaJ #.! A$$@@\*\m\! ! Sdd;
A!=dd4D[lmn,,,!  es   $%B?Cc                     | j                   j                  }| j                  j                  }|D cg c]	  }|dvs| }}||z   S c c}w )N)pixel_values_videosvideo_grid_thw)r2   model_input_namesr1   )r0   tokenizer_input_namesimage_processor_input_namesnames       r#   rx   z#ColQwen2Processor.model_input_names   s\     $ @ @&*&:&:&L&L#
 9'
DHq<qD'
# '
 %'BBB'
s
   	AAc                 .    | j                   j                  S )z
        Return the query augmentation token.

        Query augmentation buffers are used as reasoning buffers during inference.
        )r2   	pad_token)r0   s    r#   r[   z*ColQwen2Processor.query_augmentation_token   s     ~~'''r"   c                 *     | j                   dd|i|S )a  
        Prepare for the model one or several image(s). This method is a wrapper around the `__call__` method of the ColQwen2Processor's
        [`ColQwen2Processor.__call__`].

        This method forwards the `images` and `kwargs` arguments to the image processor.

        Args:
            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
                tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
                number of channels, H and W are image height and width.
            return_tensors (`str` or [`~utils.TensorType`], *optional*):
                If set, will return tensors of a particular framework. Acceptable values are:

                - `'pt'`: Return PyTorch `torch.Tensor` objects.
                - `'np'`: Return NumPy `np.ndarray` objects.

        Returns:
            [`BatchFeature`]: A [`BatchFeature`] with the following fields:

            - **input_ids** -- List of token ids to be fed to a model.
            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
              `None`).
            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
        r5   r!   rj   )r0   r5   r3   s      r#   process_imagesz ColQwen2Processor.process_images   s    > t}}5F5f55r"   c                 *     | j                   dd|i|S )ai  
        Prepare for the model one or several texts. This method is a wrapper around the `__call__` method of the ColQwen2Processor's
        [`ColQwen2Processor.__call__`].

        This method forwards the `text` and `kwargs` arguments to the tokenizer.

        Args:
            text (`str`, `list[str]`, `list[list[str]]`):
                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
            return_tensors (`str` or [`~utils.TensorType`], *optional*):
                If set, will return tensors of a particular framework. Acceptable values are:

                - `'pt'`: Return PyTorch `torch.Tensor` objects.
                - `'np'`: Return NumPy `np.ndarray` objects.

        Returns:
            [`BatchFeature`]: A [`BatchFeature`] with the following fields:

            - **input_ids** -- List of token ids to be fed to a model.
            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
              `None`).
        r6   r!   r   )r0   r6   r3   s      r#   process_queriesz!ColQwen2Processor.process_queries  s    < t}}1$1&11r"   query_embeddingsztorch.Tensorpassage_embeddings
batch_sizeoutput_dtypeztorch.dtypeoutput_deviceztorch.devicec           	         t        |      dk(  rt        d      t        |      dk(  rt        d      |d   j                  |d   j                  k7  rt        d      |d   j                  |d   j                  k7  rt        d      ||d   j                  }g }t	        dt        |      |      D ]%  }g }t
        j                  j                  j                  j                  ||||z    dd      }	t	        dt        |      |      D ]  }
t
        j                  j                  j                  j                  ||
|
|z    dd      }|j                  t        j                  d|	|      j                  d	
      d   j                  d
              |j                  t        j                  |d
      j                  |      j                  |             ( t        j                  |d
      S )a[  
        Compute the late-interaction/MaxSim score (ColBERT-like) for the given multi-vector
        query embeddings (`qs`) and passage embeddings (`ps`). For ColQwen2, a passage is the
        image of a document page.

        Because the embedding tensors are multi-vector and can thus have different shapes, they
        should be fed as:
        (1) a list of tensors, where the i-th tensor is of shape (sequence_length_i, embedding_dim)
        (2) a single tensor of shape (n_passages, max_sequence_length, embedding_dim) -> usually
            obtained by padding the list of tensors.

        Args:
            query_embeddings (`Union[torch.Tensor, list[torch.Tensor]`): Query embeddings.
            passage_embeddings (`Union[torch.Tensor, list[torch.Tensor]`): Passage embeddings.
            batch_size (`int`, *optional*, defaults to 128): Batch size for computing scores.
            output_dtype (`torch.dtype`, *optional*, defaults to `torch.float32`): The dtype of the output tensor.
                If `None`, the dtype of the input embeddings is used.
            output_device (`torch.device` or `str`, *optional*, defaults to "cpu"): The device of the output tensor.

        Returns:
            `torch.Tensor`: A tensor of shape `(n_queries, n_passages)` containing the scores. The score
            tensor is saved on the "cpu" device.
        r   zNo queries providedzNo passages providedz/Queries and passages must be on the same devicez-Queries and passages must have the same dtypeT)rA   padding_valuezbnd,csd->bcnsr   )dimr<   r=   )rL   rI   devicedtyperN   rQ   rT   rU   rV   rW   r\   einsummaxsumcatto)r0   r   r   r   r   r   scoresrb   batch_scoresbatch_queriesjbatch_passagess               r#   score_retrievalz!ColQwen2Processor.score_retrieval"  s   @  A%233!"a'344A%%);A)>)E)EENOOA$$(:1(=(C(CCLMM+A.44L%'q#./< 	]A/1L!HHNN..;; Q^4$VW < M 1c"45zB !&!3!3!@!@&q1z>:\] "A " ##LL-PTTYZT[\]^bbghbi	 MM%))La8;;LILL][\	] yyQ''r"   )NNNNN)NN)N)   Ncpu)r   r   r   rZ   r.   r   r   r   r   rK   r   r   r   rj   rt   propertyrx   r[   r   r   r   intr   r   __classcell__)r4   s   @r#   r&   r&   /   s    +/#'6
 "Dj6 Dj60  %)Z^fT!f ++d9o=EV@WWf 01	f
 
f fP-4 	C 	C (# ( ( %)6T!6 016 
	6B2$y/)2 012 
	2H 0449>(^0D DE>( ".$~2F"FG>( 	>(
 }->( ^S01>( 
>(r"   r&   )typingr   r   feature_extraction_utilsr   image_utilsr   r   processing_utilsr	   r
   r   r   tokenization_utils_baser   r   rU   r   r   rQ   r   r&   __all__r!   r"   r#   <module>r      s_   * # 4 5 X X C 7 
.e 
 p( p( p(f	 
r"   