
    qi=2                         d dl mZmZ d dlmZmZmZ ddlmZ ddl	m
Z
mZ ddlmZmZ ddlmZmZ ddlmZmZ  e       rd d	lZ ej,                  e      Z G d
 ded      Z G d de      ZdgZy	)    )OptionalUnion)IMAGE_TOKENPaliGemmaProcessorbuild_string_from_input   )BatchFeature)
ImageInputmake_flat_list_of_images)ProcessingKwargsUnpack)PreTokenizedInput	TextInput)is_torch_availableloggingNc                   &    e Zd ZddidddddidZy	)
ColPaliProcessorKwargspaddinglongestchannels_firstT)data_formatdo_convert_rgbreturn_tensorspt)text_kwargsimages_kwargscommon_kwargsN)__name__
__module____qualname__	_defaults     ]/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/colpali/modular_colpali.pyr   r   !   s,     y
 ,"
 +D1	Ir#   r   F)totalc                   4    e Zd Z	 	 	 	 	 ddedef fdZedefd       Z	 	 ddedz  dee	z  e
e   z  e
e	   z  d	ee   defd
Z	 ddedz  d	ee   defdZdee
e   z  d	ee   defdZ	 	 	 ddede
d   f   dede
d   f   deded   dedef   ddfdZ xZS )ColPaliProcessorNvisual_prompt_prefixquery_prefixc                 F    || _         || _        t        |   |||       y)a!  
        visual_prompt_prefix (`str`, *optional*, defaults to `"Describe the image."`):
            A string that gets tokenized and prepended to the image tokens.
        query_prefix (`str`, *optional*, defaults to `"Question: "`):
            A prefix to be used for the query.
        )image_processor	tokenizerchat_templateN)r(   r)   super__init__)selfr+   r,   r-   r(   r)   	__class__s         r$   r/   zColPaliProcessor.__init__/   s*     %9!(I]jkr#   returnc                 .    | j                   j                  S )z
        Return the query augmentation token.

        Query augmentation buffers are used as reasoning buffers during inference.
        )r,   	pad_token)r0   s    r$   query_augmentation_tokenz)ColPaliProcessor.query_augmentation_tokenA   s     ~~'''r#   imagestextkwargsc                 X    | j                   t        fd| j                  j                  i|}|d   j	                  dd      }d}||t        d      ||t        d      |j| j                  j                  |      }t        |      }| j                  gt        |      z  }|D cg c]  }|j                  d       }}t        ||      D 	
cg c]R  \  }	}
t        |	| j                  j                  | j                  t         t#        |
t$              rt        |
      nd	
      T }}	}
 | j                  |fi |d   d   }|d   j'                  dd      |d   dxx   | j                  z  cc<    | j                  |fd|i|d   }i |d|i}|r.|d   j)                  |d   dk(  d      }|j+                  d|i       t-        |      S |t#        |t.              r|g}n.t#        |t$              rt#        |d   t.              st        d      || j0                  dz  }g }|D ]?  }| j                  j                  | j2                  z   |z   |z   dz   }|j5                  |       A |d   j'                  dd      |d   d<    | j                  |fd|i|d   }|S yc c}w c c}
}	w )a  
        Returns:
            [`BatchFeature`]: A [`BatchFeature`] with the following fields:

            - **input_ids** -- List of token ids to be fed to a model.
            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
              `None`).
            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
        tokenizer_init_kwargsr   suffixNTz&Either text or images must be providedz5Only one of text or images can be processed at a timeRGB   )prompt	bos_tokenimage_seq_lenimage_token
num_imagesr   pixel_values
max_lengthreturn_token_type_ids	input_idstoken_type_idsr   ilabels)dataz*Text must be a string or a list of strings
   
2   )_merge_kwargsr   r,   init_kwargspop
ValueErrorr+   fetch_imagesr   r(   lenconvertzipr   r?   image_seq_lengthr   
isinstancelistgetmasked_fillupdater	   strr5   r)   append)r0   r6   r7   r8   output_kwargsr;   rE   	texts_docimager>   
image_listinput_stringsrC   inputsreturn_datarH   texts_queryquerybatch_querys                      r$   __call__zColPaliProcessor.__call__J   s     +**"
"&.."<"<
 

 }-11(DA $<FNEFF 2TUU))66v>F-f5F223c&kAI8>?uemmE*?F? +.i*@	 'FJ (!"nn66"&"7"7 +2<Z2Ns:TU	M 	 04//Y-:XYZhiL ]+//dCOm,\:d>S>SS:#T^^&;  .F CVB^\BK$,88@P9QUV9VX\]""Hf#56[11$$v t,DGS1I !MNN~66;%'K *0043D3DDuLvUX\\""5)* :G}9U9Y9YZfhj9kM-(6($..&;  .K - A @	s   )J!AJ&c                 *     | j                   dd|i|S )a  
        Prepare for the model one or several image(s). This method is a wrapper around the `__call__` method of the ColPaliProcessor's
        [`ColPaliProcessor.__call__`].

        This method forwards the `images` and `kwargs` arguments to the image processor.

        Args:
            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
                tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
                number of channels, H and W are image height and width.
            return_tensors (`str` or [`~utils.TensorType`], *optional*):
                If set, will return tensors of a particular framework. Acceptable values are:

                - `'pt'`: Return PyTorch `torch.Tensor` objects.
                - `'np'`: Return NumPy `np.ndarray` objects.

        Returns:
            [`BatchFeature`]: A [`BatchFeature`] with the following fields:

            - **input_ids** -- List of token ids to be fed to a model.
            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
              `None`).
            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
        r6   r"   rg   )r0   r6   r8   s      r$   process_imageszColPaliProcessor.process_images   s    > t}}5F5f55r#   c                 *     | j                   dd|i|S )ag  
        Prepare for the model one or several texts. This method is a wrapper around the `__call__` method of the ColPaliProcessor's
        [`ColPaliProcessor.__call__`].

        This method forwards the `text` and `kwargs` arguments to the tokenizer.

        Args:
            text (`str`, `list[str]`, `list[list[str]]`):
                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
            return_tensors (`str` or [`~utils.TensorType`], *optional*):
                If set, will return tensors of a particular framework. Acceptable values are:

                - `'pt'`: Return PyTorch `torch.Tensor` objects.
                - `'np'`: Return NumPy `np.ndarray` objects.

        Returns:
            [`BatchFeature`]: A [`BatchFeature`] with the following fields:

            - **input_ids** -- List of token ids to be fed to a model.
            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
              `None`).
        r7   r"   ri   )r0   r7   r8   s      r$   process_queriesz ColPaliProcessor.process_queries   s    < t}}1$1&11r#   query_embeddingsztorch.Tensorpassage_embeddings
batch_sizeoutput_dtypeztorch.dtypeoutput_deviceztorch.devicec           	         t        |      dk(  rt        d      t        |      dk(  rt        d      |d   j                  |d   j                  k7  rt        d      |d   j                  |d   j                  k7  rt        d      ||d   j                  }g }t	        dt        |      |      D ]%  }g }t
        j                  j                  j                  j                  ||||z    dd      }	t	        dt        |      |      D ]  }
t
        j                  j                  j                  j                  ||
|
|z    dd      }|j                  t        j                  d|	|      j                  d	
      d   j                  d
              |j                  t        j                  |d
      j                  |      j                  |             ( t        j                  |d
      S )aZ  
        Compute the late-interaction/MaxSim score (ColBERT-like) for the given multi-vector
        query embeddings (`qs`) and passage embeddings (`ps`). For ColPali, a passage is the
        image of a document page.

        Because the embedding tensors are multi-vector and can thus have different shapes, they
        should be fed as:
        (1) a list of tensors, where the i-th tensor is of shape (sequence_length_i, embedding_dim)
        (2) a single tensor of shape (n_passages, max_sequence_length, embedding_dim) -> usually
            obtained by padding the list of tensors.

        Args:
            query_embeddings (`Union[torch.Tensor, list[torch.Tensor]`): Query embeddings.
            passage_embeddings (`Union[torch.Tensor, list[torch.Tensor]`): Passage embeddings.
            batch_size (`int`, *optional*, defaults to 128): Batch size for computing scores.
            output_dtype (`torch.dtype`, *optional*, defaults to `torch.float32`): The dtype of the output tensor.
                If `None`, the dtype of the input embeddings is used.
            output_device (`torch.device` or `str`, *optional*, defaults to "cpu"): The device of the output tensor.

        Returns:
            `torch.Tensor`: A tensor of shape `(n_queries, n_passages)` containing the scores. The score
            tensor is saved on the "cpu" device.
        r   zNo queries providedzNo passages providedz/Queries and passages must be on the same devicez-Queries and passages must have the same dtypeT)batch_firstpadding_valuezbnd,csd->bcnsr   )dim   r=   )rR   rP   devicedtyperangetorchnnutilsrnnpad_sequencer\   einsummaxsumcatto)r0   rm   rn   ro   rp   rq   scoresibatch_scoresbatch_queriesjbatch_passagess               r$   score_retrievalz ColPaliProcessor.score_retrieval   s   @  A%233!"a'344A%%);A)>)E)EENOOA$$(:1(=(C(CCLMM+A.44L%'q#./< 	]A/1L!HHNN..;; Q^4$VW < M 1c"45zB !&!3!3!@!@&q1z>:\] "A " ##LL-PTTYZT[\]^bbghbi	 MM%))La8;;LILL][\	] yyQ''r#   )NNNzDescribe the image.z
Question: )NN)N)   Ncpu)r   r   r    r[   r/   propertyr5   r
   r   r   rW   r   r   r	   rg   rj   rl   r   intr   r   __classcell__)r1   s   @r$   r'   r'   .   s    $9(l
 "l l$ (# ( ( %)Z^XT!X ++d9o=EV@WWX /0	X
 
Xx %)6T!6 /06 
	6B2$y/)2 /02 
	2H 0449>(^0D DE>( ".$~2F"FG>( 	>(
 }->( ^S01>( 
>(r#   r'   )typingr   r   2transformers.models.paligemma.processing_paligemmar   r   r   feature_extraction_utilsr	   image_utilsr
   r   processing_utilsr   r   tokenization_utils_baser   r   r|   r   r   rz   
get_loggerr   loggerr   r'   __all__r"   r#   r$   <module>r      sg     # w w 4 ? 8 C 0 			H	%
-U 
u() u(r r#   