
    qitO                        d dl mZ d dlmZ d dlmZmZmZ d dlZddl	m
Z
 ddlmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZmZmZmZ ddlmZ ddlmZ ddlmZ ddlm Z  ddl!m"Z"m#Z# ddl$m%Z% ddl&m'Z'm(Z(  ejR                  e*      Z+ G d de%      Z, G d de(d      Z- ed      e G d de'                    Z.e G d de#             Z/e ed !       G d" d#e                    Z0 ed$!       G d% d&e"             Z1g d'Z2y)(    )deepcopy)	dataclass)AnyOptionalUnionN   )PreTrainedConfig)BatchFeature)
ImageInputis_valid_image)Unpack)	TextInput)ModelOutputTransformersKwargsauto_docstringlogging)can_return_tuple)requires   )CONFIG_MAPPING)	AutoModel)ColPaliForRetrievalColPaliPreTrainedModel)ColQwen2Config)Idefics3ProcessorIdefics3ProcessorKwargsc                   J    e Zd ZU dZdZdeiZeee	f   e
d<   	 	 	 d	dedefdZy)
ColModernVBertConfiga  
    Configuration class to store the configuration of a [`ColModernVBertForRetrieval`]. It is used to instantiate an instance
    of `ColModernVBertForRetrieval` according to the specified arguments, defining the model architecture following the methodology
    from the "ColPali: Efficient Document Retrieval with Vision Language Models" paper.

    Instantiating a configuration with the defaults will yield a similar configuration to the vision encoder used by the pre-trained
    ColModernVBert model, e.g. [ModernVBERT/colmodernvbert-merged](https://huggingface.co/ModernVBERT/colmodernvbert-merged).

    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PreTrainedConfig`] for more information.

    Args:
        vlm_config (`PreTrainedConfig`, *optional*):
            Configuration of the VLM backbone model.
        embedding_dim (`int`, *optional*, defaults to 128):
            Dimension of the multi-vector embeddings produced by the model.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
    Example:

    ```python
    from transformers import ColModernVBertConfig, ColModernVBertForRetrieval

    config = ColModernVBertConfig()
    model = ColModernVBertForRetrieval(config)
    ```
    colmodernvbert
vlm_configsub_configsNembedding_diminitializer_rangec                    |#t        d          }t        j                  d       ndt        |t              r,t        |      }d|vrt        d      t        |d      di |}n(t        |t              st        dt        |       d      t        |d      s|j                         j                  |_        || _        || _        || _        t        j                   di | y )	Nmodernvbertzc`vlm_config` is `None`. Initializing `vlm_config` with the `ModernVBertConfig` with default values.
model_typez^The `model_type` key is missing in the `vlm_config` dictionary. Please provide the model type.zWInvalid type for `vlm_config`. Expected `PreTrainedConfig`, `dict`, or `None`, but got .
vocab_size )r   loggerinfo
isinstancedictr   KeyErrorr	   	TypeErrortypehasattrget_text_configr(   r    r"   r#   __init__)selfr    r"   r#   kwargss        k/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/colmodernvbert/modular_colmodernvbert.pyr3   zColModernVBertConfig.__init__G   s     '68JKKu 
D)!*-J:-t  (
<(@AOJOJJ(89ijnoyjzi{{|}  z<0$.$>$>$@$K$KJ!$*!2!!+F+    )N   g{Gz?)__name__
__module____qualname____doc__r&   r	   r!   r-   strr   __annotations__intfloatr3   r)   r7   r6   r   r   '   sJ    8 "J#/1A"BKc3hB  #'	, , !	,r7   r   c                   (    e Zd ZddiddddddidZy	)
ColModernVBertProcessorKwargspaddinglongestTchannels_first)return_row_col_infodata_formatdo_convert_rgbreturn_tensorspt)text_kwargsimages_kwargscommon_kwargsN)r9   r:   r;   	_defaultsr)   r7   r6   rB   rB   h   s/     y
 $(+"

 +D1
Ir7   rB   F)total)torch)backendsc                        e Zd ZdZ	 	 	 	 	 ddededz  dedz  f fdZ	 ddedz  dee	   d	e
fd
Zdeee   z  dee	   d	e
fdZ	 	 	 ddeded   f   deded   f   deded   dedef   d	dfdZ xZS )ColModernVBertProcessora!  
    Constructs a ColModernVBert processor which wraps a ModernVBertProcessor and special methods to process images and queries, as
    well as to compute the late-interaction retrieval score.

    [`ColModernVBertProcessor`] offers all the functionalities of [`ModernVBertProcessor`]. See the [`~ModernVBertProcessor.__call__`]
    for more information.

    Args:
            image_processor ([`Idefics3ImageProcessor`]): An instance of [`Idefics3ImageProcessor`]. The image processor is a required input.
            tokenizer (`PreTrainedTokenizerFast`, *optional*): An instance of [`PreTrainedTokenizerFast`]. This should correspond with the model's text model. The tokenizer is a required input.
            image_seq_len (`int`, *optional*, defaults to 64): The length of the image sequence i.e. the number of <image> tokens per image in the input.
            visual_prompt_prefix (`Optional`, *optional*): A prefix to be prepended to visual prompts.
            query_prefix (`Optional`, *optional*): A prefix to be prepended to query prompts.
    Nimage_seq_lenvisual_prompt_prefixquery_prefixc                     d}t        |   ||f||d| |xs d| j                   d| _        |xs d| _        | j
                  | _        y)a  
        image_seq_len (`int`, *optional*, defaults to 64):
            The length of the image sequence i.e. the number of <image> tokens per image in the input.
        visual_prompt_prefix (`str`, *optional*):
            A string that gets tokenized and prepended to the image tokens.
        query_prefix (`str`, *optional*):
            A prefix to be used for the query.
        N)chat_templaterT   z<|begin_of_text|>User:z0Describe the image.<end_of_utterance>
Assistant: )superr3   image_tokenrU   rV   end_of_utterance_tokenquery_augmentation_token)	r4   image_processor	tokenizerrX   rT   rU   rV   r5   	__class__s	           r6   r3   z ColModernVBertProcessor.__init__   sw    $ 	
 ('		

 	
 %9 %
$T%5%5$66gh 	! ).B(,(C(C%r7   imagesr5   returnc                 v    | j                   t        fd| j                  j                  i|}|d   j	                  dd      }|du}t        |      r|g}n^t        |t              rt        |d         rn?t        |t              r$t        |d   t              rt        |d   d         st        d      |D cg c]  }|j                  d       }}| j                  | j                  gt        |      z  ||d   |d   	      }|r.|d
   j                  |d   dk(  d      }|j                  d|i       |S c c}w )a  
        Prepare for the model one or several image(s). Handles input validation, RGB conversion,
        and prepends the `visual_prompt_prefix` to each image. Optionally computes labels from
        `token_type_ids` when a `suffix` is provided in `text_kwargs`.

        Args:
            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
                tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
                number of channels, H and W are image height and width.
            return_tensors (`str` or [`~utils.TensorType`], *optional*):
                If set, will return tensors of a particular framework. Acceptable values are:

                - `'pt'`: Return PyTorch `torch.Tensor` objects.
                - `'np'`: Return NumPy `np.ndarray` objects.

        Returns:
            [`BatchFeature`]: A [`BatchFeature`] with the following fields:

            - **input_ids** -- List of token ids to be fed to a model.
            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
              `None`).
            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
        tokenizer_init_kwargsrK   suffixNr   zAimages must be an image, list of images or list of list of imagesRGBrL   )textra   rL   rK   	input_idstoken_type_idsilabels)_merge_kwargsrB   r_   init_kwargspopr   r,   list
ValueErrorconvert__call__rU   lenmasked_fillupdate)	r4   ra   r5   output_kwargsre   return_token_type_idsimage	batch_docrj   s	            r6   process_imagesz&ColModernVBertProcessor.process_images   s[   < +**)
"&.."<"<
 
 }-11(DA &d 2 &!XF%.*CVT*z&)T/J~^def^ghi^jOk`aa 5;;5%--&;; MM++,s6{:'8%m4	 " 
	 !{+77	BR8SWX8XZ^_Fh/0 <s   8D6rg   c                     | j                   t        fd| j                  j                  i|}|d   j	                  dd      }t        |t              r|g}n.t        |t              rt        |d   t              st        d      || j                  dz  }|D cg c]  }| j                  |z   |z    }}| j                  |d|d   	      }|S c c}w )
ad  
        Prepare for the model one or several text queries. Handles input validation, prepends the
        `query_prefix`, and appends query augmentation tokens (used to pad query embeddings for
        better late-interaction retrieval performance).

        Args:
            text (`str`, `list[str]`, `list[list[str]]`):
                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
            return_tensors (`str` or [`~utils.TensorType`], *optional*):
                If set, will return tensors of a particular framework. Acceptable values are:

                - `'pt'`: Return PyTorch `torch.Tensor` objects.
                - `'np'`: Return NumPy `np.ndarray` objects.

        Returns:
            [`BatchFeature`]: A [`BatchFeature`] with the following fields:

            - **input_ids** -- List of token ids to be fed to a model.
            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
              `None`).
        rd   rK   re   Nr   z*Text must be a string or a list of strings
   F)rg   rv   rK   )rk   rB   r_   rl   rm   r,   r=   rn   ro   r]   rV   rq   )r4   rg   r5   ru   re   querytexts_querybatch_querys           r6   process_queriesz'ColModernVBertProcessor.process_queries   s    : +**)
"&.."<"<
 
 }-11(DAdC 6DT4(ZQ-EIJJ >22R7F SW!W$"3"3e";f"D!W!Wmm"'%m4 $ 
  "Xs   Cquery_embeddingsztorch.Tensorpassage_embeddings
batch_sizeoutput_dtypeztorch.dtypeoutput_deviceztorch.devicec           	         t        |      dk(  rt        d      t        |      dk(  rt        d      |d   j                  |d   j                  k7  rt        d      |d   j                  |d   j                  k7  rt        d      ||d   j                  }g }t	        dt        |      |      D ]%  }g }t
        j                  j                  j                  j                  ||||z    dd      }	t	        dt        |      |      D ]  }
t
        j                  j                  j                  j                  ||
|
|z    dd      }|j                  t        j                  d|	|      j                  d	
      d   j                  d
              |j                  t        j                  |d
      j                  |      j                  |             ( t        j                  |d
      S )a[  
        Compute the late-interaction/MaxSim score (ColBERT-like) for the given multi-vector
        query embeddings (`qs`) and passage embeddings (`ps`). For ColQwen2, a passage is the
        image of a document page.

        Because the embedding tensors are multi-vector and can thus have different shapes, they
        should be fed as:
        (1) a list of tensors, where the i-th tensor is of shape (sequence_length_i, embedding_dim)
        (2) a single tensor of shape (n_passages, max_sequence_length, embedding_dim) -> usually
            obtained by padding the list of tensors.

        Args:
            query_embeddings (`Union[torch.Tensor, list[torch.Tensor]`): Query embeddings.
            passage_embeddings (`Union[torch.Tensor, list[torch.Tensor]`): Passage embeddings.
            batch_size (`int`, *optional*, defaults to 128): Batch size for computing scores.
            output_dtype (`torch.dtype`, *optional*, defaults to `torch.float32`): The dtype of the output tensor.
                If `None`, the dtype of the input embeddings is used.
            output_device (`torch.device` or `str`, *optional*, defaults to "cpu"): The device of the output tensor.

        Returns:
            `torch.Tensor`: A tensor of shape `(n_queries, n_passages)` containing the scores. The score
            tensor is saved on the "cpu" device.
        r   zNo queries providedzNo passages providedz/Queries and passages must be on the same devicez-Queries and passages must have the same dtypeT)batch_firstpadding_valuezbnd,csd->bcnsr   )dimr      )rr   ro   devicedtyperangerP   nnutilsrnnpad_sequenceappendeinsummaxsumcatto)r4   r   r   r   r   r   scoresibatch_scoresbatch_queriesjbatch_passagess               r6   score_retrievalz'ColModernVBertProcessor.score_retrieval%  s   @  A%233!"a'344A%%);A)>)E)EENOOA$$(:1(=(C(CCLMM+A.44L%'q#./< 	]A/1L!HHNN..;; Q^4$VW < M 1c"45zB !&!3!3!@!@&q1z>:\] "A " ##LL-PTTYZT[\]^bbghbi	 MM%))La8;;LILL][\	] yyQ''r7   )NN@   NNN)r8   Ncpu)r9   r:   r;   r<   r?   r=   r3   r   r   rB   r
   ry   r   rn   r   r   r   r   __classcell__r`   s   @r6   rS   rS   v   s'   $ +/#' D
  D "Dj D Dj DH %)@T!@ 67@ 
	@D7$y/)7 677 
	7z 0449>(^0D DE>( ".$~2F"FG>( 	>(
 }->( ^S01>( 
>(r7   rS   c                       e Zd ZU eed<   y)ColModernVBertPreTrainedModelconfigN)r9   r:   r;   r   r>   r)   r7   r6   r   r   f  s      r7   r   z:
    Base class for ColModernVBert embeddings output.
    )custom_introc                       e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ
eej                     dz  ed<   dZeej                     dz  ed<   dZeej                     dz  ed<   y) ColModernVBertForRetrievalOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        The embeddings of the model.
    image_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True` and `pixel_values` are provided):
        Tuple of `torch.FloatTensor` (one for the output of the image modality projection + one for the output of each layer) of shape
        `(batch_size, num_channels, image_size, image_size)`.
        Hidden-states of the image encoder at the output of each layer plus the initial modality projection outputs.
    Nloss
embeddingshidden_statesimage_hidden_states
attentions)r9   r:   r;   r<   r   rP   FloatTensorr>   r   Tensorr   tupler   r   r)   r7   r6   r   r   k  s    	 &*D%

d
")&*Jt#*59M5**+d29;?u001D8?26Je''(4/6r7   r   u  
    Following the ColPali approach, ColModernVBert leverages VLMs to construct efficient multi-vector embeddings directly
    from document images (“screenshots”) for document retrieval. The model is trained to maximize the similarity
    between these document embeddings and the corresponding query embeddings, using the late interaction method
    introduced in ColBERT.

    Using ColModernVBert removes the need for potentially complex and brittle layout recognition and OCR pipelines with
    a single model that can take into account both the textual and visual content (layout, charts, ...) of a document.

    ColModernVBert is trained on top of ModernVBert, and was introduced in the following paper:
    [*ModernVBERT: Towards Smaller Visual Document Retrievers*](https://arxiv.org/abs/2510.01149).

    ColModernVBert is part of the ColVision model family, which was introduced with ColPali in the following paper:
    [*ColPali: Efficient Document Retrieval with Vision Language Models*](https://huggingface.co/papers/2407.01449).
    c                        e Zd Zi Zdef fdZee	 	 	 d
dej                  dz  dej                  dz  dej                  dz  dee   def
d	              Z xZS )ColModernVBertForRetrievalr   c                     t         |   |       t        j                  |j                        | _        | j                          y r   )rZ   r3   r   from_configr    vlm	post_init)r4   r   r`   s     r6   r3   z#ColModernVBertForRetrieval.__init__  s2     (():):;r7   Nrh   pixel_valuesattention_maskr5   rb   c                     | j                   d|||d|}|d   }| j                  j                  j                  }| j                  |j	                  |            }||j                  dd      z  }|;|j	                  |j                  |j                        }||j                  d      z  }t        ||j                  |j                  |j                        S )	N)rh   r   r   r   T)r   keepdim)r   r   )r   r   r   r   r)   )r   embedding_proj_layerweightr   r   normr   	unsqueezer   r   r   r   )	r4   rh   r   r   r5   
vlm_outputlast_hidden_states
proj_dtyper   s	            r6   forwardz"ColModernVBertForRetrieval.forward  s     TXX 
)%
 	

 (]..55;;
../A/D/DZ/PQ
  *//b$/"GG
%+..Z5E5EjN_N_.`N#n&>&>r&BBJ/!$22!,, * > >	
 	
r7   )NNN)r9   r:   r;   _checkpoint_conversion_mappingr   r3   r   r   rP   
LongTensorr   r   r   r   r   r   r   r   s   @r6   r   r     s    $ &("3 
  .215.2	
##d*
 ''$.
 t+	

 +,
 
*
  
r7   r   )r   r   r   rS   )3copyr   dataclassesr   typingr   r   r   rP   configuration_utilsr	   feature_extraction_utilsr
   image_utilsr   r   processing_utilsr   tokenization_utils_baser   r   r   r   r   r   utils.genericr   utils.import_utilsr   autor   auto.modeling_autor   colpali.modeling_colpalir   r   colqwen2.configuration_colqwen2r   idefics3.processing_idefics3r   r   
get_loggerr9   r*   r   rB   rS   r   r   r   __all__r)   r7   r6   <module>r      s    ! ' '  3 4 5 & 0 M M - * ! * R < U 
		H	%>,> >,B$;5  
:k(/ k(  k(\ !$: ! ! 
7{ 7 7& "(
!4 (
#"(
Vr7   