
    qi=!                        d dl mZ d dlZd dlmZ ddlmZ ddlmZ ddl	m
Z
 ddlmZmZmZ dd	lmZ d
dlmZ ddlmZ e G d de             Ze ed       G d de                    Z ed       G d de             ZddgZy)    )	dataclassN)nn   )initialization)PreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstring)can_return_tuple   )	AutoModel   )ColModernVBertConfigc                   ^    e Zd ZU eed<   dZdZg ZdZdZ	dZ
 ej                         d        Zy)ColModernVBertPreTrainedModelconfigmodel)imagetextTc                    t        | j                  d      r| j                  j                  n)| j                  j                  j                  j                  }t        |t        j                  t        j                  f      rOt        j                  |j                  d|       |j                   t        j                  |j                         y y t        |t        j                        rtt        j                  |j                  d|       |j                  Et!        |j                  dd      s-t        j                  |j                  |j                            y y y y )Ninitializer_rangeg        )meanstd_is_hf_initializedF)hasattrr   r   
vlm_configtext_config
isinstancer   LinearConv2dinitnormal_weightbiaszeros_	Embeddingpadding_idxgetattr)selfmoduler   s      l/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/colmodernvbert/modeling_colmodernvbert.py_init_weightsz+ColModernVBertPreTrainedModel._init_weights-   s     t{{$78 KK))''33EE 	 fryy"))45LLSc:{{&FKK( '-LLSc:!!-gfmmMach6iFMM&*<*<=> 7j- .    N)__name__
__module____qualname__r   __annotations__base_model_prefixinput_modalities_no_split_modules_supports_sdpa_supports_flash_attn_supports_flex_attntorchno_gradr-    r.   r,   r   r   #   sE      (NU]]_? ?r.   r   z:
    Base class for ColModernVBert embeddings output.
    )custom_introc                       e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ
eej                     dz  ed<   dZeej                     dz  ed<   dZeej                     dz  ed<   y) ColModernVBertForRetrievalOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        The embeddings of the model.
    image_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True` and `pixel_values` are provided):
        Tuple of `torch.FloatTensor` (one for the output of the image modality projection + one for the output of each layer) of shape
        `(batch_size, num_channels, image_size, image_size)`.
        Hidden-states of the image encoder at the output of each layer plus the initial modality projection outputs.
    Nloss
embeddingshidden_statesimage_hidden_states
attentions)r/   r0   r1   __doc__r?   r9   FloatTensorr2   r@   TensorrA   tuplerB   rC   r;   r.   r,   r>   r>   @   s    	 &*D%

d
")&*Jt#*59M5**+d29;?u001D8?26Je''(4/6r.   r>   u  
    Following the ColPali approach, ColModernVBert leverages VLMs to construct efficient multi-vector embeddings directly
    from document images (“screenshots”) for document retrieval. The model is trained to maximize the similarity
    between these document embeddings and the corresponding query embeddings, using the late interaction method
    introduced in ColBERT.

    Using ColModernVBert removes the need for potentially complex and brittle layout recognition and OCR pipelines with
    a single model that can take into account both the textual and visual content (layout, charts, ...) of a document.

    ColModernVBert is trained on top of ModernVBert, and was introduced in the following paper:
    [*ModernVBERT: Towards Smaller Visual Document Retrievers*](https://arxiv.org/abs/2510.01149).

    ColModernVBert is part of the ColVision model family, which was introduced with ColPali in the following paper:
    [*ColPali: Efficient Document Retrieval with Vision Language Models*](https://huggingface.co/papers/2407.01449).
    c                       e Zd Zi Zdef fdZee	 	 	 ddej                  dz  dej                  dz  dej                  dz  dee   def
d	              Zd
 Zd Zd Zd Z	 	 	 ddedz  dedz  dedej.                  fdZ xZS )ColModernVBertForRetrievalr   c                    t         |   |       || _        |j                  j                  j
                  | _        t        j                  |j                        | _        | j                  j                  | _	        t        j                  | j                  j                  j                  j                  | j                        | _        | j                          y N)super__init__r   r   r   
vocab_sizer   from_configvlmembedding_dimr   r    hidden_sizeembedding_proj_layer	post_init)r*   r   	__class__s     r,   rM   z#ColModernVBertForRetrieval.__init__m   s      ++77BB(():):;![[66$&IIKK""..::%
!
 	r.   N	input_idspixel_valuesattention_maskkwargsreturnc                     | j                   d|||d|}|d   }| j                  j                  j                  }| j                  |j	                  |            }||j                  dd      z  }|;|j	                  |j                  |j                        }||j                  d      z  }t        ||j                  |j                  |j                        S )	N)rV   rX   rW   r   T)dimkeepdim)dtypedevice)r@   rA   rC   rB   r;   )rP   rS   r$   r_   tonormr`   	unsqueezer>   rA   rC   rB   )	r*   rV   rW   rX   rY   
vlm_outputlast_hidden_states
proj_dtyper@   s	            r,   forwardz"ColModernVBertForRetrieval.forward{   s     TXX 
)%
 	

 (]..55;;
../A/D/DZ/PQ
  *//b$/"GG
%+..Z5E5EjN_N_.`N#n&>&>r&BBJ/!$22!,, * > >	
 	
r.   c                 6    | j                   j                         S rK   )rP   get_input_embeddingsr*   s    r,   ri   z/ColModernVBertForRetrieval.get_input_embeddings   s    xx,,..r.   c                 :    | j                   j                  |       y rK   )rP   set_input_embeddings)r*   values     r,   rl   z/ColModernVBertForRetrieval.set_input_embeddings   s    %%e,r.   c                 6    | j                   j                         S rK   )rP   get_output_embeddingsrj   s    r,   ro   z0ColModernVBertForRetrieval.get_output_embeddings   s    xx--//r.   c                 :    | j                   j                  |       y rK   )rP   set_output_embeddings)r*   new_embeddingss     r,   rq   z0ColModernVBertForRetrieval.set_output_embeddings   s    &&~6r.   new_num_tokenspad_to_multiple_ofmean_resizingc                 B   | j                   j                  |||      }|j                  | j                  j                  j
                  _        |j                  | j                  j                  _        |j                  | j                   _        |j                  | _        |S )N)rs   rt   ru   )rP   resize_token_embeddingsnum_embeddingsr   r   r   rN   )r*   rs   rt   ru   model_embedss        r,   rw   z2ColModernVBertForRetrieval.resize_token_embeddings   s     xx77)1' 8 
 9E8S8S**5,8,G,G)*99&55r.   )NNN)NNT)r/   r0   r1   _checkpoint_conversion_mappingr   rM   r   r   r9   
LongTensorrE   rF   r   r
   r>   rg   ri   rl   ro   rq   intboolr   r'   rw   __classcell__)rU   s   @r,   rI   rI   Y   s    $ &("3   .215.2	
##d*
 ''$.
 t+	

 +,
 
*
  
@/-07
 &*)-"	d
  $J 	
 
r.   rI   )dataclassesr   r9   r    r   r"   modeling_utilsr   processing_utilsr   utilsr	   r
   r   utils.genericr   auto.modeling_autor   configuration_colmodernvbertr   r   r>   rI   __all__r;   r.   r,   <module>r      s   * "   & - & D D - * > ?O ? ?8 
7{ 7 7& "P!> P#"Pf ()H
Ir.   