
    qi{=                        d dl mZ ddlmZ ddlmZ ddlmZmZ ddl	m
Z
mZmZmZ ddlmZmZ ddlmZmZmZmZmZ d	d
lmZmZ d	dlmZ ddlmZ  e       rd dlZ ej>                  e       Z! G d ded      Z" G d de      Z# G d de      Z$e ed       G d de                    Z% ed       G d de             Z&g dZ'y)    )	dataclass   )Cache)BatchFeature)
ImageInputis_valid_image)MultiModalDataProcessingKwargsProcessorMixinUnpack)PreTokenizedInput	TextInput)ModelOutputauto_docstringcan_return_tupleis_torch_availablelogging   )ColPaliForRetrievalColPaliPreTrainedModel)ColPaliProcessor   )ColQwen2ConfigNc                   &    e Zd ZddidddddidZy	)
ColQwen2ProcessorKwargspaddinglongestchannels_firstT)data_formatdo_convert_rgbreturn_tensorspt)text_kwargsimages_kwargscommon_kwargsN)__name__
__module____qualname__	_defaults     _/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/colqwen2/modular_colqwen2.pyr   r   "   s,     y
 ,"
 +D1	Ir+   r   F)totalc            	           e Zd Z	 	 	 	 	 ddedz  dedz  fdZ	 	 ddedz  deez  ee   z  ee   z  de	e
   defd	Zdd
Zed        Zy)ColQwen2ProcessorNvisual_prompt_prefixquery_prefixc                     t        j                  | |||       t        |d      sdn|j                  | _        t        |d      sdn|j                  | _        |xs d| _        |xs d| _        y)	ar  
        visual_prompt_prefix (`str`, *optional*, defaults to `"<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>Describe the image.<|im_end|><|endoftext|>"`):
            A string that gets tokenized and prepended to the image tokens.
        query_prefix (`str`, *optional*, defaults to `"Query: "`):
            A prefix to be used for the query.
        )chat_templateimage_tokenz<|image_pad|>video_tokenz<|video_pad|>zf<|im_start|>user
<|vision_start|><|image_pad|><|vision_end|>Describe the image.<|im_end|><|endoftext|>zQuery: N)r   __init__hasattrr4   r5   r0   r1   )selfimage_processor	tokenizerr3   r0   r1   kwargss          r,   r6   zColQwen2Processor.__init__0   so     	oyP]^29)]2S?YbYnYn29)]2S?YbYnYn$8 %
u 	! )5Ir+   imagestextr;   returnc                 v    | j                   t        fd| j                  j                  i|}|d   j	                  dd      }|du}||t        d      ||t        d      |7t        |      r|g}n^t        |t              rt        |d         rn?t        |t              r$t        |d   t              rt        |d   d         st        d      | j                  gt        |      z  } | j                  dd	|i|d
   }|d   }	|	| j                  j                  dz  }
d}t        t        |            D ]  }| j                  ||   v rQ||   j                  | j                  d|	|   j!                         |
z  z  d      ||<   |dz  }| j                  ||   v rQ||   j                  d| j                        ||<     | j                  |fddi|d   }t#        i ||      }|d   dddf   |d   dddf   z  }t        t%        j&                  |d   |j)                                     }t$        j*                  j,                  j.                  j1                  |d      |d<   |r.|d   j3                  |d   dk(  d      }|j5                  d|i       |S |t        |t6              r|g}n.t        |t              rt        |d   t6              st        d      || j8                  dz  }g }|D ]%  }| j:                  |z   |z   }|j=                  |       '  | j                  |fddi|d   }|S y)a  
        Returns:
            [`BatchFeature`]: A [`BatchFeature`] with the following fields:

            - **input_ids** -- List of token ids to be fed to a model.
            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
              `None`).
            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
        tokenizer_init_kwargsr#   suffixNz&Either text or images must be providedz5Only one of text or images can be processed at a timer   zAimages must be an image, list of images or list of list of imagesr<   r$   image_grid_thwr   z<|placeholder|>r   return_token_type_idsF)datapixel_valuesT)batch_first	input_idstoken_type_idsilabelsz*Text must be a string or a list of strings
   r*   )_merge_kwargsr   r:   init_kwargspop
ValueErrorr   
isinstancelistr0   lenr9   
merge_sizeranger4   replaceprodr   torchsplittolistnnutilsrnnpad_sequencemasked_fillupdatestrquery_augmentation_tokenr1   append)r8   r<   r=   r;   output_kwargsrA   rC   	texts_docimage_inputsrB   merge_lengthindexitext_inputsreturn_dataoffsetsrE   rI   texts_queryqueryaugmented_querybatch_querys                         r,   __call__zColQwen2Processor.__call__H   s     +**#
"&.."<"<
 

 }-11(DA &d 2<FNEFF 2TUUf% FD)nVAY.G .:fQi3NSabhijbklmbnSo !dee223c&kAI/4//`v`A_`L)*:;N)#33>>As9~. ]A**il:'0|';'; ,,.?>RWCXC]C]C_coCo.prs(	! 
	 **il:
 $-Q<#7#78I4K[K[#\IaL] )$..&+  .K ',K{,Kl,KLK ""23AqD9KHX<YZ[]^Z^<__G  K79IJL
 +0((..*<*<*I*I$ +J +K' %$[1==kJZ>[_`>`bfg""Hf#56$$v t,DGS1I !MNN~66;%'K 4"&"3"3e";f"D""?34 )$..&+  .K + r+   c                    i }|t         j                  j                  di       }|j                  |       |j                  dd      xs | j                  j
                  }|D cg c]   } | j                  j                  g || " }}|D cg c]
  }||dz  z   }	}|j                  |	|d       t        di |S c c}w c c}w )a  
        Computes the number of placeholder tokens needed for multimodal inputs with the given sizes.
        Args:
            image_sizes (`list[list[int]]`, *optional*):
                The input sizes formatted as (height, width) per each image.
        Returns:
            `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided
            input modalities, along with other useful data.
        Nr$   rR   r   )num_image_tokensnum_image_patchesr*   )r   r)   getr^   r9   rR   get_number_of_image_patchesr	   )
r8   image_sizesr;   vision_datar$   rR   
image_sizerr   num_patchesrq   s
             r,   _get_num_multimodal_tokensz,ColQwen2Processor._get_num_multimodal_tokens   s     "3==AA/SUVM  (&**<>a$BVBVBaBaJ #.! A$$@@\*\m\! ! Sdd;
A!=dd4D[lmn,,,!  es   $%B?Cc                     | j                   j                  }| j                  j                  }|D cg c]	  }|dvs| }}||z   S c c}w )N)pixel_values_videosvideo_grid_thw)r:   model_input_namesr9   )r8   tokenizer_input_namesimage_processor_input_namesnames       r,   r}   z#ColQwen2Processor.model_input_names   s\     $ @ @&*&:&:&L&L#
 9'
DHq<qD'
# '
 %'BBB'
s
   	AA)NNNNN)NNN)r&   r'   r(   r_   r6   r   r   r   rP   r   r   r   ro   ry   propertyr}   r*   r+   r,   r/   r/   /   s     +/#'6
 "Dj6 Dj64 %)Z^fT!f ++d9o=EV@WWf 01	f
 
fP-4 	C 	Cr+   r/   c                       e Zd Zy)ColQwen2PreTrainedModelN)r&   r'   r(   r*   r+   r,   r   r      s    r+   r   z4
    Base class for ColQwen2 embeddings output.
    )custom_introc                       e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ
edz  ed<   dZeej                     dz  ed<   dZeej                     dz  ed<   y)ColQwen2ForRetrievalOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        The embeddings of the model.
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    Nloss
embeddingspast_key_valueshidden_states
attentions)r&   r'   r(   __doc__r   rV   FloatTensor__annotations__r   Tensorr   r   r   tupler   r*   r+   r,   r   r      ss    
 &*D%

d
")&*Jt#*$(OUT\(59M5**+d2926Je''(4/6r+   r   uG  
    Following the ColPali approach, ColQwen2 leverages VLMs to construct efficient multi-vector embeddings directly
    from document images (“screenshots”) for document retrieval. The model is trained to maximize the similarity
    between these document embeddings and the corresponding query embeddings, using the late interaction method
    introduced in ColBERT.

    Using ColQwen2 removes the need for potentially complex and brittle layout recognition and OCR pipelines with
    a single model that can take into account both the textual and visual content (layout, charts, ...) of a document.

    ColQwen2 is part of the ColVision model family, which was introduced with ColPali in the following paper:
    [*ColPali: Efficient Document Retrieval with Vision Language Models*](https://huggingface.co/papers/2407.01449).
    c                       e Zd Zi Zdef fdZee	 	 	 	 	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dedz  dej                  dz  d	ej                  dz  d
edz  dedz  dedz  dedz  dej                  dz  dej                  dz  dej                  dz  defd              Z xZS )ColQwen2ForRetrievalconfigc                 (    t         |   |       | `y r   )superr6   _tied_weights_keys)r8   r   	__class__s     r,   r6   zColQwen2ForRetrieval.__init__  s     #r+   NrG   attention_maskposition_idsr   rI   inputs_embeds	use_cacheoutput_attentionsoutput_hidden_statesreturn_dictrE   rB   cache_positionr>   c                    |k|i|dddf   |dddf   z  }t        j                  |j                  d   |j                        }|j	                  d      |j	                  d      k  }||   }||n| j
                  j                  }|	|	n| j
                  j                  }	|
|
n| j
                  j                  }
| | j                  j                         |      }|| j                  j                  j                  ||d      j                  }|| j
                  j                  j                  k(  j	                  d      j!                  |      }|j#                  |j                  |j$                        }|j'                  ||      }| j                  j                  d|||||||	|
|	
      }|	r|j(                  nd}|d   }| j*                  j,                  j$                  }| j+                  |j#                  |            }||j/                  dd
      z  }|||j	                  d      z  }t1        ||j2                  ||j4                        S )z
        image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
            The temporal, height and width of feature shape of each image in LLM.
        Nr   r   )devicer   T)grid_thwr   )
rG   r   r   r   r   r   r   r   r   r   )dimkeepdim)r   r   r   r   )rV   arangeshaper   	unsqueezer   r   r   use_return_dictvlmget_input_embeddingsmodelvisualpooler_output
vlm_configimage_token_id	expand_astodtypemasked_scatterr   embedding_proj_layerweightnormr   r   r   )r8   rG   r   r   r   rI   r   r   r   r   r   rE   rB   r   r;   rj   r   maskimage_embeds
image_mask
vlm_outputvlm_hidden_stateslast_hidden_states
proj_dtyper   s                            r,   forwardzColQwen2ForRetrieval.forward
  s\   0 #(B$QT*^AqD-AAG\\,"4"4Q"7OF##A&):):1)==D'-L1B1N-TXT_T_TqTq %9$D $++JjJj 	 &1%<k$++B]B]  ;DHH99;IFM'#xx~~44 >t  5  -  $++"8"8"G"GGRRSUV``ano   ,}/C/C]EXEXY - < <Z VXX^^%)+'/!5#) $ 

 9MJ44RV']..55;;
../A/D/DZ/PQ
  *//b$/"GG
%#n&>&>r&BBJ)!&66+!,,	
 	
r+   )NNNNNNNNNNNNN)r&   r'   r(   _checkpoint_conversion_mappingr   r6   r   r   rV   
LongTensorr   r   r   boolr   r   __classcell__)r   s   @r,   r   r      s^    &("$~ $  .2.204(,*.26!%)-,0#',02626O
##d*O
 t+O
 &&-	O

 O
   4'O
 ((4/O
 $;O
  $;O
 #TkO
 D[O
 llT)O
 ((4/O
 ((4/O
  
$!O
  O
r+   r   )r   r   r/   )(dataclassesr   cache_utilsr   feature_extraction_utilsr   image_utilsr   r   processing_utilsr	   r
   r   r   tokenization_utils_baser   r   rZ   r   r   r   r   r   colpali.modeling_colpalir   r   colpali.processing_colpalir   configuration_colqwen2r   rV   
get_loggerr&   loggerr   r/   r   r   r   __all__r*   r+   r,   <module>r      s    "   4 5 X X C _ _ R 9 2 			H	%
.e 
eC( eCP	4 	 
7 7 7( X
. X
X
vr+   