
    qivf                     ,   d dl Z d dlmZ d dlmZmZmZ d dlZd dl	Z	ddl
mZ ddlmZmZmZ ddlmZmZmZmZ ddlmZmZmZ dd	lmZ dd
lmZ erddlmZ  G d ded      ZdefdZ d Z!d Z"d Z#d Z$ ed      e G d de                    Z%dgZ&y)    N)
accumulate)TYPE_CHECKINGOptionalUnion   )BatchFeature)
ImageInputis_valid_image
load_image)MultiModalDataProcessingKwargsProcessorMixinUnpack)
AddedTokenBatchEncoding	TextInput)auto_docstring)requires)PreTokenizedInputc                   (    e Zd ZddiddddddidZy	)
ColModernVBertProcessorKwargspaddinglongestTchannels_first)return_row_col_infodata_formatdo_convert_rgbreturn_tensorspt)text_kwargsimages_kwargscommon_kwargsN)__name__
__module____qualname__	_defaults     n/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/colmodernvbert/processing_colmodernvbert.pyr   r   (   s/     y
 $(+"

 +D1
Ir(   r   F)totalreturnc                 H    t        | t              xr | j                  d      S )Nhttp)
isinstancestr
startswith)vals    r)   is_urlr2   6   s    c3:CNN6$::r(   c                 2    t        |       xs t        |       S N)r2   r
   )elems    r)   is_image_or_image_urlr6   :   s    $</>$//r(   c           	          d}t        |      D ]4  }t        |      D ]  }|| d|dz    d|dz    dz   | | z  z   z  }! |dz  }6 |d| | z   | | z  z   | z   z  }|S )zKPrompt with expanded image tokens for when the image is split into patches. <row_   _col_>
)range)	image_seq_len
image_rows
image_colsfake_token_around_imageimage_tokenglobal_img_tokentext_split_imagesn_hn_ws	            r)   _prompt_split_imagerH   >   s    Z  "$ 	C*+sQwiuS1WIQ/OOU`TaerRrr	 	T!" 
$%&	 M]
*	+ %%	' r(   c                 &    | | z   | | z  z   | z   S )z5Prompt with expanded image tokens for a single image.r'   )r?   rB   rC   rD   s       r)   _prompt_single_imagerJ   Q   s6     #
#	 M]
*	+ %%	'r(   c                 T    | dk(  r|dk(  rt        ||||      S t        || ||||      S )Nr   )rB   rC   rD   )rJ   rH   )r@   rA   r?   rB   rC   rD   s         r)   get_image_prompt_stringrL   [   sH     Q:?#$;#-	
 	
 z:/FUe r(   )torch)backendsc                   j    e Zd ZdZ	 	 	 	 	 ddededz  dedz  f fdZd Ze	 	 	 dde	e
e	   z  e
e
e	      z  d	eed
e
e   e
d
   f   dedz  dee   def
d       ZddZ	 dde	dz  dee   defdZd	ee
e   z  dee   defdZ	 	 	 ddede
d   f   dede
d   f   deded   dedef   ddfdZ xZS )ColModernVBertProcessora!  
    Constructs a ColModernVBert processor which wraps a ModernVBertProcessor and special methods to process images and queries, as
    well as to compute the late-interaction retrieval score.

    [`ColModernVBertProcessor`] offers all the functionalities of [`ModernVBertProcessor`]. See the [`~ModernVBertProcessor.__call__`]
    for more information.

    Args:
            image_processor ([`Idefics3ImageProcessor`]): An instance of [`Idefics3ImageProcessor`]. The image processor is a required input.
            tokenizer (`PreTrainedTokenizerFast`, *optional*): An instance of [`PreTrainedTokenizerFast`]. This should correspond with the model's text model. The tokenizer is a required input.
            image_seq_len (`int`, *optional*, defaults to 64): The length of the image sequence i.e. the number of <image> tokens per image in the input.
            visual_prompt_prefix (`Optional`, *optional*): A prefix to be prepended to visual prompts.
            query_prefix (`Optional`, *optional*): A prefix to be prepended to query prompts.
    Nr?   visual_prompt_prefixquery_prefixc                    d}t        ddd      j                  | _        t        ddd      j                  | _        t        ddd      j                  | _        d| _        || _        |j                  | j                        | _        |j                  | j                        | _	        |j                  | j
                        | _
        t        d	      D 	cg c]0  }t        d	      D ]   }	|j                  d
|dz    d|	dz    d      " 2 c}	}| _        t        j                  d      | _        d| j                  | j                  | j                  gi}
|j!                  |
       |j                  | j                        | _        t#        | H  ||fd|i| |xs d| j                   d| _        |xs d| _        | j                  | _        yc c}	}w )a  
        image_seq_len (`int`, *optional*, defaults to 64):
            The length of the image sequence i.e. the number of <image> tokens per image in the input.
        visual_prompt_prefix (`str`, *optional*):
            A string that gets tokenized and prepended to the image tokens.
        query_prefix (`str`, *optional*):
            A prefix to be used for the query.
        Nz<fake_token_around_image>FT)
normalizedspecialz<image>z<end_of_utterance>z<global-img>   r9   r:   r;   r<   z*(\n?<global-img>\n?|<row_\d+_col_\d+>\n?)+additional_special_tokenschat_templatez<|begin_of_text|>User:z0Describe the image.<end_of_utterance>
Assistant:r8   )r   contentfake_image_tokenrC   end_of_utterance_tokenglobal_image_tagr?   convert_tokens_to_idsimage_token_idfake_image_token_idglobal_image_token_idr>   row_col_idsrecompile%_regex_to_remove_extra_special_tokensadd_special_tokenssuper__init__rQ   rR   query_augmentation_token)selfimage_processor	tokenizerrX   r?   rQ   rR   kwargsijtokens_to_add	__class__s              r)   rg   z ColModernVBertProcessor.__init__|   s   $  *+FSXbf g o o%iE4PXX&01ERWae&f&n&n# .*'==d>N>NO#,#B#B4CXCX#Y %.%D%DTEZEZ%["SXYZS[
NOejklem
`aI++eAE7%Awa,HI
I
 68ZZ@m5n2 (%%  ++*
 	$$]3'==d>N>NO)[=[TZ[$8 %
$T%5%5$66gh 	! ).B(,(C(C%1
s   5Gc                     g }|D ]_  }g }|D ]E  }t        |      r|j                  |        t        |      s,|j                  t        |             G |j                  |       a |S r4   )r
   appendr2   r   )ri   promptsprompt_imagespromptimagesr5   s         r)   _extract_images_from_promptsz4ColModernVBertProcessor._extract_images_from_prompts   sn     	)FF 4!$'MM$'D\MM*T"23	4
   (	) r(   rv   textr   rl   r+   c                    ||t        d       | j                  t        fd| j                  j                  i|}||n| j
                  }|d   j                  dd      }|d   j                  dd      }g }g }	i }
|jt        |t              r|g}n.t        |t              st        |d   t              st        d	      |D cg c]  }|j                  | j                         }}|Wt        |      r|gg}nt        |t        t        f      rt        |d         r|t        |      t        |      k7  r>t        d
| j                   dt        |       d| j                   dt        |       d	      dgt        t!        |            z   }t#        t        |            D cg c]  }|||   ||dz        }}nO|g}nKt        |t        t        f      s5t        |d   t        t        f      st        |d   d         st        d      |D cg c]  }t        |       }	}|D cg c](  }|D cg c]  }t%        |      rt'        |      n| c}* }}} | j(                  |fi |d   }|
j+                  |       ||	|k7  rt        d| d|	 d      |
j                  d|D cg c]  }dg|z  
 c}      }|
j                  d|D cg c]  }dg|z  
 c}      }| j,                  }| j                  }| j.                  }g }g }t1        |||      D ]  \  }}}g }g }t1        ||      D ]`  \  }}t3        ||||||      }| j
                  dz   |z  dz   }|j5                  | j
                  dz   ||z  z          |j5                  |       b |j5                  |       |j7                  |      } t        |       dk(  rt        d      | d   }t9        |      D ]  \  }}||| |dz      z   z  } |j5                  |         | j                  |fi |d   }!| j;                  ||!dg       |
j+                  |!       nZ|Xt=        |      r%t        dt        |       d| j                   d       | j                  d#d|i|d   }!|
j+                  |!       |rt?        j@                  |
d          }"t?        jB                  |"      }#t9              D ]o  \  }}$t?        jD                  |"|   | jF                  k(        d   }%d}&|$D ];  }'|&t        |%      k\  r E|%|&   }(|(|'z   })d|#||(|)f<   t?        jH                  |%|)      }&= q |#jK                         |
d!<   tM        |
|"      S c c}w c c}w c c}w c c}w c c}}w c c}w c c}w )$a  
        image_seq_len (`int`, *optional*):
            The length of the image sequence. If not provided, the default value of self.image_seq_len is used.
            image_seq_len should be equal to int(((image_size // patch_size) ** 2) / (scale_factor**2))
        Nz+You must provide either `text` or `images`.tokenizer_init_kwargsr    return_mm_token_type_idsFr   r   zAInvalid input text. Please provide a string, or a list of stringszThe total number of zP tokens in the prompts should be the same as the number of images passed. Found  z tokens and z images.r:   zdInvalid input images. Please provide a single image or a list of images or a list of list of images.r!   z!The number of images in the text z and images z should be the same.rowscols)rC   rB   rD      r   z.The image token should be present in the text.image)
modalitieszFound z. tokens in the text but no images were passed.rx   	input_idsmm_token_type_ids)datatensor_typer'   )'
ValueError_merge_kwargsr   rk   init_kwargsr?   popr.   r/   listcountrC   r6   tuplesumlenr   r>   r2   r   rj   updaterZ   r\   ziprL   rr   split	enumerate_check_special_mm_tokensanynparray
zeros_likewherer_   searchsortedtolistr   )*ri   rv   rx   r?   rl   output_kwargsr{   r   n_images_in_textn_images_in_imagesinputssamplecumsum_images_in_textrm   imimage_inputsn_imagesr@   rA   rZ   rC   rD   prompt_stringsbatch_image_seq_lengthssample_rowssample_colsimage_prompt_stringsimage_seq_lengthsn_rowsn_colsimage_prompt_string
row_lengthsplit_sampletext_inputs	array_idsr   seq_lengthsimage_start_positionsrn   seq_lenstartends*                                             r)   __call__z ColModernVBertProcessor.__call__   sO    <FNJKK***)
"&.."<"<
 
 *7)BHZHZ#0#?#C#CD^`e#f &}599:JDQ$$vd+JtAw4L !deeMQR6T-=-= >RR$V,!(FT5M27LVTUY7W#+,F;(243C3C2D E&&)*:&;%<Ad>N>N=O|\_`f\g[hhpr 
 ./C$zBR7S2T,T) "'s+;'<!= 4Q7:OPQTUPU:VWF 
 %XFve}5"6!9tUm<-fQil; z  =C!C&#f+!C!C ]ccRXfMz"~;McFc/4//Y-:XYLMM,'%)99$;<L;M\ZlYm  nB  C  $ZZP`0aH!x0ab
#ZZP`0aH!x0ab
#'#8#8 "..#'#8#8 !#*,'8;D*j8Y 24FK+-((*%*-k;*G I.E"")(34D-=/+ '+&8&81&<%F%J
)00$2D2Dq2HJY_L_1_`,334GHI ,223DE#)<<#<L<(A-()YZZ *!_F2;<P2Q L.."5QU8K"KKL"))&172: -dnn^\}]?[\--nkW^V_-`k*#$ S!1231T5E5E4FFtu  )$..SdSmM6RSKMM+&#!45I "i 8"+,C"D 	D;(*1AYAY1Y(Z[\(]%* DGC 5661!4E'/C67%asl3(=sCAD	D +<*B*B*DF&'^DDY  S "D Nc 1b0as6   "U)&U.U3*	U=3U8U=+V
V
8U=c                    i }|t         j                  j                  di       }|j                  |       |D cg c]   } | j                  j
                  g || " }}| j                  dz   }| j                  dz   }g }	g }
|D ]6  \  }}}||z  dz   }|	j                  |||z  z          |
j                  |       8 |j                  |	|
d       t        di |S c c}w )a  
        Computes the number of placeholder tokens needed for multimodal inputs with the given sizes.

        Args:
            image_sizes (`list[list[int]]`, *optional*):
                The input sizes formatted as (height, width) per each image.

        Returns:
            `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided
            input modalities, along with other useful data.
        r!   r   r   r:   )num_image_tokensnum_image_patchesr'   )	r   r&   getr   rj   get_number_of_image_patchesr?   rr   r   )ri   image_sizesrl   vision_datar!   
image_sizenum_image_row_colsbase_image_length
col_lengthr   r   num_patchesnum_rowsnum_colsr   s                  r)   _get_num_multimodal_tokensz2ColModernVBertProcessor._get_num_multimodal_tokensN  s    "9CCGGY[\M  ( #." A$$@@\*\m\" "
 !% 2 2Q 6++a/J! "3E 6/Xx'(2Q6
 ''(9Z(=R(ST!((56
 4D[lmn,,,#"s   %Cc                 v    | j                   t        fd| j                  j                  i|}|d   j	                  dd      }|du}t        |      r|g}n^t        |t              rt        |d         rn?t        |t              r$t        |d   t              rt        |d   d         st        d      |D cg c]  }|j                  d       }}| j                  | j                  gt        |      z  ||d   |d   	      }|r.|d
   j                  |d   dk(  d      }|j                  d|i       |S c c}w )a  
        Prepare for the model one or several image(s). Handles input validation, RGB conversion,
        and prepends the `visual_prompt_prefix` to each image. Optionally computes labels from
        `token_type_ids` when a `suffix` is provided in `text_kwargs`.

        Args:
            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
                tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
                number of channels, H and W are image height and width.
            return_tensors (`str` or [`~utils.TensorType`], *optional*):
                If set, will return tensors of a particular framework. Acceptable values are:

                - `'pt'`: Return PyTorch `torch.Tensor` objects.
                - `'np'`: Return NumPy `np.ndarray` objects.

        Returns:
            [`BatchFeature`]: A [`BatchFeature`] with the following fields:

            - **input_ids** -- List of token ids to be fed to a model.
            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
              `None`).
            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
        rz   r    suffixNr   zAimages must be an image, list of images or list of list of imagesRGBr!   )rx   rv   r!   r    r   token_type_idsilabels)r   r   rk   r   r   r
   r.   r   r   convertr   rQ   r   masked_fillr   )	ri   rv   rl   r   r   return_token_type_idsr   	batch_docr   s	            r)   process_imagesz&ColModernVBertProcessor.process_imagess  s[   < +**)
"&.."<"<
 
 }-11(DA &d 2 &!XF%.*CVT*z&)T/J~^def^ghi^jOk`aa 5;;5%--&;; MM++,s6{:'8%m4	 " 
	 !{+77	BR8SWX8XZ^_Fh/0 <s   8D6c                     | j                   t        fd| j                  j                  i|}|d   j	                  dd      }t        |t              r|g}n.t        |t              rt        |d   t              st        d      || j                  dz  }|D cg c]  }| j                  |z   |z    }}| j                  |d|d   	      }|S c c}w )
ad  
        Prepare for the model one or several text queries. Handles input validation, prepends the
        `query_prefix`, and appends query augmentation tokens (used to pad query embeddings for
        better late-interaction retrieval performance).

        Args:
            text (`str`, `list[str]`, `list[list[str]]`):
                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
            return_tensors (`str` or [`~utils.TensorType`], *optional*):
                If set, will return tensors of a particular framework. Acceptable values are:

                - `'pt'`: Return PyTorch `torch.Tensor` objects.
                - `'np'`: Return NumPy `np.ndarray` objects.

        Returns:
            [`BatchFeature`]: A [`BatchFeature`] with the following fields:

            - **input_ids** -- List of token ids to be fed to a model.
            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
              `None`).
        rz   r    r   Nr   z*Text must be a string or a list of strings
   F)rx   r   r    )r   r   rk   r   r   r.   r/   r   r   rh   rR   r   )ri   rx   rl   r   r   querytexts_querybatch_querys           r)   process_queriesz'ColModernVBertProcessor.process_queries  s    : +**)
"&.."<"<
 
 }-11(DAdC 6DT4(ZQ-EIJJ >22R7F SW!W$"3"3e";f"D!W!Wmm"'%m4 $ 
  "Xs   Cquery_embeddingsztorch.Tensorpassage_embeddings
batch_sizeoutput_dtypeztorch.dtypeoutput_deviceztorch.devicec           	         t        |      dk(  rt        d      t        |      dk(  rt        d      |d   j                  |d   j                  k7  rt        d      |d   j                  |d   j                  k7  rt        d      ||d   j                  }g }t	        dt        |      |      D ]%  }g }t
        j                  j                  j                  j                  ||||z    dd      }	t	        dt        |      |      D ]  }
t
        j                  j                  j                  j                  ||
|
|z    dd      }|j                  t        j                  d|	|      j                  d	
      d   j                  d
              |j                  t        j                  |d
      j                  |      j                  |             ( t        j                  |d
      S )a[  
        Compute the late-interaction/MaxSim score (ColBERT-like) for the given multi-vector
        query embeddings (`qs`) and passage embeddings (`ps`). For ColQwen2, a passage is the
        image of a document page.

        Because the embedding tensors are multi-vector and can thus have different shapes, they
        should be fed as:
        (1) a list of tensors, where the i-th tensor is of shape (sequence_length_i, embedding_dim)
        (2) a single tensor of shape (n_passages, max_sequence_length, embedding_dim) -> usually
            obtained by padding the list of tensors.

        Args:
            query_embeddings (`Union[torch.Tensor, list[torch.Tensor]`): Query embeddings.
            passage_embeddings (`Union[torch.Tensor, list[torch.Tensor]`): Passage embeddings.
            batch_size (`int`, *optional*, defaults to 128): Batch size for computing scores.
            output_dtype (`torch.dtype`, *optional*, defaults to `torch.float32`): The dtype of the output tensor.
                If `None`, the dtype of the input embeddings is used.
            output_device (`torch.device` or `str`, *optional*, defaults to "cpu"): The device of the output tensor.

        Returns:
            `torch.Tensor`: A tensor of shape `(n_queries, n_passages)` containing the scores. The score
            tensor is saved on the "cpu" device.
        r   zNo queries providedzNo passages providedz/Queries and passages must be on the same devicez-Queries and passages must have the same dtypeT)batch_firstpadding_valuezbnd,csd->bcnsr   )dimr   r:   )r   r   devicedtyper>   rM   nnutilsrnnpad_sequencerr   einsummaxr   catto)ri   r   r   r   r   r   scoresrm   batch_scoresbatch_queriesrn   batch_passagess               r)   score_retrievalz'ColModernVBertProcessor.score_retrieval  s   @  A%233!"a'344A%%);A)>)E)EENOOA$$(:1(=(C(CCLMM+A.44L%'q#./< 	]A/1L!HHNN..;; Q^4$VW < M 1c"45zB !&!3!3!@!@&q1z>:\] "A " ##LL-PTTYZT[\]^bbghbi	 MM%))La8;;LILL][\	] yyQ''r(   )NN@   NN)NNNr4   )   Ncpu)r#   r$   r%   __doc__intr/   rg   rw   r   r	   r   r   r   r   r   r   r   r   r   r   r   r   r   __classcell__)rp   s   @r)   rP   rP   j   s   $ +/#'3D
 3D "Dj3D Dj3Dj
  JNbf$(	NET*--T*5E0FFNE I2DOTJ]E^^_NE Tz	NE
 67NE 
NE NE`#-N %)@T!@ 67@ 
	@D7$y/)7 677 
	7z 0449>(^0D DE>( ".$~2F"FG>( 	>(
 }->( ^S01>( 
>(r(   rP   )'rb   	itertoolsr   typingr   r   r   numpyr   rM   feature_extraction_utilsr   image_utilsr	   r
   r   processing_utilsr   r   r   r   tokenization_utils_baser   r   r   r   r   utils.import_utilsr   r   r   boolr2   r6   rH   rJ   rL   rP   __all__r'   r(   r)   <module>r      s   * 
   1 1   4 A A X X K K # * <$4E ;4 ;0& 
:@(n @(  @(F %
%r(   