
    qi:                     (   d Z ddlZddlmZ ddlmZmZ ddlZddl	m
Z
 ddlmZmZmZ ddlmZmZmZmZ dd	lmZmZmZ dd
lmZmZ erddlmZ  ej8                  e      ZdefdZ d Z!d Z"d Z#d Z$ G d ded      Z%e G d de             Z&dgZ'y)z
Processor class for Idefics3.
    N)
accumulate)TYPE_CHECKINGUnion   )BatchFeature)
ImageInputis_valid_image
load_image)MultiModalDataProcessingKwargsProcessorMixinUnpack)
AddedTokenBatchEncoding	TextInput)auto_docstringlogging)PreTokenizedInputreturnc                 H    t        | t              xr | j                  d      S )Nhttp)
isinstancestr
startswith)vals    b/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/idefics3/processing_idefics3.pyis_urlr   %   s    c3:CNN6$::    c                 2    t        |       xs t        |       S N)r   r	   )elems    r   is_image_or_image_urlr"   )   s    $</>$//r   c           	          d}t        |      D ]4  }t        |      D ]  }|| d|dz    d|dz    dz   | | z  z   z  }! |dz  }6 |d| | z   | | z  z   | z   z  }|S )zKPrompt with expanded image tokens for when the image is split into patches. <row_   _col_>
)range)	image_seq_len
image_rows
image_colsfake_token_around_imageimage_tokenglobal_img_tokentext_split_imagesn_hn_ws	            r   _prompt_split_imager4   -   s    Z  "$ 	C*+sQwiuS1WIQ/OOU`TaerRrr	 	T!" 
$%&	 M]
*	+ %%	' r   c                 &    | | z   | | z  z   | z   S )z5Prompt with expanded image tokens for a single image. )r+   r.   r/   r0   s       r   _prompt_single_imager7   @   s6     #
#	 M]
*	+ %%	'r   c                 T    | dk(  r|dk(  rt        ||||      S t        || ||||      S )Nr   )r.   r/   r0   )r7   r4   )r,   r-   r+   r.   r/   r0   s         r   get_image_prompt_stringr9   J   sH     Q:?#$;#-	
 	
 z:/FUe r   c                   $    e Zd ZdddddddidZy)Idefics3ProcessorKwargsTF)add_special_tokenspaddingis_split_into_wordsreturn_mm_token_type_idsreturn_row_col_info)text_kwargsimages_kwargsN)__name__
__module____qualname__	_defaultsr6   r   r   r;   r;   Y   s(     #'#((-	
 "4

Ir   r;   F)totalc                        e Zd Z	 ddededz  f fdZd Ze	 	 	 ddee	e   z  e	e	e      z  de
ede	e   e	d   f   dedz  d	ee   d
ef
d       ZddZ xZS )Idefics3ProcessorNr+   chat_templatec                 :   t        ddd      j                  | _        t        ddd      j                  | _        t        ddd      j                  | _        d| _        || _        |j                  | j                        | _        |j                  | j                        | _	        |j                  | j
                        | _
        t        d      D cg c]0  }t        d      D ]   }|j                  d	|d
z    d|d
z    d      " 2 c}}| _        t        j                  d      | _        d| j                  | j                  | j                  gi}|j!                  |       |j                  | j                        | _        t#        	| H  ||fd|i| yc c}}w )a  
        image_seq_len (`int`, *optional*, defaults to 169):
            The length of the image sequence i.e. the number of <image> tokens per image in the input.
            This parameter is used to build the string from the input prompt and image tokens and should match the
            value the model used. It is computed as: image_seq_len = int(((image_size // patch_size) ** 2) / (scale_factor**2))
        z<fake_token_around_image>FT)
normalizedspecialz<image>z<end_of_utterance>z<global-img>   r%   r&   r'   r(   z*(\n?<global-img>\n?|<row_\d+_col_\d+>\n?)+additional_special_tokensrJ   N)r   contentfake_image_tokenr/   end_of_utterance_tokenglobal_image_tagr+   convert_tokens_to_idsimage_token_idfake_image_token_idglobal_image_token_idr*   row_col_idsrecompile%_regex_to_remove_extra_special_tokensr<   super__init__)
selfimage_processor	tokenizerr+   rJ   kwargsijtokens_to_add	__class__s
            r   r]   zIdefics3Processor.__init__i   s    !++FSXbf g o o%iE4PXX&01ERWae&f&n&n# .*'==d>N>NO#,#B#B4CXCX#Y %.%D%DTEZEZ%["SXYZS[
NOejklem
`aI++eAE7%Awa,HI
I
 68ZZ@m5n2 (%%  ++*
 	$$]3'==d>N>NO)[=[TZ[%
s   5Fc                     g }|D ]_  }g }|D ]E  }t        |      r|j                  |        t        |      s,|j                  t        |             G |j                  |       a |S r    )r	   appendr   r
   )r^   promptsprompt_imagespromptimagesr!   s         r   _extract_images_from_promptsz.Idefics3Processor._extract_images_from_prompts   sn     	)FF 4!$'MM$'D\MM*T"23	4
   (	) r   rk   textr   ra   r   c                    ||t        d       | j                  t        fd| j                  j                  i|}||n| j
                  }|d   j                  dd      }|d   j                  dd      }g }g }	i }
|jt        |t              r|g}n.t        |t              st        |d   t              st        d	      |D cg c]  }|j                  | j                         }}|Wt        |      r|gg}nt        |t        t        f      rt        |d         r|t        |      t        |      k7  r>t        d
| j                   dt        |       d| j                   dt        |       d	      dgt        t!        |            z   }t#        t        |            D cg c]  }|||   ||dz        }}nO|g}nKt        |t        t        f      s5t        |d   t        t        f      st        |d   d         st        d      |D cg c]  }t        |       }	}|D cg c](  }|D cg c]  }t%        |      rt'        |      n| c}* }}} | j(                  |fi |d   }|
j+                  |       ||	|k7  rt        d| d|	 d      |
j                  d|D cg c]  }dg|z  
 c}      }|
j                  d|D cg c]  }dg|z  
 c}      }| j,                  }| j                  }| j.                  }g }g }t1        |||      D ]  \  }}}g }g }t1        ||      D ]`  \  }}t3        ||||||      }| j
                  dz   |z  dz   }|j5                  | j
                  dz   ||z  z          |j5                  |       b |j5                  |       |j7                  |      } t        |       dk(  rt        d      | d   }t9        |      D ]  \  }}||| |dz      z   z  } |j5                  |         | j                  |fi |d   }!| j;                  ||!dg       |
j+                  |!       nZ|Xt=        |      r%t        dt        |       d| j                   d       | j                  d#d|i|d   }!|
j+                  |!       |rt?        j@                  |
d          }"t?        jB                  |"      }#t9              D ]o  \  }}$t?        jD                  |"|   | jF                  k(        d   }%d}&|$D ];  }'|&t        |%      k\  r E|%|&   }(|(|'z   })d|#||(|)f<   t?        jH                  |%|)      }&= q |#jK                         |
d!<   tM        |
|"      S c c}w c c}w c c}w c c}w c c}}w c c}w c c}w )$a  
        image_seq_len (`int`, *optional*):
            The length of the image sequence. If not provided, the default value of self.image_seq_len is used.
            image_seq_len should be equal to int(((image_size // patch_size) ** 2) / (scale_factor**2))
        Nz+You must provide either `text` or `images`.tokenizer_init_kwargsrA   r?   Freturn_tensorsr   zAInvalid input text. Please provide a string, or a list of stringszThe total number of zP tokens in the prompts should be the same as the number of images passed. Found  z tokens and z images.r&   zdInvalid input images. Please provide a single image or a list of images or a list of list of images.rB   z!The number of images in the text z and images z should be the same.rowscols)r/   r.   r0      r   z.The image token should be present in the text.image)
modalitieszFound z. tokens in the text but no images were passed.rm   	input_idsmm_token_type_ids)datatensor_typer6   )'
ValueError_merge_kwargsr;   r`   init_kwargsr+   popr   r   listcountr/   r"   tuplesumlenr   r*   r   r
   r_   updaterQ   rS   zipr9   rg   split	enumerate_check_special_mm_tokensanynparray
zeros_likewhererV   searchsortedtolistr   )*r^   rk   rm   r+   ra   output_kwargsr?   rp   n_images_in_textn_images_in_imagesinputssamplecumsum_images_in_textrb   imimage_inputsn_imagesr,   r-   rQ   r/   r0   prompt_stringsbatch_image_seq_lengthssample_rowssample_colsimage_prompt_stringsimage_seq_lengthsn_rowsn_colsimage_prompt_string
row_lengthsplit_sampletext_inputs	array_idsrx   seq_lengthsimage_start_positionsrc   seq_lenstartends*                                             r   __call__zIdefics3Processor.__call__   sO    <FNJKK***#
"&.."<"<
 
 *7)BHZHZ#0#?#C#CD^`e#f &}599:JDQ$$vd+JtAw4L !deeMQR6T-=-= >RR$V,!(FT5M27LVTUY7W#+,F;(243C3C2D E&&)*:&;%<Ad>N>N=O|\_`f\g[hhpr 
 ./C$zBR7S2T,T) "'s+;'<!= 4Q7:OPQTUPU:VWF 
 %XFve}5"6!9tUm<-fQil; z  =C!C&#f+!C!C ]ccRXfMz"~;McFc/4//Y-:XYLMM,'%)99$;<L;M\ZlYm  nB  C  $ZZP`0aH!x0ab
#ZZP`0aH!x0ab
#'#8#8 "..#'#8#8 !#*,'8;D*j8Y 24FK+-((*%*-k;*G I.E"")(34D-=/+ '+&8&81&<%F%J
)00$2D2Dq2HJY_L_1_`,334GHI ,223DE#)<<#<L<(A-()YZZ *!_F2;<P2Q L.."5QU8K"KKL"))&172: -dnn^\}]?[\--nkW^V_-`k*#$ S!1231T5E5E4FFtu  )$..SdSmM6RSKMM+&#!45I "i 8"+,C"D 	D;(*1AYAY1Y(Z[\(]%* DGC 5661!4E'/C67%asl3(=sCAD	D +<*B*B*DF&'^DDY  S "D Nc 1b0as6   "U)&U.U3*	U=3U8U=+V
V
8U=c                    i }|t         j                  j                  di       }|j                  |       |D cg c]   } | j                  j
                  g || " }}| j                  dz   }| j                  dz   }g }	g }
|D ]6  \  }}}||z  dz   }|	j                  |||z  z          |
j                  |       8 |j                  |	|
d       t        di |S c c}w )a  
        Computes the number of placeholder tokens needed for multimodal inputs with the given sizes.

        Args:
            image_sizes (`list[list[int]]`, *optional*):
                The input sizes formatted as (height, width) per each image.

        Returns:
            `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided
            input modalities, along with other useful data.
        rB   r   rt   r&   )num_image_tokensnum_image_patchesr6   )	r;   rF   getr   r_   get_number_of_image_patchesr+   rg   r   )r^   image_sizesra   vision_datarB   
image_sizenum_image_row_colsbase_image_length
col_lengthr   r   num_patchesnum_rowsnum_colsr   s                  r   _get_num_multimodal_tokensz,Idefics3Processor._get_num_multimodal_tokens+  s    "3==AA/SUVM  ( #." A$$@@\*\m\" "
 !% 2 2Q 6++a/J! "3E 6/Xx'(2Q6
 ''(9Z(=R(ST!((56
 4D[lmn,,,#"s   %C)N   N)NNNr    )rC   rD   rE   intr   r]   rl   r   r   r   r   r   r   r;   r   r   r   __classcell__)re   s   @r   rI   rI   g   s     fj#\>A#\X[^bXb#\J
  JNbf$(	NET*--T*5E0FFNE I2DOTJ]E^^_NE Tz	NE
 01NE 
NE NE`#-r   rI   )(__doc__rY   	itertoolsr   typingr   r   numpyr   feature_extraction_utilsr   image_utilsr   r	   r
   processing_utilsr   r   r   r   tokenization_utils_baser   r   r   utilsr   r   r   
get_loggerrC   loggerboolr   r"   r4   r7   r9   r;   rI   __all__r6   r   r   <module>r      s    
   '  4 A A X X K K , <			H	%;4 ;0&.e  f- f- f-R 
r   