
    qiB                         d Z ddlmZ ddlmZ ddlmZ ddlmZm	Z	m
Z
mZ ddlmZmZ ddlmZmZ  e       rdd	lZd
Z G d de
d      Z G d ded      ZddZd Zd Zd Ze G d de	             ZdgZy	)z
Processor class for IDEFICS.
    )urlparse   )BatchFeature)
ImageInput)ProcessingKwargsProcessorMixin
TextKwargsUnpack)PreTokenizedInput	TextInput)auto_docstringis_torch_availableN<image>c                   2    e Zd ZU dZedz  ed<   edz  ed<   y)IdeficsTextKwargsaW  
    add_eos_token (`bool`, *optional*, defaults to `False`):
        Whether to add an end-of-sequence token at the end of the text input. When enabled, an EOS token is
        appended to mark the end of the text sequence, which is useful for generation tasks.
    add_end_of_utterance_token (`bool`, *optional*):
        Whether to add an end-of-utterance token to mark the end of a user's message in conversational contexts.
        This token helps the model distinguish between different utterances in a multi-turn conversation and is
        particularly important for chat-based models.
    Nadd_eos_tokenadd_end_of_utterance_token)__name__
__module____qualname____doc__bool__annotations__     `/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/idefics/processing_idefics.pyr   r   '   s     $; $t+r   r   F)totalc                   .    e Zd ZU eed<   ddddddidZy)	IdeficsProcessorKwargstext_kwargsFlongest)add_special_tokenspaddingr   return_tensorspt)r    common_kwargsN)r   r   r   r   r   	_defaultsr   r   r   r   r   6   s)    "" #( "

 +D1Ir   r   c                     |dk7  r|dk(  rd| | |k\  <   |dk(  r>| dk(  }d| |<   t         j                  j                  j                  | |      }d||d d f<   S )Nr%   r   num_classes)torchnn
functionalone_hot)incremental_maskr$   r+   	negatives	attn_masks        r   $incremental_to_binary_attention_maskr3   C   sv    bT!@B-<= $*	&'#HH''//0@k/Z	"#	)Q,r   c                 &    |dk(  rt        | |      S y )Nr%   ),image_attention_mask_for_packed_input_ids_pt)	input_ids	tokenizerr$   s      r   )image_attention_mask_for_packed_input_idsr8   T   s    ;IyQQ r   c                    t        j                  | d      }t        j                  | d      }|j                  t              }|j                  }t        | j                  d            D ]K  }d}d}t        | |         D ]4  \  }	}
|
|k(  r|dz  }|||   |	<   d}n|||   |	<   |rd||   |	<   |
|k(  s3d}6 M t        | j                  d            D ]  }d}d}t        | |   j                  d      dz
  dd      D ]9  }	| |   |	   }
|
|k(  r|dz  }|||   |	<   d}n|||   |	<   |
|k(  rd}|s2d||   |	<   ; ||   dk7  }||   |xx   |z  cc<   ||   |xx   dz  cc<    ||fS )Nr)   )
fill_valuer   F   T)r,   	full_likeconvert_tokens_to_idsIMAGE_TOKENeos_token_idrangesize	enumerate)r6   r7   image_attention_masknext_image_attention_maskimage_token_ideod_token_id	batch_idxcountseen_eodidxtoken_idnon_negative_indicess               r   r5   r5   Y   s    ??9D %	b I44[AN))L9>>!,-  	&y';< 	 MC>)
7<$Y/4 7<$Y/479$Y/4<'	  " 9>>!,- I	9-221592rB 	?C +C0H>)
<A))4S9 <A))4S9<'<>))4S9	?  9CrI!),-ABeKB!),-ABbHB)I,  !:::r   c                 d    d| v ryt        |       }t        |j                  |j                  g      S )zChecks if the passed string contains a valid url and nothing else. e.g. if space is included it's immediately
    invalidated the url F)r   allschemenetloc)stringresults     r   is_urlrT      s0     f}fFv}}-..r   c            
            e Zd Zd fd	Ze	 	 d	deee   z  ez  ee   z  eee      z  dee	z  ee   z  ee	   z  eee      z  eee	      z  de
e   defd       Zed        Z xZS )
IdeficsProcessorc                 p   t         |   ||       t        |d      r|j                  n|j	                  t
              | _        | j                  j                  | j                  j                  | j                  j                  f| _	        d| j                  j                  j                  dg       v | _        y)z
        image_size (int, *optional*, defaults to 224):
            The size of the image to be processed.
        add_end_of_utterance_token (bool, *optional*, defaults to None):
            Whether to add the end of utterance token to the text.
        image_token<end_of_utterance>additional_special_tokensN)super__init__hasattrrE   r=   r>   image_processorimage_num_channels
image_sizedefault_image_dimsr7   special_tokens_mapget1tokenizer_was_trained_with_end_of_utterance_token)selfr^   r7   r`   r   kwargs	__class__s         r   r\   zIdeficsProcessor.__init__   s     	)4 y-0 $$00= 	   33  ++  ++#
 !DNN$E$E$I$IJegi$jj 	>r   imagestextrf   returnc                 	  () ||t        d      ||}n|t        |t        t        f      s|g}t        |t              r|g}t        |t        t        f      r"t        |      t        |      k7  rt        d      t        d |D              st        d      t        |d   t        t        f      r"t        ||      D cg c]
  \  }}|g| }}}nt        t        ||            } | j                  t        fd| j                  j                  i|}|d   j                  d	d
      }|d   j                  dd      }	|	| j                  }	t        d D              s|g}d(d)d}
()fd}g }g }|D ],  }| j                  j                   }g }d
}d
}t!        |      D ]  \  }}|dkD  rt#        |       }t        |t              rg|j%                  d      }t'        |      r:| j(                  j+                  |      }| ||      z  }|j-                  |       d}}|	r|r||
z  }||z  }d
}| ||      z  }|j-                  |       d} |r|| j                  j.                  z  } | j(                  |fi |d   }|j-                  |       |j-                  |       / |d   j                  dd      } | j                  |fi |d   }|d   }|d   }t1        d |D              }t1        d|      }t3        d |D              dkD  }g }g }g }t        |||      D ]  \  }}} |}!|!j5                  | j6                        }"t9        |"|      }#| d|# }$t        |$      dkD  rA|dk(  rat;        j<                  |g|$j?                         dd  }%|$|%d|$j?                  d       n%|dk(  r t;        j<                  |g| j@                   }%|j-                  %       |dk(  s|j-                  t;        jB                  |!             |j-                  t;        jB                  |              |dk(  r?t;        jD                  |      }t;        jD                  |      }t;        jD                  |      }|r)tG        || j                  |      \  }&}'tI        |&||      }&nF|dk(  rAt;        j<                  |jJ                  d   |jJ                  d   dt:        j"                        }&tM        |||&d      S c c}}w )a
  
        Returns:
            a dict with entries: `input_ids`, `attention_mask`, `pixel_values`, `image_attention_mask` which can be
            directly passed to `model.generate`

            Detailed explanation:

            Each entry in `text` is either a text to be passed as is or an image that will be processed.

            An image can be either an image object (`PIL.Image`) or a url from which the image can be retrieved.

        When the processor encounters an image it'll inject `<fake_token_around_image><image><fake_token_around_image>`
        entry into the prompt.

        Example:

        ```python
        checkpoint = "HuggingFaceM4/idefics-9b"
        processor = AutoProcessor.from_pretrained(checkpoint)
        url = "https://hips.hearstapps.com/hmg-prod/images/cute-photos-of-cats-in-grass-1593184777.jpg"
        img = processor.image_processor.fetch_images([url])[0]

        prompts = [
            "User:",
            img,
            "Describe this image.\nAssistant: An image of two kittens in grass.\n",
            "User:",
            "https://hips.hearstapps.com/hmg-prod/images/dog-puns-1581708208.jpg",
            "Describe this image.\nAssistant:",
        ]

        inputs = processor(text=prompts, return_tensors="pt")
        generated_ids = model.generate(**inputs, max_length=100)
        generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        ```

        In this example the `prompts` will be converted into:

        ```
        <s>User:<fake_token_around_image><image><fake_token_around_image>Describe this image.
        Assistant: An image of two kittens in grass.
        User:<fake_token_around_image><image><fake_token_around_image>Describe this image.
        Assistant:'
        ```

        and the two images will be massaged using [`IdeficsImageProcessor.__call__`] method and placed inside the
        `pixel_values` dict entry of the return value.

        This example also exemplifies that images can be passed as objects or as text urls. It can be seen that the
        first image is passed as object and the second one as a url.

        To do training do:

        ```python
        image_transform = transforms.Compose(
            [
                transforms.RandomResizedCrop(
                    (w, h), scale=(0.9, 1.0), interpolation=transforms.InterpolationMode.BICUBIC
                ),
                transforms.ToTensor(),
                transforms.Normalize(mean=self.image_mean, std=self.image_std),
            ]
        )
        inputs = processor(text=prompts, transform=image_transform, return_tensors="pt")
        ```

        In order to help debug prompt generation enable `debug=True` which will show you what's happening.

        Nz9You need to specify either `text` or `images` and `text`.a  When providing both images and text arguments, the number of text prompts should be the same as the number of images.If you want to have several images per prompt, images should be nested as such: images=[[img1, img2], [img3, img4], ...] for text=[prompt1, prompt2, ...].c              3   <   K   | ]  }t        |t                y wN)
isinstancestr.0is     r   	<genexpr>z,IdeficsProcessor.__call__.<locals>.<genexpr>  s     8az!S)8s   zQWhen using the image-text-to-text behavior, the prompts should only contain text.r   tokenizer_init_kwargsr    r   Fr   c              3   H   K   | ]  }t        |t        t        f        y wrm   )rn   listtuplerp   s     r   rs   z,IdeficsProcessor.__call__.<locals>.<genexpr>%  s     AA:a$/As    "z<fake_token_around_image>r   rY   c                 "    | rz   S z   z   S rm   r   )last_was_image
fake_tokenrX   s    r   image_tokensz/IdeficsProcessor.__call__.<locals>.image_tokens,  s!    "Z//!K/*<<r   rN   Timages_kwargsr$   r%   r6   attention_maskc              3   2   K   | ]  }t        |        y wrm   lenrq   xs     r   rs   z,IdeficsProcessor.__call__.<locals>.<genexpr>b  s     8SV8   r;   c              3   2   K   | ]  }t        |        y wrm   r   r   s     r   rs   z,IdeficsProcessor.__call__.<locals>.<genexpr>e  s      <AQ <r   r*   )dtype)r6   r}   pixel_valuesrC   )data)'
ValueErrorrn   rv   rw   ro   r   rO   zip_merge_kwargsr   r7   init_kwargspoprd   any	bos_tokenrB   r   striprT   r^   fetch_imagesappend	eos_tokenmaxsumrH   rE   minr,   zerosrA   ra   tensorstackr8   r3   shaper   )*re   rh   ri   rf   prompts
image_listsampleoutput_kwargsr   r   end_of_utterance_tokenr{   all_prompts
all_images	full_textimage_objectsry   last_was_textrr   itemimager$   text_encoding	all_textsall_attention_masksmax_num_imagesat_least_one_imageoutput_input_idsoutput_imagesoutput_attention_maskstext_singler}   extracted_imagespadded_input_idsimage_countlocal_max_num_imagescurrent_imagespadded_image_tensorrC   _rz   rX   s*                                           @@r   __call__zIdeficsProcessor.__call__   s]   b >dlXYY>G ftUm4 $$v$u.3t9F3K q 
 8488 !tuu&)dE]3KNvW[K\]5GZF0Z0]]s6401***"
"&.."<"<
 
 &m488%P%2=%A%E%EFbdh%i" &-)-)_)_&AAAiG0
!5	= 
 %	-F>>334I M"N!M$V, *4q5$(^);$<MdC(::c?Dd| $ 4 4 A A$ G!\.%AA	%,,U3)- 6-%)??I!T)	). n!==I!((.%)N+*. T^^555	0D00a-P_B`aMy)m,K%	-P '}599:JDQ&{SmM6RS!+.	+,<= 8Z88Q/  < <<q@!#=@L_ak=l 	L9K)9**001D1DEK#&{N#C -.C/CDN>"Q&!T)*/++n*a~GZGZG\]^]_G`*a'DR'(@.*=*=a*@A!T)*/++n*_tG^G^*_'  !45% ''5E(FG&--ell>.JK%	L( T!${{+;<!KK6M%*[[1G%H"&O $...'# ! $H$n.$ 
 %',{{$**1-/?/E/Ea/H!SXS]S]($ -"8 -(<	
 	
y ^s   S3c                 ~    | j                   j                  }| j                  j                  }t        ||z   dgz         S )NrC   )r7   model_input_namesr^   rv   )re   tokenizer_input_namesimage_processor_input_namess      r   r   z"IdeficsProcessor.model_input_names  s?     $ @ @&*&:&:&L&L#),GGKaJbbccr   )N   N)NN)r   r   r   r\   r   r   rv   ro   r   r   r
   r   r   r   propertyr   __classcell__)rg   s   @r   rV   rV      s    
0  UY +/k
T*--3d3i?$tCy/Qk
 

y/  
!" tI
	 
 t%&
'(k
 /0k
 
k
 k
Z d dr   rV   )r)   )r   urllib.parser   feature_extraction_utilsr   image_utilsr   processing_utilsr   r   r	   r
   tokenization_utils_baser   r   utilsr   r   r,   r>   r   r   r3   r8   r5   rT   rV   __all__r   r   r   <module>r      s    " 4 %  D 7  ,
% ,	-U 	"R
,;^/ Kd~ Kd Kd\ 
r   