
    qi,                         d dl ZddlmZ ddlmZ ddlmZmZm	Z	m
Z
mZ ddlmZmZ ddlmZmZ ddlmZ  e       rd	d
lmZmZ  G d de
d      Z G d ded      Ze ed       G d de	                    ZdgZy)    N   )BatchFeature)
ImageInput)MultiModalDataProcessingKwargsProcessorMixin
TextKwargsUnpack)PreTokenizedInput	TextInput)auto_docstringis_vision_available)requires   )Emu3ImageProcessorKwargssmart_resizec                       e Zd ZU dZeed<   y)Emu3TextKwargsa  
    return_for_image_generation (`bool`, *optional*, defaults to `False`):
        Whether the processed text is intended for image generation tasks. When `True`, the processor prepares
        inputs for image generation by appending image start tokens and size information to the prompt, and
        images should not be provided. When `False`, the processor prepares inputs for text generation from
        images and text, requiring both inputs to be provided.
    return_for_image_generationN)__name__
__module____qualname____doc__bool__annotations__     Z/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/emu3/processing_emu3.pyr   r      s     "&%r   r   F)totalc                   8    e Zd ZU eed<   eed<   dddddddZy	)
Emu3ProcessorKwargstext_kwargsimages_kwargsF)r   return_mm_token_type_idsz1:1i  )ratio
image_area)r"   r#   N)r   r   r   r   r   r   	_defaultsr   r   r   r!   r!   +   s/    ++ ,1(-

  
	Ir   r!   )vision)backendsc            
            e Zd Z	 d fd	Ze	 	 ddedz  deez  ee   z  ee   z  dz  de	e
   defd       ZddZd	 Zdefd
Z	 ddZ xZS )Emu3ProcessorNc                 &   |j                   | _         |j                  | _        |j                  | _        |j                  | _        |j                  | _        |j                  | _        |j                  | _	        d| _
        t        | 1  |||       y )N   )chat_template)image_tokenimage_token_id	boi_tokenimage_start_token	eoi_tokenimage_end_tokenimage_wrapper_tokenfake_token_around_image	eof_token	bos_tokendownsample_ratiosuper__init__)selfimage_processor	tokenizerr.   kwargs	__class__s        r   r;   zEmu3Processor.__init__=   s     %00'66!*!4!4(22'0'D'D$",,",, !)=Qr   imagestextr?   returnc                 |   t        |t              r|g}n.t        |t              st        |d   t              st        d       | j                  t
        fd| j                  j                  i|}|d   j                  dd      }|d   j                  dd	      }|d   j                  d
d	      }|r|t        d      |s||t        d      i }| j                   }	| j                   | j                   }
|s| | j                  |fi |d   }t        |j                        }g }|D ]  }| j                   |v rt#        |      }|\  }}|| j$                  z  }|| j$                  z  }||dz   z  }|	 | d| | j&                   d|z   |
 }|j)                  | j                   |d      }| j*                   | }| j                   |v r|j-                  |        |D cg c]  }|j)                  d| j                           }}nj|rh| j/                  ||| j$                        \  }}|	 | d| | j&                   }|D cg c]  }| j*                   | |  }}||ggt1        |      z  |d<   |d   j                  dd	      }|d   j                  dd      } | j                  |fi |d   dd	i}| j3                  ||dg       |rUt5        j6                  |d         }t5        j8                  |d         }d||| j:                  k(  <   |j=                         |d<   t?        i |||      S c c}w c c}w )aA  
        Returns:
            [`BatchFeature`]: A [`BatchFeature`] with the following fields:

            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
              `None`).
            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
        r   zAInvalid input text. Please provide a string, or a list of stringstokenizer_init_kwargsr"   r   Fr#   r%   Nr&   zGYou should not provide `images` when `return_for_image_generation=True`zOYou must provide either text or images when `return_for_image_generation=False`r   *z<placeholder>image_sizesreturn_tensorsr$   image)
modalities	input_idsmm_token_type_ids)datatensor_type) 
isinstancestrlist	TypeError_merge_kwargsr!   r>   init_kwargspop
ValueErrorr2   r7   r4   r=   iterrG   r/   nextr9   r6   replacer8   appendcalculate_generate_sizelen_check_special_mm_tokensnparray
zeros_liker0   tolistr   )r<   rA   rB   r?   output_kwargsr   r%   r&   image_featuresimage_start_tokensimage_end_tokensrG   prompt_stringssample
image_sizeheightwidthimage_seq_lengthimage_placeholderimage_promptrH   r$   text_inputs	array_idsrL   s                            r   __call__zEmu3Processor.__call__N   s   & dC 6DD$'
47C0H_``***
"&.."<"<
 

 '4M&B&F&FGdfk&l#o.227DA"?377dK
&6+=fgg*t|noo $ 6 67"nn-d.B.B-CD +v/A1T11&[M/<Z[N~99:KN .&&&0!%k!2J$.MFE#t'<'<<F!T%:%::E'-';$+=*>vhawtOkOkNlm|  @P  nP  mQ  Rb  Qc  )d%#^^D,<,<>OQRSF $/x8F &&&0 %%f-. Ucc&FNN?D4D4DEcDc ) 88
DLaLabMFE01&5'$B^B^A_`LLPQ&t~~&vh|n=QDQ.4e_,=D	,IN=) '}599:JDQ#0#?#C#CD^`e#f $dnnT_]=-I_Z^_%%dKWI%N#[!9:I "k+.F GBCi4+>+>>?/@/G/G/IK+,!BK!B>!BP^__+ d Rs   ##L4L9c                    i }|g }|D ]  \  }}t        ||| j                  j                  | j                  j                  | j                  j                        \  }}|| j
                  z  }|| j
                  z  }||dz   z  }|j                  |        dgt        |      z  }|j                  ||d       t        di |S )a  
        Computes the number of placeholder tokens needed for multimodal inputs with the given sizes.

        Args:
            image_sizes (`list[list[int]]`, *optional*):
                The input sizes formatted as (height, width) per each image.

        Returns:
            `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided
            input modalities, along with other useful data.
        r   )num_image_tokensnum_image_patchesr   )
r   r=   spatial_factor
min_pixels
max_pixelsr9   rZ   r\   updater   )	r<   rG   r?   vision_datarr   ri   rj   rk   rs   s	            r   _get_num_multimodal_tokensz(Emu3Processor._get_num_multimodal_tokens   s     "!!, : ,((77((33((33!  4#8#88!6!66#)UQY#7  ''(89: "#c+&6 64D[lmn,,,r   c                     t        t        |j                  d            \  }}||z  }||z  dz  }t        t        ||z  |z              }t        t        ||z  |z              }	||	fS )N:g      ?)mapintsplitround)
r<   r%   r&   rt   rj   ri   current_areatarget_ratiotoken_heighttoken_widths
             r   r[   z%Emu3Processor.calculate_generate_size   sp    CS!12vv~"\1c95,!6!GHI% 4~ EFG[((r   c                 <     | j                   j                  |fi |S N)r=   postprocess)r<   rA   r?   s      r   r   zEmu3Processor.postprocess   s     /t##//A&AAr   c                     ||dk(  r | j                   |fd|i|S |dk(  r| j                  |d      }|d   S t        | j                  j                   d| d      )	a  
        Post-process the output of a multimodal model to return the requested modality output.
        If the model cannot generated the requested modality, an error will be raised.

        Args:
            generated_outputs (`torch.Tensor` or `np.ndarray`):
                The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)`
                or `(sequence_length,)`.
            skip_special_tokens (`bool`, *optional*, defaults to `True`):
                Whether or not to remove special tokens in the output. Argument passed to the tokenizer's `batch_decode` method.
            generation_mode (`str`, *optional*):
                Generation mode indicated which modality to output and can be one of `["text", "image", "audio"]`.
            **kwargs:
                Additional arguments to be passed to the tokenizer's `batch_decode method`.

        Returns:
            `list[Union[str, PIL.Image.Image]]`: The decoded text or generated image.
        rB   skip_special_tokensrI   zPIL.Image.Image)rH   pixel_valuesz# got an unexpected generation_mode=z.. Supported options are only `text` and `image)post_process_image_text_to_textr   rV   r@   r   )r<   generated_outputsr   generation_moder?   rA   s         r   post_process_multimodal_outputz,Emu3Processor.post_process_multimodal_output   s    * "o&?7477!7JNT  '%%&7HY%ZF.)) >>**++NN_  `N  O r   r   )NN)TN)r   r   r   r;   r   r   r   r   rQ   r
   r!   r   rp   ry   r[   r   r   __classcell__)r@   s   @r   r+   r+   :   s     	R"  %)aeQ`T!Q` ++d9o=EV@WWZ^^Q` ,-	Q`
 
Q` Q`f -D)B* B LP!r   r+   )numpyr^   image_processing_utilsr   image_utilsr   processing_utilsr   r   r   r	   r
   tokenization_utils_baser   r   utilsr   r   utils.import_utilsr   image_processing_emu3r   r   r   r!   r+   __all__r   r   r   <module>r      s}   "  2 % d d C 8 * M	&Zu 	&*%  	;uN u   up 
r   