
    qi;                         d Z ddlmZ ddlmZ ddlmZmZmZm	Z	 ddl
mZmZ ddlmZmZ  ej                   e      ZdZ G d d	ed
      Z G d ded
      Ze G d de             ZdgZy)z
Processor class for Janus.
   )BatchFeature)
ImageInput)ProcessingKwargsProcessorMixin
TextKwargsUnpack)PreTokenizedInput	TextInput)auto_docstringloggingzYou are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.

c                       e Zd ZU dZeed<   y)JanusTextKwargsas  
    generation_mode (`str`, *optional*, defaults to `"text"`):
        The generation mode indicating which modality to generate. Can be one of `"text"` or `"image"`. When set
        to `"text"`, the processor prepares inputs for text generation. When set to `"image"`, it prepares inputs
        for image generation by appending image start tokens to the prompt.
    generation_modeN)__name__
__module____qualname____doc__str__annotations__     \/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/janus/processing_janus.pyr   r   "   s     r   r   F)totalc                   ,    e Zd ZU eed<   dddddidZy)	JanusProcessorKwargstext_kwargsFtext)paddingr   return_tensorspt)r   common_kwargsN)r   r   r   r   r   	_defaultsr   r   r   r   r   -   s      #(VD*D1Ir   r   c            
            e Zd Zd
 fd	Ze	 	 ddeez  ee   z  ee   z  dedz  de	e
   defd       ZdefdZ	 dd	Z xZS )JanusProcessorNc                     d| _         |j                  | _        |j                  | _        |j                  | _        || _        t        | !  |||       y)z
        use_default_system_prompt (`bool`, *optional*, defaults to `False`):
            Use default system prompt for Text Generation.
        i@  )chat_templateN)	num_image_tokensimage_token	boi_tokenimage_start_token	eoi_tokenimage_end_tokenuse_default_system_promptsuper__init__)selfimage_processor	tokenizerr&   r-   kwargs	__class__s         r   r/   zJanusProcessor.__init__7   sS    
 !$$00!*!4!4(22)B&)=Qr   r   imagesr3   returnc                     | j                   t        fd| j                  j                  i|}||t	        d      |Gt        |t              r|g}n3t        |t        t        f      rt        d |D              st	        d      |d   j                  d      }g }| j                  | j                  | j                  z  z   | j                  z   }|D ]]  }|j                  | j                  |      }| j                   r|dk(  r	t"        |z   }|dk(  r|| j                  z  }|j%                  |       _  | j                  |fi |d   }	|"|dk7  r | j&                  dd	|i|d
   d   |	d<   t)        |	      S )aA  
        Returns:
            [`BatchFeature`]: A [`BatchFeature`] with the following fields:

            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
              `None`).
            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
        tokenizer_init_kwargsz'You must specify either text or images.c              3   <   K   | ]  }t        |t                y w)N)
isinstancer   ).0ts     r   	<genexpr>z*JanusProcessor.__call__.<locals>.<genexpr>`   s     =_UVjC>P=_s   zAInvalid input text. Please provide a string, or a list of stringsr   r   r   imager5   images_kwargspixel_values)datar   )_merge_kwargsr   r2   init_kwargs
ValueErrorr:   r   listtupleallpopr*   r(   r'   r,   replacer-   DEFAULT_SYSTEM_PROMPTappendr1   r   )
r0   r   r5   r3   output_kwargsr   prompt_stringsone_img_tokenspromptrA   s
             r   __call__zJanusProcessor.__call__D   s   $ +** 
8<8R8R
V\
 <FNFGG$$v e}5#=_Z^=_:_ !dee'6::;LM //43C3CdF[F[3[\_c_s_ss 	*F^^D$4$4nEF--/V2K.7')$000!!&)	* t~~nMm0LM /W"<#74#7#7#hv#hWfIg#h$D  &&r   c                 <     | j                   j                  |fi |S )z
        Forwards all arguments to the image processor's `postprocess` method.
        Refer to the original method's docstring for more details.
        )r1   postprocess)r0   r5   r3   s      r   rR   zJanusProcessor.postprocessz   s"    
 0t##//A&AAr   c                     ||dk(  r | j                   |fd|i|S |dk(  r1t        |j                               }| j                  |d      }|d   S t	        | j
                  j                   d| d      )	a  
        Post-process the output of a multimodal model to return the requested modality output.
        If the model cannot generated the requested modality, an error will be raised.

        Args:
            generated_outputs (`torch.Tensor` or `np.ndarray`):
                The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)`
                or `(sequence_length,)`.
            skip_special_tokens (`bool`, *optional*, defaults to `True`):
                Whether or not to remove special tokens in the output. Argument passed to the tokenizer's `batch_decode` method.
            generation_mode (`str`, *optional*):
                Generation mode indicated which modality to output and can be one of `["text", "image", "audio"]`.
            **kwargs:
                Additional arguments to be passed to the tokenizer's `batch_decode method`.

        Returns:
            `list[Union[str, PIL.Image.Image]]`: The decoded text or generated image.
        r   skip_special_tokensr>   zPIL.Image.Image)r   r@   z# got an unexpected generation_mode=z.. Supported options are only `text` and `image)post_process_image_text_to_textrE   floatrR   rD   r4   r   )r0   generated_outputsrT   r   r3   r5   s         r   post_process_multimodal_outputz-JanusProcessor.post_process_multimodal_output   s    * "o&?7477!7JNT  ' $%6%<%<%> ?%%&7HY%ZF.)) >>**++NN_  `N  O r   )NF)NN)TN)r   r   r   r/   r   r
   r	   rE   r   r   r   r   rP   rR   rX   __classcell__)r4   s   @r   r$   r$   5   s    R  [_$(3'++d9o=EV@WW3' T!3' -.	3'
 
3' 3'jB* B LP"r   r$   N)r   feature_extraction_utilsr   image_utilsr   processing_utilsr   r   r   r   tokenization_utils_baser	   r
   utilsr   r   
get_loggerr   loggerrJ   r   r   r$   __all__r   r   r   <module>rb      s    5 % T T C , 
		H	%N j +5  m^ m m` 
r   