
    qi                         d Z ddlZddlmZ ddlmZmZmZ ddl	m
Z
mZmZmZ ddlmZmZ ddlmZmZ  ej(                  e      Z G d	 d
ed      Ze G d de             ZdgZy)z
Processor class for Llava.
    N   )BatchFeature)
ImageInputget_image_sizeto_numpy_array)MultiModalDataProcessingKwargsProcessorMixinUnpack)PreTokenizedInput	TextInput)auto_docstringloggingc                       e Zd ZddddiZy)LlavaProcessorKwargstext_kwargsF)paddingreturn_mm_token_type_idsN)__name__
__module____qualname__	_defaults     \/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/llava/processing_llava.pyr   r   #   s    5eLIr   r   F)totalc            
            e Zd Z	 	 	 	 	 	 	 d	 fd	Ze	 	 d
dedz  deez  ee   z  ee   z  de	e
   defd       ZddZ xZS )LlavaProcessorNc                     || _         || _        || _        t        |d      r|j                  n|| _        |j                  | j                  d      d   | _        t        	| !  |||       y)a  
        patch_size (`int`, *optional*):
            Patch size from the vision tower.
        vision_feature_select_strategy (`str`, *optional*):
            The feature selection strategy used to select the vision feature from the vision backbone.
            Should be same as in model's config
        image_token (`str`, *optional*, defaults to `"<image>"`):
            Special token used to denote image location.
        num_additional_image_tokens (`int`, *optional*, defaults to 0):
            Number of additional tokens added to the image embeddings, such as CLS (+1). If the backbone has no CLS or other
            extra tokens appended, no need to set this arg.
        image_tokenF)add_special_tokensr   )chat_templateN)	
patch_sizenum_additional_image_tokensvision_feature_select_strategyhasattrr    encodeimage_token_idsuper__init__)
selfimage_processor	tokenizerr#   r%   r"   r    r$   kwargs	__class__s
            r   r*   zLlavaProcessor.__init__+   so    . %+F(.L+4;I}4U900[f'..t/?/?TY.Z[\])=Qr   imagestextr.   returnc                     ||t        d       | j                  t        fd| j                  j                  i|}| | j
                  |fi |d   }ni }t        |t              r|g}n.t        |t              st        |d   t              st        d      |}|j                  d      |d   }t        t        |d               \  }}	|| j                  z  |	| j                  z  z  | j                  z   }
| j                  dk(  r|
d	z  }
g }|D ]<  }|j!                  | j"                  | j"                  |
z        }|j%                  |       > |d
   j'                  dd      }|d
   j'                  dd      } | j                  |fi |d
   ddi}| j)                  ||dg       |rUt+        j,                  |d         }t+        j.                  |d         }d	||| j0                  k(  <   |j3                         |d<   t5        i |||      S )aA  
        Returns:
            [`BatchFeature`]: A [`BatchFeature`] with the following fields:

            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
              `None`).
            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
        Nz7You have to specify at least one of `images` or `text`.tokenizer_init_kwargsimages_kwargsr   zAInvalid input text. Please provide a string, or a list of stringspixel_valuesdefault   r   return_tensorsr   Fimage)
modalities	input_idsmm_token_type_ids)datatensor_type)
ValueError_merge_kwargsr   r-   init_kwargsr,   
isinstancestrlist	TypeErrorgetr   r   r#   r$   r%   replacer    appendpop_check_special_mm_tokensnparray
zeros_liker(   tolistr   )r+   r0   r1   r.   output_kwargsimage_inputsprompt_stringsr6   heightwidthnum_image_tokenssampler9   r   text_inputs	array_idsr=   s                    r   __call__zLlavaProcessor.__call__I   sD   " >dlVWW*** 
"&.."<"<
 

 /4//Y-:XYLLdC 6DD$'
47C0H_`` N+7'7L*>,q/+JKMFE &$// 9( 00 1 22i? A% N .(8(8$:J:JM]:]^%%f-. '}599:JDQ#0#?#C#CD^`e#f $dnn^i}]7Sidhi%%nkwi%X#[!9:I "k+.F GBCi4+>+>>?/@/G/G/IK+,!@K!@<!@n]]r   c                    i }|t         j                  j                  di       }|j                  |       |j                  dd      xs | j                  j
                  }|d   |d   }}|| j                  z  || j                  z  z  }|| j                  z  }| j                  dk(  r|dz  }|gt        |      z  }dgt        |      z  }	|j                  ||	d       t        d	i |S )
a  
        Computes the number of placeholder tokens needed for multimodal inputs with the given sizes.

        Args:
            image_sizes (`list[list[int]]`, *optional*):
                The input sizes formatted as (height, width) per each image.

        Returns:
            `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided
            input modalities, along with other useful data.
        Nr5   	crop_sizerS   rT   r7   r8   )rU   num_image_patchesr   )r   r   rG   updater,   r[   r#   r$   r%   lenr   )
r+   image_sizesr.   vision_datar5   r[   resized_heightresized_widthrU   r\   s
             r   _get_num_multimodal_tokensz)LlavaProcessor._get_num_multimodal_tokens   s     "0::>>PRSM  (%))+t<^@T@T@^@^I,5h,?7ASMN .$// AmW[WfWfFfg @ @@22i? A%  01C4DD!"c+&6 64D[lmn,,,r   )NNNNNz<image>r   )NN)N)r   r   r   r*   r   r   r   r   rE   r   r   r   rY   rc   __classcell__)r/   s   @r   r   r   )   s     '+$%R<  %)Z^>^T!>^ ++d9o=EV@WW>^ -.	>^
 
>^ >^@-r   r   )__doc__numpyrL   feature_extraction_utilsr   image_utilsr   r   r   processing_utilsr   r	   r
   r   tokenization_utils_baser   r   utilsr   r   
get_loggerr   loggerr   r   __all__r   r   r   <module>ro      sw     4 E E  D , 
		H	%+5  }-^ }- }-@ 
r   