
    qi"                         d dl ZddlmZ ddlmZ ddlmZmZm	Z	m
Z
mZ ddlmZmZ ddlmZmZ dd	lmZ  G d
 ded      Z G d de	d      Ze G d de
             ZdgZy)    N   )BatchFeature)
ImageInput)ImagesKwargsMultiModalDataProcessingKwargsProcessorMixinUnpack)PreTokenizedInput	TextInput)
TensorTypeauto_docstring   )AutoTokenizerc                   0    e Zd ZU dZeed<   eed<   eed<   y)AriaImagesKwargsa  
    split_image (`bool`, *optional*, defaults to `False`):
        Whether to split large images into multiple crops. When enabled, images exceeding the maximum size are
        divided into overlapping crops that are processed separately and then combined. This allows processing
        of very high-resolution images that exceed the model's input size limits.
    max_image_size (`int`, *optional*, defaults to `980`):
        Maximum image size (in pixels) for a single image crop. Images larger than this will be split into
        multiple crops when `split_image=True`, or resized if splitting is disabled. This parameter controls
        the maximum resolution of individual image patches processed by the model.
    min_image_size (`int`, *optional*):
        Minimum image size (in pixels) for a single image crop. Images smaller than this will be upscaled to
        meet the minimum requirement. If not specified, images are processed at their original size (subject
        to the maximum size constraint).
    split_imagemax_image_sizemin_image_sizeN)__name__
__module____qualname____doc__bool__annotations__int     Z/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/aria/processing_aria.pyr   r      s     r   r   F)totalc                   D    e Zd ZU eed<   ddddddej                  dZy)AriaProcessorKwargsimages_kwargsF)paddingreturn_mm_token_type_ids  )r   r   )text_kwargsr#   return_tensorsN)r   r   r   r   r   r   PYTORCH	_defaultsr   r   r   r"   r"   4   s4    ## (-

 " 
 %,,
Ir   r"   c            
            e Zd Z	 	 	 	 ddeez  dedz  deeez  ef   dz  f fdZe		 dde
ez  ee
   z  ee   z  dedz  dee   d	efd
       ZddZed        Z xZS )AriaProcessorN	tokenizerchat_templatesize_conversionc                 *   |ddd}|j                         D ci c]  \  }}t        |      | c}}| _        |j                  | _        |j                  | _        ||j
                  |j                  |_        t        | !  |||       yc c}}w )zx
        size_conversion (`Dict`, *optional*):
            A dictionary indicating size conversions for images.
        N      )i  r&   )r.   )	itemsr   r/   image_tokenimage_token_id	pad_token	unk_tokensuper__init__)selfimage_processorr-   r.   r/   kv	__class__s          r   r9   zAriaProcessor.__init__F   s     "$'c2O6E6K6K6MNdaA	N$00'66 Y%8%8%@"+"5"5I)=Q  Os   Btextimageskwargsreturnc                     | j                   t        fd| j                  j                  i|}t	        |t
              r|g}n.t	        |t              st	        |d   t
              st        d      | | j                  |fi |d   }| j                  |j                  j                  d      }g }|j                  d      |z  }|D ]P  }	|	j                  | j                  j                  | j                  j                  |z        }	|j                  |	       R ni }|}|d   j                  d	d      }
|d   j                  d
d      } | j                  |fi |d   d	di}| j!                  ||dg       |rUt#        j$                  |d         }t#        j&                  |d         }d||| j(                  k(  <   |j+                         |d<   t-        i |||
      S )a  
        Returns:
            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
            `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
            `None`).
            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
            - **pixel_mask** -- Pixel mask to be fed to a model. Returned when `images` is not `None`.
        tokenizer_init_kwargsr   zAInvalid input text. Please provide a string, or a list of stringsNr#   r   	num_cropsr'   r(   r%   Fimage)
modalities	input_ids   mm_token_type_ids)datatensor_type)_merge_kwargsr"   r-   init_kwargs
isinstancestrlist	TypeErrorr;   r/   pixel_valuesshapepopreplacer4   append_check_special_mm_tokensnparray
zeros_liker5   tolistr   )r:   r?   r@   rA   output_kwargsimage_inputstokens_per_imageprompt_stringsrE   sampler(   r%   text_inputs	array_idsrJ   s                  r   __call__zAriaProcessor.__call__\   s   " +**
"&.."<"<
 
 dC 6DD$'
47C0H_``/4//Y-:XYL#33L4M4M4S4STU4VWN$((58HHI .(B(BDNND^D^ajDjk%%f-.
 L!N&}599:JDQ#0#?#C#CD^`e#f $dnn^i}]7Sidhi%%nkwi%X#[!9:I "k+.F GBCi4+>+>>?/@/G/G/IK+,!@K!@<!@n]]r   c                    i }|t         j                  j                  di       }|j                  |       |j                  dd      xs | j                  j
                  }|D cg c]   } | j                  j                  g || " }}|D cg c]  }| j                  |   |z   }	}|j                  |	|d       t        di |S c c}w c c}w )a  
        Computes the number of placeholder tokens needed for multimodal inputs with the given sizes.
        Args:
            image_sizes (`list[list[int]]`, *optional*):
                The input sizes formatted as (height, width) per each image.
        Returns:
            `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided
            input modalities, along with other useful data.
        Nr#   r   )num_image_tokensnum_image_patchesr   )	r"   r*   getupdater;   r   get_number_of_image_patchesr/   r   )
r:   image_sizesrA   vision_datar#   max_size
image_sizerg   num_patchesrf   s
             r   _get_num_multimodal_tokensz(AriaProcessor._get_num_multimodal_tokens   s     "/99==orRM  ($(()94@gDDXDXDgDgH #.! A$$@@\*\m\! ! arrQ\ 4 4X > Lrr4D[lmn,,,!  ss   $%C	Cc                     | j                   j                  }| j                  j                  }|D cg c]
  }|dk7  s	| }}t        t        j                  ||z               S c c}w )NrE   )r-   model_input_namesr;   rQ   dictfromkeys)r:   tokenizer_input_namesimage_processor_input_namesnames       r   rr   zAriaProcessor.model_input_names   se     $ @ @&*&:&:&L&L# 9T&kW[_jWjt&k#&kDMM"7:U"UVWW 'ls
   
A#A#)NNNN)N)r   r   r   r   rP   rs   floatr   r9   r   r   r   rQ   r   r
   r"   r   rd   rp   propertyrr   __classcell__)r>   s   @r   r,   r,   D   s     )-$(9=R !3&R Tz	R
 eck3./$6R,  %)4^++d9o=EV@WW4^ T!4^ ,-	4^
 
4^ 4^l-4 X Xr   r,   )numpyrY   image_processing_utilsr   image_utilsr   processing_utilsr   r   r   r	   r
   tokenization_pythonr   r   utilsr   r   autor   r   r"   r,   __all__r   r   r   <module>r      sh   *  2 % f f ? /  |5 **%   pXN pX pXf 
r   