
    qi`                        d dl mZ d dlZddlmZmZmZmZ ddl	m
Z
mZmZmZmZ ddlmZmZmZmZmZmZmZmZmZmZ ddlmZmZ  ej:                  e      Zdej@                  d	e!d
e"ej@                     fdZ# G d de      Z$dgZ%y)    )IterableN   )BaseImageProcessorBatchFeatureget_patch_output_sizeselect_best_resolution)PaddingModeconvert_to_rgbpadresizeto_channel_dimension_format)
ChannelDimension
ImageInputPILImageResamplingget_image_sizeinfer_channel_dimension_formatis_scaled_imagemake_flat_list_of_imagesto_numpy_arrayvalid_imagesvalidate_preprocess_arguments)
TensorTypeloggingimage
patch_sizereturnc                    g }t        | |      \  }}t        d||      D ]^  }t        d||      D ]L  }|t        j                  k(  r| |||z   |||z   f   }n| dd|||z   |||z   f   }|j	                  |       N ` |S )a  
    Divides an image into patches of a specified size.

    Args:
        image (`np.ndarray`):
            The input image.
        patch_size (`int`):
            The size of each patch.
        input_data_format (`ChannelDimension` or `str`):
            The channel dimension format of the input image.

    Returns:
        list: A list of np.ndarray representing the patches.
    channel_dimr   N)r   ranger   LASTappend)	r   r   input_data_formatpatchesheightwidthijpatchs	            `/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/aria/image_processing_aria.pydivide_to_patchesr+   ,   s     G"56GHMFE1fj) "q%, 	"A $4$9$99a!j.0!a*n2DDEaQ^!3QZ5GGHNN5!	"" N    c                   p    e Zd ZdZg dZddddddddddej                  fd	ee   dz  d
ee   dz  de	de	dee
e	e	f      dz  dedz  dedz  dede	ez  dedz  def fdZdddddddddddej                  dfdeee   z  d	eee   z  dz  d
eee   z  dz  de	dz  de	dz  dedz  dedz  dedz  dedz  dedz  dedz  deez  dz  dedz  deez  dz  fdZdej(                  de
dedej(                  fdZde
de
fd Zdej(                  de
dedej(                  fd!Zej2                  d"ddfdej(                  d#e	e
e	e	f   z  ee
e	e	f      z  d$ed%eee   z  deez  dz  deez  dz  dej(                  fd&Zdej(                  d'ee
e	e	f      d(e	dedededeej(                     fd)Zd-d*e	d+e	fd,Z xZS ).AriaImageProcessoraG  
    A vision processor for the Aria model that handles image preprocessing.
    Initialize the AriaImageProcessor.

    Args:
        image_mean (`list`, *optional*, defaults to [0.5, 0.5, 0.5]):
            Mean values for normalization.
        image_std (`list`, *optional*, defaults to [0.5, 0.5, 0.5]):
            Standard deviation values for normalization.
        max_image_size (`int`, *optional*, defaults to 980):
            Maximum image size.
        min_image_size (`int`, *optional*, defaults to 336):
            Minimum image size.
        split_resolutions (`list`, *optional*, defaults to a list of optimal,resolutions as tuples):
            The optimal resolutions for splitting the image.
        split_image (`bool`, *optional*, defaults to `False`):
            Whether to split the image.
        do_convert_rgb (`bool`, *optional*, defaults to `True`):
            Whether to convert the image to RGB.
        do_rescale (`bool`, *optional*, defaults to `True`):
            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
            the `preprocess` method.
        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
            Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess`
            method.
        do_normalize (`bool`, *optional*, defaults to `True`):
            Whether to normalize the image.
        resample (PILImageResampling, *optional*, defaults to `BICUBIC`):
            The resampling filter to use if resizing the image.
    pixel_values
pixel_mask	num_cropsN  iP  FTgp?
image_mean	image_stdmax_image_sizemin_image_sizesplit_resolutionssplit_imagedo_convert_rgb
do_rescalerescale_factordo_normalizeresamplec                 &   t        |   di | |g d}|g d}|| _        || _        || _        || _        || _        |!g d}|D cg c]  }|d   dz  |d   dz  f }}|| _        || _        || _	        |	| _
        |
| _        || _        y c c}w )N)      ?r@   r@   ))      )rA   r   )rA      )rA      )rA      )rA      )rA      )rB   rC   )rB   r   )rB   rB   )rB   rA   )r   rA   )r   rB   )rC   rA   )rC   rB   )rD   rA   )rE   rA   )rF   rA   )rG   rA   r     rA    )super__init__r6   r7   r4   r5   r9   r8   r:   r;   r<   r=   r>   )selfr4   r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   kwargsel	__class__s                 r*   rK   zAriaImageProcessor.__init__j   s     	"6"(J'I,,$"&$ !yFW X"Q%#+r!us{!; X X!2,$,(  !Ys   
Bptimagesreturn_tensorsdata_formatr#   c           	      P   ||n| j                   }||n| j                  }||n| j                  }||n| j                  }||n| j                  }||n| j
                  }||n| j                  }|	|	n| j                  }	|
|
n| j                  }
||n| j                  }|dvrt        d      | j                  |      }t        |      }t        |      st        d      t        |
|||||	       |r|D cg c]  }t        |       }}|D cg c]  }t!        |       }}|r#t#        |d         rt$        j'                  d       |t)        |d         }g }g }d}|D ]}  }|r"| j+                  || j,                  ||||      }n|g}|t/        |      |kD  rt/        |      }|D ]2  }t1        |      \  }}|t3        ||      z  }||k\  rt3        t5        ||z        |      |f}n|t3        t5        ||z        |      f}t7        |||||	      }||d   z
  ||d
   z
  }}t9        |d|fd|ff||      }t;        j<                  ||ft>              }d
|d|d   d|d
   f<   |jA                  |       |r| jC                  ||	|      }|
r;| jE                  || j                   | j                  ||      }|tG        |||      n|}|jA                  |       5  tI        t;        jJ                  |d      t;        jJ                  |d      |d|      S c c}w c c}w )aI  
        Process a list of images.

        Args:
            images (ImageInput or list of ImageInput):
                The input image or a list of images.
            image_mean (`list`, *optional*, defaults to [0.5, 0.5, 0.5]):
                Mean values for normalization.
            image_std (`list`, *optional*, defaults to [0.5, 0.5, 0.5]):
                Standard deviation values for normalization.
            max_image_size (`int`, *optional*, defaults to `self.max_image_size` (980)):
                Maximum image size.
            min_image_size (`int`, *optional*, defaults to `self.min_image_size` (336)):
                Minimum image size.
            split_image (`bool`, *optional*, defaults to `self.split_image` (False)):
                Whether to split the image.
            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb` (True)):
                Whether to convert the image to RGB.
            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
                Whether to rescale the image.
            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
            do_normalize (`bool`, *optional*, defaults to `self.do_normalize` (True)):
                Whether to normalize the image.
            resample (PILImageResampling, *optional*, defaults to `self.resample` (BICUBIC)):
                The resampling filter to use if resizing the image.
            return_tensors (`str` or `TensorType`, *optional*, defaults to "pt"):
                The type of tensor to return.
            data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format for the output image. Can be one of:
                    - `"channels_first"` or `ChannelDimension.FIRST`:
                        image in (num_channels, height, width) format.
                    - `"channels_last"` or `ChannelDimension.LAST`:
                        image in (height, width, num_channels) format.
                If unset, will use same as the input image.
            input_data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format for the input image. Can be one of:
                    - `"channels_first"` or `ChannelDimension.FIRST`:
                        image in (num_channels, height, width) format.
                    - `"channels_last"` or `ChannelDimension.LAST`:
                        image in (height, width, num_channels) format.
                If unset, will use the inferred format of the input image.

        Returns:
            BatchFeature:
                A BatchFeature object containing:
                - 'pixel_values':
                    Tensor of processed image pixel values.
                - 'pixel_mask':
                    Boolean pixel mask. This mask is a 2D tensor of shape (max_image_size, max_image_size) where:
                    - True (1) values indicate pixels that belong to the original resized image.
                    - False (0) values indicate pixels that are part of the padding.
                  The mask helps distinguish between actual image content and padded areas in subsequent processing steps.
                - 'num_crops':
                    The maximum number of crops across all images.
        N)rH   r3   z(max_image_size must be either 490 or 980zSInvalid image type. Must be of type PIL.Image.Image, numpy.ndarray, or torch.Tensor)r=   r4   r5   r>   r;   r<   r   zIt looks like you are trying to rescale already rescaled images. If the input images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again.)rS   r#   )r>   rS   r#   rA   )dtype)r   scaler#   )axisr/   )datatensor_type)&r4   r5   r6   r7   r9   r:   r;   r<   r=   r>   
ValueErrorfetch_imagesr   r   r   r
   r   r   loggerwarning_oncer   get_image_patchesr8   lenr   maxintr   r   npzerosboolr"   rescale	normalizer   r   stack)rL   rQ   r4   r5   r6   r7   r9   r:   r;   r<   r=   r>   rR   rS   r#   r   r0   pixel_masksr2   crop_images
crop_imagehwrV   new_sizecrop_image_resizedpadding_bottompadding_rightcrop_image_paddedr1   s                                 r*   
preprocesszAriaImageProcessor.preprocess   s   R $.#9Zt
!*!6IDNN	+9+E4K^K^+9+E4K^K^%0%<k$BRBR+9+E4K^K^#-#9Zt
+9+E4K^K^'3'?|TEVEV'38+GHH""6*)&1F#rss%%!!)	
 9?@nU+@F@ 6<<E.'<</&)4s
 $ >vay I	 @	7E"44**" 1&7 5   %g C$4y$@,	) 17
%j11&Q26 #CE	NN C^TH .CE	NN0STH%+% 1&7&" 1?!0Ln_ghi_jNj$'&(1m*<= 1&7	%!  XX~~&FdS
;<
=Xa[=-HQK-78"":.(,/~Yj )5 )%  (,)$5*; )7 )% '2 44E{Tef. & ##$56c17@	7B  "A > hh{;&
 '
 	
i A =s   9LL#r   target_resolutionr   c                 H    t        |||      \  }}t        |||f||      }|S )aG  
        Resizes an image to a target resolution while maintaining aspect ratio.

        Args:
            image (np.ndarray):
                The input image.
            target_resolution (tuple):
                The target resolution (height, width) of the image.
            resample (`PILImageResampling`):
                Resampling filter to use if resizing the image.
            input_data_format (`ChannelDimension` or `str`):
                The channel dimension format of the input image.

        Returns:
            np.ndarray: The resized and padded image.
        r>   r#   )r   r   )rL   r   rs   r>   r#   
new_height	new_widthresized_images           r*   _resize_for_patchingz'AriaImageProcessor._resize_for_patchingR  s7    & !6e=NPa b
I uz9&=duvr,   original_resolutionc                 z    |\  }}|\  }}t        ||z
  d      \  }}t        ||z
  d      \  }	}
|	|	|
z   f|||z   ffS )NrB   )divmod)rL   rz   rs   original_heightoriginal_widthtarget_heighttarget_widthpaste_xr_xpaste_yr_ys              r*   _get_padding_sizez$AriaImageProcessor._get_padding_sizel  s]    *='&7#|l^;Q?mo=qA3''7S=)AAAr,   c                 j    t        |||      }| j                  ||      }| j                  ||      }|S )zU
        Pad an image to a target resolution while maintaining aspect ratio.
        )padding)r   r   r   )rL   r   rs   r#   new_resolutionr   padded_images          r*   _pad_for_patchingz$AriaImageProcessor._pad_for_patchings  s?     /u6GIZ[((9JKxxwx7r,   g        r   modeconstant_valuesc                 ^   t        |t              st        |      dk7  rt        ||||||      S |t	        |      }t
        j                  dt
        j                  dt
        j                  dt
        j                  di}t        j                  ||||   |      }|t        |||      }|S |}|S )a	  
        Pads the `image` with the specified `padding` and `mode`. Padding can be in the (`height`, `width`)
        dimension of in the (`num_patches`) dimension. In the second case an iterable if tuples is expected
        as input.

        Args:
            image (`np.ndarray`):
                The image to pad.
            padding (`int` or `tuple[int, int]` or `Iterable[tuple[int, int]]`):
                Padding to apply to the edges of the height, width axes. Can be one of three formats:
                - `((before_height, after_height), (before_width, after_width))` unique pad widths for each axis.
                - `((before, after),)` yields same before and after pad for height and width.
                - `(pad,)` or int is a shortcut for before = after = pad width for all axes.
            mode (`PaddingMode`):
                The padding mode to use. Can be one of:
                    - `"constant"`: pads with a constant value.
                    - `"reflect"`: pads with the reflection of the vector mirrored on the first and last values of the
                    vector along each axis.
                    - `"replicate"`: pads with the replication of the last value on the edge of the array along each axis.
                    - `"symmetric"`: pads with the reflection of the vector mirrored along the edge of the array.
            constant_values (`float` or `Iterable[float]`, *optional*):
                The value to use for the padding if `mode` is `"constant"`.
            data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format for the output image. Can be one of:
                    - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                    - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                If unset, will use same as the input image.
            input_data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format for the input image. Can be one of:
                    - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                    - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                If unset, will use the inferred format of the input image.

        Returns:
            `np.ndarray`: The padded image.

        rC   constantreflectedge	symmetric)r   r   )
isinstancera   r_   r   r   r	   CONSTANTREFLECT	REPLICATE	SYMMETRICrb   r   )rL   r   r   r   r   rS   r#   padding_mode_mappings           r*   r   zAriaImageProcessor.pad  s    ` gs#s7|q'8ugt_kK\]]$ >u E   *!!6!!;	 
 ug,@,FXghR]Ri'{<MN 	  pu 	 r,   grid_pinpointsr   c                     t        |t              st        d      |}t        ||      }t	        ||      }	| j                  ||	||      }
| j                  |
|	|      }t        |||      }|D cg c]  }t        |||       }}|S c c}w )a]  
        Process an image with variable resolutions by dividing it into patches.

        Args:
            image (`np.ndarray`):
                The input image to be processed.
            grid_pinpoints (list[tuple[int, int]]):
                A list of possible resolutions as tuples.
            patch_size (`int`):
                Size of the patches to divide the image into.
            resample (`PILImageResampling`):
                Resampling filter to use if resizing the image.
            data_format (`ChannelDimension` or `str`):
                The channel dimension format for the output image.
            input_data_format (`ChannelDimension` or `str`):
                The channel dimension format of the input image.

        Returns:
            `list[np.ndarray]`: A list of NumPy arrays containing the processed image patches.
        z6grid_pinpoints must be a list of possible resolutions.r   ru   )r#   )r   r#   )r   input_channel_dim)	r   list	TypeErrorr   r   ry   r   r+   r   )rL   r   r   r   r>   rS   r#   possible_resolutions
image_sizebest_resolutionrx   r   r$   r)   s                 r*   r^   z$AriaImageProcessor.get_image_patches  s    : .$/TUU-#E7HI
0=QR11?XIZ 2 
 --m_`q-r#LZ[lm
 !
 (;Zkl
 
 	
s   2Br%   r&   c                     |j                  d| j                        }|j                  d| j                        }t        ||f| j                        \  }}|sd}|S ||z  |z  |z  }|S )a  
        A utility that returns number of image patches for a given image size.

        Args:
            height (`int`):
                Height of the input image.
            width (`int`):
                Width of the input image.
            images_kwargs (`dict`, *optional*)
                Any kwargs to override defaults of the image processor.
        Returns:
            `int`: Number of patches per image.
        r9   r6   rA   )getr9   r6   r   r8   )	rL   r%   r&   images_kwargsr9   r6   resized_heightresized_widthnum_patchess	            r*   get_number_of_image_patchesz.AriaImageProcessor.get_number_of_image_patches  s|     $''t7G7GH&**+;T=P=PQ(>PTPfPf(g%*a 1?.0PS`0`dr0rr,   )N)__name__
__module____qualname____doc__model_input_namesr   BICUBICr   floatra   tuplerd   rK   r   FIRSTr   strr   rr   rb   ndarrayry   r   r   r	   r   r   r   r^   r   __classcell__)rO   s   @r*   r.   r.   H   s   > D *.(,!!:>#(&*&-$('9'A'A"!K$&"! ;%"! 	"!
 "!  c3h047"! D["! t"! "! e"! Tk"! %"!N 2604%)%)#'&*"&'+$(.226/?/E/E;?B
T*--B
 DK'$.B
 4;&-	B

 d
B
 d
B
 D[B
 tB
 4KB
 B
 TkB
 %t+B
 j(4/B
 &,B
 !11D8B
HZZ49Xh	4BU Bu BZZ49N^	" (003659;?@zz@ uS#X&%S/)BB@ 	@
 %0@ ++d2@ !11D8@ 
@D0zz0 U38_-0 	0
 %0 &0 ,0 
bjj	0d# c r,   r.   )&collections.abcr   numpyrb   image_processing_utilsr   r   r   r   image_transformsr	   r
   r   r   r   image_utilsr   r   r   r   r   r   r   r   r   r   utilsr   r   
get_loggerr   r\   r   ra   r   r+   r.   __all__rI   r,   r*   <module>r      s   ( %  u u e e   ) 
		H	%RZZ S PTUWU_U_P` 8+ D  
 r,   