
    qi?                        d Z ddlZddlmZ ddlmZ ddlZddlm	Z	m
Z
 ddlmZmZmZ ddlmZmZmZmZmZmZmZmZmZ dd	lmZ dd
lmZmZmZmZ  ej@                  e!      Z" e       rddl#m$Z$  G d ded      Z% ed      	 d!de&de&de&de&de'de(e&e&f   fd       Z)dejT                  de&dejT                  fdZ+d"dejT                  de&de&de(ejT                  ejT                  f   fdZ, G d d e	      Z-d gZ.y)#z"Image processor class for SigLIP2.    N)	lru_cache)Optional   )BaseImageProcessorBatchFeature)convert_to_rgbresizeto_channel_dimension_format)	ChannelDimension
ImageInputPILImageResamplinginfer_channel_dimension_formatis_scaled_imagemake_flat_list_of_imagesto_numpy_arrayvalid_imagesvalidate_preprocess_arguments)ImagesKwargs)
TensorTypefilter_out_non_signature_kwargsis_vision_availablelogging)Imagec                   &    e Zd ZU dZeed<   eed<   y)Siglip2ImageProcessorKwargsaP  
    patch_size (`int`, *optional*, defaults to 16):
        The size (resolution) of each patch the image will be split to.
    max_num_patches (`int`, *optional*, defaults to 256):
        The image will be resized to have at most this number of patches,
        and then padded in "patch" dimension to match this number exactly.
    
patch_sizemax_num_patchesN)__name__
__module____qualname____doc__int__annotations__     f/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/siglip2/image_processing_siglip2.pyr   r   2   s     Or%   r   F)total   )maxsizeimage_heightimage_widthr   r   epsreturnc                     dt         dt        dt        dt        fd}|dz  d}}||z
  |k\  r:||z   dz  } ||| |      }	 ||||      }
|	|z  |
|z  z  }||k  r|}n|}||z
  |k\  r:|} ||| |      }	 ||||      }
|	|
fS )	a"  
    Determine image size based on max number of patches, ensure dimensions are divisible by patch size and image is at least 1 patch.

    Args:
        image_height (`int`):
            Original image height.
        image_width (`int`):
            Original image width.
        patch_size (`int`):
            Patch size for processing.
        max_num_patches (`int`):
            Maximum number of patches.
        eps (`float`):
            Small threshold for binary search.

    Returns:
        Tuple: (target_height, target_width)
    scalesizer   r-   c                 p    || z  }t        j                  ||z        |z  }t        ||      }t        |      S )N)mathceilmaxr"   )r/   r0   r   scaled_sizes       r&   get_scaled_image_sizezAget_image_size_for_max_num_patches.<locals>.get_scaled_image_sizeV   s:    Uliij 89JF*k2;r%   
   g      Y@   )floatr"   )r*   r+   r   r   r,   r6   	scale_min	scale_maxr/   target_heighttarget_widthnum_patchess               r&   "get_image_size_for_max_num_patchesr?   ?   s    . U  #  3  3   8UyIy S
(Y&!+-e\:N,UKL$z1lZ6OP/)II y S
( E)%zJM(ZHL,&&r%   imagec                     | j                   \  }}}||z  }||z  }| j                  |||||      }|j                  ddddd      }|j                  ||z  d      }|S )z
    Convert 3D array image of shape (image_height, image_width, num_channels) into 2D array of patches of shape
    (num_patches_height * num_patches_width, patch_size * patch_size * num_channels).
    r   r8      r      )shapereshape	transpose)r@   r   r*   r+   num_channelsnum_patches_heightnum_patches_widthpatched_images           r&   convert_image_to_patchesrL   o   s|    
 /4kk+L+|%3#z1MM"4jBSU_amnM!++Aq!Q:M!))*<?P*PRTUMr%   arraytarget_length	pad_valuec                     | j                   d   }||z
  }t        j                  |ft        j                        }|dkD  r8d|fgdg| j                  dz
  z  z   }t        j
                  | |d|      } d|| d | |fS )z2
    Pad the array along the first dimension.
    r   )dtype)r   r   rB   constant)modeconstant_valuesN)rE   nponesint32ndimpad)rM   rN   rO   current_lengthpadding_lengthmaskpaddingss          r&   pad_along_first_dimr^   }   s     [[^N"^3N77M#2884D'(F8uzzA~+FFuhZS!"n_$;r%   c                       e Zd ZdZg dZeZdej                  ddddddddf
de	d	d
de	de
de	de
ee
   z  dz  de
ee
   z  dz  de	dz  dedef fdZ e       	 	 	 	 	 	 	 	 	 	 	 	 ddede	dz  d	ed
   de	dz  de
dz  de	dz  de
ee
   z  dz  de
ee
   z  dz  deez  dz  deez  dz  de	dz  dedz  dedz  ddfd       Z xZS )Siglip2ImageProcessora3	  
    Constructs a SigLIP2 image processor.

    Args:
        do_resize (`bool`, *optional*, defaults to `True`):
            Whether to resize the image's dimensions to fit `max_num_patches` according to given `patch_size`.
            Can be overridden by `do_resize` in the `preprocess` method.
        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
            Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method.
        do_rescale (`bool`, *optional*, defaults to `True`):
            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
            the `preprocess` method.
        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
            Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess`
            method.
        do_normalize (`bool`, *optional*, defaults to `True`):
            Whether to normalize the image by the specified mean and standard deviation. Can be overridden by
            `do_normalize` in the `preprocess` method.
        image_mean (`float` or `list[float]`, *optional*, defaults to `[0.5, 0.5, 0.5]`):
            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
        image_std (`float` or `list[float]`, *optional*, defaults to `[0.5, 0.5, 0.5]`):
            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
            Can be overridden by the `image_std` parameter in the `preprocess` method.
        do_convert_rgb (`bool`, *optional*, defaults to `True`):
            Whether to convert the image to RGB.
        patch_size (`int`, *optional*, defaults to 16):
            The size (resolution) of each patch the image will be split to.
        max_num_patches (`int`, *optional*, defaults to 256):
            The image will be resized to have at most this number of patches,
            and then padded in "patch" dimension to match this number exactly.
    pixel_valuespixel_attention_maskspatial_shapesTgp?N   r(   	do_resizeresampler   
do_rescalerescale_factordo_normalize
image_mean	image_stddo_convert_rgbr   r   c                     t        |   di | ||ng d}||ng d}|| _        || _        || _        || _        || _        || _        || _        || _	        |	| _
        |
| _        y )N)      ?ro   ro   r$   )super__init__rf   rg   rh   ri   rj   rk   rl   rm   r   r   )selfrf   rg   rh   ri   rj   rk   rl   rm   r   r   kwargs	__class__s               r&   rq   zSiglip2ImageProcessor.__init__   sw     	"6"#-#9Z
!*!6IO	" $,($",$.r%   imagesreturn_tensorsinput_data_formatr-   zImage.Imagec                    ||n| j                   }||n| j                  }||n| j                  }||n| j                  }||n| j                  }||n| j
                  }||n| j                  }||n| j                  }||n| j                  }||n| j                  }t        j                  }| j                  |      }t        |      }t        |      st        d      t!        |||||       |r|D cg c]  }t#        |       }}|D cg c]  }t%        |       }}|r#t'        |d         rt(        j+                  d       |
t-        |d         }
g }g }g }|D ]  }t/        |||
      }|r=t1        |j2                  d   |j2                  d   ||      \  }}t5        |||f||      }|r| j7                  |||	      }|r| j9                  ||||
      }t;        ||      }t=        ||      \  }}|j2                  d   |z  }|j2                  d   |z  }|j?                  ||f       |j?                  |       |j?                  |        tA        |||d|	      }|S c c}w c c}w )a	  
        Preprocess an image or batch of images.

        Args:
            images (`ImageInput`):
                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
                Whether to resize the image.
            size (`dict[str, int]`, *optional*, defaults to `self.size`):
                Size of the image after resizing.
            resample (`int`, *optional*, defaults to `self.resample`):
                Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
                has an effect if `do_resize` is set to `True`.
            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
                Whether to rescale the image.
            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
                Whether to normalize the image.
            image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`):
                Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
            image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`):
                Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
                `True`.
            return_tensors (`str` or `TensorType`, *optional*):
                The type of tensors to return. Can be one of:
                - Unset: Return a list of `np.ndarray`.
                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
            input_data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format for the input image. If unset, the channel dimension format is inferred
                from the input image. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
                Whether to convert the image to RGB.
            patch_size (`int`, *optional*, defaults to `self.patch_size`):
                Patch size for processing, same as the patch size used in the model.
            max_num_patches (`int`, *optional*, defaults to `self.max_num_patches`):
                Maximum number of patches per image, the image will be resized to have at most this number of patches.
        zSInvalid image type. Must be of type PIL.Image.Image, numpy.ndarray, or torch.Tensor)rh   ri   rj   rk   rl   r   zIt looks like you are trying to rescale already rescaled images. If the input images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again.)input_channel_dimrB   )r*   r+   r   r   )r@   r0   rg   rw   )r@   r/   rw   )r@   meanstdrw   ra   )datatensor_type)!rf   rg   rh   ri   rj   rk   rl   rm   r   r   r   LASTfetch_imagesr   r   
ValueErrorr   r   r   r   loggerwarning_oncer   r
   r?   rE   r	   rescale	normalizerL   r^   appendr   )rr   ru   rf   rg   rh   ri   rj   rk   rl   rv   rw   rm   r   r   data_formatr@   pixel_masksrb   rd   heightwidthpatchesr\   rI   rJ   batch_features                             r&   
preprocessz Siglip2ImageProcessor.preprocess   s   x "+!6IDNN	'38#-#9Zt
+9+E4K^K^'3'?|TEVEV#-#9Zt
!*!6IDNN	+9+E4K^K^#-#9Zt
-<-H/dNbNb '++""6*)&1F#rss%!)%!	
 9?@nU+@F@ 6<<E.'<</&)4s
 $ >vay I 	%E/{VghE B!&Q %A)$3	! U&%8grs5ZefUfqr.ujAG/IMGT!&Q:!= %A* <!!#57H"IJ(t$3	%6 % ,(3"0
 '
 o A =s   9I(I-)NNNNNNNNNNNN)r   r   r    r!   model_input_namesr   valid_kwargsr   BILINEARboolr9   listr"   rq   r   r   r   strr   r   r   __classcell__)rt   s   @r&   r`   r`      s    D S.L );)D)D '!1504&*"// '/ 	/
 / / DK'$./ 4;&-/ t/ / /< %& "&37"&'+$(150426;?&*!%&*NN $;N /0	N
 4KN N TkN DK'$.N 4;&-N j(4/N !11D8N tN $JN tN 
N 'Nr%   r`   )gh㈵>)r   )/r!   r2   	functoolsr   typingr   numpyrU   image_processing_utilsr   r   image_transformsr   r	   r
   image_utilsr   r   r   r   r   r   r   r   r   processing_utilsr   utilsr   r   r   r   
get_loggerr   r   PILr   r   r"   r9   tupler?   ndarrayrL   r^   r`   __all__r$   r%   r&   <module>r      s9   )     F 

 
 
 - ^ ^ 
		H	% 
,e 
 3]a,','$','58,'KN,'UZ,'
38_,' ,'^BJJ C BJJ rzz # # V[\^\f\fhjhrhr\rVs S. Sl #
#r%   