
    qiM                        d Z ddlZddlZddlZddlmZ ddlmZm	Z	 ddl
mZmZmZmZ ddlmZmZmZmZmZmZmZ ddlmZ dd	lmZmZmZmZ dd
lmZ  e       rddl Z ddl!m"Z"m#Z#m$Z$  e       rddl%Z% ejL                  e'      Z(dZ) G d ded      Z*d Z+	 	 	 	 	 	 	 	 	 d$de,de-de,de,de-de-de-de-de.dz  de,dz  de"jD                  fdZ/d%dej`                  de,d e,e1z  dz  fd!Z2 G d" d#e      Z3d#gZ4y)&z%Image processor class for Pix2Struct.    N)hf_hub_download   )BaseImageProcessorBatchFeature)convert_to_rgb	normalizeto_channel_dimension_formatto_pil_image)ChannelDimension
ImageInputget_image_sizeinfer_channel_dimension_formatmake_flat_list_of_imagesto_numpy_arrayvalid_images)ImagesKwargs)
TensorTypeis_torch_availableis_vision_availablelogging)requires_backends)Image	ImageDraw	ImageFontzybelkada/fontsc                   V    e Zd ZU dZeed<   eeef   ed<   eed<   e	e   ez  dz  ed<   y)Pix2StructImageProcessorKwargsa  
    max_patches (`int`, *optional*):
        Maximum number of patches to extract.
    patch_size (`dict[str, int]`, *optional*, defaults to `{"height": 16, "width": 16}`):
        The patch size to use for the image. According to Pix2Struct paper and code, the patch size is 16x16.
    is_vqa (`bool`, *optional*, defaults to `False`):
        Whether or not the image processor is for the VQA task. If `True` and `header_text` is passed in, text is
        rendered onto the input images.
    header_text (`Union[list[str], str]`, *optional*):
        Text to render as a header. Only has an effect if `image_processor.is_vqa` is `True`.
    max_patches
patch_sizeis_vqaNheader_text)
__name__
__module____qualname____doc__int__annotations__dictstrboollist     l/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/pix2struct/image_processing_pix2struct.pyr   r   2   s5    
 S#XLcS4''r,   r   F)totalc                    t        t        dg       | j                  d      } t        j                  j
                  j                  | ||f||f      }|j                  | j                  d      | j                  d      ||d      }|j                  ddddd      j                  | j                  d      |z  | j                  d      |z  | j                  d      |z  |z        }|j                  d      S )	a  
    Utility function to extract patches from a given image tensor. Returns a tensor of shape
    (1, `rows`, `columns`, `num_channels`x `patch_height` x `patch_width`).

    Args:
        image_tensor (torch.Tensor):
            The image tensor to extract patches from.
        patch_height (int):
            The height of the patches to extract.
        patch_width (int):
            The width of the patches to extract.
    torchr   )stride         r   )
r   torch_extract_patches	unsqueezer0   nn
functionalunfoldreshapesizepermute)image_tensorpatch_heightpatch_widthpatchess       r-   r6   r6   F   s     +gY7))!,Lhh!!((k7R\hju[v(wGool//2L4E4Ea4H,XceghGooaAq!,44!,!+!|+k9G
 Qr,   text	text_size
text_colorbackground_colorleft_paddingright_paddingtop_paddingbottom_padding
font_bytes	font_pathreturnc
                 T   t        t        d       t        j                  d      }
|
j	                  |       }dj                  |      }||	t        j                  |      }n|	|	}nt        t        d      }t        j                  |d|      }t        j                  t        j                  d	d
|            }|j!                  d||      \  }}}}||z   |z   }||z   |z   }t        j                  d	||f|      }t        j                  |      }|j#                  ||f|||       |S )a  
    Render text. This script is entirely adapted from the original script that can be found here:
    https://github.com/google-research/pix2struct/blob/main/pix2struct/preprocessing/preprocessing_utils.py

    Args:
        text (`str`, *optional*, defaults to ):
            Text to render.
        text_size (`int`, *optional*, defaults to 36):
            Size of the text.
        text_color (`str`, *optional*, defaults to `"black"`):
            Color of the text.
        background_color (`str`, *optional*, defaults to `"white"`):
            Color of the background.
        left_padding (`int`, *optional*, defaults to 5):
            Padding on the left.
        right_padding (`int`, *optional*, defaults to 5):
            Padding on the right.
        top_padding (`int`, *optional*, defaults to 5):
            Padding on the top.
        bottom_padding (`int`, *optional*, defaults to 5):
            Padding on the bottom.
        font_bytes (`bytes`, *optional*):
            Bytes of the font to use. If `None`, the default font will be used.
        font_path (`str`, *optional*):
            Path to the font to use. If `None`, the default font will be used.
    visionP   )width)rB   
z	Arial.TTFzUTF-8)encodingr<   RGB)r2   r2   r   r   )xyrB   fillfont)r   render_texttextwrapTextWrapperwrapjoinioBytesIOr   DEFAULT_FONT_PATHr   truetyper   Drawr   newtextbboxrB   )rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   wrapperlineswrapped_textrW   	temp_draw_
text_widthtext_heightimage_widthimage_heightimagedraws                         r-   rX   rX   a   s$   L k8, "",GLLdL#E99U#L)"3zz*%		0+>dW9ED uyy8HIJI$-$6$6v|T$R!Aq*k |+m;K,~=LIIek<8:JKE>>% DII,,<jW[I\Lr,   rm   headerinput_data_formatc                 v   t        t        d       t        | |      } t        |fi |}t	        |j
                  | j
                        }t        | j                  || j
                  z  z        }t        |j                  ||j
                  z  z        }t        j                  d|||z   fd      }|j                  |j                  ||f      d       |j                  | j                  ||f      d|f       t        |      }t        |      t        j                  k(  rt!        |t        j                        }|S )a  
    Renders the input text as a header on the input image.

    Args:
        image (`np.ndarray`):
            The image to render the header on.
        header (`str`):
            The header text.
        data_format (`Union[ChannelDimension, str]`, *optional*):
            The data format of the image. Can be either "ChannelDimension.channels_first" or
            "ChannelDimension.channels_last".

    Returns:
        `np.ndarray`: The image with the header rendered.
    rN   )rp   rS   whiterT   r   )r   render_headerr
   rX   maxrP   r%   heightr   rb   pasteresizer   r   r   LASTr	   )	rm   ro   rp   kwargsheader_image	new_width
new_heightnew_header_height	new_images	            r-   rs   rs      s     mX. 2CDEv00LL&&4IU\\Y%<=>JL//9|?Q?Q3QRS		%)Z:K-K!LgVIOOL''4E(FGPOOELL)Z!89A?P;QR y)I%i04D4I4II/	;K;P;PQ	r,   c                       e Zd ZdZddgZeZ	 	 	 	 	 ddededee	e
f   dz  de
d	ed
df fdZ	 ddej                  de
dede	ez  dz  d
ej                  f
dZ	 	 ddej                  de	ez  dz  de	ez  dz  d
ej                  fdZddddddej"                  dfdede	dz  dedz  dedz  de
dz  dee	e
f   dz  de	ez  dz  dede	ez  dz  d
efdZ xZS )Pix2StructImageProcessoraj  
    Constructs a Pix2Struct image processor.

    Args:
        do_convert_rgb (`bool`, *optional*, defaults to `True`):
            Whether to convert the image to RGB.
        do_normalize (`bool`, *optional*, defaults to `True`):
            Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
            method. According to Pix2Struct paper and code, the image is normalized with its own mean and standard
            deviation.
        patch_size (`dict[str, int]`, *optional*, defaults to `{"height": 16, "width": 16}`):
            The patch size to use for the image. According to Pix2Struct paper and code, the patch size is 16x16.
        max_patches (`int`, *optional*, defaults to 2048):
            The maximum number of patches to extract from the image as per the [Pix2Struct
            paper](https://huggingface.co/papers/2210.03347).
        is_vqa (`bool`, *optional*, defaults to `False`):
            Whether or not the image processor is for the VQA task. If `True` and `header_text` is passed in, text is
            rendered onto the input images.
    flattened_patchesattention_maskNdo_convert_rgbdo_normalizer   r   r   rL   c                 x    t        |   di | ||nddd| _        || _        || _        || _        || _        y )N   )ru   rP   r+   )super__init__r   r   r   r   r   )selfr   r   r   r   r   ry   	__class__s          r-   r   z!Pix2StructImageProcessor.__init__   sH     	"6"(2(>*r\^D_(,&r,   rm   rp   c           	         t        | j                  d       t        |t        j                  |      }t        j                  |      }|d   |d   }}t        |t        j                        \  }}	t        j                  |||z  z  ||	z  z        }
t        t        t        j                  |
|z  |z        |      d      }t        t        t        j                  |
|	z  |z        |      d      }t        ||z  d      }t        ||z  d      }t
        j                  j                  j                  |j!                  d      ||fddd	      j#                  d      }t%        |||      }|j&                  }|d   }|d
   }|d   }|j)                  ||z  |g      }t        j*                  |      j)                  |dg      j-                  d|      j)                  ||z  dg      }t        j*                  |      j)                  d|g      j-                  |d      j)                  ||z  dg      }|dz  }|dz  }|j/                  t
        j0                        }|j/                  t
        j0                        }t        j2                  |||gd      }t
        j                  j                  j5                  |ddd|||z  z
  g      j7                         }t9        |      }|S )a  
        Extract flattened patches from an image.

        Args:
            image (`np.ndarray`):
                Image to extract flattened patches from.
            max_patches (`int`):
                Maximum number of patches to extract.
            patch_size (`dict`):
                Dictionary containing the patch height and width.

        Returns:
            result (`np.ndarray`):
                A sequence of `max_patches` flattened patches.
        r0   ru   rP   r2   r   bilinearFT)r<   modealign_corners	antialiasr5   r   r3   )r   extract_flattened_patchesr	   r   FIRSTr0   
from_numpyr   mathsqrtrt   minfloorr8   r9   interpolater7   squeezer6   shaper;   arangerepeattofloat32catpadfloatr   )r   rm   r   r   rp   ry   r?   r@   rl   rk   scalenum_feasible_rowsnum_feasible_colsresized_heightresized_widthrA   patches_shaperowscolumnsdepthrow_idscol_idsresults                          r-   r   z2Pix2StructImageProcessor.extract_flattened_patches   s   . 	$88'B ,E3C3I3IK\]  '$.x$8*W:Mk$25:J:P:P$Q!k 		+)DEWbIbcdDJJu|/Cl/R$SU` acdeDJJu{/B[/P$QS^ _abc.=qA-;Q?##//OOA -0 0 
 '!* 	 (|[IQ"a  //4'>5"9: ,,t$,,dAY7>>q'JRRTX[bTbdeSfg,,w'//G=DDT1MUUW[^eWeghVij 	11 **U]]+**U]]+ GWg6; $$((!Q;$QX.;Y1Z[aac'r,   data_formatc           	      n   |j                   t        j                  k(  r|j                  t        j                        }t        j
                  |      }t        j                  |      }t        |dt        j                  t        j                  |j                              z        }t        |f||||d|S )a-  
        Normalize an image. image = (image - image_mean) / image_std.

        Args:
            image (`np.ndarray`):
                Image to normalize.
            data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format for the output image. If unset, the channel dimension format of the input
                image is used.
            input_data_format (`str` or `ChannelDimension`, *optional*):
                The channel dimension format of the input image. If not provided, it will be inferred.
        g      ?)meanstdr   rp   )dtypenpuint8astyper   r   r   rt   r   r   prodr   r   )r   rm   r   rp   ry   r   r   adjusted_stddevs           r-   r   z"Pix2StructImageProcessor.normalizeF  s    & ;;"(("LL,E wwu~ffUmc32775;;3G)H#HI
#/
 
 	
r,   imagesr    return_tensorsc
           
         ||n| j                   }||n| j                  }||n| j                  }||n| j                  }| j                  }|
j                  d      t        d      t        |      }t        |      st        d      |r|D cg c]  }t        |       }}|D cg c]  }t        |       }}|	t        |d         }	|r}|t        d      |
j                  dd      }|
j                  dd      }t        |t              r|gt        |      z  }t!        |      D cg c]  \  }}t#        |||   ||	       }}}|r |D cg c]  }| j%                  ||	
       }}|D cg c]  }| j'                  ||||	       }}|D cg c]4  }|j)                  d      dk7  j+                  t,        j.                        6 }}t1        ||d|      }|S c c}w c c}w c c}}w c c}w c c}w c c}w )a	  
        Preprocess an image or batch of images. The processor first computes the maximum possible number of
        aspect-ratio preserving patches of size `patch_size` that can be extracted from the image. It then pads the
        image with zeros to make the image respect the constraint of `max_patches`.

        Args:
            images (`ImageInput`):
                Image to preprocess. Expects a single or batch of images.
            header_text (`Union[list[str], str]`, *optional*):
                Text to render as a header. Only has an effect if `image_processor.is_vqa` is `True`.
            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
                Whether to convert the image to RGB.
            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
                Whether to normalize the image.
            max_patches (`int`, *optional*, defaults to `self.max_patches`):
                Maximum number of patches to extract.
            patch_size (`dict`, *optional*, defaults to `self.patch_size`):
                Dictionary containing the patch height and width.
            return_tensors (`str` or `TensorType`, *optional*):
                The type of tensors to return. Can be one of:
                    - Unset: Return a list of `np.ndarray`.
                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
                The channel dimension format for the output image. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                - Unset: Use the channel dimension format of the input image.
            input_data_format (`ChannelDimension` or `str`, *optional*):
                The channel dimension format for the input image. If unset, the channel dimension format is inferred
                from the input image. Can be one of:
                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
        Nr   z8data_format is not an accepted input as the outputs are zSInvalid image type. Must be of type PIL.Image.Image, numpy.ndarray, or torch.Tensorr   z.A header text must be provided for VQA models.rJ   rK   )rJ   rK   )rm   rp   )rm   r   r   rp   r3   )axis)r   r   )datatensor_type)r   r   r   r   r   get
ValueErrorr   r   r   r   r   pop
isinstancer(   len	enumeraters   r   r   sumr   r   r   r   )r   r   r    r   r   r   r   r   r   rp   ry   r   rm   rJ   rK   iattention_masksencoded_outputss                     r-   
preprocessz#Pix2StructImageProcessor.preprocessj  s7   ` (4'?|TEVEV+9+E4K^K^#-#9Zt
%0%<k$BRBR::m$0WXX)&1F#rss 9?@nU+@F@ 6<<E.'<<$ >vay I" !QRRL$7J

;5I+s+*mc&k9 !*& 1Au e[^
V_`F 
 djk[`dnn5DUnVkFk  	
  **_p + 
 
 V\\EEII2I.!3;;BJJG\\&'-Q_m
 S A = l
 ]s$   G (G%,G*G01G59G:)TTNi   FN)NN)r!   r"   r#   r$   model_input_namesr   valid_kwargsr)   r'   r(   r%   r   r   ndarrayr   r   r   r   r   r   r   __classcell__)r   s   @r-   r   r      s   ( -.>?1L  $!,0  cNT)	
   
* <@OzzO O 	O
 !11D8O 
Oh 6:;?	"
zz"
 ++d2"
 !11D8	"
 
"
N #'&*$("&,026(8(>(>;?ii 4Zi t	i
 Tki 4Zi cNT)i j(4/i &i !11D8i 
ir,   r   )	$   blackrr      r   r   r   NNr   )5r$   r]   r   numpyr   huggingface_hubr   image_processing_utilsr   r   image_transformsr   r   r	   r
   image_utilsr   r   r   r   r   r   r   processing_utilsr   utilsr   r   r   r   utils.import_utilsr   rY   PILr   r   r   r0   
get_loggerr!   loggerr_   r   r6   r(   r%   bytesrX   r   ChildProcessErrorrs   r   __all__r+   r,   r-   <module>r      sk   , 	   + F d d   - Q Q 3 //			H	%$ (\ (( : ## @
@@ @ 	@
 @ @ @ @ @ Tz@ [[@H% %S %SK\E\_cEc %PF1 FR &
&r,   