
    qi-                        d Z ddlZddlmZ ddlmZmZ ddlm	Z	m
Z
mZmZmZ ddlmZmZmZ ddlmZmZ  ej*                  e      Zd	Z ed
      D  cg c]	  } d| dd c}  ed      D  cg c]	  } d| dd c} z   Z G d de      Z G d de
d      ZdefdZd Zd Z d Z!e G d de             Z"dgZ#yc c} w c c} w )z 
Processor class for PaliGemma.
    N   )BatchFeature)
ImageInputis_valid_image)MultiModalDataProcessingKwargsProcessorMixin
TextKwargsUnpack)
AddedTokenPreTokenizedInput	TextInput)auto_docstringloggingz<image>i   z<locz0>4>   z<segz0>3c                   @    e Zd ZU dZeez  ee   z  ee   z  dz  ed<   y)PaliGemmaTextKwargsa  
    suffix (`str`, `list[str]`, `list[list[str]]`):
        The suffixes or batch of suffixes to be encoded. Only necessary for finetuning. See https://github.com/google-research/big_vision/blob/main/big_vision/configs/proj/paligemma/README.md
        for more information. If your prompt is "<image> What is on the image", the suffix corresponds to the expected prediction "a cow sitting on a bench".
    Nsuffix)__name__
__module____qualname____doc__r   r   list__annotations__     d/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/paligemma/processing_paligemma.pyr   r   '   s/     ))DO;dCT>UUX\\\r   r   c                   ,    e Zd ZU eed<   dddddidZy)PaliGemmaProcessorKwargstext_kwargsF)paddingreturn_mm_token_type_idsdata_formatchannels_first)r!   images_kwargsN)r   r   r   r   r   	_defaultsr   r   r   r    r    1   s)    $$ (-

 +
Ir   r    F)totalreturnc                 H    t        | t              xr | j                  d      S )Nhttp)
isinstancestr
startswith)vals    r   is_urlr0   ?   s    c3:CNN6$::r   c                 2    t        |       xs t        |       S N)r0   r   elems    r   is_image_or_image_urlr5   D   s    $</>$//r   c                 <    t        | t              xs t        |       S r2   )r,   r-   r5   r3   s    r   _is_str_or_imager7   H   s    dS"A&;D&AAr   c                      ||z  |z   | |  dS )aZ  
    Builds a string from the input prompt and image tokens.
    For example, for the call:
    build_string_from_input(
        prompt="Prefix str"
        bos_token="<s>",
        image_seq_len=3,
        image_token="<im>",
    )
    The output will be:
    "<im><im><im><s>Initial str"
    Args:
        prompt (`list[Union[str, ImageInput]]`): The input prompt.
        bos_token (`str`): The beginning of sentence token.
        image_seq_len (`int`): The length of the image sequence.
        image_token (`str`): The image token.
        num_images (`int`): Number of images in the prompt.
    
r   prompt	bos_tokenimage_seq_lenimage_token
num_imagess        r   build_string_from_inputr@   L   s$    & M)J67	{6("MMr   c            
            e Zd Z	 	 	 d
 fd	Ze	 	 ddedz  deez  ee   z  ee   z  de	e
   defd       ZddZed	        Z xZS )PaliGemmaProcessorNc                    t        |d      st        d      |j                  | _        t        |d      sNt        t        dd      }d|gi}|j                  |       |j                  t              | _        t        | _        n"|j                  | _        |j                  | _        |j                  t               d|_        d|_        t        | 9  |||       y )	Nimage_seq_lengthz;Image processor is missing an `image_seq_length` attribute.r>   FT)
normalizedspecialadditional_special_tokens)chat_template)hasattr
ValueErrorrD   r   IMAGE_TOKENadd_special_tokensconvert_tokens_to_idsimage_token_idr>   
add_tokensEXTRA_TOKENSadd_bos_tokenadd_eos_tokensuper__init__)selfimage_processor	tokenizerrH   kwargsr>   tokens_to_add	__class__s          r   rT   zPaliGemmaProcessor.__init__d   s     (:;Z[[ / @ @y-0$[UDQK8;-HM((7"+"A"A+"ND*D"+":":D(44D\*"'	"'	)=Qr   imagestextrX   r)   c                 h    | j                   t        fd| j                  j                  i|}|d   j	                  dd      }d}|t        d      |t        j                  d       d}t        |      r|g}nt        |t              rt        |d	         r	 |#| t        d
 |D              sjt        j                  d       t        |t              rKt        |t              r;t        |      t        |      k7  r$t        dt        |       dt        |       d      t        |      r|gg}nt        |t        t        f      rt        |d	         r|D cg c]  }|g }}nKt        |t        t        f      r*t        |d	   t        t        f      rt        |d	   d	         st        d      t!        ||      D 	cg c]R  \  }}	t#        || j                  j$                  | j&                  t(        t        |	t              rt        |	      nd      T }
}}	ng }|D ]  }|j+                  t(        t(        | j&                  z        }|j-                  t(              }|dk7  r|t        t(              z   nd	}|d| | j                  j$                  z   ||d z   }|j/                  |        |D cg c]  }| d	 }
}|t        |      r|g}|&|D cg c]  }|| j                  j0                  z    }} | j2                  |fi |d   d   }|d   j	                  dd      }|d   j	                  dd      } | j                  
f||d|d   }| j5                  |
|dg       i |d|i}|rIt7        j8                  |d         }d|t7        j8                  |d         d	k(  <   |j;                  d|i       |rUt7        j8                  |d         }t7        j<                  |d         }d||| j>                  k(  <   |jA                         |d<   tC        ||       S c c}w c c}	}w c c}w c c}w )!a  
        Returns:
            [`BatchFeature`]: A [`BatchFeature`] with the following fields:

            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`. If `suffix`
              is provided, the `input_ids` will also contain the suffix input ids.
            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
              `None`).
            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
            - **labels** -- Labels compatible with training if `suffix` is not None
        tokenizer_init_kwargsr!   r   NTzF`images` are expected as arguments to a `PaliGemmaProcessor` instance.z]You are using PaliGemma without a text prefix. It will perform as a picture-captioning model. r   c              3   ,   K   | ]  }t         |v   y wr2   )rK   ).0samples     r   	<genexpr>z.PaliGemmaProcessor.__call__.<locals>.<genexpr>   s     @{f,@s   aL  You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text. For this call, we will infer how many images each text has and add special tokens.z	Received z images for zK prompts. Each prompt should be associated with an image or list of images.zAimages must be an image, list of images or list of list of images   r:   r9   r&   pixel_valuesreturn_tensorsr#   )	text_pairreturn_token_type_idsimage)
modalities	input_idsitoken_type_idslabelsmm_token_type_ids)datatensor_type)"_merge_kwargsr    rW   init_kwargspoprJ   loggerwarning_oncer7   r,   r   anywarninglenr   tuplezipr@   r<   rD   rK   replacerfindappend	eos_tokenrV   _check_special_mm_tokensnparrayupdate
zeros_likerN   tolistr   )rU   r[   r\   rX   output_kwargsr   ri   rj   r;   
image_listinput_stringsexpanded_samplesrb   expanded_samplebos_rfind_index	bos_indexsfxrf   rg   r#   inputsreturn_datarn   	array_idsro   s                            r   __call__zPaliGemmaProcessor.__call__   se   ( +**$
"&.."<"<
 

 }-11(DA $>eff<o DD!6Dd#(8a(A 2@4@@< dD)j.F6{c$i/('F}LT  LW  X 
 "&)%hZFu6>&QR);T39:%ug:F:ve}5"6!9tUm<&vay|4$%hii /2$.?	! +
 ,%"&..":":&*&;&;$/6@T6R3z?XY	! 	! $& " =F&,nn[+PTPePeBe&fO&5&;&;K&HOFUY[F[#k2B BabI'
3dnn6N6NNQ`ajakQll $ %++O<= >N N6F82 N N"26":XF@FGcDNN444GFG+t++FUmO6TUVde&}599:JDQ#0#?#C#CD^`d#e 
"7
 M*	
 	%%mV	%R>>> !XXf[12F>BF288F#345:;&12#[!9:I "k+.F GBCi4+>+>>?/@/G/G/IK+,.IIy ;	!( !O
 Hs   
PAP$1P* P/c                     i }|<| j                   gt        |      z  }dgt        |      z  }|j                  ||d       t        di |S )a  
        Computes the number of placeholder tokens needed for multimodal inputs with the given sizes.

        Args:
            image_sizes (list[list[str]], *optional*):
                The input sizes formatted as (height, width) per each image.
        Returns:
            `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided
            input modalities, along with other useful data.
        rd   )num_image_tokensnum_image_patchesr   )rD   ry   r   r   )rU   image_sizesrX   vision_datar   r   s         r   _get_num_multimodal_tokensz-PaliGemmaProcessor._get_num_multimodal_tokens   s]     " $ 5 56[9II!"c+&6 64D[lmn,,,r   c                     | j                   j                  ddgz   }| j                  j                  }t        ||z         S )Nrm   rn   )rW   model_input_namesrV   r   )rU   tokenizer_input_namesimage_processor_input_namess      r   r   z$PaliGemmaProcessor.model_input_names  sB     $ @ @DTV^C_ _&*&:&:&L&L#),GGHHr   )NNN)NNr2   )r   r   r   rT   r   r   r   r   r   r   r    r   r   r   propertyr   __classcell__)rZ   s   @r   rB   rB   b   s     	R8  %)Z^xJT!xJ ++d9o=EV@WWxJ 12	xJ
 
xJ xJt-$ I Ir   rB   )$r   numpyr   feature_extraction_utilsr   image_utilsr   r   processing_utilsr   r   r	   r
   r   tokenization_utils_baser   r   r   utilsr   r   
get_loggerr   ru   rK   rangerP   r   r    boolr0   r5   r7   r@   rB   __all__)is   0r   <module>r      s     4 5  P O , 
		H	%).t5A$qgQ5RWX[R\8]Q4#wa8]]]* ]
/u 
;4 ;
0BN, nI nI nIb  
 a 68]s   B;-C 