
    qi                         d dl Z d dlZddlmZ ddlmZmZ ddlm	Z	m
Z
mZmZ ddlmZmZ ddlmZmZ  G d d	e
d
      Ze G d de             ZdgZy)    N   )BatchFeature)
ImageInputmake_nested_list_of_images)MultiModalDataProcessingKwargsProcessorMixinUnpack)PreTokenizedInput	TextInput)auto_docstring	to_py_objc                   (    e Zd ZddddddddddZy	)
Gemma3ProcessorKwargsFT)paddingreturn_mm_token_type_ids      g333333?)do_convert_rgbdo_pan_and_scanpan_and_scan_min_crop_sizepan_and_scan_max_num_crops"pan_and_scan_min_ratio_to_activate)text_kwargsimages_kwargsN)__name__
__module____qualname__	_defaults     ^/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/gemma3/processing_gemma3.pyr   r      s,     (,

 #$*-*+25
Ir!   r   F)totalc            
            e Zd Z	 	 ddef fdZe	 	 ddedz  deez  e	e   z  e	e   z  de
e   defd       Zdd	Zed
        Z xZS )Gemma3ProcessorNimage_seq_lengthc                 $   || _         |j                  | _        |j                  | _        |j                  | _        dj	                  |j                  g|z        }d|j                   | |j
                   d| _        t        |    d|||d| y )N z

)image_processor	tokenizerchat_templater    )	r&   image_token_id	boi_tokenimage_tokenjoin	eoi_tokenfull_image_sequencesuper__init__)selfr)   r*   r+   r&   kwargsimage_tokens_expanded	__class__s          r"   r3   zGemma3Processor.__init__,   s     !1'66",,$00 ")>)>(?BR(R S%))*=*=)>?T>UV_ViViUjjn#o  	
+'	
 		
r!   imagestextr5   returnc           
      `   ||t        d       | j                  t        fd| j                  j                  i|}t        |t              r|g}n.t        |t              st        |d   t              st        d      i }|W| j                  j                  |      }t        |      } | j                  |fi |d   }|s5|D cg c]*  }dj                  | j                  gt        |      z        , }}t        |      t        |      k7  r$t        dt        |       dt        |       d	      t        |j!                  d
            }|D cg c]3  }t#        t        |            D cg c]  }|j!                  d       c}5 }	}}t%        t'        |||	            D ]  \  }
\  }}}t)        j*                  | j                  |      D cg c]  }|j-                          }}t        |      t        |      k7  r$t        dt        |       dt        |       d      t/        t        t'        ||                  D ]a  \  }}|s	d| j                   ddj                  | j                  g|z        z   }|d | |z   ||t        | j                        z   d  z   }|||
<   c  |D cg c](  }|j1                  | j                  | j2                        * }}|d   j!                  dd       }|d   j!                  dd      } | j                  dd|i|d   }| j5                  ||dg       |rRt7        j8                  |d         }t7        j:                  |      }d||| j<                  k(  <   |j?                         |d<   tA        i |||      S c c}w c c}w c c}}w c c}w c c}w )Nz+Provide at least one of `text` or `images`.tokenizer_init_kwargsr   zAInvalid input text. Please provide a string, or a list of stringsr    z1Received inconsistently sized batches of images (z) and text (z).	num_cropszPrompt contained z image tokens but received z images.zHere is the original image z0 and here are some crops to help you see better r   return_tensorsr   Fr9   image)
modalities	input_ids   token_type_ids)datatensor_typer    )!
ValueError_merge_kwargsr   r*   init_kwargs
isinstancestrlist	TypeErrorr)   fetch_imagesr   r/   r-   lenr   poprange	enumerateziprefinditerstartreversedreplacer1   _check_special_mm_tokensnparray
zeros_liker,   tolistr   )r4   r8   r9   r5   output_kwargsimage_inputsbatched_imagesr>   _batch_num_crops	batch_idxpromptmimage_indexesnumidxformatted_image_textr?   r   text_inputs	array_idsmm_token_type_idss                         r"   __call__zGemma3Processor.__call__B   s    <FNJKK***!
"&.."<"<
 
 dC 6DD$'
47C0H_``))66v>F7?N/4//Y-:XYL O]^V$..!1CK!?@^^>"c$i/ GNH[G\\hilmqirhssuv 
 ","2"2;"?@I\jkRX%F:LMQ	a 0MkOk:CCn^mDn:o 16	6FFI46KKPV4W Xq X Xv;#m"44$+C,>+??Z[^_e[fZggop 
 !)c)].K)L M 1HC9$..9IIyz!hh'7#'=>? - "(0D!DvcTWX\XfXfTgNgNiGj!j*0Y11& \``QWFNN4>>43K3KL`D`&}599:JDQ#0#?#C#CD^`e#f $dnnO$O-2NO%%dKWI%N $[!9:I "i 8BCi4+>+>>?,=,D,D,FK()!@K!@<!@n]]W _  Nk X$ as*   /NN ,NN N&,-N+N c                     i }|<| j                   gt        |      z  }dgt        |      z  }|j                  ||d       t        di |S )a  
        Computes the number of placeholder tokens needed for multimodal inputs with the given sizes.

        Args:
            image_sizes (`list[list[int]]`, *optional*):
                The input sizes formatted as (height, width) per each image.

        Returns:
            `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided
            input modalities, along with other useful data.
        rC   )num_image_tokensnum_image_patchesr    )r&   rO   updater   )r4   image_sizesr5   vision_dataro   rp   s         r"   _get_num_multimodal_tokensz*Gemma3Processor._get_num_multimodal_tokens   s]     " $ 5 56[9II!"c+&6 64D[lmn,,,r!   c                     | j                   j                  dgz   }| j                  j                  }|D cg c]
  }|dk7  s	| }}t        ||z         S c c}w )NrD   r>   )r*   model_input_namesr)   rL   )r4   tokenizer_input_namesimage_processor_input_namesnames       r"   rv   z!Gemma3Processor.model_input_names   sb     $ @ @DTCU U&*&:&:&L&L#8S&kW[_jWjt&k#&k),GGHH 'ls   
A A)Nr   )NN)N)r   r   r   intr3   r   r   r   r   rL   r
   r   r   rm   rt   propertyrv   __classcell__)r7   s   @r"   r%   r%   *   s      #

 
,  %)Z^G^T!G^ ++d9o=EV@WWG^ ./	G^
 
G^ G^R-. I Ir!   r%   )rT   numpyrZ   feature_extraction_utilsr   image_utilsr   r   processing_utilsr   r   r	   r
   tokenization_utils_baser   r   utilsr   r   r   r%   __all__r    r!   r"   <module>r      sZ    
  4 A X X C .,E   ~In ~I ~IB 
r!   