
    qi$%                         d Z ddlZddlmZ ddlmZmZ ddlm	Z	m
Z
mZmZ ddlmZmZ ddlmZmZmZ  e       rd	d
lmZ  ej,                  e      Z G d de
d      ZdefdZd Ze G d de             ZdgZy)z
Processor class for Pixtral.
    N   )BatchFeature)
ImageInputis_valid_image)MultiModalDataProcessingKwargsProcessorMixinUnpack)PreTokenizedInput	TextInput)auto_docstringis_vision_availablelogging   )get_resize_output_image_sizec                        e Zd ZdddddidZy)PixtralProcessorKwargsF)paddingreturn_mm_token_type_idsreturn_tensorspt)text_kwargscommon_kwargsN)__name__
__module____qualname__	_defaults     `/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/pixtral/processing_pixtral.pyr   r   '   s"     (-

 d
Ir   r   F)totalreturnc                 H    t        | t              xr | j                  d      S )Nhttp)
isinstancestr
startswith)vals    r    is_urlr)   4   s    c3:CNN6$::r   c                 2    t        |       xs t        |       S N)r)   r   )elems    r    is_image_or_image_urlr-   9   s    $</>$//r   c            
            e Zd Z	 	 	 	 	 	 	 	 ddedef fdZe	 	 ddedz  deez  e	e   z  e	e   z  de
e   defd	       Zdd
Zed        Z xZS )PixtralProcessorN
patch_sizespatial_merge_sizec	                    || _         || _        || _        |j                  | j                        | _        || _        || _        |j                  | j                        | _        |j                  | j
                        | _        |j                  | j                        | _        | j                  | j                  | j                  g| _	        t        
| -  |||       y)a  
        patch_size (`int`, *optional*, defaults to 16):
            Patch size from the vision tower.
        spatial_merge_size (`int`, *optional*, defaults to 1):
            The downsampling factor for the spatial merge operation.
        image_token (`str`, *optional*, defaults to `"[IMG]"`):
            Special token used to denote image location.
        image_break_token (`str`, *optional*, defaults to `"[IMG_BREAK]"`):
            Special token used to denote the end of a line of pixels in an image.
        image_end_token (`str`, *optional*, defaults to `"[IMG_END]"`):
            Special token used to denote the end of an image input.
        )chat_templateN)r0   r1   image_tokenconvert_tokens_to_idsimage_token_idimage_break_tokenimage_end_tokenimage_break_token_idimage_end_token_id	image_idssuper__init__)selfimage_processor	tokenizerr0   r1   r3   r4   r7   r8   kwargs	__class__s             r    r=   zPixtralProcessor.__init__?   s    0 %"4&'==d>N>NO!2.'==d>N>NO$-$C$CDDZDZ$[!"+"A"A$BVBV"W--t/H/H$JaJab)=Qr   imagestextrA   r"   c           	          | j                   t        fdt        | j                  di       i|}| j                  | j
                  z  }|||d   d<    | j                  |fi |d   }ni }t        |t              r|g}n.t        |t              st        |d   t              st        d      |}|j                  d      t        |d	         }g }g }	|D ]  }
| j                  |
v rt        |      \  }}||z  }||z  }| j                  g|z  | j                  gz   g|z  }|D cg c]  }|D ]  }|  }}}| j                   |d
<   dj#                  |      }|	j%                  |       |
j'                  | j                  dd      }
| j                  |
v rd|
v r)|	j)                  d      }|
j'                  d|d      }
d|
v r)|j%                  |
        |d   j)                  dd      }|d   j)                  dd      }|d   j)                  dd        | j                  |fi |d   ddi}| j+                  ||dg       |rft-        j.                  |d         }t-        j0                  |d         }d|t-        j2                  || j4                        <   |j7                         |d<   t9        i |||      S c c}}w )a?  
        Returns:
            [`BatchFeature`]: A [`BatchFeature`] with the following fields:

            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
            `None`).
            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
        tokenizer_init_kwargsinit_kwargsNimages_kwargsr0   r   zAInvalid input text. Please provide a string, or a list of stringspixel_valuesimage_sizes z<placeholder>r   r   r   r   Freturn_token_type_idsimage)
modalities	input_idsmm_token_type_ids)datatensor_type)_merge_kwargsr   getattrr@   r0   r1   r?   r%   r&   list	TypeErrorgetiterr4   nextr7   r8   joinappendreplacepop_check_special_mm_tokensnparray
zeros_likeisinr;   tolistr   )r>   rC   rD   rA   output_kwargsr0   image_inputsprompt_stringsrJ   replace_stringssampleheightwidthnum_height_tokensnum_width_tokensreplace_tokenssublistitemreplace_strr   r   text_inputs	array_idsrQ   s                           r    __call__zPixtralProcessor.__call__c   s   $ +**"
")$..-"L
 
 __t'>'>>
;EM/*<8/4//Y-:XYLLdC 6DD$'
47C0H_`` N+7|M:;KN O .&&&0$($5MFE(.*(<%',
':$))*-==AWAW@XX&)&*N ;I%]wU\%]Td%]d%]N%])-)=)=N2&"$''."9K#**;7#^^D,<,<oqQF &&&0 &/"1"5"5a"8K#^^O[!LF &/ %%f-%.( '}599:JDQ#0#?#C#CD^`e#f m$(()@$G$dnn^i}]7Sidhi%%nkwi%X#[!9:I "k+.F GDEbggi@A/@/G/G/IK+,!@K!@<!@n]]1 &^s   %J<c                 
   i }|t         j                  j                  di       }|j                  |       |j                  dd      xs | j                  j
                  }| j                  | j                  z  }g }|D ]W  \  }}	t        t        j                  ||	df      |d   |d   f||f      \  }
}|
|z  }||z  }|j                  |dz   |z         Y dgt        |      z  }|j                  ||d       t        d	i |S )
a  
        Computes the number of placeholder tokens needed for multimodal inputs with the given sizes.

        Args:
            image_sizes (`list[list[int]]`, *optional*):
                The input sizes formatted as (height, width) per each image.

        Returns:
            `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided
            input modalities, along with other useful data.
        NrH   sizer   longest_edge)rv   r0   r   )num_image_tokensnum_image_patchesr   )r   r   rX   updater?   rv   r0   r1   r   r`   zerosr\   lenr   )r>   rJ   rA   vision_datarH   rv   r0   rx   rj   rk   resized_heightresized_widthrl   rm   ry   s                  r    _get_num_multimodal_tokensz+PixtralProcessor._get_num_multimodal_tokens   s-    "2<<@@RTUM  ( $$VT2Od6J6J6O6OD4+B+BBJ!!, T0LHHfeQ/0~.^0DE *J71-
 %3j$@!#0J#>  '')9A)=AR(RST "#c+&6 64D[lmn,,,r   c                 l    | j                   j                  }| j                  j                  }||z   dgz   S )NrJ   )r@   model_input_namesr?   )r>   tokenizer_input_namesimage_processor_input_namess      r    r   z"PixtralProcessor.model_input_names   s7     $ @ @&*&:&:&L&L#$'BBm_TTr   )NN   r   Nz[IMG]z[IMG_BREAK]z	[IMG_END])NNr+   )r   r   r   intr=   r   r   r   r   rV   r
   r   r   rt   r   propertyr   __classcell__)rB   s   @r    r/   r/   =   s     "#'#"R 	"R
  "RH  %)Z^M^T!M^ ++d9o=EV@WWM^ /0	M^
 
M^ M^^"-H U Ur   r/   )__doc__numpyr`   feature_extraction_utilsr   image_utilsr   r   processing_utilsr   r   r	   r
   tokenization_utils_baser   r   utilsr   r   r   image_processing_pixtralr   
get_loggerr   loggerr   boolr)   r-   r/   __all__r   r   r    <module>r      s     4 5  D A A F 
		H	%	-U 	;4 ;
0 ]U~ ]U ]U@ 
r   