
    qi\                     R    d Z ddlmZ ddlmZ ddlmZ e G d de             ZdgZy)z(
Image/Text processor class for CLIPSeg
   )ProcessorMixin)BatchEncoding)auto_docstringc                   2     e Zd Zd fd	Zedd       Z xZS )CLIPSegProcessorc                 &    t         |   ||       y )N)super__init__)selfimage_processor	tokenizerkwargs	__class__s       `/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/clipseg/processing_clipseg.pyr
   zCLIPSegProcessor.__init__   s    )4    c                    |||t        d      ||t        d       | j                  | j                  fd| j                  j                  i|}| | j                  |fd|i|d   }| | j
                  |fd|i|d   }| | j
                  |fd|i|d   }	||	j                  j                  d}|S ||	j                  d<   |S |S |d	j                  i}|S t        t        di 	|
      S )a  
        visual_prompt (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
            The visual prompt image or batch of images to be prepared. Each visual prompt image can be a PIL image,
            NumPy array or PyTorch tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape
            (C, H, W), where C is a number of channels, H and W are image height and width.

        Returns:
            [`BatchEncoding`]: A [`BatchEncoding`] with the following fields:

            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
              `None`).
            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
        z9You have to specify either text, visual prompt or images.zMYou have to specify exactly one type of prompt. Either text or visual prompt.tokenizer_init_kwargsreturn_tensorstext_kwargsimages_kwargs)pixel_valuesconditional_pixel_valuesr   r   )datatensor_type )	
ValueError_merge_kwargsvalid_processor_kwargsr   init_kwargsr   r   r   dict)
r   textimagesvisual_promptr   r   output_kwargsencodingprompt_featuresimage_featuress
             r   __call__zCLIPSegProcessor.__call__   s   " <M1fnXYY 9lmm***''
?C~~?Y?Y
]c
 %t~~dj>j][hMijH$2d22.<@Mo@^O 1T11'59F9WN $); . ; ;,;,H,HH O&"4'5'B'BH^$OO&*O,H,HH O d&<^&<.YYr   )NN)NNNN)__name__
__module____qualname__r
   r   r(   __classcell__)r   s   @r   r   r      s    5 8Z 8Zr   r   N)	__doc__processing_utilsr   tokenization_utils_baser   utilsr   r   __all__r   r   r   <module>r2      s>    / 4 # =Z~ =Z =Z@ 
r   