
    qi                         d dl mZ d dlmZmZmZ ddlmZmZm	Z	m
Z
mZ ddlmZmZ  e	       rd dlmZ ddlmZ  e       r
d d	lZdd
lmZ  e
j,                  e      Z e ed             G d de             Zy	)    )UserDict)AnyUnionoverload   )add_end_docstringsis_torch_availableis_vision_availableloggingrequires_backends   )Pipelinebuild_pipeline_init_args)Image)
load_imageN)6MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING_NAMEST)has_image_processorc                   P    e Zd ZdZdZdZdZdZ fdZe	de
edf   dee   ded	eeeef      fd
       Ze	dee   ed   z  dee   ded	eeeeef         fd       Zde
eee   ded   f   dee   ded	eeeef      eeeeef         z  f fdZddZ	 	 	 	 ddZd Zd Z xZS )#ZeroShotImageClassificationPipelineaL  
    Zero shot image classification pipeline using `CLIPModel`. This pipeline predicts the class of an image when you
    provide an image and a set of `candidate_labels`.

    Example:

    ```python
    >>> from transformers import pipeline

    >>> classifier = pipeline(model="google/siglip-so400m-patch14-384")
    >>> classifier(
    ...     "https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png",
    ...     candidate_labels=["animals", "humans", "landscape"],
    ... )
    [{'score': 0.965, 'label': 'animals'}, {'score': 0.03, 'label': 'humans'}, {'score': 0.005, 'label': 'landscape'}]

    >>> classifier(
    ...     "https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png",
    ...     candidate_labels=["black and white", "photorealist", "painting"],
    ... )
    [{'score': 0.996, 'label': 'black and white'}, {'score': 0.003, 'label': 'photorealist'}, {'score': 0.0, 'label': 'painting'}]
    ```

    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)

    This image classification pipeline can currently be loaded from [`pipeline`] using the following task identifier:
    `"zero-shot-image-classification"`.

    See the list of available models on
    [huggingface.co/models](https://huggingface.co/models?filter=zero-shot-image-classification).
    FTc                 f    t        |   di | t        | d       | j                  t               y )Nvision )super__init__r   check_model_typer   )selfkwargs	__class__s     g/opt/pipecat/venv/lib/python3.12/site-packages/transformers/pipelines/zero_shot_image_classification.pyr   z,ZeroShotImageClassificationPipeline.__init__C   s,    "6"$)TU    imagezImage.Imagecandidate_labelsr   returnc                      y Nr   r   r!   r"   r   s       r   __call__z,ZeroShotImageClassificationPipeline.__call__I   s      #r    c                      y r%   r   r&   s       r   r'   z,ZeroShotImageClassificationPipeline.__call__N   s     &)r    c                 l    d|v r|j                  d      }|t        d      t        |   |fd|i|S )a  
        Assign labels to the image(s) passed as inputs.

        Args:
            image (`str`, `list[str]`, `PIL.Image` or `list[PIL.Image]`):
                The pipeline handles three types of images:

                - A string containing a http link pointing to an image
                - A string containing a local path to an image
                - An image loaded in PIL directly

            candidate_labels (`list[str]`):
                The candidate labels for this image. They will be formatted using *hypothesis_template*.

            hypothesis_template (`str`, *optional*, defaults to `"This is a photo of {}"`):
                The format used in conjunction with *candidate_labels* to attempt the image classification by
                replacing the placeholder with the candidate_labels. Pass "{}" if *candidate_labels* are
                already formatted.

            timeout (`float`, *optional*, defaults to None):
                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
                the call may block forever.

        Return:
            A list of dictionaries containing one entry per proposed label. Each dictionary contains the
            following keys:
            - **label** (`str`) -- One of the suggested *candidate_labels*.
            - **score** (`float`) -- The score attributed by the model to that label. It is a value between
                0 and 1, computed as the `softmax` of `logits_per_image`.
        imageszSCannot call the zero-shot-image-classification pipeline without an images argument!r"   )pop
ValueErrorr   r'   )r   r!   r"   r   r   s       r   r'   z,ZeroShotImageClassificationPipeline.__call__S   sH    J vJJx(E=rsswS8HSFSSr    c                 X    i }d|v r|d   |d<   d|v r|d   |d<   d|v r|d   |d<   |i i fS )Nr"   timeouthypothesis_templater   )r   tokenizer_kwargsr   preprocess_paramss       r   _sanitize_parametersz8ZeroShotImageClassificationPipeline._sanitize_parameters~   sa    '4:;M4N01+1)+<i( F*7=>S7T34 "b((r    c                    |i }t        ||      }| j                  |gd      }|j                  | j                        }||d<   |D cg c]  }|j	                  |       }}ddi}	d| j
                  j                  j                  v r|	j                  dd	d
       |	j                  |        | j                  |fddi|	}
|
g|d<   |S c c}w )N)r.   pt)r*   return_tensorsr"   paddingTsiglip
max_length@   )r6   r8   
truncationr5   text_inputs)
r   image_processortodtypeformatmodelconfig
model_typeupdate	tokenizer)r   r!   r"   r/   r.   r0   inputsx	sequencestokenizer_default_kwargsr;   s              r   
preprocessz.ZeroShotImageClassificationPipeline.preprocess   s     #!5'2%%eWT%J4::&%5!"<LMq(//2M	M$-t#4 tzz((333$++LR\`+a ''(89$dnnY`t`G_`!,} Ns   
Cc                     |j                  d      }|j                  d      }t        |d   t              r|d   }n|d   d   } | j                  di ||}||j                  d}|S )Nr"   r;   r   )r"   logitsr   )r+   
isinstancer   r@   logits_per_image)r   model_inputsr"   r;   outputsmodel_outputss         r   _forwardz,ZeroShotImageClassificationPipeline._forward   s    '++,>?"&&}5k!nh/%a.K &a.+K$**;{;l; !1..
 r    c                    |j                  d      }|d   d   }d| j                  j                  j                  v rHt	        j
                  |      j                  d      }|j                         }t        |t              sH|g}nD|j                  d      j                  d      }|j                         }t        |t              s|g}t        t        ||      d       D cg c]
  \  }}||d	 }}}|S c c}}w )
Nr"   rK   r   r7   )dimc                     | d    S )Nr   r   )rF   s    r   <lambda>zAZeroShotImageClassificationPipeline.postprocess.<locals>.<lambda>   s    _`ab_c^c r    )key)scorelabel)r+   r@   rA   rB   torchsigmoidsqueezetolistrL   listsoftmaxsortedzip)	r   rP   r"   rK   probsscoresrX   candidate_labelresults	            r   postprocessz/ZeroShotImageClassificationPipeline.postprocess   s    (,,-?@x(+tzz((333MM&)11"5E\\^Ffd+ NNrN*2226E\\^Ffd+  +1V=M1NTc*d
& o6
 
 	
s   #C7r%   )NzThis is a photo of {}.NN)__name__
__module____qualname____doc___load_processor_load_image_processor_load_feature_extractor_load_tokenizerr   r   r   strr^   r   dictr'   r2   rI   rQ   rf   __classcell__)r   s   @r   r   r      s`   @ O #OV #3-.#BFs)#WZ#	d38n	# # )#Ym!44)HLS	)]`)	d4S>"	#) ))TS$s)]D4GGH)T s))T 	)T
 
d38n	T$sCx.%9 :	:)TV	) 4."r    r   )collectionsr   typingr   r   r   utilsr   r	   r
   r   r   baser   r   PILr   image_utilsr   rZ   models.auto.modeling_autor   
get_loggerrg   loggerr   r   r    r   <module>r{      su      ' '  5 (b 
		H	% ,FGf( f Hfr    