
    qi(                         d dl mZmZmZ ddlmZmZmZmZm	Z	 ddl
mZmZ  e       rd dlmZ ddlmZmZ  e       rd dlZd d	lmZ dd
lmZ  ej.                  e      Z e ed             G d de             Zy)    )AnyUnionoverload   )add_end_docstringsis_torch_availableis_vision_availableloggingrequires_backends   )ChunkPipelinebuild_pipeline_init_args)Image)
load_imagevalid_imagesN)BaseModelOutput)2MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING_NAMEST)has_image_processorc                   n    e Zd ZdZdZdZdZdZ fdZe	de
edf   deee   z  ded	eeeef      fd
       Ze	deeeef      ded	eeeeef         fd       Z	 dde
edeeeef      f   deee   z  dz  ded	eeeef      eeeeef         z  f fdZd ZddZd ZddZddd	eeef   fdZ xZS )ZeroShotObjectDetectionPipelinea  
    Zero shot object detection pipeline using `OwlViTForObjectDetection`. This pipeline predicts bounding boxes of
    objects when you provide an image and a set of `candidate_labels`.

    Example:

    ```python
    >>> from transformers import pipeline

    >>> detector = pipeline(model="google/owlvit-base-patch32", task="zero-shot-object-detection")
    >>> detector(
    ...     "http://images.cocodataset.org/val2017/000000039769.jpg",
    ...     candidate_labels=["cat", "couch"],
    ... )
    [{'score': 0.287, 'label': 'cat', 'box': {'xmin': 324, 'ymin': 20, 'xmax': 640, 'ymax': 373}}, {'score': 0.254, 'label': 'cat', 'box': {'xmin': 1, 'ymin': 55, 'xmax': 315, 'ymax': 472}}, {'score': 0.121, 'label': 'couch', 'box': {'xmin': 4, 'ymin': 0, 'xmax': 642, 'ymax': 476}}]

    >>> detector(
    ...     "https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png",
    ...     candidate_labels=["head", "bird"],
    ... )
    [{'score': 0.119, 'label': 'bird', 'box': {'xmin': 71, 'ymin': 170, 'xmax': 410, 'ymax': 508}}]
    ```

    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)

    This object detection pipeline can currently be loaded from [`pipeline`] using the following task identifier:
    `"zero-shot-object-detection"`.

    See the list of available models on
    [huggingface.co/models](https://huggingface.co/models?filter=zero-shot-object-detection).
    FTc                 f    t        |   di | t        | d       | j                  t               y )Nvision )super__init__r   check_model_typer   )selfkwargs	__class__s     c/opt/pipecat/venv/lib/python3.12/site-packages/transformers/pipelines/zero_shot_object_detection.pyr   z(ZeroShotObjectDetectionPipeline.__init__=   s,    "6"$)PQ    imagezImage.Imagecandidate_labelsr   returnc                      y Nr   )r   r"   r#   r   s       r    __call__z(ZeroShotObjectDetectionPipeline.__call__C   s      #r!   c                      y r&   r   )r   r"   r   s      r    r'   z(ZeroShotObjectDetectionPipeline.__call__H   s    ber!   Nc           	      :   d|v r|j                  d      }t        |t        t        j                  f      r||d}nNt        |t        t
        f      r5t        |      r*t	        t        |    d t        ||      D        fi |      S 	 |}t        |    |fi |}|S )a|  
        Detect objects (bounding boxes & classes) in the image(s) passed as inputs.

        Args:
            image (`str`, `PIL.Image` or `list[dict[str, Any]]`):
                The pipeline handles three types of images:

                - A string containing an http url pointing to an image
                - A string containing a local path to an image
                - An image loaded in PIL directly

                You can use this parameter to send directly a list of images, or a dataset or a generator like so:

                ```python
                >>> from transformers import pipeline

                >>> detector = pipeline(model="google/owlvit-base-patch32", task="zero-shot-object-detection")
                >>> detector(
                ...     [
                ...         {
                ...             "image": "http://images.cocodataset.org/val2017/000000039769.jpg",
                ...             "candidate_labels": ["cat", "couch"],
                ...         },
                ...         {
                ...             "image": "http://images.cocodataset.org/val2017/000000039769.jpg",
                ...             "candidate_labels": ["cat", "couch"],
                ...         },
                ...     ]
                ... )
                [[{'score': 0.287, 'label': 'cat', 'box': {'xmin': 324, 'ymin': 20, 'xmax': 640, 'ymax': 373}}, {'score': 0.25, 'label': 'cat', 'box': {'xmin': 1, 'ymin': 55, 'xmax': 315, 'ymax': 472}}, {'score': 0.121, 'label': 'couch', 'box': {'xmin': 4, 'ymin': 0, 'xmax': 642, 'ymax': 476}}], [{'score': 0.287, 'label': 'cat', 'box': {'xmin': 324, 'ymin': 20, 'xmax': 640, 'ymax': 373}}, {'score': 0.254, 'label': 'cat', 'box': {'xmin': 1, 'ymin': 55, 'xmax': 315, 'ymax': 472}}, {'score': 0.121, 'label': 'couch', 'box': {'xmin': 4, 'ymin': 0, 'xmax': 642, 'ymax': 476}}]]
                ```


            candidate_labels (`str` or `list[str]` or `list[list[str]]`):
                What the model should recognize in the image.

            threshold (`float`, *optional*, defaults to 0.1):
                The probability necessary to make a prediction.

            top_k (`int`, *optional*, defaults to None):
                The number of top predictions that will be returned by the pipeline. If the provided number is `None`
                or higher than the number of predictions available, it will default to the number of predictions.

            timeout (`float`, *optional*, defaults to None):
                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
                the call may block forever.


        Return:
            A list of lists containing prediction results, one list per input image. Each list contains dictionaries
            with the following keys:

            - **label** (`str`) -- Text query corresponding to the found object.
            - **score** (`float`) -- Score corresponding to the object (between 0 and 1).
            - **box** (`dict[str,int]`) -- Bounding box of the detected object in image's original size. It is a
              dictionary with `x_min`, `x_max`, `y_min`, `y_max` keys.
        text_queriesr"   r#   c              3   ,   K   | ]  \  }}||d   yw)r+   Nr   ).0imglabelss      r    	<genexpr>z;ZeroShotObjectDetectionPipeline.__call__.<locals>.<genexpr>   s     pKCs?ps   )
pop
isinstancestrr   listtupler   r   r'   zip)r   r"   r#   r   inputsresultsr   s         r    r'   z(ZeroShotObjectDetectionPipeline.__call__K   s    ~ V#%zz.9ec5;;/0$:JKFe}-,u2E pSVW\^nSop  F'"64V4r!   c                 \    i }d|v r|d   |d<   i }d|v r|d   |d<   d|v r|d   |d<   |i |fS )Ntimeout	thresholdtop_kr   )r   r   preprocess_paramspostprocess_paramss       r    _sanitize_parametersz4ZeroShotObjectDetectionPipeline._sanitize_parameters   sc    +1)+<i(& .4[.A{+f*0/w' "&888r!   c              #     K   t        |d   |      }|d   }t        |t              r|j                  d      }t	        j
                  |j                  |j                  ggt        j                        }t        |      D ]a  \  }}| j                  |d      }| j                  |d      }	|	j                  | j                        }	|t        |      dz
  k(  ||d	||	 c y w)
Nr"   )r:   r#   ,)dtypept)return_tensorsr   )is_lasttarget_sizecandidate_label)r   r2   r3   splittorchtensorheightwidthint32	enumerate	tokenizerimage_processortorB   len)
r   r7   r:   r"   r#   rF   irG   text_inputsimage_featuress
             r    
preprocessz*ZeroShotObjectDetectionPipeline.preprocess   s     6'?G<!"45&,/55c:llU\\5;;$?#@T"+,<"= 
	A...NK!11%1MN+..tzz:N$4 5 99*#2 	
 ! 	
	s   C#C%c                     |j                  d      }|j                  d      }|j                  d      } | j                  di |}|||d|}|S )NrF   rG   rE   )rF   rG   rE   r   )r1   model)r   model_inputsrF   rG   rE   outputsmodel_outputss          r    _forwardz(ZeroShotObjectDetectionPipeline._forward   s_    "&&}5&**+<=""9-$**,|,(3dkwovwr!   c                 j   g }|D ]  }|d   }t        |      }| j                  j                  |||d         d   }|d   j                         D ]I  }|d   |   j	                         }	| j                  |d   |   d         }
|	||
d}|j                  |       K  t        |d d	
      }|r|d | }|S )NrG   rF   )rZ   r;   target_sizesr   scoresboxes)scorelabelboxc                     | d   S )Nra   r   )xs    r    <lambda>z=ZeroShotObjectDetectionPipeline.postprocess.<locals>.<lambda>   s
    '
 r!   T)keyreverse)r   rP   post_process_object_detectionnonzeroitem_get_bounding_boxappendsorted)r   r[   r;   r<   r8   model_outputrb   rZ   indexra   rc   results               r    postprocessz+ZeroShotObjectDetectionPipeline.postprocess   s    ) 	'L !23E*<8L**HH$	UbHc I G !*224 ')%0557,,WW-=e-DQ-GH#(5Ev&'	' &:DIfuoGr!   rc   ztorch.Tensorc                 Z    |j                         j                         \  }}}}||||d}|S )a%  
        Turns list [xmin, xmax, ymin, ymax] into dict { "xmin": xmin, ... }

        Args:
            box (`torch.Tensor`): Tensor containing the coordinates in corners format.

        Returns:
            bbox (`dict[str, int]`): Dict containing the coordinates in corners format.
        )xminyminxmaxymax)inttolist)r   rc   rt   ru   rv   rw   bboxs          r    rl   z1ZeroShotObjectDetectionPipeline._get_bounding_box   s;     "%!1!1!3dD$	
 r!   r&   )g?N)__name__
__module____qualname____doc___load_processor_load_image_processor_load_feature_extractor_load_tokenizerr   r   r   r3   r4   r   dictr'   r?   rV   r\   rr   rx   rl   __classcell__)r   s   @r    r   r      sl   @ O #OR #3-.#BES	/#]`#	d38n	# # ed4S>2eced4PTUXZ]U]P^K_F`e e
 48VS-d38n)==>V S	/D0V 	V
 
d38n	T$sCx.%9 :	:Vp	9&,^ S#X r!   r   )typingr   r   r   utilsr   r   r	   r
   r   baser   r   PILr   image_utilsr   r   rI   transformers.modeling_outputsr   models.auto.modeling_autor   
get_loggerr{   loggerr   r   r!   r    <module>r      sm    ' ' k k 9 6=^			H	% ,FG[m [ H[r!   