
    qil                        d Z ddlmZ ddlZddlmZ ddlmZ ddl	m
Z
mZmZ ddlmZmZmZmZ dd	lmZ  ej(                  e      Z e       rddlZd
 Zd Zd Zd Zd Zd Zd Z ed      e G d de                    ZdgZ y)z
Processor class for SAM3.
    )deepcopyN   )
ImageInput)ProcessorMixin)BatchEncodingPreTokenizedInput	TextInput)
TensorTypeauto_docstringis_torch_availablelogging)requiresc                     | j                  d      \  }}}}|d|z  z
  |d|z  z
  |d|z  z   |d|z  z   g}t        j                  |d      S N      ?dimunbindtorchstackxx_cy_cwhbs         Z/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/sam3/processing_sam3.pybox_cxcywh_to_xyxyr!   #   sV    XXb\NCa
a-3q=C#'MS37]LA;;qb!!    c                     | j                  d      \  }}}}|d|z  z
  |d|z  z
  ||g}t        j                  |d      S r   r   r   s         r    box_cxcywh_to_xywhr$   )   sF    XXb\NCa
a-3q=A4A;;qb!!r"   c                 t    | j                  d      \  } }}}| || |z   ||z   g}t        j                  |d      S Nr   r   r   r   yr   r   r   s        r    box_xywh_to_xyxyr)   /   >    "JAq!Q
qAEQU$A;;qb!!r"   c                     | j                  d      \  } }}}| d|z  z   |d|z  z   ||g}t        j                  |d      S r   r   r'   s        r    box_xywh_to_cxcywhr,   5   sF    "JAq!Q
cAg+S1WQ0A;;qb!!r"   c                 t    | j                  d      \  } }}}| ||| z
  ||z
  g}t        j                  |d      S r&   r   )r   r(   XYr   s        r    box_xyxy_to_xywhr0   ;   r*   r"   c                     | j                  d      \  }}}}||z   dz  ||z   dz  ||z
  ||z
  g}t        j                  |d      S )Nr      r   r   )r   x0y0x1y1r   s         r    box_xyxy_to_cxcywhr7   A   sN    XXb\NBB
r'QbAR27<A;;qb!!r"   c                 D    | j                  d      \  }}}}||z
  ||z
  z  S )z
    Batched version of box area. Boxes should be in [x0, y0, x1, y1] format.

    Inputs:
    - boxes: Tensor of shape (..., 4)

    Returns:
    - areas: Tensor of shape (...,)
    r   )r   )boxesr3   r4   r5   r6   s        r    box_arear:   G   s-     \\"%NBBGR  r"   )r   )backendsc                       e Zd Z	 ddedz  def fdZe	 	 	 	 	 	 	 d dedz  deez  e	e   z  e	e   z  dz  dedz  de	e	e	e
         ej                  z  dz  d	e	e	e	e         ej                  z  dz  d
e	e	e
      ej                  z  dz  deez  dz  defd       Zd!d"dZd#dZd Zd$dZd%dZd Zd Z	 d$dej                  ej2                  z  e	z  dededededz  de	fdZd&dZd'dZd(dZ	 	 	 d)dZ xZS )*Sam3ProcessorNtarget_sizepoint_pad_valuec                     t        |   ||fi | || _        ||| _        y| j                  j                  d   | _        y)z
        target_size (`int`, *optional*):
            The target size (target_size, target_size) to which the image will be resized.
        point_pad_value (`int`, *optional*, defaults to -10):
            The value used for padding input boxes.
        Nheight)super__init__r?   image_processorsizer>   )selfrD   	tokenizerr>   r?   kwargs	__class__s         r    rC   zSam3Processor.__init__X   sG     	)>v>.*5*A;tG[G[G`G`aiGjr"   imagestextsegmentation_mapsinput_boxesinput_boxes_labelsoriginal_sizesreturn_tensorsreturnc                    d}	| | j                   |f||d|}	nW|Ht        |t        j                        r|j	                         j                         }t        d|i|      }	n|t        d      | j                  ||      }|+| j                  ||dd      }
|	|	j                  |
       n|
}	|,|	d   }| j                  |d	d
dd      }| j                  |ddd      }|| j                  |      dd }|| j                  |      dd }||k7  rt        d      |n| j                  |dgz         }t        j                  |t        j                        }| j!                  ||dd       t#        |      }|	j                  d|i       |J| j                  |      }t        j                  |t        j$                        }|	j                  d|i       |	S )a}  
        images (`ImageInput`, *optional*):
            The image(s) to process.
        text (`str`, `list[str]`, `list[list[str]]`, *optional*):
            The text to process.
        segmentation_maps (`ImageInput`, *optional*):
            The segmentation maps to process.
        input_boxes (`list[list[list[float]]]`, `torch.Tensor`, *optional*):
            The bounding boxes to process.
        input_boxes_labels (`list[list[int]]`, `torch.Tensor`, *optional*):
            The labels for the bounding boxes.
        original_sizes (`list[list[float]]`, `torch.Tensor`, *optional*):
            The original sizes of the images.

        Returns:
            A [`BatchEncoding`] with the following fields:
            - `pixel_values` (`torch.Tensor`): The processed image(s).
            - `original_sizes` (`list[list[float]]`): The original sizes of the images.
            - `labels` (`torch.Tensor`): The processed segmentation maps (if provided).
            - `input_boxes_labels` (`torch.Tensor`): The processed labels for the bounding boxes.
            - `input_boxes` (`torch.Tensor`): The processed bounding boxes.
        N)rL   rP   rO   )tensor_typezKEither images or original_sizes must be provided if input_boxes is not None
max_length    )rP   paddingrT   r   r9   z)[image level, box level, box coordinates]   )expected_depth
input_nameexpected_formatexpected_coord_sizer2   labelsz[image level, box level])rX   rY   rZ   zaInput boxes and labels have inconsistent dimensions. Please ensure they have the same dimensions.)dtypeT)is_bounding_boxpreserve_paddingrM   rN   )rD   
isinstancer   Tensorcputolistr   
ValueError_resolve_text_promptsrG   update_validate_single_input_get_nested_dimensions_pad_nested_listtensorfloat32_normalize_tensor_coordinatesr7   int64)rF   rJ   rK   rL   rM   rN   rO   rP   rH   encodingtext_inputsprocessed_boxesprocessed_boxes_labelsboxes_max_dimsboxes_labels_max_dimspadded_boxesfinal_boxespadded_boxes_labelsfinal_boxes_labelss                      r    __call__zSam3Processor.__call__e   sP   D +t++"3- 	H '.%,,7!/!3!3!5!<!<!>$&6%GUcdH$jkk))$<..nVboq.rK#,& "%&67N"99 " K$% : O &*%@%@" # :	 &A &" *!%!<!<_!Mbq!Q%1(,(C(CDZ([\^]^(_% */E/Q!%::${ 
 *#44_nXYWZFZ[#ll<u}}M22X\ 3  1= <=%1&*&;&;<RTi&j#%*\\2EU[[%Y"!57I JKr"   c                     |\  }}t        |      j                         }|r|j                  ddd      }|d   |z  |d<   |d   |z  |d<   |r|j                  dd      }|S )a  
        Expects a numpy array of length 2 in the final dimension. Requires the original image size in (H, W) format.

        Args:
            target_size (`int`):
                The target size of the image.
            coords (`torch.Tensor`):
                The coordinates to be normalized.
            original_size (`tuple`):
                The original size of the image.
            is_bounding_box (`bool`, *optional*, defaults to `False`):
                Whether the coordinates are bounding boxes.
        r   r2   ).r   ).   rW   )r   floatreshape)rF   coordsoriginal_sizer^   old_hold_ws         r    _normalize_coordinatesz$Sam3Processor._normalize_coordinates   sq     %u&!'')^^B1-F%/v%/v^^B*Fr"   c           	         |yt        |t        j                        rb||dz
  k(  st        |j                        dk  r|j                         j                         S |D cg c]  }| j                  |||dz          c}S t        |t        j                        rT||dz
  k(  st        |j                        dk  r|j                         S |D cg c]  }| j                  |||dz          c}S t        |t              r/||k(  r|S |D cg c]  }|| j                  |||dz         nd c}S t        |t        t        f      r|S t        dt        |             c c}w c c}w c c}w )a  
        Recursively convert various input formats (tensors, numpy arrays, lists) to nested lists.
        Preserves None values within lists.

        Args:
            data: Input data in any format (may be None or contain None values)
            expected_depth: Expected nesting depth
            current_depth: Current depth in recursion

        Returns:
            Nested list representation of the data (or None)
        Nr2   rz   zUnsupported data type: )r`   r   ra   lenshapenumpyrc   _convert_to_nested_listnpndarraylistintr{   rd   type)rF   datarX   current_depthitems        r    r   z%Sam3Processor._convert_to_nested_list   sd    < dELL) 22c$**o6Jzz|**,,jnobf44T>=[\K\]oobjj) 22c$**o6J{{}$jnobf44T>=[\K\]ood#.
 !% ^b]mD00~}WXGXYsww  sEl+K6tDzlCDD' p
 ps    EE!	!E&c                    ||rdS dS t        |t        t        f      s|S t        |      }|r;t        |      t        |      k7  r$t	        dt        |       dt        |       d      t        |      D ]  \  }}|	|s||   d||<    |S )zQ
        Resolve text prompts by setting defaults based on prompt types.
        NvisualzEThe number of text prompts must match the number of input boxes. Got z text prompts and z input boxes.)r`   r   tupler   rd   	enumerate)rF   rK   rM   i
text_values        r    re   z#Sam3Processor._resolve_text_prompts  s    
 <*844$u.K Dz3t9K(884yk!3C4D3E]T  't_ 	#MAz!kk!n6P"Q	# r"   c                    |g }t        |t              s|S t        |      dk(  r|j                  t        |             nt	        |d   t        |            |d<   t        |      dkD  rz|D ]u  }|t        |t              s| j                  |      }t        |      D ]@  \  }}|dz   t        |      k\  r|j                  |       )t	        ||dz      |      ||dz   <   B w |S )a  
        Get the maximum dimensions at each level of nesting, skipping None values.

        Args:
            nested_list (`list`):
                Nested list structure (may contain None values).
            max_dims (`list`, *optional*):
                Current maximum dimensions (for recursion).

        Returns:
            `list`: A list of maximum dimensions for each nesting level.
        r   rz   )r`   r   r   appendmaxrh   r   )rF   nested_listmax_dimsr   sub_dimsr   r   s          r    rh   z$Sam3Processor._get_nested_dimensions.  s     H+t,Ox=AOOC,-hqk3{+;<HQK{a# H<dD)#::4@H"+H"5 H3q5CM1$OOC0.1(1q5/3.GHQUO	HH r"   c                 f   || j                   }|t        |      k\  r|S t        |t              s|g}t        |      }||   }|t        |      dz
  k(  r|j	                  |g||z
  z         n|dkD  rm|t        |      dz
  k  r||dz   d }| j                  ||      }n|g||dz      z  }|j	                  t        ||z
        D 	cg c]  }	t        |       c}	       nK||dz   d }| j                  ||      }|j	                  t        |      D 	cg c]  }	t        |       c}	       |t        |      dz
  k  rmt        t        |            D ]V  }
||
   ||dz   d }| j                  ||      ||
<   &t        ||
   t              s:| j                  ||
   ||dz   |      ||
<   X |S c c}	w c c}	w )a3  
        Recursively pad a nested list to match target dimensions. Replaces None values with padded structures.

        Args:
            nested_list (`list`):
                Nested list to pad (may contain None values).
            target_dims (`list`):
                Target dimensions for each level.
            current_level (`int`, *optional*, defaults to 0):
                Current nesting level.
            pad_value (`int`, *optional*):
                Value to use for padding.

        Returns:
            `list`: The padded nested list.
        Nrz   r   r2   )	r?   r   r`   r   extend_create_empty_nested_structureranger   ri   )rF   r   target_dimscurrent_level	pad_valuecurrent_sizer>   template_dimstemplate_r   s              r    ri   zSam3Processor._pad_nested_listV  s   " ,,IC,, +t,&-K ;'!-0 C,q00	{kL.HIJ a 3{#3a#77$/0A0C$DM#BB=R[\H !*{[9J-KKH""kT`F`@a#b1HX$6#bc !,MA,=,? @>>}iX""k@R#S1HX$6#ST 3{+a//3{+, vq>)$/0A0C$DM%)%H%HXa%bKNA5%)%:%:;q>;XehiXikt%uKNv # $c
 $Ts   F)F.c                     t        |      dk(  r	|g|d   z  S t        |d         D cg c]  }| j                  |dd |       c}S c c}w )a  
        Create an empty nested structure with given dimensions filled with pad_value.

        Args:
            dims (`list`):
                The dimensions of the nested structure.
            pad_value (`int`):
                The value to fill the structure with.
        rz   r   N)r   r   r   )rF   dimsr   r   s       r    r   z,Sam3Processor._create_empty_nested_structure  sT     t9>;a((V[\`ab\cVdeQRD77QR)Leees   Ac                     t        |t              r/t        |      dk(  ry|D ]  }|d| j                  |      z   c S  yt        |t        j
                  t        j                  f      rt        |j                        S y)z
        Get the nesting level of a list structure, skipping None values.

        Args:
            input_list (`list`):
                The list to get the nesting level of.
        r   rz   )	r`   r   r   _get_nesting_levelr   r   r   ra   r   )rF   
input_listr   s      r    r   z Sam3Processor._get_nesting_level  sx     j$':!#" =#t66t<<<= 
RZZ$>?z''((r"   r   rX   rY   rZ   r[   c                    |yt        |t        j                  t        j                  f      ry|j
                  |k7  r"t        d| d| d| d|j
                   d	      |4|j                  d   |k7  r"t        d| d| d|j                  d    d	      | j                  ||      S t        |t              r@| j                  |      }||k7  rt        d| d
| d| d| d	      | j                  ||      S y)a  
                Validate a single input by ensuring proper nesting and raising an error if the input is not valid.

                Args:
                    data (`torch.Tensor`, `np.ndarray`, or `list`):
                        Input data to process.
                    expected_depth (`int`):
                        Expected nesting depth.
                    input_name (`str`):
                        Name of the input for error messages.
                    expected_format (`str`):
                        The expected format of the input.
                    expected_coord_size (`int`, *optional*):
                        Expected coordinate size (4 for boxes, None for labels).
        .
        NzInput z must be a tensor/array with z, dimensions. The expected nesting format is z. Got z dimensions.r   z as the last dimension, got .z must be a nested list with z( levels. The expected nesting format is z levels.)r`   r   ra   r   r   ndimrd   r   r   r   r   )rF   r   rX   rY   rZ   r[   r   s          r    rg   z$Sam3Processor._validate_single_input  s   0 < dU\\2::67yyN* ZL(EnEU  VB  CR  BS  SY  Z^  Zc  Zc  Yd  dp  q  %0::b>%88$ ,IJ]I^^z{  |F  |F  GI  |J  {K  KL  M  //nEE dD! 33D9M. ZL(D^DTT|  ~M  }N  NT  Ub  Tc  ck  l  //nEE "r"   c                 z   |r"|| j                   k7  }|j                  dd      }t        t        |            D ]  }||j                  d   k  s|t        |      k  r||   n|d   }| j                  ||   ||      }	|r5|   }
t        j                  |
j                  ||         |	||         ||<   ||	||<    y)a  
        Helper method to normalize coordinates in a tensor across multiple images.

        Args:
            tensor (`torch.Tensor`):
                Input tensor with coordinates.
            original_sizes (`list`):
                Original image sizes.
            is_bounding_box (`bool`, *optional*, defaults to `False`):
                Whether coordinates are bounding boxes.
            preserve_padding (`bool`, *optional*, defaults to `False`):
                Whether to preserve padding values (for boxes).
        r   T)r   keepdimr   )r^   N)	r?   allr   r   r   r   r   where	expand_as)rF   rj   rO   r^   r_   mask
coord_maskimg_idxr~   normalized_coordsimg_masks              r    rl   z+Sam3Processor._normalize_tensor_coordinates  s     T111Db$7JS01 	8Ga(;BSEX;Xw 7^lmn^o$($?$?7O]O %@ %! $)'2H&+kk **6'?;=NPVW^P_'F7O '8F7O	8r"   c                 <    | j                   j                  |||      S )a  
        Converts the output of [`Sam3Model`] into semantic segmentation maps.

        Args:
            outputs ([`Sam3ImageSegmentationOutput`]):
                Raw outputs of the model containing semantic_seg.
            target_sizes (`list[tuple]` of length `batch_size`, *optional*):
                List of tuples corresponding to the requested final size (height, width) of each prediction. If unset,
                predictions will not be resized.
            threshold (`float`, *optional*, defaults to 0.5):
                Threshold for binarizing the semantic segmentation masks.

        Returns:
            semantic_segmentation: `list[torch.Tensor]` of length `batch_size`, where each item is a semantic
            segmentation map of shape (height, width) corresponding to the target_sizes entry (if `target_sizes` is
            specified). Each entry is a binary mask (0 or 1).
        )rD   "post_process_semantic_segmentation)rF   outputstarget_sizes	thresholds       r    r   z0Sam3Processor.post_process_semantic_segmentation  s!    $ ##FFwP\^ghhr"   c                 <    | j                   j                  |||      S )a  
        Converts the raw output of [`Sam3Model`] into final bounding boxes in (top_left_x, top_left_y,
        bottom_right_x, bottom_right_y) format. This is a convenience wrapper around the image processor method.

        Args:
            outputs ([`Sam3ImageSegmentationOutput`]):
                Raw outputs of the model containing pred_boxes, pred_logits, and optionally presence_logits.
            threshold (`float`, *optional*, defaults to 0.3):
                Score threshold to keep object detection predictions.
            target_sizes (`list[tuple[int, int]]`, *optional*):
                List of tuples (`tuple[int, int]`) containing the target size `(height, width)` of each image in the
                batch. If unset, predictions will not be resized.

        Returns:
            `list[dict]`: A list of dictionaries, each dictionary containing the following keys:
                - **scores** (`torch.Tensor`): The confidence scores for each predicted box on the image.
                - **boxes** (`torch.Tensor`): Image bounding boxes in (top_left_x, top_left_y, bottom_right_x,
                  bottom_right_y) format.

        Example:

        ```python
        >>> from transformers import AutoModel, AutoProcessor
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO

        >>> model = AutoModel.from_pretrained("facebook/sam3-base")
        >>> processor = AutoProcessor.from_pretrained("facebook/sam3-base")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))
        >>> inputs = processor(images=image, text="cat", return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> # Post-process to get bounding boxes
        >>> results = processor.post_process_object_detection(outputs, threshold=0.3, target_sizes=[image.size[::-1]])
        >>> boxes = results[0]["boxes"]
        >>> scores = results[0]["scores"]
        ```
        )rD   post_process_object_detection)rF   r   r   r   s       r    r   z+Sam3Processor.post_process_object_detection&  s!    V ##AA'9Vbccr"   c                 >    | j                   j                  ||||      S )ay	  
        Converts the raw output of [`Sam3Model`] into instance segmentation predictions with bounding boxes and masks.
        This is a convenience wrapper around the image processor method.

        Args:
            outputs ([`Sam3ImageSegmentationOutput`]):
                Raw outputs of the model containing pred_boxes, pred_logits, pred_masks, and optionally
                presence_logits.
            threshold (`float`, *optional*, defaults to 0.3):
                Score threshold to keep instance predictions.
            mask_threshold (`float`, *optional*, defaults to 0.5):
                Threshold for binarizing the predicted masks.
            target_sizes (`list[tuple[int, int]]`, *optional*):
                List of tuples (`tuple[int, int]`) containing the target size `(height, width)` of each image in the
                batch. If unset, predictions will not be resized.

        Returns:
            `list[dict]`: A list of dictionaries, each dictionary containing the following keys:
                - **scores** (`torch.Tensor`): The confidence scores for each predicted instance on the image.
                - **boxes** (`torch.Tensor`): Image bounding boxes in (top_left_x, top_left_y, bottom_right_x,
                  bottom_right_y) format.
                - **masks** (`torch.Tensor`): Binary segmentation masks for each instance, shape (num_instances,
                  height, width).

        Example:

        ```python
        >>> from transformers import AutoModel, AutoProcessor
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO

        >>> model = AutoModel.from_pretrained("facebook/sam3-base")
        >>> processor = AutoProcessor.from_pretrained("facebook/sam3-base")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))
        >>> inputs = processor(images=image, text="cat", return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> # Post-process to get instance segmentation
        >>> results = processor.post_process_instance_segmentation(
        ...     outputs, threshold=0.3, target_sizes=[image.size[::-1]]
        ... )
        >>> masks = results[0]["masks"]
        >>> boxes = results[0]["boxes"]
        >>> scores = results[0]["scores"]
        ```
        )rD   "post_process_instance_segmentation)rF   r   r   mask_thresholdr   s        r    r   z0Sam3Processor.post_process_instance_segmentationS  s'    r ##FFY
 	
r"   )Ni)NNNNNNN)F)r}   torch.TensorrQ   r   )r   )N)r   N)FF)Nr   )333333?N)r   r   N) __name__
__module____qualname__r   rC   r   r   r	   r   r   r{   r   ra   strr
   r   rx   r   r   re   rh   ri   r   r   r   r   rg   rl   r   r   r   __classcell__)rI   s   @r    r=   r=   U   s    bek7:Tzk[^k  %)ae/3EIJNBF26fT!f ++d9o=EV@WWZ^^f &,	f
 $tE{+,u||;dBf !d3i1ELL@4Gf T%[)ELL84?f j(4/f 
f fP6(ET6&P@Df8 +/0FllRZZ'$.0F 0F 	0F
 0F !4Z0F 
0Fd!8Fi(+d` ;
r"   r=   )!__doc__copyr   r   r   image_utilsr   processing_utilsr   tokenization_utils_baser   r   r	   utilsr
   r   r   r   utils.import_utilsr   
get_loggerr   loggerr   r!   r$   r)   r,   r0   r7   r:   r=   __all__ r"   r    <module>r      s      % . R R L L * 
		H	%""""""! 
:w
N w
  w
t 
r"   