
    qi1                         d Z ddlmZ ddlmZ ddlZddlmZ ddl	m
Z
mZmZ ddlmZmZmZ dd	lmZmZ  e       rddlZeeeez  dz  d
f      Z G d de
d      Z G d ded      Ze G d de             ZdgZy)z
Processor class for SAM.
    )deepcopy)UnionN   )
ImageInput)ImagesKwargsProcessingKwargsProcessorMixin)BatchEncodingPreTokenizedInput	TextInput)auto_docstringis_torch_available
NestedListc                   x    e Zd ZU dZedz  ed<   ded<   ded<   ded<   edz  ed	<   eeef   ed
<   eeef   ed<   y)SamImagesKwargsa1  
    segmentation_maps (`ImageInput`, *optional*):
        Ground truth segmentation maps to process alongside the input images. These maps are used for training
        or evaluation purposes and are resized and normalized to match the processed image dimensions.
    input_points (`NestedList`, *optional*):
        Input points for prompt-based segmentation. Should be a nested list with structure
        `[image_level, object_level, point_level, [x, y]]` where each point is specified as `[x, y]` coordinates
        in the original image space. Points are normalized to the target image size before being passed to the model.
    input_labels (`NestedList`, *optional*):
        Labels for the input points, indicating whether each point is a foreground (1) or background (0) point.
        Should be a nested list with structure `[image_level, object_level, point_level]`. Must have the same
        structure as `input_points` (excluding the coordinate dimension).
    input_boxes (`NestedList`, *optional*):
        Bounding boxes for prompt-based segmentation. Should be a nested list with structure
        `[image_level, box_level, [x1, y1, x2, y2]]` where each box is specified as `[x1, y1, x2, y2]` coordinates
        in the original image space. Boxes are normalized to the target image size before being passed to the model.
    point_pad_value (`int`, *optional*, defaults to `-10`):
        The value used for padding input points when batching sequences of different lengths. This value marks
        padded positions and is preserved during coordinate normalization to distinguish real points from padding.
    mask_size (`dict[str, int]`, *optional*):
        Dictionary specifying the target mask size with keys `"height"` and `"width"`. This determines the
        resolution of the output segmentation masks generated by the model.
    mask_pad_size (`dict[str, int]`, *optional*):
        Dictionary specifying the padding size for masks with keys `"height"` and `"width"`. This is used when
        batching masks of different sizes to ensure consistent dimensions.
    Nsegmentation_mapsz NestedList | torch.Tensor | Noneinput_pointsz&NestedList | int | torch.Tensor | Noneinput_labelsinput_boxespoint_pad_value	mask_sizemask_pad_size)	__name__
__module____qualname____doc__r   __annotations__intdictstr     X/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/sam/processing_sam.pyr   r   #   sL    6 "D((44::334ZCH~S>!r"   r   F)totalc                   $    e Zd ZU eed<   dddiiZy)SamProcessorKwargsimages_kwargsr   N)r   r   r   r   r   	_defaultsr!   r"   r#   r&   r&   H   s    ""s
Ir"   r&   c                        e Zd Z fdZe	 	 ddedz  deez  ee   z  ee   z  dz  de	fd       Z
	 	 	 	 	 ddZd Z	 dd	ed
ej                  dej                  fdZ	 	 	 ddZed        Zd Z xZS )SamProcessorc                 `    t         |   |       | j                  j                  d   | _        y )Nlongest_edge)super__init__image_processorsizetarget_size)selfr0   	__class__s     r#   r/   zSamProcessor.__init__S   s)    )//44^Dr"   Nimagestextreturnc           
          | j                   t        fdi i|}|d   j                  dd       }|d   j                  dd       }|d   j                  dd       }|d   j                  dd       } | j                  |fi |d   }	|	d   }
t	        |
d      r|
j                         }
| j                  |||	      \  }}}| j                  |	|
||||d   j                  d
      |      }	|	S )Ntokenizer_init_kwargsr'   r   r   r   r   original_sizesnumpy)r   r   r   return_tensors)r   r   r   r<   r   )	_merge_kwargsr&   popr0   hasattrr;   _check_and_preprocess_points_normalize_and_convertget)r3   r5   r6   kwargsoutput_kwargsr   r   r   r   encoding_image_processorr:   s              r#   __call__zSamProcessor.__call__W   s?    +**
"$
 

 %_599.$O$_599.$O#O488M'8<<=NPTU#74#7#7$
O,$
  22BC>7++113N262S2S%%# 3T 3
/lK $(#>#>$%%#(9==>NO+ $? $
  ('r"   c           
      6   t        |      t              k7  r/D cg c]"  }| j                  | j                  ||d         $ c}n:t        |      D 	cg c]"  \  }}	| j                  | j                  ||	      $ c}	}t	        fdD              s|| j                  ||      \  }t        j                        |t        j                  |      }|t        |      t        |      k7  r0|D 
cg c]$  }
| j                  | j                  |
|d   d      & }}
n;t        ||      D 
	cg c]$  \  }
}	| j                  | j                  |
|	d      & }}
}	t        j                  |      }|X|dk(  r@t        j                  |      }t        |j                        dk7  r|j                  d      n|}|j                  d|i       X|dk(  r@t        j                        t        j                        d	k7  rj                  d      n|j                  d
i       |X|dk(  r@t        j                  |      }t        |j                        dk7  r|j                  d      n|}|j                  d|i       |S c c}w c c}	}w c c}
w c c}	}
w )Nr   c              3   V   K   | ]   }|j                   d    j                   k(   " ywr   Nshape).0pointr   s     r#   	<genexpr>z6SamProcessor._normalize_and_convert.<locals>.<genexpr>   s$     Vu{{l1o&;&;;Vs   &)T)is_bounding_boxptr      r      r   r   )len_normalize_coordinatesr2   zipall_pad_points_and_labelsnparraytorch
from_numpyrK   	unsqueezeupdate)r3   rE   r:   r   r   r   r<   r   rM   original_sizeboxs      `       r#   rA   z#SamProcessor._normalize_and_convert   s    #>"c,&77iu `eD//0@0@%XYIZ[  14L.0Q ,} //0@0@%W 
 VVV+151L1L$lO2.L, 88L1L#88L1L">"c+&66  + //0@0@#~VWGXjn/o  /2+~.N*] //0@0@#}fj/k  ((;/K"%#..{;:=k>O>O:PTU:Uk33A6[f$++]K,HI#%$//=<?@R@R<SWX<X|55a8^j$++^\,JK#%$//=<?@R@R<SWX<X|55a8^j$++^\,JK''i  $
s   'J'J
9)J4)Jc           	      ^   t        d |D              }g }t        |      D ]  \  }}|j                  d   |k7  r^t        j                  |t        j
                  ||j                  d   z
  df      |z   gd      }t        j                  ||   |g      ||<   |j                  |        |}||fS )zh
        The method pads the 2D points and labels to the maximum number of points in the batch.
        c              3   :   K   | ]  }|j                   d      ywrI   rJ   )rL   rM   s     r#   rN   z6SamProcessor._pad_points_and_labels.<locals>.<genexpr>   s      JEQ Js   r      )axis)max	enumeraterK   rX   concatenatezerosappend)r3   r   r   r   expected_nb_pointsprocessed_input_pointsirM   s           r#   rW   z#SamProcessor._pad_points_and_labels   s     ! J\ JJ!#!,/ 	1HAu{{1~!33BHH&85;;q>&I1%MNQ``ahi #%))LOo=N"OQ"))%0	1 .\))r"   r2   coordsc                    |\  }}| j                   j                  ||      \  }}t        |      j                  t              }|r|j                  ddd      }|d   ||z  z  |d<   |d   ||z  z  |d<   |r|j                  dd      }|S )z~
        Expects a numpy array of length 2 in the final dimension. Requires the original image size in (H, W) format.
        )r-   rb   ).r   ).rQ   rR   )r0   _get_preprocess_shaper   astypefloatreshape)	r3   r2   rl   r^   rO   old_hold_wnew_hnew_ws	            r#   rT   z#SamProcessor._normalize_coordinates   s     %u++AA-^iAju&!((/^^B1-F55=9v55=9v^^B*Fr"   c                    |{t        |d      r|j                         j                         }t        |t              rt        |d   t              st        d      |D cg c]  }t        j                  |       }}nd}|{t        |d      r|j                         j                         }t        |t              rt        |d   t              st        d      |D cg c]  }t        j                  |       }}nd}|t        |d      r|j                         j                         }t        |t              r)t        |d   t              rt        |d   d   t              st        d      |D cg c]4  }t        j                  |      j                  t        j                        6 }}nd}|||fS c c}w c c}w c c}w )a8  
        Check and preprocesses the 2D points, labels and bounding boxes. It checks if the input is valid and if they
        are, it converts the coordinates of the points and bounding boxes. If a user passes directly a `torch.Tensor`,
        it is converted to a `numpy.ndarray` and then to a `list`.
        Nr;   r   z7Input points must be a list of list of floating points.z-Input labels must be a list of list integers.z>Input boxes must be a list of list of list of floating points.)
r?   r;   tolist
isinstancelist
ValueErrorrX   rY   rp   float32)r3   r   r   r   input_pointlabelr_   s          r#   r@   z)SamProcessor._check_and_preprocess_points   s    #|W-+113::<lD1LQROUY9Z !Z[[EQRkBHH[1RLRL#|W-+113::<lD1LQROUY9Z !PQQ9EFBHHUOFLFL"{G,)//188: {D1!+a.$7!+a."3T: !abbGRS288C=//

;SKSK\;669 S G Ts   F6F;39G c                 N    | j                   j                  }t        |ddgz         S )Nr:   reshaped_input_sizes)r0   model_input_namesrz   )r3   image_processor_input_namess     r#   r   zSamProcessor.model_input_names  s,    &*&:&:&L&L#/3CE[2\\]]r"   c                 :     | j                   j                  |i |S )N)r0   post_process_masks)r3   argsrC   s      r#   r   zSamProcessor.post_process_masks   s     6t##66GGGr"   )NN)NNNrP   r(   )F)NNN)r   r   r   r/   r   r   r   r   rz   r
   rF   rA   rW   r   rX   ndarrayrT   r@   propertyr   r   __classcell__)r4   s   @r#   r+   r+   Q   s    E  %)ae+(T!+( ++d9o=EV@WWZ^^+(
 
+( +(b @(D*" TY(*

	. 	-7^ ^ ^Hr"   r+   )r   copyr   typingr   r;   rX   image_utilsr   processing_utilsr   r   r	   tokenization_utils_baser
   r   r   utilsr   r   rZ   rz   rq   r   r   r   r&   r+   __all__r!   r"   r#   <module>r      s       % N N R R 7 %d*L89:
""l% ""J)  OH> OH OHd 
r"   