
    qi<,                     J    d dl Z ddlmZmZ ddlmZ d Z G d de      ZdgZy)	    N   )IMAGENET_STANDARD_MEANIMAGENET_STANDARD_STD   )Sam2ImageProcessorFastc                    t        |t        t        f      rMt        j                  |D cg c]  }|d   	 c}      }t        j                  |D cg c]  }|d   	 c}      }n:t        |t        j
                        r|j                  d      \  }}nt        d      t        j                  ||||gd      }|j                  d      j                  | j                        }| |z  } | S c c}w c c}w )a  
    Scale batch of bounding boxes to the target sizes.

    Args:
        boxes (`torch.Tensor` of shape `(batch_size, num_boxes, 4)`):
            Bounding boxes to scale. Each box is expected to be in (x1, y1, x2, y2) format.
        target_sizes (`list[tuple[int, int]]` or `torch.Tensor` of shape `(batch_size, 2)`):
            Target sizes to scale the boxes to. Each target size is expected to be in (height, width) format.

    Returns:
        `torch.Tensor` of shape `(batch_size, num_boxes, 4)`: Scaled bounding boxes.
    r      z4`target_sizes` must be a list, tuple or torch.Tensordim)
isinstancelisttupletorchtensorTensorunbind	TypeErrorstack	unsqueezetodevice)boxestarget_sizesiimage_heightimage_widthscale_factors         W/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/sam3/modular_sam3.py_scale_boxesr      s     ,u.||<$@aQqT$@All,#?QAaD#?@	L%,,	/$0$7$7$:!kNOO;;\;U[\]L))!,//=LL EL %A#?s   C*C/c                       e Zd ZeZeZdddZdddZ	 dde	e
   dz  defdZddede	e
   dz  fdZ	 	 	 dded	ede	e
   dz  fd
Zy)Sam3ImageProcessorFasti  )heightwidthi   Nr   	thresholdc                 x   |j                   }|t        d      |j                         }|t        |      t        |      k7  rt        d      g }t	        t        |            D ]{  }t
        j                  j                  j                  ||   j                  d      ||   dd      }|d	   |kD  j                  t
        j                        }	|j                  |	       } |S |dddf   |kD  j                  t
        j                        }t	        |j                  d         D 
cg c]  }
||
   	 }}
|S c c}
w )
a  
        Converts the output of [`Sam3Model`] into semantic segmentation maps.

        Args:
            outputs ([`Sam3ImageSegmentationOutput`]):
                Raw outputs of the model containing semantic_seg.
            target_sizes (`list[tuple]` of length `batch_size`, *optional*):
                List of tuples corresponding to the requested final size (height, width) of each prediction. If unset,
                predictions will not be resized.
            threshold (`float`, *optional*, defaults to 0.5):
                Threshold for binarizing the semantic segmentation masks.

        Returns:
            semantic_segmentation: `list[torch.Tensor]` of length `batch_size`, where each item is a semantic
            segmentation map of shape (height, width) corresponding to the target_sizes entry (if `target_sizes` is
            specified). Each entry is a binary mask (0 or 1).
        NzSemantic segmentation output is not available in the model outputs. Make sure the model was run with semantic segmentation enabled.zTMake sure that you pass in as many target sizes as the batch dimension of the logitsr   r
   bilinearFsizemodealign_corners)r   r   )semantic_seg
ValueErrorsigmoidlenranger   nn
functionalinterpolater   r   longappendshape)selfoutputsr   r$   semantic_logitssemantic_probssemantic_segmentationidxresized_probssemantic_mapr   s              r   "post_process_semantic_segmentationz9Sam3ImageProcessorFast.post_process_semantic_segmentation;   sZ   , ".."R  )002 #?#s<'88 j  %'!S12 	; % 3 3 ? ?"3'11a18%c*#"'	 !@ ! !.d 3i ?CCEJJO%,,\:	; %$ &4AqD%9I%E$I$I%**$U!GLMbMhMhijMkGl$m!%:1%=$m!$m$$ %ns   'D7c                    |j                   }|j                  }|j                  }|j                  d   }|t	        |      |k7  rt        d      |j                         }||j                         }	||	z  }|}
|t        |
|      }
g }t        ||
      D ](  \  }}||kD  }||   }||   }|j                  ||d       * |S )aD  
        Converts the raw output of [`Sam3Model`] into final bounding boxes in (top_left_x, top_left_y,
        bottom_right_x, bottom_right_y) format.

        Args:
            outputs ([`Sam3ImageSegmentationOutput`]):
                Raw outputs of the model containing pred_boxes, pred_logits, and optionally presence_logits.
            threshold (`float`, *optional*, defaults to 0.3):
                Score threshold to keep object detection predictions.
            target_sizes (`list[tuple[int, int]]`, *optional*):
                List of tuples (`tuple[int, int]`) containing the target size `(height, width)` of each image in the
                batch. If unset, predictions will not be resized.

        Returns:
            `list[dict]`: A list of dictionaries, each dictionary containing the following keys:
                - **scores** (`torch.Tensor`): The confidence scores for each predicted box on the image.
                - **boxes** (`torch.Tensor`): Image bounding boxes in (top_left_x, top_left_y, bottom_right_x,
                  bottom_right_y) format.
        r   9Make sure that you pass in as many target sizes as images)scoresr   )
pred_logits
pred_boxespresence_logitsr5   r.   r,   r-   r   zipr4   )r6   r7   r$   r   rB   rC   rD   
batch_sizebatch_scorespresence_scoresbatch_boxesresultsrA   r   keeps                  r   post_process_object_detectionz4Sam3ImageProcessorFast.post_process_object_detectionv   s    ( ))''
!11 &&q)
#L(9Z(GXYY #**,&-557O'/9L ! #&{LAK {; 	?MFEI%DD\F$KENNfu=>		?     mask_thresholdc                    |j                   }|j                  }|j                  }|j                  }|j                  d   }	|t        |      |	k7  rt        d      |j                         }
||j                         }|
|z  }
|j                         }|}|t        ||      }g }t        t        |
||            D ]  \  }\  }}}||kD  }||   }||   }||   }|^||   }t        |      dkD  rKt        j                  j                  j                  |j                  d      |dd      j!                  d      }||kD  j#                  t        j$                        }|j'                  |||d        |S )aQ  
        Converts the raw output of [`Sam3Model`] into instance segmentation predictions with bounding boxes and masks.

        Args:
            outputs ([`Sam3ImageSegmentationOutput`]):
                Raw outputs of the model containing pred_boxes, pred_logits, pred_masks, and optionally
                presence_logits.
            threshold (`float`, *optional*, defaults to 0.3):
                Score threshold to keep instance predictions.
            mask_threshold (`float`, *optional*, defaults to 0.5):
                Threshold for binarizing the predicted masks.
            target_sizes (`list[tuple[int, int]]`, *optional*):
                List of tuples (`tuple[int, int]`) containing the target size `(height, width)` of each image in the
                batch. If unset, predictions will not be resized.

        Returns:
            `list[dict]`: A list of dictionaries, each dictionary containing the following keys:
                - **scores** (`torch.Tensor`): The confidence scores for each predicted instance on the image.
                - **boxes** (`torch.Tensor`): Image bounding boxes in (top_left_x, top_left_y, bottom_right_x,
                  bottom_right_y) format.
                - **masks** (`torch.Tensor`): Binary segmentation masks for each instance, shape (num_instances,
                  height, width).
        r   r@   r&   Fr'   )rA   r   masks)rB   rC   
pred_masksrD   r5   r.   r,   r-   r   	enumeraterE   r   r0   r1   r2   r   squeezer   r3   r4   )r6   r7   r$   rN   r   rB   rC   rQ   rD   rF   rG   rH   batch_masksrI   rJ   r;   rA   r   rP   rK   target_sizes                        r   "post_process_instance_segmentationz9Sam3ImageProcessorFast.post_process_instance_segmentation   s   < ))''
''
!11 &&q)
#L(9Z(GXYY #**,&-557O'/9L !((* ! #&{LAK+4S{T_5`+a 	O'C'&%I%DD\F$KE$KE '*3/u:>!HH//;;*('&+	 < 
 gaj  ^+//

;ENNfuuMN+	O. rM   )N      ?)333333?N)rX   rW   N)__name__
__module____qualname__r   
image_meanr   	image_stdr(   	mask_sizer   r   floatr>   rL   rV    rM   r   r!   r!   5   s    'J%IT*D-I TW9%%)%[4%79%KP9%v1 1[_`e[fim[m 1l  #+/P P 	P
 5kD(PrM   r!   )	r   image_utilsr   r   sam2.image_processing_sam2_fastr   r   r!   __all__r`   rM   r   <module>rd      s4      E8D3 DN $
$rM   