Ë
    «q±i<,  ã                   óJ   — d dl Z ddlmZmZ ddlmZ d„ Z G d„ de«      ZdgZy)	é    Né   )ÚIMAGENET_STANDARD_MEANÚIMAGENET_STANDARD_STDé   )ÚSam2ImageProcessorFastc                 óè  — t        |t        t        f«      rMt        j                  |D cg c]  }|d   ‘Œ	 c}«      }t        j                  |D cg c]  }|d   ‘Œ	 c}«      }n:t        |t        j
                  «      r|j                  d«      \  }}nt        d«      ‚t        j                  ||||gd¬«      }|j                  d«      j                  | j                  «      }| |z  } | S c c}w c c}w )a  
    Scale batch of bounding boxes to the target sizes.

    Args:
        boxes (`torch.Tensor` of shape `(batch_size, num_boxes, 4)`):
            Bounding boxes to scale. Each box is expected to be in (x1, y1, x2, y2) format.
        target_sizes (`list[tuple[int, int]]` or `torch.Tensor` of shape `(batch_size, 2)`):
            Target sizes to scale the boxes to. Each target size is expected to be in (height, width) format.

    Returns:
        `torch.Tensor` of shape `(batch_size, num_boxes, 4)`: Scaled bounding boxes.
    r   é   z4`target_sizes` must be a list, tuple or torch.Tensor©Údim)Ú
isinstanceÚlistÚtupleÚtorchÚtensorÚTensorÚunbindÚ	TypeErrorÚstackÚ	unsqueezeÚtoÚdevice)ÚboxesÚtarget_sizesÚiÚimage_heightÚimage_widthÚscale_factors         úW/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/sam3/modular_sam3.pyÚ_scale_boxesr      sÌ   € ô ,¤¤u Ô.Ü—|‘|°<Ö$@¨a Q q£TÒ$@ÓAˆÜ—l‘l°,Ö#?¨Q A a£DÒ#?Ó@‰Ü	L¤%§,¡,Ô	/Ø$0×$7Ñ$7¸Ó$:Ñ!ˆ‘käÐNÓOÐOä—;‘; ¨\¸;ÈÐUÐ[\Ô]€LØ×)Ñ)¨!Ó,×/Ñ/°·±Ó=€LØLÑ €EØ€Lùò %AùÚ#?s   ªC*ÁC/c                   óŽ   — e Zd ZeZeZdddœZdddœZ	 dde	e
   dz  defd„Zddede	e
   dz  fd„Z	 	 	 dded	ede	e
   dz  fd
„Zy)ÚSam3ImageProcessorFastið  )ÚheightÚwidthi   Nr   Ú	thresholdc                 óx  — |j                   }|€t        d«      ‚|j                  «       }|¸t        |«      t        |«      k7  rt        d«      ‚g }t	        t        |«      «      D ]{  }t
        j                  j                  j                  ||   j                  d¬«      ||   dd¬«      }|d	   |kD  j                  t
        j                  «      }	|j                  |	«       Œ} |S |dd…df   |kD  j                  t
        j                  «      }t	        |j                  d   «      D 
cg c]  }
||
   ‘Œ	 }}
|S c c}
w )
aŽ  
        Converts the output of [`Sam3Model`] into semantic segmentation maps.

        Args:
            outputs ([`Sam3ImageSegmentationOutput`]):
                Raw outputs of the model containing semantic_seg.
            target_sizes (`list[tuple]` of length `batch_size`, *optional*):
                List of tuples corresponding to the requested final size (height, width) of each prediction. If unset,
                predictions will not be resized.
            threshold (`float`, *optional*, defaults to 0.5):
                Threshold for binarizing the semantic segmentation masks.

        Returns:
            semantic_segmentation: `list[torch.Tensor]` of length `batch_size`, where each item is a semantic
            segmentation map of shape (height, width) corresponding to the target_sizes entry (if `target_sizes` is
            specified). Each entry is a binary mask (0 or 1).
        NzƒSemantic segmentation output is not available in the model outputs. Make sure the model was run with semantic segmentation enabled.zTMake sure that you pass in as many target sizes as the batch dimension of the logitsr   r
   ÚbilinearF©ÚsizeÚmodeÚalign_corners)r   r   )Úsemantic_segÚ
ValueErrorÚsigmoidÚlenÚranger   ÚnnÚ
functionalÚinterpolater   r   ÚlongÚappendÚshape)ÚselfÚoutputsr   r$   Úsemantic_logitsÚsemantic_probsÚsemantic_segmentationÚidxÚresized_probsÚsemantic_mapr   s              r   Ú"post_process_semantic_segmentationz9Sam3ImageProcessorFast.post_process_semantic_segmentation;   sZ  € ð, "×.Ñ.ˆàÐ"ÜðRóð ð )×0Ñ0Ó2ˆð Ð#Ü?Ó#¤s¨<Ó'8Ò8Ü Øjóð ð %'Ð!äœS Ó1Ó2ò 	;Ü %§¡× 3Ñ 3× ?Ñ ?Ø" 3Ñ'×1Ñ1°aÐ1Ó8Ø% cÑ*Ø#Ø"'ð	 !@ó !ð !.¨dÑ 3°iÑ ?×CÑCÄEÇJÁJÓOØ%×,Ñ,¨\Õ:ð	;ð %Ð$ð &4²A°q°DÑ%9¸IÑ%E×$IÑ$IÌ%Ï*É*Ó$UÐ!ÜGLÐMb×MhÑMhÐijÑMkÓGlÖ$mÀ!Ð%:¸1Ó%=Ð$mÐ!Ð$mà$Ð$ùò %ns   Ä'D7c                 ó‚  — |j                   }|j                  }|j                  }|j                  d   }|t	        |«      |k7  rt        d«      ‚|j                  «       }||j                  «       }	||	z  }|}
|t        |
|«      }
g }t        ||
«      D ](  \  }}||kD  }||   }||   }|j                  ||dœ«       Œ* |S )aD  
        Converts the raw output of [`Sam3Model`] into final bounding boxes in (top_left_x, top_left_y,
        bottom_right_x, bottom_right_y) format.

        Args:
            outputs ([`Sam3ImageSegmentationOutput`]):
                Raw outputs of the model containing pred_boxes, pred_logits, and optionally presence_logits.
            threshold (`float`, *optional*, defaults to 0.3):
                Score threshold to keep object detection predictions.
            target_sizes (`list[tuple[int, int]]`, *optional*):
                List of tuples (`tuple[int, int]`) containing the target size `(height, width)` of each image in the
                batch. If unset, predictions will not be resized.

        Returns:
            `list[dict]`: A list of dictionaries, each dictionary containing the following keys:
                - **scores** (`torch.Tensor`): The confidence scores for each predicted box on the image.
                - **boxes** (`torch.Tensor`): Image bounding boxes in (top_left_x, top_left_y, bottom_right_x,
                  bottom_right_y) format.
        r   ú9Make sure that you pass in as many target sizes as images)Úscoresr   )
Úpred_logitsÚ
pred_boxesÚpresence_logitsr5   r.   r,   r-   r   Úzipr4   )r6   r7   r$   r   rB   rC   rD   Ú
batch_sizeÚbatch_scoresÚpresence_scoresÚbatch_boxesÚresultsrA   r   Úkeeps                  r   Úpost_process_object_detectionz4Sam3ImageProcessorFast.post_process_object_detectionv   sï   € ð( ×)Ñ)ˆØ×'Ñ'ˆ
Ø!×1Ñ1ˆà ×&Ñ& qÑ)ˆ
àÐ#¬¨LÓ(9¸ZÒ(GÜÐXÓYÐYð #×*Ñ*Ó,ˆØÐ&Ø-×5Ñ5Ó7ˆOØ'¨/Ñ9ˆLð !ˆð Ð#Ü& {°LÓAˆKàˆÜ  ¨{Ó;ò 	?‰MˆFEØ˜IÑ%ˆDØ˜D‘\ˆFØ˜$‘KˆEØN‰N f°uÑ=Õ>ð		?ð ˆó    Úmask_thresholdc                 óæ  — |j                   }|j                  }|j                  }|j                  }|j                  d   }	|t        |«      |	k7  rt        d«      ‚|j                  «       }
||j                  «       }|
|z  }
|j                  «       }|}|t        ||«      }g }t        t        |
||«      «      D ]´  \  }\  }}}||kD  }||   }||   }||   }|^||   }t        |«      dkD  rKt        j                  j                  j                  |j                  d«      |dd¬«      j!                  d«      }||kD  j#                  t        j$                  «      }|j'                  |||dœ«       Œ¶ |S )aQ  
        Converts the raw output of [`Sam3Model`] into instance segmentation predictions with bounding boxes and masks.

        Args:
            outputs ([`Sam3ImageSegmentationOutput`]):
                Raw outputs of the model containing pred_boxes, pred_logits, pred_masks, and optionally
                presence_logits.
            threshold (`float`, *optional*, defaults to 0.3):
                Score threshold to keep instance predictions.
            mask_threshold (`float`, *optional*, defaults to 0.5):
                Threshold for binarizing the predicted masks.
            target_sizes (`list[tuple[int, int]]`, *optional*):
                List of tuples (`tuple[int, int]`) containing the target size `(height, width)` of each image in the
                batch. If unset, predictions will not be resized.

        Returns:
            `list[dict]`: A list of dictionaries, each dictionary containing the following keys:
                - **scores** (`torch.Tensor`): The confidence scores for each predicted instance on the image.
                - **boxes** (`torch.Tensor`): Image bounding boxes in (top_left_x, top_left_y, bottom_right_x,
                  bottom_right_y) format.
                - **masks** (`torch.Tensor`): Binary segmentation masks for each instance, shape (num_instances,
                  height, width).
        r   r@   r&   Fr'   )rA   r   Úmasks)rB   rC   Ú
pred_masksrD   r5   r.   r,   r-   r   Ú	enumeraterE   r   r0   r1   r2   r   Úsqueezer   r3   r4   )r6   r7   r$   rN   r   rB   rC   rQ   rD   rF   rG   rH   Úbatch_masksrI   rJ   r;   rA   r   rP   rK   Útarget_sizes                        r   Ú"post_process_instance_segmentationz9Sam3ImageProcessorFast.post_process_instance_segmentation©   s¡  € ð< ×)Ñ)ˆØ×'Ñ'ˆ
Ø×'Ñ'ˆ
Ø!×1Ñ1ˆà ×&Ñ& qÑ)ˆ
àÐ#¬¨LÓ(9¸ZÒ(GÜÐXÓYÐYð #×*Ñ*Ó,ˆØÐ&Ø-×5Ñ5Ó7ˆOØ'¨/Ñ9ˆLð !×(Ñ(Ó*ˆð !ˆð Ð#Ü& {°LÓAˆKàˆÜ+4´S¸À{ÐT_Ó5`Ó+aò 	OÑ'ˆCÑ'&˜% à˜IÑ%ˆDØ˜D‘\ˆFØ˜$‘KˆEØ˜$‘KˆEð Ð'Ø*¨3Ñ/Üu“: ’>Ü!ŸH™H×/Ñ/×;Ñ;ØŸ™¨Ó*Ø(Ø'Ø&+ð	 <ó ÷
 ‘g˜a“jð ð ˜^Ñ+×/Ñ/´·
±
Ó;ˆEàN‰N f°uÀuÑMÕNð+	Oð. ˆrM   )Nç      à?)ç333333Ó?N)rX   rW   N)Ú__name__Ú
__module__Ú__qualname__r   Ú
image_meanr   Ú	image_stdr(   Ú	mask_sizer   r   Úfloatr>   rL   rV   © rM   r   r!   r!   5   s¢   „ Ø'€JØ%€IØ TÑ*€DØ¨Ñ-€Ið TWñ9%Ø%)¨%¡[°4Ñ%7ð9%ØKPó9%ñv1Àð 1Ð[_Ð`eÑ[fÐimÑ[mó 1ðl Ø #Ø+/ñPð ðPð ð	Pð
 ˜5‘k DÑ(ôPrM   r!   )	r   Úimage_utilsr   r   Úsam2.image_processing_sam2_fastr   r   r!   Ú__all__r`   rM   r   ú<module>rd      s4   ðó  ÷õ Eòô8DÐ3ô DðN $Ð
$rM   