
    qi܌                       d dl Z d dlmZmZ d dlmZ d dlZd dlZd dl	m
Z
 d dlm
c mZ d dlZd dlmZ d dlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZ ddlm Z m!Z! ddl"m#Z# ddl$m%Z% ddl&m'Z'm(Z(m)Z) ddl*m+Z+m,Z,m-Z- ddl.m/Z/ ddl0m1Z1 ddl2m3Z3m4Z4m5Z5m6Z6m7Z7m8Z8m9Z9  e)jt                  e;      Z<ee' G d de                    Z=ee' G d de                    Z>ee' G d de                    Z?ee' G d de                    Z@ee' G d de                    ZAee' G d  d!e                    ZBdqd"ej                  d#eCd$ej                  fd%ZDdrd&eEfd'ZFd( ZG G d) d*e
j                        ZI	 	 dsd+e
j                  d,ej                  d-ej                  d.ej                  d/ej                  dz  d0eCdz  d1eCd2e#e+   fd3ZJ G d4 d5e
j                        ZK G d6 d7e
j                        ZLd8 ZMd9ej                  d:ej                  d;ej                  d<ej                  d$eNej                  ej                  f   f
d=ZO G d> d?e
j                        ZP G d@ dAe
j                        ZQ G dB dCe
j                        ZRdD ZSdE ZT G dF dGe
j                        ZU G dH dIe      ZVe' G dJ dKe!             ZWe' G dL dMeW             ZX G dN dOe
j                        ZY G dP dQe
j                        ZZ G dR dSe
j                        Z[ e'dTU       G dV dWeW             Z\ G dX dYe
j                        Z] G dZ d[e
j                        Z^ G d\ d]e
j                        Z_ G d^ d_eW      Z` G d` dae
j                        Za G db dce
j                        Zb G dd deeW      Zc G df dge
j                        Zd G dh die
j                        Ze G dj dke
j                        Zf G dl dmeW      Zg G dn doeW      Zhg dpZiy)t    N)CallableIterable)	dataclass)Tensor)CLIPTextModelWithProjection   )initialization)ACT2FN)create_bidirectional_mask)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPoolingModelOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)#compile_compatible_method_lru_cache)auto_docstringcan_return_tuplelogging)TransformersKwargsis_flash_attention_requestedmerge_with_config_defaults)capture_outputs   )	AutoModel   )
Sam3ConfigSam3DETRDecoderConfigSam3DETREncoderConfigSam3GeometryEncoderConfigSam3MaskDecoderConfigSam3VisionConfigSam3ViTConfigc                   j    e Zd ZU dZdZeej                  df   ed<   dZ	eej                  df   ed<   y)Sam3VisionEncoderOutputz
    fpn_hidden_states (`tuple[torch.FloatTensor]`):
        Tuple of multi-level FPN feature maps.
    fpn_position_encoding (`tuple[torch.FloatTensor]`):
        Tuple of position encodings for each FPN level.
    N.fpn_hidden_statesfpn_position_encoding)
__name__
__module____qualname____doc__r'   tupletorchFloatTensor__annotations__r(        X/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/sam3/modeling_sam3.pyr&   r&   >   s?     8<uU..34;;?5!2!2C!78?r2   r&   c                   \    e Zd ZU dZdZej                  ed<   dZej                  dz  ed<   y)Sam3GeometryEncoderOutputa^  
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_prompts, hidden_size)`):
        Encoded geometry prompt features (boxes).
    attention_mask (`torch.BoolTensor` of shape `(batch_size, num_prompts)`, *optional*):
        Attention mask for geometry prompts where True indicates valid positions and False indicates padding.
    Nlast_hidden_stateattention_mask)
r)   r*   r+   r,   r6   r.   r/   r0   r7   
BoolTensorr1   r2   r3   r5   r5   L   s/     ,0u((/.2NE$$t+2r2   r5   c                      e Zd ZU dZdZej                  ed<   dZej                  dz  ed<   dZ	ej                  dz  ed<   dZ
ej                  dz  ed<   dZeej                     dz  ed<   dZeej                     dz  ed<   y)	Sam3DETREncoderOutputa  
    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Encoded vision features (flattened from multi-level features).
    pos_embeds_flattened (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
        Flattened position embeddings for the vision features.
    text_features (`torch.FloatTensor` of shape `(batch_size, text_seq_len, hidden_size)`, *optional*):
        Text features (may be pooled after encoder processing).
    spatial_shapes (`torch.LongTensor` of shape `(num_levels, 2)`, *optional*):
        Spatial shapes (height, width) for each feature pyramid level.
    hidden_states (`tuple[torch.FloatTensor]`, *optional*):
        Tuple of hidden states from all encoder layers.
    attentions (`tuple[torch.FloatTensor]`, *optional*):
        Tuple of attention weights from all encoder layers.
    Nr6   pos_embeds_flattenedtext_featuresspatial_shapeshidden_states
attentions)r)   r*   r+   r,   r6   r.   r/   r0   r;   r<   r=   
LongTensorr>   r-   r?   r1   r2   r3   r:   r:   Z   s     ,0u((/59%++d29.2M5$$t+2.2NE$$t+259M5**+d2926Je''(4/6r2   r:   c                       e Zd ZU dZdZej                  ed<   dZej                  ed<   dZ	ej                  ed<   dZ
eej                     dz  ed<   dZeej                     dz  ed<   y)Sam3DETRDecoderOutputa  
    intermediate_hidden_states (`torch.FloatTensor` of shape `(num_layers, batch_size, num_queries, hidden_size)`):
        Decoder hidden states from all layers.
    reference_boxes (`torch.FloatTensor` of shape `(num_layers, batch_size, num_queries, 4)`):
        Predicted reference boxes from all decoder layers in (cx, cy, w, h) format.
    presence_logits (`torch.FloatTensor` of shape `(num_layers, batch_size, 1)`):
        Presence logits from all decoder layers indicating object presence confidence.
    hidden_states (`tuple[torch.FloatTensor]`, *optional*):
        Tuple of hidden states from all decoder layers.
    attentions (`tuple[torch.FloatTensor]`, *optional*):
        Tuple of attention weights from all decoder layers (self-attention and cross-attention).
    Nintermediate_hidden_statesreference_boxespresence_logitsr>   r?   )r)   r*   r+   r,   rC   r.   r/   r0   rD   rE   r>   r-   r?   r1   r2   r3   rB   rB   t   sn     59 1 18)-OU&&-)-OU&&-59M5**+d2926Je''(4/6r2   rB   c                       e Zd ZU dZdZej                  ed<   dZej                  dz  ed<   dZ	e
ej                     dz  ed<   y)Sam3MaskDecoderOutputa  
    pred_masks (`torch.FloatTensor` of shape `(batch_size, num_queries, height, width)`):
        Predicted segmentation masks for each query.
    semantic_seg (`torch.FloatTensor` of shape `(batch_size, 1, height, width)`, *optional*):
        Semantic segmentation output.
    attentions (`tuple[torch.FloatTensor]`, *optional*):
        Tuple of attention weights from mask decoder cross-attention layers.
    N
pred_maskssemantic_segr?   )r)   r*   r+   r,   rH   r.   r/   r0   rI   r?   r-   r1   r2   r3   rG   rG      sH     %)J!!(-1L%##d*126Je''(4/6r2   rG   c                   8   e Zd ZU dZdZej                  ed<   dZej                  ed<   dZ	ej                  dz  ed<   dZ
ej                  dz  ed<   dZej                  dz  ed<   dZeej                     dz  ed<   dZej                  dz  ed	<   dZeej                     dz  ed
<   dZeej                     dz  ed<   dZeej                     dz  ed<   dZeej                     dz  ed<   dZeej                     dz  ed<   dZeej                     dz  ed<   y)Sam3ImageSegmentationOutputa}  
    pred_masks (`torch.FloatTensor` of shape `(batch_size, num_queries, height, width)`):
        Predicted segmentation masks for each query.
    pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
        Predicted bounding boxes in (x1, y1, x2, y2) format.
    pred_logits (`torch.FloatTensor` of shape `(batch_size, num_queries)`, *optional*):
        Classification confidence scores for each query, computed via dot product between
        decoder query features and text features.
    presence_logits (`torch.FloatTensor` of shape `(batch_size, 1)`, *optional*):
        Presence logits from the DETR decoder presence token (last layer only). These indicate whether objects
        are present in the scene. Can be used to compute final scores by multiplying with pred_logits:
        `final_scores = pred_logits.sigmoid() * presence_logits.sigmoid()`.
    semantic_seg (`torch.FloatTensor` of shape `(batch_size, 1, height, width)`, *optional*):
        Semantic segmentation output.
    decoder_hidden_states (`tuple[torch.FloatTensor]`, *optional*):
        Tuple of hidden states from all DETR decoder layers. Each tensor has shape `(batch_size, num_queries, hidden_size)`.
    decoder_reference_boxes (`torch.FloatTensor` of shape `(num_layers, batch_size, num_queries, 4)`, *optional*):
        Reference boxes from all DETR decoder layers.
    encoder_hidden_states (`tuple[torch.FloatTensor]`, *optional*):
        Tuple of hidden states from all DETR encoder layers.
    vision_hidden_states (`tuple[torch.FloatTensor]`, *optional*):
        Tuple of hidden states from all vision encoder (ViT) layers.
    vision_attentions (`tuple[torch.FloatTensor]`, *optional*):
        Attention weights from vision encoder (ViT) layers.
    detr_encoder_attentions (`tuple[torch.FloatTensor]`, *optional*):
        Attention weights from DETR encoder layers.
    detr_decoder_attentions (`tuple[torch.FloatTensor]`, *optional*):
        Attention weights from DETR decoder layers (self-attention and cross-attention).
    mask_decoder_attentions (`tuple[torch.FloatTensor]`, *optional*):
        Attention weights from mask decoder layers.
    NrH   
pred_boxespred_logitsrE   rI   decoder_hidden_statesdecoder_reference_boxesencoder_hidden_statesvision_hidden_statesvision_attentionsdetr_encoder_attentionsdetr_decoder_attentionsmask_decoder_attentions)r)   r*   r+   r,   rH   r.   r/   r0   rL   rM   rE   rI   rN   r-   rO   rP   rQ   rR   rS   rT   rU   r1   r2   r3   rK   rK      s<   @ %)J!!($(J!!(,0K""T)004OU&&-4-1L%##d*1=A5!2!23d:A8<U..5<=A5!2!23d:A<@% 1 12T9@9=uU../$6=?CU5#4#45<C?CU5#4#45<C?CU5#4#45<Cr2   rK   xepsreturnc                     | j                  dd      } | j                  |      }d| z
  j                  |      }t        j                  ||z        S )z5The inverse function for sigmoid activation function.r   r   minmaxr[   )clampr.   log)rV   rW   x1x2s       r3   inverse_sigmoidrb      sK    	A1A	
S	B
a%3	B99R"Wr2   return_indexc                 T   | j                   \  }}}|j                   \  }}	}
||cxk(  r*|j                  d      cxk(  r|j                  d      k(  sJ  J ||
k(  sJ ||j                  d      k(  sJ |	|j                  d      k(  sJ |j                  d      }|j                  d      }||z   }||	z   }t        j                  ||j
                        d   j                  |d      |dddf   k  }t        j                  |||f|j
                  |j                        }| |ddd|ddf<   t        j                  |	|j
                        d   j                  |d      }||dddf   z   }|j                  d|dddddf   j                  dd|      |      }|r|||fS ||fS )a  
    Concatenates two right-padded sequences, such that the resulting sequence
    is contiguous and also right-padded.

    Tensors are batch-first, masks are batch-first with True=valid, False=padding.

    Args:
        seq1: A tensor of shape (batch_size, seq1_length, hidden_size).
        mask1: A tensor of shape (batch_size, seq1_length) with True=valid, False=padding.
        seq2: A tensor of shape (batch_size, seq2_length, hidden_size).
        mask2: A tensor of shape (batch_size, seq2_length) with True=valid, False=padding.
        return_index: If True, also returns the index of the ids of the element of seq2
            in the concatenated sequence. This can be used to retrieve the elements of seq2.

    Returns:
        A tuple (concatenated_sequence, concatenated_mask) if return_index is False,
        otherwise (concatenated_sequence, concatenated_mask, index).
        The concatenated_mask uses True=valid, False=padding convention.
    r   r   dim)deviceNrh   dtype)shapesizesumr.   arangerh   repeatzerosrj   scatterexpand)seq1mask1seq2mask2rc   
batch_sizeseq1_lengthhidden_sizebatch_size2seq2_lengthhidden_size2actual_seq1_lengthsactual_seq2_lengthsfinal_lengths
max_lengthconcatenated_maskconcatenated_sequenceindexs                     r3   concat_padded_sequencesr      s   ( ,0::(J[-1ZZ*KlF

1FAFFFFF,&&&%**Q-'''%**Q-''')))+)))+'*==M{*J 	Z4T:AA*aPS`abdhahSii  "KKZ(MVZVaVaimisist04!\k\1,- LLT[[9$?FFzSTUE'400E 299!U1a:=N=U=UVXZ\^i=jlpq$&7>> "333r2   c                     | j                  d      \  }}}}|d|z  z
  |d|z  z
  |d|z  z   |d|z  z   g}t        j                  |d      S )zDConvert boxes from (cx, cy, w, h) format to (x1, y1, x2, y2) format.re         ?rf   )unbindr.   stack)rV   x_cy_cwhbs         r3   box_cxcywh_to_xyxyr     sV    XXb\NCa
a-3q=C#'MS37]LA;;qb!!r2   c                   \     e Zd Zdef fdZdej                  dej                  fdZ xZS )Sam3MLPconfigc                 d   t         |           || _        t        |j                     | _        t        j                  |j                  |j                        | _
        t        j                  |j                  |j                        | _        t        j                  |j                        | _        y N)super__init__r   r
   
hidden_actactivation_fnnnLinearry   intermediate_sizefc1fc2Dropouthidden_dropoutdropoutselfr   	__class__s     r3   r   zSam3MLP.__init__  sz    #F$5$5699V//1I1IJ99V55v7I7IJzz&"7"78r2   r>   rX   c                     | j                  |      }| j                  |      }| j                  |      }| j                  |      }|S r   )r   r   r   r   )r   r>   s     r3   forwardzSam3MLP.forward  sB    /]3**=9/r2   )	r)   r*   r+   r$   r   r.   r   r   __classcell__r   s   @r3   r   r     s*    9} 9U\\ ell r2   r   modulequerykeyvaluer7   scalingr   kwargsc                    ||j                  d      dz  }t        j                  ||j                  dd            |z  }|||z   }t        j
                  j                  |d      }t        j
                  j                  ||| j                        }t        j                  ||      }	|	j                  dd      j                         }	|	|fS )Nre         r   r   rf   )ptrainingr   )
rl   r.   matmul	transposer   
functionalsoftmaxr   r   
contiguous)
r   r   r   r   r7   r   r   r   attn_weightsattn_outputs
             r3   eager_attention_forwardr   $  s     **R.D( <<s}}Q':;gEL!#n4==((2(>L==((6??([L,,|U3K''1-88:K$$r2   c                        e Zd ZdZ fdZ	 ddej                  dej                  dej                  dej                  dz  dee   d	e	ej                  ej                  f   fd
Z
 xZS )Sam3Attentionz`
    Multi-head attention.
    Handles standard [batch_size, seq_len, hidden_size] tensors.
    c                 ^   t         |           || _        |j                  | _        |j                  | _        | j                  |j                  z  | _        | j
                  dz  | _        d| _        t        j                  | j                  | j                        | _
        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        y Nr   F)r   r   r   ry   num_attention_headshead_dimr   	is_causalr   r   q_projk_projv_projo_projr   s     r3   r   zSam3Attention.__init__F  s    !--#)#=#= ((F,F,FF}}d*ii 0 0$2B2BCii 0 0$2B2BCii 0 0$2B2BCii 0 0$2B2BCr2   Nr   r   r   r7   r   rX   c                    |j                   d   }|j                   d   }|j                   d   }| j                  |      j                  ||| j                  | j                        j                  dd      }| j                  |      j                  ||| j                  | j                        j                  dd      }| j                  |      j                  ||| j                  | j                        j                  dd      }t        j                  | j                  j                  t              }	t        | j                        r=|;|j                  t        j                   k7  rt        d   }	t"        j%                  d        |	| |||f|d| j&                  | j(                  d|\  }
}|
j+                  ||| j                  | j                  z        j-                         }
| j/                  |
      }
|
|fS )a  
        Args:
            query: [batch_size, query_len, hidden_size]
            key: [batch_size, key_len, hidden_size]
            value: [batch_size, value_len, hidden_size]
            attention_mask: [batch_size, num_heads, query_len, key_len] or broadcastable

        Returns:
            Tuple of (output, attention_weights)
                output: [batch_size, query_len, hidden_size]
                attention_weights: [batch_size, num_heads, query_len, key_len]
        r   r   r   sdpazSam3Attention: falling back to SDPA for relative-position cross-attention because Flash Attention does not support additive bias masks.        r7   r   r   r   )rk   r   viewr   r   r   r   r   r   get_interfacer   _attn_implementationr   r   rj   r.   boolloggerwarning_oncer   r   reshaper   r   )r   r   r   r   r7   r   rw   	query_lenkey_lenattention_interfacer   r   s               r3   r   zSam3Attention.forwardT  s   ( [[^
KKN	))A,E"''
It?W?WY]YfYfgqqrsuvwkk###J9Q9QSWS`S`akklmopqE"''
GT=U=UW[WdWdeoopqstu(?(M(MKK,,.E)

 )5*$$

2 #:&"AH
 %8	
%

 *LLnn
%
 
%
!\ "))*iAYAY\`\i\iAijuuwkk+.L((r2   r   )r)   r*   r+   r,   r   r.   r   r   r   r-   r   r   r   s   @r3   r   r   @  s    
D& /3<)||<) \\<) ||	<)
 t+<) +,<) 
u||U\\)	*<)r2   r   c            	            e Zd ZdZd	dedededef fdZ ej                         de
ej                  ej                  f   fd       Z xZS )
Sam3ViTRotaryEmbeddingz
    Vision Rotary Position Embedding for SAM3, following transformers library standards.
    Supports 2D (axial) rotary embeddings for spatial dimensions.
    r   end_xend_yscalec                 X   t         |           |j                  |j                  z  }|dz  dk7  rt	        d      ||c| _        | _        || _        |j                  | _        || _	        d|j                  t        j                  d|d      d |dz   j                         |z  z  z  }t        j                  ||z  t        j                        }||z  |z  }t        j                  ||d      |z  }	t        j                  ||      j                         }
t        j                  |	|      j                         }t        j                   |
|gd	      }|j#                  d
d	      }| j%                  d|j'                         d       | j%                  d|j)                         d       y )N   r   z/Dimension must be divisible by 4 for axial RoPE      ?rj   floorrounding_modere   rf   r   rope_embeddings_cosF)
persistentrope_embeddings_sin)r   r   ry   r   
ValueErrorr   r   rg   
rope_thetar   r.   rn   floatlongdivoutercatrepeat_interleaveregister_buffercossin)r   r   r   r   r   rg   freqsflattened_indicesx_positionsy_positionsfreqs_xfreqs_yinv_freqr   s                r3   r   zSam3ViTRotaryEmbedding.__init__  sw     F$>$>>7a<NOO!&
DJ ++
v((U\\!S!-D\q-R-X-X-Z]`-`ab!LLejjI(50E9ii 15PSXX++k51779++k5177999gw/R8--aR-82HLLNuU2HLLNuUr2   rX   c                 2    | j                   | j                  fS r   )r   r   r   s    r3   r   zSam3ViTRotaryEmbedding.forward  s     '')A)AAAr2   )r   )r)   r*   r+   r,   r$   intr   r   r.   no_gradr-   r   r   r   r   s   @r3   r   r     sf    
V} VS V VU V. U]]_Bu||U\\9: B Br2   r   c                      | j                   g | j                  dd dd } | j                  d      \  }}t        j                  | |fd      } | j                  d      S )ax  
    pairwise rotation of the hidden dims of the input. Differerent from Llama Half-Tensor Rotation.

    This is an optimized version of the following more explicit implementation:
    ```python
    x_rotated = torch.zeros_like(x, dtype=x.dtype, device=x.device)
    x_rotated[..., ::2] = -x[..., 1::2]
    x_rotated[..., 1::2] = x[..., ::2]
    return x_rotated
    ```
    Nre   r   rf   )	start_dim)r   rk   r   r.   r   flatten)rV   r`   ra   s      r3   rotate_pairwiser     sf     	$$b$!$AXX"XFBbS"I2&A99r9""r2   qkr   r   c                     | j                         }||z  t        |      |z  z   }|j                         }||z  t        |      |z  z   }|j                  |       |j                  |      fS )a  
    Apply rotary position embedding to query and key tensors for self-attention.

    Args:
        q: Query tensor of shape (batch_size, num_windows, seq_len, num_heads, head_dim)
        k: Key tensor of shape (batch_size, num_windows, seq_len, num_heads, head_dim)
        cos: Cosine position embedding of shape (seq_len, head_dim)
        sin: Sine position embedding of shape (seq_len, head_dim)

    Returns:
        Rotated (q, k) tensors
    )r   r   type_as)r   r   r   r   q_embedk_embeds         r3   apply_rotary_pos_emb_2dr    sg    $ ggiG}!9C!?@GggiG}!9C!?@G??1wq111r2   c                        e Zd ZdZdef fdZdej                  deej                  ej                  f   de	e
   defdZ xZS )	Sam3ViTRoPEAttentionz-Self-attention with rotary position encoding.r   c                    t         |           || _        |j                  | _        |j                  | _        | j                  |j                  z  | _        | j
                  dz  | _        |j                  | _        d| _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        y r   )r   r   r   ry   r   r   r   attention_dropoutr   r   r   r   r   r   r   r   s     r3   r   zSam3ViTRoPEAttention.__init__  s    !--#)#=#= ((F,F,FF}}d*!'!9!9ii 0 0$2B2BCii 0 0$2B2BCii 0 0$2B2BCii 0 0$2B2BCr2   r>   position_embeddingsr   rX   c                    |j                   \  }}}}||z  }||| j                  | j                  f}	 | j                  |      j                  |	 j                  dd      }
 | j                  |      j                  |	 j                  dd      } | j                  |      j                  |	 j                  dd      }|\  }}t        |
|||      \  }
}t        j                  | j                  j                  t              } || |
||fd | j                  sdn| j                  | j                   | j"                  d|\  }}|j%                  |||d      j'                         }| j)                  |      }||fS )Nr   r   )r   r   r   r   re   )rk   r   r   r   r   r   r   r   r  r   r   r   r   r   r   r  r   r   r   r   r   )r   r>   r  r   rw   heightwidth_seq_len	new_shaper   r   r   r   r   r   r   r   s                     r3   r   zSam3ViTRoPEAttention.forward  sy    (5':':$
FE15.$*B*BDMMR	/M*//;EEaK-dkk-(--y9CCAqI/M*//;EEaK&S,UCScJ
s(?(M(MKK,,.E)
 %8	
%

  #}}C$2H2HLLnn
%
 
%
!\ "))*feRHSSUkk+.L((r2   )r)   r*   r+   r,   r$   r   r.   r   r-   r   r   r   r   r   s   @r3   r  r    s[    7D} D )|| ) #5<<#=> ) +,	 )
 
 )r2   r  c                   `     e Zd ZdZdef fdZdej                  dej                  fdZ xZ	S )Sam3ViTPatchEmbeddingsz
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    r   c                 ~   t         |           |j                  |j                  }}|j                  |j
                  }}t        |t              r|n||f}t        |t              r|n||f}|d   |d   z  |d   |d   z  z  }|| _        || _        || _        || _	        t        j                  ||||d      | _        y )Nr   r   F)kernel_sizestridebias)r   r   pretrain_image_size
patch_sizenum_channelsry   
isinstancer   
image_sizenum_patchesr   Conv2d
projection)r   r   r  r  r  ry   r  r   s          r3   r   zSam3ViTPatchEmbeddings.__init__  s    !'!;!;V=N=NJ
$*$7$79K9Kk#-j(#CZ*V`Ia
#-j(#CZ*V`Ia
!!}
15*Q-:VW=:XY$$(&))L+:^hotur2   pixel_valuesrX   c                     | j                  |j                  | j                   j                  j                              j	                  d      j                  dd      }|S )Nr   r   )r  toweightrj   r   r   )r   r  
embeddingss      r3   r   zSam3ViTPatchEmbeddings.forward.  sJ    __\__T__5K5K5Q5Q%RS[[\]^hhijlmn
r2   )
r)   r*   r+   r,   r$   r   r.   r   r   r   r   s   @r3   r  r    s1    v} vELL U\\ r2   r  c                        e Zd ZdZdef fdZdej                  dededej                  fdZ		 dd	ej                  d
e
dej                  fdZ xZS )Sam3ViTEmbeddingsz
    Construct the patch embeddings and position embeddings for SAM3 ViT.

    Position embeddings are tiled (not interpolated) when resizing to match different input sizes.
    r   c                 J   t         |           t        |      | _        | j                  j                  }t        j                  t        j                  d||j                              | _
        t        j                  |j                        | _        |j                  | _        y )Nr   )r   r   r  patch_embeddingsr  r   	Parameterr.   randnry   r  r   r   r   r  )r   r   r  r   s      r3   r   zSam3ViTEmbeddings.__init__:  sx     6v >++77#%<<KK;(:(:;$
  zz&"7"78 ++r2   r  r	  r
  rX   c                    t        |j                  d   dz        }t        j                  j	                         s ||k(  r||k(  r|j                  d||z  d      S |j                  d   }|j                  d|||      j                  dddd      }||z  dz   }||z  dz   }|j                  dd||g      ddddd|d|f   }|j                  dddd      j                  d||z  |      S )aG  
        Tile position embeddings to match target spatial dimensions.
        Args:
            position_embeddings: Shape [1, num_pretrain_patches, hidden_size]
            height: Target height in patches
            width: Target width in patches

        Returns:
            Shape [1, height * width, hidden_size]
        r   r   re   r   r   r   N)r   rk   r.   jit
is_tracingr   permutetile)	r   r  r	  r
  pretrain_sizery   	pos_embedrepeat_hrepeat_ws	            r3   _tile_position_embeddingsz+Sam3ViTEmbeddings._tile_position_embeddingsF  s     /55a8C?@ yy##%-6*AmW\F\&..q&5."EE *//3'//=-Q\]eefgijlmopq	]*Q.M)A-NNAq(H#=>q!WfWfuf?TU	  Aq!,44QTTr2   r  interpolate_pos_encodingc                     |j                   dd  \  }}| j                  |      }|| j                  z  }|| j                  z  }| j                  | j                  ||      }||z   }| j                  |      }|S )Nr   )rk   r$  r  r0  r  r   )	r   r  r1  r	  r
  r   height_patcheswidth_patchesr  s	            r3   r   zSam3ViTEmbeddings.forwardd  s    
 %**23/**<8
  4??20"<<$$

  "55
\\*-
r2   F)r)   r*   r+   r,   r$   r   r.   r   r   r0  r   r   r   r   s   @r3   r"  r"  3  s{    
,} 
,U"\\U U 	U
 
UB */ll #' 
	r2   r"  c           	      `   | j                   \  }}}}|||z  z
  |z  }|||z  z
  |z  }t        j                  j                  | ddd|d|f      } ||z   ||z   }	}| j	                  |||z  ||	|z  ||      } | j                  dddddd      j                         j	                  d|||      }
|
||	ffS )a  
    Partition into non-overlapping windows with padding if needed.

    Args:
        hidden_state (`torch.Tensor`):
            Input tokens with [batch_size, height, width, num_channels].
        window_size (`int`):
            Window size.

    Returns:
        `tuple(torch.FloatTensor)` comprising various elements:
        - windows: windows after partition with [batch_size * num_windows, window_size, window_size, num_channels].
        - (padded_height, padded_width): padded height and width before partition
    r   r   r   r   r      re   )rk   r   r   padr   r*  r   )hidden_statewindow_sizerw   r	  r
  r  
pad_height	pad_widthpadded_heightpadded_widthwindowss              r3   window_partitionr@  {  s     /;.@.@+J| 44CJu{22kAI ==$$\Aq!Y:3VWL"(:"5uy7H<M$$M[0+|{?Z\giuL ""1aAq!4??AFFr;XceqrG]L111r2   c                 6   |\  }}|\  }}| j                   d   ||z  |z  |z  z  }| j                  |||z  ||z  ||d      }	|	j                  dddddd      j                         }	|	j                  |||d      }	|	ddd|d|ddf   j                         }	|	S )	aB  
    Window unpartition into original sequences and removing padding.

    Args:
        windows (`torch.Tensor`):
            Input tokens with [batch_size * num_windows, window_size, window_size, num_channels].
        window_size (`int`):
            Window size.
        pad_height_width (`tuple[int]`):
            Padded height and width (padded_height, padded_width).
        height_width (`tuple[int]`):
            Original height and width before padding.

    Returns:
        hidden_state: unpartitioned sequences with [batch_size, height, width, num_channels].
    r   re   r   r   r   r   r7  N)rk   r   r*  r   )
r?  r:  pad_height_widthheight_widthr=  r>  r	  r
  rw   r9  s
             r3   window_unpartitionrD    s    " #3M< MFEq!ml&Bk&QU`&`aJ<<M[0,+2M{\gikL  ''1aAq9DDFL$$ZbQL  7F7FUFA 56AACLr2   c                   X     e Zd Zd fdZdej
                  dej
                  fdZ xZS )Sam3ViTLayerScalerX   c                     t         |           t        j                  |j                  t        j                  |j                        z        | _        y r   )	r   r   r   r%  layer_scale_init_valuer.   onesry   lambda1r   s     r3   r   zSam3ViTLayerScale.__init__  s8    ||F$A$AEJJvOaOaDb$bcr2   r9  c                      || j                   z  S r   )rJ  )r   r9  s     r3   r   zSam3ViTLayerScale.forward  s    dll**r2   )rX   N)r)   r*   r+   r   r.   r   r   r   r   s   @r3   rF  rF    s$    d+ELL +U\\ +r2   rF  c                   t     e Zd ZdZd
dededdf fdZdej                  de	e
   dej                  fd	Z xZS )Sam3ViTLayerzYVision Transformer layer with rotary position embeddings and optional windowed attention.r   r:  rX   Nc                    t         	|           |j                  }|j                  }t	        |t
        t        f      r|n||f}|j                  }t	        |t
        t        f      r|n||f}|d   |d   z  |d   |d   z  f}t        j                  ||j                        | _        |dk(  r|n||f}|j                  |d   z  }t        ||d   |d   |      | _        t        |      | _        t        j                  ||j                        | _        t%        |      | _        t        j(                  |j*                        | _        || _        y )Nr   r   rW   )r   r   r   )r   r   ry   r  r  listr-   r  r   	LayerNormlayer_norm_epslayer_norm1r:  r   
rotary_embr  	attentionlayer_norm2r   mlpr   r   r   )
r   r   r:  ry   r  r  
input_sizerotary_input_sizerotary_scaler   s
            r3   r   zSam3ViTLayer.__init__  s=   ((&&
#-j4-#HZz[eNf
&&
#-j4-#HZz[eNf
 mz!}4jmzRS}6TU
<<9N9NO*5*:Jk@Z)),=a,@@0+A.6G6JR^
 .f5<<9N9NO6?zz&"7"78&r2   r>   r   c                    |}| j                  |      }| j                  dkD  r7|j                  d   |j                  d   }}t        || j                        \  }}| j	                         } | j
                  ||fi |\  }}| j                  dkD  rt        || j                  f      }||z   }|}| j                  |      }| j                  |      }|| j                  |      z   }|S )Nr   r   r   )
rS  r:  rk   r@  rT  rU  rD  rV  rW  r   )	r   r>   r   residualr	  r
  rB  r  r  s	            r3   r   zSam3ViTLayer.forward  s    
 !((7a)//2M4G4G4JEF.>}dN^N^._+M+"oo/)4>>-9LWPVWqa.}d>N>NP`cikpbqrM =0 ((7/ 4<<#>>r2   )r   )r)   r*   r+   r,   r$   r   r   r.   r   r   r   r   r   r   s   @r3   rM  rM    sO    c'} '3 't '0|| +, 
	r2   rM  c                   B     e Zd ZeZdZdZddgZdZdZ	dZ
dZ fdZ xZS )Sam3PreTrainedModelsam3r  imagetextTc                    t         |   |       t        |t              r7t	        j
                  |j                  d| j                  j                         y t        |t              r|j                  |j                  }}|j                  }d|j                  t        j                  d|d      d |dz   j!                         |z  z  z  }t        j                  ||z  t        j"                        }||z  |j$                  z  }t        j&                  ||d      |j$                  z  }t        j(                  ||      j!                         }	t        j(                  ||      j!                         }
t        j*                  |	|
gd	
      }|j-                  dd	
      }t	        j.                  |j0                  |j3                                t	        j.                  |j4                  |j7                                y y )Nr   )meanstdr   r   r   r   r   r   re   rf   r   )r   _init_weightsr  r"  initnormal_r  r   initializer_ranger   r   r   rg   r   r.   rn   r   r   r   r   r   r   r   copy_r   r   r   r   )r   r   r   r   rg   r   r   r   r   r   r   r   r   s               r3   re  z!Sam3PreTrainedModel._init_weights  s}   f%f/0LL33#4;;C`C`a 67!<<5E**C6,,aa1HCSTH1V1\1\1^ad1defE %UU]%** M,u4DK))$5uGTW]WcWccKkk+u5;;=Gkk+u5;;=Gyy'7!3<H11!1<HJJv118<<>BJJv118<<>B 8r2   )r)   r*   r+   r   config_classbase_model_prefixmain_input_nameinput_modalities_supports_sdpa_supports_flash_attn_supports_flex_attn_supports_attention_backendre  r   r   s   @r3   r^  r^    sA    L$O(N"&C Cr2   r^  c            	            e Zd ZeedZdef fdZdefdZ	e
 ed      edej                  d	ee   defd
                     Z xZS )Sam3ViTModelr>   r?   r   c           
         t         |   |       || _        t        |      | _        t        j                  |j                  |j                        | _	        t        j                  t        |j                        D cg c])  }t        |||j                  vr|j                  nd      + c}      | _        | j#                          y c c}w )NrO  r   )r:  )r   r   r   r"  r   r   rQ  ry   rR  
layer_norm
ModuleListrangenum_hidden_layersrM  global_attn_indexesr:  layers	post_init)r   r   ir   s      r3   r   zSam3ViTModel.__init__   s     +F3,,v'9'9v?T?TUmm v778 VqPVPjPjGj1C1Cpqr
 	s   >.C	rX   c                 .    | j                   j                  S r   )r   r$  r   s    r3   get_input_embeddingsz!Sam3ViTModel.get_input_embeddings-  s    ///r2   F)tie_last_hidden_statesr  r   c                    | j                  |      }|j                  d   }|j                  d   | j                  j                  z  }|j                  d   | j                  j                  z  }|j                  d   }|j	                  ||||      }| j                  |      }| j                  D ]  } ||fi |} |j	                  |||z  |      }t        |      S )Nr   r   re   )r6   )r   rk   r   r  r   rv  r{  r   )	r   r  r   r>   rw   r	  r
  ry   layers	            r3   r   zSam3ViTModel.forward0  s     5"((+
##B'4;;+A+AA""2&$++*@*@@#))"- &**:vukR6[[ 	;E!-:6:M	; &**:v~{S??r2   )r)   r*   r+   rM  r  _can_record_outputsr$   r   r  r  r   r   r   r.   r   r   r   r   r   r   r   s   @r3   rs  rs    s     &*
} 0&< 0  E2@ll@ +,@ 
	@  3  @r2   rs  c                   d    e Zd ZdZ	 ddededededz  f fdZdej                  d	ej                  d
e
ej                  ej                  f   fdZdej                  d
ej                  fdZ ed      	 ddej                  dej                  ez  dej"                  de	dz  d
e	f
d       Z xZS )Sam3SinePositionEmbeddingz
    This is a more standard version of the position embedding, very similar to the one used by the Attention is all you
    need paper, generalized to work on images.
    Nnum_pos_featstemperature	normalizer   c                     t         |           ||du rt        d      || _        || _        || _        |dt        j                  z  | _        y || _        y )NFz+normalize should be True if scale is passedr   )	r   r   r   r  r  r  mathpir   )r   r  r  r  r   r   s        r3   r   z"Sam3SinePositionEmbedding.__init__R  sY     	e!3JKK*&"$)MQ[
u
r2   rV   yrX   c                    || j                   z  }|| j                   z  }t        j                  | j                  t        j                  |j
                        j                  |j                        }| j                  d|dz  z  | j                  z  z  }|dddf   |z  }|dddf   |z  }t        j                  |dddddf   j                         |dddddf   j                         fd      j                  d      }t        j                  |dddddf   j                         |dddddf   j                         fd      j                  d      }||fS )a  
        Encode 1D coordinate pairs using sine/cosine positional embeddings.

        Args:
            x: 1D tensor of x coordinates (flattened)
            y: 1D tensor of y coordinates (flattened)

        Returns:
            Tuple of (pos_x, pos_y) positional embeddings
        rj   rh   r   Nr   r   rf   )r   r.   rn   r  int64rh   r  rj   r  r   r   r   r   )r   rV   r  x_embedy_embeddim_tpos_xpos_ys           r3   encode_1d_positionsz-Sam3SinePositionEmbedding.encode_1d_positions]  sB    djj.djj.T//u{{188TWWXYX_X_`  Q%1*%58J8J%JK4 5(4 5(U1add7^//15ADqD>3E3E3GHaPXXYZ[U1add7^//15ADqD>3E3E3GHaPXXYZ[e|r2   boxesc           	         |j                  d      dk(  sJ d|j                          t        j                  | j                  t        j
                  |j                        j                  |j                        }| j                  dt        j                  |dd      z  | j                  z  z  }|ddddd	f   | j                  z  }|ddddd
f   | j                  z  }|dddddf   | j                  z  }|dddddf   | j                  z  }|dddddf   |z  }|dddddf   |z  }|dddddf   |z  }	|dddddf   |z  }
t        j                  |ddddd	ddf   j                         |ddddd
ddf   j                         fd      j                  d      }t        j                  |ddddd	ddf   j                         |ddddd
ddf   j                         fd      j                  d      }t        j                  |	ddddd	ddf   j                         |	ddddd
ddf   j                         fd      j                  d      }	t        j                  |
ddddd	ddf   j                         |
ddddd
ddf   j                         fd      j                  d      }
t        j                   |||	|
fd      }|S )a2  
        Encode 4D box coordinates (x, y, w, h) for decoder conditioning using sine/cosine embeddings.

        Args:
            boxes: Box coordinates [batch_size, num_queries, 4] in (x, y, w, h) format

        Returns:
            Position embeddings [batch_size, num_queries, num_pos_feats*4]
        re   r   z4Expected 4D box coordinates (x, y, w, h), got shape r  r   r   r   Nr   r   r   rf   )rl   rk   r.   rn   r  r  rh   r  rj   r  r   r   r   r   r   r   r   )r   r  r  r  r  w_embedh_embedr  r  pos_wpos_hposs               r3   encode_boxesz&Sam3SinePositionEmbedding.encode_boxest  s    zz"~"h&Z[`[f[fZg$hh"T//u{{5<<X[[\a\g\gh  Q5!7)S%SVZVhVh%hi1a.4::-1a.4::-1a.4::-1a.4::-1d
#e+1d
#e+1d
#e+1d
#e+U1aA:.224eAq!$Q$J6G6K6K6MNTUV^^_`aU1aA:.224eAq!$Q$J6G6K6K6MNTUV^^_`aU1aA:.224eAq!$Q$J6G6K6K6MNTUV^^_`aU1aA:.224eAq!$Q$J6G6K6K6MNTUV^^_`aiiue4!<
r2   r   maxsizerk   rh   rj   maskc           
         |2t        j                  |d   |d   |d   f|t         j                        }| j                  |      }|j	                  d      }|j	                  d      }| j
                  rDd}||d d dd d d f   |z   z  | j                  z  }||d d d d dd f   |z   z  | j                  z  }t        j                  | j                  t         j                  |      j                  |      }	| j                  dt        j                  |	dd	
      z  | j                  z  z  }	|d d d d d d d f   |	z  }
|d d d d d d d f   |	z  }t        j                  |
d d d d d d dd df   j                         |
d d d d d d dd df   j                         fd      j                  d      }
t        j                  |d d d d d d dd df   j                         |d d d d d d dd df   j                         fd      j                  d      }t        j                   ||
fd      j#                  dddd      }|S )Nr   r   r   ri   r   gư>re   r  r   r   r   rf   )r.   rp   r   r  cumsumr  r   rn   r  r  r  r   r   r   r   r   r   r*  )r   rk   rh   rj   r  not_maskr  r  rW   r  r  r  r  s                r3   r   z!Sam3SinePositionEmbedding.forward  s    <;;a%(E!H=fTYT^T^_DE::e$//!$//!$>>CBC!3c!9:TZZGGArs!3c!9:TZZGGT//u{{6RUUV[\  Q5!7)S%SVZVhVh%hi1a&.1a&.U1aADqD=1557q!Q1}9M9Q9Q9STZ[\ddefgU1aADqD=1557q!Q1}9M9Q9Q9STZ[\ddefgiiA.66q!QB
r2   )@   i'  FNr   )r)   r*   r+   r,   r   r   r   r   r.   r   r-   r  r  r   Sizerh   strrj   r   r   r   s   @r3   r  r  L  s     qu	= 	=47	=LP	=afimam	=U\\ ell uU\\[`[g[gMgGh .%,, 5<< B )3 #zz s" {{	
 tm 
 4r2   r  c                   d     e Zd Zdededef fdZdej                  dej                  fdZ xZ	S )Sam3FPNLayerin_channelsfpn_dimscale_factorc                 @   t         |           || _        t        j                         | _        |dk(  r| j
                  j                  t        j                  ||dz  dd             | j
                  j                  t        j                                | j
                  j                  t        j                  |dz  |dz  dd             |dz  }n|dk(  r;| j
                  j                  t        j                  ||dz  dd             |dz  }nO|dk(  r|}nG|dk(  r3| j
                  j                  t        j                  dd             |}nt        d| d	      t        j                  ||d
      | _        t        j                  ||dd
      | _        y )Ng      @r   )r  r  r   g       @r   r   zscale_factor=z is not supported yet.r   )r  out_channelsr  r   )r  r  r  padding)r   r   r  r   rw  scale_layersappendConvTranspose2dGELU	MaxPool2dNotImplementedErrorr  proj1proj2)r   r  r  r  intermediate_channelsr   s        r3   r   zSam3FPNLayer.__init__  sp   ( MMO3$$R%7%7[TUEUcdmn%op$$RWWY/$$R%7%7q8H+YZJZhirs%tu$/1$4!S $$R%7%7[TUEUcdmn%op$/1$4!S $/!S $$R\\a%JK$/!%l^CY&Z[[YY+@wdef
YY7VWabc
r2   r>   rX   c                     |j                  | j                  j                  j                        }| j                  D ]
  } ||      } | j                  |      }| j                  |      }|S r   )r  r  r  rj   r  r  )r   r>   r  s      r3   r   zSam3FPNLayer.forward  se    %(():):)@)@A&& 	1E!-0M	1 

=1

=1r2   )
r)   r*   r+   r   r   r   r.   r   r   r   r   s   @r3   r  r    s<    dC d# dU d4U\\ ell r2   r  c                        e Zd Zdef fdZdej                  deeej                  df   eej                  df   f   fdZ xZ	S )Sam3VisionNeckr   c           
      0   t         |           || _        t        |j                  dz  d      | _        t        j                  |j                  D cg c].  }t        |j                  j                  |j                  |      0 c}      | _        y c c}w )Nr   Tr  r  )r  r  r  )r   r   r   r  fpn_hidden_sizeposition_encodingr   rw  scale_factorsr  backbone_configry   
fpn_layers)r   r   r   r   s      r3   r   zSam3VisionNeck.__init__  s    !:I_I_cdIdpt!u --
 $11	   & 6 6 B BFLbLbqv
s   3Br>   rX   .c                     d}d}| j                   D ]G  } ||      }||fz  }| j                  |j                  |j                  |j                        }||fz  }I ||fS )Nr1   )r  r  rk   rh   rj   )r   r>   r'   r(   	fpn_layer
fpn_outputpos_encs          r3   r   zSam3VisionNeck.forward  sz     " 	0I"=1J*.,,Z-=-=z?P?PR\RbRbcG!gZ/!	0 !"777r2   )
r)   r*   r+   r#   r   r.   r   r-   r   r   r   s   @r3   r  r    sS    
/ 
 8U\\ 8eE%,,PSBS<TV[\a\h\hjm\mVn<n6o 8r2   r  zJ
    The vision model from Sam without any head or projection on top.
    )custom_introc            	       z     e Zd ZeZdZdef fdZd Ze	 d	de	j                  dz  dee   deez  fd       Z xZS )
Sam3VisionModelr  r   c                     t         |   |       || _        t        j                  |j
                        | _        t        |      | _        | j                          y r   )
r   r   r   r   from_configr  backboner  neckr|  r   s     r3   r   zSam3VisionModel.__init__   sE     !--f.D.DE"6*	r2   c                 6    | j                   j                         S r   )r  r  r   s    r3   r  z$Sam3VisionModel.get_input_embeddings  s    }}1133r2   Nr   rX   c                    |t        d       | j                  |fi |}|j                  }|j                  d   }|j                  d   | j                  j
                  j                  z  }|j                  d   | j                  j
                  j                  z  }|j                  |||d      j                  dddd      }| j                  |      \  }	}
t        ||	|
|j                  |j                        S )	Nz You have to specify pixel_valuesr   r   re   r   r   r   )r6   r'   r(   r>   r?   )r   r  r6   rk   r   r  r  r   r*  r  r&   r>   r?   )r   r  r   backbone_outputr>   rw   r	  r
  hidden_states_spatialr'   r(   s              r3   r   zSam3VisionModel.forward  s     ?@@'$--??'99 #((+
##B'4;;+F+F+Q+QQ""2&$++*E*E*P*PP - 2 2:vub Q Y YZ[]^`acd e3799=R3S00&+/"7)77&11
 	
r2   r   )r)   r*   r+   r#   rj  rl  r   r  r   r.   r/   r   r   r-   r&   r   r   r   s   @r3   r  r    sm     $L$O/ 4  26
''$.
 +,
 
(	(	
 
r2   r  c                   F     e Zd Zdef fdZdededededee   f
dZ xZ	S )	Sam3GeometryEncoderLayerr   c                    t         |           t        j                  |j                        | _        t        |      | _        t        j                  |j                        | _	        t        |      | _
        t        j                  |j                        | _        t        |      | _        t        j                  |j                        | _        y r   )r   r   r   rQ  ry   rS  r   	self_attnr   r   
cross_attnrV  r   rW  layer_norm3r   s     r3   r   z!Sam3GeometryEncoderLayer.__init__(  s    <<(:(:;&v.zz&..1'/<<(:(:;6?<<(:(:;r2   prompt_featsvision_featsvision_pos_encodingprompt_maskr   c                    |}| j                  |      } | j                  d||||d|\  }}| j                  |      |z   }|}| j                  |      }||z   }	 | j                  d||	|d|\  }}| j                  |      |z   }|}| j                  |      }| j                  |      }| j                  |      |z   }|S )Nr   r   r   r7   r   r   r   r1   rS  r  r   rV  r  r  rW  )
r   r  r  r  r  r   r\  r>   r  r   s
             r3   r   z Sam3GeometryEncoderLayer.forward4  s      ((6)4>> 
]-Xc
gm
q ]3h> ((700*4??fC|f_efq]3h> ((7/]3h>r2   )
r)   r*   r+   r!   r   r   r   r   r   r   r   s   @r3   r  r  '  sK    
<8 
<  $	
  +,r2   r  c                   L    e Zd ZdZdef fdZdej                  dej                  dej                  dej                  dej                  f
d	Zd
 Z		 ddej                  dej                  dej                  de
ej                  df   de
ej                  df   dz  f
dZ xZS )Sam3GeometryEncodera  
    Encoder for geometric prompts (boxes).

    Boxes are encoded using three approaches:
     - Direct projection: linear projection from coordinate space to hidden_size
     - Pooling: pool features from the backbone at the specified location (ROI align for boxes)
     - Position encoding: use position encoding of the box center

    These encodings are combined additively and further processed with transformer layers.
    r   c                 0   t         |           || _        |j                  | _        |j                  | _        t        |j                  dz  d      | _        t        j                  d| j                        | _	        t        j                  d| j                        | _
        t        j                  d| j                        | _        t        j                  | j                  | j                  | j                        | _        t        j                  | j                  dz   | j                        | _        t        j                   | j                        | _        t        j                  | j                  | j                        | _        t        j                   | j                        | _        t        j(                  t+        |j,                        D cg c]  }t/        |       c}      | _        t        j                   | j                        | _        y c c}w )Nr   Tr  r   r   )r   r   r   ry   roi_sizer  r  r   	Embeddinglabel_embed	cls_embedr   boxes_direct_projectr  boxes_pool_projectboxes_pos_enc_projectrQ  vision_layer_norm
final_projprompt_layer_normrw  rx  
num_layersr  r{  output_layer_normr   r   r  r   s      r3   r   zSam3GeometryEncoder.__init__[  s~   !--!:I[I[_`I`lp!q<<4+;+;<a)9)9: %'IIa1A1A$B!"$))D,<,<d>N>NPTP]P]"^%'YYt/?/?!/CTEUEU%V" "$d.>.>!? ))D$4$4d6F6FG!#d.>.>!? mmuU[UfUfOg$h!%=f%E$hi!#d.>.>!? %is   Hcenter_xcenter_yr
  r	  rX   c                     | j                   j                  ||      \  }}t        j                  |||dddf   |dddf   fd      }|S )a  
        Encode box coordinates by combining position-encoded centers with raw width/height.

        Args:
            center_x: 1D tensor of box center x coordinates
            center_y: 1D tensor of box center y coordinates
            width: 1D tensor of box widths
            height: 1D tensor of box heights

        Returns:
            Encoded box coordinates [N, embedding_dim]
        Nr   rf   )r  r  r.   r   )r   r  r  r
  r	  r  r  r  s           r3   _encode_box_coordinatesz+Sam3GeometryEncoder._encode_box_coordinatesu  sN     --AA(HUuiivagagGQO
r2   c                 H   |j                   dd \  }}|j                   dd \  }}| j                  |      }	t        |      }
t        j                  ||||g|
j
                  |
j                        }|j                  ddd      }|
|z  }
|j
                  t        j                  k(  rt        j                  n|j
                  }t        j                  j                  |j                  |      |
j                  |      j                  d      | j                        j                  |j
                        }| j!                  |      }|j                  ||| j"                        }|	|z   }	|j                  d      \  }}}}| j%                  |j'                         |j'                         |j'                         |j'                               }|j                  |||j                   d         }| j)                  |      }|	|z   }	| j+                  |j-                               }||	z   |fS )	z?Encode box prompts. Mask convention: True=valid, False=padding.Nr   r   r  r   r   r   re   )rk   r  r   r.   tensorrj   rh   r   bfloat16float16torchvisionops	roi_alignr  r   r  r  ry   r  r   r  r  r   )r   r  
boxes_maskboxes_labelsvision_featuresrw   	num_boxesr	  r
  boxes_embed
boxes_xyxyr   rj   sampled_featurespooled_projectionr  r  	box_width
box_heightr  pos_projectionr  s                         r3   _encode_boxesz!Sam3GeometryEncoder._encode_boxes  s    %BQ
I'--bc2//6 (.
eVUF;:CSCS\f\m\mn

1a#%'
 "1!6!6%..!HoNcNc&??44u%z}}U';'B'B1'Et}}

"_""
# 	 !334DE-22:y$JZJZ[!$55 5:LL4D1(Iz.. 0 0 2I4E4E4GI[I[I]
 ,,z9gmmB6GH33G<!N2 &&|'8'8':;[(*44r2   Nbox_embeddingsbox_mask
box_labels	img_feats.img_pos_embedsc                    |j                   d   }|d   }||d   nt        j                  |      }|j                  d      j	                  dd      }	|j                  d      j	                  dd      }
|d   }|j                  dddd      }| j                  |      }|j                  dddd      }| j                  ||||      \  }}| j                  j                  j                  d| j                        j                  d      j                  |dd      }t        j                  |d|j                  |j                         }t#        ||||      \  }}| j%                  | j'                  |            }d}|t)        | j*                  ||      }| j,                  D ]  } |||	|
|	      } | j/                  |      }t1        ||
      S )a8  
        Forward pass for encoding geometric prompts.

        Args:
            box_embeddings: Box coordinates in CxCyWH format [batch_size, num_boxes, 4]
            box_mask: Attention mask for boxes [batch_size, num_boxes]
            box_labels: Labels for boxes (positive/negative) [batch_size, num_boxes]
            img_feats: Image features from vision encoder
            img_pos_embeds: Optional position embeddings for image features

        Returns:
            Sam3GeometryEncoderOutput containing encoded geometry features and attention mask.
        r   re   Nr   r   r   r  )r   inputs_embedsr7   )r  r  r  r  )r6   r7   )rk   r.   
zeros_liker   r   r*  r  r  r  r  r   ry   	unsqueezerr   rI  rj   rh   r   r  r  r   r   r{  r  r5   )r   r  r  r  r	  r
  rw   r  vision_pos_embedsvision_feats_flatvision_pos_embeds_flatimg_feats_lastnormalized_img_featsprompt_embedsr  r  cls_maskprompt_attention_maskr  s                      r3   r   zSam3GeometryEncoder.forward  s   * $))!,
 !}2@2LN2.RWRbRbcoRp(003==aC!2!:!:1!=!G!G1!M #2'//1a;#55nE3;;Aq!QG%)%7%7R\^r%s"{ NN))..q$2B2BCMMaPWWXbdfhjk	::j!;3D3D[M_M_`%<]KYbdl%m"{..t}/MN !%"$={{+*%! [[ 	E!*.$:1	M	 ..}=(+&
 	
r2   r   )r)   r*   r+   r,   r!   r   r.   r   r  r  r-   r   r   r   s   @r3   r  r  O  s    	@8 @405EJ\\[`[g[g	&"5T ;?D
D
 ,,D
 LL	D

 s*+D
 ellC/047D
r2   r  c                   T     e Zd ZdZdef fdZ	 ddededededz  d	ee   f
d
Z	 xZ
S )Sam3DetrEncoderLayerz;DETR encoder layer with self-attention and cross-attention.r   c                    t         |           || _        t        j                  |j
                        | _        t        |      | _        t        j                  |j                        | _
        t        |      | _        t        j                  |j
                        | _        t        |      | _        t        j                  |j
                        | _        y r   )r   r   r   r   rQ  ry   rS  r   r  r   r   r  rV  r   rW  r  r   s     r3   r   zSam3DetrEncoderLayer.__init__  s    <<(:(:;&v.zz&..1'/<<(:(:;6?<<(:(:;r2   Nr  r  r  prompt_cross_attn_maskr   c                    |}| j                  |      }||z   } | j                  d|||d|\  }}	| j                  |      |z   }|}| j                  |      } | j                  d||||d|\  }}	| j                  |      |z   }|}| j                  |      }| j                  |      }| j                  |      |z   }|S )a
  
        Forward pass for DETR encoder layer.

        Args:
            vision_feats: Vision features [batch_size, vision_len, hidden_size] (main hidden states)
            prompt_feats: Text prompt features [batch_size, text_len, hidden_size]
            vision_pos_encoding: Position encoding for vision [batch_size, vision_len, hidden_size]
            prompt_cross_attn_mask: Cross-attention mask for prompt features

        Returns:
            Updated vision features [batch_size, vision_len, hidden_size]
        r  r  r1   r  )
r   r  r  r  r  r   r\  r>   hidden_states_with_posr  s
             r3   r   zSam3DetrEncoderLayer.forward  s   *  ((6!.1D!D)4>> 
(&
 	
q ]3h> !((7*4?? 
1	

 
q ]3h> !((7/]3h>r2   r   )r)   r*   r+   r,   r    r   r   r   r   r   r   r   s   @r3   r  r    sW    E<4 <$ 1533 3 $	3
 !'3 +,3r2   r  c                   @    e Zd ZdZeedZdef fdZde	e
j                     de	e
j                     fdZee	 	 	 dde	e
j                     d	e
j                  de	e
j                     dz  d
e
j                  dz  de	eeef      dz  dee   deez  fd              Z xZS )Sam3DetrEncodera  
    DETR-style encoder that processes multi-level vision features with text fusion.

    This encoder processes vision features from multiple levels (e.g., FPN features at different
    resolutions) and fuses them with text prompts through a stack of transformer encoder layers.
    rt  r   c                    t         |   |       || _        |j                  | _        t	        j
                  t        |j                        D cg c]  }t        |       c}      | _	        | j                          y c c}w r   )r   r   r   ry   r   rw  rx  r  r  r{  r|  r  s      r3   r   zSam3DetrEncoder.__init__F  sb     !--mm5QWQbQbKc$da%9&%A$de %es   A>r  r  c                    g }g }g }t        ||      D ]  \  }}|j                  dd \  }}	|j                  ||	f       |j                  d      j	                  dd      }|j                  d      j	                  dd      }|j                  |       |j                  |        t        j                  |d      }t        j                  |d      }t        j                  |t
        j                  |j                        }|||fS )a  
        Prepare multi-level vision features by flattening spatial dimensions and adding level embeddings.

        Args:
            vision_features: List of vision features at different levels [batch_size, channels, height, width]
            vision_pos_embeds: List of position embeddings for each level [batch_size, channels, height, width]

        Returns:
            Tuple containing flattened features, position embeddings, and spatial metadata
        r   Nr   r   rf   r  )
ziprk   r  r   r   r.   r   r  r   rh   )
r   r  r  features_flattenedr;   r=   featuresr-  r	  r
  s
             r3   _prepare_multilevel_featuresz,Sam3DetrEncoder._prepare_multilevel_featuresO  s     !#&8I#J 		3Hi$NN23/MFE!!65/2  ''*44Q:H!))!,66q!<I%%h/ ''	2		3 #YY'9qA$yy)=1EnEJJOaOhOhi  
 	
r2   Nr<   	text_maskspatial_sizesr   rX   c                    |d   j                         dk(  r|d   j                  d   n|d   j                  d   }|nt        |      D ]`  \  }\  }	}
||   j                  |	|
|d      j	                  dddd      ||<   ||   j                  |	|
|d      j	                  dddd      ||<   b | j                  ||      \  }}}d}|t        | j                  |||      }|}| j                  D ]  } ||f|||d	|} t        ||||
      S )a)  
        Forward pass for the DETR encoder.

        Args:
            vision_features: List of vision features at different levels
            text_features: Text prompt features [batch_size, seq_len, hidden_size]
            vision_pos_embeds: Optional list of position embeddings for each level
            text_mask: Optional text padding mask [batch_size, seq_len]
            spatial_sizes: Optional list of (height, width) tuples for reshaping

        Returns:
            Sam3DETREncoderOutput containing encoded features and metadata.
        r   r   r   Nre   r   r   r   r  r7   rP   )r  r  r  )r6   r;   r<   r=   )
rg   rk   	enumerater   r*  r$  r   r   r{  r:   )r   r  r<   r  r%  r&  r   rw   r}  r	  r
  r"  r;   r=   r  r>   r  s                    r3   r   zSam3DetrEncoder.forwardy  s~   0 5DA4F4J4J4LPQ4Q_Q'--a0WfghWiWoWopqWr
 $&/&> w"?FE%4Q%7%?%?z[]%^%f%fghjkmnpq%r"'8';'C'CFES]_a'b'j'jklnoqrtu'v!!$w --o?PQ		
  "& %>{{0(&3	&" +[[ 	E!*$8'=	
 M	 %+!5')	
 	
r2   )NNN)r)   r*   r+   r,   r  r   r  r    r   rP  r.   r   r$  r   r   r-   r   r   r   r:   r   r   r   s   @r3   r  r  9  s     .#
4 (
ell+(
  -(
T  
 8<)-6:=
ell+=
 ||=
  -4	=

 <<$&=
 E#s(O,t3=
 +,=
 
&	&=
   =
r2   r  c            	       n     e Zd ZdZd
dedededef fdZdej                  dej                  fd	Z xZ	S )Sam3DecoderMLPz/Simple 2 or 3-layer MLP for decoder components.	input_dim
hidden_dim
output_dimr  c                 p   t         |           |dk(  r>t        j                  ||      | _        t        j                  ||      | _        d | _        y |dk(  rRt        j                  ||      | _        t        j                  ||      | _        t        j                  ||      | _        y t        d|       )Nr   r   z"Only 2 or 3 layers supported, got )r   r   r   r   layer1layer2layer3r   )r   r,  r-  r.  r  r   s        r3   r   zSam3DecoderMLP.__init__  s    ?))Iz:DK))J
;DKDK1_))Iz:DK))J
;DK))J
;DKA*NOOr2   rV   rX   c                     t        j                  | j                  |            }| j                  7t        j                  | j	                  |            }| j                  |      }|S | j	                  |      }|S r   )Frelur0  r2  r1  )r   rV   s     r3   r   zSam3DecoderMLP.forward  s`    FF4;;q>";;"t{{1~&AAA  AAr2   )r   )
r)   r*   r+   r,   r   r   r.   r   r   r   r   s   @r3   r+  r+    sH    9P# P3 PC PUX P %,, r2   r+  c                       e Zd ZdZdef fdZ	 	 ddej                  dej                  dej                  dej                  d	ej                  d
ej                  dz  dej                  dz  dee	   dej                  fdZ
 xZS )Sam3DetrDecoderLayerzYDETR decoder layer with self-attention, text cross-attention, and vision cross-attention.r   c                    t         |           || _        t        |      | _        t        j                  |j                        | _        t        j                  |j                        | _        t        |      | _        t        j                  |j                        | _        t        j                  |j                        | _        t        |      | _        t        j                  |j                        | _        t        j                  |j                        | _        t%        |      | _        t        j                  |j                        | _        t        j                  |j                        | _        y r   )r   r   r   r   r  r   r   r   self_attn_dropoutrQ  ry   self_attn_layer_normtext_cross_attntext_cross_attn_dropouttext_cross_attn_layer_normvision_cross_attnvision_cross_attn_dropoutvision_cross_attn_layer_normr   rW  mlp_layer_normmlp_dropoutr   s     r3   r   zSam3DetrDecoderLayer.__init__  s    &v.!#FNN!;$&LL1C1C$D!,V4')zz&..'A$*,,,v7I7I*J'!.v!6)+FNN)C&,.LL9K9K,L)6? ll6+=+=>::fnn5r2   Nr>   	query_posr<   r  r  text_cross_attn_maskvision_cross_attn_maskr   rX   c                 V   t        j                  |ddd      }|}	||z   }
 | j                  d|
|
|dd|\  }}|	| j                  |      z   }| j	                  |      }|}	||z   }
 | j
                  d|
|||d|\  }}|	| j                  |      z   }| j                  |      }|}	||z   }
||z   } | j                  d|
|||d|\  }}|	| j                  |      z   }| j                  |      }|}	| j                  |      }|	| j                  |      z   }| j                  |      }|S )a  
        Forward pass for decoder layer.

        Args:
            hidden_states: Query features [batch_size, num_queries + 1, hidden_size] (includes presence token at position 0)
            query_pos: Query position embeddings [batch_size, num_queries, hidden_size]
            text_features: Text features [batch_size, seq_len, hidden_size]
            vision_features: Vision features [batch_size, height*width, hidden_size]
            vision_pos_encoding: Vision position encoding [batch_size, height*width, hidden_size]
            text_cross_attn_mask: Text cross-attention mask
            vision_cross_attn_mask: Vision cross-attention mask, already expanded for presence token

        Returns:
            Updated hidden states (including presence token at position 0)
        r   r   r   r   constantr   moder   Nr  r1   )r4  r8  r  r9  r:  r;  r<  r=  r>  r?  r@  rW  rB  rA  )r   r>   rC  r<   r  r  rD  rE  r   r\  query_with_posr   r  key_with_poss                 r3   r   zSam3DetrDecoderLayer.forward  s   6 EE)\
!L	 !&2' 
 	

 
Q !4#9#9+#FF11-@ !&2--- 
 /	

 
Q !4#?#?#LL77F !&2&)<</// 
 !1	

 
Q !4#A#A+#NN99-H !/ 4#3#3M#BB++M:r2   NN)r)   r*   r+   r,   r   r   r.   r   r   r   r   r   r   s   @r3   r7  r7    s    c64 64 596:L||L <<L ||	L
 L #\\L $llT1L !&t 3L +,L 
Lr2   r7  c                       e Zd ZdZeedZdef fdZ e	d      de
j                  de
j                  d	e
j                  d
e
j                  dee
j                  e
j                  f   f
d       Zde
j                  dee
j                  e
j                  f   de
j                  fdZee	 	 dde
j                  de
j                  de
j                  de
j                  dz  de
j                  dz  dee   deez  fd              Z xZS )Sam3DetrDecodera  
    DETR-style decoder with box refinement and presence token.

    Simplified version that assumes:
    - Box refinement is always enabled
    - Intermediate outputs are always returned
    - BoxRPB (relative position bias) with log-scale encoding
    - Presence token is used
    rt  r   c                 t   t         |   |       || _        |j                  | _        t	        j
                  t        |j                        D cg c]  }t        |       c}      | _	        t	        j                  |j                        | _        t        |j                  |j                  dd      | _        t	        j                  |j                  |j                        | _        t	        j                  |j                  d      | _        t	        j                  d|j                        | _        t        |j                  |j                  dd      | _        t	        j                  |j                        | _        d| _        t        d|j                  z  |j                  |j                  d      | _        t        d|j                  |j.                  d      | _        t        d|j                  |j.                  d      | _        t5        |j                  dz  d      | _        | j9                          y c c}w )Nr   r   r   g      $@r   Fr  )r   r   r   ry   r   rw  rx  r  r7  r{  rQ  r  r+  box_headr  num_queriesquery_embedreference_pointspresence_tokenpresence_headpresence_layer_normclamp_presence_logit_max_valref_point_headr   box_rpb_embed_xbox_rpb_embed_yr  r  r|  r  s      r3   r   zSam3DetrDecoder.__init__J  s    	 !--mm5QWQbQbKc$da%9&%A$de!#f.@.@!A&v'9'96;M;MqRST<<(:(:F<N<NO "V-?-? C ll1f.@.@A+F,>,>@R@RTUWXY#%<<0B0B#C ,0),Q1C1C-CVEWEWY_YkYkmno-a1C1CVE_E_abc-a1C1CVE_E_abc!:I[I[_`I`lq!r+ %es   H5r   r  r	  r
  rj   rh   rX   c                 z    t        j                  d|||      |z  }t        j                  d|||      |z  }||fS )z%Generate normalized coordinate grids.r   ri   )r.   rn   )r   r	  r
  rj   rh   coords_hcoords_ws          r3   _get_coordszSam3DetrDecoder._get_coordsi  sA    
 <<6&FO<<5uEM!!r2   rD   spatial_shapec                     |\  }}t        |      }|j                  \  }}}| j                  |||j                  |j                        \  }	}
|	j                  ddd      |j                  ddd      dddddddf   z
  }|j                  ||dd      }|
j                  ddd      |j                  ddd      dddddddf   z
  }|j                  ||dd      }|d	z  }t        j                  |      t        j                  t        j                  |      d
z         z  t        j                  d	      z  }|d	z  }t        j                  |      t        j                  t        j                  |      d
z         z  t        j                  d	      z  }| j                  |      }| j                  |      }|j                  d      |j                  d      z   }|j                  dd      }|j!                  dddd      j#                         }|S )a  
        Compute box relative position bias (RPB) matrix using log-scale encoding.
        RPB helps the decoder attend to relevant spatial locations based on predicted box positions.

        Args:
            reference_boxes: Reference boxes [batch_size, num_queries, 4] in sigmoid space
            spatial_shape: (height, width) of the vision features as tensors

        Returns:
            RPB matrix [batch_size, num_heads, num_queries, height*width]
        r  r   re   r   Nr   r   r      r   )r   rk   r_  rj   rh   r   r   r.   signlog2absr  rZ  r[  r  r   r*  r   )r   rD   r`  r	  r
  r  rw   rR  r  r]  r^  deltas_ydeltas_xdeltas_x_logdeltas_y_log
rpb_matrixs                   r3   _get_rpb_matrixzSam3DetrDecoder._get_rpb_matrixr  s
    &'8
%/%5%5"
K "--E!6!6?U?U . 
(
 ==B*Z-?-?Aq-I!QPQRSTUPU+-VV==["a@==B*Z-?-?Aq-I!QPQRSTUPU+-VV==["a@  !|zz,/%**UYY|=TWZ=Z2[[^b^g^ghi^jj!|zz,/%**UYY|=TWZ=Z2[[^b^g^ghi^jj ''5''5 ''*X-?-?.
 

  ''1-
''1a3>>@
r2   Nr  r<   r  r%  r=   r   c                    |j                   d   }| j                  j                  j                  d      j	                  |dd      }| j
                  j                  j                  d      j	                  |dd      }	|	j                         }	| j                  j                  j                  d      j	                  |dd      }
t        j                  |
|gd      }d}|t        | j                  |||      }g }|	g}g }| j                  D ]  }|	j                  d      }| j                  j                  |dddddddf         }| j                  |      }d}|G|j                   d   dk(  r5|d   |d	   f}| j!                  |	|      }t#        j$                  |d
dd      } ||f||||||d|}|ddddf   }t'        |	      }| j)                  | j+                  |            }||z   j                         }|j-                         }	|j/                  | j+                  |             |j/                  |       |ddddf   }| j1                  | j3                  |            j5                  d      }|j7                  | j8                   | j8                        }|j/                  |        t        j:                  |      }t        j:                  |dd       }t        j:                  |      }t=        |||      S )a@  
        Forward pass for the DETR decoder.

        Args:
            vision_features: Vision features [batch_size, height*width, hidden_size]
            text_features: Text features [batch_size, seq_len, hidden_size]
            vision_pos_encoding: Vision position encoding [batch_size, height*width, hidden_size]
            text_mask: Text padding mask [batch_size, seq_len] where True=valid, False=padding
            spatial_shapes: Spatial shapes [num_levels, 2]

        Returns:
            Sam3DETRDecoderOutput containing decoder outputs from all layers.
        r   re   r   rf   Nr(  r   )r   r   )r   r   rG  rH  rI  )rC  r<   r  r  rD  rE  rZ   )rC   rD   rE   )rk   rS  r  r  rr   rT  sigmoidrU  r.   r   r   r   r{  r  r  rY  rk  r4  r8  rb   rQ  r  detachr  rV  rW  squeezer^   rX  r   rB   )r   r  r<   r  r%  r=   r   rw   query_embedsrD   rU  r>   rD  intermediate_outputsintermediate_boxesintermediate_presence_logitsr  reference_points_inputquery_sine_embedrC  rE  r`  rj  query_hidden_statesreference_boxes_before_sigmoiddelta_boxesnew_reference_boxespresence_hiddenrE   s                                r3   r   zSam3DetrDecoder.forward  s3   0 %**1-
''..88;BB:rSUV//66@@CJJ:WY[]^)113,,33==a@GG
TVXZ[ 		><"@aH# #<{{+(&3	$   "-.')$[[ +	AE%4%>%>q%A"#55BBCYZ[]^`acdZdCef++,<=I &*")n.B.B1.E.J!/!5~d7K L!11/=Q
)*z<j`a)b&!	#+ /$7%9'=	 	M #012"6 .=_-M*--(>(>?R(STK#.1O#O"X"X"Z188:O ''(>(>?R(ST%%&9: ,ArrE2O"001I1I/1Z[ccdfgO-33666D<]<] 4 O )//@W+	A\  %{{+?@"[[);CR)@A',{{3O'P$$';.8
 	
r2   rM  )r)   r*   r+   r,   r7  r   r  r   r   r   r.   r   rj   rh   r-   r_  rk  r   r   r   r   rB   r   r   r   s   @r3   rO  rO  :  s`    .#
%> )3"ll"+0<<"@E"UZUaUa"	u||U\\)	*" 4"-$||-<A%,,PUP\P\B\<]-	-^   *..2c
c
 ||c
 #\\	c

 <<$&c
 t+c
 +,c
 
&	&c
   c
r2   rO  c            	            e Zd ZdZdef fdZdej                  dej                  dz  dej                  fdZ	 dd	ej                  dej                  dej                  dz  dej                  fd
Z	 xZ
S )Sam3DotProductScoringz
    Computes classification scores by computing dot product between projected decoder queries and pooled text features.
    This is used to determine confidence/presence scores for each query.
    r   c                 <   t         |           || _        |j                  j                  }|j                  j                  }t        ||j                  j                  |d      | _        t        j                  |j                  j                        | _        t        j                  |      | _        t        j                  ||      | _        t        j                  ||      | _        t#        dt%        j&                  |      z        | _        d| _        d| _        y )Nr   )r,  r-  r.  r  r   Tg      (@)r   r   r   detr_decoder_configry   r+  r   text_mlpr   r   r   text_mlp_dropoutrQ  text_mlp_out_normr   	text_proj
query_projr   npsqrtr   clamp_logitsclamp_max_val)r   r   ry   projection_dimr   s       r3   r   zSam3DotProductScoring.__init__  s    00<<33??&!11CC"	
 !#

6+E+E+M+M N!#k!: ;?))K@ 3!889
 !!r2   r<   r%  NrX   c                     ||j                  d      S |j                  |j                        j                  d      }|j	                  d      j                  d      }||z  j	                  d      |z  }|S )a<  
        Mean pool text features, accounting for padding.

        Args:
            text_features: [batch_size, seq_len, hidden_size]
            text_mask: [batch_size, seq_len] where True indicates valid tokens, False indicates padding

        Returns:
            pooled_text: [batch_size, hidden_size]
        r   rf   re   r   r]   )rc  r  rj   r  rm   r^   )r   r<   r%  is_valid	num_validpooled_texts         r3   _pool_text_featuresz)Sam3DotProductScoring._pool_text_features)  s      %%!%,,<< 3 34>>rB LLQL'--#-6	 %x/444;iGr2   rN   c                    |}| j                  |      }| j                  |      }||z   }| j                  |      }| j                  ||      }| j	                  |      }| j                  |      }|j                  d      }t        j                  ||j                  d            }|| j                  z  }| j                  r(|j                  | j                   | j                        }|S )a  
        Compute classification scores via dot product.

        Args:
            decoder_hidden_states: [num_layers, batch_size, num_queries, hidden_size]
            text_features: [batch_size, seq_len, hidden_size]
            text_mask: [batch_size, seq_len] where True=valid, False=padding

        Returns:
            scores: [num_layers, batch_size, num_queries, 1]
        re   r   rZ   )r  r  r  r  r  r  r  r.   r   r   r  r^   r  )	r   rN   r<   r%  orig_text_featuresr  	proj_textproj_queriesscoress	            r3   r   zSam3DotProductScoring.forwardB  s    " +m4--m<%(::..}=..}iHNN;/	'<=''+	lI,?,?,BC$**$\\t'9'9&9t?Q?Q\RFr2   r   )r)   r*   r+   r,   r   r   r.   r   r  r   r   r   s   @r3   r|  r|  	  s    
"z "4 %,,Y]J] bgbnbn : *.	"$||" ||" <<$&	"
 
"r2   r|  c                   `     e Zd ZdZdef fdZdej                  dej                  fdZ xZ	S )Sam3MaskEmbedderzh
    MLP that embeds object queries for mask prediction.
    Similar to MaskFormer's mask embedder.
    r   c                 ,   t         |           || _        |j                  }t	        j
                  t	        j                  ||      t	        j                  ||      t	        j                  ||      g      | _        t	        j                         | _	        y r   )
r   r   r   ry   r   rw  r   r{  ReLU
activationr   r   ry   r   s      r3   r   zSam3MaskEmbedder.__init__m  sn    ((mm		+{3		+{3		+{3
 '')r2   queriesrX   c                     |}t        | j                        D ]:  \  }} ||      }|t        | j                        dz
  k  s*| j                  |      }< |S )z
        Args:
            queries: Query embeddings [batch_size, num_queries, hidden_size]

        Returns:
            Mask embeddings [batch_size, num_queries, hidden_size]
        r   )r)  r{  lenr  )r   r  r>   r}  r  s        r3   r   zSam3MaskEmbedder.forward{  s[      !$++. 	?HAu!-0M3t{{#a'' $ >	? r2   )
r)   r*   r+   r,   r"   r   r.   r   r   r   r   s   @r3   r  r  g  s0    
$4 $u||  r2   r  c                   f     e Zd ZdZdef fdZdeej                     dej                  fdZ	 xZ
S )Sam3PixelDecoderz
    Feature Pyramid Network (FPN) decoder that generates pixel-level features.
    Inspired by MaskFormer's pixel decoder.
    r   c                    t         |           || _        |j                  }|j                  }t        j                  t        |      D cg c]  }t        j                  ||ddd       c}      | _	        t        j                  t        |      D cg c]  }t        j                  d|       c}      | _        || _        y c c}w c c}w )Nr   r   )r  r  r  rb  )r   r   r   ry   num_upsampling_stagesr   rw  rx  r  conv_layers	GroupNormnormsr  )r   r   ry   r  r  r   s        r3   r   zSam3PixelDecoder.__init__  s    (( & < < == 45 		+{!UVW
 ]]%PeJf#gQBLLK$@#gh
'
 $hs   !CC
backbone_featuresrX   c                     |d   }t        t        |dd             D ]n  \  }}t        j                  ||j                  dd d      }||z   } | j
                  |   |      } | j                  |   |      }t        j                  |      }p |S )aA  
        Args:
            backbone_features: List of backbone features [batch_size, hidden_size, H_i, W_i]
                              from low to high resolution (assumes already projected to hidden_size)

        Returns:
            Pixel embeddings [batch_size, hidden_size, H, W] at the finest resolution
        re   Nr   nearest)rl   rJ  )r)  reversedr4  interpolaterk   r  r  r5  )r   r  prev_fpn	layer_idxbackbone_feats        r3   r   zSam3PixelDecoder.forward  s     %R((1(;LSb;Q2R(S 
	($I}}}XM4G4G4LS\]H  -/H 3t''	28<H,tzz),X6Hvvh'H
	( r2   )r)   r*   r+   r,   r"   r   rP  r.   r   r   r   r   s   @r3   r  r    s5    
(4 ("ell);  r2   r  c                   F    e Zd ZdZdeiZdef fdZee		 	 dde
j                  dee
j                     de
j                  d	e
j                  dz  d
e
j                  dz  dee   deez  fd              Zdee
j                     de
j                  de
j                  fdZ xZS )Sam3MaskDecoderz
    Mask decoder that combines object queries with pixel-level features to predict instance masks.
    Also produces a semantic segmentation output and supports cross-attention to prompts.
    r?   r   c                 
   t         |   |       || _        |j                  }t	        |      | _        t        |      | _        t        j                  | j
                  j                  |d      | _        t        j                  | j
                  j                  dd      | _        t        |      | _        t        j                  |      | _        t        j"                  |j$                        | _        | j)                          y )Nr   )r  )r   r   r   ry   r  pixel_decoderr  mask_embedderr   r  r  instance_projectionsemantic_projectionr   prompt_cross_attnrQ  prompt_cross_attn_normr   r   prompt_cross_attn_dropoutr|  r  s      r3   r   zSam3MaskDecoder.__init__  s     (( .f5 .f5 $&99T-?-?-L-Lkgh#i  $&99T-?-?-L-La]^#_ !.v!6&(ll;&?#)+FNN)C&r2   Ndecoder_queriesr  rP   prompt_featuresr  r   rX   c                    |^|}| j                  |      }d}	|t        | j                  |||      }	 | j                  d||||	d|\  }
}|| j	                  |
      z   }| j                  ||      }| j                  |      }| j                  |      }t        j                  d||      }| j                  |      }t        ||      S )aZ  
        Args:
            decoder_queries: Decoder output queries [batch_size, num_queries, hidden_size]
            backbone_features: List of backbone features to process through FPN
            encoder_hidden_states: Encoder outputs [batch_size, seq_len, hidden_size]
            prompt_features: Prompt features (text + geometry) for cross-attention [batch_size, prompt_len, hidden_size]
            prompt_mask: Padding mask [batch_size, prompt_len] where True=valid, False=padding

        Returns:
            Sam3MaskDecoderOutput containing predicted masks and semantic segmentation.
        N)r   r  rP   r7   r  )r  rP   zbqc,bchw->bqhw)rH   rI   r1   )r  r   r   r  r  _embed_pixelsr  r  r.   einsumr  rG   )r   r  r  rP   r  r  r   r\  normed_hidden_statescross_attn_maskr   r  pixel_embedinstance_embedsmask_embeddingsrH   rI   s                    r3   r   zSam3MaskDecoder.forward  s   , &,H#'#>#>?T#U "O&";;;"6*9#.	# 4T33 *#%.	
 NK %-t/M/Mk/Z$Z! ((/"7 ) 
 22;?,,_=\\"2O_U
 //<$!%
 	
r2   c                 n   |D cg c]  }|j                          }}|d   j                  d   |d   j                  d   z  }|ddd|ddf   }|j                  \  }}}	|d   j                  dd \  }
}|j                  dd      j                  ||	|
|      }||d<   | j	                  |      }|S c c}w )a  
        Embed pixels by combining backbone FPN features with encoder vision features.
        The encoder vision features replace the finest-resolution backbone feature.

        Args:
            backbone_features: List of backbone features [batch_size, C, H_i, W_i]
            encoder_hidden_states: Encoder outputs [batch_size, seq_len, hidden_size]

        Returns:
            Pixel embeddings [batch_size, hidden_size, H, W]
        re   r   Nr   r   )clonerk   r   r   r  )r   r  rP   featbackbone_visual_featsspatial_dimencoder_visual_embedrw   r  ry   r	  r
  r  s                r3   r  zSam3MaskDecoder._embed_pixels  s      ;L L$ L L (+11"58I"8M8S8STV8WW4Qa5GH%9%?%?"
A{)"-33BC83==aCKKJXcekmrs %9b! (()>? !Ms   B2rM  )r)   r*   r+   r,   r   r  r"   r   r   r   r.   r   rP  r   r   r-   rG   r   r  r   r   s   @r3   r  r    s     	m4 .   04+/<
<
  -<
  %||	<

 ,<
 \\D(<
 +,<
 
&	&<
   <
|-  %|| 
	r2   r  c                       e Zd ZddgZddiZddgZdef fdZee		 dd
e
j                  de
j                  d	z  dee   deez  fd              Ze	de
j$                  dee   defd       Zee		 	 	 	 	 	 	 dde
j$                  d	z  ded	z  d
e
j                  d	z  de
j                  d	z  de
j$                  d	z  de
j$                  d	z  de
j                  d	z  dee   defd              Z xZS )	Sam3Modelr`  ra  zdetector_model.(.+)z\1z^tracker_model.z^tracker_neck.r   c                    t        |d      r5|j                  )|j                  }t        |t              rt	        di |}|}t
        |   |       t        |j                        | _	        t        |j                        | _        |j                  j                  | _        t        j                  |j                  j                   |j"                  j                         | _        |j&                  |j(                  _        |j&                  |j"                  _        |j&                  |j*                  _        |j&                  |j,                  _        t/        |j(                        | _        t3        |j"                        | _        t7        |j*                        | _        t;        |j,                        | _        t?        |      | _         | jC                          y )Ndetector_configr1   )"hasattrr  r  dictr   r   r   r  vision_configvision_encoderr   text_configtext_encoder
vocab_sizer   r   ry   detr_encoder_configtext_projectionr   geometry_encoder_configr~  mask_decoder_configr  geometry_encoderr  detr_encoderrO  detr_decoderr  mask_decoderr|  dot_product_scoringr|  )r   r   r  r   s      r3   r   zSam3Model.__init__J  sk   6,-&2H2H2T$44O/40","?"?$F -f.B.BC78J8JK ,,77  "yy););)G)GIcIcIoIop ?E>Y>Y&&;:@:U:U""7:@:U:U""7:@:U:U""7 3F4R4R S+F,F,FG+F,F,FG+F,F,FG $9#@ r2   N	input_idsr7   r   rX   c                 v     | j                   d||dd|}|j                  }| j                  |      |_        |S )a  
        Example:

        ```python
        >>> from transformers import Sam3Model, Sam3Processor
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO

        >>> model = Sam3Model.from_pretrained("facebook/sam3")
        >>> processor = Sam3Processor.from_pretrained("facebook/sam3")

        >>> # Pre-compute text embeddings
        >>> text_inputs = processor(text="cat", return_tensors="pt")
        >>> text_embeds = model.get_text_features(**text_inputs).pooler_output

        >>> # Reuse text embeddings for multiple images
        >>> url = "http://images.cocodataset.org/val2017/000000077595.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))
        >>> img_inputs = processor(images=image, return_tensors="pt")
        >>> outputs = model(pixel_values=img_inputs.pixel_values, text_embeds=text_embeds)
        ```
        Tr  r7   return_dictr1   )r  r6   r  pooler_output)r   r  r7   r   text_outputsr6   s         r3   get_text_featureszSam3Model.get_text_featuresj  sS    @ )t(( 
D
TZ
 )::%)%9%9:K%L"r2   r  c                 ,     | j                   |fi |}|S )a  
        Example:

        ```python
        >>> from transformers import Sam3Model, Sam3Processor
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO

        >>> model = Sam3Model.from_pretrained("facebook/sam3")
        >>> processor = Sam3Processor.from_pretrained("facebook/sam3")

        >>> # Pre-compute vision embeddings
        >>> url = "http://images.cocodataset.org/val2017/000000077595.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))
        >>> img_inputs = processor(images=image, return_tensors="pt")
        >>> vision_embeds = model.get_vision_features(pixel_values=img_inputs.pixel_values)

        >>> # Reuse vision embeddings for multiple text prompts
        >>> text_inputs = processor(text="cat", return_tensors="pt")
        >>> outputs = model(vision_embeds=vision_embeds, input_ids=text_inputs.input_ids)
        ```
        )r  )r   r  r   vision_outputss       r3   get_vision_featureszSam3Model.get_vision_features  s!    < -,,\DVDr2   vision_embedstext_embedsinput_boxesinput_boxes_labelsc                 
   |du |du k(  rt        d      |du |du k(  rt        d      ||j                  d   }	|j                  }
n5|j                  d   j                  d   }	|j                  d   j                  }
| | j                  |fi |}n|}|j                  dd }|j
                  dd }|| j                  ||d      j                  }n|}||j                         nd}|duxr |j                         dkD  }d}d}|r9||j                         dkD  r|}||n't        j                  |d   t        j                  	      }||d
k7  n3t        j                  |	|j                  d   t        j                  |
      }t        j                  |d
k(  d|      }nrt        j                  |	dd|j                   |
      }t        j                  |	dt        j                  |
      }t        j                  |	dt        j                  |
      }| j#                  |||||      }|j$                  }|j&                  }|c|j                  d   dk(  r2|j                  d   dkD  r |j)                  |j                  d   dd      }t        j*                  ||gd      }|C|j                  d   dk(  r1|j                  d   dkD  r|j)                  |j                  d   d      }||t        j*                  ||gd      }n|Nt        j                  |	|j                  d   t        j                  |
      }t        j*                  ||gd      }nW|Nt        j                  |	|j                  d   t        j                  |
      }t        j*                  ||gd      }nd}n|}|} | j,                  d|d   g||d   g|d|} | j.                  d|j$                  |j0                  |j2                  ||j4                  d|}| j.                  j7                  |j8                        }t;        |j<                        }||z   j?                         }tA        |      } | jC                  |j8                  |j0                  |      jE                  d      }!|!d   }"| d   }#|j8                  d   }$|jF                  d   }% | jH                  d|$tK        |      |j$                  ||d|}&tM        |&jN                  |#|"|%|&jP                  |jR                  |j<                  |jR                  |jR                  |jT                  |jT                  |jT                  |&jT                        S )a  
        vision_embeds (`Sam3VisionEncoderOutput`, *optional*):
            Pre-computed vision embeddings. Can be used to easily reuse vision embeddings. If provided, `pixel_values`
            should not be passed. Mutually exclusive with `pixel_values`.
        text_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Pre-computed text embeddings. Can be used to easily reuse text embeddings. If provided, `input_ids`
            should not be passed. Mutually exclusive with `input_ids`.
        input_boxes (`torch.FloatTensor` of shape `(batch_size, num_boxes, 4)`, *optional*):
            Normalized box coordinates in [0, 1] range, in (cx, cy, w, h) format.
        input_boxes_labels (`torch.LongTensor` of shape `(batch_size, num_boxes)`, *optional*):
            Labels for boxes: 1 (positive), 0 (negative).

        Example:

        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoModel, AutoProcessor

        >>> model = AutoModel.from_pretrained("facebook/sam3")
        >>> processor = AutoProcessor.from_pretrained("facebook/sam3")

        >>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/sam-car.png"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read())).convert("RGB")
        >>> text = "car"
        >>> inputs = processor(images=image, text=text, return_tensors="pt")

        >>> # Get segmentation output
        >>> outputs = model(**inputs)
        >>> pred_masks = outputs.pred_masks
        >>> pred_boxes = outputs.pred_boxes
        ```
        Nz=You must specify exactly one of pixel_values or vision_embedsz8You must specify exactly one of input_ids or text_embedsr   re   Tr  ).r   r   ir   r  r   )r  r  r  r	  r
  rf   )r  r<   r  r%  )r  r<   r  r%  r=   )rN   r<   r%  )r  r  rP   r  r  )rH   rL   rM   rE   rI   rN   rO   rP   rQ   rR   rS   rT   rU   r1   )+r   rk   rh   r'   r  r(   r  r  r   numelr.   	ones_liker   rI  whererp   rj   r  r6   r7   ro   r   r  r  r<   r;   r=   rQ  rC   rb   rD   rm  r   r  ro  rE   r  rP  rK   rH   rI   r>   r?   )'r   r  r  r  r7   r  r  r  r   rw   rh   r  r'   r(   r<   r%  has_geometry_promptsgeometry_prompt_featuresgeometry_prompt_maskr  r  r  geometry_outputscombined_prompt_featurescombined_prompt_maskgeo_valid_masktext_valid_maskencoder_outputsdecoder_outputsall_box_offsetsreference_boxes_inv_sigall_pred_boxes_cxcywhall_pred_boxesall_pred_logitsrM   rL   rN   rE   mask_outputss'                                          r3   r   zSam3Model.forward  s   ` D mt&;<\]];$#67WXX#%++A.J!((F&88;AA!DJ"44Q7>>F 0T00HHN*N*<<SbA . D DSb I 22#NPT 3 m  (M-;-GN'')T	*$6R;;L;L;NQR;R#' #&;+<+<+>+B!, *5 ')?uzzR  *5 (3.J0A0A!0DEJJ_ef 
 #[[s):AzJ
!&ZA]EXEXag!h"[[QejjQWX
 ;;z1EJJvV#44-!%+4  5   (8'I'I$#3#B#B #/""1%*/G/M/Ma/PST/T - 4 45M5S5STU5VXY[\ ]',yy-AY1Z`a'b$$);q)@EYE_E_`aEbefEf%,,-A-G-G-JAN	$)=)I',yy)=Q1RXY'Z$&!& 8 > >q A\b" (-yy)^1LRS'T$%1"'**Z9L9LQ9OW\WaWajp"q',yy/CW1X^_'`$'+$'4$#, +$++ 
.r2324R89*	

 
 ,$++ 
+==)77 / D D**99
 
 ++44_5_5_`"1/2Q2Q"R!8?!J S S U+,AB22"1"L"L)77* 3 
 '"+	 	 &b)#B'
 / J J2 N)99"=(t(( 
1"#45"1"C"C4,
 
 +#..!#+%22"1"?"?$3$C$C"1"?"?!/!=!=,77$3$>$>$3$>$>$0$;$;
 	
r2   r   )NNNNNNN)r)   r*   r+   rm  _checkpoint_conversion_mapping"_keys_to_ignore_on_load_unexpectedr   r   r   r   r.   r@   r   r   r   r-   r   r  r/   r&   r  rK   r   r   r   s   @r3   r  r  @  s   (&" 	*&
z @  /3$##$ t+$ +,	$
 
+	+$  $L '' +, 
!	 @  268<-1.204046:
''$.
 /5
 ##d*	

 t+
 &&-
 &&-
 ",,t3
 +,
 
%
  
r2   r  )r  r  rs  r^  )gMbP?r5  )Nr   )jr  collections.abcr   r   dataclassesr   numpyr  r.   torch.nnr   torch.nn.functionalr   r4  r  r   transformersr    r	   rf  activationsr
   masking_utilsr   modeling_layersr   modeling_outputsr   r   r   modeling_utilsr   r   processing_utilsr   pytorch_utilsr   utilsr   r   r   utils.genericr   r   r   utils.output_capturingr   autor   configuration_sam3r   r   r    r!   r"   r#   r$   
get_loggerr)   r   r&   r5   r:   rB   rG   rK   r   rb   r   r   r   Moduler   r   r   r   r   r-   r  r  r  r"  r@  rD  rF  rM  r^  rs  r  r  r  r  r  r  r  r  r+  r7  rO  r|  r  r  r  r  __all__r1   r2   r3   <module>r     sk    . !        4 & ! 6 9 
 G & @ > > 
 6    
		H	% 	@8 	@  	@ 	3 	3  	3 7K 7  70 7K 7  7* 7K 7  7 -D+ -D  -D`u|| % 5<< 34D 34l"bii . !%II%<<% 
% <<	%
 LL4'% T\% % '(%8P)BII P)f BRYY  BF#$2||2||2 
2 
	2
 5<<%&262)299 2)jRYY 6E		 EP2>>+		 +6- 6r C/ C C: /@& /@ /@dc		 cL#299 #L8RYY 8> 
(
) (

(
V%ryy %Pa
")) a
HC299 CL
) 
DRYY 4b299 bJL
) L
^[BII [|!ryy !H/ryy /d@) @Ft
# t
n	 Rr2   