
    qiS                         d Z ddlmZ ddlmZ ddlmZmZ  G d de      Z G d	 d
e      Z	 G d de      Z
 G d de      Z G d de      Z G d de      Z G d de      Zg dZy)zSAM3 model configuration    )CLIPTextConfig   )PreTrainedConfig   )CONFIG_MAPPING
AutoConfigc                   N     e Zd ZdZdZdZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d fd	Z xZS )Sam3ViTConfiga  
    Configuration class for SAM3 Vision Encoder (ViT backbone).

    Instantiating a configuration defaults will yield a similar configuration to that of SAM 3
    [facebook/sam3](https://huggingface.co/facebook/sam3) architecture.

    Args:
        hidden_size (`int`, *optional*, defaults to 1024):
            Dimensionality of the encoder layers.
        intermediate_size (`int`, *optional*, defaults to 4736):
            Dimensionality of the feedforward (MLP) layers.
        num_hidden_layers (`int`, *optional*, defaults to 32):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 16):
            Number of attention heads for each attention layer.
        num_channels (`int`, *optional*, defaults to 3):
            Number of input image channels.
        image_size (`int`, *optional*, defaults to 1008):
            Expected input image size.
        patch_size (`int`, *optional*, defaults to 14):
            Size of image patches.
        hidden_act (`str`, *optional*, defaults to `"gelu"`):
            The non-linear activation function.
        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon used by layer normalization layers.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for attention probabilities.
        rope_theta (`float`, *optional*, defaults to 10000.0):
            Base frequency for RoPE.
        window_size (`int`, *optional*, defaults to 24):
            Window size for windowed attention.
        global_attn_indexes (`list[int]`, *optional*, defaults to `[7, 15, 23, 31]`):
            Indexes of layers with global attention.
        layer_scale_init_value (`float`, *optional*):
            Initial value for layer scale. None means no layer scale.
        pretrain_image_size (`int`, *optional*, defaults to 336):
            Pretrained model image size for position embedding initialization.
        hidden_dropout (`float`, *optional*, defaults to 0.0):
            Dropout probability for hidden states.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing weight matrices.
    backbone_configsam3_vit_modelc                    t        |   di | |g d}|| _        || _        || _        || _        || _        || _        || _        || _	        |	| _
        |
| _        || _        || _        || _        || _        || _        || _        || _        y )N)             )super__init__hidden_sizeintermediate_sizenum_hidden_layersnum_attention_headsnum_channels
image_size
patch_size
hidden_actlayer_norm_epsattention_dropout
rope_thetawindow_sizeglobal_attn_indexeslayer_scale_init_valuepretrain_image_sizehidden_dropoutinitializer_range)selfr   r   r   r   r   r   r   r   r   r   r   r    r!   r"   r#   r$   r%   kwargs	__class__s                      ]/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/sam3/configuration_sam3.pyr   zSam3ViTConfig.__init__E   s    * 	"6"&"1&!2!2#6 ($$$,!2$&#6 &<##6 ,!2    )   i         r   i     geluư>        g     @   NNiP  r1   {Gz?)__name__
__module____qualname____doc__base_config_key
model_typer   __classcell__r(   s   @r)   r
   r
      sT    )V (O!J  #%)3 )3r*   r
   c                   v     e Zd ZdZdZdZdeiZ	 	 	 	 	 	 	 d fd	Ze	d        Z
e
j                  d        Z
 xZS )	Sam3VisionConfiga  
    This is the configuration class to store the configuration of a [`Sam3VisionModel`]. It is used to instantiate a SAM
    vision encoder according to the specified arguments, defining the model architecture. Instantiating a configuration
    defaults will yield a similar configuration to that of SAM 3
    [facebook/sam3](https://huggingface.co/facebook/sam3) architecture.

    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PreTrainedConfig`] for more information.

    Args:
        backbone_config (`Union[dict, "PreTrainedConfig"]`, *optional*, defaults to `Sam3ViTConfig()`):
            Configuration for the vision backbone. This is used to instantiate the backbone using
            `AutoModel.from_config`.
        fpn_hidden_size (`int`, *optional*, defaults to 256):
            The hidden dimension of the FPN.
        backbone_feature_sizes (`List[List[int]]`, *optional*, defaults to `[[288, 288], [144, 144], [72, 72]]`):
            The spatial sizes (height, width) of the feature maps from the backbone at different scales.
        scale_factors (`list[float]`, *optional*, defaults to `[4.0, 2.0, 1.0, 0.5]`):
            Scale factors for FPN multi-scale features. List of scaling factors for each FPN level.
        hidden_act (`str`, *optional*, defaults to `"gelu"`):
            The non-linear activation function in the neck.
        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon for the layer normalization.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.

    vision_configsam3_vision_modelr   c                 <   |g dn|}|ddgddgddgg}t        |t              r'|j                  dd      |d<   t        |d      di |}n|t        d          }|| _        || _        || _        || _        || _        || _	        || _
        t        	| 0  di | y )N)g      @g       @g      ?g      ?i      H   r9   r   r   )
isinstancedictgetr   r   fpn_hidden_sizescale_factorsbackbone_feature_sizesr   r   r%   r   r   )
r&   r   rF   rH   rG   r   r   r%   r'   r(   s
            r)   r   zSam3VisionConfig.__init__   s     1>0E,=!)'*Cj3*r2h%G"ot,,;,?,?N^,_OL),_\-JK^o^O$,-=>@O.  /*&<#$,!2"6"r*   c                 .    | j                   j                  S )z"Image size for the vision encoder.r   r   r&   s    r)   r   zSam3VisionConfig.image_size   s     ##...r*   c                 &    || j                   _        y)z-Set the image size and propagate to backbone.NrJ   r&   values     r)   r   zSam3VisionConfig.image_size   s     +0'r*   )N   NNr/   r0   r3   )r4   r5   r6   r7   r8   r9   r   sub_configsr   propertyr   setterr:   r;   s   @r)   r=   r=   q   sk    8 &O$J:K ##B / / 0 0r*   r=   c                   <     e Zd ZdZdZ	 	 	 	 	 	 	 	 	 	 d fd	Z xZS )Sam3GeometryEncoderConfiga  
    Configuration class for SAM3 Geometry Encoder.

    Args:
        hidden_size (`int`, *optional*, defaults to 256):
            Dimensionality of the encoder layers.
        num_layers (`int`, *optional*, defaults to 3):
            Number of transformer encoder layers for processing geometry prompts.
        num_attention_heads (`int`, *optional*, defaults to 8):
            Number of attention heads in the geometry encoder.
        intermediate_size (`int`, *optional*, defaults to 2048):
            Dimensionality of the feedforward layers.
        dropout (`float`, *optional*, defaults to 0.1):
            Dropout probability.
        hidden_act (`str`, *optional*, defaults to `"relu"`):
            Activation function in FFN.
        hidden_dropout (`float`, *optional*, defaults to 0.0):
            Dropout probability for hidden states.
        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
            Epsilon for layer normalization.
        roi_size (`int`, *optional*, defaults to 7):
            ROI size for box pooling operations.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing weight matrices.
    sam3_geometry_encoderc                     t        |   di | || _        || _        || _        || _        || _        || _        || _        || _	        |	| _
        |
| _        y Nr   )r   r   r   
num_layersr   r   dropoutr   r$   r   roi_sizer%   )r&   r   rX   r   r   rY   r   r$   r   rZ   r%   r'   r(   s               r)   r   z"Sam3GeometryEncoderConfig.__init__   sa     	"6"&$#6 !2$,, !2r*   )
rO   r         皙?relur1   r0   r   r3   r4   r5   r6   r7   r9   r   r:   r;   s   @r)   rT   rT      s9    4 )J 3 3r*   rT   c                   :     e Zd ZdZdZ	 	 	 	 	 	 	 	 	 d fd	Z xZS )Sam3DETREncoderConfigaN  
    Configuration class for SAM3 DETR Encoder (vision-text fusion encoder).

    Args:
        hidden_size (`int`, *optional*, defaults to 256):
            Dimensionality of the encoder layers.
        num_layers (`int`, *optional*, defaults to 6):
            Number of encoder layers.
        num_attention_heads (`int`, *optional*, defaults to 8):
            Number of attention heads.
        intermediate_size (`int`, *optional*, defaults to 2048):
            Dimensionality of the feedforward layers.
        dropout (`float`, *optional*, defaults to 0.1):
            Dropout probability.
        hidden_act (`str`, *optional*, defaults to `"relu"`):
            Activation function in FFN.
        hidden_dropout (`float`, *optional*, defaults to 0.0):
            Dropout probability for hidden states.
        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
            Epsilon for layer normalization.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing weight matrices.
    sam3_detr_encoderc
                     t        |   di |
 || _        || _        || _        || _        || _        || _        || _        || _	        |	| _
        y rW   )r   r   r   rX   r   r   rY   r   r$   r   r%   )r&   r   rX   r   r   rY   r   r$   r   r%   r'   r(   s              r)   r   zSam3DETREncoderConfig.__init__  sZ     	"6"&$#6 !2$,,!2r*   )	rO      r[   r\   r]   r^   r1   r0   r3   r_   r;   s   @r)   ra   ra      s6    0 %J 3 3r*   ra   c                   <     e Zd ZdZdZ	 	 	 	 	 	 	 	 	 	 d fd	Z xZS )Sam3DETRDecoderConfiga  
    Configuration class for SAM3 DETR Decoder (object query decoder).

    Args:
        hidden_size (`int`, *optional*, defaults to 256):
            Dimensionality of the decoder layers.
        num_layers (`int`, *optional*, defaults to 6):
            Number of decoder layers.
        num_queries (`int`, *optional*, defaults to 200):
            Number of object queries.
        num_attention_heads (`int`, *optional*, defaults to 8):
            Number of attention heads.
        intermediate_size (`int`, *optional*, defaults to 2048):
            Dimensionality of the feedforward layers.
        dropout (`float`, *optional*, defaults to 0.1):
            Dropout probability.
        hidden_act (`str`, *optional*, defaults to `"relu"`):
            Activation function in FFN.
        hidden_dropout (`float`, *optional*, defaults to 0.0):
            Dropout probability for hidden states.
        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
            Epsilon for layer normalization.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing weight matrices.
    sam3_detr_decoderc                     t        |   di | || _        || _        || _        || _        || _        || _        || _        || _	        |	| _
        |
| _        y rW   )r   r   r   rX   num_queriesr   r   rY   r   r$   r   r%   )r&   r   rX   ri   r   r   rY   r   r$   r   r%   r'   r(   s               r)   r   zSam3DETRDecoderConfig.__init__I  sb     	"6"&$&#6 !2$,,!2r*   )
rO   rd      r[   r\   r]   r^   r1   r0   r3   r_   r;   s   @r)   rf   rf   ,  s9    4 %J 3 3r*   rf   c                   4     e Zd ZdZdZ	 	 	 	 	 	 d fd	Z xZS )Sam3MaskDecoderConfiga]  
    Configuration class for SAM3 Mask Decoder (pixel-level mask prediction).

    Args:
        hidden_size (`int`, *optional*, defaults to 256):
            Dimensionality of the mask decoder.
        num_upsampling_stages (`int`, *optional*, defaults to 3):
            Number of upsampling stages in the pixel decoder (FPN).
        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
            Epsilon for layer normalization.
        dropout (`float`, *optional*, defaults to 0.0):
            Dropout probability for prompt cross-attention.
        num_attention_heads (`int`, *optional*, defaults to 8):
            Number of attention heads for prompt cross-attention.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing weight matrices.
    sam3_mask_decoderc                 x    t        |   di | || _        || _        || _        || _        || _        || _        y rW   )r   r   r   num_upsampling_stagesr   rY   r   r%   )	r&   r   ro   r   rY   r   r%   r'   r(   s	           r)   r   zSam3MaskDecoderConfig.__init__y  sD     	"6"&%:",#6 !2r*   )rO   r   r0   r1   r[   r3   r_   r;   s   @r)   rl   rl   d  s-    $ %J 3 3r*   rl   c                        e Zd ZdZdZdZeeee	e
edZ	 	 	 	 	 	 	 d fd	Zed        Zej                   d        Z xZS )	
Sam3Configa  
    Configuration class to store the configuration of a [`Sam3Model`].

    Instantiating a configuration defaults will yield a similar configuration to that of SAM 3
    [facebook/sam3](https://huggingface.co/facebook/sam3) architecture.

    This is the main configuration class that combines all sub-configurations for the SAM3 model.

    <Tip>

    SAM3 checkpoints with `model_type="sam3_video"` are compatible with `Sam3Model` since the video variant weights
    are a superset of the image-only model weights. You may see a warning about model type mismatch when loading
    such checkpoints, which can be safely ignored in this case.

    </Tip>

    Args:
        vision_config (`dict` or `Sam3VisionConfig`, *optional*):
            Configuration for the vision encoder.
        text_config (`dict` or `Sam3TextConfig`, *optional*):
            Configuration for the text encoder.
        geometry_encoder_config (`dict` or `Sam3GeometryEncoderConfig`, *optional*):
            Configuration for the geometry encoder.
        detr_encoder_config (`dict` or `Sam3DETREncoderConfig`, *optional*):
            Configuration for the DETR encoder.
        detr_decoder_config (`dict` or `Sam3DETRDecoderConfig`, *optional*):
            Configuration for the DETR decoder.
        mask_decoder_config (`dict` or `Sam3MaskDecoderConfig`, *optional*):
            Configuration for the mask decoder.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing weight matrices.

    Example:
    ```python
    >>> from transformers import Sam3Config, Sam3Model

    >>> # Initializing a SAM3 configuration
    >>> configuration = Sam3Config()

    >>> # Initializing a model from the configuration
    >>> model = Sam3Model(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```
    sam3T)r>   text_configgeometry_encoder_configdetr_encoder_configdetr_decoder_configmask_decoder_configc           	      T   |i }t        |t              rt        d
i || _        n|| _        |ddddddddd	}t        |t              rt	        d
i || _        n|| _        |i }t        |t              rt        d
i || _        n|| _        |i }t        |t              rt        d
i || _	        n|| _	        |i }t        |t              rt        d
i || _        n|| _        |i }t        |t              rt        d
i || _        n|| _        || _        t        	| @  d
i | y )Ni   r+   i   i   r2   r-   r,   r/   )
vocab_sizer   r   projection_dimr   r   max_position_embeddingsr   r   )rC   rD   r=   r>   r   rs   rT   rt   ra   ru   rf   rv   rl   rw   r%   r   r   )
r&   r>   rs   rt   ru   rv   rw   r%   r'   r(   s
            r)   r   zSam3Config.__init__  s]     MmT*!1!BM!BD!.D ##%)"%%'')+-$	K k4(-<<D*D #*&(#-t4+D+_G^+_D(+BD( &"$)40'<'S?R'SD$':D$ &"$)40'<'S?R'SD$':D$ &"$)40'<'S?R'SD$':D$!2"6"r*   c                 .    | j                   j                  S )zImage size for the SAM3 model.r>   r   rK   s    r)   r   zSam3Config.image_size  s     !!,,,r*   c                 &    || j                   _        y)z2Set the image size and propagate to vision config.Nr}   rM   s     r)   r   zSam3Config.image_size  s     ).%r*   )NNNNNNr3   )r4   r5   r6   r7   r9   is_compositionr=   r   rT   ra   rf   rl   rP   r   rQ   r   rR   r:   r;   s   @r)   rq   rq     sz    -^ JN)%#<444K  $   E#N - - . .r*   rq   )rq   r
   r=   rT   ra   rf   rl   N)r7   transformersr   configuration_utilsr   autor   r   r
   r=   rT   ra   rf   rl   rq   __all__r   r*   r)   <module>r      s     ' 3 -X3$ X3vL0' L0^53 0 53p13, 13h53, 53p%3, %3PJ.! J.Zr*   