
    qik/                        d Z ddlZddlmZ ddlmZ ddlmZ ddl	m
Z
 ddlmZ dd	lmZmZ dd
lmZ ddlmZmZ ddlmZmZmZ ddlmZmZmZmZmZmZm Z m!Z!  G d de      Z" G d de      Z# G d de      Z$ G d de      Z% G d de      Z& G d de       Z' G d de      Z( G d de      Z) G d d e      Z*e G d! d"e             Z+ ed#$       G d% d&e!             Z, G d' d(e      Z-g d)Z.y)*zPyTorch SAM 2 model.    N   )initialization)PreTrainedConfig)PreTrainedModel)Unpack)auto_docstring)TransformersKwargsmerge_with_config_defaults)capture_outputs   )CONFIG_MAPPING
AutoConfig)
Sam2ConfigSam2MaskDecoderConfigSam2PromptEncoderConfig)Sam2AttentionSam2FeedForwardSam2LayerNorm	Sam2ModelSam2PreTrainedModelSam2TwoWayAttentionBlockSam2VisionEncoderOutputSam2VisionModelc                   L     e Zd ZdZdZdZdeiZ	 	 	 	 	 	 	 	 	 	 	 	 d fd	Z xZ	S )EdgeTamVisionConfiga	  
    This is the configuration class to store the configuration of a [`EdgeTamVisionModel`]. It is used to instantiate a SAM
    vision encoder according to the specified arguments, defining the model architecture. Instantiating a configuration
    defaults will yield a similar configuration to that of SAM 2.1 Hiera-tiny
    [facebook/EdgeTAM](https://huggingface.co/facebook/EdgeTAM) architecture.

    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PreTrainedConfig`] for more information.

    Args:
        backbone_config (`Union[dict, "PreTrainedConfig"]`, *optional*, defaults to `timm/repvit_m1.dist_in1k`):
            Configuration for the vision backbone. This is used to instantiate the backbone using
            `AutoModel.from_config`.
        backbone_channel_list (`List[int]`, *optional*, defaults to `[384, 192, 96, 48]`):
            The list of channel dimensions for the backbone.
        backbone_feature_sizes (`List[List[int]]`, *optional*, defaults to `[[256, 256], [128, 128], [64, 64]]`):
            The spatial sizes of the feature maps from the backbone.
        fpn_hidden_size (`int`, *optional*, defaults to 256):
            The hidden dimension of the FPN.
        fpn_kernel_size (`int`, *optional*, defaults to 1):
            The kernel size for the convolutions in the neck.
        fpn_stride (`int`, *optional*, defaults to 1):
            The stride for the convolutions in the neck.
        fpn_padding (`int`, *optional*, defaults to 0):
            The padding for the convolutions in the neck.
        fpn_top_down_levels (`List[int]`, *optional*, defaults to `[2, 3]`):
            The levels for the top-down FPN connections.
        num_feature_levels (`int`, *optional*, defaults to 3):
            The number of feature levels from the FPN to use.
        hidden_act (`str`, *optional*, defaults to `"gelu"`):
            The non-linear activation function in the neck.
        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon for the layer normalization.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.

    vision_configedgetam_vision_modelbackbone_configc                    |g dn|}|ddgddgddggn|}|ddgn|}t        |t              r'|j                  dd      |d<   t        |d      di |}n|t	        j
                  d	dd
g dd      }|| _        || _        || _        || _	        || _
        || _        || _        || _        |	| _        |
| _        || _        || _        t%        | L  di | y )N)i     `   0         @   r   r   
model_typetimm_wrapperztimm/repvit_m1.dist_in1kT)r      r   r   )in_chansfeatures_onlyout_indices)
model_args )
isinstancedictgetr   r   from_pretrainedr   backbone_channel_listbackbone_feature_sizesfpn_hidden_sizefpn_kernel_size
fpn_stridefpn_paddingfpn_top_down_levelsnum_feature_levels
hidden_actlayer_norm_epsinitializer_rangesuper__init__)selfr   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   kwargs	__class__s                 ]/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/edgetam/modular_edgetam.pyr>   zEdgeTamVisionConfig.__init__V   s     7L6S 2Yn2H2Pc3Z#sb"X.Vl 	 )<(Cq!fI\ot,,;,?,?n,]OL),_\-JK^o^O$(88*()DQ]^O
  / &;"&<#..$&#6 "4$,!2"6"    )NNNr#   r(   r(   r   Nr   gelugư>g{Gz?)
__name__
__module____qualname____doc__base_config_keyr&   r   sub_configsr>   __classcell__)rA   s   @rB   r   r   )   sQ    $L &O'J:K "# .# .#rC   r   c                       e Zd Zy)EdgeTamPromptEncoderConfigNrE   rF   rG   r-   rC   rB   rM   rM          rC   rM   c                       e Zd Zy)EdgeTamMaskDecoderConfigNrN   r-   rC   rB   rQ   rQ      rO   rC   rQ   c                       e Zd ZdZy)EdgeTamConfiga
  
    [`EdgeTamConfig`] is the configuration class to store the configuration of a [`EdgeTamModel`]. It is used to instantiate a
    EDGETAM model according to the specified arguments, defining the memory attention, memory encoder, and image encoder
    configs. Instantiating a configuration defaults will yield a similar configuration to that of the SAM 2.1 Hiera-tiny
    [facebook/edgetam.1-hiera-tiny](https://huggingface.co/facebook/edgetam.1-hiera-tiny) architecture.

    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PreTrainedConfig`] for more information.

    <Tip>

    EdgeTAM checkpoints with `model_type="edgetam_video"` are compatible with `EdgeTamModel` since the video variant
    weights are a superset of the image-only model weights. You may see a warning about model type mismatch when
    loading such checkpoints, which can be safely ignored in this case.

    </Tip>

    Args:
        vision_config (Union[`dict`, `EdgeTamVisionConfig`], *optional*):
            Dictionary of configuration options used to initialize [`EdgeTamVisionConfig`].
        prompt_encoder_config (Union[`dict`, `EdgeTamPromptEncoderConfig`], *optional*):
            Dictionary of configuration options used to initialize [`EdgeTamPromptEncoderConfig`].
        mask_decoder_config (Union[`dict`, `EdgeTamMaskDecoderConfig`], *optional*):
            Dictionary of configuration options used to initialize [`EdgeTamMaskDecoderConfig`].
        initializer_range (`float`, *optional*, defaults to 0.02):
            Standard deviation for parameter initialization.

    Example:

    ```python
    >>> from transformers import (
    ...     EdgeTamVisionConfig,
    ...     EdgeTamPromptEncoderConfig,
    ...     EdgeTamMaskDecoderConfig,
    ...     EdgeTamModel,
    ... )

    >>> # Initializing a EdgeTamConfig with `"facebook/edgetam.1_hiera_tiny"` style configuration
    >>> configuration = EdgeTamConfig()

    >>> # Initializing a EdgeTamModel (with random weights) from the `"facebook/edgetam.1_hiera_tiny"` style configuration
    >>> model = EdgeTamModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config

    >>> # We can also initialize a EdgeTamConfig from a EdgeTamVisionConfig, EdgeTamPromptEncoderConfig, and EdgeTamMaskDecoderConfig
    >>> # Initializing EDGETAM vision encoder, memory attention, and memory encoder configurations
    >>> vision_config = EdgeTamVisionConfig()
    >>> prompt_encoder_config = EdgeTamPromptEncoderConfig()
    >>> mask_decoder_config = EdgeTamMaskDecoderConfig()

    >>> config = EdgeTamConfig(vision_config, prompt_encoder_config, mask_decoder_config)
    ```
    N)rE   rF   rG   rH   r-   rC   rB   rS   rS      s    6p 	rC   rS   c                       e Zd Zy)EdgeTamLayerNormNrN   r-   rC   rB   rU   rU      rO   rC   rU   c                       e Zd Zy)EdgeTamVisionEncoderOutputNrN   r-   rC   rB   rW   rW      rO   rC   rW   c                       e Zd Zy)EdgeTamAttentionNrN   r-   rC   rB   rY   rY      rO   rC   rY   c                       e Zd Zy)EdgeTamTwoWayAttentionBlockNrN   r-   rC   rB   r[   r[      rO   rC   r[   c                       e Zd Zy)EdgeTamFeedForwardNrN   r-   rC   rB   r]   r]      rO   rC   r]   c                   >    e Zd ZdZ ej
                         d        Zy)EdgeTamPreTrainedModelNc                    t        j                  | |       t        |t              r-|j                   t        j                  |j                         y y t        |d      r,t        j                  |j                  |j                         y y )Npositional_embedding)std)r   _init_weightsr.   EdgeTamModelno_memory_embeddinginitzeros_hasattrnormal_ra   scale)r?   modules     rB   rc   z$EdgeTamPreTrainedModel._init_weights   sg    %%dF3fl+))5F667 6V34LL44&,,G 5rC   )rE   rF   rG   "_keys_to_ignore_on_load_unexpectedtorchno_gradrc   r-   rC   rB   r_   r_      s$    )-&U]]_H HrC   r_   zN
    The vision model from EdgeTAM without any head or projection on top.
    )custom_introc            
       p    e Zd ZeZdZi Zd Zee		 dde
j                  dz  dee   deez  fd              Zy)EdgeTamVisionModelpixel_valuesc                     t        d      Nz2Can't get input embeddings from timm wrapper modelNotImplementedErrorr?   s    rB   get_input_embeddingsz'EdgeTamVisionModel.get_input_embeddings       !"VWWrC   Nr@   returnc           	      ^   |t        d       | j                  |fi |}|j                  }|D cg c]  }|j                  dddd       }}| j	                  |      \  }}|| j
                   d  d d d   }|| j
                   d  d d d   }t        |d   |||j                        S c c}w )Nz You have to specify pixel_valuesr   r   r   r(   )last_hidden_statefpn_hidden_statesfpn_position_encodinghidden_states)
ValueErrorbackboner}   permuteneckr9   rW   r   )r?   rr   r@   backbone_outputintermediate_hidden_stateshidden_stater~   r   s           rB   forwardzEdgeTamVisionModel.forward   s     ?@@ ($--??%4%F%F"[u%v<l&:&:1aA&F%v"%v3799=W3X00-t/F/F.F.HI$B$O 5t7N7N6N6P QRVTVRV W)8</"7)77	
 	
 &ws   B*)N)rE   rF   rG   r   config_classmain_input_name_can_record_outputsrx   r
   r   rm   FloatTensorr   r	   tuplerW   r   r-   rC   rB   rq   rq      sp     'L$O X   26
''$.
 +,
 
+	+	
   
rC   rq   c                       e Zd Zg dZd Zy)rd   )z
^memory_.*z^mask_downsample.*zspatial_perceiver.*z^object_pointer_proj.*z0^temporal_positional_encoding_projection_layer.*no_memory_positional_encodingno_object_pointer%occlusion_spatial_embedding_parameterc                     t        d      rt   ru   rw   s    rB   rx   z!EdgeTamModel.get_input_embeddings$  ry   rC   N)rE   rF   rG   rl   rx   r-   rC   rB   rd   rd     s    	*&XrC   rd   )rd   rq   r_   rS   r   rM   rQ   )/rH   rm    r   rf   configuration_utilsr   modeling_utilsr   processing_utilsr   utilsr   utils.genericr	   r
   utils.output_capturingr   autor   r   sam2.configuration_sam2r   r   r   sam2.modeling_sam2r   r   r   r   r   r   r   r   r   rM   rQ   rS   rU   rW   rY   r[   r]   r_   rq   rd   __all__r-   rC   rB   <module>r      s     & 3 - & L 5 - ` `	 	 	[#* [#|	!8 		4 	9	J 9	x	} 		!8 		} 		": 		 	 
H0 
H 
H 
#
 #

#
LX9 X rC   