
    qi-                     x    d Z ddlmZ ddlmZ ddlmZ ddlmZ  ej                  e
      Z G d de      ZdgZy	)
zDPT model configuration   )%consolidate_backbone_kwargs_to_config)PreTrainedConfig)logging   )
AutoConfigc                         e Zd ZdZdZdeiZdddddddd	d
dddddg ddg dg ddddddddddg dddgdddf  fd	Z xZS )	DPTConfiga  
    This is the configuration class to store the configuration of a [`DPTModel`]. It is used to instantiate an DPT
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the DPT
    [Intel/dpt-large](https://huggingface.co/Intel/dpt-large) architecture.

    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PreTrainedConfig`] for more information.


    Args:
        hidden_size (`int`, *optional*, defaults to 768):
            Dimensionality of the encoder layers and the pooler layer.
        num_hidden_layers (`int`, *optional*, defaults to 12):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 12):
            Number of attention heads for each attention layer in the Transformer encoder.
        intermediate_size (`int`, *optional*, defaults to 3072):
            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
            `"relu"`, `"selu"` and `"gelu_new"` are supported.
        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
            The epsilon used by the layer normalization layers.
        image_size (`int`, *optional*, defaults to 384):
            The size (resolution) of each image.
        patch_size (`int`, *optional*, defaults to 16):
            The size (resolution) of each patch.
        num_channels (`int`, *optional*, defaults to 3):
            The number of input channels.
        is_hybrid (`bool`, *optional*, defaults to `False`):
            Whether to use a hybrid backbone. Useful in the context of loading DPT-Hybrid models.
        qkv_bias (`bool`, *optional*, defaults to `True`):
            Whether to add a bias to the queries, keys and values.
        backbone_out_indices (`list[int]`, *optional*, defaults to `[2, 5, 8, 11]`):
            Indices of the intermediate hidden states to use from backbone.
        readout_type (`str`, *optional*, defaults to `"project"`):
            The readout type to use when processing the readout token (CLS token) of the intermediate hidden states of
            the ViT backbone. Can be one of [`"ignore"`, `"add"`, `"project"`].

            - "ignore" simply ignores the CLS token.
            - "add" passes the information from the CLS token to all other tokens by adding the representations.
            - "project" passes information to the other tokens by concatenating the readout to all other tokens before
              projecting the
            representation to the original feature dimension D using a linear layer followed by a GELU non-linearity.
        reassemble_factors (`list[int]`, *optional*, defaults to `[4, 2, 1, 0.5]`):
            The up/downsampling factors of the reassemble layers.
        neck_hidden_sizes (`list[str]`, *optional*, defaults to `[96, 192, 384, 768]`):
            The hidden sizes to project to for the feature maps of the backbone.
        fusion_hidden_size (`int`, *optional*, defaults to 256):
            The number of channels before fusion.
        head_in_index (`int`, *optional*, defaults to -1):
            The index of the features to use in the heads.
        use_batch_norm_in_fusion_residual (`bool`, *optional*, defaults to `False`):
            Whether to use batch normalization in the pre-activate residual units of the fusion blocks.
        use_bias_in_fusion_residual (`bool`, *optional*, defaults to `True`):
            Whether to use bias in the pre-activate residual units of the fusion blocks.
        add_projection (`bool`, *optional*, defaults to `False`):
            Whether to add a projection layer before the depth estimation head.
        use_auxiliary_head (`bool`, *optional*, defaults to `True`):
            Whether to use an auxiliary head during training.
        auxiliary_loss_weight (`float`, *optional*, defaults to 0.4):
            Weight of the cross-entropy loss of the auxiliary head.
        semantic_loss_ignore_index (`int`, *optional*, defaults to 255):
            The index that is ignored by the loss function of the semantic segmentation model.
        semantic_classifier_dropout (`float`, *optional*, defaults to 0.1):
            The dropout ratio for the semantic classification head.
        backbone_featmap_shape (`list[int]`, *optional*, defaults to `[1, 1024, 24, 24]`):
            Used only for the `hybrid` embedding type. The shape of the feature maps of the backbone.
        neck_ignore_stages (`list[int]`, *optional*, defaults to `[0, 1]`):
            Used only for the `hybrid` embedding type. The stages of the readout layers to ignore.
        backbone_config (`Union[dict, "PreTrainedConfig"]`, *optional*, defaults to `BitConfig()`):
            The configuration of the backbone model. Only used in case `is_hybrid` is `True` or in case you want to
            leverage the [`AutoBackbone`] API.
        pooler_output_size (`int`, *optional*):
           Dimensionality of the pooler layer. If None, defaults to `hidden_size`.
        pooler_act (`str`, *optional*, defaults to `"tanh"`):
           The activation function to be used by the pooler.

    Example:

    ```python
    >>> from transformers import DPTModel, DPTConfig

    >>> # Initializing a DPT dpt-large style configuration
    >>> configuration = DPTConfig()

    >>> # Initializing a model from the dpt-large style configuration
    >>> model = DPTModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```dptbackbone_config      i   gelug        g{Gz?g-q=     r   FT)r            project)   r      g      ?)`      r   r      Ng?   g?)r   i      r       r   tanhc!                    || _         || _        |dvrt        d      | j                  rNt        |t              r|j                  dd       t        d|dddg dg dd	d
d|!\  }}!|dk7  r0t        d      |!j                  d      |t        dd|i|!\  }}!d }|| _        || _	        || _
        || _        || _        || _        |	| _        |
| _        || _        || _        || _        || _        |r|nd | _        |r|ng | _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _         || _!        || _"        || _#        |r|n|| _$        | | _%        tM        "|   di |! y )N)ignoreaddr   z8Readout_type must be one of ['ignore', 'add', 'project']
model_typebitsame
bottleneck)r   r   	   )stage1stage2stage3T)global_padding
layer_typedepthsout_featuresembedding_dynamic_padding)r   default_config_typedefault_config_kwargsr   z<Readout type must be 'project' when using `DPT-hybrid` mode.backboner    )(hidden_size	is_hybrid
ValueError
isinstancedict
setdefaultr   getr   num_hidden_layersnum_attention_headsintermediate_sizehidden_dropout_probattention_probs_dropout_problayer_norm_eps
image_size
patch_sizenum_channelsqkv_biasbackbone_out_indicesbackbone_featmap_shapeneck_ignore_stages
hidden_actinitializer_rangereadout_typereassemble_factorsneck_hidden_sizesfusion_hidden_sizehead_in_index!use_batch_norm_in_fusion_residualuse_bias_in_fusion_residualadd_projectionuse_auxiliary_headauxiliary_loss_weightsemantic_loss_ignore_indexsemantic_classifier_dropoutpooler_output_size
pooler_actsuper__init__)#selfr3   r:   r;   r<   rG   r=   r>   rH   r?   r@   rA   rB   r4   rC   rD   rI   rJ   rK   rL   rM   rN   rO   rP   rQ   rR   rS   rT   rE   rF   r   rU   rV   kwargs	__class__s#                                     [/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/dpt/configuration_dpt.pyrX   zDPTConfig.__init__   s   H '";;WXX>>/40**<?&K ' /$)&,".'$B15'' '#OV y( !_``ZZ
#/?3N&K ' /''#OV $( . "3#6 !2#6 ,H),$$( $8!@I&<t#8A"4r$!2("4!2"4*1R.+F(, #5%:"*D'+F(8J"4P[$"6"    )	__name__
__module____qualname____doc__r"   r   sub_configsrX   __classcell__)r[   s   @r\   r	   r	      s    bH J$j1K %(*)-*/$(!#&$'0q6Cf# f#r]   r	   N)ra   backbone_utilsr   configuration_utilsr   utilsr   auto.configuration_autor   
get_loggerr^   loggerr	   __all__r2   r]   r\   <module>rk      sC     C 3  0 
		H	%N#  N#b -r]   