
    qidA                        d Z ddlZddlmZ ddlmZ ddlmZmZmZ ddl	m
Z
mZmZ ddlmZ dd	lmZ d
dlmZ d
dlmZmZmZ d
dlmZmZmZ  ej6                  e      Z G d de      Z G d de      Z G d dej@                        Z! G d de      Z" G d de      Z# G d de      Z$ G d de      Z% G d dej@                        Z& G d de      Z'e
 G d  d!e'             Z( e
d"#       G d$ d%e             Z)g d&Z*y)'zPyTorch Pixio model.    N)nn   )GradientCheckpointingLayer)BackboneOutputBaseModelOutputBaseModelOutputWithPooling)auto_docstring
is_tracinglogging)merge_with_config_defaults)capture_outputs   )Dinov2Config)Dinov2BackboneDinov2DropPath	Dinov2MLP)ViTAttentionViTPatchEmbeddingsViTPreTrainedModelc                   N     e Zd ZdZdZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d fd	Z xZS )PixioConfiga  
    This is the configuration class to store the configuration of a [`PixioModel`]. It is used to instantiate a
    Pixio model according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the ViT
    [facebook/pixio-huge](https://huggingface.co/facebook/pixio-huge) architecture.

    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PreTrainedConfig`] for more information.

    Args:
        hidden_size (`int`, *optional*, defaults to 1280):
            Dimensionality of the encoder layers and the pooler layer.
        num_hidden_layers (`int`, *optional*, defaults to 32):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 16):
            Number of attention heads for each attention layer in the Transformer encoder.
        mlp_ratio (`int`, *optional*, defaults to 4):
            Ratio of the hidden size of the MLPs relative to the `hidden_size`.
        n_cls_tokens (`int`, *optional*, defaults to 8):
            Number of class tokens in the Transformer encoder.
        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
            `"relu"`, `"selu"` and `"gelu_new"` are supported.
        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon used by the layer normalization layers.
        image_size (`int`, *optional*, defaults to 256):
            The size (resolution) of each image.
        patch_size (`int`, *optional*, defaults to 16):
            The size (resolution) of each patch.
        num_channels (`int`, *optional*, defaults to 3):
            The number of input channels.
        qkv_bias (`bool`, *optional*, defaults to `True`):
            Whether to add a bias to the queries, keys and values.
        drop_path_rate (`float`, *optional*, defaults to 0.0):
            Stochastic depth rate per sample (when applied in the main path of residual layers).
        out_features (`list[str]`, *optional*):
            If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc.
            (depending on how many stages the model has). If unset and `out_indices` is set, will default to the
            corresponding stages. If unset and `out_indices` is unset, will default to the last stage. Must be in the
            same order as defined in the `stage_names` attribute.
        out_indices (`list[int]`, *optional*):
            If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc. (depending on how
            many stages the model has). If unset and `out_features` is set, will default to the corresponding stages.
            If unset and `out_features` is unset, will default to the last stage. Must be in the
            same order as defined in the `stage_names` attribute.
        apply_layernorm (`bool`, *optional*, defaults to `True`):
            Whether to apply layer normalization to the feature maps in case the model is used as backbone.
        reshape_hidden_states (`bool`, *optional*, defaults to `True`):
            Whether to reshape the feature maps to 4D tensors of shape `(batch_size, hidden_size, height, width)` in
            case the model is used as backbone. If `False`, the feature maps will be 3D tensors of shape `(batch_size,
            seq_len, hidden_size)`.

    Example:

    ```python
    >>> from transformers import PixioConfig, PixioModel

    >>> # Initializing a Pixio pixio-huge style configuration
    >>> configuration = PixioConfig()

    >>> # Initializing a model (with random weights) from the pixio-huge style configuration
    >>> model = PixioModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```pixioc                 ^    t         |   ||||||||	|
|||||||       || _        | `| `| `y )N)hidden_sizenum_hidden_layersnum_attention_heads	mlp_ratio
hidden_acthidden_dropout_probattention_probs_dropout_probinitializer_rangelayer_norm_eps
image_size
patch_sizenum_channelsqkv_biasdrop_path_rateapply_layernormreshape_hidden_states)super__init__n_cls_tokenslayerscale_valueuse_swiglu_ffnuse_mask_token)selfr   r   r   r   r,   r   r   r    r!   r"   r#   r$   r%   r&   r'   out_featuresout_indicesr(   r)   kwargs	__class__s                        Y/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/pixio/modular_pixio.pyr+   zPixioConfig.__init__p   sc    . 	#/ 3! 3)E/)!!%)+"7! 	 	
& )!    )i                gelu        r<   g{Gz?gư>   r8   r   Tr<   NNTT)__name__
__module____qualname____doc__
model_typer+   __classcell__r4   s   @r5   r   r   $   sV    GR J %(").  . r6   r   c                       e Zd Zy)PixioPatchEmbeddingsNr>   r?   r@    r6   r5   rF   rF          r6   rF   c                        e Zd ZdZdeddf fdZdej                  dededej                  fd	Z	d
ej                  dej                  fdZ
 xZS )PixioEmbeddingszB
    Construct the CLS tokens, position and patch embeddings.
    configreturnNc                 (   t         |           t        j                  t	        j
                  d|j                  |j                              | _        d | _	        t        |      | _        | j                  j                  }t        j                  t	        j
                  d||j                  z   |j                              | _        t        j                  |j                        | _        |j                  | _        |j"                  | _        || _        y )N   )r*   r+   r   	Parametertorchrandnr,   r   	cls_token
mask_tokenrF   patch_embeddingsnum_patchesposition_embeddingsDropoutr   dropoutr$   rL   )r0   rL   rV   r4   s      r5   r+   zPixioEmbeddings.__init__   s    ekk!V5H5H&J\J\&]^ 4V <++77#%<<A{VM`M`?`bhbtbt0u#v zz&"<"<="// ++r6   
embeddingsheightwidthc                 @   |j                   d   | j                  z
  }| j                  j                   d   | j                  z
  }t               s||k(  r||k(  r| j                  S | j                  ddd| j                  f   }| j                  dd| j                  df   }|j                   d   }|| j                  z  }	|| j                  z  }
t        |dz        }|j                  d|||      }|j                  dddd      }|j                  }t        j                  j                  |j                  t        j                        |	|
fdd	
      j                  |      }|j                  dddd      j                  dd|      }t        j                   ||fd      S )a#  
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support tracing and interpolation at torch.float32 precision.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        rO   Ng      ?r   r   r   bicubicF)sizemodealign_cornersdtypedim)shaper,   rW   r
   r$   intreshapepermuterd   r   
functionalinterpolatetorQ   float32viewcat)r0   rZ   r[   r\   rV   num_positionsclass_pos_embedpatch_pos_embedrf   
new_height	new_widthsqrt_num_positionstarget_dtypes                r5   interpolate_pos_encodingz(PixioEmbeddings.interpolate_pos_encoding   s    !&&q)D,=,==0066q9D<M<MM|} <5+++2216I8I8I6I3IJ221d6G6G6I3IJr"t.
T__,	 !34)11!5GI[]`a)11!Q1=&,,--33u}}-i(	 4 

 "<"
  	 *11!Q1=BB1b#Nyy/?;CCr6   pixel_valuesc                 x   |j                   \  }}}}| j                  j                  j                  j                  }| j                  |j                  |            }| j                  j                  |dd      }t        j                  ||fd      }|| j                  |||      z   }| j                  |      }|S )Nrc   r^   rO   re   )rg   rU   
projectionweightrd   rm   rS   expandrQ   rp   rx   rY   )	r0   ry   
batch_size_r[   r\   rw   rZ   
cls_tokenss	            r5   forwardzPixioEmbeddings.forward   s    '3'9'9$
Avu,,77>>DD**<???+NO
^^**:r2>
YY
J7Q?
$"?"?
FTY"ZZ
\\*-
r6   )r>   r?   r@   rA   r   r+   rQ   Tensorrh   rx   r   rC   rD   s   @r5   rK   rK      si    { t $D5<< $D $DUX $D]b]i]i $DLELL U\\ r6   rK   c                       e Zd Zy)PixioAttentionNrG   rH   r6   r5   r   r      rI   r6   r   c                       e Zd Zy)PixioDropPathNrG   rH   r6   r5   r   r      rI   r6   r   c                       e Zd Zy)PixioMLPNrG   rH   r6   r5   r   r      rI   r6   r   c                   `     e Zd Zdeddf fdZdej                  dej                  fdZ xZS )
PixioLayerrL   rM   Nc                    t         |           t        j                  |j                  |j
                        | _        t        |      | _        |j                  dkD  rt        |j                        nt        j                         | _        t        j                  |j                  |j
                        | _        t        |      | _        y )Nepsr<   )r*   r+   r   	LayerNormr   r"   norm1r   	attentionr'   r   Identity	drop_pathnorm2r   mlpr0   rL   r4   s     r5   r+   zPixioLayer.__init__   s    \\&"4"4&:O:OP
'/AGAVAVY\A\v'<'<=bdbmbmbo\\&"4"4&:O:OP
F#r6   hidden_statesc                     | j                  |      }| j                  |      }| j                  |      |z   }| j                  |      }| j	                  |      }| j                  |      |z   }|S N)r   r   r   r   r   )r0   r   hidden_states_normself_attention_outputlayer_outputs        r5   r   zPixioLayer.forward  sj    !ZZ6 $/A B'<=Mzz-0xx-~~l3mCr6   )	r>   r?   r@   r   r+   rQ   r   r   rC   rD   s   @r5   r   r      s1    ${ $t $U\\ ell r6   r   c                   N     e Zd Zdef fdZddej                  dedefdZ	 xZ
S )PixioEncoderrL   c                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w )NF)
r*   r+   rL   r   
ModuleListranger   r   layergradient_checkpointing)r0   rL   r   r4   s      r5   r+   zPixioEncoder.__init__  sN    ]]fF^F^@_#`1Jv$6#`a
&+# $as   A#r   output_hidden_statesrM   c                     |r|gnd }t        | j                        D ]!  \  }} ||      }|s|j                  |       # t        ||rt	        |            S d       S )N)last_hidden_stater   )	enumerater   appendr   tuple)r0   r   r   all_hidden_statesilayer_modules         r5   r   zPixioEncoder.forward  so    /C]O(4 	8OA|(7M !((7	8
 +6G% 12
 	
MQ
 	
r6   )F)r>   r?   r@   r   r+   rQ   r   boolr   r   rC   rD   s   @r5   r   r     s.    ,{ ,

U\\ 

 

Zi 

r6   r   c                       e Zd Zy)PixioPreTrainedModelNrG   rH   r6   r5   r   r   %  rI   r6   r   c            
            e Zd Zdef fdZdefdZe ed      e		 	 dde
j                  dz  d	edz  defd
                     Z xZS )
PixioModelrL   c                     t         |   |       || _        t        |      | _        t        |      | _        t        j                  |j                  |j                        | _        | j                          y )Nr   )r*   r+   rL   rK   rZ   r   encoderr   r   r   r"   	layernorm	post_initr   s     r5   r+   zPixioModel.__init__+  sW     )&1#F+f&8&8f>S>STr6   rM   c                 .    | j                   j                  S r   )rZ   rU   )r0   s    r5   get_input_embeddingszPixioModel.get_input_embeddings6  s    ///r6   F)tie_last_hidden_statesNry   r   c                 b   || j                   j                  }|t        d      | j                  |      }| j	                  ||      }|j
                  }| j                  |      }|d d d | j                  j                  d d f   j                  d      }t        |||j                        S )Nz You have to specify pixel_valuesr   rO   re   )r   pooler_outputr   )rL   r   
ValueErrorrZ   r   r   r   r,   meanr   r   )r0   ry   r   r3   embedding_outputencoder_outputssequence_outputpooled_outputs           r5   r   zPixioModel.forward9  s      '#';;#C#C ?@@??<8+/<<8H_s<+t);;..9'+IT__-I-I+I1(LMRRWXRY)-')77
 	
r6   )NN)r>   r?   r@   r   r+   rF   r   r   r   r	   rQ   r   r   r   r   rC   rD   s   @r5   r   r   )  sv    	{ 	0&: 0  E2 -1,0
llT)
 #Tk

 
$
  3  
r6   r   zN
    Pixio backbone, to be used with frameworks like DETR and MaskFormer.
    )custom_introc            
       \    e Zd Zeee	 ddej                  dedz  de	fd                     Z
y)PixioBackboneNry   r   rM   c                    || j                   j                  }| j                  |      }| j                  |d      }|j                  }g }t        | j                  |      D ]  \  }}	|| j                  v s| j                   j                  r| j                  |	      }	| j                   j                  r|	dd| j                  j                  df   }	|j                  \  }
}}}| j                   j                  }|	j                  |
||z  ||z  d      }	|	j                  dddd      j!                         }	|j#                  |	        t%        t'        |      |r|	      S d	      S )
aw  
        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, AutoBackbone
        >>> import torch
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> processor = AutoImageProcessor.from_pretrained("facebook/pixio-huge")
        >>> model = AutoBackbone.from_pretrained(
        ...     "facebook/pixio-huge", out_features=["stage7", "stage15", "stage23", "stage31"]
        ... )

        >>> inputs = processor(image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> feature_maps = outputs.feature_maps
        >>> list(feature_maps[-1].shape)
        [1, 1280, 16, 16]
        ```NTr   r^   r   r   rO   r   )feature_mapsr   )rL   r   rZ   r   r   zipstage_namesr1   r(   r   r)   r,   rg   r$   ri   rj   
contiguousr   r   r   )r0   ry   r   r3   r   outputr   r   stagehidden_stater~   r   r[   r\   r$   s                  r5   r   zPixioBackbone.forward\  s]   @  '#';;#C#C ??<8"&,,/?VZ,"[,,#&t'7'7#G 
	2E<)));;..#'>>,#?L;;44#/4??3O3O3Q0Q#RL3?3E3E0J65!%!7!7J#/#7#7
FjDXZ_cmZmoq#rL#/#7#71a#C#N#N#PL##L1
	2 |,+?-
 	
EI
 	
r6   r   )r>   r?   r@   r   r   r	   rQ   r   r   r   r   rH   r6   r5   r   r   V  sI      NR4
!LL4
@Dt4
	4
    4
r6   r   )r   r   r   r   )+rA   rQ   r   modeling_layersr   modeling_outputsr   r   r   utilsr	   r
   r   utils.genericr   utils.output_capturingr   dinov2.configuration_dinov2r   dinov2.modeling_dinov2r   r   r   vit.modeling_vitr   r   r   
get_loggerr>   loggerr   rF   ModulerK   r   r   r   r   r   r   r   r   __all__rH   r6   r5   <module>r      s      9 [ [ 8 8 7 5 6 
 T S 
		H	%z , z z	- 	Dbii DN	\ 		N 		y 	+ 2
299 
(	- 	 )
% )
 )
X 
8
N 8

8
v Qr6   