
    qiR                        d dl Zd dl mZ d dlZd dlmZ ddlmZ ddlm	Z	 ddl
mZ ddlmZ dd	lmZmZmZ dd
lmZmZ ddlmZ ddlmZmZmZ ddlmZ ddlmZ ddlm Z   G d dejB                        Z" G d dejB                        Z#	 	 d:dejB                  dejH                  dejH                  dejH                  dejH                  dz  de%dz  de%dee   fdZ& G d dejB                        Z' G d  d!ejB                        Z( G d" d#ejB                        Z)d;d$ejH                  d%e%d&e*d'ejH                  fd(Z+ G d) d*ejB                        Z, G d+ d,ejB                        Z- G d- d.e      Z. G d/ d0ejB                        Z/e G d1 d2e             Z0e G d3 d4e0             Z1 ed56       G d7 d8ee0             Z2g d9Z3y)<    N)Callable)nn   )initialization)ACT2FN)BackboneMixin)GradientCheckpointingLayer)BackboneOutputBaseModelOutputBaseModelOutputWithPooling)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstring
is_tracing)merge_with_config_defaults)capture_outputs   )PixioConfigc                   f     e Zd ZdZdef fdZddej                  dedej                  fdZ	 xZ
S )	PixioPatchEmbeddingsz
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    configc                    t         |           |j                  |j                  }}|j                  |j
                  }}t        |t        j                  j                        r|n||f}t        |t        j                  j                        r|n||f}|d   |d   z  |d   |d   z  z  }|| _        || _        || _        || _
        t        j                  ||||      | _        y )Nr   r   )kernel_sizestride)super__init__
image_size
patch_sizenum_channelshidden_size
isinstancecollectionsabcIterablenum_patchesr   Conv2d
projection)selfr   r   r    r!   r"   r'   	__class__s          Z/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/pixio/modeling_pixio.pyr   zPixioPatchEmbeddings.__init__/   s    !'!2!2F4E4EJ
$*$7$79K9Kk#-j+//:R:R#SZZdfpYq
#-j+//:R:R#SZZdfpYq
!!}
15*Q-:VW=:XY$$(&))L+:^hi    pixel_valuesinterpolate_pos_encodingreturnc                    |j                   \  }}}}|| j                  k7  rt        d| j                   d| d      |sV|| j                  d   k7  s|| j                  d   k7  r2t        d| d| d| j                  d    d| j                  d    d		      | j	                  |      j                  d
      j                  dd
      }|S )NzoMake sure that the channel dimension of the pixel values match with the one set in the configuration. Expected z	 but got .r   r   zInput image size (*z) doesn't match model (z).   )shaper!   
ValueErrorr   r)   flatten	transpose)r*   r.   r/   
batch_sizer!   heightwidth
embeddingss           r,   forwardzPixioPatchEmbeddings.forward>   s    2>2D2D/
L&%4,,,!../yaI  (++u8J/J (% 9+,Adooa.@-AE  __\2::1=GG1M
r-   F)__name__
__module____qualname____doc__r   r   torchTensorboolr=   __classcell__r+   s   @r,   r   r   (   s;    j{ jELL D ]b]i]i r-   r   c                        e Zd ZdZdeddf fdZdej                  dededej                  fd	Z	d
ej                  dej                  fdZ
 xZS )PixioEmbeddingszB
    Construct the CLS tokens, position and patch embeddings.
    r   r0   Nc                 (   t         |           t        j                  t	        j
                  d|j                  |j                              | _        d | _	        t        |      | _        | j                  j                  }t        j                  t	        j
                  d||j                  z   |j                              | _        t        j                  |j                        | _        |j                  | _        |j"                  | _        || _        y )Nr   )r   r   r   	ParameterrC   randnn_cls_tokensr"   	cls_token
mask_tokenr   patch_embeddingsr'   position_embeddingsDropouthidden_dropout_probdropoutr    r   )r*   r   r'   r+   s      r,   r   zPixioEmbeddings.__init__T   s    ekk!V5H5H&J\J\&]^ 4V <++77#%<<A{VM`M`?`bhbtbt0u#v zz&"<"<="// ++r-   r<   r:   r;   c                 @   |j                   d   | j                  z
  }| j                  j                   d   | j                  z
  }t               s||k(  r||k(  r| j                  S | j                  ddd| j                  f   }| j                  dd| j                  df   }|j                   d   }|| j                  z  }	|| j                  z  }
t        |dz        }|j                  d|||      }|j                  dddd      }|j                  }t        j                  j                  |j                  t        j                        |	|
fdd	
      j                  |      }|j                  dddd      j                  dd|      }t        j                   ||fd      S )a#  
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support tracing and interpolation at torch.float32 precision.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   Ng      ?r   r   r4   bicubicF)sizemodealign_cornersdtypedim)r5   rM   rQ   r   r    intreshapepermuter\   r   
functionalinterpolatetorC   float32viewcat)r*   r<   r:   r;   r'   num_positionsclass_pos_embedpatch_pos_embedr^   
new_height	new_widthsqrt_num_positionstarget_dtypes                r,   r/   z(PixioEmbeddings.interpolate_pos_encodinga   s    !&&q)D,=,==0066q9D<M<MM|} <5+++2216I8I8I6I3IJ221d6G6G6I3IJr"t.
T__,	 !34)11!5GI[]`a)11!Q1=&,,--33u}}-i(	 4 

 "<"
  	 *11!Q1=BB1b#Nyy/?;CCr-   r.   c                 x   |j                   \  }}}}| j                  j                  j                  j                  }| j                  |j                  |            }| j                  j                  |dd      }t        j                  ||fd      }|| j                  |||      z   }| j                  |      }|S )Nr[   rV   r   r]   )r5   rP   r)   weightr\   rd   rN   expandrC   rg   r/   rT   )	r*   r.   r9   _r:   r;   rn   r<   
cls_tokenss	            r,   r=   zPixioEmbeddings.forward   s    '3'9'9$
Avu,,77>>DD**<???+NO
^^**:r2>
YY
J7Q?
$"?"?
FTY"ZZ
\\*-
r-   )r?   r@   rA   rB   r   r   rC   rD   r_   r/   r=   rF   rG   s   @r,   rI   rI   O   si    { t $D5<< $D $DUX $D]b]i]i $DLELL U\\ r-   rI   modulequerykeyvalueattention_maskscalingrT   kwargsc                    ||j                  d      dz  }t        j                  ||j                  dd            |z  }|||z   }t        j
                  j                  |d      }t        j
                  j                  ||| j                        }t        j                  ||      }	|	j                  dd      j                         }	|	|fS )NrV         r4   r   r]   )ptrainingr   )
rX   rC   matmulr8   r   rb   softmaxrT   r~   
contiguous)
rt   ru   rv   rw   rx   ry   rT   rz   attn_weightsattn_outputs
             r,   eager_attention_forwardr      s     **R.D( <<s}}Q':;gEL!#n4==((2(>L==((6??([L,,|U3K''1-88:K$$r-   c                   z     e Zd Zdef fdZdej                  deej                  ej                  f   fdZ xZ	S )PixioSelfAttentionr   c                 2   t         |           |j                  |j                  z  dk7  r2t	        |d      s&t        d|j                   d|j                   d      || _        |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _	        |j                  | _        | j                  dz  | _        d| _        t        j                  |j                  | j                  |j                         | _        t        j                  |j                  | j                  |j                         | _        t        j                  |j                  | j                  |j                         | _        y )	Nr   embedding_sizezThe hidden size z4 is not a multiple of the number of attention heads r2   r|   Fbias)r   r   r"   num_attention_headshasattrr6   r   r_   attention_head_sizeall_head_sizeattention_probs_dropout_probdropout_probry   	is_causalr   Linearqkv_biasru   rv   rw   r*   r   r+   s     r,   r   zPixioSelfAttention.__init__   sF    : ::a?PVXhHi"6#5#5"6 7334A7 
 #)#=#= #&v'9'9F<V<V'V#W !558P8PP"??//5YYv1143E3EFOO\
99V//1C1C&//ZYYv1143E3EFOO\
r-   hidden_statesr0   c           
         |j                   d   }|d| j                  | j                  f} | j                  |      j                  | j                  dd      } | j                  |      j                  | j                  dd      } | j                  |      j                  | j                  dd      }t        j                  | j                  j                  t              } || |||d | j                  | j                  | j                  sdn| j                         \  }}	|j#                         d d | j$                  fz   }
|j'                  |
      }||	fS )Nr   rV   r   r4           )r   ry   rT   )r5   r   r   rv   rf   r8   rw   ru   r   get_interfacer   _attn_implementationr   r   ry   r~   r   rX   r   r`   )r*   r   r9   	new_shape	key_layervalue_layerquery_layerattention_interfacecontext_layerattention_probsnew_context_layer_shapes              r,   r=   zPixioSelfAttention.forward   sF   "((+
D$<$<d>V>VV	0DHH]+00)<FFq!L	4djj/44i@JJ1aP4djj/44i@JJ1aP(?(M(MKK,,.E)
 *=nnLL#}}C$2C2C	*
& #0"4"4"6s";t?Q?Q>S"S%--.EFo--r-   )
r?   r@   rA   r   r   rC   rD   tupler=   rF   rG   s   @r,   r   r      s:    ]{ ](.U\\ .eELL%,,<V6W .r-   r   c                   x     e Zd ZdZdef fdZdej                  dej                  dej                  fdZ xZ	S )PixioSelfOutputz
    The residual connection is defined in PixioLayer instead of here (as is the case with other models), due to the
    layernorm applied before each block.
    r   c                     t         |           t        j                  |j                  |j                        | _        t        j                  |j                        | _        y N)	r   r   r   r   r"   denserR   rS   rT   r   s     r,   r   zPixioSelfOutput.__init__   sB    YYv1163E3EF
zz&"<"<=r-   r   input_tensorr0   c                 J    | j                  |      }| j                  |      }|S r   )r   rT   )r*   r   r   s      r,   r=   zPixioSelfOutput.forward   s$    

=1]3r-   )
r?   r@   rA   rB   r   r   rC   rD   r=   rF   rG   s   @r,   r   r      s=    
>{ >
U\\  RWR^R^ r-   r   c                   \     e Zd Zdef fdZdej                  dej                  fdZ xZS )PixioAttentionr   c                 b    t         |           t        |      | _        t	        |      | _        y r   )r   r   r   	attentionr   outputr   s     r,   r   zPixioAttention.__init__   s&    +F3%f-r-   r   r0   c                 R    | j                  |      \  }}| j                  ||      }|S r   )r   r   )r*   r   self_attn_outputrr   r   s        r,   r=   zPixioAttention.forward   s,    "nn];!-}=r-   	r?   r@   rA   r   r   rC   rD   r=   rF   rG   s   @r,   r   r      s*    .{ .
U\\ ell r-   r   input	drop_probr~   r0   c                    |dk(  s|s| S d|z
  }| j                   d   fd| j                  dz
  z  z   }|t        j                  || j                  | j
                        z   }|j                          | j                  |      |z  }|S )zc
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    r   r   r   )r   )r\   device)r5   ndimrC   randr\   r   floor_div)r   r   r~   	keep_probr5   random_tensorr   s          r,   	drop_pathr     s    
 CxII[[^

Q 77E

5ELL YYMYYy!M1FMr-   c                   x     e Zd ZdZd	dedz  ddf fdZdej                  dej                  fdZde	fdZ
 xZS )
PixioDropPathzXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nr   r0   c                 0    t         |           || _        y r   )r   r   r   )r*   r   r+   s     r,   r   zPixioDropPath.__init__  s    "r-   r   c                 D    t        || j                  | j                        S r   )r   r   r~   )r*   r   s     r,   r=   zPixioDropPath.forward  s    FFr-   c                      d| j                    S )Nzp=)r   r*   s    r,   
extra_reprzPixioDropPath.extra_repr  s    DNN#$$r-   r   )r?   r@   rA   rB   floatr   rC   rD   r=   strr   rF   rG   s   @r,   r   r     sG    b#%$, #$ #GU\\ Gell G%C %r-   r   c                   X     e Zd Zd fdZdej
                  dej
                  fdZ xZS )PixioMLPr0   c                 ~   t         |           |j                  x}}t        |j                  |j                  z        }t        j                  ||d      | _        t        |j                  t              rt        |j                     | _        n|j                  | _        t        j                  ||d      | _        y )NTr   )r   r   r"   r_   	mlp_ratior   r   fc1r#   
hidden_actr   r   
activationfc2)r*   r   in_featuresout_featureshidden_featuresr+   s        r,   r   zPixioMLP.__init__  s    %+%7%77lf0063C3CCD99[/Ef''-$V%6%67DO$//DO99_lFr-   hidden_statec                 l    | j                  |      }| j                  |      }| j                  |      }|S r   )r   r   r   )r*   r   s     r,   r=   zPixioMLP.forward*  s2    xx-|4xx-r-   )r0   N)r?   r@   rA   r   rC   rD   r=   rF   rG   s   @r,   r   r     s$    	GELL U\\ r-   r   c                   `     e Zd Zdeddf fdZdej                  dej                  fdZ xZS )
PixioLayerr   r0   Nc                    t         |           t        j                  |j                  |j
                        | _        t        |      | _        |j                  dkD  rt        |j                        nt        j                         | _        t        j                  |j                  |j
                        | _        t        |      | _        y )Nepsr   )r   r   r   	LayerNormr"   layer_norm_epsnorm1r   r   drop_path_rater   Identityr   norm2r   mlpr   s     r,   r   zPixioLayer.__init__2  s    \\&"4"4&:O:OP
'/AGAVAVY\A\v'<'<=bdbmbmbo\\&"4"4&:O:OP
F#r-   r   c                     | j                  |      }| j                  |      }| j                  |      |z   }| j                  |      }| j	                  |      }| j                  |      |z   }|S r   )r   r   r   r   r   )r*   r   hidden_states_normself_attention_outputlayer_outputs        r,   r=   zPixioLayer.forward<  sj    !ZZ6 $/A B'<=Mzz-0xx-~~l3mCr-   r   rG   s   @r,   r   r   1  s1    ${ $t $U\\ ell r-   r   c                   N     e Zd Zdef fdZddej                  dedefdZ	 xZ
S )PixioEncoderr   c                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w )NF)
r   r   r   r   
ModuleListrangenum_hidden_layersr   layergradient_checkpointingr*   r   rr   r+   s      r,   r   zPixioEncoder.__init__K  sN    ]]fF^F^@_#`1Jv$6#`a
&+# $as   A#r   output_hidden_statesr0   c                     |r|gnd }t        | j                        D ]!  \  }} ||      }|s|j                  |       # t        ||rt	        |            S d       S )N)last_hidden_stater   )	enumerater   appendr   r   )r*   r   r   all_hidden_statesilayer_modules         r,   r=   zPixioEncoder.forwardQ  so    /C]O(4 	8OA|(7M !((7	8
 +6G% 12
 	
MQ
 	
r-   r>   )r?   r@   rA   r   r   rC   rD   rE   r   r=   rF   rG   s   @r,   r   r   J  s.    ,{ ,

U\\ 

 

Zi 

r-   r   c                       e Zd ZU eed<   dZdZdZdZddgZ	dZ
dZdZdZeedZ ej$                         d	ej(                  ej*                  z  ej,                  z  fd
       Zy)PixioPreTrainedModelr   pixior.   )imageTrI   r   )r   
attentionsrt   c                 "   t        |t        j                  t        j                  f      rct	        j
                  |j                  d| j                  j                         |j                   t	        j                  |j                         yyt        |t        j                        r?t	        j                  |j                         t	        j                  |j                         yt        |t              rt	        j
                  |j                  d| j                  j                         t	        j
                  |j                  d| j                  j                         |j                    t	        j                  |j                          yyy)zInitialize the weightsr   )meanstdN)r#   r   r   r(   inittrunc_normal_rp   r   initializer_ranger   zeros_r   ones_rI   rQ   rN   rO   )r*   rt   s     r,   _init_weightsz"PixioPreTrainedModel._init_weightso  s     fryy"))45v}}3DKK<Y<YZ{{&FKK( '-KK$JJv}}%0v99IfIfgv//ct{{?\?\]  ,F--. - 1r-   N)r?   r@   rA   r   __annotations__base_model_prefixmain_input_nameinput_modalitiessupports_gradient_checkpointing_no_split_modules_supports_sdpa_supports_flash_attn_supports_flex_attn_supports_attention_backendr   r   _can_record_outputsrC   no_gradr   r   r(   r   r   r-   r,   r   r   ^  s    $O!&*#*L9N"&#(
 U]]_/BII		$9BLL$H / /r-   r   c            
            e Zd Zdef fdZdefdZe ed      e		 	 dde
j                  dz  d	edz  defd
                     Z xZS )
PixioModelr   c                     t         |   |       || _        t        |      | _        t        |      | _        t        j                  |j                  |j                        | _        | j                          y )Nr   )r   r   r   rI   r<   r   encoderr   r   r"   r   	layernorm	post_initr   s     r,   r   zPixioModel.__init__  sW     )&1#F+f&8&8f>S>STr-   r0   c                 .    | j                   j                  S r   r<   rP   r   s    r,   get_input_embeddingszPixioModel.get_input_embeddings      ///r-   F)tie_last_hidden_statesNr.   r   c                 b   || j                   j                  }|t        d      | j                  |      }| j	                  ||      }|j
                  }| j                  |      }|d d d | j                  j                  d d f   j                  d      }t        |||j                        S )Nz You have to specify pixel_valuesr   r   r]   )r   pooler_outputr   )r   r   r6   r<   r  r   r  rM   r   r   r   )r*   r.   r   rz   embedding_outputencoder_outputssequence_outputpooled_outputs           r,   r=   zPixioModel.forward  s      '#';;#C#C ?@@??<8+/<<8H_s<+t);;..9'+IT__-I-I+I1(LMRRWXRY)-')77
 	
r-   )NN)r?   r@   rA   r   r   r   r  r   r   r   rC   rD   rE   r   r=   rF   rG   s   @r,   r  r    sv    	{ 	0&: 0  E2 -1,0
llT)
 #Tk

 
$
  3  
r-   r  zN
    Pixio backbone, to be used with frameworks like DETR and MaskFormer.
    )custom_introc            
       z     e Zd Z fdZdefdZeee	 dde	j                  dedz  defd                     Z xZS )	PixioBackbonec                 X   t         |   |       t        |j                  dz         D cg c]  }|j                   c}| _        t        |      | _        t        |      | _	        t        j                  |j                  |j                        | _        | j                          y c c}w )Nr   r   )r   r   r   r   r"   num_featuresrI   r<   r   r  r   r   r   r  r  r   s      r,   r   zPixioBackbone.__init__  s     9>v?W?WZ[?[9\]AV//])&1#F+f&8&8f>S>ST 	 ^s   B'r0   c                 .    | j                   j                  S r   r  r   s    r,   r  z"PixioBackbone.get_input_embeddings  r  r-   Nr.   r   c                    || j                   j                  }| j                  |      }| j                  |d      }|j                  }g }t        | j                  |      D ]  \  }}	|| j                  v s| j                   j                  r| j                  |	      }	| j                   j                  r|	dd| j                  j                  df   }	|j                  \  }
}}}| j                   j                  }|	j                  |
||z  ||z  d      }	|	j                  dddd      j!                         }	|j#                  |	        t%        t'        |      |r|	      S d	      S )
aw  
        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, AutoBackbone
        >>> import torch
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> processor = AutoImageProcessor.from_pretrained("facebook/pixio-huge")
        >>> model = AutoBackbone.from_pretrained(
        ...     "facebook/pixio-huge", out_features=["stage7", "stage15", "stage23", "stage31"]
        ... )

        >>> inputs = processor(image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> feature_maps = outputs.feature_maps
        >>> list(feature_maps[-1].shape)
        [1, 1280, 16, 16]
        ```NTr  rV   r   r   r   r4   )feature_mapsr   )r   r   r<   r  r   zipstage_namesr   apply_layernormr  reshape_hidden_statesrM   r5   r    r`   ra   r   r   r
   r   )r*   r.   r   rz   r  r   r   r)  stager   r9   rr   r:   r;   r    s                  r,   r=   zPixioBackbone.forward  s]   @  '#';;#C#C ??<8"&,,/?VZ,"[,,#&t'7'7#G 
	2E<)));;..#'>>,#?L;;44#/4??3O3O3Q0Q#RL3?3E3E0J65!%!7!7J#/#7#7
FjDXZ_cmZmoq#rL#/#7#71a#C#N#N#PL##L1
	2 |,+?-
 	
EI
 	
r-   r   )r?   r@   rA   r   r   r  r   r   r   rC   rD   rE   r
   r=   rF   rG   s   @r,   r$  r$    s\    
0&: 0  NR4
!LL4
@Dt4
	4
    4
r-   r$  )r  r   r$  )Nr   )r   F)4collections.abcr$   r   rC   r    r   r   activationsr   backbone_utilsr   modeling_layersr	   modeling_outputsr
   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   utils.genericr   utils.output_capturingr   configuration_pixior   Moduler   rI   rD   r   r   r   r   r   rE   r   r   r   r   r   r   r  r$  __all__r  r-   r,   <module>r=     s  *  $   & ! + 9 [ [ F & C C 7 5 ,$299 $NDbii DZ !%II%<<% 
% <<	%
 LL4'% T\% % '(%8/. /.dbii "	RYY 	U\\ e T V[VbVb %BII %ryy &+ 2
299 
( /? / /B )
% )
 )
X 
G
M#7 G

G
T Br-   