
    qiD              	          d Z ddlZddlZddlmZ ddlmZ ddlZddlm	Z	 ddl
mZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZmZ ddlmZ ddlmZ ddlmZmZmZ ddl m!Z!  ejD                  e#      Z$e ed       G d de                    Z%dIdejL                  de'de(dejL                  fdZ) G d de	jT                        Z+ G d de	jT                        Z, G d de	jT                        Z- G d  d!e	jT                        Z. G d" d#e.      Z/ G d$ d%e	jT                        Z0e.e/d&Z1 G d' d(e	jT                        Z2 G d) d*e	jT                        Z3 G d+ d,e	jT                        Z4 G d- d.e      Z5 G d/ d0e	jT                        Z6 G d1 d2e	jT                        Z7e G d3 d4e             Z8e G d5 d6e8             Z9 G d7 d8e	jT                        Z: ed9       G d: d;e8             Z; G d< d=e	jT                        Z< G d> d?e	jT                        Z= G d@ dAe	jT                        Z> G dB dCe	jT                        Z? G dD dEe	jT                        Z@e G dF dGe8             ZAg dHZBy)JzPyTorch Data2VecVision model.    N)	dataclass)Optional)nn)CrossEntropyLoss   )initialization)ACT2FN)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPoolingImageClassifierOutputSemanticSegmenterOutput)PreTrainedModel)#compile_compatible_method_lru_cache)auto_docstringlogging	torch_int   )Data2VecVisionConfigz7
    Class for outputs of [`Data2VecVisionModel`].
    )custom_introc                       e Zd ZdZy)$Data2VecVisionModelOutputWithPoolingaF  
    pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
        Average of the last layer hidden states of the patch tokens (excluding the *[CLS]* token) if
        *config.use_mean_pooling* is set to True. If set to False, then the final hidden state of the *[CLS]* token
        will be returned.
    N)__name__
__module____qualname____doc__     g/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/data2vec/modeling_data2vec_vision.pyr   r   +   s    r   r   input	drop_probtrainingreturnc                    |dk(  s|s| S d|z
  }| j                   d   fd| j                  dz
  z  z   }|t        j                  || j                  | j
                        z   }|j                          | j                  |      |z  }|S )zc
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

            r   r   )r   )dtypedevice)shapendimtorchrandr&   r'   floor_div)r    r!   r"   	keep_probr(   random_tensoroutputs          r   	drop_pathr1   <   s    
 CxII[[^

Q 77E

5ELL YYMYYy!M1FMr   c                   x     e Zd ZdZd	dedz  ddf fdZdej                  dej                  fdZde	fdZ
 xZS )
Data2VecVisionDropPathzXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nr!   r#   c                 0    t         |           || _        y N)super__init__r!   )selfr!   	__class__s     r   r7   zData2VecVisionDropPath.__init__O   s    "r   hidden_statesc                 D    t        || j                  | j                        S r5   )r1   r!   r"   r8   r:   s     r   forwardzData2VecVisionDropPath.forwardS   s    FFr   c                      d| j                    S )Nzp=)r!   r8   s    r   
extra_reprz!Data2VecVisionDropPath.extra_reprV   s    DNN#$$r   r5   )r   r   r   r   floatr7   r*   Tensorr=   strr@   __classcell__r9   s   @r   r3   r3   L   sG    b#%$, #$ #GU\\ Gell G%C %r   r3   c                        e Zd ZdZdeddf fdZdej                  dededej                  fd	Z		 dd
ej                  dej                  dz  dej                  fdZ xZS )Data2VecVisionEmbeddingszc
    Construct the CLS token, position and patch embeddings. Optionally, also the mask token.

    configr#   Nc                 2   t         |           t        j                  t	        j
                  dd|j                              | _        |j                  r:t        j                  t	        j
                  dd|j                              | _	        nd | _	        t        |      | _        |j                  | _        t        |j                  t        j                   j"                        r|j                  n|j                  |j                  f| _        | j                  j$                  }|j&                  r=t        j                  t	        j
                  d|dz   |j                              | _        nd | _        t        j*                  |j,                        | _        y )Nr   )r6   r7   r   	Parameterr*   zeroshidden_size	cls_tokenuse_mask_token
mask_tokenData2VecVisionPatchEmbeddingspatch_embeddings
patch_size
isinstance
image_sizecollectionsabcIterablenum_patches use_absolute_position_embeddingsposition_embeddingsDropouthidden_dropout_probdropout)r8   rH   rX   r9   s      r   r7   z!Data2VecVisionEmbeddings.__init__a   s$   ekk!Q8J8J&KL   ll5;;q!V=O=O+PQDO"DO =f E ++ &++[__-E-EF ##V%6%67 	
 ++7722')||EKK;QR?TZTfTf4g'hD$'+D$zz&"<"<=r   
embeddingsheightwidthc                    |j                   d   dz
  }| j                  j                   d   dz
  }t        j                  j	                         s||k(  r||k(  r| j                  S | j                  ddddf   }| j                  ddddf   }|j                   d   }|| j
                  z  }	|| j
                  z  }
t        |dz        }|j                  d|||      }|j                  dddd      }t        j                  j                  ||	|
fdd	
      }|j                  dddd      j                  dd|      }t        j                  ||fd      S )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   Ng      ?r   r      bicubicFsizemodealign_cornersdim)r(   rZ   r*   jit
is_tracingrR   r   reshapepermuter   
functionalinterpolateviewcat)r8   r^   r_   r`   rX   num_positionsclass_pos_embedpatch_pos_embedrj   
new_height	new_widthsqrt_num_positionss               r   interpolate_pos_encodingz1Data2VecVisionEmbeddings.interpolate_pos_encodingx   s`    !&&q)A-0066q9A= yy##%+*F6UZ?+++221bqb59221ab59r"t.
T__,	&}c'9:)11!5GI[]`a)11!Q1=--33i(	 4 
 *11!Q1=BB1b#Nyy/?;CCr   pixel_valuesbool_masked_posc                    |j                   \  }}}}| j                  |      \  }\  }}|j                         \  }	}
}|K| j                  j	                  |	|
d      }|j                  d      j                  |      }|d|z
  z  ||z  z   }| j                  j	                  |	dd      }t        j                  ||fd      }| j                  || j                  |||      z   }| j                  |      }|||ffS Nrb   r   ri   )r(   rQ   rf   rO   expand	unsqueezetype_asrM   r*   rr   rZ   ry   r]   )r8   rz   r{   _r_   r`   r^   patch_heightpatch_width
batch_sizeseq_lenmask_tokensw
cls_tokenss                 r   r=   z Data2VecVisionEmbeddings.forward   s   
 +001fe262G2G2U/
/\;!+!2
GQ&//00WbIK))"-55kBA#q1u-a?J^^**:r2>
YY
J7Q?
##/#d&C&CJPVX]&^^J\\*-
L+666r   r5   )r   r   r   r   r   r7   r*   rB   intry   
BoolTensorr=   rD   rE   s   @r   rG   rG   [   s    
>3 > >.&D5<< &D &DUX &D]b]i]i &DV 487ll7 ))D07 
	7r   rG   c                   Z     e Zd ZdZ fdZdej                  dej                  fdZ xZS )rP   z
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    c                    t         |           |j                  |j                  }}|j                  |j
                  }}t        |t        j                  j                        r|n||f}t        |t        j                  j                        r|n||f}|d   |d   z  |d   |d   z  z  }|d   |d   z  |d   |d   z  f}|| _        || _        || _        || _
        || _        t        j                  ||||      | _        y )Nr   r   kernel_sizestride)r6   r7   rT   rR   num_channelsrL   rS   rU   rV   rW   rX   patch_shaper   Conv2d
projection)	r8   rH   rT   rR   r   rL   rX   r   r9   s	           r   r7   z&Data2VecVisionPatchEmbeddings.__init__   s   !'!2!2F4E4EJ
$*$7$79K9Kk#-j+//:R:R#SZZdfpYq
#-j+//:R:R#SZZdfpYq
!!}
15*Q-:VW=:XY!!}
15z!}
ST7UV$$(&&))L+:^hir   rz   r#   c                 ^   |j                   \  }}}}|| j                  k7  rt        d      | j                  |j	                  | j                  j
                  j                              }|j                   d   |j                   d   }}|j                  d      j                  dd      }|||ffS )NzeMake sure that the channel dimension of the pixel values match with the one set in the configuration.rc   r   r   )	r(   r   
ValueErrorr   toweightr&   flatten	transpose)	r8   rz   r   r   r_   r`   r^   r   r   s	            r   r=   z%Data2VecVisionPatchEmbeddings.forward   s    2>2D2D/
L&%4,,,w  __\__T__5K5K5Q5Q%RS
$.$4$4Q$79I9I!9Lk''*44Q:
L+666r   )	r   r   r   r   r7   r*   rB   r=   rD   rE   s   @r   rP   rP      s)    j"7ELL 7U\\ 7r   rP   c                        e Zd Zddededz  ddf fdZ	 	 	 	 ddej                  dedej                  dz  d	ed
ee	   dz  deej                     eej                  ej                  f   z  fdZ
 xZS )Data2VecVisionSelfAttentionNrH   window_sizer#   c                 <   t         |           || _        |j                  |j                  z  dk7  r2t        |d      s&t        d|j                   d|j                   d      |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _	        t        j                  |j                  | j                        | _        t        j                  |j                  | j                  d      | _        t        j                  |j                  | j                        | _        t        j                  |j                         | _        t%        |      | _        | j&                  rt)        ||      | _        y y )	Nr   embedding_sizezThe hidden size z4 is not a multiple of the number of attention heads .F)biasr   )r6   r7   rH   rL   num_attention_headshasattrr   r   attention_head_sizeall_head_sizer   Linearquerykeyvaluer[   attention_probs_dropout_probr]   boolhas_relative_position_bias"Data2VecVisionRelativePositionBiasrelative_position_biasr8   rH   r   r9   s      r   r7   z$Data2VecVisionSelfAttention.__init__   sP    : ::a?PVXhHi"6#5#5"6 7334A7 
 $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
99V//1C1C%PYYv1143E3EF
zz&"E"EF*.{*;'***LVal*mD' +r   r:   output_attentionsr   ry   
resolutionc                    |j                   \  }}}| j                  |      j                  |d| j                  | j                        j                  dd      }	| j                  |      j                  |d| j                  | j                        j                  dd      }
| j                  |      j                  |d| j                  | j                        j                  dd      }t        j                  |	|
j                  dd            }|t        j                  | j                        z  }| j                  r[|\  }}|| j                  j                  z  || j                  j                  z  f}|| j                  |||j                   d         z   }|||z   }t         j"                  j%                  |d      }| j'                  |      }t        j                  ||      }|j)                  dddd      j+                         }|j-                         d d | j.                  fz   } |j                  | }|r||f}|S |f}|S )	Nrb   r   rc   dim_sizeri   r   r   )r(   r   rq   r   r   r   r   r   r*   matmulmathsqrtr   rH   rR   r   r   ro   softmaxr]   rn   
contiguousrf   r   )r8   r:   r   r   ry   r   r   
seq_lengthr   query_layer	key_layervalue_layerattention_scoresr_   r`   r   attention_probscontext_layernew_context_layer_shapeoutputss                       r   r=   z#Data2VecVisionSelfAttention.forward   s2    %2$7$7!
JJJ}%T*b$":":D<T<TUYq!_ 	 HH]#T*b$":":D<T<TUYq!_ 	 JJ}%T*b$":":D<T<TUYq!_ 	 !<<Y5H5HR5PQ+dii8P8P.QQ **&MFE!T[[%;%;;UdkkF\F\=\]K/$2M2M5@S@STU@V 3N 3  
 "-/2HH --//0@b/I ,,7_kB%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S***,CD6G=/2 O\M]r   r5   FNFN)r   r   r   r   tupler7   r*   rB   r   r   r=   rD   rE   s   @r   r   r      s    n3 n%$, nZ^ n4 #(6:).(,9||9  9 !&t 3	9
 #'9 #J%9 
u||	uU\\5<<%?@	@9r   r   c                       e Zd Z	 	 	 	 d	dej                  dedej                  dz  dedee   dz  deej                     eej                  ej                  f   z  fdZy)
Data2VecVisionSdpaSelfAttentionNr:   r   r   ry   r   r#   c           	         |r,t         j                  | j                  j                   d       |j                  \  }}}| j                  |      j                  |d| j                  | j                        j                  dd      }	| j                  |      j                  |d| j                  | j                        j                  dd      }
| j                  |      j                  |d| j                  | j                        j                  dd      }d }| j                  rX|\  }}|| j                  j                  z  || j                  j                  z  f}| j                  |||j                  d         }|
||}n||z  }dt!        j"                  | j                        z  }t$        j&                  j(                  j+                  |	|
||| j,                  r| j                  j.                  ndd|      }|j1                  d	ddd
      j3                         }|j5                         d d | j6                  fz   } |j                  | }|d fS )Nz does not support `output_attentions=True`. The returned attention weights will be `None`. If you want to get attention weights, please set `attn_implementation='eager'` when loading the model.rb   r   rc   r   r%   F)	attn_mask	dropout_p	is_causalscaler   r   r   )loggerwarning_oncer9   r   r(   r   rq   r   r   r   r   r   r   rH   rR   r   r   r   r*   r   ro   scaled_dot_product_attentionr"   r   rn   r   rf   r   )r8   r:   r   r   ry   r   r   r   r   r   r   r   	attn_biasr_   r`   r   scalingr   r   s                      r   r=   z'Data2VecVisionSdpaSelfAttention.forward8  s=    >>**+ ,D D %2$7$7!
JJJ}%T*b$":":D<T<TUYq!_ 	 HH]#T*b$":":D<T<TUYq!_ 	 JJ}%T*b$":":D<T<TUYq!_ 	 	**&MFE!T[[%;%;;UdkkF\F\=\]K335@S@STU@V 4 I
 "- 2	33	dii 8 899++HHBF--dkk>>UX I 
 &--aAq9DDF"/"4"4"6s";t?Q?Q>S"S***,CDd""r   r   )	r   r   r   r*   rB   r   r   r   r=   r   r   r   r   r   7  s     #(6:).(,:#||:#  :# !&t 3	:#
 #':# #J%:# 
u||	uU\\5<<%?@	@:#r   r   c                   ~     e Zd ZdZdeddf fdZd	dej                  dej                  dej                  fdZ xZ	S )
Data2VecVisionSelfOutputz
    The residual connection is defined in Data2VecVisionLayer instead of here (as is the case with other models), due to the
    layernorm applied before each block.
    rH   r#   Nc                     t         |           t        j                  |j                  |j                        | _        t        j                  |j                        | _        y r5   )	r6   r7   r   r   rL   denser[   r\   r]   r8   rH   r9   s     r   r7   z!Data2VecVisionSelfOutput.__init__|  sB    YYv1163E3EF
zz&"<"<=r   r:   input_tensorc                 J    | j                  |      }| j                  |      }|S r5   r   r]   )r8   r:   r   gammas       r   r=   z Data2VecVisionSelfOutput.forward  $    

=1]3r   r5   )
r   r   r   r   r   r7   r*   rB   r=   rD   rE   s   @r   r   r   v  sE    
>3 > >
U\\  ^c^j^j r   r   )eagersdpac                        e Zd Zddededz  ddf fdZ	 	 	 	 ddej                  dede	d	   d
edee
   dz  deej                     eej                  ej                  f   z  fdZ xZS )Data2VecVisionAttentionNrH   r   r#   c                     t         |           t        |j                     ||      | _        t        |      | _        y )Nr   )r6   r7   &DATA2VEC_VISION_SELF_ATTENTION_CLASSES_attn_implementation	attentionr   r0   r   s      r   r7   z Data2VecVisionAttention.__init__  s8    ?@[@[\
 /v6r   r:   r   r   r   ry   r   c                 l    | j                  |||||      }| j                  |d   |      }|f|dd  z   }|S )Nr   r   )r   r0   )	r8   r:   r   r   ry   r   self_outputsattention_outputr   s	            r   r=   zData2VecVisionAttention.forward  sQ     ~~,.DF^`j
  ;;|AF#%QR(88r   r5   r   )r   r   r   r   r   r7   r*   rB   r   r   r   r=   rD   rE   s   @r   r   r     s    73 7%$, 7Z^ 7 #(QU).(,||   !))M N	
 #' #J% 
u||	uU\\5<<%?@	@r   r   c                   `     e Zd Zdeddf fdZdej                  dej                  fdZ xZS )Data2VecVisionIntermediaterH   r#   Nc                    t         |           t        j                  |j                  |j
                        | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y r5   )r6   r7   r   r   rL   intermediate_sizer   rS   
hidden_actrC   r	   intermediate_act_fnr   s     r   r7   z#Data2VecVisionIntermediate.__init__  s]    YYv1163K3KL
f''-'-f.?.?'@D$'-'8'8D$r   r:   c                 J    | j                  |      }| j                  |      }|S r5   )r   r   r<   s     r   r=   z"Data2VecVisionIntermediate.forward  s&    

=100?r   	r   r   r   r   r7   r*   rB   r=   rD   rE   s   @r   r   r     s2    93 9 9U\\ ell r   r   c                   `     e Zd Zdeddf fdZdej                  dej                  fdZ xZS )Data2VecVisionOutputrH   r#   Nc                     t         |           t        j                  |j                  |j
                        | _        t        j                  |j                        | _	        y r5   )
r6   r7   r   r   r   rL   r   r[   r\   r]   r   s     r   r7   zData2VecVisionOutput.__init__  sB    YYv779K9KL
zz&"<"<=r   r:   c                 J    | j                  |      }| j                  |      }|S r5   r   r<   s     r   r=   zData2VecVisionOutput.forward  r   r   r   rE   s   @r   r   r     s2    >3 > >
U\\ ell r   r   c                        e Zd ZdZ	 ddededz  deddf fdZ	 	 	 	 ddej                  d	e
d
ej                  dz  de
deeef   dz  deej                     eej                  ej                  f   z  fdZ xZS )Data2VecVisionLayerz?This corresponds to the Block class in the timm implementation.NrH   r   drop_path_rater#   c                    t         |           |j                  | _        d| _        t	        ||      | _        t        |      | _        t        |      | _	        t        j                  |j                  |j                        | _        |dkD  rt        |      nt        j                          | _        t        j                  |j                  |j                        | _        |j&                  }|dkD  ryt        j(                  |t+        j,                  |j                        z  d      | _        t        j(                  |t+        j,                  |j                        z  d      | _        y d\  | _        | _        y )	Nr   r   epsr%   r   T)requires_grad)NN)r6   r7   chunk_size_feed_forwardseq_len_dimr   r   r   intermediater   r0   r   	LayerNormrL   layer_norm_epslayernorm_beforer3   Identityr1   layernorm_afterlayer_scale_init_valuerJ   r*   oneslambda_1lambda_2)r8   rH   r   r   init_valuesr9   s        r   r7   zData2VecVisionLayer.__init__  s    	'-'E'E$0[Q6v>*62 "V-?-?VEZEZ [CQTWCW/?]_]h]h]j!||F,>,>FDYDYZ33?LLuzz&BTBT7U)UeijDMLLuzz&BTBT7U)UeijDM+5(DM4=r   r:   r   r   ry   r   c                    | j                  | j                  |      ||||      }|d   }|dd  }| j                  | j                  |z  }| j                  |      |z   }| j	                  |      }	| j                  |	      }	| j                  |	      }	| j                  | j                  |	z  }	| j                  |	      |z   }	|	f|z   }|S )Nr   r   ry   r   r   r   )r   r   r  r1   r  r   r0   r  )
r8   r:   r   r   ry   r   self_attention_outputsr   r   layer_outputs
             r   r=   zData2VecVisionLayer.forward  s     "&!!-0/#9%=! "0 "
 2!4(, ==$#}}/?? '78=H ++M:((6{{<0==$==<7L ~~l3mC/G+r   )Nr%   r   )r   r   r   r   r   r   rA   r7   r*   rB   r   r   r=   rD   rE   s   @r   r   r     s    I gj6*69>6^c6	6. #(6:).-1'||'  ' !&t 3	'
 #'' #s(Od*' 
u||	uU\\5<<%?@	@'r   r   c                        e Zd Zdededdf fdZ ed      deeef   dej                  fd       Z
dd	edej                  fd
Z xZS )r   rH   r   r#   Nc                     t         |           || _        d|d   z  dz
  d|d   z  dz
  z  dz   | _        t	        j
                  t        j                  | j                  |j                              | _	        y )Nrc   r   r   r   )
r6   r7   r   num_relative_distancer   rJ   r*   rK   r   relative_position_bias_tabler   s      r   r7   z+Data2VecVisionRelativePositionBias.__init__  sr    &&'+a.&81&<[QR^ASVWAW%X[\%\",.LLKK22F4N4NO-
)r   
   )maxsizec                    d|d   z  dz
  d|d   z  dz
  z  dz   }|d   |d   z  }t        j                  t        j                  |d         t        j                  |d         d      }t        j                  |      }t        j                  |d      }|dddddf   |dddddf   z
  }|j                  ddd      j                         }|dddddfxx   |d   dz
  z  cc<   |dddddfxx   |d   dz
  z  cc<   |dddddfxx   d|d   z  dz
  z  cc<   t        j                  |dz   fdz  |j                        }|j                  d	      |ddddf<   |dz
  |dddf<   |dz
  |dddf<   |dz
  |d
<   |S )z
        This method creates the relative position index, modified to support arbitrary window sizes,
        as introduced in [MiDaS v3.1](https://huggingface.co/papers/2307.14460).
        rc   r   r   r   ij)indexingN)rf   r&   rb   )r   r   )
r*   meshgridarangestackr   rn   r   rK   r&   sum)	r8   r   r  window_areagridcoordscoords_flattenrelative_coordsrelative_position_indexs	            r    generate_relative_position_indexzCData2VecVisionRelativePositionBias.generate_relative_position_index  s    "#[^!3a!7AA<NQR<R SVW W "!n{1~5~~ell;q>:ELLUV<XcghT"vq1(At4~aqj7QQ)11!Q:EEG1a KNQ$66 1a KNQ$66 1a AA$6$:: "'++K!O3E3IQ`QfQf"g*9*=*=b*AAB')>)B12&)>)BA&(=(A%&&r   ry   c                    d| j                   d   z  dz
  }d| j                   d   z  dz
  }d|d   z  dz
  }d|d   z  dz
  }| j                  }| j                  }	||z  dz   }
|d|	dz
   }|j                  d||d      j	                  dddd      }t
        j                  j                  |t        |      t        |      fd      }|j	                  dddd      j                  |
dz
  d      }t        j                  |||	dz
  d g      }| j                  |      }||j                  d         }|j                  |d   |d   z  dz   |d   |d   z  dz   d      }|j	                  ddd      j                         }|rCt
        j                  j                  |j                  d      ||fdd	
      j                  d      }|j                  d      S )zu
        Modification of timm.models.beit.py: Attention._get_rel_pos_bias to support arbitrary window sizes.
        rc   r   r   r   Nrb   bilinear)rf   rg   Fre   )r   r  r  rm   rn   r   ro   rp   r   r*   rr   r  rq   r   r   squeeze)r8   r   ry   r   
old_height	old_widthrv   rw    old_relative_position_bias_tableold_num_relative_distancenew_num_relative_distanceold_sub_tablenew_sub_table new_relative_position_bias_tabler  r   s                   r   r=   z*Data2VecVisionRelativePositionBias.forward.  s-    ))!,,q0
((++a/	Q'!+
A&*	+/+L+L($($>$>!$.$:Q$>!89X;TWX;XY%--aJKSSTUWXZ[]^_11:!6	)8L MT^ 2 
 &--aAq9AAB[^_B_acd+099<=VYZ=Z=\]^,
( #'"G"G"T!ABYB^B^_aBb!c "8!<!<N[^+a/Q+a.1PST1TVX"
 "8!?!?1a!H!S!S!U#%']]%>%>&003)#	 &? &
 gaj # &//22r   )FN)r   r   r   r   r   r7   r   r   r*   rB   r  r   r=   rD   rE   s   @r   r   r     sn    
3 
% 
D 
 )4'E#s(O 'PUP\P\ ' 5'0-3T -3]b]i]i -3r   r   c                        e Zd Zddededz  ddf fdZ	 	 	 	 	 ddej                  deded	ed
ee	e	f   dz  dedee
z  fdZ xZS )Data2VecVisionEncoderNrH   r   r#   c                    t         |           || _        |j                  | _        | j                  rt        ||      | _        t        j                  d|j                  |j                  d      D cg c]  }|j                          }}t        j                  t        |j                        D cg c]!  }t        ||j                   r|nd ||         # c}      | _        d| _        y c c}w c c}w )Nr   r   cpu)r'   )r   r   F)r6   r7   rH   !use_shared_relative_position_biasr   r   r   r*   linspacer   num_hidden_layersitemr   
ModuleListranger   use_relative_position_biaslayergradient_checkpointing)r8   rH   r   xdprir9   s         r   r7   zData2VecVisionEncoder.__init__`  s    *0*R*R'***LVal*mD' "'63H3H&JbJbkp!qrAqvvxrr]] v778  $/5/P/PVZ#&q6	

 ',# ss   5C.4&C3r:   r   output_hidden_statesry   r   return_dictc                    |rdnd }|rdnd }t        | j                        D ]  \  }	}
|r||fz   }| j                  rY|\  }}|| j                  j                  z  || j                  j                  z  f}| j                  |||j                  d         }nd } |
|||||      }|d   }|s||d   fz   } |r||fz   }|st        d |||fD              S t        |||      S )Nr   r   )ry   r   r  r   c              3   &   K   | ]	  }||  y wr5   r   ).0vs     r   	<genexpr>z0Data2VecVisionEncoder.forward.<locals>.<genexpr>  s     mq_`_lms   )last_hidden_stater:   
attentions)		enumerater5  r   rH   rR   r   r(   r   r   )r8   r:   r   r:  ry   r   r;  all_hidden_statesall_self_attentionsr9  layer_moduler_   r`   r   r   layer_outputss                   r   r=   zData2VecVisionEncoder.forwardu  s6    #7BD$5b4(4 	POA|#$58H$H!.. *%)?)??$++J`J`A`a)-)D)D:R]j]p]pqr]s *E *& *.&("3'=)A%M *!,M &9]1=M<O&O#1	P4   1]4D Dm]4EGZ$[mmm++*
 	
r   r5   )FFFNT)r   r   r   r   r   r7   r*   rB   r   r   r   r=   rD   rE   s   @r   r+  r+  _  s    ,3 ,%$, ,Z^ ,0 #(%*).-1 /
||/
  /
 #	/

 #'/
 #s(Od*/
 /
 
	 /
r   r+  c                   r     e Zd ZU eed<   dZdZdZdZdgZ	dgZ
dZ ej                          fd       Z xZS )	Data2VecVisionPreTrainedModelrH   data2vec_vision)imagerz   Tr   z.*relative_position_index.*c                    t         |   |       t        |t              rwt	        j
                  |j                         |j                  t	        j
                  |j                         |j                   t	        j
                  |j                         yyt        |t              r t	        j
                  |j                         yt        |t              rv|j                  it	        j                  |j                  | j                  j                         t	        j                  |j                   | j                  j                         yyy)zInitialize the weightsN)r6   _init_weightsrS   rG   initzeros_rM   rO   rZ   r   r  r   r  	constant_rH   r  r  )r8   moduler9   s     r   rM  z+Data2VecVisionPreTrainedModel._init_weights  s     	f%f67KK(()  ,F--.))5F667 6 BCKK;;< 34*v0R0RSv0R0RS + 5r   )r   r   r   r   __annotations__base_model_prefixinput_modalitiesmain_input_namesupports_gradient_checkpointing_no_split_modules"_keys_to_ignore_on_load_unexpected_supports_sdpar*   no_gradrM  rD   rE   s   @r   rI  rI    sU     ! )!$O&*#./*H)I&NU]]_T Tr   rI  c                        e Zd Zddededdf fdZd Ze	 	 	 	 	 ddej                  dej                  dz  d	edz  d
edz  dededz  deez  fd       Z xZS )Data2VecVisionModelrH   add_pooling_layerr#   Nc                    t         |   |       || _        t        |      | _        t        || j                  j                  j                        | _        |j                  rt        j                         n*t        j                  |j                  |j                        | _        |rt!        |      nd| _        | j%                          y)zw
        add_pooling_layer (bool, *optional*, defaults to `False`):
            Whether to add a pooling layer
        r   r   N)r6   r7   rH   rG   r^   r+  rQ   r   encoderuse_mean_poolingr   r   r   rL   r   	layernormData2VecVisionPoolerpooler	post_init)r8   rH   r]  r9   s      r   r7   zData2VecVisionModel.__init__  s    
 	 26:,VAaAaAmAmn $44BKKM",,vGYGY_e_t_t:u 	 7H*62T 	r   c                 .    | j                   j                  S r5   )r^   rQ   r?   s    r   get_input_embeddingsz(Data2VecVisionModel.get_input_embeddings  s    ///r   rz   r{   r   r:  ry   r;  c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }| j	                  ||      \  }}	|j
                  dd }
| j                  ||||
||      }|d   }| j                  |      }| j                  | j                  |      nd}|s|||fn|f}||dd z   S t        |||j                  |j                        S )z
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
        N)r{   rc   )r   r:  r   r;  ry   r   r   )rA  pooler_outputr:   rB  )rH   r   r:  use_return_dictr^   r(   r_  ra  rc  r   r:   rB  )r8   rz   r{   r   r:  ry   r;  kwargsembedding_outputr   r   encoder_outputssequence_outputpooled_outputhead_outputss                  r   r=   zData2VecVisionModel.forward  s$    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]"oolOo\!!''+
,,/!5!#%= ' 
 *!,..98<8OO4UY?L?XO];_n^pL/!""5553-')77&11	
 	
r   )F)NNNFN)r   r   r   r   r   r7   rf  r   r*   rB   r   r   r   r=   rD   rE   s   @r   r\  r\    s    3  Y] &0  48)-,0).#',
ll,
 ))D0,
  $;	,

 #Tk,
 #',
 D[,
 
5	5,
 ,
r   r\  c                   `     e Zd Zdeddf fdZdej                  dej                  fdZ xZS )rb  rH   r#   Nc                     t         |           |j                  r1t        j                  |j
                  |j                        | _        y d | _        y )Nr   )r6   r7   r`  r   r   rL   r   ra  r   s     r   r7   zData2VecVisionPooler.__init__  sA    KQKbKbBLL++1F1FG 	hl 	r   r:   c                     | j                   0|d d dd d d f   }| j                  |j                  d            }|S |d d df   }|S )Nr   r   )ra  mean)r8   r:   patch_tokensrn  s       r   r=   zData2VecVisionPooler.forward  sU    >>%(AB2L NN<+<+<Q+?@M
  *!Q$/Mr   r   rE   s   @r   rb  rb    s2    
3 
 
	U\\ 	ell 	r   rb  z
    Data2VecVision Model transformer with an image classification head on top (a linear layer on top of the average of
    the final hidden states of the patch tokens) e.g. for ImageNet.
    c                        e Zd Zdeddf fdZe	 	 	 	 	 	 ddej                  dz  dej                  dz  dedz  dedz  d	ed
edz  de	e
z  fd       Z xZS )$Data2VecVisionForImageClassificationrH   r#   Nc                 .   t         |   |       |j                  | _        t        |d      | _        |j                  dkD  r*t        j                  |j                  |j                        nt        j                         | _	        | j                          y )NTr]  r   )r6   r7   
num_labelsr\  rJ  r   r   rL   r   
classifierrd  r   s     r   r7   z-Data2VecVisionForImageClassification.__init__*  st      ++26TR OUN_N_bcNc"))F$6$68I8IJikititiv 	r   rz   labelsr   r:  ry   r;  c                 \   ||n| j                   j                  }| j                  |||||      }|r|j                  n|d   }	| j	                  |	      }
d}|| j                  ||
| j                         }|s|
f|dd z   }||f|z   S |S t        ||
|j                  |j                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr   r:  ry   r;  r   rc   losslogitsr:   rB  )	rH   ri  rJ  rh  rz  loss_functionr   r:   rB  )r8   rz   r{  r   r:  ry   r;  rj  r   rn  r  r  r0   s                r   r=   z,Data2VecVisionForImageClassification.forward6  s    " &1%<k$++B]B]&&/!5%=# ' 
 2=--'!*/%%ffdkkBDY,F)-)9TGf$EvE$!//))	
 	
r   NNNNFN)r   r   r   r   r7   r   r*   rB   r   r   r   r=   rD   rE   s   @r   rv  rv  "  s    
3 
 
  -1&*)-,0).#'*
llT)*
 t#*
  $;	*

 #Tk*
 #'*
 D[*
 
&	&*
 *
r   rv  c                        e Zd ZdZ	 	 	 ddededeeeef   z  deeeef   z  ez  dedeeeef   z  dd	f fd
Zde	j                  de	j                  fdZ xZS )Data2VecVisionConvModuleaD  
    A convolutional block that bundles conv/norm/activation layers. This block simplifies the usage of convolution
    layers, which are commonly used with a norm layer (e.g., BatchNorm) and activation layer (e.g., ReLU).

    Based on OpenMMLab's implementation, found in https://github.com/open-mmlab/mmsegmentation.
    in_channelsout_channelsr   paddingr   dilationr#   Nc                     t         |           t        j                  ||||||      | _        t        j
                  |      | _        t        j                         | _        y )N)r  r  r   r  r   r  )	r6   r7   r   r   convBatchNorm2dbnReLU
activation)r8   r  r  r   r  r   r  r9   s          r   r7   z!Data2VecVisionConvModule.__init__m  sQ     	II#%#
	 ...'')r   r    c                 l    | j                  |      }| j                  |      }| j                  |      }|S r5   )r  r  r  )r8   r    r0   s      r   r=   z Data2VecVisionConvModule.forward  s0    5!(r   )r   Fr   )r   r   r   r   r   r   rC   r   r7   r*   rB   r=   rD   rE   s   @r   r  r  e  s     01*+$$ $ 5c?*	$
 uS#X&,$ $ c3h'$ 
$*U\\ ell r   r  c                   h     e Zd Zdedededdf fdZdej                  dej                  fdZ xZS )	!Data2VecVisionPyramidPoolingBlock
pool_scaler  channelsr#   Nc                     t         |           t        j                  |      t	        ||d      g| _        t        | j
                        D ]   \  }}| j                  t        |      |       " y )Nr   r   )	r6   r7   r   AdaptiveAvgPool2dr  layersrC  
add_modulerC   )r8   r  r  r  r9  r5  r9   s         r   r7   z*Data2VecVisionPyramidPoolingBlock.__init__  sa      ,$[(J
 "$++. 	+HAuOOCFE*	+r   r    c                 <    |}| j                   D ]
  } ||      } |S r5   )r  )r8   r    hidden_stater5  s       r   r=   z)Data2VecVisionPyramidPoolingBlock.forward  s*    [[ 	/E .L	/r   )	r   r   r   r   r7   r*   rB   r=   rD   rE   s   @r   r  r    s?    +3 +S +C +D +U\\ ell r   r  c            
            e Zd ZdZdeedf   dedededdf
 fd	Zd
ej                  de
ej                     fdZ xZS )"Data2VecVisionPyramidPoolingModulea  
    Pyramid Pooling Module (PPM) used in PSPNet.

    Args:
        pool_scales (tuple[int]): Pooling scales used in Pooling Pyramid
            Module.
        in_channels (int): Input channels.
        channels (int): Channels after modules, before conv_seg.
        align_corners (bool): align_corners argument of F.interpolate.

    Based on OpenMMLab's implementation, found in https://github.com/open-mmlab/mmsegmentation.
    pool_scales.r  r  rh   r#   Nc                    t         |           || _        || _        || _        || _        g | _        t        |      D ]I  \  }}t        |||      }| j                  j                  |       | j                  t        |      |       K y )N)r  r  r  )r6   r7   r  rh   r  r  blocksrC  r  appendr  rC   )	r8   r  r  r  rh   r9  r  blockr9   s	           r   r7   z+Data2VecVisionPyramidPoolingModule.__init__  s    &*& &{3 	+MAz5%;E KKu%OOCFE*	+r   r7  c                     g }| j                   D ]Y  } ||      }t        j                  j                  ||j	                         dd  d| j
                        }|j                  |       [ |S )Nrc   r   re   )r  r   ro   rp   rf   rh   r  )r8   r7  ppm_outsppmppm_outupsampled_ppm_outs         r   r=   z*Data2VecVisionPyramidPoolingModule.forward  sn    ;; 	/C!fG " 9 9affhqrl4K]K] !: ! OO-.	/ r   )r   r   r   r   r   r   r   r7   r*   rB   listr=   rD   rE   s   @r   r  r    s[    +E#s(O +# +QT +ei +nr + $u||*< r   r  c                   j     e Zd ZdZdeddf fdZd Zdej                  dej                  fdZ	 xZ
S )	Data2VecVisionUperHeadz
    Unified Perceptual Parsing for Scene Understanding. This head is the implementation of
    [UPerNet](https://huggingface.co/papers/1807.10221).

    Based on OpenMMLab's implementation, found in https://github.com/open-mmlab/mmsegmentation.
    rH   r#   Nc                    t         |           |j                  | _        |j                  gdz  | _        |j                  | _        d| _        t        j                  | j
                  |j                  d      | _
        t        | j                  | j                  d   | j
                  | j                        | _        t        | j                  d   t        | j                        | j
                  z  z   | j
                  dd      | _        t        j                          | _        t        j                          | _        | j                  d d D ]s  }t        || j
                  d      }t        | j
                  | j
                  dd      }| j"                  j'                  |       | j$                  j'                  |       u t        t        | j                        | j
                  z  | j
                  dd      | _        y )	N   Fr   r  rb   )rh   r   r   r  )r6   r7   r  rL   r  r  rh   r   r   ry  rz  r  psp_modulesr  len
bottleneckr2  lateral_convs	fpn_convsr  fpn_bottleneck)r8   rH   r  l_convfpn_convr9   s        r   r7   zData2VecVisionUperHead.__init__  s   !--"../!3**"))DMM63D3DRST >R MM,,	
 3R 3t'7'7#84==#HHMM	
  ]]_++CR0 	,K-k4==VWXF/t}}Z[efgH%%f-NN!!(+		, 7  !DMM1MM	
r   c                     |d   }|g}|j                  | j                  |             t        j                  |d      }| j	                  |      }|S r}   )extendr  r*   rr   r  )r8   inputsr7  psp_outsr0   s        r   psp_forwardz"Data2VecVisionUperHead.psp_forward  sL    2J3((+,99X1-*r   encoder_hidden_statesc                 P   t        | j                        D cg c]  \  }} |||          }}}|j                  | j                  |             t	        |      }t        |dz
  dd      D ]V  }||dz
     j                  dd  }||dz
     t        j                  j                  ||   |d| j                        z   ||dz
  <   X t        |dz
        D cg c]  } | j                  |   ||          }}|j                  |d          t        |dz
  dd      D ]E  }t        j                  j                  ||   |d   j                  dd  d| j                        ||<   G t        j                  |d      }| j                  |      }| j                  |      }|S c c}}w c c}w )Nr   r   rb   rc   r   re   ri   )rC  r  r  r  r  r3  r(   r   ro   rp   rh   r  r*   rr   r  rz  )	r8   r  r9  lateral_convlateralsused_backbone_levels
prev_shapefpn_outsr0   s	            r   r=   zData2VecVisionUperHead.forward  s   R[\`\n\nRopq,L!6q!9:pp(()>?@  #8}+a/B7 	A!!a%..qr2J&q1uo0I0I*:TM_M_ 1J 1 HQUO	 =BBVYZBZ<[\q%DNN1%hqk2\\%+a/B7 	A--33(1+"3"3AB"7jX\XjXj 4 HQK	 99X1-$$X.(3 q ]s   FF#)r   r   r   r   r   r7   r  r*   rB   r=   rD   rE   s   @r   r  r    s=    $
3 $
 $
LU\\ ell r   r  c                        e Zd ZdZ	 	 	 ddedededeeeef   z  ddf
 fdZd	ej                  dej                  fd
Z
 xZS )Data2VecVisionFCNHeada  
    Fully Convolution Networks for Semantic Segmentation. This head is implemented of
    [FCNNet](https://huggingface.co/papers/1411.4038>).

    Args:
        config (Data2VecVisionConfig): Configuration.
        in_channels
        kernel_size (int): The kernel size for convs in the head. Default: 3.
        dilation (int): The dilation rate for convs in the head. Default: 1.


    Based on OpenMMLab's implementation, found in https://github.com/open-mmlab/mmsegmentation.
    rH   in_indexr   r  r#   Nc           
      <   t         |           |j                  | _        |j                  | _        |j                  | _        |j                  | _	        || _
        |dz  |z  }g }|j                  t        | j                  | j
                  |||             t        | j                  dz
        D ]5  }|j                  t        | j
                  | j
                  |||             7 | j                  dk(  rt        j                         | _        nt        j"                  | | _        | j                  r8t        | j                  | j
                  z   | j
                  ||dz        | _        t        j&                  | j
                  |j(                  d      | _        y )Nrc   )r   r  r  r   r   r  r  )r6   r7   rL   r  auxiliary_channelsr  auxiliary_num_convs	num_convsauxiliary_concat_inputconcat_inputr  r  r  r3  r   r   convs
Sequentialconv_catr   ry  rz  )	r8   rH   r  r   r  conv_paddingr  r9  r9   s	           r   r7   zData2VecVisionFCNHead.__init__*  sX    	!--1133"99 #q(H4$  $--[R^iq	

 t~~)* 	ALL(MM4==kS_jr	 >>QDJ.DJ4  4==0$--[bmqrbrDM ))DMM63D3DRSTr   r  c                     || j                      }| j                  |      }| j                  r(| j                  t	        j
                  ||gd            }| j                  |      }|S )Nr   ri   )r  r  r  r  r*   rr   rz  )r8   r  r:   r0   s       r   r=   zData2VecVisionFCNHead.forwardP  sX    -dmm<M*]]599mV-D!#LMF(r   )rc   r   r   )r   r   r   r   r   r   r   r7   r*   rB   r=   rD   rE   s   @r   r  r    sw    " *+$U$$U $U 	$U
 c3h'$U 
$ULU\\ ell r   r  c                        e Zd Zdeddf fdZd Ze	 	 	 	 	 	 ddej                  dz  dej                  dz  de	dz  d	e	dz  d
e	de	dz  de
ez  fd       Z xZS )%Data2VecVisionForSemanticSegmentationrH   r#   Nc                 x   t         |   |       |j                  | _        t        |d      | _        t        | j                  j                        dk7  rt        d      t        j                  t        j                  |j                  |j                  dd      t        j                  |j                        t        j                         t        j                  |j                  |j                  dd            | _        t        j                  t        j                  |j                  |j                  dd            | _        t        j"                         | _        t        j&                  dd      | _        t+        |      | _        |j.                  rt1        |      nd | _        | j5                          y )NFrx  r  zData2VecVisionForSemanticSegmentation requires config.out_indices to be a list of 4 integers, specifying which features to use from the backbone. One can use [3, 5, 7, 11] in case of a base-sized architecture.rc   r   )r6   r7   ry  r\  rJ  r  rH   out_indicesr   r   r  ConvTranspose2drL   r  GELUfpn1fpn2r   fpn3	MaxPool2dfpn4r  decode_headuse_auxiliary_headr  auxiliary_headrd  r   s     r   r7   z.Data2VecVisionForSemanticSegmentation.__init__]  sQ     ++26US t{{&&'1,- 
 MMv1163E3EST]^_NN6--.GGIv1163E3EST]^_	
	 MMv1163E3EST]^_
	 KKM	LLQq9	 2&9?E?X?X3F;^b 	r   c                 n   t         j                  j                  ||j                  dd  dd      }|0t         j                  j                  ||j                  dd  dd      }t	        | j
                  j                        } |||      }|}|% ||      }	|| j
                  j                  |	z  z  }|S )Nr   r   Fre   )ignore_index)r   ro   rp   r(   r   rH   semantic_loss_ignore_indexauxiliary_loss_weight)
r8   r  auxiliary_logitsr{  upsampled_logitsupsampled_auxiliary_logitsloss_fct	main_lossr  auxiliary_losss
             r   compute_lossz2Data2VecVisionForSemanticSegmentation.compute_loss}  s    ==44bc*5 5 
 ')+)B)B v||BC'8zY^ *C *& $1W1WX-v6	'%&@&INDKK55FFDr   rz   r{  r   r:  ry   r;  c           	      R   ||n| j                   j                  }||n| j                   j                  }|$| j                   j                  dk(  rt	        d      | j                  ||d||      }|r|j                  n|d   }	t        |	      D 
cg c]#  \  }
}|
dz   | j                   j                  v s"|% }}
}|j                  d   }| j                   j                  | j                   j                  z  }|D cg c]3  }|ddddddf   j                  ddd      j                  |d||      5 }}| j                  | j                  | j                   | j"                  g}t%        t'        |            D ]  } ||   ||         ||<    | j)                  |      }d}| j*                  | j+                  |      }d}|| j-                  |||      }|s|r
|f|dd z   }n	|f|dd z   }||f|z   S |S t/        |||r|j                  nd|j0                  	      S c c}}
w c c}w )
a  
        labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
            Ground truth semantic segmentation maps for computing the loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels > 1`, a classification loss is computed (Cross-Entropy).

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, Data2VecVisionForSemanticSegmentation
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> image_processor = AutoImageProcessor.from_pretrained("facebook/data2vec-vision-base")
        >>> model = Data2VecVisionForSemanticSegmentation.from_pretrained("facebook/data2vec-vision-base")

        >>> inputs = image_processor(images=image, return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> # logits are of shape (batch_size, num_labels, height, width)
        >>> logits = outputs.logits
        ```Nr   z/The number of labels should be greater than oneTr}  r   rc   rb   r~  )rH   ri  r:  ry  r   rJ  r:   rC  r  r(   rT   rR   rn   rm   r  r  r  r  r3  r  r  r  r  r   rB  )r8   rz   r{  r   r:  ry   r;  rj  r   r  idxfeaturefeaturesr   patch_resolutionr7  opsr9  r  r  r  r0   s                         r   r=   z-Data2VecVisionForSemanticSegmentation.forward  sa   H &1%<k$++B]B]$8$D $++JjJj 	 $++"8"8A"=NOO&&/!%%=# ' 
 :E 5 5'RS* 1::O0PwWTWZ[T[_c_j_j_v_vTvGww!''*
;;11T[[5K5KKnv
ijAaQhK1a(00RAQScd
 

 yy$))TYY		:s8}% 	.A #a&!-HQK	. !!(+*#228<$$V-=vFD# WQR[0 WQR[0)-)9TGf$EvE&3G'//T))	
 	
; x
s   #H6H=8H$r  )r   r   r   r   r7   r  r   r*   rB   r   r   r   r=   rD   rE   s   @r   r  r  Z  s    3  @&  -1&*)-,0).#'Y
llT)Y
 t#Y
  $;	Y

 #TkY
 #'Y
 D[Y
 
(	(Y
 Y
r   r  )rv  r  r\  rI  )r%   F)Cr   collections.abcrU   r   dataclassesr   typingr   r*   r   torch.nnr    r   rN  activationsr	   modeling_layersr
   modeling_outputsr   r   r   r   modeling_utilsr   pytorch_utilsr   utilsr   r   r   configuration_data2vec_visionr   
get_loggerr   r   r   rB   rA   r   r1   Moduler3   rG   rP   r   r   r   r   r   r   r   r   r   r+  rI  r\  rb  rv  r  r  r  r  r  r  __all__r   r   r   <module>r     s|   $   !    % & ! 9  . @ 7 7 ? 
		H	% +E  U\\ e T V[VbVb  %RYY %\7ryy \7@#7BII #7NQ")) Qj;#&A ;#~ryy & )+* &bii 6 "
299 
>4 >DP3 P3hE
BII E
P TO T T8 D
7 D
 D
P299 & 8
+H 8
8
x"ryy "L		 $$ $PRRYY Rl<BII <~ N
,I N
 N
br   