
    qij              	       8   d Z ddlZddlZddlmZ ddlZddlmZmZ ddl	m
Z
 ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZmZmZmZ ddlmZ ddlmZ ddlm Z m!Z!m"Z" ddl#m$Z$  e!jJ                  e&      Z'e e d       G d de                    Z(dOdej                  de)de*dej                  fdZ+ G d dejX                        Z- G d dejX                        Z. G d dejX                        Z/ G d  d!ejX                        Z0 G d" d#e0      Z1 G d$ d%ejX                        Z2e0e1d&Z3 G d' d(ejX                        Z4 G d) d*ejX                        Z5 G d+ d,ejX                        Z6 G d- d.e      Z7 G d/ d0ejX                        Z8 G d1 d2ejX                        Z9e  G d3 d4e             Z:e  G d5 d6e:             Z; G d7 d8ejX                        Z< e d9       G d: d;e:             Z= e d<       G d= d>e:             Z> G d? d@ejX                        Z? G dA dBejX                        Z@ G dC dDejX                        ZA G dE dFejX                        ZB G dG dHejX                        ZCe  G dI dJe:             ZD e dK       G dL dMee:             ZEg dNZFy)PzPyTorch BEiT model.    N)	dataclass)Tensornn)CrossEntropyLoss   )initialization)ACT2FN)BackboneMixin)GradientCheckpointingLayer)BackboneOutputBaseModelOutputBaseModelOutputWithPoolingImageClassifierOutputMaskedLMOutputSemanticSegmenterOutput)PreTrainedModel)#compile_compatible_method_lru_cache)auto_docstringlogging	torch_int   )
BeitConfigz-
    Class for outputs of [`BeitModel`].
    )custom_introc                       e Zd ZdZy)BeitModelOutputWithPoolingaF  
    pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
        Average of the last layer hidden states of the patch tokens (excluding the *[CLS]* token) if
        *config.use_mean_pooling* is set to True. If set to False, then the final hidden state of the *[CLS]* token
        will be returned.
    N)__name__
__module____qualname____doc__     X/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/beit/modeling_beit.pyr   r   -   s    r!   r   input	drop_probtrainingreturnc                    |dk(  s|s| S d|z
  }| j                   d   fd| j                  dz
  z  z   }|t        j                  || j                  | j
                        z   }|j                          | j                  |      |z  }|S )zc
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

            r   r   )r   )dtypedevice)shapendimtorchrandr)   r*   floor_div)r#   r$   r%   	keep_probr+   random_tensoroutputs          r"   	drop_pathr4   <   s    
 CxII[[^

Q 77E

5ELL YYMYYy!M1FMr!   c                   x     e Zd ZdZd	dedz  ddf fdZdej                  dej                  fdZde	fdZ
 xZS )
BeitDropPathzXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nr$   r&   c                 0    t         |           || _        y N)super__init__r$   )selfr$   	__class__s     r"   r:   zBeitDropPath.__init__N   s    "r!   hidden_statesc                 D    t        || j                  | j                        S r8   )r4   r$   r%   r;   r=   s     r"   forwardzBeitDropPath.forwardR   s    FFr!   c                      d| j                    S )Nzp=)r$   r;   s    r"   
extra_reprzBeitDropPath.extra_reprU   s    DNN#$$r!   r8   )r   r   r   r   floatr:   r-   r   r@   strrC   __classcell__r<   s   @r"   r6   r6   K   sG    b#%$, #$ #GU\\ Gell G%C %r!   r6   c                        e Zd ZdZdeddf fdZdej                  dededej                  fd	Z		 dd
ej                  dej                  dz  dej                  fdZ xZS )BeitEmbeddingszc
    Construct the CLS token, position and patch embeddings. Optionally, also the mask token.

    configr&   Nc                 2   t         |           t        j                  t	        j
                  dd|j                              | _        |j                  r:t        j                  t	        j
                  dd|j                              | _	        nd | _	        t        |      | _        |j                  | _        t        |j                  t        j                   j"                        r|j                  n|j                  |j                  f| _        | j                  j$                  }|j&                  r=t        j                  t	        j
                  d|dz   |j                              | _        nd | _        t        j*                  |j,                        | _        y )Nr   )r9   r:   r   	Parameterr-   zeroshidden_size	cls_tokenuse_mask_token
mask_tokenBeitPatchEmbeddingspatch_embeddings
patch_size
isinstance
image_sizecollectionsabcIterablenum_patches use_absolute_position_embeddingsposition_embeddingsDropouthidden_dropout_probdropout)r;   rJ   rZ   r<   s      r"   r:   zBeitEmbeddings.__init__a   s$   ekk!Q8J8J&KL   ll5;;q!V=O=O+PQDO"DO 3F ; ++ &++[__-E-EF ##V%6%67 	
 ++7722')||EKK;QR?TZTfTf4g'hD$'+D$zz&"<"<=r!   
embeddingsheightwidthc                    |j                   d   dz
  }| j                  j                   d   dz
  }t        j                  j	                         s||k(  r||k(  r| j                  S | j                  ddddf   }| j                  ddddf   }|j                   d   }|| j
                  z  }	|| j
                  z  }
t        |dz        }|j                  d|||      }|j                  dddd      }t        j                  j                  ||	|
fdd	
      }|j                  dddd      j                  dd|      }t        j                  ||fd      S )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   Ng      ?r   r      bicubicFsizemodealign_cornersdim)r+   r\   r-   jit
is_tracingrT   r   reshapepermuter   
functionalinterpolateviewcat)r;   r`   ra   rb   rZ   num_positionsclass_pos_embedpatch_pos_embedrl   
new_height	new_widthsqrt_num_positionss               r"   interpolate_pos_encodingz'BeitEmbeddings.interpolate_pos_encodingx   s`    !&&q)A-0066q9A= yy##%+*F6UZ?+++221bqb59221ab59r"t.
T__,	&}c'9:)11!5GI[]`a)11!Q1=--33i(	 4 
 *11!Q1=BB1b#Nyy/?;CCr!   pixel_valuesbool_masked_posc                    |j                   \  }}}}| j                  |      \  }\  }}|j                         \  }	}
}|K| j                  j	                  |	|
d      }|j                  d      j                  |      }|d|z
  z  ||z  z   }| j                  j	                  |	dd      }t        j                  ||fd      }| j                  || j                  |||      z   }| j                  |      }|||ffS Nrd   r   rk   )r+   rS   rh   rQ   expand	unsqueezetype_asrO   r-   rt   r\   r{   r_   )r;   r|   r}   _ra   rb   r`   patch_heightpatch_width
batch_sizeseq_lenmask_tokensw
cls_tokenss                 r"   r@   zBeitEmbeddings.forward   s   
 +001fe262G2G2U/
/\;!+!2
GQ&//00WbIK))"-55kBA#q1u-a?J^^**:r2>
YY
J7Q?
##/#d&C&CJPVX]&^^J\\*-
L+666r!   r8   )r   r   r   r   r   r:   r-   r   intr{   
BoolTensorr@   rF   rG   s   @r"   rI   rI   [   s    
>z >d >.&D5<< &D &DUX &D]b]i]i &DV 487ll7 ))D07 
	7r!   rI   c                   Z     e Zd ZdZ fdZdej                  dej                  fdZ xZS )rR   z
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    c                    t         |           |j                  |j                  }}|j                  |j
                  }}t        |t        j                  j                        r|n||f}t        |t        j                  j                        r|n||f}|d   |d   z  |d   |d   z  z  }|d   |d   z  |d   |d   z  f}|| _        || _        || _        || _
        || _        t        j                  ||||      | _        y )Nr   r   kernel_sizestride)r9   r:   rV   rT   num_channelsrN   rU   rW   rX   rY   rZ   patch_shaper   Conv2d
projection)	r;   rJ   rV   rT   r   rN   rZ   r   r<   s	           r"   r:   zBeitPatchEmbeddings.__init__   s   !'!2!2F4E4EJ
$*$7$79K9Kk#-j+//:R:R#SZZdfpYq
#-j+//:R:R#SZZdfpYq
!!}
15*Q-:VW=:XY!!}
15z!}
ST7UV$$(&&))L+:^hir!   r|   r&   c                 ^   |j                   \  }}}}|| j                  k7  rt        d      | j                  |j	                  | j                  j
                  j                              }|j                   d   |j                   d   }}|j                  d      j                  dd      }|||ffS )NzeMake sure that the channel dimension of the pixel values match with the one set in the configuration.re   r   r   )	r+   r   
ValueErrorr   toweightr)   flatten	transpose)	r;   r|   r   r   ra   rb   r`   r   r   s	            r"   r@   zBeitPatchEmbeddings.forward   s    2>2D2D/
L&%4,,,w  __\__T__5K5K5Q5Q%RS
$.$4$4Q$79I9I!9Lk''*44Q:
L+666r!   )	r   r   r   r   r:   r-   r   r@   rF   rG   s   @r"   rR   rR      s)    j"7ELL 7U\\ 7r!   rR   c                        e Zd Zddededz  ddf fdZ	 	 	 	 ddej                  dedej                  dz  d	ed
ee	   dz  deej                     eej                  ej                  f   z  fdZ
 xZS )BeitSelfAttentionNrJ   window_sizer&   c                 <   t         |           || _        |j                  |j                  z  dk7  r2t        |d      s&t        d|j                   d|j                   d      |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _	        t        j                  |j                  | j                        | _        t        j                  |j                  | j                  d      | _        t        j                  |j                  | j                        | _        t        j                  |j                         | _        t%        |      | _        | j&                  rt)        ||      | _        y y )	Nr   embedding_sizezThe hidden size z4 is not a multiple of the number of attention heads .F)biasr   )r9   r:   rJ   rN   num_attention_headshasattrr   r   attention_head_sizeall_head_sizer   Linearquerykeyvaluer]   attention_probs_dropout_probr_   boolhas_relative_position_biasBeitRelativePositionBiasrelative_position_biasr;   rJ   r   r<   s      r"   r:   zBeitSelfAttention.__init__   sP    : ::a?PVXhHi"6#5#5"6 7334A7 
 $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
99V//1C1C%PYYv1143E3EF
zz&"E"EF*.{*;'***B6Wb*cD' +r!   r=   output_attentionsr   r{   
resolutionc                    |j                   \  }}}| j                  |      j                  |d| j                  | j                        j                  dd      }	| j                  |      j                  |d| j                  | j                        j                  dd      }
| j                  |      j                  |d| j                  | j                        j                  dd      }t        j                  |	|
j                  dd            }|t        j                  | j                        z  }| j                  r[|\  }}|| j                  j                  z  || j                  j                  z  f}|| j                  |||j                   d         z   }|||z   }t         j"                  j%                  |d      }| j'                  |      }t        j                  ||      }|j)                  dddd      j+                         }|j-                         d d | j.                  fz   } |j                  | }|r||f}|S |f}|S )	Nrd   r   re   dim_sizerk   r   r   )r+   r   rs   r   r   r   r   r   r-   matmulmathsqrtr   rJ   rT   r   r   rq   softmaxr_   rp   
contiguousrh   r   )r;   r=   r   r   r{   r   r   
seq_lengthr   query_layer	key_layervalue_layerattention_scoresra   rb   r   attention_probscontext_layernew_context_layer_shapeoutputss                       r"   r@   zBeitSelfAttention.forward   s2    %2$7$7!
JJJ}%T*b$":":D<T<TUYq!_ 	 HH]#T*b$":":D<T<TUYq!_ 	 JJ}%T*b$":":D<T<TUYq!_ 	 !<<Y5H5HR5PQ+dii8P8P.QQ **&MFE!T[[%;%;;UdkkF\F\=\]K/$2M2M5@S@STU@V 3N 3  
 "-/2HH --//0@b/I ,,7_kB%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S***,CD6G=/2 O\M]r!   r8   FNFNr   r   r   r   tupler:   r-   r   r   r   r@   rF   rG   s   @r"   r   r      s    dz d dPT d4 #(6:).(,9||9  9 !&t 3	9
 #'9 #J%9 
u||	uU\\5<<%?@	@9r!   r   c                       e Zd Z	 	 	 	 d	dej                  dedej                  dz  dedee   dz  deej                     eej                  ej                  f   z  fdZy)
BeitSdpaSelfAttentionNr=   r   r   r{   r   r&   c           	         |r,t         j                  | j                  j                   d       |j                  \  }}}| j                  |      j                  |d| j                  | j                        j                  dd      }	| j                  |      j                  |d| j                  | j                        j                  dd      }
| j                  |      j                  |d| j                  | j                        j                  dd      }d }| j                  rX|\  }}|| j                  j                  z  || j                  j                  z  f}| j                  |||j                  d         }|
||}n||z  }dt!        j"                  | j                        z  }t$        j&                  j(                  j+                  |	|
||| j,                  r| j                  j.                  ndd|      }|j1                  d	ddd
      j3                         }|j5                         d d | j6                  fz   } |j                  | }|d fS )Nz does not support `output_attentions=True`. The returned attention weights will be `None`. If you want to get attention weights, please set `attn_implementation='eager'` when loading the model.rd   r   re   r   r(   F)	attn_mask	dropout_p	is_causalscaler   r   r   )loggerwarning_oncer<   r   r+   r   rs   r   r   r   r   r   r   rJ   rT   r   r   r   r-   r   rq   scaled_dot_product_attentionr%   r   rp   r   rh   r   )r;   r=   r   r   r{   r   r   r   r   r   r   r   	attn_biasra   rb   r   scalingr   r   s                      r"   r@   zBeitSdpaSelfAttention.forward5  s=    >>**+ ,D D %2$7$7!
JJJ}%T*b$":":D<T<TUYq!_ 	 HH]#T*b$":":D<T<TUYq!_ 	 JJ}%T*b$":":D<T<TUYq!_ 	 	**&MFE!T[[%;%;;UdkkF\F\=\]K335@S@STU@V 4 I
 "- 2	33	dii 8 899++HHBF--dkk>>UX I 
 &--aAq9DDF"/"4"4"6s";t?Q?Q>S"S***,CDd""r!   r   )	r   r   r   r-   r   r   r   r   r@   r    r!   r"   r   r   4  s     #(6:).(,:#||:#  :# !&t 3	:#
 #':# #J%:# 
u||	uU\\5<<%?@	@:#r!   r   c                   ~     e Zd ZdZdeddf fdZd	dej                  dej                  dej                  fdZ xZ	S )
BeitSelfOutputz
    The residual connection is defined in BeitLayer instead of here (as is the case with other models), due to the
    layernorm applied before each block.
    rJ   r&   Nc                     t         |           t        j                  |j                  |j                        | _        t        j                  |j                        | _        y r8   )	r9   r:   r   r   rN   denser]   r^   r_   r;   rJ   r<   s     r"   r:   zBeitSelfOutput.__init__x  sB    YYv1163E3EF
zz&"<"<=r!   r=   input_tensorc                 J    | j                  |      }| j                  |      }|S r8   r   r_   )r;   r=   r   gammas       r"   r@   zBeitSelfOutput.forward}  $    

=1]3r!   r8   )
r   r   r   r   r   r:   r-   r   r@   rF   rG   s   @r"   r   r   r  sD    
>z >d >
U\\  ^c^j^j r!   r   )eagersdpac                        e Zd Zddededz  ddf fdZ	 	 	 	 ddej                  dedej                  dz  d	ed
ee	   dz  deej                     eej                  ej                  f   z  fdZ
 xZS )BeitAttentionNrJ   r   r&   c                     t         |           t        |j                     ||      | _        t        |      | _        y )Nr   )r9   r:   BEIT_SELF_ATTENTION_CLASSES_attn_implementation	attentionr   r3   r   s      r"   r:   zBeitAttention.__init__  s5    4V5P5PQRXfqr$V,r!   r=   r   r   r{   r   c                 l    | j                  |||||      }| j                  |d   |      }|f|dd  z   }|S )Nr   r   )r   r3   )	r;   r=   r   r   r{   r   self_outputsattention_outputr   s	            r"   r@   zBeitAttention.forward  sQ     ~~,.DF^`j
  ;;|AF#%QR(88r!   r8   r   r   rG   s   @r"   r   r     s    -z - -PT - #(6:).(,||   !&t 3	
 #' #J% 
u||	uU\\5<<%?@	@r!   r   c                   `     e Zd Zdeddf fdZdej                  dej                  fdZ xZS )BeitIntermediaterJ   r&   Nc                    t         |           t        j                  |j                  |j
                        | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y r8   )r9   r:   r   r   rN   intermediate_sizer   rU   
hidden_actrE   r	   intermediate_act_fnr   s     r"   r:   zBeitIntermediate.__init__  s]    YYv1163K3KL
f''-'-f.?.?'@D$'-'8'8D$r!   r=   c                 J    | j                  |      }| j                  |      }|S r8   )r   r   r?   s     r"   r@   zBeitIntermediate.forward  s&    

=100?r!   	r   r   r   r   r:   r-   r   r@   rF   rG   s   @r"   r   r     s1    9z 9d 9U\\ ell r!   r   c                   `     e Zd Zdeddf fdZdej                  dej                  fdZ xZS )
BeitOutputrJ   r&   Nc                     t         |           t        j                  |j                  |j
                        | _        t        j                  |j                        | _	        y r8   )
r9   r:   r   r   r   rN   r   r]   r^   r_   r   s     r"   r:   zBeitOutput.__init__  sB    YYv779K9KL
zz&"<"<=r!   r=   c                 J    | j                  |      }| j                  |      }|S r8   r   r?   s     r"   r@   zBeitOutput.forward  r   r!   r   rG   s   @r"   r   r     s1    >z >d >
U\\ ell r!   r   c                        e Zd ZdZddededz  deddf fdZ	 	 	 	 ddej                  d	e
d
ej                  dz  de
deeef   dz  deej                     eej                  ej                  f   z  fdZ xZS )	BeitLayerz?This corresponds to the Block class in the timm implementation.NrJ   r   drop_path_rater&   c                    t         |           |j                  | _        d| _        t	        ||      | _        t        |      | _        t        |      | _	        t        j                  |j                  |j                        | _        |dkD  rt        |      nt        j                          | _        t        j                  |j                  |j                        | _        |j&                  }|dkD  ryt        j(                  |t+        j,                  |j                        z  d      | _        t        j(                  |t+        j,                  |j                        z  d      | _        y d\  | _        | _        y )	Nr   r   epsr(   r   T)requires_grad)NN)r9   r:   chunk_size_feed_forwardseq_len_dimr   r   r   intermediater   r3   r   	LayerNormrN   layer_norm_epslayernorm_beforer6   Identityr4   layernorm_afterlayer_scale_init_valuerL   r-   oneslambda_1lambda_2)r;   rJ   r   r   init_valuesr<   s        r"   r:   zBeitLayer.__init__  s   '-'E'E$&v;G,V4 ( "V-?-?VEZEZ [9G#9Mn5SUS^S^S`!||F,>,>FDYDYZ33?LLuzz&BTBT7U)UeijDMLLuzz&BTBT7U)UeijDM+5(DM4=r!   r=   r   r   r{   r   c                    | j                  | j                  |      ||||      }|d   }|dd  }| j                  | j                  |z  }| j                  |      |z   }| j	                  |      }	| j                  |	      }	| j                  |	      }	| j                  | j                  |	z  }	| j                  |	      |z   }	|	f|z   }|S )Nr   r   r{   r   r   r   )r   r  r  r4   r  r   r3   r  )
r;   r=   r   r   r{   r   self_attention_outputsr   r   layer_outputs
             r"   r@   zBeitLayer.forward  s     "&!!-0/#9%=! "0 "
 2!4(, ==$#}}/?? '78=H ++M:((6{{<0==$==<7L ~~l3mC/G+r!   )Nr(   r   )r   r   r   r   r   r   rD   r:   r-   r   r   r   r@   rF   rG   s   @r"   r   r     s    I6z 6 6]b 6mq 6* #(6:).-1'||'  ' !&t 3	'
 #'' #s(Od*' 
u||	uU\\5<<%?@	@'r!   r   c                        e Zd Zdededdf fdZ ed      deeef   dej                  fd       Z
dd	edej                  fd
Z xZS )r   rJ   r   r&   Nc                     t         |           || _        d|d   z  dz
  d|d   z  dz
  z  dz   | _        t	        j
                  t        j                  | j                  |j                              | _	        y )Nre   r   r   r   )
r9   r:   r   num_relative_distancer   rL   r-   rM   r   relative_position_bias_tabler   s      r"   r:   z!BeitRelativePositionBias.__init__  sr    &&'+a.&81&<[QR^ASVWAW%X[\%\",.LLKK22F4N4NO-
)r!   
   )maxsizec                    d|d   z  dz
  d|d   z  dz
  z  dz   }|d   |d   z  }t        j                  t        j                  |d         t        j                  |d         d      }t        j                  |      }t        j                  |d      }|dddddf   |dddddf   z
  }|j                  ddd      j                         }|dddddfxx   |d   dz
  z  cc<   |dddddfxx   |d   dz
  z  cc<   |dddddfxx   d|d   z  dz
  z  cc<   t        j                  |dz   fdz  |j                        }|j                  d	      |ddddf<   |dz
  |dddf<   |dz
  |dddf<   |dz
  |d
<   |S )z
        This method creates the relative position index, modified to support arbitrary window sizes,
        as introduced in [MiDaS v3.1](https://huggingface.co/papers/2307.14460).
        re   r   r   r   ij)indexingN)rh   r)   rd   )r   r   )
r-   meshgridarangestackr   rp   r   rM   r)   sum)	r;   r   r  window_areagridcoordscoords_flattenrelative_coordsrelative_position_indexs	            r"    generate_relative_position_indexz9BeitRelativePositionBias.generate_relative_position_index  s    "#[^!3a!7AA<NQR<R SVW W "!n{1~5~~ell;q>:ELLUV<XcghT"vq1(At4~aqj7QQ)11!Q:EEG1a KNQ$66 1a KNQ$66 1a AA$6$:: "'++K!O3E3IQ`QfQf"g*9*=*=b*AAB')>)B12&)>)BA&(=(A%&&r!   r{   c                    d| j                   d   z  dz
  }d| j                   d   z  dz
  }d|d   z  dz
  }d|d   z  dz
  }| j                  }| j                  }	||z  dz   }
|d|	dz
   }|j                  d||d      j	                  dddd      }t
        j                  j                  |t        |      t        |      fd      }|j	                  dddd      j                  |
dz
  d      }t        j                  |||	dz
  d g      }| j                  |      }||j                  d         }|j                  |d   |d   z  dz   |d   |d   z  dz   d      }|j	                  ddd      j                         }|rCt
        j                  j                  |j                  d      ||fdd	
      j                  d      }|j                  d      S )zu
        Modification of timm.models.beit.py: Attention._get_rel_pos_bias to support arbitrary window sizes.
        re   r   r   r   Nrd   bilinear)rh   ri   Frg   )r   r  r  ro   rp   r   rq   rr   r   r-   rt   r!  rs   r   r   squeeze)r;   r   r{   r   
old_height	old_widthrx   ry    old_relative_position_bias_tableold_num_relative_distancenew_num_relative_distanceold_sub_tablenew_sub_table new_relative_position_bias_tabler   r   s                   r"   r@   z BeitRelativePositionBias.forward!  s-    ))!,,q0
((++a/	Q'!+
A&*	+/+L+L($($>$>!$.$:Q$>!89X;TWX;XY%--aJKSSTUWXZ[]^_11:!6	)8L MT^ 2 
 &--aAq9AAB[^_B_acd+099<=VYZ=Z=\]^,
( #'"G"G"T!ABYB^B^_aBb!c "8!<!<N[^+a/Q+a.1PST1TVX"
 "8!?!?1a!H!S!S!U#%']]%>%>&003)#	 &? &
 gaj # &//22r!   )FN)r   r   r   r   r   r:   r   r   r-   r   r!  r   r@   rF   rG   s   @r"   r   r     sm    
z 
 
$ 
 )4'E#s(O 'PUP\P\ ' 5'0-3T -3]b]i]i -3r!   r   c                        e Zd Zddededz  ddf fdZ	 	 	 	 	 ddej                  deded	ed
ee	e	f   dz  dedee
z  fdZ xZS )BeitEncoderNrJ   r   r&   c                    t         |           || _        |j                  | _        | j                  rt        ||      | _        t        j                  d|j                  |j                  d      D cg c]  }|j                          }}t        j                  t        |j                        D cg c]!  }t        ||j                   r|nd ||         # c}      | _        d| _        y c c}w c c}w )Nr   r   cpu)r*   )r   r   F)r9   r:   rJ   !use_shared_relative_position_biasr   r   r   r-   linspacer   num_hidden_layersitemr   
ModuleListranger   use_relative_position_biaslayergradient_checkpointing)r;   rJ   r   xdprir<   s         r"   r:   zBeitEncoder.__init__R  s    *0*R*R'***B6Wb*cD' "'63H3H&JbJbkp!qrAqvvxrr]] v778  /5/P/PVZ#&q6	

 ',# ss   5C.4&C3r=   r   output_hidden_statesr{   r   return_dictc                    |rdnd }|rdnd }t        | j                        D ]  \  }	}
|r||fz   }| j                  rY|\  }}|| j                  j                  z  || j                  j                  z  f}| j                  |||j                  d         }nd } |
|||||      }|d   }|s||d   fz   } |r||fz   }|st        d |||fD              S t        |||      S )Nr    r   )r{   r   r  r   c              3   &   K   | ]	  }||  y wr8   r    ).0vs     r"   	<genexpr>z&BeitEncoder.forward.<locals>.<genexpr>  s     mq_`_lms   )last_hidden_stater=   
attentions)		enumerater8  r   rJ   rT   r   r+   r   r   )r;   r=   r   r=  r{   r   r>  all_hidden_statesall_self_attentionsr<  layer_modulera   rb   r   r   layer_outputss                   r"   r@   zBeitEncoder.forwardg  s6    #7BD$5b4(4 	POA|#$58H$H!.. *%)?)??$++J`J`A`a)-)D)D:R]j]p]pqr]s *E *& *.&("3'=)A%M *!,M &9]1=M<O&O#1	P4   1]4D Dm]4EGZ$[mmm++*
 	
r!   r8   )FFFNT)r   r   r   r   r   r:   r-   r   r   r   r   r@   rF   rG   s   @r"   r.  r.  Q  s    ,z , ,PT ,0 #(%*).-1 /
||/
  /
 #	/

 #'/
 #s(Od*/
 /
 
	 /
r!   r.  c                   r     e Zd ZU eed<   dZdZdZdZdgZ	dgZ
dZ ej                          fd       Z xZS )	BeitPreTrainedModelrJ   beit)imager|   Tr   z.*relative_position_index.*c                    t         |   |       t        |t              rwt	        j
                  |j                         |j                  t	        j
                  |j                         |j                   t	        j
                  |j                         yyt        |t              r t	        j
                  |j                         yt        |t              rv|j                  it	        j                  |j                  | j                  j                         t	        j                  |j                   | j                  j                         yyy)zInitialize the weightsN)r9   _init_weightsrU   rI   initzeros_rO   rQ   r\   r   r  r   r  	constant_rJ   r  r  )r;   moduler<   s     r"   rP  z!BeitPreTrainedModel._init_weights  s     	f%fn-KK(()  ,F--.))5F667 6 89KK;;<	**v0R0RSv0R0RS + +r!   )r   r   r   r   __annotations__base_model_prefixinput_modalitiesmain_input_namesupports_gradient_checkpointing_no_split_modules"_keys_to_ignore_on_load_unexpected_supports_sdpar-   no_gradrP  rF   rG   s   @r"   rL  rL    sR    !$O&*#$*H)I&NU]]_T Tr!   rL  c                        e Zd Zddededdf fdZd Ze	 	 	 	 	 ddej                  dej                  dz  d	edz  d
edz  dededz  deez  fd       Z xZS )	BeitModelrJ   add_pooling_layerr&   Nc                    t         |   |       || _        t        |      | _        t        || j                  j                  j                        | _        |j                  rt        j                         n*t        j                  |j                  |j                        | _        |rt!        |      nd| _        | j%                          y)zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        r   r   N)r9   r:   rJ   rI   r`   r.  rS   r   encoderuse_mean_poolingr   r  r   rN   r  	layernorm
BeitPoolerpooler	post_init)r;   rJ   r`  r<   s      r"   r:   zBeitModel.__init__  s    
 	 (0"6t7W7W7c7cd $44BKKM",,vGYGY_e_t_t:u 	 ->j(4 	r!   c                 .    | j                   j                  S r8   r`   rS   rB   s    r"   get_input_embeddingszBeitModel.get_input_embeddings      ///r!   r|   r}   r   r=  r{   r>  c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }| j	                  ||      \  }}	|j
                  dd }
| j                  ||||
||      }|d   }| j                  |      }| j                  | j                  |      nd}|s|||fn|f}||dd z   S t        |||j                  |j                        S )z
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
        N)r}   re   )r   r=  r   r>  r{   r   r   )rD  pooler_outputr=   rE  )rJ   r   r=  use_return_dictr`   r+   rb  rd  rf  r   r=   rE  )r;   r|   r}   r   r=  r{   r>  kwargsembedding_outputr   r   encoder_outputssequence_outputpooled_outputhead_outputss                  r"   r@   zBeitModel.forward  s$    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]"oolOo\!!''+
,,/!5!#%= ' 
 *!,..98<8OO4UY?L?XO];_n^pL/!""555)-')77&11	
 	
r!   )T)NNNFN)r   r   r   r   r   r:   rj  r   r-   r   r   r   r   r@   rF   rG   s   @r"   r_  r_    s    z d d &0  48)-,0).#',
ll,
 ))D0,
  $;	,

 #Tk,
 #',
 D[,
 
+	+,
 ,
r!   r_  c                   `     e Zd Zdeddf fdZdej                  dej                  fdZ xZS )re  rJ   r&   Nc                     t         |           |j                  r1t        j                  |j
                  |j                        | _        y d | _        y )Nr   )r9   r:   rc  r   r   rN   r  rd  r   s     r"   r:   zBeitPooler.__init__  sA    KQKbKbBLL++1F1FG 	hl 	r!   r=   c                     | j                   0|d d dd d d f   }| j                  |j                  d            }|S |d d df   }|S )Nr   r   )rd  mean)r;   r=   patch_tokensrs  s       r"   r@   zBeitPooler.forward  sU    >>%(AB2L NN<+<+<Q+?@M
  *!Q$/Mr!   r   rG   s   @r"   re  re    s1    
z 
d 
	U\\ 	ell 	r!   re  a  
    Beit Model transformer with a 'language' modeling head on top. BEiT does masked image modeling by predicting
    visual tokens of a Vector-Quantize Variational Autoencoder (VQ-VAE), whereas other vision models like ViT and DeiT
    predict RGB pixel values. As a result, this class is incompatible with [`AutoModelForMaskedImageModeling`], so you
    will need to use [`BeitForMaskedImageModeling`] directly if you wish to do masked image modeling with BEiT.
    c                        e Zd Zdeddf fdZd Ze	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  d	e
dz  d
e
dz  de
de
dz  deez  fd       Z xZS )BeitForMaskedImageModelingrJ   r&   Nc                 H   t         |   |       |j                  | _        t        |d      | _        t        j                  |j                  |j                        | _	        t        j                  |j                  |j                        | _        | j                          y )NFr`  r   )r9   r:   
num_labelsr_  rM  r   r   rN   r  rd  r   
vocab_sizelm_headrg  r   s     r"   r:   z#BeitForMaskedImageModeling.__init__  su      ++f>	 f&8&8f>S>STyy!3!3V5F5FG 	r!   c                      y r8   r    rB   s    r"   get_output_embeddingsz0BeitForMaskedImageModeling.get_output_embeddings'  s    r!   r|   r}   labelsr   r=  r{   r>  c                 h   ||n| j                   j                  }| j                  ||||||      }	|	d   }
| j                  |
      }
| j	                  |
ddddf         }d}|t               } |||   |      }|s|f|	dd z   }||f|z   S |S t        |||	j                  |	j                        S )a  
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, BeitForMaskedImageModeling
        >>> import torch
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> image_processor = AutoImageProcessor.from_pretrained("microsoft/beit-base-patch16-224-pt22k")
        >>> model = BeitForMaskedImageModeling.from_pretrained("microsoft/beit-base-patch16-224-pt22k")

        >>> num_patches = (model.config.image_size // model.config.patch_size) ** 2
        >>> pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
        >>> # create random boolean mask of shape (batch_size, num_patches)
        >>> bool_masked_pos = torch.randint(low=0, high=2, size=(1, num_patches)).bool()

        >>> outputs = model(pixel_values, bool_masked_pos=bool_masked_pos)
        >>> loss, logits = outputs.loss, outputs.logits
        >>> list(logits.shape)
        [1, 196, 8192]
        ```N)r}   r   r=  r{   r>  r   r   losslogitsr=   rE  )	rJ   rn  rM  rd  r  r   r   r=   rE  )r;   r|   r}   r  r   r=  r{   r>  ro  r   rr  prediction_scoresmasked_lm_lossloss_fctr3   s                  r"   r@   z"BeitForMaskedImageModeling.forward*  s    \ &1%<k$++B]B]))+/!5%=#  
 "!*..9 LLAB)?@')H%&7&H&QN')GABK7F3A3M^%.YSYY$!//))	
 	
r!   )NNNNNFN)r   r   r   r   r:   r  r   r-   r   r   r   r   r   r@   rF   rG   s   @r"   r{  r{    s    z d   -137&*)-,0).#'J
llT)J
 ))D0J
 t#	J

  $;J
 #TkJ
 #'J
 D[J
 
	J
 J
r!   r{  z
    Beit Model transformer with an image classification head on top (a linear layer on top of the average of the final
    hidden states of the patch tokens) e.g. for ImageNet.
    c                        e Zd Zdeddf fdZe	 	 	 	 	 	 ddej                  dz  dej                  dz  dedz  dedz  d	ed
edz  de	e
z  fd       Z xZS )BeitForImageClassificationrJ   r&   Nc                 .   t         |   |       |j                  | _        t        |d      | _        |j                  dkD  r*t        j                  |j                  |j                        nt        j                         | _	        | j                          y )NTr}  r   )r9   r:   r~  r_  rM  r   r   rN   r  
classifierrg  r   s     r"   r:   z#BeitForImageClassification.__init__  ss      ++f=	 OUN_N_bcNc"))F$6$68I8IJikititiv 	r!   r|   r  r   r=  r{   r>  c                 \   ||n| j                   j                  }| j                  |||||      }|r|j                  n|d   }	| j	                  |	      }
d}|| j                  ||
| j                         }|s|
f|dd z   }||f|z   S |S t        ||
|j                  |j                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr   r=  r{   r>  r   re   r  )	rJ   rn  rM  rm  r  loss_functionr   r=   rE  )r;   r|   r  r   r=  r{   r>  ro  r   rs  r  r  r3   s                r"   r@   z"BeitForImageClassification.forward  s    " &1%<k$++B]B]))/!5%=#  
 2=--'!*/%%ffdkkBDY,F)-)9TGf$EvE$!//))	
 	
r!   NNNNFN)r   r   r   r   r:   r   r-   r   r   r   r   r@   rF   rG   s   @r"   r  r  x  s    
z 
d 
  -1&*)-,0).#'*
llT)*
 t#*
  $;	*

 #Tk*
 #'*
 D[*
 
&	&*
 *
r!   r  c                        e Zd ZdZ	 	 	 ddededeeeef   z  deeeef   z  ez  dedeeeef   z  dd	f fd
Zde	j                  de	j                  fdZ xZS )BeitConvModuleaD  
    A convolutional block that bundles conv/norm/activation layers. This block simplifies the usage of convolution
    layers, which are commonly used with a norm layer (e.g., BatchNorm) and activation layer (e.g., ReLU).

    Based on OpenMMLab's implementation, found in https://github.com/open-mmlab/mmsegmentation.
    in_channelsout_channelsr   paddingr   dilationr&   Nc                     t         |           t        j                  ||||||      | _        t        j
                  |      | _        t        j                         | _        y )N)r  r  r   r  r   r  )	r9   r:   r   r   convBatchNorm2dbnReLU
activation)r;   r  r  r   r  r   r  r<   s          r"   r:   zBeitConvModule.__init__  sQ     	II#%#
	 ...'')r!   r#   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r8   )r  r  r  )r;   r#   r3   s      r"   r@   zBeitConvModule.forward  s0    5!(r!   )r   Fr   )r   r   r   r   r   r   rE   r   r:   r-   r   r@   rF   rG   s   @r"   r  r    s     01*+$$ $ 5c?*	$
 uS#X&,$ $ c3h'$ 
$*U\\ ell r!   r  c                   h     e Zd Zdedededdf fdZdej                  dej                  fdZ xZS )	BeitPyramidPoolingBlock
pool_scaler  channelsr&   Nc                     t         |           t        j                  |      t	        ||d      g| _        t        | j
                        D ]   \  }}| j                  t        |      |       " y )Nr   r   )	r9   r:   r   AdaptiveAvgPool2dr  layersrF  
add_modulerE   )r;   r  r  r  r<  r8  r<   s         r"   r:   z BeitPyramidPoolingBlock.__init__  sa      ,;a@
 "$++. 	+HAuOOCFE*	+r!   r#   c                 <    |}| j                   D ]
  } ||      } |S r8   )r  )r;   r#   hidden_stater8  s       r"   r@   zBeitPyramidPoolingBlock.forward  s*    [[ 	/E .L	/r!   )	r   r   r   r   r:   r-   r   r@   rF   rG   s   @r"   r  r    s?    +3 +S +C +D +U\\ ell r!   r  c            
            e Zd ZdZdeedf   dedededdf
 fd	Zd
ej                  de
ej                     fdZ xZS )BeitPyramidPoolingModulea  
    Pyramid Pooling Module (PPM) used in PSPNet.

    Args:
        pool_scales (tuple[int]): Pooling scales used in Pooling Pyramid
            Module.
        in_channels (int): Input channels.
        channels (int): Channels after modules, before conv_seg.
        align_corners (bool): align_corners argument of F.interpolate.

    Based on OpenMMLab's implementation, found in https://github.com/open-mmlab/mmsegmentation.
    pool_scales.r  r  rj   r&   Nc                    t         |           || _        || _        || _        || _        g | _        t        |      D ]I  \  }}t        |||      }| j                  j                  |       | j                  t        |      |       K y )N)r  r  r  )r9   r:   r  rj   r  r  blocksrF  r  appendr  rE   )	r;   r  r  r  rj   r<  r  blockr<   s	           r"   r:   z!BeitPyramidPoolingModule.__init__  s    &*& &{3 	+MAz+z{emnEKKu%OOCFE*	+r!   r:  c                     g }| j                   D ]Y  } ||      }t        j                  j                  ||j	                         dd  d| j
                        }|j                  |       [ |S )Nre   r#  rg   )r  r   rq   rr   rh   rj   r  )r;   r:  ppm_outsppmppm_outupsampled_ppm_outs         r"   r@   z BeitPyramidPoolingModule.forward	  sn    ;; 	/C!fG " 9 9affhqrl4K]K] !: ! OO-.	/ r!   )r   r   r   r   r   r   r   r:   r-   r   listr@   rF   rG   s   @r"   r  r    s[    
+E#s(O 
+# 
+QT 
+ei 
+nr 
+ $u||*< r!   r  c                   j     e Zd ZdZdeddf fdZd Zdej                  dej                  fdZ	 xZ
S )	BeitUperHeadz
    Unified Perceptual Parsing for Scene Understanding. This head is the implementation of
    [UPerNet](https://huggingface.co/papers/1807.10221).

    Based on OpenMMLab's implementation, found in https://github.com/open-mmlab/mmsegmentation.
    rJ   r&   Nc                    t         |           |j                  | _        |j                  gdz  | _        |j                  | _        d| _        t        j                  | j
                  |j                  d      | _
        t        | j                  | j                  d   | j
                  | j                        | _        t        | j                  d   t        | j                        | j
                  z  z   | j
                  dd      | _        t        j                          | _        t        j                          | _        | j                  d d D ]s  }t        || j
                  d      }t        | j
                  | j
                  dd      }| j"                  j'                  |       | j$                  j'                  |       u t        t        | j                        | j
                  z  | j
                  dd      | _        y )	N   Fr   r  rd   )rj   r   r   r  )r9   r:   r  rN   r  r  rj   r   r   r~  r  r  psp_modulesr  len
bottleneckr5  lateral_convs	fpn_convsr  fpn_bottleneck)r;   rJ   r  l_convfpn_convr<   s        r"   r:   zBeitUperHead.__init__  s   !--"../!3**"))DMM63D3DRST 4R MM,,	
 )R 3t'7'7#84==#HHMM	
  ]]_++CR0 	,K#KANF%dmmT]]PQ[\]H%%f-NN!!(+		, -  !DMM1MM	
r!   c                     |d   }|g}|j                  | j                  |             t        j                  |d      }| j	                  |      }|S r   )extendr  r-   rt   r  )r;   inputsr:  psp_outsr3   s        r"   psp_forwardzBeitUperHead.psp_forwardB  sL    2J3((+,99X1-*r!   encoder_hidden_statesc                 P   t        | j                        D cg c]  \  }} |||          }}}|j                  | j                  |             t	        |      }t        |dz
  dd      D ]V  }||dz
     j                  dd  }||dz
     t        j                  j                  ||   |d| j                        z   ||dz
  <   X t        |dz
        D cg c]  } | j                  |   ||          }}|j                  |d          t        |dz
  dd      D ]E  }t        j                  j                  ||   |d   j                  dd  d| j                        ||<   G t        j                  |d      }| j                  |      }| j                  |      }|S c c}}w c c}w )Nr   r   rd   re   r#  rg   rk   )rF  r  r  r  r  r6  r+   r   rq   rr   rj   r  r-   rt   r  r  )	r;   r  r<  lateral_convlateralsused_backbone_levels
prev_shapefpn_outsr3   s	            r"   r@   zBeitUperHead.forwardK  s   R[\`\n\nRopq,L!6q!9:pp(()>?@  #8}+a/B7 	A!!a%..qr2J&q1uo0I0I*:TM_M_ 1J 1 HQUO	 =BBVYZBZ<[\q%DNN1%hqk2\\%+a/B7 	A--33(1+"3"3AB"7jX\XjXj 4 HQK	 99X1-$$X.(3 q ]s   FF#)r   r   r   r   r   r:   r  r-   r   r@   rF   rG   s   @r"   r  r    s<    $
z $
d $
LU\\ ell r!   r  c                        e Zd ZdZ	 ddedededeeeef   z  ddf
 fdZd	ej                  dej                  fd
Z
 xZS )BeitFCNHeada  
    Fully Convolution Networks for Semantic Segmentation. This head is implemented of
    [FCNNet](https://huggingface.co/papers/1411.4038>).

    Args:
        config (BeitConfig): Configuration.
        in_channels
        kernel_size (int): The kernel size for convs in the head. Default: 3.
        dilation (int): The dilation rate for convs in the head. Default: 1.


    Based on OpenMMLab's implementation, found in https://github.com/open-mmlab/mmsegmentation.
    rJ   in_indexr   r  r&   Nc           
      <   t         |           |j                  | _        |j                  | _        |j                  | _        |j                  | _	        || _
        |dz  |z  }g }|j                  t        | j                  | j
                  |||             t        | j                  dz
        D ]5  }|j                  t        | j
                  | j
                  |||             7 | j                  dk(  rt        j                         | _        nt        j"                  | | _        | j                  r8t        | j                  | j
                  z   | j
                  ||dz        | _        t        j&                  | j
                  |j(                  d      | _        y )Nre   )r   r  r  r   r   r  r  )r9   r:   rN   r  auxiliary_channelsr  auxiliary_num_convs	num_convsauxiliary_concat_inputconcat_inputr  r  r  r6  r   r  convs
Sequentialconv_catr   r~  r  )	r;   rJ   r  r   r  conv_paddingr  r<  r<   s	           r"   r:   zBeitFCNHead.__init__x  sX    	!--1133"99 #q(H4  $--[R^iq	

 t~~)* 	ALLMM4==kS_jr	 >>QDJ.DJ*  4==0$--[bmqrbrDM ))DMM63D3DRSTr!   r  c                     || j                      }| j                  |      }| j                  r(| j                  t	        j
                  ||gd            }| j                  |      }|S )Nr   rk   )r  r  r  r  r-   rt   r  )r;   r  r=   r3   s       r"   r@   zBeitFCNHead.forward  sX    -dmm<M*]]599mV-D!#LMF(r!   )re   r   r   )r   r   r   r   r   r   r   r:   r-   r   r@   rF   rG   s   @r"   r  r  i  sp     no U  U,/ UBE UUX[`adfiai[jUj U	 UDU\\ ell r!   r  c                        e Zd Zdeddf fdZd Ze	 	 	 	 	 	 ddej                  dz  dej                  dz  de	dz  d	e	dz  d
e	de	dz  de
ez  fd       Z xZS )BeitForSemanticSegmentationrJ   r&   Nc                 x   t         |   |       |j                  | _        t        |d      | _        t        | j                  j                        dk7  rt        d      t        j                  t        j                  |j                  |j                  dd      t        j                  |j                        t        j                         t        j                  |j                  |j                  dd            | _        t        j                  t        j                  |j                  |j                  dd            | _        t        j"                         | _        t        j&                  dd      | _        t+        |      | _        |j.                  rt1        |      nd | _        | j5                          y )NFr}  r  zBeitForSemanticSegmentation requires config.out_indices to be a list of 4 integers, specifying which features to use from the backbone. One can use [3, 5, 7, 11] in case of a base-sized architecture.re   r   )r9   r:   r~  r_  rM  r  rJ   out_indicesr   r   r  ConvTranspose2drN   r  GELUfpn1fpn2r  fpn3	MaxPool2dfpn4r  decode_headuse_auxiliary_headr  auxiliary_headrg  r   s     r"   r:   z$BeitForSemanticSegmentation.__init__  sO     ++f>	 t{{&&'1,- 
 MMv1163E3EST]^_NN6--.GGIv1163E3EST]^_	
	 MMv1163E3EST]^_
	 KKM	LLQq9	 (/5;5N5Nk&1TX 	r!   c                 n   t         j                  j                  ||j                  dd  dd      }|0t         j                  j                  ||j                  dd  dd      }t	        | j
                  j                        } |||      }|}|% ||      }	|| j
                  j                  |	z  z  }|S )Nr   r#  Frg   )ignore_index)r   rq   rr   r+   r   rJ   semantic_loss_ignore_indexauxiliary_loss_weight)
r;   r  auxiliary_logitsr  upsampled_logitsupsampled_auxiliary_logitsr  	main_lossr  auxiliary_losss
             r"   compute_lossz(BeitForSemanticSegmentation.compute_loss  s    ==44bc*5 5 
 ')+)B)B v||BC'8zY^ *C *& $1W1WX-v6	'%&@&INDKK55FFDr!   r|   r  r   r=  r{   r>  c           	      R   ||n| j                   j                  }||n| j                   j                  }|$| j                   j                  dk(  rt	        d      | j                  ||d||      }|r|j                  n|d   }	t        |	      D 
cg c]#  \  }
}|
dz   | j                   j                  v s"|% }}
}|j                  d   }| j                   j                  | j                   j                  z  }|D cg c]3  }|ddddddf   j                  ddd      j                  |d||      5 }}| j                  | j                  | j                   | j"                  g}t%        t'        |            D ]  } ||   ||         ||<    | j)                  |      }d}| j*                  | j+                  |      }d}|| j-                  |||      }|s|r
|f|dd z   }n	|f|dd z   }||f|z   S |S t/        |||r|j                  nd|j0                  	      S c c}}
w c c}w )
a  
        labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
            Ground truth semantic segmentation maps for computing the loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels > 1`, a classification loss is computed (Cross-Entropy).

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, BeitForSemanticSegmentation
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> image_processor = AutoImageProcessor.from_pretrained("microsoft/beit-base-finetuned-ade-640-640")
        >>> model = BeitForSemanticSegmentation.from_pretrained("microsoft/beit-base-finetuned-ade-640-640")

        >>> inputs = image_processor(images=image, return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> # logits are of shape (batch_size, num_labels, height, width)
        >>> logits = outputs.logits
        ```Nr   z/The number of labels should be greater than oneTr  r   re   rd   r  )rJ   rn  r=  r~  r   rM  r=   rF  r  r+   rV   rT   rp   ro   r  r  r  r  r6  r  r  r  r  r   rE  )r;   r|   r  r   r=  r{   r>  ro  r   r  idxfeaturefeaturesr   patch_resolutionr:  opsr<  r  r  r  r3   s                         r"   r@   z#BeitForSemanticSegmentation.forward  s_   H &1%<k$++B]B]$8$D $++JjJj 	 $++"8"8A"=NOO))/!%%=#  
 :E 5 5'RS* 1::O0PwWTWZ[T[_c_j_j_v_vTvGww!''*
;;11T[[5K5KKnv
ijAaQhK1a(00RAQScd
 

 yy$))TYY		:s8}% 	.A #a&!-HQK	. !!(+*#228<$$V-=vFD# WQR[0 WQR[0)-)9TGf$EvE&3G'//T))	
 	
; x
s   #H6H=8H$r  )r   r   r   r   r:   r  r   r-   r   r   r   r   r@   rF   rG   s   @r"   r  r    s    z d @&  -1&*)-,0).#'Y
llT)Y
 t#Y
  $;	Y

 #TkY
 #'Y
 D[Y
 
(	(Y
 Y
r!   r  zM
    BEiT backbone, to be used with frameworks like DETR and MaskFormer.
    c                   d     e Zd Z fdZd Ze	 	 	 d
dededz  dedz  dedz  def
d	       Z	 xZ
S )BeitBackbonec                    t         |   |       t        |j                  dz         D cg c]  }|j                   c}| _        t        |      | _        t        || j                  j                  j                        | _        |j                  rt        | j                  j                        dk7  rt!        d      |j                  }t#        j$                  t#        j&                  ||dd      t#        j(                  ||j*                        t#        j,                         t#        j&                  ||dd            | _        t#        j$                  t#        j&                  ||dd            | _        t#        j2                         | _        t#        j6                  dd      | _        | j;                          y c c}w )Nr   r   r  zBeitBackbone requires config.out_indices to be a list of 4 integers, specifying which features to use from the backbone. One can use [3, 5, 7, 11] in case of a base-sized architecture.re   r   r   )r9   r:   r6  r3  rN   num_featuresrI   r`   r.  rS   r   rb  add_fpnr  rJ   r  r   r   r  r  r  batch_norm_epsr  r  r  r  r  r  r  rg  )r;   rJ   r   rN   r<   s       r"   r:   zBeitBackbone.__init__<  sN    9>v?W?WZ[?[9\]AV//](0"6t7W7W7c7cd>>4;;**+q0 1 
 !,,K"";STU{0E0EF	"";STU	DI b&8&8k_`ij&klDIDI1=DI 	1 ^s   F?c                 .    | j                   j                  S r8   ri  rB   s    r"   rj  z!BeitBackbone.get_input_embeddingsY  rk  r!   Nr|   r=  r   r>  r&   c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|j                  d   }| j                  |      \  }\  }}	|j                  dd }
| j                  |d||
|      }|r|j                  n|d   }d}t        | j                  |      D ]e  \  }}|| j                  v s| j                   j                  r5|ddddddf   }|j                  ddd      }|j                  |d||	      }||fz  }g | j                   j                  rY| j                  |d         | j!                  |d         | j#                  |d         | j%                  |d	         g}t'        |      }|s|r|f|dd z   }|S |f|dd z   }|S t)        ||r|j                  nd|j*                  
      S )a  
        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, AutoBackbone
        >>> import torch
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> processor = AutoImageProcessor.from_pretrained("microsoft/beit-base-patch16-224")
        >>> model = AutoBackbone.from_pretrained(
        ...     "microsoft/beit-base-patch16-224", out_features=["stage1", "stage2", "stage3", "stage4"]
        ... )

        >>> inputs = processor(image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> feature_maps = outputs.feature_maps
        >>> list(feature_maps[-1].shape)
        [1, 768, 14, 14]
        ```Nr   re   T)r=  r   r   r>  r   r    rd   r   )feature_mapsr=   rE  )rJ   rn  r=  r   r+   r`   rb  r=   zipstage_namesout_featuresreshape_hidden_statesrp   ro   r  r  r  r  r  r   r   rE  )r;   r|   r=  r   r>  ro  r   rp  r   r   r   r   r=   r  stager  r3   s                    r"   r@   zBeitBackbone.forward\  s   F &1%<k$++B]B]$8$D $++JjJj 	 2C1N-TXT_T_TqTq!''*
8<8U55<!''+
,,!%/!#  
 2=--'!*#&t'7'7#G 	0E<)));;44#/12q#9L#/#7#71a#@L#/#7#7
BVa#bL/	0 ;;		,q/*		,q/*		,q/*		,q/*	L !.L#&712;6 M '712;6M%3G'//T))
 	
r!   )NNN)r   r   r   r:   rj  r   r   r   r   r@   rF   rG   s   @r"   r  r  6  so    :0  -1)-#'T
T
 #TkT
  $;	T

 D[T
 
T
 T
r!   r  )r  r{  r  r_  rL  r  )r(   F)Gr   collections.abcrW   r   dataclassesr   r-   r   r   torch.nnr    r   rQ  activationsr	   backbone_utilsr
   modeling_layersr   modeling_outputsr   r   r   r   r   r   modeling_utilsr   pytorch_utilsr   utilsr   r   r   configuration_beitr   
get_loggerr   r   r   rD   r   r4   Moduler6   rI   rR   r   r   r   r   r   r   r   r   r   r.  rL  r_  re  r{  r  r  r  r  r  r  r  r  __all__r    r!   r"   <module>r     s      !   % & ! + 9  . @ 7 7 * 
		H	% 
!;  U\\ e T V[VbVb %299 % \7RYY \7~#7")) #7LQ		 Qh;#- ;#|RYY & ! BII 0ryy  
 
<* <~P3ryy P3fE
")) E
P T/ T T8 D
# D
 D
N & \
!4 \
\
~ 8
!4 8
8
v"RYY "Jbii ""ryy "JR299 Rj8")) 8v N
"5 N
 N
b 
v
="5 v

v
rr!   