
    qiJ              	          d Z ddlZddlZddlmZ ddlmZ ddlmZ ddl	m
Z
 ddlmZ dd	lmZmZmZmZ dd
lmZ ddlmZmZmZ ddlmZ  ej4                  e      Zd=dedededz  defdZ G d dej>                        Z  G d dej>                        Z! G d dej>                        Z" G d dej>                        Z# G d dej>                        Z$ G d dej>                        Z% G d d ej>                        Z& G d! d"ej>                        Z' G d# d$ej>                        Z( G d% d&ej>                        Z) G d' d(e      Z* G d) d*ej>                        Z+e G d+ d,e             Z,e G d- d.e,             Z- ed/0       G d1 d2e,             Z. G d3 d4ej>                        Z/ G d5 d6ej>                        Z0 G d7 d8ej>                        Z1 ed90       G d: d;e,             Z2g d<Z3y)>zPyTorch MobileViT model.    N)nn)CrossEntropyLoss   )initialization)ACT2FN)GradientCheckpointingLayer)BaseModelOutputWithNoAttention(BaseModelOutputWithPoolingAndNoAttention$ImageClassifierOutputWithNoAttentionSemanticSegmenterOutput)PreTrainedModel)auto_docstringlogging	torch_int   )MobileViTConfigvaluedivisor	min_valuereturnc                 |    ||}t        |t        | |dz  z         |z  |z        }|d| z  k  r||z  }t        |      S )zU
    Ensure that all layers have a channel count that is divisible by `divisor`.
       g?)maxint)r   r   r   	new_values       b/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/mobilevit/modeling_mobilevit.pymake_divisibler   )   sS     	Is57Q;#677BWLMI3;W	y>    c                        e Zd Z	 	 	 	 	 	 ddedededededededed	ed
eez  ddf fdZdej                  dej                  fdZ
 xZS )MobileViTConvLayerconfigin_channelsout_channelskernel_sizestridegroupsbiasdilationuse_normalizationuse_activationr   Nc                 $   t         |           t        |dz
  dz        |z  }||z  dk7  rt        d| d| d      ||z  dk7  rt        d| d| d      t	        j
                  ||||||||d		      | _        |	r t	        j                  |d
ddd      | _        nd | _        |
rdt        |
t              rt        |
   | _        y t        |j                  t              rt        |j                     | _        y |j                  | _        y d | _        y )Nr   r   r   zInput channels (z) are not divisible by z groups.zOutput channels (zeros)	r"   r#   r$   r%   paddingr(   r&   r'   padding_modegh㈵>g?T)num_featuresepsmomentumaffinetrack_running_stats)super__init__r   
ValueErrorr   Conv2dconvolutionBatchNorm2dnormalization
isinstancestrr   
activation
hidden_act)selfr!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r-   	__class__s               r   r5   zMobileViTConvLayer.__init__7   s*    	{Q!+,x71$/}<STZS[[cdee& A%0>UV\U]]efgg99#%# 

 !#)$("D "&D.#."("8F--s3"():):";"("3"3"DOr   featuresc                     | j                  |      }| j                  | j                  |      }| j                  | j                  |      }|S N)r8   r:   r=   )r?   rA   s     r   forwardzMobileViTConvLayer.forwardm   sK    ##H-)))(3H??&x0Hr   )r   r   Fr   TT)__name__
__module____qualname__r   r   boolr<   r5   torchTensorrD   __classcell__r@   s   @r   r    r    6   s     "&%)4#4# 4# 	4#
 4# 4# 4# 4# 4#  4# s
4# 
4#l  r   r    c                   x     e Zd ZdZ	 ddedededededdf fd	Zd
ej                  dej                  fdZ	 xZ
S )MobileViTInvertedResidualzY
    Inverted residual block (MobileNetv2): https://huggingface.co/papers/1801.04381
    r!   r"   r#   r%   r(   r   Nc           	      @   t         |           t        t        t	        ||j
                  z              d      }|dvrt        d| d      |dk(  xr ||k(  | _        t        |||d      | _	        t        |||d|||      | _
        t        |||dd	
      | _        y )N   )r   r   zInvalid stride .r   r"   r#   r$   r   )r"   r#   r$   r%   r&   r(   Fr"   r#   r$   r*   )r4   r5   r   r   roundexpand_ratior6   use_residualr    
expand_1x1conv_3x3
reduce_1x1)r?   r!   r"   r#   r%   r(   expanded_channelsr@   s          r   r5   z"MobileViTInvertedResidual.__init__{   s     	*3u[6CVCV5V/W+XZ[\vha899#q[K{l/J,:KYZ
 +)*$
 -)% 
r   rA   c                     |}| j                  |      }| j                  |      }| j                  |      }| j                  r||z   S |S rC   )rW   rX   rY   rV   )r?   rA   residuals      r   rD   z!MobileViTInvertedResidual.forward   sI    ??8,==*??8,&*&7&7x("EXEr   r   )rE   rF   rG   __doc__r   r   r5   rI   rJ   rD   rK   rL   s   @r   rN   rN   v   sc    
 jk
%
47
GJ
TW
cf
	
BF F Fr   rN   c                   t     e Zd Z	 ddedededededdf fdZd	ej                  dej                  fd
Z xZ	S )MobileViTMobileNetLayerr!   r"   r#   r%   
num_stagesr   Nc                     t         |           t        j                         | _        t        |      D ]5  }t        ||||dk(  r|nd      }| j                  j                  |       |}7 y )Nr   r   )r"   r#   r%   )r4   r5   r   
ModuleListlayerrangerN   append)	r?   r!   r"   r#   r%   ra   ird   r@   s	           r   r5   z MobileViTMobileNetLayer.__init__   sh     	]]_
z" 	'A-')!"avQ	E JJe$&K	'r   rA   c                 8    | j                   D ]
  } ||      } |S rC   rd   )r?   rA   layer_modules      r   rD   zMobileViTMobileNetLayer.forward   s$     JJ 	.L#H-H	.r   )r   r   
rE   rF   rG   r   r   r5   rI   rJ   rD   rK   rL   s   @r   r`   r`      sV    op'%'47'GJ'TW'il'	'   r   r`   c                   d     e Zd Zdededdf fdZdej                  dej                  fdZ xZ	S )MobileViTSelfAttentionr!   hidden_sizer   Nc                    t         |           ||j                  z  dk7  rt        d| d|j                   d      |j                  | _        t	        ||j                  z        | _        | j                  | j
                  z  | _        t        j                  || j                  |j                        | _
        t        j                  || j                  |j                        | _        t        j                  || j                  |j                        | _        t        j                  |j                        | _        y )Nr   zThe hidden size z4 is not a multiple of the number of attention heads rQ   )r'   )r4   r5   num_attention_headsr6   r   attention_head_sizeall_head_sizer   Linearqkv_biasquerykeyr   Dropoutattention_probs_dropout_probdropoutr?   r!   rn   r@   s      r   r5   zMobileViTSelfAttention.__init__   s    333q8";- 0334A7 
 $*#=#= #&{V5O5O'O#P !558P8PPYY{D,>,>V__U
99[$*<*<6??SYY{D,>,>V__U
zz&"E"EFr   hidden_statesc                    |j                   \  }}}| j                  |      j                  |d| j                  | j                        j                  dd      }| j                  |      j                  |d| j                  | j                        j                  dd      }| j                  |      j                  |d| j                  | j                        j                  dd      }t        j                  ||j                  dd            }|t        j                  | j                        z  }t        j                  j                  |d      }	| j                  |	      }	t        j                  |	|      }
|
j!                  dddd      j#                         }
|
j%                         d d | j&                  fz   } |
j                  | }
|
S )Nr   r   dimr   r   )shaperu   viewrp   rq   	transposerv   r   rI   matmulmathsqrtr   
functionalsoftmaxry   permute
contiguoussizerr   )r?   r{   
batch_size
seq_length_query_layer	key_layervalue_layerattention_scoresattention_probscontext_layernew_context_layer_shapes               r   rD   zMobileViTSelfAttention.forward   s   $1$7$7!
JJJ}%T*b$":":D<T<TUYq!_ 	 HH]#T*b$":":D<T<TUYq!_ 	 JJ}%T*b$":":D<T<TUYq!_ 	 !<<Y5H5HR5PQ+dii8P8P.QQ --//0@b/I ,,7_kB%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S***,CDr   rk   rL   s   @r   rm   rm      s<    G GS GT G&"U\\ "ell "r   rm   c                   d     e Zd Zdededdf fdZdej                  dej                  fdZ xZ	S )MobileViTSelfOutputr!   rn   r   Nc                     t         |           t        j                  ||      | _        t        j
                  |j                        | _        y rC   r4   r5   r   rs   denserw   hidden_dropout_probry   rz   s      r   r5   zMobileViTSelfOutput.__init__   s6    YY{K8
zz&"<"<=r   r{   c                 J    | j                  |      }| j                  |      }|S rC   r   ry   r?   r{   s     r   rD   zMobileViTSelfOutput.forward   s$    

=1]3r   rk   rL   s   @r   r   r      s8    > >S >T >
U\\ ell r   r   c                   d     e Zd Zdededdf fdZdej                  dej                  fdZ xZ	S )MobileViTAttentionr!   rn   r   Nc                 f    t         |           t        ||      | _        t	        ||      | _        y rC   )r4   r5   rm   	attentionr   outputrz   s      r   r5   zMobileViTAttention.__init__  s*    /D)&+>r   r{   c                 J    | j                  |      }| j                  |      }|S rC   )r   r   )r?   r{   self_outputsattention_outputs       r   rD   zMobileViTAttention.forward  s%    ~~m4;;|4r   rk   rL   s   @r   r   r     s8    ? ?S ?T ?
 U\\  ell  r   r   c                   h     e Zd Zdedededdf fdZdej                  dej                  fdZ xZ	S )	MobileViTIntermediater!   rn   intermediate_sizer   Nc                     t         |           t        j                  ||      | _        t        |j                  t              rt        |j                     | _	        y |j                  | _	        y rC   )
r4   r5   r   rs   r   r;   r>   r<   r   intermediate_act_fnr?   r!   rn   r   r@   s       r   r5   zMobileViTIntermediate.__init__  sR    YY{,=>
f''-'-f.?.?'@D$'-'8'8D$r   r{   c                 J    | j                  |      }| j                  |      }|S rC   )r   r   r   s     r   rD   zMobileViTIntermediate.forward  s&    

=100?r   rk   rL   s   @r   r   r     sA    9 9S 9UX 9]a 9U\\ ell r   r   c                        e Zd Zdedededdf fdZdej                  dej                  dej                  fd	Z xZ	S )
MobileViTOutputr!   rn   r   r   Nc                     t         |           t        j                  ||      | _        t        j
                  |j                        | _        y rC   r   r   s       r   r5   zMobileViTOutput.__init__  s7    YY0+>
zz&"<"<=r   r{   input_tensorc                 T    | j                  |      }| j                  |      }||z   }|S rC   r   )r?   r{   r   s      r   rD   zMobileViTOutput.forward#  s.    

=1]3%4r   rk   rL   s   @r   r   r     sO    > >S >UX >]a >
U\\  RWR^R^ r   r   c                   h     e Zd Zdedededdf fdZdej                  dej                  fdZ xZ	S )	MobileViTTransformerLayerr!   rn   r   r   Nc                 $   t         |           t        ||      | _        t	        |||      | _        t        |||      | _        t        j                  ||j                        | _        t        j                  ||j                        | _        y )Nr0   )r4   r5   r   r   r   intermediater   r   r   	LayerNormlayer_norm_epslayernorm_beforelayernorm_afterr   s       r   r5   z"MobileViTTransformerLayer.__init__+  sq    +FK@1&+GXY%fk;LM "[f>S>S T!||KV=R=RSr   r{   c                     | j                  | j                  |            }||z   }| j                  |      }| j                  |      }| j	                  ||      }|S rC   )r   r   r   r   r   )r?   r{   r   layer_outputs       r   rD   z!MobileViTTransformerLayer.forward3  s\    >>$*?*?*NO(=8++M:((6{{<?r   rk   rL   s   @r   r   r   *  sF    T TS TUX T]a TU\\ ell r   r   c                   h     e Zd Zdedededdf fdZdej                  dej                  fdZ xZ	S )	MobileViTTransformerr!   rn   ra   r   Nc           	          t         |           t        j                         | _        t        |      D ]A  }t        ||t        ||j                  z              }| j                  j                  |       C y )N)rn   r   )
r4   r5   r   rc   rd   re   r   r   	mlp_ratiorf   )r?   r!   rn   ra   r   transformer_layerr@   s         r   r5   zMobileViTTransformer.__init__>  sh    ]]_
z" 	1A 9'"%kF4D4D&D"E!
 JJ/0	1r   r{   c                 8    | j                   D ]
  } ||      } |S rC   ri   )r?   r{   rj   s      r   rD   zMobileViTTransformer.forwardJ  s%     JJ 	8L(7M	8r   rk   rL   s   @r   r   r   =  s@    
1 
1S 
1c 
1VZ 
1U\\ ell r   r   c                        e Zd ZdZ	 ddededededededed	d
f fdZdej                  d	e	ej                  e
f   fdZdej                  de
d	ej                  fdZdej                  d	ej                  fdZ xZS )MobileViTLayerzC
    MobileViT block: https://huggingface.co/papers/2110.02178
    r!   r"   r#   r%   rn   ra   r(   r   Nc                    t         |           |j                  | _        |j                  | _        |dk(  r)t        ||||dk(  r|nd|dkD  r|dz  nd      | _        |}nd | _        t        ||||j                        | _	        t        |||ddd      | _
        t        |||      | _        t        j                  ||j                        | _        t        |||d      | _        t        |d|z  ||j                        | _        y )	Nr   r   )r"   r#   r%   r(   rR   F)r"   r#   r$   r)   r*   )rn   ra   r   )r4   r5   
patch_sizepatch_widthpatch_heightrN   downsampling_layerr    conv_kernel_sizeconv_kxkconv_1x1r   transformerr   r   r   	layernormconv_projectionfusion)	r?   r!   r"   r#   r%   rn   ra   r(   r@   s	           r   r5   zMobileViTLayer.__init__U  s    	!,,"--Q;&?')!)QvA*2Q,QA'D# 'K&*D#*#$//	
 +#$# 
 0#!
 kv7L7LM1+ST 
 )KkW]WnWn
r   rA   c                 |   | j                   | j                  }}t        ||z        }|j                  \  }}}}t        j
                  j                         r$t        t	        j                  ||z        |z        n#t        t        j                  ||z        |z        }	t        j
                  j                         r$t        t	        j                  ||z        |z        n#t        t        j                  ||z        |z        }
d}|
|k7  s|	|k7  r't        j                  j                  ||	|
fdd      }d}|
|z  }|	|z  }||z  }|j                  ||z  |z  |||      }|j                  dd      }|j                  ||||      }|j                  dd      }|j                  ||z  |d      }||f||||||d	}||fS )
NFbilinearr   modealign_cornersTr   r   r   r}   )	orig_sizer   channelsinterpolatenum_patchesnum_patches_widthnum_patches_height)r   r   r   r   rI   jit
is_tracingr   ceilr   r   r   r   reshaper   )r?   rA   r   r   
patch_arear   r   orig_height
orig_width
new_height	new_widthr   num_patch_widthnum_patch_heightr   patches	info_dicts                    r   	unfoldingzMobileViTLayer.unfolding  s   $($4$4d6G6G\|34
8@5
Hk: yy##% ejj|!;<|KLTYY{\9:\IJ 	 yy##% ejjk!9:[HITYYzK78;FG 	 
"jK&?}}00
I6ZW\ 1 H K ${2%5&8 ""!$44lOU`
 ##Aq)//*hZP##Aq)//*z"9;K &z2$ &&!0"2
	 	!!r   r   r   c                    | j                   | j                  }}t        ||z        }|d   }|d   }|d   }|d   }	|d   }
|j                         j	                  |||d      }|j                  dd      }|j                  ||z  |	z  |
||      }|j                  dd	      }|j                  |||	|z  |
|z        }|d
   r&t        j                  j                  ||d   dd      }|S )Nr   r   r   r   r   r}   r   r   r   r   r   r   Fr   )
r   r   r   r   r   r   r   r   r   r   )r?   r   r   r   r   r   r   r   r   r   r   rA   s               r   foldingzMobileViTLayer.folding  s&   $($4$4d6G6G\|34
|,
Z(.$%9:#$78 %%',,Z[RTU%%a+##!$44o|U`
 %%a+##"2\"A?U`C`
 ]#}}00y5JV[ 1 H r   c                    | j                   r| j                  |      }|}| j                  |      }| j                  |      }| j                  |      \  }}| j	                  |      }| j                  |      }| j                  ||      }| j                  |      }| j                  t        j                  ||fd            }|S Nr   r   )r   r   r   r   r   r   r   r   r   rI   cat)r?   rA   r\   r   r   s        r   rD   zMobileViTLayer.forward  s    ""..x8H ==*==* "^^H5 ""7+..) <<3''1;;uyy(H)=1EFr   r]   )rE   rF   rG   r^   r   r   r5   rI   rJ   tupledictr   r   rD   rK   rL   s   @r   r   r   P  s     8
8
 8
 	8

 8
 8
 8
 8
 
8
t1"%,, 1"5t9K3L 1"fu||   :  r   r   c            
       `     e Zd Zdeddf fdZ	 	 d	dej                  dededee	z  fdZ
 xZS )
MobileViTEncoderr!   r   Nc           	         t         
|           || _        t        j                         | _        d| _        dx}}|j                  dk(  rd}d}n|j                  dk(  rd}d}t        ||j                  d   |j                  d   dd      }| j
                  j                  |       t        ||j                  d   |j                  d   dd	      }| j
                  j                  |       t        ||j                  d   |j                  d	   d|j                  d   d
      }| j
                  j                  |       |r|dz  }t        ||j                  d	   |j                  d   d|j                  d   d|      }| j
                  j                  |       |r|dz  }t        ||j                  d   |j                  d   d|j                  d   d	|      }	| j
                  j                  |	       y )NFrP   T   r   r   )r"   r#   r%   ra   r   r   )r"   r#   r%   rn   ra      )r"   r#   r%   rn   ra   r(      )r4   r5   r!   r   rc   rd   gradient_checkpointingoutput_strider`   neck_hidden_sizesrf   r   hidden_sizes)r?   r!   dilate_layer_4dilate_layer_5r(   layer_1layer_2layer_3layer_4layer_5r@   s             r   r5   zMobileViTEncoder.__init__  s   ]]_
&+# +0/1$!N!N!!R'!N)00311!4
 	

'")00311!4
 	

'" 00311!4++A.
 	

'"MH 00311!4++A.
 	

'"MH 00311!4++A.
 	

'"r   r{   output_hidden_statesreturn_dictc                     |rdnd }t        | j                        D ]  \  }} ||      }|s||fz   } |st        d ||fD              S t        ||      S )N c              3   &   K   | ]	  }||  y wrC   r  ).0vs     r   	<genexpr>z+MobileViTEncoder.forward.<locals>.<genexpr>S  s     Xq!-Xs   )last_hidden_stater{   )	enumeraterd   r   r	   )r?   r{   r  r  all_hidden_statesrg   rj   s          r   rD   zMobileViTEncoder.forwardD  sq     #7BD(4 	IOA|(7M#$58H$H!		I X]4E$FXXX-]noor   )FT)rE   rF   rG   r   r5   rI   rJ   rH   r   r	   rD   rK   rL   s   @r   r   r     s\    H# H#4 H#Z &+ 	p||p #p 	p
 
/	/pr   r   c                   z    e Zd ZU eed<   dZdZdZdZdgZ	 e
j                         dej                  dd	fd
       Zy	)MobileViTPreTrainedModelr!   	mobilevitpixel_values)imageTr   moduler   Nc                    t        |t        j                  t        j                  t        j                  f      rt        j                  |j                  d| j                  j                         |j                  t        j                  |j                         t        |dd      ^t        j                  |j                         t        j                  |j                         t        j                  |j                          yyt        |t        j"                        r?t        j                  |j                         t        j                  |j                         yy)zInitialize the weightsg        )meanstdNrunning_mean)r;   r   rs   r7   r9   initnormal_weightr!   initializer_ranger'   zeros_getattrr  ones_running_varnum_batches_trackedr   )r?   r  s     r   _init_weightsz&MobileViTPreTrainedModel._init_weightsa  s     fryy"))R^^DELLSdkk6S6ST{{&FKK(v~t4@F//0

6--.F667 A -KK$JJv}}% .r   )rE   rF   rG   r   __annotations__base_model_prefixmain_input_nameinput_modalitiessupports_gradient_checkpointing_no_split_modulesrI   no_gradr   Moduler!  r  r   r   r  r  X  sT    #$O!&*#)*U]]_&BII &$ & &r   r  c                        e Zd Zd
dedef fdZe	 	 	 ddej                  dz  dedz  dedz  de	e
z  fd	       Z xZS )MobileViTModelr!   expand_outputc                 L   t         |   |       || _        || _        t	        ||j
                  |j                  d   dd      | _        t        |      | _	        | j                  r.t	        ||j                  d   |j                  d   d      | _
        | j                          y	)
aE  
        expand_output (`bool`, *optional*, defaults to `True`):
            Whether to expand the output of the model using a 1x1 convolution. If `True`, the model will apply an additional
            1x1 convolution to expand the output channels from `config.neck_hidden_sizes[5]` to `config.neck_hidden_sizes[6]`.
        r   r   r   )r"   r#   r$   r%   r      r   rR   N)r4   r5   r!   r,  r    num_channelsr   	conv_stemr   encoderconv_1x1_exp	post_init)r?   r!   r,  r@   s      r   r5   zMobileViTModel.__init__s  s     	 *+++11!4
 (/ 2"44Q7#55a8	!D 	r   Nr  r  r  r   c                    ||n| j                   j                  }||n| j                   j                  }|t        d      | j	                  |      }| j                  |||      }| j                  r/| j                  |d         }t        j                  |ddgd      }n|d   }d }|s|||fn|f}	|	|dd  z   S t        |||j                  	      S )
Nz You have to specify pixel_valuesr  r  r   r~   r}   F)r   keepdimr   )r  pooler_outputr{   )r!   r  use_return_dictr6   r0  r1  r,  r2  rI   r  r
   r{   )
r?   r  r  r  kwargsembedding_outputencoder_outputsr  pooled_outputr   s
             r   rD   zMobileViTModel.forward  s    %9$D $++JjJj 	 &1%<k$++B]B]?@@>>,7,,!5# ' 
  $ 1 1/!2D E "JJ'8r2hPUVM / 2 M;H;T'7[lZnFOAB///7/')77
 	
r   )T)NNN)rE   rF   rG   r   rH   r5   r   rI   rJ   r   r
   rD   rK   rL   s   @r   r+  r+  q  sr     t >  -1,0#'	(
llT)(
 #Tk(
 D[	(
 
9	9(
 (
r   r+  z
    MobileViT model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
    ImageNet.
    )custom_introc                        e Zd Zdeddf fdZe	 	 	 	 d
dej                  dz  dedz  dej                  dz  dedz  de	e
z  f
d	       Z xZS )MobileViTForImageClassificationr!   r   Nc                 |   t         |   |       |j                  | _        t        |      | _        t        j                  |j                  d      | _        |j                  dkD  r-t        j                  |j                  d   |j                        nt        j                         | _        | j                          y )NT)inplacer   r}   )r4   r5   
num_labelsr+  r  r   rw   classifier_dropout_probry   rs   r   Identity
classifierr3  r?   r!   r@   s     r   r5   z(MobileViTForImageClassification.__init__  s      ++'/ zz&"@"@$OJPJ[J[^_J_BIIf..r2F4E4EFegepeper 	
 	r   r  r  labelsr  c                 `   ||n| j                   j                  }| j                  |||      }|r|j                  n|d   }| j	                  | j                  |            }d}	|| j                  ||| j                         }	|s|f|dd z   }
|	|	f|
z   S |
S t        |	||j                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss). If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr5  r   r   )losslogitsr{   )	r!   r8  r  r7  rE  ry   loss_functionr   r{   )r?   r  r  rG  r  r9  outputsr<  rJ  rI  r   s              r   rD   z'MobileViTForImageClassification.forward  s     &1%<k$++B]B]..DXfq.r1<--'!*m!<=%%ffdkkBDY,F)-)9TGf$EvE3!//
 	
r   NNNN)rE   rF   rG   r   r5   r   rI   rJ   rH   r   r   rD   rK   rL   s   @r   r?  r?    s     4   -1,0&*#'"
llT)"
 #Tk"
 t#	"

 D["
 
5	5"
 "
r   r?  c                   h     e Zd Zdedededdf fdZdej                  dej                  fdZ xZ	S )	MobileViTASPPPoolingr!   r"   r#   r   Nc           	          t         |           t        j                  d      | _        t        |||dddd      | _        y )Nr   )output_sizeTrelu)r"   r#   r$   r%   r)   r*   )r4   r5   r   AdaptiveAvgPool2dglobal_poolr    r   )r?   r!   r"   r#   r@   s       r   r5   zMobileViTASPPPooling.__init__  sB    //A>*#%"!
r   rA   c                     |j                   dd  }| j                  |      }| j                  |      }t        j                  j                  ||dd      }|S )Nr~   r   Fr   )r   rT  r   r   r   r   )r?   rA   spatial_sizes      r   rD   zMobileViTASPPPooling.forward
  sS    ~~bc*##H-==*==,,XLzin,or   rk   rL   s   @r   rO  rO    sA    
 
S 
PS 
X\ 
  r   rO  c                   d     e Zd ZdZdeddf fdZdej                  dej                  fdZ xZ	S )MobileViTASPPz
    ASPP module defined in DeepLab papers: https://huggingface.co/papers/1606.00915, https://huggingface.co/papers/1706.05587
    r!   r   Nc                 ~   t         |           |j                  d   }|j                  }t	        |j
                        dk7  rt        d      t        j                         | _	        t        |||dd      }| j                  j                  |       | j                  j                  |j
                  D cg c]  }t        |||d|d       c}       t        |||      }| j                  j                  |       t        |d|z  |dd      | _        t        j                  |j                   	      | _        y c c}w )
Nr~   r   z"Expected 3 values for atrous_ratesr   rR  rS   )r"   r#   r$   r(   r*   r   )p)r4   r5   r   aspp_out_channelslenatrous_ratesr6   r   rc   convsr    rf   extendrO  projectrw   aspp_dropout_probry   )r?   r!   r"   r#   in_projectionrate
pool_layerr@   s          r   r5   zMobileViTASPP.__init__  s(   ..r2//v""#q(ABB]]_
*#%!
 	

-(

 #//
  # +!- !!#)
	
 *&+|L


*%)L 0|YZkq
 zzF$<$<=)
s   5D:rA   c                     g }| j                   D ]  }|j                   ||              t        j                  |d      }| j	                  |      }| j                  |      }|S r   )r^  rf   rI   r   r`  ry   )r?   rA   pyramidconvpooled_featuress        r   rD   zMobileViTASPP.forwardB  s\    JJ 	+DNN4>*	+))G+,,w/,,7r   
rE   rF   rG   r^   r   r5   rI   rJ   rD   rK   rL   s   @r   rX  rX    s7    )> )>4 )>V  r   rX  c                   d     e Zd ZdZdeddf fdZdej                  dej                  fdZ xZ	S )MobileViTDeepLabV3zJ
    DeepLabv3 architecture: https://huggingface.co/papers/1706.05587
    r!   r   Nc           	          t         |           t        |      | _        t	        j
                  |j                        | _        t        ||j                  |j                  dddd      | _        y )Nr   FT)r"   r#   r$   r)   r*   r'   )r4   r5   rX  asppr   	Dropout2drC  ry   r    r[  rB  rE  rF  s     r   r5   zMobileViTDeepLabV3.__init__R  s]    !&)	||F$B$BC,00**# 
r   r{   c                 r    | j                  |d         }| j                  |      }| j                  |      }|S )Nr}   )rm  ry   rE  )r?   r{   rA   s      r   rD   zMobileViTDeepLabV3.forwardb  s6    99]2./<<)??8,r   ri  rL   s   @r   rk  rk  M  s6    
 
4 
 U\\ ell r   rk  zX
    MobileViT model with a semantic segmentation head on top, e.g. for Pascal VOC.
    c                        e Zd Zdeddf fdZe	 	 	 	 d
dej                  dz  dej                  dz  dedz  dedz  de	e
z  f
d	       Z xZS ) MobileViTForSemanticSegmentationr!   r   Nc                     t         |   |       |j                  | _        t        |d      | _        t        |      | _        | j                          y )NF)r,  )r4   r5   rB  r+  r  rk  segmentation_headr3  rF  s     r   r5   z)MobileViTForSemanticSegmentation.__init__o  sD      ++'eD!3F!; 	r   r  rG  r  r  c                 h   ||n| j                   j                  }||n| j                   j                  }|$| j                   j                  dk(  rt	        d      | j                  |d|      }|r|j                  n|d   }| j                  |      }d}	|Yt        j                  j                  ||j                  dd dd	      }
t        | j                   j                  
      } ||
|      }	|s|r
|f|dd z   }n	|f|dd z   }|	|	f|z   S |S t        |	||r|j                  d      S dd      S )a  
        labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
            Ground truth semantic segmentation maps for computing the loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels > 1`, a classification loss is computed (Cross-Entropy).

        Examples:

        ```python
        >>> import httpx
        >>> from io import BytesIO
        >>> import torch
        >>> from PIL import Image
        >>> from transformers import AutoImageProcessor, MobileViTForSemanticSegmentation

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> image_processor = AutoImageProcessor.from_pretrained("apple/deeplabv3-mobilevit-small")
        >>> model = MobileViTForSemanticSegmentation.from_pretrained("apple/deeplabv3-mobilevit-small")

        >>> inputs = image_processor(images=image, return_tensors="pt")

        >>> with torch.no_grad():
        ...     outputs = model(**inputs)

        >>> # logits are of shape (batch_size, num_labels, height, width)
        >>> logits = outputs.logits
        ```Nr   z/The number of labels should be greater than oneTr5  r~   r   Fr   )ignore_indexr   )rI  rJ  r{   
attentions)r!   r  r8  rB  r6   r  r{   rs  r   r   r   r   r   semantic_loss_ignore_indexr   )r?   r  rG  r  r  r9  rL  encoder_hidden_statesrJ  rI  upsampled_logitsloss_fctr   s                r   rD   z(MobileViTForSemanticSegmentation.forwardy  sq   N %9$D $++JjJj 	 &1%<k$++B]B]$++"8"8A"=NOO..!%# ! 
 :E 5 5'RS*''(=>!}}88V\\"#.Zu  9   (T[[5[5[\H,f5D# WQR[0 WQR[0)-)9TGf$EvE&3G'//	
 	
 NR	
 	
r   rM  )rE   rF   rG   r   r5   r   rI   rJ   rH   r   r   rD   rK   rL   s   @r   rq  rq  i  s     4   -1&*,0#'L
llT)L
 t#L
 #Tk	L

 D[L
 
(	(L
 L
r   rq  )r?  rq  r+  r  )rP   N)4r^   r   rI   r   torch.nnr    r   r  activationsr   modeling_layersr   modeling_outputsr	   r
   r   r   modeling_utilsr   utilsr   r   r   configuration_mobilevitr   
get_loggerrE   loggerr   r   r)  r    rN   r`   rm   r   r   r   r   r   r   r   r   r  r+  r?  rO  rX  rk  rq  __all__r  r   r   <module>r     s        % & ! 9  . 7 7 4 
		H	%
# 
 
C$J 
RU 
= =@-F		 -F`bii .6RYY 6r	")) 		  	 BII 
bii 
		 &299 &f/ fR\pryy \p~ & & &0 I
- I
 I
X 3
&> 3
3
l299 08BII 8v 8 
X
'? X

X
vr   