
    qiW              	          d Z ddlZddlZddlmZ ddlZddlmc mZ	 ddlmZ ddl
mZ ddlmZ ddlmZmZ dd	lmZ dd
lmZmZ ddlmZ  ej2                  e      Zd+dej8                  dededej8                  fdZ G d dej@                        Z! G d dej@                        Z" G d dej@                        Z# G d dej@                        Z$ G d dej@                        Z% G d dej@                        Z& G d dej@                        Z' G d  d!ej@                        Z(e G d" d#e             Z)e G d$ d%e)             Z* ed&'       G d( d)e)             Z+g d*Z,y),zPyTorch PVT model.    N)Iterable)nn   )initialization)ACT2FN)BaseModelOutputImageClassifierOutput)PreTrainedModel)auto_docstringlogging   )	PvtConfiginput	drop_probtrainingreturnc                    |dk(  s|s| S d|z
  }| j                   d   fd| j                  dz
  z  z   }|t        j                  || j                  | j
                        z   }|j                          | j                  |      |z  }|S )zc
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

            r   r   )r   )dtypedevice)shapendimtorchrandr   r   floor_div)r   r   r   	keep_probr   random_tensoroutputs          V/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/pvt/modeling_pvt.py	drop_pathr!   &   s    
 CxII[[^

Q 77E

5ELL YYMYYy!M1FM    c                   x     e Zd ZdZd	dedz  ddf fdZdej                  dej                  fdZde	fdZ
 xZS )
PvtDropPathzXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nr   r   c                 0    t         |           || _        y N)super__init__r   )selfr   	__class__s     r    r(   zPvtDropPath.__init__9   s    "r"   hidden_statesc                 D    t        || j                  | j                        S r&   )r!   r   r   r)   r+   s     r    forwardzPvtDropPath.forward=   s    FFr"   c                      d| j                    S )Nzp=)r   )r)   s    r    
extra_reprzPvtDropPath.extra_repr@   s    DNN#$$r"   r&   )__name__
__module____qualname____doc__floatr(   r   Tensorr.   strr0   __classcell__r*   s   @r    r$   r$   6   sG    b#%$, #$ #GU\\ Gell G%C %r"   r$   c                        e Zd ZdZ	 ddedeee   z  deee   z  dedededef fd	Zd
e	j                  dedede	j                  fdZde	j                  dee	j                  eef   fdZ xZS )PvtPatchEmbeddingsz
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    config
image_size
patch_sizestridenum_channelshidden_size	cls_tokenc                    t         	|           || _        t        |t        j
                  j                        r|n||f}t        |t        j
                  j                        r|n||f}|d   |d   z  |d   |d   z  z  }|| _        || _        || _	        || _
        t        j                  t        j                  d|r|dz   n||            | _        |r*t        j                  t        j                   dd|            nd | _        t        j$                  ||||      | _        t        j(                  ||j*                        | _        t        j.                  |j0                        | _        y )Nr   r   kernel_sizer?   eps)p)r'   r(   r<   
isinstancecollectionsabcr   r=   r>   r@   num_patchesr   	Parameterr   randnposition_embeddingszerosrB   Conv2d
projection	LayerNormlayer_norm_eps
layer_normDropouthidden_dropout_probdropout)
r)   r<   r=   r>   r?   r@   rA   rB   rL   r*   s
            r    r(   zPvtPatchEmbeddings.__init__K   s0    	#-j+//:R:R#SZZdfpYq
#-j+//:R:R#SZZdfpYq
!!}
15*Q-:VW=:XY$$(&#%<<KKi;?[+V$
  JSekk!Q&DEX\))L+6Zde,,{8M8MNzzF$>$>?r"   
embeddingsheightwidthr   c                    ||z  }t         j                  j                         s<|| j                  j                  | j                  j                  z  k(  r| j
                  S |j                  d||d      j                  dddd      }t        j                  |||fd      }|j                  dd||z        j                  ddd      }|S )Nr   r   r      bilinear)sizemode)
r   jit
is_tracingr<   r=   rO   reshapepermuteFinterpolate)r)   rY   rZ   r[   rL   interpolated_embeddingss         r    interpolate_pos_encodingz+PvtPatchEmbeddings.interpolate_pos_encodingg   s    un yy##%+9O9ORVR]R]RhRh9h*h+++''65"=EEaAqQ
"#--
&%Wa"b"9"A"A!RRW"X"`"`abdegh"i&&r"   pixel_valuesc                    |j                   \  }}}}|| j                  k7  rt        d      | j                  |      }|j                   ^ }}}|j	                  d      j                  dd      }| j                  |      }| j                  | j                  j                  |dd      }	t        j                  |	|fd      }| j                  | j                  d d dd f   ||      }
t        j                  | j                  d d d df   |
fd      }
n| j                  | j                  ||      }
| j                  ||
z         }|||fS )NzeMake sure that the channel dimension of the pixel values match with the one set in the configuration.r^   r   r]   dim)r   r@   
ValueErrorrR   flatten	transposerU   rB   expandr   catri   rO   rX   )r)   rj   
batch_sizer@   rZ   r[   patch_embed_rY   rB   rO   s              r    r.   zPvtPatchEmbeddings.forwardr   sM   2>2D2D/
L&%4,,,w  ool3'--FE!))!,66q!<__[1
>>%--j"bAIIz#:BJ"&"?"?@X@XYZ\]\^Y^@_agin"o"'))T-E-Ea!e-LNa,bhi"j"&"?"?@X@XZ`bg"h\\*/B"BC
65((r"   F)r1   r2   r3   r4   r   intr   boolr(   r   r6   ri   tupler.   r8   r9   s   @r    r;   r;   D   s      @@ (3-'@ (3-'	@
 @ @ @ @8	'5<< 	' 	'UX 	']b]i]i 	')ELL )U5<<c;Q5R )r"   r;   c                   `     e Zd Zdedef fdZdej                  dej                  fdZ xZ	S )PvtSelfOutputr<   rA   c                     t         |           t        j                  ||      | _        t        j
                  |j                        | _        y r&   )r'   r(   r   LineardenserV   rW   rX   )r)   r<   rA   r*   s      r    r(   zPvtSelfOutput.__init__   s6    YY{K8
zz&"<"<=r"   r+   r   c                 J    | j                  |      }| j                  |      }|S r&   )r~   rX   r-   s     r    r.   zPvtSelfOutput.forward   s$    

=1]3r"   
r1   r2   r3   r   rw   r(   r   r6   r.   r8   r9   s   @r    r{   r{      s1    >y >s >
U\\ ell r"   r{   c                        e Zd ZdZdedededef fdZdedej                  fd	Z
	 ddej                  d
edededeej                     f
dZ xZS )PvtEfficientSelfAttentionzxEfficient self-attention mechanism with reduction of the sequence [PvT paper](https://huggingface.co/papers/2102.12122).r<   rA   num_attention_headssequences_reduction_ratioc                    t         |           || _        || _        | j                  | j                  z  dk7  r&t	        d| j                   d| j                   d      t        | j                  | j                  z        | _        | j                  | j                  z  | _        t        j                  | j                  | j                  |j                        | _        t        j                  | j                  | j                  |j                        | _        t        j                  | j                  | j                  |j                        | _        t        j                  |j                        | _        || _        |dkD  rEt        j$                  ||||      | _        t        j(                  ||j*                        | _        y y )	Nr   zThe hidden size (z6) is not a multiple of the number of attention heads ())biasr   rD   rF   )r'   r(   rA   r   rn   rw   attention_head_sizeall_head_sizer   r}   qkv_biasquerykeyvaluerV   attention_probs_dropout_probrX   r   rQ   sequence_reductionrS   rT   rU   r)   r<   rA   r   r   r*   s        r    r(   z"PvtEfficientSelfAttention.__init__   sr    	&#6 d666!;#D$4$4#5 622316 
 $'t'7'7$:R:R'R#S !558P8PPYYt//1C1C&//Z
99T--t/A/AXYYt//1C1C&//Z
zz&"E"EF)B&$q(&(ii[6OXq'D# !ll;F<Q<QRDO	 )r"   r+   r   c                     |j                         d d | j                  | j                  fz   }|j                  |      }|j	                  dddd      S )Nr]   r   r^   r   r   )r`   r   r   viewre   )r)   r+   	new_shapes      r    transpose_for_scoresz.PvtEfficientSelfAttention.transpose_for_scores   sT    !&&("-1I1I4KcKc0dd	%**95$$Q1a00r"   rZ   r[   output_attentionsc                    | j                  | j                  |            }| j                  dkD  r{|j                  \  }}}|j	                  ddd      j                  ||||      }| j                  |      }|j                  ||d      j	                  ddd      }| j                  |      }| j                  | j                  |            }	| j                  | j                  |            }
t        j                  ||	j                  dd            }|t        j                  | j                        z  }t         j"                  j%                  |d      }| j'                  |      }t        j                  ||
      }|j	                  dddd      j)                         }|j+                         d d | j,                  fz   }|j/                  |      }|r||f}|S |f}|S )Nr   r   r^   r]   rl   r   )r   r   r   r   re   rd   r   rU   r   r   r   matmulrp   mathsqrtr   r   
functionalsoftmaxrX   
contiguousr`   r   r   )r)   r+   rZ   r[   r   query_layerrs   seq_lenr@   	key_layervalue_layerattention_scoresattention_probscontext_layernew_context_layer_shapeoutputss                   r    r.   z!PvtEfficientSelfAttention.forward   s    //

=0IJ))A-0=0C0C-J)11!Q:BB:|]cejkM 33MBM)11*lBOWWXY[\^_`M OOM:M--dhh}.EF	//

=0IJ !<<Y5H5HR5PQ+dii8P8P.QQ --//0@b/I ,,7_kB%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S%**+BC6G=/2 O\M]r"   rv   )r1   r2   r3   r4   r   rw   r5   r(   r   r6   r   rx   ry   r.   r8   r9   s   @r    r   r      s     CSS.1SHKShmS:1# 1%,, 1 #(*||* * 	*
  * 
u||	*r"   r   c                   ~     e Zd Zdedededef fdZ	 ddej                  deded	e	d
e
ej                     f
dZ xZS )PvtAttentionr<   rA   r   r   c                 n    t         |           t        ||||      | _        t	        ||      | _        y )N)rA   r   r   )rA   )r'   r(   r   r)   r{   r   r   s        r    r(   zPvtAttention.__init__   s8     	-# 3&?	
	 $FDr"   r+   rZ   r[   r   r   c                 h    | j                  ||||      }| j                  |d         }|f|dd  z   }|S )Nr   r   )r)   r   )r)   r+   rZ   r[   r   self_outputsattention_outputr   s           r    r.   zPvtAttention.forward   sE     yy?PQ;;|A7#%QR(88r"   rv   )r1   r2   r3   r   rw   r5   r(   r   r6   rx   ry   r.   r8   r9   s   @r    r   r      sn    
E
E.1
EHK
Ehm
E _d"\\36?BW[	u||	r"   r   c            
       z     e Zd Z	 	 d
dedededz  dedz  f fdZdej                  dej                  fd	Z xZ	S )PvtFFNNr<   in_featureshidden_featuresout_featuresc                 j   t         |           ||n|}t        j                  ||      | _        t        |j                  t              rt        |j                     | _	        n|j                  | _	        t        j                  ||      | _
        t        j                  |j                        | _        y r&   )r'   r(   r   r}   dense1rI   
hidden_actr7   r   intermediate_act_fndense2rV   rW   rX   )r)   r<   r   r   r   r*   s        r    r(   zPvtFFN.__init__   s     	'3'?|[ii_=f''-'-f.?.?'@D$'-'8'8D$ii>zz&"<"<=r"   r+   r   c                     | j                  |      }| j                  |      }| j                  |      }| j                  |      }| j                  |      }|S r&   )r   r   rX   r   r-   s     r    r.   zPvtFFN.forward  sP    M200?]3M2]3r"   )NNr   r9   s   @r    r   r      sY    
 '+#'>> > t	>
 Dj>"U\\ ell r"   r   c                   f     e Zd Zdedededededef fdZddej                  d	ed
ede	fdZ
 xZS )PvtLayerr<   rA   r   r!   r   	mlp_ratioc                 v   t         |           t        j                  ||j                        | _        t        ||||      | _        |dkD  rt        |      nt        j                         | _
        t        j                  ||j                        | _        t        ||z        }t        |||      | _        y )NrF   )r<   rA   r   r   r   )r<   r   r   )r'   r(   r   rS   rT   layer_norm_1r   	attentionr$   Identityr!   layer_norm_2rw   r   mlp)	r)   r<   rA   r   r!   r   r   mlp_hidden_sizer*   s	           r    r(   zPvtLayer.__init__  s     	LL&:O:OP%# 3&?	
 4=s?Y/LL&:O:OPkI56[Rabr"   r+   rZ   r[   r   c                    | j                  | j                  |      |||      }|d   }|dd  }| j                  |      }||z   }| j                  | j	                  |            }| j                  |      }||z   }	|	f|z   }|S )N)r+   rZ   r[   r   r   r   )r   r   r!   r   r   )
r)   r+   rZ   r[   r   self_attention_outputsr   r   
mlp_outputlayer_outputs
             r    r.   zPvtLayer.forward/  s    !%++M:/	 "0 "
 2!4(,>>*:;(=8XXd//>?
^^J/
$z1/G+r"   rv   )r1   r2   r3   r   rw   r5   r(   r   r6   rx   r.   r8   r9   s   @r    r   r     so    cc c !	c
 c $)c c,U\\ 3 s _c r"   r   c                   t     e Zd Zdef fdZ	 	 	 d
dej                  dedz  dedz  dedz  dee	z  f
d	Z
 xZS )
PvtEncoderr<   c                    t         	|           || _        t        j                  d|j
                  t        |j                        d      j                         }g }t        |j                        D ]  }|j                  t        ||dk(  r|j                  n| j                  j                  d|dz   z  z  |j                  |   |j                  |   |dk(  r|j                   n|j"                  |dz
     |j"                  |   ||j                  dz
  k(                t%        j&                  |      | _        g }d}t        |j                        D ]  }g }|dk7  r||j                  |dz
     z  }t        |j                  |         D ]\  }|j                  t+        ||j"                  |   |j,                  |   |||z      |j.                  |   |j0                  |                ^ |j                  t%        j&                  |              t%        j&                  |      | _        t%        j4                  |j"                  d   |j6                  	      | _        y )
Nr   cpu)r   r^   r   )r<   r=   r>   r?   r@   rA   rB   )r<   rA   r   r!   r   r   r]   rF   )r'   r(   r<   r   linspacedrop_path_ratesumdepthstolistrangenum_encoder_blocksappendr;   r=   patch_sizesstridesr@   hidden_sizesr   
ModuleListpatch_embeddingsr   r   sequence_reduction_ratios
mlp_ratiosblockrS   rT   rU   )
r)   r<   drop_path_decaysrY   iblockscurlayersjr*   s
            r    r(   zPvtEncoder.__init__G  s-    !>>!V-B-BCDV_delln 
v001 	A"!45Fv00@V@V[\abefaf[g@h%11!4!>>!,89Q!4!4FDWDWXY\]X]D^ & 3 3A 66#<#<q#@@
	 !#j 9 v001 	1AFAvv}}QU++6==+, 
%$*$7$7$:,2,F,Fq,I"237";282R2RST2U"("3"3A"6	
 MM"--/0!	1$ ]]6*
 ,,v':':2'>FDYDYZr"   rj   r   Noutput_hidden_statesreturn_dictr   c                 2   |rdnd }|rdnd }|j                   d   }t        | j                        }|}	t        t	        | j
                  | j                              D ]|  \  }
\  }} ||	      \  }	}}|D ]&  } ||	|||      }|d   }	|r	||d   fz   }|s!||	fz   }( |
|dz
  k7  sI|	j                  |||d      j                  dddd      j                         }	~ | j                  |	      }	|r||	fz   }|st        d |	||fD              S t        |	||      S )	N r   r   r]   r   r^   c              3   &   K   | ]	  }||  y wr&   r   ).0vs     r    	<genexpr>z%PvtEncoder.forward.<locals>.<genexpr>  s     mq_`_lms   last_hidden_stater+   
attentions)r   lenr   	enumeratezipr   rd   re   r   rU   ry   r   )r)   rj   r   r   r   all_hidden_statesall_self_attentionsrs   
num_blocksr+   idxembedding_layerblock_layerrZ   r[   r   layer_outputss                    r    r.   zPvtEncoder.forwardy  si    #7BD$5b4!''*
_
$3<SAVAVX\XbXb=c3d 	v/C//;+:=+I(M65$ M %mVUDU V -a 0$*=qAQ@S*S''(9]<L(L%M j1n$ - 5 5j&%QS T \ \]^`acdfg h s s u	v 6 1]4D Dm]4EGZ$[mmm++*
 	
r"   )FFT)r1   r2   r3   r   r(   r   FloatTensorrx   ry   r   r.   r8   r9   s   @r    r   r   F  si    0[y 0[j */,1#'#
''#
  $;#
 #Tk	#

 D[#
 
	 #
r"   r   c                   t    e Zd ZU eed<   dZdZdZg Z e	j                         dej                  ddfd       Zy)	PvtPreTrainedModelr<   pvtrj   )imagemoduler   Nc                    | j                   j                  }t        |t        j                  t        j
                  f      rOt        j                  |j                  d|       |j                   t        j                  |j                         yyt        |t        j                        r?t        j                  |j                         t        j                  |j                         yt        |t              rRt        j                  |j                  d|       |j                  #t        j                  |j                  d|       yyy)zInitialize the weightsr   )meanstdN)r<   initializer_rangerI   r   r}   rQ   inittrunc_normal_weightr   zeros_rS   ones_r;   rO   rB   )r)   r   r   s      r    _init_weightsz PvtPreTrainedModel._init_weights  s     kk++fryy"))45v}}3C@{{&FKK( '-KK$JJv}}% 23v99M+""6#3#3#3G , 4r"   )r1   r2   r3   r   __annotations__base_model_prefixmain_input_nameinput_modalities_no_split_modulesr   no_gradr   Moduler  r   r"   r    r   r     sO    $O!U]]_HBII H$ H Hr"   r   c                   ~     e Zd Zdef fdZe	 	 	 d
dej                  dedz  dedz  dedz  de	e
z  f
d	       Z xZS )PvtModelr<   c                 r    t         |   |       || _        t        |      | _        | j                          y r&   )r'   r(   r<   r   encoder	post_initr)   r<   r*   s     r    r(   zPvtModel.__init__  s1      "&) 	r"   Nrj   r   r   r   r   c                 ,   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }| j	                  ||||      }|d   }|s	|f|dd  z   S t        ||j                  |j                        S )Nrj   r   r   r   r   r   r   )r<   r   r   use_return_dictr  r   r+   r   )r)   rj   r   r   r   kwargsencoder_outputssequence_outputs           r    r.   zPvtModel.forward  s     2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B],,%/!5#	 ' 
 *!,#%(;;;-)77&11
 	
r"   )NNN)r1   r2   r3   r   r(   r   r   r   rx   ry   r   r.   r8   r9   s   @r    r  r    su    y   *.,0#'
''
  $;
 #Tk	

 D[
 
	 
 
r"   r  z
    Pvt Model transformer with an image classification head on top (a linear layer on top of the final hidden state of
    the [CLS] token) e.g. for ImageNet.
    )custom_introc                        e Zd Zdeddf fdZe	 	 	 	 ddej                  dz  dej                  dz  dedz  dedz  d	edz  de	e
z  fd
       Z xZS )PvtForImageClassificationr<   r   Nc                 0   t         |   |       |j                  | _        t        |      | _        |j                  dkD  r-t        j                  |j                  d   |j                        nt        j                         | _	        | j                          y )Nr   r]   )r'   r(   
num_labelsr  r   r   r}   r   r   
classifierr  r  s     r    r(   z"PvtForImageClassification.__init__  sy      ++F# FLEVEVYZEZBIIf))"-v/@/@A`b`k`k`m 	
 	r"   rj   labelsr   r   r   c                 R   ||n| j                   j                  }| j                  ||||      }|d   }| j                  |dddddf         }	d}
|| j	                  ||	| j                         }
|s|	f|dd z   }|
|
f|z   S |S t        |
|	|j                  |j                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr  r   r   )losslogitsr+   r   )r<   r  r   r  loss_functionr	   r+   r   )r)   rj   r  r   r   r   r  r   r  r!  r   r   s               r    r.   z!PvtForImageClassification.forward  s      &1%<k$++B]B]((%/!5#	  
 "!*Aq!9:%%ffdkkBDY,F)-)9TGf$EvE$!//))	
 	
r"   )NNNN)r1   r2   r3   r   r(   r   r   r6   rx   ry   r	   r.   r8   r9   s   @r    r  r    s    y T   '+)-,0#')
llT))
 t#)
  $;	)

 #Tk)
 D[)
 
&	&)
 )
r"   r  )r  r  r   )r   F)-r4   rJ   r   collections.abcr   r   torch.nn.functionalr   r   rf    r   r   activationsr   modeling_outputsr   r	   modeling_utilsr
   utilsr   r   configuration_pvtr   
get_loggerr1   loggerr6   r5   rx   r!   r  r$   r;   r{   r   r   r   r   r   r   r  r  __all__r   r"   r    <module>r.     se       $     & ! F - , ( 
		H	%U\\ e T V[VbVb  %")) %A) A)H	BII 	O		 Od299 .RYY 6+ryy +\V
 V
r H H H0 )
! )
 )
X 9
 2 9
9
x Jr"   