
    qi                        d dl mZ d dlmZ d dlZd dlmZ ddlmZ ddl	m
Z
 ddlmZ dd	lmZmZ dd
lmZmZ ddlmZmZmZmZ ddlmZ  ej4                  e      Ze ed       G d de                    Ze ed       G d de                    Z G d dej>                        Z  G d dej>                        Z!	 dJdej>                  dejD                  dejD                  dejD                  dejD                  dz  de#de#fd Z$d! Z% G d" d#ej>                        Z&dKd$ejD                  d%e#d&e'd'ejD                  fd(Z( G d) d*ej>                        Z) G d+ d,ej>                        Z* G d- d.e      Z+ G d/ d0ej>                        Z,d1ejD                  d2e-ejD                     d'ejD                  fd3Z. G d4 d5ej>                        Z/ G d6 d7ej>                        Z0 G d8 d9ej>                        Z1 G d: d;ej>                        Z2 G d< d=e      Z3 G d> d?e      Z4 G d@ dAej>                        Z5e G dB dCe             Z6e G dD dEe6             Z7 edF       G dG dHe6             Z8g dIZ9y)L    )Callable)	dataclassN)nn   )initialization)ACT2FN)GradientCheckpointingLayer)BaseModelOutputImageClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)ModelOutputauto_docstringcan_return_tuplelogging   )VJEPA2ConfigzO
    VJEPA Predictor outputs that also contains the masked encoder outputs
    )custom_introc                       e Zd ZU dZej
                  ed<   dZej
                  dz  ed<   dZe	ej
                  df   dz  ed<   dZ
e	ej
                  df   dz  ed<   dZej
                  dz  ed<   y)	$VJEPA2WithMaskedInputPredictorOutputa  
    masked_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*, returned when `context_mask` is provided which is applied on VJEPA2Encoder outputs):
        The masked hidden state of the model.
    target_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*, returned when `target_mask` is provided which is applied on VJEPA2Encoder outputs):
        The target hidden state of the model.
    last_hidden_stateNmasked_hidden_state.hidden_states
attentionstarget_hidden_state)__name__
__module____qualname____doc__torchFloatTensor__annotations__r   r   tupler   r        \/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/vjepa2/modeling_vjepa2.pyr   r       s     (((48**T18:>M5**C/047>7;Je'',-4;48**T18r%   r   zs
    VJEPA outputs that also contains the masked encoder outputs
    Optionally contains the predictor outputs
    c                        e Zd ZU dZej
                  ed<   dZej
                  dz  ed<   dZe	ej
                  df   dz  ed<   dZ
e	ej
                  df   dz  ed<   dZedz  ed<    fd	Z xZS )
 VJEPA2WithMaskedInputModelOutputaq  
    masked_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*, returned when `context_mask` is provided which is applied on VJEPA2Encoder outputs):
        The masked hidden state of the model.
    predictor_output (`VJEPA2WithMaskedInputPredictorOutput`, *optional*):
        The output from the Predictor module.
    r   Nr   .r   r   predictor_outputc                     t        t        | 	               }t        |d   t              r|d   j                         |d<   t        |      S )N)listsuperto_tuple
isinstancer   r#   )selfoutput	__class__s     r&   r.   z)VJEPA2WithMaskedInputModelOutput.to_tupleJ   sD    eg&()fRj"FG,,.F2JV}r%   )r   r   r   r   r    r!   r"   r   r   r#   r   r)   r   r.   __classcell__r2   s   @r&   r(   r(   5   s     (((48**T18:>M5**C/047>7;Je'',-4;DH:TAH r%   r(   c                   x     e Zd ZdZ	 d	dedef fdZed        Zde	j                  de	j                  fdZ xZS )
VJEPA2PatchEmbeddings3Dz"
    Image to Patch Embedding
    confighidden_sizec                 H   t         |           |j                  | _        |j                  | _        || _        t        j                  |j                  ||j                  |j                  |j                  f|j                  |j                  |j                  f      | _        y )N)in_channelsout_channelskernel_sizestride)	r-   __init__
patch_sizetubelet_sizer8   r   Conv3din_chansprojr0   r7   r8   r2   s      r&   r>   z VJEPA2PatchEmbeddings3D.__init__V   s    
 	 ++"//&II$,,f.?.?ARARS''):):F<M<MN	
	r%   c                     | j                   | j                  z  | j                  | j                  z  z  | j                  | j                  z  z  S Nframes_per_clipr@   	crop_sizer?   r7   s    r&   num_patchesz#VJEPA2PatchEmbeddings3D.num_patchesg   sO     ##v':'::6#4#4466#4#446	
r%   pixel_values_videosreturnc                 f    | j                  |      j                  d      j                  dd      }|S )N   r   )rC   flatten	transpose)r0   rL   xs      r&   forwardzVJEPA2PatchEmbeddings3D.forwardo   s.    II)*2215??1Er%      )r   r   r   r   r   intr>   staticmethodrK   r    TensorrS   r3   r4   s   @r&   r6   r6   Q   sS      

 
" 
 
5<< ELL r%   r6   c                   f     e Zd ZdZddedef fdZdej                  dej                  fdZ	 xZ
S )	VJEPA2Embeddings>
    Construct mask token, position and patch embeddings.
    r7   r8   c                     t         |           || _        || _        t	        ||      | _        | j
                  j                  | _        |j                  | _        y )Nr8   )r-   r>   r7   r8   r6   patch_embeddingsrK   r?   rD   s      r&   r>   zVJEPA2Embeddings.__init__y   sM    & 7K X00<< ++r%   rL   rM   c                 l   |j                   d   }|j                  ddddd      }|| j                  j                  k  r)|j	                  dd| j                  j                  dd      }| j
                  j                  j                  j                  }|j                  |      }| j                  |      }|S )Nr   r   rO   r      )dtype)
shapepermuter7   r@   repeatr^   rC   weightra   to)r0   rL   
num_framestarget_dtype
embeddingss        r&   rS   zVJEPA2Embeddings.forward   s    (..q1
 299!Q1aH 000"5"<"<Q4;;C[C[]^`a"b,,1188>>144<4H**+>?
r%   rT   )r   r   r   r   r   rV   r>   r    rX   rS   r3   r4   s   @r&   rZ   rZ   t   s6    ,| ,# ,5<< ELL r%   rZ   modulequerykeyvalueattention_maskscalingdropoutc                    t        j                  ||j                  dd            |z  }t        j                  j                  |dt         j                        j                  |j                        }t        j                  j                  ||| j                        }t        j                  ||      }	|	j                  dd      j                         }	|	|fS )Nr+   )dimra   )ptrainingr   rO   )r    matmulrQ   r   
functionalsoftmaxfloat32rf   ra   rp   ru   
contiguous)
rj   rk   rl   rm   rn   ro   rp   kwargsattn_weightsattn_outputs
             r&   eager_attention_forwardr~      s     <<s}}R'<=GL ==((2U]](SVVW\WbWbcL ==((6??([L,,|U3K''1-88:K$$r%   c                    | j                         \  }}}}t        j                  |dz  | j                  | j                        }||dz  z  }dd|z  z  }|j                  d      |z  }|j                         }|j                         }	|j                  dddd      }|	j                  dddd      }	| j                  dd      }
|
j                  d	      \  }}t        j                  | |fd	      }
|
j                  d
      }
| |	z  |
|z  z   S )NrO   ra   deviceg       @g      ?i'  r+   r   )r+   rO   rs   rr   )sizer    arangera   r   	unsqueezesincosrd   	unflattenunbindstackrP   )rR   posB	num_headsNDomegafreqemb_sinemb_cosyy1y2s                r&   rotate_queries_or_keysr      s   Ay!Q
 LLaqwwqxx@E	QWE%,E==u$D hhjGhhjGnnQ1a(GnnQ1a(G 	
B AXX"XFBbS"I2&A			"AKAK((r%   c            	            e Zd Z	 	 ddededef fdZd Zd ZddZd	 Z		 	 dd
e
j                  dz  dedee
j                  e
j                  f   ee
j                     z  fdZ xZS )VJEPA2RopeAttentionr7   r8   num_attention_headsc                 z   t         |           || _        || _        || _        ||z  dk7  rt        d|f d| d      t        ||z        | _        | j                  | j                  z  | _        t        j                  || j                  |j                        | _        t        j                  || j                  |j                        | _        t        j                  || j                  |j                        | _        t        j                  ||      | _        |j                   | _        t        j$                  | j"                        | _        | j                  j(                  | j                  j*                  z  | _        | j                  j.                  | j                  j0                  z  | _        t        d| j                  dz  dz  z        | _        t        d| j                  dz  dz  z        | _        t        d| j                  dz  dz  z        | _        | j                  dz  | _        d	| _        y )
Nr   zThe hidden size z4 is not a multiple of the number of attention heads .biasrO   r         F)r-   r>   r7   r8   r   
ValueErrorrV   attention_head_sizeall_head_sizer   Linearqkv_biasrk   rl   rm   rC   attention_probs_dropout_probdropout_probDropoutrp   rI   r?   	grid_sizerH   r@   
grid_depthd_dimh_dimw_dimro   	is_causal)r0   r7   r8   r   r2   s       r&   r>   zVJEPA2RopeAttention.__init__   s    	&#6 ,,1"K>"2 3,-Q0 
 $'{5H'H#I !558P8PPYY{D,>,>V__U
99[$*<*<6??SYY{D,>,>V__U
IIk;7	"??zz$"3"34..$++2H2HH++559Q9QQt771<BCD
t771<BCD
t771<BCD
//5r%   c                 P    t        | j                  | j                  z        }||z  S rF   )rV   r   )r0   idstokens_per_frames      r&   _get_frame_posz"VJEPA2RopeAttention._get_frame_pos   s&    t~~>?&&&r%   c                     t        | j                  | j                  z        }| j                  |      }|||z  z
  }| j                  }||z  S rF   )rV   r   r   )r0   r   r   	frame_idstokens_per_rows        r&   _get_height_posz#VJEPA2RopeAttention._get_height_pos   sN    t~~>?'',	$y00n$$r%   Nc                    |j                   }|j                  d      }|-|j                  d      j                  d| j                  d      }nt        j                  ||      }t        | j                  | j                  z        }| j                  |      }| j                  }| j                  |      }	|||z  z
  ||	z  z
  }
||	|
fS )Nr   r   )r   r   r   rd   r   r    r   rV   r   r   r   )r0   rR   masksr   
token_sizer   r   r   r   
height_ids	width_idss              r&   get_position_idsz$VJEPA2RopeAttention.get_position_ids   s    VVAY
 //!$++At/G/GKC,,z&9Ct~~>?'',	))#.
 +i77>J;VV	*i//r%   c                    |\  }}}d}t        |d||| j                  z   f   |      }|| j                  z  }t        |d||| j                  z   f   |      }|| j                  z  }t        |d||| j                  z   f   |      }	|| j                  z  }|| j                  k  r&|d|d f   }
t        j                  |||	|
gd      }|S t        j                  |||	gd      }|S )Nr   .)r   r+   r   )r   r   r   r   r   r    cat)r0   qkpos_idsd_maskh_maskw_masksqkdqkhqkwqkrs              r&   apply_rotary_embeddingsz+VJEPA2RopeAttention.apply_rotary_embeddings  s    !($RQTZZ-?(?%@fM	TZZ$RQTZZ-?(?%@fM	TZZ$RQTZZ-?(?%@fM	TZZt'''S!"W+CCc3/R8B 	 Cc?3B	r%   position_maskoutput_attentionsrM   c           
         |j                   \  }}}| j                  |      j                  |d| j                  | j                        j                  dd      }| j                  |      j                  |d| j                  | j                        j                  dd      }| j                  |      j                  |d| j                  | j                        j                  dd      }	| j                  ||      }
| j                  ||
      }| j                  ||
      }t        j                  | j                  j                  t              } || |||	d | j                  | j                   | j"                  sdn| j$                        \  }}|j'                         d d | j(                  fz   }| j+                  |j-                  |            }|r||f}|S |f}|S )Nr+   r   rO   )r           r   ro   rp   rr   )rb   rk   viewr   r   rQ   rl   rm   r   r   r   get_interfacer7   _attn_implementationr~   r   ro   ru   r   r   r   rC   reshape)r0   r   r   r   
batch_size
seq_length_query_layer	key_layervalue_layerr   attention_interfacecontext_layerattention_probsnew_context_layer_shapeoutputss                   r&   rS   zVJEPA2RopeAttention.forward%  s    %2$7$7!
JJJ}%T*b$":":D<T<TUYq!_ 	 HH]#T*b$":":D<T<TUYq!_ 	 JJ}%T*b$":":D<T<TUYq!_ 	 '']'K00GD	22;H(?(M(MKK,,.E)
 *=nnLL#}}C$2C2C	*
& #0"4"4"6s";t?Q?Q>S"S		-"7"78O"PQ6G=/2 O\M]r%   )rU      rF   NF)r   r   r   r   rV   r>   r   r   r   r   r    rX   boolr#   rS   r3   r4   s   @r&   r   r      s      #%	## # !	#J'%0*( .2"'	/ ||d*/  	/
 
u||U\\)	*U5<<-@	@/r%   r   input	drop_probru   rM   c                    |dk(  s|s| S d|z
  }| j                   d   fd| j                  dz
  z  z   }|t        j                  || j                  | j
                        z   }|j                          | j                  |      |z  }|S )zc
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    r   r   r   r   r   )rb   ndimr    randra   r   floor_div)r   r   ru   	keep_probrb   random_tensorr1   s          r&   	drop_pathr   X  s    
 CxII[[^

Q 77E

5ELL YYMYYy!M1FMr%   c                   t     e Zd ZdZd	dedz  f fdZdej                  dej                  fdZde	fdZ
 xZS )
VJEPA2DropPathzXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nr   c                 0    t         |           || _        y rF   )r-   r>   r   )r0   r   r2   s     r&   r>   zVJEPA2DropPath.__init__k  s    "r%   r   rM   c                 D    t        || j                  | j                        S rF   )r   r   ru   )r0   r   s     r&   rS   zVJEPA2DropPath.forwardo  s    FFr%   c                      d| j                    S )Nzp=)r   r0   s    r&   
extra_reprzVJEPA2DropPath.extra_reprr  s    DNN#$$r%   rF   )r   r   r   r   floatr>   r    rX   rS   strr   r3   r4   s   @r&   r   r   h  s@    b#%$, #GU\\ Gell G%C %r%   r   c                   f     e Zd Zddededef fdZdej                  dej                  fdZ	 xZ
S )		VJEPA2MLPr7   r8   	mlp_ratioc                     t         |           |x}}t        ||z        }t        j                  ||d      | _        t        |j                     | _        t        j                  ||d      | _	        y NTr   )
r-   r>   rV   r   r   fc1r   
hidden_act
activationfc2)r0   r7   r8   r   in_featuresout_featureshidden_featuresr2   s          r&   r>   zVJEPA2MLP.__init__w  sa    %00lkI5699[/E !2!2399_lFr%   hidden_staterM   c                 l    | j                  |      }| j                  |      }| j                  |      }|S rF   )r   r   r   )r0   r  s     r&   rS   zVJEPA2MLP.forward  s2    xx-|4xx-r%   )rU         @)r   r   r   r   rV   r   r>   r    rX   rS   r3   r4   s   @r&   r   r   v  s=    G| G# GQV GELL U\\ r%   r   c                        e Zd ZdZ	 	 	 	 ddededededef
 fdZ	 	 dd	ej                  d
ej                  dz  de
deej                  df   fdZ xZS )VJEPA2LayerzCThis corresponds to the Block class in the original implementation.r7   drop_path_rater8   r   r   c                    t         |           || _        || _        || _        || _        t        j                  ||j                        | _	        t        |||      | _        |j                  dkD  rt        |      nt        j                         | _        t        j                  ||j                        | _        t#        |||      | _        y )Nepsr   )r8   r   )r-   r>   r7   r8   r   r   r   	LayerNormlayer_norm_epsnorm1r   	attentionr  r   Identityr   norm2r   mlp)r0   r7   r  r8   r   r   r2   s         r&   r>   zVJEPA2Layer.__init__  s     	&#6 "\\+63H3HI
,V[BUV;A;P;PSV;V7\^\g\g\i\\+63H3HI
V	Rr%   Nr   r   r   rM   .c                    |}| j                  |      }| j                  |||      }|d   }| j                  |      |z   }|}| j                  |      }| j	                  |      }| j                  |      |z   }|dd  }|f|z   }|S )N)r   r   r   r   )r  r  r   r  r  )r0   r   r   r   residualself_attention_outputsattention_outputr   s           r&   rS   zVJEPA2Layer.forward  s     !

=1!%'/ "0 "

 2!4'788C !

=1/}5@ ), "W,r%   )r   rU   r   r  r   )r   r   r   r   r   r   rV   r>   r    rX   r   r#   rS   r3   r4   s   @r&   r  r    s    M
 !$#%SS S 	S
 !S S. .2"'	|| ||d*  	
 
u||S 	!r%   r  c            
       h     e Zd Zdef fdZe	 	 	 d	dej                  dz  dedede	fd       Z
 xZS )
VJEPA2Encoderr7   c                 ^   t         |           || _        t        ||j                        | _        t        |j                        D cg c]2  }|j                  dkD  r|j                  |z  |j                  dz
  z  nd4 }}t        j                  t        |j                        D cg c]3  }t        |||   |j                  |j                  |j                        5 c}      | _        t        j                  |j                  |j                         | _        d| _        y c c}w c c}w )Nr]   r   r   r  r8   r   r   r  F)r-   r>   r7   rZ   r8   ri   rangenum_hidden_layersr  r   
ModuleListr  r   r   layerr
  r  	layernormgradient_checkpointingr0   r7   idrop_path_ratesr2   s       r&   r>   zVJEPA2Encoder.__init__  s   *6v?Q?QR 6334
 LRKcKcfgKgV""Q&&*B*BQ*FGmpp
 
 ]] v778	  #21#5 & 2 2(.(B(B$..	

 f&8&8f>S>ST&+##

	s   
7D%)8D*NrL   r   output_hidden_statesrM   c                 
   |rdnd }|rdnd }| j                  |      }t        | j                        D ](  \  }}	|r||fz   } |	|d |      }
|
d   }|s ||
d   fz   }* | j                  |      }|r||fz   }t	        |||      S )Nr$   r   r   r   r   r   )ri   	enumerater  r  r
   )r0   rL   r   r"  r{   all_hidden_statesall_self_attentionsr   r   layer_modulelayer_outputss              r&   rS   zVJEPA2Encoder.forward  s     #7BD$5b4(;<(4 	POA|#$58H$H!(>OPM)!,M &9]1=M<O&O#	P }5 1]4D D++*
 	
r%   )NFF)r   r   r   r   r>   r   r    rX   r   r
   rS   r3   r4   s   @r&   r  r    s]    ,| ,0  48"'%*	
"\\D0
  
 #	
 

 
r%   r  tensorr   c                    g }|D ]j  }|j                  | j                        }|j                  d      j                  dd| j	                  d            }|t        j                  | d|      gz  }l t        j                  |d      S )z
    Args:
        tensor (`torch.Tensor`):
            Tensor of shape [batch_size, num_patches, feature_dim]
        masks (`List[torch.Tensor]`):
            List of tensors of shape [batch_size, num_patches] containing indices of patches to keep
    r+   r   rs   indexr   r   )rf   r   r   rd   r   r    gatherr   )r*  r   all_masked_tensorsmask	mask_keeps        r&   apply_masksr2    s      Mwwv}}%NN2&--aFKKOD	u||FKLLM
 99'Q//r%   c                        e Zd ZdZdef fdZed        Z	 ddej                  de
ej                     de
ej                     ded	eej                  ej                  f   f
d
Z xZS )VJEPA2PredictorEmbeddingsr[   r7   c                    t         |           || _        t        j                  |j
                  |j                        | _        d| _        |j                  | _
        |j                  | _        t        j                  t        j                  | j                  dd|j                              | _        |j                   | _        || _        y )Nr   r   )r-   r>   r7   r   r   r8   pred_hidden_sizepredictor_embeddingsnum_mask_tokenspred_zero_init_mask_tokenszero_init_mask_tokenspred_num_mask_tokens	Parameterr    zerosmask_tokensr?   r0   r7   r2   s     r&   r>   z"VJEPA2PredictorEmbeddings.__init__  s    $&IIf.@.@&BYBY$Z! %+%F%F"%::<<D4H4H!QPVPgPg(hi ++r%   c                     | j                   dkD  rM| j                   | j                  z  | j                  | j                  z  z  | j                  | j                  z  z  S | j                  | j                  z  | j                  | j                  z  z  S Nr   rG   rJ   s    r&   rK   z%VJEPA2PredictorEmbeddings.num_patches  s    !!A%''6+>+>>##v'8'88:##v'8'88: $$(9(99f>N>NRXRcRc>cddr%   r   context_masktarget_mask
mask_indexrM   c                    |j                  d      }| j                  |      }|| j                  z  }| j                  |   }|d   j	                         dz   }|j                  ||d      }t        ||      }|j                  t        |      dd      }t        j                  ||gd      }	t        j                  |d      }
t        j                  |d      }t        j                  |
|gd      }|	|fS )z
        hidden_states : encoder outputs (context)
        context_mask: tokens of the context (outputs from the encoder)
        target_mask: tokens to predict
        mask_index: index of the target mask to choose (useful for multiclip?)
        r   r   r   )
r   r7  r8  r>  maxrd   r2  lenr    r   )r0   r   rB  rC  rD  r   contexttargetmax_patch_numri   cmtmr   s                r&   rS   z!VJEPA2PredictorEmbeddings.forward%  s     q!++M:  $"6"66
!!*- $A**,q0q-3V[1 ..\!2Aq9YY0a8
 YY|+YY{*		2r(*5  r%   r   )r   r   r   r   r   r>   rW   rK   r    rX   r,   rV   r#   rS   r3   r4   s   @r&   r4  r4    s    |  e e &!||&! 5<<(&! %,,'	&!
 &! 
u||U\\)	*&!r%   r4  c                        e Zd Zdef fdZd Zd Ze	 	 ddej                  de
ej                     de
ej                     ded	ed
efd       Z xZS )VJEPA2Predictorr7   c                    t         |           || _        d| _        t	        |      | _        t        |j                        D cg c]2  }|j                  dkD  r|j                  |z  |j                  dz
  z  nd4 }}t        j                  t        |j                        D cg c]3  }t        |||   |j                  |j                  |j                        5 c}      | _        t        j                   |j                  |j"                        | _        t        j&                  |j                  |j(                  d      | _        y c c}w c c}w )NFr   r   r  r  Tr   )r-   r>   r7   r  r4  ri   r  pred_num_hidden_layersr  r   r  r  r6  pred_num_attention_headspred_mlp_ratior  r
  r  r  r   r8   rC   r  s       r&   r>   zVJEPA2Predictor.__init__O  s5   &+#3F; 6889
  0014 %%)V-J-JQ-NO
 
 ]] v<<=	  #21#5 & 7 7(.(G(G$33	

 f&=&=6CXCXYIIf55v7I7IPTU	+
	s   7E
$8Ec                 8   |j                  |j                        }t        j                  |d|      }|j                  |j                        }|j	                  d      j                  dd|j                  d            }t        j                  |d|      }||fS )Nr   r,  r+   )rf   r   r    r.  r   expandr   )r0   r   position_masksargsorthidden_states_argsorts        r&   sort_tokenszVJEPA2Predictor.sort_tokensk  s    **^223n!7K **]112 ' 1 1" 5 < <R]EWEWXZE[ \]AVWn,,r%   c                     |j                  |j                        }t        j                  |d      }|j	                  d      j                  dd|j                  d            }t        j                  |d|      }|S )Nr   r   r+   r,  )rf   r   r    rV  r   rT  r   r.  )r0   r   rV  reverse_argsorts       r&   unsort_tokenszVJEPA2Predictor.unsort_tokensw  si    **]112--Q7)33B7>>r2}GYGYZ\G]^]Qr%   encoder_hidden_statesrB  rC  r   r"  rM   c                    |rdnd }|rdnd }t        ||      }|j                  \  }	}
}| j                  |||      \  }}t        j                  |d      }| j                  |||      \  }}t        | j                        D ](  \  }}|r||fz   } ||||      }|d   }|s ||d   fz   }* |r||fz   }| j                  |      }| j                  ||      }|d d |
d f   }| j                  |      }t        |||      S )Nr$   r   r   r   r$  )r2  rb   ri   r    rV  rX  r%  r  r  r[  rC   r
   )r0   r\  rB  rC  r   r"  r{   r&  r'  r   N_ctxtr   r   rU  rV  r   r(  r)  s                     r&   rS   zVJEPA2Predictor.forward~  sC    #7BD$5b4 !,,A< P,2261(,8M|]h(i%~ --A6(,(8(8X_(`%~(4 	POA|#$58H$H!(HYZM)!,M &9]1=M<O&O#	P   1]4D D}5**='B%aj1		-0++*
 	
r%   )FF)r   r   r   r   r>   rX  r[  r   r    rX   r,   r   r
   rS   r3   r4   s   @r&   rN  rN  N  s    V| V8
-  #(%*.
$||.
 5<<(.
 %,,'	.

  .
 #.
 
.
 .
r%   rN  c                        e Zd ZdZdef fdZ	 	 d
dej                  dej                  dz  dedz  de	ej                  ej                  dz  f   fd	Z
 xZS )VJEPA2PoolerSelfAttentionz=Multi-headed attention from 'Attention Is All You Need' paperr7   c                    t         |           || _        |j                  | _        |j
                  | _        | j                  | j                  z  | _        | j                  | j                  z  | j                  k7  r&t        d| j                   d| j                   d      | j                  dz  | _	        |j                  | _        d| _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        y Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).r   F)r-   r>   r7   r8   	embed_dimr   r   head_dimr   scaleattention_dropoutrp   r   r   r   k_projv_projq_projout_projr?  s     r&   r>   z"VJEPA2PoolerSelfAttention.__init__  s   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
//ii?ii?ii?		$..$..Ar%   Nr   rn   r   rM   c           
      &   |j                   \  }}}| j                  |      }| j                  |      }| j                  |      }	|j	                  ||| j
                  | j                        j                  dd      }|j	                  ||| j
                  | j                        j                  dd      }|	j	                  ||| j
                  | j                        j                  dd      }	t        j                  | j                  j                  t              }
 |
| |||	|| j                  | j                  | j                  sdn| j                         \  }}|j#                  |||      j%                         }| j'                  |      }|sd}||fS z#Input shape: Batch x Time x Channelr   rO   r   r   N)rb   ri  rg  rh  r   r   rd  rQ   r   r   r7   r   r~   r   re  ru   rp   r   rz   rj  )r0   r   rn   r   r   r   rc  querieskeysvaluesr   r}   r|   s                r&   rS   z!VJEPA2PoolerSelfAttention.forward  s_    -:,?,?)
J	++m,{{=)]+,,z:t~~t}}U__`acdeyyZOYYZ[]^_ZT^^T]]S]]^_abc(?(M(MKK,,.E)
 %8nnJJ#}}C$,,	%
!\ "))*j)LWWYmmK0 LL((r%   r   r   r   r   r   r   r>   r    rX   r   r#   rS   r3   r4   s   @r&   r`  r`    so    GB| B. /3).	')||') t+')  $;	')
 
u||U\\D00	1')r%   r`  c                        e Zd ZdZdef fdZ	 	 ddej                  dej                  dej                  dej                  dz  d	edz  d
e	ej                  ej                  dz  f   fdZ
 xZS )VJEPA2PoolerCrossAttentionz_It's different from other cross-attention layers, doesn't have output projection layer (o_proj)r7   c                    t         |           || _        |j                  | _        |j
                  | _        | j                  | j                  z  | _        | j                  | j                  z  | j                  k7  r&t        d| j                   d| j                   d      | j                  dz  | _	        |j                  | _        d| _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        y rb  )r-   r>   r7   r8   rc  r   r   rd  r   re  rf  rp   r   r   r   rg  rh  ri  r?  s     r&   r>   z#VJEPA2PoolerCrossAttention.__init__  s    ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
//ii?ii?ii?r%   Nrm  rn  ro  rn   r   rM   c           
      "   |j                   \  }}}|j                   d   }	| j                  |      }| j                  |      }| j                  |      }|j	                  ||| j
                  | j                        j                  dd      }|j	                  ||	| j
                  | j                        j                  dd      }|j	                  ||	| j
                  | j                        j                  dd      }t        j                  | j                  j                  t              }
 |
| ||||| j                  | j                  | j                  sdn| j                         \  }}|j#                  |||      j%                         }|sd}||fS rl  )rb   ri  rg  rh  r   r   rd  rQ   r   r   r7   r   r~   r   re  ru   rp   r   rz   )r0   rm  rn  ro  rn   r   r   q_seq_lengthrc  kv_seq_lengthr   r}   r|   s                r&   rS   z"VJEPA2PoolerCrossAttention.forward	  s\    /6mm+
L)

1++g&{{4 V$,,z<Waabcefgyy]DNNDMMR\\]^`abZV``abdef(?(M(MKK,,.E)
 %8nnJJ#}}C$,,	%
!\ "))*lINYY[ LL((r%   r   rp  r4   s   @r&   rr  rr    s    i@| @0 /3).)))) ll)) 	))
 t+))  $;)) 
u||U\\D00	1))r%   rr  c                        e Zd Zdef fdZ	 d
dej                  dej                  dedz  deej                  df   fd	Z	 xZ
S )VJEPA2PoolerSelfAttentionLayerr7   c                 :   t         |           t        j                  |j                  |j
                        | _        t        |      | _        t        j                  |j                  |j
                        | _	        t        ||j                        | _        y Nr  r]   )r-   r>   r   r
  r8   r  layer_norm1r`  	self_attnlayer_norm2r   r  r?  s     r&   r>   z'VJEPA2PoolerSelfAttentionLayer.__init__7  sl    <<(:(:@U@UV26:<<(:(:@U@UVV1C1CDr%   r   rn   r   NrM   .c                     |}| j                  |      }| j                  |||      \  }}||z   }|}| j                  |      }| j                  |      }||z   }|f}|r||fz  }|S )a=  
        Args:
            hidden_states (`torch.FloatTensor`):
                Input to the layer of shape `(batch, seq_len, embed_dim)`.
            attention_mask (`torch.FloatTensor`):
                Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very large negative values.
            output_attentions (`bool`, *optional*, defaults to `False`):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )r   rn   r   )r{  r|  r}  r  )r0   r   rn   r   r  r|   r   s          r&   rS   z&VJEPA2PoolerSelfAttentionLayer.forward>  s      !((7&*nn')/ '5 '
#|
 !=0 ((7/ =0 "&Gr%   )Fr   r   r   r   r>   r    rX   r   r#   rS   r3   r4   s   @r&   rx  rx  6  s[    E| E */	#||# #  $;	#
 
u||S 	!#r%   rx  c                        e Zd Zdef fdZ	 	 ddej                  dej                  dej                  dz  dedeej                  d	f   f
d
Z	 xZ
S )VJEPA2PoolerCrossAttentionLayerr7   c                 :   t         |           t        j                  |j                  |j
                        | _        t        |      | _        t        j                  |j                  |j
                        | _	        t        ||j                        | _        y rz  )r-   r>   r   r
  r8   r  r{  rr  
cross_attnr}  r   r  r?  s     r&   r>   z(VJEPA2PoolerCrossAttentionLayer.__init__e  sl    <<(:(:@U@UV4V<<<(:(:@U@UVV1C1CDr%   Nrm  r  rn   r   rM   .c                     |}| j                  |      }| j                  |||||      ^}}||z   }|}| j                  |      }| j                  |      }||z   }|f}|r|t	        |      z  }|S )N)rn   r   )r{  r  r}  r  r#   )r0   rm  r  rn   r   r  r|   r   s           r&   rS   z'VJEPA2PoolerCrossAttentionLayer.forwardl  s     ''5&*oo)/ '6 '
#|  ,.  ''5xx-,./u\**Gr%   r   r  r4   s   @r&   r  r  d  sm    E| E /3"' ll t+	
   
u||S 	!r%   r  c                   `     e Zd ZdZdef fdZdej                  dej                  fdZ xZ	S )VJEPA2AttentivePoolerzAttentive Poolerr7   c                 F   t         |           t        j                  t	        j
                  dd|j                              | _        t        |      | _	        t        j                  t        |j                        D cg c]  }t        |       c}      | _        y c c}w rA  )r-   r>   r   r<  r    r=  r8   query_tokensr  cross_attention_layerr  r  num_pooler_layersrx  self_attention_layers)r0   r7   r   r2   s      r&   r>   zVJEPA2AttentivePooler.__init__  sr    LLQ6;M;M)NO%DV%L"%']]=B6C[C[=\]+F3]&
"]s   ?Br  rM   c                     | j                   D ]  } ||d       d   } | j                  j                  |j                  d   dd      }| j	                  ||      d   }|j                  d      S )N)rn   r   r   )r  r  rd   rb   r  squeeze)r0   r  r  rm  s       r&   rS   zVJEPA2AttentivePooler.forward  sw    // 	GE dCAFL	G##**<+=+=a+@!QG11'<HK##A&&r%   )
r   r   r   r   r   r>   r    rX   rS   r3   r4   s   @r&   r  r    s-    
| 
'ELL 'U\\ 'r%   r  c                   f    e Zd ZU eed<   dZdZdZdZg dZ	dZ
dZ ej                         d        Zy)	VJEPA2PreTrainedModelr7   vjepa2rL   videoT)r  rx  r  r4  c                    | j                   j                  }t        |t              rt	        j
                  |j                  |       t        |j                  d      D ]w  \  }}||dz  z  }t	        j
                  |j                  j                  j                  |       t	        j
                  |j                  j                  j                  |       y |t        |j                        dz   dz  z  }t	        j
                  |j                  j                  j                  j                  |       yt        |t               rN|j"                  r t	        j$                  |j&                         yt	        j
                  |j&                  |       yt        |t(        j*                  t(        j,                  t(        j.                  f      rNt	        j
                  |j                  |       |j0                   t	        j$                  |j0                         yyt        |t(        j2                        r?t	        j$                  |j0                         t	        j4                  |j                         yy)zInitialize the weights)stdr   g      ?N)r7   initializer_ranger/   r  inittrunc_normal_r  r%  r  r|  rj  re   r  r   rG  r  r4  r:  zeros_r>  r   r   Conv2drA   r   r
  ones_)r0   rj   init_stdr   r  r  s         r&   _init_weightsz#VJEPA2PreTrainedModel._init_weights  s    ;;00f34v22A%f&B&BAF B5!S&)""5??#;#;#B#BL""599==#7#7SAB c&">">?!CKKCv;;??CCJJPST 9:++F../""6#5#58DBIIryy ABv}}(;{{&FKK( '-KK$JJv}}% .r%   N)r   r   r   r   r"   base_model_prefixmain_input_nameinput_modalitiessupports_gradient_checkpointing_no_split_modules_supports_sdpa_supports_flash_attnr    no_gradr  r$   r%   r&   r  r    sN     +O&*# NU]]_& &r%   r  c                        e Zd Zdef fdZdefdZee	 	 	 	 	 dde	j                  dee	j                     dz  dee	j                     dz  d	ed
edz  dedz  defd              Zde	j                  fdZ xZS )VJEPA2Modelr7   c                     t         |   |       || _        t        |      | _        t        |      | _        | j                          y rF   )r-   r>   r7   r  encoderrN  	predictor	post_initr?  s     r&   r>   zVJEPA2Model.__init__  s;     $V,(0 	r%   rM   c                 B    | j                   j                  j                  S rF   )r  ri   r^   r   s    r&   get_input_embeddingsz VJEPA2Model.get_input_embeddings  s    ||&&777r%   NrL   rB  rC  skip_predictorr   r"  c                    ||n| j                   j                  }||n| j                   j                  }|t        d      | j	                  |||      }|j
                  }	|||j                  d      }
|	j                  d      }t        j                  ||j                        j                  d      j                  |
df      g}t        j                  ||j                        j                  d      j                  |
df      g}|sN| j                  |	||||      }t        |j
                  t        |	|      |j                  |j                         }nd}t#        |	t        |	|      |j                  |j                   |	      }|S )
az  
        context_mask (`torch.Tensor` with shape `[batch_size, patch_size, 1]`, *optional*):
            The mask position ids indicating which encoder output patches are going to be exposed to the predictor.
            By default, this mask is created as torch.arange(N).unsqueeze(0).repeat(B,1), indicating full context
            available to the predictor.
        target_mask (`torch.Tensor` with shape `[batch_size, patch_size, 1]`, *optional*):
            The mask position ids indicating which encoder output patches are going to be used as a prediction target
            for the predictor. By default, this mask is created as torch.arange(N).unsqueeze(0).repeat(B,1), indicating
            that the predictor should predict all encoder patches.
        skip_predictor (bool):
            flag to skip the predictor forward, useful if you just need the encoder outputs
        Nz'You have to specify pixel_values_videos)rL   r   r"  r   r   r   )r\  rB  rC  r   r"  )r   r   r   r   )r   r   r   r   r)   )r7   r   r"  r   r  r   r   r    r   r   r   rd   r  r   r2  r   r   r(   )r0   rL   rB  rC  r  r   r"  r{   encoder_outputssequence_outputr   r   predictor_outputsr)   encoder_outputs                  r&   rS   zVJEPA2Model.forward  s   0 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &FGG+/<< 3/!5 ,8 ,

 *;;K$7#((+A$$Q'A!LL3F3M3MNXXYZ[bbdeghcijkL <<2E2L2LMWWXYZaacdfgbhijK15&5)'"3%9 2@ 2  D"3"E"E$/$M/==,77	   $9- +O\ J)77&11-
 r%   c                 @    | j                  |d      }|j                  S )NT)r  )rS   r   )r0   rL   r  s      r&   get_vision_featureszVJEPA2Model.get_vision_features!  s!    &9$O///r%   )NNFNN)r   r   r   r   r>   r6   r  r   r   r    rX   r,   r   r(   rS   r  r3   r4   s   @r&   r  r    s    | 8&= 8  3715$)-,0D"\\D 5<<(4/D %,,'$.	D
 D  $;D #TkD 
*D  DL0%,, 0r%   r  z}
    V-JEPA 2 Model transformer with a video classification head on top (a linear layer on top of the attentive pooler).
    c                        e Zd Zdef fdZee	 	 	 d
dej                  dej                  dz  de	dz  de	dz  de
ez  f
d	              Z xZS )VJEPA2ForVideoClassificationr7   c                    t         |   |       |j                  | _        t        |      | _        t        |      | _        t        j                  |j                  |j                  d      | _
        | j                          y r   )r-   r>   
num_labelsr  r  r  poolerr   r   r8   
classifierr  r?  s     r&   r>   z%VJEPA2ForVideoClassification.__init__,  sd      ++!&) ,F3))F$6$68I8IPTU 	r%   NrL   labelsr   r"  rM   c                    | j                  |d||      }|j                  }| j                  |      }| j                  |      }	d}
|| j	                  |	|| j
                        }
t        |
|	|j                  |j                        S )ag  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

        Examples:

        ```python
        >>> import torch
        >>> import numpy as np
        >>> from transformers import AutoVideoProcessor, VJEPA2ForVideoClassification

        >>> device = "cuda"

        >>> video_processor = AutoVideoProcessor.from_pretrained("facebook/vjepa2-vitl-fpc16-256-ssv2")
        >>> model = VJEPA2ForVideoClassification.from_pretrained("facebook/vjepa2-vitl-fpc16-256-ssv2").to(device)

        >>> video = np.ones((64, 256, 256, 3))  # 64 frames, 256x256 RGB
        >>> inputs = video_processor(video, return_tensors="pt").to(device)

        >>> # For inference
        >>> with torch.no_grad():
        ...     outputs = model(**inputs)
        >>> logits = outputs.logits

        >>> predicted_label = logits.argmax(-1).item()
        >>> print(model.config.id2label[predicted_label])

        >>> # For training
        >>> labels = torch.ones(1, dtype=torch.long, device=device)
        >>> loss = model(**inputs, labels=labels).loss

        ```T)rL   r  r   r"  N)pooled_logitsr  r7   )losslogitsr   r   )	r  r   r  r  loss_functionr7   r   r   r   )r0   rL   r  r   r"  r{   r   r   pooler_outputr  r  s              r&   rS   z$VJEPA2ForVideoClassification.forward9  s    Z ++ 3/!5	  
 $55$56/%%F6RVR]R]%^D$!//))	
 	
r%   )NNN)r   r   r   r   r>   r   r   r    rX   r   r#   r   rS   r3   r4   s   @r&   r  r  &  s    |   '+)-,0?
"\\?
 t#?
  $;	?

 #Tk?
 
&	&?
  ?
r%   r  )r  r  r  )r   )r   F):collections.abcr   dataclassesr   r    r    r   r  activationsr   modeling_layersr	   modeling_outputsr
   r   modeling_utilsr   r   utilsr   r   r   r   configuration_vjepa2r   
get_loggerr   loggerr   r(   Moduler6   rZ   rX   r   r~   r   r   r   r   r   r   r  r  r,   r2  r4  rN  r`  rr  rx  r  r  r  r  r  __all__r$   r%   r&   <module>r     s   % !   & ! 9 F F K K . 
		H	% 
9; 9 9 {  * bii  Fryy T %II%<<% 
% <<	%
 LL4'% % %4)6H")) HXU\\ e T V[VbVb  %RYY %		  2, 2j9
BII 9
x0 0T%,,-? 0ELL 0"C!		 C!L_
bii _
D>)		 >)BA) A)J+%? +\%&@ %P'BII '& '&O '& '&T X0' X0 X0v 
O
#8 O

O
d Sr%   