
    qi                         d dl Z d dlmZ d dlmZ ddlmZ ddlm	Z	m
Z
 ddlmZ ddlmZmZmZ dd	lmZmZmZmZ  G d
 de      Ze G d de             Z G d dee      Z ed       G d dee             Zg dZy)    N)IJepaConfig   )initialization)BaseModelOutputWithPoolingImageClassifierOutput)Unpack)TransformersKwargsauto_docstring	torch_int   )ViTEmbeddingsViTForImageClassificationViTModelViTPreTrainedModelc            	            e Zd Zddededdf fdZdej                  dededej                  fd	Z		 	 dd
ej                  dej                  dz  dedej                  fdZ xZS )IJepaEmbeddingsconfiguse_mask_tokenreturnNc                     t         |   ||       | `| j                  j                  }t        j                  t        j                  d||j                              | _
        y )N   )super__init__	cls_tokenpatch_embeddingsnum_patchesnn	Parametertorchrandnhidden_sizeposition_embeddings)selfr   r   r   	__class__s       Y/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/ijepa/modular_ijepa.pyr   zIJepaEmbeddings.__init__   sL    0N++77#%<<A{FL^L^0_#`     
embeddingsheightwidthc                 0   |j                   d   }| j                  j                   d   }t        j                  j	                         s||k(  r||k(  r| j                  S | j                  }|j                   d   }|| j
                  z  }|| j
                  z  }	t        |dz        }
|j                  d|
|
|      }|j                  dddd      }t        j                  j                  |||	fdd	      }|j                  dddd      j                  dd|      }|S )
a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   g      ?r   r   r   bicubicF)sizemodealign_corners)shaper"   r   jit
is_tracing
patch_sizer   reshapepermuter   
functionalinterpolateview)r#   r'   r(   r)   r   num_positionspatch_pos_embeddim
new_height	new_widthsqrt_num_positionss              r%   interpolate_pos_encodingz(IJepaEmbeddings.interpolate_pos_encoding   s#    !&&q)0066q9 yy##%+*F6UZ?+++22r"t.
T__,	&}c'9:)11!5GI[]`a)11!Q1=--33i(	 4 
 *11!Q1=BB1b#Nr&   pixel_valuesbool_masked_posr?   c                 x   |j                   \  }}}}| j                  ||      }|Z|j                   d   }	| j                  j                  ||	d      }
|j	                  d      j                  |
      }|d|z
  z  |
|z  z   }|r|| j                  |||      z   }n|| j                  z   }| j                  |      }|S )N)r?   r   r+   g      ?)	r0   r   
mask_tokenexpand	unsqueezetype_asr?   r"   dropout)r#   r@   rA   r?   
batch_size_r(   r)   r'   
seq_lengthmask_tokensmasks               r%   forwardzIJepaEmbeddings.forward<   s     (4'9'9$
Avu**<Rj*k
&#))!,J//00ZLK",,R088ED#sTz2[45GGJ $#d&C&CJPVX]&^^J#d&>&>>J\\*-
r&   )F)NF)__name__
__module____qualname__r   boolr   r   Tensorintr?   
BoolTensorrM   __classcell__r$   s   @r%   r   r      s    a{ aD aT a%5<< % %UX %]b]i]i %T 48).	ll ))D0 #'	
 
r&   r   c                       e Zd Z ej                         dej                  ej                  z  ej                  z  ddfd       Z	y)IJepaPreTrainedModelmoduler   Nc                    t        |t        j                  t        j                  f      rct	        j
                  |j                  d| j                  j                         |j                   t	        j                  |j                         yyt        |t        j                        r?t	        j                  |j                         t	        j                  |j                         yt        |t              rct	        j
                  |j                  d| j                  j                         |j                   t	        j                  |j                         yyy)zInitialize the weightsg        )meanstdN)
isinstancer   LinearConv2dinittrunc_normal_weightr   initializer_rangebiaszeros_	LayerNormones_r   r"   rC   )r#   rY   s     r%   _init_weightsz"IJepaPreTrainedModel._init_weightsY   s     fryy"))45v}}3DKK<Y<YZ{{&FKK( '-KK$JJv}}%0v99IfIfg  ,F--. - 1r&   )
rN   rO   rP   r   no_gradr   r^   r_   rf   rh    r&   r%   rX   rX   W   s@    U]]_/BII		$9BLL$H /T / /r&   rX   c                   .     e Zd Zddededef fdZ xZS )
IJepaModelr   add_pooling_layerr   c                 V    t         |   |       || _        t        ||      | _        y)z
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        use_mask_token (`bool`, *optional*, defaults to `False`):
            Whether to use a mask token for masked image modeling.
        )r   N)r   r   r   r   r'   )r#   r   rm   r   r$   s       r%   r   zIJepaModel.__init__j   s'     	 )&Pr&   )FF)rN   rO   rP   r   rQ   r   rU   rV   s   @r%   rl   rl   i   s(    	Q{ 	Qt 	Q]a 	Q 	Qr&   rl   a  
    IJepa Model transformer with an image classification head on top (a linear layer on top of the final hidden states)
    e.g. for ImageNet.

    <Tip>

        Note that it's possible to fine-tune IJepa on higher resolution images than the ones it has been trained on, by
        setting `interpolate_pos_encoding` to `True` in the forward of the model. This will interpolate the pre-trained
        position embeddings to the higher resolution.

    </Tip>
    )custom_introc                        e Zd Zdef fdZ	 	 	 d
dej                  dz  dej                  dz  dedz  dee	   de
f
d	Z xZS )IJepaForImageClassificationr   c                 h    t         |   |       t        |d      | _        | j	                          y )NF)rm   )r   r   rl   ijepa	post_init)r#   r   r$   s     r%   r   z$IJepaForImageClassification.__init__   s(     %@
r&   Nr@   labelsr?   kwargsr   c                     | j                   |fd|i|}|j                  }| j                  |j                  d            }d}| | j                  ||| j
                  fi |}t        |||j                  |j                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        r?   r   )r;   N)losslogitshidden_states
attentions)	rs   last_hidden_state
classifierr[   loss_functionr   r   rz   r{   )	r#   r@   ru   r?   rv   outputssequence_outputry   rx   s	            r%   rM   z#IJepaForImageClassification.forward   s     /9djj/
%=/
 /

 "33!5!5!!5!<=%4%%ffdkkLVLD$!//))	
 	
r&   )NNN)rN   rO   rP   r   r   r   rR   rQ   r   r	   r   rM   rU   rV   s   @r%   rq   rq   v   sk    {  -1&*04	
llT)
 t#
 #'+	

 +,
 

r&   rq   )rX   rl   rq   )r   torch.nnr   -transformers.models.ijepa.configuration_ijepar    r   r`   modeling_outputsr   r   processing_utilsr   utilsr	   r
   r   vit.modeling_vitr   r   r   r   r   rX   rl   rq   __all__rj   r&   r%   <module>r      s      E & Q & B B e eGm GT /- / /"
Q%x 
Q %
"68Q %
%
Pr&   