
    qi:r                        d dl Z d dlmZ d dlmZ d dlmZ d dlZd dlm	c m
Z d dlm	Z	 ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZ ddlmZmZ ddlmZ ddlm Z m!Z!m"Z"m#Z# ddl$m%Z% ddl&m'Z' ddl(m)Z)m*Z*m+Z+ ee" G d de                     Z, ed       G d de	jZ                               Z. G d de	jZ                        Z/ G d de	jZ                        Z0 G d de	jZ                        Z1	 d>de	jZ                  d ejd                  d!ejd                  d"ejd                  d#ejd                  dz  d$e3d%e3fd&Z4 G d' d(e	jZ                        Z5 G d) d*e      Z6 G d+ d,e	jZ                        Z7 G d- d.e	jZ                        Z8e" G d/ d0e             Z9 e"d12       G d3 d4e9             Z: e"d52       G d6 d7e9             Z;d8ejd                  d9ejd                  fd:Z<e" G d; d<e9             Z=g d=Z>y)?    N)Callable)	dataclass)Any)nn   )initialization)ACT2FN)use_kernel_forward_from_hub)create_causal_mask)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPooling)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringcan_return_tuple)merge_with_config_defaults)capture_outputs   )Aimv2ConfigAimv2TextConfigAimv2VisionConfigc                      e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	ej                  dz  ed<   dZ
ej                  dz  ed<   dZej                  dz  ed<   dZeed<   dZeed	<   d
ee   fdZy)Aimv2Outputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
        Contrastive loss for image-text similarity.
    logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
        The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
        similarity scores.
    logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
        The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
        similarity scores.
    text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The text embeddings obtained by applying the projection layer to the pooled output of [`Aimv2TextModel`].
    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The image embeddings obtained by applying the projection layer to the pooled output of [`Aimv2VisionModel`].
    text_model_output (`BaseModelOutputWithPooling`):
        The output of the [`Aimv2TextModel`].
    vision_model_output (`BaseModelOutputWithPooling`):
        The output of the [`Aimv2VisionModel`].
    Nlosslogits_per_imagelogits_per_texttext_embedsimage_embedstext_model_outputvision_model_outputreturnc                 H     t         fd j                         D              S )Nc              3   d   K   | ]'  }|d vr|   nt        |      j                          ) yw))r#   r$   N)getattrto_tuple).0kselfs     Z/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/aimv2/modeling_aimv2.py	<genexpr>z'Aimv2Output.to_tuple.<locals>.<genexpr>L   s=      
  LLDGRYZ^`aRbRkRkRmm
s   -0)tuplekeysr,   s   `r-   r)   zAimv2Output.to_tupleK   s#     
YY[
 
 	
    )__name__
__module____qualname____doc__r   torchFloatTensor__annotations__r   r    r!   r"   r#   r   r$   r/   r   r)    r2   r-   r   r   -   s    & &*D%

d
")15e''$.504OU&&-4,0K""T)0-1L%##d*148186:3:
%* 
r2   r   RMSNormc                   h     e Zd Zddeddf fdZdej                  dej                  fdZd Z xZ	S )	Aimv2RMSNormepsr%   Nc                     t         |           t        j                  t	        j
                  |            | _        || _        y)z;
        Aimv2RMSNorm is equivalent to T5LayerNorm
        N)super__init__r   	Parameterr7   onesweightvariance_epsilon)r,   hidden_sizer>   	__class__s      r-   rA   zAimv2RMSNorm.__init__T   s1     	ll5::k#:; #r2   hidden_statesc                 "   |j                   }|j                  t        j                        }|j	                  d      j                  dd      }|t        j                  || j                  z         z  }| j                  |j                  |      z  S )N   T)keepdim)	dtypetor7   float32powmeanrsqrtrE   rD   )r,   rH   input_dtypevariances       r-   forwardzAimv2RMSNorm.forward\   sy    #))%((7 $$Q',,R,>%Ht?T?T4T(UU{{]--k:::r2   c                 ^    t        | j                  j                         d| j                   S )Nz, eps=)r/   rD   shaperE   r1   s    r-   
extra_reprzAimv2RMSNorm.extra_reprc   s*    ))*+6$2G2G1HIIr2   )gư>)
r3   r4   r5   floatrA   r7   TensorrU   rX   __classcell__rG   s   @r-   r=   r=   R   s7    $ $$ $;U\\ ;ell ;Jr2   r=   c                   $     e Zd Z fdZd Z xZS )Aimv2MLPc                    t         |           || _        |j                  | _        |j                  | _        t        j                  | j                  | j                  |j                        | _        t        j                  | j                  | j                  |j                        | _	        t        j                  | j                  | j                  |j                        | _
        t        |j                     | _        y )Nbias)r@   rA   configrF   intermediate_sizer   Linearmlp_bias	gate_projup_proj	down_projr	   
hidden_actact_fnr,   rb   rG   s     r-   rA   zAimv2MLP.__init__h   s    !--!'!9!94#3#3T5K5KRXRaRabyy!1!143I3IPVP_P_`4#9#94;K;KRXRaRabV../r2   c                     | j                  | j                  | j                  |            | j                  |      z        }|S N)rh   rj   rf   rg   )r,   xrh   s      r-   rU   zAimv2MLP.forwardr   s6    NN4;;t~~a/@#ADLLQRO#ST	r2   )r3   r4   r5   rA   rU   r[   r\   s   @r-   r^   r^   g   s    0r2   r^   c                        e Zd Zdef fdZedddej                  fdej                  fd       Z	dej                  dej                  fd	Z
 xZS )
Aimv2VisionEmbeddingsrb   c                 B   t         |           || _        |j                  | _        t	        j
                  |j                  |j                  |j                  |j                        | _        t        |j                  |j                        | _        |j                  |j                  z  dz  }| j                  j                  s%t	        j                  ||j                        | _        | j!                  dt#        j$                  |      j'                  d      d       y )N)kernel_sizestriderJ   position_idsr   rK   F
persistent)r@   rA   rb   
patch_sizer   Conv2dnum_channelsrF   patch_embedr=   rms_norm_epsrms_norm
image_size	is_native	Embeddingposition_embeddingregister_bufferr7   arangeexpand)r,   rb   num_patchesrG   s      r-   rA   zAimv2VisionEmbeddings.__init__x   s     ++99!3!3ARAR[a[l[l
 %V%7%79L9LM((F,=,==!C{{$$&(ll;@R@R&SD#^U\\+-F-M-Mg-Vchir2      g     @cpur%   c                 :   t        j                  t        |      ||      }t        j                  t        |       ||      }t        j                  ||d      \  }}|dz  }t        j                  |||      |z  }	d||	z  z  }	|j	                         d   |	d d d f   z  }
|j	                         d   |	d d d f   z  }t        j
                  |
j                         |
j                         |j                         |j                         gd      d d d d d f   S )	NrM   devicexy)indexing   g      ?).Nr   dim)r7   r   intmeshgridflattenconcatsincos)heightwidth	embed_dimtemperaturer   rM   grid_wgrid_hpos_dimomegaout_hout_ws               r-   "build_2d_sincos_position_embeddingz8Aimv2VisionEmbeddings.build_2d_sincos_position_embedding   s     c%jfEc&kvFFq.WE&AGK{E)* +eD!Gn< +eD!Gn<||UYY[%))+uyy{EIIKPVWXY]_`bcYcddr2   pixel_valuesc                    |j                         \  }}}}| j                  |      j                  d      j                  dd      }| j	                  |      }| j
                  j                  rY| j                  || j                  z  || j                  z  | j
                  j                  |j                  |j                        }n| j                  | j                        }||z   }|S )NrJ   r   )r   r   rM   )sizer{   r   	transposer}   rb   r   r   rx   rF   r   rM   r   rt   )r,   r   _r   r   rH   	pos_embeds          r-   rU   zAimv2VisionEmbeddings.forward   s    *//11fe((6>>qAKKAqQm4;;  ??$//)(++11$++#)) @ I //0A0ABI%	1r2   )r3   r4   r5   r   rA   staticmethodr7   rO   rZ   r   rU   r[   r\   s   @r-   rp   rp   w   s]    j0 j !$'%u}}e	e e ELL U\\ r2   rp   c            	            e Zd Zdef fdZ	 	 	 d	dej                  dz  dej                  dz  dej                  dz  dej                  fdZ	 xZ
S )
Aimv2TextEmbeddingsrb   c                 N   t         |           |j                  }t        j                  |j
                  |      | _        t        j                  |j                  |      | _        | j                  dt        j                  |j                        j                  d      d       y )Nrt   ru   Frv   )r@   rA   rF   r   r   
vocab_sizetoken_embeddingmax_position_embeddingsr   r   r7   r   r   )r,   rb   r   rG   s      r-   rA   zAimv2TextEmbeddings.__init__   s    &&	!||F,=,=yI"$,,v/M/My"Y 	ELL)G)GHOOPWXej 	 	
r2   N	input_idsrt   inputs_embedsr%   c                 8   ||j                   d   n|j                   d   }| j                  j                  j                   d   }||kD  rt        d| d|       || j                  d d d |f   }|| j                  |      }| j                  |      }||z   }|S )NrK   r   zRSequence length must be less than max_position_embeddings (got `sequence length`: z and max_position_embeddings: )rW   r   rD   
ValueErrorrt   r   )r,   r   rt   r   
seq_lengthmax_position_embeddingposition_embeddings
embeddingss           r-   rU   zAimv2TextEmbeddings.forward   s     -6,AY__R(}GZGZ[]G^
!%!8!8!?!?!E!Ea!H..d,<=S<TV 
 ,,Q^<L  00;M"55lC"%88
r2   NNN)r3   r4   r5   r   rA   r7   
LongTensorr8   rZ   rU   r[   r\   s   @r-   r   r      sj    

 

 .20426	##d* &&- ((4/	
 
r2   r   modulequerykeyvalueattention_maskscalingdropoutc                    t        j                  ||j                  dd            |z  }|||z   }t        j                  j                  |dt         j                        j                  |j                        }t        j                  j                  ||| j                        }t        j                  ||      }	|	j                  dd      j                         }	|	|fS )NrK   r   )r   rM   )ptrainingr   rJ   )r7   matmulr   r   
functionalsoftmaxrO   rN   rM   r   r   
contiguous)
r   r   r   r   r   r   r   kwargsattn_weightsattn_outputs
             r-   eager_attention_forwardr      s     <<s}}R'<=GL!#n4==((2U]](SVVW\WbWbcL==((6??([L,,|U3K''1-88:K$$r2   c            
            e Zd ZdZ fdZ	 ddej                  dej                  dz  deej                  ej                  dz  f   fdZ xZ	S )	Aimv2Attentionz=Multi-headed attention from 'Attention Is All You Need' paperc                 x   t         |           || _        |j                  | _        |j
                  | _        | j                  | j                  z  | _        | j                  | j                  z  | j                  k7  r&t        d| j                   d| j                   d      | j                  dz  | _	        |j                  | _        d| _        t        j                  | j                  | j                  |j                        | _        t        j                  | j                  | j                  |j                        | _        t        j                  | j                  | j                  |j                        | _        t        j                  | j                  | j                  |j                        | _        y )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      Fr`   )r@   rA   rb   rF   r   num_attention_heads	num_headshead_dimr   scaleattention_dropoutr   	is_causalr   rd   qkv_biask_projv_projq_projout_projrk   s     r-   rA   zAimv2Attention.__init__   s2   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
//iiV__UiiV__UiiV__U		$..$..vWr2   NrH   r   r%   c           
         |j                   \  }}}| j                  |      }| j                  |      }| j                  |      }	|j	                  ||| j
                  | j                        j                  dd      }|j	                  ||| j
                  | j                        j                  dd      }|	j	                  ||| j
                  | j                        j                  dd      }	t        j                  | j                  j                  t              }
 |
| |||	|| j                  | j                  | j                  sdn| j                         \  }}|j#                  |||      j%                         }| j'                  |      }||fS )z#Input shape: Batch x Time x Channelr   rJ           )r   r   r   )rW   r   r   r   viewr   r   r   r   get_interfacerb   _attn_implementationr   r   r   r   r   reshaper   r   )r,   rH   r   r   
batch_sizer   r   queriesr0   valuesattention_interfacer   r   s                r-   rU   zAimv2Attention.forward   sW    -:,?,?)
J	++m,{{=)]+,,z:t~~t}}U__`acdeyyZOYYZ[]^_ZT^^T]]S]]^_abc(?(M(MKK,,.E)
 %8nnJJ#}}C$,,	%
!\ "))*j)LWWYmmK0L((r2   rm   )
r3   r4   r5   r6   rA   r7   rZ   r/   rU   r[   r\   s   @r-   r   r      sV    GX, /3$)||$) t+$)
 
u||U\\D00	1$)r2   r   c            	            e Zd Zdef fdZ	 d	dej                  dej                  dz  dee   dej                  fdZ	 xZ
S )
Aimv2EncoderLayerrb   c                     t         |           t        |      | _        t	        |      | _        t        |j                  |j                        | _	        t        |j                  |j                        | _
        y rm   )r@   rA   r   	attentionr^   ffnr=   rF   r|   	rms_norm1	rms_norm2rk   s     r-   rA   zAimv2EncoderLayer.__init__(  sZ    '/F#%f&8&8&:M:MN%f&8&8&:M:MNr2   NrH   r   r   r%   c                     | j                  |      } | j                  d||d|\  }}||z   }| j                  |      }| j                  |      }||z   }|S )N)rH   r   r:   )r   r   r   r   )r,   rH   r   r   norm_hidden_statesr   r   
mlp_outputs           r-   rU   zAimv2EncoderLayer.forward/  sl     "^^M:'r6HYgrkqrQ%3!^^M:XX01
%
2r2   rm   )r3   r4   r5   r   rA   r7   rZ   r   r   rU   r[   r\   s   @r-   r   r   '  sY    O0 O /3|| t+ +,	
 
r2   r   c                   j     e Zd ZdZdef fdZe	 d	dej                  dz  de	e
   defd       Z xZS )
Aimv2Encoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`Aimv2EncoderLayer`].

    Args:
        config: Aimv2Config
    rb   c                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w )NF)
r@   rA   rb   r   
ModuleListrangenum_hidden_layersr   layersgradient_checkpointing)r,   rb   r   rG   s      r-   rA   zAimv2Encoder.__init__I  sO    mmfNfNfHg$h1%6v%>$hi&+# %is   A#Nr   r   r%   c                 T    |}| j                   D ]  } |||fi |} t        |      S )N)last_hidden_state)r   r   )r,   r   r   r   rH   encoder_layers         r-   rU   zAimv2Encoder.forwardP  sC     &![[ 	M) M	 ??r2   rm   )r3   r4   r5   r6   r   rA   r   r7   rZ   r   r   r   rU   r[   r\   s   @r-   r   r   @  s_    ,{ ,  /3@ t+@ +,	@
 
@ @r2   r   c                   \     e Zd Zdef fdZdej                  dej                  fdZ xZS )Aimv2AttentionPoolingHeadrb   c                 &   t         |           |j                  | _        |j                  | _        t        j                  | j                  | j                  |j                        | _        t        j                  | j                  | j                  |j                        | _	        t        j                  t        j                  dd| j                              | _        t        j                  | j                  | j                  d      | _        y )Nr`   r   T)r@   rA   rF   r   r   r   rd   r   r   r   rB   r7   zeros	cls_tokenoutput_projrk   s     r-   rA   z"Aimv2AttentionPoolingHead.__init__c  s    !--33ii 0 0$2B2BYii 0 0$2B2BYekk!Q8H8H&IJ99T%5%5t7G7GdSr2   rH   r%   c                    |j                   \  }}}| j                  j                  |dd      }| j                  |      j	                  ||| j
                  || j
                  z        }| j                  |      j	                  ||| j
                  || j
                  z        }|j	                  |d| j
                  || j
                  z        }|j                  dddd      }|j                  dddd      }|j                  dddd      }t        j                  |||      }	|	j                  dd      j	                  |d|      }	|	j                  d      }	| j                  |	      }
|
S )NrK   r   r   rJ   r   r   )rW   r   r   r   r   r   r   permuteFscaled_dot_product_attentionr   rQ   r   )r,   rH   r   seq_len
hidden_dimr   r   r   r   r   outputs              r-   rU   z!Aimv2AttentionPoolingHead.forwardn  sH   *7*=*='
GZNN))*b"=	kk-(00WdnnV`dhdrdrVrsM*22:wXbfjftftXtu!!*at~~A]^kk!Q1%aAq)aAq)44UCG!++Aq199*aT!&&1&-!!+.r2   )	r3   r4   r5   r   rA   r7   rZ   rU   r[   r\   s   @r-   r   r   b  s-    	T0 	TU\\ ell r2   r   c                   v     e Zd ZU dZeed<   dZdZdZg dZ	dZ
dZdZ ej                          fd       Z xZS )Aimv2PreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models. The model is only intended for inference and doesn't support finetuning.
    rb   aimv2)imageT)r   r   rp   r   c                 $   t         |   |       t        |d      rYt        |j                  t
        j                        r4t        j                  |j                  t        j                  d             y y t        |t              r7t        j                  |j                  d| j                  j                         y t        |t               rZt        j"                  |j$                  t'        j(                  |j$                  j*                  d         j-                  d             y t        |t.              rZt        j"                  |j$                  t'        j(                  |j$                  j*                  d         j-                  d             y y )Nlogit_scaleg$I$I,@r   )rQ   stdrK   ru   )r@   _init_weightshasattr
isinstancer
  r   rB   init	constant_mathlogr   normal_r   rb   initializer_rangerp   copy_rt   r7   r   rW   r   r   )r,   r   rG   s     r-   r  z"Aimv2PreTrainedModel._init_weights  s   f%6=)&,,bll;v11488H3EF < 9:LL))9V9VW 56JJv**ELL9L9L9R9RSU9V,W,^,^_f,gh 34JJv**ELL9L9L9R9RSU9V,W,^,^_f,gh 5r2   )r3   r4   r5   r6   r   r9   base_model_prefixinput_modalitiessupports_gradient_checkpointing_no_split_modules_supports_sdpa_supports_flash_attn_supports_flex_attnr7   no_gradr  r[   r\   s   @r-   r  r    sY    
 !&*# NU]]_
i 
ir2   r  zL
    The Vision model from AIMv2 without any head or projection on top.
    )custom_introc                        e Zd ZU eed<   dZeedZdef fdZ	de
j                  fdZe ed      ed	ee   defd
                     Z xZS )Aimv2VisionModelrb   r   rH   
attentionsc                 6   t         |   |       || _        t        |      | _        t        |      | _        t        |j                  |j                        | _
        |j                  | _        | j                  rt        |      | _        | j                          y rm   )r@   rA   rb   rp   r   r   encoderr=   rF   r|   r}   use_headr   head	post_initrk   s     r-   rA   zAimv2VisionModel.__init__  sq     /7#F+$V%7%79L9LM==1&9DIr2   r%   c                 .    | j                   j                  S rm   )r   r{   r1   s    r-   get_input_embeddingsz%Aimv2VisionModel.get_input_embeddings  s    ***r2   Ftie_last_hidden_statesr   c                     | j                  |      } | j                  dd|i|}|j                  }| j                  |      }| j                  r| j                  |      nd}t        ||      S )a3  
        Examples:

        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, Siglip2VisionModel

        >>> model = Aimv2VisionModel.from_pretrained("apple/aimv2-large-patch14-native")
        >>> processor = AutoProcessor.from_pretrained("apple/aimv2-large-patch14-native")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled features
        ```r   Nr   pooler_outputr:   )r   r$  r   r}   r%  r&  r   )r,   r   r   rH   encoder_outputsr   r.  s          r-   rU   zAimv2VisionModel.forward  sz    < 5+74<< ,
',
,

 ,== MM*;<8<		"344)/'
 	
r2   )r3   r4   r5   r   r9   main_input_namer   r   _can_record_outputsrA   r   Moduler)  r   r   r   r   r   r   rU   r[   r\   s   @r-   r   r     s~     $O*$
0 +bii +  E2*
 +,*
 
$	*
  3  *
r2   r   zJ
    The text model from AIMv2 without any head or projection on top.
    c            
            e Zd ZdZeedZdef fdZde	j                  fdZd Ze ed	      e	 ddej"                  d
z  dee   defd                     Z xZS )Aimv2TextModelr   r!  rb   c                     t         |   |       || _        t        |      | _        t        |      | _        t        |j                  |j                        | _
        |j                  | _        | j                          y rm   )r@   rA   rb   r   r   r   r$  r=   rF   r|   r}   eos_token_idr'  rk   s     r-   rA   zAimv2TextModel.__init__  sa     -f5#F+$V%7%79L9LM"//r2   r%   c                 .    | j                   j                  S rm   r   r   r1   s    r-   r)  z#Aimv2TextModel.get_input_embeddings  s    ...r2   c                 &    || j                   _        y rm   r8  )r,   r   s     r-   set_input_embeddingsz#Aimv2TextModel.set_input_embeddings  s    */'r2   Fr*  Nr   r   c                    | j                  |      }|j                  \  }}}t        j                  |t        j                  |j
                        }|j                  d      j                  |d      }	|t        | j                  ||	||d       } | j                  d	||d|}
|
j                  }| j                  |      }|t        j                  |j                  d   |j
                        |j                  t        j                  |j
                        | j                  k(  j                         j!                  d      f   }t#        ||      S )
Nr   r   rK   )rb   r   rt   r   cache_positionpast_key_values)r   r   )r   r   r-  r:   )r   rW   r7   r   longr   	unsqueezer   r   rb   r$  r   r}   rN   r   r6  argmaxr   )r,   r   r   r   rH   r   r  r   r<  rt   r/  r   pooled_outputs                r-   rU   zAimv2TextModel.forward  sN    	2!.!4!4
GQgUZZH\H\]%//299*bI%/{{+)-- $N '$,, 
')
 
 ,== MM*;< *LL*003<M<T<TU\\		2C2J2J\KtO`O``eegnnsunvx

 */'
 	
r2   rm   )r3   r4   r5   r0  r   r   r1  r   rA   r   r2  r)  r:  r   r   r   r7   rZ   r   r   r   rU   r[   r\   s   @r-   r4  r4    s     "O +$
	 	/bii /0  E2 /3'
 t+'
 +,	'

 
$'
  3  '
r2   r4  tensorr%   c                     t        j                  | d      }t        j                  |dd      }t        j                  |d      }|S )z
    This method is equivalent to tensor.norm(p=2, dim=-1, keepdim=True) and used to make
    model `executorch` exportable. See issue https://github.com/pytorch/executorch/issues/3566
    rJ   rK   T)r   rL   g      ?)r7   rP   sum)rB  square_tensor
sum_tensornormed_tensors       r-   _get_vector_normrH  @  s<    
 IIfa(M=b$?JIIj#.Mr2   c                       e Zd ZU eed<   g dZdZdef fdZee		 	 dde
j                  de
j                  dz  de
j                  dz  d	ee   d
eez  f
d              Zee		 dde
j"                  ded	ee   d
eez  fd              Ze	e	 	 	 dde
j(                  dz  de
j"                  dz  de
j                  dz  d	ee   d
ef
d              Z xZS )
Aimv2Modelrb   )r   r   rp   Tc                    t         |   |       |j                  | _        |j                  j                  | _        |j                  j                  | _        t        j                  |j                        | _
        t        j                  |j                        | _        t        j                  | j
                  | j                  d      | _        t        j                  | j                  | j                  d      | _        t        j"                  t%        j&                  | j(                  j*                              | _        t/        j0                  |j2                        | _        | j7                          y )NFr`   )r@   rA   projection_dimvision_configrF   vision_embed_dimtext_configtext_embed_dimr   _from_configvision_modelr4  
text_modelr   rd   visual_projectiontext_projectionrB   r7   rB  rb   logit_scale_init_valuer
  r  r  max_logit_scalemax_log_logit_scaler'  rk   s     r-   rA   zAimv2Model.__init__Q  s     $33 & 4 4 @ @$00<<,99&:N:NO(55f6H6HI!#4+@+@$BUBU\a!b!yy)<)<d>Q>QX]^<<T[[5W5W(XY#'88F,B,B#C r2   Nr   r   rt   r   r%   c                 x     | j                   d|||dd|}|j                  }| j                  |      |_        |S )a
  
        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoTokenizer, Aimv2Model

        >>> model = Aimv2Model.from_pretrained("openai/aimv2-vit-base-patch32")
        >>> tokenizer = AutoTokenizer.from_pretrained("openai/aimv2-vit-base-patch32")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")

        >>> with torch.inference_mode():
        ...     text_features = model.get_text_features(**inputs)
        ```T)r   r   rt   return_dictr:   )rS  r.  rU  )r,   r   r   rt   r   text_outputsrA  s          r-   get_text_featureszAimv2Model.get_text_featuresc  sV    0 4C4?? 4
)%	4

 4
 %22%)%9%9-%H"r2   r   interpolate_pos_encodingc                 v     | j                   d||dd|}|j                  }| j                  |      |_        |S )a  
        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, Aimv2Model
        >>> from transformers.image_utils import load_image

        >>> model = Aimv2Model.from_pretrained("openai/aimv2-vit-base-patch32")
        >>> processor = AutoProcessor.from_pretrained("openai/aimv2-vit-base-patch32")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = load_image(url)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> with torch.inference_mode():
        ...     image_features = model.get_image_features(**inputs)
        ```T)r   r]  rZ  r:   )rR  r.  rT  )r,   r   r]  r   vision_outputsrA  s         r-   get_image_featureszAimv2Model.get_image_features  sU    6 6GT5F5F 6
%%=6
 	6
 '44'+'='=m'L$r2   c                     | j                   dd|i|} | j                  d||d|}|j                  }| j                  |      }|j                  }| j	                  |      }|t        |      z  }|t        |      z  }| j                  j                  d| j                        j                         j                  |j                        }	|	|z  |j                         z  }
|
j                         }t        ||
||||      S )a  
        Examples:

        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, Aimv2Model

        >>> model = Aimv2Model.from_pretrained("apple/aimv2-large-patch14-224-lit")
        >>> processor = AutoProcessor.from_pretrained("apple/aimv2-large-patch14-224-lit")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> inputs = processor(
        ...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
        ... )

        >>> outputs = model(**inputs)
        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
        >>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
        ```r   )r   r   r   )r   r    r!   r"   r#   r$   r:   )rR  rS  r.  rT  rU  rH  r
  clamprX  exprN   r   tr   )r,   r   r   r   r   r_  r[  r"   r!   r
  r    r   s               r-   rU   zAimv2Model.forward  s'   B 6GT5F5F 6
%6
6

 4C4?? 4
)4
 4
 &33--l;"00**;7 $&6|&DD!$4[$AA&&,,S$2J2JKOOQTTU`UgUgh&48HH*,,.-+#%* .
 	
r2   )NN)Fr   )r3   r4   r5   r   r9   r  r  rA   r   r   r7   rZ   r   r   r/   r   r\  r8   boolr`  r   r   rU   r[   r\   s   @r-   rJ  rJ  K  sn   ]{ $  /3,0	 <<  t+  llT)	 
 +,  
+	+    D  */"''" #'" +,	"
 
+	+"  "H  .215.2	?
##d*?
 ''$.?
 t+	?

 +,?
 
?
  ?
r2   rJ  )r   rJ  r  r4  )r   )?r  collections.abcr   dataclassesr   typingr   r7   torch.nn.functionalr   r   r    r   r  activationsr	   integrationsr
   masking_utilsr   modeling_layersr   modeling_outputsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.genericr   utils.output_capturingr   configuration_aimv2r   r   r   r   r2  r=   r^   rp   r   rZ   rY   r   r   r   r   r   r  r   r4  rH  rJ  __all__r:   r2   r-   <module>rw     s#  ,  $ !      & ! 7 / 9 K F & V V 7 5 P P  
+  
   
F Y'J299 J (J(ryy  1BII 1h%")) %^ %II%<<% 
% <<	%
 LL4'% % %.:)RYY :)z2 2@299 @D		 D i? i iD 
F
+ F

F
R 
C
) C

C
LU\\ ell  b
% b
 b
J Wr2   