
    qian                        d Z ddlZddlZddlmc mZ ddlmZ ddlmZ	 ddl
mZ ddlmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZmZmZ ddlmZ ddlmZ ddlmZmZm Z  ddl!m"Z"m#Z# ddl$m%Z%m&Z&m'Z' ddl(m)Z)m*Z*m+Z+  G d de'      Z, G d de&      Z- G d de%      Z. G d de+      Z/ G d de#      Z0 G d de"      Z1 G d d ejd                        Z3 G d! d"e      Z4 G d# d$e)      Z5 G d% d&e      Z6 G d' d(e*      Z7 G d) d*ejd                        Z8e G d+ d,e             Z9 ed-.       G d/ d0e9             Z: ed1.       G d2 d3e9             Z;e G d4 d5e             Z<g d6Z=y)7z%Pytorch implementation of AIMv2 Model    N)nn   )initialization)create_causal_mask)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPooling)PreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuple)merge_with_config_defaults)capture_outputs   )	CLIPModelCLIPTextEmbeddings_get_vector_norm)LlamaMLPLlamaRMSNorm)SiglipConfigSiglipTextConfigSiglipVisionConfig)SiglipAttentionSiglipEncoderSiglipOutputc                        e Zd ZdZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddededededededed	ed
ededededededef fdZ xZ	S )Aimv2VisionConfiga  
    This is the configuration class to store the configuration of a [`Aimv2VisionModel`]. It is used to instantiate a
    AIMv2 vision encoder according to the specified arguments, defining the model architecture. Instantiating a
    configuration with the defaults will yield a similar configuration to that of the vision encoder of the AIMv2
    [apple/aimv2-large-patch14-224](https://huggingface.co/apple/aimv2-large-patch14-224) architecture.

    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PreTrainedConfig`] for more information.

    Args:
        hidden_size (`int`, *optional*, defaults to 1024):
            Dimensionality of the encoder layers and the pooler layer.
        intermediate_size (`int`, *optional*, defaults to 2816):
            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
        num_hidden_layers (`int`, *optional*, defaults to 24):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 8):
            Number of attention heads for each attention layer in the Transformer encoder.
        num_channels (`int`, *optional*, defaults to 3):
            Number of channels in the input images.
        image_size (`int`, *optional*, defaults to 224):
            The size (resolution) of each image.
        patch_size (`int`, *optional*, defaults to 14):
            The size (resolution) of each patch.
        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
            The epsilon used by the rms normalization layers.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        qkv_bias (`bool`, *optional*, defaults to `False`):
            Whether to add a bias to the queries, keys and values.
        mlp_bias (`bool`, *optional*, defaults to `False`):
            Whether to add a bias to the Linear layers or Not.
        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
            `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the for initializing all weight matrices.
        use_head (`str`, *optional*, defaults to `True`):
            Whether to use Attention Pooling Head or Not.
        is_native (`str`, *optional*, defaults to `False`):
            Whether to use ckpt trained for image native resolution or not.
    Example:

    ```python
    >>> from transformers import SiglipVisionConfig, SiglipVisionModel

    >>> # Initializing a Aimv2VisionConfig with apple/aimv2-large-patch14-224 style configuration
    >>> configuration = Aimv2VisionConfig()

    >>> # Initializing a Aimv2VisionModel (with random weights) from the apple/aimv2-large-patch14-224 style configuration
    >>> model = Aimv2VisionModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```hidden_sizeintermediate_sizenum_hidden_layersnum_attention_headsnum_channels
image_size
patch_sizerms_norm_epsattention_dropoutqkv_biasmlp_bias
hidden_actinitializer_rangeuse_head	is_nativec                     t        |   d|||||||||
d	| || _        || _        |	| _        || _        |
| _        || _        || _        | `	y )N)	r   r    r!   r"   r*   r#   r$   r%   r(    )
super__init__r,   r+   r'   r)   r(   r&   r-   layer_norm_eps)selfr   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   kwargs	__class__s                    Y/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/aimv2/modular_aimv2.pyr1   zAimv2VisionConfig.__init__c   sx    & 	 	
#// 3!%!!	
 	
 !!2!2  ("    )i   i         r         h㈵>        FFsilu{Gz?TF
__name__
__module____qualname____doc__intfloatboolstrr1   __classcell__r5   s   @r6   r   r   *   s    6t  !%!##$"#& #'!( (  (  	( 
 !(  (  (  (  (  !(  (  (  (  !(  (   !(  ( r7   r   c                   t     e Zd ZdZ	 	 	 	 	 	 	 	 	 	 	 	 	 ddededededededed	ed
ededededef fdZ xZ	S )Aimv2TextConfiga
  
    This is the configuration class to store the configuration of a [`Aimv2TextModel`]. It is used to instantiate a
    AIMv2 text encoder according to the specified arguments, defining the model architecture. Instantiating a
    configuration with the defaults will yield a similar configuration to that of the text encoder of the AIMv2
    [apple/aimv2-large-patch14-224-lit](https://huggingface.co/apple/aimv2-large-patch14-224-lit) architecture.

    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PreTrainedConfig`] for more information.

    Args:
        vocab_size (`int`, *optional*, defaults to 49408):
            Vocabulary size of the AIMv2 text model. Defines the number of different tokens that can be represented by
            the `inputs_ids` passed when calling [`Aimv2Model`].
        hidden_size (`int`, *optional*, defaults to 768):
            Dimensionality of the encoder layers and the pooler layer.
        intermediate_size (`int`, *optional*, defaults to 2048):
            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
        num_hidden_layers (`int`, *optional*, defaults to 12):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 6):
            Number of attention heads for each attention layer in the Transformer encoder.
        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
            The epsilon used by the rms normalization layers.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        qkv_bias (`bool`, *optional*, defaults to `False`):
            Whether to add a bias to the queries, keys and values.
        mlp_bias (`bool`, *optional*, defaults to `False`):
            Whether to add a bias to the Linear layers or Not.
        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
            `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
        eos_token_id (`int`, *optional*, defaults to 49407):
            The id of the end-of-sequence token in the vocabulary.
        max_position_embeddings (`int`, *optional*, defaults to 77):
            The maximum sequence length that this model might ever be used with. Typically set this to something large
            just in case (e.g., 512 or 1024 or 2048).
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the for initializing all weight matrices.
    
vocab_sizer   r    r!   r"   r&   r'   r(   r)   r*   eos_token_idmax_position_embeddingsr+   c                     t        |   d||||||
||d| || _        || _        |	| _        || _        || _        | `| `| `	| `
y )N)rM   r   r    r!   r"   r*   rO   rN   r/   )r0   r1   r+   r'   r)   r(   r&   bos_token_idpad_token_idprojection_sizer2   )r3   rM   r   r    r!   r"   r&   r'   r(   r)   r*   rN   rO   r+   r4   r5   s                  r6   r1   zAimv2TextConfig.__init__   sy    " 	 
	
!#// 3!$;%
	
 
	
 "3!2  ( r7   )i   i   i         r<   r=   FFr>   i  M   r?   r@   rJ   s   @r6   rL   rL      s    'V  !%!##$"#& !')"&& &  &  	& 
 &  !&  &  !&  &  &  &  &  "%&   &  & r7   rL   c                   &     e Zd ZdZ	 d fd	Z xZS )Aimv2Configa@  
    [`Aimv2Config`] is the configuration class to store the configuration of a [`Aimv2Model`]. It is used to
    instantiate a AIMv2 model according to the specified arguments, defining the text model and vision model configs.
    Instantiating a configuration with the defaults will yield a similar configuration to that of the AIMv2
    [apple/aimv2-large-patch14-224-lit](https://huggingface.co/apple/aimv2-large-patch14-224-lit) architecture.

    Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PreTrainedConfig`] for more information.

    Args:
        text_config (`dict`, *optional*):
            Dictionary of configuration options used to initialize [`Aimv2TextConfig`].
        vision_config (`dict`, *optional*):
            Dictionary of configuration options used to initialize [`Aimv2VisionConfig`].
        projection_dim (`int`, *optional*, defaults to 512):
            Dimensionality of text and vision projection layers.
        logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
            The initial value of the *logit_scale* parameter.
        kwargs (*optional*):
            Dictionary of keyword arguments.

    Example:

    ```python
    >>> from transformers import Aimv2Config, Aimv2Model

    >>> # Initializing a Aimv2Config with apple/aimv2-large-patch14-224-lit style configuration
    >>> configuration = Aimv2Config()

    >>> # Initializing a Aimv2Model (with random weights) from the apple/aimv2-large-patch14-224-lit style configuration
    >>> model = Aimv2Model(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config

    >>> # We can also initialize a Aimv2Config from a Aimv2TextConfig and a Aimv2VisionConfig
    >>> from transformers import Aimv2TextConfig, Aimv2VisionConfig

    >>> # Initializing a AIMv2Text and AIMv2Vision configuration
    >>> config_text = Aimv2TextConfig()
    >>> config_vision = Aimv2VisionConfig()

    >>> config = Aimv2Config(text_config=config_text, vision_config=config_vision)
    ```c                 V    || _         || _        d| _        t        |   ||fi | | `y )Ng      Y@)projection_dimlogit_scale_init_valuemax_logit_scaler0   r1   initializer_factor)r3   text_configvision_configrZ   r[   r4   r5   s         r6   r1   zAimv2Config.__init__  s7     -&<#$m>v>#r7   )NNi   g/L
F@)rA   rB   rC   rD   r1   rI   rJ   s   @r6   rX   rX      s    +\ `f$ $r7   rX   c                       e Zd Zy)Aimv2OutputNrA   rB   rC   r/   r7   r6   ra   ra         r7   ra   c                       e Zd Zy)Aimv2RMSNormNrb   r/   r7   r6   re   re     rc   r7   re   c                       e Zd Zy)Aimv2MLPNrb   r/   r7   r6   rg   rg   "  rc   r7   rg   c                        e Zd Zdef fdZedddej                  fdej                  fd       Z	dej                  dej                  fd	Z
 xZS )
Aimv2VisionEmbeddingsconfigc                 B   t         |           || _        |j                  | _        t	        j
                  |j                  |j                  |j                  |j                        | _        t        |j                  |j                        | _        |j                  |j                  z  dz  }| j                  j                  s%t	        j                  ||j                        | _        | j!                  dt#        j$                  |      j'                  d      d       y )N)kernel_sizestrider   position_ids   F)
persistent)r0   r1   rj   r%   r   Conv2dr#   r   patch_embedre   r&   rms_normr$   r-   	Embeddingposition_embeddingregister_buffertorcharangeexpand)r3   rj   num_patchesr5   s      r6   r1   zAimv2VisionEmbeddings.__init__'  s     ++99!3!3ARAR[a[l[l
 %V%7%79L9LM((F,=,==!C{{$$&(ll;@R@R&SD#^U\\+-F-M-Mg-Vchir7      g     @cpureturnc                 :   t        j                  t        |      ||      }t        j                  t        |       ||      }t        j                  ||d      \  }}|dz  }t        j                  |||      |z  }	d||	z  z  }	|j	                         d   |	d d d f   z  }
|j	                         d   |	d d d f   z  }t        j
                  |
j                         |
j                         |j                         |j                         gd      d d d d d f   S )	Ndtypedevicexy)indexing   g      ?).Nrp   dim)ry   rz   rE   meshgridflattenconcatsincos)heightwidth	embed_dimtemperaturer   r   grid_wgrid_hpos_dimomegaout_hout_ws               r6   "build_2d_sincos_position_embeddingz8Aimv2VisionEmbeddings.build_2d_sincos_position_embedding5  s     c%jfEc&kvFFq.WE&AGK{E)* +eD!Gn< +eD!Gn<||UYY[%))+uyy{EIIKPVWXY]_`bcYcddr7   pixel_valuesc                    |j                         \  }}}}| j                  |      j                  d      j                  dd      }| j	                  |      }| j
                  j                  rY| j                  || j                  z  || j                  z  | j
                  j                  |j                  |j                        }n| j                  | j                        }||z   }|S )Nr   rp   )r   r   r   )sizert   r   	transposeru   rj   r-   r   r%   r   r   r   rw   rn   )r3   r   _r   r   hidden_states	pos_embeds          r6   forwardzAimv2VisionEmbeddings.forwardF  s    *//11fe((6>>qAKKAqQm4;;  ??$//)(++11$++#)) @ I //0A0ABI%	1r7   )rA   rB   rC   r   r1   staticmethodry   float32Tensorr   r   rI   rJ   s   @r6   ri   ri   &  s]    j0 j !$'%u}}e	e e ELL U\\ r7   ri   c                       e Zd Zy)Aimv2TextEmbeddingsNrb   r/   r7   r6   r   r   Z  rc   r7   r   c                        e Zd Z fdZ xZS )Aimv2Attentionc                    t         |   |       t        j                  | j                  | j                  |j
                        | _        t        j                  | j                  | j                  |j
                        | _        t        j                  | j                  | j                  |j
                        | _        t        j                  | j                  | j                  |j
                        | _	        y )Nbias)
r0   r1   r   Linearr   r(   k_projv_projq_projout_projr3   rj   r5   s     r6   r1   zAimv2Attention.__init___  s     iiV__UiiV__UiiV__U		$..$..vWr7   )rA   rB   rC   r1   rI   rJ   s   @r6   r   r   ^  s    X Xr7   r   c            	            e Zd Zdef fdZ	 d	dej                  dej                  dz  dee   dej                  fdZ	 xZ
S )
Aimv2EncoderLayerrj   c                     t         |           t        |      | _        t	        |      | _        t        |j                  |j                        | _	        t        |j                  |j                        | _
        y N)r0   r1   r   	attentionrg   ffnre   r   r&   	rms_norm1	rms_norm2r   s     r6   r1   zAimv2EncoderLayer.__init__h  sZ    '/F#%f&8&8&:M:MN%f&8&8&:M:MNr7   Nr   attention_maskr4   r   c                     | j                  |      } | j                  d||d|\  }}||z   }| j                  |      }| j                  |      }||z   }|S )N)r   r   r/   )r   r   r   r   )r3   r   r   r4   norm_hidden_statesattn_outputr   
mlp_outputs           r6   r   zAimv2EncoderLayer.forwardo  sl     "^^M:'r6HYgrkqrQ%3!^^M:XX01
%
2r7   r   )rA   rB   rC   r   r1   ry   r   r   r   r   rI   rJ   s   @r6   r   r   g  sY    O0 O /3|| t+ +,	
 
r7   r   c                       e Zd Zy)Aimv2EncoderNrb   r/   r7   r6   r   r     rc   r7   r   c                   \     e Zd Zdef fdZdej                  dej                  fdZ xZS )Aimv2AttentionPoolingHeadrj   c                 &   t         |           |j                  | _        |j                  | _        t        j                  | j                  | j                  |j                        | _        t        j                  | j                  | j                  |j                        | _	        t        j                  t        j                  dd| j                              | _        t        j                  | j                  | j                  d      | _        y )Nr   rp   T)r0   r1   r   r"   	num_headsr   r   r(   r   r   	Parameterry   zeros	cls_tokenoutput_projr   s     r6   r1   z"Aimv2AttentionPoolingHead.__init__  s    !--33ii 0 0$2B2BYii 0 0$2B2BYekk!Q8H8H&IJ99T%5%5t7G7GdSr7   r   r   c                    |j                   \  }}}| j                  j                  |dd      }| j                  |      j	                  ||| j
                  || j
                  z        }| j                  |      j	                  ||| j
                  || j
                  z        }|j	                  |d| j
                  || j
                  z        }|j                  dddd      }|j                  dddd      }|j                  dddd      }t        j                  |||      }	|	j                  dd      j	                  |d|      }	|	j                  d      }	| j                  |	      }
|
S )Nrq   rp   r   r   r   r   )shaper   r{   r   reshaper   r   permuteFscaled_dot_product_attentionr   meanr   )r3   r   
batch_sizeseq_len
hidden_dimr   keyvaluequeryr   outputs              r6   r   z!Aimv2AttentionPoolingHead.forward  sH   *7*=*='
GZNN))*b"=	kk-(00WdnnV`dhdrdrVrsM*22:wXbfjftftXtu!!*at~~A]^kk!Q1%aAq)aAq)44UCG!++Aq199*aT!&&1&-!!+.r7   )	rA   rB   rC   r   r1   ry   r   r   rI   rJ   s   @r6   r   r     s-    	T0 	TU\\ ell r7   r   c                   v     e Zd ZU dZeed<   dZdZdZg dZ	dZ
dZdZ ej                          fd       Z xZS )Aimv2PreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models. The model is only intended for inference and doesn't support finetuning.
    rj   aimv2)imageT)r   r   ri   r   c                 $   t         |   |       t        |d      rYt        |j                  t
        j                        r4t        j                  |j                  t        j                  d             y y t        |t              r7t        j                  |j                  d| j                  j                         y t        |t               rZt        j"                  |j$                  t'        j(                  |j$                  j*                  d         j-                  d             y t        |t.              rZt        j"                  |j$                  t'        j(                  |j$                  j*                  d         j-                  d             y y )Nlogit_scaleg$I$I,@r=   )r   stdrq   ro   )r0   _init_weightshasattr
isinstancer   r   r   init	constant_mathlogr   normal_r   rj   r+   ri   copy_rn   ry   rz   r   r{   r   )r3   moduler5   s     r6   r   z"Aimv2PreTrainedModel._init_weights  s   f%6=)&,,bll;v11488H3EF < 9:LL))9V9VW 56JJv**ELL9L9L9R9RSU9V,W,^,^_f,gh 34JJv**ELL9L9L9R9RSU9V,W,^,^_f,gh 5r7   )rA   rB   rC   rD   rX   __annotations__base_model_prefixinput_modalitiessupports_gradient_checkpointing_no_split_modules_supports_sdpa_supports_flash_attn_supports_flex_attnry   no_gradr   rI   rJ   s   @r6   r   r     sY    
 !&*# NU]]_
i 
ir7   r   zL
    The Vision model from AIMv2 without any head or projection on top.
    )custom_introc                        e Zd ZU eed<   dZeedZdef fdZ	de
j                  fdZe ed      ed	ee   defd
                     Z xZS )Aimv2VisionModelrj   r   r   
attentionsc                 6   t         |   |       || _        t        |      | _        t        |      | _        t        |j                  |j                        | _
        |j                  | _        | j                  rt        |      | _        | j                          y r   )r0   r1   rj   ri   
embeddingsr   encoderre   r   r&   ru   r,   r   head	post_initr   s     r6   r1   zAimv2VisionModel.__init__  sq     /7#F+$V%7%79L9LM==1&9DIr7   r   c                 .    | j                   j                  S r   )r   rt   r3   s    r6   get_input_embeddingsz%Aimv2VisionModel.get_input_embeddings  s    ***r7   Ftie_last_hidden_statesr4   c                     | j                  |      } | j                  dd|i|}|j                  }| j                  |      }| j                  r| j                  |      nd}t        ||      S )a3  
        Examples:

        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, Siglip2VisionModel

        >>> model = Aimv2VisionModel.from_pretrained("apple/aimv2-large-patch14-native")
        >>> processor = AutoProcessor.from_pretrained("apple/aimv2-large-patch14-native")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled features
        ```inputs_embedsNlast_hidden_statepooler_outputr/   )r   r   r  ru   r,   r   r	   )r3   r   r4   r   encoder_outputsr  r  s          r6   r   zAimv2VisionModel.forward  sz    < 5+74<< ,
',
,

 ,== MM*;<8<		"344)/'
 	
r7   )rA   rB   rC   r   r   main_input_namer   r   _can_record_outputsr1   r   Moduler   r   r   r   r   r   r	   r   rI   rJ   s   @r6   r   r     s~     $O*$
0 +bii +  E2*
 +,*
 
$	*
  3  *
r7   r   zJ
    The text model from AIMv2 without any head or projection on top.
    c            
            e Zd ZdZeedZdef fdZde	j                  fdZd Ze ed	      e	 ddej"                  d
z  dee   defd                     Z xZS )Aimv2TextModel	input_idsr   rj   c                     t         |   |       || _        t        |      | _        t        |      | _        t        |j                  |j                        | _
        |j                  | _        | j                          y r   )r0   r1   rj   r   r   r   r   re   r   r&   ru   rN   r   r   s     r6   r1   zAimv2TextModel.__init__$  sa     -f5#F+$V%7%79L9LM"//r7   r   c                 .    | j                   j                  S r   r   token_embeddingr   s    r6   r   z#Aimv2TextModel.get_input_embeddings/  s    ...r7   c                 &    || j                   _        y r   r  )r3   r   s     r6   set_input_embeddingsz#Aimv2TextModel.set_input_embeddings2  s    */'r7   Fr   Nr   r4   c                    | j                  |      }|j                  \  }}}t        j                  |t        j                  |j
                        }|j                  d      j                  |d      }	|t        | j                  ||	||d       } | j                  d	||d|}
|
j                  }| j                  |      }|t        j                  |j                  d   |j
                        |j                  t        j                  |j
                        | j                  k(  j                         j!                  d      f   }t#        ||      S )
Nr   r   rq   )rj   r   rn   r   cache_positionpast_key_values)r   r   )r   r   r  r/   )r   r   ry   rz   longr   	unsqueezer{   r   rj   r   r  ru   torE   rN   argmaxr	   )r3   r
  r   r4   r   r   r   r   r  rn   r  r  pooled_outputs                r6   r   zAimv2TextModel.forward5  sN    	2!.!4!4
GQgUZZH\H\]%//299*bI%/{{+)-- $N '$,, 
')
 
 ,== MM*;< *LL*003<M<T<TU\\		2C2J2J\KtO`O``eegnnsunvx

 */'
 	
r7   r   )rA   rB   rC   r  r   r   r  rL   r1   r   r  r   r  r   r   r   ry   r   r   r   r	   r   rI   rJ   s   @r6   r	  r	    s     "O +$
	 	/bii /0  E2 /3'
 t+'
 +,	'

 
$'
  3  '
r7   r	  c                       e Zd ZdZdefdZee	 	 	 ddej                  dz  dej                  dz  dej                  dz  dee   d	ef
d
              Zy)
Aimv2ModelTrj   c                    t        j                  | |       |j                  | _        |j                  j                  | _        |j                  j                  | _        t        j                  |j                        | _
        t        j                  |j                        | _        t        j                  | j
                  | j                  d      | _        t        j                  | j                  | j                  d      | _        t        j"                  t%        j&                  | j(                  j*                              | _        t/        j0                  |j2                        | _        | j7                          y )NFr   )r
   r1   rZ   r_   r   vision_embed_dimr^   text_embed_dimr   _from_configvision_modelr	  
text_modelr   r   visual_projectiontext_projectionr   ry   tensorrj   r[   r   r   r   r\   max_log_logit_scaler   )r3   rj   s     r6   r1   zAimv2Model.__init__f  s     v.$33 & 4 4 @ @$00<<,99&:N:NO(55f6H6HI!#4+@+@$BUBU\a!b!yy)<)<d>Q>QX]^<<T[[5W5W(XY#'88F,B,B#C r7   Nr
  r   r   r4   r   c                     | j                   dd|i|} | j                  d||d|}|j                  }| j                  |      }|j                  }| j	                  |      }|t        |      z  }|t        |      z  }| j                  j                  d| j                        j                         j                  |j                        }	|	|z  |j                         z  }
|
j                         }t        ||
||||      S )a  
        Examples:

        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, Aimv2Model

        >>> model = Aimv2Model.from_pretrained("apple/aimv2-large-patch14-224-lit")
        >>> processor = AutoProcessor.from_pretrained("apple/aimv2-large-patch14-224-lit")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> inputs = processor(
        ...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
        ... )

        >>> outputs = model(**inputs)
        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
        >>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
        ```r   )r
  r   r=   )logits_per_imagelogits_per_texttext_embedsimage_embedstext_model_outputvision_model_outputr/   )r  r   r  r!  r"  r   r   clampr$  expr  r   tra   )r3   r
  r   r   r4   vision_outputstext_outputsr)  r(  r   r'  r&  s               r6   r   zAimv2Model.forwardx  s'   B 6GT5F5F 6
%6
6

 4C4?? 4
)4
 4
 &33--l;"00**;7 $&6|&DD!$4[$AA&&,,S$2J2JKOOQTTU`UgUgh&48HH*,,.-+#%* .
 	
r7   )NNN)rA   rB   rC   r   rX   r1   r   r   ry   
LongTensorFloatTensorr   r   r   ra   r   r/   r7   r6   r  r  b  s    { $  .215.2	?
##d*?
 ''$.?
 t+	?

 +,?
 
?
  ?
r7   r  )rX   r   rL   r   r  r   r	  )>rD   r   ry   torch.nn.functionalr   
functionalr    r   r   masking_utilsr   modeling_layersr   modeling_outputsr   r	   modeling_utilsr
   processing_utilsr   utilsr   r   r   utils.genericr   utils.output_capturingr   clip.modeling_clipr   r   r   llama.modeling_llamar   r   siglip.configuration_siglipr   r   r   siglip.modeling_siglipr   r   r   r   rL   rX   ra   re   rg   r  ri   r   r   r   r   r   r   r   r	  r  __all__r/   r7   r6   <module>rC     s   ,      & / 9 K - & 
 8 5 P P 9 \ \ Q Qa * a HP & P f6$, 6$r	, 		< 		x 	1BII 1h	, 	X_ X2 2	= 			 D i? i iD 
F
+ F

F
R 
C
) C

C
L V
 V
 V
rr7   