
    qi}                     :   d dl mZ d dlmZ d dlmZ d dlZd dlZd dl	m
Z
 d dlm
c mZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZmZmZ ddlmZmZ ddlmZ ddl m!Z!m"Z"m#Z#m$Z$m%Z% ddl&m'Z' ddl(m)Z) ddl*m+Z+m,Z,m-Z- e e#d       G d de!                    Z.e e#d       G d de!                    Z/ee# G d de!                    Z0 G d de
jb                        Z2 G d de
jb                        Z3	 dCde
jb                  d ejh                  d!ejh                  d"ejh                  d#ejh                  dz  d$e5d%e5fd&Z6 G d' d(e
jb                        Z7 G d) d*e
jb                        Z8 G d+ d,e      Z9e# G d- d.e             Z: G d/ d0e
jb                        Z; G d1 d2e:      Z< G d3 d4e:      Z= e#d5       G d6 d7e:             Z> G d8 d9e
jb                        Z? e#d:       G d; d<e:             Z@e# G d= d>e:             ZA e#d?       G d@ dAe:             ZBg dBZCy)D    )Callable)	dataclass)AnyN   )initialization)ACT2FN)create_bidirectional_mask)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPoolingImageClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringcan_return_tupletorch_compilable_check)merge_with_config_defaults)capture_outputs   )Siglip2ConfigSiglip2TextConfigSiglip2VisionConfigz}
    Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
    )custom_introc                       e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	e
ej                  df   dz  ed<   dZe
ej                  df   dz  ed<   y)Siglip2VisionOutputz
    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
        The image embeddings obtained by applying the projection layer to the pooler_output.
    Nimage_embedslast_hidden_state.hidden_states
attentions)__name__
__module____qualname____doc__r   torchFloatTensor__annotations__r    r!   tupler"        ^/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/siglip2/modeling_siglip2.pyr   r   +   sr    
 .2L%##d*126u((4/6:>M5**C/047>7;Je'',-4;r,   r   ze
    Base class for text model's outputs that also contains a pooling of the last hidden states.
    c                       e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	e
ej                  df   dz  ed<   dZe
ej                  df   dz  ed<   y)Siglip2TextOutputz
    text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
        The text embeddings obtained by applying the projection layer to the pooler_output.
    Ntext_embedsr    .r!   r"   )r#   r$   r%   r&   r0   r'   r(   r)   r    r!   r*   r"   r+   r,   r-   r/   r/   =   sr    
 -1K""T)026u((4/6:>M5**C/047>7;Je'',-4;r,   r/   c                      e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	ej                  dz  ed<   dZ
ej                  dz  ed<   dZej                  dz  ed<   dZeed<   dZeed	<   d
ee   fdZy)Siglip2Outputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
        Contrastive loss for image-text similarity.
    logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
        The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
        similarity scores.
    logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
        The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
        similarity scores.
    text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The text embeddings obtained by applying the projection layer to the pooled output of [`Siglip2TextModel`].
    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The image embeddings obtained by applying the projection layer to the pooled output of [`Siglip2VisionModel`].
    text_model_output (`BaseModelOutputWithPooling`):
        The output of the [`Siglip2TextModel`].
    vision_model_output (`BaseModelOutputWithPooling`):
        The output of the [`Siglip2VisionModel`].
    Nlosslogits_per_imagelogits_per_textr0   r   text_model_outputvision_model_outputreturnc                 H     t         fd j                         D              S )Nc              3   d   K   | ]'  }|d vr|   nt        |      j                          ) yw))r6   r7   N)getattrto_tuple).0kselfs     r-   	<genexpr>z)Siglip2Output.to_tuple.<locals>.<genexpr>n   s=      
  LLDGRYZ^`aRbRkRkRmm
s   -0)r*   keysr?   s   `r-   r<   zSiglip2Output.to_tuplem   s#     
YY[
 
 	
r,   )r#   r$   r%   r&   r3   r'   r(   r)   r4   r5   r0   r   r6   r   r7   r*   r   r<   r+   r,   r-   r2   r2   O   s    & &*D%

d
")15e''$.504OU&&-4,0K""T)0-1L%##d*148186:3:
%* 
r,   r2   c            	            e Zd Zdef fdZedej                  dej                  de	dej                  fd       Z
dej                  dej                  dej                  fd	Z xZS )
Siglip2VisionEmbeddingsconfigc                    t         |           || _        |j                  | _        |j
                  | _        t        j                  |j                  | j
                  z  | j
                  z  | j                        | _	        |j                  | _
        t        | j                  dz        | _        t        j                  | j                  | j                        | _        y )N)in_featuresout_featuresg      ?)super__init__rE   hidden_size	embed_dim
patch_sizennLinearnum_channelspatch_embeddingnum_patchesintposition_embedding_size	Embeddingposition_embeddingr?   rE   	__class__s     r-   rJ   z Siglip2VisionEmbeddings.__init__u   s    ++ ++!yy++doo=O 

 "--'*4+;+;S+@'A$"$,,t/?/?"Pr,   positional_embeddingsspatial_shapes
max_lengthr8   c                    |j                   d   }| j                   d   }| j                  }t        j                  |||f| j                  |      }| j                  ddd      j                  d      } | j                  j                  dk(  r| j                  t        j                        } t        |      D ]  }||   j                         \  }}	t        |	dkD  d       t        |dkD  d       t        ||	z  |k  d	       t        j                  | ||	fd
dd      }
|
j                  |||	z        j!                  dd      }
|
j                  |      }
|
||d||	z  f<   |
d   ||||	z  df<    |S )ac  
        Resize positional embeddings to image-specific size and pad to a fixed size.

        Args:
            positional_embeddings (`torch.Tensor`):
                Position embeddings of shape (height, width, embed_dim)
            spatial_shapes (`torch.LongTensor`):
                Spatial shapes of shape (batch_size, 2) to resize the positional embeddings to
            max_length (`int`):
                Maximum length of the positional embeddings to pad resized positional embeddings to

        Returns:
            `torch.Tensor`: Embeddings of shape (batch_size, max_length, embed_dim)
        r   devicedtype   r   cpuz8Width of resized positional embeddings must be positive.z9Height of resized positional embeddings must be positive.z0Resized positional embeddings exceed max_length.bilinearFT)sizemodealign_corners	antialiasN)shaper`   r'   emptyr_   permute	unsqueezetypetofloat32rangetolistr   Finterpolatereshape	transpose)rY   rZ   r[   
batch_sizerL   source_dtyperesulted_positional_embeddingsiheightwidthresized_embeddingss              r-   resize_positional_embeddingsz4Siglip2VisionEmbeddings.resize_positional_embeddings   s   ( $))!,
)//3	,22).Y/(//*
& !6 = =aA F P PQR S !'',,5$9$<$<U]]$K!z" 	XA*1-446MFE"EAI0jk"FQJ1lm"FUNz#ACuv!"%e_#" "4!;!;IvPU~!V!`!`abde!f "4!6!6|!DBT*1.>.>+>?BTUVBW*1fun.>+>?+	X. .-r,   pixel_valuesc                 J   | j                   j                  j                  }| j                  |j                  |            }| j                  j                  j                  | j                  | j                  d      }| j                  |||j                  d         }||z   }|S )aH  
        Args:
            pixel_values (`torch.FloatTensor`):
                Pixel values of shape (batch_size, max_num_patches, num_channels * patch_size * patch_size)
            spatial_shapes (`list[tuple[int, int]]`):
                Spatial shapes of shape (batch_size, 2) to resize the positional embeddings to
        )r`   r]   r   )r[   )	rQ   weightr`   rm   rV   rs   rT   r|   rh   )r?   r}   rZ   target_dtypepatch_embedsrY   resized_positional_embeddings
embeddingss           r-   forwardzSiglip2VisionEmbeddings.forward   s     ++2288++LOO,O,OP !% 7 7 > > F F(($*F*F!
 )-(I(I!>l>P>PQR>S )J )
%
 "$AA
r,   )r#   r$   r%   r   rJ   staticmethodr'   Tensor
LongTensorrS   r|   r(   r   __classcell__rX   s   @r-   rD   rD   t   s    Q2 Q ;.$||;.((;. ;. 
	;. ;.zE$5$5 uGWGW \a\h\h r,   rD   c            	            e Zd Zdef fdZ	 	 	 d	dej                  dz  dej                  dz  dej                  dz  dej                  fdZ	 xZ
S )
Siglip2TextEmbeddingsrE   c                 N   t         |           |j                  }t        j                  |j
                  |      | _        t        j                  |j                  |      | _        | j                  dt        j                  |j                        j                  d      d       y )Nposition_idsr   r]   F)
persistent)rI   rJ   rK   rN   rU   
vocab_sizetoken_embeddingmax_position_embeddingsrV   register_bufferr'   arangeexpandr?   rE   rL   rX   s      r-   rJ   zSiglip2TextEmbeddings.__init__   s    &&	!||F,=,=yI"$,,v/M/My"Y 	ELL)G)GHOOPWXej 	 	
r,   N	input_idsr   inputs_embedsr8   c                 8   ||j                   d   n|j                   d   }| j                  j                  j                   d   }||kD  rt        d| d|       || j                  d d d |f   }|| j                  |      }| j                  |      }||z   }|S )Nr]   r   zRSequence length must be less than max_position_embeddings (got `sequence length`: z and max_position_embeddings: )rh   rV   r   
ValueErrorr   r   )r?   r   r   r   
seq_lengthmax_position_embeddingposition_embeddingsr   s           r-   r   zSiglip2TextEmbeddings.forward   s     -6,AY__R(}GZGZ[]G^
!%!8!8!?!?!E!Ea!H..d,<=S<TV 
 ,,Q^<L  00;M"55lC"%88
r,   NNN)r#   r$   r%   r   rJ   r'   r   r(   r   r   r   r   s   @r-   r   r      sk    

0 

 .20426	##d* &&- ((4/	
 
r,   r   modulequerykeyvalueattention_maskscalingdropoutc                    t        j                  ||j                  dd            |z  }|||z   }t        j                  j                  |dt         j                        j                  |j                        }t        j                  j                  ||| j                        }t        j                  ||      }	|	j                  dd      j                         }	|	|fS )Nr]   r   )dimr`   )ptrainingr   ra   )r'   matmulrt   rN   
functionalsoftmaxrn   rm   r`   r   r   
contiguous)
r   r   r   r   r   r   r   kwargsattn_weightsattn_outputs
             r-   eager_attention_forwardr     s     <<s}}R'<=GL!#n4==((2U]](SVVW\WbWbcL==((6??([L,,|U3K''1-88:K$$r,   c            
            e Zd ZdZ fdZ	 ddej                  dej                  dz  deej                  ej                  dz  f   fdZ xZ	S )	Siglip2Attentionz=Multi-headed attention from 'Attention Is All You Need' paperc                    t         |           || _        |j                  | _        |j
                  | _        | j                  | j                  z  | _        | j                  | j                  z  | j                  k7  r&t        d| j                   d| j                   d      | j                  dz  | _	        |j                  | _        d| _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        y )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).      F)rI   rJ   rE   rK   rL   num_attention_heads	num_headshead_dimr   scaleattention_dropoutr   	is_causalrN   rO   k_projv_projq_projout_projrW   s     r-   rJ   zSiglip2Attention.__init__  s   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
//ii?ii?ii?		$..$..Ar,   Nr!   r   r8   c           
         |j                   \  }}}| j                  |      }| j                  |      }| j                  |      }	|j	                  ||| j
                  | j                        j                  dd      }|j	                  ||| j
                  | j                        j                  dd      }|	j	                  ||| j
                  | j                        j                  dd      }	t        j                  | j                  j                  t              }
 |
| |||	|| j                  | j                  | j                  sdn| j                         \  }}|j#                  |||      j%                         }| j'                  |      }||fS )z#Input shape: Batch x Time x Channelr   ra           )r   r   r   )rh   r   r   r   viewr   r   rt   r   get_interfacerE   _attn_implementationr   r   r   r   r   rs   r   r   )r?   r!   r   r   ru   r   rL   queriesrA   valuesattention_interfacer   r   s                r-   r   zSiglip2Attention.forward2  sW    -:,?,?)
J	++m,{{=)]+,,z:t~~t}}U__`acdeyyZOYYZ[]^_ZT^^T]]S]]^_abc(?(M(MKK,,.E)
 %8nnJJ#}}C$,,	%
!\ "))*j)LWWYmmK0L((r,   N)
r#   r$   r%   r&   rJ   r'   r   r*   r   r   r   s   @r-   r   r     sV    GB. /3$)||$) t+$)
 
u||U\\D00	1$)r,   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )
Siglip2MLPc                    t         |           || _        t        |j                     | _        t        j                  |j                  |j                        | _
        t        j                  |j                  |j                        | _        y r   )rI   rJ   rE   r   
hidden_actactivation_fnrN   rO   rK   intermediate_sizefc1fc2rW   s     r-   rJ   zSiglip2MLP.__init__Z  sd    #F$5$5699V//1I1IJ99V55v7I7IJr,   r!   r8   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r   )r   r   r   )r?   r!   s     r-   r   zSiglip2MLP.forwarda  s4    /**=9/r,   )r#   r$   r%   rJ   r'   r   r   r   r   s   @r-   r   r   Y  s$    KU\\ ell r,   r   c            	            e Zd Zdeez  f fdZedej                  dej                  de	e
   dej                  fd       Z xZS )Siglip2EncoderLayerrE   c                 D   t         |           |j                  | _        t	        j
                  | j                  |j                        | _        t        |      | _	        t	        j
                  | j                  |j                        | _
        t        |      | _        y Neps)rI   rJ   rK   rL   rN   	LayerNormlayer_norm_epslayer_norm1r   	self_attnlayer_norm2r   mlprW   s     r-   rJ   zSiglip2EncoderLayer.__init__i  sm    ++<<F<Q<QR)&1<<F<Q<QRf%r,   r!   r   r   r8   c                     |}| j                  |      } | j                  d||d|\  }}||z   }|}| j                  |      }| j                  |      }||z   }|S )N)r!   r   r+   )r   r   r   r   )r?   r!   r   r   residual_s         r-   r   zSiglip2EncoderLayer.forwardq  s     !((7)4>> 
')
 
q
 !=0 ((7/ =0r,   )r#   r$   r%   r   r   rJ   r   r'   r   r   r   r(   r   r   r   s   @r-   r   r   h  sd    &25FF & ||  +,	
 
		 r,   r   c                   t    e Zd ZU eed<   dZdZdZg dZdZ	dZ
dZdZeedZ ej"                         d        Zy	)
Siglip2PreTrainedModelrE   siglip2)imagetextT)r   rD   r   $Siglip2MultiheadAttentionPoolingHeadF)r!   r"   c                    t        |t              rt        | j                  t              r | j                  j                  j
                  n| j                  j
                  }t        j                  |j                  j                  dt        j                  |      z         t        |d      rZt        j                  |j                  t        j                   |j                  j"                  d         j%                  d             yyt        |t&        j(                        r t        j*                  |j                         yt        |t,              rIt        j.                  |j0                  j                         t        j.                  |j2                  j                         t        j.                  |j4                  j                         t        j.                  |j6                  j                         t        j8                  |j0                  j:                         t        j8                  |j2                  j:                         t        j8                  |j4                  j:                         t        j8                  |j6                  j:                         yt        |t<              rt        j.                  |j>                  j                         t        j.                  |j@                  j                         t        j                  |j>                  j:                  d       t        j                  |j@                  j:                  d       yt        |tB              rrt        j.                  |jD                         t        j.                  |jF                  jH                         t        j8                  |jF                  jJ                         yt        |tL              r?t        j8                  |jN                         t        j8                  |jP                         yt        |tR              rdt        j                  |jT                  j                  | j                  j                  j
                  dz  | j                  jV                  z         yt        |t&        jX                  t&        jZ                  f      rLt        j\                  |j                         |j:                   t        j8                  |j:                         yyt        |t&        j^                        r?t        j8                  |j:                         t        j`                  |j                         yt        |tb              rZt        j                  |j                  t        j                   |j                  j"                  d         j%                  d             yy)	zInitialize the weightsr   )stdr   r]   r   gư>r   N)2
isinstancerD   rE   r   vision_configrK   initnormal_rV   r   npsqrthasattrcopy_r   r'   r   rh   r   rN   rU   default_flax_embed_init_r   xavier_uniform_r   r   r   r   zeros_biasr   r   r   r   probe	attentionin_proj_weightin_proj_biasSiglip2Modellogit_scale
logit_biasSiglip2ForImageClassification
classifierinitializer_factorrO   Conv2dlecun_normal_r   ones_r   )r?   r   rz   s      r-   _init_weightsz$Siglip2PreTrainedModel._init_weights  s    f56 dkk=9 ))55[[,, 
 LL2299q2775>?QRv~.

6..V=P=P=V=VWY=Z0[0b0bcj0kl /-))&--8 01  !5!56  !5!56  !5!56  !7!78KK**+KK**+KK**+KK,,-
+  !2!23  !2!23LLd3LLd3 DE  .  !1!1!@!@AKK((556-KK**+KK))* =>LL!!((KK--994?$++B`B`` BII 67v}}-{{&FKK( '-KK$JJv}}% 56JJv**ELL9L9L9R9RSU9V,W,^,^_f,gh 7r,   N)r#   r$   r%   r   r)   base_model_prefixinput_modalitiessupports_gradient_checkpointing_no_split_modules_supports_flash_attn_supports_sdpa_supports_flex_attn_supports_attention_backendr   r   _can_record_outputsr'   no_gradr  r+   r,   r-   r   r     si    !(&*# !N"& -&
 U]]_/i /ir,   r   c                   j     e Zd ZdZdef fdZe	 d	dej                  dz  de	e
   defd       Z xZS )
Siglip2Encoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`Siglip2EncoderLayer`].

    Args:
        config: Siglip2Config
    rE   c                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w )NF)
rI   rJ   rE   rN   
ModuleListro   num_hidden_layersr   layersgradient_checkpointing)r?   rE   r   rX   s      r-   rJ   zSiglip2Encoder.__init__  sP    mm%PVPhPhJi$jQ%8%@$jk&+# %ks   A#Nr   r   r8   c                 T    |}| j                   D ]  } |||fi |} t        |      S )N)r    )r  r   )r?   r   r   r   r!   encoder_layers         r-   r   zSiglip2Encoder.forward  sC     &![[ 	M) M	 ??r,   r   )r#   r$   r%   r&   r   rJ   r   r'   r   r   r   r   r   r   r   s   @r-   r  r    s_    ,} ,  /3@ t+@ +,	@
 
@ @r,   r  c                        e Zd ZdZdef fdZe	 	 ddej                  dej                  dej                  dedz  d	edz  d
efd       Z xZS )Siglip2VisionTransformerrQ   rE   c                 l   t         |   |       || _        |j                  }t	        |      | _        t        |      | _        t        j                  ||j                        | _        t        |d      sdn|j                  | _        | j                  rt        |      | _        | j#                          y )Nr   vision_use_headT)rI   rJ   rE   rK   rD   r   r  encoderrN   r   r   post_layernormr   r  use_headr   head	post_initr   s      r-   rJ   z!Siglip2VisionTransformer.__init__  s     &&	1&9%f- ll9&:O:OP$+F4E$FFLbLb==<VDDIr,   Nr}   r   rZ   output_attentionsoutput_hidden_statesr8   c                    ||n| j                   j                  }||n| j                   j                  }| j                  ||      }t	        | j                   ||      }| j                  ||||      }	|	j                  }
| j                  |
      }
| j                  r| j                  |
|      nd}t        |
||	j                  |	j                        S )z
        spatial_shapes (`torch.LongTensor` of shape `(batch_size, 2)`):
            Tensor containing the spatial dimensions (height, width) of the input images.
        NrE   r   r   )r   r   r   r!  )r    pooler_outputr!   r"   )rE   r   r!  r   r	   r  r    r  r  r  r   r!   r"   )r?   r}   r   rZ   r   r!  r   r!   encoder_attention_maskencoder_outputsr    r$  s               r-   r   z Siglip2VisionTransformer.forward  s     2C1N-TXT_T_TqTq$8$D $++JjJj 	 nE!:;;')"
 ,0<<'1/!5	 ,8 ,
 ,== //0ABHL		"3^D[_)/')77&11	
 	
r,   NN)r#   r$   r%   _input_embed_layerr   rJ   r   r'   r(   r   r   boolr   r   r   r   s   @r-   r  r    s    *2   *.,0+
''+
 +
 ((	+

  $;+
 #Tk+
 
$+
 +
r,   r  c                        e Zd ZdZdef fdZee	 	 	 ddej                  dz  dej                  dz  dej                  dz  de
e   d	ef
d
              Z xZS )Siglip2TextTransformerr   rE   c                 @   t         |   |       || _        |j                  }t	        |      | _        t        |      | _        t        j                  ||j                        | _        t        j                  ||j                        | _        | j                          y r   )rI   rJ   rE   rK   r   r   r  r  rN   r   r   final_layer_normrO   projection_sizer  r  r   s      r-   rJ   zSiglip2TextTransformer.__init__:  su     &&	/7%f- "YF<Q<Q RIIi)?)?@	r,   Nr   r   r   r   r8   c                 t   |t        d      |j                         }|j                  d|d         }| j                  ||      }t	        | j
                  ||      } | j                  d||d|}|j                  }| j                  |      }|d d dd d f   }	| j                  |	      }	t        ||	      S )NzYou have to specify input_idsr]   )r   r   r#  )r   r   )r    r$  r+   )r   rd   r   r   r	   rE   r  r    r-  r  r   )
r?   r   r   r   r   input_shaper!   r&  r    pooled_outputs
             r-   r   zSiglip2TextTransformer.forwardE  s     <==nn&NN2{27	),W 3;;')
 ,84<< ,
'),
 ,
 ,== 112CD *!R(3		-0)/'
 	
r,   r   )r#   r$   r%   r(  r   rJ   r   r   r'   r   r   r   r   r   r   r   s   @r-   r+  r+  7  s    *	0 	  *..2,0	&
<<$&&
 t+&
 llT)	&

 +,&
 
$&
  &
r,   r+  zL
    The text model from Siglip2 without any head or projection on top.
    c                        e Zd ZU eed<   dZdef fdZdej                  fdZ	d Z
e ed      e	 	 	 dd
ej                  d	z  dej                  d	z  dej                  d	z  dee   def
d                     Z xZS )Siglip2TextModelrE   )r   c                 d    t         |   |       t        |      | _        | j	                          y r   )rI   rJ   r+  
text_modelr  rW   s     r-   rJ   zSiglip2TextModel.__init__y  s&     08r,   r8   c                 B    | j                   j                  j                  S r   r5  r   r   rB   s    r-   get_input_embeddingsz%Siglip2TextModel.get_input_embeddings      ))999r,   c                 :    || j                   j                  _        y r   r7  r?   r   s     r-   set_input_embeddingsz%Siglip2TextModel.set_input_embeddings      5:""2r,   Ftie_last_hidden_statesNr   r   r   r   c                 .     | j                   d|||d|S )a  
        Examples:

        ```python
        >>> from transformers import AutoTokenizer, Siglip2TextModel

        >>> model = Siglip2TextModel.from_pretrained("google/siglip2-base-patch16-224")
        >>> tokenizer = AutoTokenizer.from_pretrained("google/siglip2-base-patch16-224")

        >>> # important: make sure to set padding="max_length" as that's how the model was trained
        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding="max_length", return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
        ```r   r   r   r+   r5  r?   r   r   r   r   s        r-   r   zSiglip2TextModel.forward  s/    6 t 
)%
 	
 	
r,   r   )r#   r$   r%   r   r)   r  rJ   rN   Moduler8  r<  r   r   r   r'   r   r   r   r   r   r   r   s   @r-   r3  r3  p  s      0 :bii :;  E2 *..2,0	
<<$&
 t+
 llT)	

 +,
 
$
  3  
r,   r3  c                        e Zd ZdZdef fdZd	dej                  dej                  dz  dej                  fdZ xZ	S )
r   zMultihead Attention Pooling.rE   c                    t         |           t        j                  t	        j
                  dd|j                              | _        t        j                  j                  |j                  |j                  d      | _
        t        j                  |j                  |j                        | _        t        |      | _        || _        |j                  | _        y )Nr   T)batch_firstr   )rI   rJ   rN   	Parameterr'   randnrK   r   MultiheadAttentionr   r   r   r   	layernormr   r   rE   r   rW   s     r-   rJ   z-Siglip2MultiheadAttentionPoolingHead.__init__  s    \\%++aF4F4F"GH
44V5G5GIcIcqu4vf&8&8f>S>STf%33r,   Nhidden_stater   r8   c                    |j                   d   }| j                  j                  |dd      }||j                   d   |j                   d   }}t        | j                  |||      }||j                  d| j
                  |d      }|j                  d||      }|j                  t        j                  k(  rht        j                  |t        j                  d|j                  |j                        t        j                  |j                        j                        }| j                  ||||      d   }|}| j!                  |      }|| j#                  |      z   }|d d df   S )Nr   r   )rE   r   r   encoder_hidden_statesr]   r   r^   )	attn_mask)rh   r   repeatr	   rE   r   rs   r`   r'   r)  wheretensorr_   finfominr   rK  r   )r?   rL  r   ru   r   
target_len
source_lenr   s           r-   r   z,Siglip2MultiheadAttentionPoolingHead.forward  sF   !''*


!!*a3%%*[[^\5G5G5J
J6{{#-&2	N )!/!6!6q$..*VW!X!/!7!7J
!S "''5::5%*[[&S1F1FekkZEKK044&N ~~e\<Sa~bcde~~l3$((<"88AqD!!r,   r   )
r#   r$   r%   r&   r   rJ   r'   r   r   r   r   s   @r-   r   r     sB    &	42 	4"ELL "%,,QUBU "afamam "r,   r   zN
    The vision model from Siglip2 without any head or projection on top.
    c                        e Zd ZU eed<   dZdZdef fdZdej                  fdZ
e ed      edej                  d	ej                   d
ej"                  dee   def
d                     Z xZS )Siglip2VisionModelrE   r}   r   c                 d    t         |   |       t        |      | _        | j	                          y r   )rI   rJ   r  vision_modelr  rW   s     r-   rJ   zSiglip2VisionModel.__init__  s)     4V< 	r,   r8   c                 B    | j                   j                  j                  S r   r[  r   rQ   rB   s    r-   r8  z'Siglip2VisionModel.get_input_embeddings        ++;;;r,   Fr>  pixel_attention_maskrZ   r   c                 .     | j                   d|||d|S )a  
        pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):
            Mask to avoid performing attention on padding pixel indices.
        spatial_shapes (`torch.LongTensor` of shape `(batch_size, 2)`):
            Tensor containing the spatial dimensions (height, width) of the input images.

        Examples:

        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, Siglip2VisionModel

        >>> model = Siglip2VisionModel.from_pretrained("google/siglip2-base-patch16-224")
        >>> processor = AutoProcessor.from_pretrained("google/siglip2-base-patch16-224")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled features
        ```r}   r   rZ   r+   r[  r?   r}   r_  rZ   r   s        r-   r   zSiglip2VisionModel.forward  s2    J !t   
%/)
 	
 	
r,   )r#   r$   r%   r   r)   main_input_namer  rJ   rN   rD  r8  r   r   r   r'   r(   r   r   r   r   r   r   r   r   s   @r-   rX  rX    s      $O!2 <bii <  E2'
'''
 $ll'
 ((	'

 +,'
 
$'
  3  '
r,   rX  c                       e Zd ZU eed<   def fdZdej                  fdZdej                  fdZ	e
e	 	 ddej                  d	ej                  dz  d
ej                  dz  dee   deez  f
d              Ze
e	 	 	 ddej&                  dz  dej                  dz  dej(                  dz  dee   deez  f
d              Ze
e	 	 	 	 	 	 	 	 	 ddej(                  dz  dej&                  dz  dej                  dz  dej(                  dz  d	ej                  dz  d
ej(                  dz  dedz  dedz  dedz  defd              Z xZS )r   rE   c                    t         |   |       t        |j                  t              s"t        dt        |j                         d      t        |j                  t              s"t        dt        |j                         d      |j                  }|j                  }t        j                  |      }t        j                  |      }|j                  | _        |j                  | _        t        j                  t!        j"                  d            | _        t        j                  t!        j"                  d            | _        | j)                          y )NzNconfig.text_config is expected to be of type Siglip2TextConfig but is of type .zRconfig.vision_config is expected to be of type Siglip2VisionConfig but is of type r   )rI   rJ   r   text_configr   	TypeErrorrl   r   r   r3  _from_configrX  r5  r[  rN   rH  r'   rI  r   r   r  )r?   rE   rh  r   r5  r[  rX   s         r-   rJ   zSiglip2Model.__init__  s    &,,.?@++,-Q0 
 &..0CD--./q2 
 ((,, &22;?
)66}E %//(55<<A7,,u{{1~6 	r,   r8   c                 B    | j                   j                  j                  S r   r7  rB   s    r-   r8  z!Siglip2Model.get_input_embeddings=  r9  r,   r   c                 :    || j                   j                  _        y r   r7  r;  s     r-   r<  z!Siglip2Model.set_input_embeddings@  r=  r,   Nr   r   r   r   c                 .     | j                   d|||d|S )ao  
        Examples:

        ```python
        >>> from transformers import AutoTokenizer, AutoModel
        >>> import torch

        >>> model = AutoModel.from_pretrained("google/siglip2-base-patch16-224")
        >>> tokenizer = AutoTokenizer.from_pretrained("google/siglip2-base-patch16-224")

        >>> # important: make sure to set padding="max_length" as that's how the model was trained
        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding="max_length", return_tensors="pt")
        >>> with torch.no_grad():
        ...     text_features = model.get_text_features(**inputs)
        ```rA  r+   rB  rC  s        r-   get_text_featureszSiglip2Model.get_text_featuresC  s/    0 t 
)%
 	
 	
r,   r}   r_  rZ   c                 .     | j                   d|||d|S )a  
        pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):
            Mask to avoid performing attention on padding pixel indices.
        spatial_shapes (`torch.LongTensor` of shape `(batch_size, 2)`):
            Tensor containing the spatial dimensions (height, width) of the input images.

        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, AutoModel
        >>> from transformers.image_utils import load_image

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = load_image(url)

        >>> model = AutoModel.from_pretrained("google/siglip2-base-patch16-224")
        >>> processor = AutoProcessor.from_pretrained("google/siglip2-base-patch16-224")

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> with torch.no_grad():
        ...     image_features = model.get_image_features(**inputs)
        ```
        ra  r+   rb  rc  s        r-   get_image_featureszSiglip2Model.get_image_featuresb  s2    D !t   
%/)
 	
 	
r,   return_lossr   r!  c
           	         ||n| j                   j                  }|	|	n| j                   j                  }	| j                  |||||	      }| j	                  |||||	      }|j
                  }|j
                  }||j                  ddd      z  }||j                  ddd      z  }t        j                  ||j                         j                  |j                              }| j                  j                  |j                        | j                  j                  |j                        }}||j                         z  |z   }|j                         }d}|rt        j                  |j!                  d      |j                  	      }t        j"                  |       d|z  z   }t        j$                  j&                  j)                  ||z        }t        j*                  |d
       }|j-                         }t/        |||||||      S )ae  
        pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):
            Mask to avoid performing attention on padding pixel indices.
        spatial_shapes (`torch.LongTensor` of shape `(batch_size, 2)`):
            Tensor containing the spatial dimensions (height, width) of the input images.
        return_loss (`bool`, *optional*):
            Whether or not to return the contrastive loss.

        Examples:

        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, AutoModel
        >>> import torch

        >>> model = AutoModel.from_pretrained("google/siglip2-base-patch16-224")
        >>> processor = AutoProcessor.from_pretrained("google/siglip2-base-patch16-224")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> texts = ["a photo of 2 cats", "a photo of 2 dogs"]
        >>> # important: we pass `padding=max_length` since the model was trained with this
        >>> inputs = processor(text=texts, images=image, padding="max_length", return_tensors="pt")

        >>> with torch.no_grad():
        ...     outputs = model(**inputs)

        >>> logits_per_image = outputs.logits_per_image
        >>> probs = torch.sigmoid(logits_per_image) # these are the probabilities
        >>> print(f"{probs[0][0]:.1%} that image 0 is '{texts[0]}'")
        31.9% that image 0 is 'a photo of 2 cats'
        ```
        N)r}   r   rZ   r   r!  )r   r   r   r   r!  ra   r]   T)r   r   keepdimr   )r_   r   )r3   r4   r5   r0   r   r6   r7   )rE   r   r!  r[  r5  r$  normr'   r   trm   r_   r   r   expeyerd   	ones_likerN   r   
logsigmoidsummeanr2   )r?   r   r}   r_  rZ   r   r   rq  r   r!  r   vision_outputstext_outputsr   r0   r5   r   r   r4   r3   rx  m1_diag1logliknlls                           r-   r   zSiglip2Model.forward  s    j 2C1N-TXT_T_TqTq$8$D $++JjJj 	 6:5F5F%/)/!5 6G 6
 48??)%/!5 4C 4
 &33"00 $l&7&7!T&7&RR!K$4$4qb$$4$OO  ,,{LNN4D4G4GHZHZ4[\"&"2"2"5"5k6H6H"I4??K]K]^i^p^pKqZ)KOO,==
J*,,.))O003O<R<RSC881s7BHXX((33H4NOF99V,,C88:D-+#%* .
 	
r,   r'  r   )	NNNNNNNNN)r#   r$   r%   r   r)   rJ   rN   rD  r8  r<  r   r   r'   r   r   r   r*   r   rn  r(   r   rp  r)  r2   r   r   r   s   @r-   r   r     s    } @:bii :;")) ;  /3,0	
<<
 t+
 llT)	

 +,
 
+	+
  
:  264826	%
''$.%
 $llT1%
 ((4/	%

 +,%
 
+	+%
  %
P  .2154826.204#')-,0h
##d*h
 ''$.h
 $llT1	h

 ((4/h
 t+h
 &&-h
 D[h
  $;h
 #Tkh
 
h
  h
r,   r   z
    Siglip2 vision encoder with an image classification head on top (a linear layer on top of the pooled final hidden states of
    the patch tokens) e.g. for ImageNet.
    c                   4    e Zd ZdZdZdeddf fdZdej                  fdZ	dej                  fd	Z
eee	 	 	 	 	 	 ddej                  dz  d
ej                  dz  dej                   dz  dej                  dz  dedz  dedz  defd                     Z xZS )r   r}   rY  rE   r8   Nc                 ~   t         |   |       |j                  | _        t        j	                  |j
                        }|j                  | _        |j                  dkD  r4t        j                  |j
                  j                  |j                        nt        j                         | _        | j                          y )Nr   )rI   rJ   
num_labelsrX  rj  r   r[  rN   rO   rK   Identityr   r  )r?   rE   r[  rX   s      r-   rJ   z&Siglip2ForImageClassification.__init__  s      ++ *66v7K7KL(55 OUN_N_bcNcBIIf**668I8IJikititiv 	
 	r,   c                 B    | j                   j                  j                  S r   r]  rB   s    r-   r8  z2Siglip2ForImageClassification.get_input_embeddings  r^  r,   r   c                 :    || j                   j                  _        y r   r]  r;  s     r-   r<  z2Siglip2ForImageClassification.set_input_embeddings  s    7<$$4r,   r_  rZ   labelsr   r!  c                 ,   ||n| j                   j                  }||n| j                   j                  }| j                  |||||      }|j                  }	|Q|d   j                  |	j                        }
t        j                  |	|
z  d      t        j                  |
d      z  }	nt        j                  |	d      }	| j                  |	      }d}|| j                  ||| j                         }t        |||j                  |j                        S )a  
        pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):
            Mask to avoid performing attention on padding pixel indices.
        spatial_shapes (`torch.LongTensor` of shape `(batch_size, 2)`):
            Tensor containing the spatial dimensions (height, width) of the input images.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, Siglip2ForImageClassification
        >>> import torch
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO

        >>> torch.manual_seed(3)  # doctest: +IGNORE_RESULT
        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> # note: we are loading a `Siglip2Model` from the hub here,
        >>> # so the head will be randomly initialized, hence the predictions will be random if seed is not set above.
        >>> image_processor = AutoImageProcessor.from_pretrained("google/siglip2-base-patch16-224")
        >>> model = Siglip2ForImageClassification.from_pretrained("google/siglip2-base-patch16-224")

        >>> inputs = image_processor(images=image, return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> logits = outputs.logits
        >>> # model predicts one of the two classes
        >>> predicted_class_idx = logits.argmax(-1).item()
        >>> print("Predicted class:", model.config.id2label[predicted_class_idx])
        Predicted class: LABEL_1
        ```
        N)r   rZ   r   r!  ).Nr   rt  )r3   logitsr!   r"   )rE   r   r!  r[  r    rm   r_   r'   r{  r|  r   loss_functionr   r!   r"   )r?   r}   r_  rZ   r  r   r!  r   outputssequence_output	pool_maskr  r3   s                r-   r   z%Siglip2ForImageClassification.forward  s!   f 2C1N-TXT_T_TqTq$8$D $++JjJj 	 /3.?.?/)/!5 /@ /
 "33  +,Y7::?;Q;QRI#ii)(CKeiiXaghNiiO#jja@O 1%%ffdkkBD$!//))	
 	
r,   )NNNNNN)r#   r$   r%   rd  r  r   rJ   rN   rD  r8  r<  r   r   r   r'   r   r   r)  r   r   r   r   s   @r-   r   r     s     %O!}  $<bii <=")) =   -14826&*)-,0R
llT)R
 $llT1R
 ((4/	R

 t#R
  $;R
 #TkR
 
R
    R
r,   r   )r   r   r3  rX  r   )r   )Dcollections.abcr   dataclassesr   typingr   numpyr   r'   torch.nnrN   torch.nn.functionalr   rq    r   r   activationsr   masking_utilsr	   modeling_layersr
   modeling_outputsr   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   utils.genericr   utils.output_capturingr   configuration_siglip2r   r   r   r   r/   r2   rD  rD   r   r   floatr   r   r   r   r   r  r  r+  r3  r   rX  r   r   __all__r+   r,   r-   <module>r     s  * % !       & ! 6 9 b b F & n n 7 5 X X 
	<+ 	< 	< 
	< 	< 	<  
K  
   
Febii eP%BII %^ %II%<<% 
% <<	%
 LL4'% % %.;)ryy ;)| 4 D Gi_ Gi GiT@RYY @D=
5 =
@6
3 6
r 
0
- 0

0
f,"299 ,"^ 
:
/ :

:
z \
) \
 \
~ q
$: q
q
hr,   