
    qi                     ,   d Z ddlmZ ddlmZ ddlmZ ddlZddl	Z	ddl	m
Z
 ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZmZ ddlmZmZ ddlmZ ddlmZmZm Z m!Z!m"Z" ddl#m$Z$ ddl%m&Z& ddl'm(Z(m)Z)m*Z* e e d       G d de                    Z+e e d       G d de                    Z,ee  G d de                    Z- G d de
j\                        Z/ G d d e
j\                        Z0	 dEd!e
j\                  d"e	jb                  d#e	jb                  d$e	jb                  d%e	jb                  dz  d&e2d'e2fd(Z3 G d) d*e
j\                        Z4 G d+ d,e
j\                        Z5 G d- d.e      Z6e  G d/ d0e             Z7 G d1 d2e
j\                        Z8 G d3 d4e7      Z9 e d5       G d6 d7e7             Z: G d8 d9e7      Z; G d: d;e
j\                        Z< e d<       G d= d>e7             Z=e  G d? d@e7             Z> e dA       G dB dCe7             Z?g dDZ@y)FzPyTorch Siglip model.    )Callable)	dataclass)AnyN)nn   )initialization)ACT2FN)create_bidirectional_mask)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPoolingImageClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringcan_return_tuple	torch_int)merge_with_config_defaults)capture_outputs   )SiglipConfigSiglipTextConfigSiglipVisionConfigz}
    Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
    )custom_introc                       e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	e
ej                  df   dz  ed<   dZe
ej                  df   dz  ed<   y)SiglipVisionModelOutputz
    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
        The image embeddings obtained by applying the projection layer to the pooler_output.
    Nimage_embedslast_hidden_state.hidden_states
attentions)__name__
__module____qualname____doc__r    torchFloatTensor__annotations__r!   r"   tupler#        \/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/siglip/modeling_siglip.pyr   r   +   sr    
 .2L%##d*126u((4/6:>M5**C/047>7;Je'',-4;r-   r   ze
    Base class for text model's outputs that also contains a pooling of the last hidden states.
    c                       e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	e
ej                  df   dz  ed<   dZe
ej                  df   dz  ed<   y)SiglipTextModelOutputz
    text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
        The text embeddings obtained by applying the projection layer to the pooler_output.
    Ntext_embedsr!   .r"   r#   )r$   r%   r&   r'   r1   r(   r)   r*   r!   r"   r+   r#   r,   r-   r.   r0   r0   >   sr    
 -1K""T)026u((4/6:>M5**C/047>7;Je'',-4;r-   r0   c                      e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	ej                  dz  ed<   dZ
ej                  dz  ed<   dZej                  dz  ed<   dZeed<   dZeed	<   d
ee   fdZy)SiglipOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
        Contrastive loss for image-text similarity.
    logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
        The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
        similarity scores.
    logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
        The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
        similarity scores.
    text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The text embeddings obtained by applying the projection layer to the pooled output of [`SiglipTextModel`].
    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The image embeddings obtained by applying the projection layer to the pooled output of [`SiglipVisionModel`].
    text_model_output (`BaseModelOutputWithPooling`):
        The output of the [`SiglipTextModel`].
    vision_model_output (`BaseModelOutputWithPooling`):
        The output of the [`SiglipVisionModel`].
    Nlosslogits_per_imagelogits_per_textr1   r    text_model_outputvision_model_outputreturnc                 H     t         fd j                         D              S )Nc              3   d   K   | ]'  }|d vr|   nt        |      j                          ) yw))r7   r8   N)getattrto_tuple).0kselfs     r.   	<genexpr>z(SiglipOutput.to_tuple.<locals>.<genexpr>q   s=      
  LLDGRYZ^`aRbRkRkRmm
s   -0)r+   keysr@   s   `r.   r=   zSiglipOutput.to_tuplep   s#     
YY[
 
 	
r-   )r$   r%   r&   r'   r4   r(   r)   r*   r5   r6   r1   r    r7   r   r8   r+   r   r=   r,   r-   r.   r3   r3   Q   s    & &*D%

d
")15e''$.504OU&&-4,0K""T)0-1L%##d*148186:3:
%* 
r-   r3   c                        e Zd Zdef fdZdej                  dededej                  fdZd
dej                  dej                  fd	Z
 xZS )SiglipVisionEmbeddingsconfigc                 f   t         |           || _        |j                  | _        |j
                  | _        |j                  | _        t        j                  |j                  | j                  | j                  | j                  d      | _
        | j
                  | j                  z  dz  | _        | j                  | _        t        j                  | j                  | j                        | _        | j                  dt!        j"                  | j                        j%                  d      d       y )Nvalid)in_channelsout_channelskernel_sizestridepadding   position_idsr   F
persistent)super__init__rF   hidden_size	embed_dim
image_size
patch_sizer   Conv2dnum_channelspatch_embeddingnum_patchesnum_positions	Embeddingposition_embeddingregister_bufferr(   arangeexpandr@   rF   	__class__s     r.   rU   zSiglipVisionEmbeddings.__init__x   s    ++ ++ ++!yy++?? 
 !OOt>1D!--"$,,t/A/A4>>"R^U\\$:L:L-M-T-TU\-]jopr-   
embeddingsheightwidthr9   c                    |j                   d   }| j                  j                  j                   d   }t        j                  j                         s%||k(  r ||k(  r| j                  | j                        S | j                  j                  j                  d      }|j                   d   }|| j                  z  }|| j                  z  }	t        |dz        }
|j                  d|
|
|      }|j                  dddd      }t        j                  j                  |||	fdd	      }|j                  dddd      j                  dd|      }|S )
a  
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing and no class embeddings.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   r   rQ   g      ?r   rN   bicubicF)sizemodealign_corners)shaper`   weightr(   jit
is_tracingrO   	unsqueezerY   r   reshapepermuter   
functionalinterpolateview)r@   rf   rg   rh   r]   r^   patch_pos_embeddim
new_height	new_widthsqrt_num_positionss              r.   interpolate_pos_encodingz/SiglipVisionEmbeddings.interpolate_pos_encoding   sE    !&&q)//66<<Q? yy##%+*F6UZ?**4+<+<==1188BB1Er"t.
T__,	&}c'9:)11!5GI[]`a)11!Q1=--33i(	 4 
 *11!Q1=BB1b#Nr-   pixel_valuesc                 \   |j                   \  }}}}| j                  j                  j                  }| j                  |j	                  |            }|j                  d      j                  dd      }|r|| j                  |||      z   }|S || j                  | j                        z   }|S )N)dtyperN   r   )
rn   r\   ro   r   toflatten	transposer}   r`   rO   )	r@   r~   r}   _rg   rh   target_dtypepatch_embedsrf   s	            r.   forwardzSiglipVisionEmbeddings.forward   s    *001fe++2288++LOO,O,OP!))!,66q!<
##d&C&CJPVX]&^^J  $d&=&=d>O>O&PPJr-   F)r$   r%   r&   r   rU   r(   Tensorintr}   r)   r   __classcell__re   s   @r.   rE   rE   w   s`    q1 q($5<< $ $UX $]b]i]i $L
E$5$5 
Z_ZfZf 
r-   rE   c            	            e Zd Zdef fdZ	 	 	 d	dej                  dz  dej                  dz  dej                  dz  dej                  fdZ	 xZ
S )
SiglipTextEmbeddingsrF   c                 N   t         |           |j                  }t        j                  |j
                  |      | _        t        j                  |j                  |      | _        | j                  dt        j                  |j                        j                  d      d       y )NrO   rP   FrR   )rT   rU   rV   r   r_   
vocab_sizetoken_embeddingmax_position_embeddingsr`   ra   r(   rb   rc   r@   rF   rW   re   s      r.   rU   zSiglipTextEmbeddings.__init__   s    &&	!||F,=,=yI"$,,v/M/My"Y 	ELL)G)GHOOPWXej 	 	
r-   N	input_idsrO   inputs_embedsr9   c                 8   ||j                   d   n|j                   d   }| j                  j                  j                   d   }||kD  rt        d| d|       || j                  d d d |f   }|| j                  |      }| j                  |      }||z   }|S )NrQ   r   zRSequence length must be less than max_position_embeddings (got `sequence length`: z and max_position_embeddings: )rn   r`   ro   
ValueErrorrO   r   )r@   r   rO   r   
seq_lengthmax_position_embeddingposition_embeddingsrf   s           r.   r   zSiglipTextEmbeddings.forward   s     -6,AY__R(}GZGZ[]G^
!%!8!8!?!?!E!Ea!H..d,<=S<TV 
 ,,Q^<L  00;M"55lC"%88
r-   NNN)r$   r%   r&   r   rU   r(   
LongTensorr)   r   r   r   r   s   @r.   r   r      sk    

/ 

 .20426	##d* &&- ((4/	
 
r-   r   modulequerykeyvalueattention_maskscalingdropoutc                    t        j                  ||j                  dd            |z  }|||z   }t        j                  j                  |dt         j                        j                  |j                        }t        j                  j                  ||| j                        }t        j                  ||      }	|	j                  dd      j                         }	|	|fS )NrQ   r   )ry   r   )ptrainingr   rN   )r(   matmulr   r   ru   softmaxfloat32r   r   r   r   
contiguous)
r   r   r   r   r   r   r   kwargsattn_weightsattn_outputs
             r.   eager_attention_forwardr      s     <<s}}R'<=GL!#n4==((2U]](SVVW\WbWbcL==((6??([L,,|U3K''1-88:K$$r-   c            
            e Zd ZdZ fdZ	 ddej                  dej                  dz  deej                  ej                  dz  f   fdZ xZ	S )	SiglipAttentionz=Multi-headed attention from 'Attention Is All You Need' paperc                    t         |           || _        |j                  | _        |j
                  | _        | j                  | j                  z  | _        | j                  | j                  z  | j                  k7  r&t        d| j                   d| j                   d      | j                  dz  | _	        |j                  | _        d| _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        y )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).      F)rT   rU   rF   rV   rW   num_attention_heads	num_headshead_dimr   scaleattention_dropoutr   	is_causalr   Lineark_projv_projq_projout_projrd   s     r.   rU   zSiglipAttention.__init__  s   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
//ii?ii?ii?		$..$..Ar-   Nr"   r   r9   c           
         |j                   \  }}}| j                  |      }| j                  |      }| j                  |      }	|j	                  ||| j
                  | j                        j                  dd      }|j	                  ||| j
                  | j                        j                  dd      }|	j	                  ||| j
                  | j                        j                  dd      }	t        j                  | j                  j                  t              }
 |
| |||	|| j                  | j                  | j                  sdn| j                         \  }}|j#                  |||      j%                         }| j'                  |      }||fS )z#Input shape: Batch x Time x Channelr   rN           )r   r   r   )rn   r   r   r   rw   r   r   r   r   get_interfacerF   _attn_implementationr   r   r   r   r   rs   r   r   )r@   r"   r   r   
batch_sizer   rW   queriesrB   valuesattention_interfacer   r   s                r.   r   zSiglipAttention.forward  sW    -:,?,?)
J	++m,{{=)]+,,z:t~~t}}U__`acdeyyZOYYZ[]^_ZT^^T]]S]]^_abc(?(M(MKK,,.E)
 %8nnJJ#}}C$,,	%
!\ "))*j)LWWYmmK0L((r-   N)
r$   r%   r&   r'   rU   r(   r   r+   r   r   r   s   @r.   r   r      sV    GB. /3$)||$) t+$)
 
u||U\\D00	1$)r-   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )	SiglipMLPc                    t         |           || _        t        |j                     | _        t        j                  |j                  |j                        | _
        t        j                  |j                  |j                        | _        y r   )rT   rU   rF   r	   
hidden_actactivation_fnr   r   rV   intermediate_sizefc1fc2rd   s     r.   rU   zSiglipMLP.__init__?  sd    #F$5$5699V//1I1IJ99V55v7I7IJr-   r"   r9   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r   )r   r   r   )r@   r"   s     r.   r   zSiglipMLP.forwardF  s4    /**=9/r-   )r$   r%   r&   rU   r(   r   r   r   r   s   @r.   r   r   >  s$    KU\\ ell r-   r   c            	            e Zd Zdeez  f fdZedej                  dej                  de	e
   dej                  fd       Z xZS )SiglipEncoderLayerrF   c                 D   t         |           |j                  | _        t	        j
                  | j                  |j                        | _        t        |      | _	        t	        j
                  | j                  |j                        | _
        t        |      | _        y Neps)rT   rU   rV   rW   r   	LayerNormlayer_norm_epslayer_norm1r   	self_attnlayer_norm2r   mlprd   s     r.   rU   zSiglipEncoderLayer.__init__N  sm    ++<<F<Q<QR(0<<F<Q<QRV$r-   r"   r   r   r9   c                     |}| j                  |      } | j                  d||d|\  }}||z   }|}| j                  |      }| j                  |      }||z   }|S )N)r"   r   r,   )r   r   r   r   )r@   r"   r   r   residualr   s         r.   r   zSiglipEncoderLayer.forwardV  s     !((7)4>> 
')
 
q
 !=0 ((7/ =0r-   )r$   r%   r&   r   r   rU   r   r(   r   r   r   r)   r   r   r   s   @r.   r   r   M  sd    %14DD % ||  +,	
 
		 r-   r   c                   t    e Zd ZU eed<   dZdZdZg dZdZ	dZ
dZdZeedZ ej"                         d        Zy)	SiglipPreTrainedModelrF   siglip)imagetextT)r   rE   r   #SiglipMultiheadAttentionPoolingHead)r"   r#   c                    t        |t              rt        | j                  t              r | j                  j                  j
                  n| j                  j
                  }t        j                  |j                  j                  dt        j                  |      z         t        |d      rZt        j                  |j                  t        j                   |j                  j"                  d         j%                  d             yyt        |t&        j(                        r t        j*                  |j                         yt        |t,              rIt        j.                  |j0                  j                         t        j.                  |j2                  j                         t        j.                  |j4                  j                         t        j.                  |j6                  j                         t        j8                  |j0                  j:                         t        j8                  |j2                  j:                         t        j8                  |j4                  j:                         t        j8                  |j6                  j:                         yt        |t<              rt        j.                  |j>                  j                         t        j.                  |j@                  j                         t        j                  |j>                  j:                  d       t        j                  |j@                  j:                  d       yt        |tB              rrt        j.                  |jD                         t        j.                  |jF                  jH                         t        j8                  |jF                  jJ                         yt        |tL              r?t        j8                  |jN                         t        j8                  |jP                         yt        |tR              rdt        j                  |jT                  j                  | j                  j                  j
                  dz  | j                  jV                  z         yt        |t&        jX                  t&        jZ                  f      rLt        j\                  |j                         |j:                   t        j8                  |j:                         yyt        |t&        j^                        r?t        j8                  |j:                         t        j`                  |j                         yt        |tb              rZt        j                  |j                  t        j                   |j                  j"                  d         j%                  d             yy)	zInitialize the weightsr   )stdrO   rQ   rP   gư>r   N)2
isinstancerE   rF   r   vision_configrV   initnormal_r`   ro   npsqrthasattrcopy_rO   r(   rb   rn   rc   r   r_   default_flax_embed_init_r   xavier_uniform_r   r   r   r   zeros_biasr   r   r   r   probe	attentionin_proj_weightin_proj_biasSiglipModellogit_scale
logit_biasSiglipForImageClassification
classifierinitializer_factorr   rZ   lecun_normal_r   ones_r   )r@   r   rh   s      r.   _init_weightsz#SiglipPreTrainedModel._init_weights  s    f45 dkk<8 ))55[[,, 
 LL2299q2775>?QRv~.

6..V=P=P=V=VWY=Z0[0b0bcj0kl /-))&--80  !5!56  !5!56  !5!56  !7!78KK**+KK**+KK**+KK,,-	*  !2!23  !2!23LLd3LLd3 CD  .  !1!1!@!@AKK((556,KK**+KK))* <=LL!!((KK--994?$++B`B`` BII 67v}}-{{&FKK( '-KK$JJv}}% 45JJv**ELL9L9L9R9RSU9V,W,^,^_f,gh 6r-   N)r$   r%   r&   r   r*   base_model_prefixinput_modalitiessupports_gradient_checkpointing_no_split_modules_supports_flash_attn_supports_sdpa_supports_flex_attn_supports_attention_backendr   r   _can_record_outputsr(   no_gradr   r,   r-   r.   r   r   o  si     (&*#  N"& ,%
 U]]_/i /ir-   r   c                   j     e Zd ZdZdef fdZe	 d	dej                  dz  de	e
   defd       Z xZS )
SiglipEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`SiglipEncoderLayer`].

    Args:
        config: SiglipConfig
    rF   c                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w )NF)
rT   rU   rF   r   
ModuleListrangenum_hidden_layersr   layersgradient_checkpointing)r@   rF   r   re   s      r.   rU   zSiglipEncoder.__init__  sO    mmvOgOgIh$iA%7%?$ij&+# %js   A#Nr   r   r9   c                 T    |}| j                   D ]  } |||fi |} t        |      S )N)r!   )r  r   )r@   r   r   r   r"   encoder_layers         r.   r   zSiglipEncoder.forward  sC     &![[ 	M) M	 ??r-   r   )r$   r%   r&   r'   r   rU   r   r(   r   r   r   r   r   r   r   s   @r.   r  r    s_    ,| ,  /3@ t+@ +,	@
 
@ @r-   r  c                        e Zd ZdZdef fdZee	 	 	 ddej                  dz  dej                  dz  dej                  dz  de
e   d	ef
d
              Z xZS )SiglipTextTransformerr   rF   c                 @   t         |   |       || _        |j                  }t	        |      | _        t        |      | _        t        j                  ||j                        | _        t        j                  ||j                        | _        | j                          y r   )rT   rU   rF   rV   r   rf   r  encoderr   r   r   final_layer_normr   projection_sizehead	post_initr   s      r.   rU   zSiglipTextTransformer.__init__  su     &&	.v6$V, "YF<Q<Q RIIi)?)?@	r-   Nr   r   rO   r   r9   c                 t   |t        d      |j                         }|j                  d|d         }| j                  ||      }t	        | j
                  ||      } | j                  d||d|}|j                  }| j                  |      }|d d dd d f   }	| j                  |	      }	t        ||	      S )NzYou have to specify input_idsrQ   )r   rO   )rF   r   r   )r   r   r!   pooler_outputr,   )r   rk   rw   rf   r
   rF   r  r!   r  r  r   )
r@   r   r   rO   r   input_shaper"   encoder_outputsr!   pooled_outputs
             r.   r   zSiglipTextTransformer.forward  s     <==nn&NN2{27	),W 3;;')
 ,84<< ,
'),
 ,
 ,== 112CD *!R(3		-0)/'
 	
r-   r   )r$   r%   r&   _input_embed_layerr   rU   r   r   r(   r   r   r   r   r   r   r   s   @r.   r  r    s    *	/ 	  *..2,0	&
<<$&&
 t+&
 llT)	&

 +,&
 
$&
  &
r-   r  zK
    The text model from SigLIP without any head or projection on top.
    c                        e Zd ZU eed<   dZdef fdZdej                  fdZ	d Z
e ed      e	 	 	 dd
ej                  d	z  dej                  d	z  dej                  d	z  dee   def
d                     Z xZS )SiglipTextModelrF   )r   c                 d    t         |   |       t        |      | _        | j	                          y r   )rT   rU   r  
text_modelr  rd   s     r.   rU   zSiglipTextModel.__init__  s&     /7r-   r9   c                 B    | j                   j                  j                  S r   r&  rf   r   rC   s    r.   get_input_embeddingsz$SiglipTextModel.get_input_embeddings$      ))999r-   c                 :    || j                   j                  _        y r   r(  r@   r   s     r.   set_input_embeddingsz$SiglipTextModel.set_input_embeddings'      5:""2r-   Ftie_last_hidden_statesNr   r   rO   r   c                 .     | j                   d|||d|S )a  
        Examples:

        ```python
        >>> from transformers import AutoTokenizer, SiglipTextModel

        >>> model = SiglipTextModel.from_pretrained("google/siglip-base-patch16-224")
        >>> tokenizer = AutoTokenizer.from_pretrained("google/siglip-base-patch16-224")

        >>> # important: make sure to set padding="max_length" as that's how the model was trained
        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding="max_length", return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
        ```r   r   rO   r,   r&  r@   r   r   rO   r   s        r.   r   zSiglipTextModel.forward*  s/    6 t 
)%
 	
 	
r-   r   )r$   r%   r&   r   r*   r  rU   r   Moduler)  r-  r   r   r   r(   r   r   r   r   r   r   r   s   @r.   r$  r$    s      / :bii :;  E2 *..2,0	
<<$&
 t+
 llT)	

 +,
 
$
  3  
r-   r$  c                   V     e Zd ZdZdef fdZe	 d	dedz  dee	   de
fd       Z xZS )
SiglipVisionTransformerr\   rF   c                 l   t         |   |       || _        |j                  }t	        |      | _        t        |      | _        t        j                  ||j                        | _        t        |d      sdn|j                  | _        | j                  rt        |      | _        | j#                          y )Nr   vision_use_headT)rT   rU   rF   rV   rE   rf   r  r  r   r   r   post_layernormr   r9  use_headr   r  r  r   s      r.   rU   z SiglipVisionTransformer.__init__P  s     &&	08$V, ll9&:O:OP$+F4E$FFLbLb==;FCDIr-   r}   Nr   r9   c                     | j                  ||      } | j                  dd|i|}|j                  }| j                  |      }| j                  r| j                  |      nd }t        ||      S )N)r}   r   r  r,   )rf   r  r!   r:  r;  r  r   )r@   r~   r}   r   r"   r   r!   r  s           r.   r   zSiglipVisionTransformer.forward^  s     Ogh+74<< ,
',
,

 ,== //0AB8<		"344)/'
 	
r-   r   )r$   r%   r&   r"  r   rU   r   boolr   r   r   r   r   r   s   @r.   r7  r7  M  sW    *1   16
 #'+
 +,	

 
$
 
r-   r7  c                   .     e Zd ZdZdef fdZd Z xZS )r   zMultihead Attention Pooling.rF   c                    t         |           t        j                  t	        j
                  dd|j                              | _        t        j                  j                  |j                  |j                  d      | _
        t        j                  |j                  |j                        | _        t        |      | _        y )Nr   T)batch_firstr   )rT   rU   r   	Parameterr(   randnrV   r   MultiheadAttentionr   r   r   r   	layernormr   r   rd   s     r.   rU   z,SiglipMultiheadAttentionPoolingHead.__init__z  s    \\%++aF4F4F"GH
44V5G5GIcIcqu4vf&8&8f>S>STV$r-   c                     |j                   d   }| j                  j                  |dd      }| j                  |||      d   }|}| j	                  |      }|| j                  |      z   }|d d df   S )Nr   r   )rn   r   repeatr   rD  r   )r@   hidden_stater   r   r   s        r.   r   z+SiglipMultiheadAttentionPoolingHead.forward  sv    !''*


!!*a3~~e\<HK~~l3$((<"88AqD!!r-   )r$   r%   r&   r'   r   rU   r   r   r   s   @r.   r   r   w  s    &%1 %
"r-   r   zM
    The vision model from SigLIP without any head or projection on top.
    c            
            e Zd ZU eed<   dZdZdef fdZdej                  fdZ
e ed      e	 dd	ed
ee   defd                     Z xZS )SiglipVisionModelrF   r~   r   c                 d    t         |   |       t        |      | _        | j	                          y r   )rT   rU   r7  vision_modelr  rd   s     r.   rU   zSiglipVisionModel.__init__  s)     3F; 	r-   r9   c                 B    | j                   j                  j                  S r   rL  rf   r\   rC   s    r.   r)  z&SiglipVisionModel.get_input_embeddings        ++;;;r-   Fr/  r}   r   c                 ,     | j                   d||d|S )a/  
        Examples:

        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, SiglipVisionModel

        >>> model = SiglipVisionModel.from_pretrained("google/siglip-base-patch16-224")
        >>> processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled features
        ```r~   r}   r,   rL  r@   r~   r}   r   s       r.   r   zSiglipVisionModel.forward  s/    @ !t   
%%=
 
 	
r-   r   )r$   r%   r&   r   r*   main_input_namer  rU   r   r5  r)  r   r   r   r=  r   r   r   r   r   r   s   @r.   rI  rI    s     $O!1 <bii <  E2 */!
 #'!
 +,	!

 
$!
  3  !
r-   rI  c                       e Zd ZU eed<   def fdZdej                  fdZdej                  fdZ	e
e	 	 ddej                  d	ej                  dz  d
ej                  dz  dee   deez  f
d              Ze
e	 ddej&                  dedee   deez  fd              Ze
e	 	 	 	 	 	 ddej,                  dz  dej&                  dz  d	ej                  dz  d
ej,                  dz  dedz  dedee   defd              Z xZS )r   rF   c                    t         |   |       t        |j                  t              s"t        dt        |j                         d      t        |j                  t              s"t        dt        |j                         d      |j                  }|j                  }t        j                  |      }t        j                  |      }|j                  | _        |j                  | _        t        j                  t!        j"                  d            | _        t        j                  t!        j"                  d            | _        | j)                          y )NzMconfig.text_config is expected to be of type SiglipTextConfig but is of type .zQconfig.vision_config is expected to be of type SiglipVisionConfig but is of type r   )rT   rU   r   text_configr   	TypeErrortyper   r   r$  _from_configrI  r&  rL  r   rA  r(   rB  r   r   r  )r@   rF   rX  r   r&  rL  re   s         r.   rU   zSiglipModel.__init__  s    &,,.>?++,-Q0 
 &..0BC--./q2 
 ((,, %11+>
(55mD %//(55<<A7,,u{{1~6 	r-   r9   c                 B    | j                   j                  j                  S r   r(  rC   s    r.   r)  z SiglipModel.get_input_embeddings  r*  r-   r   c                 :    || j                   j                  _        y r   r(  r,  s     r.   r-  z SiglipModel.set_input_embeddings  r.  r-   Nr   r   rO   r   c                 .     | j                   d|||d|S )am  
        Examples:

        ```python
        >>> from transformers import AutoTokenizer, AutoModel
        >>> import torch

        >>> model = AutoModel.from_pretrained("google/siglip-base-patch16-224")
        >>> tokenizer = AutoTokenizer.from_pretrained("google/siglip-base-patch16-224")

        >>> # important: make sure to set padding="max_length" as that's how the model was trained
        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding="max_length", return_tensors="pt")
        >>> with torch.no_grad():
        ...     text_features = model.get_text_features(**inputs)
        ```r2  r,   r3  r4  s        r.   get_text_featureszSiglipModel.get_text_features  s/    0 t 
)%
 	
 	
r-   r~   r}   c                 ,     | j                   d||d|S )a  
        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, AutoModel
        >>> from transformers.image_utils import load_image

        >>> model = AutoModel.from_pretrained("google/siglip-base-patch16-224")
        >>> processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = load_image(url)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> with torch.no_grad():
        ...     image_features = model.get_image_features(**inputs)
        ```rQ  r,   rR  rS  s       r.   get_image_featureszSiglipModel.get_image_features  s.    6 !t   
%%=
 
 	
r-   return_lossc           	          | j                   d||d|} | j                  d|||d|}	|j                  }
|	j                  }|
|
j                  ddd      z  }
||j                  ddd      z  }t	        j
                  ||
j                         j                  |j                              }| j                  j                  |j                        | j                  j                  |j                        }}||j                         z  |z   }|j                         }d}|rt	        j                  |j                  d      |j                  	      }t	        j                  |       d|z  z   }t        j                  j                   j#                  ||z        }t	        j$                  |d
       }|j'                         }t)        |||||
|	|      S )a  
        return_loss (`bool`, *optional*):
            Whether or not to return the contrastive loss.

        Examples:

        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, AutoModel
        >>> import torch

        >>> model = AutoModel.from_pretrained("google/siglip-base-patch16-224")
        >>> processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> texts = ["a photo of 2 cats", "a photo of 2 dogs"]
        >>> # important: we pass `padding=max_length` since the model was trained with this
        >>> inputs = processor(text=texts, images=image, padding="max_length", return_tensors="pt")

        >>> with torch.no_grad():
        ...     outputs = model(**inputs)

        >>> logits_per_image = outputs.logits_per_image
        >>> probs = torch.sigmoid(logits_per_image) # these are the probabilities
        >>> print(f"{probs[0][0]:.1%} that image 0 is '{texts[0]}'")
        31.9% that image 0 is 'a photo of 2 cats'
        ```rQ  r2  rN   rQ   T)r   ry   keepdimNr   )devicery   )r4   r5   r6   r1   r    r7   r8   r,   )rL  r&  r  normr(   r   tr   re  r   r   expeyerk   	ones_liker   ru   
logsigmoidsummeanr3   )r@   r   r~   r   rO   rb  r}   r   vision_outputstext_outputsr    r1   r6   r   r   r5   r4   rj  m1_diag1logliknlls                        r.   r   zSiglipModel.forward6  s   X 6GT5F5F 6
%%=6
 6
 4C4?? 4
)%4
 	4
 &33"00 $l&7&7!T&7&RR!K$4$4qb$$4$OO  ,,{LNN4D4G4GHZHZ4[\"&"2"2"5"5k6H6H"I4??K]K]^i^p^pKqZ)KOO,==
J*,,.))O003O<R<RSC881s7BHXX((33H4NOF99V,,C88:D-+#%* .
 	
r-   )NNr   )NNNNNF)r$   r%   r&   r   r*   rU   r   r5  r)  r-  r   r   r(   r   r   r   r+   r   r_  r)   r=  ra  r   r3   r   r   r   s   @r.   r   r     s   | @:bii :;")) ;  /3,0	
<<
 t+
 llT)	

 +,
 
+	+
  
:  */
''
 #'
 +,	

 
+	+
  
@  .215.204#').W
##d*W
 ''$.W
 t+	W

 &&-W
 D[W
 #'W
 +,W
 
W
  W
r-   r   z
    SigLIP vision encoder with an image classification head on top (a linear layer on top of the pooled final hidden states of
    the patch tokens) e.g. for ImageNet.
    c                        e Zd ZdZdZdeddf fdZdej                  fdZ	dej                  fd	Z
eee	 	 	 ddej                  dz  d
ej                  dz  dedee   def
d                     Z xZS )r   r~   rJ  rF   r9   Nc                 ~   t         |   |       |j                  | _        t        j	                  |j
                        }|j                  | _        |j                  dkD  r4t        j                  |j
                  j                  |j                        nt        j                         | _        | j                          y )Nr   )rT   rU   
num_labelsrI  r[  r   rL  r   r   rV   Identityr   r  )r@   rF   rL  re   s      r.   rU   z%SiglipForImageClassification.__init__  s      ++ )55f6J6JK(55 OUN_N_bcNcBIIf**668I8IJikititiv 	
 	r-   c                 B    | j                   j                  j                  S r   rN  rC   s    r.   r)  z1SiglipForImageClassification.get_input_embeddings  rO  r-   r   c                 :    || j                   j                  _        y r   rN  r,  s     r.   r-  z1SiglipForImageClassification.set_input_embeddings  s    7<$$4r-   labelsr}   r   c                      | j                   |fd|i|}|j                  }t        j                  |d      }| j	                  |      }d}|| j                  ||| j                        }t        ||      S )au  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, SiglipForImageClassification
        >>> import torch
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO

        >>> torch.manual_seed(3)  # doctest: +IGNORE_RESULT
        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> # note: we are loading a `SiglipModel` from the hub here,
        >>> # so the head will be randomly initialized, hence the predictions will be random if seed is not set above.
        >>> image_processor = AutoImageProcessor.from_pretrained("google/siglip-base-patch16-224")
        >>> model = SiglipForImageClassification.from_pretrained("google/siglip-base-patch16-224")

        >>> inputs = image_processor(images=image, return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> logits = outputs.logits
        >>> # model predicts one of the two classes
        >>> predicted_class_idx = logits.argmax(-1).item()
        >>> print("Predicted class:", model.config.id2label[predicted_class_idx])
        Predicted class: LABEL_1
        ```r}   r   rf  N)r4   logits)rL  r!   r(   rn  r   loss_functionrF   r   )	r@   r~   rz  r}   r   outputssequence_outputr|  r4   s	            r.   r   z$SiglipForImageClassification.forward  s    V /@d.?.?/
%=/
 /
 "33  **_!<1%%ffdkkBD$
 	
r-   )NNF)r$   r%   r&   rT  r  r   rU   r   r5  r)  r-  r   r   r   r(   r   r=  r   r   r   r   r   r   s   @r.   r   r     s     %O!|  $<bii <=")) =   -1&*).	<
llT)<
 t#<
 #'	<

 +,<
 
<
    <
r-   r   )r   r   r$  rI  r   )r   )Ar'   collections.abcr   dataclassesr   typingr   numpyr   r(   r    r   r   activationsr	   masking_utilsr
   modeling_layersr   modeling_outputsr   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   utils.genericr   utils.output_capturingr   configuration_siglipr   r   r   r   r0   r3   r5  rE   r   r   floatr   r   r   r   r   r  r  r$  r7  r   rI  r   r   __all__r,   r-   r.   <module>r     s    $ !     & ! 6 9 b b F &  8 5 T T 	<k 	< 	< 	<K 	< 	<  
;  
   
FERYY ER%299 %^ %II%<<% 
% <<	%
 LL4'% % %.;)bii ;)~		 3 D FiO Fi FiT@BII @D6
1 6
r 
0
+ 0

0
f'
3 '
T"")) "0 
4
- 4

4
n C
' C
 C
L [
#8 [
[
|r-   