
    qi                        d Z ddlmZ ddlmZ ddlmZ ddlZddlmZ ddl	m
Z dd	lmZ dd
lmZ ddlmZ ddlmZmZmZ ddlmZmZ ddlmZ ddlmZmZmZmZm Z  ddl!m"Z"m#Z# ddl$m%Z% ddl&m'Z'm(Z(m)Z)  ejT                  e+      Z,dejZ                  dejZ                  fdZ.dejZ                  dejZ                  fdZ/dejZ                  dejZ                  fdZ0e ed       G d de                    Z1e ed       G d  d!e                    Z2ee G d" d#e                    Z3 G d$ d%ejh                        Z5 G d& d'ejh                        Z6	 dOd(ejh                  d)ejZ                  d*ejZ                  d+ejZ                  d,ejZ                  dz  d-e7d.e7d/ee   fd0Z8 G d1 d2ejh                        Z9 G d3 d4ejh                        Z: G d5 d6e      Z;e G d7 d8e             Z< G d9 d:ejh                        Z= G d; d<e<      Z> ed=       G d> d?e<             Z? G d@ dAe<      Z@ edB       G dC dDe<             ZAe G dE dFe<             ZBe G dG dHe<             ZCe G dI dJe<             ZD edK       G dL dMe<             ZEg dNZFy)PzPyTorch CLIP model.    )Callable)	dataclass)AnyN)nn   )initialization)ACT2FN)create_causal_mask)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPoolingImageClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringlogging	torch_int)can_return_tuplemerge_with_config_defaults)capture_outputs   )
CLIPConfigCLIPTextConfigCLIPVisionConfiglogitsreturnc                     t         j                  j                  | t        j                  t        |       | j                              S )Ndevice)r   
functionalcross_entropytorcharangelenr"   )r   s    X/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/clip/modeling_clip.pycontrastive_lossr)   /   s/    ==&&vu||CKPVP]P]/^__    
similarityc                 Z    t        |       }t        | j                               }||z   dz  S )Ng       @)r)   t)r+   caption_loss
image_losss      r(   	clip_lossr0   3   s,    #J/L!*,,.1J:%,,r*   tensorc                     t        j                  | d      }t        j                  |dd      }t        j                  |d      }|S )z
    This method is equivalent to tensor.norm(p=2, dim=-1, keepdim=True) and used to make
    model `executorch` exportable. See issue https://github.com/pytorch/executorch/issues/3566
       T)dimkeepdim      ?)r%   powsum)r1   square_tensor
sum_tensornormed_tensors       r(   _get_vector_normr=   9   s<    
 IIfa(M=b$?JIIj#.Mr*   z}
    Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
    )custom_introc                       e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	e
ej                  df   dz  ed<   dZe
ej                  df   dz  ed<   y)CLIPVisionModelOutputz
    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
        The image embeddings obtained by applying the projection layer to the pooler_output.
    Nimage_embedslast_hidden_state.hidden_states
attentions)__name__
__module____qualname____doc__rA   r%   FloatTensor__annotations__rB   rC   tuplerD    r*   r(   r@   r@   D   sr    
 .2L%##d*126u((4/6:>M5**C/047>7;Je'',-4;r*   r@   ze
    Base class for text model's outputs that also contains a pooling of the last hidden states.
    c                       e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	e
ej                  df   dz  ed<   dZe
ej                  df   dz  ed<   y)CLIPTextModelOutputz
    text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
        The text embeddings obtained by applying the projection layer to the pooler_output.
    Ntext_embedsrB   .rC   rD   )rE   rF   rG   rH   rO   r%   rI   rJ   rB   rC   rK   rD   rL   r*   r(   rN   rN   V   sr    
 -1K""T)026u((4/6:>M5**C/047>7;Je'',-4;r*   rN   c                      e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	ej                  dz  ed<   dZ
ej                  dz  ed<   dZej                  dz  ed<   dZeed<   dZeed	<   d
ee   fdZy)
CLIPOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
        Contrastive loss for image-text similarity.
    logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
        The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
        similarity scores.
    logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
        The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
        similarity scores.
    text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The text embeddings obtained by applying the projection layer to the pooled output of [`CLIPTextModel`].
    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The image embeddings obtained by applying the projection layer to the pooled output of [`CLIPVisionModel`].
    text_model_output (`BaseModelOutputWithPooling`):
        The output of the [`CLIPTextModel`].
    vision_model_output (`BaseModelOutputWithPooling`):
        The output of the [`CLIPVisionModel`].
    Nlosslogits_per_imagelogits_per_textrO   rA   text_model_outputvision_model_outputr   c                 H     t         fd j                         D              S )Nc              3   d   K   | ]'  }|d vr|   nt        |      j                          ) yw))rU   rV   N)getattrto_tuple).0kselfs     r(   	<genexpr>z&CLIPOutput.to_tuple.<locals>.<genexpr>   s=      
  LLDGRYZ^`aRbRkRkRmm
s   -0)rK   keysr]   s   `r(   rZ   zCLIPOutput.to_tuple   s#     
YY[
 
 	
r*   )rE   rF   rG   rH   rR   r%   rI   rJ   rS   rT   rO   rA   rU   r   rV   rK   r   rZ   rL   r*   r(   rQ   rQ   h   s    & &*D%

d
")15e''$.504OU&&-4,0K""T)0-1L%##d*148186:3:
%* 
r*   rQ   c                        e Zd Zdef fdZdej                  dededej                  fdZd
dej                  dej                  fd	Z
 xZS )CLIPVisionEmbeddingsconfigc                    t         |           || _        |j                  | _        |j
                  | _        |j                  | _        t        j                  t        j                  | j                              | _        t        j                  |j                  | j                  | j                  | j                  d      | _        | j
                  | j                  z  dz  | _        | j                  dz   | _        t        j"                  | j                   | j                        | _        | j'                  dt        j(                  | j                         j+                  d      d       y )NF)in_channelsout_channelskernel_sizestridebiasr3   r   position_idsr   r4   
persistent)super__init__rc   hidden_size	embed_dim
image_size
patch_sizer   	Parameterr%   randnclass_embeddingConv2dnum_channelspatch_embeddingnum_patchesnum_positions	Embeddingposition_embeddingregister_bufferr&   expandr]   rc   	__class__s     r(   ro   zCLIPVisionEmbeddings.__init__   s	   ++ ++ ++!||EKK,GH!yy++?? 
 !OOt>1D!--1"$,,t/A/A4>>"R^U\\$:L:L-M-T-TU\-]jopr*   
embeddingsheightwidthr   c                    |j                   d   dz
  }| j                  j                  j                  d      }|j                   d   dz
  }t        j
                  j                         s%||k(  r ||k(  r| j                  | j                        S |ddddf   }|ddddf   }|j                   d   }	|| j                  z  }
|| j                  z  }t        |dz        }|j                  d|||	      }|j                  dddd      }t        j                  j                  ||
|fdd	
      }|j                  dddd      j                  dd|	      }t	        j                   ||fd      S )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   r   Nr4   r7   r   r3   bicubicF)sizemodealign_cornersr5   )shaper}   weight	unsqueezer%   jit
is_tracingrj   rs   r   reshapepermuter   r#   interpolateviewcat)r]   r   r   r   rz   r}   r{   class_pos_embedpatch_pos_embedr5   
new_height	new_widthsqrt_num_positionss                r(   interpolate_pos_encodingz-CLIPVisionEmbeddings.interpolate_pos_encoding   sv    !&&q)A-!44;;EEaH*003a7 yy##%+*F6UZ?**4+<+<==,QU3,QU3r"t.
T__,	&}c'9:)11!5GI[]`a)11!Q1=--33i(	 4 
 *11!Q1=BB1b#Nyy/?;CCr*   pixel_valuesc                 `   |j                   \  }}}}|sJ|| j                  k7  s|| j                  k7  r,t        d| d| d| j                   d| j                   d	      | j                  j                  j
                  }| j                  |j                  |            }|j                  d      j                  dd      }| j                  j                  |dd      }	t        j                  |	|gd	      }
|r|
| j                  |
||      z   }
|
S |
| j                  | j                        z   }
|
S )
NzInput image size (*z) doesn't match model ().)dtyper3   r   r4   r   )r   rr   
ValueErrorry   r   r   toflatten	transposerv   r   r%   r   r   r}   rj   )r]   r   r   
batch_size_r   r   target_dtypepatch_embedsclass_embedsr   s              r(   forwardzCLIPVisionEmbeddings.forward   s6   '3'9'9$
Avu'Vt-F%SWSbSbJb$VHAeW4KDOOK\\]^b^m^m]nnpq  ++2288++LOO,O,OP#++A.88A>++22:q"EYYl;C
##d&C&CJPVX]&^^J  $d&=&=d>O>O&PPJr*   F)rE   rF   rG   r   ro   r%   Tensorintr   rI   r   __classcell__r   s   @r(   rb   rb      se    q/ q,'D5<< 'D 'DUX 'D]b]i]i 'DRE$5$5 Z_ZfZf r*   rb   c            	            e Zd Zdef fdZ	 	 	 d	dej                  dz  dej                  dz  dej                  dz  dej                  fdZ	 xZ
S )
CLIPTextEmbeddingsrc   c                 N   t         |           |j                  }t        j                  |j
                  |      | _        t        j                  |j                  |      | _        | j                  dt        j                  |j                        j                  d      d       y )Nrj   rk   Frl   )rn   ro   rp   r   r|   
vocab_sizetoken_embeddingmax_position_embeddingsr}   r~   r%   r&   r   r]   rc   rq   r   s      r(   ro   zCLIPTextEmbeddings.__init__   s    &&	!||F,=,=yI"$,,v/M/My"Y 	ELL)G)GHOOPWXej 	 	
r*   N	input_idsrj   inputs_embedsr   c                 8   ||j                   d   n|j                   d   }| j                  j                  j                   d   }||kD  rt        d| d|       || j                  d d d |f   }|| j                  |      }| j                  |      }||z   }|S )Nr4   r   zRSequence length must be less than max_position_embeddings (got `sequence length`: z and max_position_embeddings: )r   r}   r   r   rj   r   )r]   r   rj   r   
seq_lengthmax_position_embeddingposition_embeddingsr   s           r(   r   zCLIPTextEmbeddings.forward   s     -6,AY__R(}GZGZ[]G^
!%!8!8!?!?!E!Ea!H..d,<=S<TV 
 ,,Q^<L  00;M"55lC"%88
r*   NNN)rE   rF   rG   r   ro   r%   
LongTensorrI   r   r   r   r   s   @r(   r   r      sj    

~ 

 .20426	##d* &&- ((4/	
 
r*   r   modulequerykeyvalueattention_maskscalingdropoutkwargsc                    t        j                  ||j                  dd            |z  }|||z   }t        j                  j                  |dt         j                        j                  |j                        }t        j                  j                  ||| j                        }t        j                  ||      }	|	j                  dd      j                         }	|	|fS )Nr4   r   )r5   r   )ptrainingr   r3   )r%   matmulr   r   r#   softmaxfloat32r   r   r   r   
contiguous)
r   r   r   r   r   r   r   r   attn_weightsattn_outputs
             r(   eager_attention_forwardr     s     <<s}}R'<=GL!#n4==((2U]](SVVW\WbWbcL==((6??([L,,|U3K''1-88:K$$r*   c                        e Zd ZdZdeez  f fdZ	 d
dej                  dej                  dz  de	e
   deej                  ej                  dz  f   fd	Z xZS )CLIPAttentionz=Multi-headed attention from 'Attention Is All You Need' paperrc   c                    t         |           || _        |j                  | _        |j
                  | _        | j                  | j                  z  | _        | j                  | j                  z  | j                  k7  r&t        d| j                   d| j                   d      | j                  dz  | _	        |j                  | _        d| _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        y )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: r         F)rn   ro   rc   rp   rq   num_attention_heads	num_headshead_dimr   scaleattention_dropoutr   	is_causalr   Lineark_projv_projq_projout_projr   s     r(   ro   zCLIPAttention.__init__   s   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
//ii?ii?ii?		$..$..Ar*   NrC   r   r   r   c                    |j                   \  }}}| j                  |      }| j                  |      }| j                  |      }	|j	                  ||d| j
                        j                  dd      }|j	                  ||d| j
                        j                  dd      }|	j	                  ||d| j
                        j                  dd      }	t        j                  | j                  j                  t              }
 |
| |||	|f| j                  | j                  sdn| j                  d|\  }}|j                  ||d      j!                         }| j#                  |      }||fS )z#Input shape: Batch x Time x Channelr4   r   r3           )r   r   )r   r   r   r   r   r   r   r   get_interfacerc   _attn_implementationr   r   r   r   r   r   r   )r]   rC   r   r   r   r   rq   queriesr_   valuesattention_interfacer   r   s                r(   r   zCLIPAttention.forward4  sT    -:,?,?)
J	++m,{{=)]+,,z:r4==ISSTUWXYyyZT]]CMMaQRSZRGQQRSUVW(?(M(MKK,,.E)
 %8	%
 JJ#}}C$,,	%
 	%
!\ "))*j"EPPRmmK0L((r*   N)rE   rF   rG   rH   r   r   ro   r%   r   r   r   rK   r   r   r   s   @r(   r   r     st    GB/.@ B. /3$)||$) t+$) +,	$)
 
u||U\\D00	1$)r*   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )CLIPMLPc                    t         |           || _        t        |j                     | _        t        j                  |j                  |j                        | _
        t        j                  |j                  |j                        | _        y r   )rn   ro   rc   r	   
hidden_actactivation_fnr   r   rp   intermediate_sizefc1fc2r   s     r(   ro   zCLIPMLP.__init__\  sd    #F$5$5699V//1I1IJ99V55v7I7IJr*   rC   r   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r   )r   r   r   )r]   rC   s     r(   r   zCLIPMLP.forwardc  s4    /**=9/r*   )rE   rF   rG   ro   r%   r   r   r   r   s   @r(   r   r   [  s$    KU\\ ell r*   r   c                        e Zd Zdeez  f fdZdej                  dej                  dee	   dej                  fdZ xZS )CLIPEncoderLayerrc   c                 D   t         |           |j                  | _        t	        |      | _        t        j                  | j                  |j                        | _	        t        |      | _        t        j                  | j                  |j                        | _        y N)eps)rn   ro   rp   rq   r   	self_attnr   	LayerNormlayer_norm_epslayer_norm1r   mlplayer_norm2r   s     r(   ro   zCLIPEncoderLayer.__init__k  sl    ++&v.<<F<Q<QR6?<<F<Q<QRr*   rC   r   r   r   c                     |}| j                  |      } | j                  d||d|\  }}||z   }|}| j                  |      }| j                  |      }||z   }|S )N)rC   r   rL   )r   r   r   r   )r]   rC   r   r   residualr   s         r(   r   zCLIPEncoderLayer.forwards  s     !((7)4>> 
')
 
q
 !=0 ((7/ =0r*   )rE   rF   rG   r   r   ro   r%   r   r   r   rI   r   r   r   s   @r(   r   r   j  sV    S/.@ S||  +,	
 
		r*   r   c                   l    e Zd ZU eed<   dZdZdZdZdZ	dZ
dZeedZ ej                          d        Zy)CLIPPreTrainedModelrc   clip)imagetextT)rC   rD   c                    | j                   j                  }t        |t              rt	        j
                  |j                  j                  d|dz         t	        j
                  |j                  j                  d|dz         t	        j                  |j                  t        j                  |j                  j                  d         j                  d             nt        |t              r| j                   j                  }t	        j
                  |j                   d|j"                  dz  |z         t	        j
                  |j$                  j                  |j                   j&                  |z         t	        j
                  |j                  j                  |j                   j&                  |z         t	        j                  |j                  t        j                  |j(                        j                  d             nt        |t*              r| j                   j                  }|j"                  dz  d|j                   j,                  z  dz  z  |z  }|j"                  dz  |z  }t	        j
                  |j.                  j                  |       t	        j
                  |j0                  j                  |       t	        j
                  |j2                  j                  |       t	        j
                  |j4                  j                  |       nt        |t6              r| j                   j                  }|j                   j8                  dz  d|j                   j,                  z  dz  z  |z  }d|j                   j8                  z  dz  |z  }t	        j
                  |j:                  j                  |       t	        j
                  |j<                  j                  |       nt        |t>              rt	        j
                  |j@                  j                  |jB                  dz  | j                   j                  z         t	        j
                  |jD                  j                  |jF                  dz  | j                   j                  z         nGt        |tH              rZt	        j
                  |jD                  j                  | j                   j8                  dz  | j                   j                  z         nt        |tJ              rZt	        j
                  |j@                  j                  | j                   j8                  dz  | j                   j                  z         nst        |tL              rct	        j
                  |jN                  j                  | j                   jP                  j8                  dz  | j                   j                  z         t        |tR        jT                        r>t	        jV                  |jX                         t	        jZ                  |j                         t        |tR        j\                        r-|jX                   t	        jV                  |jX                         y	y	y	)
zInitialize the weightsr   g{Gz?)meanstdr4   rk   r   )r  r3   N)/rc   initializer_factor
isinstancer   initnormal_r   r   r}   copy_rj   r%   r&   r   r   rb   rv   rq   ry   initializer_ranger{   r   num_hidden_layersr   r   r   r   r   rp   r   r   	CLIPModeltext_projectiontext_embed_dimvisual_projectionvision_embed_dimCLIPVisionModelWithProjectionCLIPTextModelWithProjectionCLIPForImageClassification
classifiervision_configr   r   zeros_ri   ones_r   )r]   r   factorin_proj_stdout_proj_stdfc_stds         r(   _init_weightsz!CLIPPreTrainedModel._init_weights  su    //f01LL//66SftmTLL2299RVWJJv**ELL9L9L9R9RSU9V,W,^,^_f,gh 45[[33FLL//cv?O?OQU?UX^?^_LL//66FMM<[<[^d<deLL2299v}}?^?^ag?ghJJv**ELL9M9M,N,U,UV],^_.[[33F!++T1q6==;Z;Z7Z_c6cdgmmK",,d2f<LLL--;?LL--;?LL--;?LL//\B([[33F!==44d:FMMDcDc@chl?lmpvvK&--333<vEFLL**7LL**<	*LL&&--))4/$++2P2PP LL((//++T1DKK4R4RR  =>LL((//KK++T1DKK4R4RR  ;<LL&&--KK++T1DKK4R4RR  :;LL!!((KK--994?$++B`B``
 fbll+KK$JJv}}%fbii(V[[-DKK$ .E(r*   N)rE   rF   rG   r   rJ   base_model_prefixinput_modalitiessupports_gradient_checkpointing_supports_sdpa_supports_flash_attn_supports_flex_attn_supports_attention_backendr   r   _can_record_outputsr%   no_gradr  rL   r*   r(   r   r     s[    (&*#N"&)#
 U]]_8% 8%r*   r   c                   `     e Zd ZdZdef fdZ	 d	dej                  dz  dee	   de
fdZ xZS )
CLIPEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`CLIPEncoderLayer`].

    Args:
        config: CLIPConfig
    rc   c                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w NF)
rn   ro   rc   r   
ModuleListranger  r   layersgradient_checkpointing)r]   rc   r   r   s      r(   ro   zCLIPEncoder.__init__  sO    mmuVMeMeGf$g!%5f%=$gh&+# %hs   A#Nr   r   r   c                 T    |}| j                   D ]  } |||fi |} t        |      S )a7  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
        )rB   )r-  r   )r]   r   r   r   rC   encoder_layers         r(   r   zCLIPEncoder.forward  sH    ( &![[ 	M) M	 +
 	
r*   r   )rE   rF   rG   rH   r   ro   r%   r   r   r   r   r   r   r   s   @r(   r(  r(    sK    ,z , /3
 t+
 +,	

 

r*   r(  c                        e Zd ZU eed<   dZddgZdef fdZe e	d      e
	 	 	 dd	ej                  dz  d
ej                  dz  dej                  dz  dee   def
d                     Z xZS )CLIPTextTransformerrc   r  r   r   c                    t         |   |       || _        |j                  }t	        |      | _        t        |      | _        t        j                  ||j                        | _        |j                  | _        | j                          y r   )rn   ro   rc   rp   r   r   r(  encoderr   r   r   final_layer_normeos_token_id	post_initr   s      r(   ro   zCLIPTextTransformer.__init__  sm     &&	,V4"6* "YF<Q<Q R #//r*   Ftie_last_hidden_statesNr   r   rj   r   r   c           	         |t        d      |j                         }|j                  d|d         }| j                  ||      }t	        | j
                  ||t        j                  |j                  d   |j                        d       }|j                  dd         | j                  d||dd	|}|j                  }| j                  |      }| j                  d
k(  rm|t        j                  |j                  d   |j                        |j                  t        j                   |j                        j#                  d      f   }	n|t        j                  |j                  d   |j                        |j                  t        j                   |j                        | j                  k(  j!                         j#                  d      f   }	t%        ||	      S )NzYou have to specify input_idsr4   )r   rj   r   r!   )rc   r   r   cache_positionpast_key_valuesr   T)r   r   r   r3   r   )r   r"   r   rB   pooler_outputrL   )r   r   r   r   r
   rc   r%   r&   r   r"   popr5  rB   r6  r7  r   r   argmaxr   )
r]   r   r   rj   r   input_shaperC   encoder_outputsrB   pooled_outputs
             r(   r   zCLIPTextTransformer.forward  s    <==nn&NN2{27	),W+;;') <<(;(;A(>}G[G[\ 
 	

;%+74<< ,
'),
 	,
 ,== 112CD! ..44Q7@Q@X@XY5995F5M5MNUUZ\U]_M ..44Q7@Q@X@XY EII6G6N6NOSWSdSddB!M */'
 	
r*   r   )rE   rF   rG   r   rJ   r  _no_split_modulesro   r   r   r   r%   r   r   r   r   r   r   r   s   @r(   r2  r2    s     -/AB
~ 
  E2 *..2,0	;
<<$&;
 t+;
 llT)	;

 +,;
 
$;
  3  ;
r*   r2  zI
    The text model from CLIP without any head or projection on top.
    c                        e Zd ZU eed<   dZddgZdef fdZdej                  fdZ
d Ze	 	 	 dd
ej                  d	z  dej                  d	z  dej                  d	z  dee   def
d       Z xZS )CLIPTextModelrc   r3  r   r   c                 d    t         |   |       t        |      | _        | j	                          y r   )rn   ro   r2  
text_modelr8  r   s     r(   ro   zCLIPTextModel.__init__d  s&     -f5r*   r   c                 B    | j                   j                  j                  S r   rI  r   r   r`   s    r(   get_input_embeddingsz"CLIPTextModel.get_input_embeddingsj      ))999r*   c                 :    || j                   j                  _        y r   rK  r]   r   s     r(   set_input_embeddingsz"CLIPTextModel.set_input_embeddingsm      5:""2r*   Nr   r   rj   r   c                 .     | j                   d|||d|S )a9  
        Examples:

        ```python
        >>> from transformers import AutoTokenizer, CLIPTextModel

        >>> model = CLIPTextModel.from_pretrained("openai/clip-vit-base-patch32")
        >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
        ```r   r   rj   rL   )rI  )r]   r   r   rj   r   s        r(   r   zCLIPTextModel.forwardp  s/    0 t 
)%
 	
 	
r*   r   )rE   rF   rG   r   rJ   r  rE  ro   r   ModulerL  rP  r   r%   r   r   r   r   r   r   r   s   @r(   rG  rG  Y  s      -/AB~ :bii :;  *..2,0	
<<$&
 t+
 llT)	

 +,
 
$
 
r*   rG  c                        e Zd ZU eed<   dZdZdgZdef fdZe	 e
d      e	 	 ddej                  dz  d	edz  d
ee   defd                     Z xZS )CLIPVisionTransformerrc   r   r  r   c                 B   t         |   |       || _        |j                  }t	        |      | _        t        j                  ||j                        | _	        t        |      | _        t        j                  ||j                        | _        | j                          y r   )rn   ro   rc   rp   rb   r   r   r   r   pre_layrnormr(  r5  post_layernormr8  r   s      r(   ro   zCLIPVisionTransformer.__init__  sv     &&	.v6LL8M8MN"6* ll9&:O:OPr*   Fr9  Nr   r   r   c                     |t        d      | j                  ||      }| j                  |      } | j                  dd|i|}|j                  }|d d dd d f   }| j                  |      }t        ||      S )Nz You have to specify pixel_values)r   r   r   r>  rL   )r   r   rY  r5  rB   rZ  r   )r]   r   r   r   rC   rC  rB   rD  s           r(   r   zCLIPVisionTransformer.forward  s     ?@@Ogh))-8+74<< ,
',
,

 ,==)!Q'2++M:)/'
 	
r*   r*  )rE   rF   rG   r   rJ   main_input_namer  rE  ro   r   r   r   r%   rI   boolr   r   r   r   r   r   s   @r(   rV  rV    s    $O!+,	/ 	  E2 2605
''$.
 #'+
 +,	

 
$
  3  
r*   rV  zK
    The vision model from CLIP without any head or projection on top.
    c            
            e Zd ZU eed<   dZdZdgZdef fdZde	j                  fdZe	 	 ddej                  dz  d	ed
ee   defd       Z xZS )CLIPVisionModelrc   r   rW  r   c                 d    t         |   |       t        |      | _        | j	                          y r   )rn   ro   rV  vision_modelr8  r   s     r(   ro   zCLIPVisionModel.__init__  s'     1&9r*   r   c                 B    | j                   j                  j                  S r   ra  r   ry   r`   s    r(   rL  z$CLIPVisionModel.get_input_embeddings        ++;;;r*   Nr   r   c                 ,     | j                   d||d|S )a(  
        Example:

        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, CLIPVisionModel

        >>> model = CLIPVisionModel.from_pretrained("openai/clip-vit-base-patch32")
        >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled CLS states
        ```r   r   rL   )ra  )r]   r   r   r   s       r(   r   zCLIPVisionModel.forward  s.    < !t   
%%=
 
 	
r*   r*  )rE   rF   rG   r   rJ   r\  r  rE  ro   r   rT  rL  r   r%   rI   r]  r   r   r   r   r   r   s   @r(   r_  r_    s     $O!+,/ <bii <  26).!
''$.!
 #'!
 +,	!

 
$!
 !
r*   r_  c                       e Zd ZU eed<   g dZdef fdZee	 	 dde	j                  de	j                  dz  de	j                  dz  dee   d	eez  f
d
              Zee	 dde	j                   dedee   d	eez  fd              Zee	 	 	 	 	 	 dde	j&                  dz  de	j                   dz  de	j                  dz  de	j&                  dz  dedz  dedee   d	efd              Z xZS )r  rc   )r   r   rb   c                    t         |   |       t        |j                  t              s"t        dt        |j                         d      t        |j                  t              s"t        dt        |j                         d      |j                  }|j                  }|j                  | _	        |j                  | _        |j                  | _        t        j                  |      }|j                  | _        t         j                  |      }|j"                  | _        t%        j&                  | j                  | j                  d      | _        t%        j&                  | j                  | j                  d      | _        t%        j,                  t/        j0                  | j2                  j4                              | _        | j9                          y )NzKconfig.text_config is expected to be of type CLIPTextConfig but is of type .zOconfig.vision_config is expected to be of type CLIPVisionConfig but is of type Fri   )rn   ro   r  text_configr   	TypeErrortyper  r   projection_dimrp   r  r  rG  _from_configrI  r_  ra  r   r   r  r  rt   r%   r1   rc   logit_scale_init_valuelogit_scaler8  )r]   rc   rk  r  rI  ra  r   s         r(   ro   zCLIPModel.__init__  sx    &,,n=++,-Q0 
 &..0@A--./q2 
 ((,,$33)55 - 9 9"//<
$//&33MB(55!#4+@+@$BUBU\a!b!yy)<)<d>Q>QX]^<<T[[5W5W(XY 	r*   Nr   r   rj   r   r   c                 x     | j                   d|||dd|}|j                  }| j                  |      |_        |S )a  
        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoTokenizer, CLIPModel

        >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
        >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")

        >>> with torch.inference_mode():
        ...     text_features = model.get_text_features(**inputs)
        ```T)r   r   rj   return_dictrL   )rI  r?  r  )r]   r   r   rj   r   text_outputsrD  s          r(   get_text_featureszCLIPModel.get_text_features   sV    0 4C4?? 4
)%	4

 4
 %22%)%9%9-%H"r*   r   r   c                 v     | j                   d||dd|}|j                  }| j                  |      |_        |S )a  
        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, CLIPModel
        >>> from transformers.image_utils import load_image

        >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
        >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = load_image(url)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> with torch.inference_mode():
        ...     image_features = model.get_image_features(**inputs)
        ```T)r   r   rs  rL   )ra  r?  r  )r]   r   r   r   vision_outputsrD  s         r(   get_image_featureszCLIPModel.get_image_featuresD  sU    6 6GT5F5F 6
%%=6
 	6
 '44'+'='=m'L$r*   return_lossc           	      L    | j                   d||d|} | j                  d|||d|}	|j                  }
| j                  |
      }
|	j                  }| j	                  |      }|
t        |
      z  }
|t        |      z  }t        j                  ||
j                         j                  |j                              }|| j                  j                         j                  |j                        z  }|j                         }d}|rt        |      }t        |||||
|	|      S )a  
        return_loss (`bool`, *optional*):
            Whether or not to return the contrastive loss.

        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, CLIPModel
        >>> from transformers.image_utils import load_image

        >>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
        >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = load_image(url)

        >>> inputs = processor(
        ...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
        ... )

        >>> with torch.inference_mode():
        ...     outputs = model(**inputs)
        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
        >>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
        ```rf  rS  N)rR   rS   rT   rO   rA   rU   rV   rL   )ra  rI  r?  r  r  r=   r%   r   r-   r   r"   rq  expr0   rQ   )r]   r   r   r   rj   ry  r   r   rw  rt  rA   rO   rT   rS   rR   s                  r(   r   zCLIPModel.forwardj  sI   L 6GT5F5F 6
%%=6
 6
 4C4?? 4
)%4
 	4
 &33--l;"00**;7 $&6|&DD!$4[$AA  ,,{LNN4D4G4GHZHZ4[\)D,<,<,@,@,B,E,EkFXFX,YY*,,._-D-+#%* .
 	
r*   NNr   )NNNNNF)rE   rF   rG   r   rJ   rE  ro   r   r   r%   r   r   r   rK   r   ru  rI   r]  rx  r   rQ   r   r   r   s   @r(   r  r    s   Z!z !F  /3,0	 <<  t+  llT)	 
 +,  
+	+    D  */"''" #'" +,	"
 
+	+"  "H  .215.204#').M
##d*M
 ''$.M
 t+	M

 &&-M
 D[M
 #'M
 +,M
 
M
  M
r*   r  c                        e Zd ZU eed<   dZddgZdef fdZdej                  fdZ
d Zee	 	 	 dd
ej                  d	z  dej                  d	z  dej                  d	z  dee   def
d              Z xZS )r  rc   r3  r   r   c                     t         |   |       t        j                  |      }|j                  | _        t        j                  |j                  |j                  d      | _	        | j                          y NFrj  )rn   ro   rG  ro  rI  r   r   rp   rn  r  r8  )r]   rc   rI  r   s      r(   ro   z$CLIPTextModelWithProjection.__init__  s[     "//7
$//!yy););V=R=RY^_ 	r*   r   c                 B    | j                   j                  j                  S r   rK  r`   s    r(   rL  z0CLIPTextModelWithProjection.get_input_embeddings  rM  r*   c                 :    || j                   j                  _        y r   rK  rO  s     r(   rP  z0CLIPTextModelWithProjection.set_input_embeddings  rQ  r*   Nr   r   rj   r   c                      | j                   d|||d|}|j                  }| j                  |      }t        ||j                  |j
                  |j                        S )a@  
        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoTokenizer, CLIPTextModelWithProjection

        >>> model = CLIPTextModelWithProjection.from_pretrained("openai/clip-vit-base-patch32")
        >>> tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-base-patch32")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")

        >>> with torch.inference_mode():
        ...     outputs = model(**inputs)
        >>> text_embeds = outputs.text_embeds
        ```rS  )rO   rB   rC   rD   rL   )rI  r?  r  rN   rB   rC   rD   )r]   r   r   rj   r   rt  rD  rO   s           r(   r   z#CLIPTextModelWithProjection.forward  su    4 4C4?? 4
)%4
 	4
 %22**=9"#*<<&44#..	
 	
r*   r   )rE   rF   rG   r   rJ   r  rE  ro   r   rT  rL  rP  r   r   r%   r   r   r   rN   r   r   r   s   @r(   r  r    s     -/AB	~ 	:bii :;  *..2,0	&
<<$&&
 t+&
 llT)	&

 +,&
 
&
  &
r*   r  c                        e Zd ZU eed<   dZdZdef fdZdej                  fdZ
ee	 	 ddej                  dz  ded	ee   defd
              Z xZS )r  rc   r   rW  c                     t         |   |       t        j                  |      }|j                  | _        t        j                  |j                  |j                  d      | _	        | j                          y r  )rn   ro   r_  ro  ra  r   r   rp   rn  r  r8  r]   rc   ra  r   s      r(   ro   z&CLIPVisionModelWithProjection.__init__  s\     &33F;(55!#6+=+=v?T?T[`!a 	r*   r   c                 B    | j                   j                  j                  S r   rc  r`   s    r(   rL  z2CLIPVisionModelWithProjection.get_input_embeddings  rd  r*   Nr   r   c                      | j                   d||d|}|j                  }| j                  |      }t        ||j                  |j
                  |j                        S )a  
        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, CLIPVisionModelWithProjection
        >>> from transformers.image_utils import load_image

        >>> model = CLIPVisionModelWithProjection.from_pretrained("openai/clip-vit-base-patch32")
        >>> processor = AutoProcessor.from_pretrained("openai/clip-vit-base-patch32")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = load_image(url)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> with torch.inference_mode():
        ...     outputs = model(**inputs)
        >>> image_embeds = outputs.image_embeds
        ```rf  )rA   rB   rC   rD   rL   )ra  r?  r  r@   rB   rC   rD   )r]   r   r   r   rw  rD  rA   s          r(   r   z%CLIPVisionModelWithProjection.forward  st    : 6GT5F5F 6
%%=6
 6

 '44--m<$%,>>(66%00	
 	
r*   r*  )rE   rF   rG   r   rJ   r\  r  ro   r   rT  rL  r   r   r%   rI   r]  r   r   r@   r   r   r   s   @r(   r  r    s    $O!	/ 	<bii <  26).(
''$.(
 #'(
 +,	(

 
(
  (
r*   r  z
    CLIP vision encoder with an image classification head on top (a linear layer on top of the pooled final hidden states of
    the patch tokens) e.g. for ImageNet.
    c                        e Zd ZdZdZdeddf fdZee	 	 d
de	j                  dz  de	j                  dz  dee   defd	              Z xZS )r  r   rW  rc   r   Nc                 ~   t         |   |       |j                  | _        t        j	                  |j
                        }|j                  | _        |j                  dkD  r4t        j                  |j
                  j                  |j                        nt        j                         | _        | j                          y )Nr   )rn   ro   
num_labelsr_  ro  r  ra  r   r   rp   Identityr  r8  r  s      r(   ro   z#CLIPForImageClassification.__init__J  s      ++&33F4H4HI(55 OUN_N_bcNcBIIf**668I8IJikititiv 	
 	r*   labelsr   c                 0    | j                   |fi |}|j                  }t        j                  |ddddddf   d      }| j	                  |      }d}|| j                  ||| j                        }t        |||j                  |j                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr   r   )rR   r   rC   rD   )
ra  rB   r%   r  r  loss_functionrc   r   rC   rD   )r]   r   r  r   outputssequence_outputr   rR   s           r(   r   z"CLIPForImageClassification.forwardY  s     /@d.?.?/
/

 "33**_QAX%>AF1%%ffdkkBD$!//))	
 	
r*   r|  )rE   rF   rG   r\  r  r   ro   r   r   r%   r   r   r   r   r   r   r   s   @r(   r  r  @  s     %O!z d   -1&*
llT)
 t#
 +,	

 

  
r*   r  )r  r   rG  r  r_  r  r  )r   )GrH   collections.abcr   dataclassesr   typingr   r%   r    r   r  activationsr	   masking_utilsr
   modeling_layersr   modeling_outputsr   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   utils.genericr   r   utils.output_capturingr   configuration_clipr   r   r   
get_loggerrE   loggerr   r)   r0   r=   r@   rN   rQ   rT  rb   r   floatr   r   r   r   r   r(  r2  rG  rV  r_  r  r  r  r  __all__rL   r*   r(   <module>r     s    $ !    & ! / 9 b b F &  J 5 L L 
		H	%
`U\\ `ell `-%,, -5<< -U\\ ell  
	<K 	< 	< 
	<+ 	< 	<  
  
   
FP299 Pf% %^ %II%<<% 
% <<	%
 LL4'% % % '(%*;)BII ;)|bii 1 B G%/ G% G%T-
")) -
`P
- P
f 
/
' /

/
d,
/ ,
^ 
1
) 1

1
h @
# @
 @
F ?
"5 ?
 ?
D =
$7 =
 =
@ 4
!4 4
4
nr*   