
    qi                        d Z ddlZddlmZ ddlmZ ddlmZ ddlZddl	m
Z
 ddlmZ ddlmZ dd	lmZ dd
lmZmZmZmZ ddlmZmZ ddlmZ ddlmZ ddlmZm Z m!Z!m"Z"m#Z#m$Z$ ddl%m&Z&m'Z'm(Z(  e#jR                  e*      Z+dejX                  dejX                  fdZ-dejX                  dejX                  fdZ.ee! G d de                    Z/ G d de
j`                        Z1 G d de
j`                        Z2 G d de
j`                        Z3de2iZ4 G d d e
j`                        Z5 G d! d"e
j`                        Z6 G d# d$e
j`                        Z7 G d% d&e      Z8 G d' d(e
j`                        Z9 G d) d*e
j`                        Z:	 dLd+e
j`                  d,ejX                  d-ejX                  d.ejX                  d/ejX                  dz  d0e;d1e;fd2Z< G d3 d4e
j`                        Z= G d5 d6e
j`                        Z> G d7 d8e      Z? G d9 d:e
j`                        Z@ G d; d<e
j`                        ZAe! G d= d>e             ZB G d? d@e
j`                        ZC G dA dBeB      ZD e!dCD       G dE dFeB             ZE G dG dHeB      ZF G dI dJeB      ZGg dKZHy)MzPyTorch AltCLIP model.    N)Callable)	dataclass)Any   )initialization)ACT2FN)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPooling,BaseModelOutputWithPoolingAndCrossAttentions'BaseModelOutputWithPoolingAndProjection)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)apply_chunking_to_forward)ModelOutputTransformersKwargsauto_docstringcan_return_tuplelogging	torch_int   )AltCLIPConfigAltCLIPTextConfigAltCLIPVisionConfiglogitsreturnc                     t         j                  j                  | t        j                  t        |       | j                              S )Ndevice)nn
functionalcross_entropytorcharangelenr    )r   s    ^/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/altclip/modeling_altclip.pycontrastive_lossr(   -   s/    ==&&vu||CKPVP]P]/^__    
similarityc                 Z    t        |       }t        | j                               }||z   dz  S )Ng       @)r(   t)r*   caption_loss
image_losss      r'   	clip_lossr/   1   s,    #J/L!*,,.1J:%,,r)   c                      e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	ej                  dz  ed<   dZ
ej                  dz  ed<   dZej                  dz  ed<   dZeed<   dZeed	<   d
ee   fdZy)AltCLIPOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
        Contrastive loss for image-text similarity.
    logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
        The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
        similarity scores.
    logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
        The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
        similarity scores.
    text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The text embeddings obtained by applying the projection layer to the pooled output of [`AltCLIPTextModel`].
    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The image embeddings obtained by applying the projection layer to the pooled output of [`AltCLIPVisionModel`].
    text_model_output (`BaseModelOutputWithPooling`):
        The output of the [`AltCLIPTextModel`].
    vision_model_output (`BaseModelOutputWithPooling`):
        The output of the [`AltCLIPVisionModel`].
    Nlosslogits_per_imagelogits_per_texttext_embedsimage_embedstext_model_outputvision_model_outputr   c                 H     t         fd j                         D              S )Nc              3   d   K   | ]'  }|d vr|   nt        |      j                          ) yw))r7   r8   N)getattrto_tuple).0kselfs     r'   	<genexpr>z)AltCLIPOutput.to_tuple.<locals>.<genexpr>W   s=      
  LLDGRYZ^`aRbRkRkRmm
s   -0)tuplekeysr?   s   `r'   r<   zAltCLIPOutput.to_tupleV   s#     
YY[
 
 	
r)   )__name__
__module____qualname____doc__r2   r$   FloatTensor__annotations__r3   r4   r5   r6   r7   r   r8   rA   r   r<    r)   r'   r1   r1   7   s    & &*D%

d
")15e''$.504OU&&-4,0K""T)0-1L%##d*148186:3:
%* 
r)   r1   c                        e Zd ZdZ fdZ	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  ded	ej                  fd
Z
ed        Zedd       Z xZS )AltRobertaEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.c                 T   t         |           t        j                  |j                  |j
                  |j                        | _        t        j                  |j                  |j
                        | _	        t        j                  |j
                  |j                        | _
        t        j                  |j                        | _        | j                  dt!        j"                  |j$                        j'                  d      d       | j                  dt!        j(                  | j*                  j-                         t         j.                        d       |j                  | _        t        j                  |j$                  |j
                  | j0                        | _        y )	N)padding_idxepsposition_idsr   F
persistenttoken_type_idsdtype)super__init__r!   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutregister_bufferr$   r%   max_position_embeddingsexpandzerosrQ   sizelongrN   position_embeddingsr?   config	__class__s     r'   rZ   zAltRobertaEmbeddings.__init__a   s4   !||F,=,=v?Q?Q_e_r_rs%'\\&2H2H&J\J\%]"f&8&8f>S>STzz&"<"<=ELL)G)GHOOPWXej 	 	
 	ekk$*;*;*@*@*B%**Ubg 	 	
 "..#%<<**F,>,>DL\L\$
 r)   N	input_idsrV   rQ   inputs_embedspast_key_values_lengthr   c                    |<|| j                  || j                  |      }n| j                  || j                        }||j                         }n|j                         d d }|\  }}|t	        | d      rT| j
                  j                  |j                  d   d      }	t        j                  |	d|      }	|	j                  ||      }n:t        j                  |t        j                  | j                  j                        }|| j                  |      }| j                  |      }
||
z   }| j!                  |      }||z   }| j#                  |      }| j%                  |      }|S )NrS   rV   r   r   )dimindexrX   r    )"create_position_ids_from_input_idsrN   &create_position_ids_from_inputs_embedsrk   hasattrrV   ri   shaper$   gatherrj   rl   rQ   r    r_   ra   rm   rb   rf   )r?   rq   rV   rQ   rr   rs   input_shape
batch_size
seq_lengthbuffered_token_type_idsra   
embeddingsrm   s                r'   forwardzAltRobertaEmbeddings.forwardu   sn    $#FFt//1G   $JJ=Z^ZjZjk #..*K',,.s3K!,
J
 !t-.*.*=*=*D*D\EWEWXYEZ\^*_'*/,,7NTU]i*j'!8!?!?
J!W!&[

SWSdSdSkSk!l  00;M $ : :> J"%::
"66|D"55
^^J/
\\*-
r)   c                     | j                         dd }|d   }t        j                  |dz   ||z   dz   t        j                  | j                        }|j                  d      j                  |      S )z
        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

        Args:
            inputs_embeds: torch.Tensor

        Returns: torch.Tensor
        NrS   r   rw   r   )rk   r$   r%   rl   r    	unsqueezeri   )rr   rN   r}   sequence_lengthrQ   s        r'   ry   z;AltRobertaEmbeddings.create_position_ids_from_inputs_embeds   sp     $((*3B/%a.||!O_{:Q>ejjYfYmYm
 %%a(//<<r)   c                     | j                  |      j                         }t        j                  |d      j	                  |      |z   |z  }|j                         |z   S )a  
        Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
        are ignored. This is modified from fairseq's `utils.make_positions`.

        Args:
            x: torch.Tensor x:

        Returns: torch.Tensor
        r   ru   )neintr$   cumsumtype_asrl   )rq   rN   rs   maskincremental_indicess        r'   rx   z7AltRobertaEmbeddings.create_position_ids_from_input_ids   sW     ||K(,,.$||Da8@@FI__cgg"'')K77r)   )NNNNr   )r   )rD   rE   rF   rG   rZ   r$   
LongTensorrH   r   Tensorr   staticmethodry   rx   __classcell__rp   s   @r'   rL   rL   ^   s    Q
, .2260426&'.##d*. ((4/. &&-	.
 ((4/. !$. 
.` = =" 8 8r)   rL   c            
            e Zd Z fdZ	 	 ddej
                  dej                  dz  dedz  deej
                     fdZ	 xZ
S )	AltRobertaSelfAttentionc                    t         |           |j                  |j                  z  dk7  r2t	        |d      s&t        d|j                   d|j                   d      |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                        | _        y )Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ())rY   rZ   r]   num_attention_headsrz   
ValueErrorr   attention_head_sizeall_head_sizer!   Linearquerykeyvaluerd   attention_probs_dropout_probrf   rn   s     r'   rZ   z AltRobertaSelfAttention.__init__   s    : ::a?PVXhHi#F$6$6#7 8 445Q8 
 $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EFr)   Nhidden_statesattention_maskoutput_attentionsr   c                 V   |j                   d d }g |d| j                  }| j                  |      j                  |      j	                  dd      }| j                  |      j                  |      j	                  dd      }| j                  |      j                  |      j	                  dd      }t        j                  ||j	                  dd            }	|	t        j                  | j                        z  }	||	|z   }	t        j                  j                  |	d      }
| j                  |
      }
t        j                  |
|      }|j                  dddd      j!                         }|j#                         d d | j$                  fz   }|j                  |      }|r||
f}|S |f}|S )NrS   r      r   r   r   )r{   r   r   view	transposer   r   r$   matmulmathsqrtr!   r"   softmaxrf   permute
contiguousrk   r   )r?   r   r   r   r}   hidden_shapequery_layer	key_layervalue_layerattention_scoresattention_probscontext_layernew_context_layer_shapeoutputss                 r'   r   zAltRobertaSelfAttention.forward   s    $))#2.CCbC$*B*BCjj/44\BLLQPQRHH]+00>HHAN	jj/44\BLLQPQR !<<Y5H5HR5PQ+dii8P8P.QQ%/.@ --//0@b/I ,,7_kB%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S%**+BC6G=/2 O\M]r)   NFrD   rE   rF   rZ   r$   r   rH   boolrA   r   r   r   s   @r'   r   r      sX    G* 48).	$||$ ))D0$  $;	$
 
u||	$r)   r   c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )AltRobertaSelfOutputc                 (   t         |           t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        t        j                  |j                        | _
        y NrO   )rY   rZ   r!   r   r]   denserb   rc   rd   re   rf   rn   s     r'   rZ   zAltRobertaSelfOutput.__init__  s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=r)   r   input_tensorr   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S Nr   rf   rb   r?   r   r   s      r'   r   zAltRobertaSelfOutput.forward
  7    

=1]3}|'CDr)   rD   rE   rF   rZ   r$   r   r   r   r   s   @r'   r   r     1    >U\\  RWR^R^ r)   r   eagerc            
            e Zd Z fdZ	 	 ddej
                  dej                  dz  dedz  deej
                     fdZ	 xZ
S )	AltRobertaAttentionc                 |    t         |           t        |j                     |      | _        t        |      | _        y r   )rY   rZ   "ALT_ROBERTA_SELF_ATTENTION_CLASSES_attn_implementationr?   r   outputrn   s     r'   rZ   zAltRobertaAttention.__init__  s2    6v7R7RSTZ[	*62r)   Nr   r   r   r   c                 j    | j                  |||      }| j                  |d   |      }|f|dd  z   }|S N)r   r   r   r   )r?   r   )r?   r   r   r   self_outputsattention_outputr   s          r'   r   zAltRobertaAttention.forward  sP     yy)/ ! 

  ;;|AF#%QR(88r)   r   r   r   s   @r'   r   r     sW    3 48).	|| ))D0  $;	
 
u||	r)   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )AltRobertaIntermediatec                    t         |           t        j                  |j                  |j
                        | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y r   )rY   rZ   r!   r   r]   intermediate_sizer   
isinstance
hidden_actstrr   intermediate_act_fnrn   s     r'   rZ   zAltRobertaIntermediate.__init__.  s]    YYv1163K3KL
f''-'-f.?.?'@D$'-'8'8D$r)   r   r   c                 J    | j                  |      }| j                  |      }|S r   )r   r   r?   r   s     r'   r   zAltRobertaIntermediate.forward6  s&    

=100?r)   r   r   s   @r'   r   r   -  s#    9U\\ ell r)   r   c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )AltRobertaOutputc                 (   t         |           t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        y r   )rY   rZ   r!   r   r   r]   r   rb   rc   rd   re   rf   rn   s     r'   rZ   zAltRobertaOutput.__init__>  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=r)   r   r   r   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S r   r   r   s      r'   r   zAltRobertaOutput.forwardD  r   r)   r   r   s   @r'   r   r   =  r   r)   r   c                        e Zd Z fdZ	 	 d
dej
                  dej                  dz  dedz  dee	   de
ej
                     f
dZd	 Z xZS )AltRobertaLayerc                     t         |           |j                  | _        d| _        t	        |      | _        t        |      | _        t        |      | _	        y )Nr   )
rY   rZ   chunk_size_feed_forwardseq_len_dimr   	attentionr   intermediater   r   rn   s     r'   rZ   zAltRobertaLayer.__init__M  sI    '-'E'E$,V426:&v.r)   Nr   r   r   kwargsr   c                      | j                   |f||d|}|d   }|dd  }t        | j                  | j                  | j                  |      }|f|z   }|S r   )r   r   feed_forward_chunkr   r   )	r?   r   r   r   r   self_attention_outputsr   r   layer_outputs	            r'   r   zAltRobertaLayer.forwardU  s     "0"
)/"
 	"
 2!4(,0##T%A%A4CSCSUe
  /G+r)   c                 L    | j                  |      }| j                  ||      }|S r   )r   r   )r?   r   intermediate_outputr   s       r'   r   z"AltRobertaLayer.feed_forward_chunkl  s,    "//0@A{{#68HIr)   r   )rD   rE   rF   rZ   r$   r   rH   r   r   r   rA   r   r   r   r   s   @r'   r   r   L  sl    / 48).	|| ))D0  $;	
 +, 
u||	.r)   r   c                        e Zd Z fdZe	 	 	 	 ddej                  dej                  dz  dedz  dedz  dedz  de	e
   d	eej                     ez  fd
       Z xZS )AltRobertaEncoderc                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w r   )
rY   rZ   ro   r!   
ModuleListrangenum_hidden_layersr   layergradient_checkpointing)r?   ro   irp   s      r'   rZ   zAltRobertaEncoder.__init__t  sN    ]]U6KcKcEd#eOF$;#ef
&+# $f   A#Nr   r   r   output_hidden_statesreturn_dictr   r   c                     |rdnd }|rdnd }t        | j                        D ])  \  }	}
|r||fz   } |
|||fi |}|d   }|s!||d   fz   }+ |r||fz   }t        |||      S )NrJ   r   r   last_hidden_stater   
attentions)	enumerater   r
   )r?   r   r   r   r   r   r   all_hidden_statesall_self_attentionsr   layer_modulelayer_outputss               r'   r   zAltRobertaEncoder.forwardz  s     #7BD$5b4(4 	POA|#$58H$H!(! 	M *!,M &9]1=M<O&O#	P   1]4D D++*
 	
r)   )NFFT)rD   rE   rF   rZ   r   r$   r   rH   r   r   r   rA   r
   r   r   r   s   @r'   r   r   s  s    ,  48).,1#'"
||"
 ))D0"
  $;	"

 #Tk"
 D["
 +,"
 
u||		."
 "
r)   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )AltRobertaPoolerc                     t         |           t        j                  |j                  |j                        | _        t        j                         | _        y r   )rY   rZ   r!   r   r]   r   Tanh
activationrn   s     r'   rZ   zAltRobertaPooler.__init__  s9    YYv1163E3EF
'')r)   r   r   c                 \    |d d df   }| j                  |      }| j                  |      }|S )Nr   )r   r  )r?   r   first_token_tensorpooled_outputs       r'   r   zAltRobertaPooler.forward  s6     +1a40

#566r)   r   r   s   @r'   r  r    s#    $
U\\ ell r)   r  moduler   r   r   r   scalingrf   c                    t        j                  ||j                  dd            |z  }|||z   }t        j                  j                  |dt         j                        j                  |j                        }t        j                  j                  ||| j                        }t        j                  ||      }	|	j                  dd      j                         }	|	|fS )NrS   r   )ru   rX   )ptrainingr   r   )r$   r   r   r!   r"   r   float32torX   rf   r  r   )
r  r   r   r   r   r  rf   r   attn_weightsattn_outputs
             r'   eager_attention_forwardr    s     <<s}}R'<=GL!#n4==((2U]](SVVW\WbWbcL==((6??([L,,|U3K''1-88:K$$r)   c                        e Zd ZdZ fdZ	 	 d
dej                  dej                  dz  dedz  dee	   de
ej                  ej                  dz  f   f
d	Z xZS )AltCLIPAttentionz=Multi-headed attention from 'Attention Is All You Need' paperc                    t         |           || _        |j                  | _        |j
                  | _        | j                  | j                  z  | _        | j                  | j                  z  | j                  k7  r&t        d| j                   d| j                   d      | j                  dz  | _	        |j                  | _        d| _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        y )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: ).      F)rY   rZ   ro   r]   	embed_dimr   	num_headshead_dimr   scaleattention_dropoutrf   	is_causalr!   r   k_projv_projq_projout_projrn   s     r'   rZ   zAltCLIPAttention.__init__  s   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
//ii?ii?ii?		$..$..Ar)   Nr   r   r   r   r   c                    |j                   \  }}}| j                  |      }| j                  |      }	| j                  |      }
|j	                  ||| j
                  | j                        j                  dd      }|	j	                  ||| j
                  | j                        j                  dd      }	|
j	                  ||| j
                  | j                        j                  dd      }
t        j                  | j                  j                  t              } || ||	|
|f| j                  | j                  sdn| j                  d|\  }}|j!                  |||      j#                         }| j%                  |      }|sd}||fS )z#Input shape: Batch x Time x Channelr   r           )r  rf   N)r{   r#  r!  r"  r   r  r  r   r   get_interfacero   r   r  r  r  rf   reshaper   r$  )r?   r   r   r   r   r~   r   r  queriesrB   valuesattention_interfacer  r  s                 r'   r   zAltCLIPAttention.forward  si    -:,?,?)
J	++m,{{=)]+,,z:t~~t}}U__`acdeyyZOYYZ[]^_ZT^^T]]S]]^_abc(?(M(MKK,,.E)
 %8	%
 JJ#}}C$,,	%
 	%
!\ "))*j)LWWYmmK0 LL((r)   r   )rD   rE   rF   rG   rZ   r$   r   r   r   r   rA   r   r   r   s   @r'   r  r    sw    GB. /3).	&)||&) t+&)  $;	&)
 +,&) 
u||U\\D00	1&)r)   r  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )
AltCLIPMLPc                    t         |           || _        t        |j                     | _        t        j                  |j                  |j                        | _
        t        j                  |j                  |j                        | _        y r   )rY   rZ   ro   r   r   activation_fnr!   r   r]   r   fc1fc2rn   s     r'   rZ   zAltCLIPMLP.__init__
  sd    #F$5$5699V//1I1IJ99V55v7I7IJr)   r   r   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r   )r0  r/  r1  r   s     r'   r   zAltCLIPMLP.forward  s4    /**=9/r)   r   r   s   @r'   r-  r-  	  s$    KU\\ ell r)   r-  c                        e Zd Zdef fdZ	 d
dej                  dej                  dedz  dee	   de
ej                     f
d	Z xZS )AltCLIPEncoderLayerro   c                 D   t         |           |j                  | _        t	        |      | _        t        j                  | j                  |j                        | _	        t        |      | _        t        j                  | j                  |j                        | _        y r   )rY   rZ   r]   r  r  	self_attnr!   rb   rc   layer_norm1r-  mlplayer_norm2rn   s     r'   rZ   zAltCLIPEncoderLayer.__init__  sm    ++)&1<<F<Q<QRf%<<F<Q<QRr)   r   r   r   Nr   r   c                     |}| j                  |      } | j                  d|||d|\  }}||z   }|}| j                  |      }| j                  |      }||z   }|f}|r||fz  }|S )aI  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
                `(config.encoder_attention_heads,)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )r   r   r   rJ   )r7  r6  r9  r8  )r?   r   r   r   r   residualr  r   s           r'   r   zAltCLIPEncoderLayer.forward!  s    " !((7&4dnn '
')/'
 	'
#| !=0 ((7/ =0 "&Gr)   F)rD   rE   rF   r   rZ   r$   r   r   r   r   rA   rH   r   r   r   s   @r'   r4  r4    sh    S} S */	&||& &  $;	&
 +,& 
u  	!&r)   r4  c                        e Zd ZdZdef fdZe	 	 	 	 ddej                  dz  de	dz  de	dz  de	dz  d	e
e   d
eez  fd       Z xZS )AltCLIPEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`AltCLIPEncoderLayer`].

    Args:
        config: AltCLIPConfig
    ro   c                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w r   )
rY   rZ   ro   r!   r   r   r   r4  layersr   )r?   ro   _rp   s      r'   rZ   zAltCLIPEncoder.__init__S  sP    mm%PVPhPhJi$jQ%8%@$jk&+# %kr   Nr   r   r   r   r   r   c                 j   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|rdnd}|rdnd}|}	t	        | j
                        D ]*  \  }
}|r||	fz   } ||	|fd|i|}|d   }	|s"||d   fz   }, |r||	fz   }t        |	||      S )ad  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        NrJ   r   r   r   r   )ro   r   r   use_return_dictr   r@  r
   )r?   rr   r   r   r   r   r   encoder_statesall_attentionsr   idxencoder_layerr  s                r'   r   zAltCLIPEncoder.forwardY  s   @ 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]30d%"+DKK"8 	FC#!/=2B!B) #4 	M *!,M !/=3C2E!E	F  +}.>>N+>Vd
 	
r)   )NNNN)rD   rE   rF   rG   r   rZ   r   r$   r   r   r   r   rA   r
   r   r   r   s   @r'   r>  r>  J  s    ,} ,  /3)-,0#'=
 t+=
  $;	=

 #Tk=
 D[=
 +,=
 
	 =
 =
r)   r>  c                        e Zd Zdef fdZdej                  dededej                  fdZd
dej                  dej                  fd	Z
 xZS )AltCLIPVisionEmbeddingsro   c                    t         |           || _        |j                  | _        |j
                  | _        |j                  | _        t        j                  t        j                  | j                              | _        t        j                  |j                  | j                  | j                  | j                  d      | _        | j
                  | j                  z  dz  | _        | j                  dz   | _        t        j"                  | j                   | j                        | _        | j'                  dt        j(                  | j                         j+                  d      d       y )NF)in_channelsout_channelskernel_sizestridebiasr   r   rQ   rR   rT   )rY   rZ   ro   r]   r  
image_size
patch_sizer!   	Parameterr$   randnclass_embeddingConv2dnum_channelspatch_embeddingnum_patchesnum_positionsr[   position_embeddingrg   r%   ri   rn   s     r'   rZ   z AltCLIPVisionEmbeddings.__init__  s	   ++ ++ ++!||EKK,GH!yy++?? 
 !OOt>1D!--1"$,,t/A/A4>>"R^U\\$:L:L-M-T-TU\-]jopr)   r   heightwidthr   c                    |j                   d   dz
  }| j                  j                  j                  d      }|j                   d   dz
  }t        j
                  j                         s%||k(  r ||k(  r| j                  | j                        S |ddddf   }|ddddf   }|j                   d   }	|| j                  z  }
|| j                  z  }t        |dz        }|j                  d|||	      }|j                  dddd      }t        j                  j                  ||
|fdd	
      }|j                  dddd      j                  dd|	      }t	        j                   ||fd      S )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   r   NrS   g      ?r   r   bicubicF)rk   modealign_cornersr   )r{   rZ  weightr   r$   jit
is_tracingrQ   rQ  r   r(  r   r!   r"   interpolater   cat)r?   r   r[  r\  rX  rZ  rY  class_pos_embedpatch_pos_embedru   
new_height	new_widthsqrt_num_positionss                r'   interpolate_pos_encodingz0AltCLIPVisionEmbeddings.interpolate_pos_encoding  sv    !&&q)A-!44;;EEaH*003a7 yy##%+*F6UZ?**4+<+<==,QU3,QU3r"t.
T__,	&}c'9:)11!5GI[]`a)11!Q1=--33i(	 4 
 *11!Q1=BB1b#Nyy/?;CCr)   pixel_valuesc                 `   |j                   \  }}}}|sJ|| j                  k7  s|| j                  k7  r,t        d| d| d| j                   d| j                   d	      | j                  j                  j
                  }| j                  |j                  |            }|j                  d      j                  dd      }| j                  j                  |dd      }	t        j                  |	|gd	      }
|r|
| j                  |
||      z   }
|
S |
| j                  | j                        z   }
|
S )
NzInput image size (*z) doesn't match model (r  rW   r   r   rS   r   )r{   rP  r   rW  ra  rX   r  flattenr   rT  ri   r$   re  rk  rZ  rQ   )r?   rl  rk  r~   rA  r[  r\  target_dtypepatch_embedsclass_embedsr   s              r'   r   zAltCLIPVisionEmbeddings.forward  s6   '3'9'9$
Avu'Vt-F%SWSbSbJb$VHAeW4KDOOK\\]^b^m^m]nnpq  ++2288++LOO,O,OP#++A.88A>++22:q"EYYl;C
##d&C&CJPVX]&^^J  $d&=&=d>O>O&PPJr)   r<  )rD   rE   rF   r   rZ   r$   r   r   rk  rH   r   r   r   s   @r'   rI  rI    se    q2 q,'D5<< 'D 'DUX 'D]b]i]i 'DRE$5$5 Z_ZfZf r)   rI  c                   V    e Zd ZU eed<   dZdZdZg Z e	j                         d        Zy)AltCLIPPreTrainedModelro   altclip)imagetextTc                    | j                   j                  }t        |t              r| j                   j                  }t	        j
                  |j                  d|j                  dz  |z         t	        j
                  |j                  j                  |j                   j                  |z         t	        j
                  |j                  j                  |j                   j                  |z         t	        j                  |j                  t        j                  |j                         j#                  d             yt        |t$              r| j                   j                  }|j                  dz  d|j                   j&                  z  dz  z  |z  }|j                  dz  |z  }t	        j
                  |j(                  j                  |       t	        j
                  |j*                  j                  |       t	        j
                  |j,                  j                  |       t	        j
                  |j.                  j                  |       yt        |t0              r| j                   j                  }|j                   j2                  dz  d|j                   j&                  z  dz  z  |z  }d|j                   j2                  z  dz  |z  }t	        j
                  |j4                  j                  |       t	        j
                  |j6                  j                  |       yt        |t8              rt	        j
                  |j:                  j                  |j<                  dz  | j                   j                  z         t	        j
                  |j>                  j                  |j@                  dz  | j                   j                  z         yt        |tB        jD                        r?t	        jF                  |jH                         t	        jJ                  |j                         yt        |tB        jL                        rct	        j
                  |j                  d| j                   j                         |jH                   t	        jF                  |jH                         yyt        |tB        jN                        rt	        j
                  |j                  d| j                   j                         |jP                  EtS        |j                  dd	      s-t	        jF                  |j                  |jP                            yyyt        |tT              ryt	        j                  |j                  t        j                  |j                  jV                  d
         j#                  d             t	        jF                  |jX                         yy)zInitialize the weightsr&  r  )meanstd)rz  rR   r   N_is_hf_initializedFrS   )-ro   initializer_factorr   rI  initnormal_rT  r  rW  ra  initializer_rangerZ  copy_rQ   r$   r%   rY  ri   r  r   r#  r!  r"  r$  r-  r]   r0  r1  AltCLIPModeltext_projectiontext_embed_dimvisual_projectionvision_embed_dimr!   rb   zeros_rO  ones_r   r[   rN   r;   rL   r{   rV   )r?   r  factorin_proj_stdout_proj_stdfc_stds         r'   _init_weightsz$AltCLIPPreTrainedModel._init_weights  s    //f56[[33FLL//cv?O?OQU?UX^?^_LL//66FMM<[<[^d<deLL2299v}}?^?^ag?ghJJv**ELL9M9M,N,U,UV],^_ 01[[33F!++T1q6==;Z;Z7Z_c6cdgmmK",,d2f<LLL--;?LL--;?LL--;?LL//\B
+[[33F!==44d:FMMDcDc@chl?lmpvvK&--333<vEFLL**7LL**<-LL&&--))4/$++2P2PP LL((//++T1DKK4R4RR -KK$JJv}}%		*LLSdkk6T6TU{{&FKK( '-LLSdkk6T6TU!!-gfmmMach6iFMM&*<*<=> 7j- 45JJv**ELL9L9L9R9RSU9V,W,^,^_f,ghKK--. 6r)   N)rD   rE   rF   r   rI   base_model_prefixinput_modalitiessupports_gradient_checkpointing_no_split_moduler$   no_gradr  rJ   r)   r'   rt  rt    s:    !(&*#U]]_./ ./r)   rt  c                        e Zd Zdef fdZee	 	 	 	 	 ddej                  dz  de	dz  de	dz  de	dz  de	dz  d	e
ez  fd
              Z xZS )AltCLIPVisionTransformerro   c                     t         |           || _        |j                  }t	        |      | _        t        j                  ||j                        | _	        t        |      | _        t        j                  ||j                        | _        y r   )rY   rZ   ro   r]   rI  r   r!   rb   rc   pre_layrnormr>  encoderpost_layernorm)r?   ro   r  rp   s      r'   rZ   z!AltCLIPVisionTransformer.__init__)  sj    &&	1&9LL8M8MN%f- ll9&:O:OPr)   Nrl  r   r   r   rk  r   c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|t	        d      | j                  ||      }| j                  |      }| j                  |||d      }|d   }|d d dd d f   }	| j                  |	      }	t        ||	|j                  |j                        S )Nz You have to specify pixel_values)rk  T)rr   r   r   r   r   r   pooler_outputr   r   )ro   r   r   rC  r   r   r  r  r  r   r   r   )
r?   rl  r   r   r   rk  r   encoder_outputsr   r  s
             r'   r   z AltCLIPVisionTransformer.forward3  s     2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]?@@Ogh))-8,,'/!5	 ' 
 ,A.)!Q'2++M:)/')77&11	
 	
r)   )NNNNF)rD   rE   rF   r   rZ   r   r   r$   rH   r   rA   r   r   r   r   s   @r'   r  r  (  s    Q2 Q  26)-,0#'05$
''$.$
  $;$
 #Tk	$

 D[$
 #'+$
 
+	+$
  $
r)   r  c                        e Zd ZU eed<   dZdZdef fdZdej                  fdZ
e	 	 	 	 	 ddej                  dz  dedz  d	edz  d
ededz  dee   deez  fd       Z xZS )AltCLIPVisionModelro   rl  )rv  c                 d    t         |   |       t        |      | _        | j	                          y r   )rY   rZ   r  vision_model	post_initrn   s     r'   rZ   zAltCLIPVisionModel.__init__a  s'     4V<r)   r   c                 B    | j                   j                  j                  S r   )r  r   rW  rC   s    r'   get_input_embeddingsz'AltCLIPVisionModel.get_input_embeddingsg  s      ++;;;r)   Nr   r   rk  r   r   c                 b    ||n| j                   j                  }| j                  |||||      S )a  
        Examples:

        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, AltCLIPVisionModel

        >>> model = AltCLIPVisionModel.from_pretrained("BAAI/AltCLIP")
        >>> processor = AutoProcessor.from_pretrained("BAAI/AltCLIP")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled CLS states
        ```rl  r   r   rk  r   )ro   rC  r  )r?   rl  r   r   rk  r   r   s          r'   r   zAltCLIPVisionModel.forwardj  sB    @ &1%<k$++B]B]  %/!5%=# ! 
 	
r)   )NNNFN)rD   rE   rF   r   rI   main_input_namer  rZ   r!   Moduler  r   r$   rH   r   r   r   rA   r   r   r   r   s   @r'   r  r  \  s    $O!2 <bii <  26)-,0).#''
''$.'
  $;'
 #Tk	'

 #''
 D['
 +,'
 
+	+'
 '
r)   r  aE  
    The model behaves as an encoder following the architecture described in *Attention is
    all you need*_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz
    Kaiser and Illia Polosukhin.

    .. _*Attention is all you need*: https://huggingface.co/papers/1706.03762
    )custom_introc                   4    e Zd ZU eed<   d fd	Zd Zd Ze	 	 	 	 	 	 	 	 dde	j                  dz  de	j                  dz  de	j                  dz  d	e	j                  dz  d
e	j                  dz  dedz  dedz  dedz  dee	j                     ez  fd       Z xZS )AltRobertaModelro   c                     t         |   |       || _        t        |      | _        t        |      | _        |rt        |      nd| _        | j                          y)zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        N)
rY   rZ   ro   rL   r   r   r  r  poolerr  )r?   ro   add_pooling_layerrp   s      r'   rZ   zAltRobertaModel.__init__  sN    
 	 .v6(02C&v. 	r)   c                 .    | j                   j                  S r   r   r_   rC   s    r'   r  z$AltRobertaModel.get_input_embeddings  s    ...r)   c                 &    || j                   _        y r   r  r?   r   s     r'   set_input_embeddingsz$AltRobertaModel.set_input_embeddings  s    */'r)   Nrq   r   rV   rQ   rr   r   r   r   r   c	                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }||t	        d      |#| j                  ||       |j                         }
n!||j                         d d }
nt	        d      |
\  }}||j                  n|j                  }|t        j                  ||f|      }|pt        | j                  d      r4| j                  j                  d d d |f   }|j                  ||      }|}n&t        j                  |
t        j                  |      }| j!                  ||
      }| j                  ||||      }| j#                  ||||d	      }|d
   }| j$                  | j%                  |      nd }t'        |||j(                  |j*                        S )NzDYou cannot specify both input_ids and inputs_embeds at the same timerS   z5You have to specify either input_ids or inputs_embedsr   rV   rw   )rq   rQ   rV   rr   T)r   r   r   r   r   r  )ro   r   r   rC  r   %warn_if_padding_and_no_attention_maskrk   r    r$   onesrz   r   rV   ri   rj   rl   get_extended_attention_maskr  r  r   r   r   )r?   rq   r   rV   rQ   rr   r   r   r   r   r}   r~   r   r    r    buffered_token_type_ids_expandedextended_attention_maskembedding_outputr  sequence_outputr  s                        r'   r   zAltRobertaModel.forward  s    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] ]%>cdd"66y.Q#..*K&',,.s3KTUU!,
J%.%:!!@T@T!"ZZ*j)A6RN!t(89*.//*H*HKZK*X'3J3Q3QR\^h3i0!A!&[

SY!Z 150P0PQ_al0m??%)'	 + 
 ,,2/!5 ' 
 *!,8<8OO4UY)-')77&11	
 	
r)   )TNNNNNNNN)rD   rE   rF   r   rI   rZ   r  r  r   r$   r   r   rA   r   r   r   r   s   @r'   r  r    s      /0  *..2.2,0-1)-,0#'C
<<$&C
 t+C
 t+	C

 llT)C
 ||d*C
  $;C
 #TkC
 D[C
 
u||	K	KC
 C
r)   r  c                       e Zd ZU eed<   dZ fdZdej                  fdZ	dej                  ddfdZdd	edz  dej                  f fd
Zee	 	 	 	 	 	 	 	 ddej"                  dz  dej"                  dz  dej"                  dz  dej"                  dz  dej"                  dz  dedz  dedz  dedz  dee   deez  fd              Z xZS )AltCLIPTextModelro   )rw  c                 &   t         |   |       t        |d      | _        t	        j
                  |j                  |j                        | _        t	        j                  |j                  |j                        | _        | j                          y )NF)r  rO   )rY   rZ   r  robertar!   r   r]   project_dimtransformationrb   rc   pre_LNr  rn   s     r'   rZ   zAltCLIPTextModel.__init__  se     &vG ii(:(:F<N<NOll6#5#56;P;PQr)   r   c                 B    | j                   j                  j                  S r   r  r   r_   rC   s    r'   r  z%AltCLIPTextModel.get_input_embeddings  s    ||&&666r)   r   Nc                 :    || j                   j                  _        y r   r  r  s     r'   r  z%AltCLIPTextModel.set_input_embeddings  s    27/r)   new_num_tokensc                 "    t         |   |      S r   )rY   resize_token_embeddings)r?   r  rp   s     r'   r  z(AltCLIPTextModel.resize_token_embeddings  s    w.~>>r)   rq   r   rV   rQ   rr   r   r   r   r   c	           
         ||n| j                   j                  }| j                  |||||||d      }
|
d   }| j                  |      }| j	                  |      }|dddf   }t        |||
j                  |
j                        S )a+  
        Examples:

        ```python
        >>> from transformers import AutoProcessor, AltCLIPTextModel

        >>> model = AltCLIPTextModel.from_pretrained("BAAI/AltCLIP")
        >>> processor = AutoProcessor.from_pretrained("BAAI/AltCLIP")

        >>> texts = ["it's a cat", "it's a dog"]

        >>> inputs = processor(text=texts, padding=True, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled CLS states
        ```NT)rq   r   rV   rQ   rr   r   r   r   r   r  )ro   rC  r  r  r  r   r   r   )r?   rq   r   rV   rQ   rr   r   r   r   r   r   r  projection_stater  s                 r'   r   zAltCLIPTextModel.forward  s    @ &1%<k$++B]B],,))%'/!5  	
 "!* ++o6  ..?(A.6.'!//))	
 	
r)   r   r  )rD   rE   rF   r   rI   r  rZ   r!   r  r  r[   r  r   r  r   r   r$   r   r   r   r   rA   r   r   r   r   s   @r'   r  r     s3    7bii 78",, 84 8?cDj ?BLL ?  *..2.2,0-1)-#',0:
<<$&:
 t+:
 t+	:

 llT):
 ||d*:
  $;:
 D[:
 #Tk:
 +,:
 
8	8:
  :
r)   r  c                   (    e Zd ZU eed<   def fdZee	 	 	 ddej                  dej                  dz  dej                  dz  dej                  dz  de
e   d	eez  fd
              Zee	 ddej                  dede
e   d	eez  fd              Ze	 	 	 	 	 	 	 	 	 	 ddej$                  dz  dej                  dz  dej                  dz  dej$                  dz  dej                  dz  dedz  dedz  dedz  dededz  de
e   d	eez  fd       Z xZS )r  ro   c                 r   t         |   |       t        |j                  t              s"t        dt        |j                         d      t        |j                  t              s"t        dt        |j                         d      |j                  }|j                  }|j                  |_	        |j                  | _
        |j                  | _        |j                  | _        t        |      | _        t#        |      | _        t'        j(                  | j                  | j                  d      | _        t'        j(                  | j                  | j                  d      | _        t'        j.                  t1        j2                  | j4                  j6                              | _        | j;                          y )NzRconfig.vision_config is expected to be of type AltCLIPVisionConfig but is of type .zNconfig.text_config is expected to be of type AltCLIPTextConfig but is of type F)rO  )rY   rZ   r   vision_configr   	TypeErrortypetext_configr   r   projection_dimr  r  r]   r  r  
text_modelr  r  r!   r   r  r  rR  r$   tensorro   logit_scale_init_valuelogit_scaler  )r?   ro   r  r  rp   s       r'   rZ   zAltCLIPModel.__init__V  se    &..0CD--./q2  &,,.?@++,-Q0 
 ((,,-3-H-H*$33)55 - 9 9*;74]C!#4+@+@$BUBU\a!b!yy)<)<d>Q>QX]^<<T[[5W5W(XY 	r)   Nrq   r   rQ   rV   r   r   c           	      z     | j                   d||||dd|}|j                  }| j                  |      |_        |S )a  
        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, AltCLIPModel

        >>> model = AltCLIPModel.from_pretrained("BAAI/AltCLIP")
        >>> processor = AutoProcessor.from_pretrained("BAAI/AltCLIP")

        >>> inputs = processor(text=["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
        >>> with torch.inference_mode():
        ...     text_features = model.get_text_features(**inputs)
        ```T)rq   r   rQ   rV   r   rJ   )r  r  r  )r?   rq   r   rQ   rV   r   text_outputsr  s           r'   get_text_featureszAltCLIPModel.get_text_featuresw  s]    0 AP A
)%)A
 A
 %22%)%9%9-%H"r)   rl  rk  c                 v     | j                   d||dd|}|j                  }| j                  |      |_        |S )ao  
        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, AltCLIPModel
        >>> from transformers.image_utils import load_image

        >>> model = AltCLIPModel.from_pretrained("BAAI/AltCLIP")
        >>> processor = AutoProcessor.from_pretrained("BAAI/AltCLIP")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = load_image(url)

        >>> inputs = processor(images=image, return_tensors="pt")
        >>> with torch.inference_mode():
        ...     image_features = model.get_image_features(**inputs)
        ```T)rl  rk  r   rJ   )r  r  r  )r?   rl  rk  r   vision_outputsr  s         r'   get_image_featureszAltCLIPModel.get_image_features  sT    4 +** 
%%=
 	
 '44'+'='=m'L$r)   return_lossr   r   r   c           	         ||n| j                   j                  }||n| j                   j                  }|
|
n| j                   j                  }
| j	                  |||||||
      }| j                  ||||	|
      }|d   }| j                  |      }|d   }| j                  |      }||j                  ddd      z  }||j                  ddd      z  }| j                  j                         }t        j                  ||j                               |z  }|j                  }d}|rt        |      }|
s||||||f}||f|z   S |S t!        |||||||	      S )
a  
        return_loss (`bool`, *optional*):
            Whether or not to return the contrastive loss.

        Examples:

        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, AltCLIPModel

        >>> model = AltCLIPModel.from_pretrained("BAAI/AltCLIP")
        >>> processor = AutoProcessor.from_pretrained("BAAI/AltCLIP")
        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))
        >>> inputs = processor(
        ...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
        ... )
        >>> outputs = model(**inputs)
        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
        >>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
        ```N)rq   r   rV   rQ   r   r   r   r  r   r   rS   T)r  ru   keepdim)r2   r3   r4   r5   r6   r7   r8   )ro   r   r   rC  r  r  r  r  normr  expr$   r   r,   Tr/   r1   )r?   rq   rl  r   rQ   rV   r  r   r   rk  r   r   r  r  r6   r5   r  r4   r3   r2   r   s                        r'   r   zAltCLIPModel.forward  s   P 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]))%/!5# ' 
 **%/!5%=# + 
 &a(--l;"1o**;7 $l&7&7!T&7&RR!K$4$4qb$$4$OO &&**,,,{LNN4DES*,,_-D&lT`bpqF)-)9TGf$EvE-+#%* .
 	
r)   )NNNr<  )
NNNNNNNNFN)rD   rE   rF   r   rI   rZ   r   r   r$   r   r   r   rA   r   r  rH   r   r  r   r1   r   r   r   s   @r'   r  r  S  s   } B  /3,0.2!<<! t+! llT)	!
 t+! +,! 
+	+!  !F  */!''! #'! +,	!
 
+	+!  !F  .215.204.2#')-,0).#'^
##d*^
 ''$.^
 t+	^

 &&-^
 t+^
 D[^
  $;^
 #Tk^
 #'^
 D[^
 +,^
 
	^
 ^
r)   r  )rt  r  r  r  )r&  )IrG   r   collections.abcr   dataclassesr   typingr   r$   torch.nnr!    r   r}  activationsr   modeling_layersr	   modeling_outputsr
   r   r   r   modeling_utilsr   r   processing_utilsr   pytorch_utilsr   utilsr   r   r   r   r   r   configuration_altclipr   r   r   
get_loggerrD   loggerr   r(   r/   r1   r  rL   r   r   r   r   r   r   r   r   r  floatr  r  r-  r4  r>  rI  rt  r  r  r  r  r  __all__rJ   r)   r'   <module>r     s     $ !    & ! 9  G & 6 j j X X 
		H	%
`U\\ `ell `-%,, -5<< -  
K  
   
Hg8299 g8T7bii 7v299  $& "
")) .RYY  ryy #0 #N*
		 *
\ryy . %II%<<% 
% <<	%
 LL4'% % %.=)ryy =)B /4 /dM
RYY M
bPbii Pf 6/_ 6/ 6/r1
ryy 1
h6
/ 6
r _
, _
_
DP
- P
fM
) M
` _r)   