
    qi\                     :   d Z ddlZddlZddlmZ ddlZddlmZ ddlm	Z	 ddl
mZ ddlmZ dd	lmZ dd
lmZmZmZmZmZmZ ddlmZ ddlmZmZ ddlmZ  ej>                  e       Z!e ed       G d de                    Z" G d dejF                        Z$ G d dejF                        Z% G d dejF                        Z& G d dejF                        Z' G d dejF                        Z( G d dejF                        Z) G d d ejF                        Z* G d! d"ejF                        Z+ G d# d$e      Z, G d% d&ejF                        Z-e G d' d(e             Z.e G d) d*e.             Z/ G d+ d,ejF                        Z0 ed-       G d. d/e.             Z1 G d0 d1ejF                        Z2 G d2 d3ejF                        Z3 ed4       G d5 d6e.             Z4 ed7       G d8 d9e.             Z5 ed:       G d; d<e.             Z6e G d= d>e.             Z7g d?Z8y)@zPyTorch ViLT model.    N)	dataclass)nn)CrossEntropyLoss   )initialization)ACT2FN)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPoolingMaskedLMOutputModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)auto_docstringlogging   )
ViltConfigzF
    Class for outputs of [`ViltForImagesAndTextClassification`].
    )custom_introc                       e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	e
eej                        dz  ed<   dZe
eej                        dz  ed<   y)(ViltForImagesAndTextClassificationOutputa7  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Classification (or regression if config.num_labels==1) loss.
    logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
        Classification (or regression if config.num_labels==1) scores (before SoftMax).
    hidden_states (`list[tuple(torch.FloatTensor)]`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
        List of tuples of `torch.FloatTensor` (one for each image-text pair, each tuple containing the output of
        the embeddings + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
        Hidden-states of the model at the output of each layer plus the initial embedding outputs.
    Nlosslogitshidden_states
attentions)__name__
__module____qualname____doc__r   torchFloatTensor__annotations__r   r   listtupler        X/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/vilt/modeling_vilt.pyr   r   +   sq    	 &*D%

d
")'+FE$+;?M4e//01D8?8<JU5,,-.5<r&   r   c                   4     e Zd ZdZ fdZddZ	 ddZ xZS )ViltEmbeddingsz
    Construct the text and patch embeddings.

    Text embeddings are equivalent to BERT embeddings.

    Patch embeddings are equivalent to ViT embeddings.
    c                 ,   t         |           t        |      | _        t	        j
                  t        j                  dd|j                              | _	        t        |      | _        | j                  j                  }t	        j
                  t        j                  d|dz   |j                              | _        t	        j                  |j                  |j                        | _        t	        j"                  |j$                        | _        || _        y Nr   )super__init__TextEmbeddingstext_embeddingsr   	Parameterr    zeroshidden_size	cls_tokenViltPatchEmbeddingspatch_embeddingsnum_patchesposition_embeddings	Embeddingmodality_type_vocab_sizetoken_type_embeddingsDropouthidden_dropout_probdropoutconfig)selfr>   r6   	__class__s      r'   r-   zViltEmbeddings.__init__L   s      .f5ekk!Q8J8J&KL 3F ;++77#%<<A{QPVPbPb0c#d %'\\&2Q2QSYSeSe%f"zz&"<"<=r&   c                 `   | j                   j                  j                  j                  \  }}}}| j                  |      }|d d d d d d d f   j	                         }t
        j                  j                  ||j                  d   |j                  d   f      j                         }|d d df   j                  d      d d df   }	|d d df   j                  d      d d df   }
|j                  \  }}}}| j                  j                  | j                  j                  z  }| j                  d d dd d d f   j                  dd      j                  d|||      }t!        j"                  t%        |	|
      D cg c]R  \  }}t
        j                  j'                  t
        j                  j                  |||fdd	      d||z
  d||z
  f      T c}}d      }|j)                  d      j                  dd      }|j)                  d      j                  dd      }t!        j*                  t!        j,                  t!        j.                  |j                  d
         t!        j.                  |j                  d         d      d      j1                  |j2                        }|d d d d d d d d f   }|j5                  |j                  d   |j                  d   ddd      }|j)                  dd      }|j)                  d      }|dk  s|t7        |t8              s|	|
z  }|j;                         }n|	|
z  }t=        |j;                         |      }|j?                  d      }d|z
  j?                  d      }|d d df   jA                         }|D cg c]  }||d d df   |k(      }}|D cg c]  }||d d df   |k(      }}|D cg c]  }|jC                  d       }}|D cg c]  }|jC                  d       }}|D cg c]  }||z
  	 }}g } tE        t%        |||            D ]  \  }!\  }}"}#|#dk  rOt!        jF                  t!        jH                  |      j	                         |      }$| jK                  ||!   |$          ^t!        jF                  t!        jH                  |"      j	                         |#d      }%| jK                  t!        j"                  ||!   ||!   |%   gd              t!        j"                  | d      } || d d df   | d d df   f   j                  |d|      }|| d d df   | d d df   f   j                  |d      }|| d d df   | d d df   f   j                  |dd      }|| d d df   | d d df   f   j                  |d|      }| jL                  j5                  |dd      }&t!        j"                  |&|fd      }t!        j"                  | j                  d d dd d f   d d d d d f   j5                  |dd      |fd      }||z   }| jO                  |      }t!        j"                  t!        jH                  |j                  d   d      j1                  |      |gd      }|||||fffS c c}}w c c}w c c}w c c}w c c}w c c}w )N   r   )sizer   r   dimbilinearT)rC   modealign_cornersij)indexingdeviceF)as_tuple)replacement)(r5   
projectionweightshapefloatr   
functionalinterpolatelongsumr>   
image_size
patch_sizer7   	transposeviewr    catzippadflattenstackmeshgridarangetorN   expand
isinstanceintmaxminnonzerouniquerC   	enumeratemultinomialonesappendr3   r=   )'r?   pixel_values
pixel_maskmax_image_length_phpwxx_maskx_hx_w
batch_sizenum_channelsheightwidth	patch_dimspatial_poshw	pos_embedpatch_indexeffective_resolution	valid_idxnon_valid_idxunique_rowsuvalid_row_idxnon_valid_row_idxv
valid_numsnon_valid_numspad_numsselectinvpvalid_choice
pad_choice
cls_tokenss'                                          r'   visual_embedzViltEmbeddings.visual_embed[   sW   ,,77>>DD1b"!!,/AtQM*002**6QWWQZ8P*QVVXQTl1%ad+QTl1%ad+23''/
L&%KK**dkk.D.DD	..q!"ax8BB1aHMMaQ]_hjstII  SM Aq !!MM--#V'&*	 .  	1fqj1 
	  %%a(221a8	IIaL""1a(kkNN5<<R(895<<UWHX;Ydhioq

"FMM"
" 	 "$aA"56!((a&,,q/2rSUV!))!Q/"a#3#;:N^`cCd
 $'9 3779#&9 "#7#;#;#=?OPNNEN2	V,,e,<1o,,.BMNQ9QT?a#78NNNYZ]=A+>!+CDZZ)67AaffQi7
7->?!&&)??2<=Q$q(==&s:~x'PQ 	fMAz2qAv$00A1D1D1FHXYmA.|<="..uzz"~/C/C/EqVZ[
eiiq)9;LQ;OPZ;[(\bcde	f 6q)fQTlF1a4L()..z2|Lq!tfQTl2388RH!&A,q!t"<=BB:rSTUfQTlF1a4L89>>z2|\	^^**:r2>
IIz1o1-II%%aAg.q$z:AA*bRTUW`agh
	 	MLLOEJJv||A:==fEvNTUV&;888SP OZ7?=s%   ?AZ
Z5ZZ!/Z&Z+c	           	      (   | j                  |||      }	|-| j                  ||| j                  j                        \  }}
}n|j	                  d      }
|d}|	| j                  t        j                  |t        j                  |	j                              z   }	|| j                  t        j                  |
|t        j                  |	j                              z   }t        j                  |	|gd      }t        j                  ||
gd      }||fS )N)	input_idstoken_type_idsinputs_embeds)rr   r   dtyperN   rD   )r/   r   r>   rr   r`   r:   r    
zeros_likerW   rN   	full_liker]   )r?   r   attention_maskr   rp   rq   r   image_embedsimage_token_type_idxtext_embedsimage_masksr   
embeddingsmaskss                 r'   forwardzViltEmbeddings.forward   s    **m + 

 595F5Fj4;;;W;W 6G 62L+{ %,,Q/K  '#$ !D$>$>^5::kFXFXY%
 
 $d&@&@OOK)=UZZXcXjXjk'
 

 YY\:B
		>;7Q?5  r&   )   )r   )r   r   r   r   r-   r   r   __classcell__r@   s   @r'   r)   r)   C   s    V9B '!r&   r)   c                   *     e Zd ZdZ fdZddZ xZS )r.   zGConstruct the embeddings from word, position and token_type embeddings.c                    t         |           t        j                  |j                  |j
                  |j                        | _        t        j                  |j                  |j
                        | _	        t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        | j#                  dt%        j&                  |j                        j)                  d      d       | j#                  dt%        j*                  | j,                  j/                         t$        j0                        d       y )	N)padding_idxepsposition_idsr   rJ   F)
persistentr   r   )r,   r-   r   r8   
vocab_sizer2   pad_token_idword_embeddingsmax_position_embeddingsr7   type_vocab_sizer:   	LayerNormlayer_norm_epsr;   r<   r=   register_bufferr    rc   re   r1   r   rC   rW   r?   r>   r@   s     r'   r-   zTextEmbeddings.__init__   s   !||F,=,=v?Q?Q_e_r_rs#%<<0N0NPVPbPb#c %'\\&2H2H&J\J\%]"f&8&8f>S>STzz&"<"<=ELL)G)GHOOPWXej 	 	
 	ekk$*;*;*@*@*B%**Ubg 	 	
r&   c                 6   ||j                         }n|j                         d d }|d   }|| j                  d d d |f   }|st        | d      r-| j                  d d d |f   }|j	                  |d   |      }|}n:t        j                  |t
        j                  | j                  j                        }|| j                  |      }| j                  |      }	||	z   }
| j                  |      }|
|z  }
| j                  |
      }
| j                  |
      }
|
S )NrJ   r   r   r   r   )rC   r   hasattrr   re   r    r1   rW   rN   r   r:   r7   r   r=   )r?   r   r   r   r   input_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedr:   r   r7   s               r'   r   zTextEmbeddings.forward   s/    #..*K',,.s3K ^
,,Q^<L
 !t-.*.*=*=a*n*M'3J3Q3QR]^_R`bl3m0!A!&[

SWSdSdSkSk!l  00;M $ : :> J"%::
"66|D))
^^J/
\\*-
r&   )NNNNr   r   r   r   r-   r   r   r   s   @r'   r.   r.      s    Q
  r&   r.   c                   (     e Zd ZdZ fdZd Z xZS )r4   z#
    Image to Patch Embedding.
    c                    t         |           |j                  |j                  }}|j                  |j
                  }}t        |t        j                  j                        r|n||f}t        |t        j                  j                        r|n||f}|d   |d   z  |d   |d   z  z  }|| _        || _        || _        || _
        t        j                  ||||      | _        y )Nr   r   )kernel_sizestride)r,   r-   rY   rZ   r{   r2   rf   collectionsabcIterabler6   r   Conv2drQ   )r?   r>   rY   rZ   r{   r2   r6   r@   s          r'   r-   zViltPatchEmbeddings.__init__  s    !'!2!2F4E4EJ
$*$7$79K9Kk#-j+//:R:R#SZZdfpYq
#-j+//:R:R#SZZdfpYq
!!}
15*Q-:VW=:XY$$(&))L+:^hir&   c                     |j                   \  }}}}|| j                  k7  rt        d      | j                  j                  j
                  }| j                  |j                  |            }|S )NzeMake sure that the channel dimension of the pixel values match with the one set in the configuration.r   )rS   r{   
ValueErrorrQ   rR   r   rd   )r?   rp   rz   r{   r|   r}   target_dtyperv   s           r'   r   zViltPatchEmbeddings.forward'  si    2>2D2D/
L&%4,,,w  --33OOLOO,O?@r&   r   r   s   @r'   r4   r4     s    jr&   r4   c                   &     e Zd Z fdZddZ xZS )ViltSelfAttentionc                    t         |           |j                  |j                  z  dk7  r2t	        |d      s&t        d|j                   d|j                   d      |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _        t        j                  |j                  | j                  |j                        | _        t        j                  |j                  | j                  |j                        | _        t        j                  |j                  | j                  |j                        | _        t        j                  |j                         | _        y )Nr   embedding_sizezThe hidden size z4 is not a multiple of the number of attention heads .)bias)r,   r-   r2   num_attention_headsr   r   rg   attention_head_sizeall_head_sizer   Linearqkv_biasquerykeyvaluer;   attention_probs_dropout_probr=   r   s     r'   r-   zViltSelfAttention.__init__3  s.    : ::a?PVXhHi"6#5#5"6 7334A7 
 $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EFOO\
99V//1C1C&//ZYYv1143E3EFOO\
zz&"E"EFr&   c                    |j                   \  }}}| j                  |      j                  |d| j                  | j                        j                  dd      }| j                  |      j                  |d| j                  | j                        j                  dd      }| j                  |      j                  |d| j                  | j                        j                  dd      }	t        j                  ||j                  dd            }
|
t        j                  | j                        z  }
||
|z   }
 t        j                  d      |
      }| j                  |      }t        j                  ||	      }|j                  dddd      j!                         }|j#                         d d | j$                  fz   } |j                  | }|r||f}|S |f}|S )NrJ   r   rB   rI   rD   r   r   )rS   r   r\   r   r   r[   r   r   r    matmulmathsqrtr   Softmaxr=   permute
contiguousrC   r   )r?   r   r   output_attentionsrz   r   rs   query_layer	key_layervalue_layerattention_scoresattention_probscontext_layernew_context_layer_shapeoutputss                  r'   r   zViltSelfAttention.forwardE  s   $1$7$7!
JJJ}%T*b$":":D<T<TUYq!_ 	 HH]#T*b$":":D<T<TUYq!_ 	 JJ}%T*b$":":D<T<TUYq!_ 	 !<<Y5H5HR5PQ+dii8P8P.QQ%/.@ -"**,-=> ,,7_kB%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S***,CD6G=/2 O\M]r&   NFr   r   r   r-   r   r   r   s   @r'   r   r   2  s    G$(r&   r   c                   x     e Zd ZdZdef fdZdej                  dej                  dej                  fdZ xZ	S )ViltSelfOutputz
    The residual connection is defined in ViltLayer instead of here (as is the case with other models), due to the
    layernorm applied before each block.
    r>   c                     t         |           t        j                  |j                  |j                        | _        t        j                  |j                        | _        y N)	r,   r-   r   r   r2   denser;   r<   r=   r   s     r'   r-   zViltSelfOutput.__init__w  sB    YYv1163E3EF
zz&"<"<=r&   r   input_tensorreturnc                 J    | j                  |      }| j                  |      }|S r   r   r=   r?   r   r   s      r'   r   zViltSelfOutput.forward|  s$    

=1]3r&   )
r   r   r   r   r   r-   r    Tensorr   r   r   s   @r'   r   r   q  s=    
>z >
U\\  RWR^R^ r&   r   c                   &     e Zd Z fdZddZ xZS )ViltAttentionc                 b    t         |           t        |      | _        t	        |      | _        y r   )r,   r-   r   	attentionr   outputr   s     r'   r-   zViltAttention.__init__  s&    *62$V,r&   c                 h    | j                  |||      }| j                  |d   |      }|f|dd  z   }|S )Nr   r   )r   r   )r?   r   r   r   self_outputsattention_outputr   s          r'   r   zViltAttention.forward  sC    ~~m^EVW;;|AF#%QR(88r&   r   r   r   s   @r'   r   r     s    -
r&   r   c                   \     e Zd Zdef fdZdej                  dej                  fdZ xZS )ViltIntermediater>   c                    t         |           t        j                  |j                  |j
                        | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y r   )r,   r-   r   r   r2   intermediate_sizer   rf   
hidden_actstrr   intermediate_act_fnr   s     r'   r-   zViltIntermediate.__init__  s]    YYv1163K3KL
f''-'-f.?.?'@D$'-'8'8D$r&   r   r   c                 J    | j                  |      }| j                  |      }|S r   )r   r  r?   r   s     r'   r   zViltIntermediate.forward  s&    

=100?r&   	r   r   r   r   r-   r    r   r   r   r   s   @r'   r  r    s*    9z 9U\\ ell r&   r  c                   t     e Zd Zdef fdZdej                  dej                  dej                  fdZ xZS )
ViltOutputr>   c                     t         |           t        j                  |j                  |j
                        | _        t        j                  |j                        | _	        y r   )
r,   r-   r   r   r  r2   r   r;   r<   r=   r   s     r'   r-   zViltOutput.__init__  sB    YYv779K9KL
zz&"<"<=r&   r   r   r   c                 T    | j                  |      }| j                  |      }||z   }|S r   r   r   s      r'   r   zViltOutput.forward  s.    

=1]3%4r&   r  r   s   @r'   r  r    s8    >z >
U\\  RWR^R^ r&   r  c                   *     e Zd ZdZ fdZddZ xZS )	ViltLayerz?This corresponds to the Block class in the timm implementation.c                 r   t         |           |j                  | _        d| _        t	        |      | _        t        |      | _        t        |      | _	        t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        y )Nr   r   )r,   r-   chunk_size_feed_forwardseq_len_dimr   r   r  intermediater  r   r   r   r2   r   layernorm_beforelayernorm_afterr   s     r'   r-   zViltLayer.__init__  s    '-'E'E$&v.,V4 ( "V-?-?VEZEZ [!||F,>,>FDYDYZr&   c                    | j                  | j                  |      ||      }|d   }|dd  }||j                  |j                        z   }| j	                  |      }| j                  |      }| j                  ||      }|f|z   }|S )N)r   r   r   )r   r  rd   rN   r  r  r   )r?   r   r   r   self_attention_outputsr  r   layer_outputs           r'   r   zViltLayer.forward  s    !%!!-0/ "0 "

 2!4(, )=+;+;<L<S<S+TT ++M:((6 {{<?/G+r&   r   r   r   s   @r'   r  r    s    I[r&   r  c                   .     e Zd Z fdZ	 	 	 	 ddZ xZS )ViltEncoderc                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w r   )
r,   r-   r>   r   
ModuleListrangenum_hidden_layersr  layergradient_checkpointing)r?   r>   rs   r@   s      r'   r-   zViltEncoder.__init__  sN    ]]uVE]E]?^#_!If$5#_`
&+# $`s   A#c                     |rdnd }|rdnd }t        | j                        D ](  \  }}	|r||fz   } |	|||      }
|
d   }|s ||
d   fz   }* |r||fz   }|st        d |||fD              S t        |||      S )Nr%   r   r   c              3   &   K   | ]	  }||  y wr   r%   ).0r   s     r'   	<genexpr>z&ViltEncoder.forward.<locals>.<genexpr>  s     mq_`_lms   )last_hidden_stater   r   )rl   r!  r$   r
   )r?   r   r   r   output_hidden_statesreturn_dictall_hidden_statesall_self_attentionsr   layer_modulelayer_outputss              r'   r   zViltEncoder.forward  s     #7BD$5b4(4 		POA|#$58H$H!(HYZM)!,M &9]1=M<O&O#		P   1]4D Dm]4EGZ$[mmm++*
 	
r&   )NFFTr   r   s   @r'   r  r    s    , "
r&   r  c                   >     e Zd ZU eed<   dZdZdZddgZ fdZ	 xZ
S )ViltPreTrainedModelr>   vilt)imagetextTr)   r   c                 6   t         |   |       t        |t              ryt	        j
                  |j                  t        j                  |j                  j                  d         j                  d             t	        j                  |j                         y y )NrJ   r   )r,   _init_weightsrf   r.   initcopy_r   r    rc   rS   re   zeros_r   )r?   moduler@   s     r'   r4  z!ViltPreTrainedModel._init_weights  sl    f%fn-JJv**ELL9L9L9R9RSU9V,W,^,^_f,ghKK--. .r&   )r   r   r   r   r"   base_model_prefixinput_modalitiessupports_gradient_checkpointing_no_split_modulesr4  r   r   s   @r'   r/  r/    s1    (&*#)+>?/ /r&   r/  c                   t    e Zd Zd fd	Zd Zd Ze	 	 	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  d	ej                  dz  d
ej                  dz  dej                  dz  de
dz  dedz  dedz  dedz  deeej                     z  fd       Z xZS )	ViltModelc                    t         |   |       || _        t        |      | _        t        |      | _        t        j                  |j                  |j                        | _        |rt        |      nd| _        | j                          y)zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        r   N)r,   r-   r>   r)   r   r  encoderr   r   r2   r   	layernorm
ViltPoolerpooler	post_init)r?   r>   add_pooling_layerr@   s      r'   r-   zViltModel.__init__  sk    
 	 (0"6*f&8&8f>S>ST,=j(4 	r&   c                 B    | j                   j                  j                  S r   r   r/   r   r?   s    r'   get_input_embeddingszViltModel.get_input_embeddings  s    ..>>>r&   c                 :    || j                   j                  _        y r   rG  )r?   r   s     r'   set_input_embeddingszViltModel.set_input_embeddings"  s    :?''7r&   Nr   r   r   rp   rq   r   r   r   r   r(  r)  r   c           
      0   |	|	n| j                   j                  }	|
|
n| j                   j                  }
||n| j                   j                  }||t	        d      |#| j                  ||       |j                         }n!||j                         dd }nt	        d      |\  }}||j                  n|j                  }|t        j                  ||f|      }||t	        d      ||t	        d      ||j                  d   n|j                  d   }||k7  rt	        d	      |Bt        j                  || j                   j                  | j                   j                  f|      }| j                  ||||||||
      \  }}| j                  ||      }| j                  |||	|
|      }|d   }| j                  |      }| j                   | j!                  |      nd}|s
||f|dd z   S t#        |||j$                  |j&                        S )a  
        image_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`, *optional*):
            Optionally, instead of passing `pixel_values`, you can choose to directly pass an embedded representation.
            This is useful if you want more control over how to convert `pixel_values` into patch embeddings.
        image_token_type_idx (`int`, *optional*):
            - The token type ids for images.

        Examples:

        ```python
        >>> from transformers import ViltProcessor, ViltModel
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO

        >>> # prepare image and text
        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))
        >>> text = "hello world"

        >>> processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-mlm")
        >>> model = ViltModel.from_pretrained("dandelin/vilt-b32-mlm")

        >>> inputs = processor(image, text, return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> last_hidden_states = outputs.last_hidden_state
        ```NzDYou cannot specify both input_ids and inputs_embeds at the same timerJ   z5You have to specify either input_ids or inputs_embedsrM   zFYou cannot specify both pixel_values and image_embeds at the same timez7You have to specify either pixel_values or image_embedsr   zAThe text inputs and image inputs need to have the same batch size)r   )r   r   r(  r)  r   )r'  pooler_outputr   r   )r>   r   r(  use_return_dictr   %warn_if_padding_and_no_attention_maskrC   rN   r    rn   rS   rY   r   get_extended_attention_maskr@  rA  rC  r   r   r   )r?   r   r   r   rp   rq   r   r   r   r   r(  r)  kwargsr   text_batch_sizer   rN   image_batch_sizeembedding_outputextended_attention_maskencoder_outputssequence_outputpooled_outputs                          r'   r   zViltModel.forward%  sh   X 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] ]%>cdd"66y.Q#..*K&',,.s3KTUU&1#%.%:!!@T@T!"ZZ/:)FPVWN#(@eff!l&:VWW4@4L<--a0R^RdRdefRg.`aa%5t{{7M7Mt{{OeOe$fouvJ+/??!5 ,; 	,
(. 150P0PQ_al0m,,2/!5# ' 
 *!,..98<8OO4UY#]3oab6III)-')77&11	
 	
r&   )TNNNNNNNNNNN)r   r   r   r-   rI  rK  r   r    
LongTensorr!   rg   boolr   r$   r   r   r   s   @r'   r>  r>    s;   "?@  .2372615.22615+/)-,0#'n
##d*n
 ))D0n
 ((4/	n

 ''$.n
 $$t+n
 ((4/n
 ''$.n
 "Djn
  $;n
 #Tkn
 D[n
 
$eE,=,=&>	>n
 n
r&   r>  c                   $     e Zd Z fdZd Z xZS )rB  c                     t         |           t        j                  |j                  |j                        | _        t        j                         | _        y r   )r,   r-   r   r   r2   r   Tanh
activationr   s     r'   r-   zViltPooler.__init__  s9    YYv1163E3EF
'')r&   c                 \    |d d df   }| j                  |      }| j                  |      }|S )Nr   )r   r_  )r?   r   first_token_tensorrX  s       r'   r   zViltPooler.forward  s6     +1a40

#566r&   r   r   s   @r'   rB  rB    s    $
r&   rB  zU
    ViLT Model with a language modeling head on top as done during pretraining.
    c                       e Zd ZddiZ fdZd Zd Ze	 	 	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  d	ej                  dz  d
ej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dedz  dedz  dedz  deeej                     z  fd       Z xZS )ViltForMaskedLMzmlm_score.decoder.weightz6vilt.embeddings.text_embeddings.word_embeddings.weightc                     t         |   |       t        |      | _        t	        |      | _        | j                          y r   )r,   r-   r>  r0  ViltMLMHead	mlm_scorerD  r   s     r'   r-   zViltForMaskedLM.__init__  s4     f%	$V, 	r&   c                 .    | j                   j                  S r   )rf  decoderrH  s    r'   get_output_embeddingsz%ViltForMaskedLM.get_output_embeddings  s    ~~%%%r&   c                 \    || j                   _        |j                  | j                   _        y r   )rf  rh  r   )r?   new_embeddingss     r'   set_output_embeddingsz%ViltForMaskedLM.set_output_embeddings  s     !/,11r&   Nr   r   r   rp   rq   r   r   labelsr   r(  r)  r   c                 D   ||n| j                   j                  }| j                  ||||||||	|
|
      }|dd \  }}||j                  d   n|j                  d   }|ddd|f   |dd|df   }}| j	                  |      }d}|at               }|j                  |j                        } ||j                  d| j                   j                        |j                  d            }|s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )a  
        image_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`, *optional*):
            Optionally, instead of passing `pixel_values`, you can choose to directly pass an embedded representation.
            This is useful if you want more control over how to convert `pixel_values` into patch embeddings.
        labels (*torch.LongTensor* of shape *(batch_size, sequence_length)*, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in *[-100, 0, ...,
            config.vocab_size]* (see *input_ids* docstring) Tokens with indices set to *-100* are ignored (masked), the
            loss is only computed for the tokens with labels in *[0, ..., config.vocab_size]*

        Examples:

        ```python
        >>> from transformers import ViltProcessor, ViltForMaskedLM
        >>> import httpx
        >>> from io import BytesIO
        >>> from PIL import Image
        >>> import re
        >>> import torch

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))
        >>> text = "a bunch of [MASK] laying on a [MASK]."

        >>> processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-mlm")
        >>> model = ViltForMaskedLM.from_pretrained("dandelin/vilt-b32-mlm")

        >>> # prepare inputs
        >>> encoding = processor(image, text, return_tensors="pt")

        >>> # forward pass
        >>> outputs = model(**encoding)

        >>> tl = len(re.findall("\[MASK\]", text))
        >>> inferred_token = [text]

        >>> # gradually fill in the MASK tokens, one by one
        >>> with torch.no_grad():
        ...     for i in range(tl):
        ...         encoded = processor.tokenizer(inferred_token)
        ...         input_ids = torch.tensor(encoded.input_ids)
        ...         encoded = encoded["input_ids"][0][1:-1]
        ...         outputs = model(input_ids=input_ids, pixel_values=encoding.pixel_values)
        ...         mlm_logits = outputs.logits[0]  # shape (seq_len, vocab_size)
        ...         # only take into account text features (minus CLS and SEP token)
        ...         mlm_logits = mlm_logits[1 : input_ids.shape[1] - 1, :]
        ...         mlm_values, mlm_ids = mlm_logits.softmax(dim=-1).max(dim=-1)
        ...         # only take into account text
        ...         mlm_values[torch.tensor(encoded) != 103] = 0
        ...         select = mlm_values.argmax().item()
        ...         encoded[select] = mlm_ids[select].item()
        ...         inferred_token = [processor.decode(encoded)]

        >>> selected_token = ""
        >>> encoded = processor.tokenizer(inferred_token)
        >>> output = processor.decode(encoded.input_ids[0], skip_special_tokens=True)
        >>> print(output)
        a bunch of cats laying on a couch.
        ```N	r   r   rp   rq   r   r   r   r(  r)  rB   r   rJ   r   r   r   r   )r>   rN  r0  rS   rf  r   rd   rN   r\   r   r   r   r   )r?   r   r   r   rp   rq   r   r   rm  r   r(  r)  rQ  r   rW  rX  text_seq_lentext_featuresrs   
mlm_logitsmasked_lm_lossloss_fctr   s                          r'   r   zViltForMaskedLM.forward  sZ   V &1%<k$++B]B]))))%!'%/!5#  
 *1!&-6-Byq)H[H[\]H^+A}},<=qR^R_O_?`q^^M2
')HYYz001F%joob$++:P:P&QSYS^S^_aSbcN ]WQR[0F3A3M^%.YSYY!//))	
 	
r&   rY  )r   r   r   _tied_weights_keysr-   ri  rl  r   r    rZ  r!   r[  r   r$   r   r   r   s   @r'   rc  rc    sQ    	#$\&2  .2372615.22615*.)-,0#'p
##d*p
 ))D0p
 ((4/	p

 ''$.p
 $$t+p
 ((4/p
 ''$.p
   4'p
  $;p
 #Tkp
 D[p
 
% 1 12	2p
 p
r&   rc  c                   $     e Zd Z fdZd Z xZS )ViltPredictionHeadTransformc                 h   t         |           t        j                  |j                  |j                        | _        t        |j                  t              rt        |j                     | _
        n|j                  | _
        t        j                  |j                  |j                        | _        y )Nr   )r,   r-   r   r   r2   r   rf   r  r  r   transform_act_fnr   r   r   s     r'   r-   z$ViltPredictionHeadTransform.__init__5  s{    YYv1163E3EF
f''-$*6+<+<$=D!$*$5$5D!f&8&8f>S>STr&   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r   )r   rz  r   r
  s     r'   r   z#ViltPredictionHeadTransform.forward>  s4    

=1--m<}5r&   r   r   s   @r'   rx  rx  4  s    Ur&   rx  c                   $     e Zd Z fdZd Z xZS )re  c                     t         |           || _        t        |      | _        t        j                  |j                  |j                        | _	        y r   )
r,   r-   r>   rx  	transformr   r   r2   r   rh  r   s     r'   r-   zViltMLMHead.__init__F  s?    4V<yy!3!3V5F5FGr&   c                 J    | j                  |      }| j                  |      }|S r   )r~  rh  )r?   rv   s     r'   r   zViltMLMHead.forwardL  s"    NN1LLOr&   r   r   s   @r'   re  re  E  s    Hr&   re  z
    Vilt Model transformer with a classifier head on top (a linear layer on top of the final hidden state of the [CLS]
    token) for visual question answering, e.g. for VQAv2.
    c                   z    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	ej                  dz  d
ej                  dz  dedz  dedz  dedz  de	e
ej                     z  fd       Z xZS )ViltForQuestionAnsweringc           	         t         |   |       |j                  | _        t        |      | _        t        j                  t        j                  |j                  |j                  dz        t        j                  |j                  dz        t        j                         t        j                  |j                  dz  |j                              | _        | j                          y )NrB   )r,   r-   
num_labelsr>  r0  r   
Sequentialr   r2   r   GELU
classifierrD  r   s     r'   r-   z!ViltForQuestionAnswering.__init__Y  s      ++f%	 --IIf((&*<*<q*@ALL++a/0GGIIIf((1,f.?.?@	
 	r&   Nr   r   r   rp   rq   r   r   rm  r   r(  r)  r   c                    ||n| j                   j                  }| j                  ||||||||	|
|
      }|r|j                  n|d   }| j	                  |      }d}|K|j                  |j                        }t        j                  j                  ||      |j                  d   z  }|s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )a  
        image_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`, *optional*):
            Optionally, instead of passing `pixel_values`, you can choose to directly pass an embedded representation.
            This is useful if you want more control over how to convert `pixel_values` into patch embeddings.
        labels (`torch.FloatTensor` of shape `(batch_size, num_labels)`, *optional*):
            Labels for computing the visual question answering loss. This tensor must be either a one-hot encoding of
            all answers that are applicable for a given example in the batch, or a soft encoding indicating which
            answers are applicable, where 1.0 is the highest score.

        Examples:

        ```python
        >>> from transformers import ViltProcessor, ViltForQuestionAnswering
        >>> import httpx
        >>> from io import BytesIO
        >>> from PIL import Image

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))
        >>> text = "How many cats are there?"

        >>> processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-vqa")
        >>> model = ViltForQuestionAnswering.from_pretrained("dandelin/vilt-b32-finetuned-vqa")

        >>> # prepare inputs
        >>> encoding = processor(image, text, return_tensors="pt")

        >>> # forward pass
        >>> outputs = model(**encoding)
        >>> logits = outputs.logits
        >>> idx = logits.argmax(-1).item()
        >>> print("Predicted answer:", model.config.id2label[idx])
        Predicted answer: 2
        ```Nro  r   rB   rp  )r>   rN  r0  rM  r  rd   rN   r   rU    binary_cross_entropy_with_logitsrS   r   r   r   )r?   r   r   r   rp   rq   r   r   rm  r   r(  r)  rQ  r   rM  r   r   r   s                     r'   r   z ViltForQuestionAnswering.forwardj  s   f &1%<k$++B]B]))))%!'%/!5#  
 2=--'!*/YYv}}-F==AA&&QTZT`T`abTccD Y,F)-)9TGf$EvE'!//))	
 	
r&   rY  r   r   r   r-   r   r    rZ  r!   r[  r   r$   r   r   r   s   @r'   r  r  R  s7   "  .2372615.22615*.)-,0#'U
##d*U
 ))D0U
 ((4/	U

 ''$.U
 $$t+U
 ((4/U
 ''$.U
   4'U
  $;U
 #TkU
 D[U
 
"E%*;*;$<	<U
 U
r&   r  z
    Vilt Model transformer with a classifier head on top (a linear layer on top of the final hidden state of the [CLS]
    token) for image-to-text or text-to-image retrieval, e.g. MSCOCO and F30K.
    c                   z    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	ej                  dz  d
ej                  dz  dedz  dedz  dedz  de	e
ej                     z  fd       Z xZS )ViltForImageAndTextRetrievalc                     t         |   |       t        |      | _        t	        j
                  |j                  d      | _        | j                          y r+   )	r,   r-   r>  r0  r   r   r2   rank_outputrD  r   s     r'   r-   z%ViltForImageAndTextRetrieval.__init__  sC     f%	 99V%7%7; 	r&   Nr   r   r   rp   rq   r   r   rm  r   r(  r)  r   c                 B   ||n| j                   j                  }d}|t        d      | j                  ||||||||	|
|
      }|r|j                  n|d   }| j                  |      }|s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )a  
        image_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`, *optional*):
            Optionally, instead of passing `pixel_values`, you can choose to directly pass an embedded representation.
            This is useful if you want more control over how to convert `pixel_values` into patch embeddings.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels are currently not supported.

        Examples:

        ```python
        >>> from transformers import ViltProcessor, ViltForImageAndTextRetrieval
        >>> import httpx
        >>> from io import BytesIO
        >>> from PIL import Image

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))
        >>> texts = ["An image of two cats chilling on a couch", "A football player scoring a goal"]

        >>> processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-coco")
        >>> model = ViltForImageAndTextRetrieval.from_pretrained("dandelin/vilt-b32-finetuned-coco")

        >>> # forward pass
        >>> scores = dict()
        >>> for text in texts:
        ...     # prepare inputs
        ...     encoding = processor(image, text, return_tensors="pt")
        ...     outputs = model(**encoding)
        ...     scores[text] = outputs.logits[0, :].item()
        ```NzTraining is not yet supported.ro  r   rB   rp  )	r>   rN  NotImplementedErrorr0  rM  r  r   r   r   )r?   r   r   r   rp   rq   r   r   rm  r   r(  r)  rQ  r   r   rM  r   r   s                     r'   r   z$ViltForImageAndTextRetrieval.forward  s    ^ &1%<k$++B]B]%&FGG))))%!'%/!5#  
 2=--'!*!!-0Y,F)-)9TGf$EvE'!//))	
 	
r&   rY  r  r   s   @r'   r  r    s7   	  .2372615.22615*.)-,0#'N
##d*N
 ))D0N
 ((4/	N

 ''$.N
 $$t+N
 ((4/N
 ''$.N
   4'N
  $;N
 #TkN
 D[N
 
"E%*;*;$<	<N
 N
r&   r  zq
    Vilt Model transformer with a classifier head on top for natural language visual reasoning, e.g. NLVR2.
    c                   z    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	ej                  dz  d
ej                  dz  dedz  dedz  dedz  de	e
ej                     z  fd       Z xZS )"ViltForImagesAndTextClassificationc           	         t         |   |       |j                  | _        t        |      | _        |j
                  }t        j                  t        j                  |j                  |z  |j                  |z        t        j                  |j                  |z        t        j                         t        j                  |j                  |z  |j                              | _        | j                          y r   )r,   r-   r  r>  r0  
num_imagesr   r  r   r2   r   r  r  rD  )r?   r>   r  r@   s      r'   r-   z+ViltForImagesAndTextClassification.__init__-  s      ++f%	 &&
--IIf((:5v7I7IJ7VWLL++j89GGIIIf((:5v7H7HI	
 	r&   Nr   r   r   rp   rq   r   r   rm  r   r(  r)  r   c                    |	|	n| j                   j                  }	|
|
n| j                   j                  }
||n| j                   j                  }| |j                  dk(  r|j                  d      }| |j                  dk(  r|j                  d      }||j                  d   nd}|||j                  d   nd}|| j                   j                  k7  rt        d      g }|
rg nd}|	rg nd}t        |      D ]  }| j                  |||||dd|ddddddf   nd||dd|ddddf   nd|||dd|ddddf   nd|dz   |	|
|      }|r|j                  n|d   }|j                  |       |
r|j                  |j                         |	s|j                  |j                          t        j                   |d      }| j#                  |      }d}|Wt%               }|j'                  |j(                        } ||j+                  d| j,                        |j+                  d            }|s|||f}||f|z   S |S t/        ||||	      S )
a  
        image_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`, *optional*):
            Optionally, instead of passing `pixel_values`, you can choose to directly pass an embedded representation.
            This is useful if you want more control over how to convert `pixel_values` into patch embeddings.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Binary classification labels.

        Examples:

        ```python
        >>> from transformers import ViltProcessor, ViltForImagesAndTextClassification
        >>> import httpx
        >>> from io import BytesIO
        >>> from PIL import Image

        >>> url_1 = "https://lil.nlp.cornell.edu/nlvr/exs/ex0_0.jpg"
        >>> with httpx.stream("GET", url_1) as response:
        ...     image_1 = Image.open(BytesIO(response.read()))

        >>> url_2 = "https://lil.nlp.cornell.edu/nlvr/exs/ex0_1.jpg"
        >>> with httpx.stream("GET", url_2) as response:
        ...     image_2 = Image.open(BytesIO(response.read()))

        >>> text = "The left image contains twice the number of dogs as the right image."

        >>> processor = ViltProcessor.from_pretrained("dandelin/vilt-b32-finetuned-nlvr2")
        >>> model = ViltForImagesAndTextClassification.from_pretrained("dandelin/vilt-b32-finetuned-nlvr2")

        >>> # prepare inputs
        >>> encoding = processor([image_1, image_2], text, return_tensors="pt")

        >>> # forward pass
        >>> outputs = model(input_ids=encoding.input_ids, pixel_values=encoding.pixel_values.unsqueeze(0))
        >>> logits = outputs.logits
        >>> idx = logits.argmax(-1).item()
        >>> print("Predicted answer:", model.config.id2label[idx])
        Predicted answer: True
        ```N   r   r   z\Make sure to match the number of images in the model with the number of images in the input.)
r   r   rp   rq   r   r   r   r   r(  r)  rJ   rD   rp  )r>   r   r(  rN  ndim	unsqueezerS   r  r   r  r0  rM  ro   r   r   r    r]   r  r   rd   rN   r\   r  r   )r?   r   r   r   rp   rq   r   r   rm  r   r(  r)  rQ  r  pooler_outputsr   r   r   r   rM  rX  r   r   ru  r   s                            r'   r   z*ViltForImagesAndTextClassification.forward?  s   l 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]#(9(9Q(>'11!4L#(9(9Q(>'11!4L.:.F\''*D
2>2J++A.PTJ///n  2,R$
z" 	6Aii--<H<T\!Q1a-8Z^5?5K:aAqj1QU+9E9Q\!Q1*5W[%&U"3%9'   G 6AG11gajM!!-0#$$W%:%:; !!'"4"45)	6, 		.b9/')HYYv}}-FFKKDOO<fkk"oNDmZ8F)-)9TGf$EvE7'!	
 	
r&   rY  )r   r   r   r-   r   r    rZ  r!   r[  r   r$   r   r   r   s   @r'   r  r  '  s7   $  .2372615.22615*.)-,0#'v
##d*v
 ))D0v
 ((4/	v

 ''$.v
 $$t+v
 ((4/v
 ''$.v
   4'v
  $;v
 #Tkv
 D[v
 
2E%:K:K4L	Lv
 v
r&   r  c                   z    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	ej                  dz  d
ej                  dz  dedz  dedz  dedz  de	e
ej                     z  fd       Z xZS )ViltForTokenClassificationc                 0   t         |   |       |j                  | _        t        |d      | _        t        j                  |j                        | _        t        j                  |j                  |j                        | _        | j                          y )NF)rE  )r,   r-   r  r>  r0  r   r;   r<   r=   r   r2   r  rD  r   s     r'   r-   z#ViltForTokenClassification.__init__  sk      ++f>	zz&"<"<=))F$6$68I8IJ 	r&   Nr   r   r   rp   rq   r   r   rm  r   r(  r)  r   c                 2   ||n| j                   j                  }| j                  ||||||||	|
|
      }|d   }||j                  d   n|j                  d   }| j	                  |      }| j                  |ddd|f         }d}|Wt               }|j                  |j                        } ||j                  d| j                        |j                  d            }|s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )a/  
        image_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`, *optional*):
            Optionally, instead of passing `pixel_values`, you can choose to directly pass an embedded representation.
            This is useful if you want more control over how to convert `pixel_values` into patch embeddings.
        labels (`torch.LongTensor` of shape `(batch_size, text_sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        Nro  r   r   rJ   rB   rp  )r>   rN  r0  rS   r=   r  r   rd   rN   r\   r  r   r   r   )r?   r   r   r   rp   rq   r   r   rm  r   r(  r)  rQ  r   rW  text_input_sizer   r   ru  r   s                       r'   r   z"ViltForTokenClassification.forward  s<   0 &1%<k$++B]B]))))%!'%/!5#  
 "!*090E)//!,=K^K^_`Ka,,74D_4D1D!EF')HYYv}}-FFKKDOO<fkk"oNDY,F)-)9TGf$EvE$!//))	
 	
r&   rY  )r   r   r   r-   r   r    rZ  r!   r[  r   r$   r   r   r   s   @r'   r  r    s)   
  .2372615.22615*.)-,0#'=
##d*=
 ))D0=
 ((4/	=

 ''$.=
 $$t+=
 ((4/=
 ''$.=
   4'=
  $;=
 #Tk=
 D[=
 
u'8'8!9	9=
 =
r&   r  )r  r  r  rc  r  r  r>  r/  )9r   collections.abcr   r   dataclassesr   r    r   torch.nnr    r   r5  activationsr   modeling_layersr	   modeling_outputsr
   r   r   r   r   r   modeling_utilsr   utilsr   r   configuration_viltr   
get_loggerr   loggerr   Moduler)   r.   r4   r   r   r   r  r  r  r  r/  r>  rB  rc  rx  re  r  r  r  r  __all__r%   r&   r'   <module>r     sY      !   % & ! 9  . , * 
		H	% 
={ = =$W!RYY W!t3RYY 3l")) >;		 ;~RYY "BII  ryy  
 
"* "J&
")) &
R // / / G
# G
 G
T  
F
) F

F
R")) "
")) 
 h
2 h
h
V [
#6 [
[
| 
J
)< J

J
Z K
!4 K
 K
\	r&   