
    qi]                       d Z ddlZddlZddlmZ ddlmZ ddlmZ ddlZddlm	Z	 ddl
mZ dd	lmZ dd
lmZ ddlmZmZ ddlmZ ddlmZ ddlmZmZmZmZmZmZ ddlm Z m!Z!m"Z"m#Z#m$Z$  ejJ                  e&      Z'dZ(dZ)dZ*e$e"z  e#z  Z+e ed       G d de                    Z,e ed       G d de                    Z-e ed       G d de                    Z. G d de	j^                        Z0 G d d e	j^                        Z1 G d! d"e	j^                        Z2 G d# d$e	j^                        Z3 G d% d&e	j^                        Z4 G d' d(e	j^                        Z5 G d) d*e	j^                        Z6 G d+ d,e	j^                        Z7 G d- d.e      Z8 G d/ d0e	j^                        Z9 G d1 d2e	j^                        Z:e G d3 d4e             Z;e G d5 d6e;             Z<e G d7 d8e;             Z=e G d9 d:e;             Z>e G d; d<e;             Z? G d= d>e	j^                        Z@ G d? d@e	j^                        ZA G dA dBe	j^                        ZB edC       G dD dEe;             ZC G dF dGe	j^                        ZD G dH dIe	j^                        ZE G dJ dKe	j^                        ZF G dL dMe	j^                        ZG edN       G dO dPe;             ZHg dQZIy)RzPyTorch FLAVA model.    N)OrderedDict)	dataclass)Any)nn   )initialization)ACT2FN)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPooling)PreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringcan_return_tuplelogging	torch_int   )FlavaConfigFlavaImageCodebookConfigFlavaImageConfigFlavaMultimodalConfigFlavaTextConfigzfacebook/flava-image-codebookg$(~k@a  
    Output from FlavaModel containing embeddings and outputs from individual encoders.

    Note that `image_embeddings` and `text_embeddigns` returned are similar to pooled output returned from a
    transformer. If you want embeddings for contrastive loss or retrieval use a FLAVA model's `image_projection` and
    `text_projection` layers on `image_embeddings` and `text_embeddings` respectively.
    )custom_introc                       e Zd ZU dZdZej                  dz  ed<   dZe	dz  ed<   dZ
ej                  dz  ed<   dZe	dz  ed<   dZej                  dz  ed<   dZe	dz  ed<   d	ee   fd
Zy)FlavaModelOutputa  
    image_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `pixel_values` are present):
        The image embeddings which are basically the pooled output of [`FlavaImageModel`].
    image_output (`BaseModelOutputWithPooling`, *optional*, returned when `pixel_values` are present):
        The output of the [`FlavaImageModel`].
    text_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `input_ids` are present):
        The text embeddings which are basically the pooled output of [`FlavaTextModel`].
    text_output (`BaseModelOutputWithPooling`, *optional*, returned when `input_ids` are present):
        The output of the [`FlavaTextModel`].
    multimodal_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `input_ids` and `pixel_values` are present and `skip_multimodal_encoder` is `None` or `False`):
        The multimodal embeddings which are basically the pooled output of [`FlavaTextModel`].
    multimodal_output (`BaseModelOutputWithPooling`, returned when `input_ids` and `pixel_values` are present and `skip_multimodal_encoder` is `None` or `False`):
        The output of the [`FlavaMultimodalModel`].
    Nimage_embeddingsimage_outputtext_embeddingstext_outputmultimodal_embeddingsmultimodal_outputreturnc                 H     t         fd j                         D              S )Nc              3   d   K   | ]'  }|d vr|   nt        |      j                          ) yw))r!   r   r#   Ngetattrto_tuple).0kselfs     Z/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/flava/modeling_flava.py	<genexpr>z,FlavaModelOutput.to_tuple.<locals>.<genexpr>U   s=      
  TTDGZabfhiZjZsZsZuu
   -0tuplekeysr,   s   `r-   r)   zFlavaModelOutput.to_tupleT   s#     
YY[
 
 	
    )__name__
__module____qualname____doc__r   torchFloatTensor__annotations__r   r   r    r!   r"   r#   r1   r   r)    r4   r-   r   r   3   s     26e''$.56:L,t3:04OU&&-459K+d296:5,,t3:;?1D8?
%* 
r4   r   z@
    Class representing pretraining losses from FLAVA model
    c                      e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	ej                  dz  ed<   dZ
ej                  dz  ed<   dZej                  dz  ed<   dZej                  dz  ed<   d	efd
Zy)FlavaLossesa  
    mim (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `mim_labels` and `pixel_values` are present, `input_ids_masked` is absent and `mim_weight` > 0.):
        Masked Image Modeling loss as used in BeIT calculated only for unimodal image data.
    mlm (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `mlm_labels` and `input_ids_masked` are present, `pixel_values` is absent and `mlm_weight` > 0.):
        Masked Language Modeling loss as used in BERT calculated only for unimodal text data.
    itm (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `itm_labels`, `input_ids_masked`, `pixel_values` are present and `itm_weight` > 0.):
        Image Text Matching (ITM) loss calculated for paired image-text data. Note that ITM loss is calculated on
        masked pairs in FLAVA.
    global_contrastive (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `input_ids` and `pixel_values` are present and `global_contrastive_weight` > 0.):
        Contrastive loss for image-text similarity similar to CLIP but calculated globally for paired image-text
        data. This is calculated on unmasked images and texts.
    mmm_image (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `mim_labels`, `pixel_values` and `input_ids_masked` are present and `mmm_image_weight` > 0.):
        Masked Multimodal Modeling loss's image component calculated on paired image-text data.
    mmm_text (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `mlm_labels`, `pixel_values` and `input_ids_masked` are present and `mmm_text_weight` > 0.):
        Masked Multimodal Modeling loss's text component calculated on paired image-text data.
    Nmimmlmitmglobal_contrastive	mmm_imagemmm_textr$   c                 B    d}| j                         D ]	  }|d} |S  |S )NTF)values)r,   all_nonevs      r-   rG   zFlavaLosses.all_nonez   s5     	A} 		 r4   )r5   r6   r7   r8   r?   r9   r:   r;   r@   rA   rB   rC   rD   boolrG   r<   r4   r-   r>   r>   [   s    " %)C		T	!($(C		T	!($(C		T	!(37))D07*.Iu  4'.)-He$&-$ r4   r>   a  
    Output from FlavaForPreTraining containing embeddings, and outputs from individual encoders.

    Note that `image_embeddings` and `text_embeddings` returned are similar to pooled output returned from a
    transformer. If you want embeddings for contrastive loss or retrieval use a FLAVA model's `image_projection` and
    `text_projection` layers on `image_embeddings` and `text_embeddings` respectively.
    c                      e Zd ZU dZdZej                  dz  ed<   dZe	ed<   dZ
ej                  dz  ed<   dZedz  ed<   dZej                  dz  ed<   dZedz  ed<   dZej                  dz  ed	<   dZedz  ed
<   dZej                  dz  ed<   dZedz  ed<   dZej                  dz  ed<   dZedz  ed<   dZej                  dz  ed<   dZedz  ed<   dZej                  dz  ed<   dZej                  dz  ed<   dZej                  dz  ed<   dZej                  dz  ed<   dZej                  dz  ed<   dZej                  dz  ed<   dZej                  dz  ed<   dee   fdZ y)FlavaForPreTrainingOutputay  
    loss (`torch.FloatTensor`, *optional*, returned when `return_loss` is True):
        Total loss calculated for this model.
    loss_info (`FlavaLosses`):
        Detailed info for FLAVA Pretraining losses. Check `FlavaLosses` class description for the information on
        the keys.
    image_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `pixel_values` are present):
        The image embeddings which are basically the pooled output of [`FlavaImageModel`].
    image_output (`BaseModelOutputWithPooling`, *optional*, returned when `pixel_values` are present):
        The output of the [`FlavaImageModel`].
    text_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `input_ids` are present):
        The text embeddings which are basically the pooled output of [`FlavaTextModel`].
    text_output (`BaseModelOutputWithPooling`, *optional*, returned when `input_ids` are present):
        The output of the [`FlavaTextModel`].
    multimodal_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `input_ids` and `pixel_values` are present and `skip_unmasked_multimodal_encoder` is `None` or `False`):
        The multimodal embeddings which are basically the pooled output of [`FlavaTextModel`].
    multimodal_output (`BaseModelOutputWithPooling`, returned when `input_ids` and `pixel_values` are present and `skip_unmasked_multimodal_encoder` is `None` or `False`):
        The output of the [`FlavaMultimodalModel`].
    image_masked_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `pixel_values` are present):
        The image embeddings which are basically the pooled output of [`FlavaImageModel`]. Uses `bool_masked_pos`
        to create masked images.
    image_masked_output (`BaseModelOutputWithPooling`, *optional*, returned when `pixel_values` are present):
        The output of the [`FlavaImageModel`]. Uses `bool_masked_pos` to create masked images.
    text_masked_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `input_ids_masked` are present):
        The text embeddings which are basically the pooled output of [`FlavaTextModel`].
    text_masked_output (`BaseModelOutputWithPooling`, *optional*, returned when `input_ids_masked` are present):
        The output of the [`FlavaTextModel`].
    multimodal_masked_embeddings (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*, returned when `input_ids` and `pixel_values` are present):
        The multimodal embeddings which are basically the pooled output of [`FlavaTextModel`].
    multimodal_masked_output (`BaseModelOutputWithPooling`, *optional*, returned when `input_ids_masked` and `pixel_values` are present):
        The output of the [`FlavaMultimodalModel`].
    mim_logits (`torch.FloatTensor` of shape `(batch_size, num_image_patches, image_vocab_size)` or of shape `(total_masked_patches, image_vocab_size)` , *optional*, returned when `pixel_values` are present and `input_ids_masked` are not):
        The logits for MIM unimodal loss. Uses `book_masked_pos` to get masked patches. The flattened output is
            returned when `bool_masked_pos` has some of the patches masked.
    mlm_logits (`torch.FloatTensor` of shape `(batch_size, text_seq_length, text_vocab_size)` or of shape `(total_masked_seq_length, text_vocab_size)`, *optional*, returned when `input_ids_masked` are present and `pixel_values` are not):
        The logits for MLM unimodal loss. The flattened output is returned when `input_ids_masked` has some of
            the tokens masked.
    itm_logits (`torch.FloatTensor` of shape `(batch_size, 2)`, *optional*, returned when `input_ids_masked` and `pixel_values` are present):
        The logits for ITM loss. Note that ITM loss is calculated on masked pairs in FLAVA.
    contrastive_logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
        The scaled dot product scores between `image_embeddings` and `text_embeddings` but passed through FLAVA's
        `image_projection` and `text_projection` layers respectively. This represents the image-text similarity
        scores. This is calculated on unmasked images and texts.
    contrastive_logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
        The scaled dot product scores between `text_embeddings` and `image_embeddings` but passed through FLAVA's
        `text_projection` and `image_projection` layers respectively. This is calculated on unmasked images and
        texts.
    mmm_image_logits (`torch.FloatTensor` of shape `(batch_size, num_image_patches, image_vocab_size)` or of shape`(total_masked_patches, image_vocab_size)`, *optional*, returned when `pixel_values` and `input_ids_masked` are present):
        The logits for MMM image multimodal loss. Uses `book_masked_pos` to get masked patches. The flattened
            output is returned when `bool_masked_pos` has some of the patches masked.
    mmm_text_logits (`torch.FloatTensor` of shape `(batch_size, text_seq_length, text_vocab_size)` or of shape `(`(total_masked_seq_length, text_vocab_size)`), *optional*, returned when `pixel_values` and `input_ids_masked` are present):
        The logits for MMM text multimodal loss. The flattened output is returned when `input_ids_masked` has
            some of the tokens masked.
    Nloss	loss_infor   r   r    r!   r"   r#   image_masked_embeddingsimage_masked_outputtext_masked_embeddingstext_masked_outputmultimodal_masked_embeddingsmultimodal_masked_output
mim_logits
mlm_logits
itm_logitscontrastive_logits_per_imagecontrastive_logits_per_textmmm_image_logitsmmm_text_logitsr$   c                 T     g dt         fd j                         D              S )N)r!   r   r#   rQ   rO   rS   c              3   d   K   | ]'  }|vr|   nt        |      j                          ) y wNr'   )r*   r+   r,   transformer_outputss     r-   r.   z5FlavaForPreTrainingOutput.to_tuple.<locals>.<genexpr>   s4     sbc)< <T!W'$PQBRB[B[B]]sr/   r0   )r,   r^   s   `@r-   r)   z"FlavaForPreTrainingOutput.to_tuple   s(    
 sgkgpgpgrsssr4   )!r5   r6   r7   r8   rL   r9   r:   r;   rM   r>   r   r   r   r    r!   r"   r#   rN   rO   rP   rQ   rR   rS   rT   rU   rV   rW   rX   rY   rZ   r1   r   r)   r<   r4   r-   rK   rK      s   5n &*D%

d
")!I{!15e''$.56:L,t3:04OU&&-459K+d296:5,,t3:;?1D8?8<U..5<=A3d:A7;E--4;<@2T9@=A %"3"3d":ABF84?F+/J!!D(/+/J!!D(/+/J!!D(/=A %"3"3d":A<@!2!2T!9@15e''$.504OU&&-4	t%* 	tr4   rK   c            	            e Zd ZdZddededdf fdZdej                  de	d	e	dej                  fd
Z
	 	 ddej                  dej                  dz  dedej                  fdZ xZS )FlavaImageEmbeddingszb
    Construct the CLS token, position and patch embeddings. Optionally, also the mask token.
    configuse_mask_tokenr$   Nc                    t         |           |xs |j                  }t        j                  t        j                  dd|j                              | _        |r4t        j                  t        j                  dd|j                              nd | _        t        |j                  |j                  |j                  |j                        | _        | j                  j                  }t        j                  t        j                  d|dz   |j                              | _        t        j                   |j"                        | _        |j                  | _        || _        y )Nr   )
image_size
patch_sizenum_channels	embed_dim)super__init__
mask_tokenr   	Parameterr9   zeroshidden_size	cls_tokenPatchEmbeddingsrd   re   rf   patch_embeddingsnum_patchesposition_embeddingsDropouthidden_dropout_probdropoutra   )r,   ra   rb   rq   	__class__s       r-   ri   zFlavaImageEmbeddings.__init__   s    '<6+<+<ekk!Q8J8J&KLQ_",,u{{1a9K9K'LMei /((((,,((	!
 ++77#%<<A{QPVPbPb0c#d zz&"<"<= ++r4   
embeddingsheightwidthc                    |j                   d   dz
  }| j                  j                   d   dz
  }t        j                  j	                         s||k(  r||k(  r| j                  S | j                  ddddf   }| j                  ddddf   }|j                   d   }|| j
                  z  }	|| j
                  z  }
t        |dz        }|j                  d|||      }|j                  dddd      }t        j                  j                  ||	|
fdd	
      }|j                  dddd      j                  dd|      }t        j                  ||fd      S )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   Ng      ?r   r      bicubicF)sizemodealign_cornersdim)shaperr   r9   jit
is_tracingre   r   reshapepermuter   
functionalinterpolateviewcat)r,   rw   rx   ry   rq   num_positionsclass_pos_embedpatch_pos_embedr   
new_height	new_widthsqrt_num_positionss               r-   interpolate_pos_encodingz-FlavaImageEmbeddings.interpolate_pos_encoding  s`    !&&q)A-0066q9A= yy##%+*F6UZ?+++221bqb59221ab59r"t.
T__,	&}c'9:)11!5GI[]`a)11!Q1=--33i(	 4 
 *11!Q1=BB1b#Nyy/?;CCr4   pixel_valuesbool_masked_posr   c                 V   |j                   \  }}}}| j                  ||      }|j                         \  }}	}
|| j                  j	                  ||	d      }|j                         dk(  r!|j                  |j                  d      d      }|j                  d      j                  |      }|d|z
  z  ||z  z   }| j                  j	                  |dd      }t        j                  ||fd      }|r|| j                  |||      z   }n|| j                  z   }| j                  |      }|S )N)r   r{   r   r   g      ?r   r   )r   rp   r~   rj   expandr   r   	unsqueezetype_asrn   r9   r   r   rr   ru   )r,   r   r   r   
batch_sizerf   rx   ry   rw   seq_len_mask_tokensmask
cls_tokenss                 r-   forwardzFlavaImageEmbeddings.forward)  s4    3?2D2D/
L&%**<Rj*k
!+!2
GQ&//00WbIK""$)"1"6"67K7KA7NPR"S",,R088ED#sTz2[45GGJ ^^**:r2>
YY
J7Q?
 $#d&C&CJPVX]&^^J#d&>&>>J\\*-
r4   FNF)r5   r6   r7   r8   r   rI   ri   r9   Tensorintr   
BoolTensorr   __classcell__rv   s   @r-   r`   r`      s    /  RV &&D5<< &D &DUX &D]b]i]i &DV 48).	ll ))D0 #'	
 
r4   r`   c            	            e Zd ZdZ	 	 	 	 ddedeeeef   z  dedef fdZddej                  de	d	ej                  fd
Z
 xZS )ro   z#
    Image to Patch Embedding.
    rd   re   rf   rg   c                 V   t         |           t        |t        j                  j
                        s||f}t        |t        j                  j
                        s||f}|d   |d   z  |d   |d   z  z  }|| _        || _        || _        t        j                  ||||      | _        y )Nr   r   )kernel_sizestride)rh   ri   
isinstancecollectionsabcIterablerd   re   rq   r   Conv2d
projection)r,   rd   re   rf   rg   rq   rv   s         r-   ri   zPatchEmbeddings.__init__R  s     	*koo&>&>?$j1J*koo&>&>?$j1J!!}
15*Q-:VW=:XY$$&))L)\fgr4   r   r   r$   c                 8   |j                   \  }}}}|sV|| j                  d   k7  s|| j                  d   k7  r2t        d| d| d| j                  d    d| j                  d    d	      | j                  |      j	                  d      j                  dd      }|S )Nr   r   zInput image size (*z) doesn't match model (z).r|   )r   rd   
ValueErrorr   flatten	transpose)r,   r   r   r   rf   rx   ry   xs           r-   r   zPatchEmbeddings.forwarde  s    2>2D2D/
L&%'++u8J/J (% 9+,Adooa.@-AE  OOL)11!4>>q!Dr4   )      r   i   r   )r5   r6   r7   r8   r   r1   ri   r9   r   rI   r   r   r   s   @r-   ro   ro   M  sx     ,.hh %S/)h 	h
 h&	ELL 	D 	]b]i]i 	r4   ro   c                        e Zd ZdZ fdZ	 	 	 ddej                  dz  dej                  dz  dej                  dz  fdZ xZS )	FlavaTextEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.c                    t         |           t        j                  |j                  |j
                  |j                        | _        t        j                  |j                  |j
                        | _	        t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        | j#                  dt%        j&                  |j                        j)                  d      d       | j#                  dt%        j*                  | j,                  j/                         t$        j0                        d       y )	N)padding_idxepsposition_idsr   r{   F)
persistenttoken_type_ids)dtype)rh   ri   r   	Embedding
vocab_sizerm   pad_token_idword_embeddingsmax_position_embeddingsrr   type_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsrs   rt   ru   register_bufferr9   aranger   rl   r   r~   longr,   ra   rv   s     r-   ri   zFlavaTextEmbeddings.__init__t  s   !||F,=,=v?Q?Q_e_r_rs#%<<0N0NPVPbPb#c %'\\&2H2H&J\J\%]"f&8&8f>S>STzz&"<"<=ELL)G)GHOOPWXej 	 	
 	ekk$*;*;*@*@*B%**Ubg 	 	
r4   N	input_idsr   r   c                    |j                         }|d   }|| j                  d d d |f   }|st        | d      r-| j                  d d d |f   }|j	                  |d   |      }|}n:t        j                  |t
        j                  | j                  j                        }| j                  |      }| j                  |      }	||	z   }
| j                  |      }|
|z  }
| j                  |
      }
| j                  |
      }
|
S )Nr   r   r   )r   device)r~   r   hasattrr   r   r9   rl   r   r   r   r   rr   r   ru   )r,   r   r   r   input_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedinputs_embedsr   rw   rr   s               r-   r   zFlavaTextEmbeddings.forward  s     nn& ^
,,Q^<L
 !t-.*.*=*=a*n*M'3J3Q3QR]^_R`bl3m0!A!&[

SWSdSdSkSk!l,,Y7 $ : :> J"%::
"66|D))
^^J/
\\*-
r4   NNN)	r5   r6   r7   r8   ri   r9   r   r   r   r   s   @r-   r   r   q  sR    Q
$ *..2,0	 <<$&  t+  llT)	 r4   r   c                        e Zd Zdeddf fdZ	 	 d	dej                  dej                  dz  dedeej                  ej                  f   eej                     z  fdZ	 xZ
S )
FlavaSelfAttentionra   r$   Nc                    t         |           |j                  |j                  z  dk7  r2t	        |d      s&t        d|j                   d|j                   d      |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _        t        j                  |j                  | j                  |j                        | _        t        j                  |j                  | j                  |j                        | _        t        j                  |j                  | j                  |j                        | _        t        j                  |j                         | _        y )Nr   embedding_sizezThe hidden size z4 is not a multiple of the number of attention heads .bias)rh   ri   rm   num_attention_headsr   r   r   attention_head_sizeall_head_sizer   Linearqkv_biasquerykeyvaluers   attention_probs_dropout_probru   r   s     r-   ri   zFlavaSelfAttention.__init__  s.    : ::a?PVXhHi"6#5#5"6 7334A7 
 $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EFOO\
99V//1C1C&//ZYYv1143E3EFOO\
zz&"E"EFr4   hidden_statesattention_maskoutput_attentionsc                    |j                   \  }}}| j                  |      j                  |d| j                  | j                        j                  dd      }| j                  |      j                  |d| j                  | j                        j                  dd      }| j                  |      j                  |d| j                  | j                        j                  dd      }	t        j                  ||j                  dd            }
|
t        j                  | j                        z  }
||
|z   }
t        j                  j                  |
d      }| j                  |      }t        j                  ||	      }|j!                  dddd      j#                         }|j%                         d d | j&                  fz   } |j                  | }|r||f}|S |f}|S )Nr{   r   r|   r   r   r   )r   r   r   r   r   r   r   r   r9   matmulmathsqrtr   r   softmaxru   r   
contiguousr~   r   )r,   r   r   r   r   r   r   query_layer	key_layervalue_layerattention_scoresattention_probscontext_layernew_context_layer_shapeoutputss                  r-   r   zFlavaSelfAttention.forward  s    %2$7$7!
JJJ}%T*b$":":D<T<TUYq!_ 	 HH]#T*b$":":D<T<TUYq!_ 	 JJ}%T*b$":":D<T<TUYq!_ 	 !<<Y5H5HR5PQ+dii8P8P.QQ%/.@ --//0@b/I ,,7_kB%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S***,CD6G=/2 O\M]r4   r   r5   r6   r7   FlavaPossibleConfigsri   r9   r   rI   r1   r   r   r   s   @r-   r   r     sz    G3 G G* /3"'	.||. t+.  	.
 
u||U\\)	*U5<<-@	@.r4   r   c                   |     e Zd ZdZdeddf fdZdej                  dej                  dej                  fdZ xZ	S )	FlavaSelfOutputz
    The residual connection is defined in FlavaLayer (same as ViTLayer) instead of here (as is the case with other
    models), due to the layernorm applied before each block.
    ra   r$   Nc                     t         |           t        j                  |j                  |j                        | _        t        j                  |j                        | _        y r]   )	rh   ri   r   r   rm   densers   rt   ru   r   s     r-   ri   zFlavaSelfOutput.__init__  sB    YYv1163E3EF
zz&"<"<=r4   r   input_tensorc                 J    | j                  |      }| j                  |      }|S r]   r  ru   r,   r   r  s      r-   r   zFlavaSelfOutput.forward  s$    

=1]3r4   )
r5   r6   r7   r8   r   ri   r9   r   r   r   r   s   @r-   r   r     sE    
>3 > >
U\\  RWR^R^ r4   r   c                        e Zd Zdeddf fdZ	 	 d	dej                  dej                  dz  dedeej                  ej                  f   eej                     z  fdZ	 xZ
S )
FlavaAttentionra   r$   Nc                 b    t         |           t        |      | _        t	        |      | _        y r]   )rh   ri   r   	attentionr   outputr   s     r-   ri   zFlavaAttention.__init__  s&    +F3%f-r4   r   r   r   c                 j    | j                  |||      }| j                  |d   |      }|f|dd  z   }|S N)r   r   r   r   )r
  r  )r,   r   r   r   self_outputsattention_outputr   s          r-   r   zFlavaAttention.forward  sO     ~~.L] & 
  ;;|AF#%QR(88r4   r   r   r   s   @r-   r  r    sw    .3 . . /3"'	|| t+  	
 
u||U\\)	*U5<<-@	@r4   r  c                   `     e Zd Zdeddf fdZdej                  dej                  fdZ xZS )FlavaIntermediatera   r$   Nc                    t         |           t        j                  |j                  |j
                        | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y r]   )rh   ri   r   r   rm   intermediate_sizer  r   
hidden_actstrr	   intermediate_act_fnr   s     r-   ri   zFlavaIntermediate.__init__  s]    YYv1163K3KL
f''-'-f.?.?'@D$'-'8'8D$r4   r   c                 J    | j                  |      }| j                  |      }|S r]   )r  r  r,   r   s     r-   r   zFlavaIntermediate.forward  s&    

=100?r4   	r5   r6   r7   r   ri   r9   r   r   r   r   s   @r-   r  r    s2    93 9 9U\\ ell r4   r  c                   x     e Zd Zdeddf fdZdej                  dej                  dej                  fdZ xZS )FlavaOutputra   r$   Nc                     t         |           t        j                  |j                  |j
                        | _        t        j                  |j                        | _	        y r]   )
rh   ri   r   r   r  rm   r  rs   rt   ru   r   s     r-   ri   zFlavaOutput.__init__%  sB    YYv779K9KL
zz&"<"<=r4   r   r  c                 T    | j                  |      }| j                  |      }||z   }|S r]   r  r  s      r-   r   zFlavaOutput.forward+  s.    

=1]3%4r4   r  r   s   @r-   r  r  $  s@    >3 > >U\\  RWR^R^ r4   r  c                        e Zd ZdZdeddf fdZ	 	 d
dej                  dej                  dz  dede	ej                  ej                  f   e	ej                     z  fd	Z
 xZS )
FlavaLayerz?This corresponds to the Block class in the timm implementation.ra   r$   Nc                 r   t         |           |j                  | _        d| _        t	        |      | _        t        |      | _        t        |      | _	        t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        y )Nr   r   )rh   ri   chunk_size_feed_forwardseq_len_dimr  r
  r  intermediater  r  r   r   rm   r   layernorm_beforelayernorm_afterr   s     r-   ri   zFlavaLayer.__init__7  s    '-'E'E$'/-f5!&) !#V-?-?VEZEZ [!||F,>,>FDYDYZr4   r   r   r   c                     | j                  | j                  |      ||      }|d   }|dd  }||z   }| j                  |      }| j                  |      }| j	                  ||      }|f|z   }|S r  )r
  r$  r%  r#  r  )r,   r   r   r   self_attention_outputsr  r   layer_outputs           r-   r   zFlavaLayer.forwardC  s     "&!!-0)/ "0 "

 2!4(, )=8 ++M:((6 {{<?/G+r4   r   )r5   r6   r7   r8   r   ri   r9   r   rI   r1   r   r   r   s   @r-   r  r  4  s}    I
[3 
[ 
[ /3"'	|| t+  	
 
u||U\\)	*U5<<-@	@r4   r  c                        e Zd Zdeddf fdZ	 	 	 	 ddej                  dej                  dz  deded	edee	z  fd
Z
 xZS )FlavaEncoderra   r$   Nc                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w r   )
rh   ri   ra   r   
ModuleListrangenum_hidden_layersr  layergradient_checkpointing)r,   ra   r   rv   s      r-   ri   zFlavaEncoder.__init__a  sN    ]]fF^F^@_#`1Jv$6#`a
&+# $as   A#r   r   r   output_hidden_statesreturn_dictc                     |rdnd }|rdnd }t        | j                        D ](  \  }}	|r||fz   } |	|||      }
|
d   }|s ||
d   fz   }* |r||fz   }|st        d |||fD              S t        |||      S )Nr<   r   r   c              3   &   K   | ]	  }||  y wr]   r<   )r*   rH   s     r-   r.   z'FlavaEncoder.forward.<locals>.<genexpr>  s     mq_`_lm   )last_hidden_stater   
attentions)	enumerater/  r1   r   )r,   r   r   r   r1  r2  all_hidden_statesall_self_attentionsilayer_modulelayer_outputss              r-   r   zFlavaEncoder.forwardg  s     #7BD$5b4(4 		POA|#$58H$H!(HYZM)!,M &9]1=M<O&O#		P   1]4D Dm]4EGZ$[mmm+;LYl
 	
r4   )NFFT)r5   r6   r7   r   ri   r9   r   rI   r1   r   r   r   r   s   @r-   r*  r*  `  su    ,{ ,t , /3"'%* 
||
 t+
  	

 #
 
 
	 
r4   r*  c                   D     e Zd Zdef fdZdej                  fdZ xZS )FlavaPoolerra   c                     t         |           t        j                  |j                  |j                        | _        t        j                         | _        y r]   )rh   ri   r   r   rm   r  Tanh
activationr   s     r-   ri   zFlavaPooler.__init__  s9    YYv1163E3EF
'')r4   r   c                 \    |d d df   }| j                  |      }| j                  |      }|S Nr   )r  rB  )r,   r   first_token_tensorpooled_outputs       r-   r   zFlavaPooler.forward  s6     +1a40

#566r4   r  r   s   @r-   r?  r?    s     $3 $
U\\ r4   r?  c                        e Zd ZU eed<   dZdZdZ ej                         de
j                  e
j                  z  e
j                  z  ddf fd       Z xZS )	FlavaPreTrainedModelra   flava)imagetextTmoduler$   Nc                    t         |   |       t        |t              r t	        j
                  |j                         yt        |t              rkt	        j
                  |j                         t	        j
                  |j                         |j                   t	        j
                  |j                         yyt        |t              ryt	        j                  |j                  t        j                  |j                  j                   d         j#                  d             t	        j
                  |j$                         yt        |t&              r-|j(                  r t	        j
                  |j                         yyt        |t*              r5t	        j,                  |j.                  | j0                  j2                         yy)zInitialize the weightsNr{   r   )rh   _init_weightsr   FlavaMaskedPredictionHeadinitzeros_r   r`   rn   rr   rj   r   copy_r   r9   r   r   r   r   FlavaMultimodalModeluse_cls_token
FlavaModel	constant_logit_scalera   logit_scale_init_value)r,   rL  rv   s     r-   rN  z"FlavaPreTrainedModel._init_weights  s/    	f%f78KK$ 45KK(()KK223  ,F--. - 34JJv**ELL9L9L9R9RSU9V,W,^,^_f,ghKK--. 45##F,,- $
+NN6--t{{/Q/QR ,r4   )r5   r6   r7   r   r;   base_model_prefixinput_modalitiessupports_gradient_checkpointingr9   no_gradr   r   r   r   rN  r   r   s   @r-   rH  rH    s]    (&*#U]]_SBII		$9BLL$H ST S Sr4   rH  c                   0    e Zd ZU eed<   dZdZdZddedef fdZ	de
j                  fdZd	e
j                  fd
Ze	 	 	 	 	 	 	 ddej                   dz  dej"                  dz  dedz  dej                   dz  dedz  dedz  dedz  deez  fd       Z xZS )FlavaImageModelra   zflava.image_modelr   rJ  add_pooling_layerc                    t         |   |       || _        t        |      | _        t        |      | _        t        j                  |j                  |j                        | _        |rt        |      nd| _        | j                          yv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        r   N)rh   ri   ra   r`   rw   r*  encoderr   r   rm   r   	layernormr?  pooler	post_initr,   ra   r`  rv   s      r-   ri   zFlavaImageModel.__init__  si    
 	 .v6#F+f&8&8f>S>ST->k&)Dr4   r$   c                 .    | j                   j                  S r]   rw   rp   r3   s    r-   get_input_embeddingsz$FlavaImageModel.get_input_embeddings  s    ///r4   r   c                 &    || j                   _        y r]   rj  r,   r   s     r-   set_input_embeddingsz$FlavaImageModel.set_input_embeddings  s    +0(r4   Nr   r   r   r   r1  r2  c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|t	        d      | j                  |||      }	| j                  |	||||      }
|
d   }| j                  |      }| j                  | j                  |      nd}|s
||f|
dd z   S t        |||
j                  |
j                        S )z
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, image_num_patches)`):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
        Nz You have to specify pixel_values)r   r   r   r   r1  r2  r   r   r6  pooler_outputr   r7  )ra   r   r1  use_return_dictr   rw   rd  re  rf  r   r   r7  )r,   r   r   r   r   r   r1  r2  kwargsembedding_outputencoder_outputssequence_outputrF  s                r-   r   zFlavaImageModel.forward  s     2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]?@@??/Tl + 
 ,,)/!5# ' 
 *!,..98<8OO4UY#]3oab6III)-')77&11	
 	
r4   TNNNNNNN)r5   r6   r7   r   r;   rY  main_input_namerZ  rI   ri   r   Modulerk  rn  r   r9   r   r   r1   r   r   r   r   s   @r-   r^  r^    s    +$O!/ D "0bii 01")) 1  -13704.2)-,0#'/
llT)/
 ))D0/
 #'+	/

 t+/
  $;/
 #Tk/
 D[/
 
+	+/
 /
r4   r^  c                   ,    e Zd ZU eed<   dZdZddedef fdZde	fdZ
dej                  fd	Ze	 	 	 	 	 	 	 ddej                   d
z  dej                   d
z  dej                   d
z  dej                   d
z  ded
z  ded
z  ded
z  deez  fd       Z xZS )FlavaTextModelra   zflava.text_model)rK  r`  c                    t         |   |       || _        t        |      | _        t        |      | _        t        j                  |j                  |j                        | _        |rt        |      nd| _        | j                          yrb  )rh   ri   ra   r   rw   r*  rd  r   r   rm   r   re  r?  rf  rg  rh  s      r-   ri   zFlavaTextModel.__init__  si    
 	 -f5#F+f&8&8f>S>ST->k&)Dr4   r$   c                 .    | j                   j                  S r]   rw   r   r3   s    r-   rk  z#FlavaTextModel.get_input_embeddings  s    ...r4   r   c                 &    || j                   _        y r]   r  rm  s     r-   rn  z#FlavaTextModel.set_input_embeddings  s    */'r4   Nr   r   r   r   r   r1  r2  c                 ^   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|t	        d      |j                         }	|!t        j                  |	|j                        }| j                  ||	      }
| j                  |||      }| j                  ||
|||      }|d   }| j                  |      }| j                  | j                  |      nd}|s
||f|dd z   S t        |||j                  |j                         S )	a  
        input_ids (`torch.LongTensor` of shape `(batch_size, text_seq_length)`):
            Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See
            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input
            IDs?](../glossary#input-ids)
        token_type_ids (`torch.LongTensor` of shape `(batch_size, text_seq_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:
            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.
            [What are token type IDs?](../glossary#token-type-ids)
        NzYou have to specify input_idsr   )r   r   r   rp  r   r   rq  )ra   r   r1  rs  r   r~   r9   onesr   get_extended_attention_maskrw   rd  re  rf  r   r   r7  )r,   r   r   r   r   r   r1  r2  rt  r   extended_attention_maskru  rv  rw  rF  s                  r-   r   zFlavaTextModel.forward!  sY   0 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]<==nn&!"ZZI<L<LMN040P0P1

  ??)% + 
 ,,2/!5# ' 
 *!,..98<8OO4UY#]3oab6III)-')77&11	
 	
r4   rx  ry  )r5   r6   r7   r   r;   rY  rZ  rI   ri   ro   rk  r   r{  rn  r   r9   r   r1   r   r   r   r   s   @r-   r}  r}    s    *  4  /o /0")) 0  *..2.2,0)-,0#'C
<<$&C
 t+C
 t+	C

 llT)C
  $;C
 #TkC
 D[C
 
+	+C
 C
r4   r}  c                        e Zd ZU eed<   dZdZddef fdZe	 	 	 	 dde	j                  de	j                  dz  dedz  dedz  d	edz  d
eez  fd       Z xZS )rS  ra   zflava.multimodal_modelr   c                    t         |   |       || _        | j                  j                  | _        | j                  r9t	        j
                  t        j                  dd|j                              | _	        t        |      | _        t	        j                  |j                  |j                        | _        |rt        |      nd| _        | j#                          y)rc  r   r   N)rh   ri   ra   rT  r   rk   r9   rl   rm   rn   r*  rd  r   r   re  r?  rf  rg  rh  s      r-   ri   zFlavaMultimodalModel.__init__o  s    
 	 ![[66\\%++aF<N<N*OPDN#F+f&8&8f>S>ST->k&)Dr4   Nr   r   r1  r2  r$   c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|j	                         \  }}}	| j
                  r;| j                  j                  |dd      }
t        j                  |
|fd      }|dz  }|#t        j                  ||f|j                        }| j                  |||f      }| j                  |||||      }|d   }| j                  |      }| j                  | j                  |      nd}|s
||f|dd z   S t!        |||j"                  |j$                        S )	z
        hidden_states (`torch.FloatTensor` of shape `(batch_size, image_num_patches + text_seq_len, hidden_size)`):
            The concatenated hidden states of unimodal encoders.
        Nr{   r   r   r  rp  r   rq  )ra   r   r1  rs  r~   rT  rn   r   r9   r   r  r   r  rd  re  rf  r   r   r7  )r,   r   r   r   r1  r2  rt  r   r   r   r   r  rv  rw  rF  s                  r-   r   zFlavaMultimodalModel.forward  sz    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]$1$6$6$8!
J..z2rBJ!IIz=&AqIM!OJ!"ZZZ(@I]I]^N040P0P$1

 ,,2/!5# ' 
 *!,..98<8OO4UY#]3oab6III)-')77&11	
 	
r4   rx  )NNNN)r5   r6   r7   r   r;   rY  rz  ri   r   r9   r   rI   r1   r   r   r   r   s   @r-   rS  rS  h  s    !!0%O4 $  /3)-,0#'5
||5
 t+5
  $;	5

 #Tk5
 D[5
 
+	+5
 5
r4   rS  c                       e Zd ZU eed<   def fdZee	 	 	 ddej                  dej                  dz  dej                  dz  dej                  dz  de
e   d	eez  fd
              Zee	 	 	 ddej                  dej                  dz  dedz  dej                  dz  de
e   d	eez  fd              Ze	 	 	 	 	 	 	 	 	 	 	 ddej$                  dz  dej&                  dz  dej                  dz  dej                  dz  dej                  dz  dej$                  dz  dej                  dz  dedz  dedz  dededz  d	eez  fd       Z xZS )rU  ra   c                    t         |   |       t        |j                  t              s"t        dt        |j                         d      t        |j                  t              s"t        dt        |j                         d      t        |j                  t              s%t        ddt        |j                         dz         |j                  }|j                  }|j                  }|j                  | _        |j                  | _        |j                  | _        |j                  | _        t!        |      | _        t%        |      | _        t)        |      | _        t-        j.                  | j                  | j                        | _        t-        j.                  | j                  | j                        | _        t-        j4                  t7        j8                  | j:                  j<                              | _        t-        j.                  | j                  | j                        | _         t-        j.                  | j                  | j                        | _!        | jE                          y )NzLconfig.text_config is expected to be of type FlavaTextConfig but is of type r   zNconfig.image_config is expected to be of type FlavaImageConfig but is of type zMconfig.multimodal_config is expected to be of type FlavaMultimodalConfig but zis of type )#rh   ri   r   text_configr   	TypeErrortypeimage_configr   multimodal_configr   projection_dimrm   text_hidden_sizeimage_hidden_sizemm_hidden_sizer}  
text_modelr^  image_modelrS  multimodal_modelr   r   image_projectiontext_projectionrk   r9   tensorra   rX  rW  image_to_mm_projectiontext_to_mm_projectionrg  )r,   ra   r  r  r  rv   s        r-   ri   zFlavaModel.__init__  s    &,,o>++,-Q0 
 &--/?@,,-.a1 
 &224IJ_V%=%= >?qAB 
 ((**"44$33 + 7 7!-!9!9/;;(5*<8 45F G "		$*@*@$BUBU V!yy)>)>@S@ST<<T[[5W5W(XY&(ii0F0FH[H[&\#%'YYt/D/DdFYFY%Z"r4   Nr   r   r   r   rt  r$   c           	      z     | j                   d||||dd|}|j                  }| j                  |      |_        |S )a	  
        input_ids (`torch.LongTensor` of shape `(batch_size, text_seq_length)`):
            Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See
            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input
            IDs?](../glossary#input-ids)
        token_type_ids (`torch.LongTensor` of shape `(batch_size, text_seq_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:
            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.
            [What are token type IDs?](../glossary#token-type-ids)

        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, FlavaModel

        >>> model = FlavaModel.from_pretrained("{0}")
        >>> processor = AutoProcessor.from_pretrained("{0}")

        >>> inputs = processor(
        ...     text=["a photo of a cat", "a photo of a dog"], max_length=77, padding="max_length", return_tensors="pt"
        ... )
        >>> with torch.inference_mode():
        ...     text_features = model.get_text_features(**inputs)
        ```
        T)r   r   r   r   r2  r<   )r  r6  r  rr  )r,   r   r   r   r   rt  text_outputsr6  s           r-   get_text_featureszFlavaModel.get_text_features  s\    L 4C4?? 4
))%4
 4
 )::%)%9%9:K%L"r4   r   r   r   c           	      z     | j                   d||||dd|}|j                  }| j                  |      |_        |S )a   
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, image_num_patches)`):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).

        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, FlavaModel
        >>> from transformers.image_utils import load_image

        >>> model = FlavaModel.from_pretrained("{0}")
        >>> processor = AutoProcessor.from_pretrained("{0}")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = load_image(url)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> with torch.inference_mode():
        ...     image_features = model.get_image_features(**inputs)
        ```
        T)r   r   r   r   r2  r<   )r  r6  r  rr  )r,   r   r   r   r   rt  image_outputsr6  s           r-   get_image_featureszFlavaModel.get_image_features  s^    B 5ED4D4D 5
%+)%=5
 5
 *;;&*&;&;<M&N#r4   image_attention_maskskip_multimodal_encoderr   r1  r2  c           	         ||n| j                   j                  }|
st        d      d}d}d}d}|5| j                  ||||	|
|      }|d   |d   }}| j	                  |d         }d}d}d}d}|6| j                  |||||	|
|      }|d   |d   }}| j                  |d         }d}d}|||s|g|j                  \  }}}| j                  j                  r|dz  }t        j                  |||j                  	      }t        j                  ||gd
      }nd}t        j                  ||gd
      }| j                  |||      }|d   }|s||||||fS t        ||||||      S )a/
  
        input_ids (`torch.LongTensor` of shape `(batch_size, image_num_patches + text_seq_len)`):
            Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See
            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input
            IDs?](../glossary#input-ids)
        token_type_ids (`torch.LongTensor` of shape `(batch_size, image_num_patches + text_seq_len)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:
            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.
            [What are token type IDs?](../glossary#token-type-ids)
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, image_num_patches)`):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
        image_attention_mask (`torch.Tensor` of shape `(batch_size, image_num_patches)`, *optional*):
            Mask to avoid performing attention on padding pixel values for image inputs. Mask values selected in `[0, 1]`:
            - 1 for pixel values that are real (i.e., **not masked**),
            - 0 for pixel values that are padding (i.e., **masked**).
        skip_multimodal_encoder (*bool*, *optional*):
            Skip any calculations for multimodal encoder. Useful if multimodal encoding is not going to be used.

        Examples:

        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, FlavaModel

        >>> model = FlavaModel.from_pretrained("facebook/flava-full")
        >>> processor = AutoProcessor.from_pretrained("facebook/flava-full")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> inputs = processor(text=["a photo of a cat"], images=image, return_tensors="pt", padding=True)

        >>> outputs = model(**inputs)

        >>> image_embeddings = outputs.image_embeddings
        >>> text_embeddings = outputs.text_embeddings
        >>> multimodal_embeddings = outputs.multimodal_embeddings

        >>> outputs.image_embeddings.shape
        torch.Size([1, 197, 768])

        >>> text_embeddings.shape
        torch.Size([1, 7, 768])

        >>> multimodal_embeddings.shape
        torch.Size([1, 205, 768])
        ```
        NzRFLAVA model requires hidden states to work. Please set `output_hidden_states=True`)r   r   r   r   r1  r2  r   r|   r{   )r   r   r   r   r   r1  r2  r   r  r   )r   r2  )r   r   r    r!   r"   r#   )ra   r2  r   r  r  r  r  r   r  rT  r9   r  r   r   r   )r,   r   r   r   r   r   r   r  r  r   r1  r2  rt  r   image_statesimage_mm_projectionr   r    text_statestext_mm_projectionr!   r"   r#   r   r   r   attention_mask_imageattention_multimodalmultimodal_inputs                                r-   r   zFlavaModel.forwardJ  s
   L &1%<k$++BYBY#qrr"#++) /3"3%9' , L .:!_l1ol"&"="=l2>N"O! //#-)-"3%9' * K ,7q>;q>[O!%!;!;KO!L $ */A/MVm))<)B)B&
GQ((66qLG',zz*gNaNhNh'i$',yy2F1W]^'_$'+$$yy*=?Q)RXYZ $ 5 5 1ES^ !6 ! %6a$8! %!   -%+#"7/
 	
r4   r   )NNNNNNNNNTN)r5   r6   r7   r   r;   ri   r   r   r9   r   r   r   r1   r   r  r   rI   r  
LongTensorr:   r   r   r   r   s   @r-   rU  rU    s?   ){ )V  /3.2,0/<</ t+/ t+	/
 llT)/ +,/ 
+	+/  /b  4804.2*ll* ))D0* #'+	*
 t+* +,* 
+	+*  *X  .215.2.2/30448/3)-%)#'N
##d*N
 ''$.N
 t+	N

 t+N
 ,N
 &&-N
 $llT1N
 "&N
  $;N
 #N
 D[N
 
!	!N
 N
r4   rU  c                   `     e Zd Zdedef fdZdej                  dej                  fdZ xZS )FlavaImageCodebookResPathin_sizeout_sizec                    t         |           |dz  }t               }t        j                         |d<   t        j
                  ||dd      |d<   t        j                         |d<   t        j
                  ||dd      |d<   t        j                         |d	<   t        j
                  ||dd      |d
<   t        j                         |d<   t        j
                  ||dd      |d<   t        j                  |      | _        y )N   relu_1r   r   r   paddingconv_1relu_2conv_2relu_3conv_3relu_4r   conv_4)rh   ri   r   r   ReLUr   
Sequentialpath)r,   r  r  rt  hid_sizer  rv   s         r-   ri   z"FlavaImageCodebookResPath.__init__  s    q=}X7H!QOXX8X1aPXX8X1aPXX8X1aPXMM$'	r4   r   r$   c                 $    | j                  |      S r]   )r  r,   r   s     r-   r   z!FlavaImageCodebookResPath.forward  s    yy|r4   	r5   r6   r7   r   ri   r9   r   r   r   r   s   @r-   r  r    s1    ( (s (  %,, r4   r  c                   d     e Zd Zdededef fdZdej                  dej                  fdZ xZS )FlavaImageCodebookBlockr  r  
num_layersc                     t         |           d|dz  z  | _        ||k7  rt        j                  ||dd      | _        nt        j                         | _        t        ||      | _        y )Nr   r|   r   r  )	rh   ri   	post_gainr   r   id_pathIdentityr  res_path)r,   r  r  r  rt  rv   s        r-   ri   z FlavaImageCodebookBlock.__init__  sW    j!m,h99WhAqQDL;;=DL1'8Dr4   r   r$   c                 b    | j                  |      | j                  | j                  |      z  z   S r]   )r  r  r  r  s     r-   r   zFlavaImageCodebookBlock.forward  s'    ||A$--2B!BBBr4   r  r   s   @r-   r  r    s?    
E 
Es 
E 
EC C%,, Cr4   r  c                   n     e Zd Zd
dededededef
 fdZdej                  dej                  fd	Z xZ	S )FlavaImageCodebookLayerGroup
num_blocksr  r  r  use_poolc                 $   t         |           t               }t        |      D ]4  }|dk(  rt	        |||      |d|dz    <   t	        |||      |d|dz    <   6 |rt        j                  d      |d<   t        j                  |      | _        y )Nr   block_r   r|   )r   pool)	rh   ri   r   r-  r  r   	MaxPool2dr  group)	r,   r  r  r  r  r  blocksr;  rv   s	           r-   ri   z%FlavaImageCodebookLayerGroup.__init__  s    z" 	cAAv+B7HV`+aAw'(+B8XWa+bAw'(		c \\a8F6N]]6*
r4   r   r$   c                 $    | j                  |      S r]   )r  r  s     r-   r   z$FlavaImageCodebookLayerGroup.forward  s    zz!}r4   rx  )
r5   r6   r7   r   rI   ri   r9   r   r   r   r   s   @r-   r  r    sH    +3 +C +# +QT +`d + %,, r4   r  a"  
    The FLAVA's image codebook model inspired from DALL-E's original encoder. Outputs raw hidden states and can be used
    to generate image tokens for an image based on DALL-E's vocab. Used to generate labels for MIM. Use
    `get_codebook_indices` to get image tokens for an image.
    c                        e Zd ZU dZeed<   dZdZdZdede	f fdZ
dej                  dej                  fd	Zdej                  dej                  fd
Zdej                  dej                  fdZ xZS )FlavaImageCodebookmodelra   r   r_  Frt  c                    t         |   |       || _        |j                  | _        |j                  | _        |j
                  | _        |j                  | _        |j                  | _        | j                  | j
                  z  }t               }t        j                         |d<   t        j                  d| j                  z  | j                  dd      |d<   t               }t        j                  | j                  d| j                  z  dd      |d	<   t        | j
                  |d| j                  z  d| j                  z        |d
<   t        | j
                  |d| j                  z  d| j                  z        |d<   t        | j
                  |d| j                  z  d| j                  z        |d<   t        | j
                  |d| j                  z  d| j                  z  d      |d<   t        j                  |      |d<   t        j                  |      | _        | j                          | j                  j                   r| j#                         D ]	  }d|_         y y )Nrelu   r   r   r  conv   r   inputgroup_1r|   group_2r  group_3F)r  group_4r  )rh   ri   ra   
num_groupsinput_channelsnum_blocks_per_grouprm   r   r   r   r  r   r  r  r  rg  freeze
parametersrequires_grad)r,   ra   rt  r  output_blocksr  paramrv   s          r-   ri   zFlavaImageCodebook.__init__$  s   
 	  ++$33$*$?$?!!-- ++__t'@'@@
# "	f "		!d.>.>*>]^hi jf))D$7$7T=M=M9M[\fghw8%%z1t7G7G3GTM]M]I]
y 9%%z1t7G7G3GTM]M]I]
y 9%%z1t7G7G3GTM]M]I]
y 9%%z1t7G7G3GTM]M]I]hm
y ==7xmmF+;;* ,&+#, r4   r$   c                 t    dt          dt          d | j                  |      }t        j                  |d      S )NaI  
        Args:
            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
                Pixel values. Codebook pixel values can be obtained using [`AutoImageProcessor`] by passing
                `return_codebook_pixels=True`. See [`FlavaImageProcessor.__call__`] for details.

        Examples:
        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoImageProcessor, FlavaImageCodebook

        >>> model = FlavaImageCodebook.from_pretrained("E")
        >>> image_processor = AutoImageProcessor.from_pretrained("a  ")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> inputs = image_processor([image], return_codebook_pixels=True, return_tensors="pt")
        >>> inputs = dict(pixel_values=inputs.codebook_pixel_values)

        >>> outputs = model.get_codebook_indices(**inputs)
        ```
        r   )axis)_CHECKPOINT_FOR_CODEBOOK_DOCr  r9   argmaxr,   r   z_logitss      r-   get_codebook_indicesz'FlavaImageCodebook.get_codebook_indicesP  sI    9 :V8V WCC_B` a		4 ;;|,||H1--r4   c                 \    | j                  |      } t        j                  d      |      S )Nr   r   )r  r   Softmaxr  s      r-   get_codebook_probsz%FlavaImageCodebook.get_codebook_probsn  s&    ;;|, rzza **r4   c                 0   dt          dt          d t        |j                        dk7  rt        d|j                   d      |j                  d   | j                  k7  r(t        d|j                  d    d	| j                         | j                  |      S )
NaJ  
        Args:
            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
                Pixel values. Codebook pixel values can be obtained using [`AutoImageProcessor`] by passing
                `return_codebook_pixels=True`. See [`FlavaImageProcessor.__call__`] for details.

        Examples:

        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoImageProcessor, FlavaImageCodebook

        >>> model = FlavaImageCodebook.from_pretrained("r  a  ")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> inputs = image_processor([image], return_codebook_pixels=True, return_tensors="pt")
        >>> inputs = dict(pixel_values=inputs.codebook_pixel_values)

        >>> outputs = model(**inputs)
        >>> print(outputs.shape)
        (1, 196)
        ```
        r  zinput shape z
 is not 4dr   z
input has z channels but model built for )r  lenr   r   r  r  )r,   r   rt  s      r-   r   zFlavaImageCodebook.forwardr  s    9 :V8V WCC_B` a		: |!!"a'|L,>,>+?zJKKa D$7$77z,*<*<Q*?)@@^_c_r_r^stuu{{<((r4   )r5   r6   r7   rY  r   r;   rz  rZ  r[  r   ri   r9   r   r  r  r:   r   r   r   s   @r-   r  r    s      $$$O!&+#*,(*, *,X. .%,, .<+u|| + +")E$5$5 ")ELL ")r4   r  c                   $     e Zd Z fdZd Z xZS )FlavaPredictionHeadTransformc                 h   t         |           t        j                  |j                  |j                        | _        t        |j                  t              rt        |j                     | _
        n|j                  | _
        t        j                  |j                  |j                        | _        y )Nr   )rh   ri   r   r   rm   r  r   r  r  r	   transform_act_fnr   r   r   s     r-   ri   z%FlavaPredictionHeadTransform.__init__  s{    YYv1163E3EF
f''-$*6+<+<$=D!$*$5$5D!f&8&8f>S>STr4   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r]   )r  r  r   r  s     r-   r   z$FlavaPredictionHeadTransform.forward  s4    

=1--m<}5r4   r5   r6   r7   ri   r   r   r   s   @r-   r  r    s    Ur4   r  c                   &     e Zd Zd fd	Zd Z xZS )rO  c                 H   t         |           || _        t        |      | _        t        j                  |j                  |j                  d      | _	        t        j                  t        j                  |j                              | _        ||| j                  _        y y )NTr   )rh   ri   ra   r  	transformr   r   rm   r   decoderrk   r9   rl   r   weight)r,   ra   r  rv   s      r-   ri   z"FlavaMaskedPredictionHead.__init__  sw    5f=yy!3!3V5F5FTRLLV->->!?@	"(DLL r4   c                 J    | j                  |      }| j                  |      }|S r]   )r  r  r  s     r-   r   z!FlavaMaskedPredictionHead.forward  s"    NN1LLOr4   r]   r   r   s   @r-   rO  rO    s    )r4   rO  c                   $     e Zd Z fdZd Z xZS )FlavaITMHeadc                     t         |           || _        t        |      | _        t        j                  |j                  d      | _        y )Nr|   )	rh   ri   ra   r?  rf  r   r   rm   seq_relationshipr   s     r-   ri   zFlavaITMHead.__init__  s:    !&) "		&*<*<a @r4   c                 J    | j                  |      }| j                  |      }|S r]   )rf  r
  r  s     r-   r   zFlavaITMHead.forward  s$    KKN!!!$r4   r   r   s   @r-   r  r    s    Ar4   r  c                   $     e Zd Z fdZd Z xZS )FlavaGlobalContrastiveHeadc                 R    t         |           || _        |j                  | _        y r]   )rh   ri   ra   global_backprop_contrastiver   s     r-   ri   z#FlavaGlobalContrastiveHead.__init__  s#    +1+M+M(r4   c                     t        j                  |      }t         j                  j                         rt         j                  j	                         s8t        j
                  |j                  d      |j                        }|g}|g}n{|j                  d      }t         j                  j                         }	| j                  rgt         j                  j                  j                  j                  |      }t         j                  j                  j                  j                  |      }nt        |	      D 
cg c]  }
t        j                  |       }}
t        |	      D 
cg c]  }
t        j                  |       }}
t         j                  j                  ||       t         j                  j                  ||       |t         j                  j                         z  t        j
                  ||j                        z   }t        j                   |      }t        j                   |      }t        j"                  ||j%                  dd            |z  }t        j"                  ||j%                  dd            |z  }|||fS c c}
w c c}
w )Nr   r  r   )r9   expdistributedis_availableis_initializedr   r~   r   get_world_sizer  r   r   
all_gatherr-  
zeros_likeget_rankr   r   r   )r,   r   r    rW  temperaturelabelsimage_embeddings_alltext_embeddings_alllocal_batch_size
world_sizer   logits_per_imagelogits_per_texts                r-   r   z"FlavaGlobalContrastiveHead.forward  s   ii,  --/u7H7H7W7W7Y\\"2"7"7":CSCZCZ[F$4#5 #2"3/44Q7**99;J// (-'8'8';';'F'F'Q'QRb'c$&+&7&7&:&:&E&E&P&PQ`&a#SXYcSd'ea(8(8(I'e$'eSXYcSd&eau'7'78H'I&e#&e!!,,-ACST!!,,-@/R%(9(9(B(B(DDu|| )9)@)@H F  %yy)=>#ii(;< <<(8:M:W:WXY[\:]^all,,8L8V8VWXZ[8\]`kk&88 (f&es   9J$Jr   r   s   @r-   r  r    s    N
9r4   r  zk
    The FLAVA model for pretraining which outputs losses, embeddings, logits and transformer outputs.
    c            '       `    e Zd ZdddddZddedej                  dz  f fd	Zd
ej                  fdZ
e	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dedz  dej                  dz  dej                  dz  dej                  dz  dedz  dededz  dedz  deej                     ez  f$d       Z xZS )!FlavaForPreTrainingzmmm_text_head.decoder.biaszmim_head.decoder.biaszmlm_head.decoder.biaszmmm_image_head.decoder.bias)zmmm_text_head.biaszmim_head.biaszmlm_head.biaszmmm_image_head.biasNra   image_codebookc                 b   t         |   |       t        |      | _        || _        | j                  &|j
                  rt        |j                        | _        t        |j                        | _
        t        |j                        | _        t        |      | _        t        |j                        | _        t        |j                        | _        t#        |      | _        |j                  j&                  | _        |j                  j&                  | _        |j,                  | _        |j.                  | _        |j0                  | _        |j2                  | _        |j4                  | _        |j6                  | _        |j8                  | _        |j:                  | _        | j=                          y)z
        image_codebook ([`nn.Module`]):
            If passed, the image codebook will be set to this. Otherwise, it will be initialized using the
            image_codebook_config defined in the config first as the first parameter.
        N)rh   ri   rU  rI  r#  init_codebookr  image_codebook_configrO  r  mim_headr  mlm_headr  itm_headmmm_image_headmmm_text_headr  global_contrastive_headr   image_vocab_sizetext_vocab_size
mlm_weight
mim_weightglobal_contrastive_weightce_ignore_index
itm_weightmmm_image_weightmmm_text_weight skip_unmasked_multimodal_encoderrg  )r,   ra   r#  rv   s      r-   ri   zFlavaForPreTraining.__init__  sQ    	 '
,&6+?+?"4V5Q5Q"RD 2&2E2EF1&2D2DE$V,78K8KL6v7I7IJ'A&'I$ & 3 3 > >%11<< ++ ++)/)I)I&%55 ++ & 7 7%55060W0W-r4   r   c                 n    |j                         dkD  r!|j                  |j                  d      d      }|S )Nr|   r   r{   )r   r   r~   r  s     r-   _resize_to_2dz!FlavaForPreTraining._resize_to_2d  s,    557Q;qvvay"%Ar4   r   input_ids_maskedr   codebook_pixel_valuesr   r   r   r   r  r6  
mlm_labels
mim_labels
itm_labelsr   r1  r2  return_lossr$   c                    ||n| j                   j                  }||n| j                   j                  }|
|
n| j                  }
||t        j                  d       |}| j                  ||||||	|
||d
      }| j                  |||||	|||d	      }d}|j                  }|j                  }|j                  }|j                  }|j                  }dx}x}x}x}x}x} }!dx}"x}#x}$}%dx}&x}'}(||C|A|r?| j                  t        d      |t        d      | j                  j                  |      }| j                  dkD  r|||})|| j                  |      }| j                  |      }| j                   ||j#                  d      <   |)dd|j%                  d	       dddf   })|j#                  | j                         }*||*   }+|)|*ddf   })| j'                  |)      }"|rjt(        j*                  j-                  |"j/                  d
| j0                        |+j/                  d
            }|| j                  z  }n| j'                  |)      }"| j2                  dkD  r|||},|| j                  |      }|,dd|j%                  d	       dddf   },|j#                  | j                         }*||*   }-|,|*ddf   },| j5                  |,      }#|rjt(        j*                  j-                  |#j/                  d
| j6                        |-j/                  d
            }|| j2                  z  }n| j5                  |,      }#| j8                  dkD  r|| j;                  |      }&||j#                  d      }.t=        j>                  |.jA                         |.|.jC                  dg            }|r/t(        j*                  j-                  |&|      }!|!| j8                  z  }!|||   }|||   }|
||   }||   }|| jD                  dkD  r|})|j%                  d	      d	z
  }/|)dddd|/z   ddf   })|| j                  |      }| j                  |      }| j                   ||j#                  d      <   |j#                  | j                         }*||*   }+|)|*ddf   })| jG                  |)      }%|rjt(        j*                  j-                  |%j/                  d
| j0                        |+j/                  d
            }|| jD                  z  }n| jG                  |)      }%|| jH                  dkD  r|},|,dd|j%                  d	       dddf   },|| j                  |      }|j#                  | j                         }*||*   }-|,|*ddf   },| jK                  |,      }$|rjt(        j*                  j-                  |$j/                  d
| j6                        |-j/                  d
            }|| jH                  z  }n| jK                  |,      }$|x|u| jL                  dkD  re| j                  jO                  |dddddf         }0t(        j*                  jQ                  |0d
      }0| j                  jS                  |dddddf         }1t(        j*                  jQ                  |1d
      }1| jT                  r8| j                  jV                  jX                  j[                  t\        t^               | ja                  |1|0| j                  jV                        \  }'}(}2||'|   }'|(|   }(|2|   }2|rWt(        j*                  j-                  |'|2      }3t(        j*                  j-                  |(|2      }4|3|4z   dz  } | | jL                  z  } tc        |||!| ||      }5|r0|5je                         s tg        d |5ji                         D              }|s.||jj                  |jj                  jm                         nd||jn                  |jn                  jm                         nd|j                  |jp                  |jp                  jm                         nd||jj                  |jj                  jm                         nd||jn                  |jn                  jm                         nd||jp                  |jp                  jm                         nd|"|#|&|'|'|%|$f}6|r|5je                         s||5f|6z   }6ts        d |6D              S tu        d%i d|d|5d|d|jj                  d|d|jn                  d|j                  d|jp                  d|d|jj                  d|d|jn                  d|d|jp                  d|"d|#d |&d!|'d"|(d#|%d$|$S )&a  
        input_ids (`torch.LongTensor` of shape `(batch_size, text_seq_len)`):
            Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See
            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input
            IDs?](../glossary#input-ids)
        input_ids_masked (`torch.LongTensor` of shape `(batch_size, text_seq_len)`):
            Indices of input sequence tokens in the vocabulary. These ones are the masked version of the original task
            to be used with MLM. Indices can be obtained using [`AutoTokenizer`] along with
            [`DataCollatorForMaskedLanguageModeling`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details. [What are input IDs?](../glossary#input-ids)
        codebook_pixel_values (`torch.FloatTensor` of shape `(batch_size, num_image_patches, patch_size, patch_size, 3)`, *optional*):
            Pixel values for image patches that are used to compute the image codebook labels for masked image modeling.
        token_type_ids (`torch.LongTensor` of shape `(batch_size, text_seq_len)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:
            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.
            [What are token type IDs?](../glossary#token-type-ids)
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, image_num_patches)`):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
        image_attention_mask (`torch.FloatTensor` of shape `(batch_size, image_num_patches)`, *optional*):
            Mask to avoid performing attention on padding token indices specifically for images. Mask values selected
            in `[0, 1]`:
            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
            [What are attention masks?](../glossary#attention-mask)
        skip_unmasked_multimodal_encoder (*bool*, *optional*):
            Skip any calculations for multimodal encoder for unmasked inputs. FLAVA pretraining doesn't need unmasked
            multimodal embeddings or outputs as of now.
        mlm_labels (`torch.LongTensor` of shape `(batch_size, text_seq_len)`, *optional*):
            Labels for computing the left-to-right language and multimodal masked modeling loss (next word prediction).
            Indices should be in `[-100, 0, ..., text_config.vocab_size - 1]` (see `input_ids` docstring). Tokens with
            indices set to `-100` are ignored (masked), the loss is only computed for the tokens with labels in `[0,
            ..., text_config.vocab_size - 1]`.
        mim_labels (`torch.LongTensor` of shape `(batch_size, image_num_patches)`, *optional*):
            Labels for computing the image and multimodal masked modeling loss. Indices should be in `[-100, 0, ...,
            image_config.vocab_size - 1]`. Tokens with indices set to `-100` are ignored (masked), the loss is only
            computed for the tokens with labels in `[0, ..., image_config.vocab_size - 1]`. If not passed, they are
            generated automatically using the image codebook assigned to the model. By default, it uses
            [`FlavaImageCodebook`]. See [`FlavaImageCodebook`] to understand how to generate mim_labels.
        itm_labels (`torch.LongTensor` of shape `(batch_size, 1)`, *optional*):
            Labels for computing the image-text matching loss. 0 means the pairs don't match and 1 means they match.
            The pairs with 0 will be skipped for calculation of MMM and global contrastive losses as well.
        return_loss (`bool`, *optional*, default to None):
            Whether to return calculated loss or not.

        Examples:
        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import FlavaForPreTraining, AutoProcessor

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> model = FlavaForPreTraining.from_pretrained("facebook/flava-full")
        >>> processor = AutoProcessor.from_pretrained("facebook/flava-full")

        >>> text = ["a photo of a cat"]

        >>> inputs = processor(
        ...     images=[image],
        ...     text=text,
        ...     return_masks=True,
        ...     return_codebook_pixels=True,
        ...     padding=True,
        ...     max_length=77,
        ...     return_tensors="pt",
        ... )


        >>> output = model(**inputs)
        ```
        Nz`input_ids_masked` isn't passed which means MLM loss won't be calculated correctlySetting it to `input_ids` so that model can work. Please pass it if this is unintentional. This is usually OKAY if you are doing inference on unmasked text...T)
r   r   r   r   r   r  r  r   r1  r2  )	r   r   r   r   r  r   r   r1  r2  z`return_loss` is set to True but the image codebook is not initialized and no `mim_labels`  have been passed. Reinstantiate the model with `init_codebook` set to True or pass in your custom `mim_labels`z`codebook_pixel_value` are required to generate `mim_labels` if loss is expected. Call `AutoProcessor` with `return_codebook_pixels` set to Truer   r   r{   r|   r   )r?   r@   rA   rB   rC   rD   c              3   (   K   | ]
  }||nd  y wrD  r<   )r*   rL   s     r-   r.   z.FlavaForPreTraining.forward.<locals>.<genexpr>Z  s     _T%5T1<_s   c              3   &   K   | ]	  }||  y wr]   r<   )r*   r   s     r-   r.   z.FlavaForPreTraining.forward.<locals>.<genexpr>{  s     8qai8r5  rL   rM   r   r   r    r!   r"   r#   rN   rO   rP   rQ   rR   rS   rT   rU   rV   rW   rX   rY   rZ   r<   );ra   rs  r>  r6  loggerwarningrI  r   r    r"   r#  RuntimeErrorr   r  r0  r8  r2  ner~   r'  r   r   cross_entropyr   r-  r/  r(  r.  r3  r)  r9   whereanynewr4  r*  r5  r+  r1  r  	normalizer  trainingrW  dataclamp_LOGIT_SCALE_CLAMP_MINLOGIT_SCALE_CLAMP_MAXr,  r>   rG   sumrF   r   r)   r!   r#   r1   rK   )7r,   r   r9  r   r:  r   r   r   r   r  r6  r;  r<  r=  r   r1  r2  r>  rt  flava_outputflava_masked_outputpos_maskr   r    rN   rP   rR   
total_lossmim_lossmlm_lossmmm_text_lossmmm_image_lossgc_lossitm_lossrT   rU   rZ   rY   rV   r  r   sequence_for_imagemasked_tokensmim_labels_filteredsequence_for_textmlm_labels_filtered	pos_pairs	end_indextext_embeddingimage_embedding	gc_labelsgc_loss_imagegc_loss_textflava_lossesr  s7                                                          r-   r   zFlavaForPreTraining.forward#  s
   D &1%<k$++B]B]%0%<k$++BYBY 0; -66 	) #	(=NN?
  )zz%))%!5 %E/!5 " 
  #jj&%))!5+/!5 ) 

 '88&66"5"F"F!4!D!D':'P'P$aee
eXee=e>eGV^GKK
KZK/4D:>>
>% #.2N2Z!k&&.&; 
 )0$Y  "00EEF[\
 ??Q#:#FKgKo!8%!//
;
"&"4"4_"E7;7K7K
?--d34%7JOOA<N;N;PRS8S%T" *d.B.B C&0&?#%7q8H%I"!]]+=>
!}}::"D,A,ABDWD\D\]_D` H /H!]]+=>
 ??Q#9#EJfJn 6%!//
;
$5a*//!:L9L9NPQ6Q$R! *d.B.B C&0&?#$5mQ6F$G!!]]+<=
!}}::"D,@,@ACVC[C[\^C_ H /H!]]+<=
 ??Q#?#K'CDJ%&MM!,	 ;;y}}	9==RVQWCXY!}}:::zRH/H/;3OPX3Y0)!+H!5J)!+H!5J&5h&?O (38M8MPQ8Q!=/44Q7!;I!3Aq1y=7H!4K!L%!//
;
"&"4"4_"E7;7K7K
?--d34 *d.B.B C&0&?#%7q8H%I"#'#6#67I#J %']]%@%@(--b$2G2GHJ]JbJbceJf&N #d&;&;;N#'#6#67I#J  (38L8Lq8P < 1!6L6Q6QRS6T5T5VXY2Y Z%!//
;
 *d.B.B C&0&?#$5mQ6F$G!"&"4"45F"G$&MM$?$?',,R1E1EFH[H`H`acHd%M "T%9%99M"&"4"45F"G 'O,GDLjLjmnLn!ZZ771a8PQN]]44^4LN"jj99:J1aQR7:STO mm55o25NO}}

&&++223HJ_`;?;W;W1G1G<8oy
 ##3H#= "1(";%h/	 " ; ;<Li X!}}::?IV(<71<4999"&$"
 |446_I\I\I^__J 8D8Q8Q8]))224cg7C7O7O7[((113ae22=I=[=[=g..779mq'?R?_?_?k#0099;qu&>Q>]>]>i#//88:os,&88D $55>>@   +F. <#8#8#:   8F888( 

"
 .
 &22	

 ,
 %00
 #/"D"D
 +<<
 %<
 !4 @ @
 $:
  3>>
 *F
 &9%J%J
 "
  "!
" "#
$ *:%
& )8'
( .)
* ,+
 	
r4   r]   )NNNNNNNNNNNNNNTNN)r5   r6   r7   _tied_weights_keysr   r   r{  ri   r9   r   r8  r   r  r:   rI   r1   rK   r   r   r   s   @r-   r"  r"    s    ;00<	!{ !BII<L !Fu|| 
  .24815:>.2.2/304488<*.*.*.)-%)#'#'%o
##d*o
  **T1o
 ''$.	o

  %0047o
 t+o
 t+o
 ,o
 &&-o
 $llT1o
 +/+o
 LL4'o
 LL4'o
 LL4'o
  $;o
  #!o
" D[#o
$ D[%o
( 
u||	8	8)o
 o
r4   r"  )r"  r  r^  rU  rS  rH  r}  )Jr8   r   r   r   dataclassesr   typingr   r9   r    r   rP  activationsr	   modeling_layersr
   modeling_outputsr   r   modeling_utilsr   processing_utilsr   utilsr   r   r   r   r   r   configuration_flavar   r   r   r   r   
get_loggerr5   rB  r  rN  rO  r   r   r>   rK   r{  r`   ro   r   r   r   r  r  r  r  r*  r?  rH  r^  r}  rS  rU  r  r  r  r  r  rO  r  r  r"  __all__r<   r4   r-   <module>ru     s+      # !    & ! 9 K - & j j  
		H	%>   &)99<QQ  
{ 
 
< 
+  D Wt Wt Wtx_299 _H!bii !H3")) 3lA AHbii $RYY ,		 ""))  )+ )X$
299 $
N"))  S? S S6 N
* N
 N
b `
) `
 `
F N
/ N
 N
b ^
% ^
 ^
B			 *Cbii C"299 ( w)- w)w)t299 "		  
299 
%9 %9P 
a
. a

a
Hr4   