
    qi9                       d Z ddlZddlZddlmZ ddlmZ ddlmZ ddl	Z	ddl	m
Z
 ddlmZ dd	lmZ dd
lmZmZmZ ddlmZ ddlmZ ddlmZ ddlmZmZmZmZ ddlm Z m!Z! ddl"m#Z# ddl$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z* ddl+m,Z, ddl-m.Z.m/Z/m0Z0  e)jb                  e2      Z3dVde	jh                  de	jj                  de6dz  fdZ7	 dWde	jp                  de	jj                  de	jr                  de6fdZ:ee' G d de                    Z;e e'd        G d! d"e%                    Z<e e'd#        G d$ d%e%                    Z= G d& d'e
j|                        Z?	 dXd(e
j|                  d)e	jh                  d*e	jh                  d+e	jh                  d,e	jh                  dz  d-e@d.e@fd/ZA G d0 d1e
j|                        ZB G d2 d3e
j|                        ZC G d4 d5e      ZD G d6 d7e
j|                        ZE G d8 d9e
j|                        ZF G d: d;e
j|                        ZG G d< d=e
j|                        ZH G d> d?e
j|                        ZI G d@ dAe      ZJ G dB dCe
j|                        ZKe' G dD dEe!             ZL G dF dGeL      ZM G dH dIeL      ZN e'dJ        G dK dLeLe             ZO G dM dNe
j|                        ZP e'dO        G dP dQeL             ZQ e'dR        G dS dTeLe             ZRg dUZSy)YzPyTorch KOSMOS-2 model.    N)Callable)	dataclass)Any)nn   )initialization)ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsBaseModelOutputWithPooling!CausalLMOutputWithCrossAttentions)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringcan_return_tuplelogging	torch_int)is_flash_attention_requested   )Kosmos2ConfigKosmos2TextConfigKosmos2VisionConfigmaskdtypetgt_lenc                 2   | j                         \  }}||n|}| ddddddf   j                  |d||      j                  |      }d|z
  }|j                  |j                  t        j
                        t	        j                  |      j                        S )z_
    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
    Nr         ?)sizeexpandtomasked_filltorchboolfinfomin)r"   r#   r$   bszsrc_lenexpanded_maskinverted_masks          ^/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/kosmos2/modeling_kosmos2.py_expand_maskr4   /   s     99;LC ,g'GD$)*11#q'7KNNuUM-'M$$]%5%5ejj%A5;;uCUCYCYZZ    input_ids_shapedevicepast_key_values_lengthc                    | \  }}t        j                  ||ft        j                  |      j                  |      }t        j                  |j                  d      |      }|j                  ||dz   j                  |j                  d      d      k  d       |j                  |      }|dkD  r0t        j                  t        j                  ||||      |gd      }|ddddddf   j                  |d|||z         S )zB
    Make causal mask used for bi-directional self-attention.
    )r7   r   r   r#   r7   dimN)r+   fullr-   r.   aranger'   masked_fill_viewr)   catzerosr(   )r6   r#   r7   r8   r/   r$   r"   	mask_conds           r3   _make_causal_maskrE   =   s     #LC::w(%++e*<*@*@PDTYYr]6:Ii9q="6"6tyy}a"HH!L775>D!yy%++g/EU[abdhioqrdAq !((a'DZ:Z[[r5   c                   @    e Zd ZU dZdZeej                     dz  ed<   y)'BaseModelOutputWithProjectionAttentionsaq  
    projection_attentions (`tuple(torch.FloatTensor)`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights given by `Kosmos2ImageToTextProjection`, after the attention softmax, used to compute
        the weighted average in the self-attention heads.
    Nprojection_attentions)	__name__
__module____qualname____doc__rH   tupler+   FloatTensor__annotations__ r5   r3   rG   rG   N   s%     >B5!2!23d:Ar5   rG   ze
    Base class for text model's outputs that also contains a pooling of the last hidden states.
    )custom_introc                       e Zd ZU dZdZej                  dz  ed<   dZe	dz  ed<   dZ
eej                     dz  ed<   dZeej                     dz  ed<   dZej                  dz  ed<   dZeej                     dz  ed<   dZeed	<   d
ee   fdZy)Kosmos2ModelOutputa  
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
        `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
        input) to speed up sequential decoding.
    image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
        Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.
    projection_attentions (`tuple(torch.FloatTensor)`, *optional*):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights given by `Kosmos2ImageToTextProjection`, after the attention softmax, used to compute
        the weighted average in the self-attention heads.
    vision_model_output (`BaseModelOutputWithPooling`, *optional*):
        The output of the [`Kosmos2VisionModel`].
    Nlast_hidden_statepast_key_valueshidden_states
attentionsimage_embedsrH   vision_model_outputreturnc                 H     t         fd j                         D              S )Nc              3   d   K   | ]'  }|d vr|   nt        |      j                          ) yw)text_model_outputrY   Ngetattrto_tuple.0kselfs     r3   	<genexpr>z.Kosmos2ModelOutput.to_tuple.<locals>.<genexpr>   =      
  LLDGRYZ^`aRbRkRkRmm
   -0rM   keysre   s   `r3   ra   zKosmos2ModelOutput.to_tuple   #     
YY[
 
 	
r5   )rI   rJ   rK   rL   rT   r+   rN   rO   rU   r
   rV   rM   rW   rX   rH   rY   r   r   ra   rP   r5   r3   rS   rS   ]   s    & 37u((4/6$(OUT\(59M5**+d2926Je''(4/6-1L%##d*1=A5!2!23d:A6:3:
%* 
r5   rS   zC
    Model output class for `Kosmos2ForConditionalGeneration`.
    c                   H   e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	e
dz  ed<   dZeej                     dz  ed<   dZeej                     dz  ed<   dZej                  dz  ed<   dZeej                     dz  ed	<   dZeed
<   dee   fdZy)*Kosmos2ForConditionalGenerationModelOutputa*  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
        `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
        input) to speed up sequential decoding.
    image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
        Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.
    projection_attentions (`tuple(torch.FloatTensor)`, *optional*):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights given by `Kosmos2ImageToTextProjection`, after the attention softmax, used to compute
        the weighted average in the self-attention heads.
    vision_model_output (`BaseModelOutputWithPooling`, *optional*):
        The output of the [`Kosmos2VisionModel`].
    NlosslogitsrU   rV   rW   rX   rH   rY   rZ   c                 H     t         fd j                         D              S )Nc              3   d   K   | ]'  }|d vr|   nt        |      j                          ) ywr]   r_   rb   s     r3   rf   zFKosmos2ForConditionalGenerationModelOutput.to_tuple.<locals>.<genexpr>   rg   rh   ri   rk   s   `r3   ra   z3Kosmos2ForConditionalGenerationModelOutput.to_tuple   rl   r5   )rI   rJ   rK   rL   ro   r+   rN   rO   rp   rU   r
   rV   rM   rW   rX   rH   rY   r   r   ra   rP   r5   r3   rn   rn      s    . &*D%

d
")'+FE$+$(OUT\(59M5**+d2926Je''(4/6-1L%##d*1=A5!2!23d:A6:3:
%* 
r5   rn   c                        e Zd Zdef fdZdej                  dededej                  fdZd
dej                  dej                  fd	Z
 xZS )Kosmos2VisionEmbeddingsconfigc                    t         |           || _        |j                  | _        |j
                  | _        |j                  | _        t        j                  t        j                  | j                              | _        t        j                  |j                  | j                  | j                  | j                  d      | _        | j
                  | j                  z  dz  | _        | j                  dz   | _        t        j"                  | j                   | j                        | _        | j'                  dt        j(                  | j                         j+                  d      d       y )NF)in_channelsout_channelskernel_sizestridebias   r   position_idsr   r:   
persistent)super__init__ru   hidden_size	embed_dim
image_size
patch_sizer   	Parameterr+   randnclass_embeddingConv2dnum_channelspatch_embeddingnum_patchesnum_positions	Embeddingposition_embeddingregister_bufferr?   r(   re   ru   	__class__s     r3   r   z Kosmos2VisionEmbeddings.__init__   s	   ++ ++ ++!||EKK,GH!yy++?? 
 !OOt>1D!--1"$,,t/A/A4>>"R^U\\$:L:L-M-T-TU\-]jopr5   
embeddingsheightwidthrZ   c                    |j                   d   dz
  }| j                  j                  j                  d      }|j                   d   dz
  }t        j
                  j                         s%||k(  r ||k(  r| j                  | j                        S |ddddf   }|ddddf   }|j                   d   }	|| j                  z  }
|| j                  z  }t        |dz        }|j                  d|||	      }|j                  dddd      }t        j                  j                  ||
|fdd	
      }|j                  dddd      j                  dd|	      }t	        j                   ||fd      S )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   r   Nr:   g      ?r   r|   bicubicF)r'   modealign_cornersr<   )shaper   weight	unsqueezer+   jit
is_tracingr}   r   r   reshapepermuter   
functionalinterpolaterA   rB   )re   r   r   r   r   r   r   class_pos_embedpatch_pos_embedr=   
new_height	new_widthsqrt_num_positionss                r3   interpolate_pos_encodingz0Kosmos2VisionEmbeddings.interpolate_pos_encoding   sv    !&&q)A-!44;;EEaH*003a7 yy##%+*F6UZ?**4+<+<==,QU3,QU3r"t.
T__,	&}c'9:)11!5GI[]`a)11!Q1=--33i(	 4 
 *11!Q1=BB1b#Nyy/?;CCr5   pixel_valuesc                 `   |j                   \  }}}}|sJ|| j                  k7  s|| j                  k7  r,t        d| d| d| j                   d| j                   d	      | j                  j                  j
                  }| j                  |j                  |            }|j                  d      j                  dd      }| j                  j                  |dd      }	t        j                  |	|gd	      }
|r|
| j                  |
||      z   }
|
S |
| j                  | j                        z   }
|
S )
NzInput image size (*z) doesn't match model ().r#   r|   r   r:   r<   )r   r   
ValueErrorr   r   r#   r)   flatten	transposer   r(   r+   rB   r   r   r}   )re   r   r   
batch_size_r   r   target_dtypepatch_embedsclass_embedsr   s              r3   forwardzKosmos2VisionEmbeddings.forward   s6   '3'9'9$
Avu'Vt-F%SWSbSbJb$VHAeW4KDOOK\\]^b^m^m]nnpq  ++2288++LOO,O,OP#++A.88A>++22:q"EYYl;C
##d&C&CJPVX]&^^J  $d&=&=d>O>O&PPJr5   F)rI   rJ   rK   r!   r   r+   Tensorintr   rN   r   __classcell__r   s   @r3   rt   rt      se    q2 q,'D5<< 'D 'DUX 'D]b]i]i 'DRE$5$5 Z_ZfZf r5   rt   modulequerykeyvalueattention_maskscalingdropoutc                 p   t        j                  ||j                  dd            |z  }|||z   }t        j                  j                  |d      }t        j                  j                  ||| j                        }t        j                  ||      }	|	j                  dd      j                         }	|	|fS )Nr:   r<   ptrainingr   r|   )	r+   matmulr   r   r   softmaxr   r   
contiguous)
r   r   r   r   r   r   r   kwargsattn_weightsattn_outputs
             r3   eager_attention_forwardr   	  s     <<s}}R'<=GL!#n4==((2(>L==((6??([L,,|U3K''1-88:K$$r5   c                        e Zd ZdZ fdZ	 	 	 d
dej                  dej                  dz  dej                  dz  dedz  deej                  ej                  dz  f   f
d	Z	 xZ
S )Kosmos2VisionAttention=Multi-headed attention from 'Attention Is All You Need' paperc                    t         |           || _        |j                  | _        |j
                  | _        | j                  | j                  z  | _        | j                  | j                  z  | j                  k7  r&t        d| j                   d| j                   d      | j                  dz  | _	        |j                  | _        d| _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        y )N;embed_dim must be divisible by num_heads (got `embed_dim`:  and `num_heads`: r         F)r   r   ru   r   r   num_attention_heads	num_headshead_dimr   scaleattention_dropoutr   	is_causalr   Lineark_projv_projq_projout_projr   s     r3   r   zKosmos2VisionAttention.__init__"  s   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
//ii?ii?ii?		$..$..Ar5   NrV   r   causal_attention_maskoutput_attentionsrZ   c           
         |j                   \  }}}| j                  |      }| j                  |      }	| j                  |      }
|j	                  ||| j
                  | j                        j                  dd      }|	j	                  ||| j
                  | j                        j                  dd      }	|
j	                  ||| j
                  | j                        j                  dd      }
t        | j                        s||||z   }n||}n	|du| _
        t        j                  | j                  j                  t              } || ||	|
|| j                  | j                  | j                   sdn| j"                        \  }}|j%                  |||      j'                         }| j)                  |      }|sd}||fS )#Input shape: Batch x Time x Channelr   r|   N        )r   r   r   )r   r   r   r   rA   r   r   r   r   ru   r   r   get_interface_attn_implementationr   r   r   r   r   r   r   )re   rV   r   r   r   r   
seq_lengthr   queriesrj   valuesattention_interfacer   r   s                 r3   r   zKosmos2VisionAttention.forward6  s    -:,?,?)
J	++m,{{=)]+,,z:t~~t}}U__`acdeyyZOYYZ[]^_ZT^^T]]S]]^_abc ,DKK8).C.O!/2G!G&2!62$>DN(?(M(MKK,,.E)
 %8nnJJ#}}C$,,	%
!\ "))*j)LWWYmmK0 LL((r5   )NNF)rI   rJ   rK   rL   r   r+   r   r,   rM   r   r   r   s   @r3   r   r     s}    GB. /359)./)||/) t+/)  %||d2	/)
  $;/) 
u||U\\D00	1/)r5   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )Kosmos2VisionMLPc                    t         |           || _        t        |j                     | _        t        j                  |j                  |j                        | _
        t        j                  |j                  |j                        | _        y N)r   r   ru   r	   
hidden_actactivation_fnr   r   r   intermediate_sizefc1fc2r   s     r3   r   zKosmos2VisionMLP.__init__j  sd    #F$5$5699V//1I1IJ99V55v7I7IJr5   rV   rZ   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r   )r   r   r   re   rV   s     r3   r   zKosmos2VisionMLP.forwardq  s4    /**=9/r5   )rI   rJ   rK   r   r+   r   r   r   r   s   @r3   r   r   i  s$    KU\\ ell r5   r   c                        e Zd Zdef fdZ	 d
dej                  dej                  dedz  dee	   de
ej                     f
d	Z xZS )Kosmos2VisionEncoderLayerru   c                 D   t         |           |j                  | _        t	        |      | _        t        j                  | j                  |j                        | _	        t        |      | _        t        j                  | j                  |j                        | _        y Neps)r   r   r   r   r   	self_attnr   	LayerNormlayer_norm_epslayer_norm1r   mlplayer_norm2r   s     r3   r   z"Kosmos2VisionEncoderLayer.__init__z  sm    ++/7<<F<Q<QR#F+<<F<Q<QRr5   rV   r   r   Nr   rZ   c                     |}| j                  |      } | j                  d|||d|\  }}||z   }|}| j                  |      }| j                  |      }||z   }|f}|r||fz  }|S )aI  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
                `(config.encoder_attention_heads,)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )rV   r   r   rP   )r  r   r  r  )re   rV   r   r   r   residualr   outputss           r3   r   z!Kosmos2VisionEncoderLayer.forward  s    " !((7&4dnn '
')/'
 	'
#| !=0 ((7/ =0 "&Gr5   r   )rI   rJ   rK   r!   r   r+   r   r,   r   r   rM   rN   r   r   r   s   @r3   r   r   y  si    S2 S */	&||& &  $;	&
 +,& 
u  	!&r5   r   c                        e Zd ZdZdef fdZe	 	 	 	 ddej                  dz  de	dz  de	dz  de	dz  d	e
ez  f
d
       Z xZS )Kosmos2VisionEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`Kosmos2VisionEncoderLayer`].

    Args:
        config: Kosmos2VisionConfig
    ru   c                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w )NF)
r   r   ru   r   
ModuleListrangenum_hidden_layersr   layersgradient_checkpointing)re   ru   r   r   s      r3   r   zKosmos2VisionEncoder.__init__  sQ    mmPUV\VnVnPo$p1%>v%F$pq&+# %qs   A#Nr   r   output_hidden_statesreturn_dictrZ   c                 j   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|rdnd}|rdnd}|}	t	        | j
                        D ]*  \  }
}|r||	fz   } ||	|fd|i|}|d   }	|s"||d   fz   }, |r||	fz   }t        |	||      S )ad  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        NrP   r   r   r   )rT   rV   rW   )ru   r   r  use_return_dict	enumerater  rG   )re   inputs_embedsr   r   r  r  r   encoder_statesall_attentionsrV   idxencoder_layerlayer_outputss                r3   r   zKosmos2VisionEncoder.forward  s   @ 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]30d%"+DKK"8 	FC#!/=2B!B) #4 	M *!,M !/=3C2E!E	F  +}.>>N6+>Vd
 	
r5   )NNNN)rI   rJ   rK   rL   r!   r   r   r+   r   r,   rM   r   r   r   r   s   @r3   r  r    s    ,2 ,  /3)-,0#'=
 t+=
  $;	=

 #Tk=
 D[=
 
	 =
 =
r5   r  c                        e Zd Zdef fdZ	 	 	 	 	 ddej                  dz  dedz  dedz  dededz  d	ee	z  fd
Z
 xZS )Kosmos2VisionTransformerru   c                     t         |           || _        |j                  }t	        |      | _        t        j                  ||j                        | _	        t        |      | _        t        j                  ||j                        | _        y r   )r   r   ru   r   rt   r   r   r   r   pre_layrnormr  encoderpost_layernorm)re   ru   r   r   s      r3   r   z!Kosmos2VisionTransformer.__init__  sj    &&	1&9LL8M8MN+F3 ll9&:O:OPr5   Nr   r   r  r   r  rZ   c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|t	        d      | j                  ||      }| j                  |      }| j                  ||||      }|d   }|d d dd d f   }	| j                  |	      }	|s
||	f|dd  z   S t        ||	|j                  |j                        S )Nz You have to specify pixel_values)r   )r  r   r  r  r   r   )rT   pooler_outputrV   rW   )ru   r   r  r  r   r   r  r  r  r   rV   rW   )
re   r   r   r  r   r  rV   encoder_outputsrT   pooled_outputs
             r3   r   z Kosmos2VisionTransformer.forward  s    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]?@@Ogh))-8,,'/!5#	 ' 
 ,A.)!Q'2++M:%}58KKK)/')77&11	
 	
r5   NNNFN)rI   rJ   rK   r!   r   r+   rN   r,   rM   r   r   r   r   s   @r3   r  r    s    Q2 Q 26)-,0).#''
''$.'
  $;'
 #Tk	'

 #''
 D['
 
+	+'
r5   r  c                   4    e Zd ZdZddedededz  f fdZddedededz  fdZeddedededz  fd	       Z e	j                         	 	 	 	 dd
e	j                  dz  de	j                  dz  dede	j                  dz  fd       Zed        Zedd       Z xZS )(Kosmos2TextSinusoidalPositionalEmbeddingzDThis module produces sinusoidal positional embeddings of any length.Nr   embedding_dimpadding_idxc                     t         |           d| _        || _        || _        || _        | j                  || j                  z   ||       y )Nr|   )r   r   offsetr   r'  r(  make_weights)re   r   r'  r(  r   s       r3   r   z1Kosmos2TextSinusoidalPositionalEmbedding.__init__7  sH    **&-$++5}kRr5   num_embeddingsc                     | j                  |||      }t        | d      r;|j                  | j                  j                  | j                  j
                        }| j                  d|d       y )Nweightsr;   Fr   )get_embeddinghasattrr)   r.  r#   r7   r   )re   r,  r'  r(  emb_weightss        r3   r+  z5Kosmos2TextSinusoidalPositionalEmbedding.make_weights@  s[    ((T4#%..t||/A/A$,,J]J].^KYFr5   c                    |dz  }t        j                  d      |dz
  z  }t        j                  t        j                  |t        j
                        j                         | z        }t        j                  | t        j
                        j                         j                  d      |j                  d      z  }t        j                  t        j                  |      t        j                  |      gd      j                  | d      }|dz  dk(  r-t        j                  |t        j                  | d      gd      }|	d||ddf<   |j                  t        j                               S )	z
        Build sinusoidal embeddings.

        This matches the implementation in tensor2tensor, but differs slightly from the description in Section 3.5 of
        "Attention Is All You Need".
        r|   i'  r   r   r   r<   r:   N)mathlogr+   expr?   int64floatr   rB   sincosrA   rC   r)   get_default_dtype)r,  r'  r(  half_dimembs        r3   r/  z6Kosmos2TextSinusoidalPositionalEmbedding.get_embeddingH  s    !A%hhuoA.iiXU[[AGGISDPQll>=CCEOOPQRUXUbUbcdUeeii338a@EEnVXY1!))S%++na"@AqIC""#CQvve--/00r5   	input_idsr  r8   r}   c                    |L|j                         \  }}|l| j                  || j                  |      j                  |j                        }n5|j                         d d \  }}|| j                  ||| j                        }| j                  dz   |z   |z   }|| j                  j                  d      kD  r4| j                  || j                  z   | j                  | j                         | j                  j                  d|j                  d            j                  ||| j                  j                  d         j                         S )Nr:   r   r   )r'   "create_position_ids_from_input_idsr(  r)   r7   &create_position_ids_from_inputs_embedsr.  r+  r*  r'  index_selectrA   r   detach)re   r=  r  r8   r}   r/   seq_lenmax_poss           r3   r   z0Kosmos2TextSinusoidalPositionalEmbedding.forward^  s8     $>>+LC##FFt//1G "Y%%&  )--/4LC##JJ!#94;K;K 
 ""Q&03IIT\\&&q))g3T5G5GIYIYZ||((L,=,=b,ABGGWVZVbVbVhVhikVlmttvvr5   c                    | j                         dd }|d   }t        j                  |dz   ||z   dz   t        j                  | j                        }|j                  d      j                  |      j                         |z   S )z
        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

        Args:
            inputs_embeds: torch.Tensor

        Returns: torch.Tensor
        Nr:   r   r;   r   )r'   r+   r?   longr7   r   r(   r   )r  r8   r(  input_shapesequence_lengthr}   s         r3   r@  zOKosmos2TextSinusoidalPositionalEmbedding.create_position_ids_from_inputs_embeds{  s     $((*3B/%a.||!O_{:Q>ejjYfYmYm
 %%a(//<GGILbbbr5   c                     | j                  |      j                         }t        j                  |d      j	                  |      |z   |z  }|j                         |z   S )a  
        Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
        are ignored. This is modified from fairseq's `utils.make_positions`.

        Args:
            x: torch.Tensor x:

        Returns: torch.Tensor
        r   r<   )ner   r+   cumsumtype_asrF  )r=  r(  r8   r"   incremental_indicess        r3   r?  zKKosmos2TextSinusoidalPositionalEmbedding.create_position_ids_from_input_ids  sW     ||K(,,.$||Da8@@FI__cgg"'')K77r5   r   )NNr   Nr   )rI   rJ   rK   rL   r   r   r+  staticmethodr/  r+   no_gradr   r   r@  r?  r   r   s   @r3   r&  r&  3  s   NSc S# SCRVJ SG3 Gs GQTW[Q[ G 1c 1# 1CRVJ 1 1( U]]_ *.-1&',0w<<$&w ||d*w !$	w
 llT)w w8 c c" 8 8r5   r&  c                   <    e Zd ZdZ	 	 	 	 	 ddededededz  dedz  dedz  d	edz  f fd
Z	 	 	 	 	 ddej                  dej                  dz  de
dz  dej                  dz  dedej                  dz  deej                  ej                  dz  e
dz  f   fdZ xZS )KosmosTextAttentionr   Nr   r   r   
is_decoderadd_inner_attn_layernormr{   	layer_idxc	                 x   t         	|           || _        || _        || _        || _        ||z  | _        d| _        | j                  |z  | j                  k7  rt        d| j                   d| d      | j                  dz  | _	        || _
        || _        t        j                  |||      | _        t        j                  |||      | _        t        j                  |||      | _        t        j                  |||      | _        d | _        |r't        j&                  ||j(                        | _        y y )NTr   r   r   r   )r{   r   )r   r   ru   r   r   r   r   r   r   r   rS  rU  r   r   r   r   r   r   inner_attn_lnr   r   )
re   ru   r   r   r   rS  rT  r{   rU  r   s
            r3   r   zKosmosTextAttention.__init__  s    	""!Y.MMI%$..8MdnnM]$YKr3  }}d*$"ii	94@ii	94@ii	94@		)YTB "#!#iV=R=R!SD $r5   rV   encoder_hidden_statesrU   r   r   cache_positionrZ   c                 P   |du}|j                   dd \  }	}
| j                  |      }|j                  |	|
| j                  | j                        j                  dd      }d}|St        |t              rA|j                  j                  | j                        }|r|j                  }n|j                  }n|}|r|n|}|rK|I|rGj                  | j                     j                  }|j                  | j                     j                  }n| j!                  |      }| j#                  |      }|j                  |	d| j                  | j                        j                  dd      }|j                  |	d| j                  | j                        j                  dd      }|T|s|nd}j%                  ||| j                  d|i      \  }}|r)t        |t              rd|j                  | j                  <   t'        j(                  | j*                  j,                  t.              } || ||||f| j0                  sdn| j2                  | j4                  d	|\  }}|j7                  |	|
d      j9                         }| j:                  | j;                  |      }| j=                  |      }||fS )
r   Nr|   r   Fr:   rY  Tr   )r   r   )r   r   rA   r   r   r   
isinstancer   
is_updatedgetrU  cross_attention_cacheself_attention_cacher  rj   r   r   r   updater   r   ru   r   r   r   r   r   r   r   rW  r   )re   rV   rX  rU   r   r   rY  r   is_cross_attentionr   r   query_statesr\  curr_past_key_valuescurrent_states
key_statesvalue_statesr   r   r   s                       r3   r   zKosmosTextAttention.forward  s    3$>!.!4!4Ra!8
J{{=1#((ZQUQ^Q^_iijkmno
&/+>?,77;;DNNK
%+:+P+P(+:+O+O('6$2D.-/"=*-44T^^DIIJ/66t~~FMML^4J;;~6L#RWaabcefgJ',,ZT^^T]][eefgijkL*7It+?+F+Fdnn?OQ_>`,(
L &*_FY*ZAEO..t~~>(?(M(MKK,,.E)
 %8	%
  $}}C$,,LL	%
 	%
!\ "))*j"EPPR),,[9KmmK0L((r5   )r   FFTNr$  )rI   rJ   rK   rL   r   r7  r,   r   r+   r   r
   rM   r   r   r   s   @r3   rR  rR    s   G "'05 !%$T $T 	$T
 $T 4K$T #'+$T Tk$T $;$TR 6:(,.2"'.2J)||J)  %||d2J) 	J)
 t+J)  J) t+J) 
u||U\\D0%$,>	?J)r5   rR  c                   *     e Zd Zdef fdZd Z xZS )Kosmos2TextFFNru   c                    t         |           |j                  | _        t        |j                     | _        |j                  | _        t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        y r   )r   r   r   r	   activation_functionr   activation_dropoutr   r   r   ffn_dimr   r   r   r   ffn_layernormr   s     r3   r   zKosmos2TextFFN.__init__  s    ~~#F$>$>?"(";";99V--v~~>99V^^V-=-=>\\&..f>S>STr5   c                 b   | j                  | j                  |            }t        j                  j	                  || j
                  | j                        }| j                  |      }| j                  |      }t        j                  j	                  || j                  | j                        }|S )Nr   )	r   r   r   r   r   rk  r   rm  r   r   s     r3   r   zKosmos2TextFFN.forward$  s    **488M+BC--mt?V?Vaeanan-o**=9/--mt||VZVcVc-dr5   )rI   rJ   rK   r    r   r   r   r   s   @r3   rh  rh    s    
U0 
Ur5   rh  c                   F    e Zd Zddef fdZ	 	 	 	 	 	 	 ddej                  dej                  dz  dej                  dz  dej                  dz  dedz  d	edz  d
edz  dej                  dz  de	ej                  e	ej                  ej                  f   dz  f   fdZ xZS )Kosmos2TextBlockNru   c           	         t         |           |j                  | _        t        || j                  |j                  |j
                  dd|      | _        |j                  | _        t        j                  | j                  |j                        | _        |j                  ret        || j                  |j                  |j
                  dd|      | _        t        j                  | j                  |j                        | _        t        |      | _        t        j                  | j                  |j                        | _        y )NT)r   r   r   rS  rT  rU  r   F)r   r   r   rR  attention_headsr   r   r   r   r   r   self_attn_layer_normadd_cross_attentionencoder_attnencoder_attn_layer_normrh  ffnfinal_layer_norm)re   ru   rU  r   s      r3   r   zKosmos2TextBlock.__init__/  s    )),nn,,,,%)
 ~~$&LLVEZEZ$[!%% 3.. 0000).#!D ,.<<FLaLa+bD(!&) "T^^AVAV Wr5   rV   r   rX  encoder_attention_maskrU   r   	use_cacherY  rZ   c	           
      T   |}
| j                  |      } | j                  d|||||d|	\  }}t        j                  j	                  || j                  | j
                        }|
|z   }d }|t        | d      st        d|  d      |}
| j                  |      } | j                  d||||||d|	\  }}t        j                  j	                  || j                  | j
                        }|
|z   }|}
| j                  |      }| j                  |      }|
|z   }|f}|r|||fz  }|S )N)rV   rU   r   r   rY  r   ru  z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`)rV   rX  r   rU   r   rY  rP   )rs  r   r   r   r   r   r0  r   rv  ru  rx  rw  )re   rV   r   rX  ry  rU   r   rz  rY  r   r  self_attn_weightscross_attn_weightsr  s                 r3   r   zKosmos2TextBlock.forwardN  s    !11-@+94>> ,
'+)/),
 ,
(( --mt||VZVcVc-d =0 " ,40 =dV DD D 
 %H 88GM0A0A0A 1+&;5 /"3-1 1-M- MM11-4<<Z^ZgZg1hM$}4M !--m< / =0 ")+=>>Gr5   r   )NNNNFTN)rI   rJ   rK   r    r   r+   r   r
   r,   rM   rN   r   r   r   s   @r3   rp  rp  .  s    X0 XD /3596:(,).!%.2?||? t+?  %||d2	?
 !&t 3? ?  $;? $;? t+? 
u  %(9(95;L;L(L"MPT"TT	U?r5   rp  c            "       8    e Zd ZdZdef fdZd Z	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  d	e	d
ej                  dz  f
dZ
	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dedz  dej                  dz  d
ej                  dz  dedz  dedz  dedz  dedz  dej                  dz  dee   deez  f dZ xZS )Kosmos2TextTransformerz
    Transformer decoder consisting of `config.layers` layers. Each layer is a [`Kosmos2TextBlock`].

    Args:
        config: Kosmos2TextConfig
    ru   c           	         t         |           || _        |j                  | _        |j                  | _        |j
                  rt        j                  |j                        nd| _	        t        j                  |j                  |j                  |j                        | _        t        |j                   |j                  |j                        | _        t        j$                  t'        |j(                        D cg c]  }t+        ||       c}      | _        t        j,                  |j                  |j.                        | _        d| _        y c c}w )Nr&   )r(  )r   r'  r(  )rU  F)r   r   ru   r   	layerdropscale_embeddingr3  sqrtr   embed_scaler   r   
vocab_sizepad_token_idembed_tokensr&  max_position_embeddingsembed_positionsr
  r  r  rp  r   r   
layer_normr  )re   ru   ir   s      r3   r   zKosmos2TextTransformer.__init__  s    ~~)):@:P:P499V%5%56VYLL):):F<L<LZ`ZmZmnG 88 **++ 
 mmTYZ`ZgZgTh$iq%5f%J$ij,,v'7'79N9NO&+# %js   =Ec                     d }|d   dkD  r#t        ||j                  |j                  |      }|=t        ||j                  |d         j	                  |j                        }||n||z   }|S )Nr:   r   )r7   r8   r$   )rE   r#   r7   r4   r)   )re   r   rG  r  r8   combined_attention_maskexpanded_attn_masks          r3   _prepare_decoder_attention_maskz6Kosmos2TextTransformer._prepare_decoder_attention_mask  s     #'r?Q&7##$++'=	'# %!-nm>Q>Q[fgi[j!k!n!n$$" '>&E"K]`wKw $ '&r5   Nr  rX   img_input_maskr8   r}   c                    || j                  |      }|[|j                  |j                        j                  d|j	                  d            ||j                  t
        j                        <   || j                  z  }| j                  ||||      }|j                  |j                        }||z   }t        j                  j                  || j                  | j                        }|S )Nr:   r   )r=  r  r8   r}   r   )r  r)   r7   rA   r'   r+   r,   r  r  r   r   r   r   )	re   r=  r  rX   r  r8   r}   	positionsrV   s	            r3   forward_embeddingz(Kosmos2TextTransformer.forward_embedding  s       --i8M#AMQ^QeQeAfAkAkL%%b)BM.++%**+=> &(8(88 (('#9%	 ) 
	 LL!5!56	%	1--mt||VZVcVc-dr5   r=  r   image_embeds_position_maskrX  ry  rU   rz  r   r  r  rY  r   rZ   c           
      0   ||n| j                   j                  }||n| j                   j                  }|
|
n| j                   j                  }
||t	        d      |"|j
                  }|j                  d|d         }n!||j                         d d }nt	        d      | j                  r%| j                  r|
rt        j                  d       d}
|
rd|b|| j                   j                  r4t        t        | j                         t        | j                               nt        | j                         }||j                         nd}|dkD  rd }d }| j!                  ||||||	      }| j#                  ||||      }||t%        ||j&                  |d   	      }t(        j*                  j-                  || j,                  | j                  
      }|rdnd }|rdnd }|r|dnd }t/        | j0                        D ]l  \  }}|r||fz  }| j                  r%t3        j4                  g       }|| j6                  k  r? ||||f||||
|d|}|d   }|sX||d   fz  }|d||d   fz  }n | j9                  |      }|r||fz  }t;        |||||      S )NzDYou cannot specify both input_ids and inputs_embeds at the same timer:   z5You have to specify either input_ids or inputs_embedszZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...F)ru   r   )r=  r  rX   r  r8   r}   r  r   rP   )ry  rU   r   rz  rY  r   r|   )rT   rU   rV   rW   cross_attentions)ru   r   r  rz  r   r   rA   r'   r  r   loggerwarning_onceis_encoder_decoderr   r   get_seq_lengthr  r  r4   r#   r   r   r   r  r  r+   randr  r  r   )re   r=  r   rX   r  rX  ry  rU   r  r}   rz  r   r  r  rY  r   rG  r8   rV   all_hidden_statesall_self_attnsall_cross_attentionsr  decoder_layerdropout_probabilityr  s                             r3   r   zKosmos2TextTransformer.forward  s   $ 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	 ]%>cdd"#//K!r;r?;I&',,.s3KTUU&&4==##p "	0 )48V8V $L$DlZ^ZeZeFfg!5  FUE`!?!?!Afg "A%L)-&..'%5#9% / 
 ==K8N

 !,1G1S%12H-J]J]grsugv%w"--mt||VZVcVc-d #7BD0d&7<Q<]rdh"+DKK"8 	@C#!m%55!}}&+jjn#&7)%
 (> /"3#-
 
M *!,M =#3"55(4(]1-=,??(5	@: 6  -!118+++%1
 	
r5   )NNNr   NNNNNNNNNNNNNNN)rI   rJ   rK   rL   r    r   r  r+   r   r   r  r
   r,   r   r   rM   r   r   r   r   s   @r3   r  r    s   ,0 ,('4 .2,0.2&',0! ||d*! llT)	!
 t+! !$! llT)!J *..2,0:>596:(,-1,0!%)-,0#'.2y
<<$&y
 t+y
 llT)	y

 %*LL4$7y
  %||d2y
 !&t 3y
 y
 ||d*y
 llT)y
 $;y
  $;y
 #Tky
 D[y
 t+y
  -.!y
" 
:	:#y
r5   r  c                   |    e Zd ZU eed<   dZdZddgZdZdZ	dZ
 ej                         dej                  fd       Zy	)
Kosmos2PreTrainedModelru   )imagetextTr   rp  Fr   c                    t        | t              r| j                  j                  }n6t        | t        t
        f      r | j                  j                  j                  }t        | t        t        f      r| j                  j                  }n6t        | t        t
        f      r | j                  j                  j                  }t        |t              rt        j                  |j                  d|j                  dz  z         t        j                  |j                   j"                  |j                  j$                  |z         t        j                  |j&                  j"                  |j                  j$                  |z         t        j(                  |j*                  t-        j.                  |j*                  j0                  d         j3                  d             nt        |t4              r|j                  dz  d|j                  j6                  z  dz  z  z  }|j                  dz  |z  }t        j                  |j8                  j"                  |       t        j                  |j:                  j"                  |       t        j                  |j<                  j"                  |       t        j                  |j>                  j"                  |       nt        |t@              r|j                  jB                  dz  d|j                  j6                  z  dz  z  z  }d|j                  jB                  z  dz  |z  }t        j                  |jD                  j"                  |       t        j                  |jF                  j"                  |       nt        |tH              rt        j                  |j8                  j"                         t        j                  |j:                  j"                  |       t        j                  |j<                  j"                  |       t        j                  |j>                  j"                  |       n\t        |tJ              rXt        j                  |jD                  j"                         t        j                  |jF                  j"                  |       nt        |t              r-t        j                  |jL                  j"                         nt        |tN              rLt        j                  |jP                  j"                         t        j                  |jR                         n[t        |tT              rt        j                  |jV                  j"                  d       |jV                  jX                  t        jZ                  |jV                  j"                  |jV                  jX                            nt        |t\        j^                        r?t        j`                  |j"                         t        jZ                  |jb                         nnt        |td              r^|jg                  |jh                  |jj                  z   |jl                  |jX                        }t        j(                  |jn                  |       t        |t\        jp                        r-|jb                   t        jZ                  |jb                         yyy)	zInitialize the weightsr   r   )meanstd)r  r:   r~   r|   N)9r[  Kosmos2VisionModelru   initializer_factorKosmos2ModelKosmos2ForConditionalGenerationvision_configKosmos2TextModelKosmos2TextForCausalLMinit_stdtext_configrt   initnormal_r   r   r   r   initializer_ranger   copy_r}   r+   r?   r   r(   r   r  r   r   r   r   r   r   r   r   rR  rh  lm_headKosmos2ImageToTextProjectiondenselatent_queryr  r  r(  zeros_r   r   ones_r{   r&  r/  r   r*  r'  r.  r   )re   r   factorr  in_proj_stdout_proj_stdfc_stdr1  s           r3   _init_weightsz$Kosmos2PreTrainedModel._init_weightsl  s    d./[[33F|-LMN[[..AAFd-/EFG++&&C|-LMN++))22Cf56LL//cv?O?OQU?UX^?^_LL//66FMM<[<[^d<deLL2299v}}?^?^ag?ghJJv**ELL9L9L9R9RSU9V,W,^,^_f,gh 67!++T1q6==;Z;Z7Z_c6cdgmmK",,d2f<LLL--;?LL--;?LL--;?LL//\B 01!==44d:FMMDcDc@chl?lmpvvK&--333<vEFLL**7LL**< 34LL--37LL--37LL--37LL//S9/LL**4LL**4 67LL..C8 <=LL,,#6LL,,- 67LL,,33#3G""..:F//66v7J7J7V7VWX-JJv}}%KK$ HI ..$$v}}4f6J6JFL^L^K JJv~~{3fbii(V[[-DKK$ .E(r5   N)rI   rJ   rK   r   rO   input_modalitiessupports_gradient_checkpointing_no_split_modules_supports_attention_backend_supports_flash_attn_supports_sdpar+   rP  r   Moduler  rP   r5   r3   r  r  b  sV    (&*#46HI"& NU]]_8%BII 8% 8%r5   r  c                        e Zd ZU eed<   dZdZdef fdZdej                  fdZ
e	 	 	 	 	 ddej                  dz  dedz  d	edz  d
ededz  deez  fd       Z xZS )r  ru   r   )r  c                 d    t         |   |       t        |      | _        | j	                          y r   )r   r   r  model	post_initr   s     r3   r   zKosmos2VisionModel.__init__  s&     -f5
r5   rZ   c                 B    | j                   j                  j                  S r   )r  r   r   rk   s    r3   get_input_embeddingsz'Kosmos2VisionModel.get_input_embeddings  s    zz$$444r5   Nr   r  r   r  c                 .    | j                  |||||      S )N)r   r   r  r   r  r  )re   r   r   r  r   r  r   s          r3   r   zKosmos2VisionModel.forward  s)     zz%/!5%=#  
 	
r5   r$  )rI   rJ   rK   r!   rO   main_input_namer  r   r   r  r  r   r+   rN   r,   rM   rG   r   r   r   s   @r3   r  r    s    $O!2 5bii 5  26)-,0).#'
''$.
  $;
 #Tk	

 #'
 D[
 
8	8
 
r5   r  c            $           e Zd ZU eed<   dZdef fdZdej                  fdZ	e
e	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  d	ej                  dz  d
ej                  dz  dej                  dz  dej                  dz  dedz  dej                  dz  dej                  dz  dedz  dedz  dedz  dedz  dej                  dz  dee   deez  f d              Z xZS )r  ru   )r  c                 d    t         |   |       t        |      | _        | j	                          y r   )r   r   r  r  r  r   s     r3   r   zKosmos2TextModel.__init__  s&     +F3
r5   rZ   c                 .    | j                   j                  S r   r  r  rk   s    r3   r  z%Kosmos2TextModel.get_input_embeddings      zz&&&r5   Nr=  r   rX   r  rX  ry  rU   r  r}   rz  r   r  r  rY  r   c                 D     | j                   d|||||||||	|
||||d|S )aN  
        image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.
        image_embeds_position_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to indicate the location in a sequence to insert the image features . Mask values selected in `[0,
            1]`:

            - 1 for places where to put the image features,
            - 0 for places that are not for image features (i.e. for text tokens).
        )r=  r   rX   r  rX  ry  rU   r  r}   rz  r   r  r  rY  rP   r  )re   r=  r   rX   r  rX  ry  rU   r  r}   rz  r   r  r  rY  r   s                   r3   r   zKosmos2TextModel.forward  sP    < tzz 
)%'A"7#9+'%/!5#)
 
 	
r5   r  )rI   rJ   rK   r    rO   r  r   r   r  r  r   r   r+   r   r
   r,   r   r   rM   r   r   r   r   s   @r3   r  r    s    0 'bii '  *..2,0:>596:(,-1,0!%)-,0#'.2,
<<$&,
 t+,
 llT)	,

 %*LL4$7,
  %||d2,
 !&t 3,
 ,
 ||d*,
 llT),
 $;,
  $;,
 #Tk,
 D[,
 t+,
  -.!,
" 
:	:#,
  ,
r5   r  z
    The text model from KOSMOS-2 with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    c            &       V    e Zd ZU eed<   ddiZdef fdZdej                  fdZ	dej                  fdZ
ee	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dd	ej                  dz  d
ej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dedz  dej                  dz  dej                  dz  dej                   dz  dedz  dedz  dedz  dej                  dz  deej                  z  dee   deez  f"d              Z	 	 	 	 	 	 	 	 d fd	Z xZS )r  ru   zlm_head.weightzmodel.embed_tokens.weightc                     t         |   |       t        |      | _        t	        j
                  |j                  |j                  d      | _        | j                          y )NF)in_featuresout_featuresr{   )
r   r   r  r  r   r   r   r  r  r  r   s     r3   r   zKosmos2TextForCausalLM.__init__  sI     +F3
yyV-=-=FL]L]dij 	r5   rZ   c                 .    | j                   j                  S r   r  rk   s    r3   r  z+Kosmos2TextForCausalLM.get_input_embeddings  r  r5   c                     | j                   S r   )r  rk   s    r3   get_output_embeddingsz,Kosmos2TextForCausalLM.get_output_embeddings  s    ||r5   Nr=  r   rX   r  rX  ry  rU   r  r}   labelsrz  r   r  rY  logits_to_keepr   c                    |
|rt         j                  d       d} | j                  d|||||||||	||||d|}|j                  }t	        |t
              rt        | d      n|}| j                  |dd|ddf         }d}|
* | j                  d||
| j                  j                  d|}t        |||j                  |j                  |j                  |j                        S )a  
        image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.
        image_embeds_position_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to indicate the location in a sequence to insert the image features . Mask values selected in `[0,
            1]`:

            - 1 for places where to put the image features,
            - 0 for places that are not for image features (i.e. for text tokens).
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
            ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        NzJThe `use_cache` argument is changed to `False` since `labels` is provided.F)r=  r   rX   r  rX  ry  rU   r  r}   rz  r   r  rY  )rp   r  r  )ro   rp   rU   rV   rW   r  rP   )r  warningr  rT   r[  r   slicer  loss_functionru   r  r   rU   rV   rW   r  )re   r=  r   rX   r  rX  ry  rU   r  r}   r  rz  r   r  rY  r  r   r  rV   slice_indicesrp   ro   s                         r3   r   zKosmos2TextForCausalLM.forward"  s   F klI=GTZZ >
)%'A"7#9+'%/!5)>
 >
"  118B>SV8W~ot4]kmA}a,?@A%4%%pVFt{{OeOepiopD0#33!//))$55
 	
r5   c
                    |	s|rd }d }n|||j                         d d n|j                         \  }}|j                         d   }t        j                  |t        j                  |||z
  ft        j                  |j
                        fd      }t        |   |f||||||||	d|
}|j                  dd        |S )Nr:   )r'   r#   r7   r   r<   )rU   r   rX   r  r  rz  rY  is_first_iterationr}   )	r'   r+   rB   rC   r,   r7   r   prepare_inputs_for_generationpop)re   r=  rX   r  rU   r   r  rz  rY  r  model_kwargsr   rC  mask_lenmodel_inputsr   s                  r3   r  z4Kosmos2TextForCausalLM.prepare_inputs_for_generationm  s    & "iL)-& (3?L?X-"4"4"6s";^l^q^q^sJ1668<H)..KKj'H2D%EUZZ`i`p`pq *& w<
+)%'A')1
 
 	.r5   )NNNNNNNNNNNNNNr   )NNNNNNNF)rI   rJ   rK   r    rO   _tied_weights_keysr   r   r  r  r  r   r   r+   r   r
   
LongTensorr,   r   r   r   rM   r   r   r  r   r   s   @r3   r  r  	  s    *,GH0 'bii 'ryy   *..2,0:>596:(,-1,0*.!%)-,0.2-.!G
<<$&G
 t+G
 llT)	G

 %*LL4$7G
  %||d2G
 !&t 3G
 G
 ||d*G
 llT)G
   4'G
 $;G
  $;G
 #TkG
 t+G
  ell*!G
" +,#G
$ 
2	2%G
  G
X #' 2 2r5   r  c                   .     e Zd ZdZdef fdZd Z xZS )r  zmThe layer that transforms the image model's output to part of the text model's input (namely, image features)ru   c                    t         |           t        j                  |j                  j
                  |j                  j                        | _        t        j                  t        j                  |j                  |j                  j                              | _        t        |j                  |j                  j                  |j                  j                  |j                  j                   dd      | _        y )NF)r   rS  rT  )r   r   r   r   r  r   r  r   r  r   r+   r   latent_query_numr  rR  rr  r   x_attnr   s     r3   r   z%Kosmos2ImageToTextProjection.__init__  s    YYv33??ASASA]A]^
LLV5L5LfN`N`NjNj)kl)((..&&88%*
r5   c                    | j                  |      }| j                  j                  d      j                  |j	                  d      dd      }t        j                  ||gd      }| j                  ||d d d       \  }}||fS )Nr   r:   r   r<   )rV   rX  rU   r   r   )r  r  r   r(   r'   r+   rB   r  )re   featuresrV   r  key_value_statesr   s         r3   r   z$Kosmos2ImageToTextProjection.forward  s    

8, ((2215<<]=O=OPQ=RTVXZ[ 99m\%BJ&*kk&"2 " '2 '
#| l**r5   )rI   rJ   rK   rL   r   r   r   r   r   s   @r3   r  r    s    w
} 
+r5   r  z}
    KOSMOS-2 Model for generating text and image features. The model consists of a vision encoder and a language model.
    c            "           e Zd ZU eed<   dZdef fdZdej                  fdZ	d Z
ee	 ddej                  dedz  d	ee   deez  fd
              Zee	 	 	 	 	 	 	 	 	 	 	 	 	 ddej*                  dz  dej*                  dz  dej*                  dz  dej*                  dz  dedz  dej*                  dz  dej*                  dz  dej*                  dz  dedz  dedz  dedz  dededz  d	ee   deez  fd              Z xZS )r  ru   r   c                     t         |   |       t        |j                        | _        t        |j                        | _        t        |      | _	        | j                          y r   )r   r   r  r  
text_modelr  r  vision_modelr  image_to_text_projectionr  r   s     r3   r   zKosmos2Model.__init__  sN     *6+=+=>.v/C/CD(DV(L% 	r5   rZ   c                 B    | j                   j                  j                  S r   r  r  r  rk   s    r3   r  z!Kosmos2Model.get_input_embeddings      $$111r5   c                 :    || j                   j                  _        y r   r  re   r   s     r3   set_input_embeddingsz!Kosmos2Model.set_input_embeddings      -2*r5   r   Nr   c                 h   d|v r,t        j                  dt               |j                  dd         | j                  d||dd|}| j                  j
                  j                  |d         }t        j                  j                  |d      }| j                  |      \  }}||_        ||_        |S )	Nreturn_attentionsz`return_attentions` is deprecated and will be removed in a future version. Please use `return_dict` and access `projection_attentions` from the returned `ModelOutput` instead.T)r   r   r  r   r:   r<   rP   )warningswarnFutureWarningr  r  r  r  r   r   	normalizer  r!  rH   )re   r   r   r   vision_outputrX   rH   s          r3   get_image_featureszKosmos2Model.get_image_features  s     &(MM_
 JJ*D1ARARAR B
%%=B
 	B
 ((..==mA>NO}}..|.D.2.K.KL.Y++&2#.C+r5   r=  r  r   rU   rX   r  r}   rz  r   r  r  c                    |
|
n| j                   j                  }
||n| j                   j                  }||n| j                   j                  }d}d}|9|t	        d      | j                  ||d      }|j                  }|j                  } | j                  d||||||||	|
|dd|}t        |j                  |j                  |j                  |j                  |||      S )a  
        image_embeds_position_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to indicate the location in a sequence to insert the image features . Mask values selected in `[0,
            1]`:

            - 1 for places where to put the image features,
            - 0 for places that are not for image features (i.e. for text tokens).
        image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.

        Examples:

        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, Kosmos2Model

        >>> model = Kosmos2Model.from_pretrained("microsoft/kosmos-2-patch14-224")
        >>> processor = AutoProcessor.from_pretrained("microsoft/kosmos-2-patch14-224")

        >>> url = "https://huggingface.co/microsoft/kosmos-2-patch14-224/resolve/main/snowman.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> text = (
        ...     "<grounding> An image of<phrase> a snowman</phrase><object><patch_index_0044><patch_index_0863>"
        ...     "</object> warming himself by<phrase> a fire</phrase><object><patch_index_0005><patch_index_0911>"
        ...     "</object>"
        ... )

        >>> inputs = processor(text=text, images=image, return_tensors="pt", add_eos_token=True)

        >>> last_hidden_state = model(
        ...     pixel_values=inputs["pixel_values"],
        ...     input_ids=inputs["input_ids"],
        ...     attention_mask=inputs["attention_mask"],
        ...     image_embeds_position_mask=inputs["image_embeds_position_mask"],
        ... ).last_hidden_state
        >>> list(last_hidden_state.shape)
        [1, 91, 2048]
        ```N<You have to specify either `pixel_values` or `image_embeds`.T)r   r  )r=  r   rX   r  rU   r  r}   rz  r   r  r  )rT   rU   rV   rW   rX   rH   rY   rP   )ru   r   r  r  r   r	  r!  rH   r  rS   rT   rU   rV   rW   )re   r   r=  r  r   rU   rX   r  r}   rz  r   r  r   r  r   rY   rH   image_featuresr  s                      r3   r   zKosmos2Model.forward  s*   z 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]" $# !_``!447O]a 5 N *77L$2$H$H!!$// 
)%'A+'%/!5
 
 "%77#33!//))%"7 3
 	
r5   r   )NNNNNNNNNNNFN)rI   rJ   rK   r   rO   r  r   r   r  r  r   r   r   r+   rN   r,   r   r   rM   rG   r	  r   r
   r   rS   r   r   r   s   @r3   r  r    s    $O} 2bii 23  16'' #'+ +,	
 
8	8  <  -1)-:>.2(,,0-1,0!%)-,0).#'c
llT)c
 <<$&c
 %*LL4$7	c

 t+c
 c
 llT)c
 ||d*c
 llT)c
 $;c
  $;c
 #Tkc
 #'c
 D[c
 -.c
  
#	#!c
  c
r5   r  z
    KOSMOS-2 Model for generating text and bounding boxes given an image. The model consists of a vision encoder and a
    language model.
    c            "           e Zd ZU eed<   dZddiZdef fdZdej                  fdZ
d Zdej                  fd	Zd
 Zee	 	 	 	 	 	 	 	 	 	 	 	 	 ddej"                  dz  dej"                  dz  dej"                  dz  dej"                  dz  dedz  dej"                  dz  dej"                  dz  dej"                  dz  dej&                  dz  dedz  dedz  dedz  deej"                  z  dee   deez  fd              Z ej6                         	 	 	 	 	 	 ddej"                  dz  dej"                  dz  dej"                  dz  dej"                  dz  dej"                  dz  dej"                  dz  fd       Z xZS )r  ru   r   ztext_model.lm_head.weightz$text_model.model.embed_tokens.weightc                     t         |   |       t        |j                        | _        t        |j                        | _        t        |      | _	        | j                          y r   )r   r   r  r  r  r  r  r  r  r  r  r   s     r3   r   z(Kosmos2ForConditionalGeneration.__init__q  sN     01C1CD.v/C/CD(DV(L% 	r5   rZ   c                 B    | j                   j                  j                  S r   r  rk   s    r3   r  z4Kosmos2ForConditionalGeneration.get_input_embeddings|  r  r5   c                 :    || j                   j                  _        y r   r  r  s     r3   r   z4Kosmos2ForConditionalGeneration.set_input_embeddings  r  r5   c                 6    | j                   j                         S r   )r  r  rk   s    r3   r  z5Kosmos2ForConditionalGeneration.get_output_embeddings  s    4466r5   c                 :    | j                   j                  |       y r   )r  set_output_embeddings)re   new_embeddingss     r3   r  z5Kosmos2ForConditionalGeneration.set_output_embeddings  s    --n=r5   Nr=  r  r   rU   rX   r  r}   r  rz  r   r  r  r   c                 :   ||n| j                   j                  }||n| j                   j                  }d}d}|~|t        d      | j	                  |||      }| j                  j
                  j                  |d         }t        j                  j                  |d      }| j                  |      \  }} | j                  d	||||||||	|
|||d|}t        |j                  |j                  |j                  |j                   |j"                  |||      S )
a  
        image_embeds_position_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to indicate the location in a sequence to insert the image features . Mask values selected in `[0,
            1]`:

            - 1 for places where to put the image features,
            - 0 for places that are not for image features (i.e. for text tokens).
        image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
            ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`

        Examples:

        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, Kosmos2ForConditionalGeneration

        >>> model = Kosmos2ForConditionalGeneration.from_pretrained("microsoft/kosmos-2-patch14-224")
        >>> processor = AutoProcessor.from_pretrained("microsoft/kosmos-2-patch14-224")

        >>> url = "https://huggingface.co/microsoft/kosmos-2-patch14-224/resolve/main/snowman.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> prompt = "<grounding> An image of"

        >>> inputs = processor(text=prompt, images=image, return_tensors="pt")

        >>> generated_ids = model.generate(
        ...     pixel_values=inputs["pixel_values"],
        ...     input_ids=inputs["input_ids"],
        ...     attention_mask=inputs["attention_mask"],
        ...     image_embeds=None,
        ...     image_embeds_position_mask=inputs["image_embeds_position_mask"],
        ...     use_cache=True,
        ...     max_new_tokens=64,
        ... )
        >>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        >>> processed_text = processor.post_process_generation(generated_text, cleanup_and_extract=False)
        >>> processed_text
        '<grounding> An image of<phrase> a snowman</phrase><object><patch_index_0044><patch_index_0863></object> warming himself by<phrase> a fire</phrase><object><patch_index_0005><patch_index_0911></object>.'

        >>> caption, entities = processor.post_process_generation(generated_text)
        >>> caption
        'An image of a snowman warming himself by a fire.'

        >>> entities
        [('a snowman', (12, 21), [(0.390625, 0.046875, 0.984375, 0.828125)]), ('a fire', (41, 47), [(0.171875, 0.015625, 0.484375, 0.890625)])]
        ```Nr  )r   r   r  r   r:   r<   )r=  r   rX   r  rU   r  r}   r  rz  r   r  r  )ro   rp   rU   rV   rW   rX   rH   rY   rP   )ru   r   r  r   r  r  r  r   r   r  r  r  rn   ro   rp   rU   rV   rW   )re   r   r=  r  r   rU   rX   r  r}   r  rz  r   r  r  r   rY   rH   
lm_outputss                     r3   r   z'Kosmos2ForConditionalGeneration.forward  s[   R 2C1N-TXT_T_TqTq$8$D $++JjJj 	 # $# !_``"&"3"3)"3%9 #4 #  ,,22AABUVWBXYL==22<R2HL262O2OP\2]/L/8G 9
)%'A+'%/!5)9
 9

  :$$&66$22!,,%"7 3	
 		
r5   c           	         |j                  dd       }||t        d| d      |||}|n| j                  |      }	| j                  j                  j	                  |	d         }t
        j                  j                  |d      }| j                  |      \  }}
 | j                  j                  d|||||d|}|S )	Ninputsz
`inputs`: zp were passed alongside `pixel_values` which is not allowed.Make sure to either pass `inputs` or pixel_values=...r   r:   r<   )r=  r   rX   r  r  rP   )r  r   r  r  r  r   r   r  r  r  generate)re   r   r  r=  r   rX   r  r   r  rY   rH   outputs               r3   r  z(Kosmos2ForConditionalGeneration.generate  s     Hd+#(:VH %H I  F$6!L"&"3"3L"A,,22AABUVWBXYL==22<R2HL262O2OP\2]/L/))) 
)%'A'
 
 r5   )NNNNNNNNNNNNr   )NNNNNN)rI   rJ   rK   r   rO   r  r  r   r   r  r  r   r  r  r   r   r+   r   r
   r  r,   r   r   r   rM   rn   r   rP  r  r   r   s   @r3   r  r  f  sL    $O57]^	} 	2bii 237ryy 7>  -1)-:>.2(,,0-1,0*.!%)-,0-.v
llT)v
 <<$&v
 %*LL4$7	v

 t+v
 v
 llT)v
 ||d*v
 llT)v
   4'v
 $;v
  $;v
 #Tkv
 ell*v
 +,v
  
;	;!v
  v
p U]]_ -1:>)-.2,0-1%llT)% %*LL4$7% <<$&	%
 t+% llT)% ||d*% %r5   r  )r  r  r  r   rN  )r   )TrL   r3  r  collections.abcr   dataclassesr   typingr   r+   r    r   r  activationsr	   cache_utilsr
   r   r   
generationr   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   r   utils.genericr   configuration_kosmos2r   r    r!   
get_loggerrI   r  r   r#   r   r4   Sizer7   rE   rG   rS   rn   r  rt   r7  r   r   r   r   r  r  r&  rR  rh  rp  r  r  r  r  r  r  r  r  __all__rP   r5   r3   <module>r-     sP      $ !    & ! C C ) B 9  G & j j 9 X X 
		H	%[u|| [EKK [#* [ jk\ZZ\(-\=B\\\cf\" 
B.H 
B  
B 
 
  
  
F 
%
 %
 %
RPbii Pv %II%<<% 
% <<	%
 LL4'% % %,F)RYY F)Tryy  / : /dM
299 M
b3
ryy 3
nj8ryy j8Zt)")) t)nRYY ._1 _DO
RYY O
d B%_ B% B%J 
/  
F;
- ;
| P3_ PPf +299  +F 
Y
) Y

Y
x |&<o ||~ Xr5   