
    qi~                     P   d dl Z d dlmZ d dlmZ d dlZd dlmZ ddlmZ	 ddl
mZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZmZ ddlmZmZ ddlmZ ddlmZmZm Z m!Z!m"Z" ddl#m$Z$ ddl%m&Z& ddl'm(Z( ddl)m*Z*m+Z+ ee  G d de                    Z,e e d       G d de                    Z-e e d       G d de                    Z. ed       G d  d!ej^                               Z0 G d" d#ej^                        Z1 G d$ d%ej^                        Z2	 dGd&ej^                  d'ejf                  d(ejf                  d)ejf                  d*ejf                  dz  d+e4d,e4fd-Z5 G d. d/ej^                        Z6 G d0 d1ej^                        Z7 G d2 d3e      Z8 G d4 d5ej^                        Z9 G d6 d7ej^                        Z: G d8 d9ejv                        Z< G d: d;e      Z=d<ejf                  d=e>fd>Z? G d? d@e=      Z@ e dA       G dB dCe=             ZAe  G dD dEe=e             ZBg dFZCy)H    N)Callable)	dataclass)nn   )initialization)ACT2FN)Cache)GenerationMixin)use_kernel_forward_from_hub)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPastBaseModelOutputWithPooling)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringcan_return_tupletorch_compilable_check)merge_with_config_defaults)capture_outputs   )	AutoModel   )Ovis2ConfigOvis2VisionConfigc                   :    e Zd ZU dZdZej                  dz  ed<   y)*BaseModelOutputWithVisualIndicatorFeaturesz
    visual_indicator_features (`torch.FloatTensor` of shape `(batch_size, visual_indicator_size)`):
        Visual indicator features extracted from the model, which can be used for auxiliary tasks or further processing.
    Nvisual_indicator_features)__name__
__module____qualname____doc__r!   torchFloatTensor__annotations__     Z/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/ovis2/modeling_ovis2.pyr    r    ,   s    
 ;?u0047>r*   r    zJ
    Base class for Llava outputs, with hidden states and attentions.
    custom_introc                   :    e Zd ZU dZdZej                  dz  ed<   y)Ovis2ModelOutputWithPasta  
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
    Nimage_hidden_states)r"   r#   r$   r%   r0   r&   r'   r(   r)   r*   r+   r/   r/   7   s    	 59**T18r*   r/   zQ
    Base class for Ovis2 causal language model (or autoregressive) outputs.
    c                       e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	e
dz  ed<   dZeej                     dz  ed<   dZeej                     dz  ed<   dZej                  dz  ed<   y)	Ovis2CausalLMOutputWithPastaA  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size (batch_size * num_patches, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
    Nlosslogitspast_key_valueshidden_states
attentionsr0   )r"   r#   r$   r%   r3   r&   r'   r(   r4   r5   r	   r6   tupler7   r0   r)   r*   r+   r2   r2   L   s     &*D%

d
")'+FE$+$(OUT\(59M5**+d2926Je''(4/648**T18r*   r2   RMSNormc                   h     e Zd Zddeddf fdZdej                  dej                  fdZd Z xZ	S )	Ovis2RMSNormepsreturnNc                     t         |           t        j                  t	        j
                  |            | _        || _        y)z;
        Ovis2RMSNorm is equivalent to T5LayerNorm
        N)super__init__r   	Parameterr&   onesweightvariance_epsilon)selfhidden_sizer<   	__class__s      r+   r@   zOvis2RMSNorm.__init__l   s1     	ll5::k#:; #r*   r6   c                 "   |j                   }|j                  t        j                        }|j	                  d      j                  dd      }|t        j                  || j                  z         z  }| j                  |j                  |      z  S )Nr   Tkeepdim)	dtypetor&   float32powmeanrsqrtrD   rC   )rE   r6   input_dtypevariances       r+   forwardzOvis2RMSNorm.forwardt   sy    #))%((7 $$Q',,R,>%Ht?T?T4T(UU{{]--k:::r*   c                 ^    t        | j                  j                         d| j                   S )Nz, eps=)r8   rC   shaperD   rE   s    r+   
extra_reprzOvis2RMSNorm.extra_repr{   s*    ))*+6$2G2G1HIIr*   )gư>)
r"   r#   r$   floatr@   r&   TensorrT   rX   __classcell__rG   s   @r+   r;   r;   j   s7    $ $$ $;U\\ ;ell ;Jr*   r;   c                   $     e Zd Z fdZd Z xZS )Ovis2VisionMLPc                    t         |           || _        |j                  | _        |j                  | _        t        j                  | j                  | j                  |j                        | _        t        j                  | j                  | j                  |j                        | _	        t        j                  | j                  | j                  |j                        | _
        t        |j                     | _        y Nbiasr?   r@   configrF   intermediate_sizer   Linearmlp_bias	gate_projup_proj	down_projr   
hidden_actact_fnrE   rd   rG   s     r+   r@   zOvis2VisionMLP.__init__       !--!'!9!94#3#3T5K5KRXRaRabyy!1!143I3IPVP_P_`4#9#94;K;KRXRaRabV../r*   c                     | j                  | j                  | j                  |            | j                  |      z        }|S Nrj   rl   rh   ri   rE   xrj   s      r+   rT   zOvis2VisionMLP.forward   6    NN4;;t~~a/@#ADLLQRO#ST	r*   r"   r#   r$   r@   rT   r[   r\   s   @r+   r^   r^          0r*   r^   c                   \     e Zd Zdef fdZdej                  dej                  fdZ xZ	S )Ovis2VisionEmbeddingsrd   c                    t         |           || _        |j                  | _        |j
                  | _        |j                  | _        t        j                  |j                  | j                  | j                  | j                  d      | _
        | j
                  | j                  z  dz  | _        | j                  | _        t        j                  | j                  | j                        | _        | j                  dt!        j"                  | j                        j%                  d      d       t'        |j                  |j(                        | _        y )Nvalid)in_channelsout_channelskernel_sizestridepaddingr   position_idsr   rI   F)
persistent)r?   r@   rd   rF   	embed_dim
image_size
patch_sizer   Conv2dnum_channelspatch_embeddingnum_patchesnum_positions	Embeddingposition_embeddingregister_bufferr&   arangeexpandr;   rms_norm_epsrms_normrm   s     r+   r@   zOvis2VisionEmbeddings.__init__   s    ++ ++ ++!yy++?? 
 !OOt>1D!--"$,,t/A/A4>>"R^U\\$:L:L-M-T-TU\-]jop$V%7%79L9LMr*   pixel_valuesr=   c                 (   | j                   j                  j                  }| j                  |j                  |            }|j	                  d      j                  dd      }| j                  |      }|| j                  | j                        z   }|S )NrL   r   r   )	r   rC   rL   rM   flatten	transposer   r   r   )rE   r   target_dtypepatch_embeds
embeddingss        r+   rT   zOvis2VisionEmbeddings.forward   s    ++2288++LOO,O,OP!))!,66q!<
]]:.
$"9"9$:K:K"LL
r*   )
r"   r#   r$   r   r@   r&   r'   rZ   rT   r[   r\   s   @r+   rx   rx      s/    N0 N*E$5$5 %,, r*   rx   modulequerykeyvalueattention_maskscalingdropoutc                    t        j                  ||j                  dd            |z  }|||z   }t        j                  j                  |dt         j                        j                  |j                        }t        j                  j                  ||| j                        }t        j                  ||      }	|	j                  dd      j                         }	|	|fS )NrI   )dimrL   )ptrainingr   r   )r&   matmulr   r   
functionalsoftmaxrN   rM   rL   r   r   
contiguous)
r   r   r   r   r   r   r   kwargsattn_weightsattn_outputs
             r+   eager_attention_forwardr      s     <<s}}R'<=GL!#n4==((2U]](SVVW\WbWbcL==((6??([L,,|U3K''1-88:K$$r*   c            
            e Zd ZdZ fdZ	 ddej                  dej                  dz  deej                  ej                  dz  f   fdZ xZ	S )	Ovis2VisionAttentionz=Multi-headed attention from 'Attention Is All You Need' paperc                 x   t         |           || _        |j                  | _        |j
                  | _        | j                  | j                  z  | _        | j                  | j                  z  | j                  k7  r&t        d| j                   d| j                   d      | j                  dz  | _	        |j                  | _        d| _        t        j                  | j                  | j                  |j                        | _        t        j                  | j                  | j                  |j                        | _        t        j                  | j                  | j                  |j                        | _        t        j                  | j                  | j                  |j                        | _        y )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      Fra   )r?   r@   rd   rF   r   num_attention_heads	num_headshead_dim
ValueErrorscaleattention_dropoutr   	is_causalr   rf   qkv_biask_projv_projq_projout_projrm   s     r+   r@   zOvis2VisionAttention.__init__   s2   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
//iiV__UiiV__UiiV__U		$..$..vWr*   Nr6   r   r=   c           
         |j                   \  }}}| j                  |      }| j                  |      }| j                  |      }	|j	                  ||| j
                  | j                        j                  dd      }|j	                  ||| j
                  | j                        j                  dd      }|	j	                  ||| j
                  | j                        j                  dd      }	t        j                  | j                  j                  t              }
 |
| |||	|| j                  | j                  | j                  sdn| j                         \  }}|j#                  |||      j%                         }| j'                  |      }||fS )z#Input shape: Batch x Time x Channelr   r           )r   r   r   )rV   r   r   r   viewr   r   r   r   get_interfacerd   _attn_implementationr   r   r   r   r   reshaper   r   )rE   r6   r   r   
batch_size
seq_lengthr   querieskeysvaluesattention_interfacer   r   s                r+   rT   zOvis2VisionAttention.forward   sW    -:,?,?)
J	++m,{{=)]+,,z:t~~t}}U__`acdeyyZOYYZ[]^_ZT^^T]]S]]^_abc(?(M(MKK,,.E)
 %8nnJJ#}}C$,,	%
!\ "))*j)LWWYmmK0L((r*   rp   )
r"   r#   r$   r%   r@   r&   rZ   r8   rT   r[   r\   s   @r+   r   r      sV    GX, /3$)||$) t+$)
 
u||U\\D00	1$)r*   r   c                   $     e Zd Z fdZd Z xZS )Ovis2MLPc                    t         |           || _        |j                  | _        |j                  | _        t        j                  | j                  | j                  |j                        | _        t        j                  | j                  | j                  |j                        | _	        t        j                  | j                  | j                  |j                        | _
        t        |j                     | _        y r`   rc   rm   s     r+   r@   zOvis2MLP.__init__  rn   r*   c                     | j                  | j                  | j                  |            | j                  |      z        }|S rp   rq   rr   s      r+   rT   zOvis2MLP.forward  rt   r*   ru   r\   s   @r+   r   r     rv   r*   r   c            	            e Zd Zdef fdZ	 d	dej                  dej                  dz  dee   dej                  fdZ	 xZ
S )
Ovis2VisionEncoderLayerrd   c                     t         |           t        |      | _        t	        |      | _        t        |j                  |j                        | _	        t        |j                  |j                        | _
        y rp   )r?   r@   r   	attentionr   ffnr;   rF   r   	rms_norm1	rms_norm2rm   s     r+   r@   z Ovis2VisionEncoderLayer.__init__  sZ    -f5F#%f&8&8&:M:MN%f&8&8&:M:MNr*   Nr6   r   r   r=   c                     | j                  |      } | j                  d||d|\  }}||z   }| j                  |      }| j                  |      }||z   }|S )N)r6   r   r)   )r   r   r   r   )rE   r6   r   r   norm_hidden_statesr   _
mlp_outputs           r+   rT   zOvis2VisionEncoderLayer.forward  sl     "^^M:'r6HYgrkqrQ%3!^^M:XX01
%
2r*   rp   )r"   r#   r$   r   r@   r&   rZ   r   r   rT   r[   r\   s   @r+   r   r     sY    O0 O /3|| t+ +,	
 
r*   r   c            	       t     e Zd ZdZdef fdZee	 d	dej                  dz  de
e   defd              Z xZS )
Ovis2VisionEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`Ovis2VisionEncoderLayer`].

    Args:
        config: Ovis2VisionConfig
    rd   c                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w NF)
r?   r@   rd   r   
ModuleListrangenum_hidden_layersr   layersgradient_checkpointing)rE   rd   r   rG   s      r+   r@   zOvis2VisionEncoder.__init__6  sP    mmeTZTlTlNm$n%<V%D$no&+# %os   A#Nr   r   r=   c                 T    |}| j                   D ]  } |||fi |} t        |      S )Nlast_hidden_state)r   r   )rE   inputs_embedsr   r   r6   encoder_layers         r+   rT   zOvis2VisionEncoder.forward=  s<     &![[ 	SM)-R6RM	S ??r*   rp   )r"   r#   r$   r%   r   r@   r   r   r&   rZ   r   r   r   rT   r[   r\   s   @r+   r   r   -  sh    ,0 ,  /3
@ t+
@ +,	
@
 

@  
@r*   r   c                   X     e Zd Zdef fdZe	 ddej                  dz  fd       Z xZ	S )Ovis2VisionTransformerrd   c                     t         |           || _        t        |      | _        t        |      | _        t        |j                  |j                        | _
        d| _        y r   )r?   r@   rd   rx   r   r   encoderr;   rF   r   r   r   rm   s     r+   r@   zOvis2VisionTransformer.__init__M  sO    /7)&1$V%7%79L9LM&+#r*   Nr   c                     | j                  |      } | j                  d||d|}|j                  }| j                  |      }t	        |      S )N)r   r   r   r)   )r   r   r   r   r   )rE   r   r   r   r6   encoder_outputsr   s          r+   rT   zOvis2VisionTransformer.forwardU  sa     5+74<< ,
'),
 ,
 ,== MM*;<1BCCr*   rp   )
r"   r#   r$   r   r@   r   r&   rZ   rT   r[   r\   s   @r+   r   r   L  s?    ,0 ,  /3D t+D Dr*   r   c                   P     e Zd Zdej                  dej                  f fdZ xZS )Ovis2VisualEmbeddingTablevisual_tokensr=   c                    |j                   t        j                  t        j                  t        j                  t        j
                  t        j                  fv rt        | !  |      S t        j                  || j                        S rp   )rL   r&   int8int16int32int64longr?   rT   r   rC   )rE   r   rG   s     r+   rT   z!Ovis2VisualEmbeddingTable.forwardk  sW    5::u{{EKKV[V`V`"aa7?=11||M4;;77r*   )r"   r#   r$   r&   rZ   rT   r[   r\   s   @r+   r   r   j  s#    8U\\ 8ell 8 8r*   r   c                   X     e Zd ZU eed<   dZdZdZdgZdZ	dZ
dZdZdZdZdZ fdZ xZS )Ovis2PreTrainedModelrd   model)imagetextTr   r5   c                     t         |   |       t        |t              rZt	        j
                  |j                  t        j                  |j                  j                  d         j                  d             y y )NrI   r   )r?   _init_weights
isinstancerx   initcopy_r   r&   r   rV   r   )rE   r   rG   s     r+   r   z"Ovis2PreTrainedModel._init_weights  s[    f%f34JJv**ELL9L9L9R9RSU9V,W,^,^_f,gh 5r*   )r"   r#   r$   r   r(   base_model_prefixinput_modalitiessupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_cache_class_supports_flash_attn_supports_flex_attn_supports_sdpa_can_compile_fullgraph_supports_attention_backendr   r[   r\   s   @r+   r   r   q  sY    (&*#/0"3 N!"&i ir*   r   r4   r   c                     | j                  |      }|j                  |d      d   }t        j                  | t        j                        j                  ||d      }||j                         z
  |z   }|S )NTrJ   r   )memory_formatg      ?)r   maxr&   
zeros_likelegacy_contiguous_formatscatter_detach)r4   r   y_softindexy_hardrets         r+   hard_softmaxr    sk    ^^C FJJsDJ)!,EfE4R4RS\\]`bgilmF
6==?
"V
+CJr*   c            	            e Zd ZU eed<   eedZdef fdZe	e
dej                  dee   deez  fd              Z xZS )Ovis2VisionModelrd   )r6   r7   c                    t         |   |       || _        t        |      | _        |j
                  | _        |j                  | _        t        j                  |j                  |j                  z  |j                  z  | j                  | j
                  z
  d      | _        t        j                  | j                  | j
                  z
        | _        | j                          y NFra   )r?   r@   rd   r   transformernum_visual_indicator_tokens
vocab_sizer   rf   rF   hidden_stridehead_linear	LayerNorm	head_norm	post_initrm   s     r+   r@   zOvis2VisionModel.__init__  s     1&9+1+M+M( ++99!5!558L8LLOOd>>>

 doo8X8X&XYr*   r   r   r=   c           	          | j                   |fi |}|d   }| j                  j                  dkD  r|j                  \  }}}| j                  j                  }t	        t        j                  |            }	|	|	z  |k7  rt        d      ||	|z  z
  |z  }
t        j                  j                  |ddd|
d|
fdd      }|	|
z  }	|j                  ||	|z  ||	|z  ||      }|j                  dddddd      }|j                  |d	||z  |z        }| j                  |      }| j                  |      }| j                  j                  d
k(  r#t        j                  j!                  |d	d      }na| j                  j                  dk(  rt#        |d	      }n:| j                  j                  dk(  r!t        j                  j%                  |d	      }t'        |      S )Nr   r   z.Token sequence length must be a perfect squareconstantr   r         rI   gumbel_argmaxT)r   hard	st_argmaxr   r   )r   pooler_output)r  rd   r  rV   intmathsqrtr   r   r   padr   permuter  r   tokenize_functiongumbel_softmaxr  r   r    )rE   r   r   outputsr   
num_imagesseq_len
hidden_dimr  sqrt_lpad_sizer4   
prob_tokens                r+   rT   zOvis2VisionModel.forward  s   
 #$""<:6:#AJ;;$$q(.?.E.E+J KK55M7+,F') !QRR%-)?@MQH " 1 12CaAxYZ\dEegqst uhF 1 9 9Fm3]FmD[]jlv! !2 9 9!Q1a K 1 9 9B =
 J! !!"34';;((O;55f"45PJ[[**k9%f"5J[[**i7..v2.>J9/$
 	
r*   )r"   r#   r$   r   r(   r   r   _can_record_outputsr@   r   r   r&   r'   r   r   r8   r    rT   r[   r\   s   @r+   r  r    sj    0*
0   &
!--&
9?@R9S&
	;	;&
   &
r*   r  zu
    The Ovis2 model which consists of a vision backbone and a language model, without a language modeling head.
    c                    >    e Zd Zi Zdef fdZd Zd Ze e	d      de
j                  dee   d	eez  fd
              Zde
j"                  de
j                  de
j                  fdZee		 	 	 	 	 	 	 	 	 	 	 	 	 dde
j"                  dz  de
j                  dz  de
j&                  dz  de
j"                  dz  dedz  de
j                  dz  de
j"                  dz  dedz  dedz  dedz  dedz  de
j"                  dz  dee
j&                  z  d	eez  fd              Z xZS )
Ovis2Modelrd   c                    t         |   |       t        |j                        | _        t        j                  |j                        | _        t        |j                  j                  |j                        | _        |j                  j                  | _        |j                  | _
        |j                  | _        | j                          y rp   )r?   r@   r  vision_configvision_towerr   from_configtext_configlanguage_modelr   r  rF   visual_embeddings_tablevisual_vocab_sizevisual_indicator_token_idsr!  rm   s     r+   r@   zOvis2Model.__init__  s     ,V-A-AB'33F4F4FG'@AUAUA`A`bhbtbt'u$!'!5!5!@!@ ++*0*K*K'r*   c                 6    | j                   j                         S rp   )rA  get_input_embeddingsrW   s    r+   rF  zOvis2Model.get_input_embeddings  s    ""7799r*   c                 :    | j                   j                  |       y rp   )rA  set_input_embeddingsrE   r   s     r+   rH  zOvis2Model.set_input_embeddings  s    007r*   zWObtains image last hidden states from the vision tower and apply multimodal projection.r,   r   r   r=   c                 h    | j                   |fddi|}|j                  }|j                  \  }}}t        j                  ||| j                   j
                  f|j                  |j                  d|j                        }t        j                  ||gd      }| j                  |      }t        j                  | j                  | j                   j
                  z
  | j                  t        j                        j                  |j                        }	||_        | j                  |	      |_        |S )Nreturn_dictTF)rL   devicerequires_gradlayoutr   r)  r   )r>  r*  rV   r&   zerosr  rL   rL  rN  catrB  r   rC  r   rM   r!   )
rE   r   r   image_outputsimage_featuresr   img_seq_lenr   padding_tensorvisual_indicators
             r+   get_image_featureszOvis2Model.get_image_features  s    *)),SDSFS&44%3%9%9"
Kd&7&7&S&ST &&!((!((
 NN#CK55nE <<""T%6%6%R%RR""**
 "^""
#	 	
 '5#262N2NO_2`/r*   	input_idsr   rR  c                 N   |m| | j                         t        j                  | j                  j                  t        j
                  |j                              k(  }|j                  d      }n|| j                  j                  k(  }|j                         }|j                  d   |j                  d   z  }|j                  d      j                  |      j                  |j                        }t        ||   j                         |j                         k(  d| d|        |S )z
        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
        equal to the length of multimodal features. If the lengths are different, an error is raised.
        rL   rL  rI   r   r   z6Image features and image tokens do not match, tokens: z, features: )rF  r&   tensorrd   image_token_idr   rL  allsumrV   	unsqueeze	expand_asrM   r   numel)rE   rW  r   rR  special_image_maskn_image_tokensn_image_featuress          r+   get_placeholder_maskzOvis2Model.get_placeholder_mask
  s    !.2M$2K2K2MT[[77uzzR_RfRfg3 " "4!7!7!;!*dkk.H.H!H+//1)//2^5I5I!5LL/99"=GGVYYZgZnZno,-3359M9M9OOD^DTT`aq`rs	
 "!r*   Nr   r   r5   labels	use_cacheoutput_attentionsoutput_hidden_statesrK  cache_positionlogits_to_keepc                    |	|	n| j                   j                  }	|
|
n| j                   j                  }
|d u |d uz  rt        d      | | j	                         |      }|6| j                  |d      }|j                  }|j                  }| j                  |||      }|j                  ||      }t        | j                        D ]  \  }}|Y| | j	                         t        j                  |t        j                  |j                              k(  }|j!                  d      }n||k(  j#                  |j                        }|j%                         s||   j'                  ||         j#                  |j                  |j(                        ||<     | j*                  d	||||||	|
d||d
|}t-        |j.                  |j0                  |j2                  |j4                  |      S d       S )
Nz:You must specify exactly one of input_ids or inputs_embedsT)r   rK  )r   rR  rY  rI   )
r   r   r5   r   rf  rg  rh  rK  ri  rj  )r   r5   r6   r7   r0   r)   )rd   rg  rh  r   rF  rV  r*  r!   rd  masked_scatter	enumeraterD  r&   rZ  r   rL  r\  rM   anyr_  rL   rA  r/   r   r5   r6   r7   )rE   rW  r   r   r   r5   r   re  rf  rg  rh  rK  ri  rj  r   rQ  rR  r!   ra  ivisual_indicator_idmaskr2  s                          r+   rT   zOvis2Model.forward"  s5   & 2C1N-TXT_T_TqTq$8$D $++JjJj 	 -t";<YZZ 7D557	BM# 33[_3`M*88N(5(O(O%!%!:!:+- "; "
 *889K^\M*3D4S4S*T &&$(,GD,E,E,G%8

S`SgSgh- D  88B<D%)<<@@AUAUVD88:1!4"=#67M00-2E2EF "$'  &$%% 
)%+'/!5))
 
 (%77#33!//))2>2J
 	

 QU
 	
r*   NNNNNNNNNNNNr   )r"   r#   r$   _checkpoint_conversion_mappingr   r@   rF  rH  r   r   r&   r'   r   r   r8   r    rV  
LongTensorrd  rZ   r	   boolr+  r/   rT   r[   r\   s   @r+   r;  r;    s    &("	{ 	:8 n'' +, 
;	;	 8"))":?:K:K"]b]n]n"0  .215.204(,26*.!%)-,0#'26-.L
##d*L
 ''$.L
 t+	L

 &&-L
 L
 ((4/L
   4'L
 $;L
  $;L
 #TkL
 D[L
 ((4/L
 ell*L
  
)	)!L
  L
r*   r;  c                        e Zd Zi ZddiZdef fdZd Zd Zde	j                  fdZed	ej                  d
ee   deez  fd       Zee	 	 	 	 	 	 	 	 	 	 	 	 	 ddej*                  dz  d	ej                  dz  dej,                  dz  dej*                  dz  dedz  dej                  dz  dej*                  dz  dedz  dedz  dedz  dedz  dej*                  dz  deej,                  z  deez  fd              Z	 	 	 	 	 	 	 d fd	Z xZS )Ovis2ForConditionalGenerationzlm_head.weightz(model.language_model.embed_tokens.weightrd   c                     t         |   |       t        |      | _        t	        j
                  |j                  |j                  d      | _        | j                          y r  )
r?   r@   r;  r   r   rf   rF   r  lm_headr!  rm   s     r+   r@   z&Ovis2ForConditionalGeneration.__init__x  sF     '
yy!3!3V5F5FUSr*   c                 6    | j                   j                         S rp   )r   rF  rW   s    r+   rF  z2Ovis2ForConditionalGeneration.get_input_embeddings~  s    zz..00r*   c                 :    | j                   j                  |       y rp   )r   rH  rI  s     r+   rH  z2Ovis2ForConditionalGeneration.set_input_embeddings  s    

''.r*   r=   c                     | j                   S rp   )ry  rW   s    r+   get_output_embeddingsz3Ovis2ForConditionalGeneration.get_output_embeddings  s    ||r*   r   r   c                 >     | j                   j                  dd|i|S )Nr   r)   )r   rV  )rE   r   r   s      r+   rV  z0Ovis2ForConditionalGeneration.get_image_features  s#     -tzz,,Q,Q&QQr*   NrW  r   r   r5   r   re  rf  rg  rh  rK  ri  rj  c                    |	|	n| j                   j                  }	|
|
n| j                   j                  }
 | j                  d||||||||	|
d|d|}|d   }t	        |t
              rt        | d      n|}| j                  |dd|ddf         }d}|4 | j                  d||| j                   j                  j                  d|}t        |||j                  |j                  |j                  |j                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, Ovis2ForConditionalGeneration

        >>> model = Ovis2ForConditionalGeneration.from_pretrained("thisisiron/Ovis2-2B-hf")
        >>> processor = AutoProcessor.from_pretrained("thisisiron/Ovis2-2B-hf")

        >>> prompt = "<|im_start|>user\n<image>\nDescribe the image.<|im_end|>\n<|im_start|>assistant\n"
        >>> url = "http://images.cocodataset.org/val2014/COCO_val2014_000000537955.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> inputs = processor(images=image, text=prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(**inputs, max_new_tokens=15)
        >>> processor.batch_decode(generate_ids, skip_special_tokens=True)[0]
        "user\n\nDescribe the image.\nassistant\nThe image features a brown dog standing on a wooden floor, looking up with"
        ```NT)rW  r   r   r   r5   r   rf  rg  rh  rK  ri  r   )r4   re  r  )r3   r4   r5   r6   r7   r0   r)   )rd   rg  rh  r   r   r+  slicery  loss_functionr@  r  r2   r5   r6   r7   r0   )rE   rW  r   r   r   r5   r   re  rf  rg  rh  rK  ri  rj  r   r2  r6   slice_indicesr4   r3   s                       r+   rT   z%Ovis2ForConditionalGeneration.forward  s7   ` 2C1N-TXT_T_TqTq$8$D $++JjJj 	 $** 
%)%+'/!5)
 
  
8B>SV8W~ot4]kmA}a,?@A%4%% f9P9P9[9[_eD +#33!//)) ' ; ;
 	
r*   c	           
      h    t        |   |f||||||d|	}
|s|	j                  dd      s||
d<   |
S )N)r5   r   r   ri  rj  is_first_iterationrf  Tr   )r?   prepare_inputs_for_generationget)rE   rW  r5   r   r   r   ri  rj  r  r   model_inputsrG   s              r+   r  z;Ovis2ForConditionalGeneration.prepare_inputs_for_generation  s\     w<	
+')))1	
 	
 VZZT%B
 ,8L(r*   rr  )NNNNNNF)r"   r#   r$   rs  _tied_weights_keysr   r@   rF  rH  r   Moduler}  r   r&   r'   r   r   r8   r    rV  r   rt  rZ   r	   ru  r+  r2   rT   r  r[   r\   s   @r+   rw  rw  s  s   %'"*,VW{ 1/ryy  R!--R9?@R9SR	;	;R R
  .215.204(,26*.!%)-,0#'26-.T
##d*T
 ''$.T
 t+	T

 &&-T
 T
 ((4/T
   4'T
 $;T
  $;T
 #TkT
 D[T
 ((4/T
 ell*T
  
,	,!T
  T
r     r*   rw  )r   r;  rw  )r   )Dr,  collections.abcr   dataclassesr   r&   r    r   r   activationsr   cache_utilsr	   
generationr
   integrationsr   modeling_layersr   modeling_outputsr   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   utils.genericr   utils.output_capturingr   autor   configuration_ovis2r   r   r    r/   r2   r  r;   r^   rx   rZ   rY   r   r   r   r   r   r   r   r   r   r+  r  r  r;  rw  __all__r)   r*   r+   <module>r     s[  *  $ !   & !   ) 7 9 d d F & n n 7 5  ? ?1K ?  ? 
96 9 9 
9+ 9 90 Y'J299 J (J(RYY  BII P %II%<<% 
% <<	%
 LL4'% % %.:)299 :)zryy  8 2@ @>DRYY D<8 8i? i* C >
+ >
B 
Z
% Z

Z
z Q$8/ Q Qh Rr*   