
    qiJ                        d dl Z d dlmZ d dlZd dlmZ ddlmZ ddlm	Z	 ddl
mZ ddlmZmZ dd	lmZ dd
lmZ ddlmZmZmZ ddlmZ ddlmZ ddlmZmZ ddlmZ ddl m!Z!m"Z" ddl#m$Z$m%Z% ddl&m'Z'm(Z( ddl)m*Z*m+Z+ ddl,m-Z-m.Z. dej^                  de0fdZ1ee G d de                    Z2 G d de(      Z3 G d de'      Z4 G d  d!e"      Z5 G d" d#e!      Z6 G d$ d%e+      Z7 G d& d'e      Z8 G d( d)e      Z9 G d* d+e*      Z: G d, d-ejv                        Z< G d. d/ejz                        Z> G d0 d1e      Z? G d2 d3e?      Z@ G d4 d5e%      ZAe G d6 d7e$e             ZBg d8ZCy)9    N)	dataclass)nn   )initialization)Cache)GenerationMixin)BaseModelOutputBaseModelOutputWithPooling)PreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuple)merge_with_config_defaults)capture_outputs   )Aimv2AttentionAimv2EncoderLayer)	AutoModel)LlamaMLPLlamaRMSNorm)LlavaForConditionalGeneration
LlavaModel)LlavaNextCausalLMOutputWithPastLlavaNextModelOutputWithPast)SiglipEncoderSiglipVisionEmbeddings   )Ovis2ConfigOvis2VisionConfiglogitsdimc                     | j                  |      }|j                  |d      d   }t        j                  | t        j                        j                  ||d      }||j                         z
  |z   }|S )NT)keepdimr   )memory_formatg      ?)softmaxmaxtorch
zeros_likelegacy_contiguous_formatscatter_detach)r!   r"   y_softindexy_hardrets         Y/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/ovis2/modular_ovis2.pyhard_softmaxr2   '   sk    ^^C FJJsDJ)!,EfE4R4RS\\]`bgilmF
6==?
"V
+CJ    c                   :    e Zd ZU dZdZej                  dz  ed<   y)*BaseModelOutputWithVisualIndicatorFeaturesz
    visual_indicator_features (`torch.FloatTensor` of shape `(batch_size, visual_indicator_size)`):
        Visual indicator features extracted from the model, which can be used for auxiliary tasks or further processing.
    Nvisual_indicator_features)__name__
__module____qualname____doc__r6   r(   FloatTensor__annotations__ r3   r1   r5   r5   1   s    
 ;?u0047>r3   r5   c                       e Zd Zy)Ovis2ModelOutputWithPastNr7   r8   r9   r=   r3   r1   r?   r?   <       r3   r?   c                       e Zd Zy)Ovis2CausalLMOutputWithPastNr@   r=   r3   r1   rC   rC   @   rA   r3   rC   c                       e Zd Zy)Ovis2RMSNormNr@   r=   r3   r1   rE   rE   D   rA   r3   rE   c                       e Zd Zy)Ovis2VisionMLPNr@   r=   r3   r1   rG   rG   H   rA   r3   rG   c                   b     e Zd Zdef fdZd Zdej                  dej                  fdZ	 xZ
S )Ovis2VisionEmbeddingsconfigc                 n    t         |   |       t        |j                  |j                        | _        y N)super__init__rE   hidden_sizerms_norm_epsrms_normselfrJ   	__class__s     r1   rN   zOvis2VisionEmbeddings.__init__M   s*     $V%7%79L9LMr3   c                     t        d      )NzNot needed for Ovis2)NotImplementedError)rS   s    r1   interpolate_pos_encodingz.Ovis2VisionEmbeddings.interpolate_pos_encodingQ   s    !"899r3   pixel_valuesreturnc                 (   | j                   j                  j                  }| j                  |j                  |            }|j	                  d      j                  dd      }| j                  |      }|| j                  | j                        z   }|S )Ndtyper   r   )	patch_embeddingweightr\   toflatten	transposerQ   position_embeddingposition_ids)rS   rX   target_dtypepatch_embeds
embeddingss        r1   forwardzOvis2VisionEmbeddings.forwardT   s    ++2288++LOO,O,OP!))!,66q!<
]]:.
$"9"9$:K:K"LL
r3   )r7   r8   r9   r    rN   rW   r(   r;   Tensorrg   __classcell__rT   s   @r1   rI   rI   L   s4    N0 N:E$5$5 %,, r3   rI   c                       e Zd Zy)Ovis2VisionAttentionNr@   r=   r3   r1   rl   rl   _   rA   r3   rl   c                   $     e Zd Zdef fdZ xZS )Ovis2VisionEncoderLayerrJ   c                 B    t         |           t        |      | _        y rL   )rM   rN   rl   	attentionrR   s     r1   rN   z Ovis2VisionEncoderLayer.__init__d   s    -f5r3   )r7   r8   r9   r    rN   ri   rj   s   @r1   rn   rn   c   s    60 6 6r3   rn   c            	       p     e Zd Zdef fdZee	 ddej                  dz  de	e
   defd              Z xZS )	Ovis2VisionEncoderrJ   c                     t         |   |       t        j                  t	        |j
                        D cg c]  }t        |       c}      | _        y c c}w rL   )rM   rN   r   
ModuleListrangenum_hidden_layersrn   layers)rS   rJ   _rT   s      r1   rN   zOvis2VisionEncoder.__init__j   s@     mmeTZTlTlNm$n%<V%D$no$ns   ANattention_maskkwargsrY   c                 T    |}| j                   D ]  } |||fi |} t        |      S )Nlast_hidden_state)rw   r	   )rS   inputs_embedsry   rz   hidden_statesencoder_layers         r1   rg   zOvis2VisionEncoder.forwardn   s<     &![[ 	SM)-R6RM	S ??r3   rL   )r7   r8   r9   r    rN   r   r   r(   rh   r   r   r	   rg   ri   rj   s   @r1   rr   rr   i   se    p0 p  /3
@ t+
@ +,	
@
 

@  
@r3   rr   c                   X     e Zd Zdef fdZe	 ddej                  dz  fd       Z xZ	S )Ovis2VisionTransformerrJ   c                     t         |           || _        t        |      | _        t        |      | _        t        |j                  |j                        | _
        d| _        y )NF)rM   rN   rJ   rI   rf   rr   encoderrE   rO   rP   rQ   gradient_checkpointingrR   s     r1   rN   zOvis2VisionTransformer.__init__~   sO    /7)&1$V%7%79L9LM&+#r3   Nry   c                     | j                  |      } | j                  d||d|}|j                  }| j                  |      }t	        |      S )N)r~   ry   r|   r=   )rf   r   r}   rQ   r	   )rS   rX   ry   rz   r   encoder_outputsr}   s          r1   rg   zOvis2VisionTransformer.forward   sa     5+74<< ,
'),
 ,
 ,== MM*;<1BCCr3   rL   )
r7   r8   r9   r    rN   r   r(   rh   rg   ri   rj   s   @r1   r   r   }   s?    ,0 ,  /3D t+D Dr3   r   c                   P     e Zd Zdej                  dej                  f fdZ xZS )Ovis2VisualEmbeddingTablevisual_tokensrY   c                    |j                   t        j                  t        j                  t        j                  t        j
                  t        j                  fv rt        | !  |      S t        j                  || j                        S rL   )r\   r(   int8int16int32int64longrM   rg   matmulr^   )rS   r   rT   s     r1   rg   z!Ovis2VisualEmbeddingTable.forward   sW    5::u{{EKKV[V`V`"aa7?=11||M4;;77r3   )r7   r8   r9   r(   rh   rg   ri   rj   s   @r1   r   r      s#    8U\\ 8ell 8 8r3   r   c                   X     e Zd ZU eed<   dZdZdZdgZdZ	dZ
dZdZdZdZdZ fdZ xZS )Ovis2PreTrainedModelrJ   model)imagetextTrl   past_key_valuesc                     t         |   |       t        |t              rZt	        j
                  |j                  t        j                  |j                  j                  d         j                  d             y y )N)r   r   )rM   _init_weights
isinstancerI   initcopy_rc   r(   arangeshapeexpand)rS   modulerT   s     r1   r   z"Ovis2PreTrainedModel._init_weights   s[    f%f34JJv**ELL9L9L9R9RSU9V,W,^,^_f,gh 5r3   )r7   r8   r9   r   r<   base_model_prefixinput_modalitiessupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_cache_class_supports_flash_attn_supports_flex_attn_supports_sdpa_can_compile_fullgraph_supports_attention_backendr   ri   rj   s   @r1   r   r      sY    (&*#/0"3 N!"&i ir3   r   c            	            e Zd ZU eed<   eedZdef fdZe	e
dej                  dee   deez  fd              Z xZS )Ovis2VisionModelrJ   )r   
attentionsc                    t         |   |       || _        t        |      | _        |j
                  | _        |j                  | _        t        j                  |j                  |j                  z  |j                  z  | j                  | j
                  z
  d      | _        t        j                  | j                  | j
                  z
        | _        | j                          y NF)bias)rM   rN   rJ   r   transformernum_visual_indicator_tokens
vocab_sizer   LinearrO   hidden_stridehead_linear	LayerNorm	head_norm	post_initrR   s     r1   rN   zOvis2VisionModel.__init__   s     1&9+1+M+M( ++99!5!558L8LLOOd>>>

 doo8X8X&XYr3   rX   rz   rY   c           	          | j                   |fi |}|d   }| j                  j                  dkD  r|j                  \  }}}| j                  j                  }t	        t        j                  |            }	|	|	z  |k7  rt        d      ||	|z  z
  |z  }
t        j                  j                  |ddd|
d|
fdd      }|	|
z  }	|j                  ||	|z  ||	|z  ||      }|j                  dddddd      }|j                  |d	||z  |z        }| j                  |      }| j                  |      }| j                  j                  d
k(  r#t        j                  j!                  |d	d      }na| j                  j                  dk(  rt#        |d	      }n:| j                  j                  dk(  r!t        j                  j%                  |d	      }t'        |      S )Nr   r   z.Token sequence length must be a perfect squareconstantr   r         r   gumbel_argmaxT)r"   hard	st_argmaxr"   r&   )r}   pooler_output)r   rJ   r   r   intmathsqrt
ValueErrorr   
functionalpadreshapepermuter   r   tokenize_functiongumbel_softmaxr2   r&   r5   )rS   rX   rz   outputsr}   
num_imagesseq_len
hidden_dimr   sqrt_lpad_sizer!   
prob_tokens                r1   rg   zOvis2VisionModel.forward   s   
 #$""<:6:#AJ;;$$q(.?.E.E+J KK55M7+,F') !QRR%-)?@MQH " 1 12CaAxYZ\dEegqst uhF 1 9 9Fm3]FmD[]jlv! !2 9 9!Q1a K 1 9 9B =
 J! !!"34';;((O;55f"45PJ[[**k9%f"5J[[**i7..v2.>J9/$
 	
r3   )r7   r8   r9   r    r<   rn   rl   _can_record_outputsrN   r   r   r(   r;   r   r   tupler5   rg   ri   rj   s   @r1   r   r      sj    0*
0   &
!--&
9?@R9S&
	;	;&
   &
r3   r   c                        e Zd Zi Zdef fdZe ed      dej                  de
e   deez  fd              Zee	 	 	 	 	 	 	 	 	 	 	 	 	 dd
ej                  d	z  dej                  d	z  dej                   d	z  dej                  d	z  ded	z  dej                  d	z  dej                  d	z  ded	z  ded	z  ded	z  ded	z  dej                  d	z  deej                   z  deez  fd              Z xZS )
Ovis2ModelrJ   c                 |   t         |   |       t        |j                        | _        t        |j                  j                  |j                        | _        |j                  j                  | _	        |j                  | _        |j                  | _
        t        j                  |j                        | _        | `y rL   )rM   rN   r   vision_configvision_towerr   r   rO   visual_embeddings_tablevisual_vocab_sizevisual_indicator_token_idsr   from_configtext_configlanguage_modelmulti_modal_projectorrR   s     r1   rN   zOvis2Model.__init__   s     ,V-A-AB'@AUAUA`A`bhbtbt'u$!'!5!5!@!@ ++*0*K*K''33F4F4FG&r3   zWObtains image last hidden states from the vision tower and apply multimodal projection.)custom_introrX   rz   rY   c                 h    | j                   |fddi|}|j                  }|j                  \  }}}t        j                  ||| j                   j
                  f|j                  |j                  d|j                        }t        j                  ||gd      }| j                  |      }t        j                  | j                  | j                   j
                  z
  | j                  t        j                        j                  |j                        }	||_        | j                  |	      |_        |S )Nreturn_dictTF)r\   devicerequires_gradlayoutr   r   r[   )r   r   r   r(   zerosr   r\   r   r   catr   r   r   r   r_   r6   )
rS   rX   rz   image_outputsimage_features
batch_sizeimg_seq_lenrx   padding_tensorvisual_indicators
             r1   get_image_featureszOvis2Model.get_image_features  s    *)),SDSFS&44%3%9%9"
Kd&7&7&S&ST &&!((!((
 NN#CK55nE <<""T%6%6%R%RR""**
 "^""
#	 	
 '5#262N2NO_2`/r3   N	input_idsry   rc   r   r~   labels	use_cacheoutput_attentionsoutput_hidden_statesr   cache_positionlogits_to_keepc                    |	|	n| j                   j                  }	|
|
n| j                   j                  }
|d u |d uz  rt        d      | | j	                         |      }|6| j                  |d      }|j                  }|j                  }| j                  |||      }|j                  ||      }t        | j                        D ]  \  }}|Y| | j	                         t        j                  |t        j                  |j                              k(  }|j!                  d      }n||k(  j#                  |j                        }|j%                         s||   j'                  ||         j#                  |j                  |j(                        ||<     | j*                  d	||||||	|
d||d
|}t-        |j.                  |j0                  |j2                  |j4                  |      S d       S )
Nz:You must specify exactly one of input_ids or inputs_embedsT)rX   r   )r~   r   )r\   r   r   )
ry   rc   r   r~   r   r   r   r   r   r   )r}   r   r   r   image_hidden_statesr=   )rJ   r   r   r   get_input_embeddingsr   r   r6   get_placeholder_maskmasked_scatter	enumerater   r(   tensorr   r   allr_   any	expand_asr\   r   r?   r}   r   r   r   )rS   r   rX   ry   rc   r   r~   r   r   r   r   r   r   r   rz   r   r   r6   special_image_maskivisual_indicator_idmaskr   s                          r1   rg   zOvis2Model.forward&  s5   & 2C1N-TXT_T_TqTq$8$D $++JjJj 	 -t";<YZZ 7D557	BM# 33[_3`M*88N(5(O(O%!%!:!:+- "; "
 *889K^\M*3D4S4S*T &&$(,GD,E,E,G%8

S`SgSgh- D  88B<D%)<<@@AUAUVD88:1!4"=#67M00-2E2EF "$'  &$%% 
)%+'/!5))
 
 (%77#33!//))2>2J
 	

 QU
 	
r3   NNNNNNNNNNNNr   )r7   r8   r9   _checkpoint_conversion_mappingr   rN   r   r   r(   r;   r   r   r   r5   r   
LongTensorrh   r   boolr   r?   rg   ri   rj   s   @r1   r   r      s   %'"	'{ 	' n'' +, 
;	;	 8  .215.204(,26*.!%)-,0#'26-.L
##d*L
 ''$.L
 t+	L

 &&-L
 L
 ((4/L
   4'L
 $;L
  $;L
 #TkL
 D[L
 ((4/L
 ell*L
  
)	)!L
  L
r3   r   c                        e Zd Zi Zdef fdZedej                  de	e
   deez  fd       Zee	 	 	 	 	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  d	ej                   dz  d
ej                  dz  dedz  dej                  dz  dej                  dz  dedz  dedz  dedz  dedz  dej                  dz  deej                   z  deez  fd              Z xZS )Ovis2ForConditionalGenerationrJ   c                     t         |   |       t        j                  |j                  |j
                  d      | _        y r   )rM   rN   r   r   rO   r   lm_headrR   s     r1   rN   z&Ovis2ForConditionalGeneration.__init__{  s0     yy!3!3V5F5FUSr3   rX   rz   rY   c                 >     | j                   j                  dd|i|S )NrX   r=   )r   r   )rS   rX   rz   s      r1   r   z0Ovis2ForConditionalGeneration.get_image_features  s#     -tzz,,Q,Q&QQr3   Nr   ry   rc   r   r~   r   r   r   r   r   r   r   c                    |	|	n| j                   j                  }	|
|
n| j                   j                  }
 | j                  d||||||||	|
d|d|}|d   }t	        |t
              rt        | d      n|}| j                  |dd|ddf         }d}|4 | j                  d||| j                   j                  j                  d|}t        |||j                  |j                  |j                  |j                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, Ovis2ForConditionalGeneration

        >>> model = Ovis2ForConditionalGeneration.from_pretrained("thisisiron/Ovis2-2B-hf")
        >>> processor = AutoProcessor.from_pretrained("thisisiron/Ovis2-2B-hf")

        >>> prompt = "<|im_start|>user\n<image>\nDescribe the image.<|im_end|>\n<|im_start|>assistant\n"
        >>> url = "http://images.cocodataset.org/val2014/COCO_val2014_000000537955.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> inputs = processor(images=image, text=prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(**inputs, max_new_tokens=15)
        >>> processor.batch_decode(generate_ids, skip_special_tokens=True)[0]
        "user\n\nDescribe the image.\nassistant\nThe image features a brown dog standing on a wooden floor, looking up with"
        ```NT)r   rX   ry   rc   r   r~   r   r   r   r   r   r   )r!   r   r   )lossr!   r   r   r   r   r=   )rJ   r   r   r   r   r   slicer  loss_functionr   r   rC   r   r   r   r   )rS   r   rX   ry   rc   r   r~   r   r   r   r   r   r   r   rz   r   r   slice_indicesr!   r  s                       r1   rg   z%Ovis2ForConditionalGeneration.forward  s7   ` 2C1N-TXT_T_TqTq$8$D $++JjJj 	 $** 
%)%+'/!5)
 
  
8B>SV8W~ot4]kmA}a,?@A%4%% f9P9P9[9[_eD +#33!//)) ' ; ;
 	
r3   r  )r7   r8   r9   r	  r   rN   r   r(   r;   r   r   r   r5   r   r   r
  rh   r   r  r   rC   rg   ri   rj   s   @r1   r  r  w  s   %'"T{ T R!--R9?@R9SR	;	;R R
  .215.204(,26*.!%)-,0#'26-.T
##d*T
 ''$.T
 t+	T

 &&-T
 T
 ((4/T
   4'T
 $;T
  $;T
 #TkT
 D[T
 ((4/T
 ell*T
  
,	,!T
  T
r3   r  )r   r   r  )Dr   dataclassesr   r(   r    r   r   cache_utilsr   
generationr   modeling_outputsr	   r
   modeling_utilsr   processing_utilsr   utilsr   r   r   utils.genericr   utils.output_capturingr   aimv2.modeling_aimv2r   r   autor   llama.modeling_llamar   r   llava.modeling_llavar   r   llava_next.modeling_llava_nextr   r   siglip.modeling_siglipr   r   configuration_ovis2r   r    rh   r   r2   r5   r?   rC   rE   rG   rI   rl   rn   rr   Moduler   	Embeddingr   r   r   r   r  __all__r=   r3   r1   <module>r*     s^    !   &   ) K - & I I 7 5 D  9 L j J ? C  ?1K ?  ?	; 		"A 		< 		X 	2 &	> 	6/ 6@ @(DRYY D<8 8i? i*>
+ >
B|
 |
~ c
$A? c
 c
L Rr3   