
    qi3                     T   d dl Z d dl mZ d dlmZmZmZmZmZ ddlm	Z	 ddl
mZ ddlmZ ddlmZ dd	lmZmZmZ dd
lmZ ddlmZ  ej0                  e      Z G d de      Z G d de      Z G d dej:                        Z G d de      Z G d de      Z  G d de      Z!g dZ"y)    N)nn)LlavaCausalLMOutputWithPastLlavaForConditionalGeneration
LlavaModelLlavaModelOutputWithPastLlavaPreTrainedModel   )ACT2FN)Cache)BaseModelOutputWithPooling)Unpack)TransformersKwargsauto_docstringlogging)can_return_tuple   )VipLlavaConfigc                       e Zd Zy)VipLlavaModelOutputWithPastN__name__
__module____qualname__     _/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/vipllava/modular_vipllava.pyr   r   &       r   r   c                       e Zd Zy)VipLlavaCausalLMOutputWithPastNr   r   r   r   r   r   *   r   r   r   c                   *     e Zd Zdef fdZd Z xZS )VipLlavaMultiModalProjectorconfigc                 H   t         |           t        |j                  t              rdnt        |j                        }t        j                  ||j                  j                  z  |j                        | _        t        j                  ||j                  j                  z  |j                  j                  d      | _        t        |j                      | _        t        j                  |j                  j                  |j                  j                  d      | _        y )Nr   )epsT)bias)super__init__
isinstancevision_feature_layersintlenr   	LayerNormvision_confighidden_sizeprojector_layernorm_epsprojector_layernormLineartext_configlinear_1r
   projector_hidden_actactlinear_2)selfr"   num_feature_layers	__class__s      r   r'   z$VipLlavaMultiModalProjector.__init__/   s    ",V-I-I3"OQUXY_YuYuUv#%<<!5!5!A!AAvGeGe$
  		!5!5!A!AA**

 &556		&"4"4"@"@&BTBTB`B`gklr   c                     | j                  |      }| j                  |      }| j                  |      }| j                  |      }|S N)r0   r3   r5   r6   )r7   hidden_statess     r   forwardz#VipLlavaMultiModalProjector.forward>   sB    00?m4/m4r   )r   r   r   r   r'   r=   __classcell__)r9   s   @r   r!   r!   .   s    m~ mr   r!   c                       e Zd Zy)VipLlavaPreTrainedModelNr   r   r   r   r@   r@   F   r   r   r@   c                      e Zd Ze ed      	 	 ddej                  deee   z  dz  de	dz  de
e   deez  f
d	              Ze	 	 	 	 	 	 	 	 	 	 	 	 dd
ej                  dz  dej                  dz  dej                   dz  dej                  dz  dedz  dej                  dz  deee   z  dz  de	dz  de	dz  de	dz  de	dz  dej                  dz  deez  fd       Zy)VipLlavaModelzWObtains image last hidden states from the vision tower and apply multimodal projection.)custom_introNpixel_valuesr)   output_hidden_stateskwargsreturnc                 j   ||n| j                   j                  } | j                  |fddd|}t        |t              r|j
                  |   ddddf   }n<|D cg c]  }|j
                  |   ddddf    }}t        j                  |d      }| j                  |      }||_	        |S c c}w )\  
        pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`):
            The tensors corresponding to the input images.
        vision_feature_layers (`Union[int, list[int]]`, *optional*):
            The vision feature layer, or the list of indexes of the layers to select
            the vision feature.
        NT)rE   return_dictr   )dim)
r"   r)   vision_towerr(   r*   r<   torchcatmulti_modal_projectorpooler_output)r7   rD   r)   rE   rF   image_outputsimage_featuresindexs           r   get_image_featuresz VipLlavaModel.get_image_featuresK   s    & &;%F!DKKLmLm 	 *))
!%
 	
 +S1*889NOPQSTSUPUVN VkkEm99%@ABGkNk"YY~2>N33NC&4# ls   B0	input_idsattention_maskposition_idspast_key_valuesinputs_embeds	use_cacheoutput_attentionsrJ   cache_positionc                    |	|	n| j                   j                  }	|
|
n| j                   j                  }
||n| j                   j                  }||n| j                   j                  }|du |duz  rt        d      | | j                         |      }|j| j                  ||d      j                  }|j                  |j                  |j                        }| j                  |||      }|j                  ||      } | j                  d||||||	|
d|d	|}t        |j                   |j"                  |j$                  |j&                  |nd      }|r|S |j)                         S )	z
        vision_feature_layers (`Union[int, list[int]]`, *optional*):
            The vision feature layer, or the list of indexes of the layers to select
            the vision feature.
        Nz:You must specify exactly one of input_ids or inputs_embedsT)rD   r)   rJ   )rZ   rS   )	rW   rX   rY   rZ   r[   r\   rE   rJ   r]   )last_hidden_staterY   r<   
attentionsimage_hidden_statesr   )r"   r\   rE   use_return_dictr)   
ValueErrorget_input_embeddingsrU   rQ   todevicedtypeget_placeholder_maskmasked_scatterlanguage_modelr   r_   rY   r<   r`   to_tuple)r7   rV   rD   rW   rX   rY   rZ   r)   r[   r\   rE   rJ   r]   	lm_kwargsrS   special_image_maskoutputsoutputs                     r   r=   zVipLlavaModel.forwardt   s   , 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]%:%F!DKKLmLm 	 -t";<YZZ 7D557	BM#!44)AVdh 5 m  ,..}/C/C]EXEXYN!%!:!:~ "; " *889K^\M%$%% 
)%+'/!5)
 
 -%77#33!//))2>2JPT
 %v;&//*;;r   )NN)NNNNNNNNNNNN)r   r   r   r   r   rN   FloatTensorr*   listboolr   r   tupler   rU   
LongTensorTensorr   r   r=   r   r   r   rB   rB   J   s   n 9=,0	#''#  #T#Y5# #Tk	#
 +,# 
+	+# #J  .215.204(,268<!%)-,0#'26B<##d*B< ''$.B< t+	B<
 &&-B< B< ((4/B<  #T#Y5B< $;B<  $;B< #TkB< D[B< ((4/B< 
,	,B< B<r   rB   c                       e Zd Ze	 ddej
                  deee   z  dz  dee	   de
ez  fd       Z	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddej                  dz  dej
                  dz  dej                  dz  d	ej                  dz  d
edz  dej
                  dz  deee   z  dz  dej                  dz  dedz  dedz  dedz  dedz  dej                  dz  deej                  z  de
ez  fdZy) VipLlavaForConditionalGenerationNrD   r)   rF   rG   c                 @     | j                   j                  d||d|S )rI   )rD   r)   r   )modelrU   )r7   rD   r)   rF   s       r   rU   z3VipLlavaForConditionalGeneration.get_image_features   s0     -tzz,, 
%=R
V\
 	
r   rV   rW   rX   rY   rZ   labelsr[   r\   rE   rJ   r]   logits_to_keepc                 l   |
|
n| j                   j                  }
||n| j                   j                  }||n| j                   j                  }||n| j                   j                  } | j
                  d|||||||	||
|d|d|}|d   }t        |t              rt        | d      n|}| j                  |dd|ddf         }d}|2| j                  ||| j                   j                  j                        }t        |||j                  |j                  |j                   |j"                        S )a  
        vision_feature_layers (`Union[int, list[int]]`, *optional*):
            The vision feature layer, or the list of indexes of the layers to select
            the vision feature.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> import torch
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, VipLlavaForConditionalGeneration

        >>> model = VipLlavaForConditionalGeneration.from_pretrained("llava-hf/vip-llava-7b-hf", device_map="auto", dtype=torch.float16)
        >>> processor = AutoProcessor.from_pretrained("llava-hf/vip-llava-7b-hf")

        >>> prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.###Human: <image>\n{}###Assistant:"
        >>> question = "Can you please describe this image?"
        >>> prompt = prompt.format(question)
        >>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/compel-neg.png"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> inputs = processor(text=text, images=image, return_tensors="pt").to(0, torch.float16)

        >>> # Generate
        >>> generate_ids = model.generate(**inputs, max_new_tokens=20)
        >>> processor.decode(generate_ids[0][len(inputs["input_ids"][0]):], skip_special_tokens=True)
        The image features a brown and white cat sitting on a green surface, with a red ball in its
        ```NT)rV   rD   rW   rX   rY   rZ   r[   r)   r\   rE   rJ   r]   r   )logitsrz   
vocab_size)lossr}   rY   r<   r`   ra   r   )r"   r\   rE   rb   r)   ry   r(   r*   slicelm_headloss_functionr2   r~   r   rY   r<   r`   ra   )r7   rV   rD   rW   rX   rY   rZ   r)   rz   r[   r\   rE   rJ   r]   r{   rl   rn   r<   slice_indicesr}   r   s                        r   r=   z(VipLlavaForConditionalGeneration.forward   s[   l 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]%:%F!DKKLmLm 	 $** 
%)%+'"7/!5)
 
   
8B>SV8W~ot4]kmA}a,?@A%%VFt{{OfOfOqOq%rD-#33!//)) ' ; ;
 	
r   r;   )NNNNNNNNNNNNNr   )r   r   r   r   rN   rp   r*   rq   r   r   rs   r   rU   rt   ru   r   rr   r   r=   r   r   r   rw   rw      s    9=
''
  #T#Y5
 +,	

 
+	+
 
& .215.204(,268<*.!%)-,0#'26-._
##d*_
 ''$._
 t+	_

 &&-_
 _
 ((4/_
  #T#Y5_
   4'_
 $;_
  $;_
 #Tk_
 D[_
 ((4/_
 ell*_
" 
/	/#_
r   rw   )rB   rw   r@   )#rN   r   (transformers.models.llava.modeling_llavar   r   r   r   r   activationsr
   cache_utilsr   modeling_outputsr   processing_utilsr   utilsr   r   r   utils.genericr   configuration_vipllavar   
get_loggerr   loggerr   r   Moduler!   r@   rB   rw   __all__r   r   r   <module>r      s       "   : & @ @ - 2 
		H	%	": 		%@ 	")) 0	2 	m<J m<`r
'D r
j [r   