
    qiD                     :   d Z ddlZddlmZ ddlmZ ddlmZ ddlmZm	Z	 ddl
mZ dd	lmZ dd
lmZ ddlmZmZmZmZmZ ddlmZ  ej0                  e      Ze G d de             Z ed       G d de             Z ed       G d dee             Zg dZy)zPyTorch Fuyu model.    N)nn   )Cache)GenerationMixin)BaseModelOutputWithPoolingCausalLMOutputWithPast)PreTrainedModel)	AutoModel)Unpack)TransformersKwargsauto_docstringcan_return_tupleloggingtorch_compilable_check   )
FuyuConfigc                   <    e Zd ZU eed<   dZdZdZdZdZ	dZ
dZg ZdZy)FuyuPreTrainedModelconfigfuyu)imagetextTpast_key_valuesN)__name__
__module____qualname__r   __annotations__base_model_prefixinput_modalitiessupports_gradient_checkpointing_supports_attention_backend_supports_flash_attn_supports_sdpa_supports_flex_attn_no_split_modules_skip_keys_device_placement     X/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/fuyu/modeling_fuyu.pyr   r       s=    (&*#"&N"3r(   r   zt
    The Fuyu model which consists of a vision backbone and a language model, without a language modeling head.
    )custom_introc                   X    e Zd ZddiZdef fdZd Zd Zdej                  de
ej                     d	ej                  d
ej                  fdZeedej                  dee   d
eez  fd              Zdej(                  dej                  dej                  fdZe	 	 	 	 	 	 	 	 	 	 	 ddej(                  dz  dej                  dz  dej                  dz  dej                  dz  dej(                  dz  dedz  dej                  dz  dedz  dedz  dedz  dedz  d
eez  fd       Z xZS )	FuyuModelzlanguage_model.modellanguage_modelr   c                    t         |   |       |j                  | _        |j                  j
                  | _        t        j                  |j                        | _        t        j                  |j                  |j                  z  |j                  z  |j                        | _        d| _        | j!                          y )NF)super__init__pad_token_idpadding_idxtext_config
vocab_sizer
   from_configr-   r   Linear
patch_sizenum_channelshidden_sizevision_embed_tokensgradient_checkpointing	post_initselfr   	__class__s     r)   r0   zFuyuModel.__init__6   s     !.. ,,77'33F4F4FG#%99 1 11F4G4GGI[I[$
  ',#r(   c                 6    | j                   j                         S N)r-   get_input_embeddingsr>   s    r)   rB   zFuyuModel.get_input_embeddingsC   s    ""7799r(   c                 :    | j                   j                  |       y rA   )r-   set_input_embeddingsr>   values     r)   rE   zFuyuModel.set_input_embeddingsF   s    007r(   word_embeddingscontinuous_embeddingsimage_patch_input_indicesreturnc           
         |j                   d   t        |      k(  s't        dt        |      d|j                   d         |j                         }t	        |j                   d         D ]  }t        j                  ||   dk\  d      d   }||   |   }|j                   d   ||   j                   d   kD  r,t        d||   j                   d|j                   d| d	      ||   |   j                  |j                        |||f<    |S )
a  This function places the continuous_embeddings into the word_embeddings at the locations
        indicated by image_patch_input_indices. Different batch elements can have different numbers of continuous
        embeddings.

        Args:
            word_embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Tensor of word embeddings.
            continuous_embeddings (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`):
                Tensor of continuous embeddings. The length of the list is the batch size. Each entry is shape
                [num_image_embeddings, hidden], and num_image_embeddings needs to match the number of non-negative
                indices in image_patch_input_indices for that batch element.
            image_patch_input_indices (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                Tensor of indices of the image patches in the input_ids tensor.
        r   z7Batch sizes must match! Got len(continuous_embeddings)=z and word_embeddings.shape[0]=T)as_tuplezGNumber of continuous embeddings continuous_embeddings[batch_idx].shape=zA does not match number of continuous token ids src_indices.shape=z in batch element .)	shapelen
ValueErrorclonerangetorchnonzerotodevice)r>   rH   rI   rJ   output_embeddings	batch_idxdst_indicessrc_indicess           r)   gather_continuous_embeddingsz&FuyuModel.gather_continuous_embeddingsI   sZ   (  %%a(C0E,FFJs3H/I.KKjQ`QfQfghQiPkl  ,11344Q78 	I  --(A)(LPQ(Q\`abcdK 4I>{KK  #&;I&F&L&LQ&OO ^7LY7W7]7]6_ `I6A6G6G5II[\e[ffgi  9Ni8XYd8e8h8h!((9i45	  ! r(   pixel_valueskwargsc                 <    | j                  |      }t        |      S )z
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
            The tensors corresponding to the input images.
        )last_hidden_state)r:   r   )r>   r]   r^   patch_embeddingss       r)   get_image_featureszFuyuModel.get_image_featuresu   s!      33LA)<LMMr(   	input_idsinputs_embedsimage_featuresc                 N   |m| | j                         t        j                  | j                  j                  t        j
                  |j                              k(  }|j                  d      }n|| j                  j                  k(  }|j                         }|j                  d   |j                  d   z  }|j                  d      j                  |      j                  |j                        }t        ||   j                         |j                         k(  d| d|        |S )z
        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
        equal to the length of multimodal features. If the lengths are different, an error is raised.
        dtyperW   r   r   z6Image features and image tokens do not match, tokens: z, features: )rB   rT   tensorr   image_token_idlongrW   allsumrO   	unsqueeze	expand_asrV   r   numel)r>   rc   rd   re   special_image_maskn_image_tokensn_image_featuress          r)   get_placeholder_maskzFuyuModel.get_placeholder_mask   s    !.2M$2K2K2MT[[77uzzR_RfRfg3 " "4!7!7!;!*dkk.H.H!H+//1)//2^5I5I!5LL/99"=GGVYYZgZnZno,-3359M9M9OOD^DTT`aq`rs	
 "!r(   Nimage_patchesimage_patches_indicesattention_maskposition_idsr   	use_cacheoutput_attentionsoutput_hidden_statesreturn_dictc                 z   |	|	n| j                   j                  }	|
|
n| j                   j                  }
||n| j                   j                  }||n| j                   j                  }||t        d      ||j                  \  }}n||j                  \  }}}nt        d      |i||j                  n|j                  }||j                         nd}t        j                  |||z   t        j                  |      }|j                  d      }|  | j                  j                         |      }|i| j                  |d      j                   }|j#                  |j                  |j$                        }| j'                  |||      }|j)                  ||      } | j                  d	|||||	|
||d|}|S )
a  
        image_patches (`torch.FloatTensor` of shape `(batch_size, num_total_patches, patch_size_ x patch_size x num_channels)`, *optional*):
            Image patches to be used as continuous embeddings. The patches are flattened and then projected to the
            hidden size of the model.
        image_patches_indices (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Tensor of indices of the image patches in the input_ids tensor.
        zDYou cannot specify both input_ids and inputs_embeds at the same timez4You have to specify either input_is or inputs_embedsr   rg   T)r}   )rd   re   )rd   rx   ry   r   r{   r|   rz   r}   r'   )r   r{   r|   rz   use_return_dictrQ   rO   rW   get_seq_lengthrT   arangerl   ro   r-   rB   rb   r`   rV   rh   ru   masked_scatter)r>   rc   rv   rw   rx   ry   r   rd   rz   r{   r|   r}   r^   
batch_size
seq_length_rW   past_key_values_lengthra   rr   outputss                        r)   forwardzFuyuModel.forward   s   0 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B] ]%>cdd"%.__"J
&(5(;(;%J
ASTT)2)>Y%%MDXDXFIXId_%C%C%Ejk" <<&
5K(KSXS]S]flL (11!4L FD//DDFyQM$#66}RV6Wii/22=3G3GI\I\]!%!:!:GW "; " *889KM]^M%$%% 

')%+/!5#

 

 r(   )NNNNNNNNNNN)r   r   r   _checkpoint_conversion_mappingr   r0   rB   rE   rT   Tensorlistr\   r   r   FloatTensorr   r   tupler   rb   
LongTensorru   r   boolr   r   __classcell__r?   s   @r)   r,   r,   .   s    '=>N%O"z :8*!*!  $ELL1*! $)<<	*!
 
*!X N!--N9?@R9SN	+	+N  N"))":?:K:K"]b]n]n"0  .2-159.204(,26!%)-,0#'G##d*G ||d*	G
  %||d2G t+G &&-G G ((4/G $;G  $;G #TkG D[G 
'	'G Gr(   r,   zz
    Fuyu Model with a language modeling head on top for causal language model conditioned on image patches and text.
    c                        e Zd ZddddZddiZdef fdZd	 Zd
 Ze	e
	 	 	 	 	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dedz  dej                  dz  dedz  dej                  dz  dedz  dedz  dedz  dedz  deez  fd              Z	 	 	 	 	 	 	 d fd	Z xZS )FuyuForCausalLMzmodel.language_modelzmodel.vision_embed_tokenslm_head)z^language_model.modelz^vision_embed_tokensz^language_model.lm_headzlm_head.weightz(model.language_model.embed_tokens.weightr   c                     t         |   |       t        |      | _        t	        j
                  |j                  j                  |j                  j                  d      | _	        | j                          y )NF)bias)r/   r0   r,   modelr   r6   r3   r9   r4   r   r<   r=   s     r)   r0   zFuyuForCausalLM.__init__   sS     v&
yy!3!3!?!?ASASA^A^ejkr(   c                 6    | j                   j                         S rA   )r   rB   rC   s    r)   rB   z$FuyuForCausalLM.get_input_embeddings   s    zz..00r(   c                 :    | j                   j                  |       y rA   )r   rE   rF   s     r)   rE   z$FuyuForCausalLM.set_input_embeddings   s    

''.r(   Nrc   rv   rw   rx   ry   r   rd   rz   labelsr{   r|   r}   logits_to_keeprK   c                 T   |
|
n| j                   j                  }
||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }| j                  ||||||||
||d      }|d   }t        |t              rt        | d      n|}| j                  |dd|ddf         }d}|	4 | j                  d||	| j                   j                  j                  d|}t        |||j                  |j                  |j                         S )a  
        image_patches (`torch.FloatTensor` of shape `(batch_size, num_total_patches, patch_size_ x patch_size x num_channels)`, *optional*):
            Image patches to be used as continuous embeddings. The patches are flattened and then projected to the
            hidden size of the model.
        image_patches_indices (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Tensor of indices of the image patches in the input_ids tensor.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.

        Examples:

        ```python
        >>> from transformers import FuyuProcessor, FuyuForCausalLM
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO

        >>> processor = FuyuProcessor.from_pretrained("adept/fuyu-8b")
        >>> model = FuyuForCausalLM.from_pretrained("adept/fuyu-8b")

        >>> url = "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/bus.png"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))
        >>> prompt = "Generate a coco-style caption.\n"

        >>> inputs = processor(images=image, text=prompt, return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> generated_ids = model.generate(**inputs, max_new_tokens=7)
        >>> generation_text = processor.batch_decode(generated_ids[:, -7:], skip_special_tokens=True)
        >>> print(generation_text[0])
        A blue bus parked on the side of a road.
        ```NT)rc   rv   rw   rd   rx   ry   r   r{   r|   rz   r}   r   )logitsr   r4   )lossr   r   hidden_states
attentionsr'   )r   r{   r|   rz   r   r   
isinstanceintslicer   loss_functionr3   r4   r   r   r   r   )r>   rc   rv   rw   rx   ry   r   rd   rz   r   r{   r|   r}   r   r^   r   r   slice_indicesr   r   s                       r)   r   zFuyuForCausalLM.forward   sN   p 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B]**'"7')%+/!5  
  
8B>SV8W~ot4]kmA}a,?@A%4%% f9P9P9[9[_eD &#33!//))
 	
r(   c	                 t    t        |   |f|||||||d|	}
|s|	j                  dd      r
d |
d<   d |
d<   |
S )N)r   rx   rd   rv   rw   cache_positionis_first_iterationrz   Trw   rv   )r/   prepare_inputs_for_generationget)r>   rc   r   rx   rd   rv   rw   r   r   r^   model_inputsr?   s              r)   r   z-FuyuForCausalLM.prepare_inputs_for_generation_  sh     w<

+)''"7)1

 

 "fjjd&C48L01,0L)r(   )NNNNNNNNNNNNr   )NNNNNNF)r   r   r   r   _tied_weights_keysr   r0   rB   rE   r   r   rT   r   r   r   r   r   r   r   r   r   r   r   r   s   @r)   r   r      s    "8 ;#,&"
 +,VWz 1/  .2-159.204(,26!%&*)-,0#'%&^
##d*^
 ||d*	^

  %||d2^
 t+^
 &&-^
 ^
 ((4/^
 $;^
 t#^
  $;^
 #Tk^
 D[^
 d
^
" 
'	'#^
  ^
F "  r(   r   )r   r   r,   )__doc__rT   r   cache_utilsr   
generationr   modeling_outputsr   r   modeling_utilsr	   models.auto.modeling_autor
   processing_utilsr   utilsr   r   r   r   r   configuration_fuyur   
get_loggerr   loggerr   r,   r   __all__r'   r(   r)   <module>r      s         ) R - 2 & j j * 
		H	% 
4/ 
4 
4 
n# n
nb 
U)? U
Up Br(   