
    qi[k                     j   d Z ddlmZ ddlmZ ddlZddlmZ ddlmZ ddl	m
Z
 dd	lmZ dd
lmZ ddlmZ ddlmZmZ ddlmZ ddlmZ ddlmZmZmZmZmZmZ ddlm Z  ddl!m"Z" ddl#m$Z$  ejJ                  e&      Z'e ed       G d de                    Z(e ed       G d de                    Z) G d dejT                        Z+dejX                  dz  dejX                  dz  d edz  fd!Z- e d"d#d$%      	 	 	 	 d7d&e
d$ejX                  d'ejX                  dz  d(ejX                  d)edz  d*ejX                  dz  dejX                  dz  d+ej\                  dz  d,e/dz  d-e/dz  d e0fd.       Z1e G d/ d0e             Z2 ed1       G d2 d3e2             Z3 ed1       G d4 d5e2e             Z4g d6Z5y)8zPyTorch PaliGemmamodel.    )Callable)	dataclassN)nn   )Cache)PreTrainedConfig)GenerationMixin)create_masks_for_generate)FlashAttentionKwargs)BaseModelOutputWithPastBaseModelOutputWithPooling)PreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringcan_return_tupleloggingtorch_compilable_check)deprecate_kwarg   )	AutoModel   )PaliGemmaConfigzN
    Base class for Paligemma outputs, with hidden states and attentions.
    custom_introc                   :    e Zd ZU dZdZej                  dz  ed<   y)PaligemmaModelOutputWithPasta  
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
    Nimage_hidden_states)__name__
__module____qualname____doc__r   torchFloatTensor__annotations__     b/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/paligemma/modeling_paligemma.pyr   r   .   s     59**T18r(   r   zU
    Base class for PaliGemma causal language model (or autoregressive) outputs.
    c                       e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	e
dz  ed<   dZeej                     dz  ed<   dZeej                     dz  ed<   dZej                  dz  ed<   y)	PaliGemmaCausalLMOutputWithPasta8  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.text_config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder after projecting last hidden state.
    Nlosslogitspast_key_valueshidden_states
attentionsr   )r    r!   r"   r#   r,   r$   r%   r&   r-   r.   r   r/   tupler0   r   r'   r(   r)   r+   r+   >   s     &*D%

d
")'+FE$+$(OUT\(59M5**+d2926Je''(4/648**T18r(   r+   c                   *     e Zd Zdef fdZd Z xZS )PaliGemmaMultiModalProjectorconfigc                     t         |           t        j                  |j                  j
                  |j                  j                  d      | _        y )NTbias)super__init__r   Linearvision_confighidden_sizeprojection_dimlinearselfr4   	__class__s     r)   r9   z%PaliGemmaMultiModalProjector.__init__]   s;    ii 4 4 @ @&BVBVBeBelpqr(   c                 (    | j                  |      }|S N)r>   )r@   image_featuresr/   s      r)   forwardz$PaliGemmaMultiModalProjector.forwarda   s    N3r(   )r    r!   r"   r   r9   rE   __classcell__rA   s   @r)   r3   r3   \   s    r rr(   r3   token_type_idsimage_group_idsreturnc           
      Z      ydt         dt         dt         dt         dt        f
 fd}|S )z
    This function adds the correct offsets to the `q_idx` and `kv_idx` as the torch API can only accept lengths,
    not start and end indices.
    N	batch_idxhead_idxq_idxkv_idxrJ   c                 :   t        j                  |j                  d   k  |d      }t        j                  |j                  d   k  |d      }| |f   }t        j                  |j                  d   k  |d      }| |f   }t        j                  |j                  d   k  |d      }| |f   }t        j                  |j                  d   k  |d      }| |f   }	t        j                  |j                  d   k  |	d      }	|dk(  |dk(  z  }
||	k(  }|
|z  S )Nr   r   )r$   whereshape)rL   rM   rN   rO   
safe_q_idxsafe_kv_idxtoken_type_ids_at_q_idxtoken_type_ids_at_kv_idximage_group_ids_at_q_idximage_group_ids_at_kv_idxis_image_blocksame_image_blockrI   rH   s               r)   
inner_maskz0token_type_ids_mask_function.<locals>.inner_masks   sM    [[)=)=a)@!@%K
kk&>+?+?+B"BFAN"0J1F"G"'++en6J6J16M.MOfhi"j#1)[2H#I #(;;v8L8LQ8O/OQikl#m #29j3H#I #(;;u7L7LQ7O/OQikm#n $3I{4J$K!$)KK9N9Nq9Q0QSlnp$q!1Q6;SWX;XY37PP  000r(   )intbool)rH   rI   r\   s   `` r)   token_type_ids_mask_functionr_   g   s>     1c 1S 1 1c 1d 12 r(   input_embeds5.6.0inputs_embedsversionnew_namer4   attention_maskcache_positionr.   position_idspixel_valuesis_trainingis_first_iterationc
                    |r|t        d      | j                         |||||d}|	r|	n|du xs |j                   xs |du}	|	s|
j                  dd      s<|d|z
  }n4t        j                  d       t        j                  |      dddddf   }||	r|dk(  j                  |j                        }t        j                  j                  |d	d
      ddddf   }|| z  }t        j                  |j                         d      dz
  }t        j                  ||t        j                   |d            }t#        |j                  |j                        |      |d<   t%        di |S )a"  
    Overwrites the base `create_masks_for_generate` with `token_type_ids` masking to create the causal mask mapping
    for all kinds of forward passes. Paligemma uses a bidirectional mask on the prompt tokens.

    Uses `pixel_values` as an optional input to disambiguate edge cases.
    Nz;`token_type_ids` is required as a model input when training)r4   rb   rf   rg   r.   rh   	use_cacheTr   zIt is a prefill stage but The `token_type_ids` is not provided. We recommend passing `token_type_ids` to the model to prevent bad attention masking.r   )r   r   )valuerQ   )dimor_mask_functionr'   )
ValueErrorget_text_configis_initializedgetloggerwarning_oncer$   	ones_liketodevicer   
functionalpadcumsumr]   rR   	full_liker_   r
   )r4   rb   rf   rg   r.   rh   rH   ri   rj   rk   kwargsmask_kwargsis_imageis_previous_imagenew_image_startrI   s                   r)   create_causal_mask_mappingr      s   ( ~-VWW ((*&((*$K  	%g_-K-K)Kg|cgOg  K!>% /NZ
 #__];Aq!GDN
 !&8 #a'++N,A,ABMM--ha-HCRCP"&7%77,,':':'<!DqH++hQ_acAde*Fn334o+
&' %3{33r(   c                   B    e Zd ZU eed<   dZdZdZdgZdZ	dZ
dZdZdZdZy)	PaliGemmaPreTrainedModelr4   model)imagetextTr3   r.   FN)r    r!   r"   r   r&   base_model_prefixinput_modalitiessupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_can_compile_fullgraph_supports_flash_attn_supports_sdpa_supports_flex_attn_supports_attention_backendr'   r(   r)   r   r      sF    (&*#78"3"N"&r(   r   z|
    The Base Paligemma model which consists of a vision backbone and a language model without language modeling head.,
    c            "       P    e Zd ZddiZdZdef fdZd Zd Ze	 e
d	      d
ej                  dee   deez  fd              Zdej$                  dej                  dej                  fdZe	e
	 	 	 	 	 	 	 	 	 	 	 	 	 ddej$                  dz  d
ej                  dz  dej(                  dz  dej$                  dz  dedz  dej$                  dz  dej$                  dz  dej                  dz  dej$                  dz  dedz  dedz  dedz  dedz  dee   deez  fd              Z xZS )PaliGemmaModelzlanguage_model.modellanguage_modelFr4   c                    t         |   |       t        j                  |j                        | _        t        |      | _        |j                  j                  | _	        t        j                  |j                        }|| _
        | j                  j                         j                  xs | j                  | _        | j                          y )N)r4   )r8   r9   r   from_configr;   vision_towerr3   multi_modal_projectortext_config
vocab_sizer   r4   rr   dtypetext_config_dtype	post_init)r@   r4   r   rA   s      r)   r9   zPaliGemmaModel.__init__   s     %119M9MN%A&%I" ,,77"..f6H6HI,!%!<!<!>!D!D!R

r(   c                 6    | j                   j                         S rC   )r   get_input_embeddingsr@   s    r)   r   z#PaliGemmaModel.get_input_embeddings   s    ""7799r(   c                 :    | j                   j                  |       y rC   )r   set_input_embeddingsr@   rn   s     r)   r   z#PaliGemmaModel.set_input_embeddings  s    007r(   zWObtains image last hidden states from the vision tower and apply multimodal projection.r   ri   r~   rJ   c                      | j                   |fddi|}|j                  }| j                  |      }|| j                  j                  j
                  dz  z  }||_        |S )Nreturn_dictTg      ?)r   last_hidden_stater   r4   r   r<   pooler_output)r@   ri   r~   image_outputsselected_image_featurerD   s         r)   get_image_featuresz!PaliGemmaModel.get_image_features  sk     *)),SDSFS!.!@!@334JK'4;;+B+B+N+NPS+ST&4#r(   	input_idsrb   rD   c                 N   |m| | j                         t        j                  | j                  j                  t        j
                  |j                              k(  }|j                  d      }n|| j                  j                  k(  }|j                         }|j                  d   |j                  d   z  }|j                  d      j                  |      j                  |j                        }t        ||   j                         |j                         k(  d| d|        |S )z
        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
        equal to the length of multimodal features. If the lengths are different, an error is raised.
        )r   ry   rQ   r   r   z6Image features and image tokens do not match, tokens: z, features: )r   r$   tensorr4   image_token_idlongry   allsumrS   	unsqueeze	expand_asrx   r   numel)r@   r   rb   rD   special_image_maskn_image_tokensn_image_featuress          r)   get_placeholder_maskz#PaliGemmaModel.get_placeholder_mask  s    !.2M$2K2K2MT[[77uzzR_RfRfg3 " "4!7!7!;!*dkk.H.H!H+//1)//2^5I5I!5LL/99"=GGVYYZgZnZno,-3359M9M9OOD^DTT`aq`rs	
 "!r(   Nrf   rh   r.   rH   rg   labelsrm   output_attentionsoutput_hidden_statesr   c                 b   |du |duz  rt        d      ||n| j                  j                  }||n| j                  j                  }||n| j                  j                  }|R| j                  j
                  | j                  k\  r/|| j                  j
                  k(  }|j                         }d||<   n|}| | j                         |      }|F||j                         nd}t        j                  |||j                  d   z   |j                        }||j                  d      dz   }|i| j                  |d      j                   }|j#                  |j                  |j$                        }| j'                  |||      }|j)                  ||      }t+        |x}t,              s(t/        | j                  |||||||| j0                  		      } | j2                  d|||||
||d|d
	|}t5        |j6                  |j8                  |j:                  |j<                  |      S d      S )  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.

        Example:

        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, PaliGemmaForConditionalGeneration

        >>> model = PaliGemmaForConditionalGeneration.from_pretrained("google/paligemma2-3b-mix-224")
        >>> processor = AutoProcessor.from_pretrained("google/paligemma2-3b-mix-224")

        >>> prompt = "Where is the cat standing?"
        >>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> inputs = processor(images=image, text=prompt,  return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(**inputs,)
        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Where is the cat standing?\nsnow"
        ```Nz:You must specify exactly one of input_ids or inputs_embedsr   r   )ry   T)r   )rb   rD   )rj   )	rf   rh   r.   rb   rm   r   r   r   rg   )r   r.   r/   r0   r   r'   )rq   r4   r   r   use_return_dictr   r   cloner   get_seq_lengthr$   arangerS   ry   r   r   r   rx   r   r   masked_scatter
isinstancedictr   trainingr   r   r   r.   r/   r0   )r@   r   ri   rf   rh   r.   rH   rg   rb   r   rm   r   r   r   r~   r   llm_input_idspast_seen_tokensrD   causal_mask_mappingoutputss                        r)   rE   zPaliGemmaModel.forward.  s}   b -t";<YZZ1B1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]  T[[%?%?4??%R!*dkk.H.H!H%OO-M01M,-%M 7D557FM!CRC^==?de"\\ "2]5H5H5K"KTaThThN )33A6:L #!44\t4TbbN+..}/C/C]EXEXYN!%!:!:~ "; " *889K^\M ?-F"< MM
# &$%% 
.%+'/!5)
 
 ,%77#33!//))2>2J
 	

 QU
 	
r(   )NNNNNNNNNNNNN)r    r!   r"   _checkpoint_conversion_mappingaccepts_loss_kwargsr   r9   r   r   r   r   r$   r%   r   r   r1   r   r   
LongTensorr   Tensorr   r^   r   r   rE   rF   rG   s   @r)   r   r      s
    '=>N%O"
 
:8 n	!--	9?@R9S		+	+	 	"))":?:K:K"]b]n]n"0  .215.204(,262626*.!%)-,0#'v
##d*v
 ''$.v
 t+	v

 &&-v
 v
 ((4/v
 ((4/v
 ((4/v
   4'v
 $;v
  $;v
 #Tkv
 D[v
 -.v
  
-	-!v
  v
r(   r   c            $           e Zd ZdddddZddiZdef fd	Zd
 Zd Ze	de
j                  dee   fd       Zee		 	 	 	 	 	 	 	 	 	 	 	 	 	 d%de
j                   dz  de
j                  dz  de
j"                  dz  de
j                   dz  dedz  de
j                   dz  de
j                   dz  de
j                  dz  de
j                   dz  dedz  dedz  dedz  dedz  dee
j"                  z  dee   deez  f d              Z	 	 	 	 	 	 	 	 	 	 	 d& fd	Ze ed d!d"      	 	 d'dede
j"                  de
j"                  dz  de
j"                  dedz  de
j"                  dz  de
j"                  dz  d#edz  defd$              Z xZS )(!PaliGemmaForConditionalGenerationzmodel.language_modelzmodel.vision_towerzmodel.multi_modal_projectorlm_head)z^language_model.modelz^vision_towerz^multi_modal_projectorz^language_model.lm_headzlm_head.weightz(model.language_model.embed_tokens.weightr4   c                     t         |   |       t        |      | _        t	        j
                  |j                  j                  |j                  j                  d      | _	        | j                          y )NFr6   )r8   r9   r   r   r   r:   r   r<   r   r   r   r?   s     r)   r9   z*PaliGemmaForConditionalGeneration.__init__  sS     #F+
yy!3!3!?!?ASASA^A^ejkr(   c                 6    | j                   j                         S rC   )r   r   r   s    r)   r   z6PaliGemmaForConditionalGeneration.get_input_embeddings  s    zz..00r(   c                 :    | j                   j                  |       y rC   )r   r   r   s     r)   r   z6PaliGemmaForConditionalGeneration.set_input_embeddings  s    

''.r(   ri   r~   c                 <     | j                   j                  |fi |S rC   )r   r   )r@   ri   r~   s      r)   r   z4PaliGemmaForConditionalGeneration.get_image_features  s    ,tzz,,\DVDDr(   Nr   rf   rh   r.   rH   rg   rb   r   rm   r   r   r   logits_to_keeprJ   c                 >   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  } | j                  d||||||||
|	||d|d|}|d   }t        |t              rt        | d      n|}| j                  |dd|ddf         }d}|	4 | j                  d||	| j                   j                  j                  d|}t        |||j                  |j                  |j                  |j                         S )r   NT)r   ri   rH   rf   rh   r.   rb   rm   r   r   r   r   rg   r   )r-   r   r   )r,   r-   r.   r/   r0   r   r'   )r4   r   r   r   r   r   r]   slicer   loss_functionr   r   r+   r.   r/   r0   r   )r@   r   ri   rf   rh   r.   rH   rg   rb   r   rm   r   r   r   r   r~   r   r/   slice_indicesr-   r,   s                        r)   rE   z)PaliGemmaForConditionalGeneration.forward  sS   b 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]$** 
%))%+'/!5)
 
"  
8B>SV8W~ot4]kmA}a,?@A%4%% f9P9P9[9[_eD /#33!//)) ' ; ;
 	
r(   c                     t        |   |f||||||	|
||d	|}|j                  d      |d   dz   |d<   |s|	s||d<   |S )N)	r.   rb   rf   rh   rg   rm   r   rH   rk   rh   r   ri   )r8   prepare_inputs_for_generationrt   )r@   r   r.   rb   rg   rh   ri   rf   rH   rm   r   r   rk   r~   model_inputsrA   s                  r)   r   z?PaliGemmaForConditionalGeneration.prepare_inputs_for_generation#  s    " w<
+')%)))1
 
 N+7+7+G!+KL( Y+7L(r(   r`   ra   rc   rk   c           
          t        | ||||||fd|i|j                         D 	
ci c]  \  }	}
|	dk7  s|	|
 c}
}	S c c}
}	w )Nrk   ri   )r   items)r4   rb   rf   rg   r.   rh   rH   rk   r~   kvs              r)   r
   z;PaliGemmaForConditionalGeneration.create_masks_for_generateP  s`     *

  2

 !'F1!~2Eq!tF

 
	
 Gs   ==)NNNNNNNNNNNNNr   )NNNNNNNTNNF)NF)r    r!   r"   r   _tied_weights_keysr   r9   r   r   r   r$   r%   r   r   r   r   r   r   r   r^   r]   r1   r+   rE   r   staticmethodr   r   r   r
   rF   rG   s   @r)   r   r     s    "8-"?#,	&" +,VW 1/ Eu/@/@ EFSeLf E E  .215.204(,262626*.!%)-,0#'-.X
##d*X
 ''$.X
 t+	X

 &&-X
 X
 ((4/X
 ((4/X
 ((4/X
   4'X
 $;X
  $;X
 #TkX
 D[X
 ell*X
  +,!X
" 
0	0#X
  X
z  +Z ^WO /3*/
 
||
 t+
 	

 
 llT)
 t+
 !4K
 

 P 
r(   r   )r   r   r   )NNFN)6r#   collections.abcr   dataclassesr   r$   r   cache_utilsr   configuration_utilsr   
generationr	   masking_utilsr
   modeling_flash_attention_utilsr   modeling_outputsr   r   modeling_utilsr   processing_utilsr   utilsr   r   r   r   r   r   utils.deprecationr   autor   configuration_paligemmar   
get_loggerr    ru   r   r+   Moduler3   r   r_   r%   r^   r   r   r   r   r   __all__r'   r(   r)   <module>r      sO    $ !     3 ) 6 B S - &  1  4 
		H	% 
9#: 9 9 
9k 9 90299 %LL4'%\\D(% _%P ?K +/-1$&*G4G4<<G4 LL4'G4 LL	G4
 T\G4 ,,%G4 LL4'G4 ##d*G4 G4 tG4 
G4 LG4T ' ' ' 
x
- x

x
v 
z
(@/ z

z
z ^r(   