
    qi                     t   d dl Zd dl mZ d dlmZ d dlZd dlmZ ddlm	Z
 ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZ ddlmZmZ ddlmZ ddlmZm Z m!Z!m"Z"m#Z# ddl$m%Z%m&Z& ddl'm(Z( ddl)m*Z* ddl+m,Z,m-Z-  ed       G d dej\                               Z/	 dFdej\                  dej`                  dej`                  dej`                  dej`                  dz  de1de1fdZ2 G d  d!ej\                        Z3e e!d"#       G d$ d%e                    Z4 G d& d'ej\                        Z5 G d( d)ej\                        Z6 G d* d+ej\                        Z7ejp                  e/d,Z9 G d- d.e      Z: G d/ d0ej\                        Z;e! G d1 d2e             Z<e! G d3 d4e<             Z=e! G d5 d6e             Z> G d7 d8ej\                        Z?e e!d9#       G d: d;e                    Z@ e!d<#       G d= d>e>             ZAe e!d?#       G d@ dAe                    ZB e!dB#       G dC dDe>e             ZCg dEZDy)G    N)Callable)	dataclass   )initialization)ACT2FN)Cache)GenerationMixin)use_kernel_forward_from_hub)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPastBaseModelOutputWithPooling)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringtorch_compilable_check	torch_int)can_return_tuplemerge_with_config_defaults)capture_outputs   )	AutoModel   )InternVLConfigInternVLVisionConfigRMSNormc                   h     e Zd Zddeddf fdZdej                  dej                  fdZd Z xZ	S )	InternVLVisionRMSNormepsreturnNc                     t         |           t        j                  t	        j
                  |            | _        || _        y)zD
        InternVLVisionRMSNorm is equivalent to T5LayerNorm
        N)super__init__nn	Parametertorchonesweightvariance_epsilon)selfhidden_sizer"   	__class__s      `/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/internvl/modeling_internvl.pyr&   zInternVLVisionRMSNorm.__init__/   s1     	ll5::k#:; #    hidden_statesc                 "   |j                   }|j                  t        j                        }|j	                  d      j                  dd      }|t        j                  || j                  z         z  }| j                  |j                  |      z  S )Nr   T)keepdim)	dtypetor)   float32powmeanrsqrtr,   r+   )r-   r2   input_dtypevariances       r0   forwardzInternVLVisionRMSNorm.forward7   sy    #))%((7 $$Q',,R,>%Ht?T?T4T(UU{{]--k:::r1   c                 ^    t        | j                  j                         d| j                   S )Nz, eps=)tupler+   shaper,   r-   s    r0   
extra_reprz InternVLVisionRMSNorm.extra_repr>   s*    ))*+6$2G2G1HIIr1   )gư>)
__name__
__module____qualname__floatr&   r)   Tensorr>   rC   __classcell__r/   s   @r0   r!   r!   -   s7    $ $$ $;U\\ ;ell ;Jr1   r!   modulequerykeyvalueattention_maskscalingdropoutc                 x   |}|}	t        j                  ||j                  dd            |z  }
||
|z   }
t        j                  j                  |
d      }
t        j                  j                  |
|| j                        }
t        j                  |
|	      }|j                  dd      j                         }||
fS )Nr   r   r4   dim)ptrainingr   )	r)   matmul	transposer'   
functionalsoftmaxrQ   rV   
contiguous)rK   rL   rM   rN   rO   rP   rQ   kwargs
key_statesvalue_statesattn_weightsattn_outputs               r0   eager_attention_forwardra   B   s     JL<<z';';Aq'ABWLL!#n4 ==((2(>L==((6??([L,,|\:K''1-88:K$$r1   c                   t     e Zd ZdZdef fdZ	 d	dej                  dej                  dz  dee	   fdZ
 xZS )
InternVLVisionAttentionz+Attention Class for InternVL Vision Encoderconfigc                    t         |           || _        |j                  | _        |j
                  | _        | j                  | j                  z  | _        | j                  | j                  z  | j                  k7  r&t        d| j                   d| j                   d      | j                  dz  | _	        |j                  | _
        |j                  }|j                  }d| _        t        j                  | j                  | j                  | j                  z  |j                         | _        t        j                  | j                  | j                  | j                  z  |j                         | _        t        j                  | j                  | j                  | j                  z  |j                         | _        t        j                  | j                  | j                        | _        |dkD  rt        j*                  |      nt        j,                         | _        |rt/        | j                        nt        j,                         | _        |rt/        | j                        | _        y t        j,                         | _        y )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      Fbiasr   )r%   r&   rd   r.   	embed_dimnum_attention_heads	num_headshead_dim
ValueErrorscaleattention_dropoutprojection_dropoutuse_qk_norm	is_causalr'   Linearattention_biasq_projk_projv_projprojection_layerDropoutIdentityr!   q_normk_norm)r-   rd   proj_dropoutqk_normr/   s       r0   r&   z InternVLVisionAttention.__init___   s   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
!'!9!900$$ ii0NU[UjUjkii0NU[UjUjkii0NU[UjUjk "		$..$.. I>JQ>N"**\":TVT_T_Ta?F+DNN;BKKM?F+DNN;BKKMr1   Nr2   rO   r\   c                 r   |j                         \  }}}| j                  |      }| j                  |      }| j                  |      }	| j	                  |      }| j                  |      }|j                  ||| j                  | j                        j                  dd      }|j                  ||| j                  | j                        j                  dd      }|	j                  ||| j                  | j                        j                  dd      }	t        j                  | j                  j                  t              }
 |
| |||	|f| j                   sdn| j"                  | j$                  dd|\  }}|j                  ||| j&                        }| j)                  |      }| j+                  |      }||fS )Nr   r           F)rQ   rP   rq   )sizert   ru   rv   rz   r{   reshaperj   rk   rX   viewr   get_interfacerd   _attn_implementationra   rV   rn   rm   rh   rw   ro   )r-   r2   rO   r\   
batch_sizeseq_len_query_statesr]   r^   attention_interfacer`   r_   outputs                 r0   r>   zInternVLVisionAttention.forward{   s    "/!3!3!5
GQ{{=1[[/
{{=1{{<0[[,
#++JQUQ^Q^_iijkmno''
GT^^T]][eefgijk
#((Wdnndmm\ffghjkl(?(M(MKK,,.E)
 %8
%
  $}}C$2H2HJJ
%
 
%
!\ "))*gt~~N&&{3((0|##r1   N)rD   rE   rF   __doc__r   r&   r)   rH   r   r   r>   rI   rJ   s   @r0   rc   rc   \   sN    5Z3 Z> /3'$||'$ t+'$ +,	'$r1   rc   z7
    Class for outputs of [`InternVLVisionModel`].
    custom_introc                       e Zd ZdZy)$InternVLVisionModelOutputWithPoolingaF  
    pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
        Average of the last layer hidden states of the patch tokens (excluding the *[CLS]* token) if
        *config.use_mean_pooling* is set to True. If set to False, then the final hidden state of the *[CLS]* token
        will be returned.
    N)rD   rE   rF   r    r1   r0   r   r      s    r1   r   c                   Z     e Zd ZdZ fdZdej                  dej                  fdZ xZS )InternVLVisionPatchEmbeddingsz
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    c                 ^   t         |           |j                  |j                  }}|j                  |j
                  }}|d   |d   z  |d   |d   z  z  }|d   |d   z  |d   |d   z  f}|| _        || _        || _        || _        || _        t        j                  ||||      | _
        y )Nr   r   )kernel_sizestride)r%   r&   
image_size
patch_sizenum_channelsr.   num_patchespatch_shaper'   Conv2d
projection)	r-   rd   r   r   r   r.   r   r   r/   s	           r0   r&   z&InternVLVisionPatchEmbeddings.__init__   s    !'!2!2F4E4EJ
$*$7$79K9Kk!!}
15*Q-:VW=:XY!!}
15z!}
ST7UV$$(&&))L+:^hir1   pixel_valuesr#   c                    |j                   \  }}}}|| j                  k7  rt        d      | j                  |j	                  | j                  j
                  j                              }|j                  d      j                  dd      }|S )NzeMake sure that the channel dimension of the pixel values match with the one set in the configuration.r   r   )	rA   r   rl   r   r7   r+   r6   flattenrX   )r-   r   r   r   heightwidth
embeddingss          r0   r>   z%InternVLVisionPatchEmbeddings.forward   s    2>2D2D/
L&%4,,,w  __\__T__5K5K5Q5Q%RS
''*44Q:
r1   )	rD   rE   rF   r   r&   r)   rH   r>   rI   rJ   s   @r0   r   r      s)    j
ELL 
U\\ 
r1   r   c                        e Zd ZdZdeddf fdZdej                  dededej                  fd	Z		 dd
ej                  dej                  dz  dej                  fdZ xZS )InternVLVisionEmbeddingszc
    Construct the CLS token, position and patch embeddings. Optionally, also the mask token.

    rd   r#   Nc                 2   t         |           t        j                  t	        j
                  dd|j                              | _        |j                  r:t        j                  t	        j
                  dd|j                              | _	        nd | _	        t        |      | _        |j                  | _        t        |j                  t        j                   j"                        r|j                  n|j                  |j                  f| _        | j                  j$                  }|j&                  r=t        j                  t	        j
                  d|dz   |j                              | _        nd | _        t        j*                  |j,                        | _        y )Nr   )r%   r&   r'   r(   r)   zerosr.   	cls_tokenuse_mask_token
mask_tokenr   patch_embeddingsr   
isinstancer   collectionsabcIterabler    use_absolute_position_embeddingsposition_embeddingsrx   hidden_dropout_probrQ   )r-   rd   r   r/   s      r0   r&   z!InternVLVisionEmbeddings.__init__   s$   ekk!Q8J8J&KL   ll5;;q!V=O=O+PQDO"DO =f E ++ &++[__-E-EF ##V%6%67 	
 ++7722')||EKK;QR?TZTfTf4g'hD$'+D$zz&"<"<=r1   r   r   r   c                    |j                   d   dz
  }| j                  j                   d   dz
  }t        j                  j	                         s||k(  r||k(  r| j                  S | j                  ddddf   }| j                  ddddf   }|j                   d   }|| j
                  d   z  }	|| j
                  d   z  }
t        |dz        }|j                  d|||      }|j                  dddd      }t        j                  j                  ||	|
fdd	
      }|j                  dddd      j                  dd|      }t        j                  ||fd      S )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   Nr4   r         ?r   r   bicubicF)r   modealign_cornersrS   )rA   r   r)   jit
is_tracingr   r   r   permuter'   rY   interpolater   cat)r-   r   r   r   r   num_positionsclass_pos_embedpatch_pos_embedrT   
new_height	new_widthsqrt_num_positionss               r0   interpolate_pos_encodingz1InternVLVisionEmbeddings.interpolate_pos_encoding   sj    !&&q)A-0066q9A= yy##%+*F6UZ?+++221bqb59221ab59r"tq11
T__Q//	&}c'9:)11!5GI[]`a)11!Q1=--33i(	 4 
 *11!Q1=BB1b#Nyy/?;CCr1   r   bool_masked_posc                    |j                   \  }}}}| j                  |      }|j                         \  }}}|K| j                  j	                  ||d      }	|j                  d      j                  |	      }
|d|
z
  z  |	|
z  z   }| j                  j	                  |dd      }t        j                  ||fd      }| j                  || j                  |||      z   }| j                  |      }|S )Nr4   r   rS   )rA   r   r   r   expand	unsqueezetype_asr   r)   r   r   r   rQ   )r-   r   r   r   r   r   r   r   r   mask_tokensw
cls_tokenss               r0   r>   z InternVLVisionEmbeddings.forward  s    
 +001fe**<8
!+!2
GQ&//00WbIK))"-55kBA#q1u-a?J^^**:r2>
YY
J7Q?
##/#d&C&CJPVX]&^^J\\*-
r1   r   )rD   rE   rF   r   r   r&   r)   rH   intr   
BoolTensorr>   rI   rJ   s   @r0   r   r      s    
>3 > >,&D5<< &D &DUX &D]b]i]i &DV 48ll ))D0 
	r1   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )InternVLVisionMLPc                    t         |           || _        t        |j                     | _        t        j                  |j                  |j                        | _
        t        j                  |j                  |j                        | _        y r   )r%   r&   rd   r   
hidden_actactivation_fnr'   rr   r.   intermediate_sizefc1fc2r-   rd   r/   s     r0   r&   zInternVLVisionMLP.__init__8  sd    #F$5$5699V//1I1IJ99V55v7I7IJr1   r2   r#   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r   )r   r   r   )r-   r2   s     r0   r>   zInternVLVisionMLP.forward?  s4    /**=9/r1   )rD   rE   rF   r&   r)   rH   r>   rI   rJ   s   @r0   r   r   7  s$    KU\\ ell r1   r   )
layer_normrms_normc                        e Zd ZdZdeddf fdZdej                  deej                     eej                  ej                  f   z  fdZ	 xZ
S )InternVLVisionLayerz?This corresponds to the Block class in the timm implementation.rd   r#   Nc                    t         |           |j                  | _        d| _        t	        |      | _        t        |      | _        t        |j                     |j                  |j                        | _        t        |j                     |j                  |j                        | _        |j                  }t        j                   |t#        j$                  |j                        z  d      | _        t        j                   |t#        j$                  |j                        z  d      | _        t        j*                  |j,                        | _        y )Nr   r"   T)requires_grad)r%   r&   chunk_size_feed_forwardseq_len_dimrc   	attentionr   mlpNORM2FN	norm_typer.   layer_norm_epslayernorm_beforelayernorm_afterlayer_scale_init_valuer'   r(   r)   r*   lambda_1lambda_2rx   r   rQ   )r-   rd   init_valuesr/   s      r0   r&   zInternVLVisionLayer.__init__L  s    '-'E'E$08$V, '(8(8 9&:L:LRXRgRg h&v'7'789K9KQWQfQfg33[5::f>P>P3Q%Qaef[5::f>P>P3Q%Qaefzz&"<"<=r1   r2   c                    | j                  | j                  |            \  }}| j                  |z  }||z   }| j                  |      }| j	                  |      }| j                  |      }| j                  | j                  |z  }||z   }|S r   )r   r   r   r   r   rQ   r   )r-   r2   attention_outputr   layer_outputs        r0   r>   zInternVLVisionLayer.forward[  s     #nn!!-0
!  ==+;; )=8 ++M:xx-||L1==$==<7L $m3r1   )rD   rE   rF   r   r   r&   r)   rH   r@   r>   rI   rJ   s   @r0   r   r   I  sU    I>3 > >|| 
u||	uU\\5<<%?@	@r1   r   c                   R     e Zd Zdeddf fdZdej                  deez  fdZ	 xZ
S )InternVLVisionEncoderrd   r#   Nc                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w )NF)
r%   r&   rd   r'   
ModuleListrangenum_hidden_layersr   layergradient_checkpointing)r-   rd   ir/   s      r0   r&   zInternVLVisionEncoder.__init__x  sO    ]]vOgOgIh#iA$7$?#ij
&+# $js   A#r2   c                 L    | j                   D ]
  } ||      } t        |      S )N)last_hidden_state)r   r   )r-   r2   layer_modules      r0   r>   zInternVLVisionEncoder.forward~  s3     !JJ 	8L(7M	8 +
 	
r1   )rD   rE   rF   r   r&   r)   rH   r@   r   r>   rI   rJ   s   @r0   r   r   w  s7    ,3 , ,	
||	
 
	 	
r1   r   c                        e Zd ZU eed<   dZdZdZdZdgZ	dZ
dZdZdZeedZ ej$                          fd       Z xZS )	InternVLVisionPreTrainedModelrd   internvl_visionr   )imagevideoTr   )r2   
attentionsc                 $   t         |   |       t        |t              rwt	        j
                  |j                         |j                  t	        j
                  |j                         |j                   t	        j
                  |j                         yyt        |t              rit	        j                  |j                  | j                  j                         t	        j                  |j                  | j                  j                         yy)zInitialize the weightsN)r%   _init_weightsr   r   initzeros_r   r   r   r   	constant_r   rd   r   r   )r-   rK   r/   s     r0   r  z+InternVLVisionPreTrainedModel._init_weights  s     	f%f67KK(()  ,F--.))5F667 6 34NN6??DKK,N,NONN6??DKK,N,NO 5r1   )rD   rE   rF   r   __annotations__base_model_prefixmain_input_nameinput_modalitiessupports_gradient_checkpointing_no_split_modules_supports_sdpa_supports_flash_attn_supports_flex_attn_supports_attention_backendr   rc   _can_record_outputsr)   no_gradr  rI   rJ   s   @r0   r   r     sn      )$O)&*#./N"& --
 U]]_P Pr1   r   c                        e Zd Zdeddf fdZd Ze ed      e	 dde	j                  d	e	j                  dz  deez  fd
                     Z xZS )InternVLVisionModelrd   r#   Nc                 2   t         |   |       || _        t        |      | _        t        |      | _        |j                  rt        j                         n*t        j                  |j                  |j                        | _        | j                          y )Nr   )r%   r&   rd   r   r   r   encoderuse_mean_poolingr'   ry   	LayerNormr.   r   	layernorm	post_initr   s     r0   r&   zInternVLVisionModel.__init__  so     26:,V4 $44BKKM",,vGYGY_e_t_t:u 	
 	r1   c                 .    | j                   j                  S r   )r   r   rB   s    r0   get_input_embeddingsz(InternVLVisionModel.get_input_embeddings  s    ///r1   F)tie_last_hidden_statesr   r   c                     | j                  ||      }| j                  |      }|d   }| j                  |      }t        ||j                  |j
                        S )z
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
        )r   r   )r   r2   r   )r   r  r  r   r2   r   )r-   r   r   r\   embedding_outputencoder_outputssequence_outputs          r0   r>   zInternVLVisionModel.forward  s`      ??<?Y,,'78)!,..93-)77&11
 	
r1   r   )rD   rE   rF   r   r&   r  r   r   r   r)   rH   r   r@   r   r>   rI   rJ   s   @r0   r  r    su    3  0  E2UY
!LL
;@;K;Kd;R
	5	5
  3  
r1   r  c                   <    e Zd ZU eed<   dZdZdZdZdZ	dZ
dZdZdZy)InternVLPreTrainedModelrd   model)r   textr   Tpast_key_valuesN)rD   rE   rF   r   r  r  r	  r
  _skip_keys_device_placementr  r  _can_compile_fullgraphr  r  r   r1   r0   r"  r"    s=    1&*#"3N!"&r1   r"  c                   *     e Zd Zdef fdZd Z xZS )InternVLMultiModalProjectorrd   c                 *   t         |           t        j                  |j                  j
                  t        d|j                  z        dz  z        | _        t        j                  |j                  j
                  t        d|j                  z        dz  z  |j                  j
                        | _        t        |j                     | _        t        j                  |j                  j
                  |j                  j
                        | _        y )Nr   r   )r%   r&   r'   r  vision_configr.   r   downsample_ratior   rr   text_configlinear_1r   projector_hidden_actactlinear_2r   s     r0   r&   z$InternVLMultiModalProjector.__init__  s    ,,v';';'G'G#aRXRiRiNiJjnoJo'op		  ,,s1v7N7N3N/OST/TTV\VhVhVtVt
 &556		&"4"4"@"@&BTBTB`B`ar1   c                     | j                  |      }| j                  |      }| j                  |      }| j                  |      }|S r   )r   r.  r0  r1  )r-   image_featuresr2   s      r0   r>   z#InternVLMultiModalProjector.forward  s@    7m4/m4r1   )rD   rE   rF   r   r&   r>   rI   rJ   s   @r0   r)  r)    s    b~ br1   r)  zM
    Base class for InternVL outputs, with hidden states and attentions.
    c                   :    e Zd ZU dZdZej                  dz  ed<   y)InternVLModelOutputWithPasta  
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
    Nimage_hidden_states)rD   rE   rF   r   r6  r)   FloatTensorr  r   r1   r0   r5  r5    s    	 59**T18r1   r5  zx
    The InternVL model which consists of a vision backbone and a language model, without a language modeling head.
    c                   V    e Zd ZddiZdef fdZd Zd Zee	 e
d      	 	 dd
ej                  deee   z  d	z  ded	z  dee   deez  f
d                     Zdej*                  dej                  dej                  fdZee
	 	 	 	 	 	 	 	 	 ddej*                  d	z  d
ej                  d	z  dej.                  d	z  dej*                  d	z  ded	z  dej                  d	z  deee   z  d	z  ded	z  dej*                  d	z  dee   deez  fd              Zddej.                  defdZ xZS )InternVLModel^language_model.modellanguage_modelrd   c                     t         |   |       t        j                  |j                        | _        t        |      | _        t        j                  |j                        | _	        | j                          y r   )r%   r&   r   from_configr+  vision_towerr)  multi_modal_projectorr-  r;  r  r   s     r0   r&   zInternVLModel.__init__  sY     %11&2F2FG%@%H"'33F4F4FGr1   c                 6    | j                   j                         S r   )r;  r  rB   s    r0   r  z"InternVLModel.get_input_embeddings  s    ""7799r1   c                 :    | j                   j                  |       y r   )r;  set_input_embeddingsr-   rN   s     r0   rB  z"InternVLModel.set_input_embeddings!  s    007r1   zWObtains image last hidden states from the vision tower and apply multimodal projection.r   Nr   vision_feature_layervision_feature_select_strategyr\   r#   c                 &   |j                  | j                        }| j                  j                  }|dk7  rd|d<    | j                  d|dd|}|dk(  r|j
                  }n|j                  |   }|dk(  r|ddddddf   }|j                  d   }t        |d	z        }	|j                  d
   }
|j                  |
|	|	d      }| j                  ||      }|j                  |
d|j                  d         }| j                  |      }||_        |S )a!  
        pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
            The tensors corresponding to the input images.
        vision_feature_layer (`int` or `list[int]`):
            Layer index or list of layer indices to extract features from.
        )r6   r4   Toutput_hidden_states)r   return_dictdefaultNr   r   r   )scale_factorr   )r7   r6   rd   r,  r>  r   r2   rA   r   r   pixel_shuffler?  pooler_output)r-   r   rD  rE  r\   r,  vision_outputsvision_featureschannelsfeature_sizer   s              r0   get_image_featuresz InternVLModel.get_image_features$  s:   $ $TZZ8;;772%-1F)****aRVaZ`a2%,>>O,::;OPO)Y6-aQh7O #((+8S=)$**1-
 *11*lLZ\] ,,_K[,\ *11*b/BWBWXZB[\ 44_E'6$r1   	input_idsinputs_embedsr3  c                 N   |m| | j                         t        j                  | j                  j                  t        j
                  |j                              k(  }|j                  d      }n|| j                  j                  k(  }|j                         }|j                  d   |j                  d   z  }|j                  d      j                  |      j                  |j                        }t        ||   j                         |j                         k(  d| d|        |S )z
        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
        equal to the length of multimodal features. If the lengths are different, an error is raised.
        )r6   devicer4   r   r   z6Image features and image tokens do not match, tokens: z, features: )r  r)   tensorrd   image_token_idlongrU  allsumrA   r   	expand_asr7   r   numel)r-   rR  rS  r3  special_image_maskn_image_tokensn_image_featuress          r0   get_placeholder_maskz"InternVLModel.get_placeholder_maskW  s    !.2M$2K2K2MT[[77uzzR_RfRfg3 " "4!7!7!;!*dkk.H.H!H+//1)//2^5I5I!5LL/99"=GGVYYZgZnZno,-3359M9M9OOD^DTT`aq`rs	
 "!r1   rO   position_idsr%  cache_positionc
           	         |d u |d uz  rt        d      | | j                         |      }|k| j                  |||d      j                  }|j	                  |j
                  |j                        }| j                  |||      }|j                  ||      } | j                  d|||||	d|
}t        |j                  |j                  |j                  |j                  |      S d       S )Nz:You must specify exactly one of input_ids or inputs_embedsT)r   rD  rE  rH  )rS  r3  )rO   ra  r%  rS  rb  )r   r%  r2   r   r6  r   )rl   r  rQ  rL  r7   rU  r6   r`  masked_scatterr;  r5  r   r%  r2   r   )r-   rR  r   rO   ra  r%  rS  rD  rE  rb  r\   r3  r]  outputss                 r0   r>   zInternVLModel.forwardo  s<    -t";<YZZ 7D557	BM#!44)%9/M 	 5 
 m  ,..}/C/C]EXEXYN!%!:!:~ "; " *889K^\M%$%% 
)%+')
 
 +%77#33!//))2>2J
 	

 QU
 	
r1   rN  rJ  c           
         |j                         \  }}}}||z  dk7  s||z  dk7  rt        d      |j                  ||t        ||z        t        ||z              }|j	                  dddd      j                         }|j                  |t        ||z        t        ||z        t        ||dz  z              }|j	                  dddd      j                         }|S )a&  Perform pixel shuffle downsampling on vision features.

        Args:
            vision_features (`torch.Tensor`):
                Input tensor of shape (batch_size, width, height, channels).
            scale_factor (`float`, *optional*, defaults to `0.5`):
                Factor by which to downsample. Default is 0.5, which halves the dimensions.

        Returns:
            vision_features (`torch.Tensor`):
                Downsampled tensor of shape (batch_size, height*scale_factor, width*scale_factor, channels/(scale_factor^2)).
        r   zKHeight and width must be divisible by scale_factor for proper downsampling.r   r   r   )r   rl   r   r   r   r[   )r-   rN  rJ  r   r   r   rO  s          r0   rK  zInternVLModel.pixel_shuffle  s     />.B.B.D+
E68L A%)=)Bjkk *..s6L#893x,?V;W
 *11!Q1=HHJ *..F\12C8L4MsS[_kmn_nSoOp

 *11!Q1=HHJr1   NN)	NNNNNNNNN)r   )rD   rE   rF   _checkpoint_conversion_mappingr   r&   r  rB  r   r   r   r)   r7  r   liststrr   r   r@   r   rQ  
LongTensorr`  rH   r   r5  r>   rG   rK  rI   rJ   s   @r0   r9  r9    s    	!"2&"~ :8 n 8<59	,'', "DIo4, ),d
	,
 +,, 
+	+,   
,\"))":?:K:K"]b]n]n"0  .215.204(,267;5926/
##d*/
 ''$./
 t+	/

 &&-/
 /
 ((4//
 "DIo4/
 ),d
/
 ((4//
 +,/
 
,	,/
  /
b!U\\ ! !r1   r9  zT
    Base class for InternVL causal language model (or autoregressive) outputs.
    c                       e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	e
dz  ed<   dZeej                     dz  ed<   dZeej                     dz  ed<   dZej                  dz  ed<   y)	InternVLCausalLMOutputWithPasta4  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
    Nlosslogitsr%  r2   r   r6  )rD   rE   rF   r   rn  r)   r7  r  ro  r%  r   r2   r@   r   r6  r   r1   r0   rm  rm    s     &*D%

d
")'+FE$+$(OUT\(59M5**+d2926Je''(4/648**T18r1   rm  zV
    The INTERNVL model which consists of a vision backbone and a language model.
    c                    f    e Zd ZdddddZddiZdef fd	Zd
 Zd Zde	j                  fdZe	 	 ddej                  deee   z  dz  dedz  dee   deez  f
d       Zee	 	 	 	 	 	 	 	 	 	 	 	 d dej0                  dz  dej                  dz  dej2                  dz  dej0                  dz  dedz  dej                  dz  deee   z  dz  dedz  dej0                  dz  dej0                  dz  deej2                  z  dej2                  dz  dee   deez  fd              Z	 	 	 	 	 	 	 d! fd	Z xZS )" InternVLForConditionalGenerationzmodel.language_modelzmodel.vision_towerzmodel.multi_modal_projectorlm_head)r:  z^vision_towerz^multi_modal_projectorz^language_model.lm_headzlm_head.weightz(model.language_model.embed_tokens.weightrd   c                     t         |   |       t        |      | _        t	        j
                  |j                  j                  |j                  j                  d      | _	        | j                          y )NFrf   )r%   r&   r9  r#  r'   rr   r-  r.   
vocab_sizerr  r  r   s     r0   r&   z)InternVLForConditionalGeneration.__init__  sS     "6*
yy!3!3!?!?ASASA^A^ejkr1   c                 6    | j                   j                         S r   )r#  r  rB   s    r0   r  z5InternVLForConditionalGeneration.get_input_embeddings  s    zz..00r1   c                 :    | j                   j                  |       y r   )r#  rB  rC  s     r0   rB  z5InternVLForConditionalGeneration.set_input_embeddings  s    

''.r1   r#   c                     | j                   S r   )rr  rB   s    r0   get_output_embeddingsz6InternVLForConditionalGeneration.get_output_embeddings  s    ||r1   Nr   rD  rE  r\   c                 B     | j                   j                  d|||d|S )N)r   rD  rE  r   )r#  rQ  )r-   r   rD  rE  r\   s        r0   rQ  z3InternVLForConditionalGeneration.get_image_features  s5     -tzz,, 
%!5+I
 	
 	
r1   rR  rO   ra  r%  rS  labelsrb  logits_to_keepimage_sizesc                     | j                   d|||||||||
|d
|}|d   }t        |t              rt        | d      n|}| j	                  |dd|ddf         }d}|	4 | j
                  d||	| j                  j                  j                  d|}t        |||j                  |j                  |j                  |j                        S )ac  
        Example:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, AutoModelForImageTextToText

        >>> torch_device = "cuda"
        >>> processor = AutoProcessor.from_pretrained("OpenGVLab/InternVL3-1B-hf")
        >>> model = AutoModelForImageTextToText.from_pretrained(
        ...     "OpenGVLab/InternVL3-1B-hf", dtype=torch.bfloat16, device_map=torch_device
        ... )

        >>> messages = [
        ...     {
        ...         "role": "user",
        ...         "content": [
        ...             {
        ...                 "type": "image",
        ...                 "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg",
        ...             },
        ...             {
        ...                 "type": "image",
        ...                 "url": "https://thumbs.dreamstime.com/b/golden-gate-bridge-san-francisco-purple-flowers-california-echium-candicans-36805947.jpg",
        ...             },
        ...             {"type": "text", "text": "These images depict two different landmarks. Can you identify them?"},
        ...         ],
        ...     },
        ... ]

        >>> inputs = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt").to(torch_device)
        >>> generate_ids = model.generate(**inputs, max_new_tokens=200)
        >>> print(processor.decode(generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True))
        The images depict the Statue of Liberty and the Golden Gate Bridge.
        ```)
rR  r   rO   ra  r%  rS  rD  rE  rb  r|  r   N)ro  rz  rt  )rn  ro  r%  r2   r   r6  r   )r#  r   r   slicerr  loss_functionrd   r-  rt  rm  r%  r2   r   r6  )r-   rR  r   rO   ra  r%  rS  rD  rE  rz  rb  r{  r|  r\   re  r2   slice_indicesro  rn  s                      r0   r>   z(InternVLForConditionalGeneration.forward  s    j $** 
%)%+'!5+I)#
 
  
8B>SV8W~ot4]kmA}a,?@A%4%% f9P9P9[9[_eD .#33!//)) ' ; ;
 	
r1   c	           
      h    t        |   |f||||||d|	}
|s|	j                  dd      s||
d<   |
S )N)r%  rS  rO   rb  r{  is_first_iteration	use_cacheTr   )r%   prepare_inputs_for_generationget)r-   rR  r%  rS  r   rO   rb  r{  r  r\   model_inputsr/   s              r0   r  z>InternVLForConditionalGeneration.prepare_inputs_for_generationg  s\     w<	
+')))1	
 	
 VZZT%B
 ,8L(r1   rg  )NNNNNNNNNNr   N)NNNNNNF)rD   rE   rF   rh  _tied_weights_keysr   r&   r  rB  r'   Modulerx  r   r)   r7  r   ri  rj  r   r   r@   r   rQ  r   rk  rH   r   rm  r>   r  rI   rJ   s   @r0   rq  rq    s8    #9.#@$-	&" +,VW~ 1/ryy   8<59	
''
 "DIo4
 ),d
	

 +,
 
+	+
 
  .215.204(,267;59*.26-.+/S
##d*S
 ''$.S
 t+	S

 &&-S
 S
 ((4/S
 "DIo4S
 ),d
S
   4'S
 ((4/S
 ell*S
 \\D(S
 +,S
 
/	/S
  S
p     r1   rq  )r   r  r"  r9  rq  )r   )Ecollections.abcr   r   dataclassesr   r)   torch.nnr'    r   r  activationsr   cache_utilsr   
generationr	   integrationsr
   modeling_layersr   modeling_outputsr   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   utils.genericr   r   utils.output_capturingr   autor   configuration_internvlr   r   r  r!   rH   rG   ra   rc   r   r   r   r   r  r   r   r   r   r  r"  r)  r5  r9  rm  rq  __all__r   r1   r0   <module>r     s  ,  $ !   & !   ) 7 9 d d F & g g I 5  H Y'JBII J (J6 %II%<<% 
% <<	%
 LL4'% % %4F$bii F$R 
+E   BII  J[ryy [|		  3H
I+4 +\
BII 
& PO P P@ &
7 &
 &
R 'o ' '")) $ 
9"9 9 9 
r+ r
rj 
9[ 9 90 
^'> ^
^Br1   