
    qib                        d dl Zd dl mZ d dlmZ d dlZd dlmZ ddlm	Z
 ddlmZ ddlmZ ddlmZ dd	lmZmZ dd
lmZmZ ddlmZ ddlmZmZmZ ddlmZmZ ddl m!Z! ddl"m#Z# ddl$m%Z% ddl&m'Z' ddl(m)Z)m*Z*m+Z+m,Z,m-Z- ddl.m/Z/m0Z0 	 dBdejb                  dejd                  dejd                  dejd                  dejd                  dz  de3de3fdZ4 G d de'      Z5 G d  d!e%      Z6e ed"#       G d$ d%e                    Z7 G d& d'ejb                        Z8 G d( d)ejb                        Z9 G d* d+e#      Z:ejv                  e5d,Z< G d- d.e      Z= G d/ d0ejb                        Z>e G d1 d2e             Z?e G d3 d4e?             Z@ G d5 d6e-      ZAdZB G d7 d8ejb                        ZC G d9 d:e,      ZD G d; d<e+      ZE G d= d>e)      ZF G d? d@e*      ZGg dAZHy)C    N)Callable)	dataclass   )initialization)ACT2FN)Cache)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPooling)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstring	torch_int)can_return_tuplemerge_with_config_defaults)capture_outputs   )CLIPMLP)JanusVisionAttention)LlamaRMSNorm)LlavaCausalLMOutputWithPastLlavaForConditionalGeneration
LlavaModelLlavaModelOutputWithPastLlavaPreTrainedModel   )InternVLConfigInternVLVisionConfigmodulequerykeyvalueattention_maskscalingdropoutc                 x   |}|}	t        j                  ||j                  dd            |z  }
||
|z   }
t        j                  j                  |
d      }
t        j                  j                  |
|| j                        }
t        j                  |
|	      }|j                  dd      j                         }||
fS )Nr   r   dim)ptrainingr   )	torchmatmul	transposenn
functionalsoftmaxr'   r-   
contiguous)r!   r"   r#   r$   r%   r&   r'   kwargs
key_statesvalue_statesattn_weightsattn_outputs               _/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/internvl/modular_internvl.pyeager_attention_forwardr;   .   s     JL<<z';';Aq'ABWLL!#n4 ==((2(>L==((6??([L,,|\:K''1-88:K$$    c                       e Zd Zy)InternVLVisionRMSNormN__name__
__module____qualname__ r<   r:   r>   r>   H       r<   r>   c                   p     e Zd Zdef fdZ	 ddej                  dej                  dz  dee   fdZ	 xZ
S )	InternVLVisionAttentionconfigc                    t         |   |       | `d| _        |j                  }|rt        | j                        nt        j                         | _	        |rt        | j                        | _
        y t        j                         | _
        y NF)super__init__num_key_value_groups	is_causaluse_qk_normr>   	embed_dimr1   Identityq_normk_norm)selfrG   qk_norm	__class__s      r:   rK   z InternVLVisionAttention.__init__M   sd     % $$?F+DNN;BKKM?F+DNN;BKKMr<   Nhidden_statesr%   r5   c                 r   |j                         \  }}}| j                  |      }| j                  |      }| j                  |      }	| j	                  |      }| j                  |      }|j                  ||| j                  | j                        j                  dd      }|j                  ||| j                  | j                        j                  dd      }|	j                  ||| j                  | j                        j                  dd      }	t        j                  | j                  j                  t              }
 |
| |||	|f| j                   sdn| j"                  | j$                  dd|\  }}|j                  ||| j&                        }| j)                  |      }| j+                  |      }||fS )Nr   r           F)r'   r&   rM   )sizeq_projk_projv_projrQ   rR   reshape	num_headshead_dimr0   viewr   get_interfacerG   _attn_implementationr;   r-   attention_dropoutscalerO   projection_layerprojection_dropout)rS   rV   r%   r5   
batch_sizeseq_len_query_statesr6   r7   attention_interfacer9   r8   outputs                 r:   forwardzInternVLVisionAttention.forwardX   s    "/!3!3!5
GQ{{=1[[/
{{=1{{<0[[,
#++JQUQ^Q^_iijkmno''
GT^^T]][eefgijk
#((Wdnndmm\ffghjkl(?(M(MKK,,.E)
 %8
%
  $}}C$2H2HJJ
%
 
%
!\ "))*gt~~N&&{3((0|##r<   N)r@   rA   rB   r    rK   r.   Tensorr   r   rm   __classcell__rU   s   @r:   rF   rF   L   sK    	Z3 	Z /3'$||'$ t+'$ +,	'$r<   rF   z7
    Class for outputs of [`InternVLVisionModel`].
    custom_introc                       e Zd ZdZy)$InternVLVisionModelOutputWithPoolingaF  
    pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
        Average of the last layer hidden states of the patch tokens (excluding the *[CLS]* token) if
        *config.use_mean_pooling* is set to True. If set to False, then the final hidden state of the *[CLS]* token
        will be returned.
    N)r@   rA   rB   __doc__rC   r<   r:   ru   ru      s    r<   ru   c                   Z     e Zd ZdZ fdZdej                  dej                  fdZ xZS )InternVLVisionPatchEmbeddingsz
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    c                 ^   t         |           |j                  |j                  }}|j                  |j
                  }}|d   |d   z  |d   |d   z  z  }|d   |d   z  |d   |d   z  f}|| _        || _        || _        || _        || _        t        j                  ||||      | _
        y )Nr   r   )kernel_sizestride)rJ   rK   
image_size
patch_sizenum_channelshidden_sizenum_patchespatch_shaper1   Conv2d
projection)	rS   rG   r|   r}   r~   r   r   r   rU   s	           r:   rK   z&InternVLVisionPatchEmbeddings.__init__   s    !'!2!2F4E4EJ
$*$7$79K9Kk!!}
15*Q-:VW=:XY!!}
15z!}
ST7UV$$(&&))L+:^hir<   pixel_valuesreturnc                    |j                   \  }}}}|| j                  k7  rt        d      | j                  |j	                  | j                  j
                  j                              }|j                  d      j                  dd      }|S )NzeMake sure that the channel dimension of the pixel values match with the one set in the configuration.r   r   )	shaper~   
ValueErrorr   toweightdtypeflattenr0   )rS   r   rg   r~   heightwidth
embeddingss          r:   rm   z%InternVLVisionPatchEmbeddings.forward   s    2>2D2D/
L&%4,,,w  __\__T__5K5K5Q5Q%RS
''*44Q:
r<   )	r@   rA   rB   rv   rK   r.   ro   rm   rp   rq   s   @r:   rx   rx      s)    j
ELL 
U\\ 
r<   rx   c                        e Zd ZdZdeddf fdZdej                  dededej                  fd	Z		 dd
ej                  dej                  dz  dej                  fdZ xZS )InternVLVisionEmbeddingszc
    Construct the CLS token, position and patch embeddings. Optionally, also the mask token.

    rG   r   Nc                 2   t         |           t        j                  t	        j
                  dd|j                              | _        |j                  r:t        j                  t	        j
                  dd|j                              | _	        nd | _	        t        |      | _        |j                  | _        t        |j                  t        j                   j"                        r|j                  n|j                  |j                  f| _        | j                  j$                  }|j&                  r=t        j                  t	        j
                  d|dz   |j                              | _        nd | _        t        j*                  |j,                        | _        y )Nr   )rJ   rK   r1   	Parameterr.   zerosr   	cls_tokenuse_mask_token
mask_tokenrx   patch_embeddingsr}   
isinstancer|   collectionsabcIterabler    use_absolute_position_embeddingsposition_embeddingsDropouthidden_dropout_probr'   )rS   rG   r   rU   s      r:   rK   z!InternVLVisionEmbeddings.__init__   s$   ekk!Q8J8J&KL   ll5;;q!V=O=O+PQDO"DO =f E ++ &++[__-E-EF ##V%6%67 	
 ++7722')||EKK;QR?TZTfTf4g'hD$'+D$zz&"<"<=r<   r   r   r   c                    |j                   d   dz
  }| j                  j                   d   dz
  }t        j                  j	                         s||k(  r||k(  r| j                  S | j                  ddddf   }| j                  ddddf   }|j                   d   }|| j
                  d   z  }	|| j
                  d   z  }
t        |dz        }|j                  d|||      }|j                  dddd      }t        j                  j                  ||	|
fdd	
      }|j                  dddd      j                  dd|      }t        j                  ||fd      S )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   Nr)   r         ?r   r   bicubicF)rY   modealign_cornersr*   )r   r   r.   jit
is_tracingr}   r   r]   permuter1   r2   interpolater`   cat)rS   r   r   r   r   num_positionsclass_pos_embedpatch_pos_embedr+   
new_height	new_widthsqrt_num_positionss               r:   interpolate_pos_encodingz1InternVLVisionEmbeddings.interpolate_pos_encoding   sj    !&&q)A-0066q9A= yy##%+*F6UZ?+++221bqb59221ab59r"tq11
T__Q//	&}c'9:)11!5GI[]`a)11!Q1=--33i(	 4 
 *11!Q1=BB1b#Nyy/?;CCr<   r   bool_masked_posc                    |j                   \  }}}}| j                  |      }|j                         \  }}}|K| j                  j	                  ||d      }	|j                  d      j                  |	      }
|d|
z
  z  |	|
z  z   }| j                  j	                  |dd      }t        j                  ||fd      }| j                  || j                  |||      z   }| j                  |      }|S )Nr)   r   r*   )r   r   rY   r   expand	unsqueezetype_asr   r.   r   r   r   r'   )rS   r   r   ri   r   r   r   rg   rh   mask_tokensw
cls_tokenss               r:   rm   z InternVLVisionEmbeddings.forward   s    
 +001fe**<8
!+!2
GQ&//00WbIK))"-55kBA#q1u-a?J^^**:r2>
YY
J7Q?
##/#d&C&CJPVX]&^^J\\*-
r<   rn   )r@   rA   rB   rv   r    rK   r.   ro   intr   
BoolTensorrm   rp   rq   s   @r:   r   r      s    
>3 > >,&D5<< &D &DUX &D]b]i]i &DV 48ll ))D0 
	r<   r   c                       e Zd Zy)InternVLVisionMLPNr?   rC   r<   r:   r   r     rD   r<   r   )
layer_normrms_normc                        e Zd ZdZdeddf fdZdej                  deej                     eej                  ej                  f   z  fdZ	 xZ
S )InternVLVisionLayerz?This corresponds to the Block class in the timm implementation.rG   r   Nc                    t         |           |j                  | _        d| _        t	        |      | _        t        |      | _        t        |j                     |j                  |j                        | _        t        |j                     |j                  |j                        | _        |j                  }t        j                   |t#        j$                  |j                        z  d      | _        t        j                   |t#        j$                  |j                        z  d      | _        t        j*                  |j,                        | _        y )Nr   epsT)requires_grad)rJ   rK   chunk_size_feed_forwardseq_len_dimrF   	attentionr   mlpNORM2FN	norm_typer   layer_norm_epslayernorm_beforelayernorm_afterlayer_scale_init_valuer1   r   r.   oneslambda_1lambda_2r   r   r'   )rS   rG   init_valuesrU   s      r:   rK   zInternVLVisionLayer.__init__  s    '-'E'E$08$V, '(8(8 9&:L:LRXRgRg h&v'7'789K9KQWQfQfg33[5::f>P>P3Q%Qaef[5::f>P>P3Q%Qaefzz&"<"<=r<   rV   c                    | j                  | j                  |            \  }}| j                  |z  }||z   }| j                  |      }| j	                  |      }| j                  |      }| j                  | j                  |z  }||z   }|S rn   )r   r   r   r   r   r'   r   )rS   rV   attention_outputri   layer_outputs        r:   rm   zInternVLVisionLayer.forward-  s     #nn!!-0
!  ==+;; )=8 ++M:xx-||L1==$==<7L $m3r<   )r@   rA   rB   rv   r    rK   r.   ro   tuplerm   rp   rq   s   @r:   r   r     sU    I>3 > >|| 
u||	uU\\5<<%?@	@r<   r   c                   R     e Zd Zdeddf fdZdej                  deez  fdZ	 xZ
S )InternVLVisionEncoderrG   r   Nc                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w rI   )
rJ   rK   rG   r1   
ModuleListrangenum_hidden_layersr   layergradient_checkpointing)rS   rG   irU   s      r:   rK   zInternVLVisionEncoder.__init__J  sO    ]]vOgOgIh#iA$7$?#ij
&+# $js   A#rV   c                 L    | j                   D ]
  } ||      } t        |      S )N)last_hidden_state)r   r
   )rS   rV   layer_modules      r:   rm   zInternVLVisionEncoder.forwardP  s3     !JJ 	8L(7M	8 +
 	
r<   )r@   rA   rB   r    rK   r.   ro   r   r
   rm   rp   rq   s   @r:   r   r   I  s7    ,3 , ,	
||	
 
	 	
r<   r   c                        e Zd ZU eed<   dZdZdZdZdgZ	dZ
dZdZdZeedZ ej$                          fd       Z xZS )	InternVLVisionPreTrainedModelrG   internvl_visionr   )imagevideoTr   )rV   
attentionsc                 $   t         |   |       t        |t              rwt	        j
                  |j                         |j                  t	        j
                  |j                         |j                   t	        j
                  |j                         yyt        |t              rit	        j                  |j                  | j                  j                         t	        j                  |j                  | j                  j                         yy)zInitialize the weightsN)rJ   _init_weightsr   r   initzeros_r   r   r   r   	constant_r   rG   r   r   )rS   r!   rU   s     r:   r   z+InternVLVisionPreTrainedModel._init_weightsn  s     	f%f67KK(()  ,F--.))5F667 6 34NN6??DKK,N,NONN6??DKK,N,NO 5r<   )r@   rA   rB   r    __annotations__base_model_prefixmain_input_nameinput_modalitiessupports_gradient_checkpointing_no_split_modules_supports_sdpa_supports_flash_attn_supports_flex_attn_supports_attention_backendr   rF   _can_record_outputsr.   no_gradr   rp   rq   s   @r:   r   r   \  sn      )$O)&*#./N"& --
 U]]_P Pr<   r   c                        e Zd Zdeddf fdZd Ze ed      e	 dde	j                  d	e	j                  dz  deez  fd
                     Z xZS )InternVLVisionModelrG   r   Nc                 2   t         |   |       || _        t        |      | _        t        |      | _        |j                  rt        j                         n*t        j                  |j                  |j                        | _        | j                          y )Nr   )rJ   rK   rG   r   r   r   encoderuse_mean_poolingr1   rP   	LayerNormr   r   	layernorm	post_initrS   rG   rU   s     r:   rK   zInternVLVisionModel.__init__  so     26:,V4 $44BKKM",,vGYGY_e_t_t:u 	
 	r<   c                 .    | j                   j                  S rn   )r   r   )rS   s    r:   get_input_embeddingsz(InternVLVisionModel.get_input_embeddings  s    ///r<   F)tie_last_hidden_statesr   r   c                     | j                  ||      }| j                  |      }|d   }| j                  |      }t        ||j                  |j
                        S )z
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
        )r   r   )r   rV   r   )r   r   r  ru   rV   r   )rS   r   r   r5   embedding_outputencoder_outputssequence_outputs          r:   rm   zInternVLVisionModel.forward  s`      ??<?Y,,'78)!,..93-)77&11
 	
r<   rn   )r@   rA   rB   r    rK   r  r   r   r   r.   ro   r   r   ru   rm   rp   rq   s   @r:   r   r   }  su    3  0  E2UY
!LL
;@;K;Kd;R
	5	5
  3  
r<   r   c                       e Zd ZdZy)InternVLPreTrainedModel)r   textr   N)r@   rA   rB   r   rC   r<   r:   r  r    s    1r<   r  c                   *     e Zd Zdef fdZd Z xZS )InternVLMultiModalProjectorrG   c                 *   t         |           t        j                  |j                  j
                  t        d|j                  z        dz  z        | _        t        j                  |j                  j
                  t        d|j                  z        dz  z  |j                  j
                        | _        t        |j                     | _        t        j                  |j                  j
                  |j                  j
                        | _        y )Nr   r   )rJ   rK   r1   r   vision_configr   r   downsample_ratior   Lineartext_configlinear_1r   projector_hidden_actactlinear_2r  s     r:   rK   z$InternVLMultiModalProjector.__init__  s    ,,v';';'G'G#aRXRiRiNiJjnoJo'op		  ,,s1v7N7N3N/OST/TTV\VhVhVtVt
 &556		&"4"4"@"@&BTBTB`B`ar<   c                     | j                  |      }| j                  |      }| j                  |      }| j                  |      }|S rn   )r   r  r  r  )rS   image_featuresrV   s      r:   rm   z#InternVLMultiModalProjector.forward  s@    7m4/m4r<   )r@   rA   rB   r   rK   rm   rp   rq   s   @r:   r  r    s    b~ br<   r  c                       e Zd Zy)InternVLModelOutputWithPastNr?   rC   r<   r:   r  r    rD   r<   r  c                      e Zd Zddej                  defdZee e	d      	 	 ddej                  deee   z  dz  d	edz  d
ee   deez  f
d                     Zee		 	 	 	 	 	 	 	 	 ddej&                  dz  dej                  dz  dej                  dz  dej&                  dz  dedz  dej                  dz  deee   z  dz  d	edz  dej&                  dz  d
ee   deez  fd              Zy)InternVLModelvision_featuresscale_factorc           
         |j                         \  }}}}||z  dk7  s||z  dk7  rt        d      |j                  ||t        ||z        t        ||z              }|j	                  dddd      j                         }|j                  |t        ||z        t        ||z        t        ||dz  z              }|j	                  dddd      j                         }|S )a&  Perform pixel shuffle downsampling on vision features.

        Args:
            vision_features (`torch.Tensor`):
                Input tensor of shape (batch_size, width, height, channels).
            scale_factor (`float`, *optional*, defaults to `0.5`):
                Factor by which to downsample. Default is 0.5, which halves the dimensions.

        Returns:
            vision_features (`torch.Tensor`):
                Downsampled tensor of shape (batch_size, height*scale_factor, width*scale_factor, channels/(scale_factor^2)).
        r   zKHeight and width must be divisible by scale_factor for proper downsampling.r   r   r   )rY   r   r`   r   r   r4   )rS   r  r   rg   r   r   channelss          r:   pixel_shufflezInternVLModel.pixel_shuffle  s     />.B.B.D+
E68L A%)=)Bjkk *..s6L#893x,?V;W
 *11!Q1=HHJ *..F\12C8L4MsS[_kmn_nSoOp

 *11!Q1=HHJr<   zWObtains image last hidden states from the vision tower and apply multimodal projection.rr   Nr   vision_feature_layervision_feature_select_strategyr5   r   c                 &   |j                  | j                        }| j                  j                  }|dk7  rd|d<    | j                  d|dd|}|dk(  r|j
                  }n|j                  |   }|dk(  r|ddddddf   }|j                  d   }t        |d	z        }	|j                  d
   }
|j                  |
|	|	d      }| j                  ||      }|j                  |
d|j                  d         }| j                  |      }||_        |S )a!  
        pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
            The tensors corresponding to the input images.
        vision_feature_layer (`int` or `list[int]`):
            Layer index or list of layer indices to extract features from.
        )r   r)   Toutput_hidden_states)r   return_dictdefaultNr   r   r   )r   rC   )r   r   rG   r  vision_towerr   rV   r   r   r]   r#  multi_modal_projectorpooler_output)rS   r   r$  r%  r5   r  vision_outputsr  r"  feature_sizerg   s              r:   get_image_featuresz InternVLModel.get_image_features  s:   $ $TZZ8;;772%-1F)****aRVaZ`a2%,>>O,::;OPO)Y6-aQh7O #((+8S=)$**1-
 *11*lLZ\] ,,_K[,\ *11*b/BWBWXZB[\ 44_E'6$r<   	input_idsr%   position_idspast_key_valuesinputs_embedscache_positionc
           	         |d u |d uz  rt        d      | | j                         |      }|k| j                  |||d      j                  }|j	                  |j
                  |j                        }| j                  |||      }|j                  ||      } | j                  d|||||	d|
}t        |j                  |j                  |j                  |j                  |      S d       S )Nz:You must specify exactly one of input_ids or inputs_embedsT)r   r$  r%  r(  )r3  r  )r%   r1  r2  r3  r4  )r   r2  rV   r   image_hidden_statesrC   )r   r  r/  r,  r   devicer   get_placeholder_maskmasked_scatterlanguage_modelr  r   r2  rV   r   )rS   r0  r   r%   r1  r2  r3  r$  r%  r4  r5   r  special_image_maskoutputss                 r:   rm   zInternVLModel.forward  s<    -t";<YZZ 7D557	BM#!44)%9/M 	 5 
 m  ,..}/C/C]EXEXYN!%!:!:~ "; " *889K^\M%$%% 
)%+')
 
 +%77#33!//))2>2J
 	

 QU
 	
r<   )r   )NN)	NNNNNNNNN)r@   rA   rB   r.   ro   floatr#  r   r   r   FloatTensorr   liststrr   r   r   r   r/  
LongTensorr   r  rm   rC   r<   r:   r  r    s   !U\\ ! !F n 8<59	,'', "DIo4, ),d
	,
 +,, 
+	+,   
,\  .215.204(,267;5926/
##d*/
 ''$./
 t+	/

 &&-/
 /
 ((4//
 "DIo4/
 ),d
/
 ((4//
 +,/
 
,	,/
  /
r<   r  c                       e Zd Zy)InternVLCausalLMOutputWithPastNr?   rC   r<   r:   rC  rC  O  rD   r<   rC  c                        e Zd Z fdZ xZS ) InternVLForConditionalGenerationc                  :     t               j                  di |  y)ac  
        Example:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, AutoModelForImageTextToText

        >>> torch_device = "cuda"
        >>> processor = AutoProcessor.from_pretrained("OpenGVLab/InternVL3-1B-hf")
        >>> model = AutoModelForImageTextToText.from_pretrained(
        ...     "OpenGVLab/InternVL3-1B-hf", dtype=torch.bfloat16, device_map=torch_device
        ... )

        >>> messages = [
        ...     {
        ...         "role": "user",
        ...         "content": [
        ...             {
        ...                 "type": "image",
        ...                 "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg",
        ...             },
        ...             {
        ...                 "type": "image",
        ...                 "url": "https://thumbs.dreamstime.com/b/golden-gate-bridge-san-francisco-purple-flowers-california-echium-candicans-36805947.jpg",
        ...             },
        ...             {"type": "text", "text": "These images depict two different landmarks. Can you identify them?"},
        ...         ],
        ...     },
        ... ]

        >>> inputs = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt").to(torch_device)
        >>> generate_ids = model.generate(**inputs, max_new_tokens=200)
        >>> print(processor.decode(generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True))
        The images depict the Statue of Liberty and the Golden Gate Bridge.
        ```NrC   )rJ   rm   )super_kwargsrU   s    r:   rm   z(InternVLForConditionalGeneration.forwardT  s    H 	','r<   )r@   rA   rB   rm   rp   rq   s   @r:   rE  rE  S  s    $( $(r<   rE  )r   r   r  r  rE  )rX   )Icollections.abcr   r   dataclassesr   r.   torch.nnr1    r   r   activationsr   cache_utilsr   modeling_layersr	   modeling_outputsr
   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   utils.genericr   r   utils.output_capturingr   clip.modeling_clipr   janus.modeling_janusr   llama.modeling_llamar   llava.modeling_llavar   r   r   r   r   configuration_internvlr   r    Modulero   r=  r;   r>   rF   ru   rx   r   r   r   r   r   r   r   r   r  INTERNVL_INPUTS_DOCSTRINGr  r  r  rC  rE  __all__rC   r<   r:   <module>r]     s     $ !   & !   9 K F & B B I 5 ( 7 /  I %II%<<% 
% <<	%
 LL4'% % %4	L 	3$2 3$l 
+E   BII  J[ryy [|	 	 3H
I+4 +\
BII 
& PO P P@ &
7 &
 &
R22 2 ! ")) $	": 	H
J H
V	%@ 	%('D %(Pr<   