
    qiv                     t   d dl Z d dlmZ d dlmZmZ d dlZd dlmZ d dlm	Z	m
Z
mZ ddlmZ ddlmZ ddlmZmZmZmZ dd	lmZ dd
lmZ ddlmZmZmZ ddlmZ ddl m!Z!m"Z"m#Z# ddl$m%Z% ddl&m'Z'm(Z(  ejR                  e*      Z+ G d de      Z,e G d de             Z-e G d de             Z. G d dej^                        Z0e G d de(             Z1 ed       G d de'             Z2 G d d e%      Z3e G d! d"e1             Z4 ed#       G d$ d%e1             Z5 ed&       G d' d(e1             Z6g d)Z7y)*    N)	dataclass)AnyLiteral)BCEWithLogitsLossCrossEntropyLossMSELoss   )initialization)PretrainedConfig)BaseModelOutputMaskedLMOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)Unpack)TransformersKwargsauto_docstringlogging)can_return_tuple   )CONFIG_MAPPING
AutoConfig	AutoModel)ModernBertPredictionHead)SmolVLMModelSmolVLMPreTrainedModelc                        e Zd ZU dZdZeedZeee	f   e
d<   	 	 	 	 	 	 	 	 	 ddedz  dedz  dedz  d	edz  d
ed   dedz  dedz  f fdZ xZS )ModernVBertConfiga  
    This is the configuration class to store the configuration of a [`ModernVBert`] model. It is used to
    instantiate a ModernVBert model according to the specified arguments and defines the model architecture.
    e.g. [ModernVBERT/modernvbert](https://huggingface.co/ModernVBERT/modernvbert).

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs.
    See the documentation for [`PretrainedConfig`] for more details.

    Args:
        text_config (`AutoConfig`, *optional*): Configuration for the text encoder.
        vision_config (`ModernVBertVisionConfig`, *optional*): Configuration for the vision encoder.
        image_token_id (`int | None`, *optional*, defaults to 50407): The token id reserved for image tokens inserted into the text stream.
        pixel_shuffle_factor (`int | None`, *optional*, defaults to 4): Scale factor used by any pixel-shuffle / upsampling operations in the vision head.
        initializer_range (`float | None`, *optional*, defaults to 0.02): The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        initializer_cutoff_factor (`float | None`, *optional*, defaults to 2.0): The cutoff factor for the truncated_normal_initializer for initializing all weight matrices.
        classifier_pooling (`Literal["cls", "mean"]`, *optional*, defaults to `"cls"`): The pooling strategy to use for classification tasks.
        classifier_dropout (`float | None`, *optional*, defaults to 0.0): The dropout probability for the classification head.
        classifier_bias (`bool | None`, *optional*, defaults to `False`): Whether to add a bias term to the classification head.

    Example:
    ```python
    >>> from transformers import ModernVBertConfig

    >>> # Initializing configuration
    >>> configuration = ModernVBertConfig()

    >>> # Initializing a model from the configuration (model class is implemented in
    >>> # `modernvbert.modeling_modernvbert`)

    >>> from transformers import ModernVBertModel
    >>> model = ModernVBertModel(configuration)

    >>> # Accessing the model configuration
    >>> cfg = model.config
    ```modernvbert)text_configvision_configsub_configsNimage_token_idpixel_shuffle_factorinitializer_rangeinitializer_cutoff_factorclassifier_poolingclsmeanclassifier_dropoutclassifier_biasc
                 v   |dvrt        d| d      |t        d          }nt        |t              rt        d   di |}|| _        |t        d          }nt        |t              rt        d   di |}|| _        || _        || _        || _        || _	        || _
        |	| _        t        | 4  dd|i|
 y )Nr(   zQInvalid value for `classifier_pooling`, should be either "cls" or "mean", but is .
modernbertsiglip_vision_modelr#    )
ValueErrorr   
isinstancedictr    r!   r$   r%   r&   r'   r+   r,   super__init__)selfr    r!   r#   r$   r%   r&   r'   r+   r,   kwargs	__class__s              e/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/modernvbert/modular_modernvbert.pyr6   zModernVBertConfig.__init__S   s     _4cdvcwwxy  (68KT*(6EEK& *+@ACMt,*+@ARMRM*$8!!2)B&"4"4.AA&A    )	NNi     g{Gz?       @r)           F)__name__
__module____qualname____doc__
model_typer   r"   r4   strr   __annotations__intfloatr   boolr6   __classcell__r9   s   @r:   r   r   +   s    "H J2<z"ZKc3hZ %*+,*.255:+.',%B d
	%B
 "Dj%B !4<%B $)4<%B $M2%B "DL%B %B %Br;   r   c                       e Zd ZU dZdZej                  ed<   dZe	ej                     dz  ed<   dZ
e	ej                     dz  ed<   dZe	ej                     dz  ed<   y)ModernVBertBaseModelOutputaY  
    Base class for ModernVBERT model's outputs.
    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
            If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
            hidden_size)` is output.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
            Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
            sequence_length, hidden_size)`.
            image_hidden_states of the model produced by the vision encoder
    Nlast_hidden_statehidden_states
attentionsimage_hidden_states)r?   r@   rA   rB   rM   torchFloatTensorrE   rN   tuplerO   rP   r1   r;   r:   rL   rL   {   si    , ,0u((/59M5**+d2926Je''(4/6;?u001D8?r;   rL   c                       e Zd ZU dZdZej                  dz  ed<   dZej                  ed<   dZ	e
ej                  df   dz  ed<   dZe
ej                  df   dz  ed<   dZej                  dz  ed<   y)	ModernVBertMaskedLMOutputaG  
    Base class for ModernVBERT model's outputs with masked language modeling loss.
    Args:
        loss (`torch.FloatTensor`, *optional*, returned when `labels` is provided):
            Masked language modeling (MLM) loss.
        logits (`torch.FloatTensor`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
            Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
            sequence_length, hidden_size)`.
            image_hidden_states of the model produced by the vision encoder
    Nlosslogits.rN   rO   rP   )r?   r@   rA   rB   rV   rQ   rR   rE   rW   rN   rS   rO   rP   r1   r;   r:   rU   rU      s    , &*D%

d
") $FE$:>M5**C/047>7;Je'',-4;48**T18r;   rU   c                   .     e Zd ZdZ fdZd Zd Z xZS )ModernVBertConnectorz
    Connector module for ModernVBERT. It performs a pixel shuffle operation followed by a linear projection to match the text model's hidden size.
    Based on https://pytorch.org/docs/stable/generated/torch.nn.PixelShuffle.html
    c                     t         |           |j                  | _        t        j                  |j
                  j                  |j                  dz  z  |j                  j                  d      | _        y )Nr   Fbias)	r5   r6   r$   nnLinearr!   hidden_sizer    modality_projectionr7   configr9   s     r:   r6   zModernVBertConnector.__init__   s^    $*$?$?!#%99  ,,0K0KQ0NO**$
 r;   c                    |j                         \  }}}t        |dz        x}}|j                  ||||      }|j                  ||t        ||z        ||z        }|j                  dddd      }|j	                  |t        ||z        t        ||z        ||dz  z        }|j                  dddd      }|j	                  |t        ||dz  z        ||dz  z        S )Ng      ?r   r      r	   )sizerF   viewpermutereshape)r7   rP   r$   
batch_size
seq_length	embed_dimheightwidths           r:   pixel_shufflez"ModernVBertConnector.pixel_shuffle   s   ,?,D,D,F)
J	Z_--166z65R[\166E,@$@ A9OcCc
 299!Q1E199,,---.-q01	
 299!Q1E"**J*>*ABCYRfhiRiEj
 	
r;   c                 \    | j                  || j                        }| j                  |      S N)rn   r$   r`   )r7   rP   s     r:   forwardzModernVBertConnector.forward   s.    "001DdF_F_`''(;<<r;   )r?   r@   rA   rB   r6   rn   rq   rI   rJ   s   @r:   rY   rY      s    


&=r;   rY   c                   B    e Zd ZeZg Z ej                         d        Zy)ModernVBertPreTrainedModelc                     t        j                   |       dt        j                  dt        f fd}t        |t              ra j                  j                  t        j                  d j                  j                  j                  z        z  } ||j                  |       y t        |t              ra j                  j                  t        j                  d j                  j                  j                  z        z  } ||j                  |       y t        |t         t"        f      r^ j                  j                  t        j                   j                  j                  j$                        z  } ||j&                  |       y y )Nmodulestdc                 8   t        j                  dd      }t        j                  | j                  d|| |z  ||z         t        | t        j                  t        j                  f      r-| j                   t        j                  | j                         y y y )Nr&   r=   r>   )r*   rv   ab)getattrrb   inittrunc_normal_weightr3   r]   r^   Conv2dr\   zeros_)ru   rv   cutoff_factorr7   s      r:   init_weightz=ModernVBertPreTrainedModel._init_weights.<locals>.init_weight   s    #DKK1LcRM .3&#% &299bii"89;;*KK, + :r;   r=   )r   _init_weightsr]   ModulerG   r3   rY   rb   r%   mathsqrtr    num_hidden_layersr`   ModernVBertForMaskedLMlm_head$ModernVBertForSequenceClassification!ModernVBertForTokenClassificationr_   
classifier)r7   ru   r   out_stdfinal_out_stds   `    r:   r   z(ModernVBertPreTrainedModel._init_weights   s   %%dF3	-		 	- 	- f23kk33diidkkF]F]FoFo@o6ppG22G< 67kk33diidkkF]F]FoFo@o6ppG041
 !KK99DIIdkkF]F]FiFi<jjM))=9
r;   N)	r?   r@   rA   r   config_class_no_split_modulesrQ   no_gradr   r1   r;   r:   rs   rs      s'    $LU]]_: :r;   rs   aF  
    ModernVBertModel is a model that combines a vision encoder (SigLIP) and a text encoder (ModernBert).

    ModernVBert is the base model of the visual retriver ColModernVBert, and was introduced in the following paper:
    [*ModernVBERT: Towards Smaller Visual Document Retrievers*](https://arxiv.org/abs/2510.01149).
    )custom_introc                   @    e Zd Zdef fdZe edd      	 	 	 	 	 	 	 ddej                  dej                  dz  d	ej                  dz  d
ej                  dz  dej                  dz  dej                  dz  dej                  dz  dee   deez  fd              Z xZS )ModernVBertModelrb   c                    t         |   |       t        |      | _        t	        j
                  |j                        | _        t	        j
                  |j                        | _	        t        |j                  j                  |j                  j                  z  dz  |j                  dz  z        | _        | j                          y )Nr   )r5   r6   rY   	connectorr   from_configr    
text_modelr!   vision_modelrF   
image_size
patch_sizer$   image_seq_len	post_initra   s     r:   r6   zModernVBertModel.__init__  s      .f5#//0B0BC%11&2F2FG ""--1E1E1P1PPUVV**A-/
 	r;     
        Inputs fed to the model can have an arbitrary number of images. To account for this, pixel_values fed to
        the model have image padding -> (batch_size, max_num_images, 3, max_heights, max_widths) where
        max_num_images is the maximum number of images among the batch_size samples in the batch.
        Padding images are not needed beyond padding the pixel_values at the entrance of the model.
        For efficiency, we only pass through the vision_model's forward the real images by
        discarding the padding images i.e. pixel_values of size (image_batch_size, 3, height, width) where
        image_batch_size would be 7 when num_images_per_sample=[1, 3, 1, 2] and max_num_images would be 3.
        ModernVBERT/modernvbertr   
checkpointN	input_idsattention_maskposition_idsinputs_embedspixel_valuespixel_attention_maskrP   r8   returnc                    |9 | j                   j                         |      j                  |j                        }|| j	                  ||      j
                  }|;|j                  |j                  |j                        }| j                  |||      } | j                   d|||d|}	t        |	j                  |	j                  |	j                  |      S )a|  
        pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):
            Mask to avoid performing attention on padding pixel indices.
        image_hidden_states (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
            The hidden states of the image encoder after modality projection.
        )r   r   )dtypedevice)r   r   rP   )r   r   r   )rM   rN   rO   rP   r1   )r   get_input_embeddingstor   get_image_featurespooler_outputr   inputs_mergerrL   rM   rN   rO   )
r7   r   r   r   r   r   r   rP   r8   outputss
             r:   rq   zModernVBertModel.forward   s    >  BDOO@@B9MPPQZQaQabM #"&"9"9)@T #: #m  
 *"5"8"8}?R?R[h[o[o"8"p ..#=Vi / M
 "$// 
')%
 	
 *%77!//)) 3	
 	
r;   )NNNNNNN)r?   r@   rA   r   r6   r   r   rQ   
LongTensorTensorrR   
BoolTensorr   r   rS   rL   rq   rI   rJ   s   @r:   r   r     s    0    - '+.20426158<8</
##/
 t+/
 &&-	/

 ((4//
 ''$./
 $..5/
 #..5/
 +,/
 
+	+/
 /
r;   r   c                       e Zd Zy)ModernVBertPredictionHeadN)r?   r@   rA   r1   r;   r:   r   r   _  s    r;   r   c                   n    e Zd ZddiZ fdZd Zd Ze edd      	 	 	 	 	 	 	 	 dd
e	j                  de	j                  d	z  de	j                  d	z  de	j                  d	z  de	j                  d	z  de	j                  d	z  de	j                  d	z  de	j                  d	z  dee   deez  fd              Z xZS )r   zlm_head.weightz1model.text_model.embeddings.tok_embeddings.weightc                 l   t         |   |       |j                  j                  | _        t	        |      | _        t        |j                        | _        t        j                  |j                  j                  | j                  |j                  j                        | _        | j                          y )Nr[   )r5   r6   r    
vocab_sizer   modelr   projection_headr]   r^   r_   decoder_biasr   r   ra   s     r:   r6   zModernVBertForMaskedLM.__init__g  s      ,,77%f-
89K9KLyy!3!3!?!?W]WiWiWvWvw 	r;   c                     | j                   S rp   r   )r7   s    r:   get_output_embeddingsz,ModernVBertForMaskedLM.get_output_embeddingss  s    ||r;   c                     || _         y rp   r   )r7   new_embeddingss     r:   set_output_embeddingsz,ModernVBertForMaskedLM.set_output_embeddingsv  s	    %r;   r   r   r   Nr   r   r   r   r   r   rP   labelsr8   r   c	                 \    | j                   d|||||||d|	}
|
d   }| j                  | j                  |            }d}|<t               } ||j	                  d| j
                        |j	                  d            }t        |||
j                  |
j                  |
j                        S )  
        pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):
            Mask to avoid performing attention on padding pixel indices.
        image_hidden_states (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
            The hidden states of the image encoder after modality projection.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            text_config.]` or `model.image_token_id`. Tokens with indices set to `model.image_token_id` are
            ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., text_config.]`.
        r   r   r   r   r   r   rP   r   N)rV   rW   rN   rO   rP   r1   )
r   r   r   r   rf   r   rU   rN   rO   rP   )r7   r   r   r   r   r   r   rP   r   r8   r   rN   rW   rV   	criterions                  r:   rq   zModernVBertForMaskedLM.forwardy  s    H $** 	
)%'%!5 3	
 	
  
d22=AB(*IV[[T__=v{{2OD(!//)) ' ; ;
 	
r;   NNNNNNNN)r?   r@   rA   _tied_weights_keysr6   r   r   r   r   rQ   r   r   rR   r   r   r   rS   rU   rq   rI   rJ   s   @r:   r   r   c  s"   *,_`
&  - '+.20426158<8<*.0
##0
 t+0
 &&-	0

 ((4/0
 ''$.0
 $..50
 #..50
   4'0
 +,0
 
*	*0
 0
r;   r   za
    The ModernVBert Model with a sequence classification head on top that performs pooling.
    c                   `    e Zd Zdef fdZe edd      	 	 	 	 	 	 	 	 ddej                  dej                  dz  d	ej                  dz  d
ej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dee   deez  fd              Z xZS )r   rb   c                    t         |   |       |j                  | _        || _        t	        |      | _        t        |j                        | _        t        j                  |j                        | _        t        j                  |j                  j                  |j                        | _        | j!                          y rp   )r5   r6   
num_labelsrb   r   r   r   r    headr]   Dropoutr+   dropr^   r_   r   r   ra   s     r:   r6   z-ModernVBertForSequenceClassification.__init__  s      ++%f-
-f.@.@A	JJv889	))F$6$6$B$BFDUDUV 	r;   r   r   r   Nr   r   r   r   r   r   rP   r   r8   r   c	                     | j                   d|||||||d|	}
|
d   }| j                  j                  dk(  r
|dddf   }n| j                  j                  dk(  r||j                  dd \  }}n|j                  dd \  }}||j                  n|j                  }|(t        j                  ||f|t
        j                        }||j                  d      z  j                  d	
      |j                  d	d      z  }| j                  |      }| j                  |      }| j                  |      }d}|| j                  j                  | j                  d	k(  rd| j                  _        nl| j                  d	kD  rL|j                  t
        j                   k(  s|j                  t
        j"                  k(  rd| j                  _        nd| j                  _        | j                  j                  dk(  rIt%               }| j                  d	k(  r& ||j'                         |j'                               }n |||      }n| j                  j                  dk(  r=t)               } ||j+                  d| j                        |j+                  d            }n,| j                  j                  dk(  rt-               } |||      }t/        |||
j0                  |
j2                        S )r   r   r   r)   Nr*   r   )r   r   r   rd   )dimT)r   keepdim
regressionsingle_label_classificationmulti_label_classificationrV   rW   rN   rO   r1   )r   rb   r'   shaper   rQ   onesrH   	unsqueezesumr   r   r   problem_typer   r   longrF   r   squeezer   rf   r   r   rN   rO   )r7   r   r   r   r   r   r   rP   r   r8   r   rM   ri   seq_lenr   pooled_outputrW   rV   loss_fcts                      r:   rq   z,ModernVBertForSequenceClassification.forward  s   F $** 	
)%'%!5 3	
 	
 $AJ;;))U2 1!Q$ 7[[++v5(&3&9&9"1&=#
G&/oobq&9#
G)2)>Y%%MDXDXF%!&Z,A&X]XbXb!c!2^5M5Mb5Q!Q V V[\ V ]`n`r`rt as a ! 		"34		-0/{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#FF3D))-JJ+-B @&++b/R))-II,./'!//))	
 	
r;   r   )r?   r@   rA   r   r6   r   r   rQ   r   r   rR   r   r   r   rS   r   rq   rI   rJ   s   @r:   r   r     s    0   - '+.20426158<8<*.Q
##Q
 t+Q
 &&-	Q

 ((4/Q
 ''$.Q
 $..5Q
 #..5Q
   4'Q
 +,Q
 
)	)Q
 Q
r;   r   zw
    The ModernVBert Model with a token classification head on top, e.g. for Named Entity Recognition (NER) tasks.
    c                   `    e Zd Zdef fdZe edd      	 	 	 	 	 	 	 	 ddej                  dej                  dz  d	ej                  dz  d
ej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dee   deez  fd              Z xZS )r   rb   c                 t   t         |   |       |j                  | _        t        |      | _        t        |j                        | _        t        j                  |j                        | _        t        j                  |j                  j                  |j                        | _        | j                          y rp   )r5   r6   r   r   r   r   r    r   r]   r   r+   r   r^   r_   r   r   ra   s     r:   r6   z*ModernVBertForTokenClassification.__init__3  s      ++%f-
-f.@.@A	JJv889	))F$6$6$B$BFDUDUV 	r;   r   r   r   Nr   r   r   r   r   r   rP   r   r8   r   c	                 l    | j                   d|||||||d|	}
|
d   }| j                  |      }| j                  |      }| j                  |      }d}|<t	               } ||j                  d| j                        |j                  d            }t        |||
j                  |
j                        S )r   r   r   Nr   r   r1   )
r   r   r   r   r   rf   r   r   rN   rO   )r7   r   r   r   r   r   r   rP   r   r8   r   rM   rW   rV   r   s                  r:   rq   z)ModernVBertForTokenClassification.forward?  s    H $** 	
)%'%!5 3	
 	
 $AJ II&78 II&78!23')HFKKDOO<fkk"oND$!//))	
 	
r;   r   )r?   r@   rA   r   r6   r   r   rQ   r   r   rR   r   r   r   rS   r   rq   rI   rJ   s   @r:   r   r   -  s   
0 
  - '+.20426158<8<*.1
##1
 t+1
 &&-	1

 ((4/1
 ''$.1
 $..51
 #..51
   4'1
 +,1
 
&	&1
 1
r;   r   )r   rs   r   r   r   r   )8r   dataclassesr   typingr   r   rQ   torch.nnr]   r   r   r    r
   r{   configuration_utilsr   modeling_outputsr   r   r   r   modeling_utilsr   processing_utilsr   utilsr   r   r   utils.genericr   autor   r   r   modernbert.modeling_modernbertr   smolvlm.modeling_smolvlmr   r   
get_loggerr?   loggerr   rL   rU   r   rY   rs   r   r   r   r   r   __all__r1   r;   r:   <module>r      s    !    A A & 3  . & @ @ - 8 8 E K 
		H	%MB( MB` @ @ @: 9 9 9<$=299 $=N $:!7 $: $:N M
| M
M
`	 8 	 R
7 R
 R
j 
l
+E l

l
^ 
K
(B K

K
\r;   