
    qi                     *   d dl Z d dlmZ d dlZd dlmZ d dlmZmZmZ ddl	m
Z ddlmZ ddlmZmZmZmZmZ ddlmZ dd	lmZ dd
lmZmZmZ ddlmZ ddlmZ ddl m!Z! e G d de             Z"e G d de             Z# G d dejH                        Z%e G d de             Z& ed       G d de&             Z' G d dejH                        Z(e G d de&             Z) ed        G d! d"e&             Z* ed#       G d$ d%e&             Z+g d&Z,y)'    N)	dataclass)BCEWithLogitsLossCrossEntropyLossMSELoss   )initialization)ACT2FN)BaseModelOutputBaseModelOutputWithPoolingMaskedLMOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)Unpack)TransformersKwargsauto_docstringtorch_compilable_check)can_return_tuple   )	AutoModel   )ModernVBertConfigc                       e Zd ZU dZdZej                  ed<   dZe	ej                     dz  ed<   dZ
e	ej                     dz  ed<   dZe	ej                     dz  ed<   y)ModernVBertBaseModelOutputaY  
    Base class for ModernVBERT model's outputs.
    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
            If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
            hidden_size)` is output.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
            Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
            sequence_length, hidden_size)`.
            image_hidden_states of the model produced by the vision encoder
    Nlast_hidden_statehidden_states
attentionsimage_hidden_states)__name__
__module____qualname____doc__r   torchFloatTensor__annotations__r   tupler   r        f/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/modernvbert/modeling_modernvbert.pyr   r   -   si    , ,0u((/59M5**+d2926Je''(4/6;?u001D8?r(   r   c                       e Zd ZU dZdZej                  dz  ed<   dZej                  ed<   dZ	e
ej                  df   dz  ed<   dZe
ej                  df   dz  ed<   dZej                  dz  ed<   y)	ModernVBertMaskedLMOutputaG  
    Base class for ModernVBERT model's outputs with masked language modeling loss.
    Args:
        loss (`torch.FloatTensor`, *optional*, returned when `labels` is provided):
            Masked language modeling (MLM) loss.
        logits (`torch.FloatTensor`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
            Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
            sequence_length, hidden_size)`.
            image_hidden_states of the model produced by the vision encoder
    Nlosslogits.r   r   r   )r   r    r!   r"   r,   r#   r$   r%   r-   r   r&   r   r   r'   r(   r)   r+   r+   K   s    , &*D%

d
") $FE$:>M5**C/047>7;Je'',-4;48**T18r(   r+   c                   .     e Zd ZdZ fdZd Zd Z xZS )ModernVBertConnectorz
    Connector module for ModernVBERT. It performs a pixel shuffle operation followed by a linear projection to match the text model's hidden size.
    Based on https://pytorch.org/docs/stable/generated/torch.nn.PixelShuffle.html
    c                     t         |           |j                  | _        t        j                  |j
                  j                  |j                  dz  z  |j                  j                  d      | _        y )Nr   Fbias)	super__init__pixel_shuffle_factornnLinearvision_confighidden_sizetext_configmodality_projectionselfconfig	__class__s     r)   r4   zModernVBertConnector.__init__p   s^    $*$?$?!#%99  ,,0K0KQ0NO**$
 r(   c                    |j                         \  }}}t        |dz        x}}|j                  ||||      }|j                  ||t        ||z        ||z        }|j                  dddd      }|j	                  |t        ||z        t        ||z        ||dz  z        }|j                  dddd      }|j	                  |t        ||dz  z        ||dz  z        S )Ng      ?r   r   r   r   )sizeintviewpermutereshape)r=   r   r5   
batch_size
seq_length	embed_dimheightwidths           r)   pixel_shufflez"ModernVBertConnector.pixel_shuffley   s   ,?,D,D,F)
J	Z_--166z65R[\166E,@$@ A9OcCc
 299!Q1E199,,---.-q01	
 299!Q1E"**J*>*ABCYRfhiRiEj
 	
r(   c                 \    | j                  || j                        }| j                  |      S N)rK   r5   r;   )r=   r   s     r)   forwardzModernVBertConnector.forward   s.    "001DdF_F_`''(;<<r(   )r   r    r!   r"   r4   rK   rN   __classcell__r?   s   @r)   r/   r/   j   s    


&=r(   r/   c                   z     e Zd ZU eed<   dZdZdZg ZdZ	dZ
dZdZdZeZ ej                           fd       Z xZS )ModernVBertPreTrainedModelr>   model)imagetextTpast_key_valuesc                 
    t            |       dt        j                  dt        f fd}t        |t              ra j                  j                  t        j                  d j                  j                  j                  z        z  } ||j                  |       y t        |t              ra j                  j                  t        j                  d j                  j                  j                  z        z  } ||j                  |       y t        |t         t"        f      r^ j                  j                  t        j                   j                  j                  j$                        z  } ||j&                  |       y y )Nmodulestdc                 8   t        j                  dd      }t        j                  | j                  d|| |z  ||z         t        | t        j                  t        j                  f      r-| j                   t        j                  | j                         y y y )Ninitializer_cutoff_factor       @        )meanrY   ab)getattrr>   inittrunc_normal_weight
isinstancer6   r7   Conv2dr2   zeros_)rX   rY   cutoff_factorr=   s      r)   init_weightz=ModernVBertPreTrainedModel._init_weights.<locals>.init_weight   s    #DKK1LcRM .3&#% &299bii"89;;*KK, + :r(   r\   )r3   _init_weightsr6   Modulefloatre   r/   r>   initializer_rangemathsqrtr:   num_hidden_layersr;   ModernVBertForMaskedLMlm_head$ModernVBertForSequenceClassification!ModernVBertForTokenClassificationr9   
classifier)r=   rX   ri   out_stdfinal_out_stdr?   s   `    r)   rj   z(ModernVBertPreTrainedModel._init_weights   s   f%	-		 	- 	- f23kk33diidkkF]F]FoFo@o6ppG22G< 67kk33diidkkF]F]FoFo@o6ppG041
 !KK99DIIdkkF]F]FiFi<jjM))=9
r(   )r   r    r!   r   r%   base_model_prefixinput_modalitiessupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_supports_flex_attn_supports_attention_backendconfig_classr#   no_gradrj   rO   rP   s   @r)   rR   rR      s]    (&*#"3N"&$LU]]_: :r(   rR   aF  
    ModernVBertModel is a model that combines a vision encoder (SigLIP) and a text encoder (ModernBert).

    ModernVBert is the base model of the visual retriver ColModernVBert, and was introduced in the following paper:
    [*ModernVBERT: Towards Smaller Visual Document Retrievers*](https://arxiv.org/abs/2510.01149).
    custom_introc                       e Zd ZdZdef fdZd Zd Zdej                  dej                  dej                  fd	Ze ed
      	 ddej                  dej                  dz  dee   deez  fd              Ze edd      	 	 	 	 	 	 	 ddej                  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej(                  dz  dej                  dz  dee   deez  fd              Z xZS )ModernVBertModelz
    A subclass of Idefics3Model. We do *not* remove or block the call to inputs_merger
    in forward. Instead, we override inputs_merger here with custom logic.
    r>   c                 Z   t         |   |       | j                  j                  j                  | _        | j                  j                  j                  | _        t        j                  |j                        | _
        t        |      | _        t        j                  |j                        | _        t        |j                  j                  |j                  j                   z  dz  |j"                  dz  z        | _        | j                  j&                  | _        | j)                          y )Nr   )r3   r4   r>   r:   pad_token_idpadding_idx
vocab_sizer   from_configr8   vision_modelr/   	connector
text_modelrB   
image_size
patch_sizer5   image_seq_lenimage_token_id	post_initr<   s     r)   r4   zModernVBertModel.__init__   s     ;;22??++11<<%11&2F2FG .f5#//0B0BC ""--1E1E1P1PPUVV**A-/
 #kk88r(   c                 6    | j                   j                         S rM   )r   get_input_embeddingsr=   s    r)   r   z%ModernVBertModel.get_input_embeddings   s    3355r(   c                 :    | j                   j                  |       y rM   )r   set_input_embeddings)r=   values     r)   r   z%ModernVBertModel.set_input_embeddings   s    ,,U3r(   	input_idsinputs_embedsr   c                     |j                   \  }}}|a| | j                         t        j                  | j                  j
                  t        j                  |j                              k(  }|d   }n|| j                  j
                  k(  }|j                  d      }t        t        j                  ||z  dk(        d       ||z  }t        j                  j                  j                  |j                  d      dd	      }	|	dd
 }
|j                  d
      }|dz
  |z  }|dz
  |z  }|
j                  d      |z   }t        j                   |      }|||   ||   ddf   ||<   t        j"                  |j                  d
      ||      }|S )as  
        This method aims at merging the token embeddings with the image hidden states into one single sequence of vectors that are fed to the transformer LM.
        The merging happens as follows:
        - The text token sequence is: `tok_1 tok_2 tok_3 <fake_token_around_image> <image> <image> ... <image> <fake_token_around_image> tok_4`.
        - We get the image hidden states for the image through the vision encoder and that hidden state, after a pixel shuffle operation, is then projected into the text embedding space.
        We thus have a sequence of image hidden states of size (1, image_seq_len, hidden_dim), where 1 is for batch_size of 1 image and hidden_dim is the hidden_dim of the LM transformer.
        - The merging happens so that we obtain the following sequence: `vector_tok_1 vector_tok_2 vector_tok_3 vector_fake_tok_around_image {sequence of image_seq_len image hidden states} vector_fake_toke_around_image vector_tok_4`. That sequence is fed to the LM.
        - To fit the format of that sequence, `input_ids`, `inputs_embeds`, `attention_mask` are all 3 adapted to insert the image hidden states.
        Ndtypedevice).r   r   dimr   zCAt least one sample has <image> tokens not divisible by patch_size.)r   r   )r   )shaper   r#   tensorr>   r   longr   sumr   allr6   
functionalpadcumsum	unsqueeze
zeros_likewhere)r=   r   r   r   _r   
image_masknum_image_tokensblocks_per_sampleoffsetsblock_offsetrow_cum	chunk_idx	local_idx	block_idximage_embedsmerged_embedss                    r)   inputs_mergerzModernVBertModel.inputs_merger   s    /44:q&*E$*C*C*ET[[77uzzR_RfRfg+ J $F+J"dkk&@&@@J%>>a>0II&3q89Q	
 -
:((%%))*;*B*Bq*B*I6YZ)[s|###+q[Z/	q[J.	 **1-	9	''6#6y7LiXbNcef7f#gZ J$8$8$<lMZr(   zVEncodes images into continuous embeddings that can be forwarded to the language model.r   Npixel_valuespixel_attention_maskkwargsreturnc                    |j                   \  }}}}}|j                  | j                        } |j                  ||z  g|j                   dd  }|j                   dd j	                         }	|dk(  j                  d      |	k7  }
|
dxx   t        j                  |
       z  cc<   ||
   j                         }|Lt        j                  d	D cg c]  }|j                   |    c}t        j                  |j                  
      }n6 |j                  ||z  g|j                   dd  }||
   j                         }| j                  j                  j                  }|j                  d||      }|j                  d||      }|j                  d      dkD  j                         } | j                   d||dd|}|j"                  }| j%                  |      }||_        |S c c}w )a4  
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
            The tensors corresponding to the input images.
        pixel_attention_mask (`torch.LongTensor`, *optional*):
            The attention mask indicating padded regions in the image.
        )r   r   Nr   r]   )r   r   r   )r   r   r   )rA   r   r   )	dimensionrA   step)r   r   T)r   patch_attention_maskreturn_dictr'   )r   tor   rC   numelr   r#   any
contiguousonesboolr   r>   r8   r   unfoldr   r   r   pooler_output)r=   r   r   r   rF   
num_imagesnum_channelsrI   rJ   nb_values_per_imagereal_images_indsir   patches_subgridr   image_outputsr   image_featuress                     r)   get_image_featuresz#ModernVBertModel.get_image_features  s     ?K>P>P;
Jfe#TZZ8(|((j)@Z<CUCUVWVXCYZ +004::<(C/444FJ]] 			*: ;;;#$45@@B'#(::5>?l((+?jj#**$  $=#7#<#<Z*=T#vWkWqWqrsrtWu#v #78H#I#T#T#V [[..99
.55
Yc5d)001:T^0_ / 3 3 3 AA EKKM *)) 
%<P^b
fl
 ,== (;<&4#/ @s   G&  
        Inputs fed to the model can have an arbitrary number of images. To account for this, pixel_values fed to
        the model have image padding -> (batch_size, max_num_images, 3, max_heights, max_widths) where
        max_num_images is the maximum number of images among the batch_size samples in the batch.
        Padding images are not needed beyond padding the pixel_values at the entrance of the model.
        For efficiency, we only pass through the vision_model's forward the real images by
        discarding the padding images i.e. pixel_values of size (image_batch_size, 3, height, width) where
        image_batch_size would be 7 when num_images_per_sample=[1, 3, 1, 2] and max_num_images would be 3.
        ModernVBERT/modernvbertr   
checkpointattention_maskposition_idsc                    |9 | j                   j                         |      j                  |j                        }|| j	                  ||      j
                  }|;|j                  |j                  |j                        }| j                  |||      } | j                   d|||d|}	t        |	j                  |	j                  |	j                  |      S )a|  
        pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):
            Mask to avoid performing attention on padding pixel indices.
        image_hidden_states (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
            The hidden states of the image encoder after modality projection.
        )r   r   r   )r   r   r   )r   r   r   )r   r   r   r   r'   )r   r   r   r   r   r   r   r   r   r   r   r   )
r=   r   r   r   r   r   r   r   r   outputss
             r)   rN   zModernVBertModel.forwardJ  s    >  BDOO@@B9MPPQZQaQabM #"&"9"9)@T #: #m  
 *"5"8"8}?R?R[h[o[o"8"p ..#=Vi / M
 "$// 
')%
 	
 *%77!//)) 3	
 	
r(   rM   )NNNNNNN)r   r    r!   r"   r   r4   r   r   r#   
LongTensorTensorr   r   r   r$   r   r   r&   r   r   
BoolTensorr   rN   rO   rP   s   @r)   r   r      s   
0 $64())(:?,,(]b]i]i(T m 9=2''2 $..52 +,	2
 
+	+2 2h  - '+.20426158<8</
##/
 t+/
 &&-	/

 ((4//
 ''$./
 $..5/
 #..5/
 +,/
 
+	+/
 /
r(   r   c                   \     e Zd Zdef fdZdej                  dej                  fdZ xZS )ModernVBertPredictionHeadr>   c                 J   t         |           || _        t        j                  |j
                  |j
                  |j                        | _        t        |j                     | _
        t        j                  |j
                  |j                  |j                        | _        y )N)epsr2   )r3   r4   r>   r6   r7   r9   classifier_biasdenser	   classifier_activationact	LayerNormnorm_eps	norm_biasnormr<   s     r)   r4   z"ModernVBertPredictionHead.__init__  sq    YYv1163E3EvG]G]^
&667LL!3!3vO_O_`	r(   r   r   c                 `    | j                  | j                  | j                  |                  S rM   )r   r   r   )r=   r   s     r)   rN   z!ModernVBertPredictionHead.forward  s#    yy$**]";<==r(   )	r   r    r!   r   r4   r#   r   rN   rO   rP   s   @r)   r   r     s-    a0 a>U\\ >ell >r(   r   c                   n    e Zd ZddiZ fdZd Zd Ze edd      	 	 	 	 	 	 	 	 dd
e	j                  de	j                  d	z  de	j                  d	z  de	j                  d	z  de	j                  d	z  de	j                  d	z  de	j                  d	z  de	j                  d	z  dee   deez  fd              Z xZS )rq   zlm_head.weightz1model.text_model.embeddings.tok_embeddings.weightc                 l   t         |   |       |j                  j                  | _        t	        |      | _        t        |j                        | _        t        j                  |j                  j                  | j                  |j                  j                        | _        | j                          y )Nr1   )r3   r4   r:   r   r   rS   r   projection_headr6   r7   r9   decoder_biasrr   r   r<   s     r)   r4   zModernVBertForMaskedLM.__init__  s      ,,77%f-
89K9KLyy!3!3!?!?W]WiWiWvWvw 	r(   c                     | j                   S rM   rr   r   s    r)   get_output_embeddingsz,ModernVBertForMaskedLM.get_output_embeddings  s    ||r(   c                     || _         y rM   r   )r=   new_embeddingss     r)   set_output_embeddingsz,ModernVBertForMaskedLM.set_output_embeddings  s	    %r(   r   r   r   Nr   r   r   r   r   r   r   labelsr   r   c	                 \    | j                   d|||||||d|	}
|
d   }| j                  | j                  |            }d}|<t               } ||j	                  d| j
                        |j	                  d            }t        |||
j                  |
j                  |
j                        S )  
        pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):
            Mask to avoid performing attention on padding pixel indices.
        image_hidden_states (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
            The hidden states of the image encoder after modality projection.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            text_config.]` or `model.image_token_id`. Tokens with indices set to `model.image_token_id` are
            ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., text_config.]`.
        r   r   r   r   r   r   r   r   Nr   )r,   r-   r   r   r   r'   )
rS   rr   r   r   rC   r   r+   r   r   r   )r=   r   r   r   r   r   r   r   r   r   r   r   r-   r,   	criterions                  r)   rN   zModernVBertForMaskedLM.forward  s    H $** 	
)%'%!5 3	
 	
  
d22=AB(*IV[[T__=v{{2OD(!//)) ' ; ;
 	
r(   NNNNNNNN)r   r    r!   _tied_weights_keysr4   r   r   r   r   r#   r   r   r$   r   r   r   r&   r+   rN   rO   rP   s   @r)   rq   rq     s"   *,_`
&  - '+.20426158<8<*.0
##0
 t+0
 &&-	0

 ((4/0
 ''$.0
 $..50
 #..50
   4'0
 +,0
 
*	*0
 0
r(   rq   za
    The ModernVBert Model with a sequence classification head on top that performs pooling.
    c                   `    e Zd Zdef fdZe edd      	 	 	 	 	 	 	 	 ddej                  dej                  dz  d	ej                  dz  d
ej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dee   deez  fd              Z xZS )rs   r>   c                    t         |   |       |j                  | _        || _        t	        |      | _        t        |j                        | _        t        j                  |j                        | _        t        j                  |j                  j                  |j                        | _        | j!                          y rM   )r3   r4   
num_labelsr>   r   rS   r   r:   headr6   Dropoutclassifier_dropoutdropr7   r9   ru   r   r<   s     r)   r4   z-ModernVBertForSequenceClassification.__init__  s      ++%f-
-f.@.@A	JJv889	))F$6$6$B$BFDUDUV 	r(   r   r   r   Nr   r   r   r   r   r   r   r   r   r   c	                     | j                   d|||||||d|	}
|
d   }| j                  j                  dk(  r
|dddf   }n| j                  j                  dk(  r||j                  dd \  }}n|j                  dd \  }}||j                  n|j                  }|(t        j                  ||f|t
        j                        }||j                  d      z  j                  d	
      |j                  d	d      z  }| j                  |      }| j                  |      }| j                  |      }d}|| j                  j                  | j                  d	k(  rd| j                  _        nl| j                  d	kD  rL|j                  t
        j                   k(  s|j                  t
        j"                  k(  rd| j                  _        nd| j                  _        | j                  j                  dk(  rIt%               }| j                  d	k(  r& ||j'                         |j'                               }n |||      }n| j                  j                  dk(  r=t)               } ||j+                  d| j                        |j+                  d            }n,| j                  j                  dk(  rt-               } |||      }t/        |||
j0                  |
j2                        S )r   r   r   clsNr^   r   )r   r   r   r   r   T)r   keepdim
regressionsingle_label_classificationmulti_label_classificationr,   r-   r   r   r'   )rS   r>   classifier_poolingr   r   r#   r   r   r   r   r  r  ru   problem_typer  r   r   rB   r   squeezer   rC   r   r   r   r   )r=   r   r   r   r   r   r   r   r   r   r   r   rF   seq_lenr   pooled_outputr-   r,   loss_fcts                      r)   rN   z,ModernVBertForSequenceClassification.forward  s   F $** 	
)%'%!5 3	
 	
 $AJ;;))U2 1!Q$ 7[[++v5(&3&9&9"1&=#
G&/oobq&9#
G)2)>Y%%MDXDXF%!&Z,A&X]XbXb!c!2^5M5Mb5Q!Q V V[\ V ]`n`r`rt as a ! 		"34		-0/{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#FF3D))-JJ+-B @&++b/R))-II,./'!//))	
 	
r(   r   )r   r    r!   r   r4   r   r   r#   r   r   r$   r   r   r   r&   r   rN   rO   rP   s   @r)   rs   rs     s    0   - '+.20426158<8<*.Q
##Q
 t+Q
 &&-	Q

 ((4/Q
 ''$.Q
 $..5Q
 #..5Q
   4'Q
 +,Q
 
)	)Q
 Q
r(   rs   zw
    The ModernVBert Model with a token classification head on top, e.g. for Named Entity Recognition (NER) tasks.
    c                   `    e Zd Zdef fdZe edd      	 	 	 	 	 	 	 	 ddej                  dej                  dz  d	ej                  dz  d
ej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dee   deez  fd              Z xZS )rt   r>   c                 t   t         |   |       |j                  | _        t        |      | _        t        |j                        | _        t        j                  |j                        | _        t        j                  |j                  j                  |j                        | _        | j                          y rM   )r3   r4   r  r   rS   r   r:   r  r6   r  r  r  r7   r9   ru   r   r<   s     r)   r4   z*ModernVBertForTokenClassification.__init__e  s      ++%f-
-f.@.@A	JJv889	))F$6$6$B$BFDUDUV 	r(   r   r   r   Nr   r   r   r   r   r   r   r   r   r   c	                 l    | j                   d|||||||d|	}
|
d   }| j                  |      }| j                  |      }| j                  |      }d}|<t	               } ||j                  d| j                        |j                  d            }t        |||
j                  |
j                        S )r   r   r   Nr   r  r'   )
rS   r  r  ru   r   rC   r  r   r   r   )r=   r   r   r   r   r   r   r   r   r   r   r   r-   r,   r  s                  r)   rN   z)ModernVBertForTokenClassification.forwardq  s    H $** 	
)%'%!5 3	
 	
 $AJ II&78 II&78!23')HFKKDOO<fkk"oND$!//))	
 	
r(   r   )r   r    r!   r   r4   r   r   r#   r   r   r$   r   r   r   r&   r   rN   rO   rP   s   @r)   rt   rt   _  s   
0 
  - '+.20426158<8<*.1
##1
 t+1
 &&-	1

 ((4/1
 ''$.1
 $..51
 #..51
   4'1
 +,1
 
&	&1
 1
r(   rt   )rR   r   rq   rs   rt   )-rn   dataclassesr   r#   torch.nnr6   r   r   r    r   rb   activationsr	   modeling_outputsr
   r   r   r   r   modeling_utilsr   processing_utilsr   utilsr   r   r   utils.genericr   autor   configuration_modernvbertr   r   r+   rk   r/   rR   r   r   rq   rs   rt   __all__r'   r(   r)   <module>r#     s[  *  !   A A & !  . & O O -  8 @ @ @: 9 9 9<$=299 $=N -: -: -:` |
1 |
|
~	>		 	> R
7 R
 R
j 
l
+E l

l
^ 
K
(B K

K
\r(   