
    qi                        d Z ddlZddlZddlmZ ddlmZmZmZ ddlm	Z
 ddlmZ ddlmZmZmZ dd	lmZ dd
lmZ ddlmZmZmZmZmZmZmZmZ ddlmZ ddl m!Z! ddl"m#Z#m$Z$ ddl%m&Z&  e$jN                  e(      Z) G d dejT                        Z+ G d dejT                        Z, G d dejT                        Z- G d dejT                        Z. G d dejT                        Z/ G d dejT                        Z0 G d dejT                        Z1 G d d e      Z2 G d! d"ejT                        Z3 G d# d$ejT                        Z4 G d% d&ejT                        Z5 G d' d(ejT                        Z6e# G d) d*e             Z7 e#d+,       G d- d.e7             Z8e# G d/ d0e7             Z9 e#d1,       G d2 d3e7e             Z: e#d4,       G d5 d6e7             Z;e# G d7 d8e7             Z<e# G d9 d:e7             Z=e# G d; d<e7             Z>g d=Z?y)>zPyTorch RemBERT model.    N)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )initialization)ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)GradientCheckpointingLayer))BaseModelOutputWithPastAndCrossAttentions,BaseModelOutputWithPoolingAndCrossAttentions!CausalLMOutputWithCrossAttentionsMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)apply_chunking_to_forward)auto_docstringlogging   )RemBertConfigc                        e Zd ZdZ fdZ	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  ded	ej                  fd
Z
 xZS )RemBertEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.c                 |   t         |           t        j                  |j                  |j
                  |j                        | _        t        j                  |j                  |j
                        | _	        t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        | j#                  dt%        j&                  |j                        j)                  d      d       y )N)padding_idxepsposition_idsr   F)
persistent)super__init__r   	Embedding
vocab_sizeinput_embedding_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutregister_buffertorcharangeexpandselfconfig	__class__s     ^/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/rembert/modeling_rembert.pyr(   zRemBertEmbeddings.__init__1   s    !||v::H[H[ 
 $&<<0N0NPVPkPk#l %'\\&2H2H&JeJe%f"f&A&AvG\G\]zz&"<"<= 	ELL)G)GHOOPWXej 	 	
    N	input_idstoken_type_idsr#   inputs_embedspast_key_values_lengthreturnc                    ||j                         }n|j                         d d }|d   }|| j                  d d |||z   f   }|:t        j                  |t        j                  | j                  j
                        }|| j                  |      }| j                  |      }||z   }	| j                  |      }
|	|
z  }	| j                  |	      }	| j                  |	      }	|	S )Nr%   r   dtypedevice)sizer#   r8   zeroslongrI   r-   r1   r/   r2   r6   )r<   rA   rB   r#   rC   rD   input_shape
seq_lengthr1   
embeddingsr/   s              r?   forwardzRemBertEmbeddings.forwardA   s      #..*K',,.s3K ^
,,Q0FVlIl0l-lmL!"[[EJJtO`O`OgOghN  00;M $ : :> J"%::
"66|D))
^^J/
\\*-
r@   )NNNNr   )__name__
__module____qualname____doc__r(   r8   
LongTensorFloatTensorintTensorrP   __classcell__r>   s   @r?   r   r   .   s    Q
$ .2260426&'##d* ((4/ &&-	
 ((4/ !$ 
r@   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )RemBertPoolerc                     t         |           t        j                  |j                  |j                        | _        t        j                         | _        y N)r'   r(   r   Linearhidden_sizedenseTanh
activationr;   s     r?   r(   zRemBertPooler.__init__d   s9    YYv1163E3EF
'')r@   hidden_statesrE   c                 \    |d d df   }| j                  |      }| j                  |      }|S )Nr   )ra   rc   )r<   rd   first_token_tensorpooled_outputs       r?   rP   zRemBertPooler.forwardi   s6     +1a40

#566r@   rQ   rR   rS   r(   r8   rX   rP   rY   rZ   s   @r?   r\   r\   c   s#    $
U\\ ell r@   r\   c                        e Zd Zd fd	Z	 	 	 	 	 ddej
                  dej                  dz  dej                  dz  dedz  dedej
                  dz  d	e	fd
Z
 xZS )RemBertSelfAttentionNc                    t         |           |j                  |j                  z  dk7  r2t	        |d      s&t        d|j                   d|j                   d      |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                        | _        |j"                  | _        || _        y )Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ())r'   r(   r`   num_attention_headshasattr
ValueErrorrW   attention_head_sizeall_head_sizer   r_   querykeyvaluer4   attention_probs_dropout_probr6   
is_decoder	layer_idxr<   r=   rx   r>   s      r?   r(   zRemBertSelfAttention.__init__s   s0    : ::a?PVXhHi#F$6$6#7 8 445Q8 
 $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EF ++"r@   rd   attention_maskencoder_hidden_statespast_key_valuesoutput_attentionscache_positionrE   c                    |j                   \  }}}	| j                  |      j                  |d| j                  | j                        j                  dd      }
d}|d u}|St        |t              rA|j                  j                  | j                        }|r|j                  }n|j                  }n|}|r|n|}|rK|I|rGj                  | j                     j                  }|j                  | j                     j                  }n| j!                  |      j                  |d| j                  | j                        j                  dd      }| j#                  |      j                  |d| j                  | j                        j                  dd      }|T|s|nd }j%                  ||| j                  d|i      \  }}|r)t        |t              rd|j                  | j                  <   t'        j(                  |
|j                  dd            }|t+        j,                  | j                        z  }|||z   }t.        j0                  j3                  |d      }| j5                  |      }t'        j(                  ||      }|j7                  d	ddd
      j9                         }|j;                         d d | j<                  fz   } |j                  | }||fS )Nr%   r      Fr~   Tdimr   r   )shapers   viewrn   rq   	transpose
isinstancer   
is_updatedgetrx   cross_attention_cacheself_attention_cachelayerskeysvaluesrt   ru   updater8   matmulmathsqrtr   
functionalsoftmaxr6   permute
contiguousrJ   rr   )r<   rd   rz   r{   r|   r}   r~   
batch_sizerN   _query_layerr   is_cross_attentioncurr_past_key_valuescurrent_states	key_layervalue_layerattention_scoresattention_probscontext_layernew_context_layer_shapes                        r?   rP   zRemBertSelfAttention.forward   s    %2$7$7!
JJJ}%T*b$":":D<T<TUYq!_ 	 
2$>&/+>?,77;;DNNK
%+:+P+P(+:+O+O('6$2D.-/"=*,33DNNCHHI.55dnnELLK (j"d&>&>@X@XY1a  

>*j"d&>&>@X@XY1a  *7It)=)D)D{DNN=M~<^*&	; &*_FY*ZAEO..t~~> !<<Y5H5HR5PQ+dii8P8P.QQ%/.@ --//0@b/I ,,7_kB%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S***,CDo--r@   r^   NNNFNrQ   rR   rS   r(   r8   rX   rV   r
   booltuplerP   rY   rZ   s   @r?   rj   rj   r   s    #0 48:>(,"'.2M.||M. ))D0M.  %0047	M.
 M.  M. t+M. 
M.r@   rj   c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )RemBertSelfOutputc                 (   t         |           t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        t        j                  |j                        | _
        y Nr!   )r'   r(   r   r_   r`   ra   r2   r3   r4   r5   r6   r;   s     r?   r(   zRemBertSelfOutput.__init__   s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=r@   rd   input_tensorrE   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S r^   ra   r6   r2   r<   rd   r   s      r?   rP   zRemBertSelfOutput.forward   7    

=1]3}|'CDr@   rh   rZ   s   @r?   r   r      1    >U\\  RWR^R^ r@   r   c                        e Zd Zd fd	Z	 	 	 	 	 ddej
                  dej                  dz  dej                  dz  dedz  dedz  dej
                  dz  d	e	ej
                     fd
Z
 xZS )RemBertAttentionNc                 f    t         |           t        ||      | _        t	        |      | _        y )Nrx   )r'   r(   rj   r<   r   outputry   s      r?   r(   zRemBertAttention.__init__   s(    (9E	'/r@   rd   rz   r{   r|   r}   r~   rE   c                 p    | j                  ||||||      }| j                  |d   |      }|f|dd  z   }	|	S )Nrz   r{   r|   r}   r~   r   r   )r<   r   )
r<   rd   rz   r{   r|   r}   r~   self_outputsattention_outputoutputss
             r?   rP   zRemBertAttention.forward   sY     yy)"7+/) ! 
  ;;|AF#%QR(88r@   r^   r   r   rZ   s   @r?   r   r      s    0 48:>(,)..2|| ))D0  %0047	
   $; t+ 
u||	r@   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )RemBertIntermediatec                    t         |           t        j                  |j                  |j
                        | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y r^   )r'   r(   r   r_   r`   intermediate_sizera   r   
hidden_actstrr	   intermediate_act_fnr;   s     r?   r(   zRemBertIntermediate.__init__  s]    YYv1163K3KL
f''-'-f.?.?'@D$'-'8'8D$r@   rd   rE   c                 J    | j                  |      }| j                  |      }|S r^   )ra   r   r<   rd   s     r?   rP   zRemBertIntermediate.forward  s&    

=100?r@   rh   rZ   s   @r?   r   r     s#    9U\\ ell r@   r   c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )RemBertOutputc                 (   t         |           t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        y r   )r'   r(   r   r_   r   r`   ra   r2   r3   r4   r5   r6   r;   s     r?   r(   zRemBertOutput.__init__  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=r@   rd   r   rE   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S r^   r   r   s      r?   rP   zRemBertOutput.forward  r   r@   rh   rZ   s   @r?   r   r     r   r@   r   c                        e Zd Zd fd	Z	 	 	 	 	 	 ddej
                  dej                  dz  dej                  dz  dej                  dz  dedz  dedz  d	ej
                  dz  d
e	ej
                     fdZ
d Z xZS )RemBertLayerNc                 h   t         |           |j                  | _        d| _        t	        ||      | _        |j                  | _        |j                  | _        | j                  r,| j                  st        |  d      t	        ||      | _	        t        |      | _        t        |      | _        y )Nr   z> should be used as a decoder model if cross attention is addedr   )r'   r(   chunk_size_feed_forwardseq_len_dimr   	attentionrw   add_cross_attentionrp   crossattentionr   intermediater   r   ry   s      r?   r(   zRemBertLayer.__init__$  s    '-'E'E$)&)< ++#)#=#= ##?? D6)g!hii"26Y"OD/7#F+r@   rd   rz   r{   encoder_attention_maskr|   r}   r~   rE   c                 D   | j                  |||||      }|d   }	|dd  }
| j                  rA|?t        | d      st        d|  d      | j	                  |	|||||      }|d   }	|
|dd  z   }
t        | j                  | j                  | j                  |	      }|f|
z   }
|
S )N)rz   r}   r|   r~   r   r   r   z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`r   )	r   rw   ro   rp   r   r   feed_forward_chunkr   r   )r<   rd   rz   r{   r   r|   r}   r~   self_attention_outputsr   r   cross_attention_outputslayer_outputs                r?   rP   zRemBertLayer.forward3  s     "&)/+) "0 "
 2!4(,??4@4!12 =dV DD D 
 '+&9&9 5&; /"3- ': '#  7q9 7 ;;G0##T%A%A4CSCSUe
  /G+r@   c                 L    | j                  |      }| j                  ||      }|S r^   )r   r   )r<   r   intermediate_outputr   s       r?   r   zRemBertLayer.feed_forward_chunka  s,    "//0@A{{#68HIr@   r^   )NNNNFN)rQ   rR   rS   r(   r8   rX   rV   r
   r   r   rP   r   rY   rZ   s   @r?   r   r   #  s    ,$ 48:>;?(,)..2+||+ ))D0+  %0047	+
 !& 1 1D 8+ +  $;+ t++ 
u||	+\r@   r   c                        e Zd Z fdZ	 	 	 	 	 	 	 	 	 ddej
                  dej                  dz  dej                  dz  dej                  dz  dedz  dedz  d	ed
ededej
                  dz  de	e
z  fdZ xZS )RemBertEncoderc           	      2   t         |           || _        t        j                  |j
                  |j                        | _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        d| _        y c c}w )Nr   F)r'   r(   r=   r   r_   r+   r`   embedding_hidden_mapping_in
ModuleListrangenum_hidden_layersr   layergradient_checkpointing)r<   r=   ir>   s      r?   r(   zRemBertEncoder.__init__h  sq    +-99V5P5PRXRdRd+e(]]uU[UmUmOn#o!L1$E#op
&+# $ps   ,BNrd   rz   r{   r   r|   	use_cacher}   output_hidden_statesreturn_dictr~   rE   c           	      n   | j                   r%| j                  r|rt        j                  d       d}|r6|4t	        t        | j                        t        | j                              }| j                  |      }|rdnd }|rdnd }|r| j                  j                  rdnd }t        | j                        D ]K  \  }}|r||fz   } |||||||      }|d   }|s#||d   fz   }| j                  j                  sC||d   fz   }M |r||fz   }|	st        d |||||fD              S t        |||||	      S )
NzZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...F)r=    r   r   r   c              3   $   K   | ]  }|| 
 y wr^   r   ).0vs     r?   	<genexpr>z)RemBertEncoder.forward.<locals>.<genexpr>  s      
 = 
s   )last_hidden_stater|   rd   
attentionscross_attentions)r   trainingloggerwarning_oncer   r   r=   r   r   	enumerater   r   r   )r<   rd   rz   r{   r   r|   r   r}   r   r   r~   all_hidden_statesall_self_attentionsall_cross_attentionsr   layer_modulelayer_outputss                    r?   rP   zRemBertEncoder.forwardp  s    &&4==##p "	01,dkk2RT`hlhshsTtuO88G"6BD$5b4%64;;;Z;Zr`d(4 	VOA|#$58H$H!(%&!M *!,M &9]1=M<O&O#;;22+?=QRCSBU+U(#	V&   1]4D D 
 "#%'(
 
 
 9+++*1
 	
r@   )	NNNNNFFTN)rQ   rR   rS   r(   r8   rX   rV   r
   r   r   r   rP   rY   rZ   s   @r?   r   r   g  s    , 48:>;?(,!%"'%* .2D
||D
 ))D0D
  %0047	D

 !& 1 1D 8D
 D
 $;D
  D
 #D
 D
 t+D
 
:	:D
r@   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )RemBertPredictionHeadTransformc                 h   t         |           t        j                  |j                  |j                        | _        t        |j                  t              rt        |j                     | _
        n|j                  | _
        t        j                  |j                  |j                        | _        y r   )r'   r(   r   r_   r`   ra   r   r   r   r	   transform_act_fnr2   r3   r;   s     r?   r(   z'RemBertPredictionHeadTransform.__init__  s{    YYv1163E3EF
f''-$*6+<+<$=D!$*$5$5D!f&8&8f>S>STr@   rd   rE   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r^   )ra   r   r2   r   s     r?   rP   z&RemBertPredictionHeadTransform.forward  s4    

=1--m<}5r@   rh   rZ   s   @r?   r   r     s$    UU\\ ell r@   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )RemBertLMPredictionHeadc                 n   t         |           t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        |j                     | _        t        j                  |j
                  |j                        | _        y r   )r'   r(   r   r_   r`   output_embedding_sizera   r*   decoderr	   r   rc   r2   r3   r;   s     r?   r(   z RemBertLMPredictionHead.__init__  sz    YYv1163O3OP
yy!=!=v?P?PQ !2!23f&B&BH]H]^r@   rd   rE   c                     | j                  |      }| j                  |      }| j                  |      }| j                  |      }|S r^   )ra   rc   r2   r  r   s     r?   rP   zRemBertLMPredictionHead.forward  s@    

=16}5]3r@   rh   rZ   s   @r?   r   r     s$    _U\\ ell r@   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )RemBertOnlyMLMHeadc                 B    t         |           t        |      | _        y r^   )r'   r(   r   predictionsr;   s     r?   r(   zRemBertOnlyMLMHead.__init__  s    26:r@   sequence_outputrE   c                 (    | j                  |      }|S r^   )r  )r<   r  prediction_scoress      r?   rP   zRemBertOnlyMLMHead.forward  s     ,,_=  r@   rh   rZ   s   @r?   r  r    s#    ;!u|| ! !r@   r  c                   2     e Zd ZU eed<   dZdZ fdZ xZS )RemBertPreTrainedModelr=   rembertTc                     t         |   |       t        |t              rZt	        j
                  |j                  t        j                  |j                  j                  d         j                  d             y y )Nr%   r$   )r'   _init_weightsr   r   initcopy_r#   r8   r9   r   r:   )r<   moduler>   s     r?   r  z$RemBertPreTrainedModel._init_weights  s[    f%f/0JJv**ELL9L9L9R9RSU9V,W,^,^_f,gh 1r@   )	rQ   rR   rS   r   __annotations__base_model_prefixsupports_gradient_checkpointingr  rY   rZ   s   @r?   r  r    s!    !&*#i ir@   r  a
  
    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
    cross-attention is added between the self-attention layers, following the architecture described in [Attention is
    all you need](https://huggingface.co/papers/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.

    To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
    to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
    `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
    )custom_introc                       e Zd Zd fd	Zd Zd Ze	 	 	 	 	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  d	ej                  dz  d
ej                  dz  dej                  dz  de
dz  dedz  dedz  dedz  dedz  dej                  dz  deez  fd       Z xZS )RemBertModelc                     t         |   |       || _        t        |      | _        t        |      | _        |rt        |      nd| _        | j                          y)zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        N)
r'   r(   r=   r   rO   r   encoderr\   pooler	post_init)r<   r=   add_pooling_layerr>   s      r?   r(   zRemBertModel.__init__  sM    
 	 +F3%f-/@mF+d 	r@   c                 .    | j                   j                  S r^   rO   r-   r<   s    r?   get_input_embeddingsz!RemBertModel.get_input_embeddings  s    ...r@   c                 &    || j                   _        y r^   r  )r<   ru   s     r?   set_input_embeddingsz!RemBertModel.set_input_embeddings  s    */'r@   NrA   rz   rB   r#   rC   r{   r   r|   r   r}   r   r   r~   rE   c                    |
|
n| j                   j                  }
||n| j                   j                  }||n| j                   j                  }| j                   j                  r|	|	n| j                   j
                  }	nd}	||t        d      |#| j                  ||       |j                         }n!||j                         d d }nt        d      |\  }}||j                  n|j                  }|dn|j                         }|t        j                  |||z   f|      }|&t        j                  |t        j                  |      }| j                  ||      }| j                   j                  rE|C|j                         \  }}}||f}|t        j                  ||      }| j!                  |      }nd }| j#                  |||||      }| j%                  ||||||	|
|||	
      }|d   }| j&                  | j'                  |      nd }|s
||f|d
d  z   S t)        |||j*                  |j,                  |j.                  |j0                        S )NFzDYou cannot specify both input_ids and inputs_embeds at the same timer%   z5You have to specify either input_ids or inputs_embedsr   )rI   rG   )rA   r#   rB   rC   rD   )	rz   r{   r   r|   r   r}   r   r   r~   r   )r   pooler_outputr|   rd   r   r   )r=   r}   r   use_return_dictrw   r   rp   %warn_if_padding_and_no_attention_maskrJ   rI   get_seq_lengthr8   onesrK   rL   get_extended_attention_maskinvert_attention_maskrO   r  r  r   r|   rd   r   r   )r<   rA   rz   rB   r#   rC   r{   r   r|   r   r}   r   r   r~   kwargsrM   r   rN   rI   rD   extended_attention_maskencoder_batch_sizeencoder_sequence_lengthr   encoder_hidden_shapeencoder_extended_attention_maskembedding_outputencoder_outputsr  rg   s                                 r?   rP   zRemBertModel.forward  s   $ 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B];;!!%.%:	@U@UII ]%>cdd"66y.Q#..*K&',,.s3KTUU!,
J%.%:!!@T@T&5&=?CaCaCc!"ZZ*jCY6Y)ZdjkN!"[[EJJvVN 150P0PQ_al0m ;;!!&;&G=R=W=W=Y: 7$68O#P %-).4HQW)X&.2.H.HI_.`+.2+??%)'#9 + 
 ,,2"7#B+/!5#) ' 
 *!,8<8OO4UY#]3oab6III;-'+;;)77&11,==
 	
r@   )T)NNNNNNNNNNNNN)rQ   rR   rS   r(   r   r"  r   r8   rU   rV   r
   r   rX   r   r   rP   rY   rZ   s   @r?   r  r    sY    /0  .226260426:>;?(,!%)-,0#'.2_
##d*_
 ((4/_
 ((4/	_

 &&-_
 ((4/_
  %0047_
 !& 1 1D 8_
 _
 $;_
  $;_
 #Tk_
 D[_
 t+_
  
=	=!_
 _
r@   r  c                   l    e Zd Z fdZd Zd Ze	 	 	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  d	ej                  dz  d
ej                  dz  dej                  dz  dej                  dz  de
dz  de
dz  de
dz  deez  fd       Z xZS )RemBertForMaskedLMc                     t         |   |       |j                  rt        j	                  d       t        |d      | _        t        |      | _        | j                          y )NznIf you want to use `RemBertForMaskedLM` make sure `config.is_decoder=False` for bi-directional self-attention.Fr  
r'   r(   rw   r   warningr  r  r  clsr  r;   s     r?   r(   zRemBertForMaskedLM.__init__x  sR     NN1
 $FeD%f- 	r@   c                 B    | j                   j                  j                  S r^   r9  r  r  r  s    r?   get_output_embeddingsz(RemBertForMaskedLM.get_output_embeddings      xx##+++r@   c                 :    || j                   j                  _        y r^   r;  r<   new_embeddingss     r?   set_output_embeddingsz(RemBertForMaskedLM.set_output_embeddings      '5$r@   NrA   rz   rB   r#   rC   r{   r   labelsr}   r   r   rE   c                    ||n| j                   j                  }| j                  ||||||||	|
|
      }|d   }| j                  |      }d}|Ft	               } ||j                  d| j                   j                        |j                  d            }|s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
        N)	rz   rB   r#   rC   r{   r   r}   r   r   r   r%   r   losslogitsrd   r   )
r=   r%  r  r9  r   r   r*   r   rd   r   )r<   rA   rz   rB   r#   rC   r{   r   rC  r}   r   r   r+  r   r  r	  masked_lm_lossloss_fctr   s                      r?   rP   zRemBertForMaskedLM.forward  s    , &1%<k$++B]B],,))%'"7#9/!5#  
 "!* HH_5')H%&7&<&<RAWAW&XZ`ZeZefhZijN')GABK7F3A3M^%.YSYY$!//))	
 	
r@   )NNNNNNNNNNN)rQ   rR   rS   r(   r<  rA  r   r8   rU   rV   r   r   r   rP   rY   rZ   s   @r?   r4  r4  v  s(   ,6  .226260426:>;?*.)-,0#'5
##d*5
 ((4/5
 ((4/	5

 &&-5
 ((4/5
  %00475
 !& 1 1D 85
   4'5
  $;5
 #Tk5
 D[5
 
	5
 5
r@   r4  zS
    RemBERT Model with a `language modeling` head on top for CLM fine-tuning.
    c            !           e Zd Z fdZd Zd Ze	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  d	ej                  dz  d
ej                  dz  dej                  dz  de
dz  dej                  dz  dedz  dedz  dedz  dedz  deej                  z  deez  fd       Z xZS )RemBertForCausalLMc                     t         |   |       |j                  st        j	                  d       t        |d      | _        t        |      | _        | j                          y )NzOIf you want to use `RemBertForCausalLM` as a standalone, add `is_decoder=True.`Fr6  r7  r;   s     r?   r(   zRemBertForCausalLM.__init__  sL       NNlm#FeD%f- 	r@   c                 B    | j                   j                  j                  S r^   r;  r  s    r?   r<  z(RemBertForCausalLM.get_output_embeddings  r=  r@   c                 :    || j                   j                  _        y r^   r;  r?  s     r?   rA  z(RemBertForCausalLM.set_output_embeddings  rB  r@   NrA   rz   rB   r#   rC   r{   r   r|   rC  r   r}   r   r   logits_to_keeprE   c                    ||n| j                   j                  }| j                  |||||||||
|||      }|d   }t        |t              rt        | d      n|}| j                  |dd|ddf         }d}|	* | j                  d||	| j                   j                  d|}|s|f|dd z   }||f|z   S |S t        |||j                  |j                  |j                  |j                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
            ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, RemBertForCausalLM, RemBertConfig
        >>> import torch

        >>> tokenizer = AutoTokenizer.from_pretrained("google/rembert")
        >>> config = RemBertConfig.from_pretrained("google/rembert")
        >>> config.is_decoder = True
        >>> model = RemBertForCausalLM.from_pretrained("google/rembert", config=config)

        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> prediction_logits = outputs.logits
        ```N)rz   rB   r#   rC   r{   r   r|   r   r}   r   r   r   )rG  rC  r*   r   )rF  rG  r|   rd   r   r   r   )r=   r%  r  r   rW   slicer9  loss_functionr*   r   r|   rd   r   r   )r<   rA   rz   rB   r#   rC   r{   r   r|   rC  r   r}   r   r   rO  r+  r   rd   slice_indicesrG  rF  r   s                         r?   rP   zRemBertForCausalLM.forward  s)   R &1%<k$++B]B],,))%'"7#9+/!5#  
  
8B>SV8W~ot4]k-=!(;<=%4%%pVFt{{OeOepiopDY,F)-)9TGf$EvE0#33!//))$55
 	
r@   )NNNNNNNNNNNNNr   )rQ   rR   rS   r(   r<  rA  r   r8   rU   rV   r
   r   rW   rX   r   r   rP   rY   rZ   s   @r?   rK  rK    sr   
,6  .226260426:>;?(,*.!%)-,0#'-.M
##d*M
 ((4/M
 ((4/	M

 &&-M
 ((4/M
  %0047M
 !& 1 1D 8M
 M
   4'M
 $;M
  $;M
 #TkM
 D[M
 ell*M
" 
2	2#M
 M
r@   rK  z
    RemBERT Model transformer with a sequence classification/regression head on top (a linear layer on top of the
    pooled output) e.g. for GLUE tasks.
    c                        e Zd Z fdZe	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	edz  d
edz  dedz  de	e
z  fd       Z xZS ) RemBertForSequenceClassificationc                 ,   t         |   |       |j                  | _        t        |      | _        t        j                  |j                        | _        t        j                  |j                  |j                        | _        | j                          y r^   r'   r(   
num_labelsr  r  r   r4   classifier_dropout_probr6   r_   r`   
classifierr  r;   s     r?   r(   z)RemBertForSequenceClassification.__init__6  si      ++#F+zz&"@"@A))F$6$68I8IJ 	r@   NrA   rz   rB   r#   rC   rC  r}   r   r   rE   c
           
      >   |	|	n| j                   j                  }	| j                  ||||||||	      }|d   }| j                  |      }| j	                  |      }d}|| j                   j
                  | j                  dk(  rd| j                   _        nl| j                  dkD  rL|j                  t        j                  k(  s|j                  t        j                  k(  rd| j                   _        nd| j                   _        | j                   j
                  dk(  rIt               }| j                  dk(  r& ||j                         |j                               }n |||      }n| j                   j
                  dk(  r=t               } ||j                  d| j                        |j                  d            }n,| j                   j
                  dk(  rt               } |||      }|	s|f|dd z   }||f|z   S |S t!        |||j"                  |j$                  	      S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nrz   rB   r#   rC   r}   r   r   r   
regressionsingle_label_classificationmulti_label_classificationr%   r   rE  )r=   r%  r  r6   rZ  problem_typerX  rH   r8   rL   rW   r   squeezer   r   r   r   rd   r   )r<   rA   rz   rB   r#   rC   rC  r}   r   r   r+  r   rg   rG  rF  rI  r   s                    r?   rP   z(RemBertForSequenceClassification.forward@  s   ( &1%<k$++B]B],,))%'/!5#  	
  
]3/{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#FF3D))-JJ+-B @&++b/R))-II,./Y,F)-)9TGf$EvE'!//))	
 	
r@   	NNNNNNNNN)rQ   rR   rS   r(   r   r8   rV   rU   r   r   r   rP   rY   rZ   s   @r?   rU  rU  /  s      /337261526*.)-,0#'D
$$t+D
 ))D0D
 ((4/	D

 ''$.D
 ((4/D
   4'D
  $;D
 #TkD
 D[D
 
)	)D
 D
r@   rU  c                        e Zd Z fdZe	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	edz  d
edz  dedz  de	e
z  fd       Z xZS )RemBertForMultipleChoicec                     t         |   |       t        |      | _        t	        j
                  |j                        | _        t	        j                  |j                  d      | _
        | j                          y )Nr   )r'   r(   r  r  r   r4   rY  r6   r_   r`   rZ  r  r;   s     r?   r(   z!RemBertForMultipleChoice.__init__  sV     #F+zz&"@"@A))F$6$6: 	r@   NrA   rz   rB   r#   rC   rC  r}   r   r   rE   c
           
      J   |	|	n| j                   j                  }	||j                  d   n|j                  d   }|!|j                  d|j	                  d            nd}|!|j                  d|j	                  d            nd}|!|j                  d|j	                  d            nd}|!|j                  d|j	                  d            nd}|1|j                  d|j	                  d      |j	                  d            nd}| j                  ||||||||	      }|d   }| j                  |      }| j                  |      }|j                  d|      }d}|t               } |||      }|	s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )a[  
        input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
            model's internal embedding lookup matrix.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        Nr   r%   r   r\  r   rE  )r=   r%  r   r   rJ   r  r6   rZ  r   r   rd   r   )r<   rA   rz   rB   r#   rC   rC  r}   r   r   r+  num_choicesr   rg   rG  reshaped_logitsrF  rI  r   s                      r?   rP   z RemBertForMultipleChoice.forward  s   X &1%<k$++B]B],5,Aiooa(}GZGZ[\G]>G>SINN2y~~b'9:Y]	M[Mg,,R1D1DR1HImqM[Mg,,R1D1DR1HImqGSG_|((\->->r-BCei ( r=#5#5b#9=;M;Mb;QR 	 ,,))%'/!5#  	
  
]3/ ++b+6')HOV4D%''!"+5F)-)9TGf$EvE("!//))	
 	
r@   rb  )rQ   rR   rS   r(   r   r8   rV   rU   r   r   r   rP   rY   rZ   s   @r?   rd  rd    s      /337261526*.)-,0#'W
$$t+W
 ))D0W
 ((4/	W

 ''$.W
 ((4/W
   4'W
  $;W
 #TkW
 D[W
 
*	*W
 W
r@   rd  c                        e Zd Z fdZe	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	edz  d
edz  dedz  de	e
z  fd       Z xZS )RemBertForTokenClassificationc                 0   t         |   |       |j                  | _        t        |d      | _        t        j                  |j                        | _        t        j                  |j                  |j                        | _        | j                          y NFr6  rW  r;   s     r?   r(   z&RemBertForTokenClassification.__init__  sk      ++#FeDzz&"@"@A))F$6$68I8IJ 	r@   NrA   rz   rB   r#   rC   rC  r}   r   r   rE   c
           
         |	|	n| j                   j                  }	| j                  ||||||||	      }|d   }| j                  |      }| j	                  |      }d}|<t               } ||j                  d| j                        |j                  d            }|	s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )z
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        Nr\  r   r%   r   rE  )r=   r%  r  r6   rZ  r   r   rX  r   rd   r   )r<   rA   rz   rB   r#   rC   rC  r}   r   r   r+  r   r  rG  rF  rI  r   s                    r?   rP   z%RemBertForTokenClassification.forward  s    $ &1%<k$++B]B],,))%'/!5#  	
 "!*,,71')HFKKDOO<fkk"oNDY,F)-)9TGf$EvE$!//))	
 	
r@   rb  )rQ   rR   rS   r(   r   r8   rV   rU   r   r   r   rP   rY   rZ   s   @r?   rj  rj    s    	  /337261526*.)-,0#'1
$$t+1
 ))D01
 ((4/	1

 ''$.1
 ((4/1
   4'1
  $;1
 #Tk1
 D[1
 
&	&1
 1
r@   rj  c                   @    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	ej                  dz  d
edz  dedz  dedz  de	e
z  fd       Z xZS )RemBertForQuestionAnsweringc                     t         |   |       |j                  | _        t        |d      | _        t        j                  |j                  |j                        | _        | j                          y rl  )
r'   r(   rX  r  r  r   r_   r`   
qa_outputsr  r;   s     r?   r(   z$RemBertForQuestionAnswering.__init__3  sU      ++#FeD))F$6$68I8IJ 	r@   NrA   rz   rB   r#   rC   start_positionsend_positionsr}   r   r   rE   c           
         |
|
n| j                   j                  }
| j                  |||||||	|
      }|d   }| j                  |      }|j	                  dd      \  }}|j                  d      }|j                  d      }d }||t        |j                               dkD  r|j                  d      }t        |j                               dkD  r|j                  d      }|j                  d      }|j                  d|       |j                  d|       t        |      } |||      } |||      }||z   dz  }|
s||f|dd  z   }||f|z   S |S t        ||||j                  |j                        S )	Nr\  r   r   r%   r   )ignore_indexr   )rF  start_logits
end_logitsrd   r   )r=   r%  r  rq  splitra  lenrJ   clamp_r   r   rd   r   )r<   rA   rz   rB   r#   rC   rr  rs  r}   r   r   r+  r   r  rG  rv  rw  
total_lossignored_indexrI  
start_lossend_lossr   s                          r?   rP   z#RemBertForQuestionAnswering.forward>  s    &1%<k$++B]B],,))%'/!5#  	
 "!*1#)<<r<#: j#++B/''+

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M""1m4  M2']CH!,@J
M:H$x/14J"J/'!"+=F/9/EZMF*Q6Q+%!!//))
 	
r@   )
NNNNNNNNNN)rQ   rR   rS   r(   r   r8   rV   rU   r   r   r   rP   rY   rZ   s   @r?   ro  ro  1  s   	  /3372615263715)-,0#'=
$$t+=
 ))D0=
 ((4/	=

 ''$.=
 ((4/=
 ))D0=
 ''$.=
  $;=
 #Tk=
 D[=
 
-	-=
 =
r@   ro  )	rK  r4  rd  ro  rU  rj  r   r  r  )@rT   r   r8   r   torch.nnr   r   r    r   r  activationsr	   cache_utilsr
   r   r   
generationr   modeling_layersr   modeling_outputsr   r   r   r   r   r   r   r   modeling_utilsr   pytorch_utilsr   utilsr   r   configuration_rembertr   
get_loggerrQ   r   Moduler   r\   rj   r   r   r   r   r   r   r   r   r  r  r  r4  rK  rU  rd  rj  ro  __all__r   r@   r?   <module>r     sE       A A & ! C C ) 9	 	 	 . 6 , 0 
		H	%1		 1jBII c.299 c.N		 ryy <"))  BII A- AHM
RYY M
bRYY "bii "! ! i_ i i 	w
) w
w
t L
/ L
 L
^ 
a
/ a

a
H P
'= P
P
f c
5 c
 c
L >
$: >
 >
B J
"8 J
 J
Z
r@   