
    qi\                     l   d Z ddlZddlmZ ddlmZmZmZ ddlmZ	 ddl
mZ ddlmZ dd	lmZmZmZmZmZ dd
lmZ ddlmZmZ ddlmZ  ej6                  e      Z G d dej<                        Z G d dej<                        Z ejB                  jD                  d        Z#ejB                  jD                  d        Z$ejB                  jD                  d        Z%ejB                  jD                  d        Z&ejB                  jD                  dejN                  de(fd       Z)ejB                  jD                  dejN                  dejN                  fd       Z*ejB                  jD                  dejN                  dejN                  de(fd       Z+ejB                  jD                  dejN                  dejN                  fd       Z, G d dej<                        Z- G d  d!ej<                        Z. G d" d#ej<                        Z/ G d$ d%ej<                        Z0 G d& d'ej<                        Z1 G d( d)e      Z2 G d* d+ej<                        Z3e G d, d-e             Z4e G d. d/e4             Z5 G d0 d1ej<                        Z6 G d2 d3ej<                        Z7 G d4 d5ej<                        Z8 G d6 d7ej<                        Z9 G d8 d9ej<                        Z:e G d: d;e4             Z; G d< d=ej<                        Z< ed>?       G d@ dAe4             Z=e G dB dCe4             Z>e G dD dEe4             Z?g dFZ@y)GzPyTorch DeBERTa model.    N)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )initialization)ACT2FN)GradientCheckpointingLayer)BaseModelOutputMaskedLMOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)auto_docstringlogging   )DebertaConfigc                   *     e Zd ZdZd fd	Zd Z xZS )DebertaLayerNormz2LayerNorm module (epsilon inside the square root).c                     t         |           t        j                  t	        j
                  |            | _        t        j                  t	        j                  |            | _        || _	        y N)
super__init__r   	Parametertorchonesweightzerosbiasvariance_epsilon)selfsizeeps	__class__s      ^/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/deberta/modeling_deberta.pyr   zDebertaLayerNorm.__init__)   sH    ll5::d#34LLT!23	 #    c                 X   |j                   }|j                         }|j                  dd      }||z
  j                  d      j                  dd      }||z
  t	        j
                  || j                  z         z  }|j                  |      }| j                  |z  | j                  z   }|S )NT)keepdim   )
dtypefloatmeanpowr   sqrtr!   tor   r    )r"   hidden_states
input_typer.   varianceys         r&   forwardzDebertaLayerNorm.forward/   s    "((
%++-!!"d!3!D(--a055b$5G&-HtG\G\<\1]]%((4KK-'$))3r'   )g-q=__name__
__module____qualname____doc__r   r6   __classcell__r%   s   @r&   r   r   &   s    <$r'   r   c                   $     e Zd Z fdZd Z xZS )DebertaSelfOutputc                    t         |           t        j                  |j                  |j                        | _        t        |j                  |j                        | _        t        j                  |j                        | _        y r   )r   r   r   Linearhidden_sizedenser   layer_norm_eps	LayerNormDropouthidden_dropout_probdropoutr"   configr%   s     r&   r   zDebertaSelfOutput.__init__;   s\    YYv1163E3EF
)&*<*<f>S>STzz&"<"<=r'   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S r   rC   rH   rE   r"   r2   input_tensors      r&   r6   zDebertaSelfOutput.forwardA   7    

=1]3}|'CDr'   r8   r9   r:   r   r6   r<   r=   s   @r&   r?   r?   :   s    >r'   r?   c                    | j                  d      }|j                  d      }t        j                  |t        j                  | j                        }t        j                  |t        j                  |j                        }|dddf   |j                  dd      j                  |d      z
  }|d|ddf   }|j                  d      }|S )a  
    Build relative position according to the query and key

    We assume the absolute position of query \(P_q\) is range from (0, query_size) and the absolute position of key
    \(P_k\) is range from (0, key_size), The relative positions from query to key is \(R_{q \rightarrow k} = P_q -
    P_k\)

    Args:
        query_size (int): the length of query
        key_size (int): the length of key

    Return:
        `torch.LongTensor`: A tensor with shape [1, query_size, key_size]

    r,   deviceNr   r)   r   )r#   r   arangelongrT   viewrepeat	unsqueeze)query_layer	key_layer
query_sizekey_sizeq_idsk_idsrel_pos_idss          r&   build_relative_positionra   H   s    $ !!"%J~~b!HLL5::k>P>PQELLI<L<LME4.5::a#4#;#;J#JJKkzk1n-K''*Kr'   c                     | j                  |j                  d      |j                  d      |j                  d      |j                  d      g      S )Nr   r   r+   r)   expandr#   )c2p_posrZ   relative_poss      r&   c2p_dynamic_expandrg   e   sI    >>;++A.0@0@0C[EUEUVWEXZfZkZklnZopqqr'   c                     | j                  |j                  d      |j                  d      |j                  d      |j                  d      g      S )Nr   r   rR   rc   )re   rZ   r[   s      r&   p2c_dynamic_expandri   j   sG    >>;++A.0@0@0CY^^TVEWYbYgYghjYklmmr'   c                     | j                  |j                         d d | j                  d      |j                  d      fz         S )Nr+   rR   rc   )	pos_indexp2c_attr[   s      r&   pos_dynamic_expandrm   o   s=    GLLN2A.)..2DinnUWFX1YYZZr'   rZ   scale_factorc                     t        j                  t        j                  | j                  d      t         j                        |z        S )Nr)   r,   )r   r0   tensorr#   r-   )rZ   rn   s     r&   scaled_size_sqrtrr   w   s0    ::ell;#3#3B#7u{{KlZ[[r'   r[   c                 d    | j                  d      |j                  d      k7  rt        | |      S |S NrR   )r#   ra   )rZ   r[   rf   s      r&   
build_rposru   |   s1    y~~b11&{I>>r'   max_relative_positionsc           
          t        j                  t        t        | j	                  d      |j	                  d            |            S rt   )r   rq   minmaxr#   )rZ   r[   rv   s      r&   compute_attention_spanrz      s4    <<C 0 0 4innR6HIKabccr'   c           	          |j                  d      |j                  d      k7  rA|d d d d d d df   j                  d      }t        j                  | dt	        || |            S | S )NrR   r   r)   r+   dimindex)r#   rY   r   gatherrm   )rl   rZ   r[   rf   rk   s        r&   uneven_size_correctedr      s_    y~~b11 Aq!,66r:	||G2DYPWYb2cddr'   c                   p    e Zd ZdZ fdZd Z	 	 	 	 ddej                  dej                  dedej                  dz  d	ej                  dz  d
ej                  dz  de	ej                  ej                  dz  f   fdZ
dej                  dej                  d	ej                  d
ej                  def
dZ xZS )DisentangledSelfAttentiona  
    Disentangled self-attention module

    Parameters:
        config (`str`):
            A model config class instance with the configuration to build a new model. The schema is similar to
            *BertConfig*, for more details, please refer [`DebertaConfig`]

    c                 Z   t         |           |j                  |j                  z  dk7  r&t	        d|j                   d|j                   d      |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _        t        j                  |j                  | j                  dz  d      | _
        t        j                  t        j                  | j                  t        j                              | _        t        j                  t        j                  | j                  t        j                              | _        |j"                  |j"                  ng | _        t%        |d	d      | _        t%        |d
d      | _        | j(                  rct        j                  |j                  |j                  d      | _        t        j                  |j                  |j                  d      | _        nd | _        d | _        | j&                  rt%        |dd      | _        | j.                  dk  r|j0                  | _        t        j2                  |j4                        | _        d| j"                  v r1t        j                  |j                  | j                  d      | _        d| j"                  v r/t        j                  |j                  | j                        | _        t        j2                  |j<                        | _        y )Nr   zThe hidden size (z6) is not a multiple of the number of attention heads ()r   Fr    rp   relative_attentiontalking_headrv   r)   r   c2pp2c) r   r   rB   num_attention_heads
ValueErrorintattention_head_sizeall_head_sizer   rA   in_projr   r   r   r-   q_biasv_biaspos_att_typegetattrr   r   head_logits_projhead_weights_projrv   max_position_embeddingsrF   rG   pos_dropoutpos_proj
pos_q_projattention_probs_dropout_probrH   rI   s     r&   r   z"DisentangledSelfAttention.__init__   sm    : ::a?#F$6$6#7 8 445Q8  $*#=#= #&v'9'9F<V<V'V#W !558P8PPyy!3!3T5G5G!5KRWXll5;;0B0B5;;#WXll5;;0B0B5;;#WX393F3F3RF//XZ")&2F"N#FNEB$&IIf.H.H&JdJdkp$qD!%'YYv/I/I6KeKelq%rD"$(D!%)D"""*1&:RTV*WD'**Q..4.L.L+!zz&*D*DED))) "		&*<*<d>P>PW\ ])))"$))F,>,>@R@R"Szz&"E"EFr'   c                     |j                         d d | j                  dfz   }|j                  |      }|j                  dddd      S )Nr)   r   r+   r   r   )r#   r   rW   permute)r"   xnew_x_shapes      r&   transpose_for_scoresz.DisentangledSelfAttention.transpose_for_scores   sF    ffhsmt'?'?&DDFF;yyAq!$$r'   Nr2   attention_maskoutput_attentionsquery_statesrf   rel_embeddingsreturnc                 L   |9| j                  |      }| j                  |      j                  dd      \  }}	}
n| j                   j                  j                  | j                  dz  d      }t        d      D cg c]C  }t        j                  t        | j                        D cg c]  }||dz  |z       c}d      E }}}t        j                  |d   |j                         j                  |d   j                              }t        j                  |d   |j                         j                  |d   j                              }t        j                  |d   |j                         j                  |d   j                              }|||fD cg c]  }| j                  |       c}\  }}	}
|| j                  | j                  ddddf         z   }|
| j                  | j                  ddddf         z   }
d}dt        | j                        z   }t!        ||      }||j                  |j                        z  }t        j                  ||	j#                  dd	            }| j$                  r*|(|&| j'                  |      }| j)                  ||	|||      }|||z   }| j*                  5| j+                  |j-                  dddd            j-                  dddd      }|j/                         }|j1                  | t        j2                  |j                        j4                        }t6        j8                  j;                  |d      }| j=                  |      }| j>                  5| j?                  |j-                  dddd            j-                  dddd      }t        j                  ||
      }|j-                  dddd      jA                         }|jC                         dd	 d
z   }|jE                  |      }|s|dfS ||fS c c}w c c}}w c c}w )a  
        Call the module

        Args:
            hidden_states (`torch.FloatTensor`):
                Input states to the module usually the output from previous layer, it will be the Q,K and V in
                *Attention(Q,K,V)*

            attention_mask (`torch.BoolTensor`):
                An attention mask matrix of shape [*B*, *N*, *N*] where *B* is the batch size, *N* is the maximum
                sequence length in which element [i,j] = *1* means the *i* th token in the input can attend to the *j*
                th token.

            output_attentions (`bool`, *optional*):
                Whether return the attention matrix.

            query_states (`torch.FloatTensor`, *optional*):
                The *Q* state in *Attention(Q,K,V)*.

            relative_pos (`torch.LongTensor`):
                The relative position encoding between the tokens in the sequence. It's of shape [*B*, *N*, *N*] with
                values ranging in [*-max_relative_positions*, *max_relative_positions*].

            rel_embeddings (`torch.FloatTensor`):
                The embedding of relative distances. It's a tensor of shape [\(2 \times
                \text{max_relative_positions}\), *hidden_size*].


        Nr   r)   r}   r   rp   r   r+   rR   )r)   )#r   r   chunkr   r   ranger   catmatmultr1   r,   r   r   lenr   rr   	transposer   r   disentangled_att_biasr   r   boolmasked_fillfinforx   r   
functionalsoftmaxrH   r   
contiguousr#   rW   )r"   r2   r   r   r   rf   r   qprZ   r[   value_layerwskiqkvwqvr   rel_attrn   scaleattention_scoresattention_probscontext_layernew_context_layer_shapes                            r&   r6   z!DisentangledSelfAttention.forward   s   L m,B262K2KB2O2U2UVW]_2U2`/KK$$**4+C+Ca+GQ*OBhmnohpqcdEIIeD<T<T6UVr!a%!)}V\]^qDqT!Wlnn&6&9&9Q&9&NOAT!Wmoo&7&:&:a&:&OPAT!Wmoo&7&:&:a&:&OPAZ[]^`aYb2cTU43L3LQ3O2c/KK!D$=$=dkk$PTVW->X$YY!D$=$=dkk$PTVW->X$YY3t0011 l;!EHH;3D3DH$EE <<Y5H5HR5PQ""~'AlF^!--n=N00iWegstG/'9   ,#445E5M5MaQRTUWX5YZbbcdfgijlmn',,.+77.8I5;;WbWhWhKiKmKmn--//0@b/I,,7!!-"44_5L5LQPQSTVW5XYaabcefhiklmO_kB%--aAq9DDF"/"4"4"6s";e"C%**+BC !4((//U Wq 3ds   >+P)P;PP!PrZ   r[   rn   c           	      $   |t        |||j                        }|j                         dk(  r!|j                  d      j                  d      }nT|j                         dk(  r|j                  d      }n/|j                         dk7  rt	        d|j                                t        ||| j                        }|j                         }|| j                  |z
  | j                  |z   d d f   j                  d      }d}d| j                  v r| j                  |      }| j                  |      }t        j                  ||j                  dd	            }	t        j                  ||z   d|dz  dz
        }
t        j                  |	dt!        |
||      
      }	||	z  }d| j                  v r| j#                  |      }| j                  |      }|t%        ||      z  }t'        |||      }t        j                  | |z   d|dz  dz
        }t        j                  ||j                  dd	      j)                  |j*                              }t        j                  |dt-        |||      
      j                  dd	      }t/        ||||      }||z  }|S )Nr+   r   r   r      z2Relative position ids must be of dim 2 or 3 or 4. r   r)   rR   r|   r   rp   )ra   rT   r}   rY   r   rz   rv   rV   r   r   r   r   r   r   clampr   rg   r   rr   ru   r1   r,   ri   r   )r"   rZ   r[   rf   r   rn   att_spanscorepos_key_layerc2p_attre   pos_query_layerr_posp2c_posrl   s                  r&   r   z/DisentangledSelfAttention.disentangled_att_bias"  s    2;	;K]K]^L"'11!4>>qAL1$'11!4L1$QR^RbRbRdQefgg)+y$B]B]^#((*'''(2T5P5PS[5[[]^^

)A, 	  D%%% MM.9M 55mDMll;0G0GB0OPGkk,"91hlQ>NOGll7:LWVaco:pqGWE D%%%"oon=O"77HO/NNOE
 kk5&8"3Q1q8HIGll9o.G.GB.O.R.RYbYhYh.R.ijGllR'9';PY'ZiB  ,G[)\ZGWEr'   FNNN)r8   r9   r:   r;   r   r   r   Tensorr   tupler6   r   r   r<   r=   s   @r&   r   r      s    $GL% #(,0,0.2U0||U0 U0  	U0
 llT)U0 llT)U0 t+U0 
u||U\\D00	1U0n6\\6 <<6 ll	6
 6 6r'   r   c                   *     e Zd ZdZ fdZddZ xZS )DebertaEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.c                    t         |           t        |dd      }t        |d|j                        | _        t        j                  |j                  | j                  |      | _        t        |dd      | _	        | j                  sd | _
        n/t        j                  |j                  | j                        | _
        |j                  dkD  r0t        j                  |j                  | j                        | _        nd | _        | j                  |j                  k7  r2t        j                  | j                  |j                  d      | _        nd | _        t!        |j                  |j"                        | _        t        j&                  |j(                        | _        || _        | j/                  d	t1        j2                  |j                        j5                  d
      d       y )Npad_token_idr   embedding_size)padding_idxposition_biased_inputTFr   position_idsr   r)   )
persistent)r   r   r   rB   r   r   	Embedding
vocab_sizeword_embeddingsr   position_embeddingsr   type_vocab_sizetoken_type_embeddingsrA   
embed_projr   rD   rE   rF   rG   rH   rJ   register_bufferr   rU   rd   )r"   rJ   r   r%   s      r&   r   zDebertaEmbeddings.__init__^  sy   v~q9%f.>@R@RS!||F,=,=t?R?R`lm%,V5Ld%S"))'+D$')||F4R4RTXTgTg'hD$!!A%)+f6L6LdNaNa)bD&)-D&&"4"44 ii(;(;V=O=OV[\DO"DO)&*<*<f>S>STzz&"<"<= 	ELL)G)GHOOPWXej 	 	
r'   c                    ||j                         }n|j                         d d }|d   }|| j                  d d d |f   }|:t        j                  |t        j                  | j                  j
                        }|| j                  |      }| j                   | j                  |j	                               }nt        j                  |      }|}	| j                  r|	|z   }	| j                  | j                  |      }
|	|
z   }	| j                  | j                  |	      }	| j                  |	      }	||j                         |	j                         k7  rD|j                         dk(  r |j                  d      j                  d      }|j                  d      }|j!                  |	j"                        }|	|z  }	| j%                  |	      }	|	S )Nr)   r   rS   r   r+   )r#   r   r   r   rV   rT   r   r   
zeros_liker   r   r   rE   r}   squeezerY   r1   r,   rH   )r"   	input_idstoken_type_idsr   maskinputs_embedsinput_shape
seq_lengthr   
embeddingsr   s              r&   r6   zDebertaEmbeddings.forward}  s    #..*K',,.s3K ^
,,Q^<L!"[[EJJtO`O`OgOghN  00;M##/"&":":<;L;L;N"O"'"2"2="A"
%%#&99J%%1$($>$>~$N!#&;;J??&4J^^J/
xxzZ^^--88:?<<?2215D~~a(77:++,D#d*J\\*-
r'   )NNNNNr7   r=   s   @r&   r   r   [  s    Q
>,r'   r   c                   p     e Zd Z fdZ	 	 	 	 ddedeej                  ej                  dz  f   fdZ xZ	S )DebertaAttentionc                 p    t         |           t        |      | _        t	        |      | _        || _        y r   )r   r   r   r"   r?   outputrJ   rI   s     r&   r   zDebertaAttention.__init__  s-    -f5	'/r'   Nr   r   c                 v    | j                  ||||||      \  }}||}| j                  ||      }	|r|	|fS |	d fS )N)r   rf   r   )r"   r   )
r"   r2   r   r   r   rf   r   self_output
att_matrixattention_outputs
             r&   r6   zDebertaAttention.forward  se     #'))%%) #, #
Z (L;;{LA$j11$d++r'   r   
r8   r9   r:   r   r   r   r   r   r6   r<   r=   s   @r&   r   r     sF     #(,  	, 
u||U\\D00	1,r'   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )DebertaIntermediatec                    t         |           t        j                  |j                  |j
                        | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y r   )r   r   r   rA   rB   intermediate_sizerC   
isinstance
hidden_actstrr	   intermediate_act_fnrI   s     r&   r   zDebertaIntermediate.__init__  s]    YYv1163K3KL
f''-'-f.?.?'@D$'-'8'8D$r'   r2   r   c                 J    | j                  |      }| j                  |      }|S r   )rC   r   r"   r2   s     r&   r6   zDebertaIntermediate.forward  s&    

=100?r'   r8   r9   r:   r   r   r   r6   r<   r=   s   @r&   r   r     s#    9U\\ ell r'   r   c                   $     e Zd Z fdZd Z xZS )DebertaOutputc                     t         |           t        j                  |j                  |j
                        | _        t        |j
                  |j                        | _	        t        j                  |j                        | _        || _        y r   )r   r   r   rA   r   rB   rC   r   rD   rE   rF   rG   rH   rJ   rI   s     r&   r   zDebertaOutput.__init__  sc    YYv779K9KL
)&*<*<f>S>STzz&"<"<=r'   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S r   rL   rM   s      r&   r6   zDebertaOutput.forward  rO   r'   rP   r=   s   @r&   r  r    s    r'   r  c                   p     e Zd Z fdZ	 	 	 	 ddedeej                  ej                  dz  f   fdZ xZ	S )DebertaLayerc                     t         |           t        |      | _        t	        |      | _        t        |      | _        y r   )r   r   r   	attentionr   intermediater  r   rI   s     r&   r   zDebertaLayer.__init__  s3    )&1/7#F+r'   Nr   r   c                     | j                  ||||||      \  }}| j                  |      }	| j                  |	|      }
|r|
|fS |
d fS )Nr   r   rf   r   )r  r  r   )r"   r2   r   r   rf   r   r   r   r   intermediate_outputlayer_outputs              r&   r6   zDebertaLayer.forward  sn     (,~~/%%) (6 (
$* #//0@A{{#68HI *-- $''r'   )NNNFr   r=   s   @r&   r  r    sF    , "'(  ( 
u||U\\D00	1(r'   r  c                        e Zd ZdZ fdZd Zd ZddZ	 	 	 	 	 ddej                  dej                  de
d	e
d
e
f
dZ xZS )DebertaEncoderz8Modified BertEncoder with relative position bias supportc                    t         |           t        j                  t	        |j
                        D cg c]  }t        |       c}      | _        t        |dd      | _	        | j                  rdt        |dd      | _
        | j                  dk  r|j                  | _
        t        j                  | j                  dz  |j                        | _        d| _        y c c}w )Nr   Frv   r)   r   r+   )r   r   r   
ModuleListr   num_hidden_layersr  layerr   r   rv   r   r   rB   r   gradient_checkpointing)r"   rJ   _r%   s      r&   r   zDebertaEncoder.__init__  s    ]]%H`H`Ba#bQL$8#bc
")&2F"N""*1&:RTV*WD'**Q..4.L.L+"$,,t/J/JQ/NPVPbPb"cD&+# $cs   Cc                 R    | j                   r| j                  j                  }|S d }|S r   )r   r   r   )r"   r   s     r&   get_rel_embeddingz DebertaEncoder.get_rel_embedding  s0    7;7N7N,,33 UYr'   c                     |j                         dk  rE|j                  d      j                  d      }||j                  d      j                  d      z  }|S |j                         dk(  r|j                  d      }|S )Nr+   r   rR   r)   r   )r}   rY   r   )r"   r   extended_attention_masks      r&   get_attention_maskz!DebertaEncoder.get_attention_mask   s    1$&4&>&>q&A&K&KA&N#47N7V7VWY7Z7d7deg7hhN  !Q&+55a8Nr'   c                 Z    | j                   r||t        ||      }|S t        ||      }|S r   )r   ra   )r"   r2   r   rf   s       r&   get_rel_poszDebertaEncoder.get_rel_pos)  s>    ""|';'6|]S   7}mTr'   r2   r   output_hidden_statesr   return_dictc           	      ^   | j                  |      }| j                  |||      }|r|fnd }|rdnd }	|}
| j                         }t        | j                        D ].  \  }} ||
|||||      \  }}|r||fz   }||}n|}
|s)|	|fz   }	0 |st        d |||	fD              S t        |||	      S )N )r   rf   r   r   c              3   &   K   | ]	  }||  y wr   r  ).0r   s     r&   	<genexpr>z)DebertaEncoder.forward.<locals>.<genexpr>Z  s     hqZ[Zghs   last_hidden_stater2   
attentions)r  r  r  	enumerater  r   r   )r"   r2   r   r  r   r   rf   r  all_hidden_statesall_attentionsnext_kvr   r   layer_moduleatt_ms                  r&   r6   zDebertaEncoder.forward1  s     00@''|\RL`8Hfj0d//1(4 	;OA|#/))-"3$ M5 $$58H$H!',' !/5(!:'	;* h]4E~$Vhhh+;LYg
 	
r'   )NN)TFNNT)r8   r9   r:   r;   r   r  r  r  r   r   r   r6   r<   r=   s   @r&   r  r    sh    B	, &*"' ,
||,
 ,
 #	,

  ,
 ,
r'   r  c                   `     e Zd ZU eed<   dZdgZdZ ej                          fd       Z
 xZS )DebertaPreTrainedModelrJ   debertar   Tc                    t         |   |       t        |t              r?t	        j
                  |j                         t	        j
                  |j                         yt        |t        t        f      r t	        j
                  |j                         yt        |t              rZt	        j                  |j                  t        j                  |j                  j                   d         j#                  d             yy)zInitialize the weights.r)   r   N)r   _init_weightsr   r   initzeros_r   r   LegacyDebertaLMPredictionHeadDebertaLMPredictionHeadr    r   copy_r   r   rU   shaperd   )r"   moduler%   s     r&   r0  z$DebertaPreTrainedModel._init_weightsg  s     	f%f78KK&KK&!>@W XYKK$ 12JJv**ELL9L9L9R9RSU9V,W,^,^_f,gh 3r'   )r8   r9   r:   r   __annotations__base_model_prefix"_keys_to_ignore_on_load_unexpectedsupports_gradient_checkpointingr   no_gradr0  r<   r=   s   @r&   r-  r-  `  s:    !*?)@&&*#U]]_	i 	ir'   r-  c                       e Zd Z fdZd Zd Ze	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  d	ej                  dz  d
e	dz  de	dz  de	dz  de
ez  fd       Z xZS )DebertaModelc                     t         |   |       t        |      | _        t	        |      | _        d| _        || _        | j                          y Nr   )	r   r   r   r   r  encoderz_stepsrJ   	post_initrI   s     r&   r   zDebertaModel.__init__v  s@     +F3%f-r'   c                 .    | j                   j                  S r   r   r   r"   s    r&   get_input_embeddingsz!DebertaModel.get_input_embeddings  s    ...r'   c                 &    || j                   _        y r   rE  r"   new_embeddingss     r&   set_input_embeddingsz!DebertaModel.set_input_embeddings  s    *8'r'   Nr   r   r   r   r   r   r  r  r   c	           	      |   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }||t	        d      |#| j                  ||       |j                         }
n!||j                         d d }
nt	        d      ||j                  n|j                  }|t        j                  |
|      }|&t        j                  |
t        j                  |      }| j                  |||||      }| j                  ||d||      }|d	   }| j                  d	kD  r|d
   }t        | j                        D cg c]  }| j                  j                   d    }}|d   }| j                  j#                         }| j                  j%                  |      }| j                  j'                  |      }|d	d  D ]!  } |||d|||      }|j)                  |       # |d   }|s|f||rd	d  z   S dd  z   S t+        ||r|j,                  nd |j.                        S c c}w )NzDYou cannot specify both input_ids and inputs_embeds at the same timer)   z5You have to specify either input_ids or inputs_embeds)rT   rS   )r   r   r   r   r   T)r  r   r  r   rR   Fr
  r+   r#  )rJ   r   r  use_return_dictr   %warn_if_padding_and_no_attention_maskr#   rT   r   r   r   rV   r   rA  rB  r   r  r  r  r  appendr   r2   r%  )r"   r   r   r   r   r   r   r  r  kwargsr   rT   embedding_outputencoder_outputsencoded_layersr2   r  layersr   r   rel_posr  sequence_outputs                          r&   r6   zDebertaModel.forward  sz    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] ]%>cdd"66y.Q#..*K&',,.s3KTUU%.%:!!@T@T!"ZZFCN!"[[EJJvVN??)%' + 
 ,,!%/# ' 
 )+<<!*2.M6;DLL6IJdll((,JFJ)"-L!\\;;=N!\\<<^LNll../?@G 	4$!"&+!-!(#1  %%l3	4 ),#%>R8\(]]]XY8\(]]]-;O/77UY&11
 	
+ Ks    H9)NNNNNNNN)r8   r9   r:   r   rG  rK  r   r   r   r   r   r   r6   r<   r=   s   @r&   r>  r>  t  s    /9  *..2.2,0-1)-,0#'O
<<$&O
 t+O
 t+	O

 llT)O
 ||d*O
  $;O
 #TkO
 D[O
 
	 O
 O
r'   r>  c                   $     e Zd Z fdZd Z xZS )$LegacyDebertaPredictionHeadTransformc                    t         |           t        |d|j                        | _        t        j                  |j                  | j                        | _        t        |j                  t              rt        |j                     | _        n|j                  | _        t        j                  | j                  |j                        | _        y )Nr   )r$   )r   r   r   rB   r   r   rA   rC   r   r   r   r	   transform_act_fnrE   rD   rI   s     r&   r   z-LegacyDebertaPredictionHeadTransform.__init__  s    %f.>@R@RSYYv1143F3FG
f''-$*6+<+<$=D!$*$5$5D!d&9&9v?T?TUr'   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r   )rC   rZ  rE   r   s     r&   r6   z,LegacyDebertaPredictionHeadTransform.forward  s4    

=1--m<}5r'   rP   r=   s   @r&   rX  rX    s    	Vr'   rX  c                   $     e Zd Z fdZd Z xZS )r3  c                 J   t         |           t        |      | _        t	        |d|j
                        | _        t        j                  | j                  |j                  d      | _
        t        j                  t        j                  |j                              | _        y )Nr   Tr   )r   r   rX  	transformr   rB   r   r   rA   r   decoderr   r   r   r    rI   s     r&   r   z&LegacyDebertaLMPredictionHead.__init__  ss    =fE%f.>@R@RS yy!4!4f6G6GdSLLV->->!?@	r'   c                 J    | j                  |      }| j                  |      }|S r   )r^  r_  r   s     r&   r6   z%LegacyDebertaLMPredictionHead.forward  s$    }5]3r'   rP   r=   s   @r&   r3  r3    s    	Ar'   r3  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )LegacyDebertaOnlyMLMHeadc                 B    t         |           t        |      | _        y r   )r   r   r3  predictionsrI   s     r&   r   z!LegacyDebertaOnlyMLMHead.__init__   s    8@r'   rV  r   c                 (    | j                  |      }|S r   )rd  )r"   rV  prediction_scoress      r&   r6   z LegacyDebertaOnlyMLMHead.forward  s     ,,_=  r'   r   r=   s   @r&   rb  rb    s$    A!u|| ! !r'   rb  c                   (     e Zd ZdZ fdZd Z xZS )r4  zMhttps://github.com/microsoft/DeBERTa/blob/master/DeBERTa/deberta/bert.py#L270c                    t         |           t        j                  |j                  |j                        | _        t        |j                  t              rt        |j                     | _
        n|j                  | _
        t        j                  |j                  |j                  d      | _        t        j                  t        j                  |j                               | _        y )NT)r$   elementwise_affine)r   r   r   rA   rB   rC   r   r   r   r	   rZ  rE   rD   r   r   r   r   r    rI   s     r&   r   z DebertaLMPredictionHead.__init__  s    YYv1163E3EF
f''-$*6+<+<$=D!$*$5$5D!f&8&8f>S>ShlmLLV->->!?@	r'   c                     | j                  |      }| j                  |      }| j                  |      }t        j                  ||j
                  j                               | j                  z   }|S r   )rC   rZ  rE   r   r   r   r   r    )r"   r2   r   s      r&   r6   zDebertaLMPredictionHead.forward  sd    

=1--m<
 ]O4J4J4L4L4NORVR[R[[r'   r7   r=   s   @r&   r4  r4  	  s    WAr'   r4  c                   $     e Zd Z fdZd Z xZS )DebertaOnlyMLMHeadc                 B    t         |           t        |      | _        y r   )r   r   r4  lm_headrI   s     r&   r   zDebertaOnlyMLMHead.__init__%  s    .v6r'   c                 *    | j                  ||      }|S r   )rn  )r"   rV  r   rf  s       r&   r6   zDebertaOnlyMLMHead.forward*  s     LL/J  r'   rP   r=   s   @r&   rl  rl  $  s    7
!r'   rl  c                   6    e Zd ZdddZ fdZd Zd Ze	 	 	 	 	 	 	 	 	 ddej                  dz  d	ej                  dz  d
ej                  dz  dej                  dz  dej                  dz  dej                  dz  de
dz  de
dz  de
dz  deez  fd       Z xZS )DebertaForMaskedLMzcls.predictions.bias)deberta.embeddings.word_embeddings.weight)zcls.predictions.decoder.biaszcls.predictions.decoder.weightc                     t         |   |       |j                  | _        t        |      | _        | j                  rt        |      | _        nddi| _        t        |      | _	        | j                          y )Nzlm_predictions.lm_head.weightrr  )r   r   legacyr>  r.  rb  cls_tied_weights_keysrl  lm_predictionsrC  rI   s     r&   r   zDebertaForMaskedLM.__init__6  sg     mm#F+;;/7DH 01\'D# #5V"<D 	r'   c                     | j                   r | j                  j                  j                  S | j                  j
                  j                  S r   )rt  ru  rd  r_  rw  rn  rC   rF  s    r&   get_output_embeddingsz(DebertaForMaskedLM.get_output_embeddingsE  s7    ;;88''///&&..444r'   c                    | j                   rA|| j                  j                  _        |j                  | j                  j                  _        y || j
                  j                  _        |j                  | j
                  j                  _        y r   )rt  ru  rd  r_  r    rw  rn  rC   rI  s     r&   set_output_embeddingsz(DebertaForMaskedLM.set_output_embeddingsK  sa    ;;+9DHH  ((6(;(;DHH  %0>D''-/=/B/BD'',r'   Nr   r   r   r   r   labelsr   r  r  r   c
           
         |	|	n| j                   j                  }	| j                  ||||||||	      }|d   }| j                  r| j	                  |      }n0| j                  || j                  j                  j                        }d}|Ft               } ||j                  d| j                   j                        |j                  d            }|	s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        Nr   r   r   r   r   r  r  r   r)   r   losslogitsr2   r%  )rJ   rM  r.  rt  ru  rw  r   r   r   rW   r   r   r2   r%  )r"   r   r   r   r   r   r|  r   r  r  rP  outputsrV  rf  masked_lm_lossloss_fctr   s                    r&   r6   zDebertaForMaskedLM.forwardS  s!   * &1%<k$++B]B],,))%'/!5#  	
 "!*;; $ 9 $ 3 3OT\\E\E\ElEl m')H%&7&<&<RAWAW&XZ`ZeZefhZijN')GABK7F3A3M^%.YSYY$!//))	
 	
r'   	NNNNNNNNN)r8   r9   r:   rv  r   ry  r{  r   r   r   r   r   r   r6   r<   r=   s   @r&   rq  rq  /  s     )?*U
5C  *..2.2,0-1&*)-,0#'5
<<$&5
 t+5
 t+	5

 llT)5
 ||d*5
 t#5
  $;5
 #Tk5
 D[5
 
	5
 5
r'   rq  c                   4     e Zd Z fdZd Zed        Z xZS )ContextPoolerc                     t         |           t        j                  |j                  |j                        | _        t        j                  |j                        | _        || _	        y r   )
r   r   r   rA   pooler_hidden_sizerC   rF   pooler_dropoutrH   rJ   rI   s     r&   r   zContextPooler.__init__  sI    YYv88&:S:ST
zz&"7"78r'   c                     |d d df   }| j                  |      }| j                  |      }t        | j                  j                     |      }|S r@  )rH   rC   r	   rJ   pooler_hidden_act)r"   r2   context_tokenpooled_outputs       r&   r6   zContextPooler.forward  sM     &ad+]3

=1t{{<<=mLr'   c                 .    | j                   j                  S r   )rJ   rB   rF  s    r&   
output_dimzContextPooler.output_dim  s    {{&&&r'   )r8   r9   r:   r   r6   propertyr  r<   r=   s   @r&   r  r    s!     ' 'r'   r  z
    DeBERTa Model transformer with a sequence classification/regression head on top (a linear layer on top of the
    pooled output) e.g. for GLUE tasks.
    )custom_introc                   ,    e Zd Z fdZd Zd Ze	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  d	ej                  dz  d
ej                  dz  de	dz  de	dz  de	dz  de
ez  fd       Z xZS ) DebertaForSequenceClassificationc                    t         |   |       t        |dd      }|| _        t	        |      | _        t        |      | _        | j                  j                  }t        j                  ||      | _        t        |dd       }|| j                  j                  n|}t        j                  |      | _        | j!                          y )N
num_labelsr+   cls_dropout)r   r   r   r  r>  r.  r  poolerr  r   rA   
classifierrJ   rG   rF   rH   rC  )r"   rJ   r  r  drop_outr%   s        r&   r   z)DebertaForSequenceClassification.__init__  s     V\15
$#F+#F+[[++
))J
;6=$76>6F4;;22Hzz(+ 	r'   c                 6    | j                   j                         S r   )r.  rG  rF  s    r&   rG  z5DebertaForSequenceClassification.get_input_embeddings  s    ||0022r'   c                 :    | j                   j                  |       y r   )r.  rK  rI  s     r&   rK  z5DebertaForSequenceClassification.set_input_embeddings  s    )).9r'   Nr   r   r   r   r   r|  r   r  r  r   c
           
         |	|	n| j                   j                  }	| j                  ||||||||	      }|d   }| j                  |      }| j	                  |      }| j                  |      }d}|| j                   j                  | j                  dk(  rXt        j                         }|j                  d      j                  |j                        } |||j                  d            }n_|j                         dk(  s|j                  d      dk(  r|dk\  j                         }|j!                         }|j                  d      dkD  rt#        j$                  |d|j'                  |j                  d      |j                  d                  }t#        j$                  |d|j                  d            }t)               } ||j                  d| j                        j+                         |j                  d            }nIt#        j,                  d      j                  |      }n#t        j.                  d      } ||      |z  j1                  d      j3                          }n| j                   j                  dk(  rIt               }| j                  dk(  r& ||j5                         |j5                               }n |||      }n| j                   j                  dk(  r=t)               } ||j                  d| j                        |j                  d            }n,| j                   j                  dk(  rt7               } |||      }|	s|f|dd z   }||f|z   S |S t9        |||j:                  |j<                  	      S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        N)r   r   r   r   r   r  r  r   r   r)   
regressionsingle_label_classificationmulti_label_classificationr  )rJ   rM  r.  r  rH   r  problem_typer  r   r   rW   r1   r,   r}   r#   nonzerorV   r   r   rd   r   r-   rq   
LogSoftmaxsumr.   r   r   r   r2   r%  )r"   r   r   r   r   r   r|  r   r  r  rP  r  encoder_layerr  r  r  loss_fnlabel_indexlabeled_logitsr  log_softmaxr   s                         r&   r6   z(DebertaForSequenceClassification.forward  s   ( &1%<k$++B]B],,))%'/!5#  	
  
M2]3/{{''/??a' jjlG#[[_//=F"66;;r?;DZZ\Q&&++b/Q*>#)Q;"7"7"9K#[[]F"''*Q.)."A{'9'9+:J:J1:Mv{{[\~'^* "'fa9I9I"9M!N#3#5'(;(;B(P(V(V(XZ`ZeZefhZij$||A11&9"$--"3K)&1F:??CIIKKD))\9"9??a'#FNN$4fnn6FGD#FF3D))-JJ+-B @&++b/R))-II,./Y,F)-)9TGf$EvE'fG4I4IV]VhVh
 	
r'   r  )r8   r9   r:   r   rG  rK  r   r   r   r   r   r   r6   r<   r=   s   @r&   r  r    s    $3:  *..2.2,0-1&*)-,0#'N
<<$&N
 t+N
 t+	N

 llT)N
 ||d*N
 t#N
  $;N
 #TkN
 D[N
 
)	)N
 N
r'   r  c                        e Zd Z fdZe	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	edz  d
edz  dedz  dee	z  fd       Z
 xZS )DebertaForTokenClassificationc                 ,   t         |   |       |j                  | _        t        |      | _        t        j                  |j                        | _        t        j                  |j                  |j                        | _        | j                          y r   )r   r   r  r>  r.  r   rF   rG   rH   rA   rB   r  rC  rI   s     r&   r   z&DebertaForTokenClassification.__init__  si      ++#F+zz&"<"<=))F$6$68I8IJ 	r'   Nr   r   r   r   r   r|  r   r  r  r   c
           
         |	|	n| j                   j                  }	| j                  ||||||||	      }|d   }| j                  |      }| j	                  |      }d}|<t               } ||j                  d| j                        |j                  d            }|	s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )z
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        Nr~  r   r)   r   r  )rJ   rM  r.  rH   r  r   rW   r  r   r2   r%  )r"   r   r   r   r   r   r|  r   r  r  rP  r  rV  r  r  r  r   s                    r&   r6   z%DebertaForTokenClassification.forward   s    $ &1%<k$++B]B],,))%'/!5#  	
 "!*,,71')HFKKDOO<fkk"oNDY,F)-)9TGf$EvE$fG4I4IV]VhVh
 	
r'   r  )r8   r9   r:   r   r   r   r   r   r   r   r6   r<   r=   s   @r&   r  r    s    	  *..2.2,0-1&*)-,0#'.
<<$&.
 t+.
 t+	.

 llT).
 ||d*.
 t#.
  $;.
 #Tk.
 D[.
 
&	&.
 .
r'   r  c                   @    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	ej                  dz  d
edz  dedz  dedz  dee	z  fd       Z
 xZS )DebertaForQuestionAnsweringc                     t         |   |       |j                  | _        t        |      | _        t        j                  |j                  |j                        | _        | j                          y r   )
r   r   r  r>  r.  r   rA   rB   
qa_outputsrC  rI   s     r&   r   z$DebertaForQuestionAnswering.__init__T  sS      ++#F+))F$6$68I8IJ 	r'   Nr   r   r   r   r   start_positionsend_positionsr   r  r  r   c           
      &   |
|
n| j                   j                  }
| j                  |||||||	|
      }|d   }| j                  |      }|j	                  dd      \  }}|j                  d      j                         }|j                  d      j                         }d }||t        |j                               dkD  r|j                  d      }t        |j                               dkD  r|j                  d      }|j                  d      }|j                  d|      }|j                  d|      }t        |      } |||      } |||      }||z   dz  }|
s||f|dd  z   }||f|z   S |S t        ||||j                  |j                        S )	Nr~  r   r   r)   r   )ignore_indexr+   )r  start_logits
end_logitsr2   r%  )rJ   rM  r.  r  splitr   r   r   r#   r   r   r   r2   r%  )r"   r   r   r   r   r   r  r  r   r  r  rP  r  rV  r  r  r  
total_lossignored_indexr  
start_lossend_lossr   s                          r&   r6   z#DebertaForQuestionAnswering.forward^  s    &1%<k$++B]B],,))%'/!5#  	
 "!*1#)<<r<#: j#++B/::<''+668

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M-33A}EO)//=AM']CH!,@J
M:H$x/14J"J/'!"+=F/9/EZMF*Q6Q+%!!//))
 	
r'   )
NNNNNNNNNN)r8   r9   r:   r   r   r   r   r   r   r   r6   r<   r=   s   @r&   r  r  R  s      *..2.2,0-1/3-1)-,0#'=
<<$&=
 t+=
 t+	=

 llT)=
 ||d*=
 ,=
 ||d*=
  $;=
 #Tk=
 D[=
 
-	-=
 =
r'   r  )rq  r  r  r  r>  r-  )Ar;   r   r   torch.nnr   r   r    r   r1  activationsr	   modeling_layersr
   modeling_outputsr   r   r   r   r   modeling_utilsr   utilsr   r   configuration_debertar   
get_loggerr8   loggerModuler   r?   jitscriptra   rg   ri   rm   r   r   rr   ru   rz   r   r   r   r   r   r  r  r  r-  r>  rX  r3  rb  r4  rl  rq  r  r  r  r  __all__r  r'   r&   <module>r     sg      A A & ! 9  . , 0 
		H	%ryy (		   8 r r n n [ [ \%,, \c \ \ ELL U\\   d d dgj d d    C		 CLN		 Nb,ryy ,F")) BII (- (BO
RYY O
d i_ i i& a
) a
 a
H299 &BII &!ryy !bii 6! ! Y
/ Y
 Y
x'BII ', h
'= h
h
V ;
$: ;
 ;
| I
"8 I
 I
Xr'   