
    qi                        d Z ddlZddlZddlmZ ddlmZmZmZ ddlm	Z
 ddlmZ ddlmZ dd	lmZmZmZmZmZmZ dd
lmZ ddlmZ ddlmZmZmZmZmZm Z  ddl!m"Z"  e jF                  e$      Z%da&d Z'd Z(dGdZ)dGdZ*dGdZ+d Z, G d dejZ                  j\                        Z/ G d dejZ                  j\                        Z0 G d d      Z1dHdZ2d Z3	 	 	 dIdZ4 G d dejj                        Z6 G d  d!ejj                        Z7 G d" d#ejj                        Z8 G d$ d%ejj                        Z9 G d& d'ejj                        Z: G d( d)ejj                        Z; G d* d+e      Z< G d, d-ejj                        Z= G d. d/ejj                        Z> G d0 d1ejj                        Z? G d2 d3ejj                        Z@e G d4 d5e             ZAe G d6 d7eA             ZBe G d8 d9eA             ZC G d: d;ejj                        ZD ed<=       G d> d?eA             ZEe G d@ dAeA             ZFe G dB dCeA             ZGe G dD dEeA             ZHg dFZIy)JzPyTorch MRA model.    N)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )initialization)ACT2FN)GradientCheckpointingLayer)"BaseModelOutputWithCrossAttentionsMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)apply_chunking_to_forward)auto_docstringis_cuda_platformis_kernels_availableis_ninja_availableis_torch_cuda_availablelogging   )	MraConfigc                  J    t               st        d      ddlm}   | d      ay )NzFkernels is not installed, please install it with `pip install kernels`r   
get_kernelzkernels-community/mra)r   ImportErrorintegrations.hub_kernelsr   mra_cuda_kernelr   s    V/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/mra/modeling_mra.pyload_cuda_kernelsr"   3   s"    !bcc6 !89O    c                 N   t        | j                               dk7  rt        d      t        |j                               dk7  rt        d      | j                  d      dk7  rt        d      | j                  d      dk7  rt        d      | j                  d	
      j                  j                  dd	      }|j                         }|j                         }|j                         }t        j                  ||||      \  }}|j                  dd	      dddddddf   }||fS )z8
    Computes maximum values for softmax stability.
       z.sparse_qk_prod must be a 4-dimensional tensor.   'indices must be a 2-dimensional tensor.    z>The size of the second dimension of sparse_qk_prod must be 32.r   z=The size of the third dimension of sparse_qk_prod must be 32.dimN)
lensize
ValueErrormaxvalues	transpose
contiguousintr    	index_max)sparse_qk_prodindicesquery_num_blockkey_num_block
index_valsmax_valsmax_vals_scatters          r!   
sparse_maxr=   <   s    > !Q&IJJ
7<<>aBCC1#YZZ1#XYY###+22<<RDJ&&(JkkmG  "G!0!:!::wP_an!oH'11"b9!Qa-H%%%r#   c                    t        | j                               dk7  rt        d      t        |j                               dk7  rt        d      | j                  d   |j                  d   k7  rt        d      | j                  \  }}||z  }t	        j
                  |j                  d      t        j                  |j                        }| j                  |||      } | |dddf   ||z  j                         ddf   } | S )zN
    Converts attention mask to a sparse mask for high resolution logits.
    r&   z$mask must be a 2-dimensional tensor.r'   r   zBmask and indices must have the same size in the zero-th dimension.dtypedeviceN)	r-   r.   r/   shapetorcharangelongrA   reshape)maskr7   
block_size
batch_sizeseq_len	num_block	batch_idxs          r!   sparse_maskrM   X   s     499;1?@@
7<<>aBCCzz!}a((]^^**J:%IW\\!_EJJw~~VI<<
Iz:D	!T'"Wy%8$>$>$@!CDDKr#   c                 j   | j                         \  }}}|j                         \  }}}||z  dk7  rt        d      ||z  dk7  rt        d      | j                  |||z  ||      j                  dd      } |j                  |||z  ||      j                  dd      }t	        | j                               dk7  rt        d      t	        |j                               dk7  rt        d      t	        |j                               d	k7  rt        d
      | j                  d      dk7  rt        d      |j                  d      dk7  rt        d      | j                         } |j                         }|j                         }|j                         }t        j                  | ||j                               S )z7
    Performs Sampled Dense Matrix Multiplication.
    r   zTquery_size (size of first dimension of dense_query) must be divisible by block_size.Pkey_size (size of first dimension of dense_key) must be divisible by block_size.r,   r)   r%   z+dense_query must be a 4-dimensional tensor.)dense_key must be a 4-dimensional tensor.r&   r'   r   r(   z.The third dimension of dense_query must be 32.z,The third dimension of dense_key must be 32.)	r.   r/   rF   r2   r-   r3   r4   r    mm_to_sparse)	dense_query	dense_keyr7   rH   rI   
query_sizer+   _key_sizes	            r!   rQ   rQ   o   s    #."2"2"4J
C ~~'AxJ!#opp*!kll%%j*
2JJX[\ffgikmnK!!*h*.DjRUV``aceghI
;!#FGG
9>>!DEE
7<<>aBCCb IJJ~~aBGHH((*K$$&IkkmG  "G''YNNr#   c                 B   |j                         \  }}}||z  dk7  rt        d      | j                  d      |k7  rt        d      | j                  d      |k7  rt        d      |j                  |||z  ||      j                  dd      }t	        | j                               d	k7  rt        d
      t	        |j                               d	k7  rt        d      t	        |j                               dk7  rt        d      |j                  d      dk7  rt        d      | j                         } |j                         }|j                         }|j                         }t        j                  | |||      }|j                  dd      j                  |||z  |      }|S )zP
    Performs matrix multiplication of a sparse matrix with a dense matrix.
    r   rO   r&   zQThe size of the second dimension of sparse_query must be equal to the block_size.r   zPThe size of the third dimension of sparse_query must be equal to the block_size.r,   r)   r%   ,sparse_query must be a 4-dimensional tensor.rP   r'   r(   z8The size of the third dimension of dense_key must be 32.)	r.   r/   rF   r2   r-   r3   r4   r    sparse_dense_mm)	sparse_queryr7   rS   r8   rH   rI   rV   r+   dense_qk_prods	            r!   rY   rY      s    !* 0J#*!kllz)lmmz)kll!!*h*.DjRUV``aceghI
<1$GHH
9>>!DEE
7<<>aBCC~~aBSTT**,LkkmG  "G$$&I#33L'9VefM!++B3;;JZdHdfijMr#   c                 `    | |z  |z  t        j                  | |d      z   j                         S )Nfloorrounding_mode)rC   divrE   )r7   dim_1_blockdim_2_blocks      r!   transpose_indicesrc      s.    {"k1EIIg{bi4jjpprrr#   c                   >    e Zd Zed        Zed        Zedd       Zy)MraSampledDenseMatMulc                 V    t        ||||      }| j                  |||       || _        |S N)rQ   save_for_backwardrH   )ctxrR   rS   r7   rH   r6   s         r!   forwardzMraSampledDenseMatMul.forward   s1    %k9gzRk9g>#r#   c                    | j                   \  }}}| j                  }|j                  d      |z  }|j                  d      |z  }t        |||      }t	        |j                  dd      |||      }	t	        ||||      }
|
|	d d fS Nr   r,   r)   )saved_tensorsrH   r.   rc   rY   r2   )ri   gradrR   rS   r7   rH   r8   r9   	indices_Tgrad_key
grad_querys              r!   backwardzMraSampledDenseMatMul.backward   s    *-*;*;'Y^^
%**1-;!q)Z7%gN	"4>>"b#99kS`a$T7IO
8T4//r#   c                 2    t         j                  | |||      S rg   )re   apply)rR   rS   r7   rH   s       r!   operator_callz#MraSampledDenseMatMul.operator_call   s    $**;	7JWWr#   Nr(   __name__
__module____qualname__staticmethodrj   rr   ru    r#   r!   re   re      s>      0 0 X Xr#   re   c                   <    e Zd Zed        Zed        Zed        Zy)MraSparseDenseMatMulc                 V    t        ||||      }| j                  |||       || _        |S rg   )rY   rh   r8   )ri   rZ   r7   rS   r8   r6   s         r!   rj   zMraSparseDenseMatMul.forward   s2    (w	?[lGY?-r#   c                     | j                   \  }}}| j                  }|j                  d      |j                  d      z  }t        |||      }t	        |j                  dd      |||      }t        |||      }	|	d |d fS rl   )rm   r8   r.   rc   rY   r2   rQ   )
ri   rn   rZ   r7   rS   r8   r9   ro   rp   rq   s
             r!   rr   zMraSparseDenseMatMul.backward   s    +.+<+<(gy--!q)\->->r-BB%gN	"<#9#9"b#A9dTab!$	7;
44//r#   c                 2    t         j                  | |||      S rg   )r~   rt   )rZ   r7   rS   r8   s       r!   ru   z"MraSparseDenseMatMul.operator_call   s    #)),O\\r#   Nrw   r|   r#   r!   r~   r~      s>      0 0 ] ]r#   r~   c                       e Zd Zed        Zy)MraReduceSumc                 B   | j                         \  }}}}t        | j                               dk7  rt        d      t        |j                               dk7  rt        d      | j                         \  }}}}|j                         \  }}| j                  d      j	                  ||z  |      } t        j                  |j                  d      t
        j                  |j                        }t        j                  ||d	      j                         |d d d f   |z  z   j	                  ||z        }	t        j                  ||z  |f| j                  | j                        }
|
j                  d|	|       j	                  |||      }|j	                  |||z        }|S )
Nr%   rX   r&   r'   r*   r   r?   r]   r^   )r.   r-   r/   sumrF   rC   rD   rE   rA   r`   zerosr@   	index_add)rZ   r7   r8   r9   rI   rK   rH   rU   rL   global_idxestempoutputs               r!   ru   zMraReduceSum.operator_call   sy   /;/@/@/B,
Iz1|  "#q(KLLw||~!#FGG*//11j! '
I#''A'.66zI7MzZLLa

7>>Z	IIg}GDIIKiXY[_X_N`crNrr
'*y(
) 	 {{/):6l>P>PYeYlYl
 <>FFzSbdno
Oj,HIr#   N)rx   ry   rz   r{   ru   r|   r#   r!   r   r      s     r#   r   c                 &   | j                         \  }}}||z  }d}	||j                  |||      j                  d      }
| j                  ||||      j                  d      |
dddddf   dz   z  }|j                  ||||      j                  d      |
dddddf   dz   z  }||j                  ||||      j                  d      |
dddddf   dz   z  }	n|t        j                  ||t        j
                  | j                        z  }
| j                  ||||      j                  d      }|j                  ||||      j                  d      }|$|j                  ||||      j                  d      }	t        j                  ||j                  dd            t        j                  |      z  }|j                  dd      j                  }|0|d	|
dddddf   |
dddddf   z  d
k  j                         z  z
  }||
||	fS )z/
    Compute low resolution approximation.
    Nr,   r*   r)   ư>r?   T)r+   keepdims     @g      ?)r.   rF   r   rC   onesfloatrA   meanmatmulr2   mathsqrtr0   r1   )querykeyrH   rG   valuerI   rJ   head_dimnum_block_per_row	value_hattoken_count	query_hatkey_hatlow_resolution_logitlow_resolution_logit_row_maxs                  r!   get_low_resolution_logitr     sN    %*JJL!J:-Ill:/@*MQQVXQYMM*.?XVZZ_aZb1d
#d*
	 ++j*;ZRVV[]V^1d
#d*
 j2CZQYZ^^ce^fAq$J'$.I !5::j:KSXS^S^glgsgs#ttMM*.?XV[[`b[c	++j*;ZRWW\^W_j2CZQYZ__df_gI <<	73D3DR3LMPTPYPYZbPcc#7#;#;T#;#R#Y#Y  3;q$z+B[QRTUW[Q[E\+\`c*c)j)j)l#ll 	  .JIUUr#   c                    | j                   \  }}}|dkD  rf|dz  }t        j                  ||| j                        }	t        j                  t        j
                  |	|       |      }
| |
dddddf   dz  z   } |dkD  r:| ddd|ddf   dz   | ddd|ddf<   | ddddd|f   dz   | ddddd|f<   t        j                  | j                  |d      |ddd	
      }|j                  }|dk(  rE|j                  j                  d      j                  }| |ddddf   k\  j                         }||fS |dk(  rd}||fS t        | d      )zZ
    Compute the indices of the subset of components to be used in the approximation.
    r   r&   rA   )diagonalNg     @r,   TF)r+   largestsortedfullr*   sparsez# is not a valid approx_model value.)rB   rC   r   rA   triltriutopkrF   r7   r1   minr   r/   )r   
num_blocksapprox_modeinitial_prior_first_n_blocksinitial_prior_diagonal_n_blocksrI   total_blocks_per_rowrU   offset	temp_maskdiagonal_mask
top_k_valsr7   	thresholdhigh_resolution_masks                  r!   get_block_idxesr   7  s    +?*D*D'J$a&*0A5JJ35IRfRmRmn	

5::i6'#JU[\3mD!QJ6ORU6UU#a' $A%A$A1!DEK 	Q =!= =q@A !A'D(D'D!DEK 	Q#@$@#@@A $$Z4jbRV_dJ   Gf%%))b)188	 4	!T4-8P PWWY ((( 
	 # ((( K=(KLMMr#   c	                    t         #t        j                  |       j                         S | j	                         \  }	}
}}|	|
z  }||z  dk7  rt        d      ||z  }| j                  |||      } |j                  |||      }|j                  |||      }|-| |dddddf   z  } ||dddddf   z  }||dddddf   z  }|dk(  rt        | ||||      \  }}}}nA|dk(  r1t        j                         5  t        | |||      \  }}}}ddd       nt        d      t        j                         5  z
  }t        |||||      \  }}ddd       t        j                  | ||      t        j                  |      z  }t        ||||      \  }}||z
  }|"|dd	t!        ||      dddddddf   z
  z  z
  }t        j"                  |      }t$        j                  ||||      }t&        j                  ||||      }|dk(  ryt        j"                  z
  dz  z
        dddddf   z  }t        j(                  |      dddddddf   j+                  d	d	|d	      j                  |||      }|j-                  d
      dddddf   j+                  d	d	|      j                  ||      }|j+                  d	d	|      j                  ||      |z
  } || |z  } t        j"                  | | dk  j/                         z        }!||!dddddf   z  }||!z  }t        j"                  |  | dkD  j/                         z        }"||"dddddf   z  }||"z  }||z   |dddddf   |dddddf   z   dz   z  }#n#|dk(  r||dddddf   dz   z  }#nt        d      ||#|dddddf   z  }#|#j                  |	|
||      }#|#S # 1 sw Y   xY w# 1 sw Y   xY w)z0
    Use Mra to approximate self-attention.
    Nr   z4sequence length must be divisible by the block_size.r   r   z&approx_mode must be "full" or "sparse")rH   r   r   r,   r*   r   z-config.approx_mode must be "full" or "sparse")r    rC   
zeros_likerequires_grad_r.   r/   rF   r   no_grad	Exceptionr   re   ru   r   r   r=   rM   expr~   r   r   repeatr   r   )$r   r   r   rG   r   r   rH   r   r   rI   num_headrJ   r   
meta_batchr   r   r   r   r   rU   low_resolution_logit_normalizedr7   r   high_resolution_logitr;   r<   high_resolution_attnhigh_resolution_attn_outhigh_resolution_normalizerlow_resolution_attnlow_resolution_attn_outlow_resolution_normalizerlog_correctionlow_resolution_corrhigh_resolution_corrcontext_layers$                                       r!   mra2_attentionr   ]  s    &5577.3jjl+J'8h&Jq OPP:-MM*gx8E
++j'8
4CMM*gx8EQ4Z((DAt$$Q4Z((fUm3
D%V
Rk+G 
	 ]]_ 	QisJRN +/KQ	 	
 @AA	 
*>A]*]'(7+(+)
%%
 2??sG
 @ 		( ",,A7L]_p!qH14DD 5q;tU\C]^_abdegk^kCl?l8m m 99%:;3AAgu.?  ".!;!;g'8:K" fII*-IICRfLffg!T1*%& 	 LL,i8AtQGVAq*a(WZ(3 	   ###+Aq$J7>>q!ZPXXYcelm 	" 6<<Q:NVVWacjknvv+d2N#ii.A:M9T9T9V(VW"9<OPQSTVZPZ<["[$=@S$S!$yy.NQ<N;U;U;W)WX#;>RSTVWY]S]>^#^ %?BV%V"14KK&q!Tz25NqRSUYz5ZZ]aa
 
	 04NqRSUYz4Z]a4abGHH%Q4Z(88!))*hRMS	 	
 
s   7O
3O
OO!c                   *     e Zd ZdZ fdZddZ xZS )MraEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.c                 L   t         |           t        j                  |j                  |j
                  |j                        | _        t        j                  |j                  dz   |j
                        | _	        t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        | j#                  dt%        j&                  |j                        j)                  d      dz          | j#                  dt%        j*                  | j,                  j/                         t$        j0                  | j,                  j2                        d	       y )
N)padding_idxr&   epsposition_idsr   r,   token_type_idsr?   F)
persistent)super__init__r   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutregister_bufferrC   rD   expandr   r   r.   rE   rA   selfconfig	__class__s     r!   r   zMraEmbeddings.__init__  s*   !||F,=,=v?Q?Q_e_r_rs#%<<0N0NQR0RTZTfTf#g %'\\&2H2H&J\J\%]"f&8&8f>S>STzz&"<"<= 	^U\\&:X:X-Y-`-`ah-ilm-mnKK))..0

4K\K\KcKcd 	 	
r#   c                 6   ||j                         }n|j                         d d }|d   }|| j                  d d d |f   }|st        | d      r-| j                  d d d |f   }|j	                  |d   |      }|}n:t        j                  |t
        j                  | j                  j                        }|| j                  |      }| j                  |      }	||	z   }
| j                  |      }|
|z  }
| j                  |
      }
| j                  |
      }
|
S )Nr,   r   r   r   r?   )r.   r   hasattrr   r   rC   r   rE   rA   r   r   r   r   r   )r   	input_idsr   r   inputs_embedsinput_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedr   
embeddingsr   s               r!   rj   zMraEmbeddings.forward  s/    #..*K',,.s3K ^
,,Q^<L
 !t-.*.*=*=a*n*M'3J3Q3QR]^_R`bl3m0!A!&[

SWSdSdSkSk!l  00;M $ : :> J"%::
"66|D))
^^J/
\\*-
r#   )NNNNrx   ry   rz   __doc__r   rj   __classcell__r   s   @r!   r   r     s    Q
" r#   r   c                   &     e Zd Z fdZddZ xZS )MraSelfAttentionc                    t         |           |j                  |j                  z  dk7  r2t	        |d      s&t        d|j                   d|j                   d      t        d u}t               r!t               rt               r|s	 t                |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _        t#        j$                  |j                  | j                         | _        t#        j$                  |j                  | j                         | _        t#        j$                  |j                  | j                         | _        t#        j,                  |j.                        | _        |j2                  dz  |j4                  z  | _        t9        | j6                  t        |j2                  dz  dz              | _        |j:                  | _        |j<                  | _        |j>                  | _        y # t        $ r#}t        j                  d|        Y d }~d }~ww xY w)	Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()zGCould not load the custom kernel for multi-scale deformable attention: r(   r&   ) r   r   r   num_attention_headsr   r/   r    r   r   r   r"   r   loggerwarningr4   attention_head_sizeall_head_sizer   Linearr   r   r   r   attention_probs_dropout_probr   r   block_per_rowrK   r   r   r   r   )r   r   kernel_loadeder   s       r!   r   zMraSelfAttention.__init__  s    : ::a?PVXhHi#F$6$6#7 8 445Q8 
 (t3"$)9);@R@T]jn!# $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EF 88B>&BVBVVT^^S&2P2PTV2V[\1\-]^!--,2,O,O)/5/U/U,%  n!hijhklmmns   
H! !	I*IIc           
      l   |j                   \  }}}| j                  |      j                  |d| j                  | j                        j                  dd      }| j                  |      j                  |d| j                  | j                        j                  dd      }| j                  |      j                  |d| j                  | j                        j                  dd      }d|dz  z   }|j                         j                  d| j                  d      j                  || j                  z  |      j                         }d}	| j                  |	k  r|| j                  ||	| j                  z
  f}
t        j                  |t        j                  |
|j                        gd      }t        j                  |t        j                  |
|j                        gd      }t        j                  |t        j                  |
|j                        gd      }t!        |j#                         |j#                         |j#                         |j#                         | j$                  | j&                  | j(                  | j*                  	      }| j                  |	k  r|d d d d d d d | j                  f   }|j                  || j                  || j                        }|j-                  d
ddd      j/                         }|j1                         d d | j2                  fz   } |j                  | }|f}|S )Nr,   r   r&   g      ?r   r(   r   r*   )r   r   r   r   r   r)   )rB   r   viewr   r  r2   r   r   squeezer   rF   r4   rC   catr   rA   r   r   rK   r   r   r   permuter3   r.   r  )r   hidden_statesattention_maskrI   rJ   rU   query_layer	key_layervalue_layergpu_warp_sizepad_sizer   new_context_layer_shapeoutputss                 r!   rj   zMraSelfAttention.forward(  s   !.!4!4
GQJJ}%T*b$":":D<T<TUYq!_ 	 HH]#T*b$":":D<T<TUYq!_ 	 JJ}%T*b$":":D<T<TUYq!_ 	 ~77""$VAt//3WZ$":"::GDSU	 	 ##m3!4#;#;WmVZVnVnFnnH))[%++h{OaOa2b$ciklK		9ekk(9K[K[.\"]cefI))[%++h{OaOa2b$ciklK&OO  "NN(()-)J)J,0,P,P	
 ##m3)!Q3MT5M5M3M*MNM%--j$:R:RT[]a]u]uv%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S***,CD "r#   rg   rx   ry   rz   r   rj   r   r   s   @r!   r   r     s    V@<r#   r   c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )MraSelfOutputc                 (   t         |           t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        t        j                  |j                        | _
        y Nr   )r   r   r   r  r   denser   r   r   r   r   r   s     r!   r   zMraSelfOutput.__init__i  s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=r#   r  input_tensorreturnc                 r    | j                  |      }| j                  |      }| j                  ||z         }|S rg   r  r   r   r   r  r  s      r!   rj   zMraSelfOutput.forwardo  7    

=1]3}|'CDr#   rx   ry   rz   r   rC   Tensorrj   r   r   s   @r!   r  r  h  1    >U\\  RWR^R^ r#   r  c                   &     e Zd Z fdZddZ xZS )MraAttentionc                 b    t         |           t        |      | _        t	        |      | _        y rg   )r   r   r   r   r  r   r   s     r!   r   zMraAttention.__init__w  s&    $V,	#F+r#   c                 f    | j                  ||      }| j                  |d   |      }|f|dd  z   }|S Nr   r   )r   r   )r   r  r  self_outputsattention_outputr  s         r!   rj   zMraAttention.forward|  s@    yy?;;|AF#%QR(88r#   rg   r  r   s   @r!   r(  r(  v  s    ,
r#   r(  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )MraIntermediatec                    t         |           t        j                  |j                  |j
                        | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y rg   )r   r   r   r  r   intermediate_sizer  
isinstance
hidden_actstrr	   intermediate_act_fnr   s     r!   r   zMraIntermediate.__init__  s]    YYv1163K3KL
f''-'-f.?.?'@D$'-'8'8D$r#   r  r  c                 J    | j                  |      }| j                  |      }|S rg   )r  r5  r   r  s     r!   rj   zMraIntermediate.forward  s&    

=100?r#   r$  r   s   @r!   r/  r/    s#    9U\\ ell r#   r/  c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )	MraOutputc                 (   t         |           t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        y r  )r   r   r   r  r1  r   r  r   r   r   r   r   r   s     r!   r   zMraOutput.__init__  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=r#   r  r  r  c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S rg   r!  r"  s      r!   rj   zMraOutput.forward  r#  r#   r$  r   s   @r!   r9  r9    r&  r#   r9  c                   ,     e Zd Z fdZddZd Z xZS )MraLayerc                     t         |           |j                  | _        d| _        t	        |      | _        |j                  | _        t        |      | _        t        |      | _
        y Nr   )r   r   chunk_size_feed_forwardseq_len_dimr(  	attentionadd_cross_attentionr/  intermediater9  r   r   s     r!   r   zMraLayer.__init__  sW    '-'E'E$%f-#)#=#= +F3'r#   c                     | j                  ||      }|d   }|dd  }t        | j                  | j                  | j                  |      }|f|z   }|S r+  )rB  r   feed_forward_chunkr@  rA  )r   r  r  self_attention_outputsr-  r  layer_outputs          r!   rj   zMraLayer.forward  sc    !%~!N1!4(,0##T%A%A4CSCSUe
  /G+r#   c                 L    | j                  |      }| j                  ||      }|S rg   )rD  r   )r   r-  intermediate_outputrH  s       r!   rF  zMraLayer.feed_forward_chunk  s,    "//0@A{{#68HIr#   rg   )rx   ry   rz   r   rj   rF  r   r   s   @r!   r=  r=    s    (r#   r=  c                   ,     e Zd Z fdZ	 	 	 ddZ xZS )
MraEncoderc                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w )NF)
r   r   r   r   
ModuleListrangenum_hidden_layersr=  layergradient_checkpointing)r   r   rU   r   s      r!   r   zMraEncoder.__init__  sN    ]]eFD\D\>]#^HV$4#^_
&+# $_s   A#c                     |rdnd }t        | j                        D ]  \  }}|r||fz   } |||      }|d   } |r||fz   }|st        d ||fD              S t        ||      S )Nr|   r   c              3   &   K   | ]	  }||  y wrg   r|   ).0vs     r!   	<genexpr>z%MraEncoder.forward.<locals>.<genexpr>  s     Xq!-Xs   )last_hidden_stater  )	enumeraterQ  tupler   )	r   r  r  output_hidden_statesreturn_dictall_hidden_statesilayer_modulelayer_outputss	            r!   rj   zMraEncoder.forward  s     #7BD(4 	-OA|#$58H$H!(GM)!,M	-   1]4D DX]4E$FXXX1++
 	
r#   )NFTr  r   s   @r!   rL  rL    s    , "
r#   rL  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )MraPredictionHeadTransformc                 h   t         |           t        j                  |j                  |j                        | _        t        |j                  t              rt        |j                     | _
        n|j                  | _
        t        j                  |j                  |j                        | _        y r  )r   r   r   r  r   r  r2  r3  r4  r	   transform_act_fnr   r   r   s     r!   r   z#MraPredictionHeadTransform.__init__  s{    YYv1163E3EF
f''-$*6+<+<$=D!$*$5$5D!f&8&8f>S>STr#   r  r  c                 l    | j                  |      }| j                  |      }| j                  |      }|S rg   )r  rd  r   r7  s     r!   rj   z"MraPredictionHeadTransform.forward  s4    

=1--m<}5r#   r$  r   s   @r!   rb  rb    s$    UU\\ ell r#   rb  c                   $     e Zd Z fdZd Z xZS )MraLMPredictionHeadc                    t         |           t        |      | _        t	        j
                  |j                  |j                  d      | _        t	        j                  t        j                  |j                              | _        y )NT)bias)r   r   rb  	transformr   r  r   r   decoder	ParameterrC   r   ri  r   s     r!   r   zMraLMPredictionHead.__init__  s[    3F; yy!3!3V5F5FTRLLV->->!?@	r#   c                 J    | j                  |      }| j                  |      }|S rg   )rj  rk  r7  s     r!   rj   zMraLMPredictionHead.forward  s$    }5]3r#   r  r   s   @r!   rg  rg    s    Ar#   rg  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )MraOnlyMLMHeadc                 B    t         |           t        |      | _        y rg   )r   r   rg  predictionsr   s     r!   r   zMraOnlyMLMHead.__init__  s    .v6r#   sequence_outputr  c                 (    | j                  |      }|S rg   )rq  )r   rr  prediction_scoress      r!   rj   zMraOnlyMLMHead.forward  s     ,,_=  r#   r$  r   s   @r!   ro  ro    s#    7!u|| ! !r#   ro  c                   t     e Zd ZU eed<   dZdZ ej                         de	j                  f fd       Z xZS )MraPreTrainedModelr   mraTmodulec                    t         |   |       t        |t              r t	        j
                  |j                         yt        |t              r|t	        j                  |j                  t        j                  |j                  j                  d         j                  d      dz          t	        j
                  |j                         yy)zInitialize the weightsr,   r   r&   N)r   _init_weightsr2  rg  initzeros_ri  r   copy_r   rC   rD   rB   r   r   )r   rx  r   s     r!   rz  z MraPreTrainedModel._init_weights  s     	f%f12KK$.JJv**ELL9L9L9R9RSU9V,W,^,^_f,gjk,klKK--. /r#   )rx   ry   rz   r   __annotations__base_model_prefixsupports_gradient_checkpointingrC   r   r   Modulerz  r   r   s   @r!   rv  rv    s<     &*#U]]_/BII / /r#   rv  c                        e Zd Z fdZd Zd Ze	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  d	ej                  dz  d
e	dz  de	dz  de
ez  fd       Z xZS )MraModelc                     t         |   |       || _        t        |      | _        t        |      | _        | j                          y rg   )r   r   r   r   r   rL  encoder	post_initr   s     r!   r   zMraModel.__init__$  s;     '/!&) 	r#   c                 .    | j                   j                  S rg   r   r   r   s    r!   get_input_embeddingszMraModel.get_input_embeddings.  s    ...r#   c                 &    || j                   _        y rg   r  )r   r   s     r!   set_input_embeddingszMraModel.set_input_embeddings1  s    */'r#   Nr   r  r   r   r   r[  r\  r  c                 ^   ||n| j                   j                  }||n| j                   j                  }||t        d      |#| j	                  ||       |j                         }	n!||j                         d d }	nt        d      |	\  }
}||j                  n|j                  }|t        j                  |
|f|      }|pt        | j                  d      r4| j                  j                  d d d |f   }|j                  |
|      }|}n&t        j                  |	t        j                  |      }| j                  ||	      }| j                  ||||      }| j!                  ||||      }|d	   }|s	|f|d
d  z   S t#        ||j$                  |j&                  |j(                        S )NzDYou cannot specify both input_ids and inputs_embeds at the same timer,   z5You have to specify either input_ids or inputs_embedsr   r   r?   )r   r   r   r   )r  r[  r\  r   r   )rX  r  
attentionscross_attentions)r   r[  use_return_dictr/   %warn_if_padding_and_no_attention_maskr.   rA   rC   r   r   r   r   r   r   rE   get_extended_attention_maskr  r   r  r  r  )r   r   r  r   r   r   r[  r\  kwargsr   rI   r   rA   r   r   extended_attention_maskembedding_outputencoder_outputsrr  s                      r!   rj   zMraModel.forward4  s    %9$D $++JjJj 	 &1%<k$++B]B] ]%>cdd"66y.Q#..*K&',,.s3KTUU!,
J%.%:!!@T@T!"ZZ*j)A6RN!t(89*.//*H*HKZK*X'3J3Q3QR\^h3i0!A!&[

SY!Z 150P0PQ_al0m??%)'	 + 
 ,,2!5#	 ' 
 *!,#%(;;;1-)77&11,==	
 	
r#   )NNNNNNN)rx   ry   rz   r   r  r  r   rC   r%  boolrZ  r   rj   r   r   s   @r!   r  r  "  s    /0  *..2.2,0-1,0#'B
<<$&B
 t+B
 t+	B

 llT)B
 ||d*B
 #TkB
 D[B
 
3	3B
 B
r#   r  c                   *    e Zd ZdddZ fdZd Zd Ze	 	 	 	 	 	 	 	 ddej                  dz  d	ej                  dz  d
ej                  dz  dej                  dz  dej                  dz  dej                  dz  de
dz  de
dz  deez  fd       Z xZS )MraForMaskedLMzcls.predictions.biasz%mra.embeddings.word_embeddings.weight)zcls.predictions.decoder.biaszcls.predictions.decoder.weightc                     t         |   |       t        |      | _        t	        |      | _        | j                          y rg   )r   r   r  rw  ro  clsr  r   s     r!   r   zMraForMaskedLM.__init__  s4     F#!&) 	r#   c                 B    | j                   j                  j                  S rg   )r  rq  rk  r  s    r!   get_output_embeddingsz$MraForMaskedLM.get_output_embeddings  s    xx##+++r#   c                     || j                   j                  _        |j                  | j                   j                  _        y rg   )r  rq  rk  ri  )r   new_embeddingss     r!   set_output_embeddingsz$MraForMaskedLM.set_output_embeddings  s,    '5$$2$7$7!r#   Nr   r  r   r   r   labelsr[  r\  r  c	           	         ||n| j                   j                  }| j                  |||||||      }
|
d   }| j                  |      }d}|Ft	               } ||j                  d| j                   j                        |j                  d            }|s|f|
dd z   }||f|z   S |S t        |||
j                  |
j                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
        Nr  r   r   r   r[  r\  r   r,   r   losslogitsr  r  )
r   r  rw  r  r   r  r   r   r  r  )r   r   r  r   r   r   r  r[  r\  r  r  rr  rt  masked_lm_lossloss_fctr   s                   r!   rj   zMraForMaskedLM.forward  s    & &1%<k$++B]B](())%'!5#  
 "!* HH_5')H%&7&<&<RAWAW&XZ`ZeZefhZijN')GABK7F3A3M^%.YSYY$!//))	
 	
r#   NNNNNNNN)rx   ry   rz   _tied_weights_keysr   r  r  r   rC   r%  r  rZ  r   rj   r   r   s   @r!   r  r  z  s     )?*Q
,8  *..2.2,0-1&*,0#'/
<<$&/
 t+/
 t+	/

 llT)/
 ||d*/
 t#/
 #Tk/
 D[/
 
	/
 /
r#   r  c                   (     e Zd ZdZ fdZd Z xZS )MraClassificationHeadz-Head for sentence-level classification tasks.c                 4   t         |           t        j                  |j                  |j                        | _        t        j                  |j                        | _        t        j                  |j                  |j                        | _
        || _        y rg   )r   r   r   r  r   r  r   r   r   
num_labelsout_projr   r   s     r!   r   zMraClassificationHead.__init__  sg    YYv1163E3EF
zz&"<"<=		&"4"4f6G6GHr#   c                     |d d dd d f   }| j                  |      }| j                  |      }t        | j                  j                     |      }| j                  |      }| j                  |      }|S )Nr   )r   r  r	   r   r3  r  )r   featuresr  xs       r!   rj   zMraClassificationHead.forward  se    Q1WLLOJJqM4;;))*1-LLOMM!r#   r   r   s   @r!   r  r    s    7r#   r  z
    MRA Model transformer with a sequence classification/regression head on top (a linear layer on top of
    the pooled output) e.g. for GLUE tasks.
    )custom_introc                       e Zd Z fdZe	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	edz  d
edz  dee	z  fd       Z
 xZS )MraForSequenceClassificationc                     t         |   |       |j                  | _        t        |      | _        t        |      | _        | j                          y rg   )r   r   r  r  rw  r  
classifierr  r   s     r!   r   z%MraForSequenceClassification.__init__  sA      ++F#/7 	r#   Nr   r  r   r   r   r  r[  r\  r  c	           	         ||n| j                   j                  }| j                  |||||||      }
|
d   }| j                  |      }d}|| j                   j                  | j
                  dk(  rd| j                   _        nl| j
                  dkD  rL|j                  t        j                  k(  s|j                  t        j                  k(  rd| j                   _        nd| j                   _        | j                   j                  dk(  rIt               }| j
                  dk(  r& ||j                         |j                               }n |||      }n| j                   j                  dk(  r=t               } ||j                  d| j
                        |j                  d            }n,| j                   j                  dk(  rt               } |||      }|s|f|
dd z   }||f|z   S |S t        |||
j                   |
j"                  	      S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr  r   r   
regressionsingle_label_classificationmulti_label_classificationr,   r  )r   r  rw  r  problem_typer  r@   rC   rE   r4   r   r  r   r  r   r   r  r  )r   r   r  r   r   r   r  r[  r\  r  r  rr  r  r  r  r   s                   r!   rj   z$MraForSequenceClassification.forward  s   & &1%<k$++B]B](())%'!5#  
 "!*1{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#FF3D))-JJ+-B @&++b/R))-II,./Y,F)-)9TGf$EvE'!//))	
 	
r#   r  )rx   ry   rz   r   r   rC   r%  r  rZ  r   rj   r   r   s   @r!   r  r    s      *..2.2,0-1&*,0#'@
<<$&@
 t+@
 t+	@

 llT)@
 ||d*@
 t#@
 #Tk@
 D[@
 
)	)@
 @
r#   r  c                       e Zd Z fdZe	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	edz  d
edz  dee	z  fd       Z
 xZS )MraForMultipleChoicec                    t         |   |       t        |      | _        t	        j
                  |j                  |j                        | _        t	        j
                  |j                  d      | _        | j                          y r?  )
r   r   r  rw  r   r  r   pre_classifierr  r  r   s     r!   r   zMraForMultipleChoice.__init__0  s_     F# ii(:(:F<N<NO))F$6$6: 	r#   Nr   r  r   r   r   r  r[  r\  r  c	           	         ||n| j                   j                  }||j                  d   n|j                  d   }
|!|j                  d|j	                  d            nd}|!|j                  d|j	                  d            nd}|!|j                  d|j	                  d            nd}|!|j                  d|j	                  d            nd}|1|j                  d|j	                  d      |j	                  d            nd}| j                  |||||||      }|d   }|dddf   }| j                  |      } t        j                         |      }| j                  |      }|j                  d|
      }d}|t               } |||      }|s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )a[  
        input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
            model's internal embedding lookup matrix.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        Nr   r,   r)   r  r   r  )r   r  rB   r  r.   rw  r  r   ReLUr  r   r   r  r  )r   r   r  r   r   r   r  r[  r\  r  num_choicesr  hidden_statepooled_outputr  reshaped_logitsr  r  r   s                      r!   rj   zMraForMultipleChoice.forward:  s   V &1%<k$++B]B],5,Aiooa(}GZGZ[\G]>G>SINN2y~~b'9:Y]	M[Mg,,R1D1DR1HImqM[Mg,,R1D1DR1HImqGSG_|((\->->r-BCei ( r=#5#5b#9=;M;Mb;QR 	 (())%'!5#  
 qz$QT*++M:!	-0/ ++b+6')HOV4D%''!"+5F)-)9TGf$EvE("!//))	
 	
r#   r  )rx   ry   rz   r   r   rC   r%  r  rZ  r   rj   r   r   s   @r!   r  r  .  s      *..2.2,0-1&*,0#'W
<<$&W
 t+W
 t+	W

 llT)W
 ||d*W
 t#W
 #TkW
 D[W
 
*	*W
 W
r#   r  c                       e Zd Z fdZe	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	edz  d
edz  dee	z  fd       Z
 xZS )MraForTokenClassificationc                 ,   t         |   |       |j                  | _        t        |      | _        t        j                  |j                        | _        t        j                  |j                  |j                        | _        | j                          y rg   )r   r   r  r  rw  r   r   r   r   r  r   r  r  r   s     r!   r   z"MraForTokenClassification.__init__  si      ++F#zz&"<"<=))F$6$68I8IJ 	r#   Nr   r  r   r   r   r  r[  r\  r  c	           	         ||n| j                   j                  }| j                  |||||||      }
|
d   }| j                  |      }| j	                  |      }d}|t               }||j                  d      dk(  }|j                  d| j                        }t        j                  ||j                  d      t        j                  |j                        j                  |            } |||      }n2 ||j                  d| j                        |j                  d            }|s|f|
dd z   }||f|z   S |S t        |||
j                  |
j                        S )z
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        Nr  r   r,   r   r  )r   r  rw  r   r  r   r  r  rC   wheretensorignore_indextype_asr   r  r  )r   r   r  r   r   r   r  r[  r\  r  r  rr  r  r  r  active_lossactive_logitsactive_labelsr   s                      r!   rj   z!MraForTokenClassification.forward  se   " &1%<k$++B]B](())%'!5#  
 "!*,,71')H),11"5: &B @ %R%,,x?T?T2U2]2]^d2e!  }=B @&++b/RY,F)-)9TGf$EvE$!//))	
 	
r#   r  )rx   ry   rz   r   r   rC   r%  r  rZ  r   rj   r   r   s   @r!   r  r    s    	  *..2.2,0-1&*,0#'8
<<$&8
 t+8
 t+	8

 llT)8
 ||d*8
 t#8
 #Tk8
 D[8
 
&	&8
 8
r#   r  c                   4    e Zd Z fdZe	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	ej                  dz  d
edz  dedz  dee	z  fd       Z
 xZS )MraForQuestionAnsweringc                     t         |   |       d|_        |j                  | _        t        |      | _        t        j                  |j                  |j                        | _        | j                          y )Nr&   )
r   r   r  r  rw  r   r  r   
qa_outputsr  r   s     r!   r   z MraForQuestionAnswering.__init__  s[      ++F#))F$6$68I8IJ 	r#   Nr   r  r   r   r   start_positionsend_positionsr[  r\  r  c
           	         |	|	n| j                   j                  }	| j                  |||||||	      }|d   }| j                  |      }|j	                  dd      \  }}|j                  d      }|j                  d      }d }||t        |j                               dkD  r|j                  d      }t        |j                               dkD  r|j                  d      }|j                  d      }|j                  d|      }|j                  d|      }t        |      } |||      } |||      }||z   dz  }|	s||f|dd  z   }||f|z   S |S t        ||||j                  |j                        S )	Nr  r   r   r,   r*   )r  r&   )r  start_logits
end_logitsr  r  )r   r  rw  r  splitr  r-   r.   clampr   r   r  r  )r   r   r  r   r   r   r  r  r[  r\  r  r  rr  r  r  r  
total_lossignored_indexr  
start_lossend_lossr   s                         r!   rj   zMraForQuestionAnswering.forward  s    &1%<k$++B]B](())%'!5#  
 "!*1#)<<r<#: j#++B/''+

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M-33A}EO)//=AM']CH!,@J
M:H$x/14J"J/'!"+=F/9/EZMF*Q6Q+%!!//))
 	
r#   )	NNNNNNNNN)rx   ry   rz   r   r   rC   r%  r  rZ  r   rj   r   r   s   @r!   r  r    s    
  *..2.2,0-1/3-1,0#';
<<$&;
 t+;
 t+	;

 llT);
 ||d*;
 ,;
 ||d*;
 #Tk;
 D[;
 
-	-;
 ;
r#   r  )r  r  r  r  r  r=  r  rv  rv   )NN)r(   r   r   )Jr   r   rC   r   torch.nnr   r   r    r   r{  activationsr	   modeling_layersr
   modeling_outputsr   r   r   r   r   r   modeling_utilsr   pytorch_utilsr   utilsr   r   r   r   r   r   configuration_mrar   
get_loggerrx   r  r    r"   r=   rM   rQ   rY   rc   autogradFunctionre   r~   r   r   r   r   r  r   r   r  r(  r/  r9  r=  rL  rb  rg  ro  rv  r  r  r  r  r  r  r  __all__r|   r#   r!   <module>r     s       A A & ! 9  . 6  ) 
		H	%:&8.%OP%PsXENN33 X0]5>>22 ]. :%VP#)Z !"$%pf4BII 4n]ryy ]BBII 
299 
bii  		 ) : 
  
H $")) "!RYY ! / / /  T
! T
 T
n F
' F
 F
TBII * K
#5 K
K
\ c
- c
 c
L E
 2 E
 E
P I
0 I
 I
X	r#   