
    qi                     R   d Z ddlZddlZddlmZ ddlmZmZmZ ddlm	Z
 ddlmZ ddlmZ dd	lmZmZmZmZmZmZ dd
lmZ ddlmZ ddlmZmZmZmZmZ ddl m!Z!  ejD                  e#      Z$da%d Z&d Z'd Z(d Z) G d dejT                  jV                        Z, G d dejT                  jV                        Z- G d dej\                        Z/ G d dej\                        Z0 G d dej\                        Z1 G d dej\                        Z2 G d d ej\                        Z3 G d! d"ej\                        Z4 G d# d$e      Z5 G d% d&ej\                        Z6 G d' d(ej\                        Z7 G d) d*ej\                        Z8 G d+ d,ej\                        Z9e G d- d.e             Z:e G d/ d0e:             Z;e G d1 d2e:             Z< G d3 d4ej\                        Z= ed56       G d7 d8e:             Z>e G d9 d:e:             Z?e G d; d<e:             Z@e G d= d>e:             ZAg d?ZBy)@zPyTorch YOSO model.    N)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )initialization)ACT2FN)GradientCheckpointingLayer)"BaseModelOutputWithCrossAttentionsMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)apply_chunking_to_forward)auto_docstringis_kernels_availableis_ninja_availableis_torch_cuda_availablelogging   )
YosoConfigc                  b    t               st        d      ddlm}   | d      }|j                  ay )NzFkernels is not installed, please install it with `pip install kernels`r   )
get_kernelzkernels-community/yoso)r   ImportErrorintegrations.hub_kernelsr   lsh_cumulation)r   yosos     X/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/yoso/modeling_yoso.pyload_cuda_kernelsr!   3   s-    !bcc6./D((N    c                     t        | t              r<g }| D ]3  }|j                         s|j                         }|j	                  |       5 |S | j                         s| j                         } | S N)
isinstancelistis_contiguous
contiguousappendinput_tensorsouttensors      r    to_contiguousr.   =   sm    -&# 	F'')**,JJv	 
**,)446Mr"   c                     t        | t              r<g }| D ]3  }|j                  t        j                  j                  |dd             5 |S t        j                  j                  | dd      S )N   )pdim)r%   r&   r)   r   
functional	normalizer*   s      r    r5   r5   K   se    -&# 	EFJJr}}..v.CD	E
}}&&}r&BBr"   c                 z   t        | j                               dk7  rt        d      t        |j                               dk7  rt        d      t        j                  | j                  d      | j                  d      ||z  | j
                        }dt        j                  || j
                        z  }t        j                  | |      j                  | j                  d      | j                  d      ||      }t        j                  ||      j                  |j                  d      |j                  d      ||      }|dkD  j                         }|dkD  j                         }	t        j                  ||z  d	      }
t        j                  |	|z  d	      }
|
j                         |
j                         fS )
Nr   zQuery has incorrect size.zKey has incorrect size.r   r0   devicer   r1   r3   )lensize
ValueErrortorchrandnr8   arangematmulreshapeintsum)querykeynum_hashhash_lenrmat	raise_powquery_projectionkey_projectionquery_binary
key_binary
query_hashs              r    hashingrO   U   sX   
5::<A455
388:!233;;uzz!}ejjmX5HQVQ]Q]^DU\\(5<<@@I||E4088A

STW_aij\\#t,44SXXa[#((1+xYabN$q(--/L 1$))+J<)3<J:	1r:J>>Z^^---r"   c                   ,    e Zd Zed        Zed        Zy)YosoCumulationc           
      N   |d   }dt        j                  t        j                  ||j                  dd                  t        j
                  z  z
  |z  }||d d d d d f   z  |d d d d d f   z  }t        j                  ||      }	| j                  ||||||       || _        |	S )Nhash_code_lenr   r1   )r=   acosr@   	transposemathpisave_for_backwardconfig)
ctx
query_maskkey_maskrD   rE   valuerZ   rS   expectationcumulation_values
             r    forwardzYosoCumulation.forwardi   s    /5::ell5#--B:O&PQTXT[T[[[`mm!Jq!Tz$::Xaqj=QQ <<U;j(KUS
r"   c                    t        |      }| j                  \  }}}}}}| j                  }|d   }	t        j                  ||j                  dd            |z  }
t        j                  |
|	dz  |z        }t        j                  |
j                  dd      |	dz  |z        }t        j                  |j                  dd      |      }d d |||d fS )NrS   r1   rT   r0   )r.   saved_tensorsrZ   r=   r@   rV   )r[   gradr\   r]   r_   rD   rE   r^   rZ   rS   weighted_exp
grad_querygrad_key
grad_values                 r    backwardzYosoCumulation.backwardv   s    T"?B?P?P<
Hk5#u/||D%//"b*AB[P\\,1Bc0IJ
<< 6 6r2 >QRARV[@[\\\+"7"7B"?F
T:xTAAr"   N__name__
__module____qualname__staticmethodra   ri    r"   r    rQ   rQ   h   s*    
  
  B Br"   rQ   c                   ,    e Zd Zed        Zed        Zy)YosoLSHCumulationc           
         |j                  d      |j                  d      k7  rt        d      |j                  d      |j                  d      k7  rt        d      |j                  d      |j                  d      k7  rt        d      |j                  d      |j                  d      k7  rt        d      |j                  d      |j                  d      k7  rt        d      |j                  d      |j                  d      k7  rt        d	      t        |||||g      \  }}}}}|j                  }|d
   }|d   }	t	        d|	z        }
|d   r t
        j                  ||||||	|d      \  }}nt        ||||	      \  }}t
        j                  ||||||
|d      }| j                  |||||||       || _	        |S )Nr   z6Query mask and Key mask differ in sizes in dimension 0z3Query mask and Query differ in sizes in dimension 0z1Query mask and Key differ in sizes in dimension 0z8Query mask and Value mask differ in sizes in dimension 0r   z,Key and Value differ in sizes in dimension 1r0   z,Query and Key differ in sizes in dimension 2rF   rS   use_fast_hash)
r;   r<   r.   is_cudarB   r   	fast_hashrO   rY   rZ   )r[   r\   r]   rD   rE   r^   rZ   use_cudarF   rS   hashtable_capacityquery_hash_codekey_hash_coder`   s                 r    ra   zYosoLSHCumulation.forward   s   ??1q!11UVV??1A.RSS??1!,PQQ??1A.WXX88A;%**Q-'KLL::a=CHHQK'KLL2?XW\^ach@i2j/
HeS%%%*%/ M!12/"-;-E-EE8S(M8UV.*O] .5UC=-Y*O])88=%I[]egh
 	j(O]TY[^`ef
r"   c                    t        |      }| j                  \  }}}}}}}| j                  }	|j                  }
|	d   }t	        d|z        }|	d   rft
        j                  |||||||
d      }t
        j                  |||||||dz  |z  ||
d
      }t
        j                  |||||||dz  |z  ||
d
      }ndt        j                  t        j                  ||j                  dd                  t        j                  z  z
  |z  }||d d d d d f   z  |d d d d d f   z  }t        j                  ||j                  dd            |z  }t        j                  ||dz  |z        }t        j                  |j                  dd      |dz  |z        }t        j                  |j                  dd      |      }d d |||d fS )NrS   r0   lsh_backwardr      r1   rT   )r.   rc   rZ   rt   rB   r   lsh_weighted_cumulationr=   rU   r@   rV   rW   rX   )r[   rd   r\   r]   rx   ry   rD   rE   r^   rZ   rv   rS   rw   rh   rf   rg   r_   re   s                     r    ri   zYosoLSHCumulation.backward   s   T"RURcRcO
Ho}eS%<</ M!12.!'66-_dL^`hjkJ (??"c)"J &=="e+"H uzz%,,ucmmBPR>S*TUX\X_X___dqqK%
1a:(>>!TST*AUUK <<eoob".EFTLl]Q5F#4MNJ||L$:$:2r$B]UVEVZ_D_`Hk&;&;B&CTJJT:xTAAr"   Nrj   ro   r"   r    rq   rq      s+    #  # J .B .Br"   rq   c                   *     e Zd ZdZ fdZddZ xZS )YosoEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.c                 P   t         |           t        j                  |j                  |j
                  |j                        | _        t        j                  |j                  dz   |j
                        | _	        t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        | j#                  dt%        j&                  |j                        j)                  d      dz   d       | j#                  dt%        j*                  | j,                  j/                         t$        j0                  | j,                  j2                  	      d       y )
N)padding_idxr0   epsposition_idsr   r1   F)
persistenttoken_type_idsdtyper8   )super__init__r   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutregister_bufferr=   r?   expandzerosr   r;   longr8   selfrZ   	__class__s     r    r   zYosoEmbeddings.__init__   s7   !||F,=,=v?Q?Q_e_r_rs#%<<0N0NQR0RTZTfTf#g %'\\&2H2H&J\J\%]"f&8&8f>S>STzz&"<"<= 	ELL)G)GHOOPWX[\\in 	 	
 	KK))..0

4K\K\KcKcd 	 	
r"   c                 6   ||j                         }n|j                         d d }|d   }|| j                  d d d |f   }|st        | d      r-| j                  d d d |f   }|j	                  |d   |      }|}n:t        j                  |t
        j                  | j                  j                        }|| j                  |      }| j                  |      }	||	z   }
| j                  |      }|
|z  }
| j                  |
      }
| j                  |
      }
|
S )Nr1   r   r   r   r   )r;   r   hasattrr   r   r=   r   r   r8   r   r   r   r   r   )r   	input_idsr   r   inputs_embedsinput_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedr   
embeddingsr   s               r    ra   zYosoEmbeddings.forward   s/    #..*K',,.s3K ^
,,Q^<L
 !t-.*.*=*=a*n*M'3J3Q3QR]^_R`bl3m0!A!&[

SWSdSdSkSk!l  00;M $ : :> J"%::
"66|D))
^^J/
\\*-
r"   )NNNNrk   rl   rm   __doc__r   ra   __classcell__r   s   @r    r   r      s    Q
& r"   r   c                   &     e Zd Z fdZddZ xZS )YosoSelfAttentionc                    t         |           |j                  |j                  z  dk7  r2t	        |d      s&t        d|j                   d|j                   d      t        d u}t               rt               r|s	 t                |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _        t!        j"                  |j                  | j                        | _        t!        j"                  |j                  | j                        | _        t!        j"                  |j                  | j                        | _        t!        j*                  |j,                        | _        |j0                  | _        |j2                  | _        |j4                  d u| _        |j8                  | _        |j:                  | _        |j<                  | _        | j2                  | j8                  | j:                  | j<                  d| _        |j4                  Zt!        j@                  |j                  |j                  |j4                  df|j4                  d	z  dfd
|j                        | _!        y y # t        $ r#}t        j                  d|        Y d }~1d }~ww xY w)Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()zGCould not load the custom kernel for multi-scale deformable attention: )rS   rs   rF   r{   r   r0   F)in_channelsout_channelskernel_sizepaddingbiasgroups)"r   r   r   num_attention_headsr   r<   r   r   r   r!   	ExceptionloggerwarningrB   attention_head_sizeall_head_sizer   LinearrD   rE   r^   r   attention_probs_dropout_probr   use_expectationrS   conv_windowuse_convrs   rF   r{   
lsh_configConv2dconv)r   rZ   kernel_loadeder   s       r    r   zYosoSelfAttention.__init__  sS    : ::a?PVXhHi#F$6$6#7 8 445Q8  'd2"$);)=mn!# $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EF%55#11**$6#11"// "//!// --	
 )		"66#77#//3++q0!411DI *7  n!hijhklmmns   =
J 	J=J88J=c                 H   |j                   \  }}}| j                  |      j                  |d| j                  | j                        j                  dd      }| j                  |      j                  |d| j                  | j                        j                  dd      }| j                  |      j                  |d| j                  | j                        j                  dd      }	| j                  r| j                  |	|d d d d d d f   z        }
|j                         \  }}}}|j                  ||z  ||      }|j                  ||z  ||      }|	j                  ||z  ||      }	d|dz  z   }|j                  d      j                  |d      j                  ||z  |      j                         }d}| j                  s||k  r||z  |||z
  f}t!        j"                  |t!        j$                  ||j&                        gd      }t!        j"                  |t!        j$                  ||j&                        gd      }t!        j"                  |	t!        j$                  ||	j&                        gd      }	| j                  s| j(                  rt+        ||g      \  }}| j                  r%t,        j/                  |||||	| j0                        }n$t2        j/                  |||||	| j0                        }| j                  s||k  r|d d d d d |f   }t+        |      }|j                  ||||      }| j                  r|
z  }|j5                  d	ddd
      j7                         }|j                         d d | j8                  fz   } |j                  | }|r||f}|S |f}|S )Nr1   r   r0   g      ?g     @r9       r7   r   r   rT   )shaperD   viewr   r   rV   rE   r^   r   r   r;   rA   	unsqueezerepeat_interleaverB   r   r=   catr   r8   trainingr5   rQ   applyr   rq   permuter(   r   )r   hidden_statesattention_maskoutput_attentions
batch_sizer   _query_layer	key_layervalue_layerconv_value_layer	num_headsseq_lenhead_dimgpu_warp_sizepad_sizecontext_layernew_context_layer_shapeoutputss                      r    ra   zYosoSelfAttention.forwardK  s   $1$7$7!
JJJ}%T*b$":":D<T<TUYq!_ 	 HH]#T*b$":":D<T<TUYq!_ 	 JJ}%T*b$":":D<T<TUYq!_ 	 ==#yy~aqRVFV7W)WX3>3C3C3E0
Iw!))*y*@'8T%%j9&<gxP	!))*y*@'8T~77$$Q'ya0WZ)+W5SU	 	 $$(]*B!I-w8PPH))KK1C1CD K 		KK1A1AB I  ))KK1C1CD K 4==%.Y/G%H"K*00YUYUdUdM .33YUYUdUdM $$(]*B)!Q		/:M!-0%--j)WhW==--M%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S***,CD4E=-0 MZK[r"   NFrk   rl   rm   r   ra   r   r   s   @r    r   r     s    .`\r"   r   c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )YosoSelfOutputc                 (   t         |           t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        t        j                  |j                        | _
        y Nr   )r   r   r   r   r   denser   r   r   r   r   r   s     r    r   zYosoSelfOutput.__init__  s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=r"   r   input_tensorreturnc                 r    | j                  |      }| j                  |      }| j                  ||z         }|S r$   r   r   r   r   r   r   s      r    ra   zYosoSelfOutput.forward  7    

=1]3}|'CDr"   rk   rl   rm   r   r=   Tensorra   r   r   s   @r    r   r     1    >U\\  RWR^R^ r"   r   c                   &     e Zd Z fdZddZ xZS )YosoAttentionc                 b    t         |           t        |      | _        t	        |      | _        y r$   )r   r   r   r   r   outputr   s     r    r   zYosoAttention.__init__  s&    %f-	$V,r"   c                 h    | j                  |||      }| j                  |d   |      }|f|dd  z   }|S )Nr   r   )r   r   )r   r   r   r   self_outputsattention_outputr   s          r    ra   zYosoAttention.forward  sC    yy@QR;;|AF#%QR(88r"   r   r   r   s   @r    r   r     s    -
r"   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )YosoIntermediatec                    t         |           t        j                  |j                  |j
                        | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y r$   )r   r   r   r   r   intermediate_sizer   r%   
hidden_actstrr	   intermediate_act_fnr   s     r    r   zYosoIntermediate.__init__  s]    YYv1163K3KL
f''-'-f.?.?'@D$'-'8'8D$r"   r   r   c                 J    | j                  |      }| j                  |      }|S r$   )r   r  r   r   s     r    ra   zYosoIntermediate.forward  s&    

=100?r"   r   r   s   @r    r   r     s#    9U\\ ell r"   r   c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )
YosoOutputc                 (   t         |           t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        y r   )r   r   r   r   r   r   r   r   r   r   r   r   r   s     r    r   zYosoOutput.__init__  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=r"   r   r   r   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S r$   r   r   s      r    ra   zYosoOutput.forward  r   r"   r   r   s   @r    r  r    r   r"   r  c                   ,     e Zd Z fdZddZd Z xZS )	YosoLayerc                     t         |           |j                  | _        d| _        t	        |      | _        |j                  | _        t        |      | _        t        |      | _
        y Nr   )r   r   chunk_size_feed_forwardseq_len_dimr   	attentionadd_cross_attentionr   intermediater  r   r   s     r    r   zYosoLayer.__init__  sW    '-'E'E$&v.#)#=#= ,V4 (r"   c                     | j                  |||      }|d   }|dd  }t        | j                  | j                  | j                  |      }|f|z   }|S )N)r   r   r   )r  r   feed_forward_chunkr  r  )r   r   r   r   self_attention_outputsr   r   layer_outputs           r    ra   zYosoLayer.forward  sh    !%~ar!s1!4(,0##T%A%A4CSCSUe
  /G+r"   c                 L    | j                  |      }| j                  ||      }|S r$   )r  r   )r   r   intermediate_outputr  s       r    r  zYosoLayer.feed_forward_chunk  s,    "//0@A{{#68HIr"   r   )rk   rl   rm   r   ra   r  r   r   s   @r    r	  r	    s    )r"   r	  c                   .     e Zd Z fdZ	 	 	 	 ddZ xZS )YosoEncoderc                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w r   )
r   r   rZ   r   
ModuleListrangenum_hidden_layersr	  layergradient_checkpointing)r   rZ   r   r   s      r    r   zYosoEncoder.__init__  sN    ]]uVE]E]?^#_!If$5#_`
&+# $`s   A#c                     |rdnd }|rdnd }t        | j                        D ](  \  }}	|r||fz   } |	|||      }
|
d   }|s ||
d   fz   }* |r||fz   }|st        d |||fD              S t        |||      S )Nro   r   r   c              3   &   K   | ]	  }||  y wr$   ro   ).0vs     r    	<genexpr>z&YosoEncoder.forward.<locals>.<genexpr>"  s     mq_`_lms   )last_hidden_stater   
attentions)	enumerater  tupler   )r   r   r   r   output_hidden_statesreturn_dictall_hidden_statesall_self_attentionsilayer_modulelayer_outputss              r    ra   zYosoEncoder.forward	  s     #7BD$5b4(4 	POA|#$58H$H!(HYZM)!,M &9]1=M<O&O#	P   1]4D Dm]4EGZ$[mmm1++*
 	
r"   )NFFTr   r   s   @r    r  r    s    , "
r"   r  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )YosoPredictionHeadTransformc                 h   t         |           t        j                  |j                  |j                        | _        t        |j                  t              rt        |j                     | _
        n|j                  | _
        t        j                  |j                  |j                        | _        y r   )r   r   r   r   r   r   r%   r   r   r	   transform_act_fnr   r   r   s     r    r   z$YosoPredictionHeadTransform.__init__,  s{    YYv1163E3EF
f''-$*6+<+<$=D!$*$5$5D!f&8&8f>S>STr"   r   r   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r$   )r   r2  r   r  s     r    ra   z#YosoPredictionHeadTransform.forward5  s4    

=1--m<}5r"   r   r   s   @r    r0  r0  +  s$    UU\\ ell r"   r0  c                   $     e Zd Z fdZd Z xZS )YosoLMPredictionHeadc                    t         |           t        |      | _        t	        j
                  |j                  |j                  d      | _        t	        j                  t        j                  |j                              | _        y )NT)r   )r   r   r0  	transformr   r   r   r   decoder	Parameterr=   r   r   r   s     r    r   zYosoLMPredictionHead.__init__>  s[    4V< yy!3!3V5F5FTRLLV->->!?@	r"   c                 J    | j                  |      }| j                  |      }|S r$   )r7  r8  r  s     r    ra   zYosoLMPredictionHead.forwardG  s$    }5]3r"   r   r   s   @r    r5  r5  =  s    Ar"   r5  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )YosoOnlyMLMHeadc                 B    t         |           t        |      | _        y r$   )r   r   r5  predictionsr   s     r    r   zYosoOnlyMLMHead.__init__O  s    /7r"   sequence_outputr   c                 (    | j                  |      }|S r$   )r>  )r   r?  prediction_scoress      r    ra   zYosoOnlyMLMHead.forwardS  s     ,,_=  r"   r   r   s   @r    r<  r<  N  s#    8!u|| ! !r"   r<  c                   t     e Zd ZU eed<   dZdZ ej                         de	j                  f fd       Z xZS )YosoPreTrainedModelrZ   r   Tmodulec                    t         |   |       t        |t              r t	        j
                  |j                         yt        |t              r|t	        j                  |j                  t        j                  |j                  j                  d         j                  d      dz          t	        j
                  |j                         yy)zInitialize the weightsr1   r   r0   N)r   _init_weightsr%   r5  initzeros_r   r   copy_r   r=   r?   r   r   r   )r   rD  r   s     r    rF  z!YosoPreTrainedModel._init_weights^  s     	f%f23KK$/JJv**ELL9L9L9R9RSU9V,W,^,^_f,gjk,klKK--. 0r"   )rk   rl   rm   r   __annotations__base_model_prefixsupports_gradient_checkpointingr=   no_gradr   ModulerF  r   r   s   @r    rC  rC  X  s:    &*#U]]_/BII / /r"   rC  c                       e Zd Z fdZd Zd Ze	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  d	ej                  dz  d
e	dz  de	dz  de	dz  de
ez  fd       Z xZS )	YosoModelc                     t         |   |       || _        t        |      | _        t        |      | _        | j                          y r$   )r   r   rZ   r   r   r  encoder	post_initr   s     r    r   zYosoModel.__init__k  s;     (0"6* 	r"   c                 .    | j                   j                  S r$   r   r   r   s    r    get_input_embeddingszYosoModel.get_input_embeddingsu  s    ...r"   c                 &    || j                   _        y r$   rU  )r   r^   s     r    set_input_embeddingszYosoModel.set_input_embeddingsx  s    */'r"   Nr   r   r   r   r   r   r(  r)  r   c	                 p   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }||t	        d      |#| j                  ||       |j                         }
n!||j                         d d }
nt	        d      |
\  }}||j                  n|j                  }|t        j                  ||f|      }|pt        | j                  d      r4| j                  j                  d d d |f   }|j                  ||      }|}n&t        j                  |
t        j                  |      }| j                  ||||      }| j!                  |||||      }|d	   }|s	|f|d
d  z   S t#        ||j$                  |j&                  |j(                        S )NzDYou cannot specify both input_ids and inputs_embeds at the same timer1   z5You have to specify either input_ids or inputs_embedsr7   r   r   )r   r   r   r   )r   r   r(  r)  r   r   )r$  r   r%  cross_attentions)rZ   r   r(  use_return_dictr<   %warn_if_padding_and_no_attention_maskr;   r8   r=   onesr   r   r   r   r   r   rR  r   r   r%  r[  )r   r   r   r   r   r   r   r(  r)  kwargsr   r   r   r8   r   r   embedding_outputencoder_outputsr?  s                      r    ra   zYosoModel.forward{  s    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] ]%>cdd"66y.Q#..*K&',,.s3KTUU!,
J%.%:!!@T@T!"ZZ*j)A6RN!t(89*.//*H*HKZK*X'3J3Q3QR\^h3i0!A!&[

SY!Z??%)'	 + 
 ,,)/!5# ' 
 *!,#%(;;;1-)77&11,==	
 	
r"   )NNNNNNNN)rk   rl   rm   r   rW  rY  r   r=   r   boolr'  r   ra   r   r   s   @r    rP  rP  i  s    /0  *..2.2,0-1)-,0#'A
<<$&A
 t+A
 t+	A

 llT)A
 ||d*A
  $;A
 #TkA
 D[A
 
3	3A
 A
r"   rP  c                   6    e Zd ZdddZ fdZd Zd Ze	 	 	 	 	 	 	 	 	 ddej                  dz  d	ej                  dz  d
ej                  dz  dej                  dz  dej                  dz  dej                  dz  de
dz  de
dz  de
dz  deez  fd       Z xZS )YosoForMaskedLMzcls.predictions.biasz&yoso.embeddings.word_embeddings.weight)zcls.predictions.decoder.biaszcls.predictions.decoder.weightc                     t         |   |       t        |      | _        t	        |      | _        | j                          y r$   )r   r   rP  r   r<  clsrS  r   s     r    r   zYosoForMaskedLM.__init__  s4     f%	"6* 	r"   c                 B    | j                   j                  j                  S r$   )rf  r>  r8  rV  s    r    get_output_embeddingsz%YosoForMaskedLM.get_output_embeddings  s    xx##+++r"   c                     || j                   j                  _        |j                  | j                   j                  _        y r$   )rf  r>  r8  r   )r   new_embeddingss     r    set_output_embeddingsz%YosoForMaskedLM.set_output_embeddings  s,    '5$$2$7$7!r"   Nr   r   r   r   r   labelsr   r(  r)  r   c
           
         |	|	n| j                   j                  }	| j                  ||||||||	      }|d   }| j                  |      }d}|Ft	               } ||j                  d| j                   j                        |j                  d            }|	s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
        Nr   r   r   r   r   r(  r)  r   r1   r   losslogitsr   r%  )
rZ   r\  r   rf  r   r   r   r   r   r%  )r   r   r   r   r   r   rl  r   r(  r)  r_  r   r?  rA  masked_lm_lossloss_fctr   s                    r    ra   zYosoForMaskedLM.forward  s    ( &1%<k$++B]B]))))%'/!5#  	
 "!* HH_5')H%&7&<&<RAWAW&XZ`ZeZefhZijN')GABK7F3A3M^%.YSYY$!//))	
 	
r"   	NNNNNNNNN)rk   rl   rm   _tied_weights_keysr   rh  rk  r   r=   r   rb  r'  r   ra   r   r   s   @r    rd  rd    s     )?*R
,8  *..2.2,0-1&*)-,0#'1
<<$&1
 t+1
 t+	1

 llT)1
 ||d*1
 t#1
  $;1
 #Tk1
 D[1
 
	1
 1
r"   rd  c                   (     e Zd ZdZ fdZd Z xZS )YosoClassificationHeadz-Head for sentence-level classification tasks.c                 4   t         |           t        j                  |j                  |j                        | _        t        j                  |j                        | _        t        j                  |j                  |j                        | _
        || _        y r$   )r   r   r   r   r   r   r   r   r   
num_labelsout_projrZ   r   s     r    r   zYosoClassificationHead.__init__  sg    YYv1163E3EF
zz&"<"<=		&"4"4f6G6GHr"   c                     |d d dd d f   }| j                  |      }| j                  |      }t        | j                  j                     |      }| j                  |      }| j                  |      }|S )Nr   )r   r   r	   rZ   r   rz  )r   featuresr_  xs       r    ra   zYosoClassificationHead.forward  se    Q1WLLOJJqM4;;))*1-LLOMM!r"   r   r   s   @r    rw  rw    s    7r"   rw  z
    YOSO Model transformer with a sequence classification/regression head on top (a linear layer on top of
    the pooled output) e.g. for GLUE tasks.
    )custom_introc                        e Zd Z fdZe	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	edz  d
edz  dedz  dee	z  fd       Z
 xZS )YosoForSequenceClassificationc                     t         |   |       |j                  | _        t        |      | _        t        |      | _        | j                          y r$   )r   r   ry  rP  r   rw  
classifierrS  r   s     r    r   z&YosoForSequenceClassification.__init__(  sA      ++f%	08 	r"   Nr   r   r   r   r   rl  r   r(  r)  r   c
           
         |	|	n| j                   j                  }	| j                  ||||||||	      }|d   }| j                  |      }d}|| j                   j                  | j
                  dk(  rd| j                   _        nl| j
                  dkD  rL|j                  t        j                  k(  s|j                  t        j                  k(  rd| j                   _        nd| j                   _        | j                   j                  dk(  rIt               }| j
                  dk(  r& ||j                         |j                               }n |||      }n| j                   j                  dk(  r=t               } ||j                  d| j
                        |j                  d            }n,| j                   j                  dk(  rt               } |||      }|	s|f|dd z   }||f|z   S |S t        |||j                   |j"                  	      S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nrn  r   r   
regressionsingle_label_classificationmulti_label_classificationr1   ro  )rZ   r\  r   r  problem_typery  r   r=   r   rB   r   squeezer   r   r   r   r   r%  )r   r   r   r   r   r   rl  r   r(  r)  r_  r   r?  rq  rp  rs  r   s                    r    ra   z%YosoForSequenceClassification.forward1  s   ( &1%<k$++B]B]))))%'/!5#  	
 "!*1{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#FF3D))-JJ+-B @&++b/R))-II,./Y,F)-)9TGf$EvE'!//))	
 	
r"   rt  )rk   rl   rm   r   r   r=   r   rb  r'  r   ra   r   r   s   @r    r  r  !  s      *..2.2,0-1&*)-,0#'B
<<$&B
 t+B
 t+	B

 llT)B
 ||d*B
 t#B
  $;B
 #TkB
 D[B
 
)	)B
 B
r"   r  c                        e Zd Z fdZe	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	edz  d
edz  dedz  dee	z  fd       Z
 xZS )YosoForMultipleChoicec                    t         |   |       t        |      | _        t	        j
                  |j                  |j                        | _        t	        j
                  |j                  d      | _        | j                          y r  )
r   r   rP  r   r   r   r   pre_classifierr  rS  r   s     r    r   zYosoForMultipleChoice.__init__y  s_     f%	 ii(:(:F<N<NO))F$6$6: 	r"   Nr   r   r   r   r   rl  r   r(  r)  r   c
           
         |	|	n| j                   j                  }	||j                  d   n|j                  d   }|!|j                  d|j	                  d            nd}|!|j                  d|j	                  d            nd}|!|j                  d|j	                  d            nd}|!|j                  d|j	                  d            nd}|1|j                  d|j	                  d      |j	                  d            nd}| j                  ||||||||	      }|d   }|dddf   }| j                  |      } t        j                         |      }| j                  |      }|j                  d|      }d}|t               } |||      }|	s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )a[  
        input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
            model's internal embedding lookup matrix.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        Nr   r1   rT   rn  r   ro  )rZ   r\  r   r   r;   r   r  r   ReLUr  r   r   r   r%  )r   r   r   r   r   r   rl  r   r(  r)  r_  num_choicesr   hidden_statepooled_outputrq  reshaped_logitsrp  rs  r   s                       r    ra   zYosoForMultipleChoice.forward  s   X &1%<k$++B]B],5,Aiooa(}GZGZ[\G]>G>SINN2y~~b'9:Y]	M[Mg,,R1D1DR1HImqM[Mg,,R1D1DR1HImqGSG_|((\->->r-BCei ( r=#5#5b#9=;M;Mb;QR 	 ))))%'/!5#  	
 qz$QT*++M:!	-0/ ++b+6')HOV4D%''!"+5F)-)9TGf$EvE("!//))	
 	
r"   rt  )rk   rl   rm   r   r   r=   r   rb  r'  r   ra   r   r   s   @r    r  r  w  s      *..2.2,0-1&*)-,0#'Y
<<$&Y
 t+Y
 t+	Y

 llT)Y
 ||d*Y
 t#Y
  $;Y
 #TkY
 D[Y
 
*	*Y
 Y
r"   r  c                        e Zd Z fdZe	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	edz  d
edz  dedz  dee	z  fd       Z
 xZS )YosoForTokenClassificationc                 ,   t         |   |       |j                  | _        t        |      | _        t        j                  |j                        | _        t        j                  |j                  |j                        | _        | j                          y r$   )r   r   ry  rP  r   r   r   r   r   r   r   r  rS  r   s     r    r   z#YosoForTokenClassification.__init__  si      ++f%	zz&"<"<=))F$6$68I8IJ 	r"   Nr   r   r   r   r   rl  r   r(  r)  r   c
           
         |	|	n| j                   j                  }	| j                  ||||||||	      }|d   }| j                  |      }| j	                  |      }d}|t               }||j                  d      dk(  }|j                  d| j                        }t        j                  ||j                  d      t        j                  |j                        j                  |            } |||      }n2 ||j                  d| j                        |j                  d            }|	s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )z
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        Nrn  r   r1   r   ro  )rZ   r\  r   r   r  r   r   ry  r=   wherer-   ignore_indextype_asr   r   r%  )r   r   r   r   r   r   rl  r   r(  r)  r_  r   r?  rq  rp  rs  active_lossactive_logitsactive_labelsr   s                       r    ra   z"YosoForTokenClassification.forward  sh   $ &1%<k$++B]B]))))%'/!5#  	
 "!*,,71')H),11"5: &B @ %R%,,x?T?T2U2]2]^d2e!  }=B @&++b/RY,F)-)9TGf$EvE$!//))	
 	
r"   rt  )rk   rl   rm   r   r   r=   r   rb  r'  r   ra   r   r   s   @r    r  r    s    	  *..2.2,0-1&*)-,0#':
<<$&:
 t+:
 t+	:

 llT):
 ||d*:
 t#:
  $;:
 #Tk:
 D[:
 
&	&:
 :
r"   r  c                   @    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  d	ej                  dz  d
edz  dedz  dedz  dee	z  fd       Z
 xZS )YosoForQuestionAnsweringc                     t         |   |       d|_        |j                  | _        t        |      | _        t        j                  |j                  |j                        | _        | j                          y )Nr0   )
r   r   ry  rP  r   r   r   r   
qa_outputsrS  r   s     r    r   z!YosoForQuestionAnswering.__init__-  s[      ++f%	))F$6$68I8IJ 	r"   Nr   r   r   r   r   start_positionsend_positionsr   r(  r)  r   c           
         |
|
n| j                   j                  }
| j                  |||||||	|
      }|d   }| j                  |      }|j	                  dd      \  }}|j                  d      }|j                  d      }d }||t        |j                               dkD  r|j                  d      }t        |j                               dkD  r|j                  d      }|j                  d      }|j                  d|      }|j                  d|      }t        |      } |||      } |||      }||z   dz  }|
s||f|dd  z   }||f|z   S |S t        ||||j                  |j                        S )	Nrn  r   r   r1   r9   )r  r0   )rp  start_logits
end_logitsr   r%  )rZ   r\  r   r  splitr  r:   r;   clampr   r   r   r%  )r   r   r   r   r   r   r  r  r   r(  r)  r_  r   r?  rq  r  r  
total_lossignored_indexrs  
start_lossend_lossr   s                          r    ra   z YosoForQuestionAnswering.forward9  s    &1%<k$++B]B]))))%'/!5#  	
 "!*1#)<<r<#: j#++B/''+

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M-33A}EO)//=AM']CH!,@J
M:H$x/14J"J/'!"+=F/9/EZMF*Q6Q+%!!//))
 	
r"   )
NNNNNNNNNN)rk   rl   rm   r   r   r=   r   rb  r'  r   ra   r   r   s   @r    r  r  +  s    
  *..2.2,0-1/3-1)-,0#'=
<<$&=
 t+=
 t+	=

 llT)=
 ||d*=
 ,=
 ||d*=
  $;=
 #Tk=
 D[=
 
-	-=
 =
r"   r  )rd  r  r  r  r  r	  rP  rC  )Cr   rW   r=   r   torch.nnr   r   r    r   rG  activationsr	   modeling_layersr
   modeling_outputsr   r   r   r   r   r   modeling_utilsr   pytorch_utilsr   utilsr   r   r   r   r   configuration_yosor   
get_loggerrk   r   r   r!   r.   r5   rO   autogradFunctionrQ   rq   rN  r   r   r   r   r   r  r	  r  r0  r5  r<  rC  rP  rd  rw  r  r  r  r  __all__ro   r"   r    <module>r     sT       A A & ! 9  . 6  + 
		H	% )C.&BU^^,, B>VB// VBt6RYY 6rM		 MbRYY 
BII 
ryy   * :%
")) %
R")) $299 "!bii ! // / /  S
# S
 S
l H
) H
 H
VRYY * M
$7 M
M
` e
/ e
 e
P G
!4 G
 G
T K
2 K
 K
\	r"   