
    qi}                     ^   d Z ddlZddlmZ ddlZddlmZ ddlmZ ddl	m
Z
 ddlmZ dd	lmZ dd
lmZmZmZmZmZmZmZ ddlmZ  ej2                  e      Zdad Z G d dej<                  j>                        Z d'dZ!d'dZ" G d dejF                        Z$ G d dejF                        Z% G d de      Z&e G d de             Z'e ed       G d de                    Z(e ed       G d d e                    Z)e G d! d"e'             Z* ed#       G d$ d%e'e
             Z+g d&Z,y)(zPyTorch RWKV model.    N)	dataclass)nn   )initialization)GenerationMixin)GradientCheckpointingLayer)PreTrainedModel)ModelOutputauto_docstringis_bitsandbytes_availableis_kernels_availableis_ninja_availableis_torch_cuda_availablelogging   )
RwkvConfigc                 `    t               st        d      ddlm}  |d      a| t        _        y )NzFkernels is not installed, please install it with `pip install kernels`r   )
get_kernelzkernels-community/rwkv)r   ImportErrorintegrations.hub_kernelsr   rwkv_cuda_kernelmax_seq_length)context_lengthr   s     X/opt/pipecat/venv/lib/python3.12/site-packages/transformers/models/rwkv/modeling_rwkv.pyload_wkv_cuda_kernelr   -   s,    !bcc6!":;&4#    c                   0    e Zd Zedd       Zedd       Zy)RwkvLinearAttentionNc                    |j                         \  }}}	|t        j                  kD  r t        d| dt        j                   d      ||	z  t	        |	d      z  dk7  rt        d| d|	 dt	        |	d       d	      |j
                  | _        |j                  j                  d
k7  sK|j                  j                  d
k7  s2|j                  j                  d
k7  s|j                  j                  d
k7  rt        d      t        j                  |j                         j                                }|j
                  t        j                  k(  r0|j                         }|j                         }|j                         }|j                         }|j                         }|j                         }t        j                  |t        j                        }
|s||Vt        j                   ||	dt        j"                  |j                  t        j                        }|d d d d dfxx   dz  cc<   nBt        j$                  |D cg c]  }|j'                  d       c}d      j                         }|j
                  t        j(                  k(  rt        j*                  }nt        j,                  } ||||||
|       nI|j
                  t        j(                  k(  rt        j.                  nt        j0                  } ||||||
       | j3                  |||||
       |4t        j4                  |dd      D cg c]  }|j7                  d       }}|
j9                  | j                        |fS c c}w c c}w )NzCannot process a batch with z+ tokens at the same time, use a maximum of z with this model.    r   zThe product of batch size (z) and hidden size (z") needs to be a round multiple of .cudazUCalling the CUDA kernel for wkv attention requires all tensors to be on CUDA devices.memory_formatr   )dtypedevicer$      籡*G)dim)sizer   r   
ValueErrorminr%   input_dtyper&   typetorchexpfloat
contiguousfloat16
empty_likecontiguous_formatzerosfloat32cat	unsqueezebfloat16forward_with_state_bf16forward_with_stateforward_bf16forwardsave_for_backwardchunksqueezeto)ctx
time_decay
time_firstkeyvaluestatereturn_state
batch_sizeseq_lenhidden_sizeoutputsforward_funcs                r   r>   zRwkvLinearAttention.forward9   s   +.88:(
G[%444.wi7b#2233DF  #c+r&::a?-j\9L[M Z";34A7 
 )) ""f,  %%/zz&(||  F*tuuii
 0 0 2 = = ?@@
99%#))+J))+CKKME**,
nn  "!!#U5L5LM5,}--::"'"9"9 aAg$&		5"Aa1;;q>"AqITTVyyENN*/GG/BBZeVUK<?II<W+88]m]u]uLZeVDj*c5&I+0;;uaQ+GHaQYYq\HEHyy)500 #B Is   ?M+4M0c                 <   | j                   }| j                  \  }}}}}t        j                  |t        j                  |t        j
                  k(  rt        j
                  nt        j                        }	t        j                  |t        j                        }
t        j                  |t        j                        }t        j                  |t        j                        }|t        j                  k(  r|j                         }|t        j
                  k(  rt        j                  nt        j                  } |||||||j                         |	|
||
       |	j                  |      |
j                  |      |j                  |      |j                  |      d d fS )N)r$   r%   r#   )r-   saved_tensorsr/   r4   r5   r:   r7   r3   r1   r   backward_bf16backwardr2   rB   )rC   g_outputg_stater-   rD   rE   rF   rG   rM   g_time_decayg_time_firstg_keyg_valuebackward_funcs                 r   rS   zRwkvLinearAttention.backwardx   sJ    oo585F5F2
JUF''11$/5>>$A%..u}}

 ''
%BYBYZ  E4K4KL""58O8OP%--'~~'H:E:W(66]m]v]v!	
 OOK(OOK(HH[!JJ{#
 	
r   NFN)__name__
__module____qualname__staticmethodr>   rS    r   r   r   r   8   s)    <1 <1| %
 %
r   r   c                    |j                         \  }}}t        j                  |      }|t        j                  |d d df   t        j                        }	t        j                  |d d df   t        j                        }
t        j                  |d d df   t        j                        dz
  }n|\  }	}
}t        j                  |        } t        |      D ]  }|d d |f   j                         }|d d |f   }t        j                  |||z         }t        j                  ||z
        }t        j                  ||z   |z
        }||	z  ||z  z   }||
z  |z   }||z  j                  |j                        |d d |f<   t        j                  || z   |      }t        j                  || z   |z
        }t        j                  ||z
        }||	z  ||z  z   }	||
z  |z   }
|} |s||	|
|g}||fS )Nr   )r%   r(   )
r*   r/   
zeros_liker7   r0   ranger1   maximumrB   r%   )rD   rE   rF   rG   rH   rI   _
seq_lengthrM   	num_state	den_state	max_statecurrent_indexcurrent_keycurrent_valuemax_for_outpute1e2	numeratordenominatormax_for_states                        r   rwkv_linear_attention_cpurt      s    xxzAz1c"F}$$SAYemmD	$$SAYemmD	$$SAYemmDtK	*/'	9i
 ))J''Jz* "!]*+113a./ y+
2JKYYy>12YY{Z/.@ANR-%77	9nr)$-$;#?#?#Mq-  i*&<kJYYy:-=>YY{]23NR-%77	NR'	!	%"( u(Iy15=r   c                     t        d | |||fD              }|j                  d      dk(  }t        |s|rt        | |||||      S t        j                  | |||||      S )Nc              3   N   K   | ]  }|j                   j                  d k7    yw)r"   N)r&   r.   ).0ts     r   	<genexpr>z(rwkv_linear_attention.<locals>.<genexpr>   s     Xa!((--6)Xs   #%r   rH   rI   )anyr*   r   rt   r   apply)rD   rE   rF   rG   rH   rI   no_cuda	one_tokens           r   rwkv_linear_attentionr      sm    XJ
CQV3WXXG q I7i(ZeSXgstt"((ZeUT`aar   c                   0     e Zd Zd fd	ZddZddZ xZS )RwkvSelfAttentionc                 r   t         |           || _        t        d uxr t        j                  |j
                  k(  }t               r"t               r|s	 t        |j
                         || _        |j                  }|j                  |j                  n|}|| _        t        j                   t#        j$                  |            | _        t        j                   t#        j$                  |            | _        t        j                   t#        j$                  dd|            | _        t        j                   t#        j$                  dd|            | _        t        j                   t#        j$                  dd|            | _        t        j0                  d      | _        t        j4                  ||d      | _        t        j4                  ||d      | _        t        j4                  ||d      | _        t        j4                  ||d      | _        y # t        $ r t        j                  d       Y w xY w)Nz9Could not load the custom CUDA kernel for RWKV attention.r   r   r   r   Fbias)super__init__configr   r   r   r   r   r   	Exceptionloggerinfolayer_idrL   attention_hidden_sizer   	Parameterr/   emptyrD   rE   time_mix_keytime_mix_valuetime_mix_receptance	ZeroPad2d
time_shiftLinearrF   rG   
receptancerM   )selfr   r   kernel_loadedrL   r   	__class__s         r   r   zRwkvSelfAttention.__init__   s   (4q9I9X9X\b\q\q9q$;$=mY$V%:%:; !((,2,H,H,TF((Ze 	 &;",,u{{3H'IJ,,u{{3H'IJLLQ;)GH ll5;;q![+IJ#%<<Aq+0N#O ,,}599[*?eLYY{,AN
))K1FUSii 5{O)  YWXYs   H H65H6c                 p   |j                  d      dk(  r||d   d d d d | j                  f   }n3| j                  |      }| |d   d d d d | j                  f   |d d df<   || j                  z  |d| j                  z
  z  z   }|| j                  z  |d| j                  z
  z  z   }|| j
                  z  |d| j
                  z
  z  z   }| j                  |      }| j                  |      }t        j                  | j                  |            }| |d d df   |d   d d d d | j                  f<   ||||fS Nr   r   r   )r*   r   r   r   r   r   rF   rG   r/   sigmoidr   )r   hiddenrH   shiftedrF   rG   r   s          r   extract_key_valuez#RwkvSelfAttention.extract_key_value   s<   ;;q>Q5#4Ahq!T]]23Goof-G  %aAt}})< =1t(((7a$:K:K6K+LL,,,w!d>Q>Q:Q/RRd666AH`H`D`9aa
hhsm

5!]]4??:#>?
,21b5ME!HQ4==()3u,,r   c                      j                  ||      \  }}}}|t         fd|dd  D              nd }t         j                   j                  ||||      \  }}|T|d   |d   d d d d  j
                  f<   |d   |d   d d d d  j
                  f<   |d   |d   d d d d  j
                  f<    j                  ||z        |fS )	NrH   c              3   J   K   | ]  }|d d d d j                   f     y wr\   r   )rw   rN   r   s     r   ry   z,RwkvSelfAttention.forward.<locals>.<genexpr>  s!     FqAaDMM12Fs    #r'   rz   r   r   r      )r   tupler   rD   rE   r   rM   )	r   r   rH   	use_cacher   rF   rG   layer_staterwkvs	   `        r   r>   zRwkvSelfAttention.forward  s    (,(>(>vU(>(S%
CJOJ[eFE!"IFFae1OOOO"
k ",7NE!HQ4==(),7NE!HQ4==(),7NE!HQ4==(){{:,-u44r   r   r\   r[   )r]   r^   r_   r   r   r>   __classcell__r   s   @r   r   r      s    P<-&5r   r   c                   (     e Zd Zd fd	ZddZ xZS )RwkvFeedForwardc                 B   t         |           || _        || _        |j                  }|j
                  |j
                  nd|j                  z  }t        j                  d      | _        t        j                  t        j                  dd|            | _        t        j                  t        j                  dd|            | _        t        j                  ||d      | _        t        j                  ||d      | _        t        j                  ||d      | _        y )Nr   r   r   Fr   )r   r   r   r   rL   intermediate_sizer   r   r   r   r/   r   r   r   r   rF   r   rG   )r   r   r   rL   r   r   s        r   r   zRwkvFeedForward.__init__   s     (((.(@(@(LF$$RSV\VhVhRh 	 ,,}5LLQ;)GH#%<<Aq+0N#O 99[*;%H))K5IYY0+EJ
r   c                 z   |j                  d      dk(  r||d   d d d d | j                  f   }n3| j                  |      }| |d   d d d d | j                  f   |d d df<   || j                  z  |d| j                  z
  z  z   }|| j                  z  |d| j                  z
  z  z   }t        j                  t        j                  | j                  |                  }| j                  |      }t        j                  | j                  |            }| |d d df   |d   d d d d | j                  f<   ||z  |fS r   )r*   r   r   r   r   r/   squarerelurF   rG   r   r   )r   r   rH   r   rF   r   rG   s          r   r>   zRwkvFeedForward.forward1  s)   ;;q>Q5#4Ahq!T]]23Goof-G  %aAt}})< =1t(((7a$:K:K6K+LLd666AH`H`D`9aa
ll5::dhhsm45

3]]4??:#>?
,21b5ME!HQ4==()E!5((r   r   r\   r]   r^   r_   r   r>   r   r   s   @r   r   r     s    K")r   r   c                   &     e Zd Z fdZddZ xZS )	RwkvBlockc                    t         |           || _        || _        |dk(  r0t	        j
                  |j                  |j                        | _        t	        j
                  |j                  |j                        | _	        t	        j
                  |j                  |j                        | _
        t        ||      | _        t        ||      | _        y )Nr   )eps)r   r   r   r   r   	LayerNormrL   layer_norm_epsilonpre_lnln1ln2r   	attentionr   feed_forward)r   r   r   r   s      r   r   zRwkvBlock.__init__F  s     q=,,v'9'9v?X?XYDK<< 2 28Q8QR<< 2 28Q8QR*68<+FH=r   c                    | j                   dk(  r| j                  |      }| j                  | j                  |      ||      \  }}||z   }| j	                  | j                  |      |      \  }}||z   }||f}|r||fz  }|S |dz  }|S )Nr   )rH   r   r   r\   )r   r   r   r   r   r   )r   r   rH   r   output_attentionsr   r   outputss           r   r>   zRwkvBlock.forwardT  s    ==A[[(F>>$((6*:%S\>]	5)#"//0@/Ne,&5/	|#G  wGr   )NFFr   r   s   @r   r   r   E  s    >r   r   c                   z    e Zd ZU eed<   dZdgZddgZdZdZ	 e
j                         dej                  fd       Zy	)
RwkvPreTrainedModelr   r   r   rD   rE   Tmodulec           	      J   t        |t              r|j                  }|j                  j                  }|j                  j
                  }|j                  }||dz
  z  }d||z  z
  }t        j                  t        |      D cg c]  }||z  	 c}|j                  j                  |j                  j                        }	|	ddddf   }	t        |      D 
cg c]  }
dd|
|dz
  z  dd|z  z   z  z  z    }}
t        j                  ||j                  j                  |j                  j                        }t        j                  t        |      D cg c]  }|dz   d	z  dz
   c}|j                  j                  |j                  j                        d
z  }t        j                   |j                  |       t        j                   |j                  t        j"                  |j                  t%        j&                  d      z  |z                t        j                   |j                  t        j(                  |	|             t        j                   |j*                  t        j(                  |	|      d|z  z          t        j                   |j,                  t        j(                  |	d
|z               yt        |t.              r|j                  }|j                  j                  }|j                  j
                  }d||z  z
  }t        j                  t        |      D cg c]  }||z  	 c}|j                  j                  |j                  j                        }	|	ddddf   }	t        j                   |j                  t        j(                  |	|             t        j                   |j,                  t        j(                  |	|             yt        |t0        j2                        r|j4                  j6                  }d}d}|j8                  t        j:                  |j8                         |d   |d   kD  rt%        j<                  |d   |d   z        }|d   | j                  j>                  k(  r|d   | j                  j
                  k(  rd
}||z  }t        j@                  |j4                  |       yt        |t0        jB                        r`|j4                  j6                  }dt%        j<                  tE        |d   |d               z  }t        j@                  |j4                  |       yt        |t0        jF                        r?t        jH                  |j4                         t        j:                  |j8                         yyc c}w c c}
w c c}w c c}w )zInitialize the weights.r   g      ?r%   r&   N   gffffff?g?r   g      ?g333333?r   )gaing-C6?)%
isinstancer   r   r   num_hidden_layersrL   r   r/   tensorrd   r   r%   r&   rD   rE   initcopy_	ones_likemathlogpowr   r   r   r   r   weightshaper   zeros_sqrt
vocab_sizeorthogonal_	Embeddingmaxr   ones_)r   r   r   r   rL   r   ratio_0_to_1ratio_1_to_almost0itime_weighthdecay_speedzigzagr   r   scales                   r   _init_weightsz!RwkvPreTrainedModel._init_weightsp  ss    f/0H & ? ? --33K$*$@$@!#'81'<=L!$3D(D!E,,*/*<=Q[=))//**11K
 &dD!m4K 45 Q!4q89sS<EW?WXXXK   ,,{&:K:K:Q:QZ`ZkZkZrZrsK.34I.JKa!eq[1_K ++11!,,33
   JJv((+6JJv((%//&:K:KdhhWZm:[^d:d*efJJv**EIIkCU,VWJJv,,eiiEW.X[^am[m.mnJJv11599[#PbJb3cd0H & ? ? --33K!$3D(D!E,,*/*<=Q[=))//**11K
 &dD!m4KJJv**EIIkCU,VWJJv11599[J\3]^		*MM''EDE{{&FKK(Qx%("yyqE!H!45Qx4;;111eAh$++BYBY6YEMDV]]6-MM''E$))Ca%($;<<DV]]6-JJv}}%KK$ .w > L* >s   VVV#V N)r]   r^   r_   r   __annotations__base_model_prefix_no_split_modules_keep_in_fp32_modulessupports_gradient_checkpointing_is_statefulr/   no_gradr   Moduler   ra   r   r   r   r   g  sT    $)<8&*#LU]]_I%BII I% I%r   r   z+
    Class for the RWKV model outputs.
    )custom_introc                       e Zd ZU dZdZej                  dz  ed<   dZe	ej                     dz  ed<   dZ
eej                  df   dz  ed<   dZeej                  df   dz  ed<   y)
RwkvOutputa  
    state (list of five `torch.FloatTensor` of shape `(batch_size, hidden_size, num_hidden_layers)`):
        The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
        avoid providing the old `input_ids`.
    Nlast_hidden_staterH   .hidden_states
attentions)r]   r^   r_   __doc__r   r/   FloatTensorr   rH   listr   r   r   ra   r   r   r   r     sw     37u((4/6,0E4!!"T)0:>M5**C/047>7;Je'',-4;r   r   zK
    Base class for causal language model (or autoregressive) outputs.
    c                       e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	e
ej                     dz  ed<   dZeej                  df   dz  ed<   dZeej                  df   dz  ed<   y)	RwkvCausalLMOutputap  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    state (list of five `torch.FloatTensor` of shape `(batch_size, hidden_size, num_hidden_layers)`):
        The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
        avoid providing the old `input_ids`.
    NlosslogitsrH   .r   r   )r]   r^   r_   r   r   r/   r   r   r   rH   r   r   r   r   ra   r   r   r   r     s     &*D%

d
")'+FE$+,0E4!!"T)0:>M5**C/047>7;Je'',-4;r   r   c                   
    e Zd Z fdZd Zd Ze	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  de
ej                     dz  d	edz  d
edz  dedz  dedz  deez  fd       Zd Zd Z xZS )	RwkvModelc           	         t         |   |       t        j                  |j                  |j
                        | _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        t        j                  |j
                        | _        d| _        d| _        | j!                          y c c}w )Nr   F)r   r   r   r   r   rL   
embeddings
ModuleListrd   r   r   blocksr   ln_outlayers_are_rescaledgradient_checkpointing	post_init)r   r   idxr   s      r   r   zRwkvModel.__init__  s     ,,v'8'8&:L:LMmmPUV\VnVnPo$pYv%D$pqll6#5#56#( &+# 	 %qs   &C	c                     | j                   S r\   r   r   s    r   get_input_embeddingszRwkvModel.get_input_embeddings  s    r   c                     || _         y r\   r  r   new_embeddingss     r   set_input_embeddingszRwkvModel.set_input_embeddings  s	    (r   N	input_idsattention_maskinputs_embedsrH   r   r   output_hidden_statesreturn_dictreturnc	           	      *   ||n| j                   j                  }||n| j                   j                  }||n#| j                  s| j                   j                  nd}||n| j                   j
                  }|t        j                  d       | j                  | j                  k(  r| j                          ||t        d      ||t        d      || j                  |      }|r||j                  d      | j                   j                  | j                   j                  f}
t        d      D cg c]A  }t!        j"                  |
|dk  r|j$                  nt         j&                  |j(                  d	C }}|d
xx   dz  cc<   | j*                  r%| j                  r|rt        j                  d       d}|}|rdnd}|rdnd}t-        | j.                        D ]o  \  }} |||||      \  }}}| j                  r=| j                   j0                  dkD  r$|dz   | j                   j0                  z  dk(  r|dz  }|r||fz   }|sj||fz   }q | j3                  |      }|r||fz   }|st5        d ||||fD              S t7        ||||      S c c}w )a  
        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
            `past_key_values.get_seq_length()` (`sequence_length` of input past key value states). Indices of input
            sequence tokens in the vocabulary.

            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
            `input_ids`.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        state (tuple of five `torch.FloatTensor` of shape `(batch_size, hidden_size, num_hidden_layers)`, *optional*):
            If passed along, the model uses the previous state in all the blocks (which will give the output for the
            `input_ids` provided as if the model add `state_input_ids + input_ids` as context).
        use_cache (`bool`, *optional*):
            If set to `True`, the last state is returned and can be used to quickly generate the next logits.
        NFz<`attention_mask` was passed, but it is unused in this model.zDYou cannot specify both input_ids and inputs_embeds at the same timez5You have to specify either input_ids or inputs_embedsr      r   r   r   gꌠ9Y>)FzZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...ra   )rH   r   r   r'   c              3   &   K   | ]	  }||  y wr\   ra   )rw   xs     r   ry   z$RwkvModel.forward.<locals>.<genexpr>`  s     tqfgfsts   )r   rH   r   r   )r   r   r  trainingr   use_return_dictr   warning_oncer  _rescale_layersr+   r   r*   rL   r   rd   r/   r6   r%   r7   r&   r  	enumerater   rescale_everyr  r   r   )r   r  r  r  rH   r   r   r  r  kwargsr   r   r   all_self_attentionsall_hidden_statesr  blockr   s                     r   r>   zRwkvModel.forward  s   @ 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IZ^ZgZgT[[=R=Rmr	%0%<k$++B]B]% ^_==D444  " ]%>cdd=#8TUU  OOI6M"''*DKK,C,CT[[EbEbcE
 q	  a-"5"5U]][h[o[oE  !HH&&4==##p "	%$5b4"6BD#DKK0 	JJC/4UiSd0,M5*
 ((KK--11W 9 99Q> - 1#$58H$H! &9ZM&I#!	J$ M2 1]4D Dt]E;LNa$bttt++*	
 	
[s   5AJc           	         | j                   | j                   k(  ry | j                  j                  dkD  rt	        j
                         5  t        | j                        D ]  \  }}| j                  r|j                  j                  j                  j                  dt        || j                  j                  z        z         |j                  j                  j                  j                  dt        || j                  j                  z        z         t        |j                  j                  j                  d      r|j                  j                  j                  j                   j#                  dt        || j                  j                  z        z         |j                  j                  j                  j                   j#                  dt        || j                  j                  z        z         t        |j                  j                  j                  d      rN| j%                  |j                  j                  |       | j%                  |j                  j                  |       |j                  j                  j                  j#                  dt        || j                  j                  z        z         |j                  j                  j                  j#                  dt        || j                  j                  z        z          	 d d d        | j                   | _         y # 1 sw Y   xY w)Nr   r'   SCBquant_state)r  r  r   r  r/   r   r  r   r   rM   r   mul_intr   rG   hasattrr#  div_ _bnb_4bit_dequantize_and_rescale)r   block_idr!  s      r   r  zRwkvModel._rescale_layersi  s`   ##DMM(9:;;$$q( r'0'= rOHe}}..55::1HPTP[P[PiPiDi@j;jk**0077<<Q#hRVR]R]RkRkFkBl=lm #5??#9#9#@#@%H!OO2299==BB1HX\XcXcXqXqLqHrCrs!..44;;??DDQ#hZ^ZeZeZsZsNsJtEtu$U__%;%;%B%BMR AA%//BXBXZbc AA%BTBTBZBZ\de!OO2299>>qCTXT_T_TmTmHmDn?no!..44;;@@c(VZVaVaVoVoJoFpApqrr" (,}}#4 #r rs   
KL77M c                    t               st        d      ddl}|j                  j	                  |j
                  j                  |j
                  j                        }|j                  dt        || j                  j                  z        z         |j                  j                  |j                  d      d      j                  |j                        }t!        |d|       y)	z
        Perform the dequantization and rescaling of the weights of a given layer. After that operation the layer will
        be quantized again.
        z/Please install bitsandbytes to use this method.r   Nr'   cpuF)requires_gradr   )r   r   bitsandbytes
functionaldequantize_4bitr   datar$  r(  r&  r   r  r   
Params4bitrB   r&   setattr)r   target_layerr*  bnbdequant_weightsquant_weights         r   r)  z*RwkvModel._bnb_4bit_dequantize_and_rescale  s    
 )*OPP"..889L9L9Q9QS_SfSfSrSrsQ#h$++2K2K&K"LLM vv((););E)BRW(X[[\k\r\rsh5r   )NNNNNNNN)r]   r^   r_   r   r	  r  r   r/   
LongTensorr   r   boolr   r   r>   r  r)  r   r   s   @r   r   r     s    )  .2262604!%)-,0#'h
##d*h
 ((4/h
 ((4/	h

 E%%&-h
 $;h
  $;h
 #Tkh
 D[h
 
	h
 h
T506r   r   z
    The RWKV Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    c                   F    e Zd ZddiZ fdZd Zd Ze	 	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  d	ej                  dz  d
eej                     dz  dej                  dz  dedz  dedz  dedz  dedz  deej                  z  deez  fd       Z xZS )RwkvForCausalLMzhead.weightzrwkv.embeddings.weightc                     t         |   |       t        |      | _        t	        j
                  |j                  |j                  d      | _        | j                          y )NFr   )
r   r   r   r   r   r   rL   r   headr  )r   r   r   s     r   r   zRwkvForCausalLM.__init__  sH     f%	IIf00&2C2C%P	 	r   c                     | j                   S r\   r=  r  s    r   get_output_embeddingsz%RwkvForCausalLM.get_output_embeddings  s    yyr   c                     || _         y r\   r?  r  s     r   set_output_embeddingsz%RwkvForCausalLM.set_output_embeddings  s	    "	r   Nr  r  r  rH   labelsr   r   r  r  logits_to_keepr  c           	         |	|	n| j                   j                  }	| j                  |||||||	      }|d   }t        |
t              rt        |
 d      n|
}| j                  |dd|ddf         }d}|* | j                  d||| j                   j                  d|}|	s|f|dd z   }||f|z   S |S t        |||j                  |j                  |j                        S )aJ  
        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
            `past_key_values.get_seq_length()` (`sequence_length` of input past key value states). Indices of input
            sequence tokens in the vocabulary.

            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
            `input_ids`.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        state (tuple of five `torch.FloatTensor` of shape `(batch_size, hidden_size, num_hidden_layers)`, *optional*):
            If passed along, the model uses the previous state in all the blocks (which will give the output for the
            `input_ids` provided as if the model add `state_input_ids + input_ids` as context).
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
        use_cache (`bool`, *optional*):
            If set to `True`, the last state is returned and can be used to quickly generate the next logits.
        N)r  rH   r   r   r  r  r   )r   rC  r   r   )r   r   rH   r   r   ra   )r   r  r   r   r&  slicer=  loss_functionr   r   rH   r   r   )r   r  r  r  rH   rC  r   r   r  r  rD  r  rwkv_outputsr   slice_indicesr   r   rM   s                     r   r>   zRwkvForCausalLM.forward  s   L &1%<k$++B]B]yy'/!5# ! 
 %Q8B>SV8W~ot4]k=M1)<=>%4%%pVFt{{OeOepiopDYab!11F)-)9TGf$EvE!$$&44#..
 	
r   )
NNNNNNNNNr   )r]   r^   r_   _tied_weights_keysr   r@  rB  r   r/   r8  r   r   r9  r&  Tensorr   r   r>   r   r   s   @r   r;  r;    s(    ()AB#  .2262604*.!%)-,0#'-.D
##d*D
 ((4/D
 ((4/	D

 E%%&-D
   4'D
 $;D
  $;D
 #TkD
 D[D
 ell*D
 
#	#D
 D
r   r;  )r;  r   r   r[   )-r   r   dataclassesr   r/   r    r   r   
generationr   modeling_layersr   modeling_utilsr	   utilsr
   r   r   r   r   r   r   configuration_rwkvr   
get_loggerr]   r   r   r   autogradFunctionr   rt   r   r   r   r   r   r   r   r   r   r;  __all__ra   r   r   <module>rW     s{     !   & ) 9 -   + 
		H	%  5g
%..11 g
T)XbC5		 C5L#)bii #)L* D R%/ R% R%j 

< 
< 
< 
< < <$ k6# k6 k6\ V
)? V
V
r Br   