
    qi                         d dl Z ddlmZ ddlmZ 	 	 	 	 	 	 dde j
                  j                  de j                  de j                  de j                  d	e j                  dz  d
ede j                  fdZy)    N   )PagedAttentionCache)!lazy_import_paged_flash_attentionmoduleqkvattention_maskcachereturnc
           	         t        | j                  j                        }t        | dd      sdn| j                  dz
  df}|dk(  rdnd}|)|j                  ||| j                  |
d	   |
d
         \  }}t        |t              r
||   }|	|   }	d|
v rd|
j                  d      ini } ||j                  dd      j                  d      j                         |j                         |j                         |j                  t        j                        |j                  t        j                        j!                         ||	f| j"                  d|d|}t        |t$              r|d   }|dfS )a  Perform the forward pass of attention with paged key-value cache.

    This function handles the cache updates and performs the attention computation
    using the flash_attn_varlen_func for efficient processing.

    Args:
        q: (total_q, nheads, headdim), where total_q = total number of query tokens in the batch.
        k: (total_k, nheads_k, headdim), where total_k = total number of key tokens in the batch.  but if there is a block table it can be the full k
        v: (total_k, nheads_k, headdim), where total_k = total number of key tokens in the batch.  but if there is a block table it can be the full v
        cu_seq_lens_q: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
           of the sequences in the batch, used to index into q.
        cu_seq_lens_k: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
           of the sequences in the batch, used to index into kv.
        max_seqlen_q: int. Maximum query sequence length in the batch.
        max_seqlen_k: int. Maximum key sequence length in the batch.
        dropout_p: float. Dropout probability.
        softmax_scale: float. The scaling of QK^T before applying softmax.
            Default to 1 / sqrt(headdim).
        causal: bool. Whether to apply causal attention mask (e.g., for auto-regressive modeling).
        window_size: (left, right). If not (-1, -1), implements sliding window local attention.
        softcap: float. Anything > 0 activates softcapping attention.
    sliding_windowF)r      r   full_attentionsliding_attentionN
read_indexwrite_index)
key_statesvalue_states	layer_idxr   r   s_auxr   T)softmax_scalecausalwindow_size)r   config_attn_implementationgetattrr   updater   
isinstancedictget	transposesqueeze
contiguoustotorchint32clonescalingtuple)r   r   r   r	   r
   r   cu_seq_lens_qcu_seq_lens_kmax_seqlen_qmax_seqlen_kkwargsflash_attn_varlen_funcr   
layer_typecustom_kwargsattn_outputs                   W/opt/pipecat/venv/lib/python3.12/site-packages/transformers/integrations/flash_paged.pypaged_attention_forwardr6      sy   F ?v}}?a?ab%,V5Eu%MXTZTiTilmTmopSqN%3x%?!EXJ ||&&l+}-  
1 -&%j1#J/6=6GWfjj12RM(	Aq!!!$//1		%%++- nn" K +u%!!n    )NNNNNN)	r'   generation.continuous_batchingr   modeling_flash_attention_utilsr   nnModuleTensorr6    r7   r5   <module>r>      s     @ N +/!%HHHOOH||H ||H ||	H
 LL4'H H \\Hr7   