
    qi                         d dl mZ d dlmZ d dlZd dlmZ ddlmZ  G d d      Z	d	ed
e
fdZdededed
efdZdededed
efdZ	 ddej                  dee   dee   ded
df
dZy)    )OrderedDict)ceilN)PretrainedConfig   )loggerc                       e Zd ZdZdeddfdZdededej                  j                  dz  fdZ	deded	ej                  j                  ddfd
Z
y)CudaGraphBufferz>A fixed-size dict for CUDA graphs with LRU eviction when full.max_sizereturnNc                 V    |dk  rt        d|       || _        t               | _        y )Nr   z#max_size must be positive, but got )
ValueErrorr
   r   _storage)selfr
   s     c/opt/pipecat/venv/lib/python3.12/site-packages/transformers/generation/continuous_batching/utils.py__init__zCudaGraphBuffer.__init__   s,    q=B8*MNN LWM    q_lenkv_lenc                 ~    | j                   j                  ||f      }|| j                   j                  ||f       |S N)r   getmove_to_end)r   r   r   graphs       r   	get_graphzCudaGraphBuffer.get_graph!   s;    !!5&/2MM%%ufo6r   r   c                     t        | j                        | j                  k\  rG| j                  j                  d      \  }}t	        j
                  d|       |j                          || j                  ||f<   y )NF)lastz!Evicting graph for evicted_key = )lenr   r
   popitemr   inforeset)r   r   r   r   evicted_keyevicted_graphs         r   	set_graphzCudaGraphBuffer.set_graph'   sf    t}}.)-)>)>E)>)J&KKK<k-=>?!).ufo&r   )__name__
__module____qualname____doc__intr   torchcuda	CUDAGraphr   r#    r   r   r	   r	      sp    HZ Z Zs C EJJ4H4H44O /s /C /

8L8L /QU /r   r	   configr   c                     | j                   dv S )z:Checks if attention mask is needed for the given (config).)zpaged|eagerz
paged|sdpa)_attn_implementation)r-   s    r   attn_mask_is_neededr0   /   s    &&*GGGr   sizeinterval_size	max_valuec                 X    |dk  r|S | dkD  rt        | |z        |z  n|}t        ||      S )zQReturn the smallest multiple of (interval_size) >= (size), capped at (max_value).r   )r   min)r1   r2   r3   paddeds       r   pad_to_intervalr7   4   s9    ;?!8T$&'-7Fvy!!r   x	divide_byalign_toc                 T    t        t        | |z              } | |z  r| || |z  z
  z  } | S r   )r(   r   )r8   r9   r:   s      r   aligned_divider<   <   s4    DY A8|	XX&&Hr   attention_maskcumulative_seqlens_qcumulative_seqlens_ksliding_windowc                 J   t        j                  | j                        j                  }t	        t        |      dz
        D ]  }||dz      ||   z
  }||dz      ||   z
  }||k  r|dk\  r	||z
  dz   }nd}t        ||   ||dz            }	t        ||   ||dz            }
t        j                  | d|	|
f   j                  || j                  | j                        }t        j                  ||      }|dkD  r"||z
  |z
  }|t        j                  ||      z  }|| d|	|
f<    y)u  Builds an attention mask inplace using the cumulative seqlens of the query and key. If given a sliding window, it
    will also apply a sliding window mask on top. The attention mask is not boolean, it uses zeroes and -inf (or its
    equivalent) so it's more of an attention score bias tensor.
    The attention mask is a block-diagonal matrix, with each block an attention mask for a single query-key pair.
    Each of those block is built from a causal mask and, if there is a sliding window, a sliding window mask.

    An example is represented below, with seqlen_k = 8, seqlen_q = 4 and sliding_window = 6:

    CAUSAL MASK:

           █ █ █ █ █ ░ ░ ░
           █ █ █ █ █ █ ░ ░
           █ █ █ █ █ █ █ ░
           █ █ █ █ █ █ █ █

    SLIDING WINDOW MASK:
         ┌──────────────────────── seqlen_k - seqlen_q - sliding_window = 8 - 4 - 6 = -2 offset to the left
       <─┴─>
     ░ █ | █ █ █ █ █ █ █ █
     ░ ░ | █ █ █ █ █ █ █ █
     ░ ░ | ░ █ █ █ █ █ █ █
     ░ ░ | ░ ░ █ █ █ █ █ █

    ATTENTION MASK (sum of causal and sliding window masks):

           █ █ █ █ █ ░ ░ ░
           █ █ █ █ █ █ ░ ░
           ░ █ █ █ █ █ █ ░
           ░ ░ █ █ █ █ █ █

    Another example with seqlen_k = 5, seqlen_q = 3 and sliding_window = 2:

    CAUSAL MASK:

           █ █ █ ░ ░
           █ █ █ █ ░
           █ █ █ █ █

    SLIDING WINDOW MASK:
         ┌──────────────────────── seqlen_k - seqlen_q - sliding_window = 5 - 3 - 2 = 0 offset to the left
        <┴>
         | ░ █ █ █ █
         | ░ ░ █ █ █
         | ░ ░ ░ █ █

    ATTENTION MASK (sum of causal and sliding window masks):

           ░ █ █ ░ ░
           ░ ░ █ █ ░
           ░ ░ ░ █ █

    r   .)dtypedevice)diagonalN)r)   finforB   r5   ranger   slicefullshaperC   triutril)r=   r>   r?   r@   	min_valueiseqlen_qseqlen_kcausal_diagonalquery_range	key_range	minus_infmaskedsliding_diagonals                 r   build_attention_maskrV   C   sS   t N00155I3+,q01 ='A.1Ea1HH'A.1Ea1HHh8q=&1A5OO035I!a%5PQ.q13GA3NO	JJ3Y67== &&!((	
	 I@A'(2^Cejj5EFFF6<sK23-=r   )r   )collectionsr   mathr   r)    transformers.configuration_utilsr   requestsr   r	   boolr0   r(   r7   r<   TensorlistrV   r,   r   r   <module>r^      s    $   = / /.H 0 HT H
"# "c "c "c "c c S S  	Q=LLQ=s)Q= s)Q= 	Q=
 
Q=r   