
    qi[i                         d dl mZmZ d dlmZ d dlmZ d dlmZ d dl	m
Z
 ddlmZ  e
d      Zd	ee   d
eeeef      fdZ G d d      Z G d d      Z G d de      Z G d de      Z G d de      Zy)    )ABCabstractmethod)deque)Iterator)ceil)TypeVar   )loggerTxsreturnc              #   X   K   t        |       dz
  }| d d d   D ]  }||f |dz  } y w)Nr	   )len)r   indexxs      k/opt/pipecat/venv/lib/python3.12/site-packages/transformers/generation/continuous_batching/cache_manager.pyreverse_enumerater      s<     GaKE"X Qh
s   (*c                   P    e Zd ZdZdededz  deddfdZdefdZede	fd	       Z
y)
Blocka  A class to represent a block managed by the block manager. We say that a block is complete when the physical KV
    cache it points to is fully computed. A block can have a parent, which is the block that came before in the
    sequence. Once a block is complete, it is given a hash, which takes into account the tokens ids of the block, the
    layer (group_id) it belong to and its parent's hash (if there is a parent).id_	parent_idNgroup_idr   c                 J    || _         || _        || _        d | _        d| _        y )Nr	   idr   r   hash	ref_count)selfr   r   r   s       r   __init__zBlock.__init__'   s%    %.% $	    c                     d| j                    d| j                   d| j                   d| j                   d| j                   dS )Nz	Block(id=z, parent_id=z, group_id=z, hash=z, ref_count=)r   r   s    r   __repr__zBlock.__repr__.   s_    477)</?{4==/Y`aeajaj`kkwx|  yG  yG  xH  HI  J  	Jr!   c                     | j                   d uS )N)r   r$   s    r   is_completezBlock.is_complete1   s    yy$$r!   )__name__
__module____qualname____doc__intr    strr%   propertyboolr'    r!   r   r   r   !   sV    S
 C  C$J  #  $  J# J %T % %r!   r   c                   V   e Zd ZdZdededdfdZedefd       Zdedefd	Z	ded
edz  dedede
e   dz  f
dZde
e   dedededee
e
e      dz  e
e   e
e   f   f
dZdeddfdZdeddfdZde
e   deddfdZdeddfdZdede
e   de
e   ddfdZdedz  de
e   dedefdZy)BlockManagera1  A class to manage the number of free blocks and block re-use. When a block becomes in use, a flag is passed to
    determine if the block is shareable or not. If it is, then a Block object is created and kept track of internally.
    It can have the following states:
      - in use: one or more requests references this block, thus it cannot be written over. The number of requests
        referencing this block is stored as ref_count in the Block object.
      - un-initialized: the block points to a space in the KV cache tensor that contains no data yet. Those blocks can
        be given as free blocks to new requests without any overhead.
      - initialized: the block is complete and was used by one or more request that are finished. It contains KV cache
        data and its hash is stored in the hash table. If a new request needs a block with the same hash, we increase
        the ref_count of the block and remove it from the list of initialized blocks, because it is now in use.
        Still, the block can be freed if no un-initialized blocks are left. In that case, we remove its hash from the
        hash table.
    If the block is not shareable, we just use the block manager as a FIFO structure where blocks are either free or in
    use. Sharability is determined by the type of cache allocator: blocks created for full attention layers are
    shareable, while blocks created for sliding window attention layers are not.
    There is no structure to keep track of the blocks in use: if a block is neither un-initialized nor initialized,
    it is in use.
    
num_blocks
block_sizer   Nc                 |    || _         || _        t        t        |            | _        i | _        i | _        i | _        y)z^Initializes the block manager with a given number of blocks (num_blocks) of size (block_size).N)r3   r4   r   range_uninit_block_ids_init_block_ids_hash_to_id_id_to_block)r   r3   r4   s      r   r    zBlockManager.__init__J   s:    $$!&uZ'8!902+-.0r!   c                 X    t        | j                        t        | j                        z   S )zfReturns the number of free blocks left. Both initialized and uninitialized blocks are considered free.)r   r7   r8   r$   s    r   num_free_blockszBlockManager.num_free_blocksS   s%     4))*S1E1E-FFFr!   n_blocksc                    t        | j                        |k\  ry|t        | j                        z
  }t        | j                        |k  ryt        |      D ]n  }| j                  j	                         d   }| j
                  |   }| j                  j                  |j                         | j                  j                  |       p y)zChecks if there are enough free blocks to allocate the requested number of blocks (n_blocks). If there are
        not enough uninitialized blocks, we uninitialize the required number of initialized blocks.TFr   )
r   r7   r8   r6   popitemr:   r9   popr   append)r   r=   block_to_uninitialize_id_to_uninitializeblocks         r   has_enough_free_blocksz#BlockManager.has_enough_free_blocksX   s     t%%&(2 (3t/E/E+F Ft##$'<<,- 	>A!%!5!5!=!=!?!B%%&89E  ,""))*<=	> r!   last_block_id	shareabler   c                     | j                  |      syt        |      D cg c]  }| j                  j                          }}|r%|D ]   }t	        |||      }|| j
                  |<   |}" |S c c}w )a  Returns a list of (n_blocks) free block and mark them as no longuer free in the internal data structures.
        If the (shareable) flag is set to True, a Block object is created to keep track of the block, with the
        (last_block_id) to indicate the last block id in the sequence, also named the parent block. If the manager
        cannot find enough free blocks, it returns None.N)rF   r6   r7   popleftr   r:   )	r   r=   rG   rH   r   rC   allocated_block_idsblock_idrE   s	            r   get_free_blockszBlockManager.get_free_blocksk   s     **84INxYAt55==?YY/ )hx@.3!!(+ ()
 #" Zs   !A,parent_blocks	num_forksc                 (   g }|rT|D ]O  }| j                   |   }|j                  r1|j                  |j                         |xj                  |z  c_        O n t        |      t        |      z
  }|dk(  rt        |      D 	cg c]  }	|dd 	 c}	g g fS g }
g }g }|r|d   nd}t        |      D ]Y  }	| j                  ||||      }|dg g fc S |
j                  ||z          |j                  || d        |j                  |       [ |
||fS c c}	w )u#  Fork a given list of (parent_blocks) as many times as (num_forks). If the blocks are (shareable), we use
        reference on the blocks that are complete. Otherwise, we allocate new blocks and keep track of their indices to
        later copy the physical cache. For instance, when forking 4 blocks for 2 children:

        Parent blocks: [0, 1, 2, 3], with all blocks being complete except the last one (block 3).

        ----------------------------------------- IF BLOCKS ARE NOT SHAREABLE -----------------------------------------

        Forked blocks lists: [[5, 6, 7, 8], [9, 10, 11, 12]]
        Copy source:          [0, 1, 2, 3,   0,  1,  2,  3]
                               ↓  ↓  ↓  ↓    ↓   ↓   ↓   ↓
        Copy destination:     [5, 6, 7, 8,   9, 10, 11, 12]  → 8 blocks are newly allocated and copied

        ----------------------------------------- IF BLOCKS ARE SHAREABLE ---------------------------------------------

        Forked blocks lists: [[0, 1, 2, 5], [0, 1, 2, 6]]
        Copy source:          [         3,            3]     (block 3 is not complete so it's copied, not referenced)
                                        ↓             ↓
        Copy destination:     [         5,            6]     → only 2 blocks are newly allocated and copied
        r   Nr   )	r:   r'   rA   r   r   r   r6   rM   extend)r   rN   rO   rH   r   forked_by_referencerL   rE   blocks_to_copyrC   forked_blocks_listscopy_srccopy_dstr   rK   s                  r   fork_blockszBlockManager.fork_blocks~   sJ   0 !) ))(3$$'..uxx8OOy0O ]+c2E.FFQ49)4DEq'*Er2MM ! 0C'+	y! 	1A"&"6"6~yR[]e"f"*R|#&&':=P'PQOOM>/*:;<OO/0	1 #Hh66! Fs   DrL   c                     | j                   |   }|xj                  dz  c_        |j                  dk(  r| j                  j                  |       yy)z4Increases the reference count of a given (block_id).r	   N)r:   r   r8   r@   r   rL   rE   s      r   increase_ref_countzBlockManager.increase_ref_count   sE    !!(+1??a  $$X.  r!   c                    | j                   |   }|xj                  dz  c_        |j                  dk(  rS|j                  rd| j                  |<   y| j                   j	                  |       | j
                  j                  |       yy)zDecreases the reference count of a given (block_id). If the reference count reaches 0, the block is no longer
        in use, and becomes initialized (if it was complete) or uninitialized (if it was incomplete).r	   r   N)r:   r   r'   r8   r@   r7   rA   rY   s      r   decrease_ref_countzBlockManager.decrease_ref_count   st     !!(+1??a  15$$X.!!%%h/&&--h7  r!   blocksc                 p    |r|D ]  }| j                  |        y| j                  j                  |       y)zMarks a list of (blocks) as free. If the blocks were not (shareable), we simply add them to the uninitialized
        blocks queue. Otherwise, their new state depends on whether they are complete.N)r\   r7   rQ   )r   r]   rH   rL   s       r   free_blockszBlockManager.free_blocks   s:     " 2''12 ""))&1r!   c                     | j                   j                  |      }|j                  dkD  rt        d| d|j                        | j                  j                  |       y)zYMarks a block as uninitialized. Raises an error if the block has more than one reference.r	   Block z0 has more than one reference: block.ref_count = N)r:   r@   r   RuntimeErrorr7   rA   rY   s      r   uninitialize_unshared_blockz(BlockManager.uninitialize_unshared_block   s[     !!%%h/??Qz1bPUP_P_Ocdee%%h/r!   num_complete_blocksallocated_blocks
prompt_idsc                    d}g }t        |      D ]A  \  }}| j                  |   }|j                  r|j                  } n|j	                  ||f       C d}	|r|j                         \  }}|		|	|_        d}	|dk(  ry|dz  }||| j                  z  |dz   | j                  z   }
| j                  ||
|j                        |_        | j                  j                  |j                        }|||j                  k(  r$t        j                  d|j                   d       nt        j                  d| d|j                          |||<   |}	| j!                  |       | j#                  |j                         n_t        j                  d|j                   d	|j                   d
|j                          |j                  | j                  |j                  <   |j                  }|ryy)zAmong the list of (allocated_blocks), mark (num_complete_blocks) incomplete blocks as now complete. The list
        of (prompt_ids) is used to compute the hash of the new block.Nr   r	   ra   z& was marked as complete more than oncezFound existing block z for block zAdding new block z (group z) with hash )r   r:   r'   r   rA   r@   r   r4   compute_hashr   r9   getr   r
   warningdebugrZ   rc   )r   rd   re   rf   parent_hashincomplete_blocksirL   rE   new_parent_idtokensexisting_block_ids               r   !mark_shareable_blocks_as_completez.BlockManager.mark_shareable_blocks_as_complete   s    57,-=> 	1KAx%%h/E  #jj$$aZ0	1 (,,.HAu ("/ $ #a'  1$DOO 3q1u6OPF**;OEJ $ 0 0 4 4UZZ @ ,$0NNVEHH:5[#\]LL#89J8K;W\W_W_V`!ab*;$Q'$5M++,=>44UXX> 0
(5>>BRR^_d_i_i^jkl/4xx  ,  **KI  r!   rl   rp   c                 0    t        |t        |      |f      S )zComputes the hash of a block identified by the (tokens) it contains, its (parent_hash) and the layer
        (group_id) it belong to. If the block has no parent, the parent hash is None.)r   tuple)r   rl   rp   r   s       r   rh   zBlockManager.compute_hash  s     [%-:;;r!   )r(   r)   r*   r+   r,   r    r.   r<   r/   rF   listrM   rt   rW   rZ   r\   r_   rc   rr   rh   r0   r!   r   r2   r2   6   s   &13 1C 1D 1 G G Gs t &##,/$J#CG#SV#	cT	#&67!#Y673667CG67SV67	tDI%tCy$s);	<67p/3 /4 /
83 
84 
82$s) 2 2 20C 0D 05%#&5%:>s)5%QUVYQZ5%	5%n<d
 <DI <QT <Y\ <r!   r2   c                       e Zd ZU dZeed<   eeee   f   ed<   e	ed<   e
dedededed	z  fd
       Zdededd	fdZe
dedededee   fd       Ze
dedededee   fd       Zdedee   dedeee   ee   f   fdZy	)CacheAllocatorzAbstract base class for cache managers. Cache managers keep track of per-request cache allocations, determine
    when a new physical block needs to be allocated and compute physical indices for reading or writing to the cache._indexblock_tableuses_block_sharingr=   
request_idblock_managerr   Nc                      y)zAllocates (n_blocks) for a given (request_id) using the (block_manager). Returns the num of blocks allocated
        if successful and None otherwise.Nr0   )r   r=   r{   r|   s       r   allocate_blockszCacheAllocator.allocate_blocks       r!   c                     || j                   v r9| j                   j                  |      }|j                  || j                         yt	        j
                  d| j                   d|        y)zJFrees all blocks associated with a (request_id) using the (block_manager).)rH   zCacheAllocator z7 attempted to free blocks for non-existent request_id: N)ry   r@   r_   rz   r
   rj   rx   )r   r{   r|   blocks_to_frees       r   r_   zCacheAllocator.free_blocks%  sb    )))!--11*=N%%n@W@W%XNN!$++.efpeqrr!   past_lengthquery_lengthc                      y)zUReturns the physical indices of where to read request_id's cache in the cache tensor.Nr0   r   r{   r   r   s       r   get_read_indiceszCacheAllocator.get_read_indices/  r   r!   c                      y)zVReturns the physical indices of where to write request_id's cache in the cache tensor.Nr0   r   s       r   get_write_indicesz CacheAllocator.get_write_indices3  r   r!   parent_request_idchildren_request_idsc                 j   || j                   vrt        d|       | j                   |   }|j                  |t        |      | j                  | j
                        \  }}}|t        d|       t        ||      D ]0  \  }}	|| j                   v rt        d|       |	| j                   |<   2 ||fS )ay  Forks the cache blocks of a (parent_request_id) to a list of (children_request_ids). To manage the blocks,
        the (block_manager) is used. When forking, the child's block are either shared with the parent, or they need to
        be copied from the parent. Hence we return two lists of blocks that need to be copied: one for the source and
        one for the destination.!No block table found for request )rN   rO   rH   r   z"Failed to fork blocks for request z'Block table already exists for request )ry   
ValueErrorrW   r   rz   rx   zip)
r   r   r   r|   rN   list_forked_blocksrU   rV   children_request_idforked_blockss
             r   rW   zCacheAllocator.fork_blocks7  s     D$4$44@AR@STUU (():;1>1J1J'./--[[	 2K 2
.Hh %ABSATUVV 366JL^2_ 	B."d&6&66 #JK^J_!`aa4AD01	B !!r!   )r(   r)   r*   r+   r,   __annotations__dictr-   ru   r/   r   r2   r~   r_   r   r   rt   rW   r0   r!   r   rw   rw     s.   y Kc49n%%- - -\ -^adh^h - -c , 4  d3 dS dPS dX\]`Xa d d eC ec eQT eY]^aYb e e"!$"<@I"Vb"	tCy$s)#	$"r!   rw   c            	           e Zd ZdZdedededdfdZded	ed
ededz  fdZ	d	ededede
e   fdZd	ededede
e   fdZy)FullAttentionCacheAllocatorz3Cache manager for a group of full attention layers.r   r4   allow_block_sharingr   Nc                 <    || _         || _        || _        i | _        y)zInitializes the cache manager for a group of full attention layers.
        Args:
            - index: the index of the associated layer group
            - block_size: the size of the blocks in the cache
        N)rx   rz   r4   ry   )r   r   r4   r   s       r   r    z$FullAttentionCacheAllocator.__init__Y  s"     "5$r!   r=   r{   r|   c                     | j                   j                  |g       }|r|d   }n|| j                   |<   d}|j                  ||| j                  | j                        }|y|j                  |       |S )zAllocate (n_blocks) for a given (request_id) using the (block_manager). Returns the number of blocks
        allocated if successful and None otherwise. For group of full attention layers, we always allocate the number of
        requested blocks.r   N)ry   ri   rM   rz   rx   rQ   )r   r=   r{   r|   ry   rG   re   s          r   r~   z+FullAttentionCacheAllocator.allocate_blocksd  s    
 &&**:r:'OM+6DZ( M(88=RVRiRikokvkvw#+,r!   r   r   c                    | j                   j                  |      }|t        d|       ||z   }|| j                  z  }|| j                  z  }g }t	        |      D ]<  }	||	   | j                  z  }
|j                  t	        |
|
| j                  z                > |r0||   | j                  z  }
|j                  t	        |
|
|z                |S )zReturns the physical indices of where to read request_id's cache. For a group of full attention layers, we
        first write the new cache to the cache tensor and then read the entire cache from the beginning to the end.r   ry   ri   r   r4   r6   rQ   )r   r{   r   r   ry   total_lengthnum_full_blocks	remainderphysical_indicesbstarts              r   r   z,FullAttentionCacheAllocator.get_read_indicesv  s     &&**:6@MNN"\1&$//9 4??2	' 	KANT__4E##E%1H$IJ	K 04??BE##E%1B$CDr!   c                    | j                   j                  |      }|t        d|       || j                  z  }|| j                  z  }||z   }|dz
  | j                  z  }g }	t	        ||dz         D ]d  }
||
   | j                  z  }|
|k(  r|nd}|
|k(  r|dz
  | j                  z  dz   n| j                  }|	j                  t	        ||z   ||z                f |	S )zReturns the physical indices for writing to the cache. For a group of full attention layers, we write the new
        cache as a continuation of the existing cache for the same request.r   r	   r   r   )r   r{   r   r   ry   start_blockstart_offsetend_pos	end_blockr   r   block_startlocal_start	local_ends                 r   r   z-FullAttentionCacheAllocator.get_write_indices  s     &&**:6@MNN!T__4"T__4,q[T__4	{IM2 	_A%a.4??:K*+{*:,K?@I~17!;SWSbSbI##E+*C[S\E\$]^	_  r!   )r(   r)   r*   r+   r,   r/   r    r-   r2   r~   ru   r   r   r0   r!   r   r   r   V  s    =	c 	s 	 	RV 	  \ ^adh^h $ 3  S  PS  X\]`Xa  * C  c  QT  Y]^aYb  r!   r   c            	           e Zd ZdZdedededdfdZded	ed
ededz  fdZd	ededede	e   fdZ
d	ededede	e   fdZy)SlidingAttentionCacheAllocatorz2Cache manager for sliding window attention layers.r   r4   sliding_windowr   Nc                     || _         d| _        || _        || _        t	        | j                  | j                  z        | _        i | _        y)a  Initializes the cache manager for a group of sliding window attention layers.
        Args:
            - index: the index of the associated layer group
            - block_size: the size of the blocks in the cache
            - sliding_window: the size of the sliding window
        FN)rx   rz   r4   r   r   _max_blocks_per_requestry   )r   r   r4   r   s       r   r    z'SlidingAttentionCacheAllocator.__init__  sF     "'$,'+D,?,?$//,Q'R$r!   r=   r{   r|   c                 ^   || j                   vrg | j                   |<   t        | j                   |         }|| j                  k(  ryt        ||z   | j                        }||z
  }|j	                  |d| j
                  | j                        }|y| j                   |   j                  |       |S )a  Allocate (n_blocks) for a given (request_id) using the (block_manager). Returns the number of blocks
        allocated otherwise. For group of sliding window attention layers, we only allocate up to the point where we can
        fit an entire sliding window in the cache tensor.r   N)ry   r   r   minrM   rz   rx   rQ   )r   r=   r{   r|   already_allocatedafter_allocationactual_n_blocksre   s           r   r~   z.SlidingAttentionCacheAllocator.allocate_blocks  s     T---+-DZ( 0 0 <= < <<08;T=Y=YZ*->>(88T4#:#:DKK
 #$++,<=r!   r   r   c                    | j                   j                  |      }|t        d|       || j                  k  rdn|| j                  z  }t	        || j                  dz
        }g }t        |||z         D ]U  }|| j                  z  }|| j                  z  }	|| j                  z  }
||	   | j                  z  |
z   }|j                  |       W |dg|z  z   S )a  Returns the physical indices of where to read request_id's cache in the cache tensor.
        For a group of sliding window attention layers, we read from the cache tensor before writing on it, because the
        new cache can overwrite the old one. To form the cache + new key / values states, we read the at most
        sliding_window - 1 cache page and then manually add the new key / values states after. Hence the -1 indices
        which indicate where to store the new key or values indices.r   r   r	   r   ry   ri   r   r   r   r6   r4   rA   )r   r{   r   r   ry   start_indexcache_lengthr   rn   	block_idxblock_offsetphysical_indexs               r   r   z/SlidingAttentionCacheAllocator.get_read_indices  s     &&**:6@MNN&)<)<<a+PTPcPcBc;(;(;a(?@{K,$>? 	4A$$$AT__,It.L(3dooETN##N3	4  2$"555r!   c                    | j                   j                  |      }|t        d|       || j                  z  }t	        || j                        }||z
  }g }t        |||z         D ]U  }	|	| j                  z  }	|	| j                  z  }
|	| j                  z  }||
   | j                  z  |z   }|j                  |       W |dkD  r	dg|z  |z   }|S )aB  Returns the physical indices of where to write request_id's cache in the cache tensor. For a group of
        sliding window attention layers, we write the new cache in rolling-buffer kind of way: if we reach the end of
        the allocated physical cache, we start writing from the beginning of the physical cache again.r   r   r   r   )r   r{   r   r   ry   r   r   padding_lengthr   rn   r   r   r   s                r   r   z0SlidingAttentionCacheAllocator.get_write_indices  s    
 &&**:6@MNN!D$7$77<)<)<=%4{K,$>? 	4A$$$AT__,It.L(3dooETN##N3	4 A "tn47GGr!   )r(   r)   r*   r+   r,   r    r-   r2   r~   ru   r   r   r0   r!   r   r   r     s    <c s C D   \ ^adh^h ,63 6S 6PS 6X\]`Xa 6. C  c  QT  Y]^aYb  r!   r   N)abcr   r   collectionsr   collections.abcr   mathr   typingr   requestsr
   r   ru   rt   r,   r   r   r2   rw   r   r   r0   r!   r   <module>r      s    $  $    CL$q' huS!V}&= % %*_< _<D;"S ;"|H . H VT ^ T r!   