
    qi                         d dl mZ d dlmZ d dlmZ d dlmZ d dlZd dl	m
Z
 ddlmZ d	d
lmZ d	dlmZmZ d	dlmZmZmZmZ e G d d             Z G d d      Z G d d      Z G d d      Zy)    )	dataclass)partial)count)AnyN)PretrainedConfig   )traced   )PagedAttentionCache)TMP_TOKEN_IDFutureRequestState)CudaGraphBufferaligned_divideattn_mask_is_neededbuild_attention_maskc                      e Zd ZU dZej
                  ed<   ej
                  eeej
                  f   z  dz  ed<   ej
                  ed<   ej
                  ed<   ej
                  eeej
                  f   z  ed<   e	ed<   e	eee	f   z  ed	<   e
ej
                     ed
<   e
ej
                     ed<   ej
                  ed<   eed<   dZeed<   deeef   fdZy)PagedAttentionArgsa  Dataclass containing the keyword arguments for a forward pass using paged attention.

    Attributes:
        input_ids: Input token IDs tensor of shape `(1, total_query_tokens)`.
        attention_mask: Attention mask tensor or dictionary mapping layer types to masks. Can be `None` if the
            attention implementation doesn't require explicit masks.
        position_ids: Position IDs tensor of shape `(1, total_query_tokens)`.
        cu_seq_lens_q: Cumulative sequence lengths for queries, used for variable-length batching.
        cu_seq_lens_k: Cumulative sequence lengths for keys/values. Can be a tensor or dictionary mapping layer
            types (e.g., "full_attention", "sliding_attention") to tensors for hybrid models.
        max_seqlen_q: Maximum query sequence length in the batch.
        max_seqlen_k: Maximum key/value sequence length. Can be an int or dictionary for hybrid models.
        write_index: List of tensors indicating where to write new KV states in the cache, one per attention group.
        read_index: List of tensors indicating which cache positions to read from, one per attention group.
        logits_indices: Tensor indicating which positions in the output should be used for next-token prediction.
        cache: The [`PagedAttentionCache`] instance managing the KV cache.
        use_cache: Whether to use caching (always `False` in continuous batching as the cache is managed externally).
    	input_idsNattention_maskposition_idscu_seq_lens_qcu_seq_lens_kmax_seqlen_qmax_seqlen_kwrite_index
read_indexlogits_indicescacheF	use_cachereturnc                    | j                   | j                  | j                  | j                  | j                  | j
                  | j                  | j                  | j                  | j                  | j                  | j                  dS )Nr   r   r   r   r   r   r   r   r   r   r   r   r"   selfs    k/opt/pipecat/venv/lib/python3.12/site-packages/transformers/generation/continuous_batching/input_outputs.pyasdictzPagedAttentionArgs.asdict?   sp    "11 --!//!// -- --++//"11ZZ
 	
    )__name__
__module____qualname____doc__torchTensor__annotations__dictstrintlistr   r   boolr   r&    r'   r%   r   r      s    & ||LL4U\\(9#::TAA,,<<<<$sELL'8"999S#X&&ell##U\\""LL It
S#X 
r'   r   c                      e Zd ZdZ	 ddededej                  dej                  de	ddfd	Z
 ed
      dd       Z	 d dd dej                  j                  deddfdZe ej"                         d deddfd              Zdeej(                  eeej(                  f   f   fdZdee	e	e	ee	   ee	   f   fdZdej(                  ddfdZddZdeee   ee	   f   fdZedee   ddfd       Zd!de	de	deeef   fdZ y)"ContinuousBatchingIOszA class to hold inputs and outputs for a continuous batching forward pass, using static tensors as storage. The
    class is meant to be self-contained, so once a set of inputs have been created, the class can be used to update the
    batch alone.
    r   configdevicemodel_dtype
max_graphsr    Nc                 f   || _         || _        || _        || _        t	        |dd      dn|j
                  | _        d| _        d| _        d| _        t        |j                        D cg c]  }d c}| _        t        |j                        D cg c]  }d c}| _        g | _        i | _        t        |      | _        | j#                          | j%                  d       |j&                  dk(  r0t(        j*                  j-                  | j                        | _        yd| _        yc c}w c c}w )	a  Initialize the continuous batching I/O manager. Args:
        - cache: The [`PagedAttentionCache`] instance managing the KV cache. Meant to be unique.
        - config: The model's pretrained configuration.
        - device: The device to allocate tensors on. If the device is CPU, then the memory is pinned.
        - model_dtype: The data type for model computations.
        - max_graphs: Maximum number of CUDA graphs to cache. Uses LRU eviction when full.
        sliding_windowNr
   r   T)
full_resetcudar8   )r   r8   r7   r9   getattrr<   actual_query_lengthactual_key_lengthactual_batch_sizerange
num_groupsactual_read_sizesactual_write_sizesrequests_in_batchreq_id_to_new_token_positionr   graphs_setup_static_tensors_reset_static_tensorstyper,   r>   Streamcompute_streamr$   r   r7   r8   r9   r:   _s          r%   __init__zContinuousBatchingIOs.__init__V   s	     
&#*63CT#J#RaX^XmXm#$ !"!"-253C3C-D!E!!E.3E4D4D.E"F1"F;=<>)'6z'B""$""d"3GM{{V\G\ejj//t{{/Cbf "F"Fs   )	D)	D.T)
standalonec                    | j                   j                  }| j                   j                  }| j                   j                  | j                   j                  z  }| j
                  j                  dk(  }t        |dz   dd      }t        j                  d|ft        j                  | j
                  |      | _        | j                  dd|f   | _        | j                  dd|f   | _        | j                  dd|dz   f   | _        | j                  d	d|f   | _        | j                  d
d|dz   f   }| j                  dd|dz   f   }| j                  dd|f   | _        i | _        | j                   j$                  r|| j"                  d<   | j                   j&                  r|| j"                  d<   t        j                  |dz   ft        j                  | j
                  |      | _        | j(                  j+                          d| _        d| _        t0        j3                  | j"                  j5                         d      | _        t9        | j:                        rhi | _        | j"                  j5                         D ]C  }t        j                  dd|||z   f| j>                  | j
                  |      | j<                  |<   E nd| _        t        j                  ||ft        j                  | j
                  |      | _         t        j                  |||z   ft        j                  | j
                  |      | _!        y)a  Allocates static tensors for generation inputs and outputs. This is called only once at init time, to avoid
        repeated allocations and enable CUDA graphs. All tensors are allocated with maximum possible sizes.
        The allocated tensors are:

        - `_bulk_input_tensor`: Storage for all the small inputs: `input_ids`, `position_ids`, `cumulative_seqlens_q`,
          `logits_indices`, `cumulative_seqlens_k`, `carry_over_ids`.
        - `attention_mask`: Optional attention masks (only for eager/SDPA implementations)
        - `write_index` and `read_index` storage: Cache indexing tensors for each attention group
        - `output_ids`: Storage for generated token IDs
        cpur
          )dtyper8   
pin_memoryr   N   r            full_attentionsliding_attention)sizerX   r8   rY   )"r   rE   max_batch_tokens
num_blocks
block_sizer8   rM   r   r,   emptyint32_bulk_input_tensorr   r   cumulative_seqlens_qr   carry_over_idscumulative_seqlens_knum_full_attention_groupsnum_sliding_attention_groups
output_idszero_total_seqlen_qr   r/   fromkeyskeysr   r   r7   r   r9   write_index_storageread_index_storage)	r$   rE   ra   	num_pagesrY   	bulk_size#full_attention_cumulative_seqlens_k&sliding_attention_cumulative_seqlens_k
layer_types	            r%   rK   z+ContinuousBatchingIOs._setup_static_tensorsz   s    ZZ**
::66JJ))DJJ,A,AA	[[%%.
 ##3a#7B?	"'++	N%++dkkj#
 004E5E4E1EF 33A7H8H7H4HI$($;$;A?UAQTUAU?U<U$V!"55a9J:J9J6JK.2.E.EaI_K[^_K_I_F_.`+151H1HLbN^abNbLbIb1c."55a9J:J9J6JK >@!:://:]D%%&67::22=cD%%&9:  ++!#5;;t{{Wa
 	 MM$*C*C*H*H*JAN t{{+"$D"77<<> 
27++Q 0)>N2NO**;;)	3##J/ #'D $);;)*%++dkk^h$
  #(++%556ekkRVR]R]jt#
r'   otherstreamnon_blockingc                 h   | j                   |_         | j                  |_        | j                  |_        | j                  d d  |_        | j                  d d  |_        | j
                  |_        | j                  |_        t        | j                  j                               |_        t        j                  j                  |      5  |j                  j                  | j                  |       |j                  j                  | j                  |       |j                   j                  | j                   |       | j"                  X|j"                  L| j"                  j%                         D ]/  }|j"                  |   j                  | j"                  |   |       1 d d d        y # 1 sw Y   y xY w)Nrz   )rA   rB   rC   rF   rG   rn   r   r/   r   itemsr,   r>   ry   rf   copy_rq   rr   r   rp   )r$   rx   ry   rz   rw   s        r%   _transfer_inputsz&ContinuousBatchingIOs._transfer_inputs   sx    %)$<$<!"&"8"8"&"8"8"&"8"8";#'#:#:1#= #22!..!$"3"3"9"9";<ZZv& 	w$$**4+B+BQ]*^%%++D,D,DS_+`$$**4+B+BQ]*^"".53G3G3S"&"5"5":":"< wJ((4::4;N;Nz;Ziu:vw	w 	w 	ws   CF((F1r=   c                    |r| j                   j                  d      n| j                  }|r| j                  j                  d      n| j                  }| j
                  ddd|dz   f   j                          d| _        | j                  d| j                          | j                  d| j                          | j                  D ]r  }d| j                  |<   | j                  | j                  |   ddddd|d|f   j                  t        j                  | j                         j"                         t | j                   ddd|f   j                  d       | j                  ddd||z   f   j                  d       y)aA  Reset static tensors for the next batch. For efficiency, this only resets the portions of tensors that were
        actually used in the previous batch, using the attributes actual_query_length, actual_key_length, and
        actual_batch_size. If a (full_reset) is requested, the entire tensor storage is reset.
        Nr
   r   )rq   r`   rA   rr   rB   rf   rm   r   r   rl   ri   r   r   fill_r,   finfor9   min)r$   r=   q_lenk_lenrw   s        r%   rL   z+ContinuousBatchingIOs._reset_static_tensors   sg    6@((--b1TE]E]4>'',,R0DDZDZ 	;UQY;/557 	FU#))+%%' 33 	oJ,-Dj)"".##J/1fuffuf0DEKKEKKX\XhXhLiLmLmn	o 	  FUF+11"5?UU]? 2399"=r'   c                 2    | j                   | j                  fS )z:Get the cumulative sequence lengths for the current batch.)rg   ri   r#   s    r%   get_cumulative_seqlensz,ContinuousBatchingIOs.get_cumulative_seqlens   s    (($*C*CCCr'   c                 t    | j                   | j                  | j                  | j                  | j                  fS N)rA   rB   rC   rF   rG   r#   s    r%   get_actual_lengthsz(ContinuousBatchingIOs.get_actual_lengths   s9    $$""""""##
 	
r'   r   c                      y r   r4   )r$   r   s     r%   carry_over_tokensz'ContinuousBatchingIOs.carry_over_tokens  s    r'   c                 R    | j                   | j                   j                          y y r   )rO   synchronizer#   s    r%   retrieve_device_outputsz-ContinuousBatchingIOs.retrieve_device_outputs  s%    *++- +r'   c                     | j                   }| j                  d t        | j                          j                         }||fS r   )rH   rl   lentolist)r$   rH   
new_tokenss      r%   prepare_batch_updatez*ContinuousBatchingIOs.prepare_batch_update	  s>     22__%Bs4+A+A'BCJJL
 *,,r'   rH   c                 H
   |st        d      | j                          d| _        d| _        d| _        t        | j                  j                        D cg c]  }d c}| _        t        | j                  j                        D cg c]  }d c}| _	        g | _
        i | _        g }g }dg}g }| j                  j                         D ci c]  }|dg }}t        | j                  j                        D cg c]  }g  }	}t        | j                  j                        D cg c]  }g  }
}|D ]  }|j                  }|j                  }t!        |j"                        }| j                  j%                  ||      }| xj                  |z  c_        | xj                  t'        |j)                               z  c_        | xj                  dz  c_        |xj                  |z  c_        |j+                  |j"                         |j+                  t        |||z                |j-                  |d   |z          t'        | j.                  |      | _        |j1                         D ]H  \  }}||   j-                  ||   d   |z          t'        | j2                  |   |      | j2                  |<   J | j                  j5                  |j6                  |||	|
       |j8                  r?|j-                  |d   dz
         t:        g|_        |d   | j                  |j6                  <   | j                  j-                  |        t=        t>        j@                  t>        jB                  | jD                        } ||      | jF                  dt!        |        ||      | jH                  dt!        |        ||      | jJ                  dt!        |        ||      | jL                  dt!        |       |d   | _'        |j1                         D ]`  \  }} ||      | j                  |   dt!        |       | jP                  4tS        | jP                  |   |||dk(  r| jT                  nd       b g | _+        g | _,        t[        t]               |	|
      D ]z  \  }}} ||      | j^                  |dt!        |      f<    ||      | j`                  |dt!        |      f<   t!        |      | j                  |<   t!        |      | j                  |<   | yc c}w c c}w c c}w c c}w c c}w )	a  Prepare tensors and metadata for the next model forward pass, using the given requests as data. This method:

        1. Resets the static tensors from the previous batch
        2. Iterates through requests to accumulate input_ids, position_ids, and sequence lengths
        3. Extends read/write indices for cache management
        4. Builds attention masks if needed (for eager/SDPA implementations)
        5. Converts accumulated lists to tensors and copies them to static storage

        This method also modifies the `position_offset` attribute of each request to track progress and adds a
        temporary token at the end of the requests for which there will a new token.
        zNo requests in batchr   r
   r   )rX   r8   Nr_   )r   rg   ri   r<   )1
ValueErrorrL   rA   rB   rC   rD   r   rE   rF   rG   rH   rI   ri   rp   stateposition_offsetr   tokens_to_processget_seqlens_kmaxvaluesextendappendr   r}   r   extend_read_and_write_indices
request_idhas_new_tokenr   r   r,   tensorre   r8   r   r   rg   r   rn   r   r   r<   r   r   zipr   rr   rq   )r$   rH   rQ   r   r   rg   r   rw   ri   r   r   future_stater   past_lengthquery_length	seqlens_klayer_type_seqlen_k	to_tensorlayer_type_seqlens_kigroup_read_indicesgroup_write_indicess                         r%   prepare_batch_tensorsz+ContinuousBatchingIOs.prepare_batch_tensors  s    !344 	""$#$ !"!"-24::3H3H-I!J!!J.3DJJ4I4I.J"K1"K!#,.) 	 !sBFB[B[B`B`BbcJ
QCcc"'

(=(=">?Qb?
?#()>)>#?@ar@@ . %	8L &&E//Ku667L

00lKI $$4$""c)*:*:*<&=="""a'"!!\1! U445k;3M NO ''(<R(@<(OP #D$5$5| DD 4=??3D h/
/$Z0778LZ8XY[8\_r8rs03D4E4Ej4QSf0g!!*-h
 JJ44  +|Z
 ))%%&:2&>&BC+7.'FTUWFX11%2B2BC"")),7K%	8R ELLDKKP	 ,5Y+?'Y(1:<1H-C-.AJK_A`!!"=C(<$=>5>~5N1c.12226 1E0J0J0L 	,J,QZ[oQpD%%j12MC8L4MN"".$#'#6#6z#B)=)=:DH[:[4#6#6ab		 :=egzS^:_ 	B6A!#6DMN`DaD##A'@-?)@'@$@AFOPcFdD$$Q(B#.A*B(B%BC(+,>(?D""1%),-@)AD##A&		B_ "K"K  d?@s   	T	T
T8	T$	Tpadded_q_sizepadded_kv_cache_sizec                 .   |dkD  xr |dkD  }|r|n| j                   }|r|n| j                  }||z   }t        | j                  d| j	                  d      | j
                  d| j	                  d      | j                  d|dz    | j                  | j                  d| i i i g g | j                  d      }|rUt        | j                  || j                  z
        | _        | j                  |_        || j                  | j                  dz   d t        | j                  j                        D ]  }|r|n| j                  |   }	|r|n| j                  |   }
|j                   j#                  | j$                  |d|	f          |j&                  j#                  | j(                  |d|
f           t+        | j,                  j/                               }t1        |      dkD  ri |_        i |_        i |_        | j,                  j9                         D ]r  \  }}|d|dz    |j4                  |<   | j2                  |   |j2                  |<   | j6                  D|r|n||   }| j6                  |   dd|d|f   |j6                  |<   t ns|d   }| j,                  |   d|dz    |_        | j2                  |   |_        | j6                  4|r|n| j,                  |   |   }| j6                  |   dd|d|f   |_        | j6                  d|_        |j;                         S )a6  Get model keyword arguments for the current batch, eventually padding the query dimension to (padded_q_size)
        and the keys/values dimension to (padded_kv_cache_size). The padding is only useful if we want static shapes,
        like when using cuda graphs AND only activated if both Q and KV are padded.r   Nr
   F)r   r   r   r   r   r   r   r   r   r   r   r   .)rA   rC   r   r   	unsqueezer   rg   r   r   r   r   rn   rD   rE   rF   rG   r   r   rr   r   rq   r2   ri   rp   r   r   r   r   r}   r&   )r$   r   r   use_paddingr   b_sizepadded_kv_sizekwargsr   read_index_sizewrite_index_sizelayer_typesrw   r   r   s                  r%   get_model_kwargsz&ContinuousBatchingIOs.get_model_kwargsz  sT   
 $a'D,@1,D!,$2J2J"-43I3I&)== $nnVe,66q9**6E2<<Q?33LfqjA**..v6**
$  #D$5$5ut?R?R7R SD"&"3"3FFKD%%d&<&<q&@&BC tzz,,- 	VA0;nAWAWXYAZO0;}AXAXYZA[$$T%<%<Q@P@P=P%QR%%d&>&>qBSCSBS?S&TU		V 44499;<{a24F<>F =?F!)-)B)B)H)H)J m%
I3<\vz3J$$Z0262C2CJ2O##J/&&2.9Ny?PE8<8K8KJ8WX[]c^c]cekfkekXk8lF))*5m %QJ#'#<#<Z#H6TU:#VF "&"3"3J"?F"".*54;T;TU_;`ag;h(,(;(;J(GVeVU[V[U[H[(\%&$(F!}}r'   rV   r    N)Fr   r   )!r(   r)   r*   r+   r   r   r,   r8   rX   r1   rR   r	   rK   r>   rN   r3   r   no_gradrL   tupler-   r/   r0   r   r2   r   r   r   r   r   r   r   r   r4   r'   r%   r6   r6   P   s    "g""g !"g 	"g
 [["g "g 
"gH tC
 C
N _dw,w6;jj6G6GwW[w	w, U]]_> > >  >8DellDellAR<S.S(T D
E#sCcDI*M$N 
5<< D .-eD1C,Dd3i,O&P -
 iBt<N7O iBTX iB iBVAc AS AY]^acf^fYg Ar'   r6   c                       e Zd Z	 ddededej                  dej                  deddfdZ	d	ej                  j                  ddfd
Zd	ej                  j                  ddfdZy)HostDeviceIOPairr   r7   r8   r9   r:   r    Nc                 L   t        ||t        j                  d      ||      | _        t        |||||      | _        t        j
                  j                         | _        t        j
                  j                         | _        t        j
                  j                         | _	        y )NrU   )
r6   r,   r8   host_io	device_ior>   Eventh2d_overcompute_overd2h_over)r$   r   r7   r8   r9   r:   s         r%   rR   zHostDeviceIOPair.__init__  su     -UFELL<OQ\^hi.uffkS]^

((*!JJ,,.

((*r'   ry   c                 T    | j                   j                  | j                  |d       y )NT)ry   rz   )r   r   r   r$   ry   s     r%   transfer_inputs_h2dz$HostDeviceIOPair.transfer_inputs_h2d  s     %%dnnVRV%Wr'   c                     t         j                  j                  |      5  | j                  j                  j                  | j                  j                  d       d d d        y # 1 sw Y   y xY w)NTr|   )r,   r>   ry   r   rl   r~   r   r   s     r%   transfer_outputs_d2hz%HostDeviceIOPair.transfer_outputs_d2h  sU    ZZv& 	XLL##))$..*C*CRV)W	X 	X 	Xs   <A%%A.r   )r(   r)   r*   r   r   r,   r8   rX   r1   rR   r>   rN   r   r   r4   r'   r%   r   r     s     +"+ !+ 	+
 [[+ + 
+X%***;*; X XX5::+<+< X Xr'   r   c                      e Zd ZdZ	 ddededej                  dej                  de	ddfd	Z
deej                  eeej                  f   f   fd
Zdee	e	e	ee	   ee	   f   fdZdee   ddfdZdej                  fdZdde	de	deeef   fdZdej                  ddfdZedej                  fd       Zedefd       ZddZdeee   ee	   f   fdZy)ContinuousBatchingAsyncIOsu  A class to handle the inputs and outputs for the asynchronous API. It uses two IO pairs to avoid race conditions
    between the two batches, which means twice as more VRAM is used for static input tensors and CUDA graph. If your GPU
    is large enough or you want to generate long sequences, this is a good trade-off to make.

    Asynchronous batching works by creating two pairs of host - device inputs and ouputs:

                                    inputs
                      ┌──────────┐ ────────► ┌────────────┐
    IO pair object:   │ Host IOs │           │ Device IOs │       (for a CUDA sytem, Host = CPU and Device = GPU)
                      └──────────┘ ◄──────── └────────────┘
                                    outputs

    Each pair is separate from the other. This means that each pairs has its own CUDA graphs set, because CUDA graphs
    need to have static adresses for input tensors. To have a unique set of CUDA graph, we would need to copy the input
    tensors to a third device-side buffer. This could limit the memory cost of CUDA graphs but would slow down the
    forward pass.
    But the CUDA streams orchestrating the transfer from host to device (H2D) and device to host (D2H) are the same for
    both pairs. Same for the compute stream.
    The order of steps in async batching looks like this (for 3 batches of compute):

         │ ┌────┬────┐                  ┌────┬────┐     ┌────┬────┐       ┌────┐          ┌────┐
    CPU  │ │PR 0│PR 1│                  │UP 0│PR 2│     │UP 1│PR 3│       │UP 2│          │UP 3│
         │ └────┼───┬┴──┐               └────┴────┼───┐ └────┴────┼───┐   └────┘          └────┘
    H2D  │      │0->│1->│               ¦         │2->│ ¦         │3->│   ¦               ¦
         │      └───┼───┴───────────┬─────────────┴─┬─┼───────────┴───┼───────────────┐   ¦
    GPU  │          │   COMPUTE 0   │   COMPUTE 1   │█│   COMPUTE 2   │   COMPUTE 3   │   ¦
         │          └───────────────┼───┬───────────┼─┴─┬─────────────┼───┬───────────┼───┤
    D2H  │                          │0<-│           │1<-│             │2<-│           │3<-│
         │                          └───┘           └───┘             └───┘           └───┘

    with: - CPU: actions happening on the CPU (host-side)
          - GPU: actions happening on the GPU (device-side)
          - H2D: host to device transfer
          - D2H: device to host transfer
    and:
          - PR N: preparation of batch N
          - ->N: host to device transfer of batch N
          - COMPUTE N: compute step for batch N
          - <-N: device to host transfer of batch N
          - UP N: update of batch N

    You can see that the GPU is almost always busy, execpt where the █ is.
    Proper ordering of steps is ensured through the use of CUDA events and streams.
    r   r7   r8   r9   r:   r    Nc           
      b   d| _         t        d      D cg c]  }t        |||||       c}| _        t        j
                  j                  |      | _        t        j
                  j                  |      | _        t        j
                  j                  |      | _	        d | j                  d   j                  _	        d | j                  d   j                  _	        d | j                  d   j                  _	        d | j                  d   j                  _	        |j                  | _        y c c}w )Nr   rZ   r?   r
   )current_pairrD   r   io_pairsr,   r>   rN   
h2d_stream
d2h_streamrO   r   r   ra   rP   s          r%   rR   z#ContinuousBatchingAsyncIOs.__init__  s     chijckl^_)%jYl**++6+:**++6+:#jj//v/>26a  /48a""126a  /48a""1 % 6 6 ms   D,c                 d    | j                   | j                     j                  j                         S r   )r   r   r   r   r#   s    r%   r   z1ContinuousBatchingAsyncIOs.get_cumulative_seqlens  s&    }}T../77NNPPr'   c                 d    | j                   | j                     j                  j                         S r   )r   r   r   r   r#   s    r%   r   z-ContinuousBatchingAsyncIOs.get_actual_lengths  s&    }}T../77JJLLr'   rH   c                     | j                   | j                     }|j                  j                  |       |j                  j                  j                  | j                                y r   )r   r   r   r   rh   r~   infer_carry_over_ids)r$   rH   io_pairs      r%   r   z0ContinuousBatchingAsyncIOs.prepare_batch_tensors#  sJ    -- 1 12--.?@&&,,T-F-F-HIr'   c                    | j                   | j                     j                  j                  }| j                   d| j                  z
     j                  j                  }t	        | j
                        D cg c]  }d }}t        |j                               D ]  \  }}|j                  |      }||||<     t        j                  |t        j                        S c c}w )a  Infers the ids of the tokens to carry over from batch N to batch N+1. In asynchronous batching mode, we can
        schedule a request for batch N+1 without knowing the token predicted for that request in batch N. For that
        reason, we might need to carry over tokens just predicted in batch N before launching the forwar pass of batch
        N+1. This method computes the ids of the tokens to carry over.r
   r   )rX   )r   r   r   rI   rD   ra   	enumeraterp   getr,   r   re   )r$   !next_req_id_to_new_token_position!prev_req_id_to_new_token_positionrQ   rh   r   req_idnew_token_positions           r%   r   z/ContinuousBatchingAsyncIOs.infer_carry_over_ids(  s    
 -1MM$:K:K,L,T,T,q,q),0MM!d>O>O:O,P,X,X,u,u)&+D,A,A&BC"CC ##D#I#I#KL 	7IAv!B!F!Fv!N!-5612	7 ||N%++>> Ds   5	Cr   r   c                 6   | j                   | j                     }|j                  | j                         | j                  j	                  |j
                         | j                  j                  |j
                         |j                  j                  ||      S r   )
r   r   r   r   record_eventr   rO   
wait_eventr   r   )r$   r   r   r   s       r%   r   z+ContinuousBatchingAsyncIOs.get_model_kwargs<  st    -- 1 12##DOO4$$W%5%56&&w'7'78  11-AUVVr'   r   c                 f   | j                   d| j                  z
     j                  j                  }| j                   | j                     j                  j                  }||   }|dk7  j                         }|d|j                  d       }|d|j                  d       }||z  |d   d|z
  z  z   |d<   y)a  As explained in the infer_carry_over_ids method, we might need to carry over tokens just predicted in batch N
        before launching the forwar pass of batch N+1. This method performs the carry over, and is recorded in CUDA
        graphs if they are enabled.r
   r   Nr   )r   r   r   rl   rh   r1   r`   )r$   r   prev_output_idsrh   carried_over_idscarried_over_masks         r%   r   z,ContinuousBatchingAsyncIOs.carry_over_tokensC  s    
 --D,=,=(=>HHSSt'8'89CCRR*>:+r1668+,?innQ.?@-.A	q0AB'*;;ilaRcNc>dd	!r'   c                 \    | j                   | j                     j                  j                  S r   )r   r   r   rl   r#   s    r%   rl   z%ContinuousBatchingAsyncIOs.output_idsU  s%     }}T../99DDDr'   c                 \    | j                   | j                     j                  j                  S r   )r   r   r   rJ   r#   s    r%   rJ   z!ContinuousBatchingAsyncIOs.graphsZ  s#    }}T../99@@@r'   c                 r   | j                   | j                     }| j                  j                  |j                         | j
                  j                  |j                         |j                  | j
                         | j
                  j                  |j                         d| j                  z
  | _        y )Nr
   )	r   r   rO   r   r   r   r   r   r   r$   r   s     r%   r   z2ContinuousBatchingAsyncIOs.retrieve_device_outputs_  s    -- 1 12(()=)=>""7#7#78$$T__5$$W%5%56 1 11r'   c                     | j                   | j                     }|j                  j                          |j                  j                         S r   )r   r   r   r   r   r   r   s     r%   r   z/ContinuousBatchingAsyncIOs.prepare_batch_updatek  s;    -- 1 12$$&3355r'   r   r   r   )r(   r)   r*   r+   r   r   r,   r8   rX   r1   rR   r   r-   r/   r0   r   r2   r   r   r   r   r   r   r   propertyrl   r   rJ   r   r   r4   r'   r%   r   r     s{   +f 7"7 !7 	7
 [[7 7 
70QellDellAR<S.S(T QME#sCcDI*M$N MJt<N7O JTX J
?ell ?(Wc WS WY]^acf^fYg We5<< eD e$ EELL E E A A A	26eD1C,Dd3i,O&P 6r'   r   )dataclassesr   	functoolsr   	itertoolsr   typingr   r,    transformers.configuration_utilsr   utils.metricsr	   r   r   requestsr   r   utilsr   r   r   r   r   r6   r   r   r4   r'   r%   <module>r      sg    "     = # & 6 ] ] /
 /
 /
dk k\X X0X6 X6r'   