@@ -190,7 +190,6 @@ def allocate_slots(
190
190
num_new_tokens : int ,
191
191
num_new_computed_tokens : int = 0 ,
192
192
new_computed_blocks : Optional [KVCacheBlocks ] = None ,
193
- num_draft_tokens : int = 0 ,
194
193
num_lookahead_tokens : int = 0 ,
195
194
delay_cache_blocks : bool = False ,
196
195
) -> Optional [KVCacheBlocks ]:
@@ -286,12 +285,17 @@ def allocate_slots(
286
285
if not self .enable_caching or delay_cache_blocks :
287
286
return KVCacheBlocks (new_blocks )
288
287
289
- # Speculated tokens might be rejected in the future, so we does
290
- # not cache any speculated tokens. We only cache blocks with
291
- # generated (accepted) tokens.
288
+ # NOTE(woosuk): We want to commit (cache) up to num_computed_tokens +
289
+ # num_new_tokens, but must exclude "non-committable" tokens (e.g.,
290
+ # draft tokens that could be rejected). Therefore, we cap the number
291
+ # at `request.num_tokens`, ensuring only "finalized" tokens are cached.
292
+ num_tokens_to_cache = min (num_computed_tokens + num_new_tokens ,
293
+ request .num_tokens )
292
294
self .coordinator .cache_blocks (
293
- request , self .req_to_block_hashes [request .request_id ],
294
- num_computed_tokens + num_new_tokens - num_draft_tokens )
295
+ request ,
296
+ self .req_to_block_hashes [request .request_id ],
297
+ num_tokens_to_cache ,
298
+ )
295
299
296
300
return KVCacheBlocks (new_blocks )
297
301
0 commit comments