@@ -345,32 +345,38 @@ def schedule(self) -> SchedulerOutput:
345
345
skipped_waiting_requests .appendleft (request )
346
346
continue
347
347
348
+ num_external_computed_tokens = 0
349
+ load_kv_async = False
350
+
348
351
# Get already-cached tokens.
349
352
if num_prealloc_computed_tokens == 0 :
350
353
new_computed_blocks , num_native_computed_tokens = \
351
354
self .kv_cache_manager .get_computed_blocks (
352
355
request )
356
+
357
+ # Get externally-cached tokens if using a KVConnector.
358
+ if self .connector is not None :
359
+ num_external_computed_tokens , load_kv_async = (
360
+ self .connector .get_num_new_matched_tokens (
361
+ request , num_native_computed_tokens ))
362
+
363
+ # Total computed tokens (local + external).
364
+ num_computed_tokens = (num_native_computed_tokens +
365
+ num_external_computed_tokens )
353
366
else :
354
367
# P/D: skip checking prefix cache if loaded from remote kvs.
355
368
new_computed_blocks = KVCacheBlocks .create_empty ()
356
369
num_native_computed_tokens = 0
357
370
358
- # Get externally-cached tokens if using a KVConnector.
359
- num_external_computed_tokens , load_kv_async = (
360
- (0 , False ) if self .connector is None else
361
- self .connector .get_num_new_matched_tokens (
362
- request , num_native_computed_tokens ))
363
-
364
- # Total computed tokens (local + external).
365
- num_computed_tokens = (num_native_computed_tokens +
366
- num_external_computed_tokens +
367
- num_prealloc_computed_tokens )
371
+ # Total computed tokens (allocated in prior step).
372
+ num_computed_tokens = num_prealloc_computed_tokens
368
373
369
374
encoder_inputs_to_schedule = None
370
375
new_encoder_budget = encoder_budget
371
376
372
377
# P/D: loading remote KV, do not allocate for new work.
373
378
if load_kv_async :
379
+ assert num_external_computed_tokens > 0
374
380
num_new_tokens = 0
375
381
# Number of tokens to be scheduled.
376
382
else :
@@ -411,7 +417,8 @@ def schedule(self) -> SchedulerOutput:
411
417
# KVConnector: update internal state after allocation.
412
418
# This information is used to determine if a load is
413
419
# needed for this request.
414
- if self .connector is not None :
420
+ if num_external_computed_tokens :
421
+ assert self .connector is not None
415
422
self .connector .update_state_after_alloc (
416
423
request ,
417
424
new_computed_blocks + new_blocks ,
0 commit comments