Skip to content

Commit 65c6944

Browse files
authored
[Docs] Improve V1 KVConnector interface documentation (#19172)
Signed-off-by: Nick Hill <nhill@redhat.com>
1 parent 9487035 commit 65c6944

File tree

2 files changed

+32
-9
lines changed

2 files changed

+32
-9
lines changed

vllm/distributed/kv_transfer/kv_connector/v1/base.py

Lines changed: 26 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,15 @@
88
Scheduler-side: runs in the scheduler, binds metadata, which
99
is used by the worker-side to load/save KV cache.
1010
get_num_new_matched_tokens() - get number of new tokens
11-
that exist in the remote KV cache
11+
that exist in the remote KV cache. Might be called multiple
12+
times for a given request and should be side-effect free.
1213
update_state_after_alloc() - update KVConnector state after
1314
temporary buffer alloc by the CacheManager.
15+
request_finished() - called when a request is finished, with
16+
the computed kv cache blocks for the request.
17+
Returns whether KV cache should be freed now or will be
18+
freed asynchronously and optionally returns KV transfer
19+
params.
1420
1521
Worker-side: runs in each worker, loads/saves KV cache to/from
1622
the Connector based on the metadata.
@@ -19,6 +25,9 @@
1925
2026
save_kv_layer() - starts saving KV for layer i (maybe async)
2127
wait_for_save() - blocks until all saves are done
28+
29+
get_finished() - called with ids of finished requests, returns
30+
ids of requests that have completed async sending/recving.
2231
"""
2332

2433
import enum
@@ -184,7 +193,8 @@ def get_finished(
184193
finished generating tokens.
185194
186195
Returns:
187-
ids of requests that have finished asynchronous transfer,
196+
ids of requests that have finished asynchronous transfer
197+
(requests that previously returned True from request_finished()),
188198
tuple of (sending/saving ids, recving/loading ids).
189199
The finished saves/sends req ids must belong to a set provided in a
190200
call to this method (this call or a prior one).
@@ -215,7 +225,8 @@ def get_num_new_matched_tokens(
215225
- The number of tokens that can be loaded from the
216226
external KV cache beyond what is already computed.
217227
- `True` if external KV cache tokens will be loaded
218-
asynchronously (between scheduler steps).
228+
asynchronously (between scheduler steps). Must be
229+
'False' if the first element is 0.
219230
"""
220231
pass
221232

@@ -225,6 +236,18 @@ def update_state_after_alloc(self, request: "Request",
225236
num_external_tokens: int):
226237
"""
227238
Update KVConnector state after block allocation.
239+
240+
If get_num_new_matched_tokens previously returned True for a
241+
request, this function may be called twice for that same request -
242+
first when blocks are allocated for the connector tokens to be
243+
asynchronously loaded into, and second when any additional blocks
244+
are allocated, after the load/transfer is complete.
245+
246+
Args:
247+
request (Request): the request object.
248+
blocks (KVCacheBlocks): the blocks allocated for the request.
249+
num_external_tokens (int): the number of tokens that will be
250+
loaded from the external KV cache.
228251
"""
229252
pass
230253

vllm/v1/core/sched/scheduler.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ def __init__(
101101
# This is flushed at the end of each scheduling step.
102102
self.finished_req_ids: set[str] = set()
103103

104-
# P/D: requests in process of recving KV transfers
104+
# KV Connector: requests in process of async KV loading or recving
105105
self.finished_recving_kv_req_ids: set[str] = set()
106106

107107
# OPTIMIZATION: Cache the CachedRequestData objects to avoid creating
@@ -822,7 +822,7 @@ def update_from_output(
822822
if not stopped:
823823
new_running.append(request)
824824

825-
# P/D: update state for finished KV Transfers.
825+
# KV Connector: update state for finished KV Transfers.
826826
self._update_from_kv_xfer_finished(model_runner_output)
827827

828828
# Return the cached request data to the queue so they can be reused.
@@ -969,7 +969,7 @@ def shutdown(self) -> None:
969969
self.kv_event_publisher.shutdown()
970970

971971
########################################################################
972-
# P/D Related Methods
972+
# KV Connector Related Methods
973973
########################################################################
974974

975975
def get_kv_connector(self) -> Optional[KVConnectorBase_V1]:
@@ -992,7 +992,7 @@ def _connector_finished(
992992

993993
def _update_waiting_for_remote_kv(self, request: Request) -> bool:
994994
"""
995-
P/D: check if the request_id is finished_recving.
995+
KV Connector: check if the request_id is finished_recving.
996996
997997
The finished_recving_kv_req_ids list is populated
998998
on the previous steps()'s update_from_output based
@@ -1029,15 +1029,15 @@ def _update_waiting_for_remote_kv(self, request: Request) -> bool:
10291029
def _update_from_kv_xfer_finished(self,
10301030
model_runner_output: ModelRunnerOutput):
10311031
"""
1032-
P/D: update the scheduler state based on the output.
1032+
KV Connector: update the scheduler state based on the output.
10331033
10341034
The Worker side connectors add finished_recving and
10351035
finished_sending reqs to the output.
10361036
* if finished_sending: free the blocks
10371037
# if finished_recving: add to state so we can
10381038
scheduler the request during the next step.
10391039
"""
1040-
# P/D: update recv and send status from last step.
1040+
# KV Connector:: update recv and send status from last step.
10411041
for req_id in (model_runner_output.finished_recving or ()):
10421042
logger.debug("Finished recving KV transfer for request %s", req_id)
10431043
self.finished_recving_kv_req_ids.add(req_id)

0 commit comments

Comments
 (0)