Skip to content

Commit 5f52a84

Browse files
authored
[V1] Add API docs for EncoderCacheManager (#19294)
Signed-off-by: Russell Bryant <rbryant@redhat.com>
1 parent d4629dc commit 5f52a84

File tree

1 file changed

+106
-2
lines changed

1 file changed

+106
-2
lines changed

vllm/v1/core/encoder_cache_manager.py

Lines changed: 106 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,39 @@
1414

1515

1616
class EncoderCacheManager:
17+
"""Manages caching of encoder outputs for multimodal models in vLLM V1.
18+
19+
The EncoderCacheManager handles the lifecycle of multimodal encoder outputs
20+
(such as vision embeddings from images) during request processing. It
21+
provides memory-aware caching to avoid recomputing encoder outputs when the
22+
same multimodal inputs appear in different stages of request processing.
23+
24+
This manager is particularly important for:
25+
- Vision-language models (e.g., LLaVA) where image encoder outputs are
26+
cached
27+
- Any multimodal model where encoder computation is expensive and
28+
cacheable
29+
30+
The cache operates at the granularity of individual multimodal input items
31+
within requests, allowing for fine-grained memory management and enabling
32+
chunked processing of multimodal inputs.
33+
34+
Note that no caching is shared between requests at this time. If the same
35+
input is used across multiple requests, it will be reprocessed for each
36+
request.
37+
38+
Args:
39+
cache_size: Limit the size of the cache, measured by the number of
40+
tokens from the input sequence.
41+
42+
Attributes:
43+
cache_size: Total cache capacity in encoder tokens
44+
num_free_slots: Current available cache capacity in encoder tokens
45+
cached: Mapping from request_id to set of cached input_ids for that
46+
request
47+
freed: List of (request_id, input_id) pairs that were recently freed.
48+
This is cleared after every call to get_freed_ids().
49+
"""
1750

1851
def __init__(self, cache_size: int):
1952
self.cache_size = cache_size
@@ -24,25 +57,79 @@ def __init__(self, cache_size: int):
2457
self.freed: list[tuple[str, int]] = []
2558

2659
def has_cache(self, request: Request, input_id: int) -> bool:
60+
"""Check if encoder output for a specific multimodal input is cached.
61+
62+
Args:
63+
request: The request containing the multimodal input
64+
input_id: Index of the multimodal input within the request
65+
66+
Returns:
67+
True if the encoder output for this input is already cached
68+
"""
2769
req_id = request.request_id
2870
return req_id in self.cached and input_id in self.cached[req_id]
2971

3072
def can_allocate(self, request: Request, input_id: int) -> bool:
73+
"""Check if there's sufficient cache space for a multimodal input.
74+
75+
Args:
76+
request: The request containing the multimodal input
77+
input_id: Index of the multimodal input within the request
78+
79+
Returns:
80+
True if there's enough free cache space to store the encoder output
81+
for this multimodal input
82+
"""
3183
num_tokens = request.get_num_encoder_tokens(input_id)
3284
return num_tokens <= self.num_free_slots
3385

3486
def allocate(self, request: Request, input_id: int) -> None:
87+
"""Allocate cache space for a multimodal input's encoder output.
88+
89+
This method reserves cache space for storing the encoder output of
90+
the specified multimodal input. The actual encoder output storage
91+
happens in the model runner, but this method ensures the cache
92+
manager tracks the allocation.
93+
94+
Args:
95+
request: The request containing the multimodal input
96+
input_id: Index of the multimodal input within the request
97+
98+
Note:
99+
This method assumes can_allocate() returned True for the same
100+
request and input_id. It will reduce available cache space.
101+
"""
35102
req_id = request.request_id
36103
if req_id not in self.cached:
37104
self.cached[req_id] = set()
38105
self.cached[req_id].add(input_id)
39106
self.num_free_slots -= request.get_num_encoder_tokens(input_id)
40107

41108
def get_cached_input_ids(self, request: Request) -> set[int]:
109+
"""Get all cached multimodal input IDs for a request.
110+
111+
Args:
112+
request: The request to query
113+
114+
Returns:
115+
Set of input_ids that have cached encoder outputs for this request.
116+
Returns empty set if no inputs are cached for this request.
117+
"""
42118
return self.cached.get(request.request_id, set())
43119

44120
def free_encoder_input(self, request: Request, input_id: int) -> None:
45-
"""Free a single encoder input id for the request."""
121+
"""Free cache space for a single multimodal input's encoder output.
122+
123+
This method is called when:
124+
- The encoder output has been fully consumed by the decoder and is
125+
no longer needed (e.g., in vision-language models after image
126+
tokens are processed)
127+
- A request is being cancelled or aborted
128+
129+
Args:
130+
request: The request containing the multimodal input
131+
input_id: Index of the multimodal input to free from cache
132+
"""
46133
req_id = request.request_id
47134
if req_id not in self.cached:
48135
return
@@ -54,12 +141,29 @@ def free_encoder_input(self, request: Request, input_id: int) -> None:
54141
self.freed.append((req_id, input_id))
55142

56143
def free(self, request: Request) -> None:
57-
"""Free all cached input ids for the request."""
144+
"""Free all cached encoder outputs for a request.
145+
146+
This method is typically called when a request is finished, cancelled,
147+
or aborted, and all its encoder outputs should be freed from cache.
148+
149+
Args:
150+
request: The request whose encoder outputs should be freed
151+
"""
58152
input_ids = self.get_cached_input_ids(request).copy()
59153
for input_id in input_ids:
60154
self.free_encoder_input(request, input_id)
61155

62156
def get_freed_ids(self) -> list[tuple[str, int]]:
157+
"""Get and clear the list of recently freed encoder cache entries.
158+
159+
This method returns all encoder cache entries that were freed since
160+
the last call to this method. It's used by the scheduler to notify
161+
workers about which encoder outputs can be removed from their caches.
162+
163+
Returns:
164+
List of (request_id, input_id) tuples that were freed since the
165+
last call. The internal freed list is cleared after this call.
166+
"""
63167
freed = self.freed
64168
self.freed = []
65169
return freed

0 commit comments

Comments
 (0)