14
14
15
15
16
16
class EncoderCacheManager :
17
+ """Manages caching of encoder outputs for multimodal models in vLLM V1.
18
+
19
+ The EncoderCacheManager handles the lifecycle of multimodal encoder outputs
20
+ (such as vision embeddings from images) during request processing. It
21
+ provides memory-aware caching to avoid recomputing encoder outputs when the
22
+ same multimodal inputs appear in different stages of request processing.
23
+
24
+ This manager is particularly important for:
25
+ - Vision-language models (e.g., LLaVA) where image encoder outputs are
26
+ cached
27
+ - Any multimodal model where encoder computation is expensive and
28
+ cacheable
29
+
30
+ The cache operates at the granularity of individual multimodal input items
31
+ within requests, allowing for fine-grained memory management and enabling
32
+ chunked processing of multimodal inputs.
33
+
34
+ Note that no caching is shared between requests at this time. If the same
35
+ input is used across multiple requests, it will be reprocessed for each
36
+ request.
37
+
38
+ Args:
39
+ cache_size: Limit the size of the cache, measured by the number of
40
+ tokens from the input sequence.
41
+
42
+ Attributes:
43
+ cache_size: Total cache capacity in encoder tokens
44
+ num_free_slots: Current available cache capacity in encoder tokens
45
+ cached: Mapping from request_id to set of cached input_ids for that
46
+ request
47
+ freed: List of (request_id, input_id) pairs that were recently freed.
48
+ This is cleared after every call to get_freed_ids().
49
+ """
17
50
18
51
def __init__ (self , cache_size : int ):
19
52
self .cache_size = cache_size
@@ -24,25 +57,79 @@ def __init__(self, cache_size: int):
24
57
self .freed : list [tuple [str , int ]] = []
25
58
26
59
def has_cache (self , request : Request , input_id : int ) -> bool :
60
+ """Check if encoder output for a specific multimodal input is cached.
61
+
62
+ Args:
63
+ request: The request containing the multimodal input
64
+ input_id: Index of the multimodal input within the request
65
+
66
+ Returns:
67
+ True if the encoder output for this input is already cached
68
+ """
27
69
req_id = request .request_id
28
70
return req_id in self .cached and input_id in self .cached [req_id ]
29
71
30
72
def can_allocate (self , request : Request , input_id : int ) -> bool :
73
+ """Check if there's sufficient cache space for a multimodal input.
74
+
75
+ Args:
76
+ request: The request containing the multimodal input
77
+ input_id: Index of the multimodal input within the request
78
+
79
+ Returns:
80
+ True if there's enough free cache space to store the encoder output
81
+ for this multimodal input
82
+ """
31
83
num_tokens = request .get_num_encoder_tokens (input_id )
32
84
return num_tokens <= self .num_free_slots
33
85
34
86
def allocate (self , request : Request , input_id : int ) -> None :
87
+ """Allocate cache space for a multimodal input's encoder output.
88
+
89
+ This method reserves cache space for storing the encoder output of
90
+ the specified multimodal input. The actual encoder output storage
91
+ happens in the model runner, but this method ensures the cache
92
+ manager tracks the allocation.
93
+
94
+ Args:
95
+ request: The request containing the multimodal input
96
+ input_id: Index of the multimodal input within the request
97
+
98
+ Note:
99
+ This method assumes can_allocate() returned True for the same
100
+ request and input_id. It will reduce available cache space.
101
+ """
35
102
req_id = request .request_id
36
103
if req_id not in self .cached :
37
104
self .cached [req_id ] = set ()
38
105
self .cached [req_id ].add (input_id )
39
106
self .num_free_slots -= request .get_num_encoder_tokens (input_id )
40
107
41
108
def get_cached_input_ids (self , request : Request ) -> set [int ]:
109
+ """Get all cached multimodal input IDs for a request.
110
+
111
+ Args:
112
+ request: The request to query
113
+
114
+ Returns:
115
+ Set of input_ids that have cached encoder outputs for this request.
116
+ Returns empty set if no inputs are cached for this request.
117
+ """
42
118
return self .cached .get (request .request_id , set ())
43
119
44
120
def free_encoder_input (self , request : Request , input_id : int ) -> None :
45
- """Free a single encoder input id for the request."""
121
+ """Free cache space for a single multimodal input's encoder output.
122
+
123
+ This method is called when:
124
+ - The encoder output has been fully consumed by the decoder and is
125
+ no longer needed (e.g., in vision-language models after image
126
+ tokens are processed)
127
+ - A request is being cancelled or aborted
128
+
129
+ Args:
130
+ request: The request containing the multimodal input
131
+ input_id: Index of the multimodal input to free from cache
132
+ """
46
133
req_id = request .request_id
47
134
if req_id not in self .cached :
48
135
return
@@ -54,12 +141,29 @@ def free_encoder_input(self, request: Request, input_id: int) -> None:
54
141
self .freed .append ((req_id , input_id ))
55
142
56
143
def free (self , request : Request ) -> None :
57
- """Free all cached input ids for the request."""
144
+ """Free all cached encoder outputs for a request.
145
+
146
+ This method is typically called when a request is finished, cancelled,
147
+ or aborted, and all its encoder outputs should be freed from cache.
148
+
149
+ Args:
150
+ request: The request whose encoder outputs should be freed
151
+ """
58
152
input_ids = self .get_cached_input_ids (request ).copy ()
59
153
for input_id in input_ids :
60
154
self .free_encoder_input (request , input_id )
61
155
62
156
def get_freed_ids (self ) -> list [tuple [str , int ]]:
157
+ """Get and clear the list of recently freed encoder cache entries.
158
+
159
+ This method returns all encoder cache entries that were freed since
160
+ the last call to this method. It's used by the scheduler to notify
161
+ workers about which encoder outputs can be removed from their caches.
162
+
163
+ Returns:
164
+ List of (request_id, input_id) tuples that were freed since the
165
+ last call. The internal freed list is cleared after this call.
166
+ """
63
167
freed = self .freed
64
168
self .freed = []
65
169
return freed
0 commit comments