[V1] Add API docs for EncoderCacheManager (#19294)

russellb · web-flow · commit 5f52a846850a · 2025-06-18T13:37:01.000+08:00
Signed-off-by: Russell Bryant &lt;rbryant@redhat.com&gt;
diff --git a/vllm/v1/core/encoder_cache_manager.py b/vllm/v1/core/encoder_cache_manager.py
@@ -14,6 +14,39 @@
 
 
 class EncoderCacheManager:
+    """Manages caching of encoder outputs for multimodal models in vLLM V1.
+
+    The EncoderCacheManager handles the lifecycle of multimodal encoder outputs
+    (such as vision embeddings from images) during request processing. It
+    provides memory-aware caching to avoid recomputing encoder outputs when the
+    same multimodal inputs appear in different stages of request processing.
+
+    This manager is particularly important for:
+    - Vision-language models (e.g., LLaVA) where image encoder outputs are
+      cached
+    - Any multimodal model where encoder computation is expensive and
+      cacheable
+
+    The cache operates at the granularity of individual multimodal input items
+    within requests, allowing for fine-grained memory management and enabling
+    chunked processing of multimodal inputs.
+
+    Note that no caching is shared between requests at this time. If the same
+    input is used across multiple requests, it will be reprocessed for each
+    request.
+    
+    Args:
+        cache_size: Limit the size of the cache, measured by the number of
+                    tokens from the input sequence.
+
+    Attributes:
+        cache_size: Total cache capacity in encoder tokens
+        num_free_slots: Current available cache capacity in encoder tokens
+        cached: Mapping from request_id to set of cached input_ids for that
+                request
+        freed: List of (request_id, input_id) pairs that were recently freed.
+               This is cleared after every call to get_freed_ids().
+    """
 
     def __init__(self, cache_size: int):
         self.cache_size = cache_size
@@ -24,25 +57,79 @@ def __init__(self, cache_size: int):
         self.freed: list[tuple[str, int]] = []
 
     def has_cache(self, request: Request, input_id: int) -> bool:
+        """Check if encoder output for a specific multimodal input is cached.
+
+        Args:
+            request: The request containing the multimodal input
+            input_id: Index of the multimodal input within the request
+
+        Returns:
+            True if the encoder output for this input is already cached
+        """
         req_id = request.request_id
         return req_id in self.cached and input_id in self.cached[req_id]
 
     def can_allocate(self, request: Request, input_id: int) -> bool:
+        """Check if there's sufficient cache space for a multimodal input.
+
+        Args:
+            request: The request containing the multimodal input
+            input_id: Index of the multimodal input within the request
+
+        Returns:
+            True if there's enough free cache space to store the encoder output
+            for this multimodal input
+        """
         num_tokens = request.get_num_encoder_tokens(input_id)
         return num_tokens <= self.num_free_slots
 
     def allocate(self, request: Request, input_id: int) -> None:
+        """Allocate cache space for a multimodal input's encoder output.
+
+        This method reserves cache space for storing the encoder output of
+        the specified multimodal input. The actual encoder output storage
+        happens in the model runner, but this method ensures the cache
+        manager tracks the allocation.
+
+        Args:
+            request: The request containing the multimodal input
+            input_id: Index of the multimodal input within the request
+
+        Note:
+            This method assumes can_allocate() returned True for the same
+            request and input_id. It will reduce available cache space.
+        """
         req_id = request.request_id
         if req_id not in self.cached:
             self.cached[req_id] = set()
         self.cached[req_id].add(input_id)
         self.num_free_slots -= request.get_num_encoder_tokens(input_id)
 
     def get_cached_input_ids(self, request: Request) -> set[int]:
+        """Get all cached multimodal input IDs for a request.
+
+        Args:
+            request: The request to query
+
+        Returns:
+            Set of input_ids that have cached encoder outputs for this request.
+            Returns empty set if no inputs are cached for this request.
+        """
         return self.cached.get(request.request_id, set())
 
     def free_encoder_input(self, request: Request, input_id: int) -> None:
-        """Free a single encoder input id for the request."""
+        """Free cache space for a single multimodal input's encoder output.
+
+        This method is called when:
+        - The encoder output has been fully consumed by the decoder and is
+          no longer needed (e.g., in vision-language models after image
+          tokens are processed)
+        - A request is being cancelled or aborted
+
+        Args:
+            request: The request containing the multimodal input
+            input_id: Index of the multimodal input to free from cache
+        """
         req_id = request.request_id
         if req_id not in self.cached:
             return
@@ -54,12 +141,29 @@ def free_encoder_input(self, request: Request, input_id: int) -> None:
         self.freed.append((req_id, input_id))
 
     def free(self, request: Request) -> None:
-        """Free all cached input ids for the request."""
+        """Free all cached encoder outputs for a request.
+
+        This method is typically called when a request is finished, cancelled,
+        or aborted, and all its encoder outputs should be freed from cache.
+
+        Args:
+            request: The request whose encoder outputs should be freed
+        """
         input_ids = self.get_cached_input_ids(request).copy()
         for input_id in input_ids:
             self.free_encoder_input(request, input_id)
 
     def get_freed_ids(self) -> list[tuple[str, int]]:
+        """Get and clear the list of recently freed encoder cache entries.
+
+        This method returns all encoder cache entries that were freed since
+        the last call to this method. It's used by the scheduler to notify
+        workers about which encoder outputs can be removed from their caches.
+
+        Returns:
+            List of (request_id, input_id) tuples that were freed since the
+            last call. The internal freed list is cleared after this call.
+        """
         freed = self.freed
         self.freed = []
         return freed