fix: manage KV cache buffer lifecycle to prevent premature deallocation

jianzs · jianzs · commit 0d90c24380cc · 2025-05-11T10:56:07.000+08:00
Signed-off-by: Jade Zheng &lt;zheng.shoujian@outlook.com&gt;
diff --git a/vllm_ascend/distributed/llmdatadist_connector_v1.py b/vllm_ascend/distributed/llmdatadist_connector_v1.py
@@ -454,6 +454,22 @@ def __init__(self, vllm_config: "VllmConfig",
                 else:
                     logger.warning(f"Still {len(clusters)} clusters to link")
 
+        # LLMDataDist will deallocate the cache buffer either when the cache
+        # buffer's Python object goes out of scope or when deallocate_cache() is
+        # explicitly called. This can lead to accuracy issues if the cache
+        # buffer is deallocated while still being used in the NPU stream. To
+        # prevent this, we maintain a reference to the cache buffer until the
+        # next round, ensuring it is not prematurely deallocated.
+        self.kv_buffers: List = []
+
+    def _detach_kv_buffers(self):
+        for kv_buffer in self.kv_buffers:
+            self.llm_datadist_engine.kv_transfer.deallocate_cache(kv_buffer)
+        self.kv_buffers.clear()
+
+    def _attach_kv_buffer(self, kv_buffer: torch.Tensor):
+        self.kv_buffers.append(kv_buffer)
+
     def start_load_kv(self, forward_context: "ForwardContext",
                       **kwargs) -> None:
         """
@@ -477,6 +493,9 @@ def start_load_kv(self, forward_context: "ForwardContext",
             # In the prefilling node, do not need to load KV cache.
             return
 
+        # Release the KV cache buffer from the previous round
+        self._detach_kv_buffers()
+
         # Get the metadata
         metadata = self._get_connector_metadata()
         assert isinstance(metadata, LLMDataDistConnectorV1Metadata)
@@ -558,6 +577,7 @@ def start_load_kv(self, forward_context: "ForwardContext",
             kv_hidden_dtype = kv_cache_layers[0].dtype
             kv_buffer, pulled_kv_caches = self._create_cache_tensors(
                 self.num_layers, kv_cache_shape, kv_hidden_dtype)
+            self._attach_kv_buffer(kv_buffer)
 
             target_tp_rank = self.tp_rank % min(
                 self.cluster_info.prefill_tp_size,
@@ -590,9 +610,6 @@ def start_load_kv(self, forward_context: "ForwardContext",
                 self._inject_kv_into_layer(kv_cache_layer, pulled_kv_cache,
                                            req_slot_mapping, is_mla)
 
-            # Release the reference count
-            self.llm_datadist_engine.kv_transfer.deallocate_cache(kv_buffer)
-
     def wait_for_layer_load(self, layer_name: str) -> None:
         """
         Block until the KV for a specific layer is loaded into vLLM's