[BugFix][CPU] Fix CPU worker dependency on cumem_allocator (#20696)

njhill · web-flow · commit 59389c927b7f · 2025-07-10T14:24:20.000+08:00
Signed-off-by: Nick Hill &lt;nhill@redhat.com&gt;
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
@@ -11,7 +11,6 @@
 
 import vllm.envs as envs
 from vllm.config import VllmConfig
-from vllm.device_allocator.cumem import CuMemAllocator
 from vllm.distributed import (ensure_model_parallel_initialized,
                               init_distributed_environment,
                               set_custom_all_reduce)
@@ -79,6 +78,8 @@ def __init__(
             self.profiler = None
 
     def sleep(self, level: int = 1) -> None:
+        from vllm.device_allocator.cumem import CuMemAllocator
+
         free_bytes_before_sleep = torch.cuda.mem_get_info()[0]
 
         # Save the buffers before level 2 sleep
@@ -101,6 +102,8 @@ def sleep(self, level: int = 1) -> None:
             used_bytes / GiB_bytes)
 
     def wake_up(self, tags: Optional[list[str]] = None) -> None:
+        from vllm.device_allocator.cumem import CuMemAllocator
+
         allocator = CuMemAllocator.get_instance()
         allocator.wake_up(tags)
 
@@ -174,6 +177,8 @@ def init_device(self):
     # to hijack tensor allocation.
     def load_model(self) -> None:
         if self.vllm_config.model_config.enable_sleep_mode:
+            from vllm.device_allocator.cumem import CuMemAllocator
+
             allocator = CuMemAllocator.get_instance()
             assert allocator.get_current_usage() == 0, (
                 "Sleep mode can only be "
@@ -241,7 +246,10 @@ def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]:
 
     def initialize_from_config(self, kv_cache_config: KVCacheConfig) -> None:
         """Allocate GPU KV cache with the specified kv_cache_config."""
+
         if self.vllm_config.model_config.enable_sleep_mode:
+            from vllm.device_allocator.cumem import CuMemAllocator
+
             allocator = CuMemAllocator.get_instance()
             context = allocator.use_memory_pool(tag="kv_cache")
         else: