11
11
12
12
import vllm .envs as envs
13
13
from vllm .config import VllmConfig
14
- from vllm .device_allocator .cumem import CuMemAllocator
15
14
from vllm .distributed import (ensure_model_parallel_initialized ,
16
15
init_distributed_environment ,
17
16
set_custom_all_reduce )
@@ -79,6 +78,8 @@ def __init__(
79
78
self .profiler = None
80
79
81
80
def sleep (self , level : int = 1 ) -> None :
81
+ from vllm .device_allocator .cumem import CuMemAllocator
82
+
82
83
free_bytes_before_sleep = torch .cuda .mem_get_info ()[0 ]
83
84
84
85
# Save the buffers before level 2 sleep
@@ -101,6 +102,8 @@ def sleep(self, level: int = 1) -> None:
101
102
used_bytes / GiB_bytes )
102
103
103
104
def wake_up (self , tags : Optional [list [str ]] = None ) -> None :
105
+ from vllm .device_allocator .cumem import CuMemAllocator
106
+
104
107
allocator = CuMemAllocator .get_instance ()
105
108
allocator .wake_up (tags )
106
109
@@ -174,6 +177,8 @@ def init_device(self):
174
177
# to hijack tensor allocation.
175
178
def load_model (self ) -> None :
176
179
if self .vllm_config .model_config .enable_sleep_mode :
180
+ from vllm .device_allocator .cumem import CuMemAllocator
181
+
177
182
allocator = CuMemAllocator .get_instance ()
178
183
assert allocator .get_current_usage () == 0 , (
179
184
"Sleep mode can only be "
@@ -241,7 +246,10 @@ def get_kv_cache_spec(self) -> dict[str, KVCacheSpec]:
241
246
242
247
def initialize_from_config (self , kv_cache_config : KVCacheConfig ) -> None :
243
248
"""Allocate GPU KV cache with the specified kv_cache_config."""
249
+
244
250
if self .vllm_config .model_config .enable_sleep_mode :
251
+ from vllm .device_allocator .cumem import CuMemAllocator
252
+
245
253
allocator = CuMemAllocator .get_instance ()
246
254
context = allocator .use_memory_pool (tag = "kv_cache" )
247
255
else :
0 commit comments