Skip to content

Commit baa4f9e

Browse files
committed
replace torch.cuda.empty_cache with current_platform.empty_cache
Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
1 parent f6e858d commit baa4f9e

File tree

5 files changed

+5
-5
lines changed

5 files changed

+5
-5
lines changed

vllm/model_executor/layers/fused_moe/layer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -283,7 +283,7 @@ def _maybe_pad_weight(self, weight: torch.Tensor) -> torch.Tensor:
283283
and (weight.stride(-2) * weight.element_size()) % 512 == 0):
284284
num_pad = 256 // weight.element_size()
285285
weight = F.pad(weight, (0, num_pad), "constant", 0)[..., :-num_pad]
286-
torch.cuda.empty_cache()
286+
current_platform.empty_cache()
287287
return weight
288288

289289
def process_weights_after_loading(self, layer: torch.nn.Module) -> None:

vllm/model_executor/layers/quantization/fp8.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -313,7 +313,7 @@ def _maybe_pad_weight(self, weight: torch.Tensor) -> torch.Tensor:
313313
and (weight.stride(-2) * weight.element_size()) % 512 == 0):
314314
num_pad = 256 // weight.element_size()
315315
weight = F.pad(weight, (0, num_pad), "constant", 0)[..., :-num_pad]
316-
torch.cuda.empty_cache()
316+
current_platform.empty_cache()
317317
return weight
318318

319319
def process_weights_after_loading(self, layer: Module) -> None:

vllm/model_executor/model_loader/bitsandbytes_loader.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -585,7 +585,7 @@ def load_weights(self, model: nn.Module,
585585
if self.load_8bit:
586586
set_weight_attrs(
587587
param, {"matmul_state": [None] * len(quant_states)})
588-
torch.cuda.empty_cache()
588+
current_platform.empty_cache()
589589

590590
def download_model(self, model_config: ModelConfig) -> None:
591591
self._prepare_weights(model_config.model, model_config.revision)

vllm/utils/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2556,7 +2556,7 @@ def measure(self):
25562556
# rather than `torch.cuda.memory_reserved()` .
25572557
# After `torch.cuda.reset_peak_memory_stats()`,
25582558
# `torch.cuda.memory_reserved()` will keep growing, and only shrink
2559-
# when we call `torch.cuda.empty_cache()` or OOM happens.
2559+
# when we call `current_platform.empty_cache()` or OOM happens.
25602560
from vllm.platforms import current_platform
25612561

25622562
self.torch_peak = current_platform.memory_stats().get(

vllm/v1/worker/gpu_worker.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -283,7 +283,7 @@ def compile_or_warm_up_model(self) -> None:
283283
# sampling related tensors of max possible shape to avoid memory
284284
# fragmentation issue.
285285
# NOTE: This is called after `capture_model` on purpose to prevent
286-
# memory buffers from being cleared by `torch.cuda.empty_cache`.
286+
# memory buffers from being cleared by `current_platform.empty_cache`.
287287
if get_pp_group().is_last_rank:
288288
max_num_reqs = min(self.scheduler_config.max_num_seqs,
289289
self.scheduler_config.max_num_batched_tokens)

0 commit comments

Comments
 (0)