Skip to content

Commit 5a3b6fa

Browse files
committed
replace torch.cuda.empty_cache with current_platform.empty_cache
Signed-off-by: Kunshang Ji <kunshang.ji@intel.com>
1 parent a5ec772 commit 5a3b6fa

File tree

5 files changed

+5
-5
lines changed

5 files changed

+5
-5
lines changed

vllm/model_executor/layers/fused_moe/layer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -297,7 +297,7 @@ def _maybe_pad_weight(self, weight: torch.Tensor) -> torch.Tensor:
297297
and (weight.stride(-2) * weight.element_size()) % 512 == 0):
298298
num_pad = 256 // weight.element_size()
299299
weight = F.pad(weight, (0, num_pad), "constant", 0)[..., :-num_pad]
300-
torch.cuda.empty_cache()
300+
current_platform.empty_cache()
301301
return weight
302302

303303
def process_weights_after_loading(self, layer: torch.nn.Module) -> None:

vllm/model_executor/layers/quantization/fp8.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -324,7 +324,7 @@ def _maybe_pad_weight(self, weight: torch.Tensor) -> torch.Tensor:
324324
and (weight.stride(-2) * weight.element_size()) % 512 == 0):
325325
num_pad = 256 // weight.element_size()
326326
weight = F.pad(weight, (0, num_pad), "constant", 0)[..., :-num_pad]
327-
torch.cuda.empty_cache()
327+
current_platform.empty_cache()
328328
return weight
329329

330330
def process_weights_after_loading(self, layer: Module) -> None:

vllm/model_executor/model_loader/bitsandbytes_loader.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -755,7 +755,7 @@ def load_weights(self, model: nn.Module,
755755
**stacked_quant_state_dict
756756
}
757757
self._bind_quant_states_to_params(model, stacked_quant_state_dict)
758-
torch.cuda.empty_cache()
758+
current_platform.empty_cache()
759759

760760
def download_model(self, model_config: ModelConfig) -> None:
761761
self._prepare_weights(model_config.model, model_config.revision)

vllm/utils/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2556,7 +2556,7 @@ def measure(self):
25562556
# rather than `torch.cuda.memory_reserved()` .
25572557
# After `torch.cuda.reset_peak_memory_stats()`,
25582558
# `torch.cuda.memory_reserved()` will keep growing, and only shrink
2559-
# when we call `torch.cuda.empty_cache()` or OOM happens.
2559+
# when we call `current_platform.empty_cache()` or OOM happens.
25602560
from vllm.platforms import current_platform
25612561

25622562
self.torch_peak = current_platform.memory_stats().get(

vllm/v1/worker/gpu_worker.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -286,7 +286,7 @@ def compile_or_warm_up_model(self) -> None:
286286
# sampling related tensors of max possible shape to avoid memory
287287
# fragmentation issue.
288288
# NOTE: This is called after `capture_model` on purpose to prevent
289-
# memory buffers from being cleared by `torch.cuda.empty_cache`.
289+
# memory buffers from being cleared by `current_platform.empty_cache`.
290290
if get_pp_group().is_last_rank:
291291
max_num_reqs = min(self.scheduler_config.max_num_seqs,
292292
self.scheduler_config.max_num_batched_tokens)

0 commit comments

Comments
 (0)