replace torch.cuda.empty_cache with current_platform.empty_cache

jikunshang · jikunshang · commit 5a3b6fabb874 · 2025-07-14T17:02:47.000+08:00
Signed-off-by: Kunshang Ji &lt;kunshang.ji@intel.com&gt;
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
@@ -297,7 +297,7 @@ def _maybe_pad_weight(self, weight: torch.Tensor) -> torch.Tensor:
                 and (weight.stride(-2) * weight.element_size()) % 512 == 0):
             num_pad = 256 // weight.element_size()
             weight = F.pad(weight, (0, num_pad), "constant", 0)[..., :-num_pad]
-            torch.cuda.empty_cache()
+            current_platform.empty_cache()
         return weight
 
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
@@ -324,7 +324,7 @@ def _maybe_pad_weight(self, weight: torch.Tensor) -> torch.Tensor:
                 and (weight.stride(-2) * weight.element_size()) % 512 == 0):
             num_pad = 256 // weight.element_size()
             weight = F.pad(weight, (0, num_pad), "constant", 0)[..., :-num_pad]
-            torch.cuda.empty_cache()
+            current_platform.empty_cache()
         return weight
 
     def process_weights_after_loading(self, layer: Module) -> None:
diff --git a/vllm/model_executor/model_loader/bitsandbytes_loader.py b/vllm/model_executor/model_loader/bitsandbytes_loader.py
@@ -755,7 +755,7 @@ def load_weights(self, model: nn.Module,
             **stacked_quant_state_dict
         }
         self._bind_quant_states_to_params(model, stacked_quant_state_dict)
-        torch.cuda.empty_cache()
+        current_platform.empty_cache()
 
     def download_model(self, model_config: ModelConfig) -> None:
         self._prepare_weights(model_config.model, model_config.revision)
diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py
@@ -2556,7 +2556,7 @@ def measure(self):
         # rather than `torch.cuda.memory_reserved()` .
         # After `torch.cuda.reset_peak_memory_stats()`,
         # `torch.cuda.memory_reserved()` will keep growing, and only shrink
-        # when we call `torch.cuda.empty_cache()` or OOM happens.
+        # when we call `current_platform.empty_cache()` or OOM happens.
         from vllm.platforms import current_platform
 
         self.torch_peak = current_platform.memory_stats().get(
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
@@ -286,7 +286,7 @@ def compile_or_warm_up_model(self) -> None:
         # sampling related tensors of max possible shape to avoid memory
         # fragmentation issue.
         # NOTE: This is called after `capture_model` on purpose to prevent
-        # memory buffers from being cleared by `torch.cuda.empty_cache`.
+        # memory buffers from being cleared by `current_platform.empty_cache`.
         if get_pp_group().is_last_rank:
             max_num_reqs = min(self.scheduler_config.max_num_seqs,
                                self.scheduler_config.max_num_batched_tokens)

Original file line number	Diff line number	Diff line change
`@@ -755,7 +755,7 @@ def load_weights(self, model: nn.Module,`
`755`	`755`	`**stacked_quant_state_dict`
`756`	`756`	`}`
`757`	`757`	`self._bind_quant_states_to_params(model, stacked_quant_state_dict)`
`758`		`- torch.cuda.empty_cache()`
	`758`	`+ current_platform.empty_cache()`
`759`	`759`
`760`	`760`	`def download_model(self, model_config: ModelConfig) -> None:`
`761`	`761`	`self._prepare_weights(model_config.model, model_config.revision)`