vllm-project · wangxiyuan · Jun 16, 2025 · May 16, 2025 · MengqingCao · Jun 16, 2025
diff --git a/examples/offline_multi_step_custom_ops.py b/examples/offline_multi_step_custom_ops.py
@@ -19,9 +19,6 @@
 
 from vllm import LLM, SamplingParams
 
-import vllm_ascend.platform as pf
-
-pf.CUSTOM_OP_ENABLED = True  # set True for custom Ops of Multi-Step.
 prompts = [
     "Hello, my name is",
     "The president of the United States is",

diff --git a/tests/e2e/singlecard/ops/test_rotary_embedding.py b/tests/e2e/singlecard/ops/test_rotary_embedding.py
@@ -10,7 +10,9 @@
 import torch
 import torch.nn as nn
 
-import vllm_ascend.platform  # noqa: F401
+from vllm_ascend.utils import enable_custom_op
+
+enable_custom_op()
 
 # Only Neox style true scenario is supported for now
 IS_NEOX_STYLE = [True]

diff --git a/vllm_ascend/attention/attention.py b/vllm_ascend/attention/attention.py
@@ -36,7 +36,7 @@
 
 from vllm_ascend.ascend_config import get_ascend_config
 from vllm_ascend.ops.cache import concat_and_cache_mla
-from vllm_ascend.platform import CUSTOM_OP_ENABLED
+from vllm_ascend.utils import enable_custom_op
 from vllm_ascend.worker.model_runner import (
     ModelInputForNPUBuilder, ModelInputForNPUWithSamplingMetadata)
 
@@ -462,7 +462,7 @@ def advance_step(self,
         for i in range(num_queries):
             self.seq_lens[i] += 1
         self.max_decode_seq_len = max(self.seq_lens)
-        if CUSTOM_OP_ENABLED:
+        if enable_custom_op():
             #advance a step on NPU for existing inputs for a multi-step runner if custom ops is enabled
             torch.ops._C.advance_step_flashattn_ascendc(
                 num_seqs=num_seqs,

diff --git a/vllm_ascend/ops/rotary_embedding.py b/vllm_ascend/ops/rotary_embedding.py
@@ -22,11 +22,12 @@
 from vllm.model_executor.layers.rotary_embedding import (
     DeepseekScalingRotaryEmbedding, RotaryEmbedding)
 
-from vllm_ascend.platform import CUSTOM_OP_ENABLED
+from vllm_ascend.utils import enable_custom_op
 
 
 def custom_rotary_embedding_enabled(query, neox_style, head_size):
-    return query.dtype == torch.float16 and neox_style and head_size % 32 == 0 and CUSTOM_OP_ENABLED
+    return query.dtype == torch.float16 and neox_style and head_size % 32 == 0 and enable_custom_op(
+    )
 
 
 def rope_forward_oot(

diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py
@@ -16,7 +16,6 @@
 #
 
 import gc
-import logging
 import os
 from datetime import timedelta
 from typing import TYPE_CHECKING, Optional, Tuple
@@ -32,16 +31,6 @@
 from vllm_ascend.ascend_config import check_ascend_config, init_ascend_config
 from vllm_ascend.utils import ASCEND_QUATIZATION_METHOD, update_aclgraph_sizes
 
-CUSTOM_OP_ENABLED = False
-try:
-    # register custom ops into torch_library here
-    import vllm_ascend.vllm_ascend_C  # type: ignore  # noqa: F401
-    CUSTOM_OP_ENABLED = True
-except ImportError as e:
-    logging.warning(
-        "Failed to import 'vllm_ascend.vllm_ascend_C': %s. All custom ops will be disabled. ",
-        e)
-
 if TYPE_CHECKING:
     from vllm.config import ModelConfig, VllmConfig
     from vllm.utils import FlexibleArgumentParser
@@ -50,7 +39,6 @@
     VllmConfig = None
     FlexibleArgumentParser = None
 
-os.environ["RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES"] = "1"
 os.environ["ACL_OP_INIT_MODE"] = ascend_envs.VLLM_ASCEND_ACL_OP_INIT_MODE
 
 

diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py
@@ -54,6 +54,8 @@
 
 ASCEND_QUATIZATION_METHOD = "ascend"
 
+CUSTOM_OP_ENABLED = None
+
 
 def try_register_lib(lib_name: str, lib_info: str = ""):
     import importlib
@@ -68,6 +70,31 @@ def try_register_lib(lib_name: str, lib_info: str = ""):
         pass
 
 
+def enable_custom_op():
+    """
+    Enable lazy init for vllm_ascend_C to avoid early initialization of CANN's RTS component. 
+    Ensure that ASCEND_RT_VISIBLE_DEVICES can be dynamically modified before torch.npu.set_device().
+    """
+    global CUSTOM_OP_ENABLED
+
+    if CUSTOM_OP_ENABLED is not None:
+        return CUSTOM_OP_ENABLED
+
+    else:
+        try:
+            # register custom ops into torch_library here
+            import vllm_ascend.vllm_ascend_C  # type: ignore  # noqa: F401
+            CUSTOM_OP_ENABLED = True
+
+        except ImportError:
+            CUSTOM_OP_ENABLED = False
+            logger.warning(
+                "Warning: Failed to register custom ops, all custom ops will be disabled"
+            )
+
+        return CUSTOM_OP_ENABLED
+
+
 def find_hccl_library() -> str:
     """
     We either use the library file specified by the `HCCL_SO_PATH`

diff --git a/vllm_ascend/worker/worker_v1.py b/vllm_ascend/worker/worker_v1.py
@@ -117,6 +117,11 @@ def wake_up(self, tags: Optional[list[str]] = None) -> None:
         allocator = CaMemAllocator.get_instance()
         allocator.wake_up(tags=tags)
 
+    def initialize_cache(self, num_gpu_blocks: int,
+                         num_cpu_blocks: int) -> None:
+        self.cache_config.num_gpu_blocks = num_gpu_blocks
+        self.cache_config.num_cpu_blocks = num_cpu_blocks
+
     def init_device(self):
         if self.device_config.device.type == "npu":
             self.device = torch.device(f"npu:{self.local_rank_across_dp}")