diff --git a/examples/offline_multi_step_custom_ops.py b/examples/offline_multi_step_custom_ops.py index 82a1bf575..59c7fafcc 100644 --- a/examples/offline_multi_step_custom_ops.py +++ b/examples/offline_multi_step_custom_ops.py @@ -19,9 +19,6 @@ from vllm import LLM, SamplingParams -import vllm_ascend.platform as pf - -pf.CUSTOM_OP_ENABLED = True # set True for custom Ops of Multi-Step. prompts = [ "Hello, my name is", "The president of the United States is", diff --git a/tests/e2e/singlecard/ops/test_rotary_embedding.py b/tests/e2e/singlecard/ops/test_rotary_embedding.py index 2d5ec18da..a3504a88b 100644 --- a/tests/e2e/singlecard/ops/test_rotary_embedding.py +++ b/tests/e2e/singlecard/ops/test_rotary_embedding.py @@ -10,7 +10,9 @@ import torch import torch.nn as nn -import vllm_ascend.platform # noqa: F401 +from vllm_ascend.utils import enable_custom_op + +enable_custom_op() # Only Neox style true scenario is supported for now IS_NEOX_STYLE = [True] diff --git a/vllm_ascend/attention/attention.py b/vllm_ascend/attention/attention.py index a567cc530..9c112ed62 100644 --- a/vllm_ascend/attention/attention.py +++ b/vllm_ascend/attention/attention.py @@ -36,7 +36,7 @@ from vllm_ascend.ascend_config import get_ascend_config from vllm_ascend.ops.cache import concat_and_cache_mla -from vllm_ascend.platform import CUSTOM_OP_ENABLED +from vllm_ascend.utils import enable_custom_op from vllm_ascend.worker.model_runner import ( ModelInputForNPUBuilder, ModelInputForNPUWithSamplingMetadata) @@ -462,7 +462,7 @@ def advance_step(self, for i in range(num_queries): self.seq_lens[i] += 1 self.max_decode_seq_len = max(self.seq_lens) - if CUSTOM_OP_ENABLED: + if enable_custom_op(): #advance a step on NPU for existing inputs for a multi-step runner if custom ops is enabled torch.ops._C.advance_step_flashattn_ascendc( num_seqs=num_seqs, diff --git a/vllm_ascend/ops/rotary_embedding.py b/vllm_ascend/ops/rotary_embedding.py index 0c2a00afb..9f8ae784c 100644 --- a/vllm_ascend/ops/rotary_embedding.py +++ b/vllm_ascend/ops/rotary_embedding.py @@ -22,11 +22,12 @@ from vllm.model_executor.layers.rotary_embedding import ( DeepseekScalingRotaryEmbedding, RotaryEmbedding) -from vllm_ascend.platform import CUSTOM_OP_ENABLED +from vllm_ascend.utils import enable_custom_op def custom_rotary_embedding_enabled(query, neox_style, head_size): - return query.dtype == torch.float16 and neox_style and head_size % 32 == 0 and CUSTOM_OP_ENABLED + return query.dtype == torch.float16 and neox_style and head_size % 32 == 0 and enable_custom_op( + ) def rope_forward_oot( diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py index 9194ae9a1..5a45e9eed 100644 --- a/vllm_ascend/platform.py +++ b/vllm_ascend/platform.py @@ -16,7 +16,6 @@ # import gc -import logging import os from datetime import timedelta from typing import TYPE_CHECKING, Optional, Tuple @@ -32,16 +31,6 @@ from vllm_ascend.ascend_config import check_ascend_config, init_ascend_config from vllm_ascend.utils import ASCEND_QUATIZATION_METHOD, update_aclgraph_sizes -CUSTOM_OP_ENABLED = False -try: - # register custom ops into torch_library here - import vllm_ascend.vllm_ascend_C # type: ignore # noqa: F401 - CUSTOM_OP_ENABLED = True -except ImportError as e: - logging.warning( - "Failed to import 'vllm_ascend.vllm_ascend_C': %s. All custom ops will be disabled. ", - e) - if TYPE_CHECKING: from vllm.config import ModelConfig, VllmConfig from vllm.utils import FlexibleArgumentParser @@ -50,7 +39,6 @@ VllmConfig = None FlexibleArgumentParser = None -os.environ["RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES"] = "1" os.environ["ACL_OP_INIT_MODE"] = ascend_envs.VLLM_ASCEND_ACL_OP_INIT_MODE diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py index e29290e73..d93205352 100644 --- a/vllm_ascend/utils.py +++ b/vllm_ascend/utils.py @@ -54,6 +54,8 @@ ASCEND_QUATIZATION_METHOD = "ascend" +CUSTOM_OP_ENABLED = None + def try_register_lib(lib_name: str, lib_info: str = ""): import importlib @@ -68,6 +70,31 @@ def try_register_lib(lib_name: str, lib_info: str = ""): pass +def enable_custom_op(): + """ + Enable lazy init for vllm_ascend_C to avoid early initialization of CANN's RTS component. + Ensure that ASCEND_RT_VISIBLE_DEVICES can be dynamically modified before torch.npu.set_device(). + """ + global CUSTOM_OP_ENABLED + + if CUSTOM_OP_ENABLED is not None: + return CUSTOM_OP_ENABLED + + else: + try: + # register custom ops into torch_library here + import vllm_ascend.vllm_ascend_C # type: ignore # noqa: F401 + CUSTOM_OP_ENABLED = True + + except ImportError: + CUSTOM_OP_ENABLED = False + logger.warning( + "Warning: Failed to register custom ops, all custom ops will be disabled" + ) + + return CUSTOM_OP_ENABLED + + def find_hccl_library() -> str: """ We either use the library file specified by the `HCCL_SO_PATH` diff --git a/vllm_ascend/worker/worker_v1.py b/vllm_ascend/worker/worker_v1.py index ba29dcfc5..ebdf01e15 100644 --- a/vllm_ascend/worker/worker_v1.py +++ b/vllm_ascend/worker/worker_v1.py @@ -117,6 +117,11 @@ def wake_up(self, tags: Optional[list[str]] = None) -> None: allocator = CaMemAllocator.get_instance() allocator.wake_up(tags=tags) + def initialize_cache(self, num_gpu_blocks: int, + num_cpu_blocks: int) -> None: + self.cache_config.num_gpu_blocks = num_gpu_blocks + self.cache_config.num_cpu_blocks = num_cpu_blocks + def init_device(self): if self.device_config.device.type == "npu": self.device = torch.device(f"npu:{self.local_rank_across_dp}")