[V1] Make V1 engine backward compatible (#637)

yiz-liu · web-flow · commit d785e785639a · 2025-04-24T17:20:11.000+08:00
### What this PR does / why we need it?
Enforce eager mode in the V1 engine ahead of the upcoming CANN and
torch_npu releases.

### Does this PR introduce _any_ user-facing change?
After this change, users will no longer need to manually set
enforce_eager=True.

### How was this patch tested?
Test it with regular offline inference examples.

Signed-off-by: Yizhou Liu &lt;liu_yizhou@outlook.com&gt;
diff --git a/tests/multicard/test_offline_inference_distributed.py b/tests/multicard/test_offline_inference_distributed.py
@@ -47,7 +47,6 @@ def test_models_distributed(model: str,
             dtype=dtype,
             tensor_parallel_size=4,
             distributed_executor_backend=distributed_executor_backend,
-            enforce_eager=True,
     ) as vllm_model:
         vllm_model.generate_greedy(example_prompts, max_tokens)
 
diff --git a/tests/ops/test_fused_moe.py b/tests/ops/test_fused_moe.py
@@ -22,7 +22,6 @@
 
 import pytest
 import torch
-from vllm.config import VllmConfig, set_current_vllm_config
 from vllm.model_executor.layers.activation import SiluAndMul
 
 from vllm_ascend.ops.fused_moe import fused_experts
@@ -68,36 +67,31 @@ def test_fused_experts(
     dtype: torch.dtype,
     device: str,
 ):
-    vllm_config = VllmConfig()
-    with set_current_vllm_config(vllm_config):
-        a = torch.randn((m, k), device=device, dtype=dtype) / 10
-        w1 = torch.randn((e, 2 * n, k), device=device, dtype=dtype) / 10
-        w2 = torch.randn((e, k, n), device=device, dtype=dtype) / 10
+    a = torch.randn((m, k), device=device, dtype=dtype) / 10
+    w1 = torch.randn((e, 2 * n, k), device=device, dtype=dtype) / 10
+    w2 = torch.randn((e, k, n), device=device, dtype=dtype) / 10
 
-        score = torch.randn((m, e), device=device, dtype=dtype)
+    score = torch.randn((m, e), device=device, dtype=dtype)
 
-        if ep_size > 1:
-            local_e = e // ep_size
-            e_ids = torch.randint(0,
-                                  e, (local_e, ),
-                                  device=device,
-                                  dtype=torch.int32)
-            e_map = torch.full((e, ), -1, device=device, dtype=torch.int32)
-            e_map[e_ids] = torch.arange(local_e,
-                                        device=device,
-                                        dtype=torch.int32)
-            w1 = w1[e_ids]
-            w2 = w2[e_ids]
-        else:
-            e_map = None
+    if ep_size > 1:
+        local_e = e // ep_size
+        e_ids = torch.randint(0,
+                              e, (local_e, ),
+                              device=device,
+                              dtype=torch.int32)
+        e_map = torch.full((e, ), -1, device=device, dtype=torch.int32)
+        e_map[e_ids] = torch.arange(local_e, device=device, dtype=torch.int32)
+        w1 = w1[e_ids]
+        w2 = w2[e_ids]
+    else:
+        e_map = None
 
-        score = torch.softmax(score, dim=-1, dtype=dtype)
-        topk_weights, topk_ids = torch.topk(score, topk)
-        topk_ids = topk_ids.to(torch.int32)
+    score = torch.softmax(score, dim=-1, dtype=dtype)
+    topk_weights, topk_ids = torch.topk(score, topk)
+    topk_ids = topk_ids.to(torch.int32)
 
-        output = fused_experts(a, w1, w2, topk_weights, topk_ids, topk, e_map)
-        torch_output = torch_moe(a, w1, w2, topk_weights, topk_ids, topk,
-                                 e_map)
-        # TODO: The native params are: atol=2e-2, rtol=0, maybe related to the nan problem
-        torch.testing.assert_close(output, torch_output, atol=4e-2, rtol=1)
+    output = fused_experts(a, w1, w2, topk_weights, topk_ids, topk, e_map)
+    torch_output = torch_moe(a, w1, w2, topk_weights, topk_ids, topk, e_map)
+    # TODO: The native params are: atol=2e-2, rtol=0, maybe related to the nan problem
+    torch.testing.assert_close(output, torch_output, atol=4e-2, rtol=1)
     torch.npu.empty_cache()
diff --git a/tests/singlecard/test_offline_inference.py b/tests/singlecard/test_offline_inference.py
@@ -52,7 +52,7 @@ def test_models(model: str, dtype: str, max_tokens: int) -> None:
     with VllmRunner(model,
                     max_model_len=8192,
                     dtype=dtype,
-                    enforce_eager=True,
+                    enforce_eager=False,
                     gpu_memory_utilization=0.7) as vllm_model:
         vllm_model.generate_greedy(example_prompts, max_tokens)
 
diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py
@@ -115,29 +115,33 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
         from vllm.config import CompilationLevel  # noqa: E402
         compilation_config = vllm_config.compilation_config
 
-        enforce_eager_flag = False
-        # Check whether the eager mode is configured
-        try:
-            enforce_eager_flag = vllm_config.model_config.enforce_eager
-        except Exception:
-            logger.warning(
-                "There is currently no enforce_eager mode configured, the default value of enforce_eager=False is used"
-            )
-
-        if enforce_eager_flag or compilation_config.level == CompilationLevel.NO_COMPILATION:
-            logger.warning(
-                "Compilation level PIECEWISE is not enable on NPU now, current compilation level to NO_COMPILATION"
-            )
+        if vllm_config.model_config is None:
+            logger.warning("Model config is missing. This may indicate "
+                           "that we are running a test case")
+            enforce_eager = False
+        else:
+            enforce_eager = getattr(vllm_config.model_config, "enforce_eager",
+                                    False)
+
+        # TODO(Yizhou): Override the value of enforce_eager to True before
+        # the CANN and torch_npu support NPU compilation.
+        enforce_eager = True
+        logger.warning(
+            "NPU compilation support pending. Will be available in future CANN and "
+            "torch_npu releases. Using default: enforce_eager=True")
+
+        if enforce_eager or compilation_config.level == CompilationLevel.NO_COMPILATION:
+            logger.info("Compilation disabled, using eager mode by default")
             compilation_config.level = CompilationLevel.NO_COMPILATION
         elif compilation_config.level != CompilationLevel.PIECEWISE:
             logger.warning(
-                "Compilation level %s is not enable on NPU now, forcing compilation level to NO_COMPILATION",
+                "NPU does not support %s compilation level. Setting level to NO_COMPILATION",
                 compilation_config.level)
             compilation_config.level = CompilationLevel.NO_COMPILATION
         else:
             logger.info(
-                "Compilation level PIECEWISE is enable on NPU now, But use_inductor is no support, only use npu_graph now"
-            )
+                "PIECEWISE compilation enabled on NPU. use_inductor not supported - "
+                "using only ACL Graph mode")
             compilation_config.use_inductor = False
             compilation_config.splitting_ops.extend(
                 ["vllm.unified_ascend_attention_with_output"])