[Misc] Remove VLLM_USE_V1 usage in code (#1764)

wangxiyuan · web-flow · commit 7bdada58eb61 · 2025-07-15T11:52:16.000+08:00
We plan to remove V0 code from this version. The first step is to delete v0 usage. Related: #1620 - vLLM version: v0.9.2 - vLLM main: vllm-project/vllm@61e2082 Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
diff --git a/tests/ut/test_ascend_config.py b/tests/ut/test_ascend_config.py
@@ -193,71 +193,48 @@ def test_check_ascend_config_pass(self):
     @_clean_up_ascend_config
     def test_check_ascend_config_wrong_case(self):
         test_vllm_config = VllmConfig()
-        # For V0 engine
-        with mock.patch.dict(os.environ, {"VLLM_USE_V1": "0"}):
-            with self.assertRaises(NotImplementedError):
-                test_vllm_config.additional_config = {
-                    "torchair_graph_config": {
-                        "enabled": True,
-                    },
-                    "refresh": True
-                }
-                init_ascend_config(test_vllm_config)
-                check_ascend_config(test_vllm_config, False)
-            with self.assertRaises(NotImplementedError):
-                test_vllm_config.additional_config = {
-                    "ascend_scheduler_config": {
-                        "enabled": True,
-                    },
-                    "refresh": True
-                }
-                init_ascend_config(test_vllm_config)
-                check_ascend_config(test_vllm_config, True)
-        # For V1 engine
-        with mock.patch.dict(os.environ, {"VLLM_USE_V1": "1"}):
-            # torchair + eager mode
-            with self.assertRaises(RuntimeError):
-                test_vllm_config.additional_config = {
-                    "torchair_graph_config": {
-                        "enabled": True,
-                    },
-                    "refresh": True
-                }
-                init_ascend_config(test_vllm_config)
-                enforce_eager = True
-                check_ascend_config(test_vllm_config, enforce_eager)
-            # torchair + non deepseek model
-            with self.assertRaises(NotImplementedError):
-                test_vllm_config.additional_config = {
-                    "torchair_graph_config": {
-                        "enabled": True,
-                    },
-                    "refresh": True
-                }
-                model_path = os.path.join(os.path.dirname(__file__),
-                                          "fake_weight")
-                fake_model_config = ModelConfig(model=model_path)
-                fake_model_config.hf_config = PretrainedConfig()
-                fake_model_config.hf_config.model_type = "llama"
-                test_vllm_config.model_config = fake_model_config
-                init_ascend_config(test_vllm_config)
-                check_ascend_config(test_vllm_config, False)
-            # aclgraph + deepseek model
-            with self.assertRaises(NotImplementedError):
-                test_vllm_config.additional_config = {
-                    "torchair_graph_config": {
-                        "enabled": False,
-                    },
-                    "refresh": True
-                }
-                model_path = os.path.join(os.path.dirname(__file__),
-                                          "fake_weight")
-                fake_model_config = ModelConfig(model=model_path)
-                fake_model_config.hf_config = PretrainedConfig()
-                fake_model_config.hf_config.model_type = "deepseek"
-                test_vllm_config.model_config = fake_model_config
-                init_ascend_config(test_vllm_config)
-                check_ascend_config(test_vllm_config, False)
+
+        # torchair + eager mode
+        with self.assertRaises(RuntimeError):
+            test_vllm_config.additional_config = {
+                "torchair_graph_config": {
+                    "enabled": True,
+                },
+                "refresh": True
+            }
+            init_ascend_config(test_vllm_config)
+            enforce_eager = True
+            check_ascend_config(test_vllm_config, enforce_eager)
+        # torchair + non deepseek model
+        with self.assertRaises(NotImplementedError):
+            test_vllm_config.additional_config = {
+                "torchair_graph_config": {
+                    "enabled": True,
+                },
+                "refresh": True
+            }
+            model_path = os.path.join(os.path.dirname(__file__), "fake_weight")
+            fake_model_config = ModelConfig(model=model_path)
+            fake_model_config.hf_config = PretrainedConfig()
+            fake_model_config.hf_config.model_type = "llama"
+            test_vllm_config.model_config = fake_model_config
+            init_ascend_config(test_vllm_config)
+            check_ascend_config(test_vllm_config, False)
+        # aclgraph + deepseek model
+        with self.assertRaises(NotImplementedError):
+            test_vllm_config.additional_config = {
+                "torchair_graph_config": {
+                    "enabled": False,
+                },
+                "refresh": True
+            }
+            model_path = os.path.join(os.path.dirname(__file__), "fake_weight")
+            fake_model_config = ModelConfig(model=model_path)
+            fake_model_config.hf_config = PretrainedConfig()
+            fake_model_config.hf_config.model_type = "deepseek"
+            test_vllm_config.model_config = fake_model_config
+            init_ascend_config(test_vllm_config)
+            check_ascend_config(test_vllm_config, False)
 
     def test_check_torchair_supported(self):
         test_cases = [('deepseek_v3', True), ('PanguProMoE', True),
diff --git a/tests/ut/test_platform.py b/tests/ut/test_platform.py
@@ -389,69 +389,6 @@ def test_check_and_update_config_v1_worker_class_selection(
             "vllm_ascend.worker.worker_v1.NPUWorker",
         )
 
-    @patch("vllm_ascend.ascend_config.check_ascend_config")
-    @patch("vllm_ascend.ascend_config.init_ascend_config")
-    @patch("vllm.envs.VLLM_USE_V1", False)
-    def test_check_and_update_config_speculative_worker_config(
-            self, mock_init_ascend, mock_check_ascend):
-        mock_init_ascend.return_value = self.mock_ascend_config
-        self.mock_vllm_config.speculative_config = MagicMock()
-        self.mock_vllm_config.speculative_config.disable_logprobs = True
-        self.mock_vllm_config.parallel_config.worker_cls = "auto"
-
-        with patch.dict("os.environ", {}):
-            from vllm_ascend import platform
-
-            importlib.reload(platform)
-            self.platform.check_and_update_config(self.mock_vllm_config)
-            import os
-
-            self.assertEqual(os.environ.get("ACL_OP_INIT_MODE"), "1")
-            self.assertEqual(
-                self.mock_vllm_config.parallel_config.worker_cls,
-                "vllm.spec_decode.spec_decode_worker.create_spec_worker",
-            )
-            self.assertEqual(
-                self.mock_vllm_config.parallel_config.sd_worker_cls,
-                "vllm_ascend.worker.worker.NPUWorker",
-            )
-
-    @patch("vllm_ascend.ascend_config.check_ascend_config")
-    @patch("vllm_ascend.ascend_config.init_ascend_config")
-    @patch("vllm.envs.VLLM_USE_V1", False)
-    def test_check_and_update_config_multi_step_worker_config(
-            self, mock_init_ascend, mock_check_ascend):
-        mock_init_ascend.return_value = self.mock_ascend_config
-        self.mock_vllm_config.scheduler_config.is_multi_step = True
-        self.mock_vllm_config.parallel_config.worker_cls = "auto"
-
-        from vllm_ascend import platform
-
-        importlib.reload(platform)
-        self.platform.check_and_update_config(self.mock_vllm_config)
-        self.assertEqual(
-            self.mock_vllm_config.parallel_config.worker_cls,
-            "vllm_ascend.worker.multi_step_worker.MultiStepWorker",
-        )
-
-    @patch("vllm_ascend.ascend_config.check_ascend_config")
-    @patch("vllm_ascend.ascend_config.init_ascend_config")
-    @patch("vllm.envs.VLLM_USE_V1", False)
-    def test_check_and_update_config_default_worker_config(
-            self, mock_init_ascend, mock_check_ascend):
-        mock_init_ascend.return_value = self.mock_ascend_config
-        self.mock_vllm_config.parallel_config.worker_cls = "auto"
-        self.mock_vllm_config.scheduler_config.is_multi_step = False
-
-        from vllm_ascend import platform
-
-        importlib.reload(platform)
-        self.platform.check_and_update_config(self.mock_vllm_config)
-        self.assertEqual(
-            self.mock_vllm_config.parallel_config.worker_cls,
-            "vllm_ascend.worker.worker.NPUWorker",
-        )
-
     @patch("vllm_ascend.ascend_config.check_ascend_config")
     @patch("vllm_ascend.ascend_config.init_ascend_config")
     @patch("vllm_ascend.utils.is_310p", return_value=True)
diff --git a/vllm_ascend/ascend_config.py b/vllm_ascend/ascend_config.py
@@ -15,7 +15,6 @@
 # limitations under the License.
 from typing import Optional
 
-import vllm.envs as envs
 from vllm.logger import logger
 
 TORCHAIR_MODEL_LIST = ["deepseek", "pangu"]
@@ -126,46 +125,36 @@ def get_ascend_config():
 def check_ascend_config(vllm_config, enforce_eager):
     ascend_config = get_ascend_config()
 
-    # for v0 engine
-    if not envs.VLLM_USE_V1:
+    # for eager mode
+    if enforce_eager:
+        # torchair_graph cannot be enabled with eager mode.
         if ascend_config.torchair_graph_config.enabled:
-            raise NotImplementedError(
-                "Torchair graph mode is only supported for V1 Engine.")
-        if ascend_config.ascend_scheduler_config.enabled:
-            raise NotImplementedError(
-                "Ascend scheduler is only supported for V1 Engine.")
-    # for v1 engine
+            raise RuntimeError(
+                "Can't enable graph mode and eager mode at the same time. Please set `enforce_eager=False` if you attempt to enable NPU graph mode."
+            )
+    # for graph mode
     else:
-        # for eager mode
-        if enforce_eager:
-            # torchair_graph cannot be enabled with eager mode.
-            if ascend_config.torchair_graph_config.enabled:
-                raise RuntimeError(
-                    "Can't enable graph mode and eager mode at the same time. Please set `enforce_eager=False` if you attempt to enable NPU graph mode."
-                )
-        # for graph mode
+        # torchair_graph case
+        if ascend_config.torchair_graph_config.enabled:
+            # torchair_graph is supported for deepseek/pangu model only.
+            if vllm_config.model_config:
+                model_type = vllm_config.model_config.hf_config.model_type
+                if not _check_torchair_supported(model_type):
+                    raise NotImplementedError(
+                        "Torchair graph mode only works with following model types:"
+                        f"{TORCHAIR_MODEL_LIST}.")
+        # aclgraph case
         else:
-            # torchair_graph case
-            if ascend_config.torchair_graph_config.enabled:
-                # torchair_graph is supported for deepseek/pangu model only.
-                if vllm_config.model_config:
-                    model_type = vllm_config.model_config.hf_config.model_type
-                    if not _check_torchair_supported(model_type):
-                        raise NotImplementedError(
-                            "Torchair graph mode only works with following model types:"
-                            f"{TORCHAIR_MODEL_LIST}.")
-            # aclgraph case
-            else:
-                # aclgraph doesn't work with deepseek model and only qwen model is well tested.
-                if vllm_config.model_config:
-                    model_type = vllm_config.model_config.hf_config.model_type
-                    if "deepseek" in model_type:
-                        raise NotImplementedError(
-                            "ACL Graph does not support deepseek. Please "
-                            "try torchair graph mode to serve deepseek models on vllm-ascend."
-                            " Or set `enforce_eager=True` to use eager mode.")
-                    if "qwen" not in model_type:
-                        logger.warning(
-                            "ACL Graph is currently experimental. Please "
-                            "raise an issue on https://github.com/vllm-project/vllm-ascend/issues"
-                            " if you encourage any Error")
+            # aclgraph doesn't work with deepseek model and only qwen model is well tested.
+            if vllm_config.model_config:
+                model_type = vllm_config.model_config.hf_config.model_type
+                if "deepseek" in model_type:
+                    raise NotImplementedError(
+                        "ACL Graph does not support deepseek. Please "
+                        "try torchair graph mode to serve deepseek models on vllm-ascend."
+                        " Or set `enforce_eager=True` to use eager mode.")
+                if "qwen" not in model_type:
+                    logger.warning(
+                        "ACL Graph is currently experimental. Please "
+                        "raise an issue on https://github.com/vllm-project/vllm-ascend/issues"
+                        " if you encourage any Error")
diff --git a/vllm_ascend/models/deepseek_dbo.py b/vllm_ascend/models/deepseek_dbo.py
@@ -30,7 +30,6 @@
 import torch
 import torch.distributed as dist
 import torch_npu  # noqa: F401
-import vllm.envs as envs
 from torch import nn
 from transformers import PretrainedConfig
 from vllm.attention import Attention, AttentionMetadata
@@ -397,20 +396,17 @@ def forward(
             hidden_states_or_q_c = hidden_states
         if self.torchair_graph_enabled:
             forward_kwargs = {}
-            if envs.VLLM_USE_V1:
-                output_shape = hidden_states.shape
-                output = torch.empty(output_shape,
-                                     dtype=hidden_states_or_q_c.dtype,
-                                     device=hidden_states_or_q_c.device)
-                forward_kwargs['output'] = output
-
+            output_shape = hidden_states.shape
+            output = torch.empty(output_shape,
+                                 dtype=hidden_states_or_q_c.dtype,
+                                 device=hidden_states_or_q_c.device)
+            forward_kwargs['output'] = output
             output = self.mla_attn.impl.forward(self.mla_attn,
                                                 hidden_states_or_q_c,
                                                 hidden_states, None, kv_cache,
                                                 attn_metadata,
                                                 **forward_kwargs)
-            if envs.VLLM_USE_V1:
-                output = output.view(-1, output_shape[-1])
+            output = output.view(-1, output_shape[-1])
             return output
         else:
             kv_c, k_pe = self.kv_a_proj_with_mqa(hidden_states)[0].split(
@@ -885,7 +881,7 @@ def forward(
     def can_run_ms(self):
         attn_metadata = get_forward_context().attn_metadata
         # support mla attention and V1 engine at present
-        if not self.use_mla or not envs.VLLM_USE_V1:
+        if not self.use_mla:
             return False
         # enable prefill overlap
         if attn_metadata is None or attn_metadata.num_prefills == 0:
diff --git a/vllm_ascend/models/deepseek_v2.py b/vllm_ascend/models/deepseek_v2.py
@@ -29,7 +29,6 @@
 
 import torch
 import torch_npu
-import vllm.envs as envs
 from torch import nn
 from transformers import PretrainedConfig
 from vllm.attention import Attention, AttentionMetadata
@@ -579,20 +578,17 @@ def forward(
         else:
             hidden_states_or_q_c = hidden_states
         if self.torchair_graph_enabled:
-            if envs.VLLM_USE_V1:
-                output_shape = hidden_states.shape
-                output = torch.empty(output_shape,
-                                     dtype=hidden_states_or_q_c.dtype,
-                                     device=hidden_states_or_q_c.device)
-                forward_kwargs['output'] = output
-
+            output_shape = hidden_states.shape
+            output = torch.empty(output_shape,
+                                 dtype=hidden_states_or_q_c.dtype,
+                                 device=hidden_states_or_q_c.device)
+            forward_kwargs['output'] = output
             output = self.mla_attn.impl.forward(self.mla_attn,
                                                 hidden_states_or_q_c,
                                                 hidden_states, None, kv_cache,
                                                 attn_metadata,
                                                 **forward_kwargs)
-            if envs.VLLM_USE_V1:
-                output = output.view(-1, output_shape[-1])
+            output = output.view(-1, output_shape[-1])
             return output
         else:
             kv_c, k_pe = self.kv_a_proj_with_mqa(hidden_states)[0].split(
@@ -660,7 +656,7 @@ def __init__(
                 prefix=f"{prefix}.mlp",
             )
             self.mla_moe_communication = ascend_config.torchair_graph_config.enable_multistream_moe \
-                and model_config.use_mla and envs.VLLM_USE_V1 and self.tp_size > 1
+                and model_config.use_mla and self.tp_size > 1
         else:
             self.mlp = CustomDeepseekV2MLP(
                 hidden_size=config.hidden_size,
diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py