raindaywhu
diff --git a/‎tests/compile/test_aclgraph.py
Lines changed: 102 additions & 0 deletions b/‎tests/compile/test_aclgraph.py
Lines changed: 102 additions & 0 deletions
diff --git a/‎tests/conftest.py
Lines changed: 1 addition & 1 deletion b/‎tests/conftest.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎tests/long_term/spec_decode/e2e/test_v1_spec_decode.py
Lines changed: 3 additions & 1 deletion b/‎tests/long_term/spec_decode/e2e/test_v1_spec_decode.py
Lines changed: 3 additions & 1 deletion
diff --git a/‎tests/multicard/test_dynamic_npugraph_batchsize.py
Lines changed: 4 additions & 2 deletions b/‎tests/multicard/test_dynamic_npugraph_batchsize.py
Lines changed: 4 additions & 2 deletions
diff --git a/‎tests/multicard/test_offline_inference_distributed.py
Lines changed: 0 additions & 3 deletions b/‎tests/multicard/test_offline_inference_distributed.py
Lines changed: 0 additions & 3 deletions
diff --git a/‎tests/singlecard/test_offline_inference.py
Lines changed: 1 addition & 1 deletion b/‎tests/singlecard/test_offline_inference.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎vllm_ascend/attention/mla_v1.py
Lines changed: 2 additions & 4 deletions b/‎vllm_ascend/attention/mla_v1.py
Lines changed: 2 additions & 4 deletions
@@ -0,0 +1,102 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# Copyright 2023 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+"""
+Compare the outputs of vLLM with and without aclgraph.
+
+Run `pytest tests/compile/test_aclgraph.py`.
+"""
+
+import os
+
+import pytest
+import torch
+from vllm import LLM, SamplingParams
+
+from tests.conftest import VllmRunner
+from tests.model_utils import check_outputs_equal
+from vllm_ascend.utils import vllm_version_is
+
+MODELS = ["Qwen/Qwen2.5-0.5B-Instruct"]
+
+
+@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0",
+                    reason="aclgraph only support on v1")
+@pytest.mark.skipif(
+    (vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1")),
+    reason="aclgraph not supported in v0.8.5 and v0.8.5.post1")
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("max_tokens", [32])
+def test_models(
+    model: str,
+    max_tokens: int,
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    with monkeypatch.context() as m:
+        prompts = [
+            "Hello, my name is", "The president of the United States is",
+            "The capital of France is", "The future of AI is"
+        ]
+
+        # aclgraph only support on v1
+        m.setenv("VLLM_USE_V1", "1")
+
+        sampling_params = SamplingParams(max_tokens=max_tokens,
+                                         temperature=0.0)
+        # TODO: change to use vllmrunner when the registry of custom op is solved
+        # while running pytest
+        vllm_model = LLM(model)
+        vllm_aclgraph_outputs = vllm_model.generate(prompts, sampling_params)
+        del vllm_model
+        torch.npu.empty_cache()
+
+        vllm_model = LLM(model, enforce_eager=True)
+        vllm_eager_outputs = vllm_model.generate(prompts, sampling_params)
+        del vllm_model
+        torch.npu.empty_cache()
+
+    vllm_aclgraph_outputs_list = []
+    for output in vllm_aclgraph_outputs:
+        vllm_aclgraph_outputs_list.append(
+            (output.outputs[0].index, output.outputs[0].text))
+
+    vllm_eager_outputs_list = []
+    for output in vllm_eager_outputs:
+        vllm_eager_outputs_list.append(
+            (output.outputs[0].index, output.outputs[0].text))
+
+    check_outputs_equal(
+        outputs_0_lst=vllm_eager_outputs_list,
+        outputs_1_lst=vllm_aclgraph_outputs_list,
+        name_0="vllm_eager_outputs",
+        name_1="vllm_aclgraph_outputs",
+    )
+
+
+@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0",
+                    reason="aclgraph only support on v1")
+@pytest.mark.skipif(
+    (vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1")),
+    reason="aclgraph not supported in v0.8.5 and v0.8.5.post1")
+def test_deepseek_raises_error(monkeypatch: pytest.MonkeyPatch) -> None:
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_MODELSCOPE", "True")
+        m.setenv("VLLM_USE_V1", "1")
+        with pytest.raises(NotImplementedError) as excinfo:
+            VllmRunner("deepseek-ai/DeepSeek-V2-Lite-Chat",
+                       max_model_len=1024,
+                       enforce_eager=False)
+        assert "ACL Graph does not support deepseek" in str(excinfo.value)
@@ -77,7 +77,7 @@ def __init__(
         block_size: int = 16,
         enable_chunked_prefill: bool = False,
         swap_space: int = 4,
-        enforce_eager: Optional[bool] = False,
+        enforce_eager: Optional[bool] = True,
         **kwargs,
     ) -> None:
         self.model = LLM(
 
@@ -72,7 +72,7 @@ def test_ngram_correctness(
     with monkeypatch.context() as m:
         m.setenv("VLLM_USE_V1", "1")
 
-        ref_llm = LLM(model=model_name, max_model_len=1024)
+        ref_llm = LLM(model=model_name, max_model_len=1024, enforce_eager=True)
         ref_outputs = ref_llm.chat(test_prompts, sampling_config)
         del ref_llm
 
@@ -85,6 +85,7 @@ def test_ngram_correctness(
                 "num_speculative_tokens": 3,
             },
             max_model_len=1024,
+            enforce_eager=True,
         )
         spec_outputs = spec_llm.chat(test_prompts, sampling_config)
         matches = 0
@@ -135,6 +136,7 @@ def test_eagle_correctness(
                 "max_model_len": 2048,
             },
             max_model_len=2048,
+            enforce_eager=True,
         )
         spec_outputs = spec_llm.chat(test_prompts, sampling_config)
         matches = 0
 
@@ -18,8 +18,7 @@
 import torch
 from vllm import LLM, SamplingParams
 
-# TODO: revert me when cuda hard code is fixed in 'VllmBackend'
-torch.cuda.CUDAGraph = torch.npu.NPUGraph
+from vllm_ascend.utils import vllm_version_is
 
 MODELS = [
     "Qwen/Qwen2.5-0.5B-Instruct",
@@ -33,6 +32,9 @@
 ]
 
 
+@pytest.mark.skipif(
+    (vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1")),
+    reason="aclgraph not supported in v0.8.5 and v0.8.5.post1")
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("tp_size", TENSOR_PARALLELS)
 @pytest.mark.parametrize("max_tokens", [64])
 
@@ -22,7 +22,6 @@
 """
 import os
 
-import pytest
 import vllm  # noqa: F401
 
 from tests.conftest import VllmRunner
@@ -47,8 +46,6 @@ def test_models_distributed_QwQ():
         vllm_model.generate_greedy(example_prompts, max_tokens)
 
 
-@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "1",
-                    reason="deepseek v2 lite is not supported on v1")
 def test_models_distributed_DeepSeek():
     example_prompts = [
         "vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs.",
 
@@ -52,7 +52,7 @@ def test_models(model: str, dtype: str, max_tokens: int) -> None:
     with VllmRunner(model,
                     max_model_len=8192,
                     dtype=dtype,
-                    enforce_eager=False,
+                    enforce_eager=True,
                     gpu_memory_utilization=0.7) as vllm_model:
         vllm_model.generate_greedy(example_prompts, max_tokens)
 
 
@@ -239,10 +239,8 @@ def build(self,
         # it blocks on all previous kernels.
         device = self.runner.device
 
-        block_table = self.runner.input_batch.block_table[0].get_device_tensor(
-        )
-        block_table[:num_reqs, :self.runner.max_num_blocks_per_req] = (
-            block_table[:num_reqs])
+        block_table = (self.runner.input_batch.block_table[0].
+                       get_device_tensor()[:num_reqs])
         slot_mapping = self.runner.slot_mapping_cpu[:num_actual_tokens].to(
             device, non_blocking=True)
         input_positions = self.runner.positions_cpu[:num_actual_tokens].to(