vllm-project
diff --git a/‎.github/workflows/vllm_ascend_test.yaml
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/vllm_ascend_test.yaml
Lines changed: 1 addition & 0 deletions
diff --git a/‎.github/workflows/vllm_ascend_test_long_term.yaml
Lines changed: 3 additions & 3 deletions b/‎.github/workflows/vllm_ascend_test_long_term.yaml
Lines changed: 3 additions & 3 deletions
diff --git a/‎tests/long_term/spec_decode_v1/test_v1_mtp_correctness.py
Lines changed: 64 additions & 1 deletion b/‎tests/long_term/spec_decode_v1/test_v1_mtp_correctness.py
Lines changed: 64 additions & 1 deletion
diff --git a/‎tests/multicard/test_offline_inference_distributed.py
Lines changed: 24 additions & 0 deletions b/‎tests/multicard/test_offline_inference_distributed.py
Lines changed: 24 additions & 0 deletions
diff --git a/‎tests/multicard/test_torchair_graph_mode.py
Lines changed: 4 additions & 4 deletions b/‎tests/multicard/test_torchair_graph_mode.py
Lines changed: 4 additions & 4 deletions
diff --git a/‎tests/singlecard/test_aclgraph.py
Lines changed: 11 additions & 1 deletion b/‎tests/singlecard/test_aclgraph.py
Lines changed: 11 additions & 1 deletion
diff --git a/‎vllm_ascend/ascend_forward_context.py
Lines changed: 10 additions & 1 deletion b/‎vllm_ascend/ascend_forward_context.py
Lines changed: 10 additions & 1 deletion
@@ -203,6 +203,7 @@ jobs:
             VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_topk
             VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_W8A8
             VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_dbo
+            VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeek_w8a8_ep_dbo
             VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/test_offline_inference_distributed.py::test_models_distributed_DeepSeekV3_dbo
             VLLM_USE_MODELSCOPE=True pytest -sv tests/multicard/ --ignore=tests/multicard/test_ilama_lora_tp2.py --ignore=tests/multicard/test_offline_inference_distributed.py --ignore=tests/multicard/test_w4a8_deepseek.py
           fi
 
@@ -95,11 +95,11 @@ jobs:
         run: |
           if [[ "${{ matrix.os }}" == "linux-arm64-npu-1" ]]; then
             # v0 spec decode test
-            VLLM_USE_MODELSCOPE=True pytest -sv tests/long_term/spec_decode_v0/e2e/test_mtp_correctness.py  # it needs a clean process
-            pytest -sv tests/long_term/spec_decode_v0 --ignore=tests/long_term/spec_decode_v0/e2e/test_mtp_correctness.py
+            # VLLM_USE_MODELSCOPE=True pytest -sv tests/long_term/spec_decode_v0/e2e/test_mtp_correctness.py  # it needs a clean process
+            # pytest -sv tests/long_term/spec_decode_v0 --ignore=tests/long_term/spec_decode_v0/e2e/test_mtp_correctness.py
             # v1 spec decode test
             # TODO: revert me when test_v1_mtp_correctness.py is fixed
-            # VLLM_USE_MODELSCOPE=True pytest -sv tests/long_term/spec_decode_v1/test_v1_mtp_correctness.py
+            VLLM_USE_MODELSCOPE=True pytest -sv tests/long_term/spec_decode_v1/test_v1_mtp_correctness.py
             # TODO: revert me when test_v1_spec_decode.py::test_ngram_correctness is fixed
             # VLLM_USE_MODELSCOPE=True pytest -sv tests/long_term/spec_decode_v1/test_v1_spec_decode.py
             # accuracy test single card
 
@@ -63,7 +63,10 @@ def test_mtp_correctness(
     with monkeypatch.context() as m:
         m.setenv("VLLM_USE_V1", "1")
 
-        ref_llm = LLM(model=model_name, max_model_len=256, enforce_eager=True)
+        ref_llm = LLM(model=model_name,
+                      max_model_len=256,
+                      gpu_memory_utilization=0.8,
+                      enforce_eager=True)
         ref_outputs = ref_llm.chat(test_prompts, sampling_config)
         del ref_llm
 
@@ -74,6 +77,7 @@ def test_mtp_correctness(
                            "num_speculative_tokens": 1,
                        },
                        max_model_len=256,
+                       gpu_memory_utilization=0.8,
                        enforce_eager=True)
         spec_outputs = spec_llm.chat(test_prompts, sampling_config)
         matches = 0
@@ -90,3 +94,62 @@ def test_mtp_correctness(
         # Upon failure, inspect the outputs to check for inaccuracy.
         assert matches > int(0.66 * len(ref_outputs))
         del spec_llm
+
+
+def test_mtp_torchair_correctness(
+    monkeypatch: pytest.MonkeyPatch,
+    test_prompts: list[list[dict[str, Any]]],
+    sampling_config: SamplingParams,
+    model_name: str,
+):
+    '''
+    Compare the outputs of a original LLM and a speculative LLM
+    should be the same when using mtp speculative decoding.
+    '''
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+
+        ref_llm = LLM(model=model_name,
+                      max_model_len=256,
+                      enforce_eager=False,
+                      additional_config={
+                          "torchair_graph_config": {
+                              "enabled": True
+                          },
+                          "ascend_scheduler_config": {
+                              "enabled": True
+                          },
+                      })
+        ref_outputs = ref_llm.chat(test_prompts, sampling_config)
+        del ref_llm
+
+        spec_llm = LLM(model=model_name,
+                       trust_remote_code=True,
+                       enforce_eager=False,
+                       speculative_config={
+                           "method": "deepseek_mtp",
+                           "num_speculative_tokens": 1,
+                       },
+                       additional_config={
+                           "torchair_graph_config": {
+                               "enabled": True
+                           },
+                           "ascend_scheduler_config": {
+                               "enabled": True
+                           },
+                       })
+        spec_outputs = spec_llm.chat(test_prompts, sampling_config)
+        matches = 0
+        misses = 0
+        for ref_output, spec_output in zip(ref_outputs, spec_outputs):
+            if ref_output.outputs[0].text == spec_output.outputs[0].text:
+                matches += 1
+            else:
+                misses += 1
+                print(f"ref_output: {ref_output.outputs[0].text}")
+                print(f"spec_output: {spec_output.outputs[0].text}")
+
+        # Heuristic: expect at least 66% of the prompts to match exactly
+        # Upon failure, inspect the outputs to check for inaccuracy.
+        assert matches > int(0.66 * len(ref_outputs))
+        del spec_llm
@@ -109,6 +109,30 @@ def test_models_distributed_DeepSeek_dbo():
         vllm_model.generate(example_prompts, sampling_params)
 
 
+@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_DBO": "1"})
+def test_models_distributed_DeepSeek_w8a8_ep_dbo():
+    example_prompts = ["The president of the United States is"] * 100
+    sampling_params = SamplingParams(max_tokens=100, temperature=0.0)
+    with VllmRunner(
+            snapshot_download("vllm-ascend/DeepSeek-V2-Lite-W8A8"),
+            dtype="auto",
+            quantization="ascend",
+            tensor_parallel_size=4,
+            enforce_eager=True,
+            enable_expert_parallel=True,
+            distributed_executor_backend="mp",
+            additional_config={"ascend_scheduler_config": {
+                "enabled": True,
+            }}) as vllm_model:
+        model_arch = 'DeepseekV2ForCausalLM'
+        registed_models = ModelRegistry.models
+        assert registed_models[
+            model_arch].module_name == "vllm_ascend.models.deepseek_dbo"
+        assert registed_models[
+            model_arch].class_name == "CustomDeepseekDBOForCausalLM"
+        vllm_model.generate(example_prompts, sampling_params)
+
+
 @pytest.mark.skip(reason="Due to OOM,waiting for 1311pr to merge in")
 @patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_DBO": "1"})
 def test_models_distributed_DeepSeekV3_dbo():
 
@@ -71,10 +71,10 @@ def test_e2e_deepseekv3_with_torchair(monkeypatch: pytest.MonkeyPatch,
         # inaccurate. This will only change if accuracy improves with the
         # official weights of DeepSeek-V3.
         golden_results = [
-            'Hello, my name is feasibility伸 spazio debtor添',
-            'The president of the United States is begg"""\n杭州风和 bestimm',
-            'The capital of France is frequentlyশามalinkAllowed',
-            'The future of AI is deleting俯احت怎么样了حراف',
+            'Hello, my name is下载早点向前很有่อง',
+            'The president of the United States isSender)## physiological Albany',
+            'The capital of France is Rocky转角 hospitalizedinterval sparked',
+            'The future of AI is её asegο BIOS一扫',
         ]
 
         assert len(golden_results) == len(vllm_output)
 
@@ -36,9 +36,11 @@
                     reason="aclgraph only support on v1")
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("max_tokens", [32])
+@pytest.mark.parametrize("full_graph", [False])
 def test_models(
     model: str,
     max_tokens: int,
+    full_graph: bool,
     monkeypatch: pytest.MonkeyPatch,
 ) -> None:
     with monkeypatch.context() as m:
@@ -54,7 +56,15 @@ def test_models(
                                          temperature=0.0)
         # TODO: change to use vllmrunner when the registry of custom op is solved
         # while running pytest
-        vllm_model = LLM(model)
+        if full_graph:
+            vllm_model = LLM(model,
+                             compilation_config={
+                                 "full_cuda_graph": True,
+                                 "cudagraph_capture_sizes":
+                                 [1, 4, 16, 64, 256]
+                             })
+        else:
+            vllm_model = LLM(model)
         vllm_aclgraph_outputs = vllm_model.generate(prompts, sampling_params)
         del vllm_model
         torch.npu.empty_cache()
 
@@ -60,12 +60,21 @@ def set_ascend_forward_context(
 
         forward_context.in_profile_run = in_profile_run
 
+        # NOTE: This cannot be set using set_forward_context
+        # due to multiple warmups before actual capturing
+        forward_context.capturing = False
+
         dp_world_size = get_dp_group().world_size
         if dp_world_size > 1 and forward_context.dp_metadata is not None:
             forward_context.max_tokens_across_dp = forward_context.dp_metadata.max_tokens_across_dp_cpu.item(
             )
+        elif num_tokens is not None:
+            forward_context.max_tokens_across_dp = num_tokens
         elif attn_metadata is not None:
-            forward_context.max_tokens_across_dp = num_tokens or attn_metadata.num_actual_tokens
+            if hasattr(attn_metadata, 'num_actual_tokens'):
+                forward_context.max_tokens_across_dp = attn_metadata.num_actual_tokens
+            else:
+                forward_context.max_tokens_across_dp = attn_metadata.num_prefill_tokens + attn_metadata.num_decode_tokens
         else:
             forward_context.max_tokens_across_dp = None