Update test_spec_decode.py

jiangpeng36 · web-flow · commit 3797db85d1bc · 2025-05-20T17:18:09.000+08:00
diff --git a/tests/singlecard/spec_decode/e2e/test_spec_decode.py b/tests/singlecard/spec_decode/e2e/test_spec_decode.py
@@ -1,12 +1,16 @@
 # SPDX-License-Identifier: Apache-2.0
 from __future__ import annotations
 
+import os
 import random
 from typing import Any
 
 import pytest
+
 from vllm import LLM, SamplingParams
 
+os.environ["VLLM_USE_MODELSCOPE"] = "True"
+
 
 @pytest.fixture
 def test_prompts():
@@ -43,18 +47,20 @@ def test_prompts():
 
 @pytest.fixture
 def sampling_config():
-    # Only support greedy for now
     return SamplingParams(temperature=0, max_tokens=10, ignore_eos=False)
 
 
 @pytest.fixture
 def model_name():
-    return "meta-llama/Meta-Llama-3-8B-Instruct"
+    return "LLM-Research/Llama-3.1-8B-Instruct"
 
 
-@pytest.fixture
 def eagle_model_name():
-    return "yuhuili/EAGLE-LLaMA3-Instruct-8B"
+    return "vllm-ascend/EAGLE-LLaMA3.1-Instruct-8B"
+
+
+def eagle3_model_name():
+    return "vllm-ascend/EAGLE3-LLaMA3.1-Instruct-8B"
 
 
 def test_ngram_correctness(
@@ -97,37 +103,42 @@ def test_ngram_correctness(
 
         # Heuristic: expect at least 70% of the prompts to match exactly
         # Upon failure, inspect the outputs to check for inaccuracy.
-        assert matches > int(0.6 * len(ref_outputs))
+        assert matches > int(0.7 * len(ref_outputs))
         del spec_llm
 
 
+@pytest.mark.parametrize("use_eagle3", [False, True], ids=["eagle", "eagle3"])
 def test_eagle_correctness(
     monkeypatch: pytest.MonkeyPatch,
     test_prompts: list[list[dict[str, Any]]],
     sampling_config: SamplingParams,
     model_name: str,
-    eagle_model_name: str,
+    use_eagle3: bool,
 ):
-    pytest.skip("Not current support for the test.")
     '''
     Compare the outputs of a original LLM and a speculative LLM
     should be the same when using eagle speculative decoding.
     '''
+    pytest.skip("Not current support for the test.")
     with monkeypatch.context() as m:
         m.setenv("VLLM_USE_V1", "1")
 
-        ref_llm = LLM(model=model_name, max_model_len=1024)
+        ref_llm = LLM(model=model_name, max_model_len=2048)
         ref_outputs = ref_llm.chat(test_prompts, sampling_config)
         del ref_llm
 
+        spec_model_name = eagle3_model_name(
+        ) if use_eagle3 else eagle_model_name()
         spec_llm = LLM(
             model=model_name,
+            trust_remote_code=True,
             speculative_config={
-                "method": "eagle",
-                "model": eagle_model_name,
+                "method": "eagle3" if use_eagle3 else "eagle",
+                "model": spec_model_name,
                 "num_speculative_tokens": 3,
+                "max_model_len": 2048,
             },
-            max_model_len=1024,
+            max_model_len=2048,
         )
         spec_outputs = spec_llm.chat(test_prompts, sampling_config)
         matches = 0
@@ -140,7 +151,7 @@ def test_eagle_correctness(
                 print(f"ref_output: {ref_output.outputs[0].text}")
                 print(f"spec_output: {spec_output.outputs[0].text}")
 
-        # Heuristic: expect at least 70% of the prompts to match exactly
+        # Heuristic: expect at least 66% of the prompts to match exactly
         # Upon failure, inspect the outputs to check for inaccuracy.
-        assert matches > int(0.7 * len(ref_outputs))
+        assert matches > int(0.66 * len(ref_outputs))
         del spec_llm