Add llama4 eagle to e2e test for spec decode

morgendave · morgendave · commit 2f8fe498c461 · 2025-07-10T14:49:46.000-07:00
diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py
@@ -6,8 +6,10 @@
 from typing import Any
 
 import pytest
+import torch
 
 from vllm import LLM, SamplingParams
+from vllm.distributed import cleanup_dist_env_and_memory
 
 
 @pytest.fixture
@@ -53,14 +55,6 @@ def model_name():
     return "meta-llama/Llama-3.1-8B-Instruct"
 
 
-def eagle_model_name():
-    return "yuhuili/EAGLE-LLaMA3.1-Instruct-8B"
-
-
-def eagle3_model_name():
-    return "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"
-
-
 def test_ngram_correctness(
     monkeypatch: pytest.MonkeyPatch,
     test_prompts: list[list[dict[str, Any]]],
@@ -77,6 +71,8 @@ def test_ngram_correctness(
         ref_llm = LLM(model=model_name, max_model_len=1024)
         ref_outputs = ref_llm.chat(test_prompts, sampling_config)
         del ref_llm
+        torch.cuda.empty_cache()
+        cleanup_dist_env_and_memory()
 
         spec_llm = LLM(
             model=model_name,
@@ -103,34 +99,48 @@ def test_ngram_correctness(
         # Upon failure, inspect the outputs to check for inaccuracy.
         assert matches > int(0.7 * len(ref_outputs))
         del spec_llm
-
-
-@pytest.mark.parametrize("use_eagle3", [False, True], ids=["eagle", "eagle3"])
+        torch.cuda.empty_cache()
+        cleanup_dist_env_and_memory()
+
+
+@pytest.mark.parametrize("model_setup", [
+    ("eagle", "meta-llama/Llama-3.1-8B-Instruct",
+     "yuhuili/EAGLE-LLaMA3.1-Instruct-8B", 1),
+    ("eagle3", "meta-llama/Llama-3.1-8B-Instruct",
+     "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B", 1),
+    ("eagle", "/home/zhiweiz/local/models/scout_base_HF_20250605_201140",
+     "morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct", 4),
+],
+                         ids=["llama3_eagle", "llama3_eagle3", "llama4_eagle"])
 def test_eagle_correctness(
     monkeypatch: pytest.MonkeyPatch,
     test_prompts: list[list[dict[str, Any]]],
     sampling_config: SamplingParams,
-    model_name: str,
-    use_eagle3: bool,
+    model_setup: tuple[str, str, str, int],
 ):
     '''
     Compare the outputs of a original LLM and a speculative LLM
     should be the same when using eagle speculative decoding.
+    model_setup: (method, model_name, eagle_model_name, tp_size)
     '''
     with monkeypatch.context() as m:
         m.setenv("VLLM_USE_V1", "1")
+        method, model_name, spec_model_name, tp_size = model_setup
 
-        ref_llm = LLM(model=model_name, max_model_len=2048)
+        ref_llm = LLM(model=model_name,
+                      max_model_len=2048,
+                      tensor_parallel_size=tp_size)
         ref_outputs = ref_llm.chat(test_prompts, sampling_config)
         del ref_llm
+        torch.cuda.empty_cache()
+        cleanup_dist_env_and_memory()
 
-        spec_model_name = eagle3_model_name(
-        ) if use_eagle3 else eagle_model_name()
         spec_llm = LLM(
             model=model_name,
             trust_remote_code=True,
+            tensor_parallel_size=tp_size,
             speculative_config={
-                "method": "eagle3" if use_eagle3 else "eagle",
+                "method": method,
                 "model": spec_model_name,
                 "num_speculative_tokens": 3,
                 "max_model_len": 2048,
@@ -152,3 +162,5 @@ def test_eagle_correctness(
         # Upon failure, inspect the outputs to check for inaccuracy.
         assert matches > int(0.66 * len(ref_outputs))
         del spec_llm
+        torch.cuda.empty_cache()
+        cleanup_dist_env_and_memory()
diff --git a/vllm/model_executor/models/llama4_eagle.py b/vllm/model_executor/models/llama4_eagle.py
@@ -1,4 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 from collections.abc import Iterable
 from typing import Optional
@@ -177,10 +178,8 @@ def forward(
     ) -> tuple[torch.Tensor, torch.Tensor]:
         return self.model(input_ids, positions, hidden_states)
 
-    def load_weights(
-        self,
-        weights: Iterable[tuple[str, torch.Tensor]]
-    ) -> None:
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> None:
         loader = AutoWeightsLoader(
             self,
             # lm_head is tied with target model (Llama4ForCausalLM)