Add examples and algorithm for non-shifting, fixes some minor issues

morgendave · morgendave · commit e19a8eed2267 · 2025-07-14T17:20:12.000-07:00
Signed-off-by: morgendave &lt;morgendave@gmail.com&gt;
diff --git a/tests/v1/e2e/test_spec_decode.py b/tests/v1/e2e/test_spec_decode.py
@@ -126,24 +126,38 @@ def test_ngram_correctness(
 @pytest.mark.parametrize(
     "model_setup,mm_enabled", [
         (("eagle", "meta-llama/Llama-3.1-8B-Instruct",
-          "yuhuili/EAGLE-LLaMA3.1-Instruct-8B", 1), False),
+          "yuhuili/EAGLE-LLaMA3.1-Instruct-8B", 1), False, True),
         (("eagle3", "meta-llama/Llama-3.1-8B-Instruct",
-          "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B", 1), False),
+          "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B", 1), False, True),
         pytest.param(
             (("eagle", "meta-llama/Llama-4-Scout-17B-16E-Instruct",
-              "morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct", 4), False),
+              "morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct", 4),
+              False, True),
             marks=pytest.mark.skip(reason="Skipping due to CI OOM issues")),
         pytest.param(
             (("eagle", "meta-llama/Llama-4-Scout-17B-16E-Instruct",
-              "morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct", 4), True),
+              "morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct", 4),
+              True, True),
+            marks=pytest.mark.skip(reason="Skipping due to CI OOM issues")),
+        pytest.param(
+            (("eagle", "meta-llama/Llama-4-Scout-17B-16E-Instruct",
+              "morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct", 4),
+              False, False),
+            marks=pytest.mark.skip(reason="Skipping due to CI OOM issues")),
+        pytest.param(
+            (("eagle", "meta-llama/Llama-4-Scout-17B-16E-Instruct",
+              "morgendave/EAGLE-Llama-4-Scout-17B-16E-Instruct", 4),
+              True, False),
             marks=pytest.mark.skip(reason="Skipping due to CI OOM issues")),
     ],
-    ids=["llama3_eagle", "llama3_eagle3", "llama4_eagle", "llama4_eagle_mm"])
+    ids=["llama3_eagle", "llama3_eagle3", "llama4_eagle", "llama4_eagle_mm",
+         "llama4_eagle_no_shift", "llama4_eagle_mm_no_shift"])
 def test_eagle_correctness(
     monkeypatch: pytest.MonkeyPatch,
     sampling_config: SamplingParams,
     model_setup: tuple[str, str, str, int],
     mm_enabled: bool,
+    prefill_shift: bool,
 ):
     # Generate test prompts inside the function instead of using fixture
     test_prompts = get_test_prompts(mm_enabled)
@@ -156,8 +170,9 @@ def test_eagle_correctness(
         m.setenv("VLLM_USE_V1", "1")
         method, model_name, spec_model_name, tp_size = model_setup
 
+        max_model_len = 2048 if not mm_enabled else 4096
         ref_llm = LLM(model=model_name,
-                      max_model_len=2048,
+                      max_model_len=max_model_len,
                       tensor_parallel_size=tp_size)
         ref_outputs = ref_llm.chat(test_prompts, sampling_config)
         del ref_llm
@@ -172,9 +187,10 @@ def test_eagle_correctness(
                 "method": method,
                 "model": spec_model_name,
                 "num_speculative_tokens": 3,
-                "max_model_len": 2048,
+                "max_model_len": max_model_len,
+                "prefill_token_shift": prefill_shift,
             },
-            max_model_len=2048,
+            max_model_len=max_model_len,
         )
         spec_outputs = spec_llm.chat(test_prompts, sampling_config)
         matches = 0
diff --git a/vllm/config.py b/vllm/config.py
@@ -2557,7 +2557,7 @@ class SpeculativeConfig:
 
     # Config for kv sharing, map from base model layer to draft layer
     # Key is draft layer, value is base layer
-    kv_sharing_mapping: SkipValidation[dict[str, str]] = None
+    kv_sharing_mapping: SkipValidation[dict[str, str]] = None  # type: ignore
     """KV copy mapping for prefill stage from base to draft"""
 
     def compute_hash(self) -> str:
diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py
@@ -104,6 +104,7 @@ def _prepare_adjusted_tensors(
         cu_num_tokens: torch.Tensor,
         decode_mask: torch.Tensor,
         full_prefill_mask: torch.Tensor,
+        partial_prefill_mask: torch.Tensor,
         prefill_first_hiddens: torch.Tensor,
         block_table: torch.Tensor,
         batch_size: int,
@@ -131,6 +132,34 @@ def _prepare_adjusted_tensors(
             tuple: (target_positions, target_hidden_states, target_slot_mapping,
                     cu_num_tokens, current_pos, partial_prefill_mask)
 
+        Algorithm design:
+        - Suppose target tokens are [1,2,3,...N], next token is N+1
+        - Position is [0,1,2,...N-1]
+        - And hidden is [h1,h2,h3,...hN]
+        - Suppose partial prefill is [Nm, Nm+1, ...Nm+M-1]
+        -- For normal shifting:
+           --- draft prefill is [2,3,...N+1], position is same as target
+           --- Stacking hidden is [h1,h2,h3,...hN]
+           --- Decode tokens are [N+2, N+3, ...], hidden is [hN+1,hN+2,...]
+           --- Decode positions are [N,N+1,...]
+           --- draft partial prefill is [Nm+1, Nm+2, ...Nm+M]
+        -- For non-shifting:
+           --- draft full prefill is [1,2,3,...N+1], position is [0,1,2,...N]
+           --- Stacking hidden is [hN,h1,h2,h3,...hN]
+           --- Decode tokens are [N+2, N+3, ...], hidden is [hN+1,hN+2,...]
+           --- Decode positions are [N+1,N+2,...]
+           --- draft partial prefill is [Nm, Nm+1, ...Nm+M-1]
+           --- draft hidden is [hNm-1,hNm,...hNm+M] 
+               (hNm-1 is the last round hidden)
+        -- For kv sharing(non-shifting required):
+           This means all target prefill tokens are not needed to be processed
+           in drafting prefill step as we don't need the kv from draft.
+           --- draft full prefill is [N+1], position is [N]
+           --- Stacking hidden is [hN]
+           --- Decode is the same as non-shifting decode
+           --- draft partial prefill is totally skipped
+        All other metadata like slot mapping, etc. should be based on
+        the positions and tokens to generate/manipulate again
         """
         # Count total number of full prefill requests to determine the
         # size needed for adjusted tensors
@@ -184,21 +213,6 @@ def _prepare_adjusted_tensors(
         # Create updated cumulative token counts
         updated_cu_num_tokens = torch.zeros_like(cu_num_tokens)
 
-        # Track which requests are partial prefill (no decode tokens)
-        partial_prefill_mask = torch.zeros_like(full_prefill_mask)
-
-        # Create masks for each category
-        has_decode_mask = torch.zeros(batch_size,
-                                      dtype=torch.bool,
-                                      device=decode_mask.device)
-        for i in range(batch_size):
-            start_idx = cu_num_tokens[i].item()
-            end_idx = cu_num_tokens[i + 1].item()
-            has_decode_mask[i] = decode_mask[start_idx:end_idx].any().item()
-
-        # Category 1: Partial prefill (no decode tokens)
-        partial_prefill_mask = ~has_decode_mask
-
         # Process batched operations using masks
         current_pos = 0
         cu_num_tokens_index = 0
@@ -368,6 +382,7 @@ def propose(
         mm_embeds: Optional[list[torch.Tensor]] = None,
         decode_mask: torch.Tensor = None,
         full_prefill_mask: torch.Tensor = None,
+        partial_prefill_mask: torch.Tensor = None,
     ) -> torch.Tensor:
         num_tokens = target_token_ids.shape[0]
         batch_size = next_token_ids.shape[0]
@@ -388,6 +403,17 @@ def propose(
             prefill_shift_tokens = False
 
         if not prefill_shift_tokens and has_prefill:
+            if (partial_prefill_mask.all()
+                    and self.draft_prefill_kv_sharing_from_base):
+                # All requests are partial prefill and
+                # KV cache sharing is enabled
+                # Skip the rest of the function
+                # and return dummy draft tokens
+                return torch.zeros(
+                    (batch_size, self.num_speculative_tokens),
+                    dtype=target_token_ids.dtype,
+                    device=target_token_ids.device,
+                )
             # Adjust the tensors for full prefill requests
             (
                 target_positions,
@@ -404,22 +430,12 @@ def propose(
                 cu_num_tokens,
                 decode_mask,
                 full_prefill_mask,
+                partial_prefill_mask,
                 prefill_first_hiddens,
                 block_table,
                 batch_size,
                 num_tokens,
             )
-            if (partial_prefill_mask.all()
-                    and self.draft_prefill_kv_sharing_from_base):
-                # All requests are partial prefill and
-                # KV cache sharing is enabled
-                # Skip the rest of the function
-                # and return dummy draft tokens
-                return torch.zeros(
-                    (batch_size, self.num_speculative_tokens),
-                    dtype=target_token_ids.dtype,
-                    device=target_token_ids.device,
-                )
             batch_size = cu_num_tokens.shape[0] - 1
         else:
             # Original behavior: shift all tokens by one
@@ -451,6 +467,9 @@ def propose(
         if not prefill_shift_tokens and has_prefill:
             # Replace the last token with the next token under non-shifting,
             # but only for non-partial prefill requests
+            # For partial prefill in non-shifting, we just match the target
+            # prefill tokens as it would match the positions and hidden states
+            # so no need to add this next token from next round
             mask = ~partial_prefill_mask
             # if we enable copy kv then all of the partial prefills
             # are completely skipped so they won't be in last_token_indices
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
@@ -1303,8 +1303,7 @@ def execute_model(
 
         # Prepare the decoder inputs.
         (attn_metadata, attention_cuda_graphs, logits_indices,
-         spec_decode_metadata,
-         num_scheduled_tokens_np,
+         spec_decode_metadata, num_scheduled_tokens_np,
          decode_mask) = (self._prepare_inputs(scheduler_output))
         num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens
         if (self.use_cuda_graph
@@ -1687,6 +1686,7 @@ def propose_draft_token_ids(
             # This is used in non-shifted prefill for eagle draft.
             prefill_first_hiddens = []
             full_prefill_mask = []
+            partial_prefill_mask = []
             for i, token_ids in enumerate(sampled_token_ids):
                 req_id = self.input_batch.req_ids[i]
                 req_state = self.requests[req_id]
@@ -1695,7 +1695,7 @@ def propose_draft_token_ids(
                 # works very well for init the first prefill hidden state.
                 if req_state.prefill_hidden_states is None:
                     req_state.prefill_hidden_states = target_hidden_states[
-                        cu_num_tokens[i]]
+                        cu_num_tokens[i + 1] - 1]
                 prefill_first_hiddens.append(req_state.prefill_hidden_states)
                 num_prompt_tokens = req_state.num_prompt_tokens
                 num_scheduled_tokens = scheduler_output.num_scheduled_tokens[
@@ -1707,6 +1707,7 @@ def propose_draft_token_ids(
                 if token_ids:
                     # Common case.
                     next_token_id = token_ids[-1]
+                    partial_prefill_mask.append(False)
                 else:
                     # Partial prefill (rare case).
                     # Get the next token id from the request state.
@@ -1719,6 +1720,7 @@ def propose_draft_token_ids(
                     # as the first prefill hidden for the next round
                     req_state.prefill_hidden_states = target_hidden_states[
                         last_hidden_index]
+                    partial_prefill_mask.append(True)
                 next_token_ids.append(next_token_id)
             next_token_ids = torch.tensor(next_token_ids,
                                           dtype=torch.int32,
@@ -1727,6 +1729,9 @@ def propose_draft_token_ids(
             full_prefill_mask = torch.tensor(full_prefill_mask,
                                              dtype=torch.bool,
                                              device=self.device)
+            partial_prefill_mask = torch.tensor(partial_prefill_mask,
+                                                dtype=torch.bool,
+                                                device=self.device)
             draft_token_ids = self.drafter.propose(
                 target_token_ids=target_token_ids,
                 target_positions=target_positions,
@@ -1740,6 +1745,7 @@ def propose_draft_token_ids(
                 prefill_first_hiddens=prefill_first_hiddens,
                 decode_mask=decode_mask,
                 full_prefill_mask=full_prefill_mask,
+                partial_prefill_mask=partial_prefill_mask,
             )
             spec_token_ids = draft_token_ids.tolist()
         return spec_token_ids