vllm-project
diff --git a/‎tests/sample/__init__.py b/‎tests/sample/__init__.py
diff --git a/‎tests/sample/test_rejection_sampler.py
Lines changed: 611 additions & 0 deletions b/‎tests/sample/test_rejection_sampler.py
Lines changed: 611 additions & 0 deletions
diff --git a/‎tests/singlecard/spec_decode/e2e/test_spec_decode.py
Lines changed: 153 additions & 0 deletions b/‎tests/singlecard/spec_decode/e2e/test_spec_decode.py
Lines changed: 153 additions & 0 deletions
diff --git a/‎vllm_ascend/attention/attention_v1.py
Lines changed: 5 additions & 0 deletions b/‎vllm_ascend/attention/attention_v1.py
Lines changed: 5 additions & 0 deletions
diff --git a/‎vllm_ascend/patch/spec_decode/__init__.py b/‎vllm_ascend/patch/spec_decode/__init__.py
diff --git a/‎vllm_ascend/patch/spec_decode/patch_common/__init__.py b/‎vllm_ascend/patch/spec_decode/patch_common/__init__.py
diff --git a/‎vllm_ascend/patch/spec_decode/patch_common/eagle.py
Lines changed: 76 additions & 0 deletions b/‎vllm_ascend/patch/spec_decode/patch_common/eagle.py
Lines changed: 76 additions & 0 deletions
diff --git a/‎vllm_ascend/sample/__init__.py b/‎vllm_ascend/sample/__init__.py
@@ -0,0 +1,153 @@
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+import random
+from typing import Any
+
+import pytest
+
+from vllm import LLM, SamplingParams
+
+
+@pytest.fixture
+def test_prompts():
+    prompt_types = ["repeat", "sentence"]
+    num_prompts = 100
+    prompts = []
+
+    random.seed(0)
+    random_prompt_type_choices = random.choices(prompt_types, k=num_prompts)
+
+    # Generate a mixed batch of prompts, some of which can be easily
+    # predicted by n-gram matching and some which likely cannot.
+    for kind in random_prompt_type_choices:
+        word_choices = ["test", "temp", "hello", "where"]
+        word = random.choice(word_choices)
+        if kind == "repeat":
+            prompt = f"""
+            please repeat the word '{word}' 10 times.
+            give no other output than the word at least ten times in a row,
+            in lowercase with spaces between each word and without quotes.
+            """
+        elif kind == "sentence":
+            prompt = f"""
+            please give a ten-word sentence that
+            uses the word {word} at least once.
+            give no other output than that simple sentence without quotes.
+            """
+        else:
+            raise ValueError(f"Unknown prompt type: {kind}")
+        prompts.append([{"role": "user", "content": prompt}])
+
+    return prompts
+
+
+@pytest.fixture
+def sampling_config():
+    return SamplingParams(temperature=0, max_tokens=10, ignore_eos=False)
+
+
+@pytest.fixture
+def model_name():
+    return "meta-llama/Llama-3.1-8B-Instruct"
+
+
+def eagle_model_name():
+    return "yuhuili/EAGLE-LLaMA3.1-Instruct-8B"
+
+
+def eagle3_model_name():
+    return "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B"
+
+
+def test_ngram_correctness(
+    monkeypatch: pytest.MonkeyPatch,
+    test_prompts: list[list[dict[str, Any]]],
+    sampling_config: SamplingParams,
+    model_name: str,
+):
+    '''
+    Compare the outputs of a original LLM and a speculative LLM
+    should be the same when using ngram speculative decoding.
+    '''
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+
+        ref_llm = LLM(model=model_name, max_model_len=1024)
+        ref_outputs = ref_llm.chat(test_prompts, sampling_config)
+        del ref_llm
+
+        spec_llm = LLM(
+            model=model_name,
+            speculative_config={
+                "method": "ngram",
+                "prompt_lookup_max": 5,
+                "prompt_lookup_min": 3,
+                "num_speculative_tokens": 3,
+            },
+            max_model_len=1024,
+        )
+        spec_outputs = spec_llm.chat(test_prompts, sampling_config)
+        matches = 0
+        misses = 0
+        for ref_output, spec_output in zip(ref_outputs, spec_outputs):
+            if ref_output.outputs[0].text == spec_output.outputs[0].text:
+                matches += 1
+            else:
+                misses += 1
+                print(f"ref_output: {ref_output.outputs[0].text}")
+                print(f"spec_output: {spec_output.outputs[0].text}")
+
+        # Heuristic: expect at least 70% of the prompts to match exactly
+        # Upon failure, inspect the outputs to check for inaccuracy.
+        assert matches > int(0.7 * len(ref_outputs))
+        del spec_llm
+
+
+@pytest.mark.parametrize("use_eagle3", [False, True], ids=["eagle", "eagle3"])
+def test_eagle_correctness(
+    monkeypatch: pytest.MonkeyPatch,
+    test_prompts: list[list[dict[str, Any]]],
+    sampling_config: SamplingParams,
+    model_name: str,
+    use_eagle3: bool,
+):
+    '''
+    Compare the outputs of a original LLM and a speculative LLM
+    should be the same when using eagle speculative decoding.
+    '''
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_USE_V1", "1")
+
+        ref_llm = LLM(model=model_name, max_model_len=2048)
+        ref_outputs = ref_llm.chat(test_prompts, sampling_config)
+        del ref_llm
+
+        spec_model_name = eagle3_model_name(
+        ) if use_eagle3 else eagle_model_name()
+        spec_llm = LLM(
+            model=model_name,
+            trust_remote_code=True,
+            speculative_config={
+                "method": "eagle3" if use_eagle3 else "eagle",
+                "model": spec_model_name,
+                "num_speculative_tokens": 3,
+                "max_model_len": 2048,
+            },
+            max_model_len=2048,
+        )
+        spec_outputs = spec_llm.chat(test_prompts, sampling_config)
+        matches = 0
+        misses = 0
+        for ref_output, spec_output in zip(ref_outputs, spec_outputs):
+            if ref_output.outputs[0].text == spec_output.outputs[0].text:
+                matches += 1
+            else:
+                misses += 1
+                print(f"ref_output: {ref_output.outputs[0].text}")
+                print(f"spec_output: {spec_output.outputs[0].text}")
+
+        # Heuristic: expect at least 66% of the prompts to match exactly
+        # Upon failure, inspect the outputs to check for inaccuracy.
+        assert matches > int(0.66 * len(ref_outputs))
+        del spec_llm
@@ -111,6 +111,7 @@ class AscendMetadata:
     block_tables: torch.Tensor
     # (batch_size,). The sequence length per sequence. Sequence length means
     # the computed tokens + new tokens None if it is a decoding.
+    query_start_loc: torch.Tensor
     query_lens: torch.Tensor
     seq_lens: torch.Tensor
     # Maximum query length in the batch. None for decoding.
@@ -141,6 +142,9 @@ def reorder_batch(self, input_batch: "InputBatch",
 
     def build(self, num_reqs, num_actual_tokens, max_query_len,
               common_prefix_len):
+        query_start_loc_cpu = self.runner.query_start_loc_cpu[:num_reqs + 1]
+        query_start_loc = query_start_loc_cpu.to(self.runner.device,
+                                                 non_blocking=True)
         if vllm_version_is("0.8.5") or vllm_version_is("0.8.5.post1"):
             block_table = (self.runner.input_batch.block_table.
                            get_device_tensor()[:num_reqs])
@@ -159,6 +163,7 @@ def build(self, num_reqs, num_actual_tokens, max_query_len,
 
         attn_metadata = AscendMetadata(num_actual_tokens=num_actual_tokens,
                                        block_tables=block_table,
+                                       query_start_loc=query_start_loc,
                                        query_lens=query_lens,
                                        seq_lens=seq_lens,
                                        max_query_len=max_query_len,
 
@@ -0,0 +1,76 @@
+# SPDX-License-Identifier: Apache-2.0
+import torch
+import torch.nn as nn
+
+from vllm.v1.spec_decode.eagle import EagleProposer
+
+
+@staticmethod
+def prepare_inputs(
+    # [batch_size + 1]
+    cu_target_query_lens: torch.Tensor,
+    # [batch_size]
+    num_rejected_tokens: torch.Tensor,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    # cu_target_query_lens: [0, a, a + b, a + b + c]
+    # num_rejected_tokens: [n1, n2, n3]
+    # num_tokens_per_req: [a - n1, b - n2, c - n3]
+    # cu_num_tokens: [0, a - n1, a + b - n1 - n2, a + b + c - n1 - n2 - n3]
+    # token_indices: [0, 1, ..., a - n1 - 1,
+    #                 a, a + 1, ..., a + b - n2 - 1,
+    #                 a + b, a + b + 1, ..., a + b + c - n3 - 1]
+
+    # [0, a, a + b, a + b + c] -> [a, b, c]
+    query_len_per_req = (cu_target_query_lens[1:] -
+                            cu_target_query_lens[:-1])
+    # [a, b, c] -> [a - n1, b - n2, c - n3]
+    num_tokens_per_req = query_len_per_req - num_rejected_tokens
+
+    cu_num_tokens = torch.empty_like(cu_target_query_lens)
+    torch.cumsum(num_tokens_per_req, dim=0, out=cu_num_tokens[1:])
+    cu_num_tokens[0] = 0
+
+    # FIXME(woosuk): Avoid synchronization.
+    num_tokens = cu_num_tokens[-1].item()
+    token_indices = torch.empty(
+        num_tokens,
+        dtype=torch.int32,
+        device=cu_num_tokens.device,
+    )
+
+    BLOCK_SIZE = 1024
+    prepare_input_pytorch(
+        token_indices,
+        cu_target_query_lens,
+        cu_num_tokens,
+        block_size=BLOCK_SIZE,
+    )
+    return cu_num_tokens, token_indices
+
+
+def prepare_input_pytorch(
+    out_ptr: torch.Tensor,
+    cu_query_lens: torch.Tensor,
+    cu_num_tokens: torch.Tensor,
+    block_size: int
+):
+    num_pids = cu_num_tokens.shape[0] - 1
+
+    for pid in range(num_pids):
+        start_pos = cu_num_tokens[pid].item()
+        end_pos = cu_num_tokens[pid + 1].item()
+        num_tokens = end_pos - start_pos
+
+        index_start = cu_query_lens[pid].item()
+        num_blocks = (num_tokens + block_size - 1)
+
+        for i in range(num_blocks):
+            offset = torch.arange(0, block_size, dtype=out_ptr.dtype,
+                device=cu_query_lens.device)
+            global_indices = start_pos + offset
+            values = index_start + offset
+            mask = offset < num_tokens
+            out_ptr[global_indices[mask]] = values[mask]
+
+
+EagleProposer.prepare_inputs = prepare_inputs