[CI/UT] Add test for chunk prefill and prefix cache on v1/AscendScheduler (#1505)

MengqingCao · web-flow · commit 59237ea78851 · 2025-07-02T16:57:03.000+08:00
### What this PR does / why we need it? Add test for chunked prefill and prefix cache on v1/AscendScheduler Covered scenarios: - `Qwen/Qwen3-0.6B-Base` and `deepseek-ai/DeepSeek-V2-Lite-Chat` --- multicard CI time increased by 19 min - `V1 + default scheduler` vs `V1 + default scheduler + enable prefix cache` - `V1 + Ascend scheduler` vs `V1 + Ascend scheduler + enable prefix cache` vs `V1 + Ascend scheduler + enable prefix cache + enable chunked prefill` - `Qwen/Qwen3-0.6B-Base` --- singlecard CI time increased by 8 min - `V1 + Ascend scheduler` vs `V1 + Ascend scheduler + enable chunked prefill` should rebase after #1498 and #1446 ### Does this PR introduce _any_ user-facing change? N/A ### How was this patch tested? CI passed with new added test. Signed-off-by: MengqingCao <cmq0113@163.com>
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -19,6 +19,7 @@
 
 import contextlib
 import gc
+import os
 from typing import Any, List, Optional, Tuple, TypeVar, Union
 
 import numpy as np
@@ -59,6 +60,9 @@
 PromptAudioInput = _PromptMultiModalInput[Tuple[np.ndarray, int]]
 PromptVideoInput = _PromptMultiModalInput[np.ndarray]
 
+_TEST_DIR = os.path.dirname(__file__)
+_TEST_PROMPTS = [os.path.join(_TEST_DIR, "e2e", "prompts", "example.txt")]
+
 
 def cleanup_dist_env_and_memory(shutdown_ray: bool = False):
     destroy_model_parallel()
@@ -367,6 +371,20 @@ def prompt_template(request):
     return PROMPT_TEMPLATES[request.param]
 
 
+def _read_prompts(filename: str) -> list[str]:
+    with open(filename) as f:
+        prompts = f.readlines()
+        return prompts
+
+
+@pytest.fixture
+def example_prompts() -> list[str]:
+    prompts = []
+    for filename in _TEST_PROMPTS:
+        prompts += _read_prompts(filename)
+    return prompts
+
+
 @pytest.fixture(scope="session")
 def ilama_lora_files():
     return snapshot_download(repo_id="jeeejeee/ilama-text2sql-spider")
diff --git a/tests/e2e/multicard/test_prefix_caching.py b/tests/e2e/multicard/test_prefix_caching.py
@@ -0,0 +1,152 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Compare the with and without prefix caching on V1 scheduler or AscendScheduler."""
+
+import os
+
+import pytest
+
+from tests.conftest import VllmRunner
+from tests.model_utils import check_outputs_equal
+
+MODELS = [
+    # for MHA
+    "Qwen/Qwen3-8B-Base",
+    # for MLA
+    "deepseek-ai/DeepSeek-V2-Lite-Chat"
+]
+
+# A prompt containing a large markdown table. The table is randomly generated by GPT-4.
+LONG_PROMPT = "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as follows.\n# Table\n" + """
+| ID  | Name          | Age | Occupation    | Country       | Email                  | Phone Number   | Address                       |
+|-----|---------------|-----|---------------|---------------|------------------------|----------------|------------------------------|
+| 1   | John Doe      | 29  | Engineer      | USA           | john.doe@example.com   | 555-1234       | 123 Elm St, Springfield, IL  |
+| 2   | Jane Smith    | 34  | Doctor        | Canada        | jane.smith@example.com | 555-5678       | 456 Oak St, Toronto, ON      |
+| 3   | Alice Johnson | 27  | Teacher       | UK            | alice.j@example.com    | 555-8765       | 789 Pine St, London, UK      |
+| 4   | Bob Brown     | 45  | Artist        | Australia     | bob.b@example.com      | 555-4321       | 321 Maple St, Sydney, NSW    |
+| 5   | Carol White   | 31  | Scientist     | New Zealand   | carol.w@example.com    | 555-6789       | 654 Birch St, Wellington, NZ |
+| 6   | Dave Green    | 28  | Lawyer        | Ireland       | dave.g@example.com     | 555-3456       | 987 Cedar St, Dublin, IE     |
+| 7   | Emma Black    | 40  | Musician      | USA           | emma.b@example.com     | 555-1111       | 246 Ash St, New York, NY     |
+| 8   | Frank Blue    | 37  | Chef          | Canada        | frank.b@example.com    | 555-2222       | 135 Spruce St, Vancouver, BC |
+| 9   | Grace Yellow  | 50  | Engineer      | UK            | grace.y@example.com    | 555-3333       | 864 Fir St, Manchester, UK   |
+| 10  | Henry Violet  | 32  | Artist        | Australia     | henry.v@example.com    | 555-4444       | 753 Willow St, Melbourne, VIC|
+| 11  | Irene Orange  | 26  | Scientist     | New Zealand   | irene.o@example.com    | 555-5555       | 912 Poplar St, Auckland, NZ  |
+| 12  | Jack Indigo   | 38  | Teacher       | Ireland       | jack.i@example.com     | 555-6666       | 159 Elm St, Cork, IE         |
+| 13  | Karen Red     | 41  | Lawyer        | USA           | karen.r@example.com    | 555-7777       | 357 Cedar St, Boston, MA     |
+| 14  | Leo Brown     | 30  | Chef          | Canada        | leo.b@example.com      | 555-8888       | 246 Oak St, Calgary, AB      |
+| 15  | Mia Green     | 33  | Musician      | UK            | mia.g@example.com      | 555-9999       | 975 Pine St, Edinburgh, UK   |
+| 16  | Noah Yellow   | 29  | Doctor        | Australia     | noah.y@example.com     | 555-0000       | 864 Birch St, Brisbane, QLD  |
+| 17  | Olivia Blue   | 35  | Engineer      | New Zealand   | olivia.b@example.com   | 555-1212       | 753 Maple St, Hamilton, NZ   |
+| 18  | Peter Black   | 42  | Artist        | Ireland       | peter.b@example.com    | 555-3434       | 912 Fir St, Limerick, IE     |
+| 19  | Quinn White   | 28  | Scientist     | USA           | quinn.w@example.com    | 555-5656       | 159 Willow St, Seattle, WA   |
+| 20  | Rachel Red    | 31  | Teacher       | Canada        | rachel.r@example.com   | 555-7878       | 357 Poplar St, Ottawa, ON    |
+| 21  | Steve Green   | 44  | Lawyer        | UK            | steve.g@example.com    | 555-9090       | 753 Elm St, Birmingham, UK   |
+| 22  | Tina Blue     | 36  | Musician      | Australia     | tina.b@example.com     | 555-1213       | 864 Cedar St, Perth, WA      |
+| 23  | Umar Black    | 39  | Chef          | New Zealand   | umar.b@example.com     | 555-3435       | 975 Spruce St, Christchurch, NZ|
+| 24  | Victor Yellow | 43  | Engineer      | Ireland       | victor.y@example.com   | 555-5657       | 246 Willow St, Galway, IE    |
+| 25  | Wendy Orange  | 27  | Artist        | USA           | wendy.o@example.com    | 555-7879       | 135 Elm St, Denver, CO       |
+| 26  | Xavier Green  | 34  | Scientist     | Canada        | xavier.g@example.com   | 555-9091       | 357 Oak St, Montreal, QC     |
+| 27  | Yara Red      | 41  | Teacher       | UK            | yara.r@example.com     | 555-1214       | 975 Pine St, Leeds, UK       |
+| 28  | Zack Blue     | 30  | Lawyer        | Australia     | zack.b@example.com     | 555-3436       | 135 Birch St, Adelaide, SA   |
+| 29  | Amy White     | 33  | Musician      | New Zealand   | amy.w@example.com      | 555-5658       | 159 Maple St, Wellington, NZ |
+| 30  | Ben Black     | 38  | Chef          | Ireland       | ben.b@example.com      | 555-7870       | 246 Fir St, Waterford, IE    |
+"""
+
+INPUT_PROMPTS = [
+    LONG_PROMPT +
+    "Question: what is the age of John Doe? Your answer: The age of John Doe is ",
+    LONG_PROMPT +
+    "Question: what is the age of Zack Blue? Your answer: The age of Zack Blue is "
+]
+
+
+@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0",
+                    reason="mtp is not supported on v1")
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("max_tokens", [50])
+def test_prefix_cache_with_v1_scheduler(model: str, max_tokens: int) -> None:
+    with VllmRunner(model,
+                    enforce_eager=True,
+                    max_model_len=2048,
+                    tensor_parallel_size=2,
+                    gpu_memory_utilization=0.7) as vllm_model:
+        prefix_cache_output = vllm_model.generate_greedy(
+            INPUT_PROMPTS, max_tokens)
+
+    with VllmRunner(model,
+                    enable_prefix_caching=False,
+                    enforce_eager=True,
+                    max_model_len=2048,
+                    tensor_parallel_size=2,
+                    gpu_memory_utilization=0.7) as vllm_model:
+        vllm_output = vllm_model.generate_greedy(INPUT_PROMPTS, max_tokens)
+
+    check_outputs_equal(
+        outputs_0_lst=vllm_output,
+        outputs_1_lst=prefix_cache_output,
+        name_0="vllm_output",
+        name_1="prefix_cache_output",
+    )
+
+
+@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0",
+                    reason="mtp is not supported on v1")
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("max_tokens", [50])
+def test_prefix_cache_with_ascend_scheduler(model: str,
+                                            max_tokens: int) -> None:
+
+    with VllmRunner(model,
+                    additional_config={
+                        'ascend_scheduler_config': {
+                            'enabled': True,
+                        },
+                    },
+                    enforce_eager=True,
+                    max_model_len=2048,
+                    tensor_parallel_size=2,
+                    gpu_memory_utilization=0.7) as vllm_model:
+        vllm_output = vllm_model.generate_greedy(INPUT_PROMPTS, max_tokens)
+
+    with VllmRunner(model,
+                    additional_config={
+                        'ascend_scheduler_config': {
+                            'enabled': True,
+                            'enable_prefix_caching': True,
+                        },
+                    },
+                    enforce_eager=True,
+                    max_model_len=2048,
+                    tensor_parallel_size=2,
+                    gpu_memory_utilization=0.7) as vllm_model:
+        prefix_cache_output = vllm_model.generate_greedy(
+            INPUT_PROMPTS, max_tokens)
+
+    with VllmRunner(model,
+                    additional_config={
+                        'ascend_scheduler_config': {
+                            'enabled': True,
+                            'enable_prefix_caching': True,
+                            "enable_chunked_prefill": True,
+                        },
+                    },
+                    enforce_eager=True,
+                    max_model_len=2048,
+                    tensor_parallel_size=2,
+                    gpu_memory_utilization=0.7) as vllm_model:
+        chunk_prefill_prefix_cache_output = vllm_model.generate_greedy(
+            INPUT_PROMPTS, max_tokens)
+
+    check_outputs_equal(
+        outputs_0_lst=vllm_output,
+        outputs_1_lst=prefix_cache_output,
+        name_0="vllm_output",
+        name_1="prefix_cache_output",
+    )
+
+    check_outputs_equal(
+        outputs_0_lst=chunk_prefill_prefix_cache_output,
+        outputs_1_lst=prefix_cache_output,
+        name_0="chunk_prefill_prefix_cache_output",
+        name_1="prefix_cache_output",
+    )
diff --git a/tests/e2e/prompts/example.txt b/tests/e2e/prompts/example.txt
@@ -0,0 +1,8 @@
+vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs.
+Briefly describe the major milestones in the development of artificial intelligence from 1950 to 2020.
+Compare and contrast artificial intelligence with human intelligence in terms of processing information.
+Describe the basic components of a neural network and how it can be trained.
+Write a short story about a robot that dreams for the first time.
+Analyze the impact of the COVID-19 pandemic on global economic structures and future business models.
+Explain the cultural significance of the Mona Lisa painting, and how its perception might vary in Western versus Eastern societies.
+Translate the following English sentence into Japanese, French, and Swahili: 'The early bird catches the worm.'
diff --git a/tests/e2e/singlecard/core/ascend_scheduler/test_ascend_scheduler.py b/tests/e2e/singlecard/core/ascend_scheduler/test_ascend_scheduler.py
diff --git a/tests/e2e/singlecard/core/ascend_scheduler/test_ascend_scheduler_e2e.py b/tests/e2e/singlecard/core/ascend_scheduler/test_ascend_scheduler_e2e.py
diff --git a/tests/e2e/singlecard/core/ascend_scheduler/test_chunk_prefill.py b/tests/e2e/singlecard/core/ascend_scheduler/test_chunk_prefill.py
@@ -0,0 +1,63 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Compare the with and without chunked prefill on AscendScheduler
+
+It tests chunked prefill. Chunked prefill can be enabled by 
+`additional_config={'ascend_scheduler_config': {'enabled': True, 'enable_chunked_prefill': True,},}`. 
+If prefill size exceeds max_num_batched_tokens, prefill requests are chunked.
+
+Run `pytest tests/e2e/singlecard/core/ascend_scheduler/test_chunk_prefill.py`.
+"""
+import os
+
+import pytest
+
+from tests.conftest import VllmRunner
+from tests.model_utils import check_outputs_equal
+
+MODELS = [
+    "Qwen/Qwen3-0.6B-Base",
+]
+
+
+@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0", reason="only test on v1")
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("max_tokens",
+                         [4])  # cannot align results when max_tokens > 4
+@pytest.mark.parametrize("chunked_prefill_token_size", [16])
+def test_chunked_prefill_with_ascend_scheduler(
+        example_prompts, model: str, max_tokens: int,
+        chunked_prefill_token_size: int) -> None:
+    max_num_seqs = chunked_prefill_token_size
+    max_num_batched_tokens = chunked_prefill_token_size
+    with VllmRunner(model,
+                    additional_config={
+                        'ascend_scheduler_config': {
+                            'enabled': True,
+                            'enable_chunked_prefill': True,
+                        },
+                    },
+                    max_num_seqs=max_num_seqs,
+                    max_num_batched_tokens=max_num_batched_tokens,
+                    enforce_eager=True,
+                    max_model_len=2048,
+                    gpu_memory_utilization=0.7) as vllm_model:
+        chunked_prefill_output = vllm_model.generate_greedy(
+            example_prompts, max_tokens)
+
+    with VllmRunner(model,
+                    additional_config={
+                        'ascend_scheduler_config': {
+                            'enabled': True,
+                        },
+                    },
+                    enforce_eager=True,
+                    max_model_len=2048,
+                    gpu_memory_utilization=0.7) as vllm_model:
+        vllm_output = vllm_model.generate_greedy(example_prompts, max_tokens)
+
+    check_outputs_equal(
+        outputs_0_lst=vllm_output,
+        outputs_1_lst=chunked_prefill_output,
+        name_0="vllm_output",
+        name_1="chunked_prefill_output",
+    )