Skip to content

Commit ab5d110

Browse files
authored
vllm-ascend support chunked prefill (#1172)
### What this PR does / why we need it? vllm-ascend support chunked prefill for MLA --------- Signed-off-by: fems14 <1804143737@qq.com>
1 parent a3b5af8 commit ab5d110

File tree

5 files changed

+303
-20
lines changed

5 files changed

+303
-20
lines changed

docs/source/user_guide/additional_config.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ The following table lists the additional configuration options available in vLLM
3131
| `expert_tensor_parallel_size` | str | `0` | Expert tensor parallel size the model to use. |
3232
| `refresh` | bool | `false` | Whether to refresh global ascend config content. This value is usually used by rlhf case. |
3333
| `expert_map_path` | str | None | When using expert load balancing for the MOE model, an expert map path needs to be passed in. |
34+
| `chunked_prefill_for_mla` | bool | `False` | Whether to enable the fused operator-like chunked_prefill. |
3435

3536
The details of each config option are as follows:
3637

tests/singlecard/test_chunked.py

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
#
2+
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
3+
# Copyright 2023 The vLLM team.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
#
17+
"""
18+
Compare the outputs of vLLM with and without aclgraph.
19+
20+
Run `pytest tests/compile/test_aclgraph.py`.
21+
"""
22+
23+
import os
24+
25+
import pytest
26+
import torch
27+
from vllm import LLM, SamplingParams
28+
29+
MODELS = ["deepseek-ai/DeepSeek-V2-Lite"]
30+
31+
32+
@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0",
33+
reason="new chunked only support on v1")
34+
@pytest.mark.parametrize("model", MODELS)
35+
@pytest.mark.parametrize("max_tokens", [1])
36+
def test_models(
37+
model: str,
38+
max_tokens: int,
39+
monkeypatch: pytest.MonkeyPatch,
40+
) -> None:
41+
return
42+
with monkeypatch.context() as m:
43+
prompts = "The president of the United States is"
44+
45+
m.setenv("VLLM_USE_V1", "1")
46+
47+
sampling_params = SamplingParams(
48+
max_tokens=max_tokens,
49+
temperature=0.0,
50+
)
51+
52+
vllm_model = LLM(model,
53+
long_prefill_token_threshold=4,
54+
enforce_eager=True)
55+
output_chunked = vllm_model.generate(prompts, sampling_params)
56+
logprobs_chunked = output_chunked.outputs[0].logprobs
57+
del vllm_model
58+
torch.npu.empty_cache()
59+
60+
vllm_model = LLM(model,
61+
enforce_eager=True,
62+
additional_config={
63+
'ascend_scheduler_config': {
64+
'enabled': True
65+
},
66+
})
67+
output = vllm_model.generate(prompts, sampling_params)
68+
logprobs = output.outputs[0].logprobs
69+
del vllm_model
70+
torch.npu.empty_cache()
71+
72+
logprobs_similarity = torch.cosine_similarity(
73+
logprobs_chunked.flatten(), logprobs.flatten(), dim=0)
74+
assert logprobs_similarity > 0.95

vllm_ascend/ascend_config.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,8 @@ def __init__(self, vllm_config):
3939
self.expert_tensor_parallel_size = int(
4040
additional_config.get("expert_tensor_parallel_size", 0))
4141
self.expert_map_path = additional_config.get("expert_map_path", None)
42+
self.chunked_prefill_for_mla = additional_config.get(
43+
"chunked_prefill_for_mla", False)
4244

4345

4446
class TorchairGraphConfig:

0 commit comments

Comments
 (0)