Skip to content

Commit 4976b48

Browse files
authored
[Build] Move numba/quart to requirments and update DS baseline and sync graph typo fix (#1121)
### What this PR does / why we need it? 1. The dependency was introduced by #874 - Move numba/quart from requirements-dev to requirments - Align pyproject.toml with requirements 2. This patch also fix deepseek accuracy baseline which #1118 was not addressed. According to https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite the gsm8k is about `41.1` 3. This also sync the vLLM upstream changes: vllm-project/vllm@eaa2e51 ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? CI passed vllm ascend test (basic workflow) vllm longterm test (spec decode) Closes: #1120 --------- Signed-off-by: Yikun Jiang <yikunkero@gmail.com>
1 parent f1543d5 commit 4976b48

File tree

6 files changed

+37
-13
lines changed

6 files changed

+37
-13
lines changed

pyproject.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,5 +16,8 @@ requires = [
1616
"torch>=2.5.1",
1717
"torchvision<0.21.0",
1818
"wheel",
19+
"msgpack",
20+
"quart",
21+
"numba",
1922
]
2023
build-backend = "setuptools.build_meta"

requirements-dev.txt

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,4 @@ ray
99
types-jsonschema
1010
xgrammar
1111
zmq
12-
numba
13-
quart
1412
types-psutil

requirements.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,3 +18,6 @@ wheel
1818
# requirements for disaggregated prefill
1919
msgpack
2020
quart
21+
22+
# Required for N-gram speculative decoding
23+
numba

tests/long_term/test_deepseek_v2_lite_tp2_accuracy.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,8 +34,7 @@
3434
# 3% relative tolerance for numerical accuracy.
3535
RTOL = 0.03
3636
# Baseline accuracy after VLLM optimization.
37-
# FIXME: fix the accuracy issue
38-
EXPECTED_VALUE = 0.000758150113722517
37+
EXPECTED_VALUE = 0.3843821076573162
3938

4039

4140
def run_test(model_name, queue, more_args=None):

tests/singlecard/compile/test_simple.py

Lines changed: 24 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@
1414
set_current_vllm_config)
1515
from vllm.utils import direct_register_custom_op
1616

17+
from vllm_ascend.utils import vllm_version_is
18+
1719
global_counter = 0
1820

1921
# create a library to hold the custom op
@@ -92,14 +94,28 @@ def test_simple_piecewise_compile():
9294

9395
inputs = torch.randn(100).npu()
9496

95-
with compilation_counter.expect(
96-
num_graphs_seen=1, # one graph for the model
97-
num_piecewise_graphs_seen=5, # 2 * num_layers + 1
98-
num_piecewise_capturable_graphs_seen=3, # 1 + num_layers
99-
num_backend_compilations=3, # num_piecewise_capturable_graphs_seen
100-
num_cudagraph_caputured=
101-
6, # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
102-
):
97+
if vllm_version_is("0.9.0"):
98+
kwargs = {
99+
"num_graphs_seen": 1, # one graph for the model
100+
"num_piecewise_graphs_seen": 5, # 2 * num_layers + 1
101+
"num_piecewise_capturable_graphs_seen": 3, # 1 + num_layers
102+
"num_backend_compilations":
103+
3, # num_piecewise_capturable_graphs_seen
104+
"num_cudagraph_caputured":
105+
6 # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
106+
}
107+
else:
108+
kwargs = {
109+
"num_graphs_seen": 1, # one graph for the model
110+
"num_piecewise_graphs_seen": 5, # 2 * num_layers + 1
111+
"num_piecewise_capturable_graphs_seen": 3, # 1 + num_layers
112+
"num_backend_compilations":
113+
3, # num_piecewise_capturable_graphs_seen
114+
"num_cudagraph_captured":
115+
6 # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
116+
}
117+
118+
with compilation_counter.expect(kwargs):
103119

104120
model(inputs)
105121

vllm_ascend/compilation/piecewise_backend.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,8 @@
3131
from vllm.logger import logger
3232
from vllm.utils import weak_ref_tensors
3333

34+
from vllm_ascend.utils import vllm_version_is
35+
3436

3537
@dataclasses.dataclass
3638
class ConcreteSizeEntry:
@@ -205,7 +207,10 @@ def __call__(self, *args) -> Any:
205207
entry.output = weak_ref_tensors(output)
206208
entry.aclgraph = aclgraph
207209

208-
compilation_counter.num_cudagraph_caputured += 1
210+
if vllm_version_is("0.9.0"):
211+
compilation_counter.num_cudagraph_caputured += 1
212+
else:
213+
compilation_counter.num_cudagraph_captured += 1
209214

210215
# important: we need to return the output, rather than
211216
# the weak ref of the output, so that pytorch can correctly

0 commit comments

Comments
 (0)