Skip to content

[Build] Move numba/quart to requirments and update DS baseline and sync graph typo fix #1121

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Jun 8, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,5 +16,8 @@ requires = [
"torch>=2.5.1",
"torchvision<0.21.0",
"wheel",
"msgpack",
"quart",
"numba",
]
build-backend = "setuptools.build_meta"
2 changes: 0 additions & 2 deletions requirements-dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,4 @@ ray
types-jsonschema
xgrammar
zmq
numba
quart
types-psutil
3 changes: 3 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,6 @@ wheel
# requirements for disaggregated prefill
msgpack
quart

# Required for N-gram speculative decoding
numba
3 changes: 1 addition & 2 deletions tests/long_term/test_deepseek_v2_lite_tp2_accuracy.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,7 @@
# 3% relative tolerance for numerical accuracy.
RTOL = 0.03
# Baseline accuracy after VLLM optimization.
# FIXME: fix the accuracy issue
EXPECTED_VALUE = 0.000758150113722517
EXPECTED_VALUE = 0.3843821076573162


def run_test(model_name, queue, more_args=None):
Expand Down
32 changes: 24 additions & 8 deletions tests/singlecard/compile/test_simple.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
set_current_vllm_config)
from vllm.utils import direct_register_custom_op

from vllm_ascend.utils import vllm_version_is

global_counter = 0

# create a library to hold the custom op
Expand Down Expand Up @@ -92,14 +94,28 @@ def test_simple_piecewise_compile():

inputs = torch.randn(100).npu()

with compilation_counter.expect(
num_graphs_seen=1, # one graph for the model
num_piecewise_graphs_seen=5, # 2 * num_layers + 1
num_piecewise_capturable_graphs_seen=3, # 1 + num_layers
num_backend_compilations=3, # num_piecewise_capturable_graphs_seen
num_cudagraph_caputured=
6, # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
):
if vllm_version_is("0.9.0"):
kwargs = {
"num_graphs_seen": 1, # one graph for the model
"num_piecewise_graphs_seen": 5, # 2 * num_layers + 1
"num_piecewise_capturable_graphs_seen": 3, # 1 + num_layers
"num_backend_compilations":
3, # num_piecewise_capturable_graphs_seen
"num_cudagraph_caputured":
6 # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
}
else:
kwargs = {
"num_graphs_seen": 1, # one graph for the model
"num_piecewise_graphs_seen": 5, # 2 * num_layers + 1
"num_piecewise_capturable_graphs_seen": 3, # 1 + num_layers
"num_backend_compilations":
3, # num_piecewise_capturable_graphs_seen
"num_cudagraph_captured":
6 # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
}

with compilation_counter.expect(kwargs):

model(inputs)

Expand Down
7 changes: 6 additions & 1 deletion vllm_ascend/compilation/piecewise_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@
from vllm.logger import logger
from vllm.utils import weak_ref_tensors

from vllm_ascend.utils import vllm_version_is


@dataclasses.dataclass
class ConcreteSizeEntry:
Expand Down Expand Up @@ -205,7 +207,10 @@ def __call__(self, *args) -> Any:
entry.output = weak_ref_tensors(output)
entry.aclgraph = aclgraph

compilation_counter.num_cudagraph_caputured += 1
if vllm_version_is("0.9.0"):
compilation_counter.num_cudagraph_caputured += 1
else:
compilation_counter.num_cudagraph_captured += 1

# important: we need to return the output, rather than
# the weak ref of the output, so that pytorch can correctly
Expand Down