Skip to content

Commit d798125

Browse files
MengqingCaowxsIcey
andauthored
[v0.9.1][DP][V1] Fix rank set in DP scenario & Bump torch-npu version to 2.5.1.post1.dev20250528 (#1247)
### What this PR does / why we need it? Cherry-pick form #1235 1. Fix rank set in DP scenario. The new poc version of torch-npu support setting `ASCEND_RT_VISIBLE_DEVICES` dynamically, thus we could use the rank set in `DPEngineCoreProc` directly instead of calculating local rank across dp by hand in the patched `_init_data_parallel` Closes: #1170 2. Bump torch-npu version to 2.5.1.post1.dev20250528 Closes: #1242 Closes: #1232 ### How was this patch tested? CI passed with new added test. --------- Signed-off-by: Icey <1790571317@qq.com> Signed-off-by: MengqingCao <cmq0113@163.com> Co-authored-by: Icey <1790571317@qq.com>
1 parent 030fe89 commit d798125

17 files changed

+108
-35
lines changed

.github/workflows/accuracy_test.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,8 @@ jobs:
173173

174174
- name: Install vllm-project/vllm-ascend
175175
working-directory: ./vllm-ascend
176+
env:
177+
PIP_EXTRA_INDEX_URL: https://mirrors.huaweicloud.com/ascend/repos/pypi
176178
run: |
177179
pip install -r requirements-dev.txt
178180
pip install -e .

.github/workflows/image_openeuler.yml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,12 @@ on:
1919
- '.github/workflows/image_openeuler.yml'
2020
- 'Dockerfile.openEuler'
2121
- 'vllm_ascend/**'
22+
- 'setup.py'
23+
- 'pyproject.toml'
24+
- 'requirements.txt'
25+
- 'cmake/**'
26+
- 'CMakeLists.txt'
27+
- 'csrc/**'
2228
push:
2329
# Publish image when tagging, the Dockerfile in tag will be build as tag image
2430
branches:

.github/workflows/image_ubuntu.yml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,12 @@ on:
1919
- '.github/workflows/image_ubuntu.yml'
2020
- 'Dockerfile'
2121
- 'vllm_ascend/**'
22+
- 'setup.py'
23+
- 'pyproject.toml'
24+
- 'requirements.txt'
25+
- 'cmake/**'
26+
- 'CMakeLists.txt'
27+
- 'csrc/**'
2228
push:
2329
# Publish image when tagging, the Dockerfile in tag will be build as tag image
2430
branches:

.github/workflows/nightly_benchmarks.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,8 @@ jobs:
111111
VLLM_TARGET_DEVICE=empty pip install -e .
112112
113113
- name: Install vllm-project/vllm-ascend
114+
env:
115+
PIP_EXTRA_INDEX_URL: https://mirrors.huaweicloud.com/ascend/repos/pypi
114116
run: |
115117
pip install -e .
116118
pip install -r benchmarks/requirements-bench.txt

.github/workflows/vllm_ascend_test.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -167,6 +167,8 @@ jobs:
167167
VLLM_TARGET_DEVICE=empty pip install -e .
168168
169169
- name: Install vllm-project/vllm-ascend
170+
env:
171+
PIP_EXTRA_INDEX_URL: https://mirrors.huaweicloud.com/ascend/repos/pypi
170172
run: |
171173
pip install -r requirements-dev.txt
172174
pip install -v -e .

.github/workflows/vllm_ascend_test_long_term.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,8 @@ jobs:
8585
VLLM_TARGET_DEVICE=empty pip install -e .
8686
8787
- name: Install vllm-project/vllm-ascend
88+
env:
89+
PIP_EXTRA_INDEX_URL: https://mirrors.huaweicloud.com/ascend/repos/pypi
8890
run: |
8991
pip install -r requirements-dev.txt
9092
pip install -v -e .

.github/workflows/vllm_ascend_test_pd.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,8 @@ jobs:
9494
VLLM_TARGET_DEVICE=empty pip install -e .
9595
9696
- name: Install vllm-project/vllm-ascend
97+
env:
98+
PIP_EXTRA_INDEX_URL: https://mirrors.huaweicloud.com/ascend/repos/pypi
9799
run: |
98100
pip install -r requirements-dev.txt
99101
pip install -v -e .

Dockerfile

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,8 @@ RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm
4646

4747
# Install vllm-ascend
4848
# Append `libascend_hal.so` path (devlib) to LD_LIBRARY_PATH
49-
RUN source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
49+
RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
50+
source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
5051
source /usr/local/Ascend/nnal/atb/set_env.sh && \
5152
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
5253
python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \

Dockerfile.openEuler

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,8 @@ RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/ -
4343
python3 -m pip cache purge
4444

4545
# Install vllm-ascend
46-
RUN source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
46+
RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
47+
source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
4748
source /usr/local/Ascend/nnal/atb/set_env.sh && \
4849
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
4950
python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ By using vLLM Ascend plugin, popular open-source models, including Transformer-l
3838
- Software:
3939
* Python >= 3.9, < 3.12
4040
* CANN >= 8.1.RC1
41-
* PyTorch >= 2.5.1, torch-npu >= 2.5.1
41+
* PyTorch >= 2.5.1, torch-npu >= 2.5.1.post1.dev20250528
4242
* vLLM (the same version as vllm-ascend)
4343

4444
## Getting Started

README.zh.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ vLLM 昇腾插件 (`vllm-ascend`) 是一个由社区维护的让vLLM在Ascend NP
3939
- 软件:
4040
* Python >= 3.9, < 3.12
4141
* CANN >= 8.1.RC1
42-
* PyTorch >= 2.5.1, torch-npu >= 2.5.1
42+
* PyTorch >= 2.5.1, torch-npu >= 2.5.1.post1.dev20250528
4343
* vLLM (与vllm-ascend版本一致)
4444

4545
## 开始使用

docs/source/installation.md

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,11 @@ This document describes how to install vllm-ascend manually.
99
- A hardware with Ascend NPU. It's usually the Atlas 800 A2 series.
1010
- Software:
1111

12-
| Software | Supported version | Note |
13-
|-----------|-------------------|----------------------------------------|
14-
| CANN | >= 8.1.RC1 | Required for vllm-ascend and torch-npu |
15-
| torch-npu | >= 2.5.1 | Required for vllm-ascend |
16-
| torch | >= 2.5.1 | Required for torch-npu and vllm |
12+
| Software | Supported version | Note |
13+
|---------------|----------------------------------|-------------------------------------------|
14+
| CANN | >= 8.1.RC1 | Required for vllm-ascend and torch-npu |
15+
| torch-npu | >= 2.5.1.post1.dev20250528 | Required for vllm-ascend |
16+
| torch | >= 2.5.1 | Required for torch-npu and vllm |
1717

1818
You have 2 way to install:
1919
- **Using pip**: first prepare env manually or via CANN image, then install `vllm-ascend` using pip.
@@ -156,6 +156,7 @@ cd ..
156156
# Install vLLM Ascend
157157
git clone --depth 1 --branch |vllm_ascend_version| https://github.com/vllm-project/vllm-ascend.git
158158
cd vllm-ascend
159+
export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi
159160
pip install -v -e .
160161
cd ..
161162
```

requirements.txt

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@ pyyaml
1010
scipy
1111
setuptools>=64
1212
setuptools-scm>=8
13-
torch-npu==2.5.1
1413
torch>=2.5.1
1514
torchvision<0.21.0
1615
wheel
@@ -21,3 +20,8 @@ quart
2120

2221
# Required for N-gram speculative decoding
2322
numba
23+
24+
# Install torch_npu
25+
--pre
26+
--extra-index-url https://mirrors.huaweicloud.com/ascend/repos/pypi
27+
torch-npu==2.5.1.post1.dev20250528

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,7 @@ def configure(self, ext: CMakeExtension) -> None:
152152
# if pybind11 is installed via pip
153153
pybind11_cmake_path = (subprocess.check_output(
154154
[python_executable, "-m", "pybind11",
155-
"--cmake"]).decode().strip())
155+
"--cmakedir"]).decode().strip())
156156
except subprocess.CalledProcessError as e:
157157
# else specify pybind11 path installed from source code on CI container
158158
raise RuntimeError(f"CMake configuration failed: {e}")

tests/multicard/test_data_parallel.py

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
#
2+
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
3+
# Copyright 2023 The vLLM team.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
#
17+
"""
18+
Compare the outputs of vLLM with and without aclgraph.
19+
Run `pytest tests/multicard/test_data_parallel.py`.
20+
"""
21+
22+
import os
23+
24+
import pytest
25+
26+
from tests.conftest import VllmRunner
27+
from tests.model_utils import check_outputs_equal
28+
29+
MODELS = ["Qwen/Qwen2.5-0.5B-Instruct"]
30+
31+
32+
@pytest.mark.skipif(True, reason="OPEN ME when dp is supported on A2")
33+
@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0",
34+
reason="Data parallel only support on v1")
35+
@pytest.mark.parametrize("model", MODELS)
36+
@pytest.mark.parametrize("max_tokens", [32])
37+
def test_data_parallel_correctness(
38+
model: str,
39+
max_tokens: int,
40+
) -> None:
41+
example_prompts = [
42+
"Hello, my name is", "The president of the United States is",
43+
"The capital of France is", "The future of AI is"
44+
]
45+
46+
with VllmRunner(model_name=model,
47+
max_model_len=1024,
48+
max_num_seqs=16,
49+
data_parallel_size=2,
50+
distributed_executor_backend="mp") as vllm_model:
51+
vllm_dp_outputs = vllm_model.generate_greedy(example_prompts,
52+
max_tokens)
53+
54+
with VllmRunner(
55+
model_name=model,
56+
max_model_len=1024,
57+
max_num_seqs=16,
58+
) as vllm_model:
59+
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
60+
61+
check_outputs_equal(
62+
outputs_0_lst=vllm_outputs,
63+
outputs_1_lst=vllm_dp_outputs,
64+
name_0="vllm_outputs",
65+
name_1="vllm_dp_outputs",
66+
)

vllm_ascend/patch/platform/patch_common/patch_distributed.py

Lines changed: 1 addition & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -21,10 +21,9 @@
2121
import vllm.distributed
2222
import vllm.envs as envs
2323
from torch.distributed import ProcessGroup
24-
from vllm.config import ParallelConfig, VllmConfig
24+
from vllm.config import ParallelConfig
2525
from vllm.distributed.utils import \
2626
stateless_init_torch_distributed_process_group
27-
from vllm.v1.engine.core import DPEngineCoreProc
2827

2928

3029
def ascend_destroy_model_parallel():
@@ -79,21 +78,6 @@ def stateless_init_dp_group(self) -> "ProcessGroup":
7978
return dp_group
8079

8180

82-
def _init_data_parallel(self, vllm_config: VllmConfig):
83-
# Configure NPUs and stateless process group for data parallel.
84-
dp_rank = vllm_config.parallel_config.data_parallel_rank
85-
dp_size = vllm_config.parallel_config.data_parallel_size
86-
local_dp_rank = vllm_config.parallel_config.data_parallel_rank_local
87-
88-
assert dp_size > 1
89-
assert 0 <= local_dp_rank <= dp_rank < dp_size
90-
91-
self.local_dp_rank = local_dp_rank
92-
self.dp_group = vllm_config.parallel_config.stateless_init_dp_group()
93-
self.current_wave = 0
94-
95-
9681
vllm.distributed.parallel_state.destroy_model_parallel = ascend_destroy_model_parallel
97-
DPEngineCoreProc._init_data_parallel = _init_data_parallel
9882
ParallelConfig.get_next_dp_init_port = parallel_config_get_dp_port
9983
ParallelConfig.stateless_init_dp_group = stateless_init_dp_group

vllm_ascend/worker/worker_v1.py

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -75,12 +75,6 @@ def __init__(
7575
distributed_init_method=distributed_init_method,
7676
is_driver_worker=is_driver_worker)
7777

78-
# NOTE(Yizhou): Since we do not set ASCEND_RT_VISIBLE_DEVICES in
79-
# vllm_ascend, we need to set the device id manually.
80-
local_dp_rank = self.vllm_config.parallel_config.data_parallel_rank_local
81-
world_size = self.vllm_config.parallel_config.world_size
82-
self.local_rank_across_dp = local_dp_rank * world_size + self.local_rank
83-
8478
# Try to import mindie_turbo to accelerate vLLM inference.
8579
try_register_lib(
8680
"mindie_turbo",
@@ -124,7 +118,7 @@ def initialize_cache(self, num_gpu_blocks: int,
124118

125119
def init_device(self):
126120
if self.device_config.device.type == "npu":
127-
self.device = torch.device(f"npu:{self.local_rank_across_dp}")
121+
self.device = torch.device(f"npu:{self.local_rank}")
128122
NPUPlatform.set_device(self.device)
129123
NPUPlatform.empty_cache()
130124
self.init_npu_memory = NPUPlatform.mem_get_info()[0]

0 commit comments

Comments
 (0)