Skip to content

Commit d6718ce

Browse files
committed
[DP][V1] Fix rank set in DP scenario
Signed-off-by: MengqingCao <cmq0113@163.com>
1 parent f5404dc commit d6718ce

File tree

13 files changed

+81
-45
lines changed

13 files changed

+81
-45
lines changed

.github/workflows/accuracy_test.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -175,7 +175,7 @@ jobs:
175175
working-directory: ./vllm-ascend
176176
run: |
177177
pip install -r requirements-dev.txt
178-
pip install -e .
178+
pip install -e . --no-build-isolation
179179
180180
- name: Install lm-eval, ray, and datasets
181181
run: |

.github/workflows/nightly_benchmarks.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,7 @@ jobs:
116116
117117
- name: Install vllm-project/vllm-ascend
118118
run: |
119-
pip install -e .
119+
pip install -e . --no-build-isolation
120120
pip install -r benchmarks/requirements-bench.txt
121121
122122
- name: Run current commit benchmarks

.github/workflows/vllm_ascend_test.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -216,7 +216,7 @@ jobs:
216216
- name: Install vllm-project/vllm-ascend
217217
run: |
218218
pip install -r requirements-dev.txt
219-
pip install -v -e .
219+
pip install -v -e . --no-build-isolation
220220
221221
- name: Run e2e test for V1 Engine
222222
env:
@@ -313,7 +313,7 @@ jobs:
313313
- name: Install vllm-project/vllm-ascend
314314
run: |
315315
pip install -r requirements-dev.txt
316-
pip install -v -e .
316+
pip install -v -e . --no-build-isolation
317317
318318
- name: Run vllm-project/vllm-ascend test for V1 Engine
319319
env:

.github/workflows/vllm_ascend_test_long_term.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ jobs:
9090
- name: Install vllm-project/vllm-ascend
9191
run: |
9292
pip install -r requirements-dev.txt
93-
pip install -v -e .
93+
pip install -v -e . --no-build-isolation
9494
9595
- name: Run vllm-project/vllm-ascend long term test
9696
run: |

.github/workflows/vllm_ascend_test_pd.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ jobs:
9999
- name: Install vllm-project/vllm-ascend
100100
run: |
101101
pip install -r requirements-dev.txt
102-
pip install -v -e .
102+
pip install -v -e . --no-build-isolation
103103
104104
- name: Run vllm-project/vllm-ascend PD Disaggregation test
105105
run: |

Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm
4949
RUN source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
5050
source /usr/local/Ascend/nnal/atb/set_env.sh && \
5151
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
52-
python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \
52+
python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --no-build-isolation --extra-index https://download.pytorch.org/whl/cpu/ && \
5353
python3 -m pip cache purge
5454

5555
# Install modelscope (for fast download) and ray (for multinode)

Dockerfile.openEuler

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/ -
4646
RUN source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
4747
source /usr/local/Ascend/nnal/atb/set_env.sh && \
4848
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
49-
python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \
49+
python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --no-build-isolation --extra-index https://download.pytorch.org/whl/cpu/ && \
5050
python3 -m pip cache purge
5151

5252
# Install modelscope (for fast download) and ray (for multinode)

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ requires = [
1212
"scipy",
1313
"setuptools>=64",
1414
"setuptools-scm>=8",
15-
"torch-npu==2.5.1",
15+
"torch-npu==2.5.1.post1.dev20250528",
1616
"torch>=2.5.1",
1717
"torchvision<0.21.0",
1818
"wheel",

requirements.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,8 @@ pyyaml
1010
scipy
1111
setuptools>=64
1212
setuptools-scm>=8
13-
torch-npu==2.5.1
13+
--extra-index-url https://mirrors.huaweicloud.com/ascend/repos/pypi
14+
torch-npu==2.5.1.post1.dev20250528
1415
torch>=2.5.1
1516
torchvision<0.21.0
1617
wheel

tests/multicard/test_data_parallel.py

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
#
2+
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
3+
# Copyright 2023 The vLLM team.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
#
17+
"""
18+
Compare the outputs of vLLM with and without aclgraph.
19+
20+
Run `pytest tests/multicard/test_data_parallel.py`.
21+
"""
22+
23+
import os
24+
25+
import pytest
26+
27+
from tests.conftest import VllmRunner
28+
from tests.model_utils import check_outputs_equal
29+
30+
MODELS = ["Qwen/Qwen2.5-0.5B-Instruct"]
31+
32+
33+
@pytest.mark.skipif(os.getenv("VLLM_USE_V1") == "0",
34+
reason="Data parallel only support on v1")
35+
@pytest.mark.parametrize("model", MODELS)
36+
@pytest.mark.parametrize("max_tokens", [32])
37+
def test_data_parallel_correctness(
38+
model: str,
39+
max_tokens: int,
40+
) -> None:
41+
example_prompts = [
42+
"Hello, my name is", "The president of the United States is",
43+
"The capital of France is", "The future of AI is"
44+
]
45+
46+
with VllmRunner(model_name=model,
47+
max_model_len=1024,
48+
max_num_seqs=16,
49+
data_parallel_size=2,
50+
distributed_executor_backend="mp") as vllm_model:
51+
vllm_dp_outputs = vllm_model.generate_greedy(example_prompts,
52+
max_tokens)
53+
54+
with VllmRunner(
55+
model_name=model,
56+
max_model_len=1024,
57+
max_num_seqs=16,
58+
) as vllm_model:
59+
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
60+
61+
check_outputs_equal(
62+
outputs_0_lst=vllm_outputs,
63+
outputs_1_lst=vllm_dp_outputs,
64+
name_0="vllm_outputs",
65+
name_1="vllm_dp_outputs",
66+
)

vllm_ascend/patch/__init__.py

Lines changed: 2 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -47,16 +47,7 @@
4747
# Related PR (if no, explain why):
4848
# Future Plan:
4949
# Remove those patch when vllm merged them
50-
# 2. `vllm.v1.engine.core.DPEngineCoreProc._init_data_parallel`
51-
# Why:
52-
# There is some bug for ASCEND_RT_VISIBLE_DEVICES usage.
53-
# How:
54-
# The ASCEND_RT_VISIBLE_DEVICES related code is dropped.
55-
# Related PR (if no, explain why):
56-
# No, this is a bug for vllm ascend
57-
# Future Plan:
58-
# Remove this patch once ASCEND_RT_VISIBLE_DEVICES bug is fixed.
59-
# 3. `vllm.config.ParallelConfig.get_next_dp_init_port`
50+
# 2. `vllm.config.ParallelConfig.get_next_dp_init_port`
6051
# Why:
6152
# vllm doesn't support get port from environment.
6253
# How:
@@ -65,7 +56,7 @@
6556
# Need a PR to vllm to support get port from environment.
6657
# Future Plan:
6758
# Remove those patch when vllm merged them
68-
# 4. `vllm.config.ParallelConfig.ParallelConfig.stateless_init_dp_group`
59+
# 3. `vllm.config.ParallelConfig.ParallelConfig.stateless_init_dp_group`
6960
# Why:
7061
# vLLM use gloo backend by default to initialize stateless dp process gourp, but we want to use hccl here to
7162
# get better performance

vllm_ascend/patch/platform/patch_common/patch_distributed.py

Lines changed: 1 addition & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -21,10 +21,9 @@
2121
import vllm.distributed
2222
import vllm.envs as envs
2323
from torch.distributed import ProcessGroup
24-
from vllm.config import ParallelConfig, VllmConfig
24+
from vllm.config import ParallelConfig
2525
from vllm.distributed.utils import \
2626
stateless_init_torch_distributed_process_group
27-
from vllm.v1.engine.core import DPEngineCoreProc
2827

2928

3029
def ascend_destroy_model_parallel():
@@ -79,21 +78,6 @@ def stateless_init_dp_group(self) -> "ProcessGroup":
7978
return dp_group
8079

8180

82-
def _init_data_parallel(self, vllm_config: VllmConfig):
83-
# Configure NPUs and stateless process group for data parallel.
84-
dp_rank = vllm_config.parallel_config.data_parallel_rank
85-
dp_size = vllm_config.parallel_config.data_parallel_size
86-
local_dp_rank = vllm_config.parallel_config.data_parallel_rank_local
87-
88-
assert dp_size > 1
89-
assert 0 <= local_dp_rank <= dp_rank < dp_size
90-
91-
self.local_dp_rank = local_dp_rank
92-
self.dp_group = vllm_config.parallel_config.stateless_init_dp_group()
93-
self.current_wave = 0
94-
95-
9681
vllm.distributed.parallel_state.destroy_model_parallel = ascend_destroy_model_parallel
97-
DPEngineCoreProc._init_data_parallel = _init_data_parallel
9882
ParallelConfig.get_next_dp_init_port = parallel_config_get_dp_port
9983
ParallelConfig.stateless_init_dp_group = stateless_init_dp_group

vllm_ascend/worker/worker_v1.py

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -75,12 +75,6 @@ def __init__(
7575
distributed_init_method=distributed_init_method,
7676
is_driver_worker=is_driver_worker)
7777

78-
# NOTE(Yizhou): Since we do not set ASCEND_RT_VISIBLE_DEVICES in
79-
# vllm_ascend, we need to set the device id manually.
80-
local_dp_rank = self.vllm_config.parallel_config.data_parallel_rank_local
81-
world_size = self.vllm_config.parallel_config.world_size
82-
self.local_rank_across_dp = local_dp_rank * world_size + self.local_rank
83-
8478
# Try to import mindie_turbo to accelerate vLLM inference.
8579
try_register_lib(
8680
"mindie_turbo",
@@ -124,7 +118,7 @@ def initialize_cache(self, num_gpu_blocks: int,
124118

125119
def init_device(self):
126120
if self.device_config.device.type == "npu":
127-
self.device = torch.device(f"npu:{self.local_rank_across_dp}")
121+
self.device = torch.device(f"npu:{self.local_rank}")
128122
NPUPlatform.set_device(self.device)
129123
NPUPlatform.empty_cache()
130124
self.init_npu_memory = NPUPlatform.mem_get_info()[0]

0 commit comments

Comments
 (0)