Skip to content

Commit 83ee4c1

Browse files
author
yangcheng (AJ)
committed
add st
Signed-off-by: yangcheng (AJ) <y00806874@china.huawei.com>
1 parent 8b296a8 commit 83ee4c1

File tree

2 files changed

+85
-13
lines changed

2 files changed

+85
-13
lines changed

tests/e2e/multicard/test_qwen3_moe.py

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
#
2+
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
3+
# Copyright 2023 The vLLM team.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
# This file is a part of the vllm-ascend project.
17+
#
18+
"""
19+
Compare the outputs of vLLM with and without aclgraph.
20+
21+
Run `pytest tests/multicard/test_data_parallel.py`.
22+
"""
23+
24+
import os
25+
import subprocess
26+
import sys
27+
from unittest.mock import patch
28+
29+
import pytest
30+
31+
MODELS = ["vllm-ascend/Qwen3-30B-A3B-Puring"]
32+
33+
34+
@pytest.mark.parametrize("model", MODELS)
35+
@pytest.mark.parametrize("max_tokens", [32])
36+
@patch.dict(os.environ, {"ASCEND_RT_VISIBLE_DEVICES": "0,1,2,3"})
37+
def test_qwen3_moe_inference(model, max_tokens):
38+
script = "examples/offline_data_parallel.py"
39+
40+
env = os.environ.copy()
41+
42+
cmd = [
43+
sys.executable,
44+
script,
45+
"--model",
46+
model,
47+
"--dp-size",
48+
"2",
49+
"--tp-size",
50+
"2",
51+
"--node-size",
52+
"1",
53+
"--node-rank",
54+
"0",
55+
"--trust-remote-code",
56+
"--enforce-eager",
57+
]
58+
59+
print(f"Running subprocess: {' '.join(cmd)}")
60+
proc = subprocess.run(cmd,
61+
env=env,
62+
stdout=subprocess.PIPE,
63+
stderr=subprocess.STDOUT,
64+
timeout=600)
65+
output = proc.stdout.decode()
66+
67+
print(output)
68+
69+
assert "DP rank 0 needs to process" in output
70+
assert "DP rank 1 needs to process" in output
71+
assert "Generated text:" in output
72+
assert proc.returncode == 0

vllm_ascend/models/qwen3_moe.py

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -19,13 +19,9 @@
1919
from typing import Optional
2020

2121
import torch
22+
import vllm
2223
from torch import nn
2324
from transformers import PretrainedConfig
24-
from vllm_ascend.ascend_config import get_ascend_config
25-
from vllm_ascend.distributed.parallel_state import get_ep_group
26-
from vllm_ascend.ops.fused_moe import AscendFusedMoE
27-
28-
import vllm
2925
from vllm.attention import AttentionMetadata
3026
from vllm.distributed import get_tensor_model_parallel_world_size, get_tp_group
3127
from vllm.distributed.parallel_state import get_dp_group
@@ -34,6 +30,10 @@
3430
from vllm.model_executor.layers.quantization import QuantizationConfig
3531
from vllm.model_executor.models.qwen3_moe import Qwen3MoeForCausalLM
3632

33+
from vllm_ascend.ascend_config import get_ascend_config
34+
from vllm_ascend.distributed.parallel_state import get_ep_group
35+
from vllm_ascend.ops.fused_moe import AscendFusedMoE
36+
3737

3838
class CustomQwen3MoeForCausalLM(Qwen3MoeForCausalLM):
3939
packed_modules_mapping = {
@@ -46,19 +46,20 @@ class CustomQwen3MoeForCausalLM(Qwen3MoeForCausalLM):
4646
"gate_proj",
4747
"up_proj",
4848
],
49-
"experts":
50-
["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
49+
"experts": [
50+
"experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"
51+
],
5152
}
5253

5354

5455
class AscendQwen3MoeSparseMoeBlock(nn.Module):
5556
top_k: int
5657

5758
def __init__(
58-
self,
59-
config: PretrainedConfig,
60-
quant_config: Optional[QuantizationConfig] = None,
61-
prefix: str = "",
59+
self,
60+
config: PretrainedConfig,
61+
quant_config: Optional[QuantizationConfig] = None,
62+
prefix: str = "",
6263
):
6364
super().__init__()
6465
self.tp_size = get_tensor_model_parallel_world_size()
@@ -126,8 +127,7 @@ def forward(
126127
is_prefill=is_prefill,
127128
top_k=self.top_k,
128129
enable_force_load_balance=enable_force_load_balance,
129-
shared_experts=None,
130-
)
130+
shared_experts=None)
131131

132132
return hidden_states
133133

0 commit comments

Comments
 (0)