add st

yangcheng (AJ) · yangcheng (AJ) · commit 83ee4c102efe · 2025-07-01T15:52:23.000+08:00
Signed-off-by: yangcheng (AJ) &lt;y00806874@china.huawei.com&gt;
diff --git a/tests/e2e/multicard/test_qwen3_moe.py b/tests/e2e/multicard/test_qwen3_moe.py
@@ -0,0 +1,72 @@
+#
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+# Copyright 2023 The vLLM team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This file is a part of the vllm-ascend project.
+#
+"""
+Compare the outputs of vLLM with and without aclgraph.
+
+Run `pytest tests/multicard/test_data_parallel.py`.
+"""
+
+import os
+import subprocess
+import sys
+from unittest.mock import patch
+
+import pytest
+
+MODELS = ["vllm-ascend/Qwen3-30B-A3B-Puring"]
+
+
+@pytest.mark.parametrize("model", MODELS)
+@pytest.mark.parametrize("max_tokens", [32])
+@patch.dict(os.environ, {"ASCEND_RT_VISIBLE_DEVICES": "0,1,2,3"})
+def test_qwen3_moe_inference(model, max_tokens):
+    script = "examples/offline_data_parallel.py"
+
+    env = os.environ.copy()
+
+    cmd = [
+        sys.executable,
+        script,
+        "--model",
+        model,
+        "--dp-size",
+        "2",
+        "--tp-size",
+        "2",
+        "--node-size",
+        "1",
+        "--node-rank",
+        "0",
+        "--trust-remote-code",
+        "--enforce-eager",
+    ]
+
+    print(f"Running subprocess: {' '.join(cmd)}")
+    proc = subprocess.run(cmd,
+                          env=env,
+                          stdout=subprocess.PIPE,
+                          stderr=subprocess.STDOUT,
+                          timeout=600)
+    output = proc.stdout.decode()
+
+    print(output)
+
+    assert "DP rank 0 needs to process" in output
+    assert "DP rank 1 needs to process" in output
+    assert "Generated text:" in output
+    assert proc.returncode == 0
diff --git a/vllm_ascend/models/qwen3_moe.py b/vllm_ascend/models/qwen3_moe.py
@@ -19,13 +19,9 @@
 from typing import Optional
 
 import torch
+import vllm
 from torch import nn
 from transformers import PretrainedConfig
-from vllm_ascend.ascend_config import get_ascend_config
-from vllm_ascend.distributed.parallel_state import get_ep_group
-from vllm_ascend.ops.fused_moe import AscendFusedMoE
-
-import vllm
 from vllm.attention import AttentionMetadata
 from vllm.distributed import get_tensor_model_parallel_world_size, get_tp_group
 from vllm.distributed.parallel_state import get_dp_group
@@ -34,6 +30,10 @@
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.models.qwen3_moe import Qwen3MoeForCausalLM
 
+from vllm_ascend.ascend_config import get_ascend_config
+from vllm_ascend.distributed.parallel_state import get_ep_group
+from vllm_ascend.ops.fused_moe import AscendFusedMoE
+
 
 class CustomQwen3MoeForCausalLM(Qwen3MoeForCausalLM):
     packed_modules_mapping = {
@@ -46,19 +46,20 @@ class CustomQwen3MoeForCausalLM(Qwen3MoeForCausalLM):
             "gate_proj",
             "up_proj",
         ],
-        "experts":
-            ["experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"],
+        "experts": [
+            "experts.0.gate_proj", "experts.0.up_proj", "experts.0.down_proj"
+        ],
     }
 
 
 class AscendQwen3MoeSparseMoeBlock(nn.Module):
     top_k: int
 
     def __init__(
-            self,
-            config: PretrainedConfig,
-            quant_config: Optional[QuantizationConfig] = None,
-            prefix: str = "",
+        self,
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+        prefix: str = "",
     ):
         super().__init__()
         self.tp_size = get_tensor_model_parallel_world_size()
@@ -126,8 +127,7 @@ def forward(
             is_prefill=is_prefill,
             top_k=self.top_k,
             enable_force_load_balance=enable_force_load_balance,
-            shared_experts=None,
-        )
+            shared_experts=None)
 
         return hidden_states