Skip to content

Commit 46fb690

Browse files
committed
[CustomOP][Refactor] Register CustomOP instead of overwrite forward_oot
* fix rope ut * add layernorm ut * add ut for common fused moe * update test Signed-off-by: MengqingCao <cmq0113@163.com>
1 parent 494b0f4 commit 46fb690

File tree

14 files changed

+685
-434
lines changed

14 files changed

+685
-434
lines changed

setup.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -391,7 +391,9 @@ def _read_requirements(filename: str) -> List[str]:
391391
extras_require={},
392392
entry_points={
393393
"vllm.platform_plugins": ["ascend = vllm_ascend:register"],
394-
"vllm.general_plugins":
395-
["ascend_enhanced_model = vllm_ascend:register_model"],
394+
"vllm.general_plugins": [
395+
"ascend_enhanced_model = vllm_ascend:register_model",
396+
"dummy_custom_ops = vllm_ascend:register_ops"
397+
],
396398
},
397399
)

tests/e2e/singlecard/ops/test_fused_moe.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,6 @@
1919
2020
Run `pytest tests/ops/test_fused_moe.py`.
2121
"""
22-
# fused moe ops test will hit the infer_schema error, we need add the patch
23-
# here to make the test pass.
24-
import vllm_ascend.patch.worker.patch_common.patch_utils # type: ignore[import] # isort: skip # noqa
2522

2623
import pytest
2724
import torch

tests/ut/ops/test_activation.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
from unittest.mock import patch
2+
3+
import pytest
4+
import torch
5+
from vllm.model_executor.layers.activation import QuickGELU, SiluAndMul
6+
7+
8+
@pytest.fixture
9+
def dummy_tensor():
10+
return torch.randn(4, 8, dtype=torch.float16)
11+
12+
13+
@patch("torch_npu.npu_fast_gelu", side_effect=lambda x: x + 1)
14+
def test_QuickGELU_forward(mock_gelu, dummy_tensor):
15+
layer = QuickGELU()
16+
out = layer.forward(dummy_tensor)
17+
18+
expected_out = dummy_tensor + 1
19+
assert torch.allclose(out, expected_out)
20+
21+
mock_gelu.assert_called_once()
22+
23+
24+
@pytest.mark.parametrize("is_310p_return", [True, False])
25+
@patch("torch_npu.npu_swiglu", side_effect=lambda x: x + 1)
26+
def test_SiluAndMul_forward(mock_swiglu, is_310p_return, dummy_tensor):
27+
28+
with patch("vllm_ascend.utils.is_310p", return_value=is_310p_return):
29+
layer = SiluAndMul()
30+
out = layer.forward(dummy_tensor)
31+
32+
if is_310p_return:
33+
expected_arg = dummy_tensor.to(torch.float32)
34+
else:
35+
expected_arg = dummy_tensor
36+
37+
# assert mock_swiglu.call_count == 1
38+
mock_swiglu.assert_called_once()
39+
40+
actual_arg = mock_swiglu.call_args[0][0]
41+
assert torch.allclose(
42+
actual_arg,
43+
expected_arg), "npu_swiglu called with unexpected input"
44+
45+
expected_out = dummy_tensor + 1
46+
assert torch.allclose(out, expected_out)

tests/ut/ops/test_common_fused_moe.py

Lines changed: 146 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,146 @@
1+
import os
2+
from typing import Optional
3+
from unittest.mock import patch
4+
5+
import pytest
6+
import torch
7+
from vllm.config import CompilationLevel, ModelConfig, get_current_vllm_config
8+
9+
from vllm.model_executor.layers.fused_moe.config import ( # isort: skip
10+
FusedMoEConfig, FusedMoEParallelConfig)
11+
from vllm.model_executor.layers.fused_moe.layer import ( # isort: skip
12+
FusedMoE, UnquantizedFusedMoEMethod)
13+
14+
NUM_EXPERTS = 256
15+
TOPK = 8
16+
TP_SIZE = 1
17+
DP_SIZE = 1
18+
19+
20+
def mock_fused_experts(
21+
hidden_states: torch.Tensor,
22+
w1: torch.Tensor,
23+
w2: torch.Tensor,
24+
topk_weights: torch.Tensor,
25+
topk_ids: torch.Tensor,
26+
top_k: int,
27+
expert_map: torch.Tensor = None,
28+
apply_router_weight_on_input: bool = False,
29+
max_num_tokens: Optional[int] = None,
30+
) -> torch.Tensor:
31+
return hidden_states + 1
32+
33+
34+
def mock_fused_experts_moge(
35+
hidden_states: torch.Tensor,
36+
w1: torch.Tensor,
37+
w2: torch.Tensor,
38+
topk_weights: torch.Tensor,
39+
topk_ids: torch.Tensor,
40+
top_k: int,
41+
global_num_experts: int,
42+
expert_map: torch.Tensor = None,
43+
apply_router_weight_on_input: bool = False,
44+
) -> torch.Tensor:
45+
return 2 * hidden_states
46+
47+
48+
def mock_npu_moe_gating_top_k_softmax(x: torch.Tensor,
49+
finished: torch.Tensor = None,
50+
k: int = 0):
51+
topk_weights = x[:, :k]
52+
topk_ids = torch.range(0, k - 1).unsqueeze(0)
53+
row_idx = torch.range(0, k - 1).unsqueeze(0)
54+
return topk_weights, topk_ids, row_idx
55+
56+
57+
def create_fused_moe_method(vllm_config):
58+
moe_parallel_config = FusedMoEParallelConfig.make(
59+
tp_size_=TP_SIZE,
60+
dp_size_=DP_SIZE,
61+
vllm_parallel_config=vllm_config.parallel_config)
62+
moe_config = FusedMoEConfig.make(
63+
num_experts=NUM_EXPERTS,
64+
experts_per_token=TOPK,
65+
hidden_dim=32,
66+
num_local_experts=NUM_EXPERTS,
67+
moe_parallel_config=moe_parallel_config,
68+
in_dtype=torch.float16,
69+
max_num_tokens=NUM_EXPERTS,
70+
quant_config=None,
71+
)
72+
layer = UnquantizedFusedMoEMethod(moe=moe_config)
73+
return layer
74+
75+
76+
@pytest.mark.parametrize("enforce_eager", [True, False])
77+
@pytest.mark.parametrize("compilation_level", [0, 1, 2, 3])
78+
def test_AscendUnquantizedFusedMoEMethod_init(enforce_eager,
79+
compilation_level):
80+
vllm_config = get_current_vllm_config()
81+
vllm_config.model_config = ModelConfig()
82+
vllm_config.model_config.enforce_eager = enforce_eager
83+
vllm_config.compilation_config.level = compilation_level
84+
with patch("vllm.config._current_vllm_config", vllm_config):
85+
layer = create_fused_moe_method(vllm_config)
86+
87+
# check initialization
88+
assert hasattr(layer, "use_aclgraph")
89+
assert hasattr(layer, "max_num_batched_tokens")
90+
assert layer.max_num_batched_tokens == vllm_config.scheduler_config.max_num_batched_tokens
91+
expected_use_aclgraph = vllm_config.compilation_config.level == CompilationLevel.PIECEWISE and not vllm_config.model_config.enforce_eager
92+
assert layer.use_aclgraph == expected_use_aclgraph
93+
94+
95+
@pytest.mark.parametrize("select_gating_topk_softmax_experts", ["0", "1"])
96+
@pytest.mark.parametrize("is_310p_return", [True, False])
97+
@patch("vllm_ascend.ops.common_fused_moe.fused_experts_moge",
98+
side_effect=mock_fused_experts_moge)
99+
@patch("vllm_ascend.ops.common_fused_moe.fused_experts",
100+
side_effect=mock_fused_experts)
101+
@patch("torch_npu.npu_moe_gating_top_k_softmax",
102+
side_effect=mock_npu_moe_gating_top_k_softmax)
103+
def test_AscendUnquantizedFusedMoEMethod_forward(
104+
mock_npu_moe_gating_top_k_softmax, mock_fused_experts,
105+
mock_fused_experts_moge, select_gating_topk_softmax_experts,
106+
is_310p_return):
107+
vllm_config = get_current_vllm_config()
108+
vllm_config.model_config = ModelConfig()
109+
vllm_config.model_config.enforce_eager = False
110+
vllm_config.compilation_config.level = 3
111+
with patch("vllm.config._current_vllm_config", vllm_config), patch(
112+
"vllm_ascend.utils.is_310p",
113+
return_value=is_310p_return), patch.dict(os.environ, {
114+
'SELECT_GATING_TOPK_SOTFMAX_EXPERTS':
115+
select_gating_topk_softmax_experts
116+
}):
117+
# prepare input and create layer
118+
layer = create_fused_moe_method(vllm_config)
119+
fused_moe = FusedMoE(num_experts=NUM_EXPERTS,
120+
top_k=TOPK,
121+
hidden_size=32,
122+
intermediate_size=32,
123+
dp_size=DP_SIZE,
124+
tp_size=TP_SIZE)
125+
x = torch.randn(32, NUM_EXPERTS)
126+
router_logits = torch.randn(32, 128)
127+
# invoke forward
128+
layer.forward(
129+
fused_moe,
130+
x,
131+
use_grouped_topk=False,
132+
top_k=TOPK,
133+
router_logits=router_logits,
134+
renormalize=True,
135+
global_num_experts=NUM_EXPERTS,
136+
)
137+
# check 310p
138+
if is_310p_return:
139+
mock_fused_experts_moge.assert_called_once()
140+
else:
141+
mock_fused_experts.assert_called_once()
142+
# check SELECT_GATING_TOPK_SOTFMAX_EXPERTS
143+
if os.environ["SELECT_GATING_TOPK_SOTFMAX_EXPERTS"] == "1":
144+
mock_npu_moe_gating_top_k_softmax.assert_called_once()
145+
else:
146+
mock_npu_moe_gating_top_k_softmax.assert_not_called()

tests/ut/ops/test_layernorm.py

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
from unittest.mock import patch
2+
3+
import pytest
4+
import torch
5+
from vllm.model_executor.layers.layernorm import RMSNorm
6+
7+
8+
@pytest.fixture
9+
def dummy_tensor():
10+
return torch.randn(4, 8, dtype=torch.float16)
11+
12+
13+
def mock_rms_norm(x, weight, eps):
14+
return x + 1, None
15+
16+
17+
def mock_add_rms_norm(x, residual, weight, eps):
18+
return 2 * x, None, 2 * residual
19+
20+
21+
@pytest.mark.parametrize("is_310p_return", [True, False])
22+
@pytest.mark.parametrize("residual",
23+
[None, torch.randn(4, 8, dtype=torch.float32)])
24+
@patch("torch_npu.npu_rms_norm", side_effect=mock_rms_norm)
25+
@patch("torch_npu.npu_add_rms_norm", side_effect=mock_add_rms_norm)
26+
def test_SiluAndMul_forward(mock_add_rmsnorm, mock_rmsnorm, is_310p_return,
27+
residual, dummy_tensor):
28+
29+
with patch("vllm_ascend.utils.is_310p", return_value=is_310p_return):
30+
layer = RMSNorm(hidden_size=32, eps=1e-05)
31+
if residual is not None:
32+
out_x, out_residual = layer.forward(dummy_tensor, residual)
33+
34+
if is_310p_return:
35+
expected_arg_x = dummy_tensor + residual.to(dummy_tensor.dtype)
36+
expected_out_x = expected_arg_x + 1
37+
expected_out_residual = expected_arg_x.to(residual.dtype)
38+
39+
mock_rmsnorm.assert_called_once()
40+
assert torch.allclose(out_x, expected_out_x)
41+
assert torch.allclose(out_residual, expected_out_residual)
42+
else:
43+
expected_out_x = 2 * dummy_tensor
44+
expected_out_residual = 2 * residual
45+
mock_add_rmsnorm.assert_called_once()
46+
assert torch.allclose(out_x, expected_out_x)
47+
assert torch.allclose(out_residual, expected_out_residual)
48+
else:
49+
out_x = layer.forward(dummy_tensor, residual)
50+
expected_out_x = dummy_tensor + 1
51+
52+
mock_rmsnorm.assert_called_once()
53+
assert torch.allclose(out_x, expected_out_x)

0 commit comments

Comments
 (0)