|
| 1 | +import os |
| 2 | +from typing import Optional |
| 3 | +from unittest.mock import patch |
| 4 | + |
| 5 | +import pytest |
| 6 | +import torch |
| 7 | +from vllm.config import CompilationLevel, ModelConfig, get_current_vllm_config |
| 8 | + |
| 9 | +from vllm.model_executor.layers.fused_moe.config import ( # isort: skip |
| 10 | + FusedMoEConfig, FusedMoEParallelConfig) |
| 11 | +from vllm.model_executor.layers.fused_moe.layer import ( # isort: skip |
| 12 | + FusedMoE, UnquantizedFusedMoEMethod) |
| 13 | + |
| 14 | +NUM_EXPERTS = 256 |
| 15 | +TOPK = 8 |
| 16 | +TP_SIZE = 1 |
| 17 | +DP_SIZE = 1 |
| 18 | + |
| 19 | + |
| 20 | +def mock_fused_experts( |
| 21 | + hidden_states: torch.Tensor, |
| 22 | + w1: torch.Tensor, |
| 23 | + w2: torch.Tensor, |
| 24 | + topk_weights: torch.Tensor, |
| 25 | + topk_ids: torch.Tensor, |
| 26 | + top_k: int, |
| 27 | + expert_map: torch.Tensor = None, |
| 28 | + apply_router_weight_on_input: bool = False, |
| 29 | + max_num_tokens: Optional[int] = None, |
| 30 | +) -> torch.Tensor: |
| 31 | + return hidden_states + 1 |
| 32 | + |
| 33 | + |
| 34 | +def mock_fused_experts_moge( |
| 35 | + hidden_states: torch.Tensor, |
| 36 | + w1: torch.Tensor, |
| 37 | + w2: torch.Tensor, |
| 38 | + topk_weights: torch.Tensor, |
| 39 | + topk_ids: torch.Tensor, |
| 40 | + top_k: int, |
| 41 | + global_num_experts: int, |
| 42 | + expert_map: torch.Tensor = None, |
| 43 | + apply_router_weight_on_input: bool = False, |
| 44 | +) -> torch.Tensor: |
| 45 | + return 2 * hidden_states |
| 46 | + |
| 47 | + |
| 48 | +def mock_npu_moe_gating_top_k_softmax(x: torch.Tensor, |
| 49 | + finished: torch.Tensor = None, |
| 50 | + k: int = 0): |
| 51 | + topk_weights = x[:, :k] |
| 52 | + topk_ids = torch.range(0, k - 1).unsqueeze(0) |
| 53 | + row_idx = torch.range(0, k - 1).unsqueeze(0) |
| 54 | + return topk_weights, topk_ids, row_idx |
| 55 | + |
| 56 | + |
| 57 | +def create_fused_moe_method(vllm_config): |
| 58 | + moe_parallel_config = FusedMoEParallelConfig.make( |
| 59 | + tp_size_=TP_SIZE, |
| 60 | + dp_size_=DP_SIZE, |
| 61 | + vllm_parallel_config=vllm_config.parallel_config) |
| 62 | + moe_config = FusedMoEConfig.make( |
| 63 | + num_experts=NUM_EXPERTS, |
| 64 | + experts_per_token=TOPK, |
| 65 | + hidden_dim=32, |
| 66 | + num_local_experts=NUM_EXPERTS, |
| 67 | + moe_parallel_config=moe_parallel_config, |
| 68 | + in_dtype=torch.float16, |
| 69 | + max_num_tokens=NUM_EXPERTS, |
| 70 | + quant_config=None, |
| 71 | + ) |
| 72 | + layer = UnquantizedFusedMoEMethod(moe=moe_config) |
| 73 | + return layer |
| 74 | + |
| 75 | + |
| 76 | +@pytest.mark.parametrize("enforce_eager", [True, False]) |
| 77 | +@pytest.mark.parametrize("compilation_level", [0, 1, 2, 3]) |
| 78 | +def test_AscendUnquantizedFusedMoEMethod_init(enforce_eager, |
| 79 | + compilation_level): |
| 80 | + vllm_config = get_current_vllm_config() |
| 81 | + vllm_config.model_config = ModelConfig() |
| 82 | + vllm_config.model_config.enforce_eager = enforce_eager |
| 83 | + vllm_config.compilation_config.level = compilation_level |
| 84 | + with patch("vllm.config._current_vllm_config", vllm_config): |
| 85 | + layer = create_fused_moe_method(vllm_config) |
| 86 | + |
| 87 | + # check initialization |
| 88 | + assert hasattr(layer, "use_aclgraph") |
| 89 | + assert hasattr(layer, "max_num_batched_tokens") |
| 90 | + assert layer.max_num_batched_tokens == vllm_config.scheduler_config.max_num_batched_tokens |
| 91 | + expected_use_aclgraph = vllm_config.compilation_config.level == CompilationLevel.PIECEWISE and not vllm_config.model_config.enforce_eager |
| 92 | + assert layer.use_aclgraph == expected_use_aclgraph |
| 93 | + |
| 94 | + |
| 95 | +@pytest.mark.parametrize("select_gating_topk_softmax_experts", ["0", "1"]) |
| 96 | +@pytest.mark.parametrize("is_310p_return", [True, False]) |
| 97 | +@patch("vllm_ascend.ops.common_fused_moe.fused_experts_moge", |
| 98 | + side_effect=mock_fused_experts_moge) |
| 99 | +@patch("vllm_ascend.ops.common_fused_moe.fused_experts", |
| 100 | + side_effect=mock_fused_experts) |
| 101 | +@patch("torch_npu.npu_moe_gating_top_k_softmax", |
| 102 | + side_effect=mock_npu_moe_gating_top_k_softmax) |
| 103 | +def test_AscendUnquantizedFusedMoEMethod_forward( |
| 104 | + mock_npu_moe_gating_top_k_softmax, mock_fused_experts, |
| 105 | + mock_fused_experts_moge, select_gating_topk_softmax_experts, |
| 106 | + is_310p_return): |
| 107 | + vllm_config = get_current_vllm_config() |
| 108 | + vllm_config.model_config = ModelConfig() |
| 109 | + vllm_config.model_config.enforce_eager = False |
| 110 | + vllm_config.compilation_config.level = 3 |
| 111 | + with patch("vllm.config._current_vllm_config", vllm_config), patch( |
| 112 | + "vllm_ascend.utils.is_310p", |
| 113 | + return_value=is_310p_return), patch.dict(os.environ, { |
| 114 | + 'SELECT_GATING_TOPK_SOTFMAX_EXPERTS': |
| 115 | + select_gating_topk_softmax_experts |
| 116 | + }): |
| 117 | + # prepare input and create layer |
| 118 | + layer = create_fused_moe_method(vllm_config) |
| 119 | + fused_moe = FusedMoE(num_experts=NUM_EXPERTS, |
| 120 | + top_k=TOPK, |
| 121 | + hidden_size=32, |
| 122 | + intermediate_size=32, |
| 123 | + dp_size=DP_SIZE, |
| 124 | + tp_size=TP_SIZE) |
| 125 | + x = torch.randn(32, NUM_EXPERTS) |
| 126 | + router_logits = torch.randn(32, 128) |
| 127 | + # invoke forward |
| 128 | + layer.forward( |
| 129 | + fused_moe, |
| 130 | + x, |
| 131 | + use_grouped_topk=False, |
| 132 | + top_k=TOPK, |
| 133 | + router_logits=router_logits, |
| 134 | + renormalize=True, |
| 135 | + global_num_experts=NUM_EXPERTS, |
| 136 | + ) |
| 137 | + # check 310p |
| 138 | + if is_310p_return: |
| 139 | + mock_fused_experts_moge.assert_called_once() |
| 140 | + else: |
| 141 | + mock_fused_experts.assert_called_once() |
| 142 | + # check SELECT_GATING_TOPK_SOTFMAX_EXPERTS |
| 143 | + if os.environ["SELECT_GATING_TOPK_SOTFMAX_EXPERTS"] == "1": |
| 144 | + mock_npu_moe_gating_top_k_softmax.assert_called_once() |
| 145 | + else: |
| 146 | + mock_npu_moe_gating_top_k_softmax.assert_not_called() |
0 commit comments