vllm-project
diff --git a/‎tests/long_term/spec_decode_v1/test_v1_mtp_correctness.py
Lines changed: 4 additions & 2 deletions b/‎tests/long_term/spec_decode_v1/test_v1_mtp_correctness.py
Lines changed: 4 additions & 2 deletions
diff --git a/‎tests/ut/ops/test_expert_load_balancer.py
Lines changed: 147 additions & 0 deletions b/‎tests/ut/ops/test_expert_load_balancer.py
Lines changed: 147 additions & 0 deletions
diff --git a/‎tests/ut/moe_util.py renamed to ‎tests/ut/test_moe_util.py
Lines changed: 18 additions & 47 deletions b/‎tests/ut/moe_util.py renamed to ‎tests/ut/test_moe_util.py
Lines changed: 18 additions & 47 deletions
diff --git a/‎tests/ut/test_token_dispatcher.py
Lines changed: 56 additions & 0 deletions b/‎tests/ut/test_token_dispatcher.py
Lines changed: 56 additions & 0 deletions
@@ -114,7 +114,8 @@ def test_mtp_torchair_correctness(
                       enforce_eager=False,
                       additional_config={
                           "torchair_graph_config": {
-                              "enabled": True
+                              "enabled": True,
+                              "graph_batch_size": [256]
                           },
                           "ascend_scheduler_config": {
                               "enabled": True
@@ -132,7 +133,8 @@ def test_mtp_torchair_correctness(
                        },
                        additional_config={
                            "torchair_graph_config": {
-                               "enabled": True
+                               "enabled": True,
+                               "graph_batch_size": [256]
                            },
                            "ascend_scheduler_config": {
                                "enabled": True
 
@@ -0,0 +1,147 @@
+# fused moe ops test will hit the infer_schema error, we need add the patch
+# here to make the test pass.
+import vllm_ascend.patch.worker.patch_common.patch_utils  # type: ignore[import]  # isort: skip  # noqa
+
+import json
+import unittest
+from typing import List, TypedDict
+from unittest import mock
+
+import torch
+
+from vllm_ascend.ops.expert_load_balancer import ExpertLoadBalancer
+
+
+class Device(TypedDict):
+    device_id: int
+    device_expert: List[int]
+
+
+class Layer(TypedDict):
+    layer_id: int
+    device_count: int
+    device_list: List[Device]
+
+
+class MockData(TypedDict):
+    moe_layer_count: int
+    layer_list: List[Layer]
+
+
+MOCK_DATA: MockData = {
+    "moe_layer_count":
+    1,
+    "layer_list": [{
+        "layer_id":
+        0,
+        "device_count":
+        2,
+        "device_list": [{
+            "device_id": 0,
+            "device_expert": [7, 2, 0, 3, 5]
+        }, {
+            "device_id": 1,
+            "device_expert": [6, 1, 4, 7, 2]
+        }]
+    }]
+}
+
+
+class TestExpertLoadBalancer(unittest.TestCase):
+
+    def setUp(self):
+        json_file = "expert_map.json"
+        with open(json_file, 'w') as f:
+            json.dump(MOCK_DATA, f)
+
+        self.expert_load_balancer = ExpertLoadBalancer(json_file,
+                                                       global_expert_num=8)
+
+    def test_init(self):
+
+        self.assertIsInstance(self.expert_load_balancer.expert_map_tensor,
+                              torch.Tensor)
+        self.assertEqual(self.expert_load_balancer.layers_num,
+                         MOCK_DATA["moe_layer_count"])
+        self.assertEqual(self.expert_load_balancer.ranks_num,
+                         MOCK_DATA["layer_list"][0]["device_count"])
+
+    def test_generate_index_dicts(self):
+        tensor_2d = torch.tensor([[7, 2, 0, 3, 5], [6, 1, 4, 7, 2]])
+        result = self.expert_load_balancer.generate_index_dicts(tensor_2d)
+        expected_result = [{
+            7: 0,
+            2: 1,
+            0: 2,
+            3: 3,
+            5: 4
+        }, {
+            6: 5,
+            1: 6,
+            4: 7,
+            7: 8,
+            2: 9
+        }]
+        self.assertEqual(result, expected_result)
+
+    def test_generate_expert_placement_map(self):
+        expert_placement_map = self.expert_load_balancer.generate_expert_placement_map(
+        )
+        self.assertEqual(expert_placement_map.shape,
+                         (self.expert_load_balancer.layers_num,
+                          self.expert_load_balancer.ranks_num, 8))
+        self.assertTrue(torch.all(expert_placement_map >= -1))
+
+    def test_generate_log2phy_expert_map(self):
+        layer_id = 0
+        log2phy_map = self.expert_load_balancer.generate_log2phy_expert_map(
+            layer_id)
+        self.assertEqual(log2phy_map.shape,
+                         (self.expert_load_balancer.ranks_num, 8))
+        self.assertTrue(torch.all(log2phy_map >= -1))
+
+    @mock.patch("torch_npu.npu._lazy_init")
+    @mock.patch("torch.npu.current_device", return_value="cpu")
+    def test_get_rank_placement_map(self, mock_current_device, mock_lazy_init):
+        layer_id = 0
+        rank_id = 0
+        rank_local_expert_num, rank_expert_map = self.expert_load_balancer.get_rank_placement_map(
+            layer_id, rank_id)
+        self.assertEqual(rank_local_expert_num, 5)
+        expected_tensor = torch.tensor([2, -1, 1, 3, -1, 4, -1, 0],
+                                       dtype=torch.int32).to(
+                                           rank_expert_map.device)
+        self.assertTrue(rank_expert_map.equal(expected_tensor))
+
+        rank_id = 1
+        rank_local_expert_num, rank_expert_map = self.expert_load_balancer.get_rank_placement_map(
+            layer_id, rank_id)
+        expected_tensor = torch.tensor([-1, 1, 4, -1, 2, -1, 0, 3],
+                                       dtype=torch.int32).to(
+                                           rank_expert_map.device)
+        self.assertTrue(rank_expert_map.equal(expected_tensor))
+
+    def test_get_rank_log2phy_map(self):
+        layer_id = 0
+        rank_id = 0
+        log2phy_map = self.expert_load_balancer.get_rank_log2phy_map(
+            layer_id, rank_id)
+        expected_tensor = torch.tensor([2, 6, 1, 3, 7, 4, 5, 0],
+                                       dtype=torch.int32).to(
+                                           log2phy_map.device)
+        self.assertTrue(log2phy_map.equal(expected_tensor))
+
+        rank_id = 1
+        log2phy_map = self.expert_load_balancer.get_rank_log2phy_map(
+            layer_id, rank_id)
+        expected_tensor = torch.tensor([2, 6, 9, 3, 7, 4, 5, 8],
+                                       dtype=torch.int32).to(
+                                           log2phy_map.device)
+        self.assertTrue(log2phy_map.equal(expected_tensor))
+
+    def test_get_global_redundant_expert_num(self):
+        redundant_expert_num = self.expert_load_balancer.get_global_redundant_expert_num(
+        )
+        expected_redundant_expert_num = len(MOCK_DATA["layer_list"][0]["device_list"][0]["device_expert"]) * \
+                                        MOCK_DATA["layer_list"][0]["device_count"] - 8
+        self.assertEqual(redundant_expert_num, expected_redundant_expert_num)
@@ -4,9 +4,9 @@
 import torch
 import pytest
 import math
+import vllm_ascend.patch.worker.patch_common.patch_utils
 
-from vllm_ascend.ops.moe_dispatcher.moe_utils import permute, get_capacity, topk_softmax_with_capacity, \
-    group_limited_topk, unpermute, sort_chunks_by_idxs
+from vllm_ascend.ops.moe_dispatcher.moe_utils import permute, get_capacity, topk_softmax_with_capacity, group_limited_topk, unpermute, sort_chunks_by_idxs
 
 
 class TestMoeUtils:
@@ -22,6 +22,7 @@ def setup(self):
         self.num_groups = 2
         self.scaling_factor = 1.0
 
+
     def test_group_limited_topk(self, setup):
         # Test group-limited topk routing
         scores = torch.randn(self.num_tokens, self.num_experts)
@@ -38,42 +39,33 @@ def test_group_limited_topk(self, setup):
         assert indices.shape == (self.num_tokens, self.topk)
         assert torch.all(indices < self.num_experts)
 
-    def test_topk_softmax_with_capacity(self, setup):
+
+    @pytest.mark.parametrize("score_function", ["softmax"])
+    def test_topk_softmax_with_capacity(self, setup, score_function):
         # Test topk softmax with capacity
         logits = torch.randn(self.num_tokens, self.num_experts)
 
         # Test without capacity
         probs, routing_map, tokens_per_expert, top_indices = topk_softmax_with_capacity(
             logits,
-            topk=self.topk
+            topk=self.topk,
+            score_function=score_function
         )
         assert probs.shape == (self.num_tokens, self.num_experts)
         assert routing_map.shape == (self.num_tokens, self.num_experts)
         assert tokens_per_expert.shape == (self.num_experts,)
 
-        # Test with capacity
-        probs, routing_map, tokens_per_expert, top_indices = topk_softmax_with_capacity(
-            logits,
-            topk=self.topk,
-            capacity_factor=self.capacity_factor,
-            pad_to_capacity=True
-        )
-        expert_capacity = get_capacity(
-            num_tokens=self.num_tokens * self.topk,
-            num_experts=self.num_experts,
-            capacity_factor=self.capacity_factor
-        )
-        assert tokens_per_expert.max() <= expert_capacity
-
         # Test with group routing
         probs, routing_map, tokens_per_expert, top_indices = topk_softmax_with_capacity(
             logits,
             topk=self.topk,
             num_groups=self.num_groups,
-            group_topk=self.group_topk
+            group_topk=self.group_topk,
+            score_function=score_function
         )
         assert probs.shape == (self.num_tokens, self.num_experts)
 
+
     def test_get_capacity(self, setup):
         # Test capacity calculation
         capacity = get_capacity(
@@ -94,6 +86,7 @@ def test_get_capacity(self, setup):
         )
         assert capacity == min_capacity
 
+
     def test_permute(self, setup):
         # Test token permutation
         tokens = torch.randn(self.num_tokens, self.hidden_size)
@@ -120,6 +113,7 @@ def test_permute(self, setup):
         assert permuted_tokens.shape[0] == num_out_tokens
         assert sorted_indices.shape[0] == num_out_tokens
 
+
     def test_unpermute(self, setup):
         # Test token unpermutation
         tokens = torch.randn(self.num_tokens, self.hidden_size)
@@ -162,6 +156,7 @@ def test_unpermute(self, setup):
         )
         assert restored_tokens.shape == tokens.shape
 
+
     def test_sort_chunks_by_idxs(self, setup):
         # Test chunk sorting
         input_tensor = torch.randn(10, self.hidden_size)
@@ -173,10 +168,10 @@ def test_sort_chunks_by_idxs(self, setup):
 
         # Verify the order is correct
         expected = torch.cat([input_tensor[5:], input_tensor[0: 3], input_tensor[3: 5]])
-        assert torch.allclose(output, expected) \
- \
-               @ pytest.mark.parametrize("score_function", ["softmax", "sigmoid"])
+        assert torch.allclose(output, expected)
 
+
+    @pytest.mark.parametrize("score_function", ["softmax"])
     def test_score_functions(self, setup, score_function):
         # Test different score functions
         logits = torch.randn(self.num_tokens, self.num_experts)
@@ -190,28 +185,4 @@ def test_score_functions(self, setup, score_function):
         )
         assert probs.shape == (self.num_tokens, self.num_experts)
         assert routing_map.shape == (self.num_tokens, self.num_experts)
-        assert tokens_per_expert.shape == (self.num_experts,)
-
-    def test_edge_cases(self, setup):
-        # Test empty input
-        empty_logits = torch.randn(0, self.num_experts)
-        with pytest.raises(AssertionError):
-            topk_softmax_with_capacity(empty_logits, topk=self.topk)
-
-        # Test invalid score function
-        logits = torch.randn(self.num_tokens, self.num_experts)
-        with pytest.raises(ValueError):
-            topk_softmax_with_capacity(
-                logits,
-                topk=self.topk,
-                score_function="invalid"
-            )
-
-        # Test invalid drop policy
-        with pytest.raises(ValueError):
-            topk_softmax_with_capacity(
-                logits,
-                topk=self.topk,
-                capacity_factor=1.0,
-                drop_policy="invalid"
-            )
+        assert tokens_per_expert.shape == (self.num_experts,)
@@ -0,0 +1,56 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
+
+import torch
+import pytest
+from pytest_mock import MockerFixture
+import vllm_ascend.patch.worker.patch_common.patch_utils
+from vllm_ascend.utils import adapt_patch # noqa E402
+
+from vllm_ascend.ops.moe_dispatcher.token_dispatcher import MoeDispatcherConfig, MoEAlltoAllSeqOverLapDispatcher
+
+adapt_patch(True)
+
+class TestMoEAlltoAllSeqOverLapDispatcher:
+
+    @pytest.fixture
+    def config(self):
+        config = MoeDispatcherConfig()
+        config.set_num_local_experts(2)
+        config.set_num_moe_experts(4)
+        config.set_moe_pad_expert_input_to_capacity(False)
+        config.set_moe_expert_capacity_factor(None)
+        config.set_moe_router_topk(2)
+        config.set_moe_grouped_gemm(False)
+        config.set_group_topk(0)
+        config.set_num_groups(1)
+        config.set_is_fused(False)
+        return config.build()
+
+    def mock_ep_group(self, mocker):
+        mock_group = mocker.MagicMock()
+        mock_group.rank_in_group = 0
+        mock_group.world_size = 2
+        mock_group.device_group = "mock_group"
+        return mock_group
+
+    @pytest.fixture
+    def dispatcher(self, config, mocker: MockerFixture):
+        mocker.patch("vllm_ascend.ops.moe_dispatcher.token_dispatcher.get_ep_group",
+                     return_value=self.mock_ep_group(mocker))
+        return MoEAlltoAllSeqOverLapDispatcher(config)
+
+    def test_initialization(self, dispatcher, config):
+        assert dispatcher.num_local_experts == config.num_local_experts
+        assert dispatcher.num_experts == config.num_moe_experts
+        assert dispatcher.local_expert_indices == [0, 1]
+        assert dispatcher.ep_rank == 0
+        assert dispatcher.ep_size == 2
+        assert dispatcher.overlap_stream is not None
+
+    def test_routing(self, dispatcher):
+        probs = torch.randn(4, 4)  # 4 tokens, 4 experts
+        scores, routing_map = dispatcher.routing(probs)
+        assert scores.shape == (4, 4)  # topk=2
+        assert routing_map.shape == (4, 4)