vllm-project
diff --git a/‎tests/long_term/spec_decode_v1/test_v1_mtp_correctness.py
Lines changed: 4 additions & 2 deletions b/‎tests/long_term/spec_decode_v1/test_v1_mtp_correctness.py
Lines changed: 4 additions & 2 deletions
diff --git a/‎tests/ut/ops/test_expert_load_balancer.py
Lines changed: 147 additions & 0 deletions b/‎tests/ut/ops/test_expert_load_balancer.py
Lines changed: 147 additions & 0 deletions
diff --git a/‎vllm_ascend/attention/mla_v1.py
Lines changed: 74 additions & 26 deletions b/‎vllm_ascend/attention/mla_v1.py
Lines changed: 74 additions & 26 deletions
@@ -114,7 +114,8 @@ def test_mtp_torchair_correctness(
                       enforce_eager=False,
                       additional_config={
                           "torchair_graph_config": {
-                              "enabled": True
+                              "enabled": True,
+                              "graph_batch_size": [256]
                           },
                           "ascend_scheduler_config": {
                               "enabled": True
@@ -132,7 +133,8 @@ def test_mtp_torchair_correctness(
                        },
                        additional_config={
                            "torchair_graph_config": {
-                               "enabled": True
+                               "enabled": True,
+                               "graph_batch_size": [256]
                            },
                            "ascend_scheduler_config": {
                                "enabled": True
 
@@ -0,0 +1,147 @@
+# fused moe ops test will hit the infer_schema error, we need add the patch
+# here to make the test pass.
+import vllm_ascend.patch.worker.patch_common.patch_utils  # type: ignore[import]  # isort: skip  # noqa
+
+import json
+import unittest
+from typing import List, TypedDict
+from unittest import mock
+
+import torch
+
+from vllm_ascend.ops.expert_load_balancer import ExpertLoadBalancer
+
+
+class Device(TypedDict):
+    device_id: int
+    device_expert: List[int]
+
+
+class Layer(TypedDict):
+    layer_id: int
+    device_count: int
+    device_list: List[Device]
+
+
+class MockData(TypedDict):
+    moe_layer_count: int
+    layer_list: List[Layer]
+
+
+MOCK_DATA: MockData = {
+    "moe_layer_count":
+    1,
+    "layer_list": [{
+        "layer_id":
+        0,
+        "device_count":
+        2,
+        "device_list": [{
+            "device_id": 0,
+            "device_expert": [7, 2, 0, 3, 5]
+        }, {
+            "device_id": 1,
+            "device_expert": [6, 1, 4, 7, 2]
+        }]
+    }]
+}
+
+
+class TestExpertLoadBalancer(unittest.TestCase):
+
+    def setUp(self):
+        json_file = "expert_map.json"
+        with open(json_file, 'w') as f:
+            json.dump(MOCK_DATA, f)
+
+        self.expert_load_balancer = ExpertLoadBalancer(json_file,
+                                                       global_expert_num=8)
+
+    def test_init(self):
+
+        self.assertIsInstance(self.expert_load_balancer.expert_map_tensor,
+                              torch.Tensor)
+        self.assertEqual(self.expert_load_balancer.layers_num,
+                         MOCK_DATA["moe_layer_count"])
+        self.assertEqual(self.expert_load_balancer.ranks_num,
+                         MOCK_DATA["layer_list"][0]["device_count"])
+
+    def test_generate_index_dicts(self):
+        tensor_2d = torch.tensor([[7, 2, 0, 3, 5], [6, 1, 4, 7, 2]])
+        result = self.expert_load_balancer.generate_index_dicts(tensor_2d)
+        expected_result = [{
+            7: 0,
+            2: 1,
+            0: 2,
+            3: 3,
+            5: 4
+        }, {
+            6: 5,
+            1: 6,
+            4: 7,
+            7: 8,
+            2: 9
+        }]
+        self.assertEqual(result, expected_result)
+
+    def test_generate_expert_placement_map(self):
+        expert_placement_map = self.expert_load_balancer.generate_expert_placement_map(
+        )
+        self.assertEqual(expert_placement_map.shape,
+                         (self.expert_load_balancer.layers_num,
+                          self.expert_load_balancer.ranks_num, 8))
+        self.assertTrue(torch.all(expert_placement_map >= -1))
+
+    def test_generate_log2phy_expert_map(self):
+        layer_id = 0
+        log2phy_map = self.expert_load_balancer.generate_log2phy_expert_map(
+            layer_id)
+        self.assertEqual(log2phy_map.shape,
+                         (self.expert_load_balancer.ranks_num, 8))
+        self.assertTrue(torch.all(log2phy_map >= -1))
+
+    @mock.patch("torch_npu.npu._lazy_init")
+    @mock.patch("torch.npu.current_device", return_value="cpu")
+    def test_get_rank_placement_map(self, mock_current_device, mock_lazy_init):
+        layer_id = 0
+        rank_id = 0
+        rank_local_expert_num, rank_expert_map = self.expert_load_balancer.get_rank_placement_map(
+            layer_id, rank_id)
+        self.assertEqual(rank_local_expert_num, 5)
+        expected_tensor = torch.tensor([2, -1, 1, 3, -1, 4, -1, 0],
+                                       dtype=torch.int32).to(
+                                           rank_expert_map.device)
+        self.assertTrue(rank_expert_map.equal(expected_tensor))
+
+        rank_id = 1
+        rank_local_expert_num, rank_expert_map = self.expert_load_balancer.get_rank_placement_map(
+            layer_id, rank_id)
+        expected_tensor = torch.tensor([-1, 1, 4, -1, 2, -1, 0, 3],
+                                       dtype=torch.int32).to(
+                                           rank_expert_map.device)
+        self.assertTrue(rank_expert_map.equal(expected_tensor))
+
+    def test_get_rank_log2phy_map(self):
+        layer_id = 0
+        rank_id = 0
+        log2phy_map = self.expert_load_balancer.get_rank_log2phy_map(
+            layer_id, rank_id)
+        expected_tensor = torch.tensor([2, 6, 1, 3, 7, 4, 5, 0],
+                                       dtype=torch.int32).to(
+                                           log2phy_map.device)
+        self.assertTrue(log2phy_map.equal(expected_tensor))
+
+        rank_id = 1
+        log2phy_map = self.expert_load_balancer.get_rank_log2phy_map(
+            layer_id, rank_id)
+        expected_tensor = torch.tensor([2, 6, 9, 3, 7, 4, 5, 8],
+                                       dtype=torch.int32).to(
+                                           log2phy_map.device)
+        self.assertTrue(log2phy_map.equal(expected_tensor))
+
+    def test_get_global_redundant_expert_num(self):
+        redundant_expert_num = self.expert_load_balancer.get_global_redundant_expert_num(
+        )
+        expected_redundant_expert_num = len(MOCK_DATA["layer_list"][0]["device_list"][0]["device_expert"]) * \
+                                        MOCK_DATA["layer_list"][0]["device_count"] - 8
+        self.assertEqual(redundant_expert_num, expected_redundant_expert_num)
@@ -11,6 +11,7 @@
 from vllm.config import get_current_vllm_config
 from vllm.model_executor.layers.linear import (LinearBase,
                                                UnquantizedLinearMethod)
+from vllm.platforms import current_platform
 from vllm.utils import cdiv, round_down
 
 from vllm_ascend import envs
@@ -81,6 +82,8 @@ class ChunkedContextMetadata:
     max_query_len: int
     max_seq_lens: int
     chunked_context: Optional[ChunkedContextMetadata] = None
+    sin: torch.Tensor = None
+    cos: torch.Tensor = None
 
 
 @dataclass
@@ -94,6 +97,9 @@ class AscendMLADecodeMetadata:
     seq_lens_list: list[int]
     actual_seq_q_lens: Optional[list[int]] = None
     attn_mask: Optional[torch.Tensor] = None
+    sin: torch.Tensor = None
+    cos: torch.Tensor = None
+    mc2_mask: Optional[torch.Tensor] = None
 
 
 @dataclass
@@ -205,6 +211,16 @@ def __init__(self,
             )
         ascend_config = get_ascend_config()
         self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled
+        self.rope_dim = self.runner.model_config.hf_text_config.qk_rope_head_dim
+        self.cos_cache = None
+        self.sin_cache = None
+
+    def generate_activate_mask(self, actual_seqs_num, batch_size):
+        mc2_mask = torch.zeros(batch_size,
+                               dtype=torch.bool,
+                               device=current_platform.device_type)
+        mc2_mask[:actual_seqs_num].fill_(True)
+        return mc2_mask
 
     def reorder_batch(self, input_batch: "InputBatch",
                       scheduler_output: "SchedulerOutput") -> bool:
@@ -317,7 +333,7 @@ def build_torchair_graph_dummy(
             num_reqs, block_table)
         num_tokens = num_reqs * self.runner.decode_token_per_req
         seq_lens = torch.zeros(num_reqs, dtype=torch.int32, device=device)
-        seq_lens_list = seq_lens.tolist()
+        seq_lens_list = [0] * num_reqs
         input_positions = torch.zeros(num_tokens,
                                       dtype=torch.int32,
                                       device=device).long()
@@ -336,6 +352,19 @@ def build_torchair_graph_dummy(
         else:
             attn_state = AscendAttentionState.DecodeOnly
             num_decode_tokens = 1
+        sin = torch.ones(num_reqs,
+                         1,
+                         1,
+                         self.rope_dim,
+                         dtype=self.runner.dtype,
+                         device=device)
+        cos = torch.ones(num_reqs,
+                         1,
+                         1,
+                         self.rope_dim,
+                         dtype=self.runner.dtype,
+                         device=device)
+        mc2_mask = self.generate_activate_mask(num_actual_tokens, num_reqs)
         decode_metadata = AscendMLADecodeMetadata(
             input_positions=input_positions,
             block_table=block_table,
@@ -344,7 +373,9 @@ def build_torchair_graph_dummy(
             max_seq_lens=1,
             attn_mask=self.runner.spec_attn_mask,
             actual_seq_q_lens=self.runner.actual_seq_q_lens[:num_reqs],
-        )
+            sin=sin,
+            cos=cos,
+            mc2_mask=mc2_mask)
         return self.metadata_cls(  # type: ignore
             num_input_tokens=num_actual_tokens,
             num_actual_tokens=num_actual_tokens,
@@ -396,6 +427,16 @@ def build(
         max_query_len = query_lens.max().item()
         max_seq_lens = seq_lens.max().item()
         query_start_loc = common_attn_metadata.query_start_loc
+        if self.cos_cache is None:
+            self.cos_cache = self.runner.get_model(
+            ).model.layers[0].self_attn.rotary_emb.cos_cached
+            self.sin_cache = self.runner.get_model(
+            ).model.layers[0].self_attn.rotary_emb.sin_cached
+        if self.cos_cache.dtype != self.runner.dtype:  # type: ignore
+            self.cos_cache = self.cos_cache.to(  # type: ignore
+                self.runner.dtype)  # type: ignore
+            self.sin_cache = self.sin_cache.to(  # type: ignore
+                self.runner.dtype)  # type: ignore
 
         prefill_metadata = None
         chunked_context_metadata = None
@@ -442,24 +483,32 @@ def build(
                     chunk_seq_lens=chunk_seq_lens,
                     workspace=self.chunked_prefill_workspace,
                 )
-
+            prefill_input_positions = input_positions[tokens_start:]
+            cos = self.cos_cache[
+                prefill_input_positions].unsqueeze(  # type: ignore
+                    1).unsqueeze(2)
+            sin = self.sin_cache[
+                prefill_input_positions].unsqueeze(  # type: ignore
+                    1).unsqueeze(2)
             prefill_metadata = AscendMLAPrefillMetadata(
                 attn_mask=self.runner.attn_mask,
                 query_lens=query_lens[tokens_start:],
                 seq_lens=seq_lens,
                 context_lens=seq_lens[tokens_start:],
-                input_positions=input_positions[tokens_start:],
+                input_positions=prefill_input_positions,
                 block_table=block_table[reqs_start:, ...],
                 max_query_len=max_query_len,
                 max_seq_lens=max_seq_lens,
                 query_start_loc=prefill_query_start_loc,
                 chunked_context=chunked_context_metadata,
+                sin=sin,
+                cos=cos,
             )
 
         decode_metadata = None
         use_torchair_graph = num_token_pad_size != -1
         if self._num_decodes > 0:
-            actual_seq_q_lens = None
+            actual_seq_q_lens = query_start_loc[1:].tolist()
             max_seq_lens = seq_lens[:self._num_decodes].max().item()
             seq_lens = seq_lens[:self._num_decode_tokens]
             input_positions = input_positions[:self._num_decode_tokens]
@@ -498,8 +547,17 @@ def build(
                 actual_seq_q_lens = query_start_loc[1:].tolist(
                 ) + self.runner.actual_seq_q_lens[num_reqs:num_reqs +
                                                   num_reqs_pad_size]
+                cos = self.cos_cache[
+                    input_positions].unsqueeze(  # type: ignore
+                        1).unsqueeze(2)
+                sin = self.sin_cache[
+                    input_positions].unsqueeze(  # type: ignore
+                        1).unsqueeze(2)
             else:
                 seq_lens_list = seq_lens.tolist()
+                cos, sin = None, None
+            mc2_mask = self.generate_activate_mask(
+                num_actual_tokens, num_reqs + num_reqs_pad_size)
 
             decode_metadata = AscendMLADecodeMetadata(
                 input_positions=input_positions,
@@ -509,7 +567,9 @@ def build(
                 max_seq_lens=max_seq_lens,
                 attn_mask=self.runner.spec_attn_mask,
                 actual_seq_q_lens=actual_seq_q_lens,
-            )
+                sin=sin,
+                cos=cos,
+                mc2_mask=mc2_mask)
 
         return self.metadata_cls(  # type: ignore
             num_actual_tokens=num_actual_tokens,
@@ -968,11 +1028,13 @@ def _forward_decode(
                                  self.qk_rope_head_dim)
                 input_layout = "BNSD"
 
-            # TorchAir's shape is [bs, num_heads_per_rank, q_seq_len, dim]
             if attn_metadata.attn_state == AscendAttentionState.SpecDecoding:
                 assert num_tokens % self.spec_token_num == 0
+                if self.enable_kv_nz:
+                    input_layout = "TND_NTD"
+                else:
+                    input_layout = "TND"
                 # [bs * q_seq_len, num_heads_per_rank, dim]
-                input_layout = "TND"
                 q_nope = q_nope.view(num_tokens, self.num_heads, -1)
                 q_pe = q_pe.view(num_tokens, self.num_heads, -1)
                 sparse_mode = 3
@@ -1101,15 +1163,8 @@ def forward(
             decode_k_nope = None
             assert attn_metadata.decode is not None
             if self.running_in_graph:
-                seq_len = self.rotary_emb.max_position_embeddings * self.rotary_emb.scaling_factor
-                cos = self.rotary_emb.cos_cached[:seq_len].to(
-                    dtype=decode_hs_or_q_c.dtype)
-                sin = self.rotary_emb.sin_cached[:seq_len].to(
-                    dtype=decode_hs_or_q_c.dtype)
-                cos = cos[attn_metadata.decode.input_positions]
-                sin = sin[attn_metadata.decode.input_positions]
-                cos = cos[:, None, None, :]
-                sin = sin[:, None, None, :]
+                cos = attn_metadata.decode.cos
+                sin = attn_metadata.decode.sin
                 # Without explicitly controlling the order, IndexByTensor operations
                 # would be placed after `matmul W_KV_T` hindering the overlapping of
                 # KvRmsNormRopeCache and SingleRope.
@@ -1144,15 +1199,8 @@ def forward(
             prefill_q_nope = prefill_q[..., :self.qk_nope_head_dim]
             if self.torchair_graph_enabled:
                 num_tokens = prefill_hs_or_q_c.shape[0]
-                seq_len = self.rotary_emb.max_position_embeddings * self.rotary_emb.scaling_factor
-                cos = self.rotary_emb.cos_cached[:seq_len].to(
-                    dtype=prefill_q_pe.dtype)
-                sin = self.rotary_emb.sin_cached[:seq_len].to(
-                    dtype=prefill_q_pe.dtype)
-                cos = cos[attn_metadata.prefill.input_positions]
-                sin = sin[attn_metadata.prefill.input_positions]
-                cos = cos[:, None, None, :]
-                sin = sin[:, None, None, :]
+                cos = attn_metadata.prefill.cos
+                sin = attn_metadata.prefill.sin
 
                 prefill_q_pe = self.rope_single(prefill_q_pe, cos, sin)
                 prefill_k_pe, prefill_k_nope = self.exec_kv_prefill(