add mc2 mask

weiguihua2 · weiguihua2 · commit 88c31da00e02 · 2025-07-08T11:05:07.000+08:00
Signed-off-by: weiguihua2 &lt;weiguihua2@huawei.com&gt;
diff --git a/vllm_ascend/attention/mla_v1.py b/vllm_ascend/attention/mla_v1.py
@@ -11,6 +11,7 @@
 from vllm.config import get_current_vllm_config
 from vllm.model_executor.layers.linear import (LinearBase,
                                                UnquantizedLinearMethod)
+from vllm.platforms import current_platform
 from vllm.utils import cdiv, round_down
 
 from vllm_ascend import envs
@@ -94,6 +95,7 @@ class AscendMLADecodeMetadata:
     seq_lens_list: list[int]
     actual_seq_q_lens: Optional[list[int]] = None
     attn_mask: Optional[torch.Tensor] = None
+    mc2_mask: Optional[torch.Tensor] = None
 
 
 @dataclass
@@ -206,6 +208,11 @@ def __init__(self,
         ascend_config = get_ascend_config()
         self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled
 
+    def generate_active_mask(self, actual_seqs_num, batch_size):
+        mc2_mask = torch.zeros(batch_size, dtype=torch.bool, device=current_platform.device_type)
+        mc2_mask[:actual_seqs_num].fill_(True)
+        return mc2_mask
+
     def reorder_batch(self, input_batch: "InputBatch",
                       scheduler_output: "SchedulerOutput") -> bool:
         # We now want to reorder the batch so that the "decode" requests are at
@@ -336,6 +343,7 @@ def build_torchair_graph_dummy(
         else:
             attn_state = AscendAttentionState.DecodeOnly
             num_decode_tokens = 1
+        mc2_mask = self.generate_active_mask(num_actual_tokens, num_reqs)
         decode_metadata = AscendMLADecodeMetadata(
             input_positions=input_positions,
             block_table=block_table,
@@ -344,6 +352,7 @@ def build_torchair_graph_dummy(
             max_seq_lens=1,
             attn_mask=self.runner.spec_attn_mask,
             actual_seq_q_lens=self.runner.actual_seq_q_lens[:num_reqs],
+            mc2_mask=mc2_mask,
         )
         return self.metadata_cls(  # type: ignore
             num_input_tokens=num_actual_tokens,
@@ -500,6 +509,7 @@ def build(
                                                   num_reqs_pad_size]
             else:
                 seq_lens_list = seq_lens.tolist()
+            mc2_mask = self.generate_active_mask(num_actual_tokens, num_reqs)
 
             decode_metadata = AscendMLADecodeMetadata(
                 input_positions=input_positions,
@@ -509,6 +519,7 @@ def build(
                 max_seq_lens=max_seq_lens,
                 attn_mask=self.runner.spec_attn_mask,
                 actual_seq_q_lens=actual_seq_q_lens,
+                mc2_mask=mc2_mask,
             )
 
         return self.metadata_cls(  # type: ignore
diff --git a/vllm_ascend/ops/fused_moe.py b/vllm_ascend/ops/fused_moe.py
@@ -122,6 +122,7 @@ def fused_experts_with_mc2(
     moe_all_to_all_group_name: Optional[str] = None,
     shared_experts: Optional[Any] = None,
     is_torchair: bool = False,
+    mc2_mask: Optional[torch.Tensor] = None,
 ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
     quant_mode = 0
     ep_group = get_ep_group()
@@ -137,6 +138,9 @@ def fused_experts_with_mc2(
     # NOTE: Currently, when in A3 or in torchair graph, we need to pass in some extra param into dispatch & combine
     need_extra_args = (get_ascend_soc_version() == AscendSocVersion.A3
                        or is_torchair)
+    
+    # NOTE: Currently, when in A3, we need to pass in some extra param into dispatch & combine
+    a3_need_extra_args = get_ascend_soc_version() == AscendSocVersion.A3
 
     moe_expert_num = len(expert_map)
     kwargs_mc2 = {
@@ -161,6 +165,10 @@ def fused_experts_with_mc2(
             "tp_world_size": 1,
             "tp_rank_id": 0,
         })
+    if a3_need_extra_args:
+        stage1_kwargs.update({
+            "x_active_mask": mc2_mask,
+        })
 
     kwargs_mc2.update(stage1_kwargs)
 
@@ -230,6 +238,10 @@ def fused_experts_with_mc2(
             "tp_world_size": 1,
             "tp_rank_id": 0,
         })
+    if a3_need_extra_args:
+        stage3_kwargs.update({
+            "x_active_mask": mc2_mask,
+        })
     kwargs_mc2.update(stage3_kwargs)
 
     hidden_states = torch_npu.npu_moe_distribute_combine(**kwargs_mc2)
@@ -944,6 +956,7 @@ def apply(
 
         fused_moe_state = get_forward_context().fused_moe_state
         if fused_moe_state == FusedMoEState.MC2:
+            mc2_mask = kwargs.get("mc2_mask", None)
             return fused_experts_with_mc2(
                 hidden_states=x,
                 w1=layer.w13_weight,
@@ -954,7 +967,8 @@ def apply(
                 expert_map=expert_map,
                 moe_all_to_all_group_name=self.moe_all_to_all_group_name,
                 shared_experts=shared_experts,
-                is_torchair=self.torchair_graph_enabled)
+                is_torchair=self.torchair_graph_enabled,
+                mc2_mask=mc2_mask)
         elif fused_moe_state == FusedMoEState.AllGather:
             return fused_experts(hidden_states=x,
                                  w1=layer.w13_weight,
@@ -1154,6 +1168,9 @@ def forward(self,
         if shared_experts:
             if not self.enable_multistream_moe or fused_moe_state != FusedMoEState.MC2:
                 shared_hidden_states = shared_experts(hidden_states)
+        
+        attn_metadata = get_forward_context().attn_metadata
+        mc2_mask = attn_metadata.decode.mc2_mask if attn_metadata is not None and attn_metadata.decode is not None else None
 
         tp_size = get_tensor_model_parallel_world_size()
         if tp_size > 1 and fused_moe_state != FusedMoEState.AllGather:
@@ -1171,6 +1188,9 @@ def forward(self,
             tp_rank = get_tensor_model_parallel_rank()
             hidden_states = chunk_hidden_states[tp_rank]
             router_logits = chunk_router_logits[tp_rank]
+            if mc2_mask is not None:
+                chunk_mc2_mask = torch.tensor_split(mc2_mask, tp_size, dim=0)
+                mc2_mask = chunk_mc2_mask[tp_rank]
         if self.dp_size > 1 and fused_moe_state == FusedMoEState.AllGather:
             # NOTE: When in torchair graph, it has been padded in model_runner_v1
             if not self.torchair_graph_enabled or is_prefill:
@@ -1209,6 +1229,7 @@ def forward(self,
             and self.enable_multistream_moe and not is_prefill else None,
             quantized_x_for_share=quantized_x_for_share,
             dynamic_scale_for_share=dynamic_scale_for_share,
+            mc2_mask=mc2_mask,
         )
 
         if shared_experts:
diff --git a/vllm_ascend/quantization/w8a8_dynamic.py b/vllm_ascend/quantization/w8a8_dynamic.py
@@ -215,6 +215,7 @@ def fused_experts_with_mc2(
     w2_scale_bias: torch.Tensor = None,
     quantized_x_for_share: Optional[Any] = None,
     dynamic_scale_for_share: Optional[Any] = None,
+    mc2_mask: Optional[torch.Tensor] = None,
 ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
     if log2phy:
         topk_ids = log2phy[topk_ids]
@@ -232,6 +233,9 @@ def fused_experts_with_mc2(
     # NOTE: Currently, when in A3 or in torchair graph, we need to pass in some extra param into dispatch & combine
     need_extra_args = (get_ascend_soc_version() == AscendSocVersion.A3
                        or is_torchair)
+    
+    # NOTE: Currently, when in A3, we need to pass in some extra param into dispatch & combine
+    a3_need_extra_args = get_ascend_soc_version() == AscendSocVersion.A3
 
     if (expert_map is not None):
         moe_expert_num = len(expert_map) + global_redundant_expert_num
@@ -260,6 +264,10 @@ def fused_experts_with_mc2(
             "tp_world_size": 1,
             "tp_rank_id": 0,
         })
+    if a3_need_extra_args:
+        stage1_kwargs.update({
+            "x_active_mask": mc2_mask,
+        })
     kwargs_mc2.update(stage1_kwargs)
 
     output = torch_npu.npu_moe_distribute_dispatch(**kwargs_mc2)
@@ -310,6 +318,10 @@ def fused_experts_with_mc2(
             "tp_world_size": 1,
             "tp_rank_id": 0,
         })
+    if a3_need_extra_args:
+        stage3_kwargs.update({
+            "x_active_mask": mc2_mask,
+        })
     kwargs_mc2.update(stage3_kwargs)
 
     hidden_states = torch_npu.npu_moe_distribute_combine(**kwargs_mc2)
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py
@@ -1666,6 +1666,8 @@ def _dummy_run(
                             attn_metadata.decode.block_table)
                         torch._dynamo.mark_static(
                             attn_metadata.decode.input_positions)
+                        torch._dynamo.mark_static(
+                            attn_metadata.decode.mc2_mask)
                         torch._dynamo.mark_static(attn_metadata.slot_mapping)
                         for kv in self.kv_caches:
                             assert isinstance(