add mc2 mask

weiguihua2 · weiguihua2 · commit c9bbd0c9c7ab · 2025-07-07T14:38:31.000+08:00
Signed-off-by: weiguihua2 &lt;weiguihua2@huawei.com&gt;
diff --git a/vllm_ascend/attention/mla_v1.py b/vllm_ascend/attention/mla_v1.py
@@ -221,7 +221,9 @@ def __init__(self,
         self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled
 
     def generate_active_mask(self, actual_seqs_num, batch_size):
-        mc2_mask = torch.zeros(batch_size, dtype=torch.bool, device=current_platform.device_type)
+        mc2_mask = torch.zeros(batch_size,
+                               dtype=torch.bool,
+                               device=current_platform.device_type)
         mc2_mask[:actual_seqs_num].fill_(True)
         return mc2_mask
 
@@ -521,7 +523,8 @@ def build(
                                                   num_reqs_pad_size]
             else:
                 seq_lens_list = seq_lens.tolist()
-            mc2_mask = self.generate_active_mask(num_actual_tokens, num_reqs)
+            mc2_mask = self.generate_active_mask(
+                num_actual_tokens, num_reqs + num_reqs_pad_size)
 
             decode_metadata = AscendMLADecodeMetadata(
                 input_positions=input_positions,
diff --git a/vllm_ascend/ops/fused_moe.py b/vllm_ascend/ops/fused_moe.py
@@ -1188,9 +1188,11 @@ def forward(self,
             tp_rank = get_tensor_model_parallel_rank()
             hidden_states = chunk_hidden_states[tp_rank]
             router_logits = chunk_router_logits[tp_rank]
+
             if mc2_mask is not None:
                 chunk_mc2_mask = torch.tensor_split(mc2_mask, tp_size, dim=0)
                 mc2_mask = chunk_mc2_mask[tp_rank]
+
         if self.dp_size > 1 and fused_moe_state == FusedMoEState.AllGather:
             # NOTE: When in torchair graph, it has been padded in model_runner_v1
             if not self.torchair_graph_enabled or is_prefill:
diff --git a/vllm_ascend/quantization/w8a8_dynamic.py b/vllm_ascend/quantization/w8a8_dynamic.py
@@ -803,6 +803,7 @@ def apply(
         topk_weights = topk_weights.to(x.dtype)
 
         if fused_moe_state == FusedMoEState.MC2:
+            mc2_mask = kwargs.get("mc2_mask", None)
             return fused_experts_with_mc2(
                 hidden_states=x,
                 w1=layer.w13_weight,
@@ -819,7 +820,8 @@ def apply(
                 shared_experts=shared_experts,
                 is_torchair=self.torchair_graph_enabled,
                 quantized_x_for_share=shared_gate_up,
-                dynamic_scale_for_share=shared_dequant_scale)
+                dynamic_scale_for_share=shared_dequant_scale,
+                mc2_mask=mc2_mask)
         elif fused_moe_state == FusedMoEState.AllGather:
             return fused_experts(hidden_states=x,
                                  w1=layer.w13_weight,