add chunk mc2 for prefill (#1703)

NNUCJ · zzzzwwjj · web-flow · commit 3b994919876a · 2025-07-10T00:54:26.000+08:00
### What this PR does / why we need it?  ### Does this PR introduce _any_ user-facing change?  ### How was this patch tested? ```shell export HCCL_IF_IP=xxxxxx export GLOO_SOCKET_IFNAME=enp48s3u1u1 export TP_SOCKET_IFNAME=enp48s3u1u1 export HCCL_SOCKET_IFNAME=enp48s3u1u1 # export HCCL_BUFFSIZE=2048 export VLLM_USE_V1=1 export VLLM_ASCEND_ENABLE_CHUNK_MC2=1 export VLLM_ASCEND_FUSED_MOE_MC2_CHUNK_SIZE=256 export HCCL_BUFFSIZE=2048 # export HCCL_BUFFSIZE=1024 export ASCEND_LAUNCH_BLOCKING=0 export PYTORCH_NPU_ALLOC_CONF="expandable_segments:True" rm -rf ./.torchair_cache/ rm -rf ./dynamo_* model_path="model_path" python data_parallel.py \ --model=${model_path} \ --dp-size=2 \ --tp-size=8 \ --enforce-eager \ --trust-remote-code \ --node-size=1 \ --node-rank=0 \ ``` --------- Signed-off-by: NNUCJ <616151263@qq.com> Signed-off-by: zzzzwwjj <1183291235@qq.com> Co-authored-by: zzzzwwjj <1183291235@qq.com> Co-authored-by: zzzzwwjj <34335947+zzzzwwjj@users.noreply.github.com>
diff --git a/vllm_ascend/ascend_forward_context.py b/vllm_ascend/ascend_forward_context.py
@@ -1,23 +1,31 @@
+import math
 from contextlib import contextmanager
 from enum import Enum
 from typing import Any, Optional
 
 import torch
 from vllm.config import VllmConfig
-from vllm.distributed import get_dp_group
+from vllm.distributed import get_dp_group, get_tp_group
 from vllm.forward_context import get_forward_context, set_forward_context
+from vllm.platforms import current_platform
+
+import vllm_ascend.envs as envs
 
 
 class FusedMoEState(Enum):
     AllGather = 0
     All2All = 1
     MC2 = 2
+    MC2_PREFILL = 3
 
 
 # TODO(zzzzwwjj): add soc_version to choose branch
 def get_fused_moe_state(ep_size: int, with_prefill: bool):
+    enable_chunk_mc2 = envs.VLLM_ASCEND_ENABLE_CHUNK_MC2
     if ep_size == 1:
         return FusedMoEState.AllGather
+    elif ep_size >= 16 and with_prefill and enable_chunk_mc2:
+        return FusedMoEState.MC2_PREFILL
     # NOTE: mc2 need ep_size >= 16 & all2all can't use in torchair graph.
     elif ep_size < 16 or with_prefill:
         return FusedMoEState.All2All
@@ -33,7 +41,8 @@ def set_ascend_forward_context(
         num_tokens: Optional[int] = None,
         num_tokens_across_dp: Optional[torch.Tensor] = None,
         with_prefill: bool = True,
-        in_profile_run: bool = False):
+        in_profile_run: bool = False,
+        num_actual_tokens: Optional[int] = None):
     """A context manager that stores the current forward context,
     can be attention metadata, etc.
     We add some additional param into forward_context.
@@ -45,7 +54,6 @@ def set_ascend_forward_context(
                              num_tokens_across_dp=num_tokens_across_dp):
         forward_context = get_forward_context()
         forward_context.with_prefill = with_prefill
-
         ep_size = torch.distributed.get_world_size(
         ) if vllm_config.parallel_config.enable_expert_parallel else 1
 
@@ -59,19 +67,55 @@ def set_ascend_forward_context(
         # due to multiple warmups before actual capturing
         forward_context.capturing = False
 
+        if num_tokens is None and attn_metadata is not None:
+            if hasattr(attn_metadata, 'num_actual_tokens'):
+                # for v1 engine
+                num_tokens = attn_metadata.num_actual_tokens
+            else:
+                # for v0 engine
+                num_tokens = attn_metadata.num_prefill_tokens + attn_metadata.num_decode_tokens
+
+        if num_actual_tokens is None:
+            num_actual_tokens = num_tokens
+
         dp_world_size = get_dp_group().world_size
         if dp_world_size > 1 and forward_context.dp_metadata is not None:
-            forward_context.max_tokens_across_dp = forward_context.dp_metadata.max_tokens_across_dp_cpu.item(
+            max_tokens_across_dp = forward_context.dp_metadata.max_tokens_across_dp_cpu.item(
             )
-        elif num_tokens is not None:
-            forward_context.max_tokens_across_dp = num_tokens
-        elif attn_metadata is not None:
-            if hasattr(attn_metadata, 'num_actual_tokens'):
-                forward_context.max_tokens_across_dp = attn_metadata.num_actual_tokens
-            else:
-                forward_context.max_tokens_across_dp = attn_metadata.num_prefill_tokens + attn_metadata.num_decode_tokens
         else:
-            forward_context.max_tokens_across_dp = None
+            max_tokens_across_dp = num_tokens
+
+        forward_context.max_tokens_across_dp = max_tokens_across_dp
+
+        if num_tokens is not None:
+            tp_world_size = get_tp_group().world_size
+            world_size = torch.distributed.get_world_size()
+            # NOTE: token num which need to pad to when mc2
+            forward_context.padded_num_tokens = math.ceil(
+                max_tokens_across_dp / tp_world_size) * tp_world_size
+            # NOTE: mc2 op's param `global_bs`, add `world_size` to make `global_bs` absolutely larger than actual global_bs.
+            forward_context.global_bs = math.ceil(
+                max_tokens_across_dp / tp_world_size) * world_size
+
+            if fused_moe_state == FusedMoEState.MC2_PREFILL:
+                chunk_size = envs.VLLM_ASCEND_FUSED_MOE_MC2_CHUNK_SIZE
+                forward_context.max_num_chunks = math.ceil(
+                    math.ceil(max_tokens_across_dp / tp_world_size) /
+                    chunk_size)
+
+                forward_context.global_bs = math.ceil(
+                    math.ceil(max_tokens_across_dp / tp_world_size) /
+                    forward_context.max_num_chunks) * world_size
+
+                min_num_tokens = forward_context.max_num_chunks * tp_world_size
+                forward_context.padded_num_tokens = math.ceil(
+                    max_tokens_across_dp / min_num_tokens) * min_num_tokens
+
+            mc2_mask = torch.zeros(forward_context.padded_num_tokens,
+                                   dtype=torch.bool,
+                                   device=current_platform.device_type)
+            mc2_mask[:num_actual_tokens] = True
+            forward_context.mc2_mask = mc2_mask
 
         try:
             yield
diff --git a/vllm_ascend/attention/mla_v1.py b/vllm_ascend/attention/mla_v1.py
@@ -11,7 +11,6 @@
 from vllm.config import get_current_vllm_config
 from vllm.model_executor.layers.linear import (LinearBase,
                                                UnquantizedLinearMethod)
-from vllm.platforms import current_platform
 from vllm.utils import cdiv, round_down
 
 from vllm_ascend import envs
@@ -71,6 +70,7 @@ class ChunkedContextMetadata:
         max_seq_lens: list[int]
         workspace: torch.Tensor
         chunk_seq_lens: torch.Tensor
+        chunk_seq_lens_npu: torch.Tensor
 
     attn_mask: torch.Tensor
     query_lens: list[int]
@@ -99,7 +99,6 @@ class AscendMLADecodeMetadata:
     attn_mask: Optional[torch.Tensor] = None
     sin: torch.Tensor = None
     cos: torch.Tensor = None
-    mc2_mask: Optional[torch.Tensor] = None
 
 
 @dataclass
@@ -215,13 +214,6 @@ def __init__(self,
         self.cos_cache = None
         self.sin_cache = None
 
-    def generate_activate_mask(self, actual_seqs_num, batch_size):
-        mc2_mask = torch.zeros(batch_size,
-                               dtype=torch.bool,
-                               device=current_platform.device_type)
-        mc2_mask[:actual_seqs_num].fill_(True)
-        return mc2_mask
-
     def reorder_batch(self, input_batch: "InputBatch",
                       scheduler_output: "SchedulerOutput") -> bool:
         # We now want to reorder the batch so that the "decode" requests are at
@@ -364,7 +356,6 @@ def build_torchair_graph_dummy(
                          self.rope_dim,
                          dtype=self.runner.dtype,
                          device=device)
-        mc2_mask = self.generate_activate_mask(num_actual_tokens, num_reqs)
         decode_metadata = AscendMLADecodeMetadata(
             input_positions=input_positions,
             block_table=block_table,
@@ -374,8 +365,7 @@ def build_torchair_graph_dummy(
             attn_mask=self.runner.spec_attn_mask,
             actual_seq_q_lens=self.runner.actual_seq_q_lens[:num_reqs],
             sin=sin,
-            cos=cos,
-            mc2_mask=mc2_mask)
+            cos=cos)
         return self.metadata_cls(  # type: ignore
             num_input_tokens=num_actual_tokens,
             num_actual_tokens=num_actual_tokens,
@@ -481,6 +471,7 @@ def build(
                     seq_tot=chunk_seq_lens.sum(dim=1).tolist(),
                     max_seq_lens=chunk_seq_lens.max(dim=1).values.tolist(),
                     chunk_seq_lens=chunk_seq_lens,
+                    chunk_seq_lens_npu=chunk_seq_lens.npu(),
                     workspace=self.chunked_prefill_workspace,
                 )
             prefill_input_positions = input_positions[tokens_start:]
@@ -554,8 +545,6 @@ def build(
                 1).unsqueeze(2)
             sin = self.sin_cache[input_positions].unsqueeze(  # type: ignore
                 1).unsqueeze(2)
-            mc2_mask = self.generate_activate_mask(
-                num_actual_tokens, num_reqs + num_reqs_pad_size)
 
             decode_metadata = AscendMLADecodeMetadata(
                 input_positions=input_positions,
@@ -566,8 +555,7 @@ def build(
                 attn_mask=self.runner.spec_attn_mask,
                 actual_seq_q_lens=actual_seq_q_lens,
                 sin=sin,
-                cos=cos,
-                mc2_mask=mc2_mask)
+                cos=cos)
 
         return self.metadata_cls(  # type: ignore
             num_actual_tokens=num_actual_tokens,
@@ -749,6 +737,8 @@ def _compute_prefill_context(
             toks = prefill_metadata.chunked_context.seq_tot[i]
 
             seq_len2 = prefill_metadata.chunked_context.chunk_seq_lens[i]
+            seq_len2_npu = prefill_metadata.chunked_context.chunk_seq_lens_npu[
+                i]
             seq_len = torch.stack([seq_len1, seq_len2])
             kv_c_normed = torch.empty(toks,
                                       num_heads,
@@ -765,7 +755,7 @@ def _compute_prefill_context(
                 cache_kv_c,
                 cache_k_pe,
                 prefill_metadata.block_table,
-                seq_len2.to(query.device),
+                seq_len2_npu,
                 seq_starts=prefill_metadata.chunked_context.starts[i],
                 key=kv_c_normed,
                 value=k_pe,
diff --git a/vllm_ascend/envs.py b/vllm_ascend/envs.py
@@ -136,7 +136,13 @@
     # Whether to enable mla_pa for deepseek mla decode, this flag will be removed after its available torch_npu is public accessible
     # and the mla_pa will be the default path of deepseek decode path.
     "VLLM_ASCEND_MLA_PA":
-    lambda: int(os.getenv("VLLM_ASCEND_MLA_PA", 0))
+    lambda: int(os.getenv("VLLM_ASCEND_MLA_PA", 0)),
+    # ENABLE chunk mc2
+    "VLLM_ASCEND_ENABLE_CHUNK_MC2":
+    lambda: bool(int(os.getenv("VLLM_ASCEND_ENABLE_CHUNK_MC2", "0"))),
+    # Batch MC2 in prefill: The number of tokens in each batch
+    "VLLM_ASCEND_FUSED_MOE_MC2_CHUNK_SIZE":
+    lambda: int(os.getenv("VLLM_ASCEND_FUSED_MOE_MC2_CHUNK_SIZE", "128")),
 }
 
 # end-env-vars-definition
diff --git a/vllm_ascend/ops/fused_moe.py b/vllm_ascend/ops/fused_moe.py
@@ -1151,6 +1151,7 @@ def forward(self,
 
         num_tokens, hidden_size = hidden_states.shape
 
+        forward_context = get_forward_context()
         fused_moe_state = get_forward_context().fused_moe_state
         # For w8a8 dynamic we can do npu_dynamic_quant and gate in parallel.
         quantized_x_for_share, dynamic_scale_for_share = None, None
@@ -1170,31 +1171,29 @@ def forward(self,
             if not self.enable_multistream_moe or fused_moe_state != FusedMoEState.MC2:
                 shared_hidden_states = shared_experts(hidden_states)
 
-        attn_metadata = get_forward_context().attn_metadata
-        mc2_mask = attn_metadata.decode.mc2_mask if attn_metadata is not None and attn_metadata.decode is not None else None
-
+        mc2_mask = forward_context.mc2_mask
         tp_size = get_tensor_model_parallel_world_size()
-        if tp_size > 1 and fused_moe_state != FusedMoEState.AllGather:
-            if num_tokens < tp_size:
+        if fused_moe_state != FusedMoEState.AllGather:
+            if num_tokens < forward_context.padded_num_tokens:
                 hidden_states = nn.functional.pad(
-                    hidden_states, (0, 0, 0, tp_size - num_tokens))
+                    hidden_states,
+                    (0, 0, 0, forward_context.padded_num_tokens - num_tokens))
                 router_logits = nn.functional.pad(
-                    router_logits, (0, 0, 0, tp_size - num_tokens))
-                if mc2_mask is not None:
-                    mc2_mask = nn.functional.pad(mc2_mask,
-                                                 (0, tp_size - num_tokens))
-            chunk_hidden_states = torch.tensor_split(hidden_states,
-                                                     tp_size,
-                                                     dim=0)
-            chunk_router_logits = torch.tensor_split(router_logits,
-                                                     tp_size,
-                                                     dim=0)
-            tp_rank = get_tensor_model_parallel_rank()
-            hidden_states = chunk_hidden_states[tp_rank]
-            router_logits = chunk_router_logits[tp_rank]
-
-            if mc2_mask is not None:
-                chunk_mc2_mask = torch.tensor_split(mc2_mask, tp_size, dim=0)
+                    router_logits,
+                    (0, 0, 0, forward_context.padded_num_tokens - num_tokens))
+            if tp_size > 1:
+                chunk_hidden_states = torch.tensor_split(hidden_states,
+                                                         tp_size,
+                                                         dim=0)
+                chunk_router_logits = torch.tensor_split(router_logits,
+                                                         tp_size,
+                                                         dim=0)
+                chunk_mc2_mask = torch.tensor_split(forward_context.mc2_mask,
+                                                    tp_size,
+                                                    dim=0)
+                tp_rank = get_tensor_model_parallel_rank()
+                hidden_states = chunk_hidden_states[tp_rank]
+                router_logits = chunk_router_logits[tp_rank]
                 mc2_mask = chunk_mc2_mask[tp_rank]
 
         if self.dp_size > 1 and fused_moe_state == FusedMoEState.AllGather:
@@ -1246,7 +1245,7 @@ def forward(self,
             dist.all_gather(list(chunk_hidden_states), e_hidden_states,
                             self.tp_group)
             final_hidden_states = torch.cat(chunk_hidden_states, dim=0)
-            if num_tokens < tp_size:
+            if num_tokens < forward_context.padded_num_tokens:
                 final_hidden_states = final_hidden_states[:num_tokens]
             dispose_tensor(e_hidden_states)
         elif self.dp_size > 1 and fused_moe_state == FusedMoEState.AllGather:
diff --git a/vllm_ascend/quantization/w8a8_dynamic.py b/vllm_ascend/quantization/w8a8_dynamic.py
diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py