vllm-project
diff --git a/‎vllm_ascend/ascend_forward_context.py
Lines changed: 57 additions & 13 deletions b/‎vllm_ascend/ascend_forward_context.py
Lines changed: 57 additions & 13 deletions
diff --git a/‎vllm_ascend/attention/mla_v1.py
Lines changed: 12 additions & 17 deletions b/‎vllm_ascend/attention/mla_v1.py
Lines changed: 12 additions & 17 deletions
diff --git a/‎vllm_ascend/distributed/parallel_state.py
Lines changed: 49 additions & 0 deletions b/‎vllm_ascend/distributed/parallel_state.py
Lines changed: 49 additions & 0 deletions
diff --git a/‎vllm_ascend/envs.py
Lines changed: 6 additions & 0 deletions b/‎vllm_ascend/envs.py
Lines changed: 6 additions & 0 deletions
diff --git a/‎vllm_ascend/ops/fused_moe.py
Lines changed: 25 additions & 25 deletions b/‎vllm_ascend/ops/fused_moe.py
Lines changed: 25 additions & 25 deletions
@@ -1,11 +1,15 @@
+import math
 from contextlib import contextmanager
 from enum import Enum
 from typing import Any, Optional
 
 import torch
 from vllm.config import VllmConfig
-from vllm.distributed import get_dp_group
+from vllm.distributed import get_dp_group, get_tp_group
 from vllm.forward_context import get_forward_context, set_forward_context
+from vllm.platforms import current_platform
+
+import vllm_ascend.envs as envs
 
 import vllm_ascend.envs as envs_ascend
 
@@ -14,17 +18,21 @@ class FusedMoEState(Enum):
     AllGather = 0
     All2All = 1
     MC2 = 2
-    All2AllSeq = 3
+    MC2_PREFILL = 3
+    All2AllSeq = 4
 
 
 # TODO(zzzzwwjj): add soc_version to choose branch
 def get_fused_moe_state(ep_size: int, with_prefill: bool):
+    enable_chunk_mc2 = envs.VLLM_ASCEND_ENABLE_CHUNK_MC2
     if ep_size == 1:
         return FusedMoEState.AllGather
     elif envs_ascend.VLLM_ASCEND_ENABLE_MOE_ALL2ALL_SEQ:
         # MC2 Dispatch/Combine performs better than alltoall_seq in decoding stage.
         return FusedMoEState.All2AllSeq if (
             ep_size < 16 or with_prefill) else FusedMoEState.MC2
+    elif ep_size >= 16 and with_prefill and enable_chunk_mc2:
+        return FusedMoEState.MC2_PREFILL
     # NOTE: mc2 need ep_size >= 16 & all2all can't use in torchair graph.
     elif ep_size < 16 or with_prefill:
         return FusedMoEState.All2All
@@ -40,7 +48,8 @@ def set_ascend_forward_context(
         num_tokens: Optional[int] = None,
         num_tokens_across_dp: Optional[torch.Tensor] = None,
         with_prefill: bool = True,
-        in_profile_run: bool = False):
+        in_profile_run: bool = False,
+        num_actual_tokens: Optional[int] = None):
     """A context manager that stores the current forward context,
     can be attention metadata, etc.
     We add some additional param into forward_context.
@@ -52,7 +61,6 @@ def set_ascend_forward_context(
                              num_tokens_across_dp=num_tokens_across_dp):
         forward_context = get_forward_context()
         forward_context.with_prefill = with_prefill
-
         ep_size = torch.distributed.get_world_size(
         ) if vllm_config.parallel_config.enable_expert_parallel else 1
 
@@ -66,19 +74,55 @@ def set_ascend_forward_context(
         # due to multiple warmups before actual capturing
         forward_context.capturing = False
 
+        if num_tokens is None and attn_metadata is not None:
+            if hasattr(attn_metadata, 'num_actual_tokens'):
+                # for v1 engine
+                num_tokens = attn_metadata.num_actual_tokens
+            else:
+                # for v0 engine
+                num_tokens = attn_metadata.num_prefill_tokens + attn_metadata.num_decode_tokens
+
+        if num_actual_tokens is None:
+            num_actual_tokens = num_tokens
+
         dp_world_size = get_dp_group().world_size
         if dp_world_size > 1 and forward_context.dp_metadata is not None:
-            forward_context.max_tokens_across_dp = forward_context.dp_metadata.max_tokens_across_dp_cpu.item(
+            max_tokens_across_dp = forward_context.dp_metadata.max_tokens_across_dp_cpu.item(
             )
-        elif num_tokens is not None:
-            forward_context.max_tokens_across_dp = num_tokens
-        elif attn_metadata is not None:
-            if hasattr(attn_metadata, 'num_actual_tokens'):
-                forward_context.max_tokens_across_dp = attn_metadata.num_actual_tokens
-            else:
-                forward_context.max_tokens_across_dp = attn_metadata.num_prefill_tokens + attn_metadata.num_decode_tokens
         else:
-            forward_context.max_tokens_across_dp = None
+            max_tokens_across_dp = num_tokens
+
+        forward_context.max_tokens_across_dp = max_tokens_across_dp
+
+        if num_tokens is not None:
+            tp_world_size = get_tp_group().world_size
+            world_size = torch.distributed.get_world_size()
+            # NOTE: token num which need to pad to when mc2
+            forward_context.padded_num_tokens = math.ceil(
+                max_tokens_across_dp / tp_world_size) * tp_world_size
+            # NOTE: mc2 op's param `global_bs`, add `world_size` to make `global_bs` absolutely larger than actual global_bs.
+            forward_context.global_bs = math.ceil(
+                max_tokens_across_dp / tp_world_size) * world_size
+
+            if fused_moe_state == FusedMoEState.MC2_PREFILL:
+                chunk_size = envs.VLLM_ASCEND_FUSED_MOE_MC2_CHUNK_SIZE
+                forward_context.max_num_chunks = math.ceil(
+                    math.ceil(max_tokens_across_dp / tp_world_size) /
+                    chunk_size)
+
+                forward_context.global_bs = math.ceil(
+                    math.ceil(max_tokens_across_dp / tp_world_size) /
+                    forward_context.max_num_chunks) * world_size
+
+                min_num_tokens = forward_context.max_num_chunks * tp_world_size
+                forward_context.padded_num_tokens = math.ceil(
+                    max_tokens_across_dp / min_num_tokens) * min_num_tokens
+
+            mc2_mask = torch.zeros(forward_context.padded_num_tokens,
+                                   dtype=torch.bool,
+                                   device=current_platform.device_type)
+            mc2_mask[:num_actual_tokens] = True
+            forward_context.mc2_mask = mc2_mask
 
         try:
             yield
 
@@ -11,7 +11,6 @@
 from vllm.config import get_current_vllm_config
 from vllm.model_executor.layers.linear import (LinearBase,
                                                UnquantizedLinearMethod)
-from vllm.platforms import current_platform
 from vllm.utils import cdiv, round_down
 
 from vllm_ascend import envs
@@ -71,6 +70,7 @@ class ChunkedContextMetadata:
         max_seq_lens: list[int]
         workspace: torch.Tensor
         chunk_seq_lens: torch.Tensor
+        chunk_seq_lens_npu: torch.Tensor
 
     attn_mask: torch.Tensor
     query_lens: list[int]
@@ -99,7 +99,6 @@ class AscendMLADecodeMetadata:
     attn_mask: Optional[torch.Tensor] = None
     sin: torch.Tensor = None
     cos: torch.Tensor = None
-    mc2_mask: Optional[torch.Tensor] = None
 
 
 @dataclass
@@ -215,13 +214,6 @@ def __init__(self,
         self.cos_cache = None
         self.sin_cache = None
 
-    def generate_activate_mask(self, actual_seqs_num, batch_size):
-        mc2_mask = torch.zeros(batch_size,
-                               dtype=torch.bool,
-                               device=current_platform.device_type)
-        mc2_mask[:actual_seqs_num].fill_(True)
-        return mc2_mask
-
     def reorder_batch(self, input_batch: "InputBatch",
                       scheduler_output: "SchedulerOutput") -> bool:
         # We now want to reorder the batch so that the "decode" requests are at
@@ -364,7 +356,6 @@ def build_torchair_graph_dummy(
                          self.rope_dim,
                          dtype=self.runner.dtype,
                          device=device)
-        mc2_mask = self.generate_activate_mask(num_actual_tokens, num_reqs)
         decode_metadata = AscendMLADecodeMetadata(
             input_positions=input_positions,
             block_table=block_table,
@@ -374,8 +365,7 @@ def build_torchair_graph_dummy(
             attn_mask=self.runner.spec_attn_mask,
             actual_seq_q_lens=self.runner.actual_seq_q_lens[:num_reqs],
             sin=sin,
-            cos=cos,
-            mc2_mask=mc2_mask)
+            cos=cos)
         return self.metadata_cls(  # type: ignore
             num_input_tokens=num_actual_tokens,
             num_actual_tokens=num_actual_tokens,
@@ -481,6 +471,7 @@ def build(
                     seq_tot=chunk_seq_lens.sum(dim=1).tolist(),
                     max_seq_lens=chunk_seq_lens.max(dim=1).values.tolist(),
                     chunk_seq_lens=chunk_seq_lens,
+                    chunk_seq_lens_npu=chunk_seq_lens.npu(),
                     workspace=self.chunked_prefill_workspace,
                 )
             prefill_input_positions = input_positions[tokens_start:]
@@ -547,15 +538,18 @@ def build(
                 actual_seq_q_lens = query_start_loc[1:].tolist(
                 ) + self.runner.actual_seq_q_lens[num_reqs:num_reqs +
                                                   num_reqs_pad_size]
+                # mtp torchair + PD scenario, last element of actual_seq_q_lens must equal to num_reqs_pad_size
+                num_padded_token_size = slot_mapping.size(0)
+                if actual_seq_q_lens[-1] != num_padded_token_size:
+                    actual_seq_q_lens.append(num_padded_token_size)
+                    seq_lens_list.append(0)
             else:
                 seq_lens_list = seq_lens.tolist()
 
             cos = self.cos_cache[input_positions].unsqueeze(  # type: ignore
                 1).unsqueeze(2)
             sin = self.sin_cache[input_positions].unsqueeze(  # type: ignore
                 1).unsqueeze(2)
-            mc2_mask = self.generate_activate_mask(
-                num_actual_tokens, num_reqs + num_reqs_pad_size)
 
             decode_metadata = AscendMLADecodeMetadata(
                 input_positions=input_positions,
@@ -566,8 +560,7 @@ def build(
                 attn_mask=self.runner.spec_attn_mask,
                 actual_seq_q_lens=actual_seq_q_lens,
                 sin=sin,
-                cos=cos,
-                mc2_mask=mc2_mask)
+                cos=cos)
 
         return self.metadata_cls(  # type: ignore
             num_actual_tokens=num_actual_tokens,
@@ -749,6 +742,8 @@ def _compute_prefill_context(
             toks = prefill_metadata.chunked_context.seq_tot[i]
 
             seq_len2 = prefill_metadata.chunked_context.chunk_seq_lens[i]
+            seq_len2_npu = prefill_metadata.chunked_context.chunk_seq_lens_npu[
+                i]
             seq_len = torch.stack([seq_len1, seq_len2])
             kv_c_normed = torch.empty(toks,
                                       num_heads,
@@ -765,7 +760,7 @@ def _compute_prefill_context(
                 cache_kv_c,
                 cache_k_pe,
                 prefill_metadata.block_table,
-                seq_len2.to(query.device),
+                seq_len2_npu,
                 seq_starts=prefill_metadata.chunked_context.starts[i],
                 key=kv_c_normed,
                 value=k_pe,
 
@@ -0,0 +1,49 @@
+from typing import Optional
+
+import torch
+from vllm.distributed.parallel_state import (GroupCoordinator, get_world_group,
+                                             init_model_parallel_group)
+
+# Currently, mc2 op need their own group coordinator.
+_MC2: Optional[GroupCoordinator] = None
+
+
+def get_mc2_group() -> GroupCoordinator:
+    assert _MC2 is not None, ("mc2 group is not initialized")
+    return _MC2
+
+
+def model_parallel_initialized():
+    return (_MC2 is not None)
+
+
+def init_ascend_model_parallel(
+    expert_parallel_size: int = 1,
+    world_size: Optional[int] = None,
+    backend: Optional[str] = None,
+):
+    if model_parallel_initialized():
+        return
+    assert torch.distributed.is_initialized()
+    world_size = world_size or torch.distributed.get_world_size()
+    backend = backend or torch.distributed.get_backend(
+        get_world_group().device_group)
+    num_expert_parallel_groups = world_size // expert_parallel_size
+
+    global _MC2
+    group_ranks = []
+    for i in range(num_expert_parallel_groups):
+        ranks = list(range(i, world_size, num_expert_parallel_groups))
+        group_ranks.append(ranks)
+
+    _MC2 = init_model_parallel_group(group_ranks,
+                                     get_world_group().local_rank,
+                                     backend,
+                                     group_name="mc2")
+
+
+def destroy_ascend_model_parallel():
+    global _MC2
+    if _MC2:
+        _MC2.destroy()
+    _MC2 = None
@@ -142,6 +142,12 @@
     #   1: enable moe all2all seq.
     "VLLM_ASCEND_ENABLE_MOE_ALL2ALL_SEQ":
     lambda: bool(int(os.getenv('VLLM_ASCEND_ENABLE_MOE_ALL2ALL_SEQ', '0'))),
+    # ENABLE chunk mc2
+    "VLLM_ASCEND_ENABLE_CHUNK_MC2":
+    lambda: bool(int(os.getenv("VLLM_ASCEND_ENABLE_CHUNK_MC2", "0"))),
+    # Batch MC2 in prefill: The number of tokens in each batch
+    "VLLM_ASCEND_FUSED_MOE_MC2_CHUNK_SIZE":
+    lambda: int(os.getenv("VLLM_ASCEND_FUSED_MOE_MC2_CHUNK_SIZE", "128")),
 }
 
 # end-env-vars-definition
 
@@ -39,6 +39,7 @@
 import vllm_ascend.envs as envs_ascend
 from vllm_ascend.ascend_config import get_ascend_config
 from vllm_ascend.ascend_forward_context import FusedMoEState
+from vllm_ascend.distributed.parallel_state import get_mc2_group
 from vllm_ascend.ops.expert_load_balancer import ExpertLoadBalancer
 from vllm_ascend.ops.moe_dispatcher.token_dispatcher import (
     MoEAlltoAllSeqOverLapDispatcher, MoeDispatcherConfig)
@@ -127,7 +128,7 @@ def fused_experts_with_mc2(
     mc2_mask: Optional[torch.Tensor] = None,
 ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
     quant_mode = 0
-    ep_group = get_ep_group()
+    ep_group = get_mc2_group()
     ep_rank_id = ep_group.rank_in_group
     ep_world_size = ep_group.world_size
     tp_world_size = get_tp_group().world_size
@@ -889,7 +890,7 @@ def __init__(self, moe: MoEConfig = None):
         self.torchair_graph_enabled = ascend_config.torchair_graph_config.enabled
 
         try:
-            device_group = get_ep_group().device_group
+            device_group = get_mc2_group().device_group
             # TODO: Try local_rank = ep_group.rank_in_group
             local_rank = torch.distributed.get_rank(group=device_group)
             backend = device_group._get_backend(torch.device("npu"))
@@ -1191,6 +1192,7 @@ def forward(self,
 
         num_tokens, hidden_size = hidden_states.shape
 
+        forward_context = get_forward_context()
         fused_moe_state = get_forward_context().fused_moe_state
         # For w8a8 dynamic we can do npu_dynamic_quant and gate in parallel.
         quantized_x_for_share, dynamic_scale_for_share = None, None
@@ -1210,32 +1212,30 @@ def forward(self,
             if not self.enable_multistream_moe or fused_moe_state != FusedMoEState.MC2:
                 shared_hidden_states = shared_experts(hidden_states)
 
-        attn_metadata = get_forward_context().attn_metadata
-        mc2_mask = attn_metadata.decode.mc2_mask if attn_metadata is not None and getattr(
-            attn_metadata, "decode", None) is not None else None
 
+        mc2_mask = forward_context.mc2_mask
         tp_size = get_tensor_model_parallel_world_size()
-        if tp_size > 1 and fused_moe_state != FusedMoEState.AllGather:
-            if num_tokens < tp_size:
+        if fused_moe_state != FusedMoEState.AllGather:
+            if num_tokens < forward_context.padded_num_tokens:
                 hidden_states = nn.functional.pad(
-                    hidden_states, (0, 0, 0, tp_size - num_tokens))
+                    hidden_states,
+                    (0, 0, 0, forward_context.padded_num_tokens - num_tokens))
                 router_logits = nn.functional.pad(
-                    router_logits, (0, 0, 0, tp_size - num_tokens))
-                if mc2_mask is not None:
-                    mc2_mask = nn.functional.pad(mc2_mask,
-                                                 (0, tp_size - num_tokens))
-            chunk_hidden_states = torch.tensor_split(hidden_states,
-                                                     tp_size,
-                                                     dim=0)
-            chunk_router_logits = torch.tensor_split(router_logits,
-                                                     tp_size,
-                                                     dim=0)
-            tp_rank = get_tensor_model_parallel_rank()
-            hidden_states = chunk_hidden_states[tp_rank]
-            router_logits = chunk_router_logits[tp_rank]
-
-            if mc2_mask is not None:
-                chunk_mc2_mask = torch.tensor_split(mc2_mask, tp_size, dim=0)
+                    router_logits,
+                    (0, 0, 0, forward_context.padded_num_tokens - num_tokens))
+            if tp_size > 1:
+                chunk_hidden_states = torch.tensor_split(hidden_states,
+                                                         tp_size,
+                                                         dim=0)
+                chunk_router_logits = torch.tensor_split(router_logits,
+                                                         tp_size,
+                                                         dim=0)
+                chunk_mc2_mask = torch.tensor_split(forward_context.mc2_mask,
+                                                    tp_size,
+                                                    dim=0)
+                tp_rank = get_tensor_model_parallel_rank()
+                hidden_states = chunk_hidden_states[tp_rank]
+                router_logits = chunk_router_logits[tp_rank]
                 mc2_mask = chunk_mc2_mask[tp_rank]
 
         if self.dp_size > 1 and fused_moe_state == FusedMoEState.AllGather:
@@ -1287,7 +1287,7 @@ def forward(self,
             dist.all_gather(list(chunk_hidden_states), e_hidden_states,
                             self.tp_group)
             final_hidden_states = torch.cat(chunk_hidden_states, dim=0)
-            if num_tokens < tp_size:
+            if num_tokens < forward_context.padded_num_tokens:
                 final_hidden_states = final_hidden_states[:num_tokens]
             dispose_tensor(e_hidden_states)
         elif self.dp_size > 1 and fused_moe_state == FusedMoEState.AllGather: