Offload calculation shared experts to another stream

w00800020 · sdmyzlp · commit 1ef0f680f886 · 2025-06-04T14:20:36.000+08:00
With the expected overlaping being:

| shared gate_up | shared act |              | shared down |
|    dispatch    | routed gate_up, act, down |   combine   |

Shared experts will be replicated regardless of TP, to avoid AllReduce comm.
Controlled by option VLLM_ENABLE_MULTISTREAM_SHARED_EXPERT, defaulted to off.

Signed-off-by: w00800020 &lt;weijinyi3@huawei.com&gt;
diff --git a/tests/multicard/test_offline_inference_distributed.py b/tests/multicard/test_offline_inference_distributed.py
@@ -21,6 +21,7 @@
 Run `pytest tests/test_offline_inference.py`.
 """
 import os
+from unittest.mock import patch
 
 import vllm  # noqa: F401
 
@@ -61,3 +62,20 @@ def test_models_distributed_DeepSeek():
             distributed_executor_backend="mp",
     ) as vllm_model:
         vllm_model.generate_greedy(example_prompts, max_tokens)
+
+@patch.dict(os.environ, {"VLLM_ENABLE_MULTISTREAM_SHARED_EXPERT": "1"})
+def test_models_distributed_multistream_shared_expert():
+    example_prompts = [
+        "vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs.",
+        "Briefly describe the major milestones in the development of artificial intelligence from 1950 to 2020.",
+        "Compare and contrast artificial intelligence with human intelligence in terms of processing information.",
+    ]
+    dtype = "half"
+    max_tokens = 5
+    with VllmRunner(
+            "deepseek-ai/DeepSeek-V2-Lite",
+            dtype=dtype,
+            tensor_parallel_size=4,
+            distributed_executor_backend="mp",
+    ) as vllm_model:
+        vllm_model.generate_greedy(example_prompts, max_tokens)
diff --git a/vllm_ascend/envs.py b/vllm_ascend/envs.py
@@ -36,6 +36,8 @@
     lambda: bool(int(os.getenv("COMPILE_CUSTOM_KERNELS", "1"))),
     "VLLM_ENABLE_MC2":
     lambda: bool(int(os.getenv("VLLM_ENABLE_MC2", '0'))),
+    "VLLM_ENABLE_MULTISTREAM_SHARED_EXPERT":
+    lambda: bool(int(os.getenv("VLLM_ENABLE_MULTISTREAM_SHARED_EXPERT", '0'))),
     "USING_LCCL_COM":
     lambda: bool(int(os.getenv("USING_LCCL_COM", '0'))),
     "SOC_VERSION":
diff --git a/vllm_ascend/models/deepseek_v2.py b/vllm_ascend/models/deepseek_v2.py
@@ -82,20 +82,35 @@ def __init__(
         hidden_act: str,
         quant_config: Optional[QuantizationConfig] = None,
         reduce_results: bool = True,
+        force_replicate: bool = False,
         prefix: str = "",
     ) -> None:
         super().__init__()
-        self.gate_up_proj = MergedColumnParallelLinear(
-            hidden_size, [intermediate_size] * 2,
-            bias=False,
-            quant_config=quant_config,
-            prefix=f"{prefix}.gate_up_proj")
-        self.down_proj = RowParallelLinear(intermediate_size,
-                                           hidden_size,
-                                           bias=False,
-                                           quant_config=quant_config,
-                                           reduce_results=reduce_results,
-                                           prefix=f"{prefix}.down_proj")
+        if not force_replicate:
+            self.gate_up_proj = MergedColumnParallelLinear(
+                hidden_size, [intermediate_size] * 2,
+                bias=False,
+                quant_config=quant_config,
+                prefix=f"{prefix}.gate_up_proj")
+            self.down_proj = RowParallelLinear(intermediate_size,
+                                               hidden_size,
+                                               bias=False,
+                                               quant_config=quant_config,
+                                               reduce_results=reduce_results,
+                                               prefix=f"{prefix}.down_proj")
+        else:
+            self.gate_up_proj = ReplicatedLinear(
+                hidden_size,
+                intermediate_size * 2,
+                bias=False,
+                quant_config=quant_config,
+                prefix=f"{prefix}.gate_up_proj")
+            self.down_proj = ReplicatedLinear(intermediate_size,
+                                              hidden_size,
+                                              bias=False,
+                                              quant_config=quant_config,
+                                              prefix=f"{prefix}.down_proj")
+
         if hidden_act != "silu":
             raise ValueError(f"Unsupported activation: {hidden_act}. "
                              "Only silu is supported for now.")
@@ -202,8 +217,12 @@ def __init__(
                 hidden_act=config.hidden_act,
                 quant_config=quant_config,
                 reduce_results=True,
+                force_replicate=envs_ascend.
+                VLLM_ENABLE_MULTISTREAM_SHARED_EXPERT,
                 prefix=f"{prefix}.shared_experts",
             )
+        else:
+            self.shared_experts = None  # type: ignore
         CustomDeepseekV2MoE.top_k = config.num_experts_per_tok
 
         self.dp_size = get_dp_group().world_size
@@ -224,8 +243,11 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
             is_prefill = attn_metadata.num_prefills > 0
             enable_force_load_balance = False
         num_tokens, hidden_dim = hidden_states.shape
+        use_separated_shared_expert = (
+            self.n_shared_experts is not None
+            and not envs_ascend.VLLM_ENABLE_MULTISTREAM_SHARED_EXPERT)
 
-        if self.n_shared_experts is not None:
+        if use_separated_shared_expert:
             shared_output = self.shared_experts(hidden_states)
 
         if self.tp_size > 1:
@@ -248,13 +270,22 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         # router_logits: (num_tokens, n_experts)
         router_logits, _ = self.gate(local_hidden_states)
 
-        router_hidden_states = self.experts(
+        experts_hidden_states = self.experts(
             hidden_states=local_hidden_states,
             router_logits=router_logits,
             is_prefill=is_prefill,
             top_k=CustomDeepseekV2MoE.top_k,
             enable_force_load_balance=enable_force_load_balance,
-        ) * self.routed_scaling_factor
+            shared_experts=(self.shared_experts
+                            if not use_separated_shared_expert else None),
+        )
+
+        if not isinstance(experts_hidden_states, tuple):
+            router_hidden_states = experts_hidden_states * self.routed_scaling_factor
+        else:
+            router_hidden_states = (
+                experts_hidden_states[0] * self.routed_scaling_factor +
+                experts_hidden_states[1])
 
         if self.tp_size > 1:
             dist.all_gather(list(chunk_hidden_states), router_hidden_states,
@@ -265,7 +296,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         else:
             final_hidden_states = router_hidden_states
 
-        if shared_output is not None:
+        if use_separated_shared_expert:
             final_hidden_states = final_hidden_states + shared_output
 
         return final_hidden_states.view(num_tokens, hidden_dim)
diff --git a/vllm_ascend/ops/fused_moe.py b/vllm_ascend/ops/fused_moe.py
@@ -33,6 +33,7 @@
 
 import vllm_ascend.envs as envs_ascend
 from vllm_ascend.distributed.parallel_state import get_ep_group, get_etp_group
+from vllm_ascend.utils import npu_stream_switch, npu_wait_tensor
 
 VLLM_ENABLE_MC2: bool = envs_ascend.VLLM_ENABLE_MC2
 USING_LCCL_COM: bool = envs_ascend.USING_LCCL_COM
@@ -47,6 +48,8 @@ def fused_experts_with_mc2(
     top_k: int,
     expert_map: torch.Tensor = None,
     moe_all_to_all_group_name: Optional[str] = None,
+    shared_experts: Optional[torch.nn.Module] = None,
+    graph_mode: bool = False,
 ) -> torch.Tensor:
     global_bs = 0
     moe_expert_num = len(expert_map)
@@ -88,6 +91,10 @@ def fused_experts_with_mc2(
     expand_x, dynamic_scale, expand_idx, expert_token_nums, ep_recv_counts = output[
         0:5]
 
+    if shared_experts is not None:
+        with npu_stream_switch("expert_secondary", 0, enabled=graph_mode):
+            shared_gate_up, _ = shared_experts.gate_up_proj(hidden_states)
+
     w1 = w1.transpose(1, 2)
     expert_token_nums = torch.cumsum(expert_token_nums,
                                      dim=0,
@@ -102,6 +109,11 @@ def fused_experts_with_mc2(
         group_list=group_list,
     )
 
+    if shared_experts is not None:
+        with npu_stream_switch("expert_secondary", 0, enabled=graph_mode):
+            npu_wait_tensor(shared_gate_up, expand_x, enabled=graph_mode)
+            shared_act = shared_experts.act_fn(shared_gate_up)
+
     # TODO: Remove this in the future.
     gate_up_out = torch.cat(gate_up_out_list, dim=0)
     gate_up_out = torch_npu.npu_swiglu(gate_up_out)
@@ -145,7 +157,15 @@ def fused_experts_with_mc2(
 
     hidden_states = torch_npu.npu_moe_distribute_combine(**kwargs)
 
-    return hidden_states
+    if shared_experts is not None:
+        with npu_stream_switch("expert_secondary", 0, enabled=graph_mode):
+            npu_wait_tensor(shared_act, down_out_list, enabled=graph_mode)
+            shared_hidden_states, _ = shared_experts.down_proj(shared_act)
+
+    if shared_experts is None:
+        return hidden_states
+    else:
+        return hidden_states, shared_hidden_states
 
 
 # currently expert parallelism implemented with all2all
@@ -587,6 +607,8 @@ def __init__(self, moe: MoEConfig = None):
         self.ep_size = ep_group.world_size
         self.global_batch_size = vllm_config.scheduler_config.max_num_seqs
         self.local_batch_size = self.global_batch_size // self.ep_size
+        self.graph_mode = vllm_config.get("additional_config",
+                                          {}).get("enable_graph_mode", False)
 
         try:
             device_group = ep_group.device_group
@@ -624,6 +646,7 @@ def apply(
         scoring_func: str = "softmax",
         e_score_correction_bias: Optional[torch.Tensor] = None,
         is_prefill: bool = False,
+        shared_experts: Optional[torch.nn.Module] = None,
         **kwargs,
     ):
         # NOTE: now npu_moe_gating_top_k can only support `group_count=256` pattern
@@ -664,28 +687,37 @@ def apply(
                 topk_ids=topk_ids,
                 top_k=top_k,
                 expert_map=expert_map,
-                moe_all_to_all_group_name=self.moe_all_to_all_group_name)
+                moe_all_to_all_group_name=self.moe_all_to_all_group_name,
+                shared_experts=shared_experts,
+                graph_mode=self.graph_mode,
+            )
         elif get_ep_group().world_size == 1:
-            return fused_experts(hidden_states=x,
-                                 w1=layer.w13_weight,
-                                 w2=layer.w2_weight,
-                                 topk_weights=topk_weights,
-                                 topk_ids=topk_ids,
-                                 top_k=top_k,
-                                 expert_map=expert_map)
+            router_hidden_states = fused_experts(hidden_states=x,
+                                                 w1=layer.w13_weight,
+                                                 w2=layer.w2_weight,
+                                                 topk_weights=topk_weights,
+                                                 topk_ids=topk_ids,
+                                                 top_k=top_k,
+                                                 expert_map=expert_map)
         else:
             # The current implementation of deepseek moe splits hidden_states
             # according to tp_size before they are feed into fused_moe module.
             # Therefore, all2all is needed no matter how dp/tp is set so as to
             # dispatch/combine tokens.
-            return fused_experts_with_all2all(hidden_states=x,
-                                              w1=layer.w13_weight,
-                                              w2=layer.w2_weight,
-                                              topk_weights=topk_weights,
-                                              topk_ids=topk_ids,
-                                              top_k=top_k,
-                                              expert_map=expert_map,
-                                              ep_group=get_ep_group())
+            router_hidden_states = fused_experts_with_all2all(
+                hidden_states=x,
+                w1=layer.w13_weight,
+                w2=layer.w2_weight,
+                topk_weights=topk_weights,
+                topk_ids=topk_ids,
+                top_k=top_k,
+                expert_map=expert_map,
+                ep_group=get_ep_group())
+
+        if shared_experts is None:
+            return router_hidden_states
+        else:
+            return router_hidden_states, shared_experts(x)
 
 
 class AscendFusedMoE(FusedMoE):
@@ -815,7 +847,8 @@ def forward(self,
                 router_logits: torch.Tensor,
                 is_prefill: bool,
                 enable_force_load_balance: bool = False,
-                top_k=None):
+                top_k: Optional[int] = None,
+                shared_experts: Optional[torch.nn.Module] = None):
         assert self.quant_method is not None
 
         if top_k:
@@ -842,7 +875,9 @@ def forward(self,
             scoring_func=self.scoring_func,
             e_score_correction_bias=self.e_score_correction_bias,
             is_prefill=is_prefill,
-            enable_force_load_balance=enable_force_load_balance)
+            enable_force_load_balance=enable_force_load_balance,
+            shared_experts=shared_experts,
+        )
 
         if VLLM_ENABLE_MC2 and not is_prefill:
             ...