Support multistream of shared experts in FusedMoE (vllm-project#997)

sdmyzlp · yangcheng (AJ) · commit 2cfba2d56c7a · 2025-07-07T23:15:38.000+08:00
Contains on vllm-project#1111 for completeness.  Implement multi-stream parallelism for MoE layers with shared experts, where computation of shared experts will be overlapped with expert token dispatch and combine. Also, when multi-stream is enabled, weights of shared experts will be force to replicate across all cards, regardless of any tensor parallelism configurations, to avoid AllReduce operations. With the expected overlaping being: ``` | shared gate_up | shared act | | shared down | | dispatch | routed gate_up, act, down | combine | ```  No.  Tested on 1x16 910 node, with tailored 2 layer DSKv2.  --------- Signed-off-by: sdmyzlp <lrwei2@petalmail.com>
diff --git a/vllm_ascend/ascend_config.py b/vllm_ascend/ascend_config.py
@@ -37,6 +37,7 @@ def __init__(self, vllm_config):
             ascend_scheduler_config)
 
         self.expert_map_path = additional_config.get("expert_map_path", None)
+        self.dynamic_eplb = additional_config.get("dynamic_eplb", False)
         self.chunked_prefill_for_mla = additional_config.get(
             "chunked_prefill_for_mla", False)
         self.enable_weight_nz_layout = additional_config.get(
diff --git a/vllm_ascend/models/deepseek_v2.py b/vllm_ascend/models/deepseek_v2.py
@@ -733,9 +733,12 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
         quant_config = vllm_config.quant_config
         self.config = config
         self.quant_config = quant_config
+        self.num_dense_layers = self.config.first_k_dense_replace
+        self.num_moe_layers = self.config.num_hidden_layers - self.num_dense_layers
         self.model = CustomDeepseekV2Model(vllm_config=vllm_config,
-                                           prefix=maybe_prefix(
+                                   prefix=maybe_prefix(
                                                prefix, "model"))
+
         if get_pp_group().is_last_rank:
             self.lm_head = ParallelLMHead(config.vocab_size,
                                           config.hidden_size,