support moe multistream in deepseek

David9857 · David9857 · commit 25e3d2ceffc8 · 2025-05-24T17:29:44.000+08:00
diff --git a/vllm_ascend/models/deepseek_v2.py b/vllm_ascend/models/deepseek_v2.py
@@ -30,6 +30,7 @@
 import torch
 import torch.distributed as dist
 import torch_npu
+import torchair as tng
 import vllm.envs as envs
 from torch import nn
 from transformers import PretrainedConfig
@@ -210,6 +211,8 @@ def __init__(
         self.tp_group = get_tp_group().device_group
         self.tp_rank = get_tp_group().rank_in_group
 
+        self.enable_multi_stream = True
+
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         attn_metadata = get_forward_context().attn_metadata
         # when profile runs, force experts to load balanced tokens
@@ -224,8 +227,12 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
             enable_force_load_balance = False
         num_tokens, hidden_dim = hidden_states.shape
 
-        if self.n_shared_experts is not None:
+        moe_multi_stream = self.enable_multi_stream and not is_prefill
+
+        if self.n_shared_experts is not None and not moe_multi_stream:
             shared_output = self.shared_experts(hidden_states)
+        else:
+            shared_hidden_states = hidden_states
 
         if self.tp_size > 1:
             # pass
@@ -244,16 +251,40 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         else:
             local_hidden_states = hidden_states
 
+        if self.n_shared_experts is not None and moe_multi_stream:
+            with tng.scope.npu_stream_switch('1'):
+                tng.scope.npu_wait_tensor(shared_hidden_states, shared_hidden_states)
+                x, dynamic_scale = torch_npu.npu_dynamic_quant(shared_hidden_states)
+                gate_up = torch_npu.npu_quant_matmul(
+                    x,
+                    self.shared_experts.gate_up_proj.weight,
+                    self.shared_experts.gate_up_proj.weight_scale,
+                    output_dtype=torch.int32,
+                )
+
         # router_logits: (num_tokens, n_experts)
         router_logits, _ = self.gate(local_hidden_states)
 
-        router_hidden_states = self.experts(
-            hidden_states=local_hidden_states,
-            router_logits=router_logits,
-            is_prefill=is_prefill,
-            top_k=CustomDeepseekV2MoE.top_k,
-            enable_force_load_balance=enable_force_load_balance,
-        ) * self.routed_scaling_factor
+        if moe_multi_stream:
+            router_hidden_states, shared_output = self.experts(
+                hidden_states=local_hidden_states,
+                router_logits=router_logits,
+                is_prefill=is_prefill,
+                top_k=CustomDeepseekV2MoE.top_k,
+                enable_force_load_balance=enable_force_load_balance,
+                shared_experts=self.shared_experts,
+                shared_gate_up=gate_up,
+                shared_dynamic_scale=dynamic_scale
+            )
+            router_hidden_states = router_hidden_states * self.routed_scaling_factor
+        else:
+            router_hidden_states = self.experts(
+                hidden_states=local_hidden_states,
+                router_logits=router_logits,
+                is_prefill=is_prefill,
+                top_k=CustomDeepseekV2MoE.top_k,
+                enable_force_load_balance=enable_force_load_balance,
+            ) * self.routed_scaling_factor
 
         if self.tp_size > 1:
             dist.all_gather(list(chunk_hidden_states), router_hidden_states,
diff --git a/vllm_ascend/ops/fused_moe.py b/vllm_ascend/ops/fused_moe.py
@@ -694,7 +694,8 @@ def forward(self,
                 router_logits: torch.Tensor,
                 is_prefill: bool,
                 enable_force_load_balance: bool = False,
-                top_k=None):
+                top_k=None,
+                **kwargs):
         assert self.quant_method is not None
 
         if top_k:
@@ -722,7 +723,12 @@ def forward(self,
             e_score_correction_bias=self.e_score_correction_bias,
             is_prefill=is_prefill,
             enable_force_load_balance=enable_force_load_balance,
-            dp_size=self.dp_size)
+            dp_size=self.dp_size,
+            **kwargs)
+
+        multi_stream = isinstance(final_hidden_states, tuple)
+        if multi_stream:
+            final_hidden_states, shared_output = final_hidden_states
 
         if VLLM_ENABLE_MC2 and not is_prefill:
             ...
@@ -731,4 +737,6 @@ def forward(self,
             final_hidden_states = tensor_model_parallel_all_reduce(
                 final_hidden_states)
 
+        if multi_stream:
+            return final_hidden_states, shared_output
         return final_hidden_states
diff --git a/vllm_ascend/quantization/quant_config.py b/vllm_ascend/quantization/quant_config.py
@@ -330,7 +330,7 @@ def apply(
             layer, x, router_logits, top_k, renormalize, use_grouped_topk,
             global_num_experts, expert_map, topk_group, num_expert_group,
             custom_routing_function, scoring_func, e_score_correction_bias,
-            is_prefill, enable_force_load_balance, dp_size)
+            is_prefill, enable_force_load_balance, dp_size, **kwargs)
 
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         if hasattr(self.quant_method, "process_weights_after_loading"):
diff --git a/vllm_ascend/quantization/w8a8_dynamic.py b/vllm_ascend/quantization/w8a8_dynamic.py
@@ -20,7 +20,8 @@
 import torch
 import torch.distributed as dist
 import torch_npu
-from vllm.distributed import GroupCoordinator
+import torchair as tng
+from vllm.distributed import GroupCoordinator, tensor_model_parallel_all_reduce
 
 import vllm_ascend.envs as envs_ascend
 from vllm_ascend.distributed.parallel_state import get_ep_group
@@ -36,7 +37,8 @@ def apply_mlp(hidden_states_wrapper: List[torch.Tensor],
               w2_scale: torch.Tensor,
               group_list: torch.Tensor,
               dynamic_scale: torch.Tensor = None,
-              group_list_type: int = 1) -> torch.Tensor:
+              group_list_type: int = 1,
+              **kwargs) -> torch.Tensor:
     """
     apply MLP: gate_up_proj -> swiglu -> down_proj
 
@@ -68,6 +70,23 @@ def apply_mlp(hidden_states_wrapper: List[torch.Tensor],
     else:
         pertoken_scale = dynamic_scale
 
+    shared_experts = kwargs.get('shared_experts', None)
+    if shared_experts:
+        shared_gate_up = kwargs.get('shared_gate_up', None)
+        shared_dynamic_scale = kwargs.get('shared_dynamic_scale', None)
+        with tng.scope.npu_stream_switch('1'):
+            tng.scope.npu_wait_tensor(shared_gate_up, hidden_states)
+            shared_x, shared_dynamic_scale = torch_npu.npu_dequant_swiglu_quant(
+                x=shared_gate_up,
+                weight_scale=shared_experts.gate_up_proj.weight_scale_fp32,
+                activation_scale=shared_dynamic_scale,
+                bias=None,
+                quant_scale=None,
+                quant_offset=None,
+                group_index=None,
+                activate_left=True,
+                quant_mode=1)
+
     # gmm1: gate_up_proj
     hidden_states = torch_npu.npu_grouped_matmul(
         x=[hidden_states],
@@ -96,6 +115,21 @@ def apply_mlp(hidden_states_wrapper: List[torch.Tensor],
         group_type=0,
         group_list=group_list,
         output_dtype=w2_scale.dtype)[0]
+
+    if shared_experts:
+        with tng.scope.npu_stream_switch('1'):
+            tng.scope.npu_wait_tensor(shared_x, hidden_states)
+            shared_output = torch_npu.npu_quant_matmul(
+                shared_x,
+                shared_experts.down_proj.weight,
+                shared_experts.down_proj.weight_scale,
+                pertoken_scale=shared_dynamic_scale,
+                output_dtype=torch.bfloat16,
+            )
+            if shared_experts.down_proj.reduce_results and shared_experts.down_proj.tp_size > 1:
+                shared_output = tensor_model_parallel_all_reduce(x)
+    if shared_experts:
+        return hidden_states, shared_output
     return hidden_states
 
 
@@ -110,11 +144,12 @@ def fused_experts_with_mc2(
     top_k: int,
     expert_map: torch.Tensor = None,
     moe_all_to_all_group_name: str = "",
+    **kwargs
 ) -> torch.Tensor:
     global_bs = 0
     moe_expert_num = len(expert_map)
     # hidden_states = hidden_states.bfloat16()
-    kwargs = {
+    kwargs1 = {
         "x": hidden_states,
         "expert_ids": topk_ids,
         "expert_shard_type": 0,
@@ -145,9 +180,9 @@ def fused_experts_with_mc2(
         "tp_world_size": tp_size,
         "tp_rank_id": tp_rank,
     }
-    kwargs.update(stage1_kwargs)
+    kwargs1.update(stage1_kwargs)
 
-    output = torch_npu.npu_moe_distribute_dispatch(**kwargs)
+    output = torch_npu.npu_moe_distribute_dispatch(**kwargs1)
     # comm_stream.wait_stream(torch.npu.current_stream())
     expand_x, dynamic_scale, expand_idx, expert_token_nums, ep_recv_counts = output[
         0:5]
@@ -165,10 +200,15 @@ def fused_experts_with_mc2(
                               w2,
                               w2_scale,
                               expert_token_nums,
-                              dynamic_scale=dynamic_scale)
+                              dynamic_scale=dynamic_scale,
+                              **kwargs)
+
+    multi_stream = isinstance(down_out_list, tuple)
+    if multi_stream:
+        down_out_list, shared_output = down_out_list
 
     # moeCombine
-    kwargs = {
+    kwargs2 = {
         "expand_x": down_out_list,
         "expert_ids": topk_ids,
         "expand_idx": expand_idx,
@@ -192,10 +232,12 @@ def fused_experts_with_mc2(
         "tp_world_size": tp_size,
         "tp_rank_id": tp_rank,
     }
-    kwargs.update(stage3_kwargs)
+    kwargs2.update(stage3_kwargs)
 
-    hidden_states = torch_npu.npu_moe_distribute_combine(**kwargs)
+    hidden_states = torch_npu.npu_moe_distribute_combine(**kwargs2)
 
+    if multi_stream:
+        return hidden_states, shared_output
     return hidden_states
 
 
@@ -634,7 +676,8 @@ def apply(
                 topk_ids=topk_ids,
                 top_k=top_k,
                 expert_map=expert_map,
-                moe_all_to_all_group_name=self.moe_all_to_all_group_name)
+                moe_all_to_all_group_name=self.moe_all_to_all_group_name,
+                **kwargs)
         elif dp_size == 1:
             return fused_experts(hidden_states=x,
                                  w1=layer.w13_weight,