support moe multistream in deepseek

David9857 · David9857 · commit 10744135b7ce · 2025-06-04T11:46:53.000+08:00
Signed-off-by: David9857 &lt;985700846@qq.com&gt;

use additional_config to enable cv parallel

Signed-off-by: David9857 &lt;985700846@qq.com&gt;

rename kwargs1 in fused_experts_with_mc2

Signed-off-by: David9857 &lt;985700846@qq.com&gt;
diff --git a/vllm_ascend/models/deepseek_v2.py b/vllm_ascend/models/deepseek_v2.py
@@ -30,6 +30,7 @@
 import torch
 import torch.distributed as dist
 import torch_npu
+import torchair as tng  # type: ignore
 import vllm.envs as envs
 from torch import nn
 from transformers import PretrainedConfig
@@ -177,6 +178,12 @@ def __init__(
         else:
             self.gate.e_score_correction_bias = None
 
+        self.enable_cv_parallel = False
+        additional_config = get_current_vllm_config().additional_config
+        if additional_config:
+            self.enable_cv_parallel = additional_config.get(
+                "enable_cv_parallel", False)
+
         self.experts = AscendFusedMoE(
             num_experts=config.n_routed_experts,
             top_k=config.num_experts_per_tok,
@@ -224,8 +231,13 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
             enable_force_load_balance = False
         num_tokens, hidden_dim = hidden_states.shape
 
+        cv_parallel = self.enable_cv_parallel and not is_prefill
+
         if self.n_shared_experts is not None:
-            shared_output = self.shared_experts(hidden_states)
+            if not cv_parallel:
+                shared_output = self.shared_experts(hidden_states)
+            else:
+                shared_hidden_states = hidden_states
 
         if self.tp_size > 1:
             # pass
@@ -247,13 +259,37 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         # router_logits: (num_tokens, n_experts)
         router_logits, _ = self.gate(local_hidden_states)
 
-        router_hidden_states = self.experts(
-            hidden_states=local_hidden_states,
-            router_logits=router_logits,
-            is_prefill=is_prefill,
-            top_k=CustomDeepseekV2MoE.top_k,
-            enable_force_load_balance=enable_force_load_balance,
-        ) * self.routed_scaling_factor
+        if self.n_shared_experts is not None and cv_parallel:
+            with tng.scope.npu_stream_switch('cv'):
+                tng.scope.npu_wait_tensor(shared_hidden_states, router_logits)
+                x, dynamic_scale = torch_npu.npu_dynamic_quant(
+                    shared_hidden_states)
+                gate_up = torch_npu.npu_quant_matmul(
+                    x,
+                    self.shared_experts.gate_up_proj.weight,
+                    self.shared_experts.gate_up_proj.weight_scale,
+                    output_dtype=torch.int32,
+                )
+
+        if cv_parallel:
+            router_hidden_states, shared_output = self.experts(
+                hidden_states=local_hidden_states,
+                router_logits=router_logits,
+                is_prefill=is_prefill,
+                top_k=CustomDeepseekV2MoE.top_k,
+                enable_force_load_balance=enable_force_load_balance,
+                shared_experts=self.shared_experts,
+                shared_gate_up=gate_up,
+                shared_dynamic_scale=dynamic_scale)
+            router_hidden_states = router_hidden_states * self.routed_scaling_factor
+        else:
+            router_hidden_states = self.experts(
+                hidden_states=local_hidden_states,
+                router_logits=router_logits,
+                is_prefill=is_prefill,
+                top_k=CustomDeepseekV2MoE.top_k,
+                enable_force_load_balance=enable_force_load_balance,
+            ) * self.routed_scaling_factor
 
         if self.tp_size > 1:
             dist.all_gather(list(chunk_hidden_states), router_hidden_states,
diff --git a/vllm_ascend/ops/fused_moe.py b/vllm_ascend/ops/fused_moe.py
@@ -810,12 +810,18 @@ def __init__(
 
         self.quant_method.create_weights(layer=self, **moe_quant_params)
 
+        self.enable_cv_parallel = False
+        if vllm_config.additional_config:
+            self.enable_cv_parallel = vllm_config.additional_config.get(
+                "enable_cv_parallel", False)
+
     def forward(self,
                 hidden_states: torch.Tensor,
                 router_logits: torch.Tensor,
                 is_prefill: bool,
                 enable_force_load_balance: bool = False,
-                top_k=None):
+                top_k=None,
+                **kwargs):
         assert self.quant_method is not None
 
         if top_k:
@@ -842,7 +848,11 @@ def forward(self,
             scoring_func=self.scoring_func,
             e_score_correction_bias=self.e_score_correction_bias,
             is_prefill=is_prefill,
-            enable_force_load_balance=enable_force_load_balance)
+            enable_force_load_balance=enable_force_load_balance,
+            **kwargs)
+
+        if self.enable_cv_parallel and not is_prefill:
+            final_hidden_states, shared_output = final_hidden_states
 
         if VLLM_ENABLE_MC2 and not is_prefill:
             ...
@@ -851,4 +861,6 @@ def forward(self,
             final_hidden_states = tensor_model_parallel_all_reduce(
                 final_hidden_states)
 
+        if self.enable_cv_parallel and not is_prefill:
+            return final_hidden_states, shared_output
         return final_hidden_states
diff --git a/vllm_ascend/quantization/quant_config.py b/vllm_ascend/quantization/quant_config.py
@@ -329,7 +329,7 @@ def apply(
             layer, x, router_logits, top_k, renormalize, use_grouped_topk,
             global_num_experts, expert_map, topk_group, num_expert_group,
             custom_routing_function, scoring_func, e_score_correction_bias,
-            is_prefill, enable_force_load_balance)
+            is_prefill, enable_force_load_balance, **kwargs)
 
     def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         if hasattr(self.quant_method, "process_weights_after_loading"):
diff --git a/vllm_ascend/quantization/w8a8_dynamic.py b/vllm_ascend/quantization/w8a8_dynamic.py
@@ -20,7 +20,8 @@
 import torch
 import torch.distributed as dist
 import torch_npu
-from vllm.distributed import GroupCoordinator
+import torchair as tng  # type: ignore
+from vllm.distributed import GroupCoordinator, tensor_model_parallel_all_reduce
 
 import vllm_ascend.envs as envs_ascend
 from vllm_ascend.distributed.parallel_state import get_ep_group
@@ -36,7 +37,8 @@ def apply_mlp(hidden_states_wrapper: List[torch.Tensor],
               w2_scale: torch.Tensor,
               group_list: torch.Tensor,
               dynamic_scale: torch.Tensor = None,
-              group_list_type: int = 1) -> torch.Tensor:
+              group_list_type: int = 1,
+              **kwargs) -> torch.Tensor:
     """
     apply MLP: gate_up_proj -> swiglu -> down_proj
 
@@ -68,6 +70,23 @@ def apply_mlp(hidden_states_wrapper: List[torch.Tensor],
     else:
         pertoken_scale = dynamic_scale
 
+    shared_experts = kwargs.get('shared_experts', None)
+    if shared_experts:
+        shared_gate_up = kwargs.get('shared_gate_up', None)
+        shared_dynamic_scale = kwargs.get('shared_dynamic_scale', None)
+        with tng.scope.npu_stream_switch('1'):
+            tng.scope.npu_wait_tensor(shared_gate_up, hidden_states)
+            shared_x, shared_dynamic_scale = torch_npu.npu_dequant_swiglu_quant(
+                x=shared_gate_up,
+                weight_scale=shared_experts.gate_up_proj.weight_scale_fp32,
+                activation_scale=shared_dynamic_scale,
+                bias=None,
+                quant_scale=None,
+                quant_offset=None,
+                group_index=None,
+                activate_left=True,
+                quant_mode=1)
+
     # gmm1: gate_up_proj
     hidden_states = torch_npu.npu_grouped_matmul(
         x=[hidden_states],
@@ -96,25 +115,39 @@ def apply_mlp(hidden_states_wrapper: List[torch.Tensor],
         group_type=0,
         group_list=group_list,
         output_dtype=w2_scale.dtype)[0]
+
+    if shared_experts:
+        with tng.scope.npu_stream_switch('1'):
+            tng.scope.npu_wait_tensor(shared_x, hidden_states)
+            shared_output = torch_npu.npu_quant_matmul(
+                shared_x,
+                shared_experts.down_proj.weight,
+                shared_experts.down_proj.weight_scale,
+                pertoken_scale=shared_dynamic_scale,
+                output_dtype=torch.bfloat16,
+            )
+            if shared_experts.down_proj.reduce_results and shared_experts.down_proj.tp_size > 1:
+                shared_output = tensor_model_parallel_all_reduce(shared_output)
+    if shared_experts:
+        return hidden_states, shared_output
     return hidden_states
 
 
-def fused_experts_with_mc2(
-    hidden_states: torch.Tensor,
-    w1: torch.Tensor,
-    w2: torch.Tensor,
-    w1_scale: torch.Tensor,
-    w2_scale: torch.Tensor,
-    topk_weights: torch.Tensor,
-    topk_ids: torch.Tensor,
-    top_k: int,
-    expert_map: torch.Tensor = None,
-    moe_all_to_all_group_name: str = "",
-) -> torch.Tensor:
+def fused_experts_with_mc2(hidden_states: torch.Tensor,
+                           w1: torch.Tensor,
+                           w2: torch.Tensor,
+                           w1_scale: torch.Tensor,
+                           w2_scale: torch.Tensor,
+                           topk_weights: torch.Tensor,
+                           topk_ids: torch.Tensor,
+                           top_k: int,
+                           expert_map: torch.Tensor = None,
+                           moe_all_to_all_group_name: str = "",
+                           **kwargs) -> torch.Tensor:
     global_bs = 0
     moe_expert_num = len(expert_map)
     # hidden_states = hidden_states.bfloat16()
-    kwargs = {
+    kwargs_mc2 = {
         "x": hidden_states,
         "expert_ids": topk_ids,
         "expert_shard_type": 0,
@@ -145,9 +178,9 @@ def fused_experts_with_mc2(
         "tp_world_size": tp_size,
         "tp_rank_id": tp_rank,
     }
-    kwargs.update(stage1_kwargs)
+    kwargs_mc2.update(stage1_kwargs)
 
-    output = torch_npu.npu_moe_distribute_dispatch(**kwargs)
+    output = torch_npu.npu_moe_distribute_dispatch(**kwargs_mc2)
     # comm_stream.wait_stream(torch.npu.current_stream())
     expand_x, dynamic_scale, expand_idx, expert_token_nums, ep_recv_counts = output[
         0:5]
@@ -165,10 +198,15 @@ def fused_experts_with_mc2(
                               w2,
                               w2_scale,
                               expert_token_nums,
-                              dynamic_scale=dynamic_scale)
+                              dynamic_scale=dynamic_scale,
+                              **kwargs)
+
+    multi_stream = isinstance(down_out_list, tuple)
+    if multi_stream:
+        down_out_list, shared_output = down_out_list
 
     # moeCombine
-    kwargs = {
+    kwargs_mc2 = {
         "expand_x": down_out_list,
         "expert_ids": topk_ids,
         "expand_idx": expand_idx,
@@ -192,10 +230,12 @@ def fused_experts_with_mc2(
         "tp_world_size": tp_size,
         "tp_rank_id": tp_rank,
     }
-    kwargs.update(stage3_kwargs)
+    kwargs_mc2.update(stage3_kwargs)
 
-    hidden_states = torch_npu.npu_moe_distribute_combine(**kwargs)
+    hidden_states = torch_npu.npu_moe_distribute_combine(**kwargs_mc2)
 
+    if multi_stream:
+        return hidden_states, shared_output
     return hidden_states
 
 
@@ -633,7 +673,8 @@ def apply(
                 topk_ids=topk_ids,
                 top_k=top_k,
                 expert_map=expert_map,
-                moe_all_to_all_group_name=self.moe_all_to_all_group_name)
+                moe_all_to_all_group_name=self.moe_all_to_all_group_name,
+                **kwargs)
         elif self.ep_group.world_size == 1:
             return fused_experts(hidden_states=x,
                                  w1=layer.w13_weight,