refactor in deepseek moe

David9857 · David9857 · commit 3511331c9f5d · 2025-06-05T20:32:48.000+08:00
Signed-off-by: David9857 &lt;985700846@qq.com&gt;
diff --git a/vllm_ascend/models/deepseek_v2.py b/vllm_ascend/models/deepseek_v2.py
@@ -180,12 +180,6 @@ def __init__(
         else:
             self.gate.e_score_correction_bias = None
 
-        self.enable_cv_parallel = False
-        additional_config = get_current_vllm_config().additional_config
-        if additional_config:
-            self.enable_cv_parallel = additional_config.get(
-                "enable_cv_parallel", False)
-
         self.experts = AscendFusedMoE(
             num_experts=config.n_routed_experts,
             top_k=config.num_experts_per_tok,
@@ -222,10 +216,13 @@ def __init__(
         self.params_dtype = torch.get_default_dtype()
 
         self.enable_graph_mode = False
+        self.enable_multistream_shared_expert = False
         additional_config = get_current_vllm_config().additional_config
         if additional_config:
             self.enable_graph_mode = additional_config.get(
                 "enable_graph_mode", False)
+            self.enable_multistream_shared_expert = additional_config.get(
+                "enable_multistream_shared_expert", False)
 
     def forward(
             self,
@@ -248,10 +245,10 @@ def forward(
 
         num_tokens, hidden_size = hidden_states.shape
 
-        cv_parallel = self.enable_cv_parallel and not is_prefill
+        multistream = self.enable_multistream_shared_expert and not is_prefill
 
         if self.n_shared_experts is not None:
-            if not cv_parallel:
+            if not multistream:
                 shared_output = self.shared_experts(hidden_states)
             else:
                 shared_hidden_states = hidden_states
@@ -275,41 +272,25 @@ def forward(
         # router_logits: (num_tokens, n_experts)
         router_logits, _ = self.gate(hidden_states)
 
-        if self.n_shared_experts is not None and cv_parallel:
-            with tng.scope.npu_stream_switch('cv'):
-                tng.scope.npu_wait_tensor(shared_hidden_states, router_logits)
-                dynamic_scale = None
-                if self.shared_experts.is_dynamic_quant:
-                    x, dynamic_scale = torch_npu.npu_dynamic_quant(
-                        shared_hidden_states)
-                    gate_up = torch_npu.npu_quant_matmul(
-                        x,
-                        self.shared_experts.gate_up_proj.weight,
-                        self.shared_experts.gate_up_proj.weight_scale,
-                        output_dtype=torch.int32,
-                    )
-                else:
-                    gate_up, _ = self.gate_up_proj(shared_hidden_states)
-
-        if cv_parallel:
-            hidden_states, shared_output = self.experts(
-                hidden_states=hidden_states,
-                router_logits=router_logits,
-                is_prefill=is_prefill,
-                top_k=CustomDeepseekV2MoE.top_k,
-                enable_force_load_balance=enable_force_load_balance,
-                shared_experts=self.shared_experts,
-                shared_gate_up=gate_up,
-                shared_dynamic_scale=dynamic_scale)
-            hidden_states = hidden_states * self.routed_scaling_factor
-        else:
-            hidden_states = self.experts(
-                hidden_states=hidden_states,
-                router_logits=router_logits,
-                is_prefill=is_prefill,
-                top_k=CustomDeepseekV2MoE.top_k,
-                enable_force_load_balance=enable_force_load_balance,
-            ) * self.routed_scaling_factor
+        kwargs = {}
+        if multistream:
+            kwargs.update({
+                "shared_experts": self.shared_experts,
+                "shared_hidden_states": shared_hidden_states
+            })
+
+        hidden_states = self.experts(
+            hidden_states=hidden_states,
+            router_logits=router_logits,
+            is_prefill=is_prefill,
+            top_k=CustomDeepseekV2MoE.top_k,
+            enable_force_load_balance=enable_force_load_balance,
+            **kwargs)
+        
+        if multistream:
+            hidden_states, shared_output = hidden_states
+        
+        hidden_states = hidden_states * self.routed_scaling_factor
 
         if self.tp_size > 1:
             if self.enable_graph_mode:
diff --git a/vllm_ascend/ops/fused_moe.py b/vllm_ascend/ops/fused_moe.py
@@ -834,13 +834,13 @@ def __init__(
         self.quant_method.create_weights(layer=self, **moe_quant_params)
 
         self.enable_graph_mode = False
-        self.enable_cv_parallel = False
+        self.enable_multistream_shared_expert = False
         additional_config = get_current_vllm_config().additional_config
         if additional_config:
             self.enable_graph_mode = additional_config.get(
                 "enable_graph_mode", False)
-            self.enable_cv_parallel = additional_config.get(
-                "enable_cv_parallel", False)
+            self.enable_multistream_shared_expert = additional_config.get(
+                "enable_multistream_shared_expert", False)
 
     def forward(self,
                 hidden_states: torch.Tensor,
@@ -895,7 +895,7 @@ def forward(self,
             enable_force_load_balance=enable_force_load_balance,
             **kwargs)
 
-        if self.enable_cv_parallel and not is_prefill:
+        if self.enable_multistream_shared_expert and not is_prefill:
             hidden_states, shared_output = hidden_states
 
         if self.dp_size > 1:
@@ -920,6 +920,6 @@ def forward(self,
         if self.reduce_results and (self.tp_size > 1 or self.ep_size > 1):
             hidden_states = tensor_model_parallel_all_reduce(hidden_states)
 
-        if self.enable_cv_parallel and not is_prefill:
+        if self.enable_multistream_shared_expert and not is_prefill:
             return hidden_states, shared_output
         return hidden_states
diff --git a/vllm_ascend/quantization/w8a8_dynamic.py b/vllm_ascend/quantization/w8a8_dynamic.py
@@ -184,6 +184,24 @@ def fused_experts_with_mc2(hidden_states: torch.Tensor,
     }
     kwargs_mc2.update(stage1_kwargs)
 
+    shared_experts = kwargs.get('shared_experts', None)
+    if shared_experts:
+        shared_hidden_states = kwargs.get('shared_hidden_states', None)
+        with tng.scope.npu_stream_switch('cv'):
+            tng.scope.npu_wait_tensor(shared_hidden_states, hidden_states)
+            shared_x, shared_dynamic_scale = torch_npu.npu_dynamic_quant(
+                shared_hidden_states)
+            shared_gate_up = torch_npu.npu_quant_matmul(
+                shared_x,
+                shared_experts.gate_up_proj.weight,
+                shared_experts.gate_up_proj.weight_scale,
+                output_dtype=torch.int32,
+            )
+        kwargs.update({
+            "shared_gate_up": shared_gate_up,
+            "shared_dynamic_scale": shared_dynamic_scale,
+        })
+
     output = torch_npu.npu_moe_distribute_dispatch(**kwargs_mc2)
     # comm_stream.wait_stream(torch.npu.current_stream())
     expand_x, dynamic_scale, expand_idx, expert_token_nums, ep_recv_counts = output[