support cv parallel for float model

David9857 · David9857 · commit d7f8be519026 · 2025-06-03T22:14:53.000+08:00
Signed-off-by: David9857 &lt;985700846@qq.com&gt;
diff --git a/vllm_ascend/models/deepseek_v2.py b/vllm_ascend/models/deepseek_v2.py
@@ -262,14 +262,18 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         if self.n_shared_experts is not None and cv_parallel:
             with tng.scope.npu_stream_switch('cv'):
                 tng.scope.npu_wait_tensor(shared_hidden_states, router_logits)
-                x, dynamic_scale = torch_npu.npu_dynamic_quant(
-                    shared_hidden_states)
-                gate_up = torch_npu.npu_quant_matmul(
-                    x,
-                    self.shared_experts.gate_up_proj.weight,
-                    self.shared_experts.gate_up_proj.weight_scale,
-                    output_dtype=torch.int32,
-                )
+                dynamic_scale = None
+                if self.shared_experts.is_dynamic_quant:
+                    x, dynamic_scale = torch_npu.npu_dynamic_quant(
+                        shared_hidden_states)
+                    gate_up = torch_npu.npu_quant_matmul(
+                        x,
+                        self.shared_experts.gate_up_proj.weight,
+                        self.shared_experts.gate_up_proj.weight_scale,
+                        output_dtype=torch.int32,
+                    )
+                else:
+                    gate_up, _ = self.gate_up_proj(shared_hidden_states)
 
         if cv_parallel:
             router_hidden_states, shared_output = self.experts(
diff --git a/vllm_ascend/ops/fused_moe.py b/vllm_ascend/ops/fused_moe.py
@@ -20,6 +20,7 @@
 import torch
 import torch.distributed as dist
 import torch_npu
+import torchair as tng  # type: ignore
 from vllm.config import get_current_vllm_config
 from vllm.distributed import (GroupCoordinator,
                               get_tensor_model_parallel_world_size,
@@ -47,10 +48,11 @@ def fused_experts_with_mc2(
     top_k: int,
     expert_map: torch.Tensor = None,
     moe_all_to_all_group_name: Optional[str] = None,
+    **kwargs
 ) -> torch.Tensor:
     global_bs = 0
     moe_expert_num = len(expert_map)
-    kwargs = {
+    kwargs_mc2 = {
         "x": hidden_states,
         "expert_ids": topk_ids,
         "expert_shard_type": 0,
@@ -81,13 +83,20 @@ def fused_experts_with_mc2(
         "tp_world_size": tp_size,
         "tp_rank_id": tp_rank,
     }
-    kwargs.update(stage1_kwargs)
+    kwargs_mc2.update(stage1_kwargs)
 
-    output = torch_npu.npu_moe_distribute_dispatch(**kwargs)
+    output = torch_npu.npu_moe_distribute_dispatch(**kwargs_mc2)
     # comm_stream.wait_stream(torch.npu.current_stream())
     expand_x, dynamic_scale, expand_idx, expert_token_nums, ep_recv_counts = output[
         0:5]
 
+    shared_experts = kwargs.get('shared_experts', None)
+    if shared_experts:
+        shared_gate_up = kwargs.get('shared_gate_up', None)
+        with tng.scope.npu_stream_switch('cv'):
+            tng.scope.npu_wait_tensor(shared_gate_up, expand_x)
+            shared_x = shared_experts.act_fn(shared_gate_up)
+
     w1 = w1.transpose(1, 2)
     expert_token_nums = torch.cumsum(expert_token_nums,
                                      dim=0,
@@ -116,10 +125,15 @@ def fused_experts_with_mc2(
         group_list=group_list,
     )
 
+    if shared_experts:
+        with tng.scope.npu_stream_switch('cv'):
+            tng.scope.npu_wait_tensor(shared_x, down_out_list)
+            shared_output = shared_experts.down_proj(shared_x)
+
     down_out_list = torch.cat(down_out_list, dim=0)
 
     # moeCombine
-    kwargs = {
+    kwargs_mc2 = {
         "expand_x": down_out_list,
         "expert_ids": topk_ids,
         "expand_idx": expand_idx,
@@ -141,10 +155,12 @@ def fused_experts_with_mc2(
         "tp_world_size": tp_size,
         "tp_rank_id": tp_rank,
     }
-    kwargs.update(stage3_kwargs)
+    kwargs_mc2.update(stage3_kwargs)
 
-    hidden_states = torch_npu.npu_moe_distribute_combine(**kwargs)
+    hidden_states = torch_npu.npu_moe_distribute_combine(**kwargs_mc2)
 
+    if shared_experts:
+        return hidden_states, shared_output
     return hidden_states
 
 
@@ -664,7 +680,8 @@ def apply(
                 topk_ids=topk_ids,
                 top_k=top_k,
                 expert_map=expert_map,
-                moe_all_to_all_group_name=self.moe_all_to_all_group_name)
+                moe_all_to_all_group_name=self.moe_all_to_all_group_name,
+                **kwargs)
         elif get_ep_group().world_size == 1:
             return fused_experts(hidden_states=x,
                                  w1=layer.w13_weight,
diff --git a/vllm_ascend/quantization/w8a8_dynamic.py b/vllm_ascend/quantization/w8a8_dynamic.py
@@ -74,7 +74,7 @@ def apply_mlp(hidden_states_wrapper: List[torch.Tensor],
     if shared_experts:
         shared_gate_up = kwargs.get('shared_gate_up', None)
         shared_dynamic_scale = kwargs.get('shared_dynamic_scale', None)
-        with tng.scope.npu_stream_switch('1'):
+        with tng.scope.npu_stream_switch('cv'):
             tng.scope.npu_wait_tensor(shared_gate_up, hidden_states)
             shared_x, shared_dynamic_scale = torch_npu.npu_dequant_swiglu_quant(
                 x=shared_gate_up,
@@ -117,7 +117,7 @@ def apply_mlp(hidden_states_wrapper: List[torch.Tensor],
         output_dtype=w2_scale.dtype)[0]
 
     if shared_experts:
-        with tng.scope.npu_stream_switch('1'):
+        with tng.scope.npu_stream_switch('cv'):
             tng.scope.npu_wait_tensor(shared_x, hidden_states)
             shared_output = torch_npu.npu_quant_matmul(
                 shared_x,