remove cv parallel for float model

David9857 · David9857 · commit 415394cb8aa0 · 2025-06-05T21:58:20.000+08:00
Signed-off-by: David9857 &lt;985700846@qq.com&gt;
diff --git a/vllm_ascend/models/deepseek_v2.py b/vllm_ascend/models/deepseek_v2.py
@@ -286,10 +286,10 @@ def forward(
             top_k=CustomDeepseekV2MoE.top_k,
             enable_force_load_balance=enable_force_load_balance,
             **kwargs)
-        
+
         if multistream:
             hidden_states, shared_output = hidden_states
-        
+
         hidden_states = hidden_states * self.routed_scaling_factor
 
         if self.tp_size > 1:
diff --git a/vllm_ascend/ops/fused_moe.py b/vllm_ascend/ops/fused_moe.py
@@ -20,7 +20,6 @@
 import torch
 import torch.distributed as dist
 import torch_npu
-import torchair as tng  # type: ignore
 from vllm.config import get_current_vllm_config
 from vllm.distributed import (GroupCoordinator,
                               get_tensor_model_parallel_world_size,
@@ -87,13 +86,6 @@ def fused_experts_with_mc2(hidden_states: torch.Tensor,
     expand_x, dynamic_scale, expand_idx, expert_token_nums, ep_recv_counts = output[
         0:5]
 
-    shared_experts = kwargs.get('shared_experts', None)
-    if shared_experts:
-        shared_gate_up = kwargs.get('shared_gate_up', None)
-        with tng.scope.npu_stream_switch('cv'):
-            tng.scope.npu_wait_tensor(shared_gate_up, expand_x)
-            shared_x = shared_experts.act_fn(shared_gate_up)
-
     w1 = w1.transpose(1, 2)
     expert_token_nums = torch.cumsum(expert_token_nums,
                                      dim=0,
@@ -122,11 +114,6 @@ def fused_experts_with_mc2(hidden_states: torch.Tensor,
         group_list=group_list,
     )
 
-    if shared_experts:
-        with tng.scope.npu_stream_switch('cv'):
-            tng.scope.npu_wait_tensor(shared_x, down_out_list)
-            shared_output = shared_experts.down_proj(shared_x)
-
     down_out_list = torch.cat(down_out_list, dim=0)
 
     # moeCombine
@@ -156,8 +143,6 @@ def fused_experts_with_mc2(hidden_states: torch.Tensor,
 
     hidden_states = torch_npu.npu_moe_distribute_combine(**kwargs_mc2)
 
-    if shared_experts:
-        return hidden_states, shared_output
     return hidden_states