|
20 | 20 | import torch
|
21 | 21 | import torch.distributed as dist
|
22 | 22 | import torch_npu
|
23 |
| -import torchair as tng # type: ignore |
24 | 23 | from vllm.config import get_current_vllm_config
|
25 | 24 | from vllm.distributed import (GroupCoordinator,
|
26 | 25 | get_tensor_model_parallel_world_size,
|
@@ -87,13 +86,6 @@ def fused_experts_with_mc2(hidden_states: torch.Tensor,
|
87 | 86 | expand_x, dynamic_scale, expand_idx, expert_token_nums, ep_recv_counts = output[
|
88 | 87 | 0:5]
|
89 | 88 |
|
90 |
| - shared_experts = kwargs.get('shared_experts', None) |
91 |
| - if shared_experts: |
92 |
| - shared_gate_up = kwargs.get('shared_gate_up', None) |
93 |
| - with tng.scope.npu_stream_switch('cv'): |
94 |
| - tng.scope.npu_wait_tensor(shared_gate_up, expand_x) |
95 |
| - shared_x = shared_experts.act_fn(shared_gate_up) |
96 |
| - |
97 | 89 | w1 = w1.transpose(1, 2)
|
98 | 90 | expert_token_nums = torch.cumsum(expert_token_nums,
|
99 | 91 | dim=0,
|
@@ -122,11 +114,6 @@ def fused_experts_with_mc2(hidden_states: torch.Tensor,
|
122 | 114 | group_list=group_list,
|
123 | 115 | )
|
124 | 116 |
|
125 |
| - if shared_experts: |
126 |
| - with tng.scope.npu_stream_switch('cv'): |
127 |
| - tng.scope.npu_wait_tensor(shared_x, down_out_list) |
128 |
| - shared_output = shared_experts.down_proj(shared_x) |
129 |
| - |
130 | 117 | down_out_list = torch.cat(down_out_list, dim=0)
|
131 | 118 |
|
132 | 119 | # moeCombine
|
@@ -156,8 +143,6 @@ def fused_experts_with_mc2(hidden_states: torch.Tensor,
|
156 | 143 |
|
157 | 144 | hidden_states = torch_npu.npu_moe_distribute_combine(**kwargs_mc2)
|
158 | 145 |
|
159 |
| - if shared_experts: |
160 |
| - return hidden_states, shared_output |
161 | 146 | return hidden_states
|
162 | 147 |
|
163 | 148 |
|
|
0 commit comments