Skip to content

Commit 3630856

Browse files
committed
support cv parallel for float model
Signed-off-by: David9857 <985700846@qq.com>
1 parent 1074413 commit 3630856

File tree

3 files changed

+46
-27
lines changed

3 files changed

+46
-27
lines changed

vllm_ascend/models/deepseek_v2.py

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -262,14 +262,18 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
262262
if self.n_shared_experts is not None and cv_parallel:
263263
with tng.scope.npu_stream_switch('cv'):
264264
tng.scope.npu_wait_tensor(shared_hidden_states, router_logits)
265-
x, dynamic_scale = torch_npu.npu_dynamic_quant(
266-
shared_hidden_states)
267-
gate_up = torch_npu.npu_quant_matmul(
268-
x,
269-
self.shared_experts.gate_up_proj.weight,
270-
self.shared_experts.gate_up_proj.weight_scale,
271-
output_dtype=torch.int32,
272-
)
265+
dynamic_scale = None
266+
if self.shared_experts.is_dynamic_quant:
267+
x, dynamic_scale = torch_npu.npu_dynamic_quant(
268+
shared_hidden_states)
269+
gate_up = torch_npu.npu_quant_matmul(
270+
x,
271+
self.shared_experts.gate_up_proj.weight,
272+
self.shared_experts.gate_up_proj.weight_scale,
273+
output_dtype=torch.int32,
274+
)
275+
else:
276+
gate_up, _ = self.gate_up_proj(shared_hidden_states)
273277

274278
if cv_parallel:
275279
router_hidden_states, shared_output = self.experts(

vllm_ascend/ops/fused_moe.py

Lines changed: 32 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
import torch
2121
import torch.distributed as dist
2222
import torch_npu
23+
import torchair as tng # type: ignore
2324
from vllm.config import get_current_vllm_config
2425
from vllm.distributed import (GroupCoordinator,
2526
get_tensor_model_parallel_world_size,
@@ -38,19 +39,18 @@
3839
USING_LCCL_COM: bool = envs_ascend.USING_LCCL_COM
3940

4041

41-
def fused_experts_with_mc2(
42-
hidden_states: torch.Tensor,
43-
w1: torch.Tensor,
44-
w2: torch.Tensor,
45-
topk_weights: torch.Tensor,
46-
topk_ids: torch.Tensor,
47-
top_k: int,
48-
expert_map: torch.Tensor = None,
49-
moe_all_to_all_group_name: Optional[str] = None,
50-
) -> torch.Tensor:
42+
def fused_experts_with_mc2(hidden_states: torch.Tensor,
43+
w1: torch.Tensor,
44+
w2: torch.Tensor,
45+
topk_weights: torch.Tensor,
46+
topk_ids: torch.Tensor,
47+
top_k: int,
48+
expert_map: torch.Tensor = None,
49+
moe_all_to_all_group_name: Optional[str] = None,
50+
**kwargs) -> torch.Tensor:
5151
global_bs = 0
5252
moe_expert_num = len(expert_map)
53-
kwargs = {
53+
kwargs_mc2 = {
5454
"x": hidden_states,
5555
"expert_ids": topk_ids,
5656
"expert_shard_type": 0,
@@ -81,13 +81,20 @@ def fused_experts_with_mc2(
8181
"tp_world_size": tp_size,
8282
"tp_rank_id": tp_rank,
8383
}
84-
kwargs.update(stage1_kwargs)
84+
kwargs_mc2.update(stage1_kwargs)
8585

86-
output = torch_npu.npu_moe_distribute_dispatch(**kwargs)
86+
output = torch_npu.npu_moe_distribute_dispatch(**kwargs_mc2)
8787
# comm_stream.wait_stream(torch.npu.current_stream())
8888
expand_x, dynamic_scale, expand_idx, expert_token_nums, ep_recv_counts = output[
8989
0:5]
9090

91+
shared_experts = kwargs.get('shared_experts', None)
92+
if shared_experts:
93+
shared_gate_up = kwargs.get('shared_gate_up', None)
94+
with tng.scope.npu_stream_switch('cv'):
95+
tng.scope.npu_wait_tensor(shared_gate_up, expand_x)
96+
shared_x = shared_experts.act_fn(shared_gate_up)
97+
9198
w1 = w1.transpose(1, 2)
9299
expert_token_nums = torch.cumsum(expert_token_nums,
93100
dim=0,
@@ -116,10 +123,15 @@ def fused_experts_with_mc2(
116123
group_list=group_list,
117124
)
118125

126+
if shared_experts:
127+
with tng.scope.npu_stream_switch('cv'):
128+
tng.scope.npu_wait_tensor(shared_x, down_out_list)
129+
shared_output = shared_experts.down_proj(shared_x)
130+
119131
down_out_list = torch.cat(down_out_list, dim=0)
120132

121133
# moeCombine
122-
kwargs = {
134+
kwargs_mc2 = {
123135
"expand_x": down_out_list,
124136
"expert_ids": topk_ids,
125137
"expand_idx": expand_idx,
@@ -141,10 +153,12 @@ def fused_experts_with_mc2(
141153
"tp_world_size": tp_size,
142154
"tp_rank_id": tp_rank,
143155
}
144-
kwargs.update(stage3_kwargs)
156+
kwargs_mc2.update(stage3_kwargs)
145157

146-
hidden_states = torch_npu.npu_moe_distribute_combine(**kwargs)
158+
hidden_states = torch_npu.npu_moe_distribute_combine(**kwargs_mc2)
147159

160+
if shared_experts:
161+
return hidden_states, shared_output
148162
return hidden_states
149163

150164

@@ -664,7 +678,8 @@ def apply(
664678
topk_ids=topk_ids,
665679
top_k=top_k,
666680
expert_map=expert_map,
667-
moe_all_to_all_group_name=self.moe_all_to_all_group_name)
681+
moe_all_to_all_group_name=self.moe_all_to_all_group_name,
682+
**kwargs)
668683
elif get_ep_group().world_size == 1:
669684
return fused_experts(hidden_states=x,
670685
w1=layer.w13_weight,

vllm_ascend/quantization/w8a8_dynamic.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@ def apply_mlp(hidden_states_wrapper: List[torch.Tensor],
7474
if shared_experts:
7575
shared_gate_up = kwargs.get('shared_gate_up', None)
7676
shared_dynamic_scale = kwargs.get('shared_dynamic_scale', None)
77-
with tng.scope.npu_stream_switch('1'):
77+
with tng.scope.npu_stream_switch('cv'):
7878
tng.scope.npu_wait_tensor(shared_gate_up, hidden_states)
7979
shared_x, shared_dynamic_scale = torch_npu.npu_dequant_swiglu_quant(
8080
x=shared_gate_up,
@@ -117,7 +117,7 @@ def apply_mlp(hidden_states_wrapper: List[torch.Tensor],
117117
output_dtype=w2_scale.dtype)[0]
118118

119119
if shared_experts:
120-
with tng.scope.npu_stream_switch('1'):
120+
with tng.scope.npu_stream_switch('cv'):
121121
tng.scope.npu_wait_tensor(shared_x, hidden_states)
122122
shared_output = torch_npu.npu_quant_matmul(
123123
shared_x,

0 commit comments

Comments
 (0)