Skip to content

Commit fee544e

Browse files
fix ep prefill (#2762)
1 parent c4718fd commit fee544e

File tree

7 files changed

+66
-32
lines changed

7 files changed

+66
-32
lines changed

custom_ops/gpu_ops/cpp_extensions.cc

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -158,7 +158,8 @@ std::vector<paddle::Tensor> EPMoeExpertDispatchFP8(
158158
const paddle::Tensor &input, const paddle::Tensor &scale,
159159
const paddle::Tensor &topk_ids, const paddle::Tensor &topk_weights,
160160
const paddle::Tensor &token_nums_per_expert,
161-
const paddle::Tensor &token_nums_per_expert_padded);
161+
const paddle::Tensor &token_nums_per_expert_padded,
162+
const bool use_in_ep, const int token_nums_this_rank_padded);
162163

163164
std::vector<paddle::Tensor> PerTokenQuant(paddle::Tensor &input,
164165
const int block_size);

custom_ops/gpu_ops/moe/ep_moe_prefill_func.cu

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -870,7 +870,9 @@ std::vector<paddle::Tensor> EPMoeExpertDispatchFP8(
870870
const paddle::Tensor& topk_ids,
871871
const paddle::Tensor& topk_weights,
872872
const paddle::Tensor& num_experts_per_rank_tensor,
873-
const paddle::Tensor& num_experts_per_rank_padded_tensor) {
873+
const paddle::Tensor& num_experts_per_rank_padded_tensor,
874+
const bool use_in_ep,
875+
const int token_nums_this_rank_padded) {
874876
const auto input_type = input.dtype();
875877
const int moe_topk = topk_ids.dims()[1];
876878
auto place = input.place();
@@ -886,22 +888,21 @@ std::vector<paddle::Tensor> EPMoeExpertDispatchFP8(
886888
const int hidden_size = input.dims()[input_dims.size() - 1];
887889
const int num_experts_per_rank = num_experts_per_rank_tensor.dims()[0];
888890

889-
int32_t token_nums_this_rank_padded = token_rows * moe_topk + num_experts_per_rank * (128-1);
890-
// token_nums_this_rank_padded = token_nums_this_rank_padded_useless;
891+
int32_t token_nums_feed_to_ffn = use_in_ep ? token_nums_this_rank_padded : token_rows * moe_topk + num_experts_per_rank * (128-1);
891892

892893
auto permute_input = GetEmptyTensor(
893-
{token_nums_this_rank_padded, hidden_size},
894+
{token_nums_feed_to_ffn, hidden_size},
894895
input_type,
895896
place);
896897
auto permute_scale = GetEmptyTensor(
897-
{token_nums_this_rank_padded, hidden_size / 128},
898+
{token_nums_feed_to_ffn, hidden_size / 128},
898899
paddle::DataType::FLOAT32,
899900
place);
900901

901-
auto m_indices = paddle::full({token_nums_this_rank_padded}, -1, paddle::DataType::INT32, place);
902+
auto m_indices = paddle::full({token_nums_feed_to_ffn}, -1, paddle::DataType::INT32, place);
902903
auto token_nums_per_expert_cumsum = GetEmptyTensor({num_experts_per_rank}, paddle::DataType::INT64, place);
903904
auto token_nums_per_expert_padded_cumsum = GetEmptyTensor({num_experts_per_rank}, paddle::DataType::INT64, place);
904-
auto dst_weights = GetEmptyTensor({token_nums_this_rank_padded}, paddle::DataType::FLOAT32, place);
905+
auto dst_weights = GetEmptyTensor({token_nums_feed_to_ffn}, paddle::DataType::FLOAT32, place);
905906
auto dst_indices = GetEmptyTensor({num_rows, num_experts_per_rank}, paddle::DataType::INT32, place);
906907
auto permute_indices_per_token = paddle::full({num_experts_per_rank, num_rows}, -1, paddle::DataType::INT32, place);
907908
auto cumsum_idx_gpu = paddle::full({num_experts_per_rank}, 0, paddle::DataType::INT32, place);
@@ -949,4 +950,5 @@ PD_BUILD_STATIC_OP(ep_moe_expert_dispatch_fp8)
949950
"dst_indices",
950951
"cumsum_idx_gpu",
951952
"m_indices"})
953+
.Attrs({"use_in_ep:bool", "token_nums_this_rank_padded:int"})
952954
.SetKernelFn(PD_KERNEL(EPMoeExpertDispatchFP8));

fastdeploy/model_executor/layers/attention/append_attn_backend.py

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
from fastdeploy.model_executor.layers.attention.attention import Attention
3434
from fastdeploy.model_executor.layers.attention.base_attention_backend import (
3535
AttentionBackend, AttentionMetadata)
36+
from fastdeploy.model_executor.layers.attention.utils import init_rank_and_device_id
3637
from fastdeploy.worker.forward_meta import ForwardMeta
3738

3839

@@ -91,7 +92,6 @@ def __init__(self, fd_config: FDConfig, kv_num_heads: int, num_heads: int,
9192
self.use_speculate: bool = self.speculative_method is not None
9293
self.speculate_max_draft_token_num: int = fd_config.speculative_config.num_speculative_tokens
9394
self.keep_pd_step_flag: bool = fd_config.speculative_config.model_type == "mtp"
94-
self.rank: int = fd_config.parallel_config.tensor_parallel_rank
9595

9696
self.kv_num_heads: int = kv_num_heads
9797
self.num_heads: int = num_heads
@@ -104,16 +104,11 @@ def __init__(self, fd_config: FDConfig, kv_num_heads: int, num_heads: int,
104104
self.use_pd_disaggregation: int = int(
105105
os.getenv("FLAGS_use_pd_disaggregation", 0))
106106
self.start_layer_index: int = fd_config.model_config.start_layer_index
107-
self.device_id: int = os.getenv("CUDA_VISIBLE_DEVICES", None)
108107

109108
if fd_config.parallel_config.expert_parallel_rank is None:
110109
fd_config.parallel_config.expert_parallel_rank = 0
111-
device_id = self.rank + fd_config.parallel_config.tensor_parallel_degree * \
112-
fd_config.parallel_config.expert_parallel_rank
113-
if self.device_id is None:
114-
self.device_id = device_id
115-
else:
116-
self.device_id = self.device_id.split(",")[device_id]
110+
111+
self.rank, self.device_id = init_rank_and_device_id(fd_config)
117112

118113
def init_attention_metadata(self, forward_meta: ForwardMeta):
119114
"""Initialize attntion metadata hence all layers in the forward pass can reuse it."""

fastdeploy/model_executor/layers/attention/flash_attn_backend.py

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
from fastdeploy.model_executor.layers.attention.ops import (
3535
get_block_shape_and_split_kv_block, gqa_rope_write_cache,
3636
init_signal_layerwise, open_shm_and_get_meta_signal, pre_cache_len_concat)
37+
from fastdeploy.model_executor.layers.attention.utils import init_rank_and_device_id
3738
from fastdeploy.worker.forward_meta import ForwardMeta
3839

3940

@@ -100,22 +101,16 @@ def __init__(self, fd_config: FDConfig, kv_num_heads: int, num_heads: int,
100101
self.use_speculate = self.speculative_method is not None
101102
self.speculate_max_draft_token_num = fd_config.speculative_config.num_speculative_tokens
102103
self.keep_pd_step_flag: bool = fd_config.speculative_config.model_type == "mtp"
103-
self.rank: int = fd_config.parallel_config.tensor_parallel_rank
104104

105105
# pd_disaggregation
106106
self.use_pd_disaggregation: int = int(
107107
os.getenv("FLAGS_use_pd_disaggregation", 0))
108108
self.start_layer_index: int = fd_config.model_config.start_layer_index
109-
self.device_id: int = os.getenv("CUDA_VISIBLE_DEVICES", None)
110109

111110
if fd_config.parallel_config.expert_parallel_rank is None:
112111
fd_config.parallel_config.expert_parallel_rank = 0
113-
device_id = self.rank + fd_config.parallel_config.tensor_parallel_degree * \
114-
fd_config.parallel_config.expert_parallel_rank
115-
if self.device_id is None:
116-
self.device_id = device_id
117-
else:
118-
self.device_id = self.device_id.split(",")[device_id]
112+
113+
self.rank, self.device_id = init_rank_and_device_id(fd_config)
119114

120115
def get_attntion_meta(self):
121116
"""get_attntion_meta"""

fastdeploy/model_executor/layers/attention/mla_attention_backend.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
from fastdeploy.model_executor.layers.attention.attention import Attention
4242
from fastdeploy.model_executor.layers.attention.base_attention_backend import (
4343
AttentionBackend, AttentionMetadata)
44+
from fastdeploy.model_executor.layers.attention.utils import init_rank_and_device_id
4445
from fastdeploy.worker.forward_meta import ForwardMeta
4546

4647

@@ -109,7 +110,6 @@ def __init__(self, fd_config: FDConfig, kv_num_heads: int, num_heads: int,
109110
self.use_speculate: bool = self.speculative_method is not None
110111
self.speculate_max_draft_token_num: int = fd_config.speculative_config.num_speculative_tokens
111112
self.keep_pd_step_flag: bool = fd_config.speculative_config.model_type == "mtp"
112-
self.rank: int = fd_config.parallel_config.tensor_parallel_rank
113113

114114
self.kv_num_heads: int = kv_num_heads
115115
self.num_heads: int = num_heads
@@ -135,10 +135,8 @@ def __init__(self, fd_config: FDConfig, kv_num_heads: int, num_heads: int,
135135
os.getenv("FLAGS_use_pd_disaggregation", 0))
136136
self.start_layer_index: int = fd_config.model_config.start_layer_index
137137
self.device_id: int = os.getenv("CUDA_VISIBLE_DEVICES", None)
138-
if self.device_id is None:
139-
self.device_id = self.rank
140-
else:
141-
self.device_id = self.device_id.split(",")[self.rank]
138+
139+
self.rank, self.device_id = init_rank_and_device_id(fd_config)
142140

143141
def init_attention_metadata(self, forward_meta: ForwardMeta):
144142
"""Initialize attention metadata hence all layers in the forward pass can reuse it."""
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
"""
2+
# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
"""
16+
17+
import os
18+
from fastdeploy.config import FDConfig
19+
20+
def init_rank_and_device_id(fd_config: FDConfig):
21+
"""
22+
23+
"""
24+
rank = (fd_config.parallel_config.expert_parallel_rank *
25+
fd_config.parallel_config.tensor_parallel_degree + fd_config.parallel_config.tensor_parallel_rank)
26+
27+
cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES", None)
28+
29+
if cuda_visible_devices is None:
30+
device_id = rank
31+
else:
32+
cuda_visible_devices = cuda_visible_devices.split(",")
33+
rank_index = rank % len(cuda_visible_devices)
34+
device_id = cuda_visible_devices[rank_index]
35+
36+
return rank, device_id

fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,10 @@ def apply_ep_prefill(
144144
if token_all_num > 0:
145145
logger.info(f"token_all_num {token_all_num}")
146146
(recv_x, recv_x_scale) = recv_x
147-
tmp = count_tokens_per_expert_func(recv_topk_idx, layer.num_local_experts)
147+
148+
token_nums_this_rank = count_tokens_per_expert_func(recv_topk_idx, layer.num_local_experts)
149+
token_nums_this_rank_padded = sum(token_nums_this_rank[1].numpy().tolist())
150+
148151
(
149152
permute_input,
150153
permute_scale,
@@ -160,8 +163,10 @@ def apply_ep_prefill(
160163
recv_x_scale,
161164
recv_topk_idx,
162165
recv_topk_weights,
163-
tmp[0],
164-
tmp[1]
166+
token_nums_this_rank[0],
167+
token_nums_this_rank[1],
168+
True, # use_in_ep
169+
token_nums_this_rank_padded,
165170
)
166171

167172
permute_scale = permute_scale.transpose([1, 0]).contiguous()
@@ -328,6 +333,8 @@ def apply_tp(
328333
topk_weights,
329334
tmp[0],
330335
tmp[1],
336+
False, # use_in_ep
337+
-1,
331338
)
332339

333340
permute_scale = permute_scale.transpose([1, 0]).contiguous()

0 commit comments

Comments
 (0)