@@ -161,7 +161,7 @@ __global__ void combine_prmt_back_kernel(
161
161
expanded_permuted_rows + expanded_permuted_row * cols; // prmt后的位置对应的值
162
162
Load<T, VEC_SIZE>(expanded_permuted_rows_row_ptr + tid * VEC_SIZE, &load_vec);
163
163
const int expert_idx = expert_for_source_row[k_offset]; // 当前位置对应的专家
164
- const T* bias_ptr = bias ? bias + expert_idx * cols : nullptr ; // 当前专家对应的ffn2的bias
164
+ const T* bias_ptr = bias ? bias + expert_idx * cols : nullptr ; // 当前专家对应的down_proj的bias
165
165
if (bias_ptr) {
166
166
Load<T, VEC_SIZE>(bias_ptr + tid * VEC_SIZE, &bias_vec);
167
167
#pragma unroll
@@ -188,7 +188,7 @@ void MoeCombineKernel(const paddle::Tensor& ffn_out,
188
188
const paddle::Tensor& expert_scales_float,
189
189
const paddle::Tensor& permute_indices_per_token,
190
190
const paddle::Tensor& top_k_indices,
191
- const paddle::optional<paddle::Tensor>& ffn2_bias ,
191
+ const paddle::optional<paddle::Tensor>& down_proj_bias ,
192
192
const bool norm_topk_prob,
193
193
const float routed_scaling_factor,
194
194
const int num_rows,
@@ -206,7 +206,7 @@ void MoeCombineKernel(const paddle::Tensor& ffn_out,
206
206
combine_prmt_back_kernel<<<gridx, threads, 0 , stream>>> (
207
207
ffn_out.data <data_t >(),
208
208
output->data <data_t >(),
209
- ffn2_bias ? ffn2_bias ->data <data_t >() : nullptr ,
209
+ down_proj_bias ? down_proj_bias ->data <data_t >() : nullptr ,
210
210
expert_scales_float.data <float >(),
211
211
permute_indices_per_token.data <int32_t >(),
212
212
top_k_indices.data <int >(),
@@ -223,7 +223,7 @@ std::vector<paddle::Tensor> EPMoeExpertCombine(
223
223
const paddle::Tensor& expert_scales_float, // dst_weights
224
224
const paddle::Tensor& permute_indices_per_token, // permute_indices_per_token
225
225
const paddle::Tensor& top_k_indices, // dst_indices
226
- const paddle::optional<paddle::Tensor>& ffn2_bias ,
226
+ const paddle::optional<paddle::Tensor>& down_proj_bias ,
227
227
const bool norm_topk_prob,
228
228
const float routed_scaling_factor) {
229
229
@@ -242,7 +242,7 @@ std::vector<paddle::Tensor> EPMoeExpertCombine(
242
242
expert_scales_float,
243
243
permute_indices_per_token,
244
244
top_k_indices,
245
- ffn2_bias ,
245
+ down_proj_bias ,
246
246
norm_topk_prob,
247
247
routed_scaling_factor,
248
248
num_rows,
@@ -255,7 +255,7 @@ std::vector<paddle::Tensor> EPMoeExpertCombine(
255
255
expert_scales_float,
256
256
permute_indices_per_token,
257
257
top_k_indices,
258
- ffn2_bias ,
258
+ down_proj_bias ,
259
259
norm_topk_prob,
260
260
routed_scaling_factor,
261
261
num_rows,
@@ -274,7 +274,7 @@ __global__ void permute_x_kernel(const T *src_x,
274
274
const int64_t *topk_idx,
275
275
const float *topk_weights,
276
276
const int *token_nums_per_expert,
277
- const float *ffn1_in_scale ,
277
+ const float *up_gate_proj_in_scale ,
278
278
const int moe_topk,
279
279
const int num_rows,
280
280
const int token_nums_this_rank,
@@ -327,9 +327,9 @@ __global__ void permute_x_kernel(const T *src_x,
327
327
// cp x
328
328
for (int v_id = tid; v_id < hidden_size_int4; v_id += blockDim .x ) {
329
329
Load<T, vec_size>(&src_x[s_token_idx * hidden_size + v_id * vec_size], &src_vec);
330
- if (ffn1_in_scale ) {
330
+ if (up_gate_proj_in_scale ) {
331
331
for (int i = 0 ; i < vec_size; i++) {
332
- float quant_value = max_bound * ffn1_in_scale [expert_now] * static_cast <float >(src_vec[i]);
332
+ float quant_value = max_bound * up_gate_proj_in_scale [expert_now] * static_cast <float >(src_vec[i]);
333
333
if (RoundType == 0 ) {
334
334
res_vec[i] = static_cast <OutT>(ClipFunc<float >(rint (quant_value), min_bound, max_bound));
335
335
} else {
@@ -353,7 +353,7 @@ void EPMoeDispatchKernel(const paddle::Tensor& input,
353
353
const paddle::Tensor& topk_ids,
354
354
const paddle::Tensor& topk_weights,
355
355
const paddle::Tensor& token_nums_per_expert,
356
- const paddle::optional<paddle::Tensor>& ffn1_in_scale ,
356
+ const paddle::optional<paddle::Tensor>& up_gate_proj_in_scale ,
357
357
const std::string& moe_quant_type,
358
358
const int moe_topk,
359
359
const int num_rows,
@@ -383,7 +383,7 @@ void EPMoeDispatchKernel(const paddle::Tensor& input,
383
383
topk_ids.data <int64_t >(),
384
384
topk_weights.data <float >(),
385
385
token_nums_per_expert.data <int >(),
386
- ffn1_in_scale ? ffn1_in_scale .get ().data <float >() : nullptr ,
386
+ up_gate_proj_in_scale ? up_gate_proj_in_scale .get ().data <float >() : nullptr ,
387
387
moe_topk,
388
388
num_rows,
389
389
token_nums_this_rank,
@@ -404,7 +404,7 @@ void EPMoeDispatchKernel(const paddle::Tensor& input,
404
404
topk_ids.data <int64_t >(),
405
405
topk_weights.data <float >(),
406
406
token_nums_per_expert.data <int >(),
407
- ffn1_in_scale ? ffn1_in_scale .get ().data <float >() : nullptr ,
407
+ up_gate_proj_in_scale ? up_gate_proj_in_scale .get ().data <float >() : nullptr ,
408
408
moe_topk,
409
409
num_rows,
410
410
token_nums_this_rank,
@@ -427,7 +427,7 @@ void EPMoeDispatchKernel(const paddle::Tensor& input,
427
427
topk_ids.data <int64_t >(),
428
428
topk_weights.data <float >(),
429
429
token_nums_per_expert.data <int >(),
430
- ffn1_in_scale ? ffn1_in_scale .get ().data <float >() : nullptr ,
430
+ up_gate_proj_in_scale ? up_gate_proj_in_scale .get ().data <float >() : nullptr ,
431
431
moe_topk,
432
432
num_rows,
433
433
token_nums_this_rank,
@@ -448,7 +448,7 @@ void EPMoeDispatchKernel(const paddle::Tensor& input,
448
448
topk_ids.data <int64_t >(),
449
449
topk_weights.data <float >(),
450
450
token_nums_per_expert.data <int >(),
451
- ffn1_in_scale ? ffn1_in_scale .get ().data <float >() : nullptr ,
451
+ up_gate_proj_in_scale ? up_gate_proj_in_scale .get ().data <float >() : nullptr ,
452
452
moe_topk,
453
453
num_rows,
454
454
token_nums_this_rank,
@@ -472,7 +472,7 @@ std::vector<paddle::Tensor> EPMoeExpertDispatch(
472
472
const paddle::Tensor& input,
473
473
const paddle::Tensor& topk_ids,
474
474
const paddle::Tensor& topk_weights,
475
- const paddle::optional<paddle::Tensor>& ffn1_in_scale ,
475
+ const paddle::optional<paddle::Tensor>& up_gate_proj_in_scale ,
476
476
const std::vector<int >& token_nums_per_expert,
477
477
const int token_nums_this_rank,
478
478
const std::string& moe_quant_type) {
@@ -516,7 +516,7 @@ std::vector<paddle::Tensor> EPMoeExpertDispatch(
516
516
topk_ids,
517
517
topk_weights,
518
518
num_experts_per_rank_tensor,
519
- ffn1_in_scale ,
519
+ up_gate_proj_in_scale ,
520
520
moe_quant_type,
521
521
moe_topk,
522
522
num_rows,
@@ -536,7 +536,7 @@ std::vector<paddle::Tensor> EPMoeExpertDispatch(
536
536
topk_ids,
537
537
topk_weights,
538
538
num_experts_per_rank_tensor,
539
- ffn1_in_scale ,
539
+ up_gate_proj_in_scale ,
540
540
moe_quant_type,
541
541
moe_topk,
542
542
num_rows,
@@ -568,7 +568,7 @@ std::vector<std::vector<int64_t>> EPMoeExpertDispatchInferShape(
568
568
const std::vector<int64_t >& input_shape,
569
569
const std::vector<int64_t >& topk_ids_shape,
570
570
const std::vector<int64_t >& topk_weights_shape,
571
- const paddle::optional<std::vector<int64_t >>& ffn1_in_scale_dtype ,
571
+ const paddle::optional<std::vector<int64_t >>& up_gate_proj_in_scale_dtype ,
572
572
const std::vector<int >& token_nums_per_expert,
573
573
const int token_nums_this_rank) {
574
574
int token_rows = -1 ;
@@ -610,7 +610,7 @@ std::vector<paddle::DataType> EPMoeExpertDispatchInferDtype(
610
610
611
611
PD_BUILD_STATIC_OP (ep_moe_expert_dispatch)
612
612
.Inputs({" input" , " topk_ids" , " topk_weights" ,
613
- paddle::Optional (" ffn1_in_scale " )})
613
+ paddle::Optional (" up_gate_proj_in_scale " )})
614
614
.Outputs({" permute_input" ,
615
615
" permute_indices_per_token" ,
616
616
" token_nums_per_expert_cumsum" ,
0 commit comments