17
17
from vllm .model_executor .layers .fused_moe .fused_moe import fused_experts
18
18
from vllm .model_executor .layers .fused_moe .modular_kernel import (
19
19
FusedMoEModularKernel )
20
- from vllm .model_executor .layers .quantization .utils .fp8_utils import (
21
- per_token_group_quant_fp8 )
22
20
from vllm .platforms import current_platform
23
21
from vllm .utils import has_deep_ep , has_deep_gemm
24
22
@@ -81,6 +79,7 @@ class TestConfig:
81
79
k : int
82
80
n : int
83
81
num_experts : int
82
+ per_act_token_quant : bool
84
83
block_size : list [int ]
85
84
# configs for testing low-latency kernels
86
85
low_latency : bool
@@ -99,18 +98,15 @@ class TestTensors:
99
98
def make (config : TestConfig , rank ) -> "TestTensors" :
100
99
101
100
dtype = torch .bfloat16
102
- topk , m , k , block_size = (config .topk , config .m , config .k ,
103
- config .block_size )
101
+ topk , m , k = (config .topk , config .m , config .k )
104
102
105
103
fp8_info = torch .finfo (torch .float8_e4m3fn )
106
104
fp8_max , fp8_min = fp8_info .max , fp8_info .min
107
105
108
106
rank_tokens = torch .randn (
109
107
(m , k ), device = torch .cuda .current_device (), dtype = dtype ) / 10.0
110
108
rank_tokens = rank_tokens .clamp (min = fp8_min , max = fp8_max )
111
-
112
- block_k = block_size [1 ]
113
- _ , rank_token_scales = per_token_group_quant_fp8 (rank_tokens , block_k )
109
+ rank_token_scales = None
114
110
115
111
topk_ids = torch .randint (
116
112
low = 0 ,
@@ -150,11 +146,12 @@ def make_ll_modular_kernel(pg: ProcessGroup, pgi: ProcessGroupInfo,
150
146
q_dtype = q_dtype ,
151
147
block_shape = test_config .block_size )
152
148
153
- fused_experts = BatchedDeepGemmExperts (max_num_tokens = max_tokens_per_rank ,
154
- world_size = pgi .world_size ,
155
- dp_size = dp_size ,
156
- block_shape = test_config .block_size ,
157
- per_act_token_quant = False )
149
+ fused_experts = BatchedDeepGemmExperts (
150
+ max_num_tokens = max_tokens_per_rank ,
151
+ world_size = pgi .world_size ,
152
+ dp_size = dp_size ,
153
+ block_shape = test_config .block_size ,
154
+ per_act_token_quant = test_config .per_act_token_quant )
158
155
mk = FusedMoEModularKernel (prepare_finalize = a2a ,
159
156
fused_experts = fused_experts )
160
157
return mk
@@ -393,6 +390,7 @@ def test_ht_deepep_deepgemm_moe(mnk: tuple[int, int, int], num_experts: int,
393
390
k = k ,
394
391
n = n ,
395
392
num_experts = num_experts ,
393
+ per_act_token_quant = False ,
396
394
block_size = block_size ,
397
395
low_latency = False ,
398
396
use_fp8_dispatch = None )
@@ -450,6 +448,7 @@ def test_ll_deepep_deepgemm_moe(
450
448
k = k ,
451
449
n = n ,
452
450
num_experts = num_experts ,
451
+ per_act_token_quant = False ,
453
452
block_size = block_size ,
454
453
low_latency = True ,
455
454
use_fp8_dispatch = use_fp8_dispatch ,
0 commit comments