Skip to content

Commit c88f65d

Browse files
committed
fix test_deepep_moe.py
Signed-off-by: Bill Nell <bnell@redhat.com>
1 parent 87c0689 commit c88f65d

File tree

2 files changed

+1
-5
lines changed

2 files changed

+1
-5
lines changed

tests/kernels/moe/test_deepep_moe.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -104,10 +104,6 @@ def make(config: TestConfig, low_latency_mode: bool) -> "TestTensors":
104104
rank_tokens = torch.randn(
105105
(config.m, config.k), device="cuda", dtype=token_dtype) / 10
106106
rank_token_scales = None
107-
if config.dtype == torch.float8_e4m3fn:
108-
# low_latency_mode kernels dont support per-token quant.
109-
_, rank_token_scales = ops.scaled_fp8_quant(
110-
rank_tokens, use_per_token_if_dynamic=not low_latency_mode)
111107

112108
topk = torch.randint(low=0,
113109
high=config.num_experts,

vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -157,7 +157,7 @@ def prepare(
157157
a1,
158158
a1_scale,
159159
quant_dtype=quant_config.quant_dtype,
160-
per_act_token_quant=False,
160+
per_act_token_quant=True,
161161
block_shape=quant_config.block_shape,
162162
)
163163
(expert_x, expert_x_scale, expert_num_tokens, expert_topk_ids,

0 commit comments

Comments
 (0)