Skip to content

Commit 946788f

Browse files
committed
fix test
Signed-off-by: Bill Nell <bnell@redhat.com>
1 parent e552cd0 commit 946788f

File tree

1 file changed

+1
-17
lines changed

1 file changed

+1
-17
lines changed

vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py

Lines changed: 1 addition & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -136,23 +136,7 @@ def prepare(
136136
"apply_router_weight_on_input is only implemented for topk=1")
137137
a1 = a1 * topk_weights.to(a1.dtype)
138138

139-
# Check if there is a block_shape / or if we can infer the quantization
140-
# schemes from the scales.
141-
per_token_quant = None
142-
if all([
143-
x is None
144-
for x in [quant_config.block_shape, a1_scale, a2_scale]
145-
]) and quant_config.quant_dtype is not None:
146-
# Quantization required despite none of the inputs suggesting
147-
# quantization. Fallback to per_token_dynamic quant.
148-
per_token_quant = True
149-
else:
150-
per_token_quant = ((quant_config.block_shape is not None) or
151-
(a1_scale is not None and a1_scale.numel() != 1)
152-
or (a2_scale is not None
153-
and a2_scale.numel() != 1))
154-
155-
if per_token_quant:
139+
if quant_config.per_act_token_quant:
156140
a1q, a1q_scale = moe_kernel_quantize_input(
157141
a1,
158142
a1_scale,

0 commit comments

Comments
 (0)