fix test

bnellnm · bnellnm · commit 946788f64b09 · 2025-06-27T02:18:17.000Z
Signed-off-by: Bill Nell &lt;bnell@redhat.com&gt;
diff --git a/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py
@@ -136,23 +136,7 @@ def prepare(
                 "apply_router_weight_on_input is only implemented for topk=1")
             a1 = a1 * topk_weights.to(a1.dtype)
 
-        # Check if there is a block_shape / or if we can infer the quantization
-        # schemes from the scales.
-        per_token_quant = None
-        if all([
-                x is None
-                for x in [quant_config.block_shape, a1_scale, a2_scale]
-        ]) and quant_config.quant_dtype is not None:
-            # Quantization required despite none of the inputs suggesting
-            # quantization. Fallback to per_token_dynamic quant.
-            per_token_quant = True
-        else:
-            per_token_quant = ((quant_config.block_shape is not None) or
-                               (a1_scale is not None and a1_scale.numel() != 1)
-                               or (a2_scale is not None
-                                   and a2_scale.numel() != 1))
-
-        if per_token_quant:
+        if quant_config.per_act_token_quant:
             a1q, a1q_scale = moe_kernel_quantize_input(
                 a1,
                 a1_scale,