fp8 baselines working

bnellnm · bnellnm · commit bb5a4ed1dffc · 2025-06-24T17:59:48.000Z
Signed-off-by: Bill Nell &lt;bnell@redhat.com&gt;
diff --git a/tests/kernels/moe/test_pplx_moe.py b/tests/kernels/moe/test_pplx_moe.py
@@ -170,9 +170,9 @@ def test_fused_moe_batched_experts(
 
     with set_current_vllm_config(vllm_config):
         topk_weight, topk_ids, _ = fused_topk(a, score, topk, False)
-        baseline_output = torch_experts(a, w1, w2, topk_weight, topk_ids)
+        baseline_output = torch_experts(a, w1, w2, topk_weight, topk_ids) # only for baseline
         torch_output = torch_batched_moe(a, w1, w2, topk_weight, topk_ids)
-        batched_output = naive_batched_moe(a, w1, w2, topk_weight, topk_ids)
+        batched_output = naive_batched_moe(a, w1, w2, topk_weight, topk_ids) # pick torch_experts or this
 
     torch.testing.assert_close(baseline_output,
                                torch_output,
@@ -666,11 +666,14 @@ def test_pplx_moe(
     a = torch.randn((m, k), device="cuda", dtype=torch.bfloat16) / 10
     score = torch.randn((m, e), device="cuda", dtype=torch.bfloat16)
 
-    _, w1, w1_s, _, w2, w2_s = make_test_weights(e,
-                                                 n,
-                                                 k,
-                                                 quant_dtype=quant_dtype,
-                                                 block_shape=block_shape)
+    _, w1, w1_s, _, w2, w2_s = make_test_weights(
+        e,
+        n,
+        k,
+        quant_dtype=quant_dtype,
+        block_shape=block_shape,
+        per_act_token_quant=per_act_token_quant,
+    )
 
     parallel_launch(world_size, _pplx_moe, dp_size, a, w1, w2, score, topk,
                     w1_s, w2_s, quant_dtype, per_act_token_quant, block_shape,
diff --git a/vllm/model_executor/layers/fused_moe/config.py b/vllm/model_executor/layers/fused_moe/config.py
@@ -14,6 +14,8 @@
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
+from vllm.utils import cdiv
+
 
 logger = init_logger(__name__)
 
diff --git a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
@@ -13,6 +13,7 @@
     get_config_dtype_str, try_get_optimal_moe_config)
 from vllm.model_executor.layers.fused_moe.utils import (
     _resize_cache, moe_kernel_quantize_input)
+from vllm.model_executor.layers.quantization.utils.quant_utils import group_broadcast
 
 
 @triton.jit
@@ -555,13 +556,17 @@ def prepare(
                     rhs_a1_scale = a1_scale[:topks.numel()][topks]
                 else:
                     rhs_a1_scale = None
-                b_a1[idx, :rows, :], b_a1_scale[idx] = (moe_kernel_quantize_input(
+                b_a1[idx, :rows, :], b_s = (moe_kernel_quantize_input(
                     rhs,
                     rhs_a1_scale,
                     quant_config.quant_dtype,
                     quant_config.per_act_token_quant,
                     quant_config.block_shape,
                 ))
+                if quant_config.is_per_tensor:
+                    b_a1_scale[idx] = b_s
+                else:
+                    b_a1_scale[idx, :rows] = b_s[:rows]
             else:
                 b_a1[idx, :rows, :] = rhs
 
@@ -670,8 +675,7 @@ def dequant(self, t: torch.Tensor, scale: torch.Tensor) -> torch.Tensor:
         if self.quant_config.is_per_act_token or self.quant_config.is_per_tensor:
             return t.to(f32) * scale
         else:
-            t32 = t.to(f32).view(-1, self.quant_config.block_shape[1])
-            return (t32 * scale.view(-1, 1)).view(t.shape)
+            return t.to(f32) * group_broadcast(scale, t.shape)
 
     def apply(
         self,