mm baselines work

bnellnm · bnellnm · commit eab92d330d6e · 2025-06-24T17:59:48.000Z
Signed-off-by: Bill Nell &lt;bnell@redhat.com&gt;
diff --git a/tests/kernels/moe/test_batched_moe.py b/tests/kernels/moe/test_batched_moe.py
@@ -166,15 +166,13 @@ def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
         B,
         ref_output,
         num_expert_tokens,
-        None,
-        None,
-        None,
     )
 
     q_ref_output = native_batched_masked_quant_matmul(A_q, B_q, q_ref_output,
                                                       num_expert_tokens,
                                                       A_scale, B_scale,
-                                                      block_shape)
+                                                      block_shape,
+                                                      per_act_token_quant)
 
     rtol, atol = {
         torch.float16: (6e-2, 6e-2),
@@ -183,7 +181,6 @@ def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
     }[test_output.dtype]
 
     torch.testing.assert_close(ref_output, q_ref_output, atol=atol, rtol=rtol)
-
     #torch.testing.assert_close(ref_output, test_output, atol=atol, rtol=rtol)
     #torch.testing.assert_close(test_output, q_ref_output, atol=atol, rtol=rtol)
 
diff --git a/tests/kernels/quant_utils.py b/tests/kernels/quant_utils.py
@@ -7,6 +7,7 @@
 
 from vllm.platforms import current_platform
 from vllm.utils import cdiv
+from vllm.model_executor.layers.quantization.utils.quant_utils import group_broadcast
 
 # Using the default value (240.0) from pytorch will cause accuracy
 # issue on dynamic quantization models. Here use 224.0 for rocm.
@@ -235,14 +236,23 @@ def per_block_cast_to_fp8(
     return x_scaled_sub, scales
 
 
+def _dequant(t: torch.Tensor, scale: torch.Tensor, block_shape, per_act_token_quant) -> torch.Tensor:
+    f32 = torch.float32
+    if per_act_token_quant or block_shape is None:
+        return t.to(f32) * scale
+    else:
+        return t.to(f32) * group_broadcast(scale, t.shape)
+
+
 def native_batched_masked_quant_matmul(
     A: torch.Tensor,
     B: torch.Tensor,
     C: torch.Tensor,
     num_expert_tokens: torch.Tensor,
-    A_scale: Optional[torch.Tensor],
-    B_scale: Optional[torch.Tensor],
-    block_shape: Optional[list[int]],
+    A_scale: Optional[torch.Tensor] = None,
+    B_scale: Optional[torch.Tensor] = None,
+    block_shape: Optional[list[int]] = None,
+    per_act_token_quant: bool = False,
 ) -> torch.Tensor:
     num_expert_tokens_cpu = num_expert_tokens.clone()
     num_expert_tokens_cpu = num_expert_tokens_cpu.to(device="cpu")
@@ -259,9 +269,9 @@ def native_batched_masked_quant_matmul(
             C[e, :num_tokens, :] = tmp[:num_tokens, :]
         elif A.dtype.itemsize == 1 and block_shape is None:
             assert A_scale is not None and B_scale is not None
-            C[e, :num_tokens, :] = (
-                (A[e, :num_tokens, :].to(f32) * A_scale[e]).to(C.dtype)
-                @ (B[e].transpose(0, 1).to(f32) * B_scale[e]).to(C.dtype))
+            A_dq = _dequant(A[e], A_scale[e], block_shape, per_act_token_quant)
+            B_dq = _dequant(B[e], B_scale[e], block_shape, per_act_token_quant)
+            C[e, :num_tokens, :] = (A_dq[:num_tokens] @ B_dq.transpose(0, 1)).to(C.dtype)
         else:
             assert A_scale is None
             assert B_scale is None