try to fix lint

bnellnm · bnellnm · commit a7ca36b448ae · 2025-06-25T20:42:50.000Z
Signed-off-by: Bill Nell &lt;bnell@redhat.com&gt;
diff --git a/tests/kernels/moe/test_batched_moe.py b/tests/kernels/moe/test_batched_moe.py
@@ -11,7 +11,7 @@
 from tests.kernels.moe.utils import (batched_moe,
                                      make_quantized_test_activations,
                                      make_test_weights, triton_moe)
-from tests.kernels.quant_utils import native_w8a8_block_matmul
+from tests.kernels.quant_utils import native_batched_masked_quant_matmul
 from tests.kernels.utils import torch_experts
 from vllm.config import VllmConfig, set_current_vllm_config
 from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
@@ -68,43 +68,6 @@ def make_tensors(config: BatchedMMConfig):
         return BatchedMMTensors(A, B, C, num_expert_tokens)
 
 
-def ref_impl(
-    A: torch.Tensor,
-    B: torch.Tensor,
-    C: torch.Tensor,
-    num_expert_tokens: torch.Tensor,
-    A_scale: Optional[torch.Tensor],
-    B_scale: Optional[torch.Tensor],
-    block_shape: Optional[list[int]],
-) -> torch.Tensor:
-    assert (A.dtype.itemsize > 1
-            or (A_scale is not None and B_scale is not None))
-
-    num_expert_tokens_cpu = num_expert_tokens.clone()
-    num_expert_tokens_cpu = num_expert_tokens_cpu.to(device="cpu")
-    num_experts = num_expert_tokens.size(0)
-
-    f32 = torch.float32
-    bf16 = torch.bfloat16
-
-    for e in range(num_experts):
-        num_tokens = num_expert_tokens_cpu[e]
-        if A.dtype.itemsize == 1 and block_shape is not None:
-            tmp = native_w8a8_block_matmul(A[e], B[e], A_scale[e], B_scale[e],
-                                           block_shape, C.dtype)
-            C[e, :num_tokens, :] = tmp[:num_tokens, :]
-        elif A.dtype.itemsize == 1 and block_shape is None:
-            C[e, :num_tokens, :] = (
-                (A[e, :num_tokens, :].to(f32) * A_scale[e]).to(bf16)
-                @ (B[e].transpose(0, 1).to(f32) * B_scale[e]).to(bf16))
-        else:
-            assert A_scale is None
-            assert B_scale is None
-            C[e, :num_tokens, :] = A[e, :num_tokens, :] @ B[e].transpose(0, 1)
-
-    return C
-
-
 @pytest.mark.parametrize("num_experts", [8, 16, 32])
 @pytest.mark.parametrize("max_tokens_per_expert",
                          [32, 64, 128, 192, 224, 256, 512])
@@ -193,7 +156,7 @@ def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
         block_shape=block_shape,
     )
 
-    ref_output = ref_impl(
+    ref_output = native_batched_masked_quant_matmul(
         A,
         B,
         ref_output,
@@ -203,8 +166,10 @@ def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
         None,
     )
 
-    q_ref_output = ref_impl(A_q, B_q, q_ref_output, num_expert_tokens, A_scale,
-                            B_scale, block_shape)
+    q_ref_output = native_batched_masked_quant_matmul(A_q, B_q, q_ref_output,
+                                                      num_expert_tokens,
+                                                      A_scale, B_scale,
+                                                      block_shape)
 
     rtol, atol = {
         torch.float16: (6e-2, 6e-2),
diff --git a/tests/kernels/quant_utils.py b/tests/kernels/quant_utils.py
@@ -233,3 +233,38 @@ def per_block_cast_to_fp8(
     x_scaled_sub = x_scaled.view_as(x_padded)[:m, :n].contiguous()
     scales = (x_amax / 448.0).view(x_view.size(0), x_view.size(2))
     return x_scaled_sub, scales
+
+
+def native_batched_masked_quant_matmul(
+    A: torch.Tensor,
+    B: torch.Tensor,
+    C: torch.Tensor,
+    num_expert_tokens: torch.Tensor,
+    A_scale: Optional[torch.Tensor],
+    B_scale: Optional[torch.Tensor],
+    block_shape: Optional[list[int]],
+) -> torch.Tensor:
+    num_expert_tokens_cpu = num_expert_tokens.clone()
+    num_expert_tokens_cpu = num_expert_tokens_cpu.to(device="cpu")
+    num_experts = num_expert_tokens.size(0)
+
+    f32 = torch.float32
+
+    for e in range(num_experts):
+        num_tokens = num_expert_tokens_cpu[e]
+        if A.dtype.itemsize == 1 and block_shape is not None:
+            assert A_scale is not None and B_scale is not None
+            tmp = native_w8a8_block_matmul(A[e], B[e], A_scale[e], B_scale[e],
+                                           block_shape, C.dtype)
+            C[e, :num_tokens, :] = tmp[:num_tokens, :]
+        elif A.dtype.itemsize == 1 and block_shape is None:
+            assert A_scale is not None and B_scale is not None
+            C[e, :num_tokens, :] = (
+                (A[e, :num_tokens, :].to(f32) * A_scale[e]).to(C.dtype)
+                @ (B[e].transpose(0, 1).to(f32) * B_scale[e]).to(C.dtype))
+        else:
+            assert A_scale is None
+            assert B_scale is None
+            C[e, :num_tokens, :] = A[e, :num_tokens, :] @ B[e].transpose(0, 1)
+
+    return C
diff --git a/tests/kernels/utils.py b/tests/kernels/utils.py
@@ -1076,9 +1076,6 @@ def torch_experts(
             or (expert_map is not None
                 and global_num_experts == expert_map.shape[0]))
 
-    assert (quant_dtype is None
-            or (w1_scale is not None and w2_scale is not None))
-
     M, K = a.shape
     topk = topk_ids.shape[1]
 
@@ -1103,6 +1100,8 @@ def torch_experts(
                 tmp2 = SiluAndMul()(tmp1)
                 out[mask] = tmp2 @ w2[i].transpose(0, 1)
             elif block_shape is not None:
+                assert (a_scale is not None and w1_scale is not None
+                        and w2_scale is not None)
                 tmp1 = native_w8a8_block_matmul(a[mask], w1[i], a_scale[mask],
                                                 w1_scale[i], block_shape,
                                                 out.dtype)