review comments, reduce test combinations, cleanup test code, etc.

bnellnm · bnellnm · commit 337320fb67c0 · 2025-07-01T12:20:31.000Z
Signed-off-by: Bill Nell &lt;bnell@redhat.com&gt;
diff --git a/tests/kernels/moe/test_batched_moe.py b/tests/kernels/moe/test_batched_moe.py
@@ -19,6 +19,29 @@
 from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
 from vllm.platforms import current_platform
 
+MNK_FACTORS = [
+    (1, 128, 128),
+    (1, 128, 2048),
+    (1, 512, 512),
+    (1, 1024, 128),
+    (1, 1024, 2048),
+    (32, 128, 128),
+    (32, 512, 512),
+    (32, 1024, 2048),
+    (45, 128, 128),
+    (45, 128, 2048),
+    (45, 512, 512),
+    (45, 1024, 128),
+    (45, 1024, 2048),
+    (64, 128, 128),
+    (64, 512, 512),
+    (64, 1024, 2048),
+    (222, 128, 128),
+    (222, 128, 2048),
+    (222, 512, 512),
+    (222, 1024, 128),
+    (222, 1024, 2048),
+]
 NUM_EXPERTS = [8, 64]
 TOP_KS = [1, 2, 6]
 
@@ -182,9 +205,7 @@ def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
     torch.testing.assert_close(test_output, q_ref_output, atol=atol, rtol=rtol)
 
 
-@pytest.mark.parametrize("m", [1, 32, 45, 64, 222])
-@pytest.mark.parametrize("n", [128, 512, 1024, 2048])
-@pytest.mark.parametrize("k", [128, 512, 1024, 2048])
+@pytest.mark.parametrize(("m", "n", "k"), MNK_FACTORS)
 @pytest.mark.parametrize("e", NUM_EXPERTS)
 @pytest.mark.parametrize("topk", TOP_KS)
 @pytest.mark.parametrize("dtype", [torch.bfloat16])
diff --git a/tests/kernels/moe/test_block_fp8.py b/tests/kernels/moe/test_block_fp8.py
@@ -1,8 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import itertools
-
 import pytest
 import torch
 
@@ -37,10 +35,62 @@
 DTYPES = [torch.bfloat16]  # [torch.half, torch.bfloat16, torch.float32]
 # Deepseek-V3's intermediate size 18432, so N is 18432*2/8=4608 at TP8
 # and its hidden size is 7168.
-M = [1, 83, 128, 2048, 8192]
-M_dg = [128, 192, 1335, 2048]
-N = [128, 256, 1024, 4608]
-K = [256, 512, 7168]
+MNK_FACTORS = [
+    (1, 128, 128),
+    (1, 512, 512),
+    (1, 128, 7168),
+    (1, 1024, 7168),
+    (1, 4608, 128),
+    (1, 4608, 512),
+    (1, 4608, 7168),
+    (83, 128, 128),
+    (83, 512, 512),
+    (83, 1024, 7168),
+    (83, 4608, 512),
+    (83, 4608, 7168),
+    (128, 128, 128),
+    (128, 512, 512),
+    (128, 1024, 7168),
+    (128, 4608, 512),
+    (128, 4608, 7168),
+    (2048, 128, 128),
+    (2048, 1024, 7168),
+    (2048, 4608, 512),
+    (2048, 4608, 7168),
+    (8192, 128, 128),
+    (8192, 512, 512),
+    (8192, 128, 7168),
+    (8192, 1024, 7168),
+    (8192, 4608, 512),
+    (8192, 4608, 7168),
+]
+
+MNK_FACTORS_DG = [
+    (128, 128, 128),
+    (128, 512, 512),
+    (128, 128, 7168),
+    (128, 1024, 7168),
+    (128, 4608, 128),
+    (128, 4608, 512),
+    (128, 4608, 7168),
+    (192, 128, 128),
+    (192, 512, 512),
+    (192, 1024, 7168),
+    (192, 4608, 512),
+    (192, 4608, 7168),
+    (1335, 128, 128),
+    (1335, 1024, 7168),
+    (1335, 4608, 512),
+    (1335, 4608, 7168),
+    (2048, 128, 128),
+    (2048, 512, 512),
+    (2048, 128, 7168),
+    (2048, 1024, 7168),
+    (2048, 4608, 128),
+    (2048, 4608, 512),
+    (2048, 4608, 7168),
+]
+
 BLOCK_SIZE = [[128, 128]]
 E = [2, 8, 16]  # [128, 256]
 TOP_KS = [1, 2, 6]
@@ -92,9 +142,12 @@ def setup_cuda():
     torch.set_default_device("cuda")
 
 
-@pytest.mark.parametrize(
-    "M,N,K,E,topk,block_size,dtype,seed",
-    itertools.product(M, N, K, E, TOP_KS, BLOCK_SIZE, DTYPES, SEEDS))
+@pytest.mark.parametrize(("M", "N", "K"), MNK_FACTORS)
+@pytest.mark.parametrize("E", E)
+@pytest.mark.parametrize("topk", TOP_KS)
+@pytest.mark.parametrize("block_size", BLOCK_SIZE)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
 @torch.inference_mode()
 def test_w8a8_block_fp8_fused_moe(M, N, K, E, topk, block_size, dtype, seed,
                                   monkeypatch):
@@ -166,8 +219,10 @@ def test_w8a8_block_fp8_fused_moe(M, N, K, E, topk, block_size, dtype, seed,
     torch.testing.assert_close(m_out, ref_out, atol=tol, rtol=tol)
 
 
-@pytest.mark.parametrize("M,N,K,E,topk,seed",
-                         itertools.product(M_dg, N, K, E, TOP_KS, SEEDS))
+@pytest.mark.parametrize(("M", "N", "K"), MNK_FACTORS_DG)
+@pytest.mark.parametrize("E", E)
+@pytest.mark.parametrize("topk", TOP_KS)
+@pytest.mark.parametrize("seed", SEEDS)
 @pytest.mark.skipif(not dg_available, reason="DeepGemm kernels not available.")
 @torch.inference_mode()
 def test_w8a8_block_fp8_deep_gemm_fused_moe(M, N, K, E, topk, seed,
diff --git a/tests/kernels/moe/test_block_int8.py b/tests/kernels/moe/test_block_int8.py
@@ -1,8 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import itertools
-
 import pytest
 import torch
 
@@ -23,9 +21,38 @@
 vllm_config.scheduler_config.max_model_len = 8192
 
 DTYPES = [torch.half, torch.bfloat16]
-M = [1, 33, 64, 222]
-N = [128, 1024]
-K = [256, 4096]
+
+MNK_FACTORS = [
+    (1, 128, 128),
+    (1, 512, 512),
+    (1, 128, 7168),
+    (1, 1024, 7168),
+    (1, 4096, 128),
+    (1, 4096, 512),
+    (1, 4096, 7168),
+    (33, 128, 128),
+    (33, 512, 512),
+    (33, 128, 7168),
+    (33, 1024, 7168),
+    (33, 4096, 128),
+    (33, 4096, 512),
+    (33, 4096, 7168),
+    (128, 128, 128),
+    (128, 512, 512),
+    (128, 1024, 7168),
+    (128, 4096, 512),
+    (128, 4096, 7168),
+    (222, 128, 128),
+    (222, 512, 512),
+    (222, 1024, 7168),
+    (222, 4096, 512),
+    (222, 4096, 7168),
+    (2048, 128, 128),
+    (2048, 1024, 7168),
+    (2048, 4096, 512),
+    (2048, 4096, 7168),
+]
+
 E = [8, 24]
 TOP_KS = [2, 6]
 # BLOCK_SIZE = [[64, 64], [64, 128], [128, 64], [128, 128]]
@@ -76,9 +103,12 @@ def setup_cuda():
     torch.set_default_device("cuda")
 
 
-@pytest.mark.parametrize(
-    "M, N, K, E, topk, block_size, dtype, seed",
-    itertools.product(M, N, K, E, TOP_KS, BLOCK_SIZE, DTYPES, SEEDS))
+@pytest.mark.parametrize(("M", "N", "K"), MNK_FACTORS)
+@pytest.mark.parametrize("E", E)
+@pytest.mark.parametrize("topk", TOP_KS)
+@pytest.mark.parametrize("block_size", BLOCK_SIZE)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
 @torch.inference_mode()
 def test_w8a8_block_int8_fused_moe(M, N, K, E, topk, block_size, dtype, seed):
     """Tests the fused_moe kernel with W8A8 INT8 block quantization against a
diff --git a/tests/kernels/moe/test_cutlass_moe.py b/tests/kernels/moe/test_cutlass_moe.py
@@ -97,18 +97,8 @@ def make_moe_tensors_8bit(m: int, k: int, n: int, e: int,
         n_b_scales = 2 * n if per_out_channel else 1
         k_b_scales = k if per_out_channel else 1
         # Get the right scale for tests.
-        if False:
-            _, a_scale = ops.scaled_fp8_quant(
-                moe_tensors_fp16.a, use_per_token_if_dynamic=per_act_token)
-            a_q, _ = ops.scaled_fp8_quant(
-                moe_tensors_fp16.a,
-                a_scale,
-                use_per_token_if_dynamic=per_act_token)
-        else:
-            a_q, a_scale = ops.scaled_fp8_quant(
-                moe_tensors_fp16.a,
-                None,
-                use_per_token_if_dynamic=per_act_token)
+        a_q, a_scale = ops.scaled_fp8_quant(
+            moe_tensors_fp16.a, None, use_per_token_if_dynamic=per_act_token)
 
         w1_q = torch.empty((e, 2 * n, k), device="cuda", dtype=q_dtype)
         w2_q = torch.empty((e, k, n), device="cuda", dtype=q_dtype)
diff --git a/tests/kernels/quant_utils.py b/tests/kernels/quant_utils.py
@@ -241,7 +241,6 @@ def per_block_cast_to_fp8(
     return x_scaled_sub, scales
 
 
-# TODO: fix this
 def per_block_cast_to_int8(
     x: torch.Tensor,
     block_shape: list[int] = DEFAULT_BLOCK_SHAPE,
@@ -255,9 +254,9 @@ def per_block_cast_to_int8(
     x_padded[:m, :n] = x
     x_view = x_padded.view(-1, block_m, x_padded.size(1) // block_n, block_n)
     x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4)
-    x_scaled = (x_view * (448.0 / x_amax)).to(torch.int8)
+    x_scaled = (x_view * (256.0 / x_amax)).to(torch.int8)
     x_scaled_sub = x_scaled.view_as(x_padded)[:m, :n].contiguous()
-    scales = (x_amax / 448.0).view(x_view.size(0), x_view.size(2))
+    scales = (x_amax / 256.0).view(x_view.size(0), x_view.size(2))
     return x_scaled_sub, scales
 
 
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
@@ -1275,8 +1275,7 @@ def scaled_fp8_quant(
             torch.ops._C.dynamic_scaled_fp8_quant(output, input, scale)
     else:
         # num_token_padding not implemented for this case
-        assert (scale.numel() == 1 and num_token_padding
-                is None), f"{scale.shape} {num_token_padding}"
+        assert scale.numel() == 1, f"{scale.shape}"
         torch.ops._C.static_scaled_fp8_quant(output, input, scale)
 
     return output, scale
diff --git a/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
@@ -36,7 +36,7 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
 
     # DeepEP low-latency kernels are compiled only for certain
     # specific hidden sizes.
-    SUPPORTED_HIDDEN_SIZES = [2560, 4096, 5120, 7168]
+    SUPPORTED_HIDDEN_SIZES = [2048, 2560, 4096, 5120, 7168]
 
     def __init__(self,
                  buffer: deep_ep.Buffer,
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -1038,7 +1038,6 @@ def inplace_fused_experts_fake(
     pass
 
 
-# TODO: get rid of these?  replace with modular op?
 direct_register_custom_op(
     op_name="inplace_fused_experts",
     op_func=inplace_fused_experts,