vllm-project
diff --git a/‎tests/kernels/moe/test_batched_moe.py
Lines changed: 1 addition & 82 deletions b/‎tests/kernels/moe/test_batched_moe.py
Lines changed: 1 addition & 82 deletions
diff --git a/‎tests/kernels/moe/test_pplx_moe.py
Lines changed: 8 additions & 60 deletions b/‎tests/kernels/moe/test_pplx_moe.py
Lines changed: 8 additions & 60 deletions
diff --git a/‎tests/kernels/moe/utils.py
Lines changed: 103 additions & 5 deletions b/‎tests/kernels/moe/utils.py
Lines changed: 103 additions & 5 deletions
diff --git a/‎tests/kernels/quant_utils.py
Lines changed: 2 additions & 6 deletions b/‎tests/kernels/quant_utils.py
Lines changed: 2 additions & 6 deletions
diff --git a/‎vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
Lines changed: 8 additions & 3 deletions b/‎vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
Lines changed: 8 additions & 3 deletions
@@ -27,6 +27,7 @@
     torch_moe2,
     triton_moe,
     batched_moe,
+    make_test_weights,
 )
 
 NUM_EXPERTS = [8, 64]
@@ -302,27 +303,6 @@ def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
     torch.testing.assert_close(test_output, q_ref_output, atol=atol, rtol=rtol)
 
 
-# Move to utils
-def per_block_cast_to_fp8(
-    x: torch.Tensor,
-    block_size_n: int = 128) -> tuple[torch.Tensor, torch.Tensor]:
-    from vllm.utils import cdiv
-    assert x.dim() == 2
-    m, n = x.shape
-    x_padded = torch.zeros(
-        (cdiv(m, 128) * 128,
-         cdiv(n, block_size_n) * block_size_n),
-        dtype=x.dtype,
-        device=x.device)
-    x_padded[:m, :n] = x
-    x_view = x_padded.view(-1, 128, x_padded.size(1) // 128, block_size_n)
-    x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4)
-    x_scaled = (x_view * (448.0 / x_amax)).to(torch.float8_e4m3fn)
-    x_scaled_sub = x_scaled.view_as(x_padded)[:m, :n].contiguous()
-    scales = (x_amax / 448.0).view(x_view.size(0), x_view.size(2))
-    return x_scaled_sub, scales
-
-
 def _make_test_weights(
     e: int,
     n: int,
@@ -370,67 +350,6 @@ def _make_test_weights(
     return w1, w2, w1_s, w2_s, w1_bf16, w2_bf16
 
 
-def make_test_weights(e, n, k, block_shape, dtype):
-    use_fp8_w8a8 = dtype == torch.float8_e4m3fn
-    w_dtype = torch.bfloat16 if use_fp8_w8a8 else dtype
-
-    w1_16 = torch.randn((e, 2 * n, k), device="cuda", dtype=w_dtype) / 15
-    w2_16 = torch.randn((e, k, n), device="cuda", dtype=w_dtype) / 15
-
-    if use_fp8_w8a8:
-        w1_l = [None] * e
-        w2_l = [None] * e
-        w1_s = [None] * e
-        w2_s = [None] * e
-        for idx in range(e):
-            if block_shape is not None:
-                w1_l[idx], w1_s[idx] = per_block_cast_to_fp8(
-                    w1_16[idx],
-                    block_shape[1],
-                )
-                w2_l[idx], w2_s[idx] = per_block_cast_to_fp8(
-                    w2_16[idx],
-                    block_shape[1],
-                )
-            else:
-                tmp, w1_s[idx] = per_token_group_quant_fp8(
-                    w1_16[idx].view(1, -1),
-                    w1_16[idx].numel()
-                )
-                w1_l[idx] = tmp.view(*w1_16[idx].shape)
-
-                tmp, w2_s[idx] = per_token_group_quant_fp8(
-                    w2_16[idx].view(1, -1),
-                    w2_16[idx].numel()
-                )
-                w2_l[idx] = tmp.view(*w2_16[idx].shape)
-
-        w1 = torch.stack(w1_l)
-        w2 = torch.stack(w2_l)
-        w1_s = torch.stack(w1_s)
-        w2_s = torch.stack(w2_s)
-        if w1_s.ndim == 2:
-            assert w1_s.shape[-1] == 1
-            w1_s = w1_s.view(-1, 1, 1)
-            w2_s = w2_s.view(-1, 1, 1)
-
-        if block_shape is not None:
-            block_n, block_k = block_shape
-            n_tiles_w1 = ((2 * n) + block_n - 1) // block_n
-            k_tiles_w1 = (k + block_k - 1) // block_k
-            n_tiles_w2 = (k + block_n - 1) // block_n
-            k_tiles_w2 = (n + block_k - 1) // block_k
-            assert w1_s.shape == (e, n_tiles_w1, k_tiles_w1)
-            assert w2_s.shape == (e, n_tiles_w2, k_tiles_w2)
-    else:
-        w1 = w1_16
-        w2 = w2_16
-        w1_s = None
-        w2_s = None
-
-    return w1, w2, w1_s, w2_s, w1_16, w2_16
-
-
 @pytest.mark.parametrize("m", [32, 45, 64])  #[1, 33, 64, 222])
 @pytest.mark.parametrize("n", [128, 512, 1024, 2048])
 @pytest.mark.parametrize("k", [128, 512, 1024, 2048])
 
@@ -39,6 +39,7 @@
 from tests.kernels.moe.utils import (
     torch_moe2,
     naive_batched_moe,
+    make_test_weights,
 )
 
 
@@ -264,7 +265,7 @@ def pplx_prepare_finalize(
         chunk_topk_ids,
         num_experts,
         None,
-        False,
+        FusedMoEConfig(),
     )
 
     b_a = b_a * 1.5
@@ -583,7 +584,7 @@ def _pplx_moe(
     with set_current_vllm_config(vllm_config), override_config(moe_config):
         topk_weight, topk_ids, _ = fused_topk(a, score, topk, False)
         torch_output = torch_moe2(a, w1, w2, topk_weight, topk_ids, w1_s, w2_s,
-                                  use_fp8_w8a8, per_act_token_quant,
+                                  qtype, per_act_token_quant,
                                   block_shape)
         pplx_output = pplx_moe(group_name, pgi.rank, pgi.world_size, dp_size, a,
                                w1, w2, topk_weight, topk_ids, w1_s, w2_s, qtype,
@@ -624,69 +625,16 @@ def test_pplx_moe(
     current_platform.seed_everything(7)
     m, n, k = mnk
     world_size, dp_size = world_dp_size
-    a = torch.randn((m, k), device="cuda", dtype=torch.bfloat16) / 10
-    w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=torch.bfloat16) / 10
-    w2 = torch.randn((e, k, n), device="cuda", dtype=torch.bfloat16) / 10
-    score = torch.randn((m, e), device="cuda", dtype=torch.bfloat16)
-
     use_fp8_w8a8 = dtype == torch.float8_e4m3fn
 
     if not use_fp8_w8a8 and per_act_token_quant and block_shape is not None:
         pytest.skip("Skip quantization test for non-quantized type")
 
-    # TODO (bnell): scale setup for different quant strategies?
-    if use_fp8_w8a8:
-        quant_type = torch.float8_e4m3fn
-
-        #finfo = torch.finfo(dtype)
-        #fp8_min = finfo.min
-        #fp8_max = finfo.max
-        #w1 = w1.clamp(min=fp8_min, max=fp8_max).to(dtype)
-        #w2 = w2.clamp(min=fp8_min, max=fp8_max).to(dtype)
-        # block_n, block_k = block_shape[0], block_shape[1]
-        # n_tiles_w1 = (2 * n + block_n - 1) // block_n
-        # n_tiles_w2 = (k + block_n - 1) // block_n
-        # k_tiles_w1 = (k + block_k - 1) // block_k
-        # k_tiles_w2 = (n + block_k - 1) // block_k
-        # factor_for_scale = 1e-2
-        # w1_s = torch.rand(
-        #     (e, n_tiles_w1, k_tiles_w1), dtype=torch.float32,
-        #     device="cuda") * factor_for_scale
-        # w2_s = torch.rand(
-        #     (e, n_tiles_w2, k_tiles_w2), dtype=torch.float32,
-        #     device="cuda") * factor_for_scale
-        w1_l = [None] * e
-        w2_l = [None] * e
-        w1_s = [None] * e
-        w2_s = [None] * e
-        for idx in range(e):
-            w1_l[idx], w1_s[idx] = moe_kernel_quantize_input(
-                w1[idx],
-                None,
-                quant_type,
-                per_act_token_quant,
-                block_shape
-            )
-            w2_l[idx], w2_s[idx] = moe_kernel_quantize_input(
-                w2[idx],
-                None,
-                quant_type,
-                per_act_token_quant,
-                block_shape
-            )
-        w1 = torch.stack(w1_l)
-        w2 = torch.stack(w2_l)
-        w1_s = torch.stack(w1_s)
-        w2_s = torch.stack(w2_s)
-        if w1_s.ndim == 2:
-            assert w1_s.shape[-1] == 1
-            w1_s = w1_s.view(-1, 1, 1)
-            w2_s = w2_s.view(-1, 1, 1)
-    else:
-        quant_type = None
-        w1_s = None
-        w2_s = None
+    a = torch.randn((m, k), device="cuda", dtype=torch.bfloat16) / 10
+    score = torch.randn((m, e), device="cuda", dtype=torch.bfloat16)
+
+    w1, w2, w1_s, w2_s, w1_16, w2_16 = make_test_weights(e, n, k, block_shape, dtype)
 
     parallel_launch(world_size, _pplx_moe, dp_size, a, w1, w2, score, topk,
-                    w1_s, w2_s, quant_type, per_act_token_quant, block_shape,
+                    w1_s, w2_s, dtype, per_act_token_quant, block_shape,
                     use_internode)
@@ -11,6 +11,10 @@
     BatchedPrepareAndFinalize,
     BatchedTritonExperts,
     NaiveBatchedExperts)
+from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+    w8a8_block_fp8_matmul,
+    per_token_group_quant_fp8)
+
 from vllm.utils import round_up
 
 from tests.kernels.quant_utils import native_w8a8_block_matmul
@@ -112,6 +116,8 @@ def torch_moe2(
         block_shape
     )
 
+    print(f"XXX {quant_type} {block_shape} {a.shape} {a_scale}")
+
     out = torch.zeros(M * topk,
                       w2.shape[1],
                       dtype=torch.bfloat16,
@@ -129,8 +135,14 @@ def torch_moe2(
                 tmp2 = SiluAndMul()(tmp1)
                 out[mask] = tmp2 @ w2[i].transpose(0, 1)
             elif block_shape is not None:
-                tmp1 = native_w8a8_block_matmul(a[mask], w1[i], a_scale[mask],
-                                                w1_scale[i], block_shape, out.dtype)
+                tmp1 = native_w8a8_block_matmul(
+                    a[mask],
+                    w1[i],
+                    a_scale[mask],
+                    w1_scale[i],
+                    block_shape,
+                    out.dtype
+                )
 
                 #print(f"TORCH INTER[{i}] {tmp1.shape}\n{tmp1}")
                 #inters[i, :tmp1.shape[0]] = tmp1
@@ -144,9 +156,14 @@ def torch_moe2(
                                                           per_act_token_quant,
                                                           block_shape)
 
-                out[mask] = native_w8a8_block_matmul(tmp2, w2[i], b_scale,
-                                                     w2_scale[i], block_shape,
-                                                     out.dtype)
+                out[mask] = native_w8a8_block_matmul(
+                    tmp2,
+                    w2[i],
+                    b_scale,
+                    w2_scale[i],
+                    block_shape,
+                    out.dtype
+                )
             else:
                 # XXXX need scales here
                 compute_type = torch.bfloat16
@@ -237,3 +254,84 @@ def naive_batched_moe(
 
     return fused_experts(a, w1, w2, topk_weight, topk_ids, num_experts)
 
+
+# Move to utils
+def per_block_cast_to_fp8(
+    x: torch.Tensor,
+    block_size_n: int = 128) -> tuple[torch.Tensor, torch.Tensor]:
+    from vllm.utils import cdiv
+    assert x.dim() == 2
+    m, n = x.shape
+    x_padded = torch.zeros(
+        (cdiv(m, 128) * 128,
+         cdiv(n, block_size_n) * block_size_n),
+        dtype=x.dtype,
+        device=x.device)
+    x_padded[:m, :n] = x
+    x_view = x_padded.view(-1, 128, x_padded.size(1) // 128, block_size_n)
+    x_amax = x_view.abs().float().amax(dim=(1, 3), keepdim=True).clamp(1e-4)
+    x_scaled = (x_view * (448.0 / x_amax)).to(torch.float8_e4m3fn)
+    x_scaled_sub = x_scaled.view_as(x_padded)[:m, :n].contiguous()
+    scales = (x_amax / 448.0).view(x_view.size(0), x_view.size(2))
+    return x_scaled_sub, scales
+
+
+def make_test_weights(e, n, k, block_shape, dtype):
+    use_fp8_w8a8 = dtype == torch.float8_e4m3fn
+    w_dtype = torch.bfloat16 if use_fp8_w8a8 else dtype
+
+    w1_16 = torch.randn((e, 2 * n, k), device="cuda", dtype=w_dtype) / 15
+    w2_16 = torch.randn((e, k, n), device="cuda", dtype=w_dtype) / 15
+
+    if use_fp8_w8a8:
+        w1_l = [None] * e
+        w2_l = [None] * e
+        w1_s = [None] * e
+        w2_s = [None] * e
+        for idx in range(e):
+            if block_shape is not None:
+                w1_l[idx], w1_s[idx] = per_block_cast_to_fp8(
+                    w1_16[idx],
+                    block_shape[1],
+                )
+                w2_l[idx], w2_s[idx] = per_block_cast_to_fp8(
+                    w2_16[idx],
+                    block_shape[1],
+                )
+            else:
+                tmp, w1_s[idx] = per_token_group_quant_fp8(
+                    w1_16[idx].view(1, -1),
+                    w1_16[idx].numel()
+                )
+                w1_l[idx] = tmp.view(*w1_16[idx].shape)
+
+                tmp, w2_s[idx] = per_token_group_quant_fp8(
+                    w2_16[idx].view(1, -1),
+                    w2_16[idx].numel()
+                )
+                w2_l[idx] = tmp.view(*w2_16[idx].shape)
+
+        w1 = torch.stack(w1_l)
+        w2 = torch.stack(w2_l)
+        w1_s = torch.stack(w1_s)
+        w2_s = torch.stack(w2_s)
+        if w1_s.ndim == 2:
+            assert w1_s.shape[-1] == 1
+            w1_s = w1_s.view(-1, 1, 1)
+            w2_s = w2_s.view(-1, 1, 1)
+
+        if block_shape is not None:
+            block_n, block_k = block_shape
+            n_tiles_w1 = ((2 * n) + block_n - 1) // block_n
+            k_tiles_w1 = (k + block_k - 1) // block_k
+            n_tiles_w2 = (k + block_n - 1) // block_n
+            k_tiles_w2 = (n + block_k - 1) // block_k
+            assert w1_s.shape == (e, n_tiles_w1, k_tiles_w1)
+            assert w2_s.shape == (e, n_tiles_w2, k_tiles_w2)
+    else:
+        w1 = w1_16
+        w2 = w2_16
+        w1_s = None
+        w2_s = None
+
+    return w1, w2, w1_s, w2_s, w1_16, w2_16
@@ -100,7 +100,8 @@ def native_w8a8_block_matmul(
         As: torch.Tensor,
         Bs: torch.Tensor,
         block_size: list[int],
-        output_dtype: torch.dtype
+        output_dtype: torch.dtype,
+        compute_type: torch.dtype = torch.float32,
 ) -> torch.Tensor:
     """This function performs matrix multiplication with block-wise
     quantization using native torch.
@@ -111,11 +112,6 @@ def native_w8a8_block_matmul(
     `Bs` (float32).
     The output is returned in the specified `output_dtype`.
     """
-    if A.dtype.itemsize <= 2:
-        compute_type = torch.bfloat16
-    else:
-        compute_type = torch.float32
-
     A = A.to(compute_type)
     B = B.to(compute_type)
     assert A.shape[-1] == B.shape[-1]
 
@@ -8,6 +8,9 @@
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe.utils import (
     _resize_cache, per_token_group_quant_fp8)
+from vllm.model_executor.layers.fused_moe.config import (
+    FusedMoEQuantConfig)
+
 
 logger = init_logger(__name__)
 
@@ -35,9 +38,11 @@ def __init__(
 
         assert self.block_shape == [self.DEEPGEMM_BLOCK_SHAPE, self.DEEPGEMM_BLOCK_SHAPE]
         super().__init__(
-            quant_dtype=torch.float8_e4m3fn,
-            per_act_token_quant=False,
-            block_shape=block_shape,
+            FusedMoEQuantConfig(
+                quant_dtype=torch.float8_e4m3fn,
+                per_act_token_quant=False,
+                block_shape=block_shape,
+            )
         )
         self.max_num_tokens = max_num_tokens
         self.world_size = world_size