prepare_finalize wokring

bnellnm · bnellnm · commit d7bb199c87df · 2025-06-24T17:59:48.000Z
Signed-off-by: Bill Nell &lt;bnell@redhat.com&gt;
diff --git a/tests/kernels/moe/test_batched_moe.py b/tests/kernels/moe/test_batched_moe.py
@@ -87,9 +87,12 @@ def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
 
     use_fp8_w8a8 = dtype == torch.float8_e4m3fn
 
-    if block_shape is not None and not use_fp8_w8a8:
+    if (per_act_token_quant or block_shape is not None) and not use_fp8_w8a8:
         pytest.skip("Don't test blocking for non-quantized types.")
 
+    if per_act_token_quant and block_shape is not None:
+        pytest.skip("Illegal quantization.")
+
     if dtype.itemsize == 1:
         act_dtype = torch.bfloat16
         quant_dtype = dtype
@@ -182,7 +185,7 @@ def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
 
     torch.testing.assert_close(ref_output, q_ref_output, atol=atol, rtol=rtol)
     #torch.testing.assert_close(ref_output, test_output, atol=atol, rtol=rtol)
-    #torch.testing.assert_close(test_output, q_ref_output, atol=atol, rtol=rtol)
+    torch.testing.assert_close(test_output, q_ref_output, atol=atol, rtol=rtol)
 
 
 @pytest.mark.parametrize("m", [1, 32, 45, 64, 222])
@@ -213,7 +216,7 @@ def test_fused_moe_batched_experts(
     if not use_fp8_w8a8 and per_act_token_quant and block_shape is not None:
         pytest.skip("Skip quantization test for non-quantized type")
 
-    if per_act_token_quant and block_shape is not None or topk > e:
+    if (per_act_token_quant and block_shape is not None) or topk > e:
         pytest.skip("Skip illegal quantization test")
 
     a = torch.randn((m, k), device="cuda", dtype=torch.bfloat16) / 10
diff --git a/tests/kernels/moe/test_pplx_moe.py b/tests/kernels/moe/test_pplx_moe.py
@@ -20,6 +20,7 @@
 
 from tests.kernels.moe.utils import make_test_weights, naive_batched_moe
 from tests.kernels.utils import torch_experts
+from tests.kernels.quant_utils import dequant
 from vllm.config import VllmConfig, set_current_vllm_config
 from vllm.model_executor.layers.fused_moe import fused_topk, override_config
 from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
@@ -39,8 +40,14 @@
     reason="Requires PPLX kernels",
 )
 
-PPLX_PREPARE_COMBOS = [(4, 128, 128), (32, 1024, 512), (64, 1024, 512),
-                       (222, 2048, 1024)]
+PPLX_PREPARE_COMBOS = [
+#    (1, 128, 128),
+    (4, 128, 128),
+    (32, 1024, 512),
+#    (45, 512, 2048),
+    (64, 1024, 512),
+    (222, 2048, 1024),
+]
 
 PPLX_MOE_COMBOS = [
     (1, 128, 128),
@@ -194,18 +201,24 @@ def chunk_by_rank(t: torch.Tensor, r: int, w: int) -> torch.Tensor:
     return t[(r * chunk):(r + 1) * chunk]
 
 
+def dummy_work(a: torch.Tensor) -> torch.Tensor:
+    return a # * 1.5
+
+
 def pplx_prepare_finalize(
     pgi: ProcessGroupInfo,
     dp_size: int,
     a: torch.Tensor,
-    a_scale: Optional[torch.Tensor],
     topk_weight: torch.Tensor,
     topk_ids: torch.Tensor,
     num_experts: int,
+    quant_dtype: Optional[torch.dtype],
+    block_shape: Optional[list[int]],
+    per_act_token_quant: bool,
     group_name: Optional[str],
 ) -> torch.Tensor:
     from vllm.model_executor.layers.fused_moe.pplx_prepare_finalize import (
-        PplxPrepareAndFinalize)
+        PplxPrepareAndFinalize, pplx_hidden_dim_scale_bytes)
 
     assert torch.cuda.current_device() == pgi.local_rank
 
@@ -214,7 +227,16 @@ def pplx_prepare_finalize(
     device = pgi.device
     rank = pgi.rank
     world_size = pgi.world_size
-    max_num_tokens = rank_chunk(num_tokens, 0, world_size)
+    max_num_tokens = max(rank_chunk(num_tokens, 0, world_size), 1)
+
+    hidden_dim_bytes, scale_bytes = pplx_hidden_dim_scale_bytes(
+        max_num_tokens,
+        hidden_dim,
+        a.dtype,
+        quant_dtype,
+        per_act_token_quant=per_act_token_quant,
+        block_shape=block_shape,
+    )
 
     args = dict(
         max_num_tokens=max_num_tokens,
@@ -224,8 +246,8 @@ def pplx_prepare_finalize(
         world_size=world_size,
         dp_size=dp_size,
         hidden_dim=hidden_dim,
-        hidden_dim_bytes=hidden_dim * a.dtype.itemsize,
-        hidden_dim_scale_bytes=0,
+        hidden_dim_bytes=hidden_dim_bytes,
+        hidden_dim_scale_bytes=scale_bytes,
     )
 
     if group_name is None:
@@ -257,10 +279,17 @@ def pplx_prepare_finalize(
         num_experts,
         None,
         False,
-        FusedMoEQuantConfig(),
+        FusedMoEQuantConfig(
+            quant_dtype,
+            per_act_token_quant,
+            False,
+            block_shape,
+        ),
     )
 
-    b_a = b_a * 1.5
+    # Do some fake work
+    #print(f"INTER {b_a.shape} {b_a_scale.shape if b_a_scale is not None else None}")
+    b_a = dummy_work(dequant(b_a, b_a_scale, block_shape, per_act_token_quant, a.dtype))
 
     out = torch.full(
         (max_num_tokens, hidden_dim),
@@ -290,10 +319,12 @@ def _pplx_prepare_finalize(
     pgi: ProcessGroupInfo,
     dp_size: int,
     a: torch.Tensor,
-    a_scale: Optional[torch.Tensor],
     score: torch.Tensor,
     topk: torch.Tensor,
     num_experts: int,
+    quant_dtype: Optional[torch.dtype],
+    block_shape: Optional[list[int]],
+    per_act_token_quant: bool,
     use_internode: bool,
 ):
     if use_internode:
@@ -307,24 +338,35 @@ def _pplx_prepare_finalize(
         cpu_group = torch.distributed.new_group(group_ranks, backend="gloo")
         group_name = cpu_group.group_name
 
-    device = pgi.device
+    #device = pgi.device
 
     topk_weight, topk_ids, _ = fused_topk(a, score, topk, False)
-    k = a.shape[1]
+    m, k = a.shape
 
-    a_rep = torch.repeat_interleave(a, topk, dim=0).to(device)
+    a_rep = torch.repeat_interleave(dummy_work(a), topk, dim=0) #.to(device)
 
-    torch_output = (a_rep.view(-1, topk, k) * 1.5 *
-                    topk_weight.view(-1, topk, 1).to(device)).sum(dim=1).to(
-                        a.dtype)
+    if True:
+        torch_output = (a_rep.view(m, topk, k) *
+                        topk_weight.view(m, topk, 1).to(a_rep.dtype)).sum(dim=1)
+    else:
+        import vllm._custom_ops as ops
+        a_rep = a_rep.view(m, topk, k)
+        a_rep.mul_(topk_weight.view(m, topk, 1).to(a_rep.dtype))
+        torch_output = torch.empty_like(a)
+        ops.moe_sum(a_rep, torch_output)
 
-    pplx_output = pplx_prepare_finalize(pgi, dp_size, a, a_scale, topk_weight, topk_ids,
-                                        num_experts, group_name)
+    pplx_output = pplx_prepare_finalize(pgi, dp_size, a, topk_weight, topk_ids,
+                                        num_experts, quant_dtype, block_shape,
+                                        per_act_token_quant, group_name)
 
     torch_output = chunk_by_rank(torch_output, pgi.rank,
                                  pgi.world_size).to(pplx_output.device)
 
-    torch.testing.assert_close(pplx_output, torch_output, atol=2e-2, rtol=0)
+    #torch.set_printoptions(profile="full")
+    #print(f"PPLX {pplx_output.shape}\n{pplx_output.shape}")
+    #print(f"TORCH {torch_output.shape}\n{torch_output.shape}")
+
+    torch.testing.assert_close(pplx_output, torch_output, atol=3e-2, rtol=3e-2)
 
     if use_internode:
         nvshmem_finalize()
@@ -333,11 +375,10 @@ def _pplx_prepare_finalize(
 # TODO (bnell): this test point does not work for odd M due to how the test is
 # written, not due to limitations of the pplx kernels.  The pplx_moe
 # test below is able to deal with odd M.
-# TODO (bnell) add fp8 tests
 @pytest.mark.parametrize("mnk", PPLX_PREPARE_COMBOS)
 @pytest.mark.parametrize("e", NUM_EXPERTS)
 @pytest.mark.parametrize("topk", TOP_KS)
-@pytest.mark.parametrize("dtype", [torch.bfloat16]) # torch.float8_e4m3fn, 
+@pytest.mark.parametrize("dtype", [torch.float8_e4m3fn, torch.bfloat16])
 @pytest.mark.parametrize("world_dp_size", [[2, 1]])
 @pytest.mark.parametrize("per_act_token_quant", [False])
 @pytest.mark.parametrize("block_shape", [None, [128, 128]])
@@ -356,28 +397,31 @@ def test_pplx_prepare_finalize(
     if dtype == torch.float8_e4m3fn:
         use_fp8_w8a8 = True
         act_dtype = torch.bfloat16
+        quant_dtype = dtype
     else:
         use_fp8_w8a8 = False
         act_dtype = dtype
+        quant_dtype = None
 
     if not use_fp8_w8a8 and (per_act_token_quant or block_shape is not None):
         pytest.skip("Skip quantization test for non-quantized type")
 
     if per_act_token_quant and block_shape is not None:
-        pytest.skip("Skip illgal quantization combination")
+        pytest.skip("Skip illegal quantization combination")
 
     current_platform.seed_everything(7)
     m, n, k = mnk
     world_size, dp_size = world_dp_size
     device = "cuda"
 
+    #print(f"MNK = {mnk}")
+
     a = torch.randn((m, k), device=device, dtype=act_dtype) / 10
     score = torch.randn((m, e), device=device, dtype=act_dtype)
 
-    a, a_scale = moe_kernel_quantize_input(a, None, dtype, False, block_shape)
-
-    parallel_launch(world_size, _pplx_prepare_finalize, dp_size, a, a_scale, score,
-                    topk, e, use_internode)
+    parallel_launch(world_size, _pplx_prepare_finalize, dp_size,
+                    a, score, topk, e, quant_dtype, block_shape,
+                    per_act_token_quant, use_internode)
 
 
 def pplx_moe(
@@ -661,7 +705,7 @@ def test_pplx_moe(
         pytest.skip("Skip quantization test for non-quantized type")
 
     if per_act_token_quant and block_shape is not None:
-        pytest.skip("Skip illgal quantization combination")
+        pytest.skip("Skip illegal quantization combination")
 
     a = torch.randn((m, k), device="cuda", dtype=torch.bfloat16) / 10
     score = torch.randn((m, e), device="cuda", dtype=torch.bfloat16)
diff --git a/tests/kernels/quant_utils.py b/tests/kernels/quant_utils.py
@@ -236,12 +236,21 @@ def per_block_cast_to_fp8(
     return x_scaled_sub, scales
 
 
-def _dequant(t: torch.Tensor, scale: torch.Tensor, block_shape, per_act_token_quant) -> torch.Tensor:
-    f32 = torch.float32
-    if per_act_token_quant or block_shape is None:
-        return t.to(f32) * scale
+def dequant(
+    t: torch.Tensor, scale:
+    Optional[torch.Tensor],
+    block_shape: Optional[list[int]],
+    per_act_token_quant: bool,
+    out_dtype: Optional[torch.dtype] = torch.float32,
+) -> torch.Tensor:
+    if scale is not None:
+        f32 = torch.float32
+        if per_act_token_quant or block_shape is None:
+            return (t.to(f32) * scale).to(out_dtype)
+        else:
+            return (t.to(f32) * group_broadcast(scale, t.shape)).to(out_dtype)
     else:
-        return t.to(f32) * group_broadcast(scale, t.shape)
+        return t.to(out_dtype)
 
 
 def native_batched_masked_quant_matmul(
@@ -269,8 +278,8 @@ def native_batched_masked_quant_matmul(
             C[e, :num_tokens, :] = tmp[:num_tokens, :]
         elif A.dtype.itemsize == 1 and block_shape is None:
             assert A_scale is not None and B_scale is not None
-            A_dq = _dequant(A[e], A_scale[e], block_shape, per_act_token_quant)
-            B_dq = _dequant(B[e], B_scale[e], block_shape, per_act_token_quant)
+            A_dq = dequant(A[e], A_scale[e], block_shape, per_act_token_quant)
+            B_dq = dequant(B[e], B_scale[e], block_shape, per_act_token_quant)
             C[e, :num_tokens, :] = (A_dq[:num_tokens] @ B_dq.transpose(0, 1)).to(C.dtype)
         else:
             assert A_scale is None
diff --git a/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/pplx_prepare_finalize.py
@@ -127,6 +127,17 @@ def prepare(
             quant_config.quant_dtype, quant_config.per_act_token_quant,
             quant_config.block_shape)
 
+        if quant_config.quant_dtype is not None:
+            if quant_config.is_per_tensor:
+                assert a1q_scale.numel() == 1
+            elif quant_config.is_per_act_token:
+                assert a1q_scale.numel() == a1.numel()
+                assert a1q_scale.shape == a1.shape
+            else:
+                assert a1q_scale.numel() == a1.shape[0] * cdiv(a1.shape[1], quant_config.block_shape[1])
+                assert a1q_scale.shape == (a1.shape[0], cdiv(a1.shape[1], quant_config.block_shape[1]))
+                #a1q_scale = group_broadcast(scale, a1q.shape)
+
         if a1q_scale is not None:
             scalar_scales = a1q_scale.numel() == 1
 
@@ -138,15 +149,21 @@ def prepare(
             orig_a_scale_block_shape = a1q_scale.shape[-1]
 
             # pad out scales if needed. TODO (bnell): do for non-scalar scales?
-            if False and scalar_scales:
-                print(f"a1q_scale {a1q.shape}, {a1q_scale.shape}")
-                a1q_scale = a1q_scale.repeat(a1q.shape[1],
-                                             4 * torch.float32.itemsize)
+            if False and (scalar_scales or quant_config.is_per_tensor):
+                #print(f"a1q_scale {a1q.shape}, {a1q_scale.shape}")
+                a1q_scale = a1q_scale.repeat(1, 4 * torch.float32.itemsize)
+            else:
+                #a1q_scale = torch.repeat_interleave(a1q_scale, round_up(a1q_scale.shape[1], 16), dim=1)
+                #a1q_scale = torch.nn.functional.pad(a1q_scale, pad=(0, 16-a1q_scale.shape[1]), mode='replicate')
+                pass
 
-            a1q_scale = a1q_scale.repeat(repeat_rows, repeat_cols)
+            if not quant_config.is_grouped:
+                a1q_scale = a1q_scale.repeat(repeat_rows, repeat_cols)
 
             #assert a1_scale is None or a1_scale.shape[0] == a1q.shape[1], f"{a1_scale.shape}, {a1q_scale.shape}"
 
+            #print(f"FINAL SCALE SHAPE {a1q_scale.shape}")
+
         assert a1q_scale is None or a1q_scale.ndim == 2, \
             f"{0 if a1q_scale is None else (a1q_scale.ndim, a1q_scale.shape)}"
 
@@ -173,16 +190,23 @@ def prepare(
         expert_x_scale: Optional[torch.Tensor] = None
         if a1q.dtype.itemsize == 1:
             float32_size = torch.float32.itemsize
-            block_size = (quant_config.block_shape[1] if quant_config.
-                          block_shape is not None else 1) * float32_size
+
+            if quant_config.is_per_act_token:
+                final_dim = expert_x.size(2)
+                assert final_dim % 4 == 0 #?
+            elif quant_config.is_per_tensor:
+                final_dim = 4
+            else:
+                num_blocks = cdiv(expert_x.size(2), quant_config.block_shape[1])
+                final_dim = round_up(num_blocks, 4)
 
             expert_x_scale_shape = (
                 num_local_experts,
                 expert_x.size(1),
-                cdiv(expert_x.size(2), block_size) if not scalar_scales else 1,
+                final_dim,
             )
 
-            print(f"EXPERT_X_SCALE {expert_x_scale_shape}")
+            #print(f"EXPERT_X_SCALE {expert_x_scale_shape}")
 
             expert_x_scale = torch.zeros(
                 expert_x_scale_shape,
@@ -207,9 +231,6 @@ def prepare(
         )
         #print(f"DISPATCH DONE {device}")
 
-        if expert_x_scale is not None:
-            expert_x_scale = expert_x_scale[:, :, 0:1]
-
         if expert_x_scale is not None:
             expert_x_scale = expert_x_scale[:, :, :orig_a_scale_block_shape]
             assert expert_x_scale.ndim == 3