vllm-project
diff --git a/‎tests/kernels/moe/test_batched_moe.py
Lines changed: 18 additions & 41 deletions b/‎tests/kernels/moe/test_batched_moe.py
Lines changed: 18 additions & 41 deletions
diff --git a/‎tests/kernels/moe/test_pplx_moe.py
Lines changed: 42 additions & 90 deletions b/‎tests/kernels/moe/test_pplx_moe.py
Lines changed: 42 additions & 90 deletions
diff --git a/‎tests/kernels/moe/utils.py
Lines changed: 1 addition & 0 deletions b/‎tests/kernels/moe/utils.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎tests/kernels/quant_utils.py
Lines changed: 19 additions & 2 deletions b/‎tests/kernels/quant_utils.py
Lines changed: 19 additions & 2 deletions
diff --git a/‎vllm/model_executor/layers/fused_moe/config.py
Lines changed: 2 additions & 1 deletion b/‎vllm/model_executor/layers/fused_moe/config.py
Lines changed: 2 additions & 1 deletion
@@ -100,8 +100,6 @@ def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
         act_dtype = dtype
         quant_dtype = None
 
-    #print(f"TYPES {dtype}, {act_dtype}, {quant_dtype}")
-
     num_expert_tokens = torch.randint(low=0,
                                       high=max_tokens_per_expert,
                                       size=(num_experts, ),
@@ -183,19 +181,13 @@ def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
         torch.float32: (1e-2, 1e-2),
     }[test_output.dtype]
 
-    if False:
-        torch.set_printoptions(profile="full")
-        print(f"REF_OUTPUT {q_ref_output.shape}\n{q_ref_output}")
-        print(f"TRITON {test_output.shape}\n{test_output}")
-
     torch.testing.assert_close(ref_output, q_ref_output, atol=atol, rtol=rtol)
-    #torch.testing.assert_close(ref_output, test_output, atol=atol, rtol=rtol)
     torch.testing.assert_close(test_output, q_ref_output, atol=atol, rtol=rtol)
 
 
 @pytest.mark.parametrize("m", [1, 32, 45, 64, 222])
-@pytest.mark.parametrize("n", [128, 512, 1024])#, 2048])
-@pytest.mark.parametrize("k", [128, 512, 1024])#, 2048])
+@pytest.mark.parametrize("n", [128, 512, 1024, 2048])
+@pytest.mark.parametrize("k", [128, 512, 1024, 2048])
 @pytest.mark.parametrize("e", NUM_EXPERTS)
 @pytest.mark.parametrize("topk", TOP_KS)
 @pytest.mark.parametrize("dtype", [torch.float8_e4m3fn, torch.bfloat16])
@@ -221,7 +213,7 @@ def test_fused_moe_batched_experts(
     if not use_fp8_w8a8 and per_act_token_quant and block_shape is not None:
         pytest.skip("Skip quantization test for non-quantized type")
 
-    if (per_act_token_quant and block_shape is not None) or topk > e:
+    if per_act_token_quant and block_shape is not None:
         pytest.skip("Skip illegal quantization test")
 
     a = torch.randn((m, k), device="cuda", dtype=torch.bfloat16) / 10
@@ -247,46 +239,31 @@ def test_fused_moe_batched_experts(
     with set_current_vllm_config(vllm_config):
         topk_weight, topk_ids, _ = fused_topk(a, score, topk, False)
 
-        if True:
-            batched_output = naive_batched_moe(a, w1, w2, topk_weight, topk_ids, w1_s,
-                                               w2_s, quant_dtype, per_act_token_quant,
-                                               block_shape)
-        else:
-            batched_output = naive_batched_moe(a, w1_16, w2_16, topk_weight, topk_ids)
-
-        if True:
-            baseline_output = torch_experts(
-                a,
-                w1,
-                w2,
-                topk_weight,
-                topk_ids,
-                w1_scale=w1_s,
-                w2_scale=w2_s,
-                quant_dtype=quant_dtype,
-                per_act_token_quant=per_act_token_quant,
-                block_shape=block_shape)
-        else:
-            baseline_output = torch_experts(a, w1_16, w2_16, topk_weight, topk_ids)
+        batched_output = naive_batched_moe(a, w1, w2, topk_weight, topk_ids, w1_s,
+                                           w2_s, quant_dtype, per_act_token_quant,
+                                           block_shape)
+
+        baseline_output = torch_experts(
+            a,
+            w1,
+            w2,
+            topk_weight,
+            topk_ids,
+            w1_scale=w1_s,
+            w2_scale=w2_s,
+            quant_dtype=quant_dtype,
+            per_act_token_quant=per_act_token_quant,
+            block_shape=block_shape)
 
         triton_output = triton_moe(a, w1, w2, topk_weight, topk_ids, w1_s,
                                    w2_s, quant_dtype, per_act_token_quant,
                                    block_shape)
 
-    #print(f"TORCH {baseline_output.shape}\n{baseline_output}")
-    #print(f"TRITON {triton_output.shape}\n{triton_output}")
-    #print(f"BATCHED {batched_output.shape}\n{batched_output}")
-
     torch.testing.assert_close(batched_output,
                                baseline_output,
                                atol=3e-2,
                                rtol=2e-2)
 
-    # torch.testing.assert_close(triton_output,
-    #                            baseline_output,
-    #                            atol=2e-2,
-    #                            rtol=2e-2)
-
     torch.testing.assert_close(triton_output,
                                batched_output,
                                atol=2e-2,
 
@@ -20,7 +20,7 @@
 
 from tests.kernels.moe.utils import make_test_weights, naive_batched_moe
 from tests.kernels.utils import torch_experts
-from tests.kernels.quant_utils import dequant
+from tests.kernels.quant_utils import batched_dequant
 from vllm.config import VllmConfig, set_current_vllm_config
 from vllm.model_executor.layers.fused_moe import fused_topk, override_config
 from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig
@@ -41,16 +41,19 @@
 )
 
 PPLX_PREPARE_COMBOS = [
-#    (1, 128, 128),
+    # TODO: figure out why this fails
+    #(1, 128, 128),
+    (2, 128, 512),
+    (3, 1024, 2048),
     (4, 128, 128),
     (32, 1024, 512),
-#    (45, 512, 2048),
+    (45, 512, 2048),
     (64, 1024, 512),
     (222, 2048, 1024),
 ]
 
 PPLX_MOE_COMBOS = [
-    (1, 128, 128),
+#    (1, 128, 128),
     (2, 128, 512),
     (3, 1024, 2048),
     (32, 128, 1024),
@@ -202,7 +205,7 @@ def chunk_by_rank(t: torch.Tensor, r: int, w: int) -> torch.Tensor:
 
 
 def dummy_work(a: torch.Tensor) -> torch.Tensor:
-    return a # * 1.5
+    return a * 1.1
 
 
 def pplx_prepare_finalize(
@@ -270,6 +273,13 @@ def pplx_prepare_finalize(
     chunk_topk_weight = chunk_by_rank(topk_weight, rank, world_size).to(device)
     chunk_topk_ids = chunk_by_rank(topk_ids, rank, world_size).to(device)
 
+    out = torch.full(
+        (max_num_tokens, hidden_dim),
+        torch.nan,
+        dtype=a.dtype,
+        device=device,
+    )
+
     b_a, b_a_scale, expert_num_tokens, _, _ = prepare_finalize.prepare(
         a_chunk,
         None,
@@ -287,16 +297,10 @@ def pplx_prepare_finalize(
         ),
     )
 
-    # Do some fake work
-    #print(f"INTER {b_a.shape} {b_a_scale.shape if b_a_scale is not None else None}")
-    b_a = dummy_work(dequant(b_a, b_a_scale, block_shape, per_act_token_quant, a.dtype))
+    #print(f"B_A_SCALE = {b_a.shape}, {b_a_scale.shape if b_a_scale is not None else None}, {per_act_token_quant} {block_shape}, {a_chunk.shape}")
+    # TOOD: shouldn't need batched_dequant
 
-    out = torch.full(
-        (max_num_tokens, hidden_dim),
-        torch.nan,
-        dtype=a.dtype,
-        device=device,
-    )
+    b_a = dummy_work(batched_dequant(b_a, b_a_scale, block_shape, per_act_token_quant, a.dtype))
 
     prepare_finalize.finalize(
         out,
@@ -338,49 +342,34 @@ def _pplx_prepare_finalize(
         cpu_group = torch.distributed.new_group(group_ranks, backend="gloo")
         group_name = cpu_group.group_name
 
-    #device = pgi.device
-
     topk_weight, topk_ids, _ = fused_topk(a, score, topk, False)
     m, k = a.shape
 
-    a_rep = torch.repeat_interleave(dummy_work(a), topk, dim=0) #.to(device)
+    a_rep = torch.repeat_interleave(dummy_work(a), topk, dim=0)
 
-    if True:
-        torch_output = (a_rep.view(m, topk, k) *
-                        topk_weight.view(m, topk, 1).to(a_rep.dtype)).sum(dim=1)
-    else:
-        import vllm._custom_ops as ops
-        a_rep = a_rep.view(m, topk, k)
-        a_rep.mul_(topk_weight.view(m, topk, 1).to(a_rep.dtype))
-        torch_output = torch.empty_like(a)
-        ops.moe_sum(a_rep, torch_output)
+    torch_output = (a_rep.view(m, topk, k) *
+                    topk_weight.view(m, topk, 1).to(a_rep.dtype)).sum(dim=1)
 
     pplx_output = pplx_prepare_finalize(pgi, dp_size, a, topk_weight, topk_ids,
                                         num_experts, quant_dtype, block_shape,
                                         per_act_token_quant, group_name)
 
-    torch_output = chunk_by_rank(torch_output, pgi.rank,
-                                 pgi.world_size).to(pplx_output.device)
-
-    #torch.set_printoptions(profile="full")
-    #print(f"PPLX {pplx_output.shape}\n{pplx_output.shape}")
-    #print(f"TORCH {torch_output.shape}\n{torch_output.shape}")
+    torch_output = chunk_by_rank(torch_output, pgi.rank, pgi.world_size).to(pgi.device)
 
     torch.testing.assert_close(pplx_output, torch_output, atol=3e-2, rtol=3e-2)
 
     if use_internode:
         nvshmem_finalize()
 
 
-# TODO (bnell): this test point does not work for odd M due to how the test is
-# written, not due to limitations of the pplx kernels.  The pplx_moe
-# test below is able to deal with odd M.
+# TODO (bnell): this test point does not work for M==1 due to how the test
+# is written, not due to limitations of the pplx kernels.
 @pytest.mark.parametrize("mnk", PPLX_PREPARE_COMBOS)
 @pytest.mark.parametrize("e", NUM_EXPERTS)
 @pytest.mark.parametrize("topk", TOP_KS)
 @pytest.mark.parametrize("dtype", [torch.float8_e4m3fn, torch.bfloat16])
 @pytest.mark.parametrize("world_dp_size", [[2, 1]])
-@pytest.mark.parametrize("per_act_token_quant", [False])
+@pytest.mark.parametrize("per_act_token_quant", [False, True])
 @pytest.mark.parametrize("block_shape", [None, [128, 128]])
 @pytest.mark.parametrize("use_internode", [False])
 @requires_pplx
@@ -414,8 +403,6 @@ def test_pplx_prepare_finalize(
     world_size, dp_size = world_dp_size
     device = "cuda"
 
-    #print(f"MNK = {mnk}")
-
     a = torch.randn((m, k), device=device, dtype=act_dtype) / 10
     score = torch.randn((m, e), device=device, dtype=act_dtype)
 
@@ -508,10 +495,13 @@ def pplx_moe(
     w1_chunk = chunk_by_rank(w1, rank, world_size).to(device)
     w2_chunk = chunk_by_rank(w2, rank, world_size).to(device)
 
-    # TODO scale chunk function
     if w1_scale is not None:
-        w1_scale_chunk = chunk_by_rank(w1_scale, rank, world_size).to(device)
-        w2_scale_chunk = chunk_by_rank(w2_scale, rank, world_size).to(device)
+        if not per_act_token_quant:
+            w1_scale_chunk = w1_scale
+            w2_scale_chunk = w2_scale
+        else:
+            w1_scale_chunk = chunk_by_rank(w1_scale, rank, world_size).to(device)
+            w2_scale_chunk = chunk_by_rank(w2_scale, rank, world_size).to(device)
     else:
         w1_scale_chunk = None
         w2_scale_chunk = None
@@ -562,48 +552,6 @@ def pplx_moe(
     return out
 
 
-def _batched_moe(pgi, dp_size, a, w1, w2, topk_weight, topk_ids):
-    assert torch.cuda.current_device() == pgi.local_rank
-
-    num_experts = w1.shape[0]
-    device = pgi.device
-    rank = pgi.rank
-    world_size = pgi.world_size
-    max_num_tokens = rank_chunk(a.shape[0], 0, world_size)
-
-    prepare_finalize = BatchedPrepareAndFinalize(
-        max_num_tokens=max_num_tokens,
-        world_size=world_size,
-        dp_size=dp_size,
-        rank=rank,
-    )
-
-    experts = NaiveBatchedExperts(max_num_tokens=a.shape[0],
-                                  world_size=1,
-                                  dp_size=1)
-
-    fused_experts = FusedMoEModularKernel(
-        prepare_finalize,
-        experts,
-    )
-
-    # Note: workers with the same dp_rank must use the exact same inputs.
-    a_chunk = chunk_by_rank(a, rank, world_size).to(device)
-    chunk_topk_weight = chunk_by_rank(topk_weight, rank, world_size).to(device)
-    chunk_topk_ids = chunk_by_rank(topk_ids, rank, world_size).to(device)
-
-    out = fused_experts(
-        a_chunk,
-        # Chunking weights like this only works for batched format
-        chunk_by_rank(w1, rank, world_size).to(device),
-        chunk_by_rank(w2, rank, world_size).to(device),
-        chunk_topk_weight,
-        chunk_topk_ids,
-        global_num_experts=num_experts)
-
-    return out
-
-
 def _pplx_moe(
     pgi: ProcessGroupInfo,
     dp_size: int,
@@ -654,18 +602,22 @@ def _pplx_moe(
                                      quant_dtype=qtype,
                                      per_act_token_quant=per_act_token_quant,
                                      block_shape=block_shape)
+
         pplx_output = pplx_moe(group_name, pgi.rank, pgi.world_size, dp_size,
                                a, w1, w2, topk_weight, topk_ids, w1_s, w2_s,
                                qtype, per_act_token_quant, block_shape)
-        # TODO (bnell): fix + re-enable
-        #batched_output = _batched_moe(pgi, dp_size, a, w1, w2, topk_weight,
-        #                              topk_ids)
 
-    torch_output = chunk_by_rank(torch_output, pgi.rank,
-                                 pgi.world_size).to(pplx_output.device)
+        # all reduce on pplx?
+        #torch.distributed.all_reduce(pplx_output)
+
+        batched_output = naive_batched_moe(a, w1, w2, topk_weight,
+            topk_ids, w1_s, w2_s, qtype, per_act_token_quant, block_shape)
+
+    chunked_torch_output = chunk_by_rank(torch_output, pgi.rank,
+                                         pgi.world_size).to(pplx_output.device)
 
-    torch.testing.assert_close(pplx_output, torch_output, atol=2e-2, rtol=0)
-    #torch.testing.assert_close(batched_output, torch_output, atol=2e-2, rtol=0)
+    torch.testing.assert_close(pplx_output, chunked_torch_output, atol=3e-2, rtol=3e-2)
+    #torch.testing.assert_close(batched_output, torch_output, atol=3e-2, rtol=3e-2)
 
     if use_internode:
         nvshmem_finalize()
 
@@ -172,6 +172,7 @@ def make_quantized_test_activations(
     return a, a_q, a_scale
 
 
+# TODO: split this into two calls to a single function
 def make_test_weights(
     e: int,
     n: int,
 
@@ -237,8 +237,8 @@ def per_block_cast_to_fp8(
 
 
 def dequant(
-    t: torch.Tensor, scale:
-    Optional[torch.Tensor],
+    t: torch.Tensor,
+    scale: Optional[torch.Tensor],
     block_shape: Optional[list[int]],
     per_act_token_quant: bool,
     out_dtype: Optional[torch.dtype] = torch.float32,
@@ -253,6 +253,23 @@ def dequant(
         return t.to(out_dtype)
 
 
+def batched_dequant(
+    t: torch.Tensor, scale:
+    Optional[torch.Tensor],
+    block_shape: Optional[list[int]],
+    per_act_token_quant: bool,
+    out_dtype: Optional[torch.dtype] = torch.float32,
+) -> torch.Tensor:
+    if scale is not None:
+        assert t.shape[0] == scale.shape[0]
+        out = torch.empty_like(t, dtype=out_dtype)
+        for e in range(t.shape[0]):
+            out[e] = dequant(t[e], scale[e], block_shape, per_act_token_quant, out_dtype)
+        return out
+
+    return t.to(out_dtype)
+
+
 def native_batched_masked_quant_matmul(
     A: torch.Tensor,
     B: torch.Tensor,
 
@@ -129,7 +129,6 @@ def make(
                                              use_int8_w8a8=use_int8_w8a8,
                                              use_int8_w8a16=use_int8_w8a16,
                                              use_int4_w4a16=use_int4_w4a16)
-
         return FusedMoEQuantConfig(
             quant_dtype,
             per_act_token_quant,
@@ -300,6 +299,8 @@ def __post_init__(self):
             logger.debug("Using FusedMoEConfig::max_num_tokens=%d",
                          self.max_num_tokens)
 
+        assert self.max_num_tokens > 0
+
     @property
     def quant_dtype(self) -> Optional[torch.dtype]:
         if self.quant_config is not None: