wip test

bnellnm · bnellnm · commit 203decea0875 · 2025-06-16T22:52:00.000Z
Signed-off-by: Bill Nell &lt;bnell@redhat.com&gt;
diff --git a/tests/kernels/moe/test_batched_moe.py b/tests/kernels/moe/test_batched_moe.py
@@ -133,6 +133,8 @@ def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
         act_dtype = dtype
         quant_dtype = None
 
+    #print(f"TYPES {dtype}, {act_dtype}, {quant_dtype}")
+
     num_expert_tokens = torch.randint(low=0,
                                       high=max_tokens_per_expert,
                                       size=(num_experts, ),
@@ -153,7 +155,8 @@ def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
         num_experts,
         N // 2,
         K,
-        quant_dtype=dtype,
+        in_dtype=act_dtype,
+        quant_dtype=quant_dtype,
         block_shape=block_shape,
     )
 
@@ -168,6 +171,8 @@ def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
         torch.float32: tl.float32
     }[test_output.dtype]
 
+    assert A_q.dtype == B_q.dtype
+
     invoke_moe_batched_triton_kernel(
         A_q,
         B_q,
@@ -185,7 +190,7 @@ def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
         config={
             "BLOCK_SIZE_M": 16,
             "BLOCK_SIZE_N": 16,
-            "BLOCK_SIZE_K": 16
+            "BLOCK_SIZE_K": 16 if dtype.itemsize > 1 else 32
         },
         block_shape=block_shape,
     )
@@ -209,7 +214,7 @@ def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
         torch.float32: (1e-2, 1e-2),
     }[test_output.dtype]
 
-    torch.testing.assert_close(ref_output, q_ref_output, atol=atol, rtol=rtol)
+    torch.testing.assert_close(ref_output, test_output, atol=atol, rtol=rtol)
     torch.testing.assert_close(test_output, q_ref_output, atol=atol, rtol=rtol)
 
 
@@ -234,7 +239,6 @@ def test_fused_moe_batched_experts(
     current_platform.seed_everything(7)
 
     use_fp8_w8a8 = dtype == torch.float8_e4m3fn
-    quant_type = torch.float8_e4m3fn if use_fp8_w8a8 else None
 
     if not use_fp8_w8a8 and per_act_token_quant and block_shape is not None:
         pytest.skip("Skip quantization test for non-quantized type")
@@ -244,20 +248,30 @@ def test_fused_moe_batched_experts(
 
     a = torch.randn((m, k), device="cuda", dtype=torch.bfloat16) / 10
     score = torch.randn((m, e), device="cuda", dtype=torch.bfloat16)
-    _, w1, w1_s, _, w2, w2_s = make_test_weights(e, n, k, block_shape=block_shape, quant_dtype=dtype)
+
+    if dtype.itemsize == 1:
+        act_dtype = torch.bfloat16
+        quant_dtype = dtype
+    else:
+        act_dtype = dtype
+        quant_dtype = None
+
+    _, w1, w1_s, _, w2, w2_s = make_test_weights(e, n, k, block_shape=block_shape,
+                                                 in_dtype=act_dtype,
+                                                 quant_dtype=quant_dtype)
 
     torch.set_printoptions(profile="full")
 
     with set_current_vllm_config(vllm_config):
         topk_weight, topk_ids, _ = fused_topk(a, score, topk, False)
         batched_output = batched_moe(a, w1, w2, topk_weight, topk_ids, w1_s,
-                                     w2_s, quant_type, per_act_token_quant,
+                                     w2_s, quant_dtype, per_act_token_quant,
                                      block_shape)
         baseline_output = torch_moe2(a, w1, w2, topk_weight, topk_ids, w1_s,
-                                     w2_s, quant_type, per_act_token_quant,
+                                     w2_s, quant_dtype, per_act_token_quant,
                                      block_shape)
         triton_output = triton_moe(a, w1, w2, topk_weight, topk_ids, w1_s,
-                                   w2_s, quant_type, per_act_token_quant,
+                                   w2_s, quant_dtype, per_act_token_quant,
                                    block_shape)
 
     torch.testing.assert_close(triton_output,
diff --git a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
@@ -446,26 +446,73 @@ def prepare(
 
         num_local_experts = num_experts // self.world_size
 
-        assert quant_config.quant_dtype is None, "NYI"
-
         b_type = a1.dtype if quant_config.quant_dtype is None else quant_config.quant_dtype
 
         b_a1 = torch.zeros(
             (num_local_experts, self.max_num_tokens, hidden_dim),
             dtype=b_type,
             device=a1.device)
 
+        if quant_config.quant_dtype is not None:
+            if quant_config.block_shape is not None:
+                _, block_k = quant_config.block_shape
+                k_tiles = (hidden_dim + block_k - 1) // block_k
+                scale_shape = (num_local_experts, self.max_num_tokens, k_tiles)
+            else:
+                if quant_config.per_act_token_quant:
+                    num = self.max_num_tokens
+                else:
+                    num = 1
+                scale_shape = (num_local_experts, num, 1)
+
+            #print(f"SCALE_SHAPE {block_shape} {b_a1.shape} {scale_shape}")
+
+            b_a1_scale = torch.zeros(scale_shape,
+                                     dtype=torch.float32,
+                                     device=a1.device)
+        else:
+            assert a1_scale is None
+            b_a1_scale = None
+
         first_expert = num_local_experts * self.rank
         last_expert = first_expert + num_local_experts
 
         for expert_id in range(first_expert, last_expert):
             topks = torch.any(topk_ids == expert_id, dim=1).flatten()
             rows = torch.count_nonzero(topks.flatten())
-            b_a1[expert_id -
-                 first_expert, :rows, :] = a1[:topks.numel()][topks]
-            tokens_per_expert[expert_id - first_expert] = rows
+            rhs = a1[:topks.numel()][topks]
+            idx = expert_id - first_expert
+            if quant_config.quant_dtype is not None:
+                if a1_scale is not None:
+                    assert False, "NYI"
+                    rhs_a1_scale = a1_scale[:topks.numel()][topks]
+                else:
+                    rhs_a1_scale = None
+                b_a1[idx, :rows, :], b_s = moe_kernel_quantize_input(
+                    rhs,
+                    rhs_a1_scale,
+                    quant_config.quant_dtype,
+                    quant_config.per_act_token_quant,
+                    quant_config.block_shape,
+                )
+                assert b_s is not None
+                if (quant_config.block_shape is None
+                        and not quant_config.per_act_token_quant):
+                    print(f"SCALE {idx}, {b_a1_scale[idx, :].shape} {b_s.shape}")
+                    b_a1_scale[idx, :] = b_s
+                else:
+                    #print(f"XXXXX rhs={rhs.shape} b_s={b_s.shape}")
+                    assert rows == b_s.shape[0] and b_a1_scale.shape[
+                        -1] == b_s.shape[-1]
+                    b_a1_scale[idx, :rows] = b_s
+            else:
+                b_a1[idx, :rows, :] = rhs
 
-        return b_a1, a1_scale, tokens_per_expert, None, None
+            tokens_per_expert[idx] = rows
+
+        assert b_a1_scale is None or b_a1_scale.ndim == 3
+
+        return b_a1, b_a1_scale, tokens_per_expert, None, None
 
     def finalize(
         self,
@@ -770,6 +817,8 @@ def apply(
                                          config=config,
                                          block_shape=self.block_shape)
 
+        intermediate_cache2.fill_(0)
+
         # TODO: would be nice to use expert_num_tokens here to reduce
         # garbage compute
         self.activation(activation, intermediate_cache2.view(-1, N // 2),