per_act_token working

bnellnm · bnellnm · commit dcf59cf3c2a0 · 2025-06-24T20:22:12.000Z
Signed-off-by: Bill Nell &lt;bnell@redhat.com&gt;
diff --git a/tests/kernels/moe/test_batched_moe.py b/tests/kernels/moe/test_batched_moe.py
@@ -77,8 +77,8 @@ def make_tensors(config: BatchedMMConfig):
 @pytest.mark.parametrize(
     "dtype",
     [torch.float8_e4m3fn, torch.float32, torch.float16, torch.bfloat16])
-@pytest.mark.parametrize("block_shape", [None, [128, 128]]) # [None])#, [128, 128]])
-@pytest.mark.parametrize("per_act_token_quant", [False, True])# [False])# ,True])
+@pytest.mark.parametrize("block_shape", [None, [128, 128]])
+@pytest.mark.parametrize("per_act_token_quant", [False, True])
 def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
                     N: int, dtype: torch.dtype,
                     block_shape: Optional[list[int]],
@@ -141,8 +141,6 @@ def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
 
     assert A_q.dtype == B_q.dtype
 
-    #B_scale.fill_(0.5)
-
     invoke_moe_batched_triton_kernel(
         A_q,
         B_q,
@@ -190,7 +188,7 @@ def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
         print(f"REF_OUTPUT {q_ref_output.shape}\n{q_ref_output}")
         print(f"TRITON {test_output.shape}\n{test_output}")
 
-    #torch.testing.assert_close(ref_output, q_ref_output, atol=atol, rtol=rtol)
+    torch.testing.assert_close(ref_output, q_ref_output, atol=atol, rtol=rtol)
     #torch.testing.assert_close(ref_output, test_output, atol=atol, rtol=rtol)
     torch.testing.assert_close(test_output, q_ref_output, atol=atol, rtol=rtol)
 
@@ -246,9 +244,6 @@ def test_fused_moe_batched_experts(
         per_act_token_quant=per_act_token_quant,
     )
 
-    # TODO remove
-    torch.set_printoptions(profile="full")
-
     with set_current_vllm_config(vllm_config):
         topk_weight, topk_ids, _ = fused_topk(a, score, topk, False)
 
@@ -274,9 +269,9 @@ def test_fused_moe_batched_experts(
         else:
             baseline_output = torch_experts(a, w1_16, w2_16, topk_weight, topk_ids)
 
-        #triton_output = triton_moe(a, w1, w2, topk_weight, topk_ids, w1_s,
-        #                           w2_s, quant_dtype, per_act_token_quant,
-        #                           block_shape)
+        triton_output = triton_moe(a, w1, w2, topk_weight, topk_ids, w1_s,
+                                   w2_s, quant_dtype, per_act_token_quant,
+                                   block_shape)
 
     #print(f"TORCH {baseline_output.shape}\n{baseline_output}")
     #print(f"TRITON {triton_output.shape}\n{triton_output}")
@@ -292,7 +287,7 @@ def test_fused_moe_batched_experts(
     #                            atol=2e-2,
     #                            rtol=2e-2)
 
-    # torch.testing.assert_close(triton_output,
-    #                            batched_output,
-    #                            atol=2e-2,
-    #                            rtol=2e-2)
+    torch.testing.assert_close(triton_output,
+                               batched_output,
+                               atol=2e-2,
+                               rtol=2e-2)
diff --git a/tests/kernels/moe/utils.py b/tests/kernels/moe/utils.py
@@ -162,9 +162,8 @@ def make_quantized_test_activations(
                 a_q[e], a_scale[e] = per_token_group_quant_fp8(
                     a[e], block_shape[1])
             else:
-                a_tmp, a_scale[e] = per_token_group_quant_fp8(
-                    a[e].view(1, -1), a[e].numel())
-                a_q[e] = a_tmp.view(*a[e].shape)
+                a_q[e], a_scale[e] = ops.scaled_fp8_quant(
+                    a[e], None, use_per_token_if_dynamic=per_act_token_quant)
         a_scale = torch.stack(a_scale)
 
     return a, a_q, a_scale
diff --git a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
@@ -74,19 +74,9 @@ def moe_mmk(
             a_scale_ptrs = a_scale_ptr + offs_m * stride_asm
             a_scale = tl.load(a_scale_ptrs, mask=mask_m, other=0.0)[:,None]
 
-            b_scale_ptrs = b_scale_ptr + offs_n[None, :] * stride_bsn
+            b_scale_ptrs = b_scale_ptr + offs_bn[None, :] * stride_bsn
             b_scale = tl.load(b_scale_ptrs)
 
-
-            # Load per-token scale for activations
-            # + (expert_id * stride_ase)??
-            #a_scale_ptrs = a_scale_ptr + offs_m * stride_asm
-            #a_scale = tl.load(a_scale_ptrs, mask=mask_m, other=0.0)[:, None]
-
-            # TODO: probably not correct
-            #b_scale_ptrs = b_scale_ptr + expert_id * stride_bse #+ offs_n[None, :] * stride_bsn
-            #b_scale = tl.load(b_scale_ptrs)
-
         # tensor-wise
         else:
             a_scale = tl.load(a_scale_ptr)
@@ -134,10 +124,6 @@ def moe_mmk(
         a_ptrs += BLOCK_K * stride_ak
         b_ptrs += BLOCK_K * stride_bk
 
-        if False and per_act_token_quant:
-            a_scale_ptrs += BLOCK_K * stride_ask
-            b_scale_ptrs += BLOCK_K * stride_bsk
-
     if use_w8a16:
         accumulator = (accumulator * b_scale).to(compute_type)
     elif use_w8a8:
@@ -329,9 +315,9 @@ def batched_triton_kernel(
             a_scale_ptr = a_scale_ptr + cta_m_start * stride_asm
             #b_scale_ptr = b_scale_ptr + offs_bn * stride_bsn
             # b group advancement?
-        elif False and per_act_token_quant:
+        elif per_act_token_quant:
             a_scale_ptr = a_scale_ptr + cta_m_start * stride_asm
-            b_scale_ptr = b_scale_ptr + cta_n_start * stride_bsn
+            # b_scale_ptr = b_scale_ptr + cta_n_start * stride_bsn
 
     expert_triton_kernel(
         a_ptr,