disable buggy fp8 tests

bnellnm · bnellnm · commit 7a9567909615 · 2025-06-18T20:27:54.000Z
Signed-off-by: Bill Nell &lt;bnell@redhat.com&gt;
diff --git a/tests/kernels/moe/test_batched_moe.py b/tests/kernels/moe/test_batched_moe.py
@@ -67,8 +67,6 @@ def make_tensors(config: BatchedMMConfig):
                                           device="cuda",
                                           dtype=torch.int32)
 
-
-
         return BatchedMMTensors(A, B, C, num_expert_tokens)
 
 
@@ -111,9 +109,7 @@ def ref_impl(
                          [32, 64, 128, 192, 224, 256, 512])
 @pytest.mark.parametrize("K", [128, 256, 1024])
 @pytest.mark.parametrize("N", [128, 256, 512, 1024])
-@pytest.mark.parametrize(
-    "dtype",
-    [torch.float8_e4m3fn, torch.float32, torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16])
 @pytest.mark.parametrize("block_shape", [None])
 @pytest.mark.parametrize("per_act_token_quant", [False])
 def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
@@ -223,7 +219,7 @@ def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
 @pytest.mark.parametrize("k", [128, 512, 1024, 2048])
 @pytest.mark.parametrize("e", NUM_EXPERTS)
 @pytest.mark.parametrize("topk", TOP_KS)
-@pytest.mark.parametrize("dtype", [torch.float8_e4m3fn, torch.bfloat16])
+@pytest.mark.parametrize("dtype", [torch.bfloat16])
 @pytest.mark.parametrize("per_act_token_quant", [False])
 @pytest.mark.parametrize("block_shape", [None])
 def test_fused_moe_batched_experts(
diff --git a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
@@ -318,8 +318,8 @@ def invoke_moe_batched_triton_kernel(
         expert_num_tokens: torch.Tensor,  # [E]
         compute_type: tl.dtype,
         # Quantization data
-        A_scale: torch.Tensor, # Optional
-        B_scale: torch.Tensor, # Optional
+        A_scale: Optional[torch.Tensor],
+        B_scale: Optional[torch.Tensor],
         B_zp: torch.Tensor,
         # Quantization schemes
         use_fp8_w8a8: bool,
@@ -453,61 +453,18 @@ def prepare(
             dtype=b_type,
             device=a1.device)
 
-        if quant_config.quant_dtype is not None:
-            if quant_config.block_shape is not None:
-                _, block_k = quant_config.block_shape
-                k_tiles = (hidden_dim + block_k - 1) // block_k
-                scale_shape = (num_local_experts, self.max_num_tokens, k_tiles)
-            else:
-                if quant_config.per_act_token_quant:
-                    num = self.max_num_tokens
-                else:
-                    num = 1
-                scale_shape = (num_local_experts, num, 1)
+        b_a1_scale = None
 
-            #print(f"SCALE_SHAPE {block_shape} {b_a1.shape} {scale_shape}")
-
-            b_a1_scale = torch.zeros(scale_shape,
-                                     dtype=torch.float32,
-                                     device=a1.device)
-        else:
-            assert a1_scale is None
-            b_a1_scale = None
+        assert quant_config.quant_dtype is None, "quantization NYI"
 
         first_expert = num_local_experts * self.rank
         last_expert = first_expert + num_local_experts
 
         for expert_id in range(first_expert, last_expert):
             topks = torch.any(topk_ids == expert_id, dim=1).flatten()
             rows = torch.count_nonzero(topks.flatten())
-            rhs = a1[:topks.numel()][topks]
             idx = expert_id - first_expert
-            if quant_config.quant_dtype is not None:
-                if a1_scale is not None:
-                    assert False, "NYI"
-                    rhs_a1_scale = a1_scale[:topks.numel()][topks]
-                else:
-                    rhs_a1_scale = None
-                b_a1[idx, :rows, :], b_s = moe_kernel_quantize_input(
-                    rhs,
-                    rhs_a1_scale,
-                    quant_config.quant_dtype,
-                    quant_config.per_act_token_quant,
-                    quant_config.block_shape,
-                )
-                assert b_s is not None
-                if (quant_config.block_shape is None
-                        and not quant_config.per_act_token_quant):
-                    print(f"SCALE {idx}, {b_a1_scale[idx, :].shape} {b_s.shape}")
-                    b_a1_scale[idx, :] = b_s
-                else:
-                    #print(f"XXXXX rhs={rhs.shape} b_s={b_s.shape}")
-                    assert rows == b_s.shape[0] and b_a1_scale.shape[
-                        -1] == b_s.shape[-1]
-                    b_a1_scale[idx, :rows] = b_s
-            else:
-                b_a1[idx, :rows, :] = rhs
-
+            b_a1[idx, :rows, :] = a1[:topks.numel()][topks]
             tokens_per_expert[idx] = rows
 
         assert b_a1_scale is None or b_a1_scale.ndim == 3