disable buggy fp8 tests

bnellnm · bnellnm · commit e59258af2f6c · 2025-06-17T01:48:21.000Z
Signed-off-by: Bill Nell &lt;bnell@redhat.com&gt;
diff --git a/tests/kernels/moe/test_batched_moe.py b/tests/kernels/moe/test_batched_moe.py
@@ -67,8 +67,6 @@ def make_tensors(config: BatchedMMConfig):
                                           device="cuda",
                                           dtype=torch.int32)
 
-
-
         return BatchedMMTensors(A, B, C, num_expert_tokens)
 
 
@@ -111,9 +109,7 @@ def ref_impl(
                          [32, 64, 128, 192, 224, 256, 512])
 @pytest.mark.parametrize("K", [128, 256, 1024])
 @pytest.mark.parametrize("N", [128, 256, 512, 1024])
-@pytest.mark.parametrize(
-    "dtype",
-    [torch.float8_e4m3fn, torch.float32, torch.float16, torch.bfloat16])
+@pytest.mark.parametrize("dtype", [torch.float32, torch.float16, torch.bfloat16])
 @pytest.mark.parametrize("block_shape", [None])
 @pytest.mark.parametrize("per_act_token_quant", [False])
 def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
@@ -223,7 +219,7 @@ def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
 @pytest.mark.parametrize("k", [128, 512, 1024, 2048])
 @pytest.mark.parametrize("e", NUM_EXPERTS)
 @pytest.mark.parametrize("topk", TOP_KS)
-@pytest.mark.parametrize("dtype", [torch.float8_e4m3fn, torch.bfloat16])
+@pytest.mark.parametrize("dtype", [torch.bfloat16])
 @pytest.mark.parametrize("per_act_token_quant", [False])
 @pytest.mark.parametrize("block_shape", [None])
 def test_fused_moe_batched_experts(
diff --git a/tests/kernels/utils.py b/tests/kernels/utils.py
@@ -1063,7 +1063,8 @@ def torch_experts(a: torch.Tensor,
                   expert_map: Optional[torch.Tensor] = None) -> torch.Tensor:
     assert (global_num_experts == -1
             or (global_num_experts == w1.shape[0] and expert_map is None)
-            or global_num_experts == expert_map.shape[0])
+            or (expert_map is not None
+                and global_num_experts == expert_map.shape[0]))
     topk = topk_ids.shape[1]
     B, D = a.shape
     a = a.view(B, -1, D).repeat(1, topk, 1).reshape(-1, D)
diff --git a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
@@ -318,8 +318,8 @@ def invoke_moe_batched_triton_kernel(
         expert_num_tokens: torch.Tensor,  # [E]
         compute_type: tl.dtype,
         # Quantization data
-        A_scale: torch.Tensor, # Optional
-        B_scale: torch.Tensor, # Optional
+        A_scale: Optional[torch.Tensor],
+        B_scale: Optional[torch.Tensor],
         B_zp: torch.Tensor,
         # Quantization schemes
         use_fp8_w8a8: bool,
@@ -453,61 +453,18 @@ def prepare(
             dtype=b_type,
             device=a1.device)
 
-        if quant_config.quant_dtype is not None:
-            if quant_config.block_shape is not None:
-                _, block_k = quant_config.block_shape
-                k_tiles = (hidden_dim + block_k - 1) // block_k
-                scale_shape = (num_local_experts, self.max_num_tokens, k_tiles)
-            else:
-                if quant_config.per_act_token_quant:
-                    num = self.max_num_tokens
-                else:
-                    num = 1
-                scale_shape = (num_local_experts, num, 1)
+        b_a1_scale = None
 
-            #print(f"SCALE_SHAPE {block_shape} {b_a1.shape} {scale_shape}")
-
-            b_a1_scale = torch.zeros(scale_shape,
-                                     dtype=torch.float32,
-                                     device=a1.device)
-        else:
-            assert a1_scale is None
-            b_a1_scale = None
+        assert quant_config.quant_dtype is None, "quantization NYI"
 
         first_expert = num_local_experts * self.rank
         last_expert = first_expert + num_local_experts
 
         for expert_id in range(first_expert, last_expert):
             topks = torch.any(topk_ids == expert_id, dim=1).flatten()
             rows = torch.count_nonzero(topks.flatten())
-            rhs = a1[:topks.numel()][topks]
             idx = expert_id - first_expert
-            if quant_config.quant_dtype is not None:
-                if a1_scale is not None:
-                    assert False, "NYI"
-                    rhs_a1_scale = a1_scale[:topks.numel()][topks]
-                else:
-                    rhs_a1_scale = None
-                b_a1[idx, :rows, :], b_s = moe_kernel_quantize_input(
-                    rhs,
-                    rhs_a1_scale,
-                    quant_config.quant_dtype,
-                    quant_config.per_act_token_quant,
-                    quant_config.block_shape,
-                )
-                assert b_s is not None
-                if (quant_config.block_shape is None
-                        and not quant_config.per_act_token_quant):
-                    print(f"SCALE {idx}, {b_a1_scale[idx, :].shape} {b_s.shape}")
-                    b_a1_scale[idx, :] = b_s
-                else:
-                    #print(f"XXXXX rhs={rhs.shape} b_s={b_s.shape}")
-                    assert rows == b_s.shape[0] and b_a1_scale.shape[
-                        -1] == b_s.shape[-1]
-                    b_a1_scale[idx, :rows] = b_s
-            else:
-                b_a1[idx, :rows, :] = rhs
-
+            b_a1[idx, :rows, :] = a1[:topks.numel()][topks]
             tokens_per_expert[idx] = rows
 
         assert b_a1_scale is None or b_a1_scale.ndim == 3
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -843,14 +843,14 @@ def try_get_optimal_moe_config_list(
             config = get_default_config(M, E, N, w1_shape[2], top_k, dtype,
                                         is_marlin, block_shape)
 
-    return [
+    return (
         config['BLOCK_SIZE_M'],
         config['BLOCK_SIZE_N'],
         config['BLOCK_SIZE_K'],
         config['GROUP_SIZE_M'],
         config.get('num_warps', 4),
         config.get('num_stages', 3 if not current_platform.is_rocm() else 2),
-    ]
+    )
 
 
 direct_register_custom_op(
@@ -1213,31 +1213,6 @@ def fused_experts(hidden_states: torch.Tensor,
             a2_scale=a2_scale,
             apply_router_weight_on_input=apply_router_weight_on_input,
         )
-    elif True:
-        fn = modular_triton_fused_moe(use_fp8_w8a8=use_fp8_w8a8,
-                                      use_int8_w8a8=use_int8_w8a8,
-                                      use_int8_w8a16=use_int8_w8a16,
-                                      use_int4_w4a16=use_int4_w4a16,
-                                      per_channel_quant=per_channel_quant,
-                                      block_shape=block_shape)
-
-        return fn(
-            hidden_states=hidden_states,
-            w1=w1,
-            w2=w2,
-            topk_weights=topk_weights,
-            topk_ids=topk_ids,
-            activation=activation,
-            apply_router_weight_on_input=apply_router_weight_on_input,
-            global_num_experts=global_num_experts,
-            expert_map=expert_map,
-            w1_scale=w1_scale,
-            w2_scale=w2_scale,
-            w1_zp=w1_zp,
-            w2_zp=w2_zp,
-            a1_scale=a1_scale,
-            a2_scale=a2_scale,
-        )
     else:
         return dispatch_fused_experts_func(inplace)(
             hidden_states=hidden_states,