wip

bnellnm · bnellnm · commit 6f9b1e7293f0 · 2025-06-13T02:14:35.000Z
Signed-off-by: Bill Nell &lt;bnell@redhat.com&gt;
diff --git a/tests/kernels/moe/test_batched_moe.py b/tests/kernels/moe/test_batched_moe.py
@@ -56,7 +56,7 @@ class BatchedMMTensors:
 
     @staticmethod
     def make_tensors(config: BatchedMMConfig):
-        if config.in_dtype == torch.torch.float8_e4m3fn:
+        if config.in_dtype == torch.float8_e4m3fn:
             config_in_dtype = torch.bfloat16
         else:
             config_in_dtype = config.in_dtype
@@ -126,13 +126,13 @@ def ref_impl(
 def make_quantized_test_activations(E, m, k, dtype, block_shape, per_act_token):
     assert not per_act_token, "NYI"
 
-    a_type = torch.bfloat16 if dtype == torch.torch.float8_e4m3fn else dtype
+    a_type = torch.bfloat16 if dtype == torch.float8_e4m3fn else dtype
 
     a = torch.randn((E, m, k), device="cuda", dtype=a_type) / 10
     a_q = a
     a_scale = None
 
-    if dtype == torch.torch.float8_e4m3fn:
+    if dtype == torch.float8_e4m3fn:
         a_q = torch.zeros_like(a, dtype=dtype)
         a_scale = [None] * E
         for e in range(E):
@@ -153,13 +153,13 @@ def make_quantized_test_activations(E, m, k, dtype, block_shape, per_act_token):
 @pytest.mark.parametrize("N", [128, 256, 512, 1024])
 @pytest.mark.parametrize(
     "dtype",
-    [torch.torch.float8_e4m3fn, torch.float32, torch.float16, torch.bfloat16])
+    [torch.float8_e4m3fn, torch.float32, torch.float16, torch.bfloat16])
 @pytest.mark.parametrize("block_shape", [None, [128, 128]])
 def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
                     N: int, dtype: torch.dtype, block_shape: list[int]):
     current_platform.seed_everything(7)
 
-    use_fp8_w8a8 = dtype == torch.torch.float8_e4m3fn
+    use_fp8_w8a8 = dtype == torch.float8_e4m3fn
 
     per_act_token_quant = False
 
@@ -328,7 +328,7 @@ def _make_test_weights(
     n: int,
     k: int,
     block_size: list[int],
-    dtype=torch.torch.float8_e4m3fn,
+    dtype=torch.float8_e4m3fn,
 ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
     """
     Return weights w1, w2, w1q, w2q, w1_scale, w2_scale
@@ -371,7 +371,7 @@ def _make_test_weights(
 
 
 def make_test_weights(e, n, k, block_shape, dtype):
-    use_fp8_w8a8 = dtype == torch.torch.float8_e4m3fn
+    use_fp8_w8a8 = dtype == torch.float8_e4m3fn
     w_dtype = torch.bfloat16 if use_fp8_w8a8 else dtype
 
     w1_16 = torch.randn((e, 2 * n, k), device="cuda", dtype=w_dtype) / 15
@@ -456,7 +456,7 @@ def test_fused_moe_batched_experts(
 ):
     current_platform.seed_everything(7)
 
-    use_fp8_w8a8 = dtype == torch.torch.float8_e4m3fn
+    use_fp8_w8a8 = dtype == torch.float8_e4m3fn
     quant_type = torch.float8_e4m3fn if use_fp8_w8a8 else None
 
     if not use_fp8_w8a8 and per_act_token_quant and block_shape is not None:
diff --git a/tests/kernels/moe/test_pplx_cutlass_moe.py b/tests/kernels/moe/test_pplx_cutlass_moe.py
@@ -118,8 +118,6 @@ def pplx_cutlass_moe(
         pgi.world_size,
         rank,
         dp_size,
-        quant_dtype=torch.float8_e4m3fn,
-        per_act_token=per_act_token,
     )
 
     experts = CutlassExpertsFp8((num_experts + world_size - 1) // world_size,
diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py
@@ -5,8 +5,7 @@
 from typing import Any, Optional
 
 from vllm.model_executor.layers.fused_moe.layer import (
-    FusedMoE, FusedMoEMethodBase,
-    FusedMoeWeightScaleSupported)
+    FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported)
 from vllm.triton_utils import HAS_TRITON
 
 _config: Optional[dict[str, Any]] = None
diff --git a/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
@@ -36,8 +36,8 @@ def __init__(
         assert self.block_shape == [self.DEEPGEMM_BLOCK_SHAPE, self.DEEPGEMM_BLOCK_SHAPE]
         super().__init__(
             quant_dtype=torch.float8_e4m3fn,
-            block_shape=block_shape,
             per_act_token_quant=False,
+            block_shape=block_shape,
         )
         self.max_num_tokens = max_num_tokens
         self.world_size = world_size
diff --git a/vllm/model_executor/layers/fused_moe/cutlass_moe.py b/vllm/model_executor/layers/fused_moe/cutlass_moe.py
@@ -348,8 +348,8 @@ def cutlass_moe_fp8(
         CutlassExpertsFp8(
             max_experts_per_worker=global_num_experts,
             out_dtype=out_dtype,
-            per_act_token=per_act_token,
-            per_out_ch=per_out_ch,
+            per_act_token_quant=per_act_token,
+            per_out_ch_quant=per_out_ch,
             use_batched_format=False,
         ),
     )
diff --git a/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
@@ -67,7 +67,11 @@ def _valid_deep_gemm(hidden_states: torch.Tensor, w1: torch.Tensor,
 class DeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
 
     def __init__(self):
-        super().__init__(torch.float8_e4m3fn, False, deep_gemm_block_shape())
+        super().__init__(
+            quant_dtype=torch.float8_e4m3fn,
+            per_act_token_quant=False,
+            block_shape=deep_gemm_block_shape()
+        )
 
     def supports_chunking(self) -> bool:
         return True
diff --git a/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py
@@ -51,13 +51,6 @@ def _get_combine_config(self) -> Optional[deep_ep.Config]:
             return None
         return deep_ep.Buffer.get_combine_config(self.dp_size)
 
-    def _do_quant(self, tokens: torch.Tensor,
-                  token_scales: Optional[torch.Tensor], per_act_token: bool):
-        tokens, token_scales = moe_kernel_quantize_input(
-            tokens, token_scales, self.quant_dtype, per_act_token,
-            self.block_shape)
-        return tokens, token_scales
-
     def _do_dispatch(self, tokens: torch.Tensor,
                      token_scales: Optional[torch.Tensor],
                      rank_topk_ids: torch.Tensor,
@@ -147,19 +140,25 @@ def prepare(
         # Check if there is a block_shape / or if we can infer the quantization
         # schemes from the scales.
         per_token_quant = None
-        if all([x is None for x in [self.block_shape, a1_scale, a2_scale]
-                ]) and self.quant_dtype is not None:
+        if all([x is None for x in [block_shape, a1_scale, a2_scale]
+                ]) and quant_dtype is not None:
             # Quantization required despite none of the inputs suggesting
             # quantization. Fallback to per_token_dynamic quant.
             per_token_quant = True
         else:
-            per_token_quant = ((self.block_shape is not None) or
+            per_token_quant = ((block_shape is not None) or
                                (a1_scale is not None and a1_scale.numel() != 1)
                                or (a2_scale is not None
                                    and a2_scale.numel() != 1))
 
         if per_token_quant:
-            a1q, a1q_scale = self._do_quant(a1, a1_scale, per_act_token=True)
+            a1q, a1q_scale = moe_kernel_quantize_input(
+                a1,
+                a1_scale,
+                quant_dtype=quant_dtype,
+                per_act_token_quant=False,
+                block_shape=block_shape,
+            )
             (expert_x, expert_x_scale, expert_num_tokens, expert_topk_ids,
              expert_topk_weights) = self._do_dispatch(
                  tokens=a1q,
@@ -180,9 +179,13 @@ def prepare(
             # quantize now
             expert_x_scale = None
             if expert_x.numel() != 0:
-                expert_x, expert_x_scale = self._do_quant(expert_x,
-                                                          a1_scale,
-                                                          per_act_token=False)
+                expert_x, expert_x_scale = moe_kernel_quantize_input(
+                    expert_x,
+                    a1_scale,
+                    quant_dtype=quant_dtype,
+                    per_act_token=False,
+                    block_shape=block_shape
+                )
 
         return (expert_x, expert_x_scale, expert_num_tokens, expert_topk_ids,
                 expert_topk_weights)
diff --git a/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
@@ -37,20 +37,21 @@ class DeepEPLLPrepareAndFinalize(mk.FusedMoEPrepareAndFinalize):
     # specific hidden sizes.
     SUPPORTED_HIDDEN_SIZES = [2560, 4096, 5120, 7168]
 
-    def __init__(self,
-                 buffer: deep_ep.Buffer,
-                 world_size: int,
-                 dp_size: int,
-                 max_tokens_per_rank: int):
+    def __init__(
+        self,
+        buffer: deep_ep.Buffer,
+        max_tokens_per_rank: int,
+        world_size: int,
+        dp_size: int,
+        use_fp8_w8a8: bool
+    ):
         super().__init__()
 
         self.buffer = buffer
+        self.max_tokens_per_rank = max_tokens_per_rank
         self.world_size = world_size
         self.dp_size = dp_size
-        self.quant_dtype = quant_dtype
-        self.block_shape = block_shape
-        self.max_tokens_per_rank = max_tokens_per_rank
-        self.use_fp8_dispatch = use_fp8_dispatch
+        self.use_fp8_dispatch = use_fp8_w8a8
         # The dispatch function returns a handle that the combine function
         # requires. We store the handle here so it is available to the
         # combine function.
@@ -63,12 +64,17 @@ def topk_indices_dtype(self) -> Optional[torch.dtype]:
         return torch.int64
 
     def _do_quant(
-            self, x: Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]],
-            a1_scale: Optional[torch.Tensor], a2_scale: Optional[torch.Tensor],
-            a1_dtype: torch.dtype
+        self,
+        x: Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]],
+        a1_scale: Optional[torch.Tensor],
+        a2_scale: Optional[torch.Tensor],
+        a1_dtype: torch.dtype,
+        quant_dtype: Optional[torch.dtype],
+        per_act_token_quant: bool,
+        block_shape: Optional[list[int]],
     ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
 
-        block_k = self.block_shape[1] if self.block_shape is not None else None
+        block_k = block_shape[1] if block_shape is not None else None
         if self.use_fp8_dispatch:
             if block_k == DEEPEP_QUANT_BLOCK_SIZE:
                 # DeepEP kernels did the quantization for us.
@@ -83,28 +89,33 @@ def _do_quant(
 
         # Check if there is a block_shape / or if we can infer the quantization
         # schemes from the scales.
-        per_token_quant = None
-        if all([v is None for v in [self.block_shape, a1_scale, a2_scale]
-                ]) and self.quant_dtype is not None:
-            # Quantization required despite none of the inputs suggesting
-            # quantization. Fallback to per_token_dynamic quant.
-            per_token_quant = True
-        else:
-            per_token_quant = ((self.block_shape is not None) or
-                               (a1_scale is not None and a1_scale.numel() != 1)
-                               or (a2_scale is not None
-                                   and a2_scale.numel() != 1))
+        # per_token_quant = None
+        # if all([v is None for v in [block_shape, a1_scale, a2_scale]
+        #         ]) and quant_dtype is not None:
+        #     # Quantization required despite none of the inputs suggesting
+        #     # quantization. Fallback to per_token_dynamic quant.
+        #     per_token_quant = True
+        # else:
+        #     per_token_quant = ((block_shape is not None) or
+        #                        (a1_scale is not None and a1_scale.numel() != 1)
+        #                        or (a2_scale is not None
+        #                            and a2_scale.numel() != 1))
+
+        assert per_act_token_quant == ((block_shape is not None) or
+                                       (a1_scale is not None and a1_scale.numel() != 1)
+                                       or (a2_scale is not None
+                                           and a2_scale.numel() != 1))
 
         num_experts, max_tokens, hidden_dim = x.size()
 
         # TODO (varun): Optimization - Use a batched version of quant
         x = x.view((-1, hidden_dim))
-        x, x_scales = moe_kernel_quantize_input(x, a1_scale, self.quant_dtype,
-                                                per_token_quant,
-                                                self.block_shape)
+        x, x_scales = moe_kernel_quantize_input(x, a1_scale, quant_dtype,
+                                                per_act_token_quant,
+                                                block_shape)
         x = x.view((num_experts, -1, hidden_dim))
 
-        if per_token_quant:
+        if per_act_token_quant:
             assert x_scales is not None
             x_scales = x_scales.view(num_experts, max_tokens, -1)
 
@@ -159,7 +170,10 @@ def prepare(
                                                 return_recv_hook=False)
 
         expert_x, expert_x_scale = self._do_quant(expert_x, a1_scale, a2_scale,
-                                                  a1.dtype)
+                                                  a1.dtype,
+                                                  quant_dtype,
+                                                  per_act_token_quant,
+                                                  block_shape)
 
         return (expert_x, expert_x_scale, expert_num_tokens, None, None)
 
diff --git a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
@@ -61,7 +61,6 @@ def moe_mmk(
     if use_w8a8:
         # block-wise
         if group_k > 0 and group_n > 0:
-            # + (expert_id * stride_ase) ??
             a_scale_ptrs = a_scale_ptr + (offs_m * stride_asm) #+ (expert_id * stride_ase)
             offs_bsn = offs_n // group_n
             b_scale_ptrs = (b_scale_ptr + offs_bsn * stride_bsn) + expert_id * stride_bse
@@ -376,12 +375,18 @@ def invoke_moe_batched_triton_kernel(
             triton.cdiv(B.size(1), BLOCK_N))
 
     assert A_scale is None or A_scale.ndim == 3, f"{0 if A_scale is None else A_scale.shape}"
-    assert B_scale is None or B_scale.ndim == 3, f"{0 if B_scale is None else B_scale.shape}"
+    assert B_scale is None or B_scale.ndim == 1 or B_scale.ndim == 3, f"{0 if B_scale is None else B_scale.shape}"
+    #assert B_scale is None or B_scale.ndim == 3, f"{0 if B_scale is None else (A.shape, B_scale.shape)}"
 
     if B_scale is not None:
-        stride_bse = B_scale.stride(0)
-        stride_bsn = B_scale.stride(1)
-        stride_bsk = B_scale.stride(2)
+        if B_scale.ndim == 1:
+            stride_bse = 1
+            stride_bsn = 0
+            stride_bsk = 0
+        else:
+            stride_bse = B_scale.stride(0)
+            stride_bsn = B_scale.stride(1)
+            stride_bsk = B_scale.stride(2)
     else:
         stride_bse = 0
         stride_bsk = 0
@@ -509,7 +514,7 @@ def prepare(
             device=a1.device)
 
         if quant_dtype is not None:
-            if self.block_shape is not None:
+            if block_shape is not None:
                 _, block_k = block_shape
                 k_tiles = (hidden_dim + block_k - 1) // block_k
                 scale_shape = (num_local_experts, self.max_num_tokens, k_tiles)
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -1626,9 +1626,9 @@ def __init__(
         )
 
         super().__init__(
-            quant_dtype,
-            per_act_token_quant,
-            block_shape,
+            quant_dtype=quant_dtype,
+            per_act_token_quant=per_act_token_quant,
+            block_shape=block_shape,
         )
 
         self.use_fp8_w8a8 = use_fp8_w8a8
@@ -1762,7 +1762,7 @@ def apply(
         a2q_scale: Optional[torch.Tensor] = None
 
         qintermediate_cache2, a2q_scale = moe_kernel_quantize_input(
-            intermediate_cache2, a2_scale, self.qtype,
+            intermediate_cache2, a2_scale, self.quant_dtype,
             self.per_act_token_quant, self.block_shape)
 
         invoke_fused_moe_kernel(qintermediate_cache2,
@@ -1795,12 +1795,6 @@ def modular_triton_fused_moe(
     per_act_token_quant: bool,
     block_shape: Optional[list[int]] = None,
 ) -> mk.FusedMoEModularKernel:
-    quant_dtype = get_config_quant_dtype(
-        use_fp8_w8a8=use_fp8_w8a8,
-        use_int8_w8a8=use_int8_w8a8,
-        use_int8_w8a16=use_int8_w8a16,
-        use_int4_w4a16=use_int4_w4a16,
-    )
     return mk.FusedMoEModularKernel(
         MoEPrepareAndFinalizeNoEP(),
         TritonExperts(
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py

Original file line number	Diff line number	Diff line change
`@@ -118,8 +118,6 @@ def pplx_cutlass_moe(`
`118`	`118`	`pgi.world_size,`
`119`	`119`	`rank,`
`120`	`120`	`dp_size,`
`121`		`- quant_dtype=torch.float8_e4m3fn,`
`122`		`- per_act_token=per_act_token,`
`123`	`121`	`)`
`124`	`122`
`125`	`123`	`experts = CutlassExpertsFp8((num_experts + world_size - 1) // world_size,`