vllm-project
diff --git a/‎tests/kernels/moe/test_pplx_moe.py
Lines changed: 1 addition & 3 deletions b/‎tests/kernels/moe/test_pplx_moe.py
Lines changed: 1 addition & 3 deletions
diff --git a/‎tests/kernels/moe/utils.py
Lines changed: 2 additions & 5 deletions b/‎tests/kernels/moe/utils.py
Lines changed: 2 additions & 5 deletions
diff --git a/‎vllm/model_executor/layers/fused_moe/__init__.py
Lines changed: 1 addition & 2 deletions b/‎vllm/model_executor/layers/fused_moe/__init__.py
Lines changed: 1 addition & 2 deletions
diff --git a/‎vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
Lines changed: 14 additions & 7 deletions b/‎vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
Lines changed: 14 additions & 7 deletions
diff --git a/‎vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py
Lines changed: 49 additions & 39 deletions b/‎vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py
Lines changed: 49 additions & 39 deletions
diff --git a/‎vllm/model_executor/layers/fused_moe/cutlass_moe.py
Lines changed: 4 additions & 8 deletions b/‎vllm/model_executor/layers/fused_moe/cutlass_moe.py
Lines changed: 4 additions & 8 deletions
diff --git a/‎vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
Lines changed: 1 addition & 3 deletions b/‎vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
Lines changed: 1 addition & 3 deletions
diff --git a/‎vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py
Lines changed: 4 additions & 5 deletions b/‎vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py
Lines changed: 4 additions & 5 deletions
diff --git a/‎vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
Lines changed: 4 additions & 4 deletions b/‎vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
Lines changed: 4 additions & 4 deletions
@@ -393,6 +393,7 @@ def pplx_moe(
     max_num_tokens = round_up(rank_chunk(a.shape[0], 0, world_size), 64)
 
     hidden_dim_bytes, scale_bytes = pplx_hidden_dim_scale_bytes(
+        max_num_tokens,
         hidden_dim,
         a.dtype,
         qtype,
@@ -426,9 +427,6 @@ def pplx_moe(
         world_size,
         rank,
         dp_size,
-        quant_dtype=qtype,
-        per_act_token_quant=per_act_token_quant,
-        block_shape=block_shape,
     )
 
     experts = BatchedTritonExperts(max_num_tokens=max_num_tokens,
 
@@ -204,14 +204,11 @@ def batched_moe(
         BatchedPrepareAndFinalize(max_num_tokens,
                                   world_size=1,
                                   dp_size=1,
-                                  rank=0,
-                                  quant_dtype=qtype,
-                                  block_shape=block_shape,
-                                  per_act_token_quant=per_act_token),
+                                  rank=0),
         BatchedTritonExperts(max_num_tokens=max_num_tokens,
                              dp_size=1,
                              world_size=1,
-                             use_fp8_w8a8=qtype == torch.float8_e4m3fn,
+                             use_fp8_w8a8=qtype==torch.float8_e4m3fn,
                              per_act_token_quant=per_act_token,
                              block_shape=block_shape)
     )
 
@@ -5,7 +5,7 @@
 from typing import Any, Optional
 
 from vllm.model_executor.layers.fused_moe.layer import (
-    MOE_DP_CHUNK_SIZE, FusedMoE, FusedMoEMethodBase,
+    FusedMoE, FusedMoEMethodBase,
     FusedMoeWeightScaleSupported)
 from vllm.triton_utils import HAS_TRITON
 
@@ -31,7 +31,6 @@ def get_config() -> Optional[dict[str, Any]]:
     "FusedMoeWeightScaleSupported",
     "override_config",
     "get_config",
-    "MOE_DP_CHUNK_SIZE",
 ]
 
 if HAS_TRITON:
 
@@ -19,22 +19,29 @@ class BatchedDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
     # The Deep Gemm kernels only support block size of 128
     DEEPGEMM_BLOCK_SHAPE = 128
 
-    def __init__(self, max_num_tokens: int, world_size: int, dp_size: int,
-                 block_shape: list[int]):
+    def __init__(
+        self,
+        max_num_tokens: int,
+        world_size: int,
+        dp_size: int,
+        block_shape: list[int]
+    ):
         """
         max_num_tokens: Maximum number of tokens from a DP Rank
         world_size: Number of EP ranks
         dp_size: Number of data-parallel ranks
         block_shape: Block quantization block shape
         """
-        super().__init__()
+
+        assert self.block_shape == [self.DEEPGEMM_BLOCK_SHAPE, self.DEEPGEMM_BLOCK_SHAPE]
+        super().__init__(
+            quant_dtype=torch.float8_e4m3fn,
+            block_shape=block_shape,
+            per_act_token_quant=False,
+        )
         self.max_num_tokens = max_num_tokens
         self.world_size = world_size
         self.dp_size = dp_size
-        self.block_shape = block_shape
-
-        assert (len(self.block_shape) == 2 and all(
-            [v == self.DEEPGEMM_BLOCK_SHAPE for v in self.block_shape]))
 
     def supports_chunking(self) -> bool:
         return False
 
@@ -12,57 +12,68 @@
 
 class BatchedTritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
 
-    def __init__(self,
-                 max_num_tokens: int,
-                 world_size: int,
-                 dp_size: int,
-                 use_fp8_w8a8: bool = False,
-                 use_int8_w8a8: bool = False,
-                 use_int8_w8a16: bool = False,
-                 use_int4_w4a16: bool = False,
-                 per_channel_quant: bool = False,
-                 block_shape: Optional[list[int]] = None,
-                 allow_deep_gemm: bool = False):
-        super().__init__()
+    def __init__(
+        self,
+        max_num_tokens: int,
+        world_size: int,
+        dp_size: int,
+        use_fp8_w8a8: bool = False,
+        use_int8_w8a8: bool = False,
+        use_int8_w8a16: bool = False,
+        use_int4_w4a16: bool = False,
+        block_shape: Optional[list[int]] = None,
+        per_act_token_quant: bool = False,
+        allow_deep_gemm: bool = False
+    ):
+        from vllm.model_executor.layers.fused_moe.fused_moe import (
+            get_config_quant_dtype)
+
         assert not use_int8_w8a8, "NYI"
         assert not use_int8_w8a16, "NYI"
         assert not use_int4_w4a16, "NYI"
 
+        quant_dtype = get_config_quant_dtype(
+            use_fp8_w8a8=use_fp8_w8a8,
+            use_int8_w8a8=use_int8_w8a8,
+            use_int8_w8a16=use_int8_w8a16,
+            use_int4_w4a16=use_int4_w4a16,
+        )
+        super().__init__(
+            quant_dtype=quant_dtype,
+            block_shape=block_shape,
+            per_act_token_quant=per_act_token_quant,
+        )
         self.max_num_tokens = max_num_tokens
         self.world_size = world_size
         self.dp_size = dp_size
-        self.use_fp8_w8a8 = use_fp8_w8a8
-        self.use_int8_w8a8 = use_int8_w8a8
-        self.use_int8_w8a16 = use_int8_w8a16
-        self.use_int4_w4a16 = use_int4_w4a16
-        self.per_channel_quant = per_channel_quant
-        self.block_shape = block_shape
-        self.allow_deep_gemm = allow_deep_gemm
-
-        # BatchedTritonKernel doesn't support block quantization
-        # at the moment.
+
         self.batched_triton_experts = BatchedTritonExperts(
             max_num_tokens=self.max_num_tokens,
-            use_fp8_w8a8=self.use_fp8_w8a8,
-            use_int8_w8a8=self.use_int8_w8a8,
-            use_int8_w8a16=self.use_int8_w8a16,
-            use_int4_w4a16=self.use_int4_w4a16,
-            per_channel_quant=self.per_channel_quant,
-            block_shape=self.block_shape,
             world_size=self.world_size,
-            dp_size=self.dp_size) if self.block_shape is None else None
+            dp_size=self.dp_size,
+            use_fp8_w8a8=use_fp8_w8a8,
+            use_int8_w8a8=use_int8_w8a8,
+            use_int8_w8a16=use_int8_w8a16,
+            use_int4_w4a16=use_int4_w4a16,
+            per_act_token_quant=self.per_act_token_quant,
+            block_shape=self.block_shape,
+        )
+
+        dg_block_shape = [BatchedDeepGemmExperts.DEEPGEMM_BLOCK_SHAPE,
+                          BatchedDeepGemmExperts.DEEPGEMM_BLOCK_SHAPE]
+
+        self.allow_deep_gemm = (allow_deep_gemm and use_fp8_w8a8
+                                and self.block_shape == dg_block_shape)
 
-        is_fp8_128_block_quantized = (self.use_fp8_w8a8
-                                      and self.block_shape is not None
-                                      and len(self.block_shape) == 2 and all(
-                                          [b == 128
-                                           for b in self.block_shape]))
         self.batched_deep_gemm_experts = BatchedDeepGemmExperts(
             max_num_tokens=self.max_num_tokens,
             world_size=self.world_size,
             dp_size=self.dp_size,
-            block_shape=self.block_shape,  # type: ignore[arg-type]
-        ) if (self.allow_deep_gemm and is_fp8_128_block_quantized) else None
+            block_shape=self.block_shape,
+        ) if self.allow_deep_gemm else None
+
+        assert (self.batched_triton_experts is not None or
+                (self.allow_deep_gemm and self.batched_deep_gemm_experts is not None))
 
         assert (self.batched_deep_gemm_experts is not None
                 or self.batched_triton_experts is not None)
@@ -86,11 +97,10 @@ def workspace_shapes(
         # Note: the deep gemm workspaces are strictly larger than the triton
         # workspaces so we can be pessimistic here and allocate for DeepGemm
         # even if we fall back to triton later, e.g. if expert maps are set.
-        if self.allow_deep_gemm and self.batched_deep_gemm_experts is not None:
+        if self.allow_deep_gemm:
             return self.batched_deep_gemm_experts.workspace_shapes(
                 a, aq, M, N, K, topk, num_experts)
         else:
-            assert self.batched_triton_experts is not None
             return self.batched_triton_experts.workspace_shapes(
                 a, aq, M, N, K, topk, num_experts)
 
@@ -118,7 +128,7 @@ def apply(
                                          and self.batched_deep_gemm_experts
                                          is not None)
         experts = (self.batched_deep_gemm_experts
-                   if use_batched_deep_gemm_experts else
+                   if self.allow_deep_gemm else
                    self.batched_triton_experts)
         assert experts is not None
         experts.apply(output, hidden_states, w1, w2, topk_ids, activation,
 
@@ -202,8 +202,7 @@ def run_cutlass_moe_fp8(
 
 
 # TODO (bnell): split class batched vs. non-batched?
-class CutlassExpertsFp8(mk.FusedMoEPermuteExpertsUnpermute):
-
+# maybe remove need for passing aq to workspace_shapes
 class CutlassExpertsFp8(mk.FusedMoEPermuteExpertsUnpermute):
 
     def __init__(
@@ -212,12 +211,13 @@ def __init__(
         out_dtype: Optional[torch.dtype],
         per_act_token_quant: bool,
         per_out_ch_quant: bool,
+        block_shape: Optional[list[int]] = None,
         use_batched_format: bool = False,
     ):
         super().__init__(
             quant_dtype=torch.float8_e4m3fn,
             per_act_token_quant=per_act_token_quant,
-            block_shape=None,
+            block_shape=block_shape,
         )
         self.max_experts_per_worker = max_experts_per_worker
         self.out_dtype = out_dtype
@@ -344,11 +344,7 @@ def cutlass_moe_fp8(
         out_dtype = a.dtype
 
     fn = mk.FusedMoEModularKernel(
-        MoEPrepareAndFinalizeNoEP(
-            quant_dtype=torch.float8_e4m3fn,
-            per_act_token_quant=per_act_token,
-            block_shape=None,
-        ),
+        MoEPrepareAndFinalizeNoEP(),
         CutlassExpertsFp8(
             max_experts_per_worker=global_num_experts,
             out_dtype=out_dtype,
 
@@ -217,9 +217,7 @@ def deep_gemm_moe_fp8(
     - torch.Tensor: The bfloat16 output tensor after applying the MoE layer.
     """
     fn = mk.FusedMoEModularKernel(
-        MoEPrepareAndFinalizeNoEP(quant_dtype=torch.float8_e4m3fn,
-                                  per_act_token_quant=False,
-                                  block_shape=deep_gemm_block_shape()),
+        MoEPrepareAndFinalizeNoEP(),
         DeepGemmExperts(),
     )
     return fn(
 
@@ -20,17 +20,13 @@ def __init__(self,
                  world_size: int,
                  rank: int,
                  dp_size: int,
-                 rank_expert_offset: int,
-                 quant_dtype: Optional[torch.dtype] = None,
-                 block_shape: Optional[list[int]] = None):
+                 rank_expert_offset: int):
         super().__init__()
         self.buffer = buffer
         self.world_size = world_size
         self.rank = rank
         self.dp_size = dp_size
         self.rank_expert_offset = rank_expert_offset
-        self.quant_dtype = quant_dtype
-        self.block_shape = block_shape
         # The dispatch function returns a handle that the combine function
         # requires. We store the handle here so it is available to the
         # combine function.
@@ -135,6 +131,9 @@ def prepare(
         num_experts: int,
         expert_map: Optional[torch.Tensor],
         apply_router_weight_on_input: bool,
+        quant_dtype: Optional[torch.dtype],
+        per_act_token_quant: bool,
+        block_shape: Optional[list[int]],
     ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor],
                Optional[torch.Tensor], Optional[torch.Tensor]]:
 
 
@@ -41,10 +41,7 @@ def __init__(self,
                  buffer: deep_ep.Buffer,
                  world_size: int,
                  dp_size: int,
-                 max_tokens_per_rank: int,
-                 quant_dtype: Optional[torch.dtype] = None,
-                 block_shape: Optional[list[int]] = None,
-                 use_fp8_dispatch: bool = False):
+                 max_tokens_per_rank: int):
         super().__init__()
 
         self.buffer = buffer
@@ -123,6 +120,9 @@ def prepare(
         num_experts: int,
         expert_map: Optional[torch.Tensor],
         apply_router_weight_on_input: bool,
+        quant_dtype: Optional[torch.dtype],
+        per_act_token_quant: bool,
+        block_shape: Optional[list[int]],
     ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor],
                Optional[torch.Tensor], Optional[torch.Tensor]]: