vllm-project
diff --git a/‎vllm/model_executor/layers/fused_moe/__init__.py
Lines changed: 26 additions & 2 deletions b/‎vllm/model_executor/layers/fused_moe/__init__.py
Lines changed: 26 additions & 2 deletions
diff --git a/‎vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py
Lines changed: 9 additions & 0 deletions b/‎vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py
Lines changed: 9 additions & 0 deletions
diff --git a/‎vllm/model_executor/layers/fused_moe/cutlass_moe.py
Lines changed: 5 additions & 0 deletions b/‎vllm/model_executor/layers/fused_moe/cutlass_moe.py
Lines changed: 5 additions & 0 deletions
diff --git a/‎vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
Lines changed: 5 additions & 0 deletions b/‎vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
Lines changed: 5 additions & 0 deletions
diff --git a/‎vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py
Lines changed: 12 additions & 8 deletions b/‎vllm/model_executor/layers/fused_moe/deepep_ht_prepare_finalize.py
Lines changed: 12 additions & 8 deletions
diff --git a/‎vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
Lines changed: 9 additions & 5 deletions b/‎vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
Lines changed: 9 additions & 5 deletions
diff --git a/‎vllm/model_executor/layers/fused_moe/fused_batched_moe.py
Lines changed: 14 additions & 0 deletions b/‎vllm/model_executor/layers/fused_moe/fused_batched_moe.py
Lines changed: 14 additions & 0 deletions
diff --git a/‎vllm/model_executor/layers/fused_moe/fused_moe.py
Lines changed: 5 additions & 0 deletions b/‎vllm/model_executor/layers/fused_moe/fused_moe.py
Lines changed: 5 additions & 0 deletions
diff --git a/‎vllm/model_executor/layers/fused_moe/layer.py
Lines changed: 18 additions & 18 deletions b/‎vllm/model_executor/layers/fused_moe/layer.py
Lines changed: 18 additions & 18 deletions
@@ -5,7 +5,11 @@
 from typing import Any, Optional
 
 from vllm.model_executor.layers.fused_moe.layer import (
-    FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported)
+    FusedMoE, FusedMoEMethodBase, FusedMoeWeightScaleSupported, MoEConfig)
+from vllm.model_executor.layers.fused_moe.modular_kernel import (
+    FusedMoEPrepareAndFinalize,
+    FusedMoEPermuteExpertsUnpermute,
+    FusedMoEActivationFormat)
 from vllm.triton_utils import HAS_TRITON
 
 _config: Optional[dict[str, Any]] = None
@@ -28,6 +32,10 @@ def get_config() -> Optional[dict[str, Any]]:
     "FusedMoE",
     "FusedMoEMethodBase",
     "FusedMoeWeightScaleSupported",
+    "FusedMoEPermuteExpertsUnpermute",
+    "FusedMoEActivationFormat",
+    "FusedMoEPrepareAndFinalize",
+    "MoEConfig",
     "override_config",
     "get_config",
 ]
@@ -37,10 +45,20 @@ def get_config() -> Optional[dict[str, Any]]:
     import vllm.model_executor.layers.fused_moe.fused_marlin_moe  # noqa
     import vllm.model_executor.layers.fused_moe.fused_moe  # noqa
     from vllm.model_executor.layers.fused_moe.cutlass_moe import (
-        cutlass_moe_fp4, cutlass_moe_fp8)
+        cutlass_moe_fp4, cutlass_moe_fp8, CutlassExpertsFp8)
     from vllm.model_executor.layers.fused_moe.fused_moe import (
         TritonExperts, fused_experts, fused_moe, fused_topk,
         get_config_file_name, grouped_topk)
+    from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
+        BatchedTritonExperts)
+    from vllm.model_executor.layers.fused_moe.deep_gemm_moe import (
+        DeepGemmExperts)
+    from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
+        BatchedDeepGemmExperts)
+    from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import (
+        TritonOrDeepGemmExperts)
+    from vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe import (
+        BatchedTritonOrDeepGemmExperts)
 
     __all__ += [
         "fused_moe",
@@ -50,5 +68,11 @@ def get_config() -> Optional[dict[str, Any]]:
         "grouped_topk",
         "cutlass_moe_fp8",
         "cutlass_moe_fp4",
+        "CutlassExpertsFp8",
         "TritonExperts",
+        "BatchedTritonExperts",
+        "DeepGemmExperts",
+        "BatchedDeepGemmExperts",
+        "TritonOrDeepGemmExperts",
+        "BatchedTritonOrDeepGemmExperts",
     ]
@@ -67,6 +67,15 @@ def __init__(self,
         assert (self.batched_deep_gemm_experts is not None
                 or self.batched_triton_experts is not None)
 
+    @property
+    def activation_formats(self) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]:
+        if self.batched_triton_experts is not None:
+            assert self.batched_deep_gemm_experts is None or self.batched_deep_gemm_experts.activation_formats == self.batched_triton_experts.activation_formats
+            return self.batched_triton_experts.activation_formats
+        else:
+            assert self.batched_deep_gemm_experts is not None
+            return self.batched_deep_gemm_experts.activation_formats
+
     def supports_chunking(self) -> bool:
         bdge = self.batched_deep_gemm_experts
         bte = self.batched_triton_experts
 
@@ -219,6 +219,11 @@ def __init__(
         self.per_out_ch = per_out_ch
         self.use_batched_format = use_batched_format
 
+    @property
+    def activation_formats(self) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]:
+        return (mk.FusedMoEActivationFormat.Standard,
+                mk.FusedMoEActivationFormat.Standard)
+
     def supports_chunking(self) -> bool:
         return not self.use_batched_format
 
 
@@ -70,6 +70,11 @@ def __init__(self):
         super().__init__()
         self.block_shape = deep_gemm_block_shape()
 
+    @property
+    def activation_formats(self) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]:
+        return (mk.FusedMoEActivationFormat.Standard,
+                mk.FusedMoEActivationFormat.Standard)
+
     def supports_chunking(self) -> bool:
         return True
 
 
@@ -39,6 +39,10 @@ def __init__(self,
         # From https://github.com/deepseek-ai/DeepEP/blob/9fe9021f29c9083cd1808ab36b740208524d9f63/deep_ep/buffer.py#L164
         self.available_rank_configs = [2, 4, 8, 16, 24, 32, 64, 128, 144, 160]
 
+    @property
+    def activation_format(self) -> mk.FusedMoEActivationFormat:
+        return mk.FusedMoEActivationFormat.Standard
+
     def max_num_tokens_per_rank(self) -> Optional[int]:
         return None
 
@@ -130,20 +134,20 @@ def prepare(
         a1: torch.Tensor,
         a1_scale: Optional[torch.Tensor],
         a2_scale: Optional[torch.Tensor],
-        rank_topk_weights: torch.Tensor,
-        rank_topk_ids: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
         num_experts: int,
         expert_map: Optional[torch.Tensor],
         apply_router_weight_on_input: bool,
     ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor],
                Optional[torch.Tensor], Optional[torch.Tensor]]:
 
         if apply_router_weight_on_input:
-            topk = rank_topk_ids.size(1)
+            topk = topk_ids.size(1)
             # TODO: this only works for topK=1, will need to update for topK>1
             assert topk == 1, (
                 "apply_router_weight_on_input is only implemented for topk=1")
-            a1 = a1 * rank_topk_weights.to(a1.dtype)
+            a1 = a1 * topk_weights.to(a1.dtype)
 
         # Check if there is a block_shape / or if we can infer the quantization
         # schemes from the scales.
@@ -165,8 +169,8 @@ def prepare(
              expert_topk_weights) = self._do_dispatch(
                  tokens=a1q,
                  token_scales=a1q_scale,
-                 rank_topk_ids=rank_topk_ids,
-                 rank_topk_weights=rank_topk_weights,
+                 rank_topk_ids=topk_ids,
+                 rank_topk_weights=topk_weights,
                  num_experts=num_experts)
         else:
             # DeepEP kernels only support dispatching per-token-quant
@@ -175,8 +179,8 @@ def prepare(
              expert_topk_weights) = self._do_dispatch(
                  tokens=a1,
                  token_scales=None,
-                 rank_topk_ids=rank_topk_ids,
-                 rank_topk_weights=rank_topk_weights,
+                 rank_topk_ids=topk_ids,
+                 rank_topk_weights=topk_weights,
                  num_experts=num_experts)
             # quantize now
             expert_x_scale = None
 
@@ -59,6 +59,10 @@ def __init__(self,
         # combine function.
         self.handle = None
 
+    @property
+    def activation_format(self) -> mk.FusedMoEActivationFormat:
+        return mk.FusedMoEActivationFormat.BatchedExperts
+
     def max_num_tokens_per_rank(self) -> Optional[int]:
         return self.max_tokens_per_rank
 
@@ -118,8 +122,8 @@ def prepare(
         a1: torch.Tensor,
         a1_scale: Optional[torch.Tensor],
         a2_scale: Optional[torch.Tensor],
-        rank_topk_weights: torch.Tensor,
-        rank_topk_ids: torch.Tensor,
+        topk_weights: torch.Tensor,
+        topk_ids: torch.Tensor,
         num_experts: int,
         expert_map: Optional[torch.Tensor],
         apply_router_weight_on_input: bool,
@@ -142,16 +146,16 @@ def prepare(
             "low_latency kernels doesn't support dispatching per-token scales")
 
         if apply_router_weight_on_input:
-            topk = rank_topk_ids.size(1)
+            topk = topk_ids.size(1)
             # TODO: this only works for topK=1, will need to update for topK>1
             assert topk == 1, (
                 "apply_router_weight_on_input is only implemented for topk=1")
-            a1 = a1 * rank_topk_weights.to(a1.dtype)
+            a1 = a1 * topk_weights.to(a1.dtype)
 
         # Dispatch
         expert_x, expert_num_tokens, self.handle, event, hook = \
                 self.buffer.low_latency_dispatch(a1,
-                                                rank_topk_ids,
+                                                topk_ids,
                                                 self.max_tokens_per_rank,
                                                 num_experts,
                                                 use_fp8=self.use_fp8_dispatch,
 
@@ -395,6 +395,10 @@ def __init__(self, max_num_tokens: int, world_size: int, dp_size: int,
         self.rank = rank
         self.max_num_tokens = max_num_tokens
 
+    @property
+    def activation_format(self) -> mk.FusedMoEActivationFormat:
+        return mk.FusedMoEActivationFormat.BatchedExperts
+
     def max_num_tokens_per_rank(self) -> Optional[int]:
         return self.max_num_tokens
 
@@ -510,6 +514,11 @@ def __init__(
         self.world_size = world_size
         self.dp_size = dp_size
 
+    @property
+    def activation_formats(self) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]:
+        return (mk.FusedMoEActivationFormat.BatchedExperts,
+                mk.FusedMoEActivationFormat.BatchedExperts)
+
     def supports_chunking(self) -> bool:
         return False
 
@@ -615,6 +624,11 @@ def __init__(
         assert not use_int4_w4a16, "NYI"
         assert self.block_shape is None, "NYI"
 
+    @property
+    def activation_formats(self) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]:
+        return (mk.FusedMoEActivationFormat.BatchedExperts,
+                mk.FusedMoEActivationFormat.BatchedExperts)
+
     def supports_chunking(self) -> bool:
         return False
 
 
@@ -1542,6 +1542,11 @@ def __init__(
                                       use_int4_w4a16=use_int4_w4a16)
         self.per_channel_quant = per_channel_quant
 
+    @property
+    def activation_formats(self) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]:
+        return (mk.FusedMoEActivationFormat.Standard,
+                mk.FusedMoEActivationFormat.Standard)
+
     def supports_chunking(self) -> bool:
         return True
 
 
@@ -23,6 +23,9 @@
 from vllm.forward_context import ForwardContext, get_forward_context
 from vllm.logger import init_logger
 from vllm.model_executor.custom_op import CustomOp
+    from .modular_kernel import (FusedMoEModularKernel,
+                                 FusedMoEPermuteExpertsUnpermute,
+                                 FusedMoEPrepareAndFinalize)
 from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (
     is_rocm_aiter_moe_enabled)
 from vllm.model_executor.layers.quantization.base_config import (
@@ -38,9 +41,6 @@
 if current_platform.is_cuda_alike():
     from .fused_batched_moe import BatchedTritonExperts
     from .fused_moe import TritonExperts, fused_experts
-    from .modular_kernel import (FusedMoEModularKernel,
-                                 FusedMoEPermuteExpertsUnpermute,
-                                 FusedMoEPrepareAndFinalize)
     if has_pplx:
         from .pplx_prepare_finalize import PplxPrepareAndFinalize
     if has_deepep:
@@ -304,9 +304,8 @@ def init_prepare_finalize(self, moe: MoEConfig,
             act_quant_block_size = quant_config.weight_block_size
             quant_dtype = torch.float8_e4m3fn
 
-        prepare_finalize: Optional[Union[PplxPrepareAndFinalize,
-                                         DeepEPHTPrepareAndFinalize,
-                                         DeepEPLLPrepareAndFinalize]] = None
+        prepare_finalize: Optional[FusedMoEPrepareAndFinalize] = None
+
         if moe.use_pplx_kernels:
             all_to_all_args = dict(
                 max_num_tokens=moe.max_num_tokens,
@@ -399,8 +398,10 @@ def init_prepare_finalize(self, moe: MoEConfig,
             )
 
     def select_gemm_impl(
-            self, prepare_finalize: FusedMoEPrepareAndFinalize,
-            moe: Optional[MoEConfig]) -> FusedMoEPermuteExpertsUnpermute:
+        self,
+        prepare_finalize: FusedMoEPrepareAndFinalize,
+        moe: MoEConfig
+    ) -> FusedMoEPermuteExpertsUnpermute:
         # based on the all2all implementation, select the appropriate
         # gemm implementation
         raise NotImplementedError(
@@ -446,23 +447,23 @@ def __init__(self, moe: MoEConfig):
         else:
             self.rocm_aiter_fused_experts = None  # type: ignore
 
-    def select_gemm_impl(self, prepare_finalize: FusedMoEPrepareAndFinalize,
-                         moe: Optional[MoEConfig]):
+    def select_gemm_impl(
+        self,
+        prepare_finalize: FusedMoEPrepareAndFinalize,
+        moe: MoEConfig
+    ) -> FusedMoEPermuteExpertsUnpermute:
 
         assert self.fused_experts == fused_experts
 
         all2all_manager = get_ep_group().device_communicator.all2all_manager
         assert all2all_manager is not None
 
-        experts: Optional[FusedMoEPermuteExpertsUnpermute] = None
-
-        use_batched_experts = prepare_finalize.max_num_tokens_per_rank(
-        ) is not None
-        if use_batched_experts:
+        if prepare_finalize.activation_format == FusedMoeActivationFormat.BatchedExperts:
             logger.debug("BatchedTritonExperts %s", self.moe)
             assert self.moe.dp_size == all2all_manager.dp_world_size
-            experts = BatchedTritonExperts(
+            return BatchedTritonExperts(
                 max_num_tokens=self.moe.max_num_tokens,
+                # TODO (bnell): Fix this mess
                 world_size=all2all_manager.world_size,
                 # dp_size actually means tp_size, bug in pplx kernels
                 dp_size=all2all_manager.tp_group.world_size,
@@ -475,15 +476,14 @@ def select_gemm_impl(self, prepare_finalize: FusedMoEPrepareAndFinalize,
             )
         else:
             logger.debug("TritonExperts %s", self.moe)
-            experts = TritonExperts(
+            return TritonExperts(
                 use_fp8_w8a8=False,
                 use_int8_w8a8=False,
                 use_int8_w8a16=False,
                 use_int4_w4a16=False,
                 block_shape=None,
                 per_channel_quant=False,
             )
-        return experts
 
     def create_weights(self, layer: torch.nn.Module, num_experts: int,
                        hidden_size: int, intermediate_size_per_partition: int,