wip

bnellnm · bnellnm · commit 911339b30b72 · 2025-06-18T21:54:25.000Z
Signed-off-by: Bill Nell &lt;bnell@redhat.com&gt;
diff --git a/tests/kernels/moe/test_batched_moe.py b/tests/kernels/moe/test_batched_moe.py
@@ -150,6 +150,9 @@ def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
 
     use_fp8_w8a8 = dtype == torch.float8_e4m3fn
 
+    config = BatchedMMConfig(dtype, num_experts, max_tokens_per_expert, K, N)
+    tensors = BatchedMMTensors.make_tensors(config)
+
     per_act_token_quant = False
 
     if block_shape is not None and not use_fp8_w8a8:
diff --git a/tests/kernels/moe/test_pplx_moe.py b/tests/kernels/moe/test_pplx_moe.py
@@ -18,22 +18,21 @@
 except ImportError:
     has_pplx = False
 
-#from tests.kernels.quant_utils import native_w8a8_block_matmul
 from tests.kernels.moe.utils import (make_test_weights, naive_batched_moe,
                                      torch_moe2)
-from tests.pplx_utils import ProcessGroupInfo, parallel_launch
 from vllm.config import VllmConfig, set_current_vllm_config
-from vllm.model_executor.layers.fused_moe import (BatchedTritonExperts,
-                                                  FusedMoEConfig,
-                                                  FusedMoEModularKernel,
-                                                  fused_topk,
-                                                  get_default_config,
-                                                  override_config)
+from vllm.model_executor.layers.fused_moe import override_config
+from vllm.model_executor.layers.fused_moe.fused_moe import get_default_config
+from vllm.model_executor.layers.fused_moe.config import FusedMoEConfig
+from vllm.model_executor.layers.fused_moe.modular_kernel import (
+    FusedMoEModularKernel)
 from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
-    BatchedPrepareAndFinalize, NaiveBatchedExperts)
+    BatchedPrepareAndFinalize, BatchedTritonExperts, NaiveBatchedExperts)
 from vllm.platforms import current_platform
 from vllm.utils import round_up
 
+from .deepep_utils import ProcessGroupInfo, parallel_launch
+
 requires_pplx = pytest.mark.skipif(
     not has_pplx,
     reason="Requires PPLX kernels",
@@ -542,7 +541,7 @@ def _pplx_moe(
     qtype: Optional[torch.dtype] = None,
     per_act_token_quant: bool = False,
     block_shape: Optional[list[int]] = None,
-    use_internode: bool,
+    use_internode: bool = False,
 ):
     if use_internode:
         uid = nvshmem_get_unique_id(
diff --git a/tests/kernels/moe/utils.py b/tests/kernels/moe/utils.py
@@ -106,7 +106,7 @@ def torch_moe2(
     a, a_scale = moe_kernel_quantize_input(a, None, quant_type,
                                            per_act_token_quant, block_shape)
 
-    print(f"XXX {quant_type} {block_shape} {a.shape} {a_scale}")
+    #print(f"XXX {quant_type} {block_shape} {a.shape} {a_scale}")
 
     out = torch.zeros(M * topk,
                       w2.shape[1],
diff --git a/vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py
@@ -65,13 +65,9 @@ def __init__(self,
             max_num_tokens=self.max_num_tokens,
             world_size=self.world_size,
             dp_size=self.dp_size,
-            block_shape=self.block_shape,
+            block_shape=self.block_shape,  # type: ignore[arg-type]
         ) if self.allow_deep_gemm else None
 
-        assert (self.batched_triton_experts is not None
-                or (self.allow_deep_gemm
-                    and self.batched_deep_gemm_experts is not None))
-
         assert (self.batched_deep_gemm_experts is not None
                 or self.batched_triton_experts is not None)
 
@@ -96,6 +92,7 @@ def workspace_shapes(
         # workspaces so we can be pessimistic here and allocate for DeepGemm
         # even if we fall back to triton later, e.g. if expert maps are set.
         if self.allow_deep_gemm:
+            assert self.batched_deep_gemm_experts is not None
             return self.batched_deep_gemm_experts.workspace_shapes(
                 a, aq, M, N, K, topk, global_num_experts, local_num_experts)
         else:
diff --git a/vllm/model_executor/layers/fused_moe/cutlass_moe.py b/vllm/model_executor/layers/fused_moe/cutlass_moe.py
@@ -347,19 +347,14 @@ def cutlass_moe_fp8(
         a2_scale.numel() != 1 if a2_scale is not None else False)
     per_out_ch = w1_scale.numel() != w1_q.shape[0]
 
-    out_dtype = a.dtype
-
-    if out_dtype is None:
-        out_dtype = a.dtype
-
     num_experts = global_num_experts if global_num_experts != -1 else w1_q.size(
         0)
 
     fn = mk.FusedMoEModularKernel(
         MoEPrepareAndFinalizeNoEP(),
         CutlassExpertsFp8(
             max_experts_per_worker=num_experts,
-            out_dtype=out_dtype,
+            out_dtype=a.dtype,
             per_act_token_quant=per_act_token,
             per_out_ch_quant=per_out_ch,
             use_batched_format=False,
diff --git a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
@@ -627,11 +627,6 @@ def __init__(
         block_shape: Optional[list[int]] = None,
         per_act_token_quant: bool = False,
     ):
-        super().__init__()
-        assert not use_fp8_w8a8, "NYI"
-        assert not use_int8_w8a8, "NYI"
-        assert not use_int8_w8a16, "NYI"
-        assert not use_int4_w4a16, "NYI"
         super().__init__(
             FusedMoEQuantConfig.make(
                 use_fp8_w8a8=use_fp8_w8a8,
@@ -641,6 +636,10 @@ def __init__(
                 per_act_token_quant=per_act_token_quant,
                 block_shape=block_shape,
             ))
+        assert not use_fp8_w8a8, "NYI"
+        assert not use_int8_w8a8, "NYI"
+        assert not use_int8_w8a16, "NYI"
+        assert not use_int4_w4a16, "NYI"
         self.max_num_tokens = max_num_tokens
         self.world_size = world_size
         self.dp_size = dp_size
@@ -928,7 +927,8 @@ def apply(
         intermediate_cache2 = _resize_cache(workspace2,
                                             (E, max_num_tokens, N // 2))
 
-        intermediate_cache1.fill_(0)
+        if self.use_fp8_w8a8:
+            intermediate_cache1.fill_(0)
 
         #print(f"A1_SCALES {a1q_scale.shape}")
 
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
@@ -93,9 +93,6 @@ def init_prepare_finalize(self, moe: FusedMoEConfig,
                 block_shape=moe.block_shape,
             )
 
-            logger.debug("All2All %s, %s = %s/%s", moe.quant_dtype,
-                         moe.block_shape, hidden_dim_bytes, hidden_scale_bytes)
-
             all_to_all_args = dict(
                 max_num_tokens=moe.max_num_tokens,
                 num_experts=moe.num_experts,
@@ -223,7 +220,8 @@ def select_gemm_impl(
         self,
         prepare_finalize: FusedMoEPrepareAndFinalize,
         moe: FusedMoEConfig
-    ):
+    ) -> FusedMoEPermuteExpertsUnpermute:
+
         assert self.fused_experts == fused_experts
 
         all2all_manager = get_ep_group().device_communicator.all2all_manager
@@ -664,7 +662,6 @@ def __init__(
 
         logger.debug("MODEL DTYPE %s", model_dtype)
 
-        # TODO: put quant info into FusedMoEConifg here
         moe = FusedMoEConfig.make(
             num_experts=self.global_num_experts,
             experts_per_token=top_k,
diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py
@@ -88,8 +88,6 @@ def _moe_problem_size(
 
 
 # TODO: pass FusedMoEParallelConfig in as ctor parameter?
-
-
 class FusedMoEPrepareAndFinalize(ABC):
     """
     An abstract base class for the [Quantize-Prepare] and [Finalize] steps
diff --git a/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py b/vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py
@@ -65,7 +65,7 @@ def workspace_shapes(
         # Note: the deep gemm workspaces are strictly larger than the triton
         # workspaces so we can be pessimistic here and allocate for DeepGemm
         # even if we fall back to triton later, e.g. if expert maps are set.
-        if (self.allow_deep_gemm and _valid_deep_gemm_shape(M, N, K)):
+        if self.allow_deep_gemm and _valid_deep_gemm_shape(M, N, K):
             return self.deep_gemm_expert.workspace_shapes(
                 a, aq, M, N, K, topk, global_num_experts, local_num_experts)
         else:
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -581,7 +581,11 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
             layer.w13_weight_scale = torch.nn.Parameter(max_w13_scales,
                                                         requires_grad=False)
 
-    def select_gemm_impl(self, prepare_finalize, moe):
+    def select_gemm_impl(
+        self,
+        prepare_finalize: FusedMoEPrepareAndFinalize,
+        moe: MoEConfig,
+    ) -> FusedMoEPermuteExpertsUnpermute:
         from vllm.model_executor.layers.fused_moe.cutlass_moe import (
             CutlassExpertsFp8)
 
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
@@ -792,10 +792,9 @@ def select_gemm_impl(self, prepare_finalize, moe):
                 self.__class__.__name__, max_num_tokens_per_rank,
                 self.quant_config.weight_block_size, False)
             return BatchedTritonOrDeepGemmExperts(
-                max_num_tokens=
-                max_num_tokens_per_rank,  # get from prepare_finalize?
-                world_size=prepare_finalize.world_size,  #  TODOsketchy
-                dp_size=prepare_finalize.dp_size,  # TODO sketchy
+                max_num_tokens=max_num_tokens_per_rank, # get from prepare_finalize?
+                world_size=prepare_finalize.world_size, # TODO sketchy
+                dp_size=prepare_finalize.dp_size,       # TODO sketchy
                 use_fp8_w8a8=True,
                 block_shape=self.quant_config.weight_block_size,
                 per_act_token_quant=False,  #?