fixes

bnellnm · bnellnm · commit db773b014bee · 2025-06-13T02:15:59.000Z
Signed-off-by: Bill Nell &lt;bnell@redhat.com&gt;
diff --git a/tests/kernels/moe/test_deepep_moe.py b/tests/kernels/moe/test_deepep_moe.py
@@ -154,20 +154,23 @@ def make_modular_kernel(pg: ProcessGroup, pgi: ProcessGroupInfo,
                         deepep_ll_args = ll_args)
 
     if low_latency_mode:
+        # TODO(bnell): block_shape?
         fused_experts = BatchedTritonExperts(
             max_num_tokens=MAX_TOKENS_PER_RANK,
             world_size=pgi.world_size,
             dp_size=dp_size,
             use_fp8_w8a8=is_quantized,
             use_int8_w8a8=False,
             use_int8_w8a16=False,
-            use_int4_w4a16=False)
+            use_int4_w4a16=False,
+            per_act_token_quant=False)
     else:
+        # TODO(bnell): block_shape?
         fused_experts = TritonExperts(use_fp8_w8a8=is_quantized,
                                       use_int8_w8a8=False,
                                       use_int8_w8a16=False,
                                       use_int4_w4a16=False,
-                                      per_channel_quant=False)
+                                      per_act_token_quant=False)
 
     mk = FusedMoEModularKernel(prepare_finalize=a2a,
                                fused_experts=fused_experts)
diff --git a/tests/kernels/moe/test_pplx_cutlass_moe.py b/tests/kernels/moe/test_pplx_cutlass_moe.py
@@ -93,7 +93,7 @@ def pplx_cutlass_moe(
         num_experts=num_experts,
         experts_per_token=topk,
         rank=rank,
-        world_size=pgi.world_size,
+        world_size=world_size,
         dp_size=dp_size,
         hidden_dim=hidden_dim,
         hidden_dim_bytes=hidden_dim,  # because a.dtype.itemsize == 1
diff --git a/tests/kernels/moe/test_pplx_moe.py b/tests/kernels/moe/test_pplx_moe.py
@@ -429,11 +429,13 @@ def pplx_moe(
         dp_size,
     )
 
-    experts = BatchedTritonExperts(max_num_tokens=max_num_tokens,
-                                   world_size=world_size,
-                                   dp_size=dp_size,
-                                   use_fp8_w8a8=qtype == torch.float8_e4m3fn,
-                                   block_shape=block_shape)
+    experts = BatchedTritonExperts(
+        max_num_tokens=max_num_tokens,
+        world_size=world_size,
+        dp_size=dp_size,
+        use_fp8_w8a8=qtype==torch.float8_e4m3fn,
+        block_shape=block_shape
+    )
 
     fused_experts = FusedMoEModularKernel(
         prepare_finalize,
diff --git a/tests/kernels/moe/utils.py b/tests/kernels/moe/utils.py
@@ -206,8 +206,8 @@ def batched_moe(
                                   dp_size=1,
                                   rank=0),
         BatchedTritonExperts(max_num_tokens=max_num_tokens,
-                             dp_size=1,
                              world_size=1,
+                             dp_size=1,
                              use_fp8_w8a8=qtype==torch.float8_e4m3fn,
                              per_act_token_quant=per_act_token,
                              block_shape=block_shape)
diff --git a/vllm/model_executor/layers/fused_moe/cutlass_moe.py b/vllm/model_executor/layers/fused_moe/cutlass_moe.py
@@ -219,6 +219,7 @@ def __init__(
             per_act_token_quant=per_act_token_quant,
             block_shape=block_shape,
         )
+        assert max_experts_per_worker > 0
         self.max_experts_per_worker = max_experts_per_worker
         self.out_dtype = out_dtype
         self.per_out_ch_quant = per_out_ch_quant
@@ -249,7 +250,7 @@ def workspace_shapes(
             workspace1 = (M * topk, max(2 * N, K))
             workspace2 = (M * topk, N)
             output = (M * topk, K)
-        return (workspace1, workspace2, output, self.out_dtype)
+        return (workspace1, workspace2, output, self.out_dtype if self.out_dtype is not None else a.dtype)
 
     def apply(
         self,
@@ -278,8 +279,9 @@ def apply(
                             activation_callable, global_num_experts,
                             expert_map, w1_scale, w2_scale, a1q_scale,
                             a2_scale, workspace13, workspace2,
-                            expert_num_tokens, self.out_dtype,
-                            self.per_act_token, self.per_out_ch,
+                            expert_num_tokens,
+                            self.out_dtype if self.out_dtype is not None else hidden_states.dtype,
+                            self.per_act_token_quant, self.per_out_ch_quant,
                             self.use_batched_format)
 
 
@@ -343,10 +345,12 @@ def cutlass_moe_fp8(
     if out_dtype is None:
         out_dtype = a.dtype
 
+    num_experts = global_num_experts if global_num_experts != -1 else w1_q.size(0)
+
     fn = mk.FusedMoEModularKernel(
         MoEPrepareAndFinalizeNoEP(),
         CutlassExpertsFp8(
-            max_experts_per_worker=global_num_experts,
+            max_experts_per_worker=num_experts,
             out_dtype=out_dtype,
             per_act_token_quant=per_act_token,
             per_out_ch_quant=per_out_ch,
@@ -362,7 +366,7 @@ def cutlass_moe_fp8(
         topk_ids,
         False,
         activation,
-        global_num_experts if global_num_experts != -1 else w1_q.size(0),
+        num_experts,
         expert_map,
         w1_scale,
         w2_scale,
diff --git a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
@@ -799,12 +799,12 @@ def __init__(
         max_num_tokens: int,
         world_size: int,
         dp_size: int,
-        use_fp8_w8a8: bool,
-        use_int8_w8a8: bool,
-        use_int8_w8a16: bool,
-        use_int4_w4a16: bool,
-        per_act_token_quant: bool,
-        block_shape: Optional[list[int]],
+        use_fp8_w8a8: bool = False,
+        use_int8_w8a8: bool = False,
+        use_int8_w8a16: bool = False,
+        use_int4_w4a16: bool = False,
+        per_act_token_quant: bool = False,
+        block_shape: Optional[list[int]] = None,
     ):
         quant_dtype = get_config_quant_dtype(
             use_fp8_w8a8=use_fp8_w8a8,
diff --git a/vllm/model_executor/layers/fused_moe/prepare_finalize.py b/vllm/model_executor/layers/fused_moe/prepare_finalize.py
@@ -43,9 +43,9 @@ def prepare(
             a1.mul_(topk_weights.to(a1.dtype))
 
         a1q, a1q_scale = moe_kernel_quantize_input(a1, a1_scale,
-                                                   self.quant_dtype,
-                                                   self.per_act_token_quant,
-                                                   self.block_shape)
+                                                   quant_dtype,
+                                                   per_act_token_quant,
+                                                   block_shape)
 
         return a1q, a1q_scale, None, None, None
 
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -592,7 +592,7 @@ def select_gemm_impl(self, prepare_finalize, moe):
 
         assert moe is not None
 
-        # method on prepare_finalize?
+        # method on prepare_finalize?  sketchy getting world_size from prepare_finalize
         max_experts_per_worker = (
             (moe.num_experts + prepare_finalize.world_size - 1) //
             prepare_finalize.world_size)