review comments + fixes

bnellnm · bnellnm · commit 90772e8044bb · 2025-06-26T20:29:39.000Z
Signed-off-by: Bill Nell &lt;bnell@redhat.com&gt;
diff --git a/tests/kernels/moe/test_batched_moe.py b/tests/kernels/moe/test_batched_moe.py
@@ -95,8 +95,6 @@ def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
         act_dtype = dtype
         quant_dtype = None
 
-    #print(f"TYPES {dtype}, {act_dtype}, {quant_dtype}")
-
     num_expert_tokens = torch.randint(low=0,
                                       high=max_tokens_per_expert,
                                       size=(num_experts, ),
@@ -226,8 +224,6 @@ def test_fused_moe_batched_experts(
                                                  in_dtype=act_dtype,
                                                  quant_dtype=quant_dtype)
 
-    torch.set_printoptions(profile="full")
-
     with set_current_vllm_config(vllm_config):
         topk_weight, topk_ids, _ = fused_topk(a, score, topk, False)
         batched_output = batched_moe(a, w1, w2, topk_weight, topk_ids, w1_s,
diff --git a/tests/kernels/moe/test_block_fp8.py b/tests/kernels/moe/test_block_fp8.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-# Adapted from https://github.com/sgl-project/sglang/pull/2575
 import itertools
 
 import pytest
diff --git a/tests/kernels/moe/test_block_int8.py b/tests/kernels/moe/test_block_int8.py
@@ -1,7 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-# Adapted from https://github.com/sgl-project/sglang/blob/main/test/srt/test_block_int8.py
 import itertools
 
 import pytest
diff --git a/vllm/model_executor/layers/fused_moe/cutlass_moe.py b/vllm/model_executor/layers/fused_moe/cutlass_moe.py
@@ -231,8 +231,12 @@ def __init__(
     def activation_formats(
         self
     ) -> tuple[mk.FusedMoEActivationFormat, mk.FusedMoEActivationFormat]:
-        return (mk.FusedMoEActivationFormat.Standard,
-                mk.FusedMoEActivationFormat.Standard)
+        if self.use_batched_format:
+            return (mk.FusedMoEActivationFormat.BatchedExperts,
+                    mk.FusedMoEActivationFormat.BatchedExperts)
+        else:
+            return (mk.FusedMoEActivationFormat.Standard,
+                    mk.FusedMoEActivationFormat.Standard)
 
     def supports_chunking(self) -> bool:
         return not self.use_batched_format
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
@@ -607,7 +607,13 @@ def __init__(
         if params_dtype is None:
             params_dtype = torch.get_default_dtype()
         self.params_dtype = params_dtype
-        all2all_manager = get_ep_group().device_communicator.all2all_manager
+
+        if ep_size is not None:
+            all2all_manager = get_ep_group().device_communicator.all2all_manager
+            world_size = (all2all_manager.world_size
+                          if all2all_manager is not None else 1)
+        else:
+            world_size = 1
 
         vllm_config = get_current_vllm_config()
         self.moe_parallel_config: FusedMoEParallelConfig = (
@@ -616,8 +622,7 @@ def __init__(
                           get_tensor_model_parallel_world_size()),
                 dp_size_=(dp_size if dp_size is not None else
                           get_dp_group().world_size),
-                world_size_=(all2all_manager.world_size
-                             if all2all_manager is not None else 1),
+                world_size_=world_size,
                 vllm_parallel_config=vllm_config.parallel_config))
 
         self.global_num_experts = num_experts