more fixes

bnellnm · bnellnm · commit 8a4f667c751c · 2025-07-01T13:57:30.000Z
Signed-off-by: Bill Nell &lt;bnell@redhat.com&gt;
diff --git a/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py
@@ -11,6 +11,7 @@
 
 # DeepEP kernels quantize dispatch inputs in 128 element chunks.
 DEEPEP_QUANT_BLOCK_SIZE = 128
+DEEPEP_QUANT_BLOCK_SHAPE = [DEEPEP_QUANT_BLOCK_SIZE, DEEPEP_QUANT_BLOCK_SIZE]
 
 
 def dequant_fp8(expert_x_fp8: torch.Tensor,
diff --git a/vllm/model_executor/layers/fused_moe/fused_batched_moe.py b/vllm/model_executor/layers/fused_moe/fused_batched_moe.py
@@ -685,7 +685,7 @@ def workspace_shapes(
         local_num_experts: int,
     ) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...], torch.dtype]:
         assert a.dim() == 2
-        num_dp = self.world_size // self.dp_size
+        num_dp = self.world_size
         num_experts = local_num_experts
         max_num_tokens = self.max_num_tokens
         workspace13 = (num_experts, max_num_tokens * num_dp, max(K, N))
diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py
@@ -44,7 +44,7 @@
                                             pplx_hidden_dim_scale_bytes)
     if has_deep_ep():
         from .deepep_ht_prepare_finalize import DeepEPHTPrepareAndFinalize
-        from .deepep_ll_prepare_finalize import (DEEPEP_QUANT_BLOCK_SIZE,
+        from .deepep_ll_prepare_finalize import (DEEPEP_QUANT_BLOCK_SHAPE,
                                                  DeepEPLLPrepareAndFinalize)
 else:
     fused_experts = None  # type: ignore
@@ -159,11 +159,11 @@ def init_prepare_finalize(self, moe: FusedMoEConfig,
 
             # Note : We may want to use FP8 dispatch even otherwise just to
             # reduce datamovement
-            assert (moe.quant_config is not None
-                    and moe.quant_config.block_shape is not None)
-            use_fp8_dispatch = (
-                moe.quant_config.quant_dtype == current_platform.fp8_dtype()
-                and moe.quant_config.block_shape[1] == DEEPEP_QUANT_BLOCK_SIZE)
+            assert moe.quant_config is not None
+            use_fp8_dispatch = (moe.quant_config.quant_dtype
+                                == current_platform.fp8_dtype()
+                                and moe.quant_config.block_shape[1]
+                                == DEEPEP_QUANT_BLOCK_SHAPE)
 
             # Note (varun): Whether to use FP8 dispatch or not needs some
             # profiling. Turning it off for now.