[Bugfix] Allocate less memory in non-batched CUTLASS MoE (#21121)

ElizaWszola · web-flow · commit 4adc66f64d56 · 2025-07-18T18:55:52.000+08:00
Signed-off-by: ElizaWszola &lt;ewszola@redhat.com&gt;
diff --git a/vllm/model_executor/layers/fused_moe/cutlass_moe.py b/vllm/model_executor/layers/fused_moe/cutlass_moe.py
@@ -283,8 +283,8 @@ def workspace_shapes(
                           (N // 2))
             output = (self.max_experts_per_worker, padded_M, K)
         else:
-            workspace1 = (M * topk, max(2 * N, K))
-            workspace2 = (M * topk, N)
+            workspace1 = (M * topk, max(N, K))
+            workspace2 = (M * topk, N // 2)
             output = (M * topk, K)
         return (workspace1, workspace2, output,
                 self.out_dtype if self.out_dtype is not None else a.dtype)