Skip to content

Commit 8a4f667

Browse files
committed
more fixes
Signed-off-by: Bill Nell <bnell@redhat.com>
1 parent e3dad30 commit 8a4f667

File tree

3 files changed

+8
-7
lines changed

3 files changed

+8
-7
lines changed

vllm/model_executor/layers/fused_moe/deepep_ll_prepare_finalize.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111

1212
# DeepEP kernels quantize dispatch inputs in 128 element chunks.
1313
DEEPEP_QUANT_BLOCK_SIZE = 128
14+
DEEPEP_QUANT_BLOCK_SHAPE = [DEEPEP_QUANT_BLOCK_SIZE, DEEPEP_QUANT_BLOCK_SIZE]
1415

1516

1617
def dequant_fp8(expert_x_fp8: torch.Tensor,

vllm/model_executor/layers/fused_moe/fused_batched_moe.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -685,7 +685,7 @@ def workspace_shapes(
685685
local_num_experts: int,
686686
) -> tuple[tuple[int, ...], tuple[int, ...], tuple[int, ...], torch.dtype]:
687687
assert a.dim() == 2
688-
num_dp = self.world_size // self.dp_size
688+
num_dp = self.world_size
689689
num_experts = local_num_experts
690690
max_num_tokens = self.max_num_tokens
691691
workspace13 = (num_experts, max_num_tokens * num_dp, max(K, N))

vllm/model_executor/layers/fused_moe/layer.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@
4444
pplx_hidden_dim_scale_bytes)
4545
if has_deep_ep():
4646
from .deepep_ht_prepare_finalize import DeepEPHTPrepareAndFinalize
47-
from .deepep_ll_prepare_finalize import (DEEPEP_QUANT_BLOCK_SIZE,
47+
from .deepep_ll_prepare_finalize import (DEEPEP_QUANT_BLOCK_SHAPE,
4848
DeepEPLLPrepareAndFinalize)
4949
else:
5050
fused_experts = None # type: ignore
@@ -159,11 +159,11 @@ def init_prepare_finalize(self, moe: FusedMoEConfig,
159159

160160
# Note : We may want to use FP8 dispatch even otherwise just to
161161
# reduce datamovement
162-
assert (moe.quant_config is not None
163-
and moe.quant_config.block_shape is not None)
164-
use_fp8_dispatch = (
165-
moe.quant_config.quant_dtype == current_platform.fp8_dtype()
166-
and moe.quant_config.block_shape[1] == DEEPEP_QUANT_BLOCK_SIZE)
162+
assert moe.quant_config is not None
163+
use_fp8_dispatch = (moe.quant_config.quant_dtype
164+
== current_platform.fp8_dtype()
165+
and moe.quant_config.block_shape[1]
166+
== DEEPEP_QUANT_BLOCK_SHAPE)
167167

168168
# Note (varun): Whether to use FP8 dispatch or not needs some
169169
# profiling. Turning it off for now.

0 commit comments

Comments
 (0)