Skip to content

Commit 86d5a87

Browse files
tlrmchlsmthminpeter
authored andcommitted
[Bugfix][EP+DP] Use pplx-kernel internode instead of intranode (vllm-project#19034)
Signed-off-by: Tyler Michael Smith <tysmith@redhat.com> Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com> Signed-off-by: minpeter <kali2005611@gmail.com>
1 parent b746dd3 commit 86d5a87

File tree

2 files changed

+9
-1
lines changed

2 files changed

+9
-1
lines changed

vllm/distributed/device_communicators/all2all.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,10 @@ def __init__(self, cpu_group):
8383
assert has_pplx, "pplx_kernels not found. Please follow https://github.com/vllm-project/vllm/blob/main/tools/ep_kernels/README.md to install pplx_kernels." # noqa
8484
super().__init__(cpu_group)
8585

86+
# TODO(tms): Disable pplx-a2a intranode as it fails with the error:
87+
# failed: cuda error /app/pplx/csrc/all_to_all/intranode.cpp:84 'invalid resource handle' # noqa
88+
self.internode = True
89+
8690
if self.internode:
8791
# inter-node communication needs nvshmem,
8892
# intra-node communication uses p2p mapping directly

vllm/model_executor/layers/fused_moe/layer.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -269,9 +269,13 @@ def init_prepare_finalize(self, moe: MoEConfig,
269269
hidden_dim_scale_bytes=(0 if moe.in_dtype.itemsize != 1 else (
270270
(moe.hidden_dim + moe.block_size - 1) // moe.block_size *
271271
torch.float32.itemsize)),
272-
group_name=all2all_manager.cpu_group.group_name,
273272
)
274273

274+
# Intranode pplx a2a takes a group name while internode does not.
275+
if not all2all_manager.internode:
276+
all_to_all_args[
277+
"group_name"] = all2all_manager.cpu_group.group_name
278+
275279
handle = all2all_manager.get_handle(all_to_all_args)
276280

277281
prepare_finalize = PplxPrepareAndFinalize(

0 commit comments

Comments
 (0)