Skip to content

Commit 5a24bd5

Browse files
conroy-cheersjimpang
authored andcommitted
[Fix] Fall back to Gloo when NCCL backend is unavailable (vllm-project#19641)
Signed-off-by: conroy-cheers <conroy@corncheese.org>
1 parent ab2555e commit 5a24bd5

File tree

1 file changed

+7
-0
lines changed

1 file changed

+7
-0
lines changed

vllm/distributed/parallel_state.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -938,6 +938,13 @@ def init_distributed_environment(
938938
assert distributed_init_method is not None, (
939939
"distributed_init_method must be provided when initializing "
940940
"distributed environment")
941+
if not torch.distributed.is_backend_available(backend):
942+
logger.warning(
943+
"Distributed backend %s is not available; "
944+
"falling back to gloo.", backend)
945+
assert torch.distributed.is_gloo_available(), (
946+
"Fallback Gloo backend is not available.")
947+
backend = "gloo"
941948
# this backend is used for WORLD
942949
torch.distributed.init_process_group(
943950
backend=backend,

0 commit comments

Comments
 (0)