Skip to content

Commit 10be209

Browse files
Relicsrussellb
andauthored
[Bug Fix] get_distributed_init_method should get the ip from get_ip i… (#20889)
Signed-off-by: Chen Li <lcpingping@gmail.com> Co-authored-by: Russell Bryant <rbryant@redhat.com> Signed-off-by: Russell Bryant <rbryant@redhat.com>
1 parent 19c8630 commit 10be209

File tree

3 files changed

+36
-4
lines changed

3 files changed

+36
-4
lines changed

vllm/envs.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,7 @@
139139
VLLM_ROCM_QUICK_REDUCE_CAST_BF16_TO_FP16: bool = True
140140
VLLM_ROCM_QUICK_REDUCE_MAX_SIZE_BYTES_MB: Optional[int] = None
141141
VLLM_NIXL_ABORT_REQUEST_TIMEOUT: int = 120
142+
VLLM_LOOPBACK_IP: str = ""
142143

143144

144145
def get_default_cache_root():
@@ -964,6 +965,10 @@ def get_vllm_port() -> Optional[int]:
964965
# If set to 1, use the TRTLLM Decode Attention backend in flashinfer.
965966
"VLLM_USE_TRTLLM_DECODE_ATTENTION":
966967
lambda: os.getenv("VLLM_USE_TRTLLM_DECODE_ATTENTION", None),
968+
969+
# Used to force set up loopback IP
970+
"VLLM_LOOPBACK_IP":
971+
lambda: os.getenv("VLLM_LOOPBACK_IP", ""),
967972
}
968973

969974
# --8<-- [end:env-vars-definition]

vllm/utils/__init__.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -813,6 +813,33 @@ def get_ip() -> str:
813813
return "0.0.0.0"
814814

815815

816+
def test_loopback_bind(address, family):
817+
try:
818+
s = socket.socket(family, socket.SOCK_DGRAM)
819+
s.bind((address, 0)) # Port 0 = auto assign
820+
s.close()
821+
return True
822+
except OSError:
823+
return False
824+
825+
826+
def get_loopback_ip() -> str:
827+
loopback_ip = envs.VLLM_LOOPBACK_IP
828+
if loopback_ip:
829+
return loopback_ip
830+
831+
# VLLM_LOOPBACK_IP is not set, try to get it based on network interface
832+
833+
if test_loopback_bind("127.0.0.1", socket.AF_INET):
834+
return "127.0.0.1"
835+
elif test_loopback_bind("::1", socket.AF_INET6):
836+
return "::1"
837+
else:
838+
raise RuntimeError(
839+
"Neither 127.0.0.1 nor ::1 are bound to a local interface. "
840+
"Set the VLLM_LOOPBACK_IP environment variable explicitly.")
841+
842+
816843
def is_valid_ipv6_address(address: str) -> bool:
817844
try:
818845
ipaddress.IPv6Address(address)

vllm/v1/executor/multiproc_executor.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -30,8 +30,8 @@
3030
from vllm.executor.multiproc_worker_utils import (
3131
_add_prefix, set_multiprocessing_worker_envs)
3232
from vllm.logger import init_logger
33-
from vllm.utils import (get_distributed_init_method, get_mp_context,
34-
get_open_port)
33+
from vllm.utils import (get_distributed_init_method, get_loopback_ip,
34+
get_mp_context, get_open_port)
3535
from vllm.v1.executor.abstract import Executor, FailureCallback
3636
from vllm.v1.outputs import ModelRunnerOutput
3737
from vllm.worker.worker_base import WorkerWrapperBase
@@ -63,9 +63,9 @@ def _init_executor(self) -> None:
6363

6464
# Multiprocessing-based executor does not support multi-node setting.
6565
# Since it only works for single node, we can use the loopback address
66-
# 127.0.0.1 for communication.
66+
# get_loopback_ip() for communication.
6767
distributed_init_method = get_distributed_init_method(
68-
"127.0.0.1", get_open_port())
68+
get_loopback_ip(), get_open_port())
6969

7070
# Initialize worker and set up message queues for SchedulerOutputs
7171
# and ModelRunnerOutputs

0 commit comments

Comments
 (0)