Skip to content

Commit e48f659

Browse files
bigPYJ1151minpeter
authored andcommitted
[BugFix][CPU] Fix x86 SHM distributed module initialization (vllm-project#18536)
Signed-off-by: jiang.li <jiang1.li@intel.com> Signed-off-by: minpeter <kali2005611@gmail.com>
1 parent 9cd00b2 commit e48f659

File tree

1 file changed

+6
-2
lines changed

1 file changed

+6
-2
lines changed

vllm/distributed/device_communicators/cpu_communicator.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,10 @@ def __init__(self,
2222
super().__init__(cpu_group, device, device_group, unique_name)
2323
self.dist_module = torch.distributed
2424

25-
if (current_platform.get_cpu_architecture() == CpuArchEnum.X86) \
26-
and hasattr(torch.ops._C, "init_shm_manager"):
25+
if (current_platform.get_cpu_architecture()
26+
== CpuArchEnum.X86) and hasattr(
27+
torch.ops._C,
28+
"init_shm_manager") and unique_name.startswith("tp"):
2729
self.dist_module = _CPUSHMDistributed(self)
2830

2931
def all_reduce(self, input_):
@@ -96,6 +98,8 @@ class _CPUSHMDistributed:
9698

9799
def __init__(self, communicator: CpuCommunicator):
98100
instance_identifier = os.environ["VLLM_DIST_IDENT"]
101+
unique_name = communicator.unique_name
102+
instance_identifier = f"{instance_identifier}-{unique_name}"
99103
self.communicator = communicator
100104

101105
group_ranks = [str(rank) for rank in self.communicator.ranks]

0 commit comments

Comments
 (0)