Skip to content

Commit d5c9c69

Browse files
committed
Update driver_pytorch.py to fix NCCL_SOCKET_IFNAME issue.
1 parent d85a4b9 commit d5c9c69

File tree

1 file changed

+10
-0
lines changed

1 file changed

+10
-0
lines changed

ads/jobs/templates/driver_pytorch.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -270,6 +270,7 @@ def find_self_ip(self):
270270
ip = snics[0].address
271271
if ipaddress.ip_address(ip) in ipaddress.ip_network(cidr):
272272
logger.info("Node IP address: %s", ip)
273+
# Specify the network interface for NCCL/GLOO
273274
os.environ["GLOO_SOCKET_IFNAME"] = interface
274275
os.environ["NCCL_SOCKET_IFNAME"] = interface
275276
return ip
@@ -579,13 +580,22 @@ def save_deepspeed_env(self):
579580
# as the .deepspeed_env file is parsed line by line.
580581
if not v or "\n" in v:
581582
continue
583+
# Ignore variables that are node specific
584+
# The network interface name for each job run is a unique string, e.g. ens300f0v1604
585+
if k in ["NCCL_SOCKET_IFNAME", "GLOO_SOCKET_IFNAME", "JOB_RUN_OCID"]:
586+
continue
582587
# Quote the value if it contains space
583588
# Environment variable containing space may not be exported correctly when using pdsh
584589
# https://github.com/microsoft/DeepSpeed/blob/v0.9.2/deepspeed/launcher/multinode_runner.py#L79
585590
if " " in v:
586591
v = shlex.quote(v)
587592

588593
f.write(f"{k}={v}\n")
594+
# The following are required for specifying the network interface to be used by NCCL/GLOO
595+
# The value should be the prefix of the expected network interface name
596+
# https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html#nccl-socket-ifname
597+
f.write("NCCL_SOCKET_IFNAME=ens\n")
598+
f.write("GLOO_SOCKET_IFNAME=ens\n")
589599
logger.debug("Environment variables saved to %s", self.ENV_FILE)
590600
self.run_command(f"cat {self.ENV_FILE}")
591601

0 commit comments

Comments
 (0)