Skip to content

Commit 72ebdc5

Browse files
authored
Fix bugs for PyTorchDistributedRuntime (#271)
2 parents 294c44a + be6334d commit 72ebdc5

File tree

3 files changed

+13
-3
lines changed

3 files changed

+13
-3
lines changed

ads/jobs/builders/runtimes/pytorch_runtime.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -206,7 +206,7 @@ def run(self, dsc_job, **kwargs):
206206
envs = {}
207207
# Huggingface accelerate requires machine rank
208208
envs["RANK"] = str(i)
209-
envs["WORLD_SIZE"] = replicas
209+
envs["WORLD_SIZE"] = str(replicas)
210210
if main_run:
211211
envs["MAIN_JOB_RUN_OCID"] = main_run.id
212212
name = replica_kwargs.get("display_name")

ads/jobs/templates/driver_pytorch.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -270,6 +270,7 @@ def find_self_ip(self):
270270
ip = snics[0].address
271271
if ipaddress.ip_address(ip) in ipaddress.ip_network(cidr):
272272
logger.info("Node IP address: %s", ip)
273+
# Specify the network interface for NCCL/GLOO
273274
os.environ["GLOO_SOCKET_IFNAME"] = interface
274275
os.environ["NCCL_SOCKET_IFNAME"] = interface
275276
return ip
@@ -579,13 +580,22 @@ def save_deepspeed_env(self):
579580
# as the .deepspeed_env file is parsed line by line.
580581
if not v or "\n" in v:
581582
continue
583+
# Ignore variables that are node specific
584+
# The network interface name for each job run is a unique string, e.g. ens300f0v1604
585+
if k in ["NCCL_SOCKET_IFNAME", "GLOO_SOCKET_IFNAME", "JOB_RUN_OCID"]:
586+
continue
582587
# Quote the value if it contains space
583588
# Environment variable containing space may not be exported correctly when using pdsh
584589
# https://github.com/microsoft/DeepSpeed/blob/v0.9.2/deepspeed/launcher/multinode_runner.py#L79
585590
if " " in v:
586591
v = shlex.quote(v)
587592

588593
f.write(f"{k}={v}\n")
594+
# The following are required for specifying the network interface to be used by NCCL/GLOO
595+
# The value should be the prefix of the expected network interface name
596+
# https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html#nccl-socket-ifname
597+
f.write("NCCL_SOCKET_IFNAME=ens\n")
598+
f.write("GLOO_SOCKET_IFNAME=ens\n")
589599
logger.debug("Environment variables saved to %s", self.ENV_FILE)
590600
self.run_command(f"cat {self.ENV_FILE}")
591601

tests/unitary/default_setup/jobs/test_jobs_pytorch_ddp.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -137,13 +137,13 @@ def test_create_job_runs(self, patched_run, *args):
137137
[
138138
{
139139
"display_name": "None-0",
140-
"environment_variables": {"RANK": "0", "WORLD_SIZE": 2},
140+
"environment_variables": {"RANK": "0", "WORLD_SIZE": "2"},
141141
},
142142
{
143143
"display_name": "None-1",
144144
"environment_variables": {
145145
"RANK": "1",
146-
"WORLD_SIZE": 2,
146+
"WORLD_SIZE": "2",
147147
"MAIN_JOB_RUN_OCID": test_ocid,
148148
},
149149
},

0 commit comments

Comments
 (0)