@@ -270,6 +270,7 @@ def find_self_ip(self):
270
270
ip = snics [0 ].address
271
271
if ipaddress .ip_address (ip ) in ipaddress .ip_network (cidr ):
272
272
logger .info ("Node IP address: %s" , ip )
273
+ # Specify the network interface for NCCL/GLOO
273
274
os .environ ["GLOO_SOCKET_IFNAME" ] = interface
274
275
os .environ ["NCCL_SOCKET_IFNAME" ] = interface
275
276
return ip
@@ -579,13 +580,22 @@ def save_deepspeed_env(self):
579
580
# as the .deepspeed_env file is parsed line by line.
580
581
if not v or "\n " in v :
581
582
continue
583
+ # Ignore variables that are node specific
584
+ # The network interface name for each job run is a unique string, e.g. ens300f0v1604
585
+ if k in ["NCCL_SOCKET_IFNAME" , "GLOO_SOCKET_IFNAME" , "JOB_RUN_OCID" ]:
586
+ continue
582
587
# Quote the value if it contains space
583
588
# Environment variable containing space may not be exported correctly when using pdsh
584
589
# https://github.com/microsoft/DeepSpeed/blob/v0.9.2/deepspeed/launcher/multinode_runner.py#L79
585
590
if " " in v :
586
591
v = shlex .quote (v )
587
592
588
593
f .write (f"{ k } ={ v } \n " )
594
+ # The following are required for specifying the network interface to be used by NCCL/GLOO
595
+ # The value should be the prefix of the expected network interface name
596
+ # https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html#nccl-socket-ifname
597
+ f .write ("NCCL_SOCKET_IFNAME=ens\n " )
598
+ f .write ("GLOO_SOCKET_IFNAME=ens\n " )
589
599
logger .debug ("Environment variables saved to %s" , self .ENV_FILE )
590
600
self .run_command (f"cat { self .ENV_FILE } " )
591
601
0 commit comments