Skip to content

Commit 7257dfb

Browse files
committed
Stop the job run when DeepSpeed pdsh exited with code 1.
1 parent ef75530 commit 7257dfb

File tree

1 file changed

+7
-0
lines changed

1 file changed

+7
-0
lines changed

ads/jobs/templates/driver_utils.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -397,6 +397,7 @@ def run_command(
397397
shell=True,
398398
)
399399
# Stream the outputs
400+
logger.debug("Streaming command output from subprocess %s", process.pid)
400401
while True:
401402
output = process.stdout.readline()
402403
if process.poll() is not None and output == b"":
@@ -411,9 +412,15 @@ def run_command(
411412
# logging will add line break
412413
msg = msg.rstrip("\n")
413414
logger.log(level=level, msg=msg)
415+
if "pdsh@" in msg and "ssh exited with exit code 1" in msg:
416+
print("DeepSpeed Failed.")
417+
sys.exit(1)
414418
# Add a small delay so that
415419
# outputs from the subsequent code will have different timestamp for oci logging
416420
time.sleep(0.02)
421+
logger.debug(
422+
"subprocess %s returned exit code %s", process.pid, process.returncode
423+
)
417424
if check and process.returncode != 0:
418425
# If there is an error, exit the main process with the same return code.
419426
sys.exit(process.returncode)

0 commit comments

Comments
 (0)