Skip to content

Commit 6ccf776

Browse files
authored
Don't export to collections for all workers with unsupported distrib training (#263)
1 parent 6b114d7 commit 6ccf776

File tree

1 file changed

+13
-5
lines changed

1 file changed

+13
-5
lines changed

smdebug/core/hook.py

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -436,12 +436,13 @@ def _initialize_writers(self, only_initialize_if_missing=False) -> None:
436436
self.first_process = True
437437
self.logger.info(f"Hook is writing from the hook with pid: {os.getpid()}\n")
438438
else:
439+
if self.first_process is None:
440+
self.logger.warn(
441+
f"Unsupported Distributed Training Strategy Detected. \
442+
Sagemaker-Debugger will only write from one process. \
443+
The process with pid: {os.getpid()} will not be writing any data. \n"
444+
)
439445
self.first_process = False
440-
self.logger.warn(
441-
f"Unsupported Distributed Training Strategy Detected.\n\
442-
Sagemaker-Debugger will only write from one process.\n\
443-
The process with pid: {os.getpid()} will not be writing any data. \n"
444-
)
445446
return
446447

447448
if self.save_all_workers is False:
@@ -546,6 +547,13 @@ def set_mode(self, mode):
546547

547548
def export_collections(self):
548549
num_workers = self._get_num_workers()
550+
if num_workers == 1 and self.first_process is False:
551+
self.logger.warn(
552+
f"Unsupported Distributed Training Strategy Detected. \
553+
Sagemaker-Debugger will only write from one process. \
554+
The process with pid: {os.getpid()} will not be writing any data. \n"
555+
)
556+
return
549557
if self.save_all_workers is False:
550558
if self.chief_worker != self.worker:
551559
return

0 commit comments

Comments
 (0)