Do not save checkpoint when data ran out

janEbert · janEbert · commit f5177b8de3bf · 2025-06-03T15:03:14.000+02:00
@fegin said: > TorchTitan currently doesn't perform force checkpoint if data is > depleted. We can fix this but I suggest that we don't do this in this > PR. (See pytorch#1238 (comment).)
diff --git a/torchtitan/train.py b/torchtitan/train.py
@@ -494,14 +494,9 @@ def train(self):
                 self.gc_handler.run(self.step)
                 data_ran_out = self.train_step(data_iterator)
                 if data_ran_out:
-                    logger.info(
-                        "Ran out of data; last step was canceled. "
-                        "Saving final checkpoint and exiting."
-                    )
-                self.checkpointer.save(
-                    self.step,
-                    force=(self.step == job_config.training.steps or data_ran_out),
-                )
+                    logger.info("Ran out of data; last step was canceled.")
+                    break
+                self.checkpointer.save(self.step, force=(self.step == job_config.training.steps))
 
                 # signal the profiler that the next profiling step has started
                 if torch_profiler: