Make checkpoint fail_fast feature optional (#1310)

fegin · web-flow · commit f4048f8e1b36 · 2025-06-17T21:58:11.000-07:00
While fail_fast checkpointing feature is useful, it can also waste time
and storage when the cluster is already verified with TorchTitan. This
PR makes fail_fast feature as optional and defaults to False.
diff --git a/tests/unit_tests/test_checkpoint.py b/tests/unit_tests/test_checkpoint.py
@@ -16,6 +16,7 @@
 import torch.nn as nn
 from torch.utils.data import DataLoader
 from torchtitan.components.checkpoint import CheckpointManager, MODEL
+from torchtitan.config_manager import Checkpoint as CheckpointConfig
 
 
 class FakeOptimizersContainer:
@@ -81,7 +82,7 @@ def fake_async_save(*args, **kwargs):
 class DummyJobConfig:
     def __init__(self, job):
         self.job = job
-        self.checkpoint = SimpleNamespace(
+        self.checkpoint = CheckpointConfig(
             enable_checkpoint=True,
             async_mode="disabled",
             folder="",
@@ -112,7 +113,7 @@ def setUp(self):
         self.data_loader = FakeDataLoader()
         self.ft_manager = DummyFTManager()
 
-        ckpt_cfg = SimpleNamespace(
+        ckpt_cfg = CheckpointConfig(
             enable_checkpoint=True,
             async_mode="DISABLED",
             folder="",
@@ -325,17 +326,17 @@ def test_interval_respects_interval(self, mock_load, mock_save, mock_rank):
             ft_manager=self.ft_manager,
         )
         manager.save(curr_step=1)
-        self.assertEqual(mock_save.call_count, 1)
+        self.assertEqual(mock_save.call_count, 0)
         manager.save(curr_step=2)
-        self.assertEqual(mock_save.call_count, 1)
+        self.assertEqual(mock_save.call_count, 0)
         manager.save(curr_step=2, force=True)
-        self.assertEqual(mock_save.call_count, 2)
+        self.assertEqual(mock_save.call_count, 1)
         manager.save(curr_step=3)
-        self.assertEqual(mock_save.call_count, 3)
+        self.assertEqual(mock_save.call_count, 2)
         manager.save(curr_step=4)
-        self.assertEqual(mock_save.call_count, 3)
+        self.assertEqual(mock_save.call_count, 2)
         manager.save(curr_step=4, force=True)
-        self.assertEqual(mock_save.call_count, 4)
+        self.assertEqual(mock_save.call_count, 3)
         manager.close()
 
     @mock.patch("torch.distributed.get_rank", return_value=0)
@@ -471,6 +472,68 @@ def test_ft_async_save_calls_async_wait(
         self.assertIsNotNone(manager.async_future)
         manager.async_future.result.assert_not_called()
 
+    @mock.patch("torch.distributed.get_rank", return_value=0)
+    @mock.patch("torchtitan.components.checkpoint.dcp.save")
+    def test_enable_first_step_checkpoint(self, mock_save, mock_rank):
+        """
+        Test that enable_first_step_checkpoint triggers checkpoint save at step 1.
+        """
+        mock_save.side_effect = self.fake_save
+
+        # Test with enable_first_step_checkpoint=False (default case)
+        cfg = self.job_config.checkpoint
+        cfg.interval = 10  # Set interval to 10 so step 1 wouldn't normally trigger save
+        cfg.keep_latest_k = 0  # Disable purging to avoid confusion
+
+        manager = CheckpointManager(
+            dataloader=self.data_loader,
+            model_parts=self.model_parts,
+            optimizers=self.optimizers,
+            lr_schedulers=self.lr_schedulers,
+            states=self.states,
+            job_config=self.job_config,
+            ft_manager=self.ft_manager,
+        )
+
+        # Step 1 should not trigger save when enable_first_step_checkpoint=False
+        # and not at interval
+        manager.save(curr_step=1)
+        self.assertEqual(mock_save.call_count, 0)
+
+        # Step 10 should trigger save due to interval
+        manager.save(curr_step=10)
+        self.assertEqual(mock_save.call_count, 1)
+
+        manager.close()
+
+        # Test with enable_first_step_checkpoint=True
+        mock_save.reset_mock()
+        cfg.enable_first_step_checkpoint = True
+
+        manager2 = CheckpointManager(
+            dataloader=self.data_loader,
+            model_parts=self.model_parts,
+            optimizers=self.optimizers,
+            lr_schedulers=self.lr_schedulers,
+            states=self.states,
+            job_config=self.job_config,
+            ft_manager=self.ft_manager,
+        )
+
+        # Step 1 should trigger save due to enable_first_step_checkpoint=True
+        manager2.save(curr_step=1)
+        self.assertEqual(mock_save.call_count, 1)
+
+        # Step 2 should not trigger save (not at interval and not forced)
+        manager2.save(curr_step=2)
+        self.assertEqual(mock_save.call_count, 1)
+
+        # Step 10 should trigger save due to interval
+        manager2.save(curr_step=10)
+        self.assertEqual(mock_save.call_count, 2)
+
+        manager2.close()
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/torchtitan/components/checkpoint.py b/torchtitan/components/checkpoint.py
@@ -273,11 +273,19 @@ def load_state_dict(state_dict):
         self.staging_stream = torch.cuda.Stream() if self.enable_staging else None
 
         self.folder = os.path.join(job_config.job.dump_folder, ckpt_config.folder)
+
+        # Checkpoint policy related fields.
         self.initial_load_path = ckpt_config.initial_load_path
         self.initial_load_model_weights_only = (
             ckpt_config.initial_load_model_weights_only
         )
+        self.last_save_model_weights_only = ckpt_config.last_save_model_weights_only
+        self.export_dtype = TORCH_DTYPE_MAP[ckpt_config.export_dtype]
+        self.exclude_from_loading = ckpt_config.exclude_from_loading
         self.interval = ckpt_config.interval
+        self.enable_first_step_checkpoint = ckpt_config.enable_first_step_checkpoint
+
+        # Async checkpoint related fields.
         async_mode = ckpt_config.async_mode.lower()
         if async_mode == AsyncMode.ASYNC or self.ft_manager:
             self.pg = dist.new_group(backend="gloo")
@@ -297,10 +305,6 @@ def load_state_dict(state_dict):
         else:
             self.purge_thread = None
 
-        self.last_save_model_weights_only = ckpt_config.last_save_model_weights_only
-        self.export_dtype = TORCH_DTYPE_MAP[ckpt_config.export_dtype]
-        self.exclude_from_loading = ckpt_config.exclude_from_loading
-
         self.mp = None
         self.async_future = None
         if async_mode == AsyncMode.DISABLED:
@@ -616,9 +620,7 @@ def _should_save(self, curr_step: int, force: bool = False) -> bool:
         if not self.enable_checkpoint:
             return False
 
-        # Force saving a checkpoint at step 1 to fail fast if checkpointer is not
-        # compatible with the cluster.
-        if curr_step == 1:
+        if curr_step == 1 and self.enable_first_step_checkpoint:
             return True
 
         if force:
diff --git a/torchtitan/config_manager.py b/torchtitan/config_manager.py
@@ -459,6 +459,14 @@ class Checkpoint:
     This will load the model only, excluding the specified keys.
     """
 
+    enable_first_step_checkpoint: bool = False
+    """
+    Enable the checkpoint save at first step. This will save a checkpoint immediately
+    after the first step to ensure checkpointing functions correctly. This is useful
+    when running on a new cluster or storage to verify checkpointing without waiting
+    for many steps or checkpointing too frequently. The default value is False.
+    """
+
 
 @dataclass
 class ActivationCheckpoint: