[RFC] Make last_save_model_weights_only default to True (#1336)

fegin · web-flow · commit adae6b6fd850 · 2025-06-25T14:29:56.000-07:00
This is a BC breaking change but should be the right way to save the
last step checkpoint.
diff --git a/tests/unit_tests/test_checkpoint.py b/tests/unit_tests/test_checkpoint.py
@@ -329,13 +329,13 @@ def test_interval_respects_interval(self, mock_load, mock_save, mock_rank):
         self.assertEqual(mock_save.call_count, 0)
         manager.save(curr_step=2)
         self.assertEqual(mock_save.call_count, 0)
-        manager.save(curr_step=2, force=True)
+        manager.save(curr_step=2, last_step=True)
         self.assertEqual(mock_save.call_count, 1)
         manager.save(curr_step=3)
         self.assertEqual(mock_save.call_count, 2)
         manager.save(curr_step=4)
         self.assertEqual(mock_save.call_count, 2)
-        manager.save(curr_step=4, force=True)
+        manager.save(curr_step=4, last_step=True)
         self.assertEqual(mock_save.call_count, 3)
         manager.close()
 
@@ -358,7 +358,7 @@ def test_last_save_model_weights_only_and_initial_load_model_weights_only(
             job_config=self.job_config,
             ft_manager=self.ft_manager,
         )
-        manager1.save(curr_step=1, force=True)
+        manager1.save(curr_step=1, last_step=True)
         path1 = os.path.join(self.test_folder, "step-1")
         self.assertTrue(os.path.isdir(path1))
         # Phase 2: initial load from step-1
@@ -383,7 +383,7 @@ def test_last_save_model_weights_only_and_initial_load_model_weights_only(
         args1, kwargs1 = mock_load.call_args
         self.assertEqual(kwargs1.get("checkpoint_id"), path1)
         # Phase 3: save new step under default folder, then load that
-        manager2.save(curr_step=2, force=True)
+        manager2.save(curr_step=2, last_step=True)
         # Default folder is test_folder, so step-2 under that
         step2_dir = os.path.join(self.test_folder, "step-2")
         self.assertTrue(os.path.isdir(step2_dir))
@@ -419,12 +419,12 @@ def test_async_save_calls_async_wait(self, mock_async_save, mock_new_group):
         )
 
         # First save schedules async
-        manager.save(curr_step=10, force=False)
+        manager.save(curr_step=10, last_step=False)
         future = manager.async_future
         future.result.assert_not_called()
 
         # Second save should wait
-        manager.save(curr_step=20, force=False)
+        manager.save(curr_step=20, last_step=False)
         future.result.assert_called_once()
 
         # New future created
@@ -462,12 +462,12 @@ def test_ft_async_save_calls_async_wait(
 
         # Initially no future
         self.assertIsNone(manager.async_future)
-        manager.save(curr_step=5, force=False)
+        manager.save(curr_step=5, last_step=False)
         self.assertIsNotNone(manager.async_future)
 
         manager.async_future.result.assert_not_called()
         prev_future = manager.async_future
-        manager.save(curr_step=6, force=False)
+        manager.save(curr_step=6, last_step=False)
         prev_future.result.assert_called_once()
         self.assertIsNotNone(manager.async_future)
         manager.async_future.result.assert_not_called()
diff --git a/torchtitan/components/checkpoint.py b/torchtitan/components/checkpoint.py
@@ -349,17 +349,17 @@ def close(self):
                 self.purge_thread.join()
 
     @torch.no_grad()
-    def save(self, curr_step: int, force: bool = False) -> None:
+    def save(self, curr_step: int, last_step: bool = False) -> None:
         """Save the checkpoint for the current step.
 
-        This function will save the checkpoint for the current step. If ``force`` is
+        This function will save the checkpoint for the current step. If ``last_step`` is
         true, it will save the checkpoint even if the interval has not been reached.
         This only happens when train_state.step == job_config.training.steps, or
         for initial seed checkpoint.
 
         Args:
             curr_step (int): The current step.
-            force (bool, optional): Whether to force save the checkpoint. Defaults to False.
+            last_step (bool, optional): Whether this is the last step of training.
 
         Returns:
             None
@@ -368,7 +368,7 @@ def save(self, curr_step: int, force: bool = False) -> None:
         if self.ft_manager:
             self._ft_save(curr_step)
 
-        if not self._should_save(curr_step, force):
+        if not self._should_save(curr_step, last_step):
             return
 
         begin = time.monotonic()
@@ -379,7 +379,7 @@ def save(self, curr_step: int, force: bool = False) -> None:
             # This GC is called for async checkpoint as it is useless to do
             # GC right after async_save -- the CPU memory is not able to be
             # freed until _async_wait()
-            if force:
+            if last_step:
                 self._save_last_step(curr_step)
             elif self.async_mode == AsyncMode.ASYNC_WITH_PINNED_MEM:
                 GarbageCollection.collect("GC collection invoked by checkpointer.")
@@ -616,14 +616,14 @@ def _save_last_step(self, curr_step: int) -> None:
 
         save_with_gc(self.states, checkpoint_id=self._create_checkpoint_id(curr_step))
 
-    def _should_save(self, curr_step: int, force: bool = False) -> bool:
+    def _should_save(self, curr_step: int, last_step: bool = False) -> bool:
         if not self.enable_checkpoint:
             return False
 
         if curr_step == 1 and self.enable_first_step_checkpoint:
             return True
 
-        if force:
+        if last_step:
             return True
 
         if curr_step % self.interval == 0:
diff --git a/torchtitan/config_manager.py b/torchtitan/config_manager.py
@@ -404,13 +404,13 @@ class Checkpoint:
     interval: int = 500
     """Checkpointing interval in steps."""
 
-    last_save_model_weights_only: bool = False
+    last_save_model_weights_only: bool = True
     """
     When last_save_model_weights_only=True, only model weights will be saved at the end of training,
     the last save.  With this, checkpoints can be loaded using `torch.load(..., weights_only=True)`
     after conversion.  When last_save_model_weights_only=False, the full checkpoint will be saved.
     A full checkpoint includes model, optimizer and train_state, which can be used to resume training.
-    The default value is false.
+    The default value is True.
     """
 
     export_dtype: Literal["float16", "bfloat16", "float32"] = "float32"
diff --git a/torchtitan/experiments/flux/train.py b/torchtitan/experiments/flux/train.py
@@ -221,7 +221,7 @@ def train_step(
             assert (
                 config.checkpoint.enable_checkpoint
             ), "Must enable checkpointing when creating a seed checkpoint."
-            trainer.checkpointer.save(curr_step=0, force=True)
+            trainer.checkpointer.save(curr_step=0, last_step=True)
             logger.info("Created seed checkpoint")
         else:
             trainer.train()
diff --git a/torchtitan/experiments/llama4/scripts/convert_hf_to_dcp_with_gpus.py b/torchtitan/experiments/llama4/scripts/convert_hf_to_dcp_with_gpus.py
@@ -536,7 +536,7 @@ def state_dict(self) -> dict[str, torch.Tensor]:
             trainer.checkpointer.states[MODEL] = DummyModel(state_dict)
             trainer.checkpointer.last_save_model_weights_only = True
             trainer.checkpointer.export_dtype = next(iter(state_dict.values())).dtype
-            trainer.checkpointer.save(curr_step=0, force=True)
+            trainer.checkpointer.save(curr_step=0, last_step=True)
             time.sleep(2)
     finally:
         pass
diff --git a/torchtitan/experiments/llama4/scripts/convert_meta_to_dcp_with_gpus.py b/torchtitan/experiments/llama4/scripts/convert_meta_to_dcp_with_gpus.py
@@ -531,7 +531,7 @@ def state_dict(self) -> dict[str, torch.Tensor]:
             trainer.checkpointer.states[MODEL] = DummyModel(state_dict)
             trainer.checkpointer.last_save_model_weights_only = True
             trainer.checkpointer.export_dtype = next(iter(state_dict.values())).dtype
-            trainer.checkpointer.save(curr_step=0, force=True)
+            trainer.checkpointer.save(curr_step=0, last_step=True)
             time.sleep(2)
     finally:
         pass
diff --git a/torchtitan/train.py b/torchtitan/train.py
@@ -494,7 +494,7 @@ def train(self):
                     logger.warning("Ran out of data; last step was canceled.")
                     break
                 self.checkpointer.save(
-                    self.step, force=(self.step == job_config.training.steps)
+                    self.step, last_step=(self.step == job_config.training.steps)
                 )
 
                 # signal the profiler that the next profiling step has started
@@ -547,7 +547,7 @@ def close(self) -> None:
             assert (
                 config.checkpoint.enable_checkpoint
             ), "Must enable checkpointing when creating a seed checkpoint."
-            trainer.checkpointer.save(curr_step=0, force=True)
+            trainer.checkpointer.save(curr_step=0, last_step=True)
             logger.info("Created seed checkpoint")
         else:
             trainer.train()