fixes

ankitageorge · ankitageorge · commit 5ae4823cac47 · 2025-06-27T08:36:00.000-07:00
diff --git a/torchtitan/components/checkpoint.py b/torchtitan/components/checkpoint.py
@@ -98,45 +98,6 @@ class SaveDone:
     pass
 
 
-def checkpoint_mp(recv: mp.Queue, send: mp.Queue):
-    """Process to save the checkpoint in the background.
-
-    This is only used when async_checkpoint_with_pinned_memory is enabled.
-
-    Args:
-        recv (mp.Queue): The queue to receive the state_dict and Terminate signal.
-        send (mp.Queue): The queue to send the SaveDone signal.
-    """
-    init_logger()
-    os.environ["MASTER_PORT"] = str(int(os.environ["MASTER_PORT"]) + 2)
-    os.environ["TORCHELASTIC_USE_AGENT_STORE"] = "False"
-    torch.cuda.set_device(int(os.environ["LOCAL_RANK"]))
-    dist.init_process_group()
-    try:
-        while True:
-            logger.debug("Checkpoint background process is done.")
-            send.put(SaveDone())
-            logger.debug("Wait for the new state_dict.")
-            obj = recv.get()
-            logger.debug("Received the new state_dict.")
-            if isinstance(obj, Terminate):
-                logger.info("Terminating the checkpoint background process.")
-                return
-            assert isinstance(obj, tuple)
-            begin = time.monotonic()
-            state, checkpoint_id = obj
-            save_with_gc(
-                state, checkpoint_id=checkpoint_id, hf_safetensors_format=False
-            )
-            logger.info(
-                "Finish saving the checkpoint in the background process in %.2f seconds.",
-                time.monotonic() - begin,
-            )
-    finally:
-        logger.info("Destroying the process group.")
-        dist.destroy_process_group()
-
-
 @torch.no_grad()
 def dcp_save(
     state_dict: dict[str, Any],
@@ -145,13 +106,13 @@ def dcp_save(
     hf_safetensors_format: bool,
     pg: Optional[dist.ProcessGroup] = None,
 ) -> Optional[Future]:
-    """Save the checkpoint for the current step.
-
-
+    """Save the checkpoint with dcp.
     Args:
         state_dict (dict): The state dict to save.
         checkpoint_id (str): The checkpoint id to save.
         is_async (bool): Whether the checkpoint is async.
+        hf_safetensors_format (bool): Whether to use the HuggingFace safetensors format.
+        pg (Optional[dist.ProcessGroup]): The process group to use.
     """
     if hf_safetensors_format:
         storage_writer = HuggingFaceStorageWriter(path=checkpoint_id, save_sharded=True)
@@ -173,12 +134,11 @@ def dcp_save(
 def dcp_load(
     state_dict: dict[str, Any], checkpoint_id: str, hf_safetensors_format: bool
 ) -> None:
-    """Save the checkpoint for the current step.
-
-
+    """Load the checkpoint with dcp.
     Args:
         state_dict (dict): The state dict to load.
         checkpoint_id (str): The checkpoint id to load.
+        hf_safetensors_format (bool): Whether to use the HuggingFace safetensors format.
     """
     if hf_safetensors_format:
         storage_reader = HuggingFaceStorageReader(path=checkpoint_id)
@@ -200,6 +160,45 @@ def save_with_gc(
     GarbageCollection.collect("GC collection invoked by checkpointer.")
 
 
+def checkpoint_mp(recv: mp.Queue, send: mp.Queue):
+    """Process to save the checkpoint in the background.
+
+    This is only used when async_checkpoint_with_pinned_memory is enabled.
+
+    Args:
+        recv (mp.Queue): The queue to receive the state_dict and Terminate signal.
+        send (mp.Queue): The queue to send the SaveDone signal.
+    """
+    init_logger()
+    os.environ["MASTER_PORT"] = str(int(os.environ["MASTER_PORT"]) + 2)
+    os.environ["TORCHELASTIC_USE_AGENT_STORE"] = "False"
+    torch.cuda.set_device(int(os.environ["LOCAL_RANK"]))
+    dist.init_process_group()
+    try:
+        while True:
+            logger.debug("Checkpoint background process is done.")
+            send.put(SaveDone())
+            logger.debug("Wait for the new state_dict.")
+            obj = recv.get()
+            logger.debug("Received the new state_dict.")
+            if isinstance(obj, Terminate):
+                logger.info("Terminating the checkpoint background process.")
+                return
+            assert isinstance(obj, tuple)
+            begin = time.monotonic()
+            state, checkpoint_id = obj
+            save_with_gc(
+                state, checkpoint_id=checkpoint_id, hf_safetensors_format=False
+            )
+            logger.info(
+                "Finish saving the checkpoint in the background process in %.2f seconds.",
+                time.monotonic() - begin,
+            )
+    finally:
+        logger.info("Destroying the process group.")
+        dist.destroy_process_group()
+
+
 def purge_thread(purge_queue: queue.Queue):
     """Thread to purge the old checkpoints.
 
@@ -466,7 +465,11 @@ def save(self, curr_step: int, last_step: bool = False) -> None:
                 )
                 GarbageCollection.collect("GC collection invoked by checkpointer.")
             else:
-                self.save_with_gc(self.states, checkpoint_id=checkpoint_id)
+                save_with_gc(
+                    self.states,
+                    checkpoint_id=checkpoint_id,
+                    hf_safetensors_format=self.enable_hf_safetensors_format,
+                )
             self._purge_stale_checkpoints()
 
             logger.info(