[WIP][RFC] Always flatten model state_dict

fegin · fegin · commit 4f05370640e0 · 2025-06-26T10:51:33.000-07:00
diff --git a/scripts/convert_llama_to_dcp.py b/scripts/convert_llama_to_dcp.py
@@ -125,7 +125,7 @@ def convert_llama_weights(input_dir, output_dir, max_seq_len: int):
     logger.info(f"Writing to DCP at '{output_dir}'")
     output_dir.mkdir(parents=True, exist_ok=True)
     storage_writer = DCP.filesystem.FileSystemWriter(output_dir, thread_count=8)
-    DCP.save({"model": state_dict}, storage_writer=storage_writer)
+    DCP.save(state_dict, storage_writer=storage_writer)
 
 
 if __name__ == "__main__":
diff --git a/scripts/generate/test_generate.py b/scripts/generate/test_generate.py
@@ -141,9 +141,9 @@ def test_generate(
     model.to_empty(device=device_type)
     model.eval()
 
-    state_dict = {"model": model.state_dict()}
+    state_dict = model.state_dict()
     for k in excluded_parameters_for_model_only:
-        state_dict["model"].pop(k, None)
+        state_dict.pop(k, None)
 
     # Checkpoint Loading
     begin = time.monotonic()
diff --git a/tests/unit_tests/test_checkpoint.py b/tests/unit_tests/test_checkpoint.py
@@ -15,7 +15,7 @@
 import torch
 import torch.nn as nn
 from torch.utils.data import DataLoader
-from torchtitan.components.checkpoint import CheckpointManager, MODEL
+from torchtitan.components.checkpoint import CheckpointManager
 from torchtitan.config_manager import Checkpoint as CheckpointConfig
 
 
@@ -105,11 +105,11 @@ def setUp(self):
 
         self.model_part = nn.Linear(2, 2)
         self.model_parts = [self.model_part]
+        self.states = {"trainer": torch.tensor([1.2347])}
         # TODO: Use a real OptimizerContainer here so that we can actually verify
         # some optimizer.state_dict() behavior (e.g., the key being the parameter name.)
         self.optimizers = FakeOptimizersContainer()
         self.lr_schedulers = FakeLRSchedulersContainer()
-        self.states = {}
         self.data_loader = FakeDataLoader()
         self.ft_manager = DummyFTManager()
 
@@ -161,7 +161,7 @@ def fake_load(self, states: dict, checkpoint_id=None):
             if key in states and hasattr(states[key], "load_state_dict"):
                 states[key].load_state_dict(val)
             elif key in states and isinstance(states[key], torch.Tensor):
-                states[key] = val
+                states[key].copy_(val)
 
     @mock.patch("torch.distributed.get_rank", return_value=0)
     @mock.patch("torchtitan.components.checkpoint.dcp.save")
@@ -354,7 +354,7 @@ def test_last_save_model_weights_only_and_initial_load_model_weights_only(
             model_parts=self.model_parts,
             optimizers=self.optimizers,
             lr_schedulers=self.lr_schedulers,
-            states={MODEL: self.model_part},
+            states=self.states,
             job_config=self.job_config,
             ft_manager=self.ft_manager,
         )
@@ -373,7 +373,7 @@ def test_last_save_model_weights_only_and_initial_load_model_weights_only(
             model_parts=self.model_parts,
             optimizers=self.optimizers,
             lr_schedulers=self.lr_schedulers,
-            states={MODEL: self.model_part},
+            states=self.states,
             job_config=self.job_config,
             ft_manager=self.ft_manager,
         )
@@ -449,13 +449,12 @@ def test_ft_async_save_calls_async_wait(
         job_config.checkpoint.async_mode = "disabled"
         ft_manager = mock.Mock()
         ft_manager.enabled = True
-        states = {"trainer": torch.tensor([0])}
         manager = CheckpointManager(
             dataloader=self.data_loader,
             model_parts=self.model_parts,
             optimizers=self.optimizers,
             lr_schedulers=self.lr_schedulers,
-            states=states,
+            states=self.states,
             job_config=job_config,
             ft_manager=ft_manager,
         )
@@ -569,14 +568,13 @@ def __init__(self):
         self.assertEqual(mock_save.call_count, 1)
         checkpoint_path = os.path.join(self.test_folder, "step-1", "state_dict.pt")
         saved_data = torch.load(checkpoint_path, weights_only=False)
-        model_state_dict = saved_data[MODEL]
 
         # Verify that freqs_cis is NOT in the saved state dict
-        self.assertNotIn("freqs_cis", model_state_dict)
+        self.assertNotIn("freqs_cis", saved_data)
         # Verify that other parameters ARE in the saved state dict
-        self.assertIn("weight", model_state_dict)
-        self.assertIn("bias", model_state_dict)
-        self.assertIn("other_param", model_state_dict)
+        self.assertIn("weight", saved_data)
+        self.assertIn("bias", saved_data)
+        self.assertIn("other_param", saved_data)
 
         manager.close()
 
diff --git a/torchtitan/components/checkpoint.py b/torchtitan/components/checkpoint.py
@@ -392,11 +392,15 @@ def save(self, curr_step: int, last_step: bool = False) -> None:
             elif self.async_mode == AsyncMode.ASYNC:
                 GarbageCollection.collect("GC collection invoked by checkpointer.")
                 self.async_future = dcp.async_save(
-                    self.states, checkpoint_id=checkpoint_id, process_group=self.pg
+                    self._flattend_model_states_sd(),
+                    checkpoint_id=checkpoint_id,
+                    process_group=self.pg,
                 )
                 GarbageCollection.collect("GC collection invoked by checkpointer.")
             else:
-                save_with_gc(self.states, checkpoint_id=checkpoint_id)
+                save_with_gc(
+                    self._flattend_model_states_sd(), checkpoint_id=checkpoint_id
+                )
             self._purge_stale_checkpoints()
 
             logger.info(
@@ -559,6 +563,19 @@ def _ft_load(self) -> None:
             f"Finished loading the ft checkpoint in {time.monotonic() - begin:.2f} seconds."
         )
 
+    def _flattend_model_states_sd(
+        self, state_dict: dict[str, Any] | None = None
+    ) -> dict[str, Any]:
+        """Flatten the model states into a single dictionary.
+
+        Note that other states, such as optimizer states, are not flattened.
+        """
+        states = state_dict if state_dict is not None else self.states
+        sd = {k: v for k, v in states.items() if k != MODEL}
+        if MODEL in states:
+            sd.update(states[MODEL].state_dict())
+        return sd
+
     def _states_to_load(self, model_only: bool) -> dict[str, Any]:
         """Determines which states to load for the given step.
 
@@ -573,8 +590,7 @@ def _states_to_load(self, model_only: bool) -> dict[str, Any]:
         """
         # For the first step, we will only load the model weights.
         if model_only:
-            sd = self.states[MODEL].state_dict()
-            return sd
+            return self.states[MODEL].state_dict()
 
         for exclude_key in self.exclude_from_loading:
             if exclude_key not in self.states:
@@ -584,6 +600,8 @@ def _states_to_load(self, model_only: bool) -> dict[str, Any]:
             k: v for k, v in self.states.items() if k not in self.exclude_from_loading
         }
 
+        states_to_load = self._flattend_model_states_sd(states_to_load)
+
         if self.ft_manager:
             states_to_load.pop(DATALOADER)