[SimpleFSDP] Add support for SimpleFSDP DCP (#1273)

ruisizhang123 · web-flow · commit b7c7ed7167fa · 2025-06-09T19:42:25.000-07:00
As titled, this pr adds support for SimpleFSDP's DCP composability. The code is based on previous implementations from @fmassa @awgu @yf225 The following losses match perfectly. The checkpoints are loaded from step-110. (1) [dp:4] --> [dp:4] <img width="1511" alt="Screenshot 2025-06-08 at 5 39 50 PM" src="https://github.com/user-attachments/assets/a0e49c0f-fdbc-4ea8-82bd-573f0c9015a8" /> (2) [dp:2, tp:2] --> [dp:2, tp:2] & [dp:2, pp:2] <img width="1534" alt="Screenshot 2025-06-08 at 5 46 29 PM" src="https://github.com/user-attachments/assets/920a9898-fc75-4dfe-bf28-d597be398226" />
diff --git a/torchtitan/experiments/simple_fsdp/README.md b/torchtitan/experiments/simple_fsdp/README.md
@@ -24,7 +24,7 @@ Some of the features require the updates from PyTorch, with which we are working
 |Tensor Parallelism| ✅ |
 |Context Parallelism| ✅ |
 |Pipeline Parallelism| ✅ |
-|Distributed Checkpointing| 🚧 |
+|Distributed Checkpointing| ✅ |
 |Float8 Training| 🚧 |
 
 
diff --git a/torchtitan/experiments/simple_fsdp/simple_fsdp.py b/torchtitan/experiments/simple_fsdp/simple_fsdp.py
@@ -7,7 +7,7 @@
 from collections.abc import Sequence
 from contextlib import contextmanager
 from dataclasses import dataclass
-from typing import Optional
+from typing import List, Optional
 
 import torch
 import torch.nn as nn
@@ -124,6 +124,26 @@ def _distribute_dtensor(
     )
 
 
+def _register_parametrization(
+    module: nn.Module, param_names: List[str], parametrization: nn.Module
+):
+    """
+    it works with state_dict without incurring parametrization calls because
+    state_dict accesses parameters directly from self._parameters, not from getters
+    https://github.com/pytorch/pytorch/blob/main/torch/nn/modules/module.py#L2141
+    """
+    param_name_to_property = {
+        param_name: property(lambda self: parametrization(self._parameters[param_name]))
+        for param_name in param_names
+    }
+    module_cls = type(
+        f"FSDP{module.__class__.__name__}",
+        (module.__class__,),
+        param_name_to_property,
+    )
+    module.__class__ = module_cls
+
+
 def fsdp_policy():
     def _fsdp_recomp_policy():
         def _custom_policy(ctx, func, *args, **kwargs):
@@ -263,18 +283,32 @@ def data_parallel(
                         distribute_tensor_func(p, device_mesh, param_sharding)
                     ),
                 )
-                nn.utils.parametrize.register_parametrization(
-                    mod,
-                    p_name,
-                    ReplicateComputation(
-                        device_mesh,
-                        param_sharding,
-                        mode,
-                        regional_ac,
-                        mp_policy=mp_policy,
-                        tp_mesh=tp_mesh,
-                    ),
-                    unsafe=True,
-                )
-
+                # to be compatible with DCP, we use a customized _register_parametrization
+                # instead of nn.utils.parametrize.register_parametrization here
+                # nn.utils.parametrize.register_parametrization(
+                #     mod,
+                #     p_name,
+                #     ReplicateComputation(
+                #         device_mesh,
+                #         param_sharding,
+                #         mode,
+                #         regional_ac,
+                #         mp_policy=mp_policy,
+                #         tp_mesh=tp_mesh,
+                #     ),
+                #     unsafe=True,
+                # )
+
+        _register_parametrization(
+            mod,
+            list(params_dict.keys()),
+            ReplicateComputation(
+                device_mesh,
+                param_sharding,
+                mode,
+                regional_ac,
+                mp_policy=mp_policy,
+                tp_mesh=tp_mesh,
+            ),
+        )
     return model
diff --git a/torchtitan/experiments/simple_fsdp/tests/integration_tests.py b/torchtitan/experiments/simple_fsdp/tests/integration_tests.py
@@ -75,20 +75,19 @@ def build_test_list():
         #     "2D async TP",
         #     "2d_asynctp",
         # ),
-        # TODO: Adds back after DCP is supported by SimpleFSDP
-        # OverrideDefinitions(
-        #     [
-        #         [
-        #             "--checkpoint.enable_checkpoint",
-        #         ],
-        #         [
-        #             "--checkpoint.enable_checkpoint",
-        #             "--training.steps 20",
-        #         ],
-        #     ],
-        #     "Checkpoint Integration Test - Save Load Full Checkpoint",
-        #     "full_checkpoint",
-        # ),
+        OverrideDefinitions(
+            [
+                [
+                    "--checkpoint.enable_checkpoint",
+                ],
+                [
+                    "--checkpoint.enable_checkpoint",
+                    "--training.steps 20",
+                ],
+            ],
+            "Checkpoint Integration Test - Save Load Full Checkpoint",
+            "full_checkpoint",
+        ),
         OverrideDefinitions(
             [
                 [
@@ -179,33 +178,32 @@ def build_test_list():
             "fsdp+tp+cp",
             ngpu=8,
         ),
-        # TODO: Adds back after DCP is supported by SimpleFSDP
-        # OverrideDefinitions(
-        #     [
-        #         [
-        #             "--checkpoint.enable_checkpoint",
-        #             "--training.steps 10",
-        #         ],
-        #         # Save at [dp:4] and load at [dp:2, tp:2]. Note that the dataloader should be
-        #         # excluded during loading to avoid errors caused by mismatched dp_degree.
-        #         [
-        #             "--checkpoint.enable_checkpoint",
-        #             "--checkpoint.exclude_from_loading lr_scheduler,dataloader,optimizer",
-        #             "--parallelism.tensor_parallel_degree 2",
-        #             "--training.steps 20",
-        #         ],
-        #         # load at [tp:4].
-        #         [
-        #             "--checkpoint.enable_checkpoint",
-        #             "--checkpoint.exclude_from_loading lr_scheduler,dataloader,optimizer",
-        #             "--parallelism.tensor_parallel_degree 4",
-        #             "--training.steps 30",
-        #         ],
-        #     ],
-        #     "Optional checkpoint",
-        #     "optional_checkpoint",
-        #     ngpu=4,
-        # ),
+        OverrideDefinitions(
+            [
+                [
+                    "--checkpoint.enable_checkpoint",
+                    "--training.steps 10",
+                ],
+                # Save at [dp:4] and load at [dp:2, tp:2]. Note that the dataloader should be
+                # excluded during loading to avoid errors caused by mismatched dp_degree.
+                [
+                    "--checkpoint.enable_checkpoint",
+                    "--checkpoint.exclude_from_loading lr_scheduler,dataloader,optimizer",
+                    "--parallelism.tensor_parallel_degree 2",
+                    "--training.steps 20",
+                ],
+                # load at [tp:4].
+                [
+                    "--checkpoint.enable_checkpoint",
+                    "--checkpoint.exclude_from_loading lr_scheduler,dataloader,optimizer",
+                    "--parallelism.tensor_parallel_degree 4",
+                    "--training.steps 30",
+                ],
+            ],
+            "Optional checkpoint",
+            "optional_checkpoint",
+            ngpu=4,
+        ),
     ]
     return integration_tests_flavors