[SimpleFSDP] Add support for hsdp+tp (#1343)

ruisizhang123 · web-flow · commit f0ce21bb7da2 · 2025-07-09T23:49:12.000-07:00
As titled, this pr adds support for SimpleFSDP's HSDP + TP The profile trace below shows the three streams for FSDP's All-gather/Reduce-scatter; DDP's All-reduce; and TP's communications. <img width="1499" alt="Screenshot 2025-06-25 at 8 13 39 PM" src="https://github.com/user-attachments/assets/4d9a7561-8895-48e6-b728-bcc1f95d8f2c" /> The loss below shows SimpleFSDP & FSDP2's losses matches under HSDP + TP mode (seed=42). <img width="1521" alt="Screenshot 2025-06-25 at 8 27 11 PM" src="https://github.com/user-attachments/assets/b497c1f9-a766-4a00-9e42-cbc8ce1ca1b8" />
diff --git a/torchtitan/experiments/simple_fsdp/simple_fsdp.py b/torchtitan/experiments/simple_fsdp/simple_fsdp.py
@@ -58,7 +58,7 @@ def _distribute_dtensor(
     Below are experimental enhancements to distribute a DTensor.
     This helps enable Simple FSDP + TP, in which
         inner spec/mesh is TP spec/mesh
-        outer spec/mesh is FSDP spec/mesh
+        outer spec/mesh is FSDP/DDP/HSDP spec/mesh
     The logic follows
     https://github.com/pytorch/pytorch/blob/main/torch/distributed/_composable/fsdp/_fsdp_param.py#L261
     """
@@ -78,24 +78,40 @@ def _distribute_dtensor(
     submesh_names = outer_mesh.mesh_dim_names + inner_mesh.mesh_dim_names
     spanned_mesh = outer_global_mesh[submesh_names]
 
-    if placements[0].is_shard():
-        # for FSDP + TP dtensor placement
-        shard_dim = placements[0].dim
+    if len(placements) == 1:
+        assert placements[0].is_replicate() or placements[0].is_shard()
+        if placements[0].is_shard():
+            # For FSDP + TP dtensor placement
+            shard_dim = placements[0].dim
+            split_factor = inner_spec.num_shards_map[shard_dim]
+            tensor_placement = (
+                (
+                    _StridedShard(shard_dim, split_factor=split_factor)
+                    if split_factor > 1
+                    else placements[0]
+                ),
+                inner_spec.placements[0],
+            )
+        else:
+            # For DDP + TP dtensor placement
+            tensor_placement = (placements[0], inner_spec.placements[0])
+    elif len(placements) == 2:
+        assert placements[0].is_replicate() and placements[1].is_shard()
+        # For HSDP + TP dtensor placement
+        shard_dim = placements[1].dim
         split_factor = inner_spec.num_shards_map[shard_dim]
         tensor_placement = (
+            placements[0],
             (
                 _StridedShard(shard_dim, split_factor=split_factor)
                 if split_factor > 1
-                else placements[0]
+                else placements[1]
             ),
             inner_spec.placements[0],
         )
-    elif placements[0].is_replicate():
-        # for DDP + TP dtensor placement
-        tensor_placement = (placements[0], inner_spec.placements[0])
     else:
         raise ValueError(
-            f"Unsupported placement {placements[0]} for distributing DTensor {tensor}"
+            f"Unsupported placement {placements} for distributing DTensor {tensor}"
         )
 
     current_spec = DTensorSpec(
@@ -105,7 +121,7 @@ def _distribute_dtensor(
     )
     target_spec = DTensorSpec(
         mesh=outer_mesh,
-        placements=(placements[0],),
+        placements=(placements[-1],),
         tensor_meta=inner_spec.tensor_meta,
     )
     result_tensor = redistribute_local_tensor(
@@ -188,9 +204,9 @@ def replicate_compute(self, x):
         # the gradients are partial tensors that needs to perform reduction
         # (i.e. DDP: allreduce, FSDP: reduce_scatter, HSDP: mix of both)
 
-        # support for FSDP/DDP + TP (assuming TP shards the inner-most dim)
+        # support for FSDP/DDP/HSDP + TP (assuming TP shards the inner-most dim)
         if x._spec.mesh.mesh_dim_names[-1] == "tp":
-            dp_placement, tp_placement = x._spec.placements
+            tp_placement = x._spec.placements[-1]
             # TODO: remove tp_mesh as an input arg to data_parallel API and use x._spec.mesh["tp"]
             #       after DeviceMesh supports slicing a non-root mesh
             # dp_mesh, tp_mesh = self.device_mesh, x._spec.mesh["tp"]
diff --git a/torchtitan/experiments/simple_fsdp/tests/integration_tests.py b/torchtitan/experiments/simple_fsdp/tests/integration_tests.py
@@ -130,19 +130,18 @@ def build_test_list():
             "hsdp",
             ngpu=4,
         ),
-        # TODO: Adds back after HSDP+TP is supported by SimpleFSDP
-        # OverrideDefinitions(
-        #     [
-        #         [
-        #             "--parallelism.data_parallel_shard_degree=2",
-        #             "--parallelism.data_parallel_replicate_degree=2",
-        #             "--parallelism.tensor_parallel_degree=2",
-        #         ]
-        #     ],
-        #     "HSDP+TP",
-        #     "hsdp+tp",
-        #     ngpu=8,
-        # ),
+        OverrideDefinitions(
+            [
+                [
+                    "--parallelism.data_parallel_shard_degree=2",
+                    "--parallelism.data_parallel_replicate_degree=2",
+                    "--parallelism.tensor_parallel_degree=2",
+                ]
+            ],
+            "HSDP+TP",
+            "hsdp+tp",
+            ngpu=8,
+        ),
         OverrideDefinitions(
             [
                 [