[llama4]: flatten dp_mesh to enable data parallel replicate (#1279)

lkhphuc · web-flow · commit 5e0905741dbc · 2025-06-13T14:46:56.000-07:00
Oneliner fix to enable training with HSDP.

When DP Replicate is enabled, dp_mesh contains `(dp_shard_cp,
dp_replicate)`, and `dp_mesh.get_group()` will throw an error due to
multiple axes.
So I flatten it the same way it is done in `build_device_mesh()`
function, with the same name just to be explicit).

Note: I should be able to use `world_mesh['dp_cp']` here, as it's
already flatten when world_mesh is built. However inside the function
when it is executed, this flatten mesh is not available for some reason.
diff --git a/torchtitan/distributed/parallel_dims.py b/torchtitan/distributed/parallel_dims.py
@@ -120,6 +120,10 @@ def dp_shard_enabled(self):
     def cp_enabled(self):
         return self.cp > 1
 
+    @property
+    def dp_cp_enabled(self):
+        return self.dp_enabled or self.cp_enabled
+
     @property
     def tp_enabled(self):
         return self.tp > 1
diff --git a/torchtitan/experiments/llama4/infra/parallelize_llama.py b/torchtitan/experiments/llama4/infra/parallelize_llama.py
@@ -118,13 +118,15 @@ def parallelize_llama(
         )
 
     # for MoE auxiliary-loss-free load balancing
-    if dp_mesh is not None:
+    if parallel_dims.dp_cp_enabled is not None:
         # NOTE: Currently this sync is blocking (thus exposed) and happens on the
         # default compute stream. Need to assess if this is OK performance-wise.
+        dp_cp_mesh = world_mesh["dp_cp"]
+
         def _sync_tokens_per_expert(module, *_):
             assert isinstance(module, MoE)
             torch.distributed.all_reduce(
-                module.tokens_per_expert, group=dp_mesh.get_group()
+                module.tokens_per_expert, group=dp_cp_mesh.get_group()
             )
 
         for transformer_block in model.layers.values():
diff --git a/torchtitan/train.py b/torchtitan/train.py
@@ -442,12 +442,7 @@ def train_step(
         if not self.metrics_processor.should_log(self.step):
             return
 
-        if (
-            parallel_dims.dp_replicate_enabled
-            or parallel_dims.dp_shard_enabled
-            or parallel_dims.cp_enabled
-            or self.ft_manager.enabled
-        ):
+        if parallel_dims.dp_cp_enabled or self.ft_manager.enabled:
             loss = loss.detach()
             # Skip ft manager communication when using semi sync training
             use_ft_pg = (