support torchft streaming diloco (#1302)

tushar00jain · web-flow · commit aae73234d5e8 · 2025-06-16T16:13:35.000-07:00
Summary:
- update to use the changed api to create diloco (we need to pass in
model parts)
- add configuration options for streaming diloco

Test Plan:
```
$ NGPU=2 ./run_train.sh --fault_tolerance.enable --fault_tolerance.group_size=1 --fault_tolerance.semi_sync_method=diloco --fault_tolerance.sync_steps=2 --fault_tolerance.replica_id=0 --fault_tolerance.fragment_sync_delay=1 --fault_tolerance.fragment_update_alpha=0.0

[rank0]:[titan] 2025-06-16 09:39:08,893 - root - INFO - Model llama3 debugmodel size: 6,270,208 total parameters
[rank0]:[titan] 2025-06-16 09:39:08,894 - root - INFO - Applied selective activation checkpointing to the model
[rank0]:[titan] 2025-06-16 09:39:08,952 - root - INFO - Applied FSDP to the model
[rank0]:[titan] 2025-06-16 09:39:09,375 - root - WARNING - Peak flops undefined for: NVIDIA PG509-210, fallback to A100
[rank0]:[titan] 2025-06-16 09:39:09,376 - root - INFO - Peak FLOPS used for computing MFU: 3.120e+14
[rank0]:[titan] 2025-06-16 09:39:09,376 - root - INFO - CUDA memory usage for model: 0.03GiB(0.04%)
[rank0]:[titan] 2025-06-16 09:39:09,377 - root - INFO - Trainer is initialized with local batch size 8, global batch size 16, gradient accumulation steps 1, sequence length 2048, total steps 10 (warmup 2).
[rank0]:[titan] 2025-06-16 09:39:09,377 - root - INFO - Training starts at step 1.
[rank0]:[titan] 2025-06-16 09:39:10,325 - root - INFO - step:  1  loss:  8.1934  memory:  1.26GiB(1.59%)  tps: 11,442  tflops: 0.82  mfu: 0.26%
[rank0]:[titan] 2025-06-16 09:39:10,325 - root - INFO - Synchronizing and adjusting timeout for all ProcessGroups to 0:01:40
[rank0]:[titan] 2025-06-16 09:39:10,431 - root - INFO - step:  2  loss:  8.1507  memory:  1.35GiB(1.71%)  tps: 154,916  tflops: 11.14  mfu: 3.57%
[rank0]:[titan] 2025-06-16 09:39:10,524 - root - INFO - step:  3  loss:  8.0737  memory:  1.35GiB(1.71%)  tps: 177,405  tflops: 12.76  mfu: 4.09%
[rank0]:[titan] 2025-06-16 09:39:10,623 - root - INFO - step:  4  loss:  7.8865  memory:  1.35GiB(1.71%)  tps: 167,289  tflops: 12.03  mfu: 3.86%
[rank0]:[titan] 2025-06-16 09:39:10,714 - root - INFO - step:  5  loss:  7.7620  memory:  1.35GiB(1.71%)  tps: 179,656  tflops: 12.92  mfu: 4.14%
[rank0]:[titan] 2025-06-16 09:39:10,808 - root - INFO - step:  6  loss:  7.5449  memory:  1.35GiB(1.71%)  tps: 175,901  tflops: 12.65  mfu: 4.05%
[rank0]:[titan] 2025-06-16 09:39:10,911 - root - INFO - step:  7  loss:  7.3452  memory:  1.35GiB(1.71%)  tps: 159,859  tflops: 11.49  mfu: 3.68%
[rank0]:[titan] 2025-06-16 09:39:11,005 - root - INFO - step:  8  loss:  7.2973  memory:  1.35GiB(1.71%)  tps: 175,980  tflops: 12.65  mfu: 4.06%
[rank0]:[titan] 2025-06-16 09:39:11,096 - root - INFO - step:  9  loss:  7.1333  memory:  1.35GiB(1.71%)  tps: 179,903  tflops: 12.94  mfu: 4.15%
[rank0]:[titan] 2025-06-16 09:39:11,186 - root - INFO - step: 10  loss:  7.0747  memory:  1.35GiB(1.71%)  tps: 184,628  tflops: 13.28  mfu: 4.26%
[rank0]:[titan] 2025-06-16 09:39:11,186 - root - INFO - Sleeping 2 seconds for other ranks to complete
[rank0]:[titan] 2025-06-16 09:39:13,186 - root - INFO - Training completed
[rank0]:[titan] 2025-06-16 09:39:13,489 - root - INFO - Process group destroyed.
```
diff --git a/torchtitan/components/ft.py b/torchtitan/components/ft.py
@@ -170,15 +170,15 @@ def ft_clip_grad_norm_util(total_norm: DTensor) -> torch.Tensor:
 def maybe_semi_sync_training(
     config: JobConfig,
     ft_manager: FTManager,
-    model: torch.nn.Module,
+    model_parts: list[torch.nn.Module],
     optimizer: torch.optim.Optimizer,
-    sync_every: int,
 ) -> ContextManager[Union["local_sgd.DiLoCo", "local_sgd.LocalSGD", None]]:
     """
     If TorchFT is enabled and the config is set, use semi_sync_method
     """
-    semi_sync_method = config.fault_tolerance.semi_sync_method
-    torchft_enabled = config.fault_tolerance.enable
+    ft_config = config.fault_tolerance
+    semi_sync_method = ft_config.semi_sync_method
+    torchft_enabled = ft_config.enable
     if torchft_enabled and semi_sync_method is not None:
         from torchft import local_sgd
 
@@ -195,17 +195,21 @@ def maybe_semi_sync_training(
 
             return local_sgd.DiLoCo(
                 manager=ft_manager._manager,
-                model=model,
+                model_fragments=model_parts,
                 inner_optimizer=optimizer,
                 outer_optimizer=outer_optimizer,
-                sync_every=sync_every,
+                sync_every=ft_config.sync_steps,
+                should_quantize=ft_config.should_quantize,
+                fragment_sync_delay=ft_config.fragment_sync_delay,
+                fragment_update_alpha=ft_config.fragment_update_alpha,
             )
         elif semi_sync_method.lower() == "local_sgd":
+            assert len(model_parts) == 1
             return local_sgd.LocalSGD(
                 manager=ft_manager._manager,
-                model=model,
+                model=model_parts[0],
                 optimizer=optimizer,
-                sync_every=sync_every,
+                sync_every=ft_config.sync_steps,
             )
         else:
             raise ValueError(
diff --git a/torchtitan/config_manager.py b/torchtitan/config_manager.py
@@ -584,6 +584,29 @@ class FaultTolerance:
     is set.
     """
 
+    should_quantize: bool = False
+    """
+    Whether to quantize the gradients before allreduce.
+
+    This is only used when "semi_sync_method" is set.
+    """
+
+    fragment_sync_delay: int = 0
+    """
+    Controls the number of inner steps to wait before blocking on a
+    model fragment's synchronization. This is the "tao" parameter in
+    the Streaming DiLoCo paper.
+
+    This is only used when "semi_sync_method" is set.
+    """
+
+    fragment_update_alpha: float = 0.0
+    """
+    Determines how to mix the local and global optimized parameters
+
+    This is only used when "semi_sync_method" is set.
+    """
+
 
 @dataclass
 class Experimental:
diff --git a/torchtitan/train.py b/torchtitan/train.py
@@ -480,9 +480,8 @@ def train(self):
             ft.maybe_semi_sync_training(
                 job_config,
                 ft_manager=self.ft_manager,
-                model=self.model_parts[0],
+                model_parts=self.model_parts,
                 optimizer=self.optimizers,
-                sync_every=job_config.fault_tolerance.sync_steps,
             ),
         ):
             data_iterator = self.batch_generator(self.dataloader)