support AMP for DDP / single-device training (#1303)

tianyu-l · web-flow · commit 0b44d4c437c4 · 2025-06-16T13:18:05.000-07:00
There have been several requests / questions / attempts around this feature. See - #630 - #700 - #1278 - #1293 Hence this PR, even though AMP does not provide full support when `fully_shard` is not used. My local testing shows: - under TP, with AMP the throughput drops. - under PP, there is not a way to wrap the forward pass only @H-Huang ; if the forward+backward is wrapped, the program hangs. - It works fine under DDP / single-device training. This PR also adds logging on if mixed precision is enabled, and if so under what mechanism, `fully_shard` or AMP.
diff --git a/torchtitan/config_manager.py b/torchtitan/config_manager.py
@@ -220,8 +220,10 @@ class Training:
 
     mixed_precision_param: Literal["bfloat16", "float32"] = "bfloat16"
     """
-    torch dtype to use for parameters when applying mixed precision via FSDP.
-    This feature only takes effect when data_parallel_shard_degree > 1
+    torch dtype to use for parameters when applying mixed precision via fully_shard or torch.autocast.
+    This feature takes effect via fully_shard when data_parallel_shard_degree > 1 or
+    context_parallel_degree > 1; it takes effect via torch.autocast when data_replicate_degree >= 1
+    and no other parallelism is enabled, i.e. under DDP or single-device training.
     """
 
     mixed_precision_reduce: Literal["float32"] = "float32"
diff --git a/torchtitan/distributed/parallel_dims.py b/torchtitan/distributed/parallel_dims.py
@@ -124,6 +124,10 @@ def cp_enabled(self):
     def dp_cp_enabled(self):
         return self.dp_enabled or self.cp_enabled
 
+    @property
+    def fsdp_enabled(self):
+        return self.dp_shard_enabled or self.cp_enabled
+
     @property
     def tp_enabled(self):
         return self.tp > 1
diff --git a/torchtitan/distributed/utils.py b/torchtitan/distributed/utils.py
@@ -18,6 +18,8 @@
 from torch.distributed.tensor import DTensor
 from torch.nn.attention import SDPBackend
 
+from torchtitan.config_manager import TORCH_DTYPE_MAP
+from torchtitan.distributed.parallel_dims import ParallelDims
 from torchtitan.models.attention import ScaledDotProductAttention
 from torchtitan.tools.logging import logger
 from torchtitan.tools.utils import device_module, device_type
@@ -202,6 +204,29 @@ def context(cp_context: Generator[None, None, None] | None = None):
     return context
 
 
+def maybe_enable_amp(
+    parallel_dims: ParallelDims, mixed_precision_param: str, device_type: torch.device
+) -> Generator[None, None, None]:
+    if parallel_dims.fsdp_enabled:
+        # FSDP handles mixed precision internally
+        logger.info("Mixed precision training is handled by fully_shard")
+        return contextlib.nullcontext()
+    else:
+        if parallel_dims.tp_enabled or parallel_dims.pp_enabled:
+            logger.warning(
+                "Mixed precision training with TP or PP is only supported when FSDP/HSDP/CP is enabled."
+            )
+            logger.info("Mixed precision training is disabled")
+            return contextlib.nullcontext()
+        else:
+            # the following code will only be executed for DDP or single-device training
+            logger.info("Mixed precision training is handled by AMP")
+            return torch.autocast(
+                device_type,
+                dtype=TORCH_DTYPE_MAP[mixed_precision_param],
+            )
+
+
 def init_distributed(job_config):
     def _warn_overwrite_env(env, val):
         if env in os.environ:
diff --git a/torchtitan/train.py b/torchtitan/train.py
@@ -313,6 +313,11 @@ def __init__(self, job_config: JobConfig):
             parallel_dims.loss_parallel_enabled,
             parallelism_config.enable_compiled_autograd,
         )
+        self.maybe_enable_amp = dist_utils.maybe_enable_amp(
+            parallel_dims,
+            job_config.training.mixed_precision_param,
+            device_type,
+        )
 
         logger.info(
             "Trainer is initialized with "
@@ -400,8 +405,9 @@ def forward_backward_step(
             # Non-PP forward / backward
             with self.train_context(optional_context_parallel_ctx):
                 assert len(model_parts) == 1
-                pred = model_parts[0](inputs)
-                loss = self.loss_fn(pred, labels)
+                with self.maybe_enable_amp:
+                    pred = model_parts[0](inputs)
+                    loss = self.loss_fn(pred, labels)
                 # need to free to before bwd to avoid peaking memory
                 del pred
                 loss.backward()