fix torchtitan + float8 + delayed + compile (#1334)

vkuzo · web-flow · commit 2843388de0ba · 2024-11-25T08:34:44.000-08:00
Summary: At some point torchtitan + delayed scaling + compile broke, fixing by switching to functional collectives for amax all-reduce. It would actually be great to add a local repro, will follow up offline on what could be missing in our current test coverage. Test Plan: ``` // torchtitan run which is fixed by this PR with-proxy CONFIG_FILE="./train_configs/debug_model.toml" ./run_llama_train.sh --float8.enable_float8_linear --training.compile --float8.scaling_type_input delayed --float8.scaling_type_weight delayed --float8.scaling_type_grad_output delayed // error message without this PR: https://gist.github.com/vkuzo/dbf54cf4027fd49bfb8095d518c618af ``` Reviewers: Subscribers: Tasks: Tags:
diff --git a/torchao/float8/float8_utils.py b/torchao/float8/float8_utils.py
@@ -8,6 +8,7 @@
 
 import torch
 import torch.distributed as dist
+from torch.distributed._functional_collectives import AsyncCollectiveTensor, all_reduce
 
 from torchao.float8.config import Float8TypeConfig, ScalingGranularity
 
@@ -109,7 +110,11 @@ def tensor_to_amax(
     # happen elsewhere.
     if reduce_amax and dist.is_initialized():
         pg = device_mesh.get_group() if device_mesh is not None else None
-        dist.all_reduce(amax, op=dist.ReduceOp.MAX, group=pg)
+        # dist.all_reduce(amax, op=dist.ReduceOp.MAX, group=pg)
+        group = list(range(dist.get_world_size())) if pg is None else pg
+        amax = all_reduce(amax, "MAX", group)
+        if isinstance(amax, AsyncCollectiveTensor):
+            amax = amax.wait()
 
     return amax