[FSDP2] cast scale to float32 in precompute (#835)

weifengpy · web-flow · commit 85d03de43160 · 2024-09-10T22:52:16.000-07:00
Summary:

Test Plan:

Reviewers:

Subscribers:

Tasks:

Tags:
diff --git a/torchao/float8/fsdp_utils.py b/torchao/float8/fsdp_utils.py
@@ -59,7 +59,7 @@ def precompute_float8_dynamic_scale_for_fsdp(module: nn.Module) -> None:
         return
 
     # inf-norm is equivalent to max(abs(w))
-    max_weights = torch._foreach_norm(weights, ord=math.inf, dtype=torch.float32)  # Partial
+    max_weights = torch._foreach_norm(weights, ord=math.inf)  # Partial
     amax_tensor = torch.stack(max_weights)  # Partial
     # clamp is dispatched through DTensor
     # it will issue a single all-reduce
@@ -69,7 +69,7 @@ def precompute_float8_dynamic_scale_for_fsdp(module: nn.Module) -> None:
         scale_tensor = torch.clamp(scale_tensor, max=torch.finfo(torch.float16).max)
     local_scale_tensor = scale_tensor.to_local()
     for i, float8_linear in enumerate(float8_linears):
-        float8_linear.weight._local_tensor._precomputed_scale = local_scale_tensor[i]
+        float8_linear.weight._local_tensor._precomputed_scale = local_scale_tensor[i].to(torch.float32)
 
 
 # FSDP pads its local tensor on dim-0. The subclass should be preserved such