[float8] Bug fix: do not override requires_grad=False when enable_float8_all_gather=True (#1873)

danielvegamyhre · web-flow · commit dfe72c467c91 · 2025-03-11T21:27:13.000-07:00
diff --git a/test/float8/test_fsdp2/test_fsdp2.py b/test/float8/test_fsdp2/test_fsdp2.py
@@ -67,7 +67,10 @@ def init_multi_module(self) -> nn.Module:
         return module
 
     def init_transformer(
-        self, weight_tying: bool, dtype: Optional[torch.dtype] = None
+        self,
+        weight_tying: bool,
+        dtype: Optional[torch.dtype] = None,
+        requires_grad: bool = True,
     ) -> nn.Module:
         torch.manual_seed(42)
         args = ModelArgs(
@@ -81,6 +84,13 @@ def init_transformer(
         module = Transformer(args).cuda()
         if dtype is not None:
             module = module.to(dtype=dtype)
+
+        # if requires_grad=False, just set requires_grad to False
+        # in the first layer to ensure we still train some params.
+        if requires_grad is False:
+            for param in module.layers[0].parameters():
+                param.requires_grad = requires_grad
+
         self.broadcast_module(module)
         return module
 
@@ -107,6 +117,7 @@ def test_transformer_parity(self):
                 ],
                 "compile_transformer_block": [False, True],
                 "dtype": [torch.float32, torch.bfloat16],
+                "requires_grad": [True, False],
             },
             self._test_transformer_parity,
         )
@@ -117,6 +128,7 @@ def _test_transformer_parity(
         precompute: bool,
         scaling_type_weight: ScalingType,
         compile_transformer_block: bool,
+        requires_grad: bool,
         dtype: Optional[torch.dtype] = None,
     ):
         if not enable_fsdp_float8_all_gather and precompute:
@@ -127,7 +139,10 @@ def _test_transformer_parity(
         # latter uses fp8 compute. With fp8 all-gather, FSDP would pre-cast to
         # fp8 for that tied weight, incorrectly using fp8 for the embedding.
         weight_tying = not enable_fsdp_float8_all_gather
-        module = self.init_transformer(weight_tying=weight_tying, dtype=dtype)
+        module = self.init_transformer(
+            weight_tying=weight_tying, dtype=dtype, requires_grad=requires_grad
+        )
+
         ref_module = copy.deepcopy(module)
         float8_linear_config1 = Float8LinearConfig(
             cast_config_weight=CastConfig(scaling_type=scaling_type_weight),
diff --git a/torchao/float8/float8_linear.py b/torchao/float8/float8_linear.py
@@ -416,7 +416,8 @@ def from_float(
                     new_mod.weight,
                     new_mod.linear_mm_config,
                     new_mod.config.cast_config_weight.target_dtype,
-                )
+                ),
+                requires_grad=new_mod.weight.requires_grad,
             )
 
         return new_mod
diff --git a/torchao/testing/float8/fsdp2_utils.py b/torchao/testing/float8/fsdp2_utils.py
@@ -22,6 +22,14 @@ def check_parity_no_mp(
     precompute: bool = False,
     compile_transformer_block: bool = False,
 ):
+    # check that requires_grad matches ref module
+    for ref_param, fsdp_param in zip(ref_model.parameters(), fsdp_model.parameters()):
+        test_cls.assertEqual(
+            ref_param.requires_grad,
+            fsdp_param.requires_grad,
+            msg=f"ref_param.requires_grad: {ref_param.requires_grad}, fsdp_param.requires_grad: {fsdp_param.requires_grad}",
+        )
+
     # TODO(before land): reorder args and make config not optional
     for iter_idx in range(10):
         losses: List[torch.Tensor] = []
@@ -31,8 +39,9 @@ def check_parity_no_mp(
             losses[-1].backward()
             if model is ref_model:
                 for param in model.parameters():
-                    dist.all_reduce(param.grad)
-                    param.grad.div_(dist.get_world_size())
+                    if param.requires_grad:
+                        dist.all_reduce(param.grad)
+                        param.grad.div_(dist.get_world_size())
 
             optim.step()
             if (

Original file line number	Diff line number	Diff line change
`@@ -416,7 +416,8 @@ def from_float(`
`416`	`416`	`new_mod.weight,`
`417`	`417`	`new_mod.linear_mm_config,`
`418`	`418`	`new_mod.config.cast_config_weight.target_dtype,`
`419`		`- )`
	`419`	`+ ),`
	`420`	`+ requires_grad=new_mod.weight.requires_grad,`
`420`	`421`	`)`
`421`	`422`
`422`	`423`	`return new_mod`