make float8 training's force_recompute_fp8_weight_in_bwd flag do nothing (#2356)

vkuzo · web-flow · commit 03c850afae7b · 2025-06-13T08:00:00.000-04:00
Summary: This PR makes the `Float8LinearConfig.force_recompute_fp8_weight_in_bwd` flag do nothing and marks it for a future deprecation. Now that PyTorch Core can handle this logic automatically, we no longer need the workaround. Please see #2251 for more context. Test Plan: ``` ./test/float8/test_everything.sh ``` Reviewers: Subscribers: Tasks: Tags:
diff --git a/benchmarks/float8/training/torchtitan_benchmark.sh b/benchmarks/float8/training/torchtitan_benchmark.sh
@@ -29,7 +29,7 @@ fi
 # validate recipe name
 if [ -n "${FLOAT8_RECIPE_WITH_BEST_SETTINGS}" ]; then
   if [ "${FLOAT8_RECIPE_WITH_BEST_SETTINGS}" == "tensorwise" ]; then
-    FLOAT8_ARGS="--model.converters="float8" --float8.enable_fsdp_float8_all_gather --float8.precompute_float8_dynamic_scale_for_fsdp --float8.force_recompute_fp8_weight_in_bwd"
+    FLOAT8_ARGS="--model.converters="float8" --float8.enable_fsdp_float8_all_gather --float8.precompute_float8_dynamic_scale_for_fsdp"
   else
     FLOAT8_ARGS="--model.converters="float8" --float8.recipe_name=${FLOAT8_RECIPE_WITH_BEST_SETTINGS}"
   fi
diff --git a/torchao/float8/config.py b/torchao/float8/config.py
@@ -192,20 +192,9 @@ class Float8LinearConfig:
     # If True, emulation is used instead of hardware accelerated gemm
     emulate: bool = False
 
-    # If the option is enabled, fp8_weight will always be re-computed in backward.
-    # It's recommended to enable this flag when using FSDP.
-    # Otherwise, the entire fp8_weight, instead of the sharded weight may be saved.
-    # If using outer activation checkpointing context or SAC, you may disable this option
-    # and handle the recomputation of fp8 weight in your customized AC context.
-    #
-    # Details:
-    # When using float8 training with FSDP, the original weight is sharded; fp8_weight (in forward) and fp8_weight_transpose (in backward) are used by the model.
-    # However, when partitioning the forward_backward graph, torch.compile may decide to
-    # save the fp8_weight_transpose for backward, which is an un-sahrded weight and costs a high memory utilization.
-    # The longer-term solution is to let compile decide how to partition the graph with optimal computation and memory savings.
-    # For now, we use the checkpointing api to force the recomputation of fp8 weight in backward.
-    # TODO(future PR): either enable by default or have a warning and set up the
-    # tests so that the warning does not spam the CI stdout.
+    # This flag is deprecated and currently has no effect. It will be removed
+    # in a future release. Please see https://github.com/pytorch/ao/issues/2251
+    # for more context.
     force_recompute_fp8_weight_in_bwd: bool = False
 
     # If this option is enabled, the scaling factor used for float8 quantization
@@ -278,13 +267,9 @@ def __post_init__(self):
                 f"{operand_name} must be cast to the same dtype in both matmuls it's used in"
             )
 
-        # See the comments around `force_recompute_fp8_weight_in_bwd` for more details of this warning.
-        if (
-            self.enable_fsdp_float8_all_gather
-            and not self.force_recompute_fp8_weight_in_bwd
-        ):
+        if self.force_recompute_fp8_weight_in_bwd:
             logger.warning(
-                "When using FSDP, it's recommended to enable config.force_recompute_fp8_weight_in_bwd."
+                "`config.force_recompute_fp8_weight_in_bwd` is deprecated and will be removed in a future release. Please see https://github.com/pytorch/ao/issues/2251 for more details."
             )
 
     @staticmethod
diff --git a/torchao/float8/float8_linear.py b/torchao/float8/float8_linear.py
@@ -10,7 +10,6 @@
 from typing import Optional
 
 import torch
-import torch.utils.checkpoint as checkpoint
 
 from torchao.float8.config import Float8LinearConfig, ScalingGranularity, ScalingType
 from torchao.float8.distributed_utils import tensor_already_casted_to_fp8
@@ -325,29 +324,18 @@ def forward(self, input: torch.Tensor) -> torch.Tensor:
         # TODO(future PR): check for axiswise scaling for input, weight,
         # grad_output separately instead of together
         if not has_any_axiswise_scaling:
-            # If force_recompute_fp8_weight_in_bwd, we only recompute the fp8 weight,
-            # weight_scale should be saved.
+            # TODO(future PR): now that `force_recompute_fp8_weight_in_bwd` is
+            # deprecated, we can simplify the below code and unify the per-tensor
+            # and per-axis paths further.
             weight_scale = _get_weight_scale(
                 self.weight, self.scaling_type_weight, self.config
             )
-
-            if self.config.force_recompute_fp8_weight_in_bwd:
-                weight_fp8_t = checkpoint.checkpoint(
-                    _cast_weight_to_float8_t,
-                    self.weight,
-                    self.config,
-                    self.linear_mm_config,
-                    weight_scale,
-                )
-            else:
-                weight_fp8_t = _cast_weight_to_float8_t(
-                    self.weight,
-                    self.config,
-                    self.linear_mm_config,
-                    weight_scale,
-                )
-
-            weight_maybe_fp8_t = weight_fp8_t
+            weight_maybe_fp8_t = _cast_weight_to_float8_t(
+                self.weight,
+                self.config,
+                self.linear_mm_config,
+                weight_scale,
+            )
 
         output = matmul_with_hp_or_float8_args.apply(
             input,