float8 training: move module attribute setting to sync function (#1341)

vkuzo · web-flow · commit b65e513a6344 · 2024-11-25T14:25:31.000-08:00
Summary: This PR moves the setting of `is_amax_initialized` flag on `Float8Linear` to the `sync_float8_amax_and_scale_history` function. There are two reasons for this: 1. the current logic does not work with torchtitan + delayed scaling + AC, failing with https://gist.github.com/vkuzo/70819a2cffb9346bf44ecd9079b8bf51 . 2. in general, stateful logic such as changing module attributes adds complexity. Even if we fix (1) in compile land, something else could break. The `sync_float8_amax_and_scale_history` function is already called outside of the main model forward/backward, it's already required to be called at every iteration, it does not need to know about AC, and it seems like a great place to stash logic which isn't easily compileable such as this init code. After this PR the `enable_amax_init` and `enable_pre_and_post_forward` config options are now no-ops. In a future PR we should add a deprecation warning, and eventually remove these. Test Plan: ``` // this repo ./test/float8/test_everything.sh // torchtitan with-proxy CONFIG_FILE="./train_configs/llama3_8b.toml" ./run_llama_train.sh --float8.enable_float8_linear --training.compile --float8.scaling_type_input delayed --float8.scaling_type_weight delayed --float8.scaling_type_grad_output delayed ``` Reviewers: Subscribers: Tasks: Tags:
diff --git a/test/float8/test_base.py b/test/float8/test_base.py
@@ -265,13 +265,14 @@ def _test_linear_impl(
             config,
         )
         for _ in range(2):
-            if linear_requires_sync(config):
-                sync_float8_amax_and_scale_history(m_fp8)
             if use_ac:
                 y_fp8 = torch.utils.checkpoint.checkpoint(m_fp8, x, use_reentrant=False)
             else:
                 y_fp8 = m_fp8(x)
             y_fp8.sum().backward()
+            if linear_requires_sync(config):
+                sync_float8_amax_and_scale_history(m_fp8)
+
             if use_ac:
                 y_ref = torch.utils.checkpoint.checkpoint(m_ref, x, use_reentrant=False)
             else:
diff --git a/torchao/float8/README.md b/torchao/float8/README.md
@@ -95,8 +95,6 @@ config = Float8LinearConfig(
     cast_config_input=CastConfig(scaling_type=ScalingType.DELAYED),
     cast_config_weight=CastConfig(scaling_type=ScalingType.DELAYED),
     cast_config_grad_output=CastConfig(scaling_type=ScalingType.DELAYED),
-    # enable_amax_init=False,  # only needed for autocast + compile + FSDP +  float8 delayed
-    # enable_pre_and_post_forward=False  # only needed for autocast + compile + FSDP +  float8 delayed
 )
 
 # convert all `torch.nn.Linear` modules to `Float8Linear`, specifying custom scaling behavior
@@ -111,8 +109,11 @@ for _ in range(10):
     y = m(x)
     y.sum().backward()
 
-    # specific to float8 with delayed scaling: separate step to sync scales/amaxes
-    # in the future, this may move to a context manager
+    # Specific to delayed scaling: separate step to sync scales/amaxes.
+    # On the first call, this function also sets the `is_amax_initialized` flag to
+    # mark the amax and scale buffers as initialized.
+    # Make sure you run this after every model forward+backward pass.
+    # In the future, this may move to a context manager.
     sync_float8_amax_and_scale_history(m)
 
     optimizer.step()
diff --git a/torchao/float8/config.py b/torchao/float8/config.py
@@ -180,15 +180,12 @@ class Float8LinearConfig:
     # Per-linear configuration
     #
 
-    # If True, on the first iteration of Float8Linear the amaxes will be
-    # initialized with the incoming data. As of 2023-12-30, this doesn't work
-    # with autocast + torch.compile + FSDP. Enabling this option is nice for
-    # testing, but this is not necessary for real training jobs.
+    # This configuration option is deprecated and no longer has an effect. It may
+    # be removed in a future release.
     enable_amax_init: bool = True
 
-    # If True, pre-forward and post-forward functions are run. As of 2023-12-30,
-    # this doesn't work with autocast + torch.compile + FSDP. Enabling this
-    # option is useful for safety, but not strictly necessary.
+    # This configuration option is deprecated and no longer has an effect. It may
+    # be removed in a future release.
     enable_pre_and_post_forward: bool = True
 
     # If True, then uses a tensor subclass for the float8 linear module's weight that
diff --git a/torchao/float8/float8_linear.py b/torchao/float8/float8_linear.py
@@ -545,7 +545,6 @@ def float8_post_forward(self):
         # config setting
         if not self.enable_pre_and_post_forward:
             return
-        self.is_amax_initialized = True
 
     def forward_fp8_matmul(self, input: torch.Tensor) -> torch.Tensor:
         has_any_axiswise_scaling = (
diff --git a/torchao/float8/float8_linear_utils.py b/torchao/float8/float8_linear_utils.py
@@ -193,6 +193,9 @@ def sync_float8_amax_and_scale_history(model: torch.nn.Module, fp8_layers=None)
             and we loop over all fp8_layers to sync and update amax scale histories.
             Users can use get_float8_layers to get all fp8 layers.
     """
+    # TODO(future): consider adding a flag to control setting the `is_amax_initialized`
+    # flag only on the first iteration.
+
     if fp8_layers is None:
         fp8_layers = get_float8_layers(model)
 
@@ -309,10 +312,10 @@ def inner_func():
             child.fp8_scale_weight.copy_(new_weight_scales[idx])
             child.fp8_scale_grad_output.copy_(new_grad_output_scales[idx])
 
-    # This allows for the compile to succede on the inner func and fail on the graph breaks
+    # This allows for the compile to succeed on the inner func and fail on the graph breaks
     # at the beginning and and of syncing
     inner_func()
 
     for child in fp8_layers:
-        # Set a flag to signal amaxes/scales are ready
-        child.amax_and_scale_synced = True
+        # Set a flag to signal that initialization is done
+        child.is_amax_initialized = True