pytorch
diff --git a/‎torchao/prototype/float8nocompile/float8nocompile_linear.py
Lines changed: 74 additions & 46 deletions b/‎torchao/prototype/float8nocompile/float8nocompile_linear.py
Lines changed: 74 additions & 46 deletions
diff --git a/‎torchao/prototype/float8nocompile/test/test.py renamed to ‎torchao/prototype/float8nocompile/test/train_test.py b/‎torchao/prototype/float8nocompile/test/test.py renamed to ‎torchao/prototype/float8nocompile/test/train_test.py
@@ -11,11 +11,13 @@
 import torch
 
 from torchao.float8.config import Float8LinearConfig
-from torchao.float8.float8_linear import manual_float8_matmul_with_args_in_float8
 from torchao.float8.float8_tensor import GemmInputRole, LinearMMConfig, ScaledMMConfig
 from torchao.prototype.float8nocompile.float8nocompile_scaling_utils import (
-    Float8NoCompileConversionFunc,
-    NoopFwToFloat8NoCompileBwDynamic,
+    ToFP8ColumnMajor,
+    ToFP8ColumnMajorT,
+    ToFP8RowAndColumnMajor,
+    ToFP8RowMajor,
+    ToFP8RowMajorT,
 )
 from torchao.prototype.float8nocompile.kernels.fp8_dynamic_tensorwise import (
     KernelAlgorithm,
@@ -69,53 +71,14 @@ def __init__(self, *args, **kwargs):
 
     def forward(self, input: torch.Tensor) -> torch.Tensor:
         # TODO(danielvegamyhre): support for FSDP once dependencies are implemented
-        input_fp8 = self.cast_input_to_float8(input)
-        weight_fp8_t = self.cast_weight_to_float8_t(self.weight)
-
-        # compute fp8 matmul
-        output = manual_float8_matmul_with_args_in_float8.apply(input_fp8, weight_fp8_t)
-
-        # cast grad_output to float8_e5m2 during backward
-        return self.cast_output_to_float8_in_bw(output)
-
-    def cast_input_to_float8(self, input: torch.Tensor) -> torch.Tensor:
-        # Duplicate the autocast logic for F.linear, so that the output
-        # of our module has the right original precision
-        if torch.is_autocast_enabled():
-            # For now, hardcode to GPU's autocast dtype
-            # if we need CPU support in the future, we can add it
-            autocast_dtype = torch.get_autocast_gpu_dtype()
-            input = input.to(autocast_dtype)
-
-        return Float8NoCompileConversionFunc.apply(
+        output = matmul_with_args_in_hp.apply(
             input,
-            self.config.cast_config_input.target_dtype,
-            self.linear_mm_config,
-            GemmInputRole.INPUT,
-            self.kernel_algo,
-        )
-
-    def cast_weight_to_float8_t(
-        self,
-        weight: torch.Tensor,
-    ) -> torch.Tensor:
-        weight_fp8 = Float8NoCompileConversionFunc.apply(
-            weight,
-            self.config.cast_config_weight.target_dtype,
-            self.linear_mm_config,
-            GemmInputRole.WEIGHT,
-            self.kernel_algo,
-        )
-        return weight_fp8.t()
-
-    def cast_output_to_float8_in_bw(self, output: torch.Tensor) -> torch.Tensor:
-        # casts grad_output to float8_e5m2 for backward
-        return NoopFwToFloat8NoCompileBwDynamic.apply(
-            output,
-            self.config.cast_config_grad_output.target_dtype,
+            self.weight,
+            self.config,
             self.linear_mm_config,
             self.kernel_algo,
         )
+        return output
 
     @classmethod
     def from_float(cls, mod, kernel_algo: KernelAlgorithm = KernelAlgorithm.ATOMIC_MAX):
@@ -140,3 +103,68 @@ def from_float(cls, mod, kernel_algo: KernelAlgorithm = KernelAlgorithm.ATOMIC_M
 
         # TODO(danielvegamyhre): support for FSDP once dependencies are implemented
         return new_mod
+
+
+class matmul_with_args_in_hp(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, input_hp, weight_hp, config, linear_mm_config, kernel_algo):
+        # output = input @ weight_t
+        input_fp8_row_major, input_fp8_col_major = ToFP8RowAndColumnMajor.apply(
+            input_hp,
+            config.cast_config_input.target_dtype,
+            linear_mm_config,
+            GemmInputRole.INPUT,
+            kernel_algo,
+        )
+        weight_t_fp8_col_major = ToFP8ColumnMajorT.apply(
+            weight_hp,
+            config.cast_config_weight.target_dtype,
+            linear_mm_config,
+            GemmInputRole.WEIGHT,
+            kernel_algo,
+        )
+        output = torch.mm(input_fp8_row_major, weight_t_fp8_col_major)
+
+        # save data for backward before returning
+        ctx.save_for_backward(input_fp8_col_major, weight_hp)
+        ctx.config = config
+        ctx.linear_mm_config = linear_mm_config
+        ctx.kernel_algo = kernel_algo
+
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        input_fp8_col_major, weight_hp = ctx.saved_tensors
+
+        # cast grad output to float8_e5m2 for backward
+        grad_output_fp8_row_major = ToFP8RowMajor.apply(
+            grad_output,
+            ctx.config.cast_config_grad_output.target_dtype,
+            ctx.linear_mm_config,
+            GemmInputRole.GRAD_OUTPUT,
+            ctx.kernel_algo,
+        )
+
+        # grad_input = grad_output @ weight
+        weight_fp8_col_major = ToFP8ColumnMajor.apply(
+            weight_hp,
+            ctx.config.cast_config_weight.target_dtype,
+            ctx.linear_mm_config,
+            GemmInputRole.WEIGHT,
+            ctx.kernel_algo,
+        )
+        grad_input = torch.mm(grad_output_fp8_row_major, weight_fp8_col_major)
+
+        # grad_weight = grad_output_t @ input
+        # apparently this variant is slightly faster than `grad_weight_t = input_t @ grad_output`
+        # source: https://github.com/pytorch/ao/blob/fe5f11b2c58b452e01ba9ec7359629928b143619/torchao/float8/float8_linear.py#L84-L85
+        grad_output_t_row_major = ToFP8RowMajorT.apply(
+            grad_output,
+            ctx.config.cast_config_grad_output.target_dtype,
+            ctx.linear_mm_config,
+            GemmInputRole.GRAD_OUTPUT,
+            ctx.kernel_algo,
+        )
+        grad_weight = torch.mm(grad_output_t_row_major, input_fp8_col_major)
+        return grad_input, grad_weight, None, None, None