Add support for float8 activation for Int4GroupwisePreshuffleTensor

jerryzh168 · jerryzh168 · commit 72bc113daa3c · 2025-07-07T12:52:13.000-07:00
Summary: Added basic op support like linear and bmm, we have both float8 and bf16 in the same Tensor because it's the same dtype, only difference is whether the activation is quantized or not. Although there is some differneces in implementation: bf16 activaton: * group_scale * group_zero fp8 activation * group_scale * row_scale Test Plan: python test/dtypes/test_float8_activation_int4_groupwise_preshuffle.py Reviewers: Subscribers: Tasks: Tags: stack-info: PR: #2437, branch: jerryzh168/stack/4
diff --git a/test/quantization/quantize_/int4/test_int4_groupwise_preshuffle_tensor.py b/test/quantization/quantize_/int4/test_int4_groupwise_preshuffle_tensor.py
@@ -4,14 +4,18 @@
 # This source code is licensed under the BSD 3-Clause license found in the
 # LICENSE file in the root directory of this source tree.
 
+import tempfile
 import unittest
 
 import torch
 from torch.testing._internal.common_utils import (
     TestCase,
+    instantiate_parametrized_tests,
+    parametrize,
     run_tests,
 )
 
+from torchao.float8.config import e4m3_dtype
 from torchao.quantization import (
     FbgemmConfig,
     quantize_,
@@ -23,6 +27,45 @@
     is_sm_at_least_90,
 )
 
+if TORCH_VERSION_AT_LEAST_2_8:
+    BF16_ACT_CONFIG = FbgemmConfig(
+        input_dtype=torch.bfloat16,
+        weight_dtype=torch.int4,
+        output_dtype=torch.bfloat16,
+        block_size=[1, 128],
+        preshuffle=True,
+    )
+
+    BF16_ACT_BMM_CONFIG = FbgemmConfig(
+        input_dtype=torch.bfloat16,
+        weight_dtype=torch.int4,
+        output_dtype=torch.bfloat16,
+        block_size=[1, 1, 128],
+        preshuffle=True,
+    )
+
+    FP8_ACT_CONFIG = FbgemmConfig(
+        input_dtype=e4m3_dtype,
+        weight_dtype=torch.int4,
+        output_dtype=torch.bfloat16,
+        block_size=[1, 128],
+        preshuffle=True,
+    )
+
+    FP8_ACT_BMM_CONFIG = FbgemmConfig(
+        input_dtype=e4m3_dtype,
+        weight_dtype=torch.int4,
+        output_dtype=torch.bfloat16,
+        block_size=[1, 1, 128],
+        preshuffle=True,
+    )
+
+else:
+    BF16_ACT_CONFIG = None
+    BF16_ACT_BMM_CONFIG = None
+    FP8_ACT_CONFIG = None
+    FP8_ACT_BMM_CONFIG = None
+
 
 @unittest.skipIf(not TORCH_VERSION_AT_LEAST_2_8, "Need pytorch 2.8+")
 @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
@@ -32,33 +75,23 @@
 )
 class TestInt4GroupwisePreshuffleTensor(TestCase):
     def setUp(self):
-        self.config = FbgemmConfig(
-            input_dtype=torch.bfloat16,
-            weight_dtype=torch.int4,
-            output_dtype=torch.bfloat16,
-            block_size=[1, 128],
-            preshuffle=True,
-        )
-        self.bmm_config = FbgemmConfig(
-            input_dtype=torch.bfloat16,
-            weight_dtype=torch.int4,
-            output_dtype=torch.bfloat16,
-            block_size=[1, 1, 128],
-            preshuffle=True,
-        )
         self.GPU_DEVICES = ["cuda"] if torch.cuda.is_available() else []
 
-    def test_linear(self):
+    @parametrize("config", [BF16_ACT_CONFIG, FP8_ACT_CONFIG])
+    def test_linear(self, config):
         dtype = torch.bfloat16
         device = "cuda"
         input = torch.randn(1, 128, dtype=dtype, device=device)
         linear = torch.nn.Linear(128, 256, dtype=dtype, device=device)
         original = linear(input)
-        quantize_(linear, self.config)
+        quantize_(linear, config)
         quantized = linear(input)
         self.assertTrue(compute_error(original, quantized) > 20)
 
-    def test_bmm(self):
+    # Note: this order will error out: `Got bad cuda status: an illegal memory access was encountered at line: 449`
+    # @parametrize("bmm_config", [BF16_ACT_BMM_CONFIG, FP8_ACT_BMM_CONFIG])
+    @parametrize("bmm_config", [FP8_ACT_BMM_CONFIG, BF16_ACT_BMM_CONFIG])
+    def test_bmm(self, bmm_config):
         class M(torch.nn.Module):
             def __init__(self, weight):
                 super().__init__()
@@ -74,32 +107,46 @@ def forward(self, x):
         m = M(weight).eval()
         original = m(input)
         m.weight = torch.nn.Parameter(m.weight.transpose(1, 2).contiguous())
-        quantize_(m, self.bmm_config, filter_fn=lambda x, fqn: True)
+        quantize_(m, bmm_config, filter_fn=lambda x, fqn: True)
         quantized = m(input)
         self.assertTrue(compute_error(original, quantized) > 18)
 
-    def test_to_device(self):
+    @parametrize("config", [BF16_ACT_CONFIG, FP8_ACT_CONFIG])
+    def test_to_device(self, config):
         for device in self.GPU_DEVICES:
             linear = torch.nn.Linear(128, 256, dtype=torch.bfloat16)
-            quantize_(linear, self.config)
+            quantize_(linear, config)
             linear.to(device)
 
             linear = torch.nn.Linear(128, 256, dtype=torch.bfloat16)
-            quantize_(linear, self.config)
+            quantize_(linear, config)
             linear.to(device=device)
 
             linear = torch.nn.Linear(128, 256, dtype=torch.bfloat16)
-            quantize_(linear, self.config)
+            quantize_(linear, config)
             linear.to(device)
 
-    def test_module_path(self):
+    @parametrize("config", [BF16_ACT_CONFIG, FP8_ACT_CONFIG])
+    def test_module_path(self, config):
         linear = torch.nn.Linear(128, 256, dtype=torch.bfloat16)
-        quantize_(linear, self.config)
+        quantize_(linear, config)
         self.assertEqual(
             str(type(linear.weight)),
             "<class 'torchao.quantization.Int4GroupwisePreshuffleTensor'>",
         )
 
+        with tempfile.NamedTemporaryFile() as f:
+            torch.save(linear.state_dict(), f)
+            f.seek(0)
+            state_dict = torch.load(f)
+            self.assertEqual(
+                str(type(state_dict["weight"])),
+                "<class 'torchao.quantization.Int4GroupwisePreshuffleTensor'>",
+            )
+
+
+instantiate_parametrized_tests(TestInt4GroupwisePreshuffleTensor)
+
 
 if __name__ == "__main__":
     run_tests()
diff --git a/torchao/quantization/quant_api.py b/torchao/quantization/quant_api.py
@@ -2040,6 +2040,8 @@ class FbgemmConfig(AOBaseConfig):
        weight_dtype (torch.dtype): weight dtype of the kernel
        output_dtype (torch.dtype): output dtype of the kernel
        group_size (int): The group size for weight
+       preshuffle (bool): whether preshuffle the weights or not
+       activation_dtype_for_int4 (str): the dtype for activation for int4 weight, either bf16 or fp8
     """
 
     input_dtype: torch.dtype
@@ -2067,7 +2069,9 @@ def _(module: torch.nn.Module, config: FbgemmConfig) -> torch.nn.Module:
     ):
         if config.preshuffle:
             weight = Int4GroupwisePreshuffleTensor.from_float(
-                module.weight, config.block_size
+                module.weight,
+                config.block_size,
+                activation_dtype="bf16",
             )
         else:
             weight = to_fbgemm_int4(
@@ -2077,6 +2081,20 @@ def _(module: torch.nn.Module, config: FbgemmConfig) -> torch.nn.Module:
         module.weight = torch.nn.Parameter(weight, requires_grad=False)
         module.extra_repr = types.MethodType(_linear_extra_repr, module)
         return module
+    if (
+        (config.input_dtype == e4m3_dtype)
+        and (config.weight_dtype == torch.int4)
+        and (config.output_dtype == torch.bfloat16)
+    ):
+        if config.preshuffle:
+            weight = Int4GroupwisePreshuffleTensor.from_float(
+                module.weight,
+                config.block_size,
+                activation_dtype="fp8",
+            )
+            module.weight = torch.nn.Parameter(weight, requires_grad=False)
+            module.extra_repr = types.MethodType(_linear_extra_repr, module)
+            return module
     elif (
         (config.input_dtype == e4m3_dtype)
         and (config.weight_dtype == e4m3_dtype)
diff --git a/torchao/quantization/quantize_/__init__.py b/torchao/quantization/quantize_/__init__.py
@@ -1,4 +1,4 @@
-from .int4_groupwise_preshuffle_tensor import (
+from .int4 import (
     Int4GroupwisePreshuffleTensor,
 )
 
diff --git a/torchao/quantization/quantize_/int4/__init__.py b/torchao/quantization/quantize_/int4/__init__.py
@@ -0,0 +1,7 @@
+from .int4_groupwise_preshuffle_tensor import (
+    Int4GroupwisePreshuffleTensor,
+)
+
+__all__ = [
+    "Int4GroupwisePreshuffleTensor",
+]
diff --git a/torchao/quantization/quantize_/int4/int4_groupwise_preshuffle_tensor.py b/torchao/quantization/quantize_/int4/int4_groupwise_preshuffle_tensor.py

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-from .int4_groupwise_preshuffle_tensor import (`
	`1`	`+from .int4 import (`
`2`	`2`	`Int4GroupwisePreshuffleTensor,`
`3`	`3`	`)`
`4`	`4`