Add support for Int4GroupwisePreshuffleTensor for fbgemm

jerryzh168 · jerryzh168 · commit 69a215e4b6a3 · 2025-06-27T12:50:43.000-07:00
Summary: Note: slice is not working yet, others are working Test Plan: python test/dtypes/test_int4_groupwise_preshuffle.py Reviewers: Subscribers: Tasks: Tags: stack-info: PR: #2421, branch: jerryzh168/stack/1
diff --git a/test/quantization/quantize_/test_int4_groupwise_preshuffle.py b/test/quantization/quantize_/test_int4_groupwise_preshuffle.py
@@ -0,0 +1,176 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+import torch
+from torch.testing._internal.common_utils import (
+    TestCase,
+    run_tests,
+)
+
+from torchao.quantization import (
+    FbgemmConfig,
+    quantize_,
+)
+from torchao.quantization.utils import compute_error
+from torchao.utils import (
+    TORCH_VERSION_AT_LEAST_2_8,
+    _is_fbgemm_genai_gpu_available,
+    is_sm_at_least_90,
+)
+
+
+@unittest.skipIf(not TORCH_VERSION_AT_LEAST_2_8, "Need pytorch 2.8+")
+@unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+@unittest.skipIf(not is_sm_at_least_90(), "Nedd sm90+")
+@unittest.skipIf(
+    not _is_fbgemm_genai_gpu_available(), "Requires fbgemm-gpu-genai >= 1.2.0"
+)
+class TestInt4GroupwisePreshuffleTensor(TestCase):
+    def setUp(self):
+        self.config = FbgemmConfig(
+            input_dtype=torch.bfloat16,
+            weight_dtype=torch.int4,
+            output_dtype=torch.bfloat16,
+            block_size=[1, 128],
+            preshuffle=True,
+        )
+        self.bmm_config = FbgemmConfig(
+            input_dtype=torch.bfloat16,
+            weight_dtype=torch.int4,
+            output_dtype=torch.bfloat16,
+            block_size=[1, 1, 128],
+            preshuffle=True,
+        )
+        self.GPU_DEVICES = ["cuda"] if torch.cuda.is_available() else []
+
+    def test_linear(self):
+        dtype = torch.bfloat16
+        device = "cuda"
+        input = torch.randn(1, 128, dtype=dtype, device=device)
+        linear = torch.nn.Linear(128, 256, dtype=dtype, device=device)
+        original = linear(input)
+        quantize_(linear, self.config)
+        quantized = linear(input)
+        self.assertTrue(compute_error(original, quantized) > 20)
+
+    @unittest.skip("WIP: this doesn't work yet")
+    def test_slice(self):
+        dtype = torch.bfloat16
+        device = "cuda"
+        dummy = torch.nn.Linear(256, 256, bias=False, dtype=dtype, device=device)
+        dummy1 = torch.nn.Linear(256, 64, bias=False, dtype=dtype, device=device)
+        dummy1.weight = torch.nn.Parameter(
+            dummy.weight.narrow(0, 0, 64), requires_grad=False
+        )
+        dummy2 = torch.nn.Linear(128, 256, dtype=dtype, device=device)
+        dummy2.weight = torch.nn.Parameter(
+            dummy.weight.narrow(1, 0, 128), requires_grad=False
+        )
+
+        quantize_(dummy, self.config)
+        weight1 = dummy.weight.narrow(0, 0, 64)
+        weight2 = dummy.weight.narrow(1, 0, 128)
+        # check the slicing operation is correctly performend of the constituents Tensors
+        self.assertEqual(
+            weight1.packed_weight, dummy.weight.packed_weight.narrow(0, 0, 64)
+        )
+        self.assertEqual(weight1.group_scale, dummy.weight.group_scale.narrow(1, 0, 64))
+        self.assertEqual(
+            weight2.packed_weight, dummy.weight.packed_weight.narrow(1, 0, 64)
+        )
+        self.assertEqual(weight2.group_scale, dummy.weight.group_scale.narrow(0, 0, 1))
+
+        # check for 1. sliced bf16 weight 2. sliced quantized weight
+        # can produce similar results doing matmul on the same input Tensor
+
+        input = torch.randn(2, 256, dtype=dtype, device=device)
+        res_ref = dummy1(input)
+        dummy.weight = torch.nn.Parameter(weight1, requires_grad=False)
+        res = dummy(input)
+        sqnr = compute_error(res, res_ref)
+        assert sqnr > 20, f"Got: {sqnr}"
+
+        input = torch.randn(2, 128, dtype=dtype, device=device)
+        res_ref = dummy2(input)
+        dummy.weight = torch.nn.Parameter(weight2, requires_grad=False)
+        res = dummy(input)
+        sqnr = compute_error(res, res_ref)
+        assert sqnr > 15, f"Got: {sqnr}"
+
+    def test_slice_and_copy_(self):
+        l = torch.nn.Linear(1024, 1024).to("cuda").to(torch.bfloat16)
+        l.weight = torch.nn.Parameter(
+            torch.zeros(1024, 1024, dtype=torch.bfloat16, device="cuda")
+        )
+        quantize_(l, self.config)
+        param = l.weight
+        param_data = param.data
+        param_data = param_data.narrow(0, 0, 512)
+        assert (
+            param.data.packed_weight.data_ptr() == param_data.packed_weight.data_ptr()
+        )
+        assert param.data.group_scale.data_ptr() == param_data.group_scale.data_ptr()
+        assert param.data.group_zero.data_ptr() == param_data.group_zero.data_ptr()
+        orig_value = param.data.packed_weight[0][0].item()
+
+        # dummy_l has random input (shouldn't be 0)
+        dummy_l = torch.nn.Linear(1024, 1024).to("cuda").to(torch.bfloat16)
+        quantize_(dummy_l, self.config)
+        quantized = dummy_l.weight
+        quantized = quantized.narrow(0, 0, 512)
+
+        param_data.copy_(quantized)
+
+        # making sure param.data is updated
+        assert param.data.packed_weight[0][0] != orig_value
+
+    def test_bmm(self):
+        class M(torch.nn.Module):
+            def __init__(self, weight):
+                super().__init__()
+                self.weight = weight
+
+            def forward(self, x):
+                return torch.bmm(x, self.weight)
+
+        dtype = torch.bfloat16
+        device = "cuda"
+        input = torch.randn(10, 32, 128, dtype=dtype, device=device)
+        weight = torch.randn(10, 128, 256, dtype=dtype, device=device)
+        m = M(weight).eval()
+        original = m(input)
+        m.weight = torch.nn.Parameter(m.weight.transpose(1, 2).contiguous())
+        quantize_(m, self.bmm_config, filter_fn=lambda x, fqn: True)
+        quantized = m(input)
+        self.assertTrue(compute_error(original, quantized) > 18)
+
+    def test_to_device(self):
+        for device in self.GPU_DEVICES:
+            linear = torch.nn.Linear(128, 256, dtype=torch.bfloat16)
+            quantize_(linear, self.config)
+            linear.to(device)
+
+            linear = torch.nn.Linear(128, 256, dtype=torch.bfloat16)
+            quantize_(linear, self.config)
+            linear.to(device=device)
+
+            linear = torch.nn.Linear(128, 256, dtype=torch.bfloat16)
+            quantize_(linear, self.config)
+            linear.to(device)
+
+    def test_module_path(self):
+        linear = torch.nn.Linear(128, 256, dtype=torch.bfloat16)
+        quantize_(linear, self.config)
+        self.assertEqual(
+            str(type(linear.weight)),
+            "<class 'torchao.quantization.Int4GroupwisePreshuffleTensor'>",
+        )
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/torchao/dtypes/__init__.py b/torchao/dtypes/__init__.py
@@ -69,4 +69,5 @@
     "to_fbgemm_fp8",
     "FbgemmFp8Tensor",
     "Int8DynamicActInt4WeightCPULayout",
+    "Int4GroupwisePreshuffleTensor",
 ]
diff --git a/torchao/quantization/__init__.py b/torchao/quantization/__init__.py
@@ -87,6 +87,9 @@
     dequantize_affine,
     quantize_affine,
 )
+from .quantize_ import (
+    Int4GroupwisePreshuffleTensor,
+)
 from .smoothquant import (
     SmoothFakeDynamicallyQuantizedLinear,
     SmoothFakeDynQuantMixin,
@@ -149,6 +152,8 @@
     "AOPerModuleConfig",
     "ModuleFqnToConfig",
     "FbgemmConfig",
+    # tensor subclasses
+    "Int4GroupwisePreshuffleTensor",
     # smooth quant - subject to change
     "get_scale",
     "SmoothFakeDynQuantMixin",
diff --git a/torchao/quantization/quant_api.py b/torchao/quantization/quant_api.py
@@ -15,7 +15,6 @@
 and mixed GEMM kernels
 """
 
-import importlib.util
 import logging
 import types
 import warnings
@@ -68,6 +67,9 @@
     LinearActivationWeightObservedTensor,
 )
 from torchao.quantization.observer import AffineQuantizedObserverBase, get_block_size
+from torchao.quantization.quantize_ import (
+    Int4GroupwisePreshuffleTensor,
+)
 from torchao.quantization.transform_module import (
     _QUANTIZE_CONFIG_HANDLER,
     register_quantize_module_handler,
@@ -79,7 +81,7 @@
     TORCH_VERSION_AT_LEAST_2_4,
     TORCH_VERSION_AT_LEAST_2_5,
     TORCH_VERSION_AT_LEAST_2_6,
-    is_fbcode,
+    _is_fbgemm_genai_gpu_available,
     is_MI300,
     is_sm_at_least_89,
     is_sm_at_least_90,
@@ -2046,18 +2048,12 @@ class FbgemmConfig(AOBaseConfig):
     block_size: Optional[List[int]] = None
     activation_scale_ub: Optional[float] = None
     transpose_input: bool = False
+    preshuffle: bool = False
 
 
 @register_quantize_module_handler(FbgemmConfig)
 def _(module: torch.nn.Module, config: FbgemmConfig) -> torch.nn.Module:
-    # TODO: use is_package_at_least("fbgemm_gpu", "1.2.0") when
-    # https://github.com/pytorch/FBGEMM/issues/4198 is fixed
-    if importlib.util.find_spec("fbgemm_gpu") is None:
-        raise ImportError("Requires fbgemm-gpu-genai >= 1.2.0")
-
-    import fbgemm_gpu.experimental.gen_ai  # noqa: F401
-
-    if not is_fbcode() and fbgemm_gpu.__version__ < "1.2.0":
+    if not _is_fbgemm_genai_gpu_available():
         raise ImportError("Requires fbgemm-gpu-genai >= 1.2.0")
 
     _SUPPORTED_DTYPES = {
@@ -2070,11 +2066,16 @@ def _(module: torch.nn.Module, config: FbgemmConfig) -> torch.nn.Module:
         and (config.weight_dtype == torch.int4)
         and (config.output_dtype == torch.bfloat16)
     ):
-        weight = to_fbgemm_int4(
-            module.weight,
-            config.block_size,
-            config.transpose_input,
-        )
+        if config.preshuffle:
+            weight = Int4GroupwisePreshuffleTensor.from_float(
+                module.weight, config.block_size
+            )
+        else:
+            weight = to_fbgemm_int4(
+                module.weight,
+                config.block_size,
+                config.transpose_input,
+            )
         module.weight = torch.nn.Parameter(weight, requires_grad=False)
         module.extra_repr = types.MethodType(_linear_extra_repr, module)
         return module
diff --git a/torchao/quantization/quantize_/__init__.py b/torchao/quantization/quantize_/__init__.py
@@ -0,0 +1,9 @@
+from .int4_groupwise_preshuffle_tensor import (
+    Int4GroupwisePreshuffleTensor,
+)
+
+Int4GroupwisePreshuffleTensor.__module__ = "torchao.quantization"
+
+__all__ = [
+    "Int4GroupwisePreshuffleTensor",
+]
diff --git a/torchao/quantization/quantize_/int4_groupwise_preshuffle_tensor.py b/torchao/quantization/quantize_/int4_groupwise_preshuffle_tensor.py
diff --git a/torchao/utils.py b/torchao/utils.py

Original file line number	Diff line number	Diff line change
`@@ -69,4 +69,5 @@`
`69`	`69`	`"to_fbgemm_fp8",`
`70`	`70`	`"FbgemmFp8Tensor",`
`71`	`71`	`"Int8DynamicActInt4WeightCPULayout",`
	`72`	`+ "Int4GroupwisePreshuffleTensor",`
`72`	`73`	`]`