Add all fbgemm kernel Tensors into Int4WeightOnlyConfig and Float8DynamicActivationInt4WeightConfig

jerryzh168 · jerryzh168 · commit 4a3c77ea4867 · 2025-07-11T14:48:31.000-07:00
Summary: att, we will deprecate FbgemmConfig since it's a single kernel. we'd like to categorize things to derived dtype + packed format Test Plan: python test/quantization/quantize_/test_int4_groupwise_preshuffle.py Reviewers: Subscribers: Tasks: Tags: stack-info: PR: #2474, branch: jerryzh168/stack/10
diff --git a/test/integration/test_serialization_bc.py b/test/integration/test_serialization_bc.py
@@ -18,6 +18,7 @@
 
 _MODEL_NAMES = [
     "torchao-testing/opt-125m-float8dq-row-fbgemm",
+    "torchao-testing/opt-125m-int4wo-preshuffle",
 ]
 
 
diff --git a/test/quantization/quantize_/workflows/int4/test_int4_preshuffled_tensor.py b/test/quantization/quantize_/workflows/int4/test_int4_preshuffled_tensor.py
@@ -15,9 +15,9 @@
     run_tests,
 )
 
-from torchao.float8.config import e4m3_dtype
 from torchao.quantization import (
-    FbgemmConfig,
+    Float8ActivationInt4WeightConfig,
+    Int4WeightOnlyConfig,
     quantize_,
 )
 from torchao.quantization.utils import compute_error
@@ -27,44 +27,15 @@
     is_sm_at_least_90,
 )
 
-if TORCH_VERSION_AT_LEAST_2_8:
-    BF16_ACT_CONFIG = FbgemmConfig(
-        input_dtype=torch.bfloat16,
-        weight_dtype=torch.int4,
-        output_dtype=torch.bfloat16,
-        block_size=[1, 128],
-        preshuffle=True,
-    )
-
-    BF16_ACT_BMM_CONFIG = FbgemmConfig(
-        input_dtype=torch.bfloat16,
-        weight_dtype=torch.int4,
-        output_dtype=torch.bfloat16,
-        block_size=[1, 1, 128],
-        preshuffle=True,
-    )
-
-    FP8_ACT_CONFIG = FbgemmConfig(
-        input_dtype=e4m3_dtype,
-        weight_dtype=torch.int4,
-        output_dtype=torch.bfloat16,
-        block_size=[1, 128],
-        preshuffle=True,
-    )
-
-    FP8_ACT_BMM_CONFIG = FbgemmConfig(
-        input_dtype=e4m3_dtype,
-        weight_dtype=torch.int4,
-        output_dtype=torch.bfloat16,
-        block_size=[1, 1, 128],
-        preshuffle=True,
-    )
-
-else:
-    BF16_ACT_CONFIG = None
-    BF16_ACT_BMM_CONFIG = None
-    FP8_ACT_CONFIG = None
-    FP8_ACT_BMM_CONFIG = None
+BF16_ACT_CONFIG = Int4WeightOnlyConfig(
+    group_size=128,
+    use_preshuffle=True,
+)
+
+FP8_ACT_CONFIG = Float8ActivationInt4WeightConfig(
+    group_size=128,
+    use_preshuffle=True,
+)
 
 
 @unittest.skipIf(not TORCH_VERSION_AT_LEAST_2_8, "Need pytorch 2.8+")
@@ -90,7 +61,7 @@ def test_linear(self, config):
 
     # Note: this order will error out: `Got bad cuda status: an illegal memory access was encountered at line: 449`
     # @parametrize("bmm_config", [BF16_ACT_BMM_CONFIG, FP8_ACT_BMM_CONFIG])
-    @parametrize("bmm_config", [FP8_ACT_BMM_CONFIG, BF16_ACT_BMM_CONFIG])
+    @parametrize("bmm_config", [FP8_ACT_CONFIG, BF16_ACT_CONFIG])
     def test_bmm(self, bmm_config):
         class M(torch.nn.Module):
             def __init__(self, weight):
diff --git a/test/quantization/quantize_/workflows/int4/test_int4_tensor.py b/test/quantization/quantize_/workflows/int4/test_int4_tensor.py
@@ -13,7 +13,7 @@
 )
 
 from torchao.quantization import (
-    FbgemmConfig,
+    Int4WeightOnlyConfig,
     quantize_,
 )
 from torchao.quantization.utils import compute_error
@@ -26,19 +26,12 @@
 @unittest.skipIf(not TORCH_VERSION_AT_LEAST_2_8, "Need pytorch 2.8+")
 @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
 @unittest.skipIf(not is_sm_at_least_90(), "Nedd sm90+")
-class TestFbgemmInt4Tensor(TestCase):
+class TestInt4Tensor(TestCase):
     def setUp(self):
-        self.config = FbgemmConfig(
-            input_dtype=torch.bfloat16,
-            weight_dtype=torch.int4,
-            output_dtype=torch.bfloat16,
-            block_size=[1, 128],
-        )
-        self.bmm_config = FbgemmConfig(
-            input_dtype=torch.bfloat16,
-            weight_dtype=torch.int4,
-            output_dtype=torch.bfloat16,
-            block_size=[1, 1, 128],
+        self.config = Int4WeightOnlyConfig(
+            group_size=128,
+            use_preshuffle=False,
+            gemm_kernel_choice="fbgemm",
         )
         self.GPU_DEVICES = ["cuda"] if torch.cuda.is_available() else []
 
@@ -135,7 +128,7 @@ def forward(self, x):
         original = m(input)
         # we need to transpose the weight first for bmm
         m.weight = torch.nn.Parameter(m.weight.transpose(1, 2).contiguous())
-        quantize_(m, self.bmm_config, filter_fn=lambda x, fqn: True)
+        quantize_(m, self.config, filter_fn=lambda x, fqn: True)
         quantized = m(input)
         self.assertTrue(compute_error(original, quantized) > 18)
 
diff --git a/torchao/dtypes/__init__.py b/torchao/dtypes/__init__.py
@@ -9,7 +9,6 @@
     to_affine_quantized_intx_static,
 )
 from .fbgemm_fp8_tensor import FbgemmFp8Tensor, to_fbgemm_fp8
-from .fbgemm_int4_tensor import FbgemmInt4Tensor, to_fbgemm_int4
 from .floatx import (
     CutlassSemiSparseLayout,
     Float8Layout,
@@ -64,8 +63,6 @@
     "PackedLinearInt8DynamicActivationIntxWeightLayout",
     "to_affine_quantized_packed_linear_int8_dynamic_activation_intx_weight",
     "Int4XPULayout",
-    "to_fbgemm_int4",
-    "FbgemmInt4Tensor",
     "to_fbgemm_fp8",
     "FbgemmFp8Tensor",
     "Int8DynamicActInt4WeightCPULayout",
diff --git a/torchao/quantization/__init__.py b/torchao/quantization/__init__.py
@@ -44,6 +44,7 @@
 from .quant_api import (
     CutlassInt4PackedLayout,
     FbgemmConfig,
+    Float8ActivationInt4WeightConfig,
     Float8DynamicActivationFloat8SemiSparseWeightConfig,
     Float8DynamicActivationFloat8WeightConfig,
     Float8MMConfig,
@@ -90,6 +91,7 @@
 from .quantize_.workflows import (
     Float8Tensor,
     Int4PreshuffledTensor,
+    Int4Tensor,
 )
 from .smoothquant import (
     SmoothFakeDynamicallyQuantizedLinear,
@@ -141,6 +143,7 @@
     "Int8DynamicActivationInt8WeightConfig",
     "Int8DynamicActivationIntxWeightConfig",
     "Int4WeightOnlyConfig",
+    "Float8ActivationInt4WeightConfig",
     "Int8WeightOnlyConfig",
     "Float8WeightOnlyConfig",
     "Float8DynamicActivationFloat8WeightConfig",
@@ -154,6 +157,7 @@
     "ModuleFqnToConfig",
     "FbgemmConfig",
     # tensor subclasses
+    "Int4Tensor",
     "Int4PreshuffledTensor",
     "Float8Tensor",
     # smooth quant - subject to change
diff --git a/torchao/quantization/quant_api.py b/torchao/quantization/quant_api.py
@@ -49,7 +49,6 @@
     to_affine_quantized_floatx_static,
     to_affine_quantized_intx,
     to_fbgemm_fp8,
-    to_fbgemm_int4,
     to_marlinqqq_quantized_intx,
 )
 from torchao.dtypes.uintx.packed_linear_int8_dynamic_activation_intx_weight_layout import (
@@ -72,6 +71,7 @@
 from torchao.quantization.quantize_.workflows import (
     Float8Tensor,
     Int4PreshuffledTensor,
+    Int4Tensor,
 )
 from torchao.quantization.transform_module import (
     _QUANTIZE_CONFIG_HANDLER,
@@ -1116,6 +1116,7 @@ class Int4WeightOnlyConfig(AOBaseConfig):
     zero_point_domain: Optional[ZeroPointDomain] = ZeroPointDomain.NONE
     set_inductor_config: bool = True
     preserve_zero: Optional[bool] = None
+    use_preshuffle: bool = False
 
 
 # for BC
@@ -1133,15 +1134,31 @@ def _int4_weight_only_quantize_tensor(weight, config):
     layout = config.layout
     use_hqq = config.use_hqq
     zero_point_domain = config.zero_point_domain
+    use_preshuffle = config.use_preshuffle
 
     if weight.shape[-1] % group_size != 0:
         logger.info(
             f"Skipping quantizing weight with int4 weight only quantization because the shape of weight {weight.shape} is not compatible with group_size {group_size}"
         )
         return weight
 
+    block_size = tuple([1 for _ in range(weight.ndim - 1)] + [group_size])
+
+    if use_preshuffle:
+        new_weight = Int4PreshuffledTensor.from_float(
+            weight,
+            block_size,
+            activation_dtype="bf16",
+        )
+        return new_weight
+    else:
+        new_weight = Int4Tensor.from_float(
+            weight,
+            block_size,
+        )
+        return new_weight
+
     mapping_type = MappingType.ASYMMETRIC
-    block_size = tuple([1 for _ in range(weight.dim() - 1)] + [group_size])
     target_dtype = torch.int32
     quant_min = 0
     quant_max = 15
@@ -1213,6 +1230,39 @@ def _int4_weight_only_transform(
     return module
 
 
+@dataclass
+class Float8ActivationInt4WeightConfig(AOBaseConfig):
+    group_size: int = 128
+    use_preshuffle: bool = False
+    kernel: str = "fbgemm"
+
+
+@register_quantize_module_handler(Float8ActivationInt4WeightConfig)
+def _(module: torch.nn.Module, config: Int4WeightOnlyConfig) -> torch.nn.Module:
+    assert hasattr(module, "weight"), (
+        "applying int8 weight only quant requires module to have weight attribute"
+        + " but {module} does not have one"
+    )
+    group_size = config.group_size
+    use_preshuffle = config.use_preshuffle
+    kernel = config.kernel
+
+    assert use_preshuffle, (
+        f"only use_preshuffle == True is supported right now, got: {use_preshuffle}"
+    )
+    assert kernel == "fbgemm", f"only fbgemm kernel is supported, got: {kernel}"
+    weight = module.weight
+    block_size = tuple([1 for _ in range(weight.ndim - 1)] + [group_size])
+    new_weight = Int4PreshuffledTensor.from_float(
+        module.weight,
+        block_size,
+        activation_dtype="fp8",
+    )
+    module.weight = torch.nn.Parameter(new_weight, requires_grad=False)
+    module.extra_repr = types.MethodType(_linear_extra_repr, module)
+    return module
+
+
 @dataclass
 class Int8WeightOnlyConfig(AOBaseConfig):
     """
@@ -2067,7 +2117,7 @@ def _(module: torch.nn.Module, config: FbgemmConfig) -> torch.nn.Module:
                 activation_dtype=torch.bfloat16,
             )
         else:
-            weight = to_fbgemm_int4(
+            weight = Int4Tensor.from_float(
                 module.weight,
                 config.block_size,
             )
diff --git a/torchao/quantization/quantize_/workflows/__init__.py b/torchao/quantization/quantize_/workflows/__init__.py
@@ -3,12 +3,15 @@
 )
 from .int4 import (
     Int4PreshuffledTensor,
+    Int4Tensor,
 )
 
+Int4Tensor.__module__ = "torchao.quantization"
 Int4PreshuffledTensor.__module__ = "torchao.quantization"
 Float8Tensor.__module__ = "torchao.quantization"
 
 __all__ = [
+    "Int4Tensor",
     "Int4PreshuffledTensor",
     "Float8Tensor",
 ]
diff --git a/torchao/quantization/quantize_/workflows/int4/__init__.py b/torchao/quantization/quantize_/workflows/int4/__init__.py
@@ -1,5 +1,7 @@
 from .int4_preshuffled_tensor import Int4PreshuffledTensor
+from .int4_tensor import Int4Tensor
 
 __all__ = [
     "Int4PreshuffledTensor",
+    "Int4Tensor",
 ]
diff --git a/torchao/quantization/quantize_/workflows/int4/int4_preshuffled_tensor.py b/torchao/quantization/quantize_/workflows/int4/int4_preshuffled_tensor.py
@@ -194,7 +194,7 @@ def from_float(
         if quantize_int4_preshuffle is None:
             raise ImportError("Requires fbgemm-gpu-genai >= 1.2.0")
 
-        assert all(x == 1 for x in block_size[:-1]), (
+        assert all(x == 1 for x in block_size[:-1]) and block_size[-1] != 1, (
             "Only groupwise quant is supported right now"
         )
         group_size = block_size[-1]
diff --git a/torchao/quantization/quantize_/workflows/int4/int4_tensor.py b/torchao/quantization/quantize_/workflows/int4/int4_tensor.py

Original file line number	Diff line number	Diff line change
`@@ -18,6 +18,7 @@`
`18`	`18`
`19`	`19`	`_MODEL_NAMES = [`
`20`	`20`	`"torchao-testing/opt-125m-float8dq-row-fbgemm",`
	`21`	`+ "torchao-testing/opt-125m-int4wo-preshuffle",`
`21`	`22`	`]`
`22`	`23`
`23`	`24`
Original file line number	Diff line number	Diff line change
`@@ -3,12 +3,15 @@`
`3`	`3`	`)`
`4`	`4`	`from .int4 import (`
`5`	`5`	`Int4PreshuffledTensor,`
	`6`	`+ Int4Tensor,`
`6`	`7`	`)`
`7`	`8`
	`9`	`+Int4Tensor.__module__ = "torchao.quantization"`
`8`	`10`	`Int4PreshuffledTensor.__module__ = "torchao.quantization"`
`9`	`11`	`Float8Tensor.__module__ = "torchao.quantization"`
`10`	`12`
`11`	`13`	`__all__ = [`
	`14`	`+ "Int4Tensor",`
`12`	`15`	`"Int4PreshuffledTensor",`
`13`	`16`	`"Float8Tensor",`
`14`	`17`	`]`
Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,7 @@`
`1`	`1`	`from .int4_preshuffled_tensor import Int4PreshuffledTensor`
	`2`	`+from .int4_tensor import Int4Tensor`
`2`	`3`
`3`	`4`	`__all__ = [`
`4`	`5`	`"Int4PreshuffledTensor",`
	`6`	`+ "Int4Tensor",`
`5`	`7`	`]`
Original file line number	Diff line number	Diff line change
`@@ -194,7 +194,7 @@ def from_float(`
`194`	`194`	`if quantize_int4_preshuffle is None:`
`195`	`195`	`raise ImportError("Requires fbgemm-gpu-genai >= 1.2.0")`
`196`	`196`
`197`		`- assert all(x == 1 for x in block_size[:-1]), (`
	`197`	`+ assert all(x == 1 for x in block_size[:-1]) and block_size[-1] != 1, (`
`198`	`198`	`"Only groupwise quant is supported right now"`
`199`	`199`	`)`
`200`	`200`	`group_size = block_size[-1]`