Enable the CPU int4 with HQQ quant (#1824)

leslie-fang-intel · web-flow · commit 2cf8fda45e6c · 2025-03-08T07:46:31.000+08:00
* Enable the CPU int4 with HQQ quant

* ruff check

* format code
diff --git a/test/integration/test_integration.py b/test/integration/test_integration.py
@@ -145,13 +145,15 @@ def _int8da_int8w_api(
         change_linear_weights_to_int8_dqtensors(mod)
 
 
-def _int4wo_api(mod):
+def _int4wo_api(mod, use_hqq=False):
     if (
         is_device(next(mod.parameters()).device.type, "cpu")
         and TORCH_VERSION_AT_LEAST_2_6
     ):
         quantize_(
-            mod, int4_weight_only(layout=Int4CPULayout()), set_inductor_config=False
+            mod,
+            int4_weight_only(layout=Int4CPULayout(), use_hqq=use_hqq),
+            set_inductor_config=False,
         )
         unwrap_tensor_subclass(mod)
     elif TORCH_VERSION_AT_LEAST_2_4:
@@ -1049,8 +1051,6 @@ def test_int8_weight_only_quant_with_freeze(self, device, dtype):
     @unittest.skipIf(not TORCH_VERSION_AT_LEAST_2_3, "int4 requires torch nightly.")
     # @unittest.skipIf(TORCH_VERSION_AT_LEAST_2_5, "int4 skipping 2.5+ for now")
     def test_int4_weight_only_quant_subclass_api(self, device, dtype):
-        if device == "cpu":
-            self.skipTest(f"Temporarily skipping for {device}")
         if dtype != torch.bfloat16:
             self.skipTest(f"Fails for {dtype}")
         for test_shape in [(16, 1024, 16)] + (
@@ -1060,6 +1060,20 @@ def test_int4_weight_only_quant_subclass_api(self, device, dtype):
                 _int4wo_api, device, 15, test_shape=test_shape, test_dtype=dtype
             )
 
+    @parameterized.expand(COMMON_DEVICE_DTYPE)
+    @unittest.skipIf(not TORCH_VERSION_AT_LEAST_2_6, "int4 hqq requires torch nightly.")
+    def test_int4_weight_only_hqq_quant_subclass_api(self, device, dtype):
+        if dtype != torch.bfloat16:
+            self.skipTest(f"Fails for {dtype}")
+        for test_shape in [(16, 1024, 16), (1, 1024, 256)]:
+            api = partial(
+                _int4wo_api,
+                use_hqq=True,
+            )
+            self._test_lin_weight_subclass_api_impl(
+                api, device, 15, test_shape=test_shape, test_dtype=dtype
+            )
+
     @parameterized.expand(COMMON_DEVICE_DTYPE)
     @unittest.skipIf(
         not TORCH_VERSION_AT_LEAST_2_5, "gemlite tests needs torch 2.5 or greater"
@@ -1111,8 +1125,6 @@ def test_gemlite_layout(self, device, dtype):
     # @unittest.skipIf(TORCH_VERSION_AT_LEAST_2_5, "int4 skipping 2.5+ for now")
     @skip_if_rocm("ROCm enablement in progress")
     def test_int4_weight_only_quant_subclass_api_grouped(self, device, dtype):
-        if device == "cpu":
-            self.skipTest(f"Temporarily skipping for {device}")
         if dtype != torch.bfloat16:
             self.skipTest(f"Fails for {dtype}")
         layout_list = []
diff --git a/test/quantization/test_quant_api.py b/test/quantization/test_quant_api.py
@@ -782,7 +782,8 @@ def reset_memory():
     @unittest.skipIf(not TORCH_VERSION_AT_LEAST_2_6, "Test only enabled for 2.6+")
     @common_utils.parametrize("dtype", [torch.float, torch.bfloat16, torch.half])
     @common_utils.parametrize("x_dim", [2, 3])
-    def test_int4wo_cpu(self, dtype, x_dim):
+    @common_utils.parametrize("use_hqq", [True, False])
+    def test_int4wo_cpu(self, dtype, x_dim, use_hqq):
         from torchao.dtypes import Int4CPULayout
 
         device = "cpu"
@@ -792,7 +793,12 @@ def test_int4wo_cpu(self, dtype, x_dim):
             example_inputs = (example_inputs[0].unsqueeze(0),)
 
         with torch.no_grad():
-            quantize_(m, int4_weight_only(group_size=32, layout=Int4CPULayout()))
+            quantize_(
+                m,
+                int4_weight_only(
+                    group_size=32, layout=Int4CPULayout(), use_hqq=use_hqq
+                ),
+            )
             # ensure the expected op is in the code
             _, code = torch._inductor.utils.run_and_get_code(
                 torch.compile(m, fullgraph=True, dynamic=True),
diff --git a/torchao/dtypes/affine_quantized_tensor.py b/torchao/dtypes/affine_quantized_tensor.py
@@ -224,6 +224,7 @@ def from_hp_to_intx(
                 else input_float.dtype
             )
             device = input_float.device
+            from torchao.dtypes import Int4CPULayout
             from torchao.dtypes.uintx import TensorCoreTiledLayout
 
             data, scale, zero_point, _ = choose_qparams_and_quantize_affine_hqq(
@@ -235,7 +236,7 @@ def from_hp_to_intx(
                 device=device,
                 verbose=False,
                 raw_output=not isinstance(
-                    _layout, (TensorCoreTiledLayout, PlainLayout)
+                    _layout, (TensorCoreTiledLayout, PlainLayout, Int4CPULayout)
                 ),
                 # raw_output=False is basically the 'convert to TensorCoreTiledLayout zero_point version' option (add scale*midpoint)
                 # note in choose_qparams_affine, preserve_zero = False does this same thing while also controlling whether