[CPU] Add concat-linear fusion pass for da8w4 (#2476)

Xia-Weiwen · pytorchmergebot · commit f24f37b5b6f4 · 2025-07-10T10:05:17.000Z
**Summary** This PR adds a concat-linear fusion pass for da8w4 on CPU. The pass fuses the following pattern ``` da8w4_linear_cpu(x, ..., w1, ...) -- y1 / x --da8w4_linear_cpu(x, ..., w2, ...) -- y2 \... da8w4_linear_cpu(x, ..., wN, ...) -- yN ``` to ``` x -- da8w4_linear_cpu(x, ..., w_concat, ...) -- y_concat -- split -- (y1, y2, yN) ``` The fusion pass is registered as a custom post_grad pass in Inductor. The pass takes effect only when `torch._inductor.config.cpp.enable_concat_linear` is true. Benchmarks show that total CPU time of linear is reduced by >5% with concat linear when running Llama3.1-8B with 32 cores on a 6th gen of Intel(R) Xeon(R). **Test plan** ``` pytest test/quantization/test_da8w4_cpu.py -k test_8da4w_concat_linear_cpu ``` Pull Request resolved: #2476 Approved by: https://github.com/leslie-fang-intel, https://github.com/CaoE, https://github.com/jerryzh168
diff --git a/test/quantization/test_da8w4_cpu.py b/test/quantization/test_da8w4_cpu.py
@@ -0,0 +1,182 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+
+import copy
+import unittest
+
+import torch
+from torch.testing._internal import common_utils
+from torch.testing._internal.common_utils import (
+    TestCase,
+    run_tests,
+)
+
+from torchao import quantize_
+from torchao.dtypes import (
+    Int8DynamicActInt4WeightCPULayout,
+    PlainLayout,
+)
+from torchao.quantization.quant_api import (
+    Int8DynamicActivationInt4WeightConfig,
+)
+from torchao.quantization.quant_primitives import MappingType
+from torchao.utils import (
+    TORCH_VERSION_AT_LEAST_2_7,
+    TORCH_VERSION_AT_LEAST_2_8,
+)
+
+
+class ToyLinearModel(torch.nn.Module):
+    def __init__(self, m=64, n=32, k=64, bias=False):
+        super().__init__()
+        self.linear1 = torch.nn.Linear(m, n, bias=bias).to(torch.float)
+        self.linear2 = torch.nn.Linear(n, k, bias=bias).to(torch.float)
+
+    def example_inputs(self, batch_size=1, dtype=torch.float, device="cpu"):
+        return (
+            torch.randn(
+                batch_size, self.linear1.in_features, dtype=dtype, device=device
+            ),
+        )
+
+    def forward(self, x):
+        x = self.linear1(x)
+        x = self.linear2(x)
+        return x
+
+
+class TestDa8w4Cpu(TestCase):
+    @unittest.skipIf(
+        "CPU" not in torch._C._dispatch_dump("torchao::da8w4_linear_cpu"),
+        reason="cpp kernels not built",
+    )
+    @unittest.skipIf(not TORCH_VERSION_AT_LEAST_2_7, "Test only enabled for 2.7+")
+    @common_utils.parametrize("dtype", [torch.float, torch.bfloat16, torch.half])
+    @common_utils.parametrize("x_dim", [2, 3])
+    @common_utils.parametrize("bias", [True, False])
+    @common_utils.parametrize("bs", [1, 160])
+    @common_utils.parametrize("sym_quant_a", [True, False])
+    def test_8da4w_cpu(self, dtype, x_dim, bias, bs, sym_quant_a):
+        if sym_quant_a and not TORCH_VERSION_AT_LEAST_2_8:
+            # not supported until PT 2.8
+            return
+        device = "cpu"
+        m = ToyLinearModel(bias=bias).eval().to(dtype).to(device)
+        m2 = copy.deepcopy(m)
+        example_inputs = m.example_inputs(batch_size=bs, dtype=dtype, device=device)
+        if x_dim == 3:
+            example_inputs = (example_inputs[0].unsqueeze(0),)
+
+        with torch.no_grad():
+            # Currently, the difference between Int8DynamicActInt4WeightCPULayout and PlainLayout
+            # is that the former packs two int4 weights into one int8, while the latter does not.
+            quantize_(
+                m,
+                Int8DynamicActivationInt4WeightConfig(
+                    group_size=32,
+                    layout=Int8DynamicActInt4WeightCPULayout(),
+                    act_mapping_type=MappingType.SYMMETRIC
+                    if sym_quant_a
+                    else MappingType.ASYMMETRIC,
+                ),
+            )
+            y, code = torch._inductor.utils.run_and_get_code(
+                torch.compile(m, fullgraph=True, dynamic=True),
+                *example_inputs,
+            )
+            # ensure the expected op is in the code
+            assert "torch.ops.torchao.da8w4_linear_cpu.default" in code[0]
+            quantize_(
+                m2,
+                Int8DynamicActivationInt4WeightConfig(
+                    group_size=32,
+                    layout=PlainLayout(),
+                    act_mapping_type=MappingType.SYMMETRIC
+                    if sym_quant_a
+                    else MappingType.ASYMMETRIC,
+                ),
+            )
+            torch._dynamo.reset()  # may segfault without this
+            y2 = torch.compile(m2, fullgraph=True, dynamic=True)(*example_inputs)
+            atol, rtol = 4e-7, 1e-5
+            if dtype == torch.bfloat16:
+                atol, rtol = 1e-2, 3e-3
+            elif dtype == torch.half:
+                atol, rtol = 6e-3, 2e-3
+            assert torch.allclose(y, y2, atol=atol, rtol=rtol)
+            # Test get_plain by dequantize()
+            dqw1 = m.linear1.weight.original_weight_tensor.dequantize()
+            dqw2 = m.linear2.weight.original_weight_tensor.dequantize()
+            dqw1_ref = m2.linear1.weight.original_weight_tensor.dequantize()
+            dqw2_ref = m2.linear2.weight.original_weight_tensor.dequantize()
+            assert torch.allclose(dqw1, dqw1_ref)
+            assert torch.allclose(dqw2, dqw2_ref)
+
+    @unittest.skipIf(
+        "CPU" not in torch._C._dispatch_dump("torchao::da8w4_linear_cpu"),
+        reason="cpp kernels not built",
+    )
+    @unittest.skipIf(not TORCH_VERSION_AT_LEAST_2_8, "Test only enabled for 2.8+")
+    @common_utils.parametrize("x_dim", [2, 3])
+    @common_utils.parametrize("bias", [True, False])
+    def test_8da4w_concat_linear_cpu(self, x_dim, bias):
+        N, K = 64, 128
+
+        class Mod(torch.nn.Module):
+            def __init__(self, bias):
+                super().__init__()
+                self.linear1 = torch.nn.Linear(K, N, bias=bias)
+                self.linear2 = torch.nn.Linear(K, N, bias=bias)
+                self.linear3 = torch.nn.Linear(K, N, bias=bias)
+
+            def forward(self, x):
+                a = self.linear1(x)
+                b = self.linear2(x)
+                c = self.linear3(x)
+                return a + b + c
+
+        dtype = torch.bfloat16
+        device = "cpu"
+        m = Mod(bias).eval().to(dtype).to(device)
+        x_shape = [2] * x_dim
+        x_shape[-1] = K
+        x = torch.rand(x_shape, dtype=dtype, device=device)
+        with torch.no_grad():
+            quantize_(
+                m,
+                Int8DynamicActivationInt4WeightConfig(
+                    group_size=32,
+                    layout=Int8DynamicActInt4WeightCPULayout(),
+                    act_mapping_type=MappingType.SYMMETRIC,
+                ),
+            )
+            # Need to turn on freezing to get the pattern
+            # set enable_concat_linear to true to enable the fusion
+            with torch._inductor.config.patch(
+                {"freezing": True, "cpp.enable_concat_linear": True}
+            ):
+                y, code = torch._inductor.utils.run_and_get_code(
+                    torch.compile(m, fullgraph=True, dynamic=True),
+                    x,
+                )
+            # ensure the expected op occurs only once in the code after fusion
+            # The trailing "(" is to avoid matching the op in the comment
+            assert code[0].count("torch.ops.torchao.da8w4_linear_cpu.default(") == 1
+            with torch._inductor.config.patch(
+                {"freezing": True, "cpp.enable_concat_linear": False}
+            ):
+                y_ref, code = torch._inductor.utils.run_and_get_code(
+                    torch.compile(m, fullgraph=True, dynamic=True),
+                    x,
+                )
+            assert torch.allclose(y, y_ref)
+
+
+common_utils.instantiate_parametrized_tests(TestDa8w4Cpu)
+
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/quantization/test_quant_api.py b/test/quantization/test_quant_api.py
@@ -29,7 +29,6 @@
     AffineQuantizedTensor,
     Int4CPULayout,
     Int4XPULayout,
-    Int8DynamicActInt4WeightCPULayout,
     PlainLayout,
     QDQLayout,
     TensorCoreTiledLayout,
@@ -71,7 +70,6 @@
     TORCH_VERSION_AT_LEAST_2_4,
     TORCH_VERSION_AT_LEAST_2_5,
     TORCH_VERSION_AT_LEAST_2_6,
-    TORCH_VERSION_AT_LEAST_2_7,
     TORCH_VERSION_AT_LEAST_2_8,
     is_sm_at_least_89,
     is_sm_at_least_90,
@@ -699,72 +697,6 @@ def test_int4wo_cpu(self, dtype, x_dim, use_hqq):
             assert "_weight_int4pack_mm_for_cpu" in code[0]
             assert "aten.mm.default" not in code[0]
 
-    @unittest.skipIf(
-        "CPU" not in torch._C._dispatch_dump("torchao::da8w4_linear_cpu"),
-        reason="cpp kernels not built",
-    )
-    @unittest.skipIf(not TORCH_VERSION_AT_LEAST_2_7, "Test only enabled for 2.7+")
-    @common_utils.parametrize("dtype", [torch.float, torch.bfloat16, torch.half])
-    @common_utils.parametrize("x_dim", [2, 3])
-    @common_utils.parametrize("bias", [True, False])
-    @common_utils.parametrize("bs", [1, 160])
-    @common_utils.parametrize("sym_quant_a", [True, False])
-    def test_8da4w_cpu(self, dtype, x_dim, bias, bs, sym_quant_a):
-        if sym_quant_a and not TORCH_VERSION_AT_LEAST_2_8:
-            # not supported until PT 2.8
-            return
-        device = "cpu"
-        m = ToyLinearModel(bias=bias).eval().to(dtype).to(device)
-        m2 = copy.deepcopy(m)
-        example_inputs = m.example_inputs(batch_size=bs, dtype=dtype, device=device)
-        if x_dim == 3:
-            example_inputs = (example_inputs[0].unsqueeze(0),)
-
-        with torch.no_grad():
-            # Currently, the difference between Int8DynamicActInt4WeightCPULayout and PlainLayout
-            # is that the former packs two int4 weights into one int8, while the latter does not.
-            quantize_(
-                m,
-                Int8DynamicActivationInt4WeightConfig(
-                    group_size=32,
-                    layout=Int8DynamicActInt4WeightCPULayout(),
-                    act_mapping_type=MappingType.SYMMETRIC
-                    if sym_quant_a
-                    else MappingType.ASYMMETRIC,
-                ),
-            )
-            y, code = torch._inductor.utils.run_and_get_code(
-                torch.compile(m, fullgraph=True, dynamic=True),
-                *example_inputs,
-            )
-            # ensure the expected op is in the code
-            assert "torch.ops.torchao.da8w4_linear_cpu.default" in code[0]
-            quantize_(
-                m2,
-                int8_dynamic_activation_int4_weight(
-                    group_size=32,
-                    layout=PlainLayout(),
-                    act_mapping_type=MappingType.SYMMETRIC
-                    if sym_quant_a
-                    else MappingType.ASYMMETRIC,
-                ),
-            )
-            torch._dynamo.reset()  # may segfault without this
-            y2 = torch.compile(m2, fullgraph=True, dynamic=True)(*example_inputs)
-            atol, rtol = 4e-7, 1e-5
-            if dtype == torch.bfloat16:
-                atol, rtol = 1e-2, 3e-3
-            elif dtype == torch.half:
-                atol, rtol = 6e-3, 2e-3
-            assert torch.allclose(y, y2, atol=atol, rtol=rtol)
-            # Test get_plain by dequantize()
-            dqw1 = m.linear1.weight.original_weight_tensor.dequantize()
-            dqw2 = m.linear2.weight.original_weight_tensor.dequantize()
-            dqw1_ref = m2.linear1.weight.original_weight_tensor.dequantize()
-            dqw2_ref = m2.linear2.weight.original_weight_tensor.dequantize()
-            assert torch.allclose(dqw1, dqw1_ref)
-            assert torch.allclose(dqw2, dqw2_ref)
-
     # TODO(#1690): move to new config names
     @unittest.skipIf(not TORCH_VERSION_AT_LEAST_2_4, "Test only enabled for 2.4+")
     @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
diff --git a/torchao/csrc/cpu/da8w4_linear.cpp b/torchao/csrc/cpu/da8w4_linear.cpp
@@ -65,6 +65,7 @@ da8w4_linear_prepack_impl(
   at::Tensor blocked_scales = new_scales.view({Nc, block_n, G}).permute({0, 2, 1}).contiguous();
   at::Tensor blocked_qzeros = new_qzeros.view({Nc, block_n, G}).permute({0, 2, 1}).contiguous();
   // Compensation = Σ(k)(W[k][n] - ZP[n]) for each block.
+  // Reorder compensation to [N/block_n, K/block_k, block_n]
   auto weight_sub_qzero = weight.view({Nc, block_n, G, -1}).to(at::kInt) - new_qzeros.view({Nc, block_n, G, -1});
   weight_sub_qzero = weight_sub_qzero.view({Nc, block_n, Kc, block_k});
   at::Tensor compensation = weight_sub_qzero.sum(-1);
@@ -622,9 +623,9 @@ void _da8w4_linear_impl(
     } else if (M < 64) {
       return 32;
     } else if (M < 96) {
-      return 48;
-    } else {
       return 64;
+    } else {
+      return 128;
     }
   }();
   int64_t Mc = (M + block_m - 1) / block_m;
diff --git a/torchao/dtypes/uintx/dyn_int8_act_int4_wei_cpu_layout.py b/torchao/dtypes/uintx/dyn_int8_act_int4_wei_cpu_layout.py
@@ -124,6 +124,10 @@ def from_plain(
         if zero_point.dim() == 1:
             zero_point.unsqueeze_(-1)
 
+        # Pack weight from [N, K] to [N / block_n, K / block_k, block_k, block_n].
+        # Pack the inner blocks [block_k, block_n] to VNNI layout if AMX is available.
+        # Pack scales/qzeros from [N, num_groups] to [N / block_n, num_groups, block_n].
+        # Compensation shape = [N / block_n, K / block_k, block_n].
         weight_int4, scales, qzeros, compensation = (
             torch.ops.torchao.da8w4_linear_prepack_cpu(int_data, scale, zero_point)
         )
@@ -310,3 +314,9 @@ def _linear_int8_act_int4_weight_cpu_impl(input_tensor, weight_tensor, bias):
     y = y.reshape(*orig_act_size[:-1], orig_out_features)
 
     return y.to(orig_dtype)
+
+
+# Register the concat linear fusion pass
+from ...prototype.inductor.fx_passes import register_da8w4_concat_linear_cpu_pass
+
+register_da8w4_concat_linear_cpu_pass()
diff --git a/torchao/prototype/inductor/fx_passes/__init__.py b/torchao/prototype/inductor/fx_passes/__init__.py
@@ -1,5 +1,7 @@
+from .da8w4_concat_linear_fusion_cpu import register_da8w4_concat_linear_cpu_pass
 from .int8_sdpa_fusion import _int8_sdpa_init
 
 __all__ = [
     "_int8_sdpa_init",
+    "register_da8w4_concat_linear_cpu_pass",
 ]
diff --git a/torchao/prototype/inductor/fx_passes/da8w4_concat_linear_fusion_cpu.py b/torchao/prototype/inductor/fx_passes/da8w4_concat_linear_fusion_cpu.py

Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,7 @@`
	`1`	`+from .da8w4_concat_linear_fusion_cpu import register_da8w4_concat_linear_cpu_pass`
`1`	`2`	`from .int8_sdpa_fusion import _int8_sdpa_init`
`2`	`3`
`3`	`4`	`__all__ = [`
`4`	`5`	`"_int8_sdpa_init",`
	`6`	`+ "register_da8w4_concat_linear_cpu_pass",`
`5`	`7`	`]`