[Compressor] Update packed compressor to support zp packing (#296)

dsikka · web-flow · commit ed3ac7c10cc5 · 2025-04-17T20:35:58.000-05:00
* update packed compressor

* update

* fix packing conditions

* update condition

* update

* Delete src/compressed_tensors/load_weights.py

* clean-up condition; add error message for decompression

* update

* add test, fix condition

* fix dtype
diff --git a/src/compressed_tensors/compressors/model_compressors/model_compressor.py b/src/compressed_tensors/compressors/model_compressors/model_compressor.py
@@ -382,6 +382,7 @@ def compress(
             compressed_state_dict = self.quantization_compressor.compress(
                 state_dict, names_to_scheme=quantized_modules_to_args
             )
+
             if self.quantization_config.format != CompressionFormat.dense.value:
                 self.quantization_config.quantization_status = (
                     QuantizationStatus.COMPRESSED
diff --git a/src/compressed_tensors/compressors/quantized_compressors/base.py b/src/compressed_tensors/compressors/quantized_compressors/base.py
@@ -18,7 +18,7 @@
 
 import torch
 from compressed_tensors.compressors.base import BaseCompressor
-from compressed_tensors.quantization import QuantizationArgs
+from compressed_tensors.quantization import QuantizationArgs, QuantizationStrategy
 from compressed_tensors.utils import (
     get_nested_mappings_from_state_dict,
     get_nested_weight_mappings,
@@ -132,8 +132,10 @@ def compress(
                         compressed_dict[merge_names(prefix, key)] = value
                 else:
                     compressed_dict[name] = value.to("cpu")
-            # only save if asym
-            elif is_weight_zp and quant_args_zp.symmetric:
+            # only save zp if asym and not packed zp
+            elif is_weight_zp and (
+                quant_args_zp.symmetric or self._check_if_zp_pack_quantized(quant_args)
+            ):
                 continue
             # only save if asym
             elif is_input_zp and input_args_zp.symmetric:
@@ -145,6 +147,17 @@ def compress(
 
         return compressed_dict
 
+    def _check_if_zp_pack_quantized(self, quant_args):
+        from compressed_tensors.compressors import PackedQuantizationCompressor
+
+        if isinstance(self, PackedQuantizationCompressor):
+            if not quant_args.symmetric and quant_args.strategy in [
+                QuantizationStrategy.GROUP.value,
+                QuantizationStrategy.CHANNEL.value,
+            ]:
+                return True
+        return False
+
     def decompress(
         self,
         path_to_model_or_tensors: Union[str, Path, Dict[str, Any]],
diff --git a/src/compressed_tensors/compressors/quantized_compressors/pack_quantized.py b/src/compressed_tensors/compressors/quantized_compressors/pack_quantized.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import math
-from typing import Dict, Optional, Tuple
+from typing import Dict, Literal, Optional, Tuple, Union
 
 import numpy as np
 import torch
@@ -21,7 +21,7 @@
     BaseQuantizationCompressor,
 )
 from compressed_tensors.config import CompressionFormat
-from compressed_tensors.quantization import QuantizationArgs
+from compressed_tensors.quantization import QuantizationArgs, QuantizationStrategy
 from compressed_tensors.quantization.lifecycle.forward import dequantize, quantize
 from compressed_tensors.quantization.utils import can_quantize
 from torch import Tensor
@@ -65,10 +65,26 @@ def compression_param_info(
         """
         pack_factor = 32 // quantization_args.num_bits
         packed_size = math.ceil(weight_shape[1] / pack_factor)
-        return {
+        packed_size_zp = math.ceil(weight_shape[0] / pack_factor)
+        output = {
             "weight_packed": (torch.Size((weight_shape[0], packed_size)), torch.int32),
             "weight_shape": (torch.Size((2,)), torch.int32),
         }
+        if not quantization_args.symmetric and quantization_args.strategy in [
+            QuantizationStrategy.GROUP.value,
+            QuantizationStrategy.CHANNEL.value,
+        ]:
+            zp_factor = (
+                quantization_args.group_size
+                if quantization_args.strategy == QuantizationStrategy.GROUP.value
+                else weight_shape[-1]
+            )
+
+            output["weight_zero_point"] = (
+                torch.Size((packed_size_zp, weight_shape[-1] // zp_factor)),
+                torch.int32,
+            )
+        return output
 
     def compress_weight(
         self,
@@ -104,6 +120,7 @@ def compress_weight(
             quantized_weight = weight
 
         packed_weight = pack_to_int32(quantized_weight, quantization_args.num_bits)
+
         weight_shape = torch.tensor(weight.shape)
         if device is not None:
             packed_weight = packed_weight.to(device)
@@ -112,6 +129,15 @@ def compress_weight(
         compressed_dict["weight_shape"] = weight_shape
         compressed_dict["weight_packed"] = packed_weight
 
+        # We typically don't compress zp; apart from when using the packed_compressor and when storing group/channel zp
+        if not quantization_args.symmetric and quantization_args.strategy in [
+            QuantizationStrategy.GROUP.value,
+            QuantizationStrategy.CHANNEL.value,
+        ]:
+            packed_zp = pack_to_int32(
+                zero_point, quantization_args.num_bits, packed_dim=0
+            )
+            compressed_dict["weight_zero_point"] = packed_zp
         return compressed_dict
 
     def decompress_weight(
@@ -133,14 +159,33 @@ def decompress_weight(
         original_shape = torch.Size(compressed_data["weight_shape"])
         num_bits = quantization_args.num_bits
         unpacked = unpack_from_int32(weight, num_bits, original_shape)
+
+        # NOTE: this will fail decompression as we don't currently handle packed zp on decompression
+        if not quantization_args.symmetric and quantization_args.strategy in [
+            QuantizationStrategy.GROUP.value,
+            QuantizationStrategy.CHANNEL.value,
+        ]:
+            raise ValueError(
+                "Decompression of packed zero points is currently not supported"
+            )
+            assert zero_point is not None
+            original_zp_shape = (original_shape[0], scale.shape[-1])
+            zero_point = unpack_from_int32(
+                zero_point, num_bits, original_zp_shape, packed_dim=0
+            )
+
         decompressed_weight = dequantize(
             x_q=unpacked, scale=scale, zero_point=zero_point, g_idx=g_idx
         )
 
         return decompressed_weight
 
 
-def pack_to_int32(value: torch.Tensor, num_bits: int) -> torch.Tensor:
+def pack_to_int32(
+    value: torch.Tensor,
+    num_bits: int,
+    packed_dim: Union[Literal[0], Literal[1]] = 1,
+) -> torch.Tensor:
     """
     Packs a tensor of quantized weights stored in int8 into int32s with padding
 
@@ -176,22 +221,30 @@ def pack_to_int32(value: torch.Tensor, num_bits: int) -> torch.Tensor:
     pack_factor = 32 // num_bits
 
     # pad input tensor and initialize packed output
-    packed_size = math.ceil(value.shape[1] / pack_factor)
-    padding = packed_size * pack_factor - value.shape[1]
+    packed_size = math.ceil(value.shape[packed_dim] / pack_factor)
+    padding = packed_size * pack_factor - value.shape[packed_dim]
     value = np.pad(value, pad_width=[(0, 0), (0, padding)], constant_values=0)
 
     # pack values
-    packed = np.zeros((value.shape[0], packed_size), dtype=np.uint32)
-    for i in range(pack_factor):
-        packed |= value[:, i::pack_factor] << num_bits * i
+    if packed_dim == 1:
+        packed = np.zeros((value.shape[0], packed_size), dtype=np.uint32)
+        for i in range(pack_factor):
+            packed |= value[:, i::pack_factor] << num_bits * i
+    else:
+        packed = np.zeros((packed_size, value.shape[1]), dtype=np.uint32)
+        for i in range(pack_factor):
+            packed |= value[i::pack_factor, :] << num_bits * i
 
     # convert back to signed and torch
     packed = np.ascontiguousarray(packed).view(np.int32)
     return torch.from_numpy(packed)
 
 
 def unpack_from_int32(
-    value: torch.Tensor, num_bits: int, shape: torch.Size
+    value: torch.Tensor,
+    num_bits: int,
+    shape: torch.Size,
+    packed_dim: Union[Literal[0], Literal[1]] = 1,
 ) -> torch.Tensor:
     """
     Unpacks a tensor of packed int32 weights into individual int8s, maintaining the
@@ -216,17 +269,31 @@ def unpack_from_int32(
 
     # unpack
     mask = (1 << num_bits) - 1
-    unpacked = torch.zeros(
-        (value.shape[0], value.shape[1] * pack_factor),
-        device=value.device,
-        dtype=torch.int32,
-    )
-    for i in range(pack_factor):
-        unpacked[:, i::pack_factor] = (value >> (num_bits * i)) & mask
-
-    # remove padding
-    original_row_size = int(shape[1])
-    unpacked = unpacked[:, :original_row_size]
+
+    if packed_dim == 1:
+        unpacked = torch.zeros(
+            (value.shape[0], value.shape[1] * pack_factor),
+            device=value.device,
+            dtype=torch.int32,
+        )
+        for i in range(pack_factor):
+            unpacked[:, i::pack_factor] = (value >> (num_bits * i)) & mask
+
+        # remove padding
+        original_row_size = int(shape[1])
+        unpacked = unpacked[:, :original_row_size]
+    else:
+        unpacked = torch.zeros(
+            (value.shape[0] * pack_factor, value.shape[1]),
+            device=value.device,
+            dtype=torch.int32,
+        )
+        for i in range(pack_factor):
+            unpacked[i::pack_factor, :] = (value >> (num_bits * i)) & mask
+
+        # remove padding
+        original_row_size = int(shape[0])
+        unpacked = unpacked[:original_row_size, :]
 
     # bits are packed in unsigned format, reformat to signed
     # update the value range from unsigned to signed
diff --git a/tests/test_compressors/quantized_compressors/test_pack_quant.py b/tests/test_compressors/quantized_compressors/test_pack_quant.py
@@ -29,6 +29,7 @@
     QuantizationConfig,
     QuantizationScheme,
     QuantizationStatus,
+    QuantizationStrategy,
     apply_quantization_config,
 )
 from compressed_tensors.quantization.lifecycle.forward import fake_quantize
@@ -76,7 +77,7 @@ def test_quant_format(shape):
     dense_state_dict = {
         "dummy.weight": torch.rand(shape),
         "dummy.weight_scale": torch.tensor(0.01, dtype=torch.float32),
-        "dummy.weight_zero_point": torch.tensor(0, dtype=torch.int32),
+        "dummy.weight_zero_point": torch.tensor(0, dtype=torch.int8),
     }
     quant_config = get_dummy_quant_config()
 
@@ -203,6 +204,63 @@ def test_reload_match(tmp_path, num_bits):
     shutil.rmtree(tmp_path)
 
 
+@pytest.mark.parametrize(
+    "strategy",
+    {QuantizationStrategy.GROUP, QuantizationStrategy.CHANNEL},
+)
+def test_asymmetric_packed_support(strategy):
+    shape = (1024, 1024)
+
+    group_size = None
+    if strategy == QuantizationStrategy.GROUP:
+        group_size = 128
+
+    if strategy == QuantizationStrategy.CHANNEL:
+        expected_shape = (shape[0], 1)
+    elif strategy == QuantizationStrategy.GROUP:
+        num_groups = shape[1] // group_size
+        expected_shape = (shape[0], max(num_groups, 1))
+
+    dense_state_dict = {
+        "dummy.weight": torch.rand(shape),
+        "dummy.weight_scale": torch.rand(expected_shape).to(torch.float32),
+        "dummy.weight_zero_point": torch.rand(expected_shape).to(torch.int8),
+    }
+
+    quant_config = get_dummy_quant_config(
+        strategy=strategy.value, symmetric=False, group_size=group_size
+    )
+
+    compressor = PackedQuantizationCompressor(config=quant_config)
+    quantized_modules_to_args = {"dummy": quant_config.config_groups["group_1"].weights}
+    compressed_state_dict = compressor.compress(
+        dense_state_dict, names_to_scheme=quantized_modules_to_args
+    )
+
+    # compressed state_dict adds one entry for shape
+    assert len(dense_state_dict) + 1 == len(compressed_state_dict)
+    assert compressed_state_dict["dummy.weight_packed"].dtype == torch.int32
+    assert compressed_state_dict["dummy.weight_zero_point"].dtype == torch.int32
+    assert compressed_state_dict["dummy.weight_scale"].dtype == torch.float32
+
+    # check weight compressed and packed
+    expected_rows = shape[0]
+    expected_columns = math.ceil(shape[1] / 8)  # round each row up to nearest int32
+    assert compressed_state_dict["dummy.weight_packed"].shape == (
+        expected_rows,
+        expected_columns,
+    )
+    assert torch.equal(compressed_state_dict["dummy.weight_shape"], torch.tensor(shape))
+
+    # check zp compressed and packed
+    packed_size_zp = math.ceil(shape[0] / 8)
+    zp_factor = group_size if strategy == QuantizationStrategy.GROUP else shape[-1]
+    assert compressed_state_dict["dummy.weight_zero_point"].shape == (
+        packed_size_zp,
+        shape[-1] // zp_factor,
+    )
+
+
 @pytest.mark.parametrize(
     "actorder",
     [

Original file line number	Diff line number	Diff line change
`@@ -382,6 +382,7 @@ def compress(`
`382`	`382`	`compressed_state_dict = self.quantization_compressor.compress(`
`383`	`383`	`state_dict, names_to_scheme=quantized_modules_to_args`
`384`	`384`	`)`
	`385`	`+`
`385`	`386`	`if self.quantization_config.format != CompressionFormat.dense.value:`
`386`	`387`	`self.quantization_config.quantization_status = (`
`387`	`388`	`QuantizationStatus.COMPRESSED`