reduce memory requirements, clarify map_modules_to_quant_scheme

kylesayrs · kylesayrs · commit c1b06dede9ad · 2025-04-02T18:34:26.000-04:00
Signed-off-by: Kyle Sayers &lt;kylesayrs@gmail.com&gt;
diff --git a/src/compressed_tensors/compressors/model_compressors/model_compressor.py b/src/compressed_tensors/compressors/model_compressors/model_compressor.py
@@ -35,6 +35,7 @@
 from compressed_tensors.quantization import (
     DEFAULT_QUANTIZATION_METHOD,
     QuantizationConfig,
+    QuantizationScheme,
     QuantizationStatus,
     apply_quantization_config,
     load_pretrained_quantization,
@@ -61,7 +62,7 @@
 from transformers.file_utils import CONFIG_NAME
 
 
-__all__ = ["ModelCompressor", "map_modules_to_quant_args"]
+__all__ = ["ModelCompressor", "map_modules_to_quant_scheme"]
 
 _LOGGER: logging.Logger = logging.getLogger(__name__)
 
@@ -372,15 +373,13 @@ def compress(
         if state_dict is None:
             state_dict = model.state_dict()
 
-        compressed_state_dict = state_dict
-
-        quantized_modules_to_args: Dict[
-            str, QuantizationArgs
-        ] = map_modules_to_quant_args(model)
+        module_to_scheme: Dict[str, QuantizationScheme] = map_modules_to_quant_scheme(
+            model
+        )
 
         if self.quantization_compressor is not None:
-            compressed_state_dict = self.quantization_compressor.compress(
-                state_dict, names_to_scheme=quantized_modules_to_args
+            state_dict = self.quantization_compressor.compress(
+                state_dict, names_to_scheme=module_to_scheme
             )
             if self.quantization_config.format != CompressionFormat.dense.value:
                 self.quantization_config.quantization_status = (
@@ -393,8 +392,8 @@ def compress(
                 targets=self.sparsity_config.targets,
                 ignore=self.sparsity_config.ignore,
             )
-            compressed_state_dict = self.sparsity_compressor.compress(
-                compressed_state_dict,
+            state_dict = self.sparsity_compressor.compress(
+                state_dict,
                 compression_targets=sparse_compression_targets,
             )
 
@@ -403,7 +402,7 @@ def compress(
         # https://github.com/huggingface/transformers/pull/30488
         transformers.modeling_utils.dtype_byte_size = new_dtype_byte_size
 
-        return compressed_state_dict
+        return state_dict
 
     def decompress(self, model_path: str, model: Module):
         """
@@ -522,9 +521,9 @@ def _replace_weights(self, dense_weight_generator, model: Module):
                 update_parameter_data(module, data, param_name)
 
 
-def map_modules_to_quant_args(
+def map_modules_to_quant_scheme(
     model: Module,
-) -> Dict[str, Union[QuantizationArgs, Tuple[QuantizationArgs, QuantizationArgs]]]:
+) -> Dict[str, QuantizationScheme]:
     """
     Given a pytorch model, map out the submodule name (usually linear layers)
     to the weight QuantizationArgs. If running input activation quantization, will also
@@ -535,15 +534,8 @@ def map_modules_to_quant_args(
     quantized_modules_to_args = {}
     for name, submodule in iter_named_leaf_modules(model):
         if is_module_quantized(submodule):
-            if submodule.quantization_scheme.weights is not None:
-                name = fix_fsdp_module_name(name)
-                quantized_modules_to_args[name] = submodule.quantization_scheme.weights
-                if submodule.quantization_scheme.input_activations is not None:
-                    weight_args = quantized_modules_to_args.get(name)
-                    quantized_modules_to_args[name] = (
-                        weight_args,
-                        submodule.quantization_scheme.input_activations,
-                    )
+            name = fix_fsdp_module_name(name)
+            quantized_modules_to_args[name] = submodule.quantization_scheme
 
     return quantized_modules_to_args
 
diff --git a/src/compressed_tensors/compressors/quantized_compressors/base.py b/src/compressed_tensors/compressors/quantized_compressors/base.py
@@ -18,11 +18,12 @@
 
 import torch
 from compressed_tensors.compressors.base import BaseCompressor
-from compressed_tensors.quantization import QuantizationArgs
+from compressed_tensors.quantization import QuantizationScheme
 from compressed_tensors.utils import (
     get_nested_mappings_from_state_dict,
     get_nested_weight_mappings,
     merge_names,
+    remove_suffix,
 )
 from safetensors import safe_open
 from torch import Tensor
@@ -69,95 +70,82 @@ class BaseQuantizationCompressor(BaseCompressor):
     def compress(
         self,
         model_state: Dict[str, Tensor],
-        names_to_scheme: Dict[str, QuantizationArgs],
+        names_to_scheme: Dict[str, QuantizationScheme],
         **kwargs,
     ) -> Dict[str, Tensor]:
         """
         Compresses a dense state dict
 
-        :param model_state: state dict of uncompressed model
+        :param model_state: state dict of uncompressed model, consumed during
+            compression
         :param names_to_scheme: quantization args for each quantized weight, needed for
             quantize function to calculate bit depth
         :return: compressed state dict
         """
-        compressed_dict = {}
-        weight_suffix = ".weight"
-        input_zp_suffix = ".input_zero_point"
-        weight_zp_suffix = ".weight_zero_point"
-        _LOGGER.debug(
-            f"Compressing model with {len(model_state)} parameterized layers..."
-        )
+        save_device = "cpu"
+
+        uncompressed_names = list(model_state.keys())
+        for name in tqdm(uncompressed_names, desc="Compressing with quantization"):
+            value = model_state[name]
+
+            # compress weights
+            if name.endswith(".weight"):
+                prefix = remove_suffix(name, ".weight")
 
-        for name, value in tqdm(model_state.items(), desc="Quantized Compression"):
-            # check if the parameter we're compressing is the weight zp
-            # or the input zp
-            is_weight_zp = name.endswith(weight_zp_suffix)
-            is_input_zp = name.endswith(input_zp_suffix)
-
-            # if we're saving the weight zp, fetch weight quant args
-            if is_weight_zp:
-                quant_args_zp = names_to_scheme.get(name[: -(len(weight_zp_suffix))])
-                if isinstance(quant_args_zp, tuple):
-                    # If tuple, first value is weight args, second is input args
-                    quant_args_zp = quant_args_zp[0]
-
-            # if we're saving the input zp, fetch input quant args
-            if is_input_zp:
-                input_args_zp = names_to_scheme.get(name[: -(len(input_zp_suffix))])
-                if isinstance(input_args_zp, tuple):
-                    # If tuple, first value is weight args, second is input args
-                    input_args_zp = input_args_zp[-1]
-
-            if name.endswith(weight_suffix):
-                prefix = name[: -(len(weight_suffix))]
+                # gather qparams
                 scale = model_state.get(merge_names(prefix, "weight_scale"), None)
-                zp = model_state.get(merge_names(prefix, "weight_zero_point"), None)
                 g_idx = model_state.get(merge_names(prefix, "weight_g_idx"), None)
-                if scale is not None:
-                    # weight is quantized, compress it
-                    if isinstance(names_to_scheme[prefix], tuple):
-                        quant_args = names_to_scheme[prefix][0]
-                    else:
-                        quant_args = names_to_scheme[prefix]
-
-                    compressed_data = self.compress_weight(
-                        weight=value,
-                        scale=scale,
-                        zero_point=zp,
-                        g_idx=g_idx,
-                        quantization_args=quant_args,
-                        device="cpu",
-                    )
-                    for key, value in compressed_data.items():
-                        compressed_dict[merge_names(prefix, key)] = value
-                else:
-                    compressed_dict[name] = value.to("cpu")
-            # only save if asym
-            elif is_weight_zp and quant_args_zp.symmetric:
-                continue
-            # only save if asym
-            elif is_input_zp and input_args_zp.symmetric:
-                continue
-            elif name.endswith("g_idx") and torch.any(value <= -1):
-                continue
+                zp = model_state.get(merge_names(prefix, "weight_zero_point"), None)
+
+                # is scale does not exist, then weight cannot be compressed
+                if scale is None:
+                    model_state[name] = value.to(save_device)
+                    continue
+
+                # compress values on cpu. TODO: experiment with different devices
+                quant_args = names_to_scheme[prefix].weights
+                compressed_values = self.compress_weight(
+                    weight=value,
+                    scale=scale,
+                    zero_point=zp,
+                    g_idx=g_idx,
+                    quantization_args=quant_args,
+                    device="cpu",
+                )
+
+                # update state dict
+                del model_state[name]
+                for key, value in compressed_values.items():
+                    model_state[merge_names(prefix, key)] = value.to(save_device)
+
             else:
-                compressed_dict[name] = value.to("cpu")
+                # omit saving zero points for symmetric quantization
+                if name.endswith("zero_point") and _is_symmetric(name, names_to_scheme):
+                    del model_state[name]
 
-        return compressed_dict
+                # omit saving for g_idx if uninitialized
+                # TODO: does this case actually occur?
+                elif name.endswith("g_idx") and torch.any(value <= -1):
+                    del model_state[name]
+
+                else:
+                    model_state[name] = value.to(save_device)
+
+        return model_state
 
     def decompress(
         self,
         path_to_model_or_tensors: Union[str, Path, Dict[str, Any]],
-        names_to_scheme: Dict[str, QuantizationArgs],
-        device: str = "cpu",
+        names_to_scheme: Dict[str, QuantizationScheme],
+        device: torch.device = "cpu",
     ) -> Generator[Tuple[str, Tensor], None, None]:
         """
         Reads a compressed state dict located at path_to_model_or_tensors
         and returns a generator for sequentially decompressing back to a
         dense state dict
         :param path_to_model_or_tensors: path to compressed safetensors model (directory
             with one or more safetensors files) or compressed tensors file
-        :param names_to_scheme: quantization args for each quantized weight
+        :param names_to_scheme: quantization scheme for each quantized weight
         :param device: optional device to load intermediate weights into
         :return: compressed state dict
         """
@@ -171,7 +159,12 @@ def decompress(
                 path_to_model_or_tensors, names_to_scheme
             )
 
-    def _decompress_from_path(self, path_to_model, names_to_scheme, device):
+    def _decompress_from_path(
+        self,
+        path_to_model: Union[str, Path, Dict[str, Any]],
+        names_to_scheme: Dict[str, QuantizationScheme],
+        device: torch.device,
+    ):
         weight_mappings = get_nested_weight_mappings(
             path_to_model, self.compression_param_names
         )
@@ -182,13 +175,17 @@ def _decompress_from_path(self, path_to_model, names_to_scheme, device):
                 with safe_open(safe_path, framework="pt", device=device) as f:
                     weight_data[param_name] = f.get_tensor(full_name)
             if "weight_scale" in weight_data:
-                quant_args = names_to_scheme[weight_name]
+                quant_args = names_to_scheme[weight_name].weights
                 decompressed = self.decompress_weight(
                     compressed_data=weight_data, quantization_args=quant_args
                 )
                 yield merge_names(weight_name, "weight"), decompressed
 
-    def _decompress_from_state_dict(self, state_dict, names_to_scheme):
+    def _decompress_from_state_dict(
+        self,
+        state_dict: Dict[str, torch.Tensor],
+        names_to_scheme: Dict[str, QuantizationScheme],
+    ):
         weight_mappings = get_nested_mappings_from_state_dict(
             state_dict, self.compression_param_names
         )
@@ -198,8 +195,23 @@ def _decompress_from_state_dict(self, state_dict, names_to_scheme):
                 weight_data[param_name] = param_value
 
             if "weight_scale" in weight_data:
-                quant_args = names_to_scheme[weight_name]
+                quant_args = names_to_scheme[weight_name].weights
                 decompressed = self.decompress_weight(
                     compressed_data=weight_data, quantization_args=quant_args
                 )
                 yield merge_names(weight_name, "weight"), decompressed
+
+
+def _is_symmetric(name: str, names_to_scheme: Dict[str, QuantizationScheme]) -> bool:
+    weight_name, zp_name = name.rsplit(".", 1)
+    scheme = names_to_scheme[weight_name]
+
+    if zp_name == "weight_zero_point":
+        quant_args = scheme.weights
+    if zp_name == "input_zero_point":
+        quant_args = scheme.input_activations
+    if zp_name == "output_zero_point":
+        quant_args = scheme.output_activations
+
+    assert quant_args is not None
+    return quant_args.symmetric
diff --git a/src/compressed_tensors/compressors/sparse_compressors/base.py b/src/compressed_tensors/compressors/sparse_compressors/base.py
@@ -76,7 +76,7 @@ def compress(
         _LOGGER.debug(
             f"Compressing model with {len(model_state)} parameterized layers..."
         )
-        for name, value in tqdm(model_state.items(), desc="Compressing model"):
+        for name, value in tqdm(model_state.items(), desc="Compressing with sparsity"):
             if not self.should_compress(name, compression_targets):
                 compressed_dict[name] = value
                 continue
diff --git a/src/compressed_tensors/utils/helpers.py b/src/compressed_tensors/utils/helpers.py
@@ -38,6 +38,7 @@
     "shard_tensor",
     "pack_bitmasks",
     "unpack_bitmasks",
+    "remove_suffix",
 ]
 
 FSDP_WRAPPER_NAME = "_fsdp_wrapped_module"
@@ -328,3 +329,9 @@ def unpack_bitmasks(
     )
 
     return unpacked_bitmasks_torch
+
+
+def remove_suffix(value: str, suffix: str) -> str:
+    # can replace with str.removesuffix in python3.9+
+    assert value.endswith(suffix)
+    return value[: -len(suffix)]

Original file line number	Diff line number	Diff line change
`@@ -76,7 +76,7 @@ def compress(`
`76`	`76`	`_LOGGER.debug(`
`77`	`77`	`f"Compressing model with {len(model_state)} parameterized layers..."`
`78`	`78`	`)`
`79`		`- for name, value in tqdm(model_state.items(), desc="Compressing model"):`
	`79`	`+ for name, value in tqdm(model_state.items(), desc="Compressing with sparsity"):`
`80`	`80`	`if not self.should_compress(name, compression_targets):`
`81`	`81`	`compressed_dict[name] = value`
`82`	`82`	`continue`