neuralmagic
diff --git a/‎src/compressed_tensors/compressors/base.py
Lines changed: 6 additions & 1 deletion b/‎src/compressed_tensors/compressors/base.py
Lines changed: 6 additions & 1 deletion
diff --git a/‎src/compressed_tensors/compressors/model_compressors/model_compressor.py
Lines changed: 89 additions & 7 deletions b/‎src/compressed_tensors/compressors/model_compressors/model_compressor.py
Lines changed: 89 additions & 7 deletions
diff --git a/‎src/compressed_tensors/compressors/quantized_compressors/base.py
Lines changed: 35 additions & 38 deletions b/‎src/compressed_tensors/compressors/quantized_compressors/base.py
Lines changed: 35 additions & 38 deletions
diff --git a/‎src/compressed_tensors/compressors/sparse_compressors/base.py
Lines changed: 21 additions & 4 deletions b/‎src/compressed_tensors/compressors/sparse_compressors/base.py
Lines changed: 21 additions & 4 deletions
@@ -19,6 +19,7 @@
 from compressed_tensors.config import SparsityCompressionConfig
 from compressed_tensors.quantization import QuantizationArgs, QuantizationConfig
 from compressed_tensors.registry import RegistryMixin
+from compressed_tensors.utils import has_offloaded_params
 from torch import Tensor
 from torch.nn import Module
 
@@ -169,6 +170,10 @@ def decompress_module(self, module: Module):
         :param module: PyTorch module to decompress
         :return: tensor of the decompressed weight, or None if module is not quantized
         """
+
+        params_device = next(module.parameters()).device
+        device = "cpu" if has_offloaded_params(module) else params_device
+
         if not hasattr(module, "quantization_scheme"):
             return None  # module is not quantized
         quantization_scheme = module.quantization_scheme
@@ -182,7 +187,7 @@ def decompress_module(self, module: Module):
 
         return self.decompress_weight(
             compressed_data=compressed_data, quantization_args=quantization_args
-        )
+        ).to(device)
 
     def decompress_weight(
         self, compressed_data: Dict[str, Tensor], **kwargs
 
@@ -42,6 +42,7 @@
     SPARSITY_CONFIG_NAME,
 )
 from compressed_tensors.compressors.base import BaseCompressor
+from compressed_tensors.compressors.sparse_compressors import DenseCompressor
 from compressed_tensors.config import CompressionFormat, SparsityCompressionConfig
 from compressed_tensors.linear.compressed_linear import CompressedLinear
 from compressed_tensors.quantization import (
@@ -50,7 +51,7 @@
     QuantizationScheme,
     QuantizationStatus,
     apply_quantization_config,
-    load_pretrained_quantization,
+    load_pretrained_quantization_parameters,
 )
 from compressed_tensors.quantization.lifecycle import expand_target_names
 from compressed_tensors.quantization.utils import (
@@ -59,8 +60,10 @@
 )
 from compressed_tensors.utils import (
     get_safetensors_folder,
+    has_offloaded_params,
     merge_names,
     module_replace_dfs,
+    register_offload_parameter,
     update_parameter_data,
 )
 from compressed_tensors.utils.helpers import (
@@ -448,6 +451,13 @@ def decompress(self, model_path: str, model: Module):
 
         :param model_path: path to compressed weights
         :param model: pytorch model to load decompressed weights into
+
+        Note: decompress makes use of both _replace_sparsity_weights and _replace_weights
+        The variations in these methods are a result of the subtle variations between the sparsity
+        and quantization compressors. Specifically, quantization compressors return not just the
+        decompressed weight, but the quantization parameters (e.g scales, zero_point) whereas sparsity
+        compressors only return the decompressed weight.
+
         """
         model_path = get_safetensors_folder(model_path)
         sparse_decompressed = False
@@ -456,9 +466,16 @@ def decompress(self, model_path: str, model: Module):
             self.sparsity_compressor is not None
             and self.sparsity_config.format != CompressionFormat.dense.value
         ):
+            params_to_ignore = None
+            if self.quantization_compressor is not None:
+                params_to_ignore = self.quantization_compressor.compression_param_names
             # Sparse decompression is applied on the model_path
-            dense_gen = self.sparsity_compressor.decompress(model_path)
-            self._replace_weights(dense_gen, model)
+            # The compressor will try and load any quantization parameters as well
+            # params_to_skip_load will skip over quantization params from being loaded
+            dense_gen = self.sparsity_compressor.decompress(
+                model_path, params_to_skip_load=params_to_ignore
+            )
+            self._replace_sparsity_weights(dense_gen, model)
             setattr(model, SPARSITY_CONFIG_NAME, self.sparsity_compressor.config)
             sparse_decompressed = True
 
@@ -467,13 +484,27 @@ def decompress(self, model_path: str, model: Module):
             # quantization during apply_quantization_config. This ensures
             # that the dtypes of the weights are not unintentionally updated.
             # The status is restored after quantization params are loaded.
+
             with override_quantization_status(
                 self.quantization_config, QuantizationStatus.FROZEN
             ):
+
                 names_to_scheme = apply_quantization_config(
                     model, self.quantization_config
                 )
-                load_pretrained_quantization(model, model_path)
+                # Load activation scales/zp or any other quantization parameters
+                # Conditionally load the weight quantization parameters if we have a dense compressor
+                # Or if a sparsity compressor has already been applied
+                load_pretrained_quantization_parameters(
+                    model,
+                    model_path,
+                    # TODO: all weight quantization params will be moved to the compressor in a follow-up
+                    # including initialization
+                    load_weight_quantization=(
+                        sparse_decompressed
+                        or isinstance(self.quantization_compressor, DenseCompressor)
+                    ),
+                )
 
             model_path_or_state_dict = (
                 model.state_dict() if sparse_decompressed else model_path
@@ -482,6 +513,8 @@ def decompress(self, model_path: str, model: Module):
             dense_gen = self.quantization_compressor.decompress(
                 model_path_or_state_dict, names_to_scheme=names_to_scheme
             )
+            # TODO: all weight quantization params will be moved to the compressor
+            # to prevent duplicate parameter updates in update_parameter_data
             self._replace_weights(dense_gen, model)
 
             def freeze_quantization_status(module):
@@ -537,7 +570,7 @@ def update_config(self, save_directory: str):
         with open(config_file_path, "w") as config_file:
             json.dump(config_data, config_file, indent=2, sort_keys=True)
 
-    def _replace_weights(self, dense_weight_generator, model: Module):
+    def _replace_sparsity_weights(self, dense_weight_generator, model: Module):
         """
         Replace the weights of the model with the
         provided dense weights.
@@ -552,11 +585,60 @@ def _replace_weights(self, dense_weight_generator, model: Module):
         :param model: The model whose weights are to be updated.
         """
         for name, data in tqdm(dense_weight_generator, desc="Decompressing model"):
+
             split_name = name.split(".")
             prefix, param_name = ".".join(split_name[:-1]), split_name[-1]
             module = operator.attrgetter(prefix)(model)
-            if hasattr(module, param_name):
-                update_parameter_data(module, data, param_name)
+
+            params_device = next(module.parameters()).device
+            device = "cpu" if has_offloaded_params(module) else params_device
+            delattr(module, param_name)
+            requires_grad = data.dtype in (torch.float16, torch.float32, torch.bfloat16)
+            param = torch.nn.Parameter(data.to(device), requires_grad=requires_grad)
+            register_offload_parameter(module, param_name, param)
+
+    def _replace_weights(self, dense_weight_generator, model: Module):
+        """
+        Replace the weights of the model with the
+        provided dense weights.
+
+        This method iterates over the dense_weight_generator and
+        updates the corresponding weights in the model. If a parameter
+        name does not exist in the model, it will be skipped.
+
+        :param dense_weight_generator (generator): A generator that yields
+            tuples of (name, data), where 'name' is the parameter name and
+            'data' is the updated param data
+        :param model: The model whose weights are to be updated.
+        """
+
+        for name, data in tqdm(dense_weight_generator, desc="Decompressing model"):
+            module = operator.attrgetter(name)(model)
+
+            params_device = next(module.parameters()).device
+            device = "cpu" if has_offloaded_params(module) else params_device
+
+            for param_name, param_data in data.items():
+                if hasattr(module, param_name):
+                    # If compressed, will have an incorrect dtype for transformers >4.49
+                    # TODO: we can also just skip initialization of scales/zp if in decompression in init
+                    # to be consistent with loading which happens later as well
+                    # however, update_data does a good shape check - should be moved to the compressor
+                    if param_name == "weight":
+                        delattr(module, param_name)
+                        requires_grad = param_data.dtype in (
+                            torch.float16,
+                            torch.float32,
+                            torch.bfloat16,
+                        )
+                        param = torch.nn.Parameter(
+                            param_data.to(device), requires_grad=requires_grad
+                        )
+                        register_offload_parameter(module, param_name, param)
+                    else:
+                        # Should already be registered to the correct device for
+                        # for scales/zero-points
+                        update_parameter_data(module, param_data, param_name)
 
 
 def map_module_to_scheme(model: Module) -> Dict[str, QuantizationScheme]:
 
@@ -14,7 +14,7 @@
 
 import logging
 from pathlib import Path
-from typing import Any, Dict, Generator, Tuple, Union
+from typing import Any, Dict, Generator, Optional, Tuple, Union
 
 import torch
 from compressed_tensors.compressors.base import BaseCompressor
@@ -121,29 +121,46 @@ def compress(
 
             else:
                 # omit saving zero points for symmetric quantization
-                if name.endswith("zero_point") and _is_symmetric(name, names_to_scheme):
+                if name.endswith("zero_point") and not self._should_save_zp(
+                    name, names_to_scheme
+                ):
                     continue
 
                 # omit saving for g_idx if uninitialized
                 # TODO: does this case actually occur?
                 elif name.endswith("g_idx") and torch.any(value <= -1):
                     continue
 
-                else:
-                    compressed_dict[name] = value.to(save_device)
+                compressed_dict[name] = value.to(save_device)
 
         return compressed_dict
 
-    def _check_if_zp_pack_quantized(self, quant_args):
+    def _should_save_zp(
+        self, name: str, names_to_scheme: Dict[str, QuantizationScheme]
+    ) -> bool:
         from compressed_tensors.compressors import PackedQuantizationCompressor
 
-        if isinstance(self, PackedQuantizationCompressor):
-            if not quant_args.symmetric and quant_args.strategy in [
-                QuantizationStrategy.GROUP.value,
-                QuantizationStrategy.CHANNEL.value,
-            ]:
-                return True
-        return False
+        module_name, zp_name = name.rsplit(".", 1) if "." in name else ("", name)
+        scheme = names_to_scheme[module_name]
+
+        if zp_name == "weight_zero_point":
+            args = scheme.weights
+        if zp_name == "input_zero_point":
+            args = scheme.input_activations
+        if zp_name == "output_zero_point":
+            args = scheme.output_activations
+
+        symmetric = args.symmetric
+        packable_strats = [
+            QuantizationStrategy.GROUP.value,
+            QuantizationStrategy.CHANNEL.value,
+        ]
+        packed = (
+            isinstance(self, PackedQuantizationCompressor)
+            and args.strategy in packable_strats
+        )
+
+        return not symmetric and not packed
 
     def decompress(
         self,
@@ -191,13 +208,10 @@ def _decompress_from_path(
                 decompressed = self.decompress_weight(
                     compressed_data=weight_data, quantization_args=quant_args
                 )
-                yield merge_names(weight_name, "weight"), decompressed
+                weight_data["weight"] = decompressed
+                yield weight_name, weight_data
 
-    def _decompress_from_state_dict(
-        self,
-        state_dict: Dict[str, torch.Tensor],
-        names_to_scheme: Dict[str, QuantizationScheme],
-    ):
+    def _decompress_from_state_dict(self, state_dict, names_to_scheme):
         weight_mappings = get_nested_mappings_from_state_dict(
             state_dict, self.compression_param_names
         )
@@ -207,26 +221,9 @@ def _decompress_from_state_dict(
                 weight_data[param_name] = param_value
 
             if "weight_scale" in weight_data:
-                quant_args = names_to_scheme[weight_name].weights
+                quant_args = names_to_scheme[weight_name]
                 decompressed = self.decompress_weight(
                     compressed_data=weight_data, quantization_args=quant_args
                 )
-                yield merge_names(weight_name, "weight"), decompressed
-
-
-def _is_symmetric(name: str, names_to_scheme: Dict[str, QuantizationScheme]) -> bool:
-    try:
-        weight_name, zp_name = name.rsplit(".", 1) if "." in name else ("", name)
-    except:
-        breakpoint()
-    scheme = names_to_scheme[weight_name]
-
-    if zp_name == "weight_zero_point":
-        quant_args = scheme.weights
-    if zp_name == "input_zero_point":
-        quant_args = scheme.input_activations
-    if zp_name == "output_zero_point":
-        quant_args = scheme.output_activations
-
-    assert quant_args is not None
-    return quant_args.symmetric
+                weight_data["weight"] = decompressed
+                yield weight_name, weight_data
@@ -98,7 +98,11 @@ def compress(
         return compressed_dict
 
     def decompress(
-        self, path_to_model_or_tensors: str, device: str = "cpu", **kwargs
+        self,
+        path_to_model_or_tensors: str,
+        device: str = "cpu",
+        params_to_skip_load: Optional[Tuple] = None,
+        **kwargs,
     ) -> Generator[Tuple[str, Tensor], None, None]:
         """
         Reads a bitmask compressed state dict located
@@ -108,6 +112,11 @@ def decompress(
         :param model_path: path to compressed safetensors model (directory with
             one or more safetensors files) or compressed tensors file
         :param device: device to load decompressed weights onto
+        :param params_to_skip_load: a list of non-sparsity parameters (e.g quantization
+            parameters) that we want to skip loading. As the sparsity compresssor does
+            not handle quantized decompression, this should contain any quantization
+            parameters when decompressing stacked compressors. We want these parameters
+            to be handled by the quantization decompressor
         :return: iterator for generating decompressed weights
         """
         weight_mappings, ignored_params = get_nested_weight_mappings(
@@ -121,13 +130,21 @@ def decompress(
                 full_name = merge_names(weight_name, param_name)
                 with safe_open(safe_path, framework="pt", device=device) as f:
                     weight_data[param_name] = f.get_tensor(full_name)
+
             decompressed = self.decompress_weight(weight_data)
             yield merge_names(weight_name, "weight"), decompressed
 
         for ignored_param_name, safe_path in ignored_params.items():
-            with safe_open(safe_path, framework="pt", device=device) as f:
-                value = f.get_tensor(ignored_param_name)
-            yield ignored_param_name, value
+            should_skip = False
+            if params_to_skip_load is not None:
+                for param_to_skip in params_to_skip_load:
+                    if param_to_skip in ignored_param_name:
+                        should_skip = True
+
+            if not should_skip:
+                with safe_open(safe_path, framework="pt", device=device) as f:
+                    value = f.get_tensor(ignored_param_name)
+                yield ignored_param_name, value
 
     @staticmethod
     def should_compress(name: str, expanded_targets: Optional[Set[str]] = None) -> bool: