wip: writing sparse decompress_from_state_dict

kylesayrs · kylesayrs · commit 3f1cf36596dd · 2025-05-07T16:13:17.000-04:00
Signed-off-by: Kyle Sayers &lt;kylesayrs@gmail.com&gt;
diff --git a/src/compressed_tensors/compressors/model_compressors/model_compressor.py b/src/compressed_tensors/compressors/model_compressors/model_compressor.py
@@ -398,7 +398,7 @@ def compress_model(self, model: Module):
                         show_progress=False,
                     )
 
-                # remove any exist parameters
+                # remove any existing parameters
                 for name, _ in list(module.named_parameters()):
                     delattr(module, name)
 
@@ -418,34 +418,45 @@ def decompress_model(self, model: Module):
             ignore=self.sparsity_config.ignore if self.sparsity_config else [],
         )
 
+        # because decompressors are implemented to only generate new values (rather than
+        # generating new values and unused values), we must explicitly pass a list of
+        # keys to yield which are unused (but used in subsequent decompressors)
+        params_to_ignore = None
+        if self.quantization_compressor is not None:
+            params_to_ignore = self.quantization_compressor.compression_param_names
+
         for prefix, module in model.named_modules():
             if prefix in module_to_scheme or prefix in sparse_compression_targets:
                 state_dict = module.state_dict(prefix=f"{prefix}.")
-                decompressed = dict()
                 # sparsity first
                 if prefix in sparse_compression_targets:
+                    # sparse_compression_targets are automatically inferred by this fn
                     generator = self.sparsity_compressor.decompress_from_state_dict(
                         state_dict,
-                        names_to_scheme=module_to_scheme,
+                        params_to_ignore=params_to_ignore,
                     )
-                    for _module_name, decompressed_data in generator:
+                    decompressed = dict()
+                    for _, decompressed_data in generator:
                         decompressed.update(decompressed_data)
+                    state_dict = decompressed
 
                 # quantization second
                 if prefix in module_to_scheme:
                     generator = self.quantization_compressor.decompress_from_state_dict(
-                        state_dict,
+                        state_dict,  # asdf
                         names_to_scheme=module_to_scheme,
                     )
-                    for _module_name, decompressed_data in generator:
+                    decompressed = dict()
+                    for _, decompressed_data in generator:
                         decompressed.update(decompressed_data)
+                    state_dict = decompressed
 
-                # remove any exist parameters
+                # remove any existing parameters
                 for name, _ in list(module.named_parameters()):
                     delattr(module, name)
 
                 # replace with decompressed parameters
-                for name, value in decompressed.items():
+                for name, value in state_dict.items():
                     param = torch.nn.Parameter(value, requires_grad=False)
                     register_offload_parameter(module, name, param)
 
diff --git a/src/compressed_tensors/compressors/quantized_compressors/base.py b/src/compressed_tensors/compressors/quantized_compressors/base.py
@@ -197,18 +197,18 @@ def decompress_from_state_dict(
         weight_mappings = get_nested_mappings_from_state_dict(
             state_dict, self.compression_param_names
         )
-        for module_name in weight_mappings.keys():
+        for module_path in weight_mappings.keys():
             weight_data = {}
-            for param_name, param_value in weight_mappings[module_name].items():
+            for param_name, param_value in weight_mappings[module_path].items():
                 weight_data[param_name] = param_value
 
             if "weight_scale" in weight_data:
-                quant_args = names_to_scheme[module_name].weights
+                quant_args = names_to_scheme[module_path].weights
                 decompressed = self.decompress_weight(
                     compressed_data=weight_data, quantization_args=quant_args
                 )
                 weight_data["weight"] = decompressed
-                yield module_name, weight_data
+                yield module_path, weight_data
 
     def _decompress_from_path(
         self,
@@ -219,16 +219,16 @@ def _decompress_from_path(
         weight_mappings = get_nested_weight_mappings(
             path_to_model, self.compression_param_names
         )
-        for module_name in weight_mappings.keys():
+        for module_path in weight_mappings.keys():
             weight_data = {}
-            for param_name, safe_path in weight_mappings[module_name].items():
-                full_name = merge_names(module_name, param_name)
+            for param_name, safe_path in weight_mappings[module_path].items():
+                full_name = merge_names(module_path, param_name)
                 with safe_open(safe_path, framework="pt", device=device) as f:
                     weight_data[param_name] = f.get_tensor(full_name)
             if "weight_scale" in weight_data:
-                quant_args = names_to_scheme[module_name].weights
+                quant_args = names_to_scheme[module_path].weights
                 decompressed = self.decompress_weight(
                     compressed_data=weight_data, quantization_args=quant_args
                 )
                 weight_data["weight"] = decompressed
-                yield module_name, weight_data
+                yield module_path, weight_data
diff --git a/src/compressed_tensors/compressors/sparse_compressors/base.py b/src/compressed_tensors/compressors/sparse_compressors/base.py
@@ -16,7 +16,7 @@
 from typing import Dict, Generator, Optional, Set, Tuple
 
 from compressed_tensors.compressors.base import BaseCompressor
-from compressed_tensors.utils import get_nested_weight_mappings, merge_names
+from compressed_tensors.utils import get_nested_weight_mappings, merge_names, get_nested_mappings_from_state_dict
 from safetensors import safe_open
 from torch import Tensor
 from tqdm import tqdm
@@ -129,15 +129,15 @@ def decompress(
             self.compression_param_names,
             return_unmatched_params=True,
         )
-        for weight_name in weight_mappings.keys():
+        for module_path in weight_mappings.keys():
             weight_data = {}
-            for param_name, safe_path in weight_mappings[weight_name].items():
-                full_name = merge_names(weight_name, param_name)
+            for param_name, safe_path in weight_mappings[module_path].items():
+                full_name = merge_names(module_path, param_name)
                 with safe_open(safe_path, framework="pt", device=device) as f:
                     weight_data[param_name] = f.get_tensor(full_name)
 
             decompressed = self.decompress_weight(weight_data)
-            yield merge_names(weight_name, "weight"), decompressed
+            yield merge_names(module_path, "weight"), decompressed
 
         for ignored_param_name, safe_path in ignored_params.items():
             should_skip = False
@@ -151,8 +151,34 @@ def decompress(
                     value = f.get_tensor(ignored_param_name)
                 yield ignored_param_name, value
 
-    def decompress_from_state_dict(self, state_dict, names_to_scheme):
-        exit(0)
+    def decompress_from_state_dict(
+        self,
+        state_dict: Dict[str, Tensor],
+        params_to_skip_load: Optional[Tuple] = None,
+    ) -> Generator[Tuple[str, Dict[str, Tensor]], None, None]:
+        """
+        Implemented to copy the pattern of 
+        """
+        weight_mappings, ignored_params = get_nested_mappings_from_state_dict(
+            state_dict, self.compression_param_names, return_unmatched_params=True
+        )
+
+        for module_path in weight_mappings.keys():
+            weight_data = {}
+            for param_name, param_value in weight_mappings[module_path].items():
+                weight_data[param_name] = param_value
+
+            yield module_path, self.decompress_weight(weight_data)
+
+        for ignored_param_name, safe_path in ignored_params.items():
+            should_skip = False
+            if params_to_skip_load is not None:
+                for param_to_skip in params_to_skip_load:
+                    if param_to_skip in ignored_param_name:
+                        should_skip = True
+
+            if not should_skip:
+                yield ignored_param_name, state_dict[ignored_param_name]
 
     @staticmethod
     def should_compress(name: str, expanded_targets: Optional[Set[str]] = None) -> bool:
diff --git a/src/compressed_tensors/utils/safetensors_load.py b/src/compressed_tensors/utils/safetensors_load.py
@@ -35,6 +35,7 @@
     "is_quantization_param",
 ]
 
+NestedStateDictType = Dict[str, Dict[str, Tensor]]
 WeightMappingType = Dict[str, str]
 NestedWeightMappingType = Dict[str, WeightMappingType]
 
@@ -249,8 +250,8 @@ def get_nested_weight_mappings(
 
 
 def get_nested_mappings_from_state_dict(
-    state_dict, params_to_nest: Iterable[str]
-) -> NestedWeightMappingType:
+    state_dict, params_to_nest: Iterable[str], return_unmatched_params: bool = False,
+) -> Union[NestedStateDictType, Tuple[NestedStateDictType, Dict[str, Tensor]]]:
     """
     Takes a state dict and returns a nested mapping from uncompressed
     parameterized layer names to the value of
@@ -269,13 +270,21 @@ def get_nested_mappings_from_state_dict(
         each layer's compression parameters.
     """
     nested_weight_mappings = {}
+    unmatched_params = {}
+    
     for key in state_dict.keys():
         for param_name in params_to_nest:
             dense_param = match_param_name(key, param_name)
             if dense_param:
                 if dense_param not in nested_weight_mappings:
                     nested_weight_mappings[dense_param] = {}
                 nested_weight_mappings[dense_param][param_name] = state_dict[key]
+                matched = True
+        if return_unmatched_params and not matched:
+            unmatched_params[key] = state_dict[key]
+
+    if return_unmatched_params:
+        return nested_weight_mappings, unmatched_params
     return nested_weight_mappings
 
 
diff --git a/tests/test_compressors/model_compressors/test_model_compressor.py b/tests/test_compressors/model_compressors/test_model_compressor.py
@@ -16,6 +16,7 @@
 from copy import deepcopy
 from pathlib import Path
 
+from compressed_tensors.config.sparse_24_bitmask import Sparse24BitMaskConfig
 import pytest
 import torch
 import torch.nn as nn
@@ -370,31 +371,28 @@ def _get_combined_config(s_config, q_config):
 
 
 @pytest.mark.parametrize(
-    "model_stub,comp_stub,q_format,s_format",
+    "model_stub,q_format,s_config",
     [
         (
             "nm-testing/llama2.c-stories42M-gsm8k-quantized-only-uncompressed",
-            "nm-testing/llama2.c-stories42M-gsm8k-quantized-only-compressed",
             "float-quantized",
             None,
         ),
         (
             "nm-testing/llama2.c-stories42M-gsm8k-sparse-only-uncompressed",
-            "nm-testing/llama2.c-stories42M-gsm8k-sparse-only-compressed",
             None,
             "sparse-24-bitmask",
         ),
         (
             "nm-testing/llama2.c-stories42M-gsm8k-stacked-uncompressed",
-            "nm-testing/llama2.c-stories42M-gsm8k-stacked-compressed",
             "float-quantized",
-            "sparse-24-bitmask",
+            Sparse24BitMaskConfig(targets=["Linear"]),
         ),
     ],
 )
-def test_compress_decompress_model(model_stub, comp_stub, q_format, s_format):
+def test_compress_decompress_model(model_stub, q_format, s_config, tmpdir):
     model = AutoModelForCausalLM.from_pretrained(model_stub, torch_dtype=torch.float32)
-    compressor = ModelCompressor.from_pretrained_model(model, s_format, q_format)
+    compressor = ModelCompressor.from_pretrained_model(model, s_config, q_format)
 
     # compress model by eagerly compressing state dict
     true_compressed = dict(compressor.compress(model))
@@ -415,29 +413,90 @@ def test_compress_decompress_model(model_stub, comp_stub, q_format, s_format):
     # -- decompress -- #
 
     # reinstantiate compressor to mimic LLM Compressor flows
-    compressor = ModelCompressor.from_pretrained_model(model, s_format, q_format)
+    model.save_pretrained(tmpdir)
+    model = AutoModelForCausalLM.from_pretrained(tmpdir, torch_dtype=torch.float32)
+    compressor = ModelCompressor.from_pretrained_model(model, s_config, q_format)
+    
+    true_decompressed_model = AutoModelForCausalLM.from_pretrained(model_stub, torch_dtype=torch.float32)
+    compressor.decompress(tmpdir, true_decompressed_model)
+    true_decompressed = dict(true_decompressed_model.state_dict())
+
+    # decompress model
+    compressor.decompress_model(model)
+    decompressed = dict(model.state_dict())
+
+    # equivalent to decompressing from disk
+    assert decompressed.keys() == true_decompressed.keys()
+    for key in decompressed.keys():
+        mask = ~torch.isclose(decompressed[key], true_decompressed[key], rtol=1e-3, atol=1e-5)
+        print("Mismatched indices:", mask.nonzero(as_tuple=True))
+        print("a values:", decompressed[key][mask])
+        print("b values:", true_decompressed[key][mask])
+        assert torch.allclose(decompressed[key], true_decompressed[key], rtol=1e-3, atol=1e-5), f"{key}"
+    del true_decompressed
+
+
+@pytest.mark.parametrize(
+    "comp_stub,q_format,s_config",
+    [
+        # (
+        #     "nm-testing/llama2.c-stories42M-gsm8k-quantized-only-compressed",
+        #     "float-quantized",
+        #     None,
+        # ),
+        # (
+        #     "nm-testing/llama2.c-stories42M-gsm8k-sparse-only-compressed",
+        #     None,
+        #     "sparse-24-bitmask",
+        # ),
+        (
+            "nm-testing/llama2.c-stories42M-gsm8k-stacked-compressed",
+            "float-quantized",
+            Sparse24BitMaskConfig(targets=["Linear"]),
+        ),
+    ],
+)
+def test_decompress_model(comp_stub, q_format, s_config):
+    # NOTE: transformers adds extra zero points if run_compressed=False or w/ sparsity
+    # https://github.com/huggingface/transformers/blob/main/src/transformers/quantizers/quantizer_compressed_tensors.py#L131-L133
+    # however, decompression does not add zero points in non-asymmetric cases
+    # in order to normalize for this effect in this test, we remove empty weight zps
 
-    # decompress model from disk  # TODO try also using a model saved from prev step
     from transformers.utils.quantization_config import CompressedTensorsConfig
 
+    # decompress from disk
     true_decompressed_model = AutoModelForCausalLM.from_pretrained(
         comp_stub,
         quantization_config=CompressedTensorsConfig(run_compressed=False),
         torch_dtype=torch.float32,
     )
     true_decompressed = dict(true_decompressed_model.state_dict())
-    true_decompressed = {
-        name: value
-        for name, value in true_decompressed.items()
-        if not name.endswith("zero_point")
-    }  # ignore zero points
+    true_decompressed = remove_empty_weight_zero_points(true_decompressed)  # see above
 
-    # decompress model
+    # decompress from memory
+    model = AutoModelForCausalLM.from_pretrained(
+        comp_stub,
+        quantization_config=CompressedTensorsConfig(run_compressed=True),
+        torch_dtype=torch.float32,
+    )
+    compressor = ModelCompressor.from_pretrained_model(model, s_config, q_format)
     compressor.decompress_model(model)
     decompressed = dict(model.state_dict())
+    if "sparse" in str(s_config):
+        decompressed = remove_empty_weight_zero_points(decompressed)  # see above
 
     # equivalent to decompressing from disk
+    breakpoint()
     assert decompressed.keys() == true_decompressed.keys()
     for key in decompressed.keys():
-        assert torch.allclose(decompressed[key], true_decompressed[key]), f"{key}"
+        if not torch.allclose(decompressed[key], true_decompressed[key]):
+            breakpoint()
     del true_decompressed
+
+
+def remove_empty_weight_zero_points(state_dict):
+    return {
+        name: value
+        for name, value in state_dict.items()
+        if not (name.endswith("weight_zero_point") and torch.all(value == 0))
+    }