wip: decompression works except for zero points

kylesayrs · kylesayrs · commit dbc104dc35a8 · 2025-05-06T18:17:09.000-04:00
Signed-off-by: Kyle Sayers &lt;kylesayrs@gmail.com&gt;
diff --git a/src/compressed_tensors/compressors/model_compressors/model_compressor.py b/src/compressed_tensors/compressors/model_compressors/model_compressor.py
@@ -382,12 +382,15 @@ def compress_model(self, model: Module):
         for prefix, module in model.named_modules():
             if prefix in module_to_scheme or prefix in sparse_compression_targets:
                 state_dict = module.state_dict(prefix=f"{prefix}.")
+                # quantization first
                 if prefix in module_to_scheme:
                     state_dict = self.quantization_compressor.compress(
                         state_dict,
                         names_to_scheme=module_to_scheme,
                         show_progress=False,
                     )
+
+                # sparsity second
                 if prefix in sparse_compression_targets:
                     state_dict = self.sparsity_compressor.compress(
                         state_dict,
@@ -407,9 +410,46 @@ def compress_model(self, model: Module):
 
                 module.quantization_status = QuantizationStatus.COMPRESSED
 
-    def decompress_model(model: Module):
+    def decompress_model(self, model: Module):
+        module_to_scheme = map_module_to_scheme(model)
+        sparse_compression_targets: Set[str] = expand_target_names(
+            model=model,
+            targets=self.sparsity_config.targets if self.sparsity_config else [],
+            ignore=self.sparsity_config.ignore if self.sparsity_config else [],
+        )
+
+        for prefix, module in model.named_modules():
+            if prefix in module_to_scheme or prefix in sparse_compression_targets:
+                state_dict = module.state_dict(prefix=f"{prefix}.")
+                decompressed = dict()
+                # sparsity first
+                if prefix in sparse_compression_targets:
+                    generator = self.sparsity_compressor.decompress_from_state_dict(
+                        state_dict,
+                        names_to_scheme=module_to_scheme,
+                    )
+                    for _module_name, decompressed_data in generator:
+                        decompressed.update(decompressed_data)
+
+                # quantization second
+                if prefix in module_to_scheme:
+                    generator = self.quantization_compressor.decompress_from_state_dict(
+                        state_dict,
+                        names_to_scheme=module_to_scheme,
+                    )
+                    for _module_name, decompressed_data in generator:
+                        decompressed.update(decompressed_data)
+
+                # remove any exist parameters
+                for name, _ in list(module.named_parameters()):
+                    delattr(module, name)
 
-        pass
+                # replace with decompressed parameters
+                for name, value in decompressed.items():
+                    param = torch.nn.Parameter(value, requires_grad=False)
+                    register_offload_parameter(module, name, param)
+
+                module.quantization_status = QuantizationStatus.FROZEN
 
         # apparently we only have logic for decompressing from a file...
 
diff --git a/src/compressed_tensors/compressors/quantized_compressors/base.py b/src/compressed_tensors/compressors/quantized_compressors/base.py
@@ -185,46 +185,50 @@ def decompress(
             )
 
         else:
-            yield from self._decompress_from_state_dict(
+            yield from self.decompress_from_state_dict(
                 path_to_model_or_tensors, names_to_scheme
             )
 
-    def _decompress_from_path(
+    def decompress_from_state_dict(
         self,
-        path_to_model: Union[str, Path, Dict[str, Any]],
+        state_dict: Dict[str, torch.Tensor],
         names_to_scheme: Dict[str, QuantizationScheme],
-        device: str,
-    ):
-        weight_mappings = get_nested_weight_mappings(
-            path_to_model, self.compression_param_names
+    ) -> Generator[Tuple[str, Dict[str, torch.Tensor]], None, None]:
+        weight_mappings = get_nested_mappings_from_state_dict(
+            state_dict, self.compression_param_names
         )
-        for weight_name in weight_mappings.keys():
+        for module_name in weight_mappings.keys():
             weight_data = {}
-            for param_name, safe_path in weight_mappings[weight_name].items():
-                full_name = merge_names(weight_name, param_name)
-                with safe_open(safe_path, framework="pt", device=device) as f:
-                    weight_data[param_name] = f.get_tensor(full_name)
+            for param_name, param_value in weight_mappings[module_name].items():
+                weight_data[param_name] = param_value
+
             if "weight_scale" in weight_data:
-                quant_args = names_to_scheme[weight_name].weights
+                quant_args = names_to_scheme[module_name]
                 decompressed = self.decompress_weight(
                     compressed_data=weight_data, quantization_args=quant_args
                 )
                 weight_data["weight"] = decompressed
-                yield weight_name, weight_data
+                yield module_name, weight_data
 
-    def _decompress_from_state_dict(self, state_dict, names_to_scheme):
-        weight_mappings = get_nested_mappings_from_state_dict(
-            state_dict, self.compression_param_names
+    def _decompress_from_path(
+        self,
+        path_to_model: Union[str, Path, Dict[str, Any]],
+        names_to_scheme: Dict[str, QuantizationScheme],
+        device: str,
+    ):
+        weight_mappings = get_nested_weight_mappings(
+            path_to_model, self.compression_param_names
         )
-        for weight_name in weight_mappings.keys():
+        for module_name in weight_mappings.keys():
             weight_data = {}
-            for param_name, param_value in weight_mappings[weight_name].items():
-                weight_data[param_name] = param_value
-
+            for param_name, safe_path in weight_mappings[module_name].items():
+                full_name = merge_names(module_name, param_name)
+                with safe_open(safe_path, framework="pt", device=device) as f:
+                    weight_data[param_name] = f.get_tensor(full_name)
             if "weight_scale" in weight_data:
-                quant_args = names_to_scheme[weight_name]
+                quant_args = names_to_scheme[module_name].weights
                 decompressed = self.decompress_weight(
                     compressed_data=weight_data, quantization_args=quant_args
                 )
                 weight_data["weight"] = decompressed
-                yield weight_name, weight_data
+                yield module_name, weight_data
diff --git a/src/compressed_tensors/compressors/sparse_compressors/base.py b/src/compressed_tensors/compressors/sparse_compressors/base.py
@@ -151,6 +151,9 @@ def decompress(
                     value = f.get_tensor(ignored_param_name)
                 yield ignored_param_name, value
 
+    def decompress_from_state_dict(self, state_dict, names_to_scheme):
+        exit(0)
+
     @staticmethod
     def should_compress(name: str, expanded_targets: Optional[Set[str]] = None) -> bool:
         """
diff --git a/tests/test_compressors/model_compressors/test_model_compressor.py b/tests/test_compressors/model_compressors/test_model_compressor.py
@@ -370,52 +370,69 @@ def _get_combined_config(s_config, q_config):
 
 
 @pytest.mark.parametrize(
-    "model_stub,q_format,s_format",
+    "model_stub,comp_stub,q_format,s_format",
     [
         (
             "nm-testing/llama2.c-stories42M-gsm8k-quantized-only-uncompressed",
+            "nm-testing/llama2.c-stories42M-gsm8k-quantized-only-compressed",
             "float-quantized",
             None,
         ),
         (
             "nm-testing/llama2.c-stories42M-gsm8k-sparse-only-uncompressed",
+            "nm-testing/llama2.c-stories42M-gsm8k-sparse-only-compressed",
             None,
             "sparse-24-bitmask",
         ),
         (
             "nm-testing/llama2.c-stories42M-gsm8k-stacked-uncompressed",
+            "nm-testing/llama2.c-stories42M-gsm8k-stacked-compressed",
             "float-quantized",
             "sparse-24-bitmask",
         ),
     ],
 )
-def test_compress_model(model_stub, q_format, s_format):
+def test_compress_decompress_model(model_stub, comp_stub, q_format, s_format):
     model = AutoModelForCausalLM.from_pretrained(model_stub)
     compressor = ModelCompressor.from_pretrained_model(model, s_format, q_format)
-    original_compressed_state_dict = dict(compressor.compress(model))
-    original_compressed_state_dict = {
-        key: value.clone() for key, value in original_compressed_state_dict.items()
-    }
 
-    compressor.compress_model(model)
+    # compress model by eagerly compressing state dict
+    true_compressed = dict(compressor.compress(model))
+    true_compressed = {key: value.clone() for key, value in true_compressed.items()}
 
-    for module in model.modules():
-        # scheme <=> CompressedLinear
-        has_scheme = hasattr(module, "quantization_scheme")
-        is_compressed = (
-            getattr(module, "quantization_status", None)
-            == QuantizationStatus.COMPRESSED
-        )
-        # assert has_scheme == is_compressed
+    # compress model directly
+    compressor.compress_model(model)
+    compressed = dict(model.state_dict())
 
     # equivalent to eagerly compressing state dict
-    compressed_state_dict = dict(model.state_dict())
-    assert compressed_state_dict.keys() == original_compressed_state_dict.keys()
-    for key in compressed_state_dict.keys():
-        assert torch.all(
-            compressed_state_dict[key] == original_compressed_state_dict[key]
-        ), f"{key}"
-
-    # decompress
+    assert compressed.keys() == true_compressed.keys()
+    for key in compressed.keys():
+        assert torch.all(compressed[key] == true_compressed[key]), f"{key}"
+
+    del compressed
+    del true_compressed
+
+    # -- decompress -- #
+
+    # reinstantiate compressor to mimic LLM Compressor flows
     compressor = ModelCompressor.from_pretrained_model(model, s_format, q_format)
+
+    # decompress model from disk  # TODO try also using a model saved from prev step
+    true_decompressed_model = AutoModelForCausalLM.from_pretrained(
+        model_stub, device_map="meta"
+    )
+    compressor.decompress(comp_stub, true_decompressed_model)
+    true_decompressed = dict(true_decompressed_model.state_dict())
+
+    # decompress model
     compressor.decompress_model(model)
+    decompressed = dict(model.state_dict())
+
+    # equivalent to decompressing from disk
+    breakpoint()
+    assert decompressed.keys() == true_decompressed.keys()
+    for key in decompressed.keys():
+        assert torch.all(decompressed[key] == true_decompressed[key]), f"{key}"
+    del true_decompressed
+
+    exit(0)