don't use compressedlinear

kylesayrs · kylesayrs · commit 16f9f1f09bb5 · 2025-05-02T14:58:44.000-04:00
Signed-off-by: Kyle Sayers &lt;kylesayrs@gmail.com&gt;
diff --git a/src/compressed_tensors/compressors/model_compressors/model_compressor.py b/src/compressed_tensors/compressors/model_compressors/model_compressor.py
@@ -370,12 +370,19 @@ def get_unexpected_file_keys(self, model: Module) -> List[str]:
         return list(unexpected_keys)
 
     def apply_compression_status(self, model: Module):
+        # sparsity compression
         if self.quantization_config is None:
             for module in model.modules():
                 module.quantization_status = QuantizationStatus.COMPRESSED
-            return
 
-        quantization_format = self.quantization_config.format
+            # hack: compress state dict upfront, since CompressedLinear doesn't have
+            # support for sparsified models
+            model_state_dict = self.compress(model)
+            def state_dict_hook(module, prefix, keep_vars):
+                return model_state_dict if prefix == "" else {}
+            model.register_state_dict_pre_hook(state_dict_hook)
+
+            return
 
         def replace_with_compressed(module: Module) -> Module:
             scheme = getattr(module, "quantization_scheme", None)
@@ -385,25 +392,26 @@ def replace_with_compressed(module: Module) -> Module:
                     with disable_hf_hook(module):
                         unwrap_module_forward_quantized(module)
 
-                module = CompressedLinear.from_linear(
-                    module,
-                    quantization_scheme=scheme,
-                    quantization_format=quantization_format,
-                )
-                state_dict = module.compressor.compress(
-                    module.state_dict(), {"": scheme}
-                )  # added by compressed linear
+                state_dict = self.compress(module, show_progress=False)
+
+                # CompressedLinear initializes qparams which have to be deleted
+                # TODO: CompressedLinear should not initialize qparams
+                for name, _ in list(module.named_parameters()):
+                    delattr(module, name)
 
                 for name, value in state_dict.items():
-                    update_offload_parameter(module, name, value)
+                    param = torch.nn.Parameter(value, requires_grad=False)
+                    register_offload_parameter(module, name, param)
+
+                module.quantization_status = QuantizationStatus.COMPRESSED
 
             return module
 
-        progress = tqdm(total=len(list(model.modules())))
+        progress = tqdm(desc="Compressing modules", total=len(list(model.modules())))
         module_map_replace(model, replace_with_compressed, progress=progress)
 
     def compress(
-        self, model: Module, state_dict: Optional[Dict[str, Tensor]] = None
+        self, model: Module, state_dict: Optional[Dict[str, Tensor]] = None, show_progress: bool = False
     ) -> Dict[str, Tensor]:
         """
         Compresses a dense state dict or model with sparsity and/or quantization
@@ -419,7 +427,7 @@ def compress(
         if self.quantization_compressor is not None:
             module_to_scheme = map_module_to_scheme(model)
             state_dict = self.quantization_compressor.compress(
-                state_dict, names_to_scheme=module_to_scheme
+                state_dict, names_to_scheme=module_to_scheme, show_progress=False
             )
 
             # TODO: consider sparse compression to also be compression
diff --git a/src/compressed_tensors/compressors/quantized_compressors/base.py b/src/compressed_tensors/compressors/quantized_compressors/base.py
@@ -71,6 +71,7 @@ def compress(
         self,
         model_state: Dict[str, Tensor],
         names_to_scheme: Dict[str, QuantizationScheme],
+        show_progress: bool = False,
         **kwargs,
     ) -> Dict[str, Tensor]:
         """
@@ -79,13 +80,16 @@ def compress(
         :param model_state: state dict of uncompressed model
         :param names_to_scheme: quantization args for each quantized weight, needed for
             quantize function to calculate bit depth
+        :param show_progress: whether to show tqdm progress
         :return: compressed state dict
         """
+        uncompressed_names = list(model_state.keys())
         compressed_dict = {}
         save_device = "cpu"
 
-        uncompressed_names = list(model_state.keys())
-        for name in tqdm(uncompressed_names, desc="Compressing with quantization"):
+        # compress values
+        desc = "Compressing with quantization"
+        for name in tqdm(uncompressed_names, desc=desc, disable=(not show_progress)):
             value = model_state[name]
 
             # compress weights
diff --git a/src/compressed_tensors/compressors/sparse_quantized_compressors/marlin_24.py b/src/compressed_tensors/compressors/sparse_quantized_compressors/marlin_24.py
@@ -125,6 +125,7 @@ def compress(
         self,
         model_state: Dict[str, Tensor],
         names_to_scheme: Dict[str, QuantizationScheme],
+        show_progress: bool = False,
         **kwargs,
     ) -> Dict[str, Tensor]:
         """
@@ -134,6 +135,7 @@ def compress(
         :param model_state: state dict of uncompressed model
         :param names_to_scheme: quantization scheme for each quantized weight, needed
             for quantize function to calculate bit depth
+        :param show_progress: whether to show tqdm progress
         :return: compressed state dict
         """
         self.validate_quant_compatability(names_to_scheme)
@@ -144,7 +146,7 @@ def compress(
             f"Compressing model with {len(model_state)} parameterized layers..."
         )
 
-        for name, value in tqdm(model_state.items(), desc="Compressing model"):
+        for name, value in tqdm(model_state.items(), desc="Compressing model", disable=(not show_progress)):
             if name.endswith(weight_suffix):
                 prefix = name[: -(len(weight_suffix))]
                 scale = model_state.get(merge_names(prefix, "weight_scale"), None)
diff --git a/tests/test_compressors/model_compressors/dict_keys(['model.embed_tokens.lua b/tests/test_compressors/model_compressors/dict_keys(['model.embed_tokens.lua
@@ -0,0 +1,186 @@
+  + odict_keys(['model.embed_tokens.weight'
+ 'model.layers.0.self_attn.q_proj.weight_scale'
+ 'model.layers.0.self_attn.q_proj.weight_zero_point'
+ 'model.layers.0.self_attn.q_proj.weight'
+ 'model.layers.0.self_attn.k_proj.weight_scale'
+ 'model.layers.0.self_attn.k_proj.weight_zero_point'
+ 'model.layers.0.self_attn.k_proj.weight'
+ 'model.layers.0.self_attn.v_proj.weight_scale'
+ 'model.layers.0.self_attn.v_proj.weight_zero_point'
+ 'model.layers.0.self_attn.v_proj.weight'
+ 'model.layers.0.self_attn.o_proj.weight_scale'
+ 'model.layers.0.self_attn.o_proj.weight_zero_point'
+ 'model.layers.0.self_attn.o_proj.weight'
+ 'model.layers.0.mlp.gate_proj.weight_scale'
+ 'model.layers.0.mlp.gate_proj.weight_zero_point'
+ 'model.layers.0.mlp.gate_proj.weight'
+ 'model.layers.0.mlp.up_proj.weight_scale'
+ 'model.layers.0.mlp.up_proj.weight_zero_point'
+ 'model.layers.0.mlp.up_proj.weight'
+ 'model.layers.0.mlp.down_proj.weight_scale'
+ 'model.layers.0.mlp.down_proj.weight_zero_point'
+ 'model.layers.0.mlp.down_proj.weight'
+ 'model.layers.0.input_layernorm.weight'
+ 'model.layers.0.post_attention_layernorm.weight'
+ 'model.layers.1.self_attn.q_proj.weight_scale'
+ 'model.layers.1.self_attn.q_proj.weight_zero_point'
+ 'model.layers.1.self_attn.q_proj.weight'
+ 'model.layers.1.self_attn.k_proj.weight_scale'
+ 'model.layers.1.self_attn.k_proj.weight_zero_point'
+ 'model.layers.1.self_attn.k_proj.weight'
+ 'model.layers.1.self_attn.v_proj.weight_scale'
+ 'model.layers.1.self_attn.v_proj.weight_zero_point'
+ 'model.layers.1.self_attn.v_proj.weight'
+ 'model.layers.1.self_attn.o_proj.weight_scale'
+ 'model.layers.1.self_attn.o_proj.weight_zero_point'
+ 'model.layers.1.self_attn.o_proj.weight'
+ 'model.layers.1.mlp.gate_proj.weight_scale'
+ 'model.layers.1.mlp.gate_proj.weight_zero_point'
+ 'model.layers.1.mlp.gate_proj.weight'
+ 'model.layers.1.mlp.up_proj.weight_scale'
+ 'model.layers.1.mlp.up_proj.weight_zero_point'
+ 'model.layers.1.mlp.up_proj.weight'
+ 'model.layers.1.mlp.down_proj.weight_scale'
+ 'model.layers.1.mlp.down_proj.weight_zero_point'
+ 'model.layers.1.mlp.down_proj.weight'
+ 'model.layers.1.input_layernorm.weight'
+ 'model.layers.1.post_attention_layernorm.weight'
+ 'model.layers.2.self_attn.q_proj.weight_scale'
+ 'model.layers.2.self_attn.q_proj.weight_zero_point'
+ 'model.layers.2.self_attn.q_proj.weight'
+ 'model.layers.2.self_attn.k_proj.weight_scale'
+ 'model.layers.2.self_attn.k_proj.weight_zero_point'
+ 'model.layers.2.self_attn.k_proj.weight'
+ 'model.layers.2.self_attn.v_proj.weight_scale'
+ 'model.layers.2.self_attn.v_proj.weight_zero_point'
+ 'model.layers.2.self_attn.v_proj.weight'
+ 'model.layers.2.self_attn.o_proj.weight_scale'
+ 'model.layers.2.self_attn.o_proj.weight_zero_point'
+ 'model.layers.2.self_attn.o_proj.weight'
+ 'model.layers.2.mlp.gate_proj.weight_scale'
+ 'model.layers.2.mlp.gate_proj.weight_zero_point'
+ 'model.layers.2.mlp.gate_proj.weight'
+ 'model.layers.2.mlp.up_proj.weight_scale'
+ 'model.layers.2.mlp.up_proj.weight_zero_point'
+ 'model.layers.2.mlp.up_proj.weight'
+ 'model.layers.2.mlp.down_proj.weight_scale'
+ 'model.layers.2.mlp.down_proj.weight_zero_point'
+ 'model.layers.2.mlp.down_proj.weight'
+ 'model.layers.2.input_layernorm.weight'
+ 'model.layers.2.post_attention_layernorm.weight'
+ 'model.layers.3.self_attn.q_proj.weight_scale'
+ 'model.layers.3.self_attn.q_proj.weight_zero_point'
+ 'model.layers.3.self_attn.q_proj.weight'
+ 'model.layers.3.self_attn.k_proj.weight_scale'
+ 'model.layers.3.self_attn.k_proj.weight_zero_point'
+ 'model.layers.3.self_attn.k_proj.weight'
+ 'model.layers.3.self_attn.v_proj.weight_scale'
+ 'model.layers.3.self_attn.v_proj.weight_zero_point'
+ 'model.layers.3.self_attn.v_proj.weight'
+ 'model.layers.3.self_attn.o_proj.weight_scale'
+ 'model.layers.3.self_attn.o_proj.weight_zero_point'
+ 'model.layers.3.self_attn.o_proj.weight''model.layers.3.mlp.gate_proj.weight_scale'
+ 'model.layers.3.mlp.gate_proj.weight_zero_point'
+ 'model.layers.3.mlp.gate_proj.weight'
+ 'model.layers.3.mlp.up_proj.weight_scale'
+ 'model.layers.3.mlp.up_proj.weight_zero_point'
+ 'model.layers.3.mlp.up_proj.weight'
+ 'model.layers.3.mlp.down_proj.weight_scale'
+ 'model.layers.3.mlp.down_proj.weight_zero_point'
+ 'model.layers.3.mlp.down_proj.weight'
+ 'model.layers.3.input_layernorm.weight'
+ 'model.layers.3.post_attention_layernorm.weight'
+ 'model.layers.4.self_attn.q_proj.weight_scale'
+ 'model.layers.4.self_attn.q_proj.weight_zero_point'
+ 'model.layers.4.self_attn.q_proj.weight'
+ 'model.layers.4.self_attn.k_proj.weight_scale'
+ 'model.layers.4.self_attn.k_proj.weight_zero_point'
+ 'model.layers.4.self_attn.k_proj.weight'
+ 'model.layers.4.self_attn.v_proj.weight_scale'
+ 'model.layers.4.self_attn.v_proj.weight_zero_point'
+ 'model.layers.4.self_attn.v_proj.weight'
+ 'model.layers.4.self_attn.o_proj.weight_scale'
+ 'model.layers.4.self_attn.o_proj.weight_zero_point'
+ 'model.layers.4.self_attn.o_proj.weight'
+ 'model.layers.4.mlp.gate_proj.weight_scale'
+ 'model.layers.4.mlp.gate_proj.weight_zero_point'
+ 'model.layers.4.mlp.gate_proj.weight'
+ 'model.layers.4.mlp.up_proj.weight_scale'
+ 'model.layers.4.mlp.up_proj.weight_zero_point'
+ 'model.layers.4.mlp.up_proj.weight'
+ 'model.layers.4.mlp.down_proj.weight_scale'
+ 'model.layers.4.mlp.down_proj.weight_zero_point'
+ 'model.layers.4.mlp.down_proj.weight'
+ 'model.layers.4.input_layernorm.weight'
+ 'model.layers.4.post_attention_layernorm.weight'
+ 'model.layers.5.self_attn.q_proj.weight_scale'
+ 'model.layers.5.self_attn.q_proj.weight_zero_point'
+ 'model.layers.5.self_attn.q_proj.weight'
+ 'model.layers.5.self_attn.k_proj.weight_scale'
+ 'model.layers.5.self_attn.k_proj.weight_zero_point'
+ 'model.layers.5.self_attn.k_proj.weight'
+ 'model.layers.5.self_attn.v_proj.weight_scale'
+ 'model.layers.5.self_attn.v_proj.weight_zero_point'
+ 'model.layers.5.self_attn.v_proj.weight'
+ 'model.layers.5.self_attn.o_proj.weight_scale'
+ 'model.layers.5.self_attn.o_proj.weight_zero_point'
+ 'model.layers.5.self_attn.o_proj.weight'
+ 'model.layers.5.mlp.gate_proj.weight_scale'
+ 'model.layers.5.mlp.gate_proj.weight_zero_point'
+ 'model.layers.5.mlp.gate_proj.weight'
+ 'model.layers.5.mlp.up_proj.weight_scale'
+ 'model.layers.5.mlp.up_proj.weight_zero_point'
+ 'model.layers.5.mlp.up_proj.weight'
+ 'model.layers.5.mlp.down_proj.weight_scale'
+ 'model.layers.5.mlp.down_proj.weight_zero_point'
+ 'model.layers.5.mlp.down_proj.weight'
+ 'model.layers.5.input_layernorm.weight'
+ 'model.layers.5.post_attention_layernorm.weight'
+ 'model.layers.6.self_attn.q_proj.weight_scale'
+ 'model.layers.6.self_attn.q_proj.weight_zero_point'
+ 'model.layers.6.self_attn.q_proj.weight'
+ 'model.layers.6.self_attn.k_proj.weight_scale'
+ 'model.layers.6.self_attn.k_proj.weight_zero_point'
+ 'model.layers.6.self_attn.k_proj.weight'
+ 'model.layers.6.self_attn.v_proj.weight_scale'
+ 'model.layers.6.self_attn.v_proj.weight_zero_point'
+ 'model.layers.6.self_attn.v_proj.weight'
+ 'model.layers.6.self_attn.o_proj.weight_scale'
+ 'model.layers.6.self_attn.o_proj.weight_zero_point'
+ 'model.layers.6.self_attn.o_proj.weight'
+ 'model.layers.6.mlp.gate_proj.weight_scale'
+ 'model.layers.6.mlp.gate_proj.weight_zero_point'
+ 'model.layers.6.mlp.gate_proj.weight'
+ 'model.layers.6.mlp.up_proj.weight_scale'
+ 'model.layers.6.mlp.up_proj.weight_zero_point'
+ 'model.layers.6.mlp.up_proj.weight'
+ 'model.layers.6.mlp.down_proj.weight_scale'
+ 'model.layers.6.mlp.down_proj.weight_zero_point'
+ 'model.layers.6.mlp.down_proj.weight'
+ 'model.layers.6.input_layernorm.weight'
+ 'model.layers.6.post_attention_layernorm.weight'
+ 'model.layers.7.self_attn.q_proj.weight_scale'
+ 'model.layers.7.self_attn.q_proj.weight_zero_point'
+ 'model.layers.7.self_attn.q_proj.weight'
+ 'model.layers.7.self_attn.k_proj.weight_scale'
+ 'model.layers.7.self_attn.k_proj.weight_zero_point'
+ 'model.layers.7.self_attn.k_proj.weight'
+ 'model.layers.7.self_attn.v_proj.weight_scale'
+ 'model.layers.7.self_attn.v_proj.weight_zero_point'
+ 'model.layers.7.self_attn.v_proj.weight'
+ 'model.layers.7.self_attn.o_proj.weight_scale'
+ 'model.layers.7.self_attn.o_proj.weight_zero_point'
+ 'model.layers.7.self_attn.o_proj.weight'
+ 'model.layers.7.mlp.gate_proj.weight_scale'
+ 'model.layers.7.mlp.gate_proj.weight_zero_point'
+ 'model.layers.7.mlp.gate_proj.weight'
+ 'model.layers.7.mlp.up_proj.weight_scale'
+ 'model.layers.7.mlp.up_proj.weight_zero_point'
+ 'model.layers.7.mlp.up_proj.weight'
+ 'model.layers.7.mlp.down_proj.weight_scale'
+ 'model.layers.7.mlp.down_proj.weight_zero_point'
+ 'model.layers.7.mlp.down_proj.weight'
+ 'model.layers.7.input_layernorm.weight'
+ 'model.layers.7.post_attention_layernorm.weight'
+ 'model.norm.weight'
+ 'lm_head.weight'])
diff --git a/tests/test_compressors/model_compressors/test_model_compressor.py b/tests/test_compressors/model_compressors/test_model_compressor.py
@@ -22,7 +22,7 @@
 from compressed_tensors.compressors import ModelCompressor
 from compressed_tensors.config import SparsityCompressionConfig
 from compressed_tensors.linear.compressed_linear import CompressedLinear
-from compressed_tensors.quantization import QuantizationConfig
+from compressed_tensors.quantization import QuantizationConfig, QuantizationStatus
 from safetensors.torch import save_file
 from tests.testing_utils import induce_sparsity, requires_hf_quantizer
 from transformers import AutoModelForCausalLM
@@ -392,13 +392,22 @@ def _get_combined_config(s_config, q_config):
 def test_apply_compression_status(model_stub, q_format, s_format):
     model = AutoModelForCausalLM.from_pretrained(model_stub)
     compressor = ModelCompressor.from_pretrained_model(model, s_format, q_format)
+    original_compressed_state_dict = dict(compressor.compress(model))
+    original_compressed_state_dict = {key: value.clone() for key, value in original_compressed_state_dict.items()}
+
     compressor.apply_compression_status(model)
 
     for module in model.modules():
         # scheme <=> CompressedLinear
         has_scheme = hasattr(module, "quantization_scheme")
-        is_compressed = isinstance(module, CompressedLinear)
-        assert has_scheme == is_compressed
+        is_compressed = getattr(module, "quantization_status", None) == QuantizationStatus.COMPRESSED
+        #assert has_scheme == is_compressed
+
+    # equivalent to eagerly compressing state dict
+    compressed_state_dict = dict(model.state_dict())
+    assert compressed_state_dict.keys() == original_compressed_state_dict.keys()
+    for key in compressed_state_dict.keys():
+        assert torch.all(compressed_state_dict[key] == original_compressed_state_dict[key]), f"{key}"
 
     # can run to completion
-    model(**model.dummy_inputs)
+    #model(**model.dummy_inputs)