add unwrapping, tests

kylesayrs · kylesayrs · commit 0272c1c681ad · 2025-04-28T11:56:42.000-04:00
Signed-off-by: Kyle Sayers &lt;kylesayrs@gmail.com&gt;
diff --git a/src/compressed_tensors/compressors/model_compressors/model_compressor.py b/src/compressed_tensors/compressors/model_compressors/model_compressor.py
@@ -33,13 +33,15 @@
 from compressed_tensors.compressors.base import BaseCompressor
 from compressed_tensors.compressors.sparse_compressors import DenseCompressor
 from compressed_tensors.config import CompressionFormat, SparsityCompressionConfig
+from compressed_tensors.linear.compressed_linear import CompressedLinear
 from compressed_tensors.quantization import (
     DEFAULT_QUANTIZATION_METHOD,
     QuantizationConfig,
     QuantizationScheme,
     QuantizationStatus,
     apply_quantization_config,
     load_pretrained_quantization_parameters,
+    unwrap_module_forward_quantized,
 )
 from compressed_tensors.quantization.lifecycle import expand_target_names
 from compressed_tensors.quantization.utils import (
@@ -58,7 +60,7 @@
     fix_fsdp_module_name,
     is_compressed_tensors_config,
 )
-from compressed_tensors.utils.offload import update_offload_parameter
+from compressed_tensors.utils.offload import disable_hf_hook, update_offload_parameter
 from torch import Tensor
 from torch.nn import Module
 from tqdm import tqdm
@@ -100,6 +102,9 @@ class ModelCompressor:
     :param quantization_config: config specifying quantization compression parameters
     """
 
+    sparsity_config: Optional[SparsityCompressionConfig] = None
+    quantization_config: Optional[QuantizationConfig] = None
+
     @classmethod
     def from_pretrained(
         cls,
@@ -364,12 +369,22 @@ def get_unexpected_file_keys(self, model: Module) -> List[str]:
 
         return list(unexpected_keys)
 
-    def apply_compression_status(self, model: Module) -> Module:
+    def apply_compression_status(self, model: Module):
+        if self.quantization_config is None:
+            for module in model.modules():
+                module.quantization_status = QuantizationStatus.COMPRESSED
+            return
+
         quantization_format = self.quantization_config.format
 
         def replace_with_compressed(module: Module) -> Module:
             scheme = getattr(module, "quantization_scheme", None)
             if isinstance(module, torch.nn.Linear) and scheme is not None:
+                # TODO: after refactored into hook, just remove hook
+                if hasattr(module, "quantization_status"):
+                    with disable_hf_hook(module):
+                        unwrap_module_forward_quantized(module)
+
                 module = CompressedLinear.from_linear(
                     module,
                     quantization_scheme=scheme,
@@ -385,7 +400,7 @@ def replace_with_compressed(module: Module) -> Module:
             return module
 
         progress = tqdm(total=len(list(model.modules())))
-        return module_map_replace(model, replace_with_compressed, progress=progress)
+        module_map_replace(model, replace_with_compressed, progress=progress)
 
     def compress(
         self, model: Module, state_dict: Optional[Dict[str, Tensor]] = None
diff --git a/src/compressed_tensors/quantization/lifecycle/forward.py b/src/compressed_tensors/quantization/lifecycle/forward.py
@@ -37,6 +37,7 @@
     "dequantize",
     "fake_quantize",
     "wrap_module_forward_quantized",
+    "unwrap_module_forward_quantized",
     "forward_quantize",
 ]
 
@@ -312,6 +313,10 @@ def wrapped_forward(self, *args, **kwargs):
     setattr(module, "forward", bound_wrapped_forward)
 
 
+def unwrap_module_forward_quantized(module: Module):
+    delattr(module, "forward")  # revert to class implementation
+
+
 def forward_quantize(
     module: Module, value: torch.Tensor, base_name: str, args: "QuantizationArgs"
 ) -> torch.Tensor:
diff --git a/src/compressed_tensors/utils/helpers.py b/src/compressed_tensors/utils/helpers.py
@@ -14,10 +14,11 @@
 
 import warnings
 from functools import wraps
-from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union
 
 import numpy
 import torch
+import tqdm
 from transformers import AutoConfig
 
 
diff --git a/tests/test_compressors/model_compressors/test_model_compressor.py b/tests/test_compressors/model_compressors/test_model_compressor.py
@@ -21,9 +21,11 @@
 import torch.nn as nn
 from compressed_tensors.compressors import ModelCompressor
 from compressed_tensors.config import SparsityCompressionConfig
+from compressed_tensors.linear.compressed_linear import CompressedLinear
 from compressed_tensors.quantization import QuantizationConfig
 from safetensors.torch import save_file
 from tests.testing_utils import induce_sparsity, requires_hf_quantizer
+from transformers import AutoModelForCausalLM
 
 
 def sparsity_config():
@@ -365,3 +367,38 @@ def _get_combined_config(s_config, q_config):
         combined["sparsity_config"] = s_config
 
     return combined
+
+
+@pytest.mark.parametrize(
+    "model_stub,q_format,s_format",
+    [
+        (
+            "nm-testing/llama2.c-stories42M-gsm8k-quantized-only-uncompressed",
+            "float-quantized",
+            None,
+        ),
+        (
+            "nm-testing/llama2.c-stories42M-gsm8k-sparse-only-uncompressed",
+            None,
+            "sparse-24-bitmask",
+        ),
+        (
+            "nm-testing/llama2.c-stories42M-gsm8k-stacked-uncompressed",
+            "float-quantized",
+            "sparse-24-bitmask",
+        ),
+    ],
+)
+def test_apply_compression_status(model_stub, q_format, s_format):
+    model = AutoModelForCausalLM.from_pretrained(model_stub)
+    compressor = ModelCompressor.from_pretrained_model(model, s_format, q_format)
+    compressor.apply_compression_status(model)
+
+    for module in model.modules():
+        # scheme <=> CompressedLinear
+        has_scheme = hasattr(module, "quantization_scheme")
+        is_compressed = isinstance(module, CompressedLinear)
+        assert has_scheme == is_compressed
+
+    # can run to completion
+    model(**model.dummy_inputs)