update marlin test, marlin uses scheme

kylesayrs · kylesayrs · commit 9524c7f364a4 · 2025-04-02T19:33:09.000-04:00
Signed-off-by: Kyle Sayers &lt;kylesayrs@gmail.com&gt;
diff --git a/src/compressed_tensors/compressors/sparse_quantized_compressors/marlin_24.py b/src/compressed_tensors/compressors/sparse_quantized_compressors/marlin_24.py
@@ -19,7 +19,11 @@
 import torch
 from compressed_tensors.compressors.base import BaseCompressor
 from compressed_tensors.config import CompressionFormat
-from compressed_tensors.quantization import QuantizationArgs, QuantizationStrategy
+from compressed_tensors.quantization import (
+    QuantizationArgs,
+    QuantizationScheme,
+    QuantizationStrategy,
+)
 from compressed_tensors.quantization.lifecycle.forward import quantize
 from compressed_tensors.utils import (
     get_permutations_24,
@@ -44,19 +48,25 @@ class Marlin24Compressor(BaseCompressor):
 
     @staticmethod
     def validate_quant_compatability(
-        model_quant_args: Dict[str, QuantizationArgs]
+        names_to_scheme: Dict[str, QuantizationScheme]
     ) -> bool:
         """
         Checks if every quantized module in the model is compatible with Marlin24
         compression. Quantization must be channel or group strategy with group_size
         of 128. Only symmetric quantization is supported
 
-        :param model_quant_args: dictionary of mapping module names to their
-            quantization configuration
+        :param names_to_scheme: dictionary of mapping module names to their
+            quantization schemes
         :return: True if all modules are compatible with Marlin24 compression, raises
             a ValueError otherwise
         """
-        for name, quant_args in model_quant_args.items():
+        for name, scheme in names_to_scheme.items():
+            quant_args = scheme.weights
+            if quant_args is None:
+                raise ValueError(
+                    "Marlin24 Compressor is only valid for weight quantization schemes"
+                )
+
             strategy = quant_args.strategy
             group_size = quant_args.group_size
             symmetric = quant_args.symmetric
@@ -114,16 +124,16 @@ def compression_param_names(self) -> Tuple[str]:
     def compress(
         self,
         model_state: Dict[str, Tensor],
-        names_to_scheme: Dict[str, QuantizationArgs],
+        names_to_scheme: Dict[str, QuantizationScheme],
         **kwargs,
     ) -> Dict[str, Tensor]:
         """
         Compresses a quantized state_dict with 2:4 sparsity structure for inference
         with the Marlin24 kernel
 
         :param model_state: state dict of uncompressed model
-        :param names_to_scheme: quantization args for each quantized weight, needed for
-           quantize function to calculate bit depth
+        :param names_to_scheme: quantization scheme for each quantized weight, needed
+            for quantize function to calculate bit depth
         :return: compressed state dict
         """
         self.validate_quant_compatability(names_to_scheme)
@@ -146,7 +156,7 @@ def compress(
                     value = value.to(torch.float16)
 
                     # quantize weight, keeping it as a float16 for now
-                    quant_args = names_to_scheme[prefix]
+                    quant_args = names_to_scheme[prefix].weights
                     value = quantize(
                         x=value, scale=scale, zero_point=zp, args=quant_args
                     )
@@ -215,7 +225,7 @@ def pack_weight_24(
     weight: Tensor,
     quantization_args: QuantizationArgs,
     tile: int = 16,
-):
+) -> torch.Tensor:
     size_k = weight.shape[0]
     size_n = weight.shape[1]
     num_bits = quantization_args.num_bits
@@ -236,7 +246,9 @@ def pack_weight_24(
     return q_packed
 
 
-def pack_scales_24(scales, quantization_args, w_shape):
+def pack_scales_24(
+    scales: torch.Tensor, quantization_args: QuantizationArgs, w_shape: torch.Size
+) -> torch.Tensor:
     size_k = w_shape[0]
     size_n = w_shape[1]
     num_bits = quantization_args.num_bits
diff --git a/tests/test_compressors/sparse_quantized_compressors/test_marlin_24.py b/tests/test_compressors/sparse_quantized_compressors/test_marlin_24.py
@@ -19,7 +19,7 @@
 from compressed_tensors.compressors import (
     BaseCompressor,
     Marlin24Compressor,
-    map_modules_to_quant_args,
+    map_module_to_scheme,
 )
 from compressed_tensors.config import CompressionFormat
 from compressed_tensors.quantization import (
@@ -92,9 +92,9 @@ def test_marlin24_format(
     assert f"{NOT_QUANT_NAME}.weight_scale" not in state_dict
     assert f"{QUANT_NAME}.weight_scale" in state_dict
 
-    model_to_quant_args = map_modules_to_quant_args(model)
+    module_to_scheme = map_module_to_scheme(model)
     compressor = Marlin24Compressor()
-    compressor.validate_quant_compatability(model_to_quant_args)
+    compressor.validate_quant_compatability(module_to_scheme)
     compressor.validate_sparsity_structure(
         QUANT_NAME, state_dict[f"{QUANT_NAME}.weight"]
     )
@@ -104,7 +104,7 @@ def test_marlin24_format(
         )
 
     compressor = Marlin24Compressor()
-    compressed_state_dict = compressor.compress(state_dict, model_to_quant_args)
+    compressed_state_dict = compressor.compress(state_dict, module_to_scheme)
 
     assert len(compressed_state_dict) == 4
     assert torch.equal(