add check for if there are no weight quantizations

kylesayrs · kylesayrs · commit 7d5f7c920387 · 2025-07-09T22:18:03.000-04:00
Signed-off-by: Kyle Sayers &lt;kylesayrs@gmail.com&gt;
diff --git a/src/llmcompressor/transformers/compression/quantization_format.py b/src/llmcompressor/transformers/compression/quantization_format.py
@@ -48,57 +48,57 @@ def infer_quantization_format(
     """
     if quantization_format is not None:
         return quantization_format
-    
-    if not save_compressed:
-        # format will be inferred from config
-        return None
-    
+
     weight_args, input_args = _get_unique_quant_args(model)
 
     # no quantization format if no weights are quantized
     if len(weight_args) <= 0:
         return None
-    
-    is_24_structure = (
-        SparsityStructure(sparsity_structure) == SparsityStructure.TWO_FOUR
-    )
-    is_weight_only = len(input_args) == 0 and len(weight_args) > 0
-
-    if (
-        weight_args[0].num_bits == 4
-        and weight_args[0].type == QuantizationType.FLOAT.value
-    ):
-        return CompressionFormat.nvfp4_pack_quantized
-
-    if is_weight_only:  # w4a16 and w8a16
-        is_valid_pack = all(
-            weight_arg.num_bits in [4, 8]
-            and weight_arg.type == QuantizationType.INT.value
-            for weight_arg in weight_args
+
+    if save_compressed:
+        is_24_structure = (
+            SparsityStructure(sparsity_structure) == SparsityStructure.TWO_FOUR
         )
-        if not is_valid_pack:  # packing only valid for int4 and int 8
-            return CompressionFormat.naive_quantized
-        if is_24_structure:
-            for arg in weight_args:
+        is_weight_only = len(input_args) == 0 and len(weight_args) > 0
+
+        if (
+            weight_args[0].num_bits == 4
+            and weight_args[0].type == QuantizationType.FLOAT.value
+        ):
+            return CompressionFormat.nvfp4_pack_quantized
+
+        if is_weight_only:  # w4a16 and w8a16
+            is_valid_pack = all(
+                weight_arg.num_bits in [4, 8]
+                and weight_arg.type == QuantizationType.INT.value
+                for weight_arg in weight_args
+            )
+            if not is_valid_pack:  # packing only valid for int4 and int 8
+                return CompressionFormat.naive_quantized
+            if is_24_structure:
+                for arg in weight_args:
+                    if (
+                        arg.strategy is not QuantizationStrategy.CHANNEL.value
+                        and arg.strategy is not QuantizationStrategy.GROUP.value
+                    ):
+                        # marlin24 kernel only applicable for channel/group quantization
+                        return CompressionFormat.pack_quantized
+                return CompressionFormat.marlin_24
+            return CompressionFormat.pack_quantized
+        else:  # w8a8 float and int
+            if len(weight_args) == 1:
                 if (
-                    arg.strategy is not QuantizationStrategy.CHANNEL.value
-                    and arg.strategy is not QuantizationStrategy.GROUP.value
+                    weight_args[0].type == QuantizationType.FLOAT.value
+                    and weight_args[0].num_bits == 8
                 ):
-                    # marlin24 kernel only applicable for channel/group quantization
-                    return CompressionFormat.pack_quantized
-            return CompressionFormat.marlin_24
-        return CompressionFormat.pack_quantized
-    else:  # w8a8 float and int
-        if len(weight_args) == 1:
-            if (
-                weight_args[0].type == QuantizationType.FLOAT.value
-                and weight_args[0].num_bits == 8
-            ):
-                return CompressionFormat.float_quantized
-            if weight_args[0].type == QuantizationType.INT.value:
-                return CompressionFormat.int_quantized
-
-        return CompressionFormat.naive_quantized
+                    return CompressionFormat.float_quantized
+                if weight_args[0].type == QuantizationType.INT.value:
+                    return CompressionFormat.int_quantized
+
+            return CompressionFormat.naive_quantized
+    else:
+        # format will be inferred from config
+        return None
 
 
 def _get_unique_quant_args(model):