Merge branch 'main' into provide_moe_calibration_mode

dsikka · web-flow · commit 51c796819944 · 2025-07-09T14:26:30.000-04:00
diff --git a/setup.py b/setup.py
@@ -119,7 +119,7 @@ def localversion_func(version: ScmVersion) -> str:
         "tqdm>=4.0.0",
         # torch 1.10 and 1.11 do not support quantized onnx export
         "torch>=1.7.0,!=1.10,!=1.11",
-        "transformers>4.0,<4.53.0",
+        "transformers>4.0",
         "datasets",
         "accelerate>=0.20.3,!=1.1.0",
         "pynvml",
diff --git a/src/llmcompressor/entrypoints/oneshot.py b/src/llmcompressor/entrypoints/oneshot.py
@@ -307,8 +307,9 @@ def oneshot(
     """
 
     # pass all args directly into Oneshot
-    local_args = locals()
-    local_args.pop("kwargs")
+    local_args = {
+        k: v for k, v in locals().items() if k not in ("local_args", "kwargs")
+    }
     one_shot = Oneshot(**local_args, **kwargs)
     one_shot()
 
diff --git a/src/llmcompressor/entrypoints/utils.py b/src/llmcompressor/entrypoints/utils.py
@@ -241,7 +241,7 @@ def initialize_processor_from_path(
         )
 
     except ValueError as exception:
-        if "trust_remote_code=True" in exception.value:
+        if any("trust_remote_code=True" in arg for arg in exception.args):
             raise ValueError(
                 f"The repository for {processor_src} contains custom code which must "
                 "be executed to correctly load the tokenizer/processor. You can "
diff --git a/tests/llmcompressor/pytorch/utils/test_sparse.py b/tests/llmcompressor/pytorch/utils/test_sparse.py
@@ -0,0 +1,73 @@
+import pytest
+import torch
+from compressed_tensors.quantization import QuantizationArgs, QuantizationScheme
+from torch.nn import Linear, Module, ReLU
+
+from llmcompressor.pytorch.utils import ModuleSparsificationInfo
+
+
+class FakeQuantizedModel(Module):
+    def __init__(self):
+        super().__init__()
+        self.fc1 = Linear(8, 16, bias=True)  # Quantized
+        self.fc2 = Linear(16, 4, bias=True)  # Unquantized
+        self.relu = ReLU()
+
+        self.fc1.quantization_scheme = QuantizationScheme(
+            targets=["model.fc1"],
+            weights=QuantizationArgs(
+                precision=8,
+                granularity="per_tensor",
+                algorithm="gptq",
+                blocksize=128,
+            ),
+        )
+
+
+def test_module_quantization_info():
+    model = FakeQuantizedModel()
+    state = model.state_dict()
+
+    # Simulate quantized weights: replace float32 weights with int8
+    state["fc1.weight"] = torch.randint(
+        -128, 127, state["fc1.weight"].shape, dtype=torch.int8
+    )
+
+    # Keep fc1.bias, fc2.weight, fc2.bias all as float32
+    info = ModuleSparsificationInfo(model, state_dict=state)
+
+    # fc1 (quantized): 8 * 16 weights + 16 biases = 144 parameters.
+    # fc2 (not quantized): 16 * 4 weights + 4 biases = 68 parameters.
+    # Total parameters: 144 + 68 = 212.
+    # Quantized percentage: (144 / 212) * 100 ≈ 67.92%.
+    percent = info.params_quantized_percent
+
+    assert percent == pytest.approx(67.92, abs=1e-2)
+
+
+class FakeSparsedModel(Module):
+    def __init__(self):
+        super().__init__()
+        self.linear_dense = Linear(10, 10, bias=True)  # no sparsity
+        self.linear_sparse = Linear(10, 10, bias=True)  # sparse layer
+
+        # Inject sparsity into linear_sparse.weight (50% zeros)
+        with torch.no_grad():
+            weight = self.linear_sparse.weight
+            weight.view(-1)[:50] = 0.0
+
+
+def test_module_sparsity_info():
+    model = FakeSparsedModel()
+    state = model.state_dict()
+
+    info = ModuleSparsificationInfo(model, state_dict=state)
+
+    # linear_dense: 10 * 10 weights + 10 biases = 110 parameters.
+    # linear_sparse: 10 * 10 weights + 10 biases = 110 parameters.
+    # Total parameters: 110 + 110 = 220
+    # Number of sparse (zero) parameters: 50 (from linear_sparse.weight).
+    # Sparsity percentage: (50 / 220) * 100 ≈ 22.73%.
+    percent = info.params_sparse_percent
+
+    assert percent == pytest.approx(22.73, abs=1e-2)

Original file line number	Diff line number	Diff line change
`@@ -241,7 +241,7 @@ def initialize_processor_from_path(`
`241`	`241`	`)`
`242`	`242`
`243`	`243`	`except ValueError as exception:`
`244`		`- if "trust_remote_code=True" in exception.value:`
	`244`	`+ if any("trust_remote_code=True" in arg for arg in exception.args):`
`245`	`245`	`raise ValueError(`
`246`	`246`	`f"The repository for {processor_src} contains custom code which must "`
`247`	`247`	`"be executed to correctly load the tokenizer/processor. You can "`