Make observer args configurable (#1492)

shanjiaz · web-flow · commit 3262d855b52d · 2025-06-24T12:22:00.000-04:00
SUMMARY: There were two ways to pass in arguments: 1. Initialize when calling observer. See example usage [here](https://github.com/vllm-project/llm-compressor/blob/030a5bee05c7e319350b6cab204a09f47d0ee552/src/llmcompressor/modifiers/quantization/gptq/gptq_quantize.py#L100). ```python observer = Observer.load_from_registry( quant_args.observer, quantization_args=quant_args, averaging_constant=1.0, # ignore moving average ) ``` Move the unpacking logic to `calibration.py`, extracting kwargs directly and then pass in `load_from_registry` 2. Defined in recipe and parsed in quantization args. This overrides other sources, except `averaging_constant` if being ignored. Example usage: ```yaml config_groups: group_0: weights: {num_bits: 8, type: int, symmetric: true, strategy: tensor, observer: mse, observer_kwargs: {patience: 5}} input_activations: {num_bits: 8, type: int, symmetric: true, strategy: tensor} targets: [Linear] ``` TEST PLAN: Tested locally. --------- Signed-off-by: shanjiaz <zsjwpianpian@gmail.com>
diff --git a/docs/observers.md b/docs/observers.md
@@ -0,0 +1,93 @@
+# Observers Overview
+
+An `Observer` in `llm-compressor` is a utility class responsible for analyzing tensors (e.g., weights, activations) and producing quantization parameters such as `scale` and `zero_point`. These observers are used by quantization modifiers to compute the statistics necessary for transforming tensors into lower precision formats.
+
+Observers are designed to be flexible and support a variety of quantization strategies, including per-tensor, per-group, per-channel, and per-token quantization.
+
+## Base Class
+
+### [Observer](../src/llmcompressor/observers/base.py)
+Base class for all observers. Subclasses must implement the `calculate_qparams` method to define how quantization parameters are computed.
+
+The base class handles:
+- Group-wise scale/zero_point computation
+- Token-wise and channel-wise quantization logic
+- Optional support for `g_idx` (group index mappings)
+- Recording observed tokens for logging and analysis
+- Resetting internal state during lifecycle transitions
+
+This class is not used directly but provides the scaffolding for all custom observers.
+
+## Implemented Observers
+
+### [MinMax](../src/llmcompressor/observers/min_max.py)
+Computes `scale` and `zero_point` by tracking the minimum and maximum of the observed tensor. This is the simplest and most common observer. Works well for symmetric and asymmetric quantization.
+
+Best used for:
+- Int8 or Int4 symmetric quantization
+- Channel-wise or group-wise strategies
+
+### [MSE](../src/llmcompressor/observers/mse.py)
+Computes quantization parameters by minimizing the Mean Squared Error (MSE) between the original and quantized tensor. Optionally maintains a moving average of min/max values for smoother convergence.
+
+Best used when:
+- Calibration accuracy is critical
+- Quantization error needs to be tightly controlled
+
+## Quantization Strategies
+
+Observers support multiple quantization strategies via the `QuantizationArgs.strategy` field:
+
+- `TENSOR`: Global scale and zero_point across entire tensor.
+- `GROUP`, `TENSOR_GROUP`: Slice tensor into equal-sized groups along columns.
+- `CHANNEL`: Per-channel quantization (e.g., across output dimensions).
+- `TOKEN`: Quantize activations along token or sequence dimensions.
+- `BLOCK`: *(Not yet implemented)* Placeholder for block-wise quantization.
+
+## Observer Configuration Parameters
+
+Observers can be configured with optional keyword arguments that control their behavior. These are passed through the `QuantizationArgs.observer_kwargs` dictionary and parsed internally when the observer is initialized.
+
+Below are the supported configuration parameters and their meanings:
+
+| Argument            | Default Value |
+|---------------------|---------------|
+| `maxshrink`         | `0.20`        |
+| `patience`          | `5`           |
+| `averaging_constant`| `0.01`        |
+| `grid`              | `100.0`       |
+| `norm`              | `2.0`         |
+
+## Example Usage
+
+```python
+from llmcompressor.observers import Observer
+from compressed_tensors.quantization.quant_args import QuantizationArgs
+
+args = QuantizationArgs(num_bits=4, strategy="group", group_size=128)
+observer = Observer.load_from_registry("minmax", quantization_args=args)
+
+x = torch.randn(64, 512)
+scale, zero_point = observer(x)
+```
+
+## Example yaml Usage
+``` yaml
+quantization_stage:
+  quantization_modifiers:
+    GPTQModifier:
+      weights:
+        observer: mse
+        observer_kwargs:
+          maxshrink: 0.1
+          patience: 10
+          averaging_constant: 0.05
+          grid: 128.0
+          norm: 2.0
+        num_bits: 4
+        type: int
+        symmetric: true
+        strategy: channel
+      targets:
+        - Linear
+```
diff --git a/src/llmcompressor/modifiers/quantization/calibration.py b/src/llmcompressor/modifiers/quantization/calibration.py
@@ -18,6 +18,12 @@
 from llmcompressor.observers import Observer
 from llmcompressor.utils.helpers import getattr_chain
 
+DEFAULT_MAXSHRINK = 0.20
+DEFAULT_PATIENCE = 5
+DEFAULT_AVERAGING_CONSTANT = 0.01
+DEFAULT_GRID = 100.0
+DEFAULT_NORM = 2.4
+
 __all__ = [
     "initialize_observer",
     "update_weight_zp_scale",
@@ -60,9 +66,18 @@ def initialize_observer(
         False,
         DynamicType.LOCAL,
     ):
+        observer_kwargs = quantization_args.observer_kwargs or {}
         observer = Observer.load_from_registry(
             quantization_args.observer,
             quantization_args=quantization_args,
+            averaging_constant=observer_kwargs.get(
+                "averaging_constant", DEFAULT_AVERAGING_CONSTANT
+            ),
+            # used by mse observer only, will be ignored by minmax observer
+            maxshrink=observer_kwargs.get("maxshrink", DEFAULT_MAXSHRINK),
+            patience=observer_kwargs.get("patience", DEFAULT_PATIENCE),
+            grid=observer_kwargs.get("grid", DEFAULT_GRID),
+            norm=observer_kwargs.get("norm", DEFAULT_NORM)
         )
         module.register_module(f"{base_name}_observer", observer)
 
diff --git a/src/llmcompressor/observers/min_max.py b/src/llmcompressor/observers/min_max.py
@@ -22,6 +22,7 @@ def __init__(
         self,
         quantization_args: QuantizationArgs,
         averaging_constant: float = 0.01,
+        **kwargs,
     ):
         super().__init__(quantization_args=quantization_args)
 
diff --git a/src/llmcompressor/observers/mse.py b/src/llmcompressor/observers/mse.py
@@ -20,18 +20,19 @@ class MovingAverageMSEObserver(Observer):
     def __init__(
         self,
         quantization_args: QuantizationArgs,
+        maxshrink: float = 0.2,
+        patience: int = 5,
         averaging_constant: float = 0.01,
         grid: float = 100.0,
         norm: float = 2.4,
+        **kwargs,
     ):
         super().__init__(quantization_args=quantization_args)
 
-        kwargs = quantization_args.observer_kwargs or {}
-        self.maxshrink = kwargs.get("maxshrink", 0.20)
-        self.patience = kwargs.get("patience", 5)
-
         self.min_val = {}
         self.max_val = {}
+        self.maxshrink = maxshrink
+        self.patience = patience
         self.averaging_constant = averaging_constant
         self.grid = grid
         self.norm = norm