TorchAO new observers (#2508)

rohansjoshi · web-flow · commit 19c009d692aa · 2025-07-09T13:04:33.000-07:00
TorchAO new observers (#2508) Summary: Added two observers, AffineQuantizedFixedQParamObserver (which allows manual range setting) and AffineQuantizedMSEObserver (which implements MSE range setting during the first forward pass) Bugfix in quant_primitives Reviewed By: jerryzh168 Differential Revision: D77906174
diff --git a/test/quantization/test_observer.py b/test/quantization/test_observer.py
@@ -14,20 +14,14 @@
 from torch.testing._internal import common_utils
 from torch.testing._internal.common_utils import TestCase
 
-from torchao.quantization.granularity import (
-    PerAxis,
-    PerTensor,
-)
+from torchao.quantization.granularity import PerAxis, PerTensor
 from torchao.quantization.observer import (
+    AffineQuantizedFixedQParamObserver,
     AffineQuantizedMinMaxObserver,
+    AffineQuantizedMSEObserver,
 )
-from torchao.quantization.quant_api import (
-    insert_observers_,
-)
-from torchao.quantization.quant_primitives import (
-    MappingType,
-    ZeroPointDomain,
-)
+from torchao.quantization.quant_api import insert_observers_
+from torchao.quantization.quant_primitives import MappingType, ZeroPointDomain
 
 
 class TestQuantFlow(TestCase):
@@ -145,6 +139,56 @@ def test_block_size_row_errors(self):
             for example_input in example_inputs:
                 obs(example_input)
 
+    def test_mse_observer(self):
+        obs = AffineQuantizedMSEObserver(
+            MappingType.SYMMETRIC,
+            torch.int8,
+            granularity=PerAxis(0),
+            eps=torch.finfo(torch.float32).eps,
+            scale_dtype=torch.float,
+            zero_point_dtype=torch.int,
+            zero_point_domain=ZeroPointDomain.NONE,
+            steps=100,
+            run_once=True,
+        )
+        example_input = torch.randn(10, 2048)
+        obs(example_input)
+
+        scale, zero_point = obs.calculate_qparams()
+        self.assertIsNone(zero_point)
+
+        minmax_obs = AffineQuantizedMinMaxObserver(
+            MappingType.SYMMETRIC,
+            torch.int8,
+            granularity=PerAxis(0),
+            eps=torch.finfo(torch.float32).eps,
+            scale_dtype=torch.float,
+            zero_point_dtype=torch.int,
+            zero_point_domain=ZeroPointDomain.NONE,
+        )
+        minmax_obs(example_input)
+        min_val, max_val = minmax_obs.min_val, minmax_obs.max_val
+        assert torch.all(
+            obs.loss_fn(example_input, obs.min_val, obs.max_val)
+            <= obs.loss_fn(example_input, min_val, max_val) + 1e6
+        )
+
+    def test_fixed_qparams_observer(self):
+        obs = AffineQuantizedFixedQParamObserver(
+            MappingType.SYMMETRIC,
+            torch.float8_e4m3fn,
+            granularity=PerAxis(0),
+            eps=torch.finfo(torch.float32).eps,
+            scale_dtype=torch.float,
+            zero_point_dtype=torch.int,
+            zero_point_domain=ZeroPointDomain.NONE,
+        )
+        example_input = torch.randn(10, 2048)
+        obs(example_input)
+        obs.set_qparams(torch.ones(2048))
+        scale, zero_point = obs.calculate_qparams()
+        self.assertTrue(torch.allclose(scale, torch.ones(2048)))
+
 
 class TestLinearObserver(TestCase):
     @common_utils.parametrize("observe_weight", [True, False])
diff --git a/torchao/quantization/observer.py b/torchao/quantization/observer.py
@@ -10,6 +10,7 @@
 
 import torch
 
+from torchao.quantization.quant_primitives import _fake_quantize_affine
 from torchao.utils import TORCH_VERSION_AT_LEAST_2_5
 
 from .granularity import (
@@ -193,6 +194,185 @@ def calculate_qparams(self) -> Tuple[torch.Tensor, torch.Tensor]:
         )
 
 
+class AffineQuantizedFixedQParamObserver(AffineQuantizedObserverBase):
+    """
+    Observer that allows manual setting of fixed quantization parameters.
+    """
+
+    def __init__(
+        self,
+        mapping_type: MappingType,
+        target_dtype: torch.dtype,
+        granularity: Granularity,
+        quant_min: Optional[int] = None,
+        quant_max: Optional[int] = None,
+        eps: Optional[float] = None,
+        scale_dtype: Optional[torch.dtype] = None,
+        zero_point_dtype: Optional[torch.dtype] = None,
+        preserve_zero: bool = True,
+        zero_point_domain: ZeroPointDomain = ZeroPointDomain.INT,
+        scale: Optional[torch.Tensor] = None,
+        zero_point: Optional[torch.Tensor] = None,
+    ):
+        super().__init__(
+            mapping_type,
+            target_dtype,
+            granularity,
+            quant_min,
+            quant_max,
+            eps,
+            scale_dtype,
+            zero_point_dtype,
+            preserve_zero,
+            zero_point_domain,
+        )
+        if not scale:
+            scale = torch.Tensor([1])
+        if not zero_point:
+            zero_point = torch.zeros_like(scale)
+        self.register_buffer("scale", scale.to(dtype=scale_dtype))
+        self.register_buffer("zero_point", zero_point.to(dtype=zero_point_dtype))
+
+    def set_qparams(self, scale, zero_point=None):
+        if not zero_point:
+            zero_point = torch.zeros_like(scale)
+        self.scale = scale.to(dtype=self.scale_dtype)
+        self.zero_point = zero_point.to(dtype=self.zero_point_dtype)
+
+    def forward(self, input):
+        return input
+
+    def calculate_qparams(self):
+        return self.scale, self.zero_point
+
+
+class AffineQuantizedMSEObserver(AffineQuantizedObserverBase):
+    """
+    Minimize quantization loss caused by outlier via linear search. More details can be found at https://arxiv.org/pdf/2209.13325
+    """
+
+    def __init__(
+        self,
+        mapping_type: MappingType,
+        target_dtype: torch.dtype,
+        granularity: Granularity,
+        quant_min: Optional[int] = None,
+        quant_max: Optional[int] = None,
+        eps: Optional[float] = None,
+        scale_dtype: Optional[torch.dtype] = None,
+        zero_point_dtype: Optional[torch.dtype] = None,
+        preserve_zero: bool = True,
+        zero_point_domain: ZeroPointDomain = ZeroPointDomain.INT,
+        steps: int = 100,
+        run_once: bool = False,
+    ):
+        super().__init__(
+            mapping_type,
+            target_dtype,
+            granularity,
+            quant_min,
+            quant_max,
+            eps,
+            scale_dtype,
+            zero_point_dtype,
+            preserve_zero,
+            zero_point_domain,
+        )
+        self.steps = steps
+        self.calibrated = False
+        self.run_once = run_once
+
+    def mse(self, pred, expect, block_size):
+        loss = (pred - expect).abs().pow(2)
+        shape_for_reduction, reduction_dims = _get_reduction_params(
+            block_size, loss.size()
+        )
+        loss = loss.view(shape_for_reduction)
+        return torch.mean(loss, dim=reduction_dims, keepdim=False)
+
+    def loss_fn(self, x, new_min, new_max):
+        block_size = get_block_size(x.shape, self.granularity)
+        scale, zero_point = choose_qparams_affine_with_min_max(
+            new_min,
+            new_max,
+            self.mapping_type,
+            [],
+            self.target_dtype,
+            self.quant_min,
+            self.quant_max,
+            self.eps,
+            self.scale_dtype,
+            self.zero_point_dtype,
+            self.preserve_zero,
+            self.zero_point_domain,
+        )
+        x_q = _fake_quantize_affine(
+            x,
+            block_size,
+            scale,
+            zero_point,
+            self.target_dtype,
+            self.quant_min,
+            self.quant_max,
+            self.zero_point_domain,
+        )
+        return self.mse(x_q, x, block_size)
+
+    def line_search(self, input):
+        if input.numel() == 0:
+            return input
+
+        input_detached = input.detach()
+        assert self.granularity is not None, "granularity is None"
+        block_size = get_block_size(input_detached.shape, self.granularity)
+
+        shape_for_reduction, reduction_dims = _get_reduction_params(
+            block_size, input_detached.size()
+        )
+        input_detached = input_detached.view(shape_for_reduction)
+        min_val = torch.amin(input_detached, dim=reduction_dims, keepdim=False)
+        max_val = torch.amax(input_detached, dim=reduction_dims, keepdim=False)
+
+        range_val = torch.max(min_val.abs(), max_val)
+        optimal_loss = torch.zeros_like(min_val) + 1e9
+
+        # check which clip range could produce smallest loss
+        for i in range(1, self.steps + 1):
+            thres = range_val / self.steps * i
+            current_loss = self.loss_fn(input, -thres, thres)
+            min_val = torch.where(current_loss < optimal_loss, -thres, min_val)
+            max_val = torch.where(current_loss < optimal_loss, thres, max_val)
+            optimal_loss = torch.min(current_loss, optimal_loss)
+
+        return min_val, max_val
+
+    def forward(self, input):
+        if not (self.run_once and self.calibrated):
+            self.min_val, self.max_val = self.line_search(input)
+            self.calibrated = True
+
+        return input
+
+    def calculate_qparams(self):
+        assert hasattr(self, "min_val") and hasattr(self, "max_val"), (
+            "Expecting the observer has min_val and max_val, please run the observer before calling calculate_qparams"
+        )
+        return choose_qparams_affine_with_min_max(
+            self.min_val,
+            self.max_val,
+            self.mapping_type,
+            [],
+            self.target_dtype,
+            self.quant_min,
+            self.quant_max,
+            self.eps,
+            self.scale_dtype,
+            self.zero_point_dtype,
+            self.preserve_zero,
+            self.zero_point_domain,
+        )
+
+
 if TORCH_VERSION_AT_LEAST_2_5:
     # Allow a model with LinearActivationQuantizedTensor weights to be loaded with `weights_only=True`
     torch.serialization.add_safe_globals([PerRow, PerTensor])
diff --git a/torchao/quantization/quant_primitives.py b/torchao/quantization/quant_primitives.py
@@ -1172,7 +1172,7 @@ def _do_fake_quantize_affine(
     elif zero_point_domain == ZeroPointDomain.FLOAT:
         _quantize_affine = _quantize_affine_tinygemm_no_dtype_cast
         _dequantize_affine = _dequantize_affine_tinygemm_no_dtype_check
-    elif ZeroPointDomain == ZeroPointDomain.NONE:
+    elif zero_point_domain == ZeroPointDomain.NONE:
         _quantize_affine = _quantize_affine_no_zero_point_no_dtype_cast
         _dequantize_affine = _dequantize_affine_no_zero_point_no_dtype_check
     else: