Set eps in end-to-end QAT flow (#2180)

andrewor14 · metascroy · web-flow · commit 45b39b14beff · 2025-05-09T11:20:18.000-04:00
* Set eps in end-to-end QAT flow **Summary:** This commit does two things: (1) Allow users to set eps in `FakeQuantizeConfig` (2) For other parts of the QAT flow, set eps to `torch.finfo(torch.float32).eps` for input linear activations to match the existing hardcoded input activation scale dtype (which is fp32) The motivation is to enable users who wish to lower their models to XNNPACK. This would require them to use the following combination of dtypes during training for end-to-end numerical match: - input activations: bf16 - input activation scales: fp32 - input activation eps: `torch.finfo(torch.float32).eps` - weight: bf16 - weight scales: bf16 - weight eps: `torch.finfo(torch.bfloat16).eps` However, today there is no way to specify the above in any of the QAT flows. For the recommended `FakeQuantizeConfig` flow, we always use `torch.finfo(x.dtype).eps`, where x is bf16 in this case, and there is no way for users to configure this. This is resolved by (1). For the legacy `Int8DynActInt4QATQuantizer` flow, we hardcode input activation scales to always use fp32 in #2085, but did not set the corresponding eps. Today, this also uses `torch.finfo(x.dtype).eps` by default, where x is bf16, and so we use the wrong eps value. This is resolved by (2). **Test Plan:** python test/quantization/test_qat.py -k test_fake_quantize_config_eps python test/quantization/test_qat.py -k test_qat_8da4w_eps * up --------- Co-authored-by: Scott Roy <161522778+metascroy@users.noreply.github.com>
diff --git a/test/quantization/test_qat.py b/test/quantization/test_qat.py
@@ -1513,6 +1513,84 @@ def test_qat_8da4w_prepare_vs_convert(self, dtype: torch.dtype):
         )
         self.assertEqual(len(non_inf_sqnr), 0, fail_message)
 
+    @unittest.skipIf(
+        not TORCH_VERSION_AT_LEAST_2_4, "skipping when torch version is 2.4 or lower"
+    )
+    def test_fake_quantize_config_eps(self):
+        """
+        Test that users can set arbitrary eps value in `FakeQuantizeConfig`.
+        """
+        eps = 0.00123
+        x = torch.randn(2, 3).to(torch.float32)
+        scale, zp = choose_qparams_affine(
+            x,
+            mapping_type=MappingType.ASYMMETRIC,
+            block_size=(1, 3),
+            target_dtype=torch.int8,
+            quant_min=-128,
+            quant_max=127,
+            eps=eps,
+        )
+        expected_out = _fake_quantize_per_token(x, scale, zp, -128, 127)
+        config = FakeQuantizeConfig(
+            torch.int8,
+            "per_token",
+            is_symmetric=False,
+            eps=eps,
+        )
+        fake_quantizer = FakeQuantizer(config)
+        actual_out = fake_quantizer(x)
+        torch.testing.assert_close(expected_out, actual_out, atol=0, rtol=0)
+
+    @unittest.skipIf(
+        not TORCH_VERSION_AT_LEAST_2_4, "skipping when torch version is 2.4 or lower"
+    )
+    def test_qat_8da4w_eps(self):
+        """
+        Test that the 8da4w QAT flow uses the expected eps.
+        """
+        from torchao.quantization.qat import Int8DynActInt4WeightQATQuantizer
+        from torchao.quantization.utils import per_token_dynamic_quant
+
+        group_size = 16
+        torch.manual_seed(self.SEED)
+        m = M()
+        quantizer = Int8DynActInt4WeightQATQuantizer(groupsize=group_size)
+
+        # prepare
+        prepared_model = quantizer.prepare(m)
+        self.assertEqual(
+            prepared_model.linear1.activation_fake_quantizer.config.eps,
+            torch.finfo(torch.float32).eps,
+        )
+
+        # convert
+        converted_model = quantizer.convert(m)
+        x = m.example_inputs()[0]
+        _input = per_token_dynamic_quant(
+            x,
+            scale_dtype=torch.float32,
+            zero_point_dtype=torch.float32,
+            eps=torch.finfo(torch.float32).eps,
+        )
+        _weight_dq = dequantize_affine(
+            converted_model.linear1.weight,
+            (1, group_size),
+            converted_model.linear1.scales,
+            converted_model.linear1.zeros,
+            torch.int8,
+            quant_min=-8,
+            quant_max=7,
+            output_dtype=torch.float32,
+        )
+        expected_out = torch.nn.functional.linear(
+            _input,
+            _weight_dq,
+            converted_model.linear1.bias,
+        )
+        actual_out = converted_model.linear1(x)
+        torch.testing.assert_close(expected_out, actual_out, atol=0, rtol=0)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/torchao/experimental/quant_passes.py b/torchao/experimental/quant_passes.py
@@ -87,7 +87,7 @@ def _get_q_dq_linear_patterns_replacements_and_filters(
     glbs["a_quant_max"] = None
     glbs["a_mapping_type"] = "ASYMMETRIC"
     glbs["a_scale_dtype"] = torch.float32
-    glbs["a_eps"] = None
+    glbs["a_eps"] = torch.finfo(torch.float32).eps
 
     lcls = {}
 
diff --git a/torchao/experimental/tests/test_int8_dynamic_activation_intx_weight.py b/torchao/experimental/tests/test_int8_dynamic_activation_intx_weight.py
@@ -361,7 +361,7 @@ def test_export_QDQLayout(self):
         self.assertTrue(torch.allclose(eager_results, exported_results))
 
         expected_lines = [
-            "torch.ops.torchao.choose_qparams_affine.default(input_1, 'ASYMMETRIC', [1, 512], torch.int8, None, None, None, torch.float32, torch.int8)",
+            "torch.ops.torchao.choose_qparams_affine.default(input_1, 'ASYMMETRIC', [1, 512], torch.int8, None, None, 1.1920928955078125e-07, torch.float32, torch.int8)",
             "torch.ops.torchao.quantize_affine.default(input_1, [1, 512], getitem, getitem_1, torch.int8)",
             "torch.ops.torchao.dequantize_affine.default(quantize_affine, [1, 512], getitem, getitem_1, torch.int8)",
             "torch.ops.torchao.dequantize_affine.default",
diff --git a/torchao/quantization/GPTQ.py b/torchao/quantization/GPTQ.py
@@ -938,7 +938,10 @@ def linear_forward_8da4w(
     # TODO: in future add ability to specify activation_scale_dtype to PTQ configs
     # and enable similar change here
     x = per_token_dynamic_quant(
-        x, scale_dtype=torch.float32, zero_point_dtype=torch.float32
+        x,
+        scale_dtype=torch.float32,
+        zero_point_dtype=torch.float32,
+        eps=torch.finfo(torch.float32).eps,
     )
 
     # TODO: verify and remove following reshape code
diff --git a/torchao/quantization/qat/api.py b/torchao/quantization/qat/api.py
@@ -85,6 +85,7 @@ class FakeQuantizeConfig:
     zero_point_domain: ZeroPointDomain
     is_dynamic: bool = True
     range_learning: bool = False
+    eps: Optional[float] = None
 
     def __init__(
         self,
@@ -96,6 +97,7 @@ def __init__(
         zero_point_domain: ZeroPointDomain = ZeroPointDomain.INT,
         is_dynamic: bool = True,
         range_learning: bool = False,
+        eps: Optional[float] = None,
         *,
         group_size: Optional[int] = None,
         is_symmetric: Optional[bool] = None,
@@ -110,6 +112,7 @@ def __init__(
         self.zero_point_domain = zero_point_domain
         self.is_dynamic = is_dynamic
         self.range_learning = range_learning
+        self.eps = eps
 
         # Validate dtype
         all_dtypes = [torch.int8, torch.uint8]
diff --git a/torchao/quantization/qat/fake_quantizer.py b/torchao/quantization/qat/fake_quantizer.py
@@ -81,6 +81,7 @@ def _per_token_forward(self, x: torch.Tensor):
                 target_dtype=self.config.dtype,
                 quant_min=qmin,
                 quant_max=qmax,
+                eps=self.config.eps,
                 scale_dtype=self.config.scale_precision,
                 zero_point_dtype=self.config.zero_point_precision,
             )
@@ -117,13 +118,15 @@ def _per_channel_or_group_forward(self, x: torch.Tensor):
                     bit_width,
                     group_size,
                     scale_precision,
+                    eps=self.config.eps,
                 )
             else:
                 (self.scale, self.zero_point) = get_groupwise_affine_qparams(
                     x,
                     bit_width,
                     group_size,
                     scale_precision,
+                    eps=self.config.eps,
                 )
             self.zero_point = self.zero_point.to(zero_point_precision)
 
diff --git a/torchao/quantization/qat/linear.py b/torchao/quantization/qat/linear.py
@@ -177,6 +177,8 @@ def __init__(
         self.padding_allowed: bool = padding_allowed
         self.precision: torch.dtype = precision
         self.scales_precision: torch.dtype = scales_precision
+        # TODO: generalize this
+        self.activation_scales_precision = torch.float32
 
     def prepare(
         self, model: torch.nn.Module, *args: Any, **kwargs: Any
@@ -247,7 +249,7 @@ def _convert_qat_linear_8da4w(self, module: torch.nn.Module):
                 self._convert_qat_linear_8da4w(child)
 
     def get_activation_fake_quantize_config(self) -> Optional[FakeQuantizeConfig]:
-        return _get_8da4w_activation_config(self.scales_precision)
+        return _get_8da4w_activation_config(self.activation_scales_precision)
 
     def get_weight_fake_quantize_config(self) -> Optional[FakeQuantizeConfig]:
         return _get_8da4w_weight_config(self.groupsize, self.scales_precision)
@@ -280,6 +282,7 @@ def __init__(
     ) -> None:
         # Use torch.float32 to match torchao.quantization.quant_api._int8_asymm_per_token_quant,
         # which is used in PTQ routines
+        # TODO: generalize this
         activation_config = _get_8da4w_activation_config(torch.float32)
         weight_config = _get_8da4w_weight_config(groupsize, scales_precision)
         super().__init__(
@@ -320,13 +323,16 @@ def _get_8da4w_activation_config(qparams_precision: torch.dtype) -> FakeQuantize
     """
     Return the activation `FakeQuantizeConfig` for `Int8DynActInt4WeightQATQuantizer`.
     """
+    # TODO: generalize this
+    assert qparams_precision == torch.float32
     return FakeQuantizeConfig(
         dtype=torch.int8,
         granularity="per_token",
         is_symmetric=False,
         is_dynamic=True,
         scale_precision=qparams_precision,
         zero_point_precision=qparams_precision,
+        eps=torch.finfo(qparams_precision).eps,
     )
 
 
diff --git a/torchao/quantization/quant_api.py b/torchao/quantization/quant_api.py
@@ -627,13 +627,15 @@ def _int8_asymm_per_token_quant(x: torch.Tensor) -> torch.Tensor:
     mapping_type = MappingType.ASYMMETRIC
     target_dtype = torch.int8
     scale_dtype = torch.float32
+    eps = torch.finfo(torch.float32).eps
     zero_point_dtype = torch.int8
     if TORCH_VERSION_AT_LEAST_2_6:
         return to_affine_quantized_intx(
             x,
             mapping_type,
             _get_per_token_block_size(x),
             target_dtype,
+            eps=eps,
             scale_dtype=scale_dtype,
             zero_point_dtype=zero_point_dtype,
         )
diff --git a/torchao/quantization/utils.py b/torchao/quantization/utils.py
@@ -324,6 +324,7 @@ def get_groupwise_affine_qparams(
     dtype=torch.bfloat16,
     zero_point_domain=ZeroPointDomain.FLOAT,
     preserve_zero=False,
+    eps=None,
 ):
     if groupsize > w.shape[-1]:
         groupsize = w.shape[-1]
@@ -337,7 +338,8 @@ def get_groupwise_affine_qparams(
     block_size = (1, groupsize)
     quant_min = 0
     quant_max = 2**n_bit - 1
-    eps = 1e-6
+    if eps is None:
+        eps = 1e-6
     scale_dtype = dtype
     zero_point_dtype = (
         dtype if zero_point_domain != ZeroPointDomain.INT else torch.int32
@@ -530,6 +532,7 @@ def get_group_qparams_symmetric(
     groupsize=128,
     precision=torch.float32,
     mapping_type=MappingType.SYMMETRIC,
+    eps=None,
 ):
     # needed for GPTQ with padding
     if groupsize > w.shape[-1]:
@@ -540,7 +543,8 @@ def get_group_qparams_symmetric(
     assert n_bit <= 8, f"unsupported n_bit: {n_bit}"
 
     block_size = (1, groupsize)
-    eps = torch.finfo(w.dtype).eps
+    if eps is None:
+        eps = torch.finfo(w.dtype).eps
     ranges = {}
     ranges[1] = (-1, 0)
     # generating ranges for bit 2 to 8
@@ -591,6 +595,7 @@ def per_token_dynamic_quant(
     input: torch.Tensor,
     scale_dtype: torch.dtype = torch.float32,
     zero_point_dtype: torch.dtype = torch.float32,
+    eps: Optional[float] = None,
 ) -> torch.Tensor:
     mapping_type = MappingType.ASYMMETRIC
     block_size = _get_per_token_block_size(input)
@@ -608,6 +613,7 @@ def per_token_dynamic_quant(
         quant_max,
         scale_dtype=scale_dtype,
         zero_point_dtype=zero_point_dtype,
+        eps=eps,
     )
     q = quantize_affine(
         input,