[quant] Prototype for unified quantization API

jerryzh168 · jerryzh168 · commit 55e5d40bf7d8 · 2024-03-07T15:08:46.000-08:00
Summary: To reduce the mental overhead for user, we would like to unify quantization flow for different quantization methods, otherwise, if we offer different APIs for different quantization methods, it will be hard for users to learn and use since they need to be familiar with API for each methods and know how to use each one. e.g. dynamic quant: API flow 1 weight only quant: API flow 2 GPTQ: API flow 3 static quant: API flow 4 QAT: API flow 5 if each one has their own flow, e.g. some of them have one line API, others have multiple lines with calibration/training etc., it will be a bad UX since user need to remember the flow for each one or constantly going back to the tutorial to use these APIs Instead: we'd like to have a unified quantization API and flow, initial plan: ``` class Quantizer(): def quantize(self, model: torch.nn.Module) -> torch.nn.Module: pass class TwoStepQuantizer(): # Note: each Quantizer will have their own implementation for prepare and convert def prepare(self, model: torch.nn.Module) -> torch.nn.Module: # implementation 1 # model = prepare_pt2e(model, self) # implementation 2, module swap, modifying weights with tensor subclass etc. # model = ... ... return model def convert(self, model: torch.nn.Module) -> torch.nn.Module: # implementation 1 # model = convert_pt2e(model, self) ... # implementation 2 # model = ... return model def save(self, model: torch.nn.Module, *args, **kwargs) -> None: pass def load(self, *args, **kwargs) -> torch.nn.Module: pass class ExportQuantizer(TwoStepQuantizer): ... def annotate(self, model: fx.GraphModule) -> None: # [optional] used only in export based flow ... def prepare(self, model: fx.GraphModule) -> fx.GraphModule: ... class XNNPACKQuantizer(ExportQuantizer): def annotate(...): ... captured_model = capture(eager_model) quantizer = XNNPACKQuantizer(captured_model) model = quantizer.prepare(model) model = quantizer.convert(model) captured_model = capture(eager_model) quantizer = Quantizer(captured_model, is_qat=True) model = quantizer.prepare(model) model = quantizer.convert(model) class GPTQQuantizer(Quantizer): ... def quantize(...): ... def convert(...): ... quantizer = GPTQQuantizer() model = quantizer.quantize(eager_model) torch.save(model.state_dict(), "gptq_weights.pt") quantizer = GPTQQuantizer(load_time=True) model = quantizer.quantize(eager_model) model.load_state_dict(torch.load("gptq_weights.pt")) class DynamicQuantizer(Quantizer): ... def quantize(...): ... quantizer = DynamicQuantizer() model = quantizer.quantize(eager_model) class WeightOnlyQuantizer(Quantizer): ... def quantize(...): ... quantizer = WeightOnlyQuantizer() model = quantizer.quantize(eager_model) ``` Test Plan: python test/quantization/test_quant_api.py Reviewers: Subscribers: Tasks: Tags: ghstack-source-id: e34a9d7 Pull Request resolved: #17
diff --git a/test/quantization/test_quant_api.py b/test/quantization/test_quant_api.py
@@ -0,0 +1,133 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+# mypy: ignore-errors
+import unittest
+import torch
+from torch._export import capture_pre_autograd_graph
+from torch.ao.quantization.quantize_pt2e import (
+    prepare_pt2e,
+    convert_pt2e,
+)
+from torch.ao.quantization.quantizer.xnnpack_quantizer import (
+    XNNPACKQuantizer,
+    get_symmetric_quantization_config,
+)
+
+from torchao.quantization.quant_api import _replace_with_custom_fn_if_matches_filter
+from torchao.quantization.quant_api import apply_dynamic_quant
+from torchao.quantization.quant_api import (
+    Quantizer,
+    TwoStepQuantizer,
+)
+
+def dynamic_quant(model, example_inputs):
+    m = capture_pre_autograd_graph(model, example_inputs)
+    quantizer = XNNPACKQuantizer().set_global(get_symmetric_quantization_config(is_dynamic=True))
+    m = prepare_pt2e(m, quantizer)
+    m = convert_pt2e(m)
+    return m
+
+def _apply_dynamic_quant(model):
+    """
+    Applies dynamic symmetric per-token activation and per-channel weight
+    quantization to all linear layers in the given model using
+    module swaps.
+    """
+    _replace_with_custom_fn_if_matches_filter(
+        model,
+        lambda linear_mod: dynamic_quant(linear_mod, (torch.randn(1, linear_mod.in_features))),
+        lambda mod, fqn: isinstance(mod, torch.nn.Linear),
+    )
+    return model
+
+
+def capture_and_prepare(model, example_inputs):
+    m = capture_pre_autograd_graph(model, example_inputs)
+    quantizer = XNNPACKQuantizer().set_global(get_symmetric_quantization_config(is_dynamic=True))
+    m = prepare_pt2e(m, quantizer)
+    # TODO: we can run the weight observer in convert_pt2e so that user don't need to run this
+    m(*example_inputs)
+    return m
+
+class XNNPackDynamicQuantizer(TwoStepQuantizer):
+
+    def prepare(self, model: torch.nn.Module) -> torch.nn.Module:
+        _replace_with_custom_fn_if_matches_filter(
+            model,
+            lambda linear_mod: capture_and_prepare(linear_mod, (torch.randn(1, linear_mod.in_features))),
+            lambda mod, fqn: isinstance(mod, torch.nn.Linear),
+        )
+        return model
+
+    def convert(self, model: torch.nn.Module) -> torch.nn.Module:
+        _replace_with_custom_fn_if_matches_filter(
+            model,
+            lambda linear_mod: convert_pt2e(linear_mod),
+            lambda mod, fqn: isinstance(mod, torch.fx.GraphModule),
+        )
+        return model
+
+class TorchCompileDynamicQuantizer(Quantizer):
+    def quantize(self, model: torch.nn.Module) -> torch.nn.Module:
+        apply_dynamic_quant(model)
+        return model
+
+class M(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.linear1 = torch.nn.Linear(5, 5).to(torch.float)
+        self.linear2 = torch.nn.Linear(5, 5).to(torch.float)
+
+    def forward(self, x):
+        x = self.linear1(x)
+        x = self.linear2(x)
+        return x
+
+class TestQuantFlow(unittest.TestCase):
+    def test_dynamic_quant_gpu_singleline(self):
+        m = M().eval()
+        m = _apply_dynamic_quant(m)
+        example_inputs = (torch.randn(1, 5).to(dtype=torch.float32),)
+        quantized = m(*example_inputs)
+        # AssertionError: Expecting input to have dtype torch.float32, but got dtype: torch.float64
+        # While executing %choose_qparams_tensor_1 : [num_users=2] = call_function[target=torch.ops.quantized_decomposed.choose_qparams.tensor](args = (%arg0_3, -128, 127, 0.000244140625, torch.int8), kwargs = {})
+        # m = torch.compile(m, mode="max-autotune")
+        # print(example_inputs[0].dtype)
+        # compiled = m(*example_inputs)
+        # torch.testing.assert_close(quantized, compiled, atol=0, rtol=0)
+
+    @unittest.skip("skipping for now due to torch.compile error")
+    def test_dynamic_quant_gpu_unified_api_unified_impl(self):
+        quantizer = XNNPackDynamicQuantizer()
+        m = M().eval()
+        m = quantizer.prepare(m)
+        m = quantizer.convert(m)
+        example_inputs = (torch.randn(1, 5).to(dtype=torch.float32),)
+        quantized = m(*example_inputs)
+        # AssertionError: Expecting input to have dtype torch.float32, but got dtype: torch.float64
+        # While executing %choose_qparams_tensor_1 : [num_users=2] = call_function[target=torch.ops.quantized_decomposed.choose_qparams.tensor](args = (%arg0_3, -128, 127, 0.000244140625, torch.int8), kwargs = {})
+        m = torch.compile(m, mode="max-autotune")
+        # print(example_inputs[0].dtype)
+        compiled = m(*example_inputs)
+        torch.testing.assert_close(quantized, compiled, atol=0, rtol=0)
+
+    def test_dynamic_quant_gpu_unified_api_eager_mode_impl(self):
+        quantizer = TorchCompileDynamicQuantizer()
+        m = M().eval()
+        m = quantizer.quantize(m)
+        example_inputs = (torch.randn(1, 5).to(dtype=torch.float32),)
+        quantized = m(*example_inputs)
+        m = torch.compile(m, mode="max-autotune")
+        compiled = m(*example_inputs)
+        torch.testing.assert_close(quantized, compiled, atol=0, rtol=0)
+
+    def test_gptq(self):
+        # should be similar to TorchCompileDynamicQuantizer
+        pass
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/torchao/quantization/quant_api.py b/torchao/quantization/quant_api.py
@@ -35,9 +35,27 @@
     "change_linear_weights_to_int8_dqtensors",
     "change_linear_weights_to_int8_woqtensors",
     "change_linear_weights_to_int4_woqtensors",
-    "swap_conv2d_1x1_to_linear"
+    "swap_conv2d_1x1_to_linear",
+    "Quantizer",
+    "TwoStepQuantizer",
 ]
 
+############################# Unified Quantization APIs ##############################
+# API 1, single quantize call to create a quantized model with quantized state_dict
+class Quantizer:
+    def quantize(self, model: torch.nn.Module, *args, **kwargs) -> torch.nn.Module:
+        pass
+
+
+# API 2, flow that needs calibration or training
+class TwoStepQuantizer:
+    def prepare(self, model: torch.nn.Module) -> torch.nn.Module:
+        pass
+
+    def convert(self, model: torch.nn.Module) -> torch.nn.Module:
+        pass
+
+############################# Unified Quantization APIs ##############################
 
 def _replace_with_custom_fn_if_matches_filter(
     model, replacement_fn, filter_fn, cur_fqn=""