Autoquant

HDCharles · HDCharles · commit 4413f75ebe3d · 2024-02-22T11:37:27.000-08:00
Summary: currently issue where for multiple linear layers, get very slow dynamic quant results on layer linear layers. unclear why. Test Plan: python test/test.py -k "autoquant" <class 'torchao.quantization.autoquant.DefaultLinear'> (torch.Size([65536, 1280]), torch.Size([3840, 1280]), torch.Size([3840])) 187.4432 0 AUTOTUNE addmm(65536x3840, 65536x1280, 1280x3840) bias_addmm 2.9764 ms 100.0% triton_mm_1 3.6858 ms 80.8% triton_mm_2 3.7502 ms 79.4% addmm 3.7887 ms 78.6% triton_mm_3 4.1547 ms 71.6% triton_mm_4 4.2022 ms 70.8% triton_mm_0 4.7970 ms 62.0% triton_mm_8 4.9596 ms 60.0% triton_mm_7 5.4343 ms 54.8% triton_mm_10 6.9352 ms 42.9% SingleProcess AUTOTUNE takes 5.6320 seconds <torch.utils.benchmark.utils.common.Measurement object at 0x7f98800eb760> f(*args, **kwargs) 3.08 ms 1 measurement, 20 runs , 1 thread <class 'torchao.quantization.autoquant.DefaultLinear'> 3.07677136734128 1311.548416 0 <class 'torchao.quantization.subclass.Int8WeightOnlyQuantizedLinearWeight'> (torch.Size([65536, 1280]), torch.Size([3840, 1280]), torch.Size([3840])) 1311.548416 0 AUTOTUNE mixed_mm(65536x1280, 1280x3840) fallback_mixed_mm 2.5089 ms 100.0% triton_mm_13 6.4153 ms 39.1% triton_mm_14 6.6832 ms 37.5% triton_mm_12 7.0896 ms 35.4% triton_mm_16 7.5022 ms 33.4% triton_mm_15 7.8426 ms 32.0% triton_mm_19 9.5269 ms 26.3% triton_mm_20 11.2033 ms 22.4% triton_mm_17 13.1675 ms 19.1% triton_mm_18 13.8004 ms 18.2% SingleProcess AUTOTUNE takes 2.4977 seconds <torch.utils.benchmark.utils.common.Measurement object at 0x7f986ff12050> f(*args, **kwargs) 3.68 ms 1 measurement, 20 runs , 1 thread <torch.utils.benchmark.utils.common.Measurement object at 0x7f986ff27b80> f(*args, **kwargs) 3.10 ms 1 measurement, 20 runs , 1 thread <class 'torchao.quantization.subclass.Int8WeightOnlyQuantizedLinearWeight'> 3.6846738075837493 3.1023880932480097 2144.447488 25 <class 'torchao.quantization.subclass.Int8DynamicallyQuantizedLinearWeight'> (torch.Size([65536, 1280]), torch.Size([3840, 1280]), torch.Size([3840])) 2144.447488 25 AUTOTUNE int_mm(65536x1280, 1280x3840, 65536x3840) triton_mm_43 2.0319 ms 100.0% triton_mm_35 2.8135 ms 72.2% triton_mm_42 3.1552 ms 64.4% triton_mm_36 3.1754 ms 64.0% triton_mm_44 3.3460 ms 60.7% triton_mm_41 3.4036 ms 59.7% triton_mm_37 3.5030 ms 58.0% triton_mm_34 3.6553 ms 55.6% triton_mm_38 3.9232 ms 51.8% triton_mm_40 9.1934 ms 22.1% SingleProcess AUTOTUNE takes 8.1948 seconds <torch.utils.benchmark.utils.common.Measurement object at 0x7f9892843f40> f(*args, **kwargs) 3.13 ms 1 measurement, 20 runs , 1 thread <torch.utils.benchmark.utils.common.Measurement object at 0x7f986cfd33a0> f(*args, **kwargs) 2.21 ms 1 measurement, 20 runs , 1 thread <class 'torchao.quantization.subclass.Int8DynamicallyQuantizedLinearWeight'> 3.1286065466701984 2.210085652768612 2144.447488 22 <class 'torchao.quantization.autoquant.DefaultLinear'> (torch.Size([65536, 3840]), torch.Size([1280, 3840]), torch.Size([1280])) 2144.447488 22 AUTOTUNE addmm(65536x1280, 65536x3840, 3840x1280) bias_addmm 2.7966 ms 100.0% addmm 3.0447 ms 91.9% triton_mm_57 3.5612 ms 78.5% triton_mm_58 3.6919 ms 75.7% triton_mm_59 4.1908 ms 66.7% triton_mm_60 4.2350 ms 66.0% triton_mm_56 4.7210 ms 59.2% triton_mm_64 4.9001 ms 57.1% triton_mm_63 5.5218 ms 50.6% triton_mm_66 7.1417 ms 39.2% SingleProcess AUTOTUNE takes 6.3734 seconds <torch.utils.benchmark.utils.common.Measurement object at 0x7f9888dd2b30> f(*args, **kwargs) 3.33 ms 1 measurement, 20 runs , 1 thread <class 'torchao.quantization.autoquant.DefaultLinear'> 3.329739556647837 2228.913664 39 <class 'torchao.quantization.subclass.Int8WeightOnlyQuantizedLinearWeight'> (torch.Size([65536, 3840]), torch.Size([1280, 3840]), torch.Size([1280])) 2228.913664 39 AUTOTUNE mixed_mm(65536x3840, 3840x1280) fallback_mixed_mm 2.3987 ms 100.0% triton_mm_70 6.9153 ms 34.7% triton_mm_72 7.1634 ms 33.5% triton_mm_69 7.3164 ms 32.8% triton_mm_68 7.5070 ms 32.0% triton_mm_71 7.5631 ms 31.7% triton_mm_76 10.7759 ms 22.3% triton_mm_75 11.0692 ms 21.7% triton_mm_73 12.8898 ms 18.6% triton_mm_77 13.3715 ms 17.9% SingleProcess AUTOTUNE takes 6.2342 seconds <torch.utils.benchmark.utils.common.Measurement object at 0x7f9880133fd0> f(*args, **kwargs) 3.48 ms 1 measurement, 20 runs , 1 thread <torch.utils.benchmark.utils.common.Measurement object at 0x7f988175b610> f(*args, **kwargs) 3.22 ms 1 measurement, 20 runs , 1 thread <class 'torchao.quantization.subclass.Int8WeightOnlyQuantizedLinearWeight'> 3.4762858413159847 3.2240213360637426 2228.913664 38 <class 'torchao.quantization.subclass.Int8DynamicallyQuantizedLinearWeight'> (torch.Size([65536, 3840]), torch.Size([1280, 3840]), torch.Size([1280])) 2228.913664 38 AUTOTUNE int_mm(65536x3840, 3840x1280, 65536x1280) triton_mm_99 1.4307 ms 100.0% triton_mm_100 1.9041 ms 75.1% triton_mm_91 2.6079 ms 54.9% triton_mm_98 2.6363 ms 54.3% triton_mm_92 2.6691 ms 53.6% triton_mm_93 3.0178 ms 47.4% triton_mm_97 3.0233 ms 47.3% triton_mm_94 3.1872 ms 44.9% triton_mm_90 3.6072 ms 39.7% triton_mm_96 8.4695 ms 16.9% SingleProcess AUTOTUNE takes 8.1095 seconds <torch.utils.benchmark.utils.common.Measurement object at 0x7f9881782f80> f(*args, **kwargs) 145.38 ms 1 measurement, 20 runs , 1 thread <torch.utils.benchmark.utils.common.Measurement object at 0x7f9892843f70> f(*args, **kwargs) 143.98 ms 1 measurement, 20 runs , 1 thread <class 'torchao.quantization.subclass.Int8DynamicallyQuantizedLinearWeight'> 145.37517526187003 143.98446583654732 2230.364672 79 Reviewers: Subscribers: Tasks: Tags: ghstack-source-id: 67787a1 Pull Request resolved: #38
diff --git a/test/test.py b/test/test.py
@@ -24,6 +24,7 @@
     change_linear_weights_to_int8_woqtensors,
     change_linear_weights_to_int4_woqtensors,
     _replace_with_custom_fn_if_matches_filter,
+    do_autoquant
 )
 from torchao.quantization.quant_primitives import (
     dequantize_per_channel,
@@ -1195,6 +1196,21 @@ def test_on_dummy_distilbert(self):
         print("sqnr_pt_quant", sqnr_pt_quant)
         self.assertTrue(sqnr_sq >= 8.0)
 
+class TestAutoQuant(unittest.TestCase):
+    def test_auto_quant(self):
+        model = torch.nn.Sequential(
+            # torch.nn.Linear(5120,1280),
+            # torch.nn.ReLU(),
+            torch.nn.Linear(1280,3840),
+            torch.nn.ReLU(),
+            torch.nn.Linear(3840,1280),
+        ).to("cuda").to(torch.bfloat16)
+        example_input = torch.randn(65536,1280, device="cuda", dtype=torch.bfloat16)
+        torch._inductor.config.epilogue_fusion = False
+        torch._inductor.config.use_mixed_mm = True
+        torch._inductor.config.force_fuse_int_mm_with_mul = True
+        torch._inductor.config.coordinate_descent_tuning = True
+        do_autoquant(model, example_input)
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/test/test_autoquant.py b/test/test_autoquant.py
@@ -0,0 +1,35 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+# mypy: ignore-errors
+import copy
+import unittest
+
+import torch
+import torch.nn as nn
+from torchao.quantization.quant_api import (
+    change_linears_to_autoquantizable,
+    change_autoquantizable_to_quantized
+)
+from torchao.quantization.autoquant import do_autoquant
+from torch._dynamo import config
+torch.manual_seed(0)
+config.cache_size_limit = 100
+
+
+class AutoquantTests(unittest.TestCase):
+    def test_autoquant_e2e(self):
+        model = torch.nn.Sequential(torch.nn.Linear(32,32), torch.nn.ReLU(), torch.nn.Linear(32,32)).cuda().to(torch.bfloat16)
+        print(model, model[0].weight)
+        example_input = torch.randn((1,64,32), dtype=torch.bfloat16, device=torch.cuda)
+        out=model(example_input)
+        print(out.sum())
+        do_autoquant(model)
+        print(model, model[0].weight)
+        print(model(example_input).sum())
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/torchao/quantization/autoquant.py b/torchao/quantization/autoquant.py
@@ -0,0 +1,200 @@
+import torch
+
+from .subclass import ( # noqa
+    Int8DynamicallyQuantizedLinearWeight,
+    Int8WeightOnlyQuantizedLinearWeight,
+    QuantizedLinearWeightBase,
+)
+from torch.utils._python_dispatch import return_and_correct_aliasing
+from .utils import benchmark
+
+aten = torch.ops.aten
+
+AUTOQUANT_CACHE = {}
+
+def check_cache(shape, cls):
+    if shape in AUTOQUANT_CACHE:
+        return AUTOQUANT_CACHE[shape].get(cls, None)
+    else:
+        return None
+
+def update_cache(shape, cls, res):
+    if not shape in AUTOQUANT_CACHE:
+        AUTOQUANT_CACHE[shape] = {}
+    AUTOQUANT_CACHE[shape][cls] = res
+
+class AutoQuantizableLinearWeight(torch.Tensor):
+    """
+    when run, finds best type of quantization for this tensor and swaps itself with that
+    """
+    @staticmethod
+    def __new__(cls, weight, qtensor_class_list, *args, **kwargs):
+        kwargs["device"] = weight.device
+        kwargs["layout"] = (
+            kwargs.get("layout") if kwargs.get("layout", False) else weight.layout
+        )
+        kwargs["dtype"] = (
+            kwargs.get("dtype") if kwargs.get("dtype", False) else weight.dtype
+        )
+        kwargs["requires_grad"] = False
+        shape = kwargs.pop("shape", weight.shape)
+        return torch.Tensor._make_wrapper_subclass(cls, shape, **kwargs)  # type: ignore[attr-defined]
+
+    def __init__(self, weight, qtensor_class_list, *args, **kwargs):
+        self.weight = weight
+        self.qtensor_class_list = qtensor_class_list
+        self.cache_shape = None
+
+    def __repr__(self):
+        return (
+            f"{self.__class__.__name__}(data={self.weight}, shape={self.shape}, "
+            f"device={self.device}, dtype={self.dtype}, qtensor_class_list={self.qtensor_class_list})"
+        )
+
+    @staticmethod
+    def tune_autoquant(act_mat, w_autoquant, bias):
+        orig_shape = act_mat.shape
+        act_mat = act_mat.reshape(-1, act_mat.shape[-1])
+        cache_shape = (act_mat.shape, w_autoquant.shape, None if bias is None else bias.shape)
+        w_autoquant.cache_shape = cache_shape
+        for cur_cls in w_autoquant.qtensor_class_list:
+            if check_cache(cache_shape, cur_cls) is None:
+                with torch.no_grad():
+                    print(cur_cls, cache_shape)
+                    print(torch.cuda.max_memory_allocated()/1e6, torch.cuda.memory_usage())
+                    res = cur_cls._autoquant_test(act_mat.clone(), w_autoquant.weight.clone(), None if bias is None else bias.clone())
+                    update_cache(cache_shape, cur_cls, res)
+                    print(torch.cuda.max_memory_allocated()/1e6, torch.cuda.memory_usage())
+        y = torch.mm(act_mat, w_autoquant.weight.t())
+        y = y.reshape(*orig_shape[:-1], y.shape[-1])
+        if bias is not None:
+            y += bias
+        return y
+
+    def to_quantized(self):
+        if self.cache_shape is None or self.cache_shape not in AUTOQUANT_CACHE:
+            raise RuntimeError("must run module normally to find best quantization option")
+        best_time = torch.inf
+        best_cls = None
+        for cur_cls in self.qtensor_class_list:
+            cls_res = AUTOQUANT_CACHE[self.cache_shape].get(cur_cls, torch.inf)
+            if best_time >= cls_res:
+                best_time = cls_res
+                best_cls = cur_cls
+        # need to handle random cls args/kwargs?
+        self = best_cls.from_float(self.weight)
+        return self
+
+    def _apply_fn_to_data(self, fn):
+        return self.__class__(
+            fn(self.weight), self.qtensor_class_list, dtype=self.dtype
+        )
+
+    def __tensor_flatten__(self):
+        return ["weight"], [self.qtensor_class_list, self.dtype, self.shape]
+
+    @classmethod
+    def __tensor_unflatten__(cls, tensor_data_dict, tensor_attributes, outer_size=None, outer_stride=None):
+        weight = tensor_data_dict["weight"]
+        qtensor_class_list, dtype, shape = tensor_attributes[0]
+        return cls(weight, qtensor_class_list, shape=shape if outer_size is None else outer_size, dtype=dtype, strides=outer_stride)
+
+    @classmethod
+    def from_float(cls, weight, qtensor_class_list):
+        return cls(weight, qtensor_class_list)
+
+    @classmethod
+    def __torch_function__(cls, func, types, args=(), kwargs=None):
+        kwargs = {} if kwargs is None else kwargs
+
+        if func is torch.nn.functional.linear:
+            mat1, w_autoquant, bias = (
+                args[0],
+                args[1],
+                args[2] if len(args)>2 else None
+            )
+            return cls.tune_autoquant(mat1, w_autoquant, bias)
+
+        try:
+            with torch._C.DisableTorchFunctionSubclass():
+                return func(*args, **kwargs)
+        except:
+            print(f"ERR: subclass doesn't implement {func}")
+
+    @classmethod
+    def __torch_dispatch__(cls, func, types, args, kwargs):
+         if func is aten.detach.default:
+            return return_and_correct_aliasing(func, args, kwargs, args[0]._apply_fn_to_data(torch.detach))
+
+
+class DefaultLinear(torch.Tensor):
+    """
+    An class to be used in concert with AutoQuantizableLinearWeight to provide a
+    default/non-quantized option. Only implements the bare minimum needed to work with the
+    AutoQuantizableLinearWeight class using the same interfaces that would normally be
+    used by QTensor subclasses but for a default linear op instead.
+    """
+    def __init__(self):
+        super().__init__()
+
+    @classmethod
+    def _autoquant_test(cls, act_mat, weight, bias):
+        w_qtensor = cls.from_float(weight)
+        q_c_op = torch.compile(cls._quantized_op, mode="max-autotune")
+        with torch.no_grad():
+            res=benchmark(q_c_op, act_mat, w_qtensor, bias)
+        print(cls, res)
+        return res
+
+    @staticmethod
+    def _quantized_op(act_mat, w_qtensor, bias):
+        return torch.nn.functional.linear(act_mat, w_qtensor, bias)
+
+    @classmethod
+    def from_float(cls, weight):
+        return weight
+
+DEFAULT_CLASS_LIST = [
+    DefaultLinear,
+    Int8WeightOnlyQuantizedLinearWeight,
+    Int8DynamicallyQuantizedLinearWeight,
+]
+
+if False:
+    # def _get_to_kwargs(self, *args, **kwargs):
+    #     device, dtype, _, memory_format = torch._C._nn._parse_to(*args, **kwargs)
+    #     device = self.device if device is None else device
+    #     dtype = self.dtype if dtype is None else dtype
+    #     memory_format = (
+    #         memory_format if memory_format is not None else torch.preserve_format
+    #     )
+    #     kwargs = {
+    #         "device": device,
+    #         "dtype": dtype,
+    #         "memory_format": memory_format,
+    #     }
+    #     return kwargs
+
+    # def to(self, *args, **kwargs):
+    #     kwargs = self._get_to_kwargs(*args, **kwargs)
+    #     return self.__class__(
+    #         self.int_data.to(kwargs["device"]),
+    #         self.q_scales.to(kwargs["device"]),
+    #         self.transposed,
+    #         self.shape,
+    #         **kwargs,
+    #     )
+
+    # def _apply_fn_to_data(self, fn):
+    #     return self.__class__(
+    #         fn(self.int_data), fn(self.q_scales), self.transposed, self.shape, dtype=self.dtype
+    #     )
+
+    # def _change_shape(self, shape):
+    #     return self.__class__(
+    #         self.int_data, self.q_scales, self.transposed, shape, dtype=self.dtype
+    #     )
+
+    # def half(self):
+    #     return self.to(torch.float16)
+    pass
diff --git a/torchao/quantization/quant_api.py b/torchao/quantization/quant_api.py
@@ -28,6 +28,7 @@
 from .weight_only import (
     WeightOnlyInt8QuantLinear,
 )
+from .autoquant import AutoQuantizableLinearWeight, DEFAULT_CLASS_LIST
 
 __all__ = [
     "apply_weight_only_int8_quant",
@@ -95,9 +96,11 @@ def apply_dynamic_quant(model, filter_fn=None):
 
 
 def _get_subclass_inserter(cls, **kwargs):
+    method = kwargs.pop("method", "from_float")
     def insert_subclass(lin):
         lin.weight = torch.nn.Parameter(
-            cls.from_float(lin.weight, **kwargs), requires_grad=False
+            # cls.from_float(...)
+            getattr(cls, method)(lin.weight, **kwargs), requires_grad=False
         )
         return lin
 
@@ -153,6 +156,39 @@ def change_linear_weights_to_int4_woqtensors(model, **kwargs):
         filter_fn,
     )
 
+
+def change_linears_to_autoquantizable(model, **kwargs):
+    filter_fn = kwargs.pop("filter_fn", _is_linear)
+    _replace_with_custom_fn_if_matches_filter(
+        model,
+        _get_subclass_inserter(AutoQuantizableLinearWeight, **kwargs),
+        filter_fn if filter_fn is not None else _is_linear,
+    )
+
+def change_autoquantizable_to_quantized(model, **kwargs):
+    filter_fn = kwargs.pop(
+        "filter_fn",
+        lambda mod, *args:
+            _is_linear(mod, *args) and
+            isinstance(mod.weight, AutoQuantizableLinearWeight)
+    )
+    _replace_with_custom_fn_if_matches_filter(
+        model,
+        _get_subclass_inserter(
+            AutoQuantizableLinearWeight, method="to_quantized", **kwargs
+        ),
+        filter_fn,
+    )
+
+@torch.no_grad()
+def do_autoquant(model, example_input, qtensor_class_list=DEFAULT_CLASS_LIST, filter_fn=_is_linear):
+    change_linears_to_autoquantizable(model, filter_fn=filter_fn, qtensor_class_list=qtensor_class_list)
+    if not isinstance(example_input, (tuple, list)):
+        example_input = [example_input]
+    model(*example_input)
+    change_autoquantizable_to_quantized(model)
+    return model
+
 def swap_conv2d_1x1_to_linear(model, filter_fn=None):
     """
     Changes all conv2d 1x1 modules to equivalent linear modules so that they can then be quantized.
diff --git a/torchao/quantization/subclass.py b/torchao/quantization/subclass.py
@@ -13,8 +13,11 @@
     groupwise_affine_quantize_tensor,
     quant_int8_dynamic_per_token_linear,
     unpack_tinygemm_scales_and_zeros,
+    quantize_activation_per_token_absmax,
+    quant_int8_per_token_matmul,
+    safe_int_mm,
 )
-from .utils import find_multiple
+from .utils import find_multiple, benchmark
 import warnings
 
 
@@ -197,6 +200,25 @@ def _quantized_op(act_mat, w_qtensor, bias):
             act_mat, w_qtensor.int_data, w_qtensor.q_scales, bias, act_mat.dtype
         )
 
+    @classmethod
+    def _autoquant_test(cls, act_mat, weight, bias):
+        w_qtensor = cls.from_float(weight)
+        q_c_op = torch.compile(cls._quantized_op, mode="max-autotune")
+        with torch.no_grad():
+            res=benchmark(q_c_op, act_mat, w_qtensor, bias)
+        x_vals_int8, x_scales = quantize_activation_per_token_absmax(
+            act_mat.reshape(-1, act_mat.shape[-1])
+        )
+        quantized_matmul = (
+            lambda x_vals_int8, x_scales, w_vals_int8:
+                safe_int_mm(x_vals_int8, w_vals_int8) * x_scales
+        )
+        q_c_matmul=torch.compile(quantized_matmul, mode="max-autotune")
+        with torch.no_grad():
+            res2=benchmark(q_c_matmul, x_vals_int8, x_scales, w_qtensor.int_data)
+        print(cls, res, res2)
+        return (res+res2)/2
+
     def dequantize(self, dtype=None):
         """
         Obtain the dequantized version of the quantized tensor subclass
@@ -292,6 +314,26 @@ def _quantized_op(act_mat, w_qtensor, bias):
             y += bias
         return y.to(orig_dtype)
 
+    @classmethod
+    def _autoquant_test(cls, act_mat, weight, bias):
+        w_qtensor = cls.from_float(weight)
+        q_c_op = torch.compile(cls._quantized_op, mode="max-autotune")
+        with torch.no_grad():
+            res=benchmark(q_c_op, act_mat, w_qtensor, bias)
+
+        quantized_matmul = (
+            lambda act_mat, w_vals_int8:
+                torch.mm(act_mat, w_vals_int8.to(act_mat.dtype))
+        )
+        q_c_matmul=torch.compile(quantized_matmul, mode="max-autotune")
+        with torch.no_grad():
+            res2=benchmark(
+                q_c_matmul,
+                act_mat.reshape(-1, act_mat.shape[-1]),
+                w_qtensor.int_data)
+        print(cls, res, res2
+        )
+        return (res+res2)/2
 
 class Int4WeightOnlyQuantizedLinearWeight(QuantizedLinearWeightBase):
     """
diff --git a/torchao/quantization/utils.py b/torchao/quantization/utils.py