Update on "Autoquant"

HDCharles · HDCharles · commit 20c81b963a43 · 2024-03-01T18:34:28.000-08:00
Summary:

currently issue where for multiple linear layers, get very slow dynamic
quant results on layer linear layers. unclear why.

Test Plan: python test/test.py -k "autoquant"

&lt;class 'torchao.quantization.autoquant.DefaultLinear'&gt; (torch.Size([65536, 1280]), torch.Size([3840, 1280]), torch.Size([3840]))
187.4432 0
AUTOTUNE addmm(65536x3840, 65536x1280, 1280x3840)
  bias_addmm 2.9764 ms 100.0%
  triton_mm_1 3.6858 ms 80.8%
  triton_mm_2 3.7502 ms 79.4%
  addmm 3.7887 ms 78.6%
  triton_mm_3 4.1547 ms 71.6%
  triton_mm_4 4.2022 ms 70.8%
  triton_mm_0 4.7970 ms 62.0%
  triton_mm_8 4.9596 ms 60.0%
  triton_mm_7 5.4343 ms 54.8%
  triton_mm_10 6.9352 ms 42.9%
SingleProcess AUTOTUNE takes 5.6320 seconds
&lt;torch.utils.benchmark.utils.common.Measurement object at 0x7f98800eb760&gt;
f(*args, **kwargs)
  3.08 ms
  1 measurement, 20 runs , 1 thread
&lt;class 'torchao.quantization.autoquant.DefaultLinear'&gt; 3.07677136734128
1311.548416 0
&lt;class 'torchao.quantization.subclass.Int8WeightOnlyQuantizedLinearWeight'&gt; (torch.Size([65536, 1280]), torch.Size([3840, 1280]), torch.Size([3840]))
1311.548416 0
AUTOTUNE mixed_mm(65536x1280, 1280x3840)
  fallback_mixed_mm 2.5089 ms 100.0%
  triton_mm_13 6.4153 ms 39.1%
  triton_mm_14 6.6832 ms 37.5%
  triton_mm_12 7.0896 ms 35.4%
  triton_mm_16 7.5022 ms 33.4%
  triton_mm_15 7.8426 ms 32.0%
  triton_mm_19 9.5269 ms 26.3%
  triton_mm_20 11.2033 ms 22.4%
  triton_mm_17 13.1675 ms 19.1%
  triton_mm_18 13.8004 ms 18.2%
SingleProcess AUTOTUNE takes 2.4977 seconds
&lt;torch.utils.benchmark.utils.common.Measurement object at 0x7f986ff12050&gt;
f(*args, **kwargs)
  3.68 ms
  1 measurement, 20 runs , 1 thread
&lt;torch.utils.benchmark.utils.common.Measurement object at 0x7f986ff27b80&gt;
f(*args, **kwargs)
  3.10 ms
  1 measurement, 20 runs , 1 thread
&lt;class 'torchao.quantization.subclass.Int8WeightOnlyQuantizedLinearWeight'&gt; 3.6846738075837493 3.1023880932480097
2144.447488 25
&lt;class 'torchao.quantization.subclass.Int8DynamicallyQuantizedLinearWeight'&gt; (torch.Size([65536, 1280]), torch.Size([3840, 1280]), torch.Size([3840]))
2144.447488 25
AUTOTUNE int_mm(65536x1280, 1280x3840, 65536x3840)
  triton_mm_43 2.0319 ms 100.0%
  triton_mm_35 2.8135 ms 72.2%
  triton_mm_42 3.1552 ms 64.4%
  triton_mm_36 3.1754 ms 64.0%
  triton_mm_44 3.3460 ms 60.7%
  triton_mm_41 3.4036 ms 59.7%
  triton_mm_37 3.5030 ms 58.0%
  triton_mm_34 3.6553 ms 55.6%
  triton_mm_38 3.9232 ms 51.8%
  triton_mm_40 9.1934 ms 22.1%
SingleProcess AUTOTUNE takes 8.1948 seconds
&lt;torch.utils.benchmark.utils.common.Measurement object at 0x7f9892843f40&gt;
f(*args, **kwargs)
  3.13 ms
  1 measurement, 20 runs , 1 thread
&lt;torch.utils.benchmark.utils.common.Measurement object at 0x7f986cfd33a0&gt;
f(*args, **kwargs)
  2.21 ms
  1 measurement, 20 runs , 1 thread
&lt;class 'torchao.quantization.subclass.Int8DynamicallyQuantizedLinearWeight'&gt; 3.1286065466701984 2.210085652768612
2144.447488 22
&lt;class 'torchao.quantization.autoquant.DefaultLinear'&gt; (torch.Size([65536, 3840]), torch.Size([1280, 3840]), torch.Size([1280]))
2144.447488 22
AUTOTUNE addmm(65536x1280, 65536x3840, 3840x1280)
  bias_addmm 2.7966 ms 100.0%
  addmm 3.0447 ms 91.9%
  triton_mm_57 3.5612 ms 78.5%
  triton_mm_58 3.6919 ms 75.7%
  triton_mm_59 4.1908 ms 66.7%
  triton_mm_60 4.2350 ms 66.0%
  triton_mm_56 4.7210 ms 59.2%
  triton_mm_64 4.9001 ms 57.1%
  triton_mm_63 5.5218 ms 50.6%
  triton_mm_66 7.1417 ms 39.2%
SingleProcess AUTOTUNE takes 6.3734 seconds
&lt;torch.utils.benchmark.utils.common.Measurement object at 0x7f9888dd2b30&gt;
f(*args, **kwargs)
  3.33 ms
  1 measurement, 20 runs , 1 thread
&lt;class 'torchao.quantization.autoquant.DefaultLinear'&gt; 3.329739556647837
2228.913664 39
&lt;class 'torchao.quantization.subclass.Int8WeightOnlyQuantizedLinearWeight'&gt; (torch.Size([65536, 3840]), torch.Size([1280, 3840]), torch.Size([1280]))
2228.913664 39
AUTOTUNE mixed_mm(65536x3840, 3840x1280)
  fallback_mixed_mm 2.3987 ms 100.0%
  triton_mm_70 6.9153 ms 34.7%
  triton_mm_72 7.1634 ms 33.5%
  triton_mm_69 7.3164 ms 32.8%
  triton_mm_68 7.5070 ms 32.0%
  triton_mm_71 7.5631 ms 31.7%
  triton_mm_76 10.7759 ms 22.3%
  triton_mm_75 11.0692 ms 21.7%
  triton_mm_73 12.8898 ms 18.6%
  triton_mm_77 13.3715 ms 17.9%
SingleProcess AUTOTUNE takes 6.2342 seconds
&lt;torch.utils.benchmark.utils.common.Measurement object at 0x7f9880133fd0&gt;
f(*args, **kwargs)
  3.48 ms
  1 measurement, 20 runs , 1 thread
&lt;torch.utils.benchmark.utils.common.Measurement object at 0x7f988175b610&gt;
f(*args, **kwargs)
  3.22 ms
  1 measurement, 20 runs , 1 thread
&lt;class 'torchao.quantization.subclass.Int8WeightOnlyQuantizedLinearWeight'&gt; 3.4762858413159847 3.2240213360637426
2228.913664 38
&lt;class 'torchao.quantization.subclass.Int8DynamicallyQuantizedLinearWeight'&gt; (torch.Size([65536, 3840]), torch.Size([1280, 3840]), torch.Size([1280]))
2228.913664 38
AUTOTUNE int_mm(65536x3840, 3840x1280, 65536x1280)
  triton_mm_99 1.4307 ms 100.0%
  triton_mm_100 1.9041 ms 75.1%
  triton_mm_91 2.6079 ms 54.9%
  triton_mm_98 2.6363 ms 54.3%
  triton_mm_92 2.6691 ms 53.6%
  triton_mm_93 3.0178 ms 47.4%
  triton_mm_97 3.0233 ms 47.3%
  triton_mm_94 3.1872 ms 44.9%
  triton_mm_90 3.6072 ms 39.7%
  triton_mm_96 8.4695 ms 16.9%
SingleProcess AUTOTUNE takes 8.1095 seconds
&lt;torch.utils.benchmark.utils.common.Measurement object at 0x7f9881782f80&gt;
f(*args, **kwargs)
  145.38 ms
  1 measurement, 20 runs , 1 thread
&lt;torch.utils.benchmark.utils.common.Measurement object at 0x7f9892843f70&gt;
f(*args, **kwargs)
  143.98 ms
  1 measurement, 20 runs , 1 thread
&lt;class 'torchao.quantization.subclass.Int8DynamicallyQuantizedLinearWeight'&gt; 145.37517526187003 143.98446583654732
2230.364672 79

Reviewers:

Subscribers:

Tasks:

Tags:

[ghstack-poisoned]
diff --git a/test/test.py b/test/test.py
@@ -54,6 +54,7 @@
     compute_error as SQNR,
     _fqn_to_op_to_shape_to_count,
     LoggingTensorMode,
+    benchmark
 )
 from torch.ao.quantization.quantize_fx import convert_to_reference_fx, prepare_fx
 import os
@@ -1198,22 +1199,38 @@ def test_on_dummy_distilbert(self):
 
 class TestAutoQuant(unittest.TestCase):
     def test_auto_quant(self):
-        model = torch.nn.Sequential(
-            # torch.nn.Linear(5120,1280),
-            # torch.nn.ReLU(),
-            torch.nn.Linear(1280,3840),
-            torch.nn.ReLU(),
-            torch.nn.Linear(3840,1280),
-            torch.nn.ReLU(),
-        ).to("cuda").to(torch.bfloat16)
-        example_input = torch.randn(65536, 1280, device="cuda", dtype=torch.bfloat16)
         torch._inductor.config.epilogue_fusion = False
         torch._inductor.config.use_mixed_mm = True
         torch._inductor.config.force_fuse_int_mm_with_mul = True
         torch._inductor.config.coordinate_descent_tuning = True
         torch._dynamo.config.automatic_dynamic_shapes = False
-        torch._dynamo.reset() # TODO use in autoquantizer
-        do_autoquant(model, example_input)
+
+        for m,k,n in [
+            (1, 1024, 1024),
+            (64, 1024, 1024),
+            (4096, 1024, 1024),
+            (1, 1024, 4096),
+            (64, 1024, 4096),
+            (1, 4096, 1024),
+            (64, 4096, 1024),
+            (4096, 4096, 1024),
+        ]:
+            print("testing", m, k, n)
+            example_input = torch.randn(m, k, device="cuda", dtype=torch.bfloat16)
+            model = torch.nn.Sequential(
+                # torch.nn.ReLU(),
+                torch.nn.Linear(k,n),
+                # torch.nn.ReLU(),
+                # torch.nn.Linear(1280,3840),
+                # torch.nn.ReLU(),
+                # torch.nn.Linear(3840,1280),
+                # torch.nn.ReLU(),
+                # torch.nn.Linear(1280,1024),
+                # torch.nn.ReLU(),
+                # torch.nn.Linear(1024,4096),
+                # torch.nn.ReLU(),
+            ).to("cuda").to(torch.bfloat16)
+            do_autoquant(model, example_input)
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/torchao/quantization/__init__.py b/torchao/quantization/__init__.py
@@ -25,6 +25,9 @@
     "dynamically_quantize_per_channel",
     "dequantize_per_tensor",
     "dequantize_per_channel",
+    "do_autoquant",
+    "change_linears_to_autoquantizable",
+    "change_autoquantizable_to_quantized",
     "quant_int8_dynamic_linear",
     "quant_int8_matmul",
     "quant_int8_dynamic_per_token_linear",
diff --git a/torchao/quantization/autoquant.py b/torchao/quantization/autoquant.py
@@ -7,6 +7,11 @@
 )
 from torch.utils._python_dispatch import return_and_correct_aliasing
 from .utils import benchmark
+from .quant_primitives import (
+    quantize_activation_per_token_absmax,
+    safe_int_mm,
+)
+import torch.nn.functional as F
 
 aten = torch.ops.aten
 
@@ -70,23 +75,30 @@ def tune_autoquant(self, q_cls):
             with torch.no_grad():
                 act_mat = torch.randn(act_shape, dtype=self.logged_dtype, device=self.device)
                 bias = None if bias_shape is None else torch.randn(bias_shape, dtype=self.logged_dtype, device=self.device)
-                print(q_cls, self.logged_shape, self.logged_dtype)
-                print("mem", torch.cuda.max_memory_allocated()/1e6, torch.cuda.memory_usage())
                 res = q_cls._autoquant_test(act_mat, self.weight, bias)
                 update_cache(q_cls, self.logged_shape, self.logged_dtype, res)
 
-    def to_quantized(self):
-        if self.logged_shape is None or self.logged_dtype is None:
+    def to_quantized(self, error_on_unseen, **kwargs):
+        if error_on_unseen and (self.logged_shape is None or self.logged_dtype is None):
             raise RuntimeError("must run module normally to get shape, dtype info for autoquant")
+        elif (self.logged_shape is None or self.logged_dtype is None) and not error_on_unseen:
+            # default back to non-quantized weight if not seen
+            self = AQFloatLinearWeight.from_float(self.weight)
+            return  self
         best_time = torch.inf
         best_cls = None
+        do_print=False
         for q_cls in self.qtensor_class_list:
             if check_cache(q_cls, self.logged_shape, self.logged_dtype) is None:
+                do_print=True
                 self.tune_autoquant(q_cls)
+                torch._dynamo.reset()
             cls_res = AUTOQUANT_CACHE.get((q_cls, self.logged_shape, self.logged_dtype), torch.inf)
             if best_time >= cls_res:
                 best_time = cls_res
                 best_cls = q_cls
+        if do_print:
+            print(f"shape={self.logged_shape}, dtype={self.logged_dtype}, best_cls={best_cls}")
         # TODO handle random cls args/kwargs? or should they be curried
         self = best_cls.from_float(self.weight)
         return self
@@ -132,26 +144,93 @@ def __torch_dispatch__(cls, func, types, args, kwargs):
          if func is aten.detach.default:
             return return_and_correct_aliasing(func, args, kwargs, args[0]._apply_fn_to_data(torch.detach))
 
-
-class DefaultLinear(torch.Tensor):
+class AQMixin():
     """
-    An class to be used in concert with AutoQuantizableLinearWeight to provide a
-    default/non-quantized option. Only implements the bare minimum needed to work with the
-    AutoQuantizableLinearWeight class using the same interfaces that would normally be
-    used by QTensor subclasses but for a default linear op instead.
+    Mixin to turn normal quantized subclasses into autoquantizable ones
     """
-    def __init__(self):
-        super().__init__()
-
     @classmethod
     def _autoquant_test(cls, act_mat, weight, bias):
         w_qtensor = cls.from_float(weight)
-        q_c_op = torch.compile(cls._quantized_op, mode="max-autotune")
+        func = lambda act_mat, w_qtensor, bias: F.relu(cls._quantized_op(F.relu(act_mat), w_qtensor, bias))
+        q_c_op = torch.compile(func, mode="max-autotune")
+        # q_c_op = torch.compile(cls._quantized_op, mode="max-autotune")
         with torch.no_grad():
-            res=benchmark(q_c_op, act_mat, w_qtensor, bias)
+            torch.cuda.synchronize()
+            res = benchmark(q_c_op, act_mat, w_qtensor, bias)
         print(cls, res)
         return res
 
+class AQInt8DynamicallyQuantizedLinearWeight(AQMixin, Int8DynamicallyQuantizedLinearWeight):
+    """
+    AutoQuantizable version of Int8DynamicallyQuantizedLinearWeight
+    """
+    @classmethod
+    def _autoquant_test(cls, act_mat, weight, bias):
+        res = super()._autoquant_test(act_mat, weight, bias)
+        w_qtensor = cls.from_float(weight)
+        x_vals_int8, x_scales = quantize_activation_per_token_absmax(
+            act_mat.reshape(-1, act_mat.shape[-1])
+        )
+        quantized_matmul = (
+            lambda x_vals_int8, x_scales, w_vals_int8:
+                safe_int_mm(x_vals_int8, w_vals_int8) * x_scales
+        )
+        q_c_matmul=torch.compile(quantized_matmul, mode="max-autotune")
+        with torch.no_grad():
+            res2=benchmark(q_c_matmul, x_vals_int8, x_scales, w_qtensor.int_data)
+        print(cls, "matmul", res2)
+        # for SAM best is between .458-.499, SDXL .45=3.094 .47=2.880 .48=3.036 .5=2.930
+        return res
+
+
+class AQWeightOnlyQuantizedLinearWeight(Int8WeightOnlyQuantizedLinearWeight, AQMixin):
+    """
+    AutoQuantizable version of Int8WeightOnlyQuantizedLinearWeight
+    """
+
+class AQWeightOnlyQuantizedLinearWeight2(Int8WeightOnlyQuantizedLinearWeight, AQMixin):
+    """
+    AutoQuantizable version of Int8WeightOnlyQuantizedLinearWeight that
+    uses a different kernel
+    """
+    @staticmethod
+    def _quantized_op(act_mat, w_qtensor, bias):
+        orig_dtype = act_mat.dtype
+        orig_shape = act_mat.shape
+        act_mat = act_mat.reshape(-1, act_mat.shape[-1], 1)
+        y = (act_mat*w_qtensor.int_data.unsqueeze(0)).sum(dim=-2)
+        y = y.reshape(*orig_shape[:-1], y.shape[-1])
+        if bias is not None:
+            y += bias
+        return y.to(orig_dtype)
+
+    @classmethod
+    def _autoquant_test(cls, act_mat, weight, bias):
+        # if act_mat has batchsize>2 don't use this kernel
+        if act_mat.reshape(-1, act_mat.shape[-1]).shape[0]>2:
+            return torch.inf
+        return super()._autoquant_test(act_mat, weight, bias)
+
+class AQWeightOnlyQuantizedLinearWeight3(Int8WeightOnlyQuantizedLinearWeight, AQMixin):
+    def _quantized_op(act_mat, w_qtensor, bias):
+        orig_shape = act_mat.shape
+        y = torch.mm(act_mat.reshape(-1, orig_shape[-1]), w_qtensor.int_data*w_qtensor.q_scales)
+        y=y.reshape(*orig_shape[:-1], y.shape[-1])
+        if bias is not None:
+            y += bias
+        return y
+
+
+class AQFloatLinearWeight(torch.Tensor, AQMixin):
+    """
+    A class to be used in concert with AutoQuantizableLinearWeight to provide a
+    default/non-quantized option. Only implements the bare minimum needed to work with the
+    AutoQuantizableLinearWeight class using the same interfaces that would normally be
+    used by QTensor subclasses but for a default linear op instead.
+    """
+    def __init__(self):
+        super().__init__()
+
     @staticmethod
     def _quantized_op(act_mat, w_qtensor, bias):
         return torch.nn.functional.linear(act_mat, w_qtensor, bias)
@@ -161,10 +240,11 @@ def from_float(cls, weight):
         return weight
 
 DEFAULT_CLASS_LIST = [
-    Int8DynamicallyQuantizedLinearWeight,
-    DefaultLinear,
-    Int8WeightOnlyQuantizedLinearWeight,
-
+    AQFloatLinearWeight,
+    AQInt8DynamicallyQuantizedLinearWeight,
+    AQWeightOnlyQuantizedLinearWeight,
+    AQWeightOnlyQuantizedLinearWeight2,
+    AQWeightOnlyQuantizedLinearWeight3,
 ]
 
 if False:
diff --git a/torchao/quantization/quant_api.py b/torchao/quantization/quant_api.py
@@ -36,7 +36,10 @@
     "change_linear_weights_to_int8_dqtensors",
     "change_linear_weights_to_int8_woqtensors",
     "change_linear_weights_to_int4_woqtensors",
-    "swap_conv2d_1x1_to_linear"
+    "swap_conv2d_1x1_to_linear",
+    "do_autoquant",
+    "change_linears_to_autoquantizable",
+    "change_autoquantizable_to_quantized",
 ]
 
 
@@ -159,6 +162,7 @@ def change_linear_weights_to_int4_woqtensors(model, **kwargs):
 
 def change_linears_to_autoquantizable(model, **kwargs):
     filter_fn = kwargs.pop("filter_fn", _is_linear)
+    kwargs["qtensor_class_list"] = kwargs.get("qtensor_class_list", DEFAULT_CLASS_LIST)
     _replace_with_custom_fn_if_matches_filter(
         model,
         _get_subclass_inserter(AutoQuantizableLinearWeight, **kwargs),
@@ -172,22 +176,27 @@ def change_autoquantizable_to_quantized(model, **kwargs):
             _is_linear(mod, *args) and
             isinstance(mod.weight, AutoQuantizableLinearWeight)
     )
+    error_on_unseen=kwargs.pop("error_on_unseen", True)
     _replace_with_custom_fn_if_matches_filter(
         model,
         _get_subclass_inserter(
-            AutoQuantizableLinearWeight, method="to_quantized", **kwargs
+            AutoQuantizableLinearWeight, method="to_quantized", error_on_unseen=error_on_unseen, **kwargs
         ),
         filter_fn,
     )
 
 @torch.no_grad()
 def do_autoquant(model, example_input, qtensor_class_list=DEFAULT_CLASS_LIST, filter_fn=_is_linear):
+    hold =  torch._dynamo.config.automatic_dynamic_shapes
+    torch._dynamo.config.automatic_dynamic_shapes = False
     change_linears_to_autoquantizable(model, filter_fn=filter_fn, qtensor_class_list=qtensor_class_list)
     if not isinstance(example_input, (tuple, list)):
         assert isinstance(example_input, torch.Tensor)
         example_input = [example_input]
     model(*example_input)
     change_autoquantizable_to_quantized(model)
+    torch._dynamo.config.automatic_dynamic_shapes = hold
+    torch._dynamo.reset()
     return model
 
 def swap_conv2d_1x1_to_linear(model, filter_fn=None):
diff --git a/torchao/quantization/subclass.py b/torchao/quantization/subclass.py
@@ -200,27 +200,6 @@ def _quantized_op(act_mat, w_qtensor, bias):
             act_mat, w_qtensor.int_data, w_qtensor.q_scales, bias, act_mat.dtype
         )
 
-    @classmethod
-    def _autoquant_test(cls, act_mat, weight, bias):
-        w_qtensor = cls.from_float(weight)
-        q_c_op = torch.compile(cls._quantized_op, mode="max-autotune")
-        with torch.no_grad():
-            res=benchmark(q_c_op, act_mat, w_qtensor, bias)
-
-        x_vals_int8, x_scales = quantize_activation_per_token_absmax(
-            act_mat.reshape(-1, act_mat.shape[-1])
-        )
-        quantized_matmul = (
-            lambda x_vals_int8, x_scales, w_vals_int8:
-                safe_int_mm(x_vals_int8, w_vals_int8) * x_scales
-        )
-        q_c_matmul=torch.compile(quantized_matmul, mode="max-autotune")
-        with torch.no_grad():
-            res2=benchmark(q_c_matmul, x_vals_int8, x_scales, w_qtensor.int_data)
-        print(cls, res, res2)
-        breakpoint()
-        return (res+res2)/2
-
     def dequantize(self, dtype=None):
         """
         Obtain the dequantized version of the quantized tensor subclass
@@ -293,7 +272,7 @@ def from_float(cls, input_float, qmin=-128, qmax=127):
         # however the external representation of our tensor will maintain the correct
         # shape attribute which needs to be tracked directly.
         int_data = w_int_repr.contiguous().t()
-        if cls is not Int8DynamicallyQuantizedLinearWeight:
+        if not issubclass(cls, Int8DynamicallyQuantizedLinearWeight):
             int_data = int_data.contiguous()
         return cls(
             int_data, w_scales, False, input_float.shape, dtype=input_float.dtype
@@ -316,26 +295,6 @@ def _quantized_op(act_mat, w_qtensor, bias):
             y += bias
         return y.to(orig_dtype)
 
-    @classmethod
-    def _autoquant_test(cls, act_mat, weight, bias):
-        w_qtensor = cls.from_float(weight)
-        q_c_op = torch.compile(cls._quantized_op, mode="max-autotune")
-        with torch.no_grad():
-            res=benchmark(q_c_op, act_mat, w_qtensor, bias)
-
-        quantized_matmul = (
-            lambda act_mat, w_vals_int8:
-                torch.mm(act_mat, w_vals_int8.to(act_mat.dtype))
-        )
-        q_c_matmul=torch.compile(quantized_matmul, mode="max-autotune")
-        with torch.no_grad():
-            res2=benchmark(
-                q_c_matmul,
-                act_mat.reshape(-1, act_mat.shape[-1]),
-                w_qtensor.int_data)
-
-        print(cls, res, res2)
-        return (res+res2)/2
 
 class Int4WeightOnlyQuantizedLinearWeight(QuantizedLinearWeightBase):
     """
diff --git a/torchao/quantization/utils.py b/torchao/quantization/utils.py
@@ -88,31 +88,14 @@ def get_model_size_in_bytes(model):
         s += b.nelement() * b.element_size()
     return s
 
-import time
-
-def benchmark_torch_function(iters, f, *args, **kwargs):
-    f(*args, **kwargs)
-    f(*args, **kwargs)
-    f(*args, **kwargs)
-    if torch.cuda.is_available():
-        torch.cuda.synchronize()
-        start_event = torch.cuda.Event(enable_timing=True)
-        end_event = torch.cuda.Event(enable_timing=True)
-        start_event.record()
-    else:
-        t0 = time.time()
-    for i in range(iters):
-        f(*args, **kwargs)
-    if torch.cuda.is_available():
-        end_event.record()
-        torch.cuda.synchronize()
-        return start_event.elapsed_time(end_event)
-    else:
-        return (time.time() - t0)
 
 def benchmark(f, *args, **kwargs):
     t0 = Timer(
         stmt="f(*args, **kwargs)", globals={"args": args, "kwargs": kwargs, "f": f}
     )
+
     # warmup
-    return benchmark_torch_function(10, f, *args, **kwargs)
+    t0.timeit(10)
+
+    res=t0.blocked_autorange(min_run_time=.5)
+    return res.median * 1e3