Update on "Autoquant"

HDCharles · HDCharles · commit 7f4150f7f918 · 2024-02-27T10:44:07.000-08:00
Summary:

currently issue where for multiple linear layers, get very slow dynamic
quant results on layer linear layers. unclear why.

Test Plan: python test/test.py -k "autoquant"

&lt;class 'torchao.quantization.autoquant.DefaultLinear'&gt; (torch.Size([65536, 1280]), torch.Size([3840, 1280]), torch.Size([3840]))
187.4432 0
AUTOTUNE addmm(65536x3840, 65536x1280, 1280x3840)
  bias_addmm 2.9764 ms 100.0%
  triton_mm_1 3.6858 ms 80.8%
  triton_mm_2 3.7502 ms 79.4%
  addmm 3.7887 ms 78.6%
  triton_mm_3 4.1547 ms 71.6%
  triton_mm_4 4.2022 ms 70.8%
  triton_mm_0 4.7970 ms 62.0%
  triton_mm_8 4.9596 ms 60.0%
  triton_mm_7 5.4343 ms 54.8%
  triton_mm_10 6.9352 ms 42.9%
SingleProcess AUTOTUNE takes 5.6320 seconds
&lt;torch.utils.benchmark.utils.common.Measurement object at 0x7f98800eb760&gt;
f(*args, **kwargs)
  3.08 ms
  1 measurement, 20 runs , 1 thread
&lt;class 'torchao.quantization.autoquant.DefaultLinear'&gt; 3.07677136734128
1311.548416 0
&lt;class 'torchao.quantization.subclass.Int8WeightOnlyQuantizedLinearWeight'&gt; (torch.Size([65536, 1280]), torch.Size([3840, 1280]), torch.Size([3840]))
1311.548416 0
AUTOTUNE mixed_mm(65536x1280, 1280x3840)
  fallback_mixed_mm 2.5089 ms 100.0%
  triton_mm_13 6.4153 ms 39.1%
  triton_mm_14 6.6832 ms 37.5%
  triton_mm_12 7.0896 ms 35.4%
  triton_mm_16 7.5022 ms 33.4%
  triton_mm_15 7.8426 ms 32.0%
  triton_mm_19 9.5269 ms 26.3%
  triton_mm_20 11.2033 ms 22.4%
  triton_mm_17 13.1675 ms 19.1%
  triton_mm_18 13.8004 ms 18.2%
SingleProcess AUTOTUNE takes 2.4977 seconds
&lt;torch.utils.benchmark.utils.common.Measurement object at 0x7f986ff12050&gt;
f(*args, **kwargs)
  3.68 ms
  1 measurement, 20 runs , 1 thread
&lt;torch.utils.benchmark.utils.common.Measurement object at 0x7f986ff27b80&gt;
f(*args, **kwargs)
  3.10 ms
  1 measurement, 20 runs , 1 thread
&lt;class 'torchao.quantization.subclass.Int8WeightOnlyQuantizedLinearWeight'&gt; 3.6846738075837493 3.1023880932480097
2144.447488 25
&lt;class 'torchao.quantization.subclass.Int8DynamicallyQuantizedLinearWeight'&gt; (torch.Size([65536, 1280]), torch.Size([3840, 1280]), torch.Size([3840]))
2144.447488 25
AUTOTUNE int_mm(65536x1280, 1280x3840, 65536x3840)
  triton_mm_43 2.0319 ms 100.0%
  triton_mm_35 2.8135 ms 72.2%
  triton_mm_42 3.1552 ms 64.4%
  triton_mm_36 3.1754 ms 64.0%
  triton_mm_44 3.3460 ms 60.7%
  triton_mm_41 3.4036 ms 59.7%
  triton_mm_37 3.5030 ms 58.0%
  triton_mm_34 3.6553 ms 55.6%
  triton_mm_38 3.9232 ms 51.8%
  triton_mm_40 9.1934 ms 22.1%
SingleProcess AUTOTUNE takes 8.1948 seconds
&lt;torch.utils.benchmark.utils.common.Measurement object at 0x7f9892843f40&gt;
f(*args, **kwargs)
  3.13 ms
  1 measurement, 20 runs , 1 thread
&lt;torch.utils.benchmark.utils.common.Measurement object at 0x7f986cfd33a0&gt;
f(*args, **kwargs)
  2.21 ms
  1 measurement, 20 runs , 1 thread
&lt;class 'torchao.quantization.subclass.Int8DynamicallyQuantizedLinearWeight'&gt; 3.1286065466701984 2.210085652768612
2144.447488 22
&lt;class 'torchao.quantization.autoquant.DefaultLinear'&gt; (torch.Size([65536, 3840]), torch.Size([1280, 3840]), torch.Size([1280]))
2144.447488 22
AUTOTUNE addmm(65536x1280, 65536x3840, 3840x1280)
  bias_addmm 2.7966 ms 100.0%
  addmm 3.0447 ms 91.9%
  triton_mm_57 3.5612 ms 78.5%
  triton_mm_58 3.6919 ms 75.7%
  triton_mm_59 4.1908 ms 66.7%
  triton_mm_60 4.2350 ms 66.0%
  triton_mm_56 4.7210 ms 59.2%
  triton_mm_64 4.9001 ms 57.1%
  triton_mm_63 5.5218 ms 50.6%
  triton_mm_66 7.1417 ms 39.2%
SingleProcess AUTOTUNE takes 6.3734 seconds
&lt;torch.utils.benchmark.utils.common.Measurement object at 0x7f9888dd2b30&gt;
f(*args, **kwargs)
  3.33 ms
  1 measurement, 20 runs , 1 thread
&lt;class 'torchao.quantization.autoquant.DefaultLinear'&gt; 3.329739556647837
2228.913664 39
&lt;class 'torchao.quantization.subclass.Int8WeightOnlyQuantizedLinearWeight'&gt; (torch.Size([65536, 3840]), torch.Size([1280, 3840]), torch.Size([1280]))
2228.913664 39
AUTOTUNE mixed_mm(65536x3840, 3840x1280)
  fallback_mixed_mm 2.3987 ms 100.0%
  triton_mm_70 6.9153 ms 34.7%
  triton_mm_72 7.1634 ms 33.5%
  triton_mm_69 7.3164 ms 32.8%
  triton_mm_68 7.5070 ms 32.0%
  triton_mm_71 7.5631 ms 31.7%
  triton_mm_76 10.7759 ms 22.3%
  triton_mm_75 11.0692 ms 21.7%
  triton_mm_73 12.8898 ms 18.6%
  triton_mm_77 13.3715 ms 17.9%
SingleProcess AUTOTUNE takes 6.2342 seconds
&lt;torch.utils.benchmark.utils.common.Measurement object at 0x7f9880133fd0&gt;
f(*args, **kwargs)
  3.48 ms
  1 measurement, 20 runs , 1 thread
&lt;torch.utils.benchmark.utils.common.Measurement object at 0x7f988175b610&gt;
f(*args, **kwargs)
  3.22 ms
  1 measurement, 20 runs , 1 thread
&lt;class 'torchao.quantization.subclass.Int8WeightOnlyQuantizedLinearWeight'&gt; 3.4762858413159847 3.2240213360637426
2228.913664 38
&lt;class 'torchao.quantization.subclass.Int8DynamicallyQuantizedLinearWeight'&gt; (torch.Size([65536, 3840]), torch.Size([1280, 3840]), torch.Size([1280]))
2228.913664 38
AUTOTUNE int_mm(65536x3840, 3840x1280, 65536x1280)
  triton_mm_99 1.4307 ms 100.0%
  triton_mm_100 1.9041 ms 75.1%
  triton_mm_91 2.6079 ms 54.9%
  triton_mm_98 2.6363 ms 54.3%
  triton_mm_92 2.6691 ms 53.6%
  triton_mm_93 3.0178 ms 47.4%
  triton_mm_97 3.0233 ms 47.3%
  triton_mm_94 3.1872 ms 44.9%
  triton_mm_90 3.6072 ms 39.7%
  triton_mm_96 8.4695 ms 16.9%
SingleProcess AUTOTUNE takes 8.1095 seconds
&lt;torch.utils.benchmark.utils.common.Measurement object at 0x7f9881782f80&gt;
f(*args, **kwargs)
  145.38 ms
  1 measurement, 20 runs , 1 thread
&lt;torch.utils.benchmark.utils.common.Measurement object at 0x7f9892843f70&gt;
f(*args, **kwargs)
  143.98 ms
  1 measurement, 20 runs , 1 thread
&lt;class 'torchao.quantization.subclass.Int8DynamicallyQuantizedLinearWeight'&gt; 145.37517526187003 143.98446583654732
2230.364672 79

Reviewers:

Subscribers:

Tasks:

Tags:

[ghstack-poisoned]
diff --git a/test/test.py b/test/test.py
@@ -1204,12 +1204,15 @@ def test_auto_quant(self):
             torch.nn.Linear(1280,3840),
             torch.nn.ReLU(),
             torch.nn.Linear(3840,1280),
+            torch.nn.ReLU(),
         ).to("cuda").to(torch.bfloat16)
-        example_input = torch.randn(65536,1280, device="cuda", dtype=torch.bfloat16)
+        example_input = torch.randn(65536, 1280, device="cuda", dtype=torch.bfloat16)
         torch._inductor.config.epilogue_fusion = False
         torch._inductor.config.use_mixed_mm = True
         torch._inductor.config.force_fuse_int_mm_with_mul = True
         torch._inductor.config.coordinate_descent_tuning = True
+        torch._dynamo.config.automatic_dynamic_shapes = False
+        torch._dynamo.reset() # TODO use in autoquantizer
         do_autoquant(model, example_input)
 
 if __name__ == "__main__":
diff --git a/torchao/quantization/autoquant.py b/torchao/quantization/autoquant.py
@@ -12,16 +12,11 @@
 
 AUTOQUANT_CACHE = {}
 
-def check_cache(shape, cls):
-    if shape in AUTOQUANT_CACHE:
-        return AUTOQUANT_CACHE[shape].get(cls, None)
-    else:
-        return None
+def check_cache(cls, shape, dtype):
+    return AUTOQUANT_CACHE.get((cls, shape, dtype), None)
 
-def update_cache(shape, cls, res):
-    if not shape in AUTOQUANT_CACHE:
-        AUTOQUANT_CACHE[shape] = {}
-    AUTOQUANT_CACHE[shape][cls] = res
+def update_cache(cls, shape, dtype, res):
+    AUTOQUANT_CACHE[(cls, shape, dtype)] = res
 
 class AutoQuantizableLinearWeight(torch.Tensor):
     """
@@ -43,7 +38,8 @@ def __new__(cls, weight, qtensor_class_list, *args, **kwargs):
     def __init__(self, weight, qtensor_class_list, *args, **kwargs):
         self.weight = weight
         self.qtensor_class_list = qtensor_class_list
-        self.cache_shape = None
+        self.logged_shape = None
+        self.logged_dtype = None
 
     def __repr__(self):
         return (
@@ -52,36 +48,46 @@ def __repr__(self):
         )
 
     @staticmethod
-    def tune_autoquant(act_mat, w_autoquant, bias):
+    def log_shape(act_mat, w_autoquant, bias):
         orig_shape = act_mat.shape
         act_mat = act_mat.reshape(-1, act_mat.shape[-1])
-        cache_shape = (act_mat.shape, w_autoquant.shape, None if bias is None else bias.shape)
-        w_autoquant.cache_shape = cache_shape
-        for cur_cls in w_autoquant.qtensor_class_list:
-            if check_cache(cache_shape, cur_cls) is None:
-                with torch.no_grad():
-                    print(cur_cls, cache_shape)
-                    print(torch.cuda.max_memory_allocated()/1e6, torch.cuda.memory_usage())
-                    res = cur_cls._autoquant_test(act_mat.clone(), w_autoquant.weight.clone(), None if bias is None else bias.clone())
-                    update_cache(cache_shape, cur_cls, res)
-                    print(torch.cuda.max_memory_allocated()/1e6, torch.cuda.memory_usage())
+        logged_shape = (act_mat.shape, w_autoquant.shape, None if bias is None else bias.shape)
+        logged_dtype = act_mat.dtype
+        w_autoquant.logged_shape = logged_shape
+        w_autoquant.logged_dtype = logged_dtype
+        for q_cls in w_autoquant.qtensor_class_list:
+            if check_cache(q_cls, logged_shape, logged_dtype) is None:
+                update_cache(q_cls, logged_shape, logged_dtype, None)
         y = torch.mm(act_mat, w_autoquant.weight.t())
         y = y.reshape(*orig_shape[:-1], y.shape[-1])
         if bias is not None:
             y += bias
         return y
 
+    def tune_autoquant(self, q_cls):
+        act_shape, w_shape, bias_shape = self.logged_shape
+        if check_cache(q_cls, self.logged_shape, self.logged_dtype) is None:
+            with torch.no_grad():
+                act_mat = torch.randn(act_shape, dtype=self.logged_dtype, device=self.device)
+                bias = None if bias_shape is None else torch.randn(bias_shape, dtype=self.logged_dtype, device=self.device)
+                print(q_cls, self.logged_shape, self.logged_dtype)
+                print("mem", torch.cuda.max_memory_allocated()/1e6, torch.cuda.memory_usage())
+                res = q_cls._autoquant_test(act_mat, self.weight, bias)
+                update_cache(q_cls, self.logged_shape, self.logged_dtype, res)
+
     def to_quantized(self):
-        if self.cache_shape is None or self.cache_shape not in AUTOQUANT_CACHE:
-            raise RuntimeError("must run module normally to find best quantization option")
+        if self.logged_shape is None or self.logged_dtype is None:
+            raise RuntimeError("must run module normally to get shape, dtype info for autoquant")
         best_time = torch.inf
         best_cls = None
-        for cur_cls in self.qtensor_class_list:
-            cls_res = AUTOQUANT_CACHE[self.cache_shape].get(cur_cls, torch.inf)
+        for q_cls in self.qtensor_class_list:
+            if check_cache(q_cls, self.logged_shape, self.logged_dtype) is None:
+                self.tune_autoquant(q_cls)
+            cls_res = AUTOQUANT_CACHE.get((q_cls, self.logged_shape, self.logged_dtype), torch.inf)
             if best_time >= cls_res:
                 best_time = cls_res
-                best_cls = cur_cls
-        # need to handle random cls args/kwargs?
+                best_cls = q_cls
+        # TODO handle random cls args/kwargs? or should they be curried
         self = best_cls.from_float(self.weight)
         return self
 
@@ -113,7 +119,7 @@ def __torch_function__(cls, func, types, args=(), kwargs=None):
                 args[1],
                 args[2] if len(args)>2 else None
             )
-            return cls.tune_autoquant(mat1, w_autoquant, bias)
+            return cls.log_shape(mat1, w_autoquant, bias)
 
         try:
             with torch._C.DisableTorchFunctionSubclass():
@@ -155,9 +161,10 @@ def from_float(cls, weight):
         return weight
 
 DEFAULT_CLASS_LIST = [
+    Int8DynamicallyQuantizedLinearWeight,
     DefaultLinear,
     Int8WeightOnlyQuantizedLinearWeight,
-    Int8DynamicallyQuantizedLinearWeight,
+
 ]
 
 if False:
diff --git a/torchao/quantization/quant_api.py b/torchao/quantization/quant_api.py
@@ -184,6 +184,7 @@ def change_autoquantizable_to_quantized(model, **kwargs):
 def do_autoquant(model, example_input, qtensor_class_list=DEFAULT_CLASS_LIST, filter_fn=_is_linear):
     change_linears_to_autoquantizable(model, filter_fn=filter_fn, qtensor_class_list=qtensor_class_list)
     if not isinstance(example_input, (tuple, list)):
+        assert isinstance(example_input, torch.Tensor)
         example_input = [example_input]
     model(*example_input)
     change_autoquantizable_to_quantized(model)
diff --git a/torchao/quantization/subclass.py b/torchao/quantization/subclass.py
@@ -206,6 +206,7 @@ def _autoquant_test(cls, act_mat, weight, bias):
         q_c_op = torch.compile(cls._quantized_op, mode="max-autotune")
         with torch.no_grad():
             res=benchmark(q_c_op, act_mat, w_qtensor, bias)
+
         x_vals_int8, x_scales = quantize_activation_per_token_absmax(
             act_mat.reshape(-1, act_mat.shape[-1])
         )
@@ -217,6 +218,7 @@ def _autoquant_test(cls, act_mat, weight, bias):
         with torch.no_grad():
             res2=benchmark(q_c_matmul, x_vals_int8, x_scales, w_qtensor.int_data)
         print(cls, res, res2)
+        breakpoint()
         return (res+res2)/2
 
     def dequantize(self, dtype=None):
@@ -331,8 +333,8 @@ def _autoquant_test(cls, act_mat, weight, bias):
                 q_c_matmul,
                 act_mat.reshape(-1, act_mat.shape[-1]),
                 w_qtensor.int_data)
-        print(cls, res, res2
-        )
+
+        print(cls, res, res2)
         return (res+res2)/2
 
 class Int4WeightOnlyQuantizedLinearWeight(QuantizedLinearWeightBase):
diff --git a/torchao/quantization/utils.py b/torchao/quantization/utils.py
@@ -88,14 +88,31 @@ def get_model_size_in_bytes(model):
         s += b.nelement() * b.element_size()
     return s
 
+import time
+
+def benchmark_torch_function(iters, f, *args, **kwargs):
+    f(*args, **kwargs)
+    f(*args, **kwargs)
+    f(*args, **kwargs)
+    if torch.cuda.is_available():
+        torch.cuda.synchronize()
+        start_event = torch.cuda.Event(enable_timing=True)
+        end_event = torch.cuda.Event(enable_timing=True)
+        start_event.record()
+    else:
+        t0 = time.time()
+    for i in range(iters):
+        f(*args, **kwargs)
+    if torch.cuda.is_available():
+        end_event.record()
+        torch.cuda.synchronize()
+        return start_event.elapsed_time(end_event)
+    else:
+        return (time.time() - t0)
+
 def benchmark(f, *args, **kwargs):
     t0 = Timer(
         stmt="f(*args, **kwargs)", globals={"args": args, "kwargs": kwargs, "f": f}
     )
     # warmup
-    t0.timeit(10).median
-    t0.blocked_autorange()
-    res = t0.timeit(20)
-    print(res)
-
-    return res.median * 1e3
+    return benchmark_torch_function(10, f, *args, **kwargs)