testing autoquant

HDCharles · HDCharles · commit 1ae9a9accf65 · 2024-03-19T15:14:47.000-07:00
Summary: improves runtime by 19.70 -> 19.76 img/sec Test Plan: sh run.sh Reviewers: Subscribers: Tasks: Tags: ghstack-source-id: 9c6098e Pull Request resolved: #114
diff --git a/experiments/eval_combo.py b/experiments/eval_combo.py
@@ -5,6 +5,7 @@
 from data import build_data, setup_coco_img_ids
 import math
 import segment_anything_fast
+import torchao
 
 torch._dynamo.config.cache_size_limit = 50000
 
@@ -289,7 +290,7 @@ def run(
     profile_top=False,
     memory_path=None,
     use_local_sam_fork=False,
-    use_compiler_settings=False,
+    use_compiler_settings=True,
 ):
     from torch._inductor import config as inductorconfig
     inductorconfig.triton.unique_kernel_names = True
@@ -298,6 +299,7 @@ def run(
     if use_compiler_settings:
         # inductorconfig.fx_graph_cache = True # seems to slow performance
         inductorconfig.epilogue_fusion = False
+        torch._dynamo.config.automatic_dynamic_shapes = False
         inductorconfig.coordinate_descent_tuning = True
         inductorconfig.coordinate_descent_check_all_directions = True
 
@@ -336,7 +338,12 @@ def run(
     for block in predictor.model.image_encoder.blocks:
         block.attn.use_rel_pos = use_rel_pos
 
-    if compress == "dynamic_quant":
+    if compress == "autoquant":
+        example_input = torch.randn((batch_size, 3, 1024, 1024), dtype=use_half, device="cuda")
+        inductorconfig.force_fuse_int_mm_with_mul = True
+        inductorconfig.use_mixed_mm = True
+        torchao.autoquant(predictor.model.image_encoder, example_input, mode=["interpolate", .5])
+    elif compress == "dynamic_quant":
         from torchao.quantization import apply_dynamic_quant
         apply_dynamic_quant(predictor.model.image_encoder)
         inductorconfig.force_fuse_int_mm_with_mul = True
diff --git a/experiments/run.sh b/experiments/run.sh
@@ -0,0 +1,4 @@
+SEGMENT_ANYTHING_FAST_USE_FLASH_4=0 python run_experiments.py 16 vit_h \
+    ~/local/pytorch ~/local/segment-anything ~/local/sam_data \
+    --run-experiments --local_fork_only \
+    --num-workers 32  --capture_output False
diff --git a/experiments/run_experiments.py b/experiments/run_experiments.py
@@ -42,7 +42,7 @@ def run_experiment(experiments_data,
                    extra_args=None,
                    print_header=False,
                    capture_output=True,
-                   limit=None,
+                   limit=1024,
                    profile_path=None,
                    profile_top=False,
                    memory_path=None):
@@ -181,6 +181,11 @@ def run(batch_size,
         rt("sparse",         "local-fork",   use_half="bfloat16",  use_compile="max-autotune", use_nested_tensor=True, compress="sparse")
 
     if run_experiments:
+        # rexp("base",  "local-fork",     use_half="bfloat16")
+        # rexp("compile",  "local-fork",     use_half="bfloat16",  use_compile="max-autotune", use_nested_tensor=(batch_size > 1), print_header=print_header)
+        # rexp("int8",         "local-fork",   use_half="bfloat16",  use_compile="max-autotune", use_nested_tensor=(batch_size > 1), compress="dynamic_quant")
+        rexp("autoquant",         "local-fork",   use_half="bfloat16",  use_compile="max-autotune", use_nested_tensor=(batch_size > 1), compress="autoquant")
+        return
         if local_fork_only:
             rexp("fp32",     "local-fork",     print_header=print_header)
             rexp("bf16",     "local-fork",     use_half="bfloat16")