From c44f0d0ab22d9bdf1ebf00b659a5b9108c8f63b7 Mon Sep 17 00:00:00 2001
From: HDCharles <charlesdavidhernandez@gmail.com>
Date: Thu, 29 Feb 2024 22:44:33 -0800
Subject: [PATCH 1/4] testing autoquant
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Summary:

improves runtime by 19.70 -> 19.76 img/sec

❯ one sh run.sh
  0%|                                                                                                                                                                                                              | 0/64 [00:00<?, ?it/s]/home/cdhernandez/local/pytorch/torch/nested/__init__.py:166: UserWarning: The PyTorch API of nested tensors is in prototype stage and will change in the near future. (Triggered internally at /home/cdhernandez/local/pytorch/aten/src/ATen/NestedTensorImpl.cpp:177.)
  return _nested.nested_tensor(
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 64/64 [06:32<00:00,  6.14s/it]
sam_model_type,batch_size,memory(MiB),memory(%),img_s(avg),batch_ms(avg)/batch_size,mIoU,use_compile,use_half,compress,epilogue_fusion_first,use_compile_decoder,use_nested_tensor,use_rel_pos,pad_input_image_batch,num_workers,num_batches,num_images,profile_path,memory_path
vit_h,16,14532,17,18.861125832244333,53.01910442113876,0.5865236891447146,max-autotune,torch.bfloat16,None,False,False,True,True,True,32,64,1024,None,None
  0%|                                                                                                                                                                                                              | 0/64 [00:00<?, ?it/s]/home/cdhernandez/local/pytorch/torch/nested/__init__.py:166: UserWarning: The PyTorch API of nested tensors is in prototype stage and will change in the near future. (Triggered internally at /home/cdhernandez/local/pytorch/aten/src/ATen/NestedTensorImpl.cpp:177.)
  return _nested.nested_tensor(
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 64/64 [07:08<00:00,  6.70s/it]
vit_h,16,14395,17,19.70834741975898,50.73992145061493,0.5875230894143607,max-autotune,torch.bfloat16,dynamic_quant,False,False,True,True,True,32,64,1024,None,None
<class 'torchao.quantization.autoquant.AQFloatLinearWeight'> 3.850527899339795
<class 'torchao.quantization.autoquant.AQInt8DynamicallyQuantizedLinearWeight'> 4.3931088875979185
<class 'torchao.quantization.autoquant.AQInt8DynamicallyQuantizedLinearWeight'> 4.3931088875979185 3.190660197287798
<class 'torchao.quantization.autoquant.AQWeightOnlyQuantizedLinearWeight'> 4.768232116475701
<class 'torchao.quantization.autoquant.AQWeightOnlyQuantizedLinearWeight3'> 3.8598313461989164
shape=(torch.Size([78400, 1280]), torch.Size([3840, 1280]), torch.Size([3840])), dtype=torch.bfloat16, best_cls=<class 'torchao.quantization.autoquant.AQInt8DynamicallyQuantizedLinearWeight'>
<class 'torchao.quantization.autoquant.AQFloatLinearWeight'> 1.4865157660096884
<class 'torchao.quantization.autoquant.AQInt8DynamicallyQuantizedLinearWeight'> 1.8800818361341953
<class 'torchao.quantization.autoquant.AQInt8DynamicallyQuantizedLinearWeight'> 1.8800818361341953 1.179535873234272
<class 'torchao.quantization.autoquant.AQWeightOnlyQuantizedLinearWeight'> 1.7427184619009497
<class 'torchao.quantization.autoquant.AQWeightOnlyQuantizedLinearWeight3'> 1.4965661568567157
shape=(torch.Size([78400, 1280]), torch.Size([1280, 1280]), torch.Size([1280])), dtype=torch.bfloat16, best_cls=<class 'torchao.quantization.autoquant.AQFloatLinearWeight'>
<class 'torchao.quantization.autoquant.AQFloatLinearWeight'> 4.215262923389673
<class 'torchao.quantization.autoquant.AQInt8DynamicallyQuantizedLinearWeight'> 4.661373794078827
<class 'torchao.quantization.autoquant.AQInt8DynamicallyQuantizedLinearWeight'> 4.661373794078827 3.485689079388976
<class 'torchao.quantization.autoquant.AQWeightOnlyQuantizedLinearWeight'> 5.220260447822511
<class 'torchao.quantization.autoquant.AQWeightOnlyQuantizedLinearWeight3'> 4.2220821138471365
shape=(torch.Size([65536, 1280]), torch.Size([5120, 1280]), torch.Size([5120])), dtype=torch.bfloat16, best_cls=<class 'torchao.quantization.autoquant.AQInt8DynamicallyQuantizedLinearWeight'>
<class 'torchao.quantization.autoquant.AQFloatLinearWeight'> 4.666170105338097
<class 'torchao.quantization.autoquant.AQInt8DynamicallyQuantizedLinearWeight'> 4.113288130611181
<class 'torchao.quantization.autoquant.AQInt8DynamicallyQuantizedLinearWeight'> 4.113288130611181 2.626298717223108
<class 'torchao.quantization.autoquant.AQWeightOnlyQuantizedLinearWeight'> 4.855024302378297
<class 'torchao.quantization.autoquant.AQWeightOnlyQuantizedLinearWeight3'> 4.674202110618353
shape=(torch.Size([65536, 5120]), torch.Size([1280, 5120]), torch.Size([1280])), dtype=torch.bfloat16, best_cls=<class 'torchao.quantization.autoquant.AQInt8DynamicallyQuantizedLinearWeight'>
<class 'torchao.quantization.autoquant.AQFloatLinearWeight'> 3.2269158866256475
<class 'torchao.quantization.autoquant.AQInt8DynamicallyQuantizedLinearWeight'> 3.7462301552295685
<class 'torchao.quantization.autoquant.AQInt8DynamicallyQuantizedLinearWeight'> 3.7462301552295685 2.6572815608233213
<class 'torchao.quantization.autoquant.AQWeightOnlyQuantizedLinearWeight'> 3.9978391956537966
<class 'torchao.quantization.autoquant.AQWeightOnlyQuantizedLinearWeight3'> 3.2370124012231827
shape=(torch.Size([65536, 1280]), torch.Size([3840, 1280]), torch.Size([3840])), dtype=torch.bfloat16, best_cls=<class 'torchao.quantization.autoquant.AQInt8DynamicallyQuantizedLinearWeight'>
<class 'torchao.quantization.autoquant.AQFloatLinearWeight'> 1.2530277017503977
<class 'torchao.quantization.autoquant.AQInt8DynamicallyQuantizedLinearWeight'> 1.5717314090579748
<class 'torchao.quantization.autoquant.AQInt8DynamicallyQuantizedLinearWeight'> 1.5717314090579748 0.9894231799989939
<class 'torchao.quantization.autoquant.AQWeightOnlyQuantizedLinearWeight'> 1.5166664496064186
<class 'torchao.quantization.autoquant.AQWeightOnlyQuantizedLinearWeight3'> 1.2606457574293017
shape=(torch.Size([65536, 1280]), torch.Size([1280, 1280]), torch.Size([1280])), dtype=torch.bfloat16, best_cls=<class 'torchao.quantization.autoquant.AQFloatLinearWeight'>
  0%|                                                                                                                                                                                                              | 0/64 [00:00<?, ?it/s]/home/cdhernandez/local/pytorch/torch/nested/__init__.py:166: UserWarning: The PyTorch API of nested tensors is in prototype stage and will change in the near future. (Triggered internally at /home/cdhernandez/local/pytorch/aten/src/ATen/NestedTensorImpl.cpp:177.)
  return _nested.nested_tensor(
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 64/64 [02:15<00:00,  2.12s/it]
vit_h,16,14463,17,19.76190752324237,50.602402567863464,0.5875653903095147,max-autotune,torch.bfloat16,auto_quant,False,False,True,True,True,32,64,1024,None,None

Test Plan:

Reviewers:

Subscribers:

Tasks:

Tags:

[ghstack-poisoned]
---
 experiments/eval_combo.py      | 11 +++++++++--
 experiments/run.sh             |  4 ++++
 experiments/run_experiments.py |  6 +++++-
 3 files changed, 18 insertions(+), 3 deletions(-)
 create mode 100644 experiments/run.sh
diff --git a/experiments/eval_combo.py b/experiments/eval_combo.py
index 30f6edc..46e8a49 100644
--- a/experiments/eval_combo.py
+++ b/experiments/eval_combo.py
@@ -289,7 +289,7 @@ def run(
     profile_top=False,
     memory_path=None,
     use_local_sam_fork=False,
-    use_compiler_settings=False,
+    use_compiler_settings=True,
 ):
     from torch._inductor import config as inductorconfig
     inductorconfig.triton.unique_kernel_names = True
@@ -298,6 +298,7 @@ def run(
     if use_compiler_settings:
         # inductorconfig.fx_graph_cache = True # seems to slow performance
         inductorconfig.epilogue_fusion = False
+        torch._dynamo.config.automatic_dynamic_shapes = False
         inductorconfig.coordinate_descent_tuning = True
         inductorconfig.coordinate_descent_check_all_directions = True
 
@@ -336,7 +337,13 @@ def run(
     for block in predictor.model.image_encoder.blocks:
         block.attn.use_rel_pos = use_rel_pos
 
-    if compress == "dynamic_quant":
+    if compress == "auto_quant":
+        from torchao.quantization.quant_api import do_autoquant
+        example_input = torch.randn((16, 3, 1024, 1024), dtype=use_half, device="cuda")
+        inductorconfig.force_fuse_int_mm_with_mul = True
+        inductorconfig.use_mixed_mm = True
+        do_autoquant(predictor.model.image_encoder, example_input)
+    elif compress == "dynamic_quant":
         from torchao.quantization import apply_dynamic_quant
         apply_dynamic_quant(predictor.model.image_encoder)
         inductorconfig.force_fuse_int_mm_with_mul = True
diff --git a/experiments/run.sh b/experiments/run.sh
new file mode 100644
index 0000000..81bb987
--- /dev/null
+++ b/experiments/run.sh
@@ -0,0 +1,4 @@
+SEGMENT_ANYTHING_FAST_USE_FLASH_4=0 python run_experiments.py 16 vit_h \
+    ~/local/pytorch ~/local/segment-anything ~/local/sam_data \
+    --run-experiments --local_fork_only \
+    --num-workers 32  --capture_output False
diff --git a/experiments/run_experiments.py b/experiments/run_experiments.py
index ece8f1b..2c77765 100755
--- a/experiments/run_experiments.py
+++ b/experiments/run_experiments.py
@@ -42,7 +42,7 @@ def run_experiment(experiments_data,
                    extra_args=None,
                    print_header=False,
                    capture_output=True,
-                   limit=None,
+                   limit=1024,
                    profile_path=None,
                    profile_top=False,
                    memory_path=None):
@@ -181,6 +181,10 @@ def run(batch_size,
         rt("sparse",         "local-fork",   use_half="bfloat16",  use_compile="max-autotune", use_nested_tensor=True, compress="sparse")
 
     if run_experiments:
+        rexp("compile",  "local-fork",     use_half="bfloat16",  use_compile="max-autotune", use_nested_tensor=(batch_size > 1), print_header=print_header)
+        rexp("int8",         "local-fork",   use_half="bfloat16",  use_compile="max-autotune", use_nested_tensor=(batch_size > 1), compress="dynamic_quant")
+        rexp("auto_quant",         "local-fork",   use_half="bfloat16",  use_compile="max-autotune", use_nested_tensor=(batch_size > 1), compress="auto_quant")
+        return
         if local_fork_only:
             rexp("fp32",     "local-fork",     print_header=print_header)
             rexp("bf16",     "local-fork",     use_half="bfloat16")

From 36d43be2e014f766282e80e40d96c5987064c68a Mon Sep 17 00:00:00 2001
From: HDCharles <charlesdavidhernandez@gmail.com>
Date: Tue, 5 Mar 2024 15:47:19 -0800
Subject: [PATCH 2/4] Update on "testing autoquant"
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Summary:

improves runtime by 19.70 -> 19.76 img/sec

❯ one sh run.sh
  0%|                                                                                                                                                                                                              | 0/64 [00:00<?, ?it/s]/home/cdhernandez/local/pytorch/torch/nested/__init__.py:166: UserWarning: The PyTorch API of nested tensors is in prototype stage and will change in the near future. (Triggered internally at /home/cdhernandez/local/pytorch/aten/src/ATen/NestedTensorImpl.cpp:177.)
  return _nested.nested_tensor(
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 64/64 [06:32<00:00,  6.14s/it]
sam_model_type,batch_size,memory(MiB),memory(%),img_s(avg),batch_ms(avg)/batch_size,mIoU,use_compile,use_half,compress,epilogue_fusion_first,use_compile_decoder,use_nested_tensor,use_rel_pos,pad_input_image_batch,num_workers,num_batches,num_images,profile_path,memory_path
vit_h,16,14532,17,18.861125832244333,53.01910442113876,0.5865236891447146,max-autotune,torch.bfloat16,None,False,False,True,True,True,32,64,1024,None,None
  0%|                                                                                                                                                                                                              | 0/64 [00:00<?, ?it/s]/home/cdhernandez/local/pytorch/torch/nested/__init__.py:166: UserWarning: The PyTorch API of nested tensors is in prototype stage and will change in the near future. (Triggered internally at /home/cdhernandez/local/pytorch/aten/src/ATen/NestedTensorImpl.cpp:177.)
  return _nested.nested_tensor(
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 64/64 [07:08<00:00,  6.70s/it]
vit_h,16,14395,17,19.70834741975898,50.73992145061493,0.5875230894143607,max-autotune,torch.bfloat16,dynamic_quant,False,False,True,True,True,32,64,1024,None,None
<class 'torchao.quantization.autoquant.AQFloatLinearWeight'> 3.850527899339795
<class 'torchao.quantization.autoquant.AQInt8DynamicallyQuantizedLinearWeight'> 4.3931088875979185
<class 'torchao.quantization.autoquant.AQInt8DynamicallyQuantizedLinearWeight'> 4.3931088875979185 3.190660197287798
<class 'torchao.quantization.autoquant.AQWeightOnlyQuantizedLinearWeight'> 4.768232116475701
<class 'torchao.quantization.autoquant.AQWeightOnlyQuantizedLinearWeight3'> 3.8598313461989164
shape=(torch.Size([78400, 1280]), torch.Size([3840, 1280]), torch.Size([3840])), dtype=torch.bfloat16, best_cls=<class 'torchao.quantization.autoquant.AQInt8DynamicallyQuantizedLinearWeight'>
<class 'torchao.quantization.autoquant.AQFloatLinearWeight'> 1.4865157660096884
<class 'torchao.quantization.autoquant.AQInt8DynamicallyQuantizedLinearWeight'> 1.8800818361341953
<class 'torchao.quantization.autoquant.AQInt8DynamicallyQuantizedLinearWeight'> 1.8800818361341953 1.179535873234272
<class 'torchao.quantization.autoquant.AQWeightOnlyQuantizedLinearWeight'> 1.7427184619009497
<class 'torchao.quantization.autoquant.AQWeightOnlyQuantizedLinearWeight3'> 1.4965661568567157
shape=(torch.Size([78400, 1280]), torch.Size([1280, 1280]), torch.Size([1280])), dtype=torch.bfloat16, best_cls=<class 'torchao.quantization.autoquant.AQFloatLinearWeight'>
<class 'torchao.quantization.autoquant.AQFloatLinearWeight'> 4.215262923389673
<class 'torchao.quantization.autoquant.AQInt8DynamicallyQuantizedLinearWeight'> 4.661373794078827
<class 'torchao.quantization.autoquant.AQInt8DynamicallyQuantizedLinearWeight'> 4.661373794078827 3.485689079388976
<class 'torchao.quantization.autoquant.AQWeightOnlyQuantizedLinearWeight'> 5.220260447822511
<class 'torchao.quantization.autoquant.AQWeightOnlyQuantizedLinearWeight3'> 4.2220821138471365
shape=(torch.Size([65536, 1280]), torch.Size([5120, 1280]), torch.Size([5120])), dtype=torch.bfloat16, best_cls=<class 'torchao.quantization.autoquant.AQInt8DynamicallyQuantizedLinearWeight'>
<class 'torchao.quantization.autoquant.AQFloatLinearWeight'> 4.666170105338097
<class 'torchao.quantization.autoquant.AQInt8DynamicallyQuantizedLinearWeight'> 4.113288130611181
<class 'torchao.quantization.autoquant.AQInt8DynamicallyQuantizedLinearWeight'> 4.113288130611181 2.626298717223108
<class 'torchao.quantization.autoquant.AQWeightOnlyQuantizedLinearWeight'> 4.855024302378297
<class 'torchao.quantization.autoquant.AQWeightOnlyQuantizedLinearWeight3'> 4.674202110618353
shape=(torch.Size([65536, 5120]), torch.Size([1280, 5120]), torch.Size([1280])), dtype=torch.bfloat16, best_cls=<class 'torchao.quantization.autoquant.AQInt8DynamicallyQuantizedLinearWeight'>
<class 'torchao.quantization.autoquant.AQFloatLinearWeight'> 3.2269158866256475
<class 'torchao.quantization.autoquant.AQInt8DynamicallyQuantizedLinearWeight'> 3.7462301552295685
<class 'torchao.quantization.autoquant.AQInt8DynamicallyQuantizedLinearWeight'> 3.7462301552295685 2.6572815608233213
<class 'torchao.quantization.autoquant.AQWeightOnlyQuantizedLinearWeight'> 3.9978391956537966
<class 'torchao.quantization.autoquant.AQWeightOnlyQuantizedLinearWeight3'> 3.2370124012231827
shape=(torch.Size([65536, 1280]), torch.Size([3840, 1280]), torch.Size([3840])), dtype=torch.bfloat16, best_cls=<class 'torchao.quantization.autoquant.AQInt8DynamicallyQuantizedLinearWeight'>
<class 'torchao.quantization.autoquant.AQFloatLinearWeight'> 1.2530277017503977
<class 'torchao.quantization.autoquant.AQInt8DynamicallyQuantizedLinearWeight'> 1.5717314090579748
<class 'torchao.quantization.autoquant.AQInt8DynamicallyQuantizedLinearWeight'> 1.5717314090579748 0.9894231799989939
<class 'torchao.quantization.autoquant.AQWeightOnlyQuantizedLinearWeight'> 1.5166664496064186
<class 'torchao.quantization.autoquant.AQWeightOnlyQuantizedLinearWeight3'> 1.2606457574293017
shape=(torch.Size([65536, 1280]), torch.Size([1280, 1280]), torch.Size([1280])), dtype=torch.bfloat16, best_cls=<class 'torchao.quantization.autoquant.AQFloatLinearWeight'>
  0%|                                                                                                                                                                                                              | 0/64 [00:00<?, ?it/s]/home/cdhernandez/local/pytorch/torch/nested/__init__.py:166: UserWarning: The PyTorch API of nested tensors is in prototype stage and will change in the near future. (Triggered internally at /home/cdhernandez/local/pytorch/aten/src/ATen/NestedTensorImpl.cpp:177.)
  return _nested.nested_tensor(
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 64/64 [02:15<00:00,  2.12s/it]
vit_h,16,14463,17,19.76190752324237,50.602402567863464,0.5875653903095147,max-autotune,torch.bfloat16,auto_quant,False,False,True,True,True,32,64,1024,None,None

Test Plan:

Reviewers:

Subscribers:

Tasks:

Tags:

[ghstack-poisoned]
---
 experiments/run_experiments.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/experiments/run_experiments.py b/experiments/run_experiments.py
index 2c77765..7f02834 100755
--- a/experiments/run_experiments.py
+++ b/experiments/run_experiments.py
@@ -181,9 +181,9 @@ def run(batch_size,
         rt("sparse",         "local-fork",   use_half="bfloat16",  use_compile="max-autotune", use_nested_tensor=True, compress="sparse")
 
     if run_experiments:
-        rexp("compile",  "local-fork",     use_half="bfloat16",  use_compile="max-autotune", use_nested_tensor=(batch_size > 1), print_header=print_header)
+        # rexp("compile",  "local-fork",     use_half="bfloat16",  use_compile="max-autotune", use_nested_tensor=(batch_size > 1), print_header=print_header)
         rexp("int8",         "local-fork",   use_half="bfloat16",  use_compile="max-autotune", use_nested_tensor=(batch_size > 1), compress="dynamic_quant")
-        rexp("auto_quant",         "local-fork",   use_half="bfloat16",  use_compile="max-autotune", use_nested_tensor=(batch_size > 1), compress="auto_quant")
+        # rexp("auto_quant",         "local-fork",   use_half="bfloat16",  use_compile="max-autotune", use_nested_tensor=(batch_size > 1), compress="auto_quant")
         return
         if local_fork_only:
             rexp("fp32",     "local-fork",     print_header=print_header)

From b7e84395a0d2b1982eb4a1cf4cb01e7623053fb1 Mon Sep 17 00:00:00 2001
From: HDCharles <charlesdavidhernandez@gmail.com>
Date: Tue, 19 Mar 2024 15:14:40 -0700
Subject: [PATCH 3/4] Update on "testing autoquant"
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Summary:

improves runtime by 19.70 -> 19.76 img/sec

❯ one sh run.sh
  0%|                                                                                                                                                                                                              | 0/64 [00:00<?, ?it/s]/home/cdhernandez/local/pytorch/torch/nested/__init__.py:166: UserWarning: The PyTorch API of nested tensors is in prototype stage and will change in the near future. (Triggered internally at /home/cdhernandez/local/pytorch/aten/src/ATen/NestedTensorImpl.cpp:177.)
  return _nested.nested_tensor(
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 64/64 [06:32<00:00,  6.14s/it]
sam_model_type,batch_size,memory(MiB),memory(%),img_s(avg),batch_ms(avg)/batch_size,mIoU,use_compile,use_half,compress,epilogue_fusion_first,use_compile_decoder,use_nested_tensor,use_rel_pos,pad_input_image_batch,num_workers,num_batches,num_images,profile_path,memory_path
vit_h,16,14532,17,18.861125832244333,53.01910442113876,0.5865236891447146,max-autotune,torch.bfloat16,None,False,False,True,True,True,32,64,1024,None,None
  0%|                                                                                                                                                                                                              | 0/64 [00:00<?, ?it/s]/home/cdhernandez/local/pytorch/torch/nested/__init__.py:166: UserWarning: The PyTorch API of nested tensors is in prototype stage and will change in the near future. (Triggered internally at /home/cdhernandez/local/pytorch/aten/src/ATen/NestedTensorImpl.cpp:177.)
  return _nested.nested_tensor(
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 64/64 [07:08<00:00,  6.70s/it]
vit_h,16,14395,17,19.70834741975898,50.73992145061493,0.5875230894143607,max-autotune,torch.bfloat16,dynamic_quant,False,False,True,True,True,32,64,1024,None,None
<class 'torchao.quantization.autoquant.AQFloatLinearWeight'> 3.850527899339795
<class 'torchao.quantization.autoquant.AQInt8DynamicallyQuantizedLinearWeight'> 4.3931088875979185
<class 'torchao.quantization.autoquant.AQInt8DynamicallyQuantizedLinearWeight'> 4.3931088875979185 3.190660197287798
<class 'torchao.quantization.autoquant.AQWeightOnlyQuantizedLinearWeight'> 4.768232116475701
<class 'torchao.quantization.autoquant.AQWeightOnlyQuantizedLinearWeight3'> 3.8598313461989164
shape=(torch.Size([78400, 1280]), torch.Size([3840, 1280]), torch.Size([3840])), dtype=torch.bfloat16, best_cls=<class 'torchao.quantization.autoquant.AQInt8DynamicallyQuantizedLinearWeight'>
<class 'torchao.quantization.autoquant.AQFloatLinearWeight'> 1.4865157660096884
<class 'torchao.quantization.autoquant.AQInt8DynamicallyQuantizedLinearWeight'> 1.8800818361341953
<class 'torchao.quantization.autoquant.AQInt8DynamicallyQuantizedLinearWeight'> 1.8800818361341953 1.179535873234272
<class 'torchao.quantization.autoquant.AQWeightOnlyQuantizedLinearWeight'> 1.7427184619009497
<class 'torchao.quantization.autoquant.AQWeightOnlyQuantizedLinearWeight3'> 1.4965661568567157
shape=(torch.Size([78400, 1280]), torch.Size([1280, 1280]), torch.Size([1280])), dtype=torch.bfloat16, best_cls=<class 'torchao.quantization.autoquant.AQFloatLinearWeight'>
<class 'torchao.quantization.autoquant.AQFloatLinearWeight'> 4.215262923389673
<class 'torchao.quantization.autoquant.AQInt8DynamicallyQuantizedLinearWeight'> 4.661373794078827
<class 'torchao.quantization.autoquant.AQInt8DynamicallyQuantizedLinearWeight'> 4.661373794078827 3.485689079388976
<class 'torchao.quantization.autoquant.AQWeightOnlyQuantizedLinearWeight'> 5.220260447822511
<class 'torchao.quantization.autoquant.AQWeightOnlyQuantizedLinearWeight3'> 4.2220821138471365
shape=(torch.Size([65536, 1280]), torch.Size([5120, 1280]), torch.Size([5120])), dtype=torch.bfloat16, best_cls=<class 'torchao.quantization.autoquant.AQInt8DynamicallyQuantizedLinearWeight'>
<class 'torchao.quantization.autoquant.AQFloatLinearWeight'> 4.666170105338097
<class 'torchao.quantization.autoquant.AQInt8DynamicallyQuantizedLinearWeight'> 4.113288130611181
<class 'torchao.quantization.autoquant.AQInt8DynamicallyQuantizedLinearWeight'> 4.113288130611181 2.626298717223108
<class 'torchao.quantization.autoquant.AQWeightOnlyQuantizedLinearWeight'> 4.855024302378297
<class 'torchao.quantization.autoquant.AQWeightOnlyQuantizedLinearWeight3'> 4.674202110618353
shape=(torch.Size([65536, 5120]), torch.Size([1280, 5120]), torch.Size([1280])), dtype=torch.bfloat16, best_cls=<class 'torchao.quantization.autoquant.AQInt8DynamicallyQuantizedLinearWeight'>
<class 'torchao.quantization.autoquant.AQFloatLinearWeight'> 3.2269158866256475
<class 'torchao.quantization.autoquant.AQInt8DynamicallyQuantizedLinearWeight'> 3.7462301552295685
<class 'torchao.quantization.autoquant.AQInt8DynamicallyQuantizedLinearWeight'> 3.7462301552295685 2.6572815608233213
<class 'torchao.quantization.autoquant.AQWeightOnlyQuantizedLinearWeight'> 3.9978391956537966
<class 'torchao.quantization.autoquant.AQWeightOnlyQuantizedLinearWeight3'> 3.2370124012231827
shape=(torch.Size([65536, 1280]), torch.Size([3840, 1280]), torch.Size([3840])), dtype=torch.bfloat16, best_cls=<class 'torchao.quantization.autoquant.AQInt8DynamicallyQuantizedLinearWeight'>
<class 'torchao.quantization.autoquant.AQFloatLinearWeight'> 1.2530277017503977
<class 'torchao.quantization.autoquant.AQInt8DynamicallyQuantizedLinearWeight'> 1.5717314090579748
<class 'torchao.quantization.autoquant.AQInt8DynamicallyQuantizedLinearWeight'> 1.5717314090579748 0.9894231799989939
<class 'torchao.quantization.autoquant.AQWeightOnlyQuantizedLinearWeight'> 1.5166664496064186
<class 'torchao.quantization.autoquant.AQWeightOnlyQuantizedLinearWeight3'> 1.2606457574293017
shape=(torch.Size([65536, 1280]), torch.Size([1280, 1280]), torch.Size([1280])), dtype=torch.bfloat16, best_cls=<class 'torchao.quantization.autoquant.AQFloatLinearWeight'>
  0%|                                                                                                                                                                                                              | 0/64 [00:00<?, ?it/s]/home/cdhernandez/local/pytorch/torch/nested/__init__.py:166: UserWarning: The PyTorch API of nested tensors is in prototype stage and will change in the near future. (Triggered internally at /home/cdhernandez/local/pytorch/aten/src/ATen/NestedTensorImpl.cpp:177.)
  return _nested.nested_tensor(
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 64/64 [02:15<00:00,  2.12s/it]
vit_h,16,14463,17,19.76190752324237,50.602402567863464,0.5875653903095147,max-autotune,torch.bfloat16,auto_quant,False,False,True,True,True,32,64,1024,None,None

Test Plan:

Reviewers:

Subscribers:

Tasks:

Tags:

[ghstack-poisoned]
---
 experiments/eval_combo.py      | 8 ++++----
 experiments/run_experiments.py | 5 +++--
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/experiments/eval_combo.py b/experiments/eval_combo.py
index 46e8a49..5c7c9f7 100644
--- a/experiments/eval_combo.py
+++ b/experiments/eval_combo.py
@@ -5,6 +5,7 @@
 from data import build_data, setup_coco_img_ids
 import math
 import segment_anything_fast
+import torchao
 
 torch._dynamo.config.cache_size_limit = 50000
 
@@ -337,12 +338,11 @@ def run(
     for block in predictor.model.image_encoder.blocks:
         block.attn.use_rel_pos = use_rel_pos
 
-    if compress == "auto_quant":
-        from torchao.quantization.quant_api import do_autoquant
-        example_input = torch.randn((16, 3, 1024, 1024), dtype=use_half, device="cuda")
+    if compress == "autoquant":
+        example_input = torch.randn((batch_size, 3, 1024, 1024), dtype=use_half, device="cuda")
         inductorconfig.force_fuse_int_mm_with_mul = True
         inductorconfig.use_mixed_mm = True
-        do_autoquant(predictor.model.image_encoder, example_input)
+        torchao.autoquant(predictor.model.image_encoder, example_input, mode=["interpolate", .5])
     elif compress == "dynamic_quant":
         from torchao.quantization import apply_dynamic_quant
         apply_dynamic_quant(predictor.model.image_encoder)
diff --git a/experiments/run_experiments.py b/experiments/run_experiments.py
index 7f02834..4cbb689 100755
--- a/experiments/run_experiments.py
+++ b/experiments/run_experiments.py
@@ -181,9 +181,10 @@ def run(batch_size,
         rt("sparse",         "local-fork",   use_half="bfloat16",  use_compile="max-autotune", use_nested_tensor=True, compress="sparse")
 
     if run_experiments:
+        # rexp("base",  "local-fork",     use_half="bfloat16")
         # rexp("compile",  "local-fork",     use_half="bfloat16",  use_compile="max-autotune", use_nested_tensor=(batch_size > 1), print_header=print_header)
-        rexp("int8",         "local-fork",   use_half="bfloat16",  use_compile="max-autotune", use_nested_tensor=(batch_size > 1), compress="dynamic_quant")
-        # rexp("auto_quant",         "local-fork",   use_half="bfloat16",  use_compile="max-autotune", use_nested_tensor=(batch_size > 1), compress="auto_quant")
+        # rexp("int8",         "local-fork",   use_half="bfloat16",  use_compile="max-autotune", use_nested_tensor=(batch_size > 1), compress="dynamic_quant")
+        rexp("autoquant",         "local-fork",   use_half="bfloat16",  use_compile="max-autotune", use_nested_tensor=(batch_size > 1), compress="autoquant")
         return
         if local_fork_only:
             rexp("fp32",     "local-fork",     print_header=print_header)

From bbd94ac284e9a70e04a8ffac6d203d885a3f87b1 Mon Sep 17 00:00:00 2001
From: HDCharles <charlesdavidhernandez@gmail.com>
Date: Tue, 19 Mar 2024 15:14:47 -0700
Subject: [PATCH 4/4] Update on "testing autoquant"

Summary:

improves runtime by 19.70 -> 19.76 img/sec

Test Plan: sh run.sh

Reviewers:

Subscribers:

Tasks:

Tags:

[ghstack-poisoned]