testing autoquant

HDCharles · HDCharles · commit 19bd3802aaf8 · 2024-03-05T15:47:19.000-08:00
Summary: improves runtime by 19.70 -> 19.76 img/sec ❯ one sh run.sh 0%| | 0/64 [00:00<?, ?it/s]/home/cdhernandez/local/pytorch/torch/nested/__init__.py:166: UserWarning: The PyTorch API of nested tensors is in prototype stage and will change in the near future. (Triggered internally at /home/cdhernandez/local/pytorch/aten/src/ATen/NestedTensorImpl.cpp:177.) return _nested.nested_tensor( 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 64/64 [06:32<00:00, 6.14s/it] sam_model_type,batch_size,memory(MiB),memory(%),img_s(avg),batch_ms(avg)/batch_size,mIoU,use_compile,use_half,compress,epilogue_fusion_first,use_compile_decoder,use_nested_tensor,use_rel_pos,pad_input_image_batch,num_workers,num_batches,num_images,profile_path,memory_path vit_h,16,14532,17,18.861125832244333,53.01910442113876,0.5865236891447146,max-autotune,torch.bfloat16,None,False,False,True,True,True,32,64,1024,None,None 0%| | 0/64 [00:00<?, ?it/s]/home/cdhernandez/local/pytorch/torch/nested/__init__.py:166: UserWarning: The PyTorch API of nested tensors is in prototype stage and will change in the near future. (Triggered internally at /home/cdhernandez/local/pytorch/aten/src/ATen/NestedTensorImpl.cpp:177.) return _nested.nested_tensor( 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 64/64 [07:08<00:00, 6.70s/it] vit_h,16,14395,17,19.70834741975898,50.73992145061493,0.5875230894143607,max-autotune,torch.bfloat16,dynamic_quant,False,False,True,True,True,32,64,1024,None,None <class 'torchao.quantization.autoquant.AQFloatLinearWeight'> 3.850527899339795 <class 'torchao.quantization.autoquant.AQInt8DynamicallyQuantizedLinearWeight'> 4.3931088875979185 <class 'torchao.quantization.autoquant.AQInt8DynamicallyQuantizedLinearWeight'> 4.3931088875979185 3.190660197287798 <class 'torchao.quantization.autoquant.AQWeightOnlyQuantizedLinearWeight'> 4.768232116475701 <class 'torchao.quantization.autoquant.AQWeightOnlyQuantizedLinearWeight3'> 3.8598313461989164 shape=(torch.Size([78400, 1280]), torch.Size([3840, 1280]), torch.Size([3840])), dtype=torch.bfloat16, best_cls=<class 'torchao.quantization.autoquant.AQInt8DynamicallyQuantizedLinearWeight'> <class 'torchao.quantization.autoquant.AQFloatLinearWeight'> 1.4865157660096884 <class 'torchao.quantization.autoquant.AQInt8DynamicallyQuantizedLinearWeight'> 1.8800818361341953 <class 'torchao.quantization.autoquant.AQInt8DynamicallyQuantizedLinearWeight'> 1.8800818361341953 1.179535873234272 <class 'torchao.quantization.autoquant.AQWeightOnlyQuantizedLinearWeight'> 1.7427184619009497 <class 'torchao.quantization.autoquant.AQWeightOnlyQuantizedLinearWeight3'> 1.4965661568567157 shape=(torch.Size([78400, 1280]), torch.Size([1280, 1280]), torch.Size([1280])), dtype=torch.bfloat16, best_cls=<class 'torchao.quantization.autoquant.AQFloatLinearWeight'> <class 'torchao.quantization.autoquant.AQFloatLinearWeight'> 4.215262923389673 <class 'torchao.quantization.autoquant.AQInt8DynamicallyQuantizedLinearWeight'> 4.661373794078827 <class 'torchao.quantization.autoquant.AQInt8DynamicallyQuantizedLinearWeight'> 4.661373794078827 3.485689079388976 <class 'torchao.quantization.autoquant.AQWeightOnlyQuantizedLinearWeight'> 5.220260447822511 <class 'torchao.quantization.autoquant.AQWeightOnlyQuantizedLinearWeight3'> 4.2220821138471365 shape=(torch.Size([65536, 1280]), torch.Size([5120, 1280]), torch.Size([5120])), dtype=torch.bfloat16, best_cls=<class 'torchao.quantization.autoquant.AQInt8DynamicallyQuantizedLinearWeight'> <class 'torchao.quantization.autoquant.AQFloatLinearWeight'> 4.666170105338097 <class 'torchao.quantization.autoquant.AQInt8DynamicallyQuantizedLinearWeight'> 4.113288130611181 <class 'torchao.quantization.autoquant.AQInt8DynamicallyQuantizedLinearWeight'> 4.113288130611181 2.626298717223108 <class 'torchao.quantization.autoquant.AQWeightOnlyQuantizedLinearWeight'> 4.855024302378297 <class 'torchao.quantization.autoquant.AQWeightOnlyQuantizedLinearWeight3'> 4.674202110618353 shape=(torch.Size([65536, 5120]), torch.Size([1280, 5120]), torch.Size([1280])), dtype=torch.bfloat16, best_cls=<class 'torchao.quantization.autoquant.AQInt8DynamicallyQuantizedLinearWeight'> <class 'torchao.quantization.autoquant.AQFloatLinearWeight'> 3.2269158866256475 <class 'torchao.quantization.autoquant.AQInt8DynamicallyQuantizedLinearWeight'> 3.7462301552295685 <class 'torchao.quantization.autoquant.AQInt8DynamicallyQuantizedLinearWeight'> 3.7462301552295685 2.6572815608233213 <class 'torchao.quantization.autoquant.AQWeightOnlyQuantizedLinearWeight'> 3.9978391956537966 <class 'torchao.quantization.autoquant.AQWeightOnlyQuantizedLinearWeight3'> 3.2370124012231827 shape=(torch.Size([65536, 1280]), torch.Size([3840, 1280]), torch.Size([3840])), dtype=torch.bfloat16, best_cls=<class 'torchao.quantization.autoquant.AQInt8DynamicallyQuantizedLinearWeight'> <class 'torchao.quantization.autoquant.AQFloatLinearWeight'> 1.2530277017503977 <class 'torchao.quantization.autoquant.AQInt8DynamicallyQuantizedLinearWeight'> 1.5717314090579748 <class 'torchao.quantization.autoquant.AQInt8DynamicallyQuantizedLinearWeight'> 1.5717314090579748 0.9894231799989939 <class 'torchao.quantization.autoquant.AQWeightOnlyQuantizedLinearWeight'> 1.5166664496064186 <class 'torchao.quantization.autoquant.AQWeightOnlyQuantizedLinearWeight3'> 1.2606457574293017 shape=(torch.Size([65536, 1280]), torch.Size([1280, 1280]), torch.Size([1280])), dtype=torch.bfloat16, best_cls=<class 'torchao.quantization.autoquant.AQFloatLinearWeight'> 0%| | 0/64 [00:00<?, ?it/s]/home/cdhernandez/local/pytorch/torch/nested/__init__.py:166: UserWarning: The PyTorch API of nested tensors is in prototype stage and will change in the near future. (Triggered internally at /home/cdhernandez/local/pytorch/aten/src/ATen/NestedTensorImpl.cpp:177.) return _nested.nested_tensor( 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 64/64 [02:15<00:00, 2.12s/it] vit_h,16,14463,17,19.76190752324237,50.602402567863464,0.5875653903095147,max-autotune,torch.bfloat16,auto_quant,False,False,True,True,True,32,64,1024,None,None Test Plan: Reviewers: Subscribers: Tasks: Tags: ghstack-source-id: a9134d6 Pull Request resolved: #114
diff --git a/experiments/eval_combo.py b/experiments/eval_combo.py
@@ -289,7 +289,7 @@ def run(
     profile_top=False,
     memory_path=None,
     use_local_sam_fork=False,
-    use_compiler_settings=False,
+    use_compiler_settings=True,
 ):
     from torch._inductor import config as inductorconfig
     inductorconfig.triton.unique_kernel_names = True
@@ -298,6 +298,7 @@ def run(
     if use_compiler_settings:
         # inductorconfig.fx_graph_cache = True # seems to slow performance
         inductorconfig.epilogue_fusion = False
+        torch._dynamo.config.automatic_dynamic_shapes = False
         inductorconfig.coordinate_descent_tuning = True
         inductorconfig.coordinate_descent_check_all_directions = True
 
@@ -336,7 +337,13 @@ def run(
     for block in predictor.model.image_encoder.blocks:
         block.attn.use_rel_pos = use_rel_pos
 
-    if compress == "dynamic_quant":
+    if compress == "auto_quant":
+        from torchao.quantization.quant_api import do_autoquant
+        example_input = torch.randn((16, 3, 1024, 1024), dtype=use_half, device="cuda")
+        inductorconfig.force_fuse_int_mm_with_mul = True
+        inductorconfig.use_mixed_mm = True
+        do_autoquant(predictor.model.image_encoder, example_input)
+    elif compress == "dynamic_quant":
         from torchao.quantization import apply_dynamic_quant
         apply_dynamic_quant(predictor.model.image_encoder)
         inductorconfig.force_fuse_int_mm_with_mul = True
diff --git a/experiments/run.sh b/experiments/run.sh
@@ -0,0 +1,4 @@
+SEGMENT_ANYTHING_FAST_USE_FLASH_4=0 python run_experiments.py 16 vit_h \
+    ~/local/pytorch ~/local/segment-anything ~/local/sam_data \
+    --run-experiments --local_fork_only \
+    --num-workers 32  --capture_output False
diff --git a/experiments/run_experiments.py b/experiments/run_experiments.py
@@ -42,7 +42,7 @@ def run_experiment(experiments_data,
                    extra_args=None,
                    print_header=False,
                    capture_output=True,
-                   limit=None,
+                   limit=1024,
                    profile_path=None,
                    profile_top=False,
                    memory_path=None):
@@ -181,6 +181,10 @@ def run(batch_size,
         rt("sparse",         "local-fork",   use_half="bfloat16",  use_compile="max-autotune", use_nested_tensor=True, compress="sparse")
 
     if run_experiments:
+        # rexp("compile",  "local-fork",     use_half="bfloat16",  use_compile="max-autotune", use_nested_tensor=(batch_size > 1), print_header=print_header)
+        rexp("int8",         "local-fork",   use_half="bfloat16",  use_compile="max-autotune", use_nested_tensor=(batch_size > 1), compress="dynamic_quant")
+        # rexp("auto_quant",         "local-fork",   use_half="bfloat16",  use_compile="max-autotune", use_nested_tensor=(batch_size > 1), compress="auto_quant")
+        return
         if local_fork_only:
             rexp("fp32",     "local-fork",     print_header=print_header)
             rexp("bf16",     "local-fork",     use_half="bfloat16")