Skip to content

Commit 1ae9a9a

Browse files
committed
testing autoquant
Summary: improves runtime by 19.70 -> 19.76 img/sec Test Plan: sh run.sh Reviewers: Subscribers: Tasks: Tags: ghstack-source-id: 9c6098e Pull Request resolved: #114
1 parent 387488b commit 1ae9a9a

File tree

3 files changed

+19
-3
lines changed

3 files changed

+19
-3
lines changed

experiments/eval_combo.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from data import build_data, setup_coco_img_ids
66
import math
77
import segment_anything_fast
8+
import torchao
89

910
torch._dynamo.config.cache_size_limit = 50000
1011

@@ -289,7 +290,7 @@ def run(
289290
profile_top=False,
290291
memory_path=None,
291292
use_local_sam_fork=False,
292-
use_compiler_settings=False,
293+
use_compiler_settings=True,
293294
):
294295
from torch._inductor import config as inductorconfig
295296
inductorconfig.triton.unique_kernel_names = True
@@ -298,6 +299,7 @@ def run(
298299
if use_compiler_settings:
299300
# inductorconfig.fx_graph_cache = True # seems to slow performance
300301
inductorconfig.epilogue_fusion = False
302+
torch._dynamo.config.automatic_dynamic_shapes = False
301303
inductorconfig.coordinate_descent_tuning = True
302304
inductorconfig.coordinate_descent_check_all_directions = True
303305

@@ -336,7 +338,12 @@ def run(
336338
for block in predictor.model.image_encoder.blocks:
337339
block.attn.use_rel_pos = use_rel_pos
338340

339-
if compress == "dynamic_quant":
341+
if compress == "autoquant":
342+
example_input = torch.randn((batch_size, 3, 1024, 1024), dtype=use_half, device="cuda")
343+
inductorconfig.force_fuse_int_mm_with_mul = True
344+
inductorconfig.use_mixed_mm = True
345+
torchao.autoquant(predictor.model.image_encoder, example_input, mode=["interpolate", .5])
346+
elif compress == "dynamic_quant":
340347
from torchao.quantization import apply_dynamic_quant
341348
apply_dynamic_quant(predictor.model.image_encoder)
342349
inductorconfig.force_fuse_int_mm_with_mul = True

experiments/run.sh

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
SEGMENT_ANYTHING_FAST_USE_FLASH_4=0 python run_experiments.py 16 vit_h \
2+
~/local/pytorch ~/local/segment-anything ~/local/sam_data \
3+
--run-experiments --local_fork_only \
4+
--num-workers 32 --capture_output False

experiments/run_experiments.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ def run_experiment(experiments_data,
4242
extra_args=None,
4343
print_header=False,
4444
capture_output=True,
45-
limit=None,
45+
limit=1024,
4646
profile_path=None,
4747
profile_top=False,
4848
memory_path=None):
@@ -181,6 +181,11 @@ def run(batch_size,
181181
rt("sparse", "local-fork", use_half="bfloat16", use_compile="max-autotune", use_nested_tensor=True, compress="sparse")
182182

183183
if run_experiments:
184+
# rexp("base", "local-fork", use_half="bfloat16")
185+
# rexp("compile", "local-fork", use_half="bfloat16", use_compile="max-autotune", use_nested_tensor=(batch_size > 1), print_header=print_header)
186+
# rexp("int8", "local-fork", use_half="bfloat16", use_compile="max-autotune", use_nested_tensor=(batch_size > 1), compress="dynamic_quant")
187+
rexp("autoquant", "local-fork", use_half="bfloat16", use_compile="max-autotune", use_nested_tensor=(batch_size > 1), compress="autoquant")
188+
return
184189
if local_fork_only:
185190
rexp("fp32", "local-fork", print_header=print_header)
186191
rexp("bf16", "local-fork", use_half="bfloat16")

0 commit comments

Comments
 (0)