diff --git a/experiments/eval_combo.py b/experiments/eval_combo.py index 30f6edc..5c7c9f7 100644 --- a/experiments/eval_combo.py +++ b/experiments/eval_combo.py @@ -5,6 +5,7 @@ from data import build_data, setup_coco_img_ids import math import segment_anything_fast +import torchao torch._dynamo.config.cache_size_limit = 50000 @@ -289,7 +290,7 @@ def run( profile_top=False, memory_path=None, use_local_sam_fork=False, - use_compiler_settings=False, + use_compiler_settings=True, ): from torch._inductor import config as inductorconfig inductorconfig.triton.unique_kernel_names = True @@ -298,6 +299,7 @@ def run( if use_compiler_settings: # inductorconfig.fx_graph_cache = True # seems to slow performance inductorconfig.epilogue_fusion = False + torch._dynamo.config.automatic_dynamic_shapes = False inductorconfig.coordinate_descent_tuning = True inductorconfig.coordinate_descent_check_all_directions = True @@ -336,7 +338,12 @@ def run( for block in predictor.model.image_encoder.blocks: block.attn.use_rel_pos = use_rel_pos - if compress == "dynamic_quant": + if compress == "autoquant": + example_input = torch.randn((batch_size, 3, 1024, 1024), dtype=use_half, device="cuda") + inductorconfig.force_fuse_int_mm_with_mul = True + inductorconfig.use_mixed_mm = True + torchao.autoquant(predictor.model.image_encoder, example_input, mode=["interpolate", .5]) + elif compress == "dynamic_quant": from torchao.quantization import apply_dynamic_quant apply_dynamic_quant(predictor.model.image_encoder) inductorconfig.force_fuse_int_mm_with_mul = True diff --git a/experiments/run.sh b/experiments/run.sh new file mode 100644 index 0000000..81bb987 --- /dev/null +++ b/experiments/run.sh @@ -0,0 +1,4 @@ +SEGMENT_ANYTHING_FAST_USE_FLASH_4=0 python run_experiments.py 16 vit_h \ + ~/local/pytorch ~/local/segment-anything ~/local/sam_data \ + --run-experiments --local_fork_only \ + --num-workers 32 --capture_output False diff --git a/experiments/run_experiments.py b/experiments/run_experiments.py index ece8f1b..4cbb689 100755 --- a/experiments/run_experiments.py +++ b/experiments/run_experiments.py @@ -42,7 +42,7 @@ def run_experiment(experiments_data, extra_args=None, print_header=False, capture_output=True, - limit=None, + limit=1024, profile_path=None, profile_top=False, memory_path=None): @@ -181,6 +181,11 @@ def run(batch_size, rt("sparse", "local-fork", use_half="bfloat16", use_compile="max-autotune", use_nested_tensor=True, compress="sparse") if run_experiments: + # rexp("base", "local-fork", use_half="bfloat16") + # rexp("compile", "local-fork", use_half="bfloat16", use_compile="max-autotune", use_nested_tensor=(batch_size > 1), print_header=print_header) + # rexp("int8", "local-fork", use_half="bfloat16", use_compile="max-autotune", use_nested_tensor=(batch_size > 1), compress="dynamic_quant") + rexp("autoquant", "local-fork", use_half="bfloat16", use_compile="max-autotune", use_nested_tensor=(batch_size > 1), compress="autoquant") + return if local_fork_only: rexp("fp32", "local-fork", print_header=print_header) rexp("bf16", "local-fork", use_half="bfloat16")