@@ -289,7 +289,7 @@ def run(
289
289
profile_top = False ,
290
290
memory_path = None ,
291
291
use_local_sam_fork = False ,
292
- use_compiler_settings = False ,
292
+ use_compiler_settings = True ,
293
293
):
294
294
from torch ._inductor import config as inductorconfig
295
295
inductorconfig .triton .unique_kernel_names = True
@@ -298,6 +298,7 @@ def run(
298
298
if use_compiler_settings :
299
299
# inductorconfig.fx_graph_cache = True # seems to slow performance
300
300
inductorconfig .epilogue_fusion = False
301
+ torch ._dynamo .config .automatic_dynamic_shapes = False
301
302
inductorconfig .coordinate_descent_tuning = True
302
303
inductorconfig .coordinate_descent_check_all_directions = True
303
304
@@ -336,7 +337,13 @@ def run(
336
337
for block in predictor .model .image_encoder .blocks :
337
338
block .attn .use_rel_pos = use_rel_pos
338
339
339
- if compress == "dynamic_quant" :
340
+ if compress == "auto_quant" :
341
+ from torchao .quantization .quant_api import do_autoquant
342
+ example_input = torch .randn ((16 , 3 , 1024 , 1024 ), dtype = use_half , device = "cuda" )
343
+ inductorconfig .force_fuse_int_mm_with_mul = True
344
+ inductorconfig .use_mixed_mm = True
345
+ do_autoquant (predictor .model .image_encoder , example_input )
346
+ elif compress == "dynamic_quant" :
340
347
from torchao .quantization import apply_dynamic_quant
341
348
apply_dynamic_quant (predictor .model .image_encoder )
342
349
inductorconfig .force_fuse_int_mm_with_mul = True
0 commit comments