From c44f0d0ab22d9bdf1ebf00b659a5b9108c8f63b7 Mon Sep 17 00:00:00 2001 From: HDCharles Date: Thu, 29 Feb 2024 22:44:33 -0800 Subject: [PATCH 1/4] testing autoquant MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Summary: improves runtime by 19.70 -> 19.76 img/sec ❯ one sh run.sh 0%| | 0/64 [00:00 3.850527899339795 4.3931088875979185 4.3931088875979185 3.190660197287798 4.768232116475701 3.8598313461989164 shape=(torch.Size([78400, 1280]), torch.Size([3840, 1280]), torch.Size([3840])), dtype=torch.bfloat16, best_cls= 1.4865157660096884 1.8800818361341953 1.8800818361341953 1.179535873234272 1.7427184619009497 1.4965661568567157 shape=(torch.Size([78400, 1280]), torch.Size([1280, 1280]), torch.Size([1280])), dtype=torch.bfloat16, best_cls= 4.215262923389673 4.661373794078827 4.661373794078827 3.485689079388976 5.220260447822511 4.2220821138471365 shape=(torch.Size([65536, 1280]), torch.Size([5120, 1280]), torch.Size([5120])), dtype=torch.bfloat16, best_cls= 4.666170105338097 4.113288130611181 4.113288130611181 2.626298717223108 4.855024302378297 4.674202110618353 shape=(torch.Size([65536, 5120]), torch.Size([1280, 5120]), torch.Size([1280])), dtype=torch.bfloat16, best_cls= 3.2269158866256475 3.7462301552295685 3.7462301552295685 2.6572815608233213 3.9978391956537966 3.2370124012231827 shape=(torch.Size([65536, 1280]), torch.Size([3840, 1280]), torch.Size([3840])), dtype=torch.bfloat16, best_cls= 1.2530277017503977 1.5717314090579748 1.5717314090579748 0.9894231799989939 1.5166664496064186 1.2606457574293017 shape=(torch.Size([65536, 1280]), torch.Size([1280, 1280]), torch.Size([1280])), dtype=torch.bfloat16, best_cls= 0%| | 0/64 [00:00 1), print_header=print_header) + rexp("int8", "local-fork", use_half="bfloat16", use_compile="max-autotune", use_nested_tensor=(batch_size > 1), compress="dynamic_quant") + rexp("auto_quant", "local-fork", use_half="bfloat16", use_compile="max-autotune", use_nested_tensor=(batch_size > 1), compress="auto_quant") + return if local_fork_only: rexp("fp32", "local-fork", print_header=print_header) rexp("bf16", "local-fork", use_half="bfloat16") From 36d43be2e014f766282e80e40d96c5987064c68a Mon Sep 17 00:00:00 2001 From: HDCharles Date: Tue, 5 Mar 2024 15:47:19 -0800 Subject: [PATCH 2/4] Update on "testing autoquant" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Summary: improves runtime by 19.70 -> 19.76 img/sec ❯ one sh run.sh 0%| | 0/64 [00:00 3.850527899339795 4.3931088875979185 4.3931088875979185 3.190660197287798 4.768232116475701 3.8598313461989164 shape=(torch.Size([78400, 1280]), torch.Size([3840, 1280]), torch.Size([3840])), dtype=torch.bfloat16, best_cls= 1.4865157660096884 1.8800818361341953 1.8800818361341953 1.179535873234272 1.7427184619009497 1.4965661568567157 shape=(torch.Size([78400, 1280]), torch.Size([1280, 1280]), torch.Size([1280])), dtype=torch.bfloat16, best_cls= 4.215262923389673 4.661373794078827 4.661373794078827 3.485689079388976 5.220260447822511 4.2220821138471365 shape=(torch.Size([65536, 1280]), torch.Size([5120, 1280]), torch.Size([5120])), dtype=torch.bfloat16, best_cls= 4.666170105338097 4.113288130611181 4.113288130611181 2.626298717223108 4.855024302378297 4.674202110618353 shape=(torch.Size([65536, 5120]), torch.Size([1280, 5120]), torch.Size([1280])), dtype=torch.bfloat16, best_cls= 3.2269158866256475 3.7462301552295685 3.7462301552295685 2.6572815608233213 3.9978391956537966 3.2370124012231827 shape=(torch.Size([65536, 1280]), torch.Size([3840, 1280]), torch.Size([3840])), dtype=torch.bfloat16, best_cls= 1.2530277017503977 1.5717314090579748 1.5717314090579748 0.9894231799989939 1.5166664496064186 1.2606457574293017 shape=(torch.Size([65536, 1280]), torch.Size([1280, 1280]), torch.Size([1280])), dtype=torch.bfloat16, best_cls= 0%| | 0/64 [00:00 1), print_header=print_header) + # rexp("compile", "local-fork", use_half="bfloat16", use_compile="max-autotune", use_nested_tensor=(batch_size > 1), print_header=print_header) rexp("int8", "local-fork", use_half="bfloat16", use_compile="max-autotune", use_nested_tensor=(batch_size > 1), compress="dynamic_quant") - rexp("auto_quant", "local-fork", use_half="bfloat16", use_compile="max-autotune", use_nested_tensor=(batch_size > 1), compress="auto_quant") + # rexp("auto_quant", "local-fork", use_half="bfloat16", use_compile="max-autotune", use_nested_tensor=(batch_size > 1), compress="auto_quant") return if local_fork_only: rexp("fp32", "local-fork", print_header=print_header) From b7e84395a0d2b1982eb4a1cf4cb01e7623053fb1 Mon Sep 17 00:00:00 2001 From: HDCharles Date: Tue, 19 Mar 2024 15:14:40 -0700 Subject: [PATCH 3/4] Update on "testing autoquant" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Summary: improves runtime by 19.70 -> 19.76 img/sec ❯ one sh run.sh 0%| | 0/64 [00:00 3.850527899339795 4.3931088875979185 4.3931088875979185 3.190660197287798 4.768232116475701 3.8598313461989164 shape=(torch.Size([78400, 1280]), torch.Size([3840, 1280]), torch.Size([3840])), dtype=torch.bfloat16, best_cls= 1.4865157660096884 1.8800818361341953 1.8800818361341953 1.179535873234272 1.7427184619009497 1.4965661568567157 shape=(torch.Size([78400, 1280]), torch.Size([1280, 1280]), torch.Size([1280])), dtype=torch.bfloat16, best_cls= 4.215262923389673 4.661373794078827 4.661373794078827 3.485689079388976 5.220260447822511 4.2220821138471365 shape=(torch.Size([65536, 1280]), torch.Size([5120, 1280]), torch.Size([5120])), dtype=torch.bfloat16, best_cls= 4.666170105338097 4.113288130611181 4.113288130611181 2.626298717223108 4.855024302378297 4.674202110618353 shape=(torch.Size([65536, 5120]), torch.Size([1280, 5120]), torch.Size([1280])), dtype=torch.bfloat16, best_cls= 3.2269158866256475 3.7462301552295685 3.7462301552295685 2.6572815608233213 3.9978391956537966 3.2370124012231827 shape=(torch.Size([65536, 1280]), torch.Size([3840, 1280]), torch.Size([3840])), dtype=torch.bfloat16, best_cls= 1.2530277017503977 1.5717314090579748 1.5717314090579748 0.9894231799989939 1.5166664496064186 1.2606457574293017 shape=(torch.Size([65536, 1280]), torch.Size([1280, 1280]), torch.Size([1280])), dtype=torch.bfloat16, best_cls= 0%| | 0/64 [00:00 1), print_header=print_header) - rexp("int8", "local-fork", use_half="bfloat16", use_compile="max-autotune", use_nested_tensor=(batch_size > 1), compress="dynamic_quant") - # rexp("auto_quant", "local-fork", use_half="bfloat16", use_compile="max-autotune", use_nested_tensor=(batch_size > 1), compress="auto_quant") + # rexp("int8", "local-fork", use_half="bfloat16", use_compile="max-autotune", use_nested_tensor=(batch_size > 1), compress="dynamic_quant") + rexp("autoquant", "local-fork", use_half="bfloat16", use_compile="max-autotune", use_nested_tensor=(batch_size > 1), compress="autoquant") return if local_fork_only: rexp("fp32", "local-fork", print_header=print_header) From bbd94ac284e9a70e04a8ffac6d203d885a3f87b1 Mon Sep 17 00:00:00 2001 From: HDCharles Date: Tue, 19 Mar 2024 15:14:47 -0700 Subject: [PATCH 4/4] Update on "testing autoquant" Summary: improves runtime by 19.70 -> 19.76 img/sec Test Plan: sh run.sh Reviewers: Subscribers: Tasks: Tags: [ghstack-poisoned]