@@ -27,3 +27,124 @@ This directory contains the implementation of blockwise quantization introduced
27
27
28
28
For detailed motivations and technical specifications, please refer to the original paper:
29
29
- [ DeepSeek Blockwise Quantization Paper] ( https://arxiv.org/html/2412.19437v1 )
30
+
31
+ ## Benchmarks
32
+
33
+ Below are performance benchmarks measuring FP8 blockwise GEMM latency against fp16 on a single H100 GPU.
34
+ These benchmarks can be reproduced using this [ benchmarking script] ( https://github.com/pytorch/ao/blob/main/benchmarks/benchmark_blockwise_scaled_linear_triton.py ) .
35
+
36
+ | m | k | n | block_size | dtype | fp16_latency (ms) | blockwise_latency (ms) | blockwise_speedup |
37
+ | -----:| ------:| ------:| -------------:| :--------------------| --------------------:| -------------------------:| --------------------:|
38
+ | 1 | 8192 | 8192 | 128 | torch.float8_e4m3fn | 83.744 | 52.224 | 1.60355 |
39
+ | 1 | 8192 | 10240 | 128 | torch.float8_e4m3fn | 99.52 | 61.12 | 1.62827 |
40
+ | 1 | 8192 | 57344 | 128 | torch.float8_e4m3fn | 436.608 | 234 | 1.86585 |
41
+ | 1 | 28672 | 8192 | 128 | torch.float8_e4m3fn | 233.568 | 131.168 | 1.78068 |
42
+ | 1 | 8192 | 8192 | 128 | torch.float8_e5m2 | 84.896 | 52.736 | 1.60983 |
43
+ | 1 | 8192 | 10240 | 128 | torch.float8_e5m2 | 100.224 | 60.96 | 1.64409 |
44
+ | 1 | 8192 | 57344 | 128 | torch.float8_e5m2 | 441.152 | 233.968 | 1.88552 |
45
+ | 1 | 28672 | 8192 | 128 | torch.float8_e5m2 | 233.28 | 130.816 | 1.78327 |
46
+ | 2 | 8192 | 8192 | 128 | torch.float8_e4m3fn | 83.392 | 53.664 | 1.55397 |
47
+ | 2 | 8192 | 10240 | 128 | torch.float8_e4m3fn | 100.192 | 61.632 | 1.62565 |
48
+ | 2 | 8192 | 57344 | 128 | torch.float8_e4m3fn | 432.384 | 233.664 | 1.85045 |
49
+ | 2 | 28672 | 8192 | 128 | torch.float8_e4m3fn | 233.648 | 133.6 | 1.74886 |
50
+ | 2 | 8192 | 8192 | 128 | torch.float8_e5m2 | 83.232 | 53.6 | 1.55284 |
51
+ | 2 | 8192 | 10240 | 128 | torch.float8_e5m2 | 100.608 | 61.664 | 1.63155 |
52
+ | 2 | 8192 | 57344 | 128 | torch.float8_e5m2 | 432.32 | 235.152 | 1.83847 |
53
+ | 2 | 28672 | 8192 | 128 | torch.float8_e5m2 | 233.824 | 136.256 | 1.71606 |
54
+ | 4 | 8192 | 8192 | 128 | torch.float8_e4m3fn | 84.16 | 52.928 | 1.59008 |
55
+ | 4 | 8192 | 10240 | 128 | torch.float8_e4m3fn | 100.544 | 61.728 | 1.62882 |
56
+ | 4 | 8192 | 57344 | 128 | torch.float8_e4m3fn | 432.768 | 234.944 | 1.842 |
57
+ | 4 | 28672 | 8192 | 128 | torch.float8_e4m3fn | 234.432 | 134.432 | 1.74387 |
58
+ | 4 | 8192 | 8192 | 128 | torch.float8_e5m2 | 83.872 | 53.408 | 1.5704 |
59
+ | 4 | 8192 | 10240 | 128 | torch.float8_e5m2 | 99.84 | 62.24 | 1.60411 |
60
+ | 4 | 8192 | 57344 | 128 | torch.float8_e5m2 | 433.376 | 238.272 | 1.81883 |
61
+ | 4 | 28672 | 8192 | 128 | torch.float8_e5m2 | 235.584 | 134.08 | 1.75704 |
62
+ | 8 | 8192 | 8192 | 128 | torch.float8_e4m3fn | 83.648 | 53.472 | 1.56433 |
63
+ | 8 | 8192 | 10240 | 128 | torch.float8_e4m3fn | 100.704 | 62.432 | 1.61302 |
64
+ | 8 | 8192 | 57344 | 128 | torch.float8_e4m3fn | 439.104 | 238.208 | 1.84336 |
65
+ | 8 | 28672 | 8192 | 128 | torch.float8_e4m3fn | 234.272 | 135.072 | 1.73442 |
66
+ | 8 | 8192 | 8192 | 128 | torch.float8_e5m2 | 84.128 | 53.728 | 1.56581 |
67
+ | 8 | 8192 | 10240 | 128 | torch.float8_e5m2 | 100.512 | 62.976 | 1.59604 |
68
+ | 8 | 8192 | 57344 | 128 | torch.float8_e5m2 | 439.36 | 238.496 | 1.84221 |
69
+ | 8 | 28672 | 8192 | 128 | torch.float8_e5m2 | 235.04 | 135.424 | 1.73559 |
70
+ | 16 | 8192 | 8192 | 128 | torch.float8_e4m3fn | 83.808 | 53.664 | 1.56172 |
71
+ | 16 | 8192 | 10240 | 128 | torch.float8_e4m3fn | 99.584 | 63.104 | 1.57809 |
72
+ | 16 | 8192 | 57344 | 128 | torch.float8_e4m3fn | 444 | 244.192 | 1.81824 |
73
+ | 16 | 28672 | 8192 | 128 | torch.float8_e4m3fn | 235.52 | 133.792 | 1.76034 |
74
+ | 16 | 8192 | 8192 | 128 | torch.float8_e5m2 | 83.488 | 53.568 | 1.55854 |
75
+ | 16 | 8192 | 10240 | 128 | torch.float8_e5m2 | 101.216 | 63.232 | 1.60071 |
76
+ | 16 | 8192 | 57344 | 128 | torch.float8_e5m2 | 444.608 | 245.936 | 1.80782 |
77
+ | 16 | 28672 | 8192 | 128 | torch.float8_e5m2 | 235.36 | 133.152 | 1.7676 |
78
+ | 32 | 8192 | 8192 | 128 | torch.float8_e4m3fn | 83.872 | 53.312 | 1.57323 |
79
+ | 32 | 8192 | 10240 | 128 | torch.float8_e4m3fn | 102.688 | 63.264 | 1.62317 |
80
+ | 32 | 8192 | 57344 | 128 | torch.float8_e4m3fn | 441.792 | 243.04 | 1.81777 |
81
+ | 32 | 28672 | 8192 | 128 | torch.float8_e4m3fn | 237.12 | 133.632 | 1.77443 |
82
+ | 32 | 8192 | 8192 | 128 | torch.float8_e5m2 | 86.08 | 53.216 | 1.61756 |
83
+ | 32 | 8192 | 10240 | 128 | torch.float8_e5m2 | 102.032 | 63.2 | 1.61443 |
84
+ | 32 | 8192 | 57344 | 128 | torch.float8_e5m2 | 439.168 | 245.184 | 1.79118 |
85
+ | 32 | 28672 | 8192 | 128 | torch.float8_e5m2 | 238.016 | 134.336 | 1.7718 |
86
+ | 64 | 8192 | 8192 | 128 | torch.float8_e4m3fn | 85.888 | 53.632 | 1.60143 |
87
+ | 64 | 8192 | 10240 | 128 | torch.float8_e4m3fn | 93.632 | 63.936 | 1.46446 |
88
+ | 64 | 8192 | 57344 | 128 | torch.float8_e4m3fn | 471.44 | 245.2 | 1.92268 |
89
+ | 64 | 28672 | 8192 | 128 | torch.float8_e4m3fn | 240 | 137.424 | 1.74642 |
90
+ | 64 | 8192 | 8192 | 128 | torch.float8_e5m2 | 85.984 | 54.016 | 1.59182 |
91
+ | 64 | 8192 | 10240 | 128 | torch.float8_e5m2 | 93.376 | 64.032 | 1.45827 |
92
+ | 64 | 8192 | 57344 | 128 | torch.float8_e5m2 | 471.36 | 244.576 | 1.92725 |
93
+ | 64 | 28672 | 8192 | 128 | torch.float8_e5m2 | 242.4 | 136.096 | 1.7811 |
94
+ | 128 | 8192 | 8192 | 128 | torch.float8_e4m3fn | 91.008 | 57.184 | 1.59149 |
95
+ | 128 | 8192 | 10240 | 128 | torch.float8_e4m3fn | 96.608 | 67.936 | 1.42204 |
96
+ | 128 | 8192 | 57344 | 128 | torch.float8_e4m3fn | 449.6 | 292.48 | 1.5372 |
97
+ | 128 | 28672 | 8192 | 128 | torch.float8_e4m3fn | 247.84 | 147.232 | 1.68333 |
98
+ | 128 | 8192 | 8192 | 128 | torch.float8_e5m2 | 89.152 | 57.248 | 1.55729 |
99
+ | 128 | 8192 | 10240 | 128 | torch.float8_e5m2 | 96.64 | 68.784 | 1.40498 |
100
+ | 128 | 8192 | 57344 | 128 | torch.float8_e5m2 | 450.048 | 284.16 | 1.58378 |
101
+ | 128 | 28672 | 8192 | 128 | torch.float8_e5m2 | 246.88 | 148.064 | 1.66739 |
102
+ | 256 | 8192 | 8192 | 128 | torch.float8_e4m3fn | 85.984 | 62.368 | 1.37866 |
103
+ | 256 | 8192 | 10240 | 128 | torch.float8_e4m3fn | 101.216 | 104.896 | 0.964918 |
104
+ | 256 | 8192 | 57344 | 128 | torch.float8_e4m3fn | 477.984 | 452.832 | 1.05554 |
105
+ | 256 | 28672 | 8192 | 128 | torch.float8_e4m3fn | 260.224 | 215.392 | 1.20814 |
106
+ | 256 | 8192 | 8192 | 128 | torch.float8_e5m2 | 86.432 | 62.048 | 1.39299 |
107
+ | 256 | 8192 | 10240 | 128 | torch.float8_e5m2 | 101.024 | 103.904 | 0.972282 |
108
+ | 256 | 8192 | 57344 | 128 | torch.float8_e5m2 | 475.568 | 433.792 | 1.0963 |
109
+ | 256 | 28672 | 8192 | 128 | torch.float8_e5m2 | 261.824 | 207.968 | 1.25896 |
110
+ | 512 | 8192 | 8192 | 128 | torch.float8_e4m3fn | 117.952 | 112.992 | 1.0439 |
111
+ | 512 | 8192 | 10240 | 128 | torch.float8_e4m3fn | 151.504 | 166.08 | 0.912235 |
112
+ | 512 | 8192 | 57344 | 128 | torch.float8_e4m3fn | 836.848 | 881.312 | 0.949548 |
113
+ | 512 | 28672 | 8192 | 128 | torch.float8_e4m3fn | 442.528 | 402.464 | 1.09955 |
114
+ | 512 | 8192 | 8192 | 128 | torch.float8_e5m2 | 121.184 | 114.592 | 1.05753 |
115
+ | 512 | 8192 | 10240 | 128 | torch.float8_e5m2 | 151.424 | 163.296 | 0.927298 |
116
+ | 512 | 8192 | 57344 | 128 | torch.float8_e5m2 | 837.312 | 873.664 | 0.958391 |
117
+ | 512 | 28672 | 8192 | 128 | torch.float8_e5m2 | 437.664 | 400.928 | 1.09163 |
118
+ | 1024 | 8192 | 8192 | 128 | torch.float8_e4m3fn | 227.008 | 224.384 | 1.01169 |
119
+ | 1024 | 8192 | 10240 | 128 | torch.float8_e4m3fn | 289.28 | 283.872 | 1.01905 |
120
+ | 1024 | 8192 | 57344 | 128 | torch.float8_e4m3fn | 1672.13 | 1673.34 | 0.999273 |
121
+ | 1024 | 28672 | 8192 | 128 | torch.float8_e4m3fn | 800 | 769.152 | 1.04011 |
122
+ | 1024 | 8192 | 8192 | 128 | torch.float8_e5m2 | 224.48 | 223.456 | 1.00458 |
123
+ | 1024 | 8192 | 10240 | 128 | torch.float8_e5m2 | 289.408 | 283.424 | 1.02111 |
124
+ | 1024 | 8192 | 57344 | 128 | torch.float8_e5m2 | 1649.58 | 1626.88 | 1.01396 |
125
+ | 1024 | 28672 | 8192 | 128 | torch.float8_e5m2 | 805.392 | 768.416 | 1.04812 |
126
+ | 2048 | 8192 | 8192 | 128 | torch.float8_e4m3fn | 449.344 | 458.272 | 0.980518 |
127
+ | 2048 | 8192 | 10240 | 128 | torch.float8_e4m3fn | 569.888 | 586.224 | 0.972134 |
128
+ | 2048 | 8192 | 57344 | 128 | torch.float8_e4m3fn | 3275.84 | 3251.9 | 1.00736 |
129
+ | 2048 | 28672 | 8192 | 128 | torch.float8_e4m3fn | 1614.37 | 1555.68 | 1.03772 |
130
+ | 2048 | 8192 | 8192 | 128 | torch.float8_e5m2 | 450.624 | 461.712 | 0.975985 |
131
+ | 2048 | 8192 | 10240 | 128 | torch.float8_e5m2 | 575.36 | 582.016 | 0.988564 |
132
+ | 2048 | 8192 | 57344 | 128 | torch.float8_e5m2 | 3363.3 | 3213.31 | 1.04668 |
133
+ | 2048 | 28672 | 8192 | 128 | torch.float8_e5m2 | 1574.32 | 1525.66 | 1.03189 |
134
+ | 4096 | 8192 | 8192 | 128 | torch.float8_e4m3fn | 915.216 | 964.592 | 0.948812 |
135
+ | 4096 | 8192 | 10240 | 128 | torch.float8_e4m3fn | 1157.18 | 1196.42 | 0.967209 |
136
+ | 4096 | 8192 | 57344 | 128 | torch.float8_e4m3fn | 6409.98 | 6638.3 | 0.965606 |
137
+ | 4096 | 28672 | 8192 | 128 | torch.float8_e4m3fn | 3173.76 | 3247.23 | 0.977374 |
138
+ | 4096 | 8192 | 8192 | 128 | torch.float8_e5m2 | 898.432 | 949.36 | 0.946355 |
139
+ | 4096 | 8192 | 10240 | 128 | torch.float8_e5m2 | 1170.62 | 1188.45 | 0.985002 |
140
+ | 4096 | 8192 | 57344 | 128 | torch.float8_e5m2 | 6751.25 | 6573.71 | 1.02701 |
141
+ | 4096 | 28672 | 8192 | 128 | torch.float8_e5m2 | 3155.9 | 3179.38 | 0.992617 |
142
+ | 8192 | 8192 | 8192 | 128 | torch.float8_e4m3fn | 1868.64 | 2022.27 | 0.92403 |
143
+ | 8192 | 8192 | 10240 | 128 | torch.float8_e4m3fn | 2336.26 | 2621.18 | 0.891298 |
144
+ | 8192 | 8192 | 57344 | 128 | torch.float8_e4m3fn | 13004 | 13990.6 | 0.929482 |
145
+ | 8192 | 28672 | 8192 | 128 | torch.float8_e4m3fn | 6781.49 | 6722.82 | 1.00873 |
146
+ | 8192 | 8192 | 8192 | 128 | torch.float8_e5m2 | 1865.25 | 1983.23 | 0.940509 |
147
+ | 8192 | 8192 | 10240 | 128 | torch.float8_e5m2 | 2296.66 | 2523.1 | 0.91025 |
148
+ | 8192 | 8192 | 57344 | 128 | torch.float8_e5m2 | 13170.9 | 14029.6 | 0.938792 |
149
+ | 8192 | 28672 | 8192 | 128 | torch.float8_e5m2 | 6688.51 | 6699.65 | 0.998338 |
150
+
0 commit comments