@@ -125,14 +125,14 @@ typical autotuning session produces output similar to:
125
125
126
126
```
127
127
[0s] Starting DifferentialEvolutionSearch with population=40, generations=20, crossover_rate=0.8
128
- [20s] Initial population: failed=10 min=0.9677 mid=3.0013 max=22.1430 best=Config(block_sizes=[[ 64, 32], [32 ]], loop_orders=[[1, 0] ], num_warps=2 , num_stages=2 , indexing='pointer ', l2_grouping=1, use_yz_grid=False )
129
- [52s ] Generation 2: replaced=16 min=0.7731 mid=1.7203 max=3.1227 best=Config(block_sizes=[[ 32, 128 ], [16 ]], loop_orders=[[0 , 1]], num_warps=4, num_stages=4 , indexing='block_ptr', l2_grouping=16 )
130
- [85s ] Generation 3: replaced=19 min=0.6256 mid=1.3916 max=2.7868 best=Config(block_sizes=[[ 64, 128 ], [16 ]], loop_orders=[ [0, 1]], num_warps=4, num_stages=4 , indexing='block_ptr ', l2_grouping=16 )
128
+ [20s] Initial population: failed=4 min=0.0266 mid=0.1577 max=1.2390 best=Config(block_sizes=[64, 32, 64 ], loop_orders=[[1, 0 ]], l2_groupings=[8], range_unroll_factors=[3, 1], range_warp_specializes=[True, False], range_num_stages=[1, 0], range_multi_buffers=[True, True ], range_flattens=[None, False], num_warps=4 , num_stages=7 , indexing='block_ptr ', pid_type='persistent_blocked' )
129
+ [51s ] Generation 2: replaced=17 min=0.0266 mid=0.0573 max=0.1331 best=Config(block_sizes=[64, 32, 64 ], loop_orders=[[1, 0 ]], l2_groupings=[8], range_unroll_factors=[3 , 1], range_warp_specializes=[True, False ], range_num_stages=[1, 0], range_multi_buffers=[True, True], range_flattens=[None, False], num_warps=4, num_stages=7 , indexing='block_ptr', pid_type='persistent_blocked' )
130
+ [88s ] Generation 3: replaced=18 min=0.0225 mid=0.0389 max=0.1085 best=Config(block_sizes=[64, 64, 16 ], loop_orders=[[0, 1 ]], l2_groupings=[4], range_unroll_factors= [0, 1], range_warp_specializes=[None, None ], range_num_stages=[0, 0], range_multi_buffers=[None, False], range_flattens=[None, None], num_warps=4, num_stages=6 , indexing='pointer ', pid_type='flat' )
131
131
...
132
- [593s ] Generation 19: replaced=7 min=0.6072 mid=0.6626 max=0.7496 best=Config(block_sizes=[[ 64, 128 ], [16 ]], loop_orders=[[1, 0] ], num_warps=4 , num_stages=3 , indexing='block_ptr', l2_grouping=32 )
133
- [593s ] Autotuning complete in 593.1s after searching 1520 configs.
132
+ [586s ] Generation 19: replaced=3 min=0.0184 mid=0.0225 max=0.0287 best=Config(block_sizes=[64, 64, 64 ], loop_orders=[[0, 1 ]], l2_groupings=[4], range_unroll_factors=[0, 1], range_warp_specializes=[None, False], range_num_stages=[0, 3], range_multi_buffers=[None, False ], range_flattens=[None, None], num_warps=8 , num_stages=6 , indexing='block_ptr', pid_type='flat' )
133
+ [586s ] Autotuning complete in 586.6s after searching 1520 configs.
134
134
One can hardcode the best config and skip autotuning with:
135
- @helion.kernel(config=helion.Config(block_sizes=[[ 64, 128 ], [16 ]], loop_orders=[[1, 0] ], num_warps=4 , num_stages=3 , indexing='block_ptr', l2_grouping=32 ))
135
+ @helion.kernel(config=helion.Config(block_sizes=[64, 64, 64 ], loop_orders=[[0, 1 ]], l2_groupings=[4], range_unroll_factors=[0, 1], range_warp_specializes=[None, False], range_num_stages=[0, 3], range_multi_buffers=[None, False ], range_flattens=[None, None], num_warps=8 , num_stages=6 , indexing='block_ptr', pid_type='flat' ))
136
136
```
137
137
138
138
Because autotuning can be time-consuming (around 10 minutes in the above
@@ -141,26 +141,21 @@ autotuning to avoid repeated tuning:
141
141
142
142
``` python
143
143
@helion.kernel (config = helion.Config(
144
- block_sizes = [[64 , 128 ], [16 ]],
145
- loop_orders = [[1 , 0 ]],
146
- num_warps = 4 ,
147
- num_stages = 3 ,
144
+ block_sizes = [64 , 64 , 64 ],
145
+ loop_orders = [[0 , 1 ]],
146
+ l2_groupings = [4 ],
147
+ range_unroll_factors = [0 , 1 ],
148
+ range_warp_specializes = [None , False ],
149
+ range_num_stages = [0 , 3 ],
150
+ range_multi_buffers = [None , False ],
151
+ range_flattens = [None , None ],
152
+ num_warps = 8 ,
153
+ num_stages = 6 ,
148
154
indexing = ' block_ptr' ,
149
- l2_grouping = 32
155
+ pid_type = ' flat '
150
156
))
151
157
def matmul (x : torch.Tensor, y : torch.Tensor) -> torch.Tensor:
152
- m, k = x.size()
153
- k2, n = y.size()
154
- assert k == k2, f " size mismatch { k} != { k2} "
155
- out = torch.empty([m, n], dtype = x.dtype, device = x.device)
156
-
157
- for tile_m, tile_n in hl.tile([m, n]):
158
- acc = hl.zeros([tile_m, tile_n], dtype = torch.float32)
159
- for tile_k in hl.tile(k):
160
- acc = torch.addmm(acc, x[tile_m, tile_k], y[tile_k, tile_n])
161
- out[tile_m, tile_n] = acc
162
-
163
- return out
158
+ ...
164
159
```
165
160
166
161
This explicit configuration skips autotuning on subsequent runs.
@@ -174,18 +169,7 @@ a more lightweight autotuning process:
174
169
helion.Config(block_sizes = [[64 , 64 ], [32 ]], num_warps = 8 ),
175
170
])
176
171
def matmul (x : torch.Tensor, y : torch.Tensor) -> torch.Tensor:
177
- m, k = x.size()
178
- k2, n = y.size()
179
- assert k == k2, f " size mismatch { k} != { k2} "
180
- out = torch.empty([m, n], dtype = x.dtype, device = x.device)
181
-
182
- for tile_m, tile_n in hl.tile([m, n]):
183
- acc = hl.zeros([tile_m, tile_n], dtype = torch.float32)
184
- for tile_k in hl.tile(k):
185
- acc = torch.addmm(acc, x[tile_m, tile_k], y[tile_k, tile_n])
186
- out[tile_m, tile_n] = acc
187
-
188
- return out
172
+ ...
189
173
```
190
174
191
175
In this case, Helion evaluates the provided configurations and selects the fastest one.
0 commit comments