|
| 1 | +# kernel execution |
| 2 | + |
| 3 | +# how many threads and blocks `kernel` needs to be launched with, passing arguments `args`, |
| 4 | +# to fully saturate the GPU. `elements` indicates the number of elements that needs to be |
| 5 | +# processed, while `elements_per_threads` indicates the number of elements this kernel can |
| 6 | +# process (i.e. if it's a grid-stride kernel, or 1 if otherwise). |
| 7 | +# |
| 8 | +# this heuristic should be specialized for the back-end, ideally using an API for maximizing |
| 9 | +# the occupancy of the launch configuration (like CUDA's occupancy API). |
| 10 | +function launch_heuristic(backend::B, kernel, args...; |
| 11 | + elements::Int, |
| 12 | + elements_per_thread::Int) where B <: Backend |
| 13 | + return (threads=256, blocks=32) |
| 14 | +end |
| 15 | + |
| 16 | +# determine how many threads and blocks to actually launch given upper limits. |
| 17 | +# returns a tuple of blocks, threads, and elements_per_thread (which is always 1 |
| 18 | +# unless specified that the kernel can handle a number of elements per thread) |
| 19 | +function launch_configuration(backend::B, heuristic; |
| 20 | + elements::Int, |
| 21 | + elements_per_thread::Int) where B <: Backend |
| 22 | + threads = clamp(elements, 1, heuristic.threads) |
| 23 | + blocks = max(cld(elements, threads), 1) |
| 24 | + |
| 25 | + if elements_per_thread > 1 && blocks > heuristic.blocks |
| 26 | + # we want to launch more blocks than required, so prefer a grid-stride loop instead |
| 27 | + ## try to stick to the number of blocks that the heuristic suggested |
| 28 | + blocks = heuristic.blocks |
| 29 | + nelem = cld(elements, blocks*threads) |
| 30 | + ## only bump the number of blocks if we really need to |
| 31 | + if nelem > elements_per_thread |
| 32 | + nelem = elements_per_thread |
| 33 | + blocks = cld(elements, nelem*threads) |
| 34 | + end |
| 35 | + (; threads, blocks, elements_per_thread=nelem) |
| 36 | + else |
| 37 | + (; threads, blocks, elements_per_thread=1) |
| 38 | + end |
| 39 | +end |
0 commit comments