@@ -162,28 +162,27 @@ struct HostKernel{F,TT} <: AbstractKernel{F,TT}
162
162
end
163
163
164
164
function launch_configuration (kernel:: HostKernel{F,TT} ) where {F,TT}
165
- # XXX : have the user pass in a global size to clamp against
166
- # maxGroupSizeX/Y/Z?
167
-
168
- # XXX : shrink until a multiple of preferredGroupSize?
165
+ # Level Zero's zeKernelSuggestGroupSize provides a launch configuration
166
+ # that exactly cover the input size. This can result in very awkward
167
+ # configurations, so roll our own version that behaves like CUDA's
168
+ # occupancy API and assumes the kernel still does bounds checking.
169
169
170
170
# once the MAX_GROUP_SIZE extension is implemented, we can use it here
171
171
kernel_props = oneL0. properties (kernel. fun)
172
- if kernel_props. maxGroupSize != = missing
173
- return kernel_props. maxGroupSize
172
+ group_size = if kernel_props. maxGroupSize != = missing
173
+ kernel_props. maxGroupSize
174
+ else
175
+ dev = kernel. fun. mod. device
176
+ compute_props = oneL0. compute_properties (dev)
177
+ max_size = compute_props. maxTotalGroupSize
178
+
179
+ # # when the kernel uses many registers (which we can't query without
180
+ # # extensions that landed _after_ MAX_GROUP_SIZE, so don't bother)
181
+ # # the groupsize should be halved
182
+ group_size = max_size ÷ 2
174
183
end
175
184
176
- # otherwise, we'd use `zeKernelSuggestGroupSize` but it's been observed
177
- # to return really bad configs (JuliaGPU/oneAPI.jl#430)
178
-
179
- # so instead, calculate it ourselves based on the device properties
180
- dev = kernel. fun. mod. device
181
- compute_props = oneL0. compute_properties (dev)
182
- max_size = compute_props. maxTotalGroupSize
183
- # # when the kernel uses many registers (which we can't query without
184
- # # extensions that landed _after_ MAX_GROUP_SIZE, so don't bother)
185
- # # the groupsize should be halved
186
- group_size = max_size ÷ 2
185
+ # TODO : align the group size based on preferredGroupSize
187
186
188
187
return group_size
189
188
end
0 commit comments