Roll our own launch configuration API.

maleadt · maleadt · commit fd03f9a9aae3 · 2024-04-19T13:56:29.000+02:00
diff --git a/src/compiler/execution.jl b/src/compiler/execution.jl
@@ -161,6 +161,33 @@ struct HostKernel{F,TT} <: AbstractKernel{F,TT}
     fun::ZeKernel
 end
 
+function launch_configuration(kernel::HostKernel{F,TT}) where {F,TT}
+    # XXX: have the user pass in a global size to clamp against
+    #      maxGroupSizeX/Y/Z?
+
+    # XXX: shrink until a multiple of preferredGroupSize?
+
+    # once the MAX_GROUP_SIZE extension is implemented, we can use it here
+    kernel_props = oneL0.properties(kernel.fun)
+    if kernel_props.maxGroupSize !== missing
+        return kernel_props.maxGroupSize
+    end
+
+    # otherwise, we'd use `zeKernelSuggestGroupSize` but it's been observed
+    # to return really bad configs (JuliaGPU/oneAPI.jl#430)
+
+    # so instead, calculate it ourselves based on the device properties
+    dev = kernel.fun.mod.device
+    compute_props = oneL0.compute_properties(dev)
+    max_size = compute_props.maxTotalGroupSize
+    ## when the kernel uses many registers (which we can't query without
+    ## extensions that landed _after_ MAX_GROUP_SIZE, so don't bother)
+    ## the groupsize should be halved
+    group_size = max_size ÷ 2
+
+    return group_size
+end
+
 
 ## host-side API
 
diff --git a/src/gpuarrays.jl b/src/gpuarrays.jl
@@ -16,9 +16,7 @@ struct oneKernelContext <: AbstractKernelContext end
                                              elements::Int, elements_per_thread::Int) where {F,N}
     kernel = @oneapi launch=false f(oneKernelContext(), args...)
 
-    items = suggest_groupsize(kernel.fun, elements).x
-    # XXX: the z dimension of the suggested group size is often non-zero.
-    #      preserve this in GPUArrays?
+    items = launch_configuration(kernel)
     # XXX: how many groups is a good number? the API doesn't tell us.
     #      measured on a low-end IGP, 32 blocks seems like a good sweet spot.
     #      note that this only matters for grid-stride kernels, like broadcast.
diff --git a/src/mapreduce.jl b/src/mapreduce.jl
@@ -146,7 +146,7 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::oneWrappedArray{T},
     kernel_args = kernel_convert.(args)
     kernel_tt = Tuple{Core.Typeof.(kernel_args)...}
     kernel = zefunction(partial_mapreduce_device, kernel_tt)
-    reduce_items = compute_items(suggest_groupsize(kernel.fun, wanted_items).x)
+    reduce_items = launch_configuration(kernel)
 
     # how many groups should we launch?
     #
diff --git a/src/oneAPIKernels.jl b/src/oneAPIKernels.jl
@@ -90,8 +90,7 @@ function (obj::KA.Kernel{oneAPIBackend})(args...; ndrange=nothing, workgroupsize
 
     # figure out the optimal workgroupsize automatically
     if KA.workgroupsize(obj) <: KA.DynamicSize && workgroupsize === nothing
-        items = oneAPI.suggest_groupsize(kernel.fun, prod(ndrange)).x
-        # XXX: the z dimension of the suggested group size is often non-zero. use this?
+        items = oneAPI.launch_configuration(kernel)
         workgroupsize = threads_to_workgroupsize(items, ndrange)
         iterspace, dynamic = KA.partition(obj, ndrange, workgroupsize)
         ctx = KA.mkcontext(obj, ndrange, iterspace)

Original file line number	Diff line number	Diff line change
`@@ -146,7 +146,7 @@ function GPUArrays.mapreducedim!(f::F, op::OP, R::oneWrappedArray{T},`
`146`	`146`	`kernel_args = kernel_convert.(args)`
`147`	`147`	`kernel_tt = Tuple{Core.Typeof.(kernel_args)...}`
`148`	`148`	`kernel = zefunction(partial_mapreduce_device, kernel_tt)`
`149`		`- reduce_items = compute_items(suggest_groupsize(kernel.fun, wanted_items).x)`
	`149`	`+ reduce_items = launch_configuration(kernel)`
`150`	`150`
`151`	`151`	`# how many groups should we launch?`
`152`	`152`	`#`