CuArrayBackend -> CUDABackend

leios · leios · commit fc151cda854f · 2024-07-22T12:28:58.000+02:00
diff --git a/src/CUDA.jl b/src/CUDA.jl
@@ -83,6 +83,11 @@ include("compiler/execution.jl")
 include("compiler/exceptions.jl")
 include("compiler/reflection.jl")
 
+# KernelAbstractions
+include("CUDAKernels.jl")
+import .CUDAKernels: CUDABackend, KA
+export CUDABackend
+
 # array implementation
 include("gpuarrays.jl")
 include("utilities.jl")
@@ -111,6 +116,9 @@ export CUBLAS, CUSPARSE, CUSOLVER, CUFFT, CURAND
 const has_cusolvermg = CUSOLVER.has_cusolvermg
 export has_cusolvermg
 
+# KA Backend Definition
+KA.get_backend(::CUSPARSE.AbstractCuSparseArray) = CUDABackend()
+
 # random depends on CURAND
 include("random.jl")
 
@@ -119,11 +127,6 @@ include("../lib/nvml/NVML.jl")
 const has_nvml = NVML.has_nvml
 export NVML, has_nvml
 
-# KernelAbstractions
-include("CUDAKernels.jl")
-import .CUDAKernels: CUDABackend
-export CUDABackend
-
 # StaticArrays is still a direct dependency, so directly include the extension
 include("../ext/StaticArraysExt.jl")
 
diff --git a/src/CUDAKernels.jl b/src/CUDAKernels.jl
@@ -25,7 +25,6 @@ KA.zeros(::CUDABackend, ::Type{T}, dims::Tuple) where T = CUDA.zeros(T, dims)
 KA.ones(::CUDABackend, ::Type{T}, dims::Tuple) where T = CUDA.ones(T, dims)
 
 KA.get_backend(::CuArray) = CUDABackend()
-KA.get_backend(::CUSPARSE.AbstractCuSparseArray) = CUDABackend()
 KA.synchronize(::CUDABackend) = synchronize()
 
 Adapt.adapt_storage(::CUDABackend, a::Array) = Adapt.adapt(CuArray, a)
diff --git a/src/gpuarrays.jl b/src/gpuarrays.jl
@@ -1,20 +1,22 @@
 # GPUArrays.jl interface
 
-import KernelAbstractions
-import KernelAbstractions: Backend
-
 #
 # Device functionality
 #
 
 
 ## execution
 
-struct CuArrayBackend <: Backend end
 
-@inline function GPUArrays.launch_heuristic(::CuArrayBackend, f::F, args::Vararg{Any,N};
+@inline function GPUArrays.launch_heuristic(::CUDABackend, f::F, args::Vararg{Any,N};
                                             elements::Int, elements_per_thread::Int) where {F,N}
-    kernel = @cuda launch=false f(CuKernelContext(), args...)
+
+    ndrange, workgroupsize, iterspace, dynamic = KA.launch_config(obj, nothing,
+                                                                  nothing)
+
+    # this might not be the final context, since we may tune the workgroupsize
+    ctx = KA.mkcontext(obj, ndrange, iterspace)
+    kernel = @cuda launch=false f(ctx, args...)
 
     # launching many large blocks) lowers performance, as observed with broadcast, so cap
     # the block size if we don't have a grid-stride kernel (which would keep the grid small)