Transition GPUArrays to KernelAbstractions

vchuravy · leios · commit 1e46794eb45d · 2024-02-14T11:48:52.000+01:00
diff --git a/Project.toml b/Project.toml
@@ -5,6 +5,7 @@ version = "10.0.2"
 [deps]
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527"
+KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 LLVM = "929cbde3-209d-540e-8aea-75f648917ca0"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
diff --git a/src/GPUArrays.jl b/src/GPUArrays.jl
@@ -22,6 +22,7 @@ include("device/indexing.jl")
 include("device/memory.jl")
 include("device/synchronization.jl")
 
+using KernelAbstractions
 # host abstractions
 include("host/abstractarray.jl")
 include("host/construction.jl")
diff --git a/src/device/execution.jl b/src/device/execution.jl
@@ -1,110 +1,8 @@
 # kernel execution
 
-export AbstractGPUBackend, AbstractKernelContext, gpu_call
-
-abstract type AbstractGPUBackend end
-
-abstract type AbstractKernelContext end
-
-import GPUArraysCore: backend
-
-"""
-    gpu_call(kernel::Function, arg0, args...; kwargs...)
-
-Executes `kernel` on the device that backs `arg` (see [`backend`](@ref)), passing along any
-arguments `args`. Additionally, the kernel will be passed the kernel execution context (see
-[`AbstractKernelContext`]), so its signature should be `(ctx::AbstractKernelContext, arg0,
-args...)`.
-
-The keyword arguments `kwargs` are not passed to the function, but are interpreted on the
-host to influence how the kernel is executed. The following keyword arguments are supported:
-
-- `target::AbstractArray`: specify which array object to use for determining execution
-  properties (defaults to the first argument `arg0`).
-- `elements::Int`: how many elements will be processed by this kernel. In most
-  circumstances, this will correspond to the total number of threads that needs to be
-  launched, unless the kernel supports a variable number of elements to process per
-  iteration. Defaults to the length of `arg0` if no other keyword arguments that influence
-  the launch configuration are specified.
-- `threads::Int` and `blocks::Int`: configure exactly how many threads and blocks are
-  launched. This cannot be used in combination with the `elements` argument.
-- `name::String`: inform the back end about the name of the kernel to be executed. This can
-  be used to emit better diagnostics, and is useful with anonymous kernels.
-"""
-function gpu_call(kernel::F, args::Vararg{Any,N};
-                  target::AbstractArray=first(args),
-                  elements::Union{Int,Nothing}=nothing,
-                  threads::Union{Int,Nothing}=nothing,
-                  blocks::Union{Int,Nothing}=nothing,
-                  name::Union{String,Nothing}=nothing) where {F,N}
-    # non-trivial default values for launch configuration
-    if elements===nothing && threads===nothing && blocks===nothing
-        elements = length(target)
-    elseif elements===nothing
-        if threads === nothing
-            threads = 1
-        end
-        if blocks === nothing
-            blocks = 1
-        end
-    elseif threads!==nothing || blocks!==nothing
-        error("Cannot specify both elements and threads/blocks configuration")
-    end
-
-    # the number of elements to process needs to be passed to the kernel somehow, so there's
-    # no easy way to do this without passing additional arguments or changing the context.
-    # both are expensive, so require manual use of `launch_heuristic` for those kernels.
-    elements_per_thread = 1
-
-    if elements !== nothing
-        @assert elements > 0
-        heuristic = launch_heuristic(backend(target), kernel, args...;
-                                     elements, elements_per_thread)
-        config = launch_configuration(backend(target), heuristic;
-                                      elements, elements_per_thread)
-        gpu_call(backend(target), kernel, args, config.threads, config.blocks; name=name)
-    else
-        @assert threads > 0
-        @assert blocks > 0
-        gpu_call(backend(target), kernel, args, threads, blocks; name=name)
-    end
-end
-
-# how many threads and blocks `kernel` needs to be launched with, passing arguments `args`,
-# to fully saturate the GPU. `elements` indicates the number of elements that needs to be
-# processed, while `elements_per_threads` indicates the number of elements this kernel can
-# process (i.e. if it's a grid-stride kernel, or 1 if otherwise).
-#
-# this heuristic should be specialized for the back-end, ideally using an API for maximizing
-# the occupancy of the launch configuration (like CUDA's occupancy API).
-function launch_heuristic(backend::AbstractGPUBackend, kernel, args...;
-                          elements::Int, elements_per_thread::Int)
-    return (threads=256, blocks=32)
-end
-
-# determine how many threads and blocks to actually launch given upper limits.
-# returns a tuple of blocks, threads, and elements_per_thread (which is always 1
-# unless specified that the kernel can handle a number of elements per thread)
-function launch_configuration(backend::AbstractGPUBackend, heuristic;
-                              elements::Int, elements_per_thread::Int)
-    threads = clamp(elements, 1, heuristic.threads)
-    blocks = max(cld(elements, threads), 1)
-
-    if elements_per_thread > 1 && blocks > heuristic.blocks
-        # we want to launch more blocks than required, so prefer a grid-stride loop instead
-        ## try to stick to the number of blocks that the heuristic suggested
-        blocks = heuristic.blocks
-        nelem = cld(elements, blocks*threads)
-        ## only bump the number of blocks if we really need to
-        if nelem > elements_per_thread
-            nelem = elements_per_thread
-            blocks = cld(elements, nelem*threads)
-        end
-        (; threads, blocks, elements_per_thread=nelem)
-    else
-        (; threads, blocks, elements_per_thread=1)
-    end
-end
-
-gpu_call(backend::AbstractGPUBackend, kernel, args, threads::Int, blocks::Int; kwargs...) =
-    error("Not implemented") # COV_EXCL_LINE
+# TODO:
+# - Rename KA device to backend
+# - Who owns `AbstractGPUBackend`?
+#   a; KernelAbstractions
+#   b; GPUArraysCore
+backend(a) = KernelAbstractions.get_device(a)
diff --git a/src/host/abstractarray.jl b/src/host/abstractarray.jl
@@ -173,13 +173,12 @@ for (D, S) in ((AnyGPUArray, Array),
 end
 
 # kernel-based variant for copying between wrapped GPU arrays
-
-function linear_copy_kernel!(ctx::AbstractKernelContext, dest, dstart, src, sstart, n)
-    i = linear_index(ctx)-1
+# TODO: Add `@Const` to `src`
+@kernel function linear_copy_kernel!(dest, dstart, src, sstart, n)
+    i = @index(Global, Linear) - 1
     if i < n
         @inbounds dest[dstart+i] = src[sstart+i]
     end
-    return
 end
 
 function Base.copyto!(dest::AnyGPUArray, dstart::Integer,
@@ -189,10 +188,8 @@ function Base.copyto!(dest::AnyGPUArray, dstart::Integer,
     destinds, srcinds = LinearIndices(dest), LinearIndices(src)
     (checkbounds(Bool, destinds, dstart) && checkbounds(Bool, destinds, dstart+n-1)) || throw(BoundsError(dest, dstart:dstart+n-1))
     (checkbounds(Bool, srcinds, sstart)  && checkbounds(Bool, srcinds, sstart+n-1))  || throw(BoundsError(src,  sstart:sstart+n-1))
-
-    gpu_call(linear_copy_kernel!,
-             dest, dstart, src, sstart, n;
-             elements=n)
+    kernel = linear_copy_kernel!(backend(dest))
+    kernel(dest, dstart, src, sstart, n; ndrange=elements)
     return dest
 end
 
@@ -242,13 +239,9 @@ end
 
 ## generalized blocks of heterogeneous memory
 
-function cartesian_copy_kernel!(ctx::AbstractKernelContext, dest, dest_offsets, src, src_offsets, shape, length)
-    i = linear_index(ctx)
-    if i <= length
-        idx = CartesianIndices(shape)[i]
-        @inbounds dest[idx + dest_offsets] = src[idx + src_offsets]
-    end
-    return
+@kernel function cartesian_copy_kernel!(ctx::AbstractKernelContext, dest, dest_offsets, src, src_offsets)
+    I = @index(Global, Cartesian)
+    @inbounds dest[I + dest_offsets] = src[I + src_offsets]
 end
 
 function Base.copyto!(dest::AnyGPUArray{<:Any, N}, destcrange::CartesianIndices{N},
@@ -262,9 +255,8 @@ function Base.copyto!(dest::AnyGPUArray{<:Any, N}, destcrange::CartesianIndices{
 
     dest_offsets = first(destcrange) - oneunit(CartesianIndex{N})
     src_offsets = first(srccrange) - oneunit(CartesianIndex{N})
-    gpu_call(cartesian_copy_kernel!,
-             dest, dest_offsets, src, src_offsets, shape, len;
-             elements=len)
+    kernel = cartesian_copy_kernel!(backend(dest))
+    kernel(dest, dest_offsets, src, src_offsets; ndrange=shape)
     dest
 end
 
diff --git a/src/host/base.jl b/src/host/base.jl
@@ -26,14 +26,13 @@ end
 # benchmark faster by having fewer read operations and avoiding the costly division
 # operation. Additionally, when repeating over the trailing dimension. `inner=(ones..., n)`,
 # data access can be contiguous during both the read and write operations.
-function repeat_inner_src_kernel!(
-    ctx::AbstractKernelContext,
+@kernel function repeat_inner_src_kernel!(
     xs::AbstractArray{<:Any, N},
     inner::NTuple{N, Int},
     out::AbstractArray{<:Any, N}
 ) where {N}
     # Get single element from src
-    idx = @cartesianidx xs
+    idx = @index(Global, Cartesian)
     @inbounds val = xs[idx]
 
     # Loop over "repeat" indices of inner
@@ -44,7 +43,6 @@ function repeat_inner_src_kernel!(
         end
         @inbounds out[CartesianIndex(odx)] = val
     end
-    return nothing
 end
 
 function repeat_inner(xs::AnyGPUArray, inner)
@@ -64,23 +62,24 @@ function repeat_inner(xs::AnyGPUArray, inner)
     # relevant benchmarks.
     if argmax(inner) == firstindex(inner)
         # Parallelize over the destination array
-        gpu_call(repeat_inner_dst_kernel!, xs, inner, out; elements=prod(size(out)))
+        kernel = repeat_inner_dst_kernel!(backend(out))
+        kernel(xs, inner, out; ndrange=size(out))
     else
         # Parallelize over the source array
-        gpu_call(repeat_inner_src_kernel!, xs, inner, out; elements=prod(size(xs)))
+        kernel = repeat_inner_src_kernel!(backend(xs))
+        kernel(xs, inner, out; ndrange=size(xs))
     end
     return out
 end
 
-function repeat_outer_kernel!(
-    ctx::AbstractKernelContext,
+@kernel function repeat_outer_kernel!(
     xs::AbstractArray{<:Any, N},
     xssize::NTuple{N},
     outer::NTuple{N},
     out::AbstractArray{<:Any, N}
 ) where {N}
     # Get index to input element
-    idx = @cartesianidx xs
+    idx = @index(Global, Cartesian)
     @inbounds val = xs[idx]
 
     # Loop over repeat indices, copying val to out
@@ -98,7 +97,8 @@ end
 function repeat_outer(xs::AnyGPUArray, outer)
     out = similar(xs, eltype(xs), outer .* size(xs))
     any(==(0), size(out)) && return out # consistent with `Base.repeat`
-    gpu_call(repeat_outer_kernel!, xs, size(xs), outer, out; elements=length(xs))
+    kernel = repeat_outer_kernel!(backend(xs))
+    kernel(xs, size(xs), outer, out; ndrange=size(xs))
     return out
 end
 
diff --git a/src/host/construction.jl b/src/host/construction.jl
@@ -11,29 +11,30 @@ Base.convert(::Type{T}, a::AbstractArray) where {T<:AbstractGPUArray} = a isa T
 
 function Base.fill!(A::AnyGPUArray{T}, x) where T
     length(A) == 0 && return A
-    gpu_call(A, convert(T, x)) do ctx, a, val
-        idx = @linearidx(a)
+    @kernel fill!(a, val)
+        idx = @index(Linear, Global)
         @inbounds a[idx] = val
-        return
     end
+    kernel = fill!(backend(A))
+    kernel(A, x)
     A
 end
 
 
 ## identity matrices
 
-function identity_kernel(ctx::AbstractKernelContext, res::AbstractArray{T}, stride, val) where T
-    i = linear_index(ctx)
+@kernel function identity_kernel(ctx::AbstractKernelContext, res::AbstractArray{T}, stride, val) where T
+    i = @index(Global, Linear)
     ilin = (stride * (i - 1)) + i
     ilin > length(res) && return
     @inbounds res[ilin] = val
-    return
 end
 
 function (T::Type{<: AnyGPUArray{U}})(s::UniformScaling, dims::Dims{2}) where {U}
     res = similar(T, dims)
     fill!(res, zero(U))
-    gpu_call(identity_kernel, res, size(res, 1), s.λ; elements=minimum(dims))
+    kernel = identity_kernel(backend(res))
+    kernel(res, size(res, 1), s.λ; ndrange=minimum(dims))
     res
 end
 
@@ -43,7 +44,8 @@ end
 
 function Base.copyto!(A::AbstractGPUMatrix{T}, s::UniformScaling) where T
     fill!(A, zero(T))
-    gpu_call(identity_kernel, A, size(A, 1), s.λ; elements=minimum(size(A)))
+    kernel = identity_kernel(backend(A))
+    kernel(A, size(A, 1), s.λ; ndrange=minimum(size(A)))
     A
 end
 
@@ -52,7 +54,8 @@ function _one(unit::T, x::AbstractGPUMatrix) where {T}
     m==n || throw(DimensionMismatch("multiplicative identity defined only for square matrices"))
     I = similar(x, T)
     fill!(I, zero(T))
-    gpu_call(identity_kernel, I, m, unit; elements=m)
+    kernel = identity_kernel(backend(I))
+    kernel(I, m, unit; ndrange=m)
     I
 end
 
diff --git a/src/host/random.jl b/src/host/random.jl
@@ -84,29 +84,32 @@ function Random.seed!(rng::RNG, seed::Vector{UInt32})
 end
 
 function Random.rand!(rng::RNG, A::AnyGPUArray{T}) where T <: Number
-    gpu_call(A, rng.state) do ctx, a, randstates
-        idx = linear_index(ctx)
-        idx > length(a) && return
+    @kernel rand!(a, randstate)
+        idx = @index(Global, Linear)
         @inbounds a[idx] = gpu_rand(T, ctx, randstates)
-        return
     end
+    kernel = rand!(backend(A))
+    kernel(A, rng.state)
     A
 end
 
 function Random.randn!(rng::RNG, A::AnyGPUArray{T}) where T <: Number
     threads = (length(A) - 1) ÷ 2 + 1
     length(A) == 0 && return
-    gpu_call(A, rng.state; elements = threads) do ctx, a, randstates
-        idx = 2*(linear_index(ctx) - 1) + 1
+    @kernel randn!(a, randstates)
+        i = @index(Global, Linear) 
+        idx = 2*(i - 1) + 1
         U1 = gpu_rand(T, ctx, randstates)
         U2 = gpu_rand(T, ctx, randstates)
         Z0 = sqrt(T(-2.0)*log(U1))*cos(T(2pi)*U2)
         Z1 = sqrt(T(-2.0)*log(U1))*sin(T(2pi)*U2)
         @inbounds a[idx] = Z0
-        idx + 1 > length(a) && return
-        @inbounds a[idx + 1] = Z1
-        return
+        if idx + 1 <= length(a)
+            @inbounds a[idx + 1] = Z1
+        end
     end
+    kernel = randn!(backend(A))
+    kernel(A, rng.states; ndrange=threads)
     A
 end
 
diff --git a/src/host/uniformscaling.jl b/src/host/uniformscaling.jl
diff --git a/test/runtests.jl b/test/runtests.jl