Skip to content

Commit 1e46794

Browse files
vchuravyleios
authored andcommitted
Transition GPUArrays to KernelAbstractions
1 parent 693a4d3 commit 1e46794

File tree

9 files changed

+80
-177
lines changed

9 files changed

+80
-177
lines changed

Project.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ version = "10.0.2"
55
[deps]
66
Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
77
GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527"
8+
KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
89
LLVM = "929cbde3-209d-540e-8aea-75f648917ca0"
910
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
1011
Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"

src/GPUArrays.jl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ include("device/indexing.jl")
2222
include("device/memory.jl")
2323
include("device/synchronization.jl")
2424

25+
using KernelAbstractions
2526
# host abstractions
2627
include("host/abstractarray.jl")
2728
include("host/construction.jl")

src/device/execution.jl

Lines changed: 6 additions & 108 deletions
Original file line numberDiff line numberDiff line change
@@ -1,110 +1,8 @@
11
# kernel execution
22

3-
export AbstractGPUBackend, AbstractKernelContext, gpu_call
4-
5-
abstract type AbstractGPUBackend end
6-
7-
abstract type AbstractKernelContext end
8-
9-
import GPUArraysCore: backend
10-
11-
"""
12-
gpu_call(kernel::Function, arg0, args...; kwargs...)
13-
14-
Executes `kernel` on the device that backs `arg` (see [`backend`](@ref)), passing along any
15-
arguments `args`. Additionally, the kernel will be passed the kernel execution context (see
16-
[`AbstractKernelContext`]), so its signature should be `(ctx::AbstractKernelContext, arg0,
17-
args...)`.
18-
19-
The keyword arguments `kwargs` are not passed to the function, but are interpreted on the
20-
host to influence how the kernel is executed. The following keyword arguments are supported:
21-
22-
- `target::AbstractArray`: specify which array object to use for determining execution
23-
properties (defaults to the first argument `arg0`).
24-
- `elements::Int`: how many elements will be processed by this kernel. In most
25-
circumstances, this will correspond to the total number of threads that needs to be
26-
launched, unless the kernel supports a variable number of elements to process per
27-
iteration. Defaults to the length of `arg0` if no other keyword arguments that influence
28-
the launch configuration are specified.
29-
- `threads::Int` and `blocks::Int`: configure exactly how many threads and blocks are
30-
launched. This cannot be used in combination with the `elements` argument.
31-
- `name::String`: inform the back end about the name of the kernel to be executed. This can
32-
be used to emit better diagnostics, and is useful with anonymous kernels.
33-
"""
34-
function gpu_call(kernel::F, args::Vararg{Any,N};
35-
target::AbstractArray=first(args),
36-
elements::Union{Int,Nothing}=nothing,
37-
threads::Union{Int,Nothing}=nothing,
38-
blocks::Union{Int,Nothing}=nothing,
39-
name::Union{String,Nothing}=nothing) where {F,N}
40-
# non-trivial default values for launch configuration
41-
if elements===nothing && threads===nothing && blocks===nothing
42-
elements = length(target)
43-
elseif elements===nothing
44-
if threads === nothing
45-
threads = 1
46-
end
47-
if blocks === nothing
48-
blocks = 1
49-
end
50-
elseif threads!==nothing || blocks!==nothing
51-
error("Cannot specify both elements and threads/blocks configuration")
52-
end
53-
54-
# the number of elements to process needs to be passed to the kernel somehow, so there's
55-
# no easy way to do this without passing additional arguments or changing the context.
56-
# both are expensive, so require manual use of `launch_heuristic` for those kernels.
57-
elements_per_thread = 1
58-
59-
if elements !== nothing
60-
@assert elements > 0
61-
heuristic = launch_heuristic(backend(target), kernel, args...;
62-
elements, elements_per_thread)
63-
config = launch_configuration(backend(target), heuristic;
64-
elements, elements_per_thread)
65-
gpu_call(backend(target), kernel, args, config.threads, config.blocks; name=name)
66-
else
67-
@assert threads > 0
68-
@assert blocks > 0
69-
gpu_call(backend(target), kernel, args, threads, blocks; name=name)
70-
end
71-
end
72-
73-
# how many threads and blocks `kernel` needs to be launched with, passing arguments `args`,
74-
# to fully saturate the GPU. `elements` indicates the number of elements that needs to be
75-
# processed, while `elements_per_threads` indicates the number of elements this kernel can
76-
# process (i.e. if it's a grid-stride kernel, or 1 if otherwise).
77-
#
78-
# this heuristic should be specialized for the back-end, ideally using an API for maximizing
79-
# the occupancy of the launch configuration (like CUDA's occupancy API).
80-
function launch_heuristic(backend::AbstractGPUBackend, kernel, args...;
81-
elements::Int, elements_per_thread::Int)
82-
return (threads=256, blocks=32)
83-
end
84-
85-
# determine how many threads and blocks to actually launch given upper limits.
86-
# returns a tuple of blocks, threads, and elements_per_thread (which is always 1
87-
# unless specified that the kernel can handle a number of elements per thread)
88-
function launch_configuration(backend::AbstractGPUBackend, heuristic;
89-
elements::Int, elements_per_thread::Int)
90-
threads = clamp(elements, 1, heuristic.threads)
91-
blocks = max(cld(elements, threads), 1)
92-
93-
if elements_per_thread > 1 && blocks > heuristic.blocks
94-
# we want to launch more blocks than required, so prefer a grid-stride loop instead
95-
## try to stick to the number of blocks that the heuristic suggested
96-
blocks = heuristic.blocks
97-
nelem = cld(elements, blocks*threads)
98-
## only bump the number of blocks if we really need to
99-
if nelem > elements_per_thread
100-
nelem = elements_per_thread
101-
blocks = cld(elements, nelem*threads)
102-
end
103-
(; threads, blocks, elements_per_thread=nelem)
104-
else
105-
(; threads, blocks, elements_per_thread=1)
106-
end
107-
end
108-
109-
gpu_call(backend::AbstractGPUBackend, kernel, args, threads::Int, blocks::Int; kwargs...) =
110-
error("Not implemented") # COV_EXCL_LINE
3+
# TODO:
4+
# - Rename KA device to backend
5+
# - Who owns `AbstractGPUBackend`?
6+
# a; KernelAbstractions
7+
# b; GPUArraysCore
8+
backend(a) = KernelAbstractions.get_device(a)

src/host/abstractarray.jl

Lines changed: 10 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -173,13 +173,12 @@ for (D, S) in ((AnyGPUArray, Array),
173173
end
174174

175175
# kernel-based variant for copying between wrapped GPU arrays
176-
177-
function linear_copy_kernel!(ctx::AbstractKernelContext, dest, dstart, src, sstart, n)
178-
i = linear_index(ctx)-1
176+
# TODO: Add `@Const` to `src`
177+
@kernel function linear_copy_kernel!(dest, dstart, src, sstart, n)
178+
i = @index(Global, Linear) - 1
179179
if i < n
180180
@inbounds dest[dstart+i] = src[sstart+i]
181181
end
182-
return
183182
end
184183

185184
function Base.copyto!(dest::AnyGPUArray, dstart::Integer,
@@ -189,10 +188,8 @@ function Base.copyto!(dest::AnyGPUArray, dstart::Integer,
189188
destinds, srcinds = LinearIndices(dest), LinearIndices(src)
190189
(checkbounds(Bool, destinds, dstart) && checkbounds(Bool, destinds, dstart+n-1)) || throw(BoundsError(dest, dstart:dstart+n-1))
191190
(checkbounds(Bool, srcinds, sstart) && checkbounds(Bool, srcinds, sstart+n-1)) || throw(BoundsError(src, sstart:sstart+n-1))
192-
193-
gpu_call(linear_copy_kernel!,
194-
dest, dstart, src, sstart, n;
195-
elements=n)
191+
kernel = linear_copy_kernel!(backend(dest))
192+
kernel(dest, dstart, src, sstart, n; ndrange=elements)
196193
return dest
197194
end
198195

@@ -242,13 +239,9 @@ end
242239

243240
## generalized blocks of heterogeneous memory
244241

245-
function cartesian_copy_kernel!(ctx::AbstractKernelContext, dest, dest_offsets, src, src_offsets, shape, length)
246-
i = linear_index(ctx)
247-
if i <= length
248-
idx = CartesianIndices(shape)[i]
249-
@inbounds dest[idx + dest_offsets] = src[idx + src_offsets]
250-
end
251-
return
242+
@kernel function cartesian_copy_kernel!(ctx::AbstractKernelContext, dest, dest_offsets, src, src_offsets)
243+
I = @index(Global, Cartesian)
244+
@inbounds dest[I + dest_offsets] = src[I + src_offsets]
252245
end
253246

254247
function Base.copyto!(dest::AnyGPUArray{<:Any, N}, destcrange::CartesianIndices{N},
@@ -262,9 +255,8 @@ function Base.copyto!(dest::AnyGPUArray{<:Any, N}, destcrange::CartesianIndices{
262255

263256
dest_offsets = first(destcrange) - oneunit(CartesianIndex{N})
264257
src_offsets = first(srccrange) - oneunit(CartesianIndex{N})
265-
gpu_call(cartesian_copy_kernel!,
266-
dest, dest_offsets, src, src_offsets, shape, len;
267-
elements=len)
258+
kernel = cartesian_copy_kernel!(backend(dest))
259+
kernel(dest, dest_offsets, src, src_offsets; ndrange=shape)
268260
dest
269261
end
270262

src/host/base.jl

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -26,14 +26,13 @@ end
2626
# benchmark faster by having fewer read operations and avoiding the costly division
2727
# operation. Additionally, when repeating over the trailing dimension. `inner=(ones..., n)`,
2828
# data access can be contiguous during both the read and write operations.
29-
function repeat_inner_src_kernel!(
30-
ctx::AbstractKernelContext,
29+
@kernel function repeat_inner_src_kernel!(
3130
xs::AbstractArray{<:Any, N},
3231
inner::NTuple{N, Int},
3332
out::AbstractArray{<:Any, N}
3433
) where {N}
3534
# Get single element from src
36-
idx = @cartesianidx xs
35+
idx = @index(Global, Cartesian)
3736
@inbounds val = xs[idx]
3837

3938
# Loop over "repeat" indices of inner
@@ -44,7 +43,6 @@ function repeat_inner_src_kernel!(
4443
end
4544
@inbounds out[CartesianIndex(odx)] = val
4645
end
47-
return nothing
4846
end
4947

5048
function repeat_inner(xs::AnyGPUArray, inner)
@@ -64,23 +62,24 @@ function repeat_inner(xs::AnyGPUArray, inner)
6462
# relevant benchmarks.
6563
if argmax(inner) == firstindex(inner)
6664
# Parallelize over the destination array
67-
gpu_call(repeat_inner_dst_kernel!, xs, inner, out; elements=prod(size(out)))
65+
kernel = repeat_inner_dst_kernel!(backend(out))
66+
kernel(xs, inner, out; ndrange=size(out))
6867
else
6968
# Parallelize over the source array
70-
gpu_call(repeat_inner_src_kernel!, xs, inner, out; elements=prod(size(xs)))
69+
kernel = repeat_inner_src_kernel!(backend(xs))
70+
kernel(xs, inner, out; ndrange=size(xs))
7171
end
7272
return out
7373
end
7474

75-
function repeat_outer_kernel!(
76-
ctx::AbstractKernelContext,
75+
@kernel function repeat_outer_kernel!(
7776
xs::AbstractArray{<:Any, N},
7877
xssize::NTuple{N},
7978
outer::NTuple{N},
8079
out::AbstractArray{<:Any, N}
8180
) where {N}
8281
# Get index to input element
83-
idx = @cartesianidx xs
82+
idx = @index(Global, Cartesian)
8483
@inbounds val = xs[idx]
8584

8685
# Loop over repeat indices, copying val to out
@@ -98,7 +97,8 @@ end
9897
function repeat_outer(xs::AnyGPUArray, outer)
9998
out = similar(xs, eltype(xs), outer .* size(xs))
10099
any(==(0), size(out)) && return out # consistent with `Base.repeat`
101-
gpu_call(repeat_outer_kernel!, xs, size(xs), outer, out; elements=length(xs))
100+
kernel = repeat_outer_kernel!(backend(xs))
101+
kernel(xs, size(xs), outer, out; ndrange=size(xs))
102102
return out
103103
end
104104

src/host/construction.jl

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -11,29 +11,30 @@ Base.convert(::Type{T}, a::AbstractArray) where {T<:AbstractGPUArray} = a isa T
1111

1212
function Base.fill!(A::AnyGPUArray{T}, x) where T
1313
length(A) == 0 && return A
14-
gpu_call(A, convert(T, x)) do ctx, a, val
15-
idx = @linearidx(a)
14+
@kernel fill!(a, val)
15+
idx = @index(Linear, Global)
1616
@inbounds a[idx] = val
17-
return
1817
end
18+
kernel = fill!(backend(A))
19+
kernel(A, x)
1920
A
2021
end
2122

2223

2324
## identity matrices
2425

25-
function identity_kernel(ctx::AbstractKernelContext, res::AbstractArray{T}, stride, val) where T
26-
i = linear_index(ctx)
26+
@kernel function identity_kernel(ctx::AbstractKernelContext, res::AbstractArray{T}, stride, val) where T
27+
i = @index(Global, Linear)
2728
ilin = (stride * (i - 1)) + i
2829
ilin > length(res) && return
2930
@inbounds res[ilin] = val
30-
return
3131
end
3232

3333
function (T::Type{<: AnyGPUArray{U}})(s::UniformScaling, dims::Dims{2}) where {U}
3434
res = similar(T, dims)
3535
fill!(res, zero(U))
36-
gpu_call(identity_kernel, res, size(res, 1), s.λ; elements=minimum(dims))
36+
kernel = identity_kernel(backend(res))
37+
kernel(res, size(res, 1), s.λ; ndrange=minimum(dims))
3738
res
3839
end
3940

@@ -43,7 +44,8 @@ end
4344

4445
function Base.copyto!(A::AbstractGPUMatrix{T}, s::UniformScaling) where T
4546
fill!(A, zero(T))
46-
gpu_call(identity_kernel, A, size(A, 1), s.λ; elements=minimum(size(A)))
47+
kernel = identity_kernel(backend(A))
48+
kernel(A, size(A, 1), s.λ; ndrange=minimum(size(A)))
4749
A
4850
end
4951

@@ -52,7 +54,8 @@ function _one(unit::T, x::AbstractGPUMatrix) where {T}
5254
m==n || throw(DimensionMismatch("multiplicative identity defined only for square matrices"))
5355
I = similar(x, T)
5456
fill!(I, zero(T))
55-
gpu_call(identity_kernel, I, m, unit; elements=m)
57+
kernel = identity_kernel(backend(I))
58+
kernel(I, m, unit; ndrange=m)
5659
I
5760
end
5861

src/host/random.jl

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -84,29 +84,32 @@ function Random.seed!(rng::RNG, seed::Vector{UInt32})
8484
end
8585

8686
function Random.rand!(rng::RNG, A::AnyGPUArray{T}) where T <: Number
87-
gpu_call(A, rng.state) do ctx, a, randstates
88-
idx = linear_index(ctx)
89-
idx > length(a) && return
87+
@kernel rand!(a, randstate)
88+
idx = @index(Global, Linear)
9089
@inbounds a[idx] = gpu_rand(T, ctx, randstates)
91-
return
9290
end
91+
kernel = rand!(backend(A))
92+
kernel(A, rng.state)
9393
A
9494
end
9595

9696
function Random.randn!(rng::RNG, A::AnyGPUArray{T}) where T <: Number
9797
threads = (length(A) - 1) ÷ 2 + 1
9898
length(A) == 0 && return
99-
gpu_call(A, rng.state; elements = threads) do ctx, a, randstates
100-
idx = 2*(linear_index(ctx) - 1) + 1
99+
@kernel randn!(a, randstates)
100+
i = @index(Global, Linear)
101+
idx = 2*(i - 1) + 1
101102
U1 = gpu_rand(T, ctx, randstates)
102103
U2 = gpu_rand(T, ctx, randstates)
103104
Z0 = sqrt(T(-2.0)*log(U1))*cos(T(2pi)*U2)
104105
Z1 = sqrt(T(-2.0)*log(U1))*sin(T(2pi)*U2)
105106
@inbounds a[idx] = Z0
106-
idx + 1 > length(a) && return
107-
@inbounds a[idx + 1] = Z1
108-
return
107+
if idx + 1 <= length(a)
108+
@inbounds a[idx + 1] = Z1
109+
end
109110
end
111+
kernel = randn!(backend(A))
112+
kernel(A, rng.states; ndrange=threads)
110113
A
111114
end
112115

0 commit comments

Comments
 (0)