Skip to content

Commit 349865a

Browse files
committed
Update KernelAbstractions to use CUDA 1.0
1 parent cf8ec8e commit 349865a

18 files changed

+161
-178
lines changed

Project.toml

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5,19 +5,16 @@ version = "0.2.6"
55

66
[deps]
77
Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
8-
CUDAapi = "3895d2a7-ec45-59b8-82bb-cfc6a382f9b3"
9-
CUDAdrv = "c5f51814-7f29-56b8-a69c-e4d8f6be1fde"
10-
CUDAnative = "be33ccc6-a3ff-5ff2-a52e-74243cff1e17"
8+
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
119
Cassette = "7057c7e9-c182-5462-911a-8362d720325c"
10+
LLVM = "929cbde3-209d-540e-8aea-75f648917ca0"
1211
MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
1312
SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
1413
StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
1514

1615
[compat]
17-
Adapt = "0.4, 1.0"
18-
CUDAapi = "4.0"
19-
CUDAdrv = "6.3"
20-
CUDAnative = "3.0"
16+
Adapt = "0.4, 1.0, 2.0"
17+
CUDA = "1.0"
2118
Cassette = "0.3.2"
2219
MacroTools = "0.5"
2320
SpecialFunctions = "0.10"

docs/src/index.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -55,11 +55,11 @@ all(A .== 2.0)
5555
1. Functions inside kernels are forcefully inlined, except when marked with `@noinline`.
5656
2. Floating-point multiplication, addition, subtraction are marked contractable.
5757

58-
## Important differences to CUDAnative
58+
## Important differences to CUDA
5959

6060
1. The kernels are automatically bounds-checked against either the dynamic or statically
6161
provided `ndrange`.
62-
2. Functions like `Base.sin` are mapped to `CUDAnative.sin`.
62+
2. Functions like `Base.sin` are mapped to `CUDA.sin`.
6363

6464
## Important differences to GPUifyLoops
6565

examples/matmul.jl

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
1-
using KernelAbstractions, Test, CUDAapi
2-
if CUDAapi.has_cuda_gpu()
3-
using CuArrays
4-
CuArrays.allowscalar(false)
1+
using KernelAbstractions, Test, CUDA
2+
3+
if has_cuda_gpu()
4+
CUDA.allowscalar(false)
55
end
66

77
# Simple kernel for matrix multiplication
@@ -26,7 +26,7 @@ function matmul!(a, b, c)
2626
if isa(a, Array)
2727
kernel! = matmul_kernel!(CPU(),4)
2828
else
29-
kernel! = matmul_kernel!(CUDA(),256)
29+
kernel! = matmul_kernel!(CUDAGPU(),256)
3030
end
3131
kernel!(a, b, c, ndrange=size(c))
3232
end

examples/memcopy.jl

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
using KernelAbstractions
2-
using CUDAapi
2+
using CUDA
33
using Test
44

55
@kernel function copy_kernel!(A, @Const(B))
@@ -21,15 +21,14 @@ wait(event)
2121

2222

2323
if has_cuda_gpu()
24-
using CuArrays
2524

2625
function mycopy!(A::CuArray, B::CuArray)
2726
@assert size(A) == size(B)
28-
copy_kernel!(CUDA(), 256)(A, B, ndrange=length(A))
27+
copy_kernel!(CUDAGPU(), 256)(A, B, ndrange=length(A))
2928
end
3029

3130
A = CuArray{Float32}(undef, 1024)
32-
B = CuArrays.ones(Float32, 1024)
31+
B = CUDA.ones(Float32, 1024)
3332
event = mycopy!(A, B)
3433
wait(event)
3534
@test A == B

examples/memcopy_static.jl

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
using KernelAbstractions
2-
using CUDAapi
2+
using CUDA
33
using Test
44

55
@kernel function copy_kernel!(A, @Const(B))
@@ -20,16 +20,15 @@ wait(event)
2020
@test A == B
2121

2222
if has_cuda_gpu()
23-
using CuArrays
2423

2524
function mycopy_static!(A::CuArray, B::CuArray)
2625
@assert size(A) == size(B)
27-
kernel = copy_kernel!(CUDA(), 32, size(A)) # if size(A) varies this will cause recompilation
26+
kernel = copy_kernel!(CUDAGPU(), 32, size(A)) # if size(A) varies this will cause recompilation
2827
kernel(A, B, ndrange=size(A))
2928
end
3029

3130
A = CuArray{Float32}(undef, 1024)
32-
B = CuArrays.ones(Float32, 1024)
31+
B = CUDA.ones(Float32, 1024)
3332
event = mycopy_static!(A, B)
3433
wait(event)
3534
@test A == B

examples/mpi.jl

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,16 @@
11
# EXCLUDE FROM TESTING
22
using KernelAbstractions
3-
using CUDAapi
4-
if CUDAapi.has_cuda_gpu()
5-
using CuArrays
6-
CuArrays.allowscalar(false)
3+
using CUDA
4+
5+
if has_cuda_gpu()
6+
CUDA.allowscalar(false)
77
else
88
exit()
99
end
10+
1011
using MPI
1112

12-
device(A) = typeof(A) <: Array ? CPU() : CUDA()
13+
device(A) = typeof(A) <: Array ? CPU() : CUDAGPU()
1314

1415
function mpiyield()
1516
MPI.Iprobe(MPI.MPI_ANY_SOURCE, MPI.MPI_ANY_TAG, MPI.COMM_WORLD)
@@ -62,7 +63,7 @@ function main()
6263
T = Int64
6364
M = 10
6465

65-
d_recv_buf = CuArrays.zeros(T, M)
66+
d_recv_buf = CUDA.zeros(T, M)
6667
h_send_buf = zeros(T, M)
6768
h_recv_buf = zeros(T, M)
6869

examples/naive_transpose.jl

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
1-
using KernelAbstractions, Test, CUDAapi
2-
if CUDAapi.has_cuda_gpu()
3-
using CuArrays
4-
CuArrays.allowscalar(false)
1+
using KernelAbstractions, Test, CUDA
2+
3+
if has_cuda_gpu()
4+
CUDA.allowscalar(false)
55
end
66

77
@kernel function naive_transpose_kernel!(a, b)
@@ -19,7 +19,7 @@ function naive_transpose!(a, b)
1919
if isa(a, Array)
2020
kernel! = naive_transpose_kernel!(CPU(),4)
2121
else
22-
kernel! = naive_transpose_kernel!(CUDA(),256)
22+
kernel! = naive_transpose_kernel!(CUDAGPU(),256)
2323
end
2424
kernel!(a, b, ndrange=size(a))
2525
end
@@ -39,7 +39,7 @@ wait(event)
3939
# beginning GPU tests
4040
if has_cuda_gpu()
4141
d_a = CuArray(a)
42-
d_b = CuArrays.zeros(Float32, res, res)
42+
d_b = CUDA.zeros(Float32, res, res)
4343

4444
ev = naive_transpose!(d_a, d_b)
4545
wait(ev)

examples/performance.jl

Lines changed: 8 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,7 @@
11
using KernelAbstractions
2-
using CUDAapi
2+
using CUDA
33

4-
CUDAapi.has_cuda_gpu() || exit()
5-
6-
using CuArrays
7-
using CUDAdrv
8-
using CUDAnative
9-
using CUDAnative.NVTX
4+
has_cuda_gpu() || exit()
105

116
@kernel function transpose_kernel_naive!(b, a)
127
i, j = @index(Global, NTuple)
@@ -24,12 +19,12 @@ const nreps = 1
2419
NVTX.@range "Naive transpose ($block_dim, $block_dim)" let
2520
a = CuArray(rand(T, shape))
2621
b = similar(a, shape[2], shape[1])
27-
kernel! = transpose_kernel_naive!(CUDA(), (block_dim, block_dim), size(b))
22+
kernel! = transpose_kernel_naive!(CUDAGPU(), (block_dim, block_dim), size(b))
2823

2924
event = kernel!(b, a)
3025
wait(event)
3126
@assert Array(b) == Array(a)'
32-
@CUDAdrv.profile begin
27+
CUDA.@profile begin
3328
for rep in 1:nreps
3429
event = kernel!(b, a, dependencies=(event,))
3530
end
@@ -40,12 +35,12 @@ end
4035
NVTX.@range "Naive transpose ($(block_dim^2), 1)" let
4136
a = CuArray(rand(T, shape))
4237
b = similar(a, shape[2], shape[1])
43-
kernel! = transpose_kernel_naive!(CUDA(), (block_dim*block_dim, 1), size(b))
38+
kernel! = transpose_kernel_naive!(CUDAGPU(), (block_dim*block_dim, 1), size(b))
4439

4540
event = kernel!(b, a)
4641
wait(event)
4742
@assert Array(b) == Array(a)'
48-
@CUDAdrv.profile begin
43+
CUDA.@profile begin
4944
for rep in 1:nreps
5045
event = kernel!(b, a, dependencies=(event,))
5146
end
@@ -56,12 +51,12 @@ end
5651
NVTX.@range "Naive transpose (1, $(block_dim^2))" let
5752
a = CuArray(rand(T, shape))
5853
b = similar(a, shape[2], shape[1])
59-
kernel! = transpose_kernel_naive!(CUDA(), (1, block_dim*block_dim), size(b))
54+
kernel! = transpose_kernel_naive!(CUDAGPU(), (1, block_dim*block_dim), size(b))
6055

6156
event = kernel!(b, a)
6257
wait(event)
6358
@assert Array(b) == Array(a)'
64-
@CUDAdrv.profile begin
59+
CUDA.@profile begin
6560
for rep in 1:nreps
6661
event = kernel!(b, a, dependencies=(event,))
6762
end

src/KernelAbstractions.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ module KernelAbstractions
22

33
export @kernel
44
export @Const, @localmem, @private, @uniform, @synchronize, @index, groupsize, @print
5-
export Device, GPU, CPU, CUDA, Event, MultiEvent, NoneEvent
5+
export Device, GPU, CPU, CUDAGPU, Event, MultiEvent, NoneEvent
66
export async_copy!
77

88

@@ -303,7 +303,7 @@ abstract type Device end
303303
abstract type GPU <: Device end
304304

305305
struct CPU <: Device end
306-
struct CUDA <: GPU end
306+
struct CUDAGPU <: GPU end
307307
# struct AMD <: GPU end
308308
# struct Intel <: GPU end
309309

0 commit comments

Comments
 (0)