Skip to content

Commit 59b9010

Browse files
committed
cleanup examples
1 parent 2239640 commit 59b9010

File tree

11 files changed

+215
-157
lines changed

11 files changed

+215
-157
lines changed

docs/make.jl

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,10 @@ makedocs(
1111
"Writing kernels" => "kernels.md",
1212
"Examples" => [
1313
"examples/memcopy.md"
14+
"examples/memcopy_static.md"
15+
"examples/naive_transpose.md"
16+
"examples/performance.md"
17+
"examples/matmul.md"
1418
],
1519
"API" => "api.md",
1620
"Extras" => [

docs/src/examples/matmul.md

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# Matmul
2+
3+
4+
````@eval
5+
using Markdown
6+
using KernelAbstractions
7+
path = joinpath(dirname(pathof(KernelAbstractions)), "..", "examples/matmul.jl")
8+
Markdown.parse("""
9+
```julia
10+
$(read(path, String))
11+
```
12+
""")
13+
````

docs/src/examples/memcopy_static.md

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
# Memcopy with static NDRange
2+
3+
The first example simple copies memory from `A` to `B`. In contrast to the previous examples
4+
it uses a fully static kernel configuration. Specializing the kernel on the iteration range itself.
5+
6+
````@eval
7+
using Markdown
8+
using KernelAbstractions
9+
path = joinpath(dirname(pathof(KernelAbstractions)), "..", "examples/memcopy_static.jl")
10+
Markdown.parse("""
11+
```julia
12+
$(read(path, String))
13+
```
14+
""")
15+
````

docs/src/examples/naive_transpose.md

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
# Naive Transpose
2+
3+
````@eval
4+
using Markdown
5+
using KernelAbstractions
6+
path = joinpath(dirname(pathof(KernelAbstractions)), "..", "examples/naive_transpose.jl")
7+
Markdown.parse("""
8+
```julia
9+
$(read(path, String))
10+
```
11+
""")
12+
````

docs/src/examples/performance.md

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
# Measuring performance
2+
3+
Run under `nsight-cu`:
4+
5+
```sh
6+
nv-nsight-cu-cli --nvtx --profile-from-start=off --section=SpeedOfLight julia --project=examples examples/performance.jl
7+
```
8+
9+
## Results:
10+
11+
Collated results on a V100:
12+
13+
| Kernel | Time | Speed of Light Mem % |
14+
| --------------- | ------ | -------------------- |
15+
| naive (32, 32) | 1.19ms | 65.06% |
16+
| naive (1024, 1) | 1.79ms | 56.13 % |
17+
| naive (1, 1024) | 3.03ms | 60.02 % |
18+
19+
### Full output:
20+
```
21+
==PROF== 0: Naive transpose (32, 32)
22+
Section: GPU Speed Of Light
23+
---------------------------------------------------------------------- --------------- ------------------------------
24+
Memory Frequency cycle/usecond 878.88
25+
SOL FB % 38.16
26+
Elapsed Cycles cycle 1,447,874
27+
SM Frequency cycle/nsecond 1.23
28+
Memory [%] % 65.93
29+
Duration msecond 1.17
30+
SOL L2 % 19.08
31+
SOL TEX % 66.19
32+
SM Active Cycles cycle 1,440,706.40
33+
SM [%] % 23.56
34+
---------------------------------------------------------------------- --------------- ------------------------------
35+
36+
ptxcall___gpu_transpose_kernel_naive__430_2, 2020-Feb-20 22:42:24, Context 1, Stream 23
37+
38+
==PROF== 0: Naive transpose (1024, 1)
39+
Section: GPU Speed Of Light
40+
---------------------------------------------------------------------- --------------- ------------------------------
41+
Memory Frequency cycle/usecond 877.69
42+
SOL FB % 22.40
43+
Elapsed Cycles cycle 2,473,141
44+
SM Frequency cycle/nsecond 1.23
45+
Memory [%] % 51.17
46+
Duration msecond 2.00
47+
SOL L2 % 50.17
48+
SOL TEX % 51.27
49+
SM Active Cycles cycle 2,465,610.06
50+
SM [%] % 11.68
51+
---------------------------------------------------------------------- --------------- ------------------------------
52+
53+
ptxcall___gpu_transpose_kernel_naive__430_3, 2020-Feb-20 22:42:28, Context 1, Stream 25
54+
55+
==PROF== 0: Naive transpose (1, 1024)
56+
Section: GPU Speed Of Light
57+
---------------------------------------------------------------------- --------------- ------------------------------
58+
Memory Frequency cycle/usecond 876.69
59+
SOL FB % 17.88
60+
Elapsed Cycles cycle 3,737,127
61+
SM Frequency cycle/nsecond 1.24
62+
Memory [%] % 60.02
63+
Duration msecond 3.02
64+
SOL L2 % 60.02
65+
SOL TEX % 45.65
66+
SM Active Cycles cycle 3,732,591.59
67+
SM [%] % 12.56
68+
---------------------------------------------------------------------- --------------- ------------------------------
69+
```
70+
71+
## Code
72+
````@eval
73+
using Markdown
74+
using KernelAbstractions
75+
path = joinpath(dirname(pathof(KernelAbstractions)), "..", "examples/performance.jl")
76+
Markdown.parse("""
77+
```julia
78+
$(read(path, String))
79+
```
80+
""")
81+
````

examples/Project.toml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
[deps]
2+
CUDAapi = "3895d2a7-ec45-59b8-82bb-cfc6a382f9b3"
3+
CUDAdrv = "c5f51814-7f29-56b8-a69c-e4d8f6be1fde"
4+
CUDAnative = "be33ccc6-a3ff-5ff2-a52e-74243cff1e17"
5+
CuArrays = "3a865a2d-5b23-5a0f-bc46-62713ec82fae"
6+
KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
7+
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

examples/matmul.jl

Lines changed: 20 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -5,17 +5,11 @@ if CUDAapi.has_cuda_gpu()
55
end
66

77
# Simple kernel for matrix multiplication
8-
@kernel function matmul!(a, b, c)
9-
if size(a)[2] != size(b)[1]
10-
# here, we need a CPU / GPU generic print statement, like...
11-
# CUDAnative.@cuprintf("Matrix size mismatch!")
12-
return nothing
13-
end
8+
@kernel function matmul_kernel!(a, b, c)
149
cI = @index(Global, Cartesian)
1510

1611
# creating a temporary sum variable for matrix multiplication
17-
tmp_sum = 0
18-
12+
tmp_sum = zero(eltype(c))
1913
for i = 1:size(a)[2]
2014
tmp_sum += a[cI[1],i] * b[i,cI[2]]
2115
end
@@ -24,44 +18,37 @@ end
2418
end
2519

2620
# Creating a wrapper kernel for launching with error checks
27-
function launch_matmul!(a, b, c)
21+
function matmul!(a, b, c)
2822
if size(a)[2] != size(b)[1]
2923
println("Matrix size mismatch!")
3024
return nothing
3125
end
3226
if isa(a, Array)
33-
kernel! = matmul!(CPU(),4)
27+
kernel! = matmul_kernel!(CPU(),4)
3428
else
35-
kernel! = matmul!(CUDA(),256)
29+
kernel! = matmul_kernel!(CUDA(),256)
3630
end
3731
kernel!(a, b, c, ndrange=size(c))
3832
end
3933

40-
function check()
41-
a = rand(256,123)
42-
b = rand(123, 45)
43-
c = zeros(256, 45)
34+
a = rand(256,123)
35+
b = rand(123, 45)
36+
c = zeros(256, 45)
4437

45-
# beginning CPU tests, returns event
46-
ev = launch_matmul!(a,b,c)
47-
wait(ev)
38+
# beginning CPU tests, returns event
39+
ev = matmul!(a,b,c)
40+
wait(ev)
4841

49-
println("Testing CPU matrix multiplication...")
50-
@test isapprox(a*b, c)
42+
@test isapprox(c, a*b)
5143

52-
# beginning GPU tests
53-
if has_cuda_gpu()
54-
d_a = CuArray(a)
55-
d_b = CuArray(b)
56-
d_c = CuArray(c)
44+
# beginning GPU tests
45+
if has_cuda_gpu()
46+
d_a = CuArray(a)
47+
d_b = CuArray(b)
48+
d_c = CuArray(c)
5749

58-
ev = launch_matmul!(d_a, d_b, d_c)
59-
wait(ev)
60-
c = a*b
50+
ev = matmul!(d_a, d_b, d_c)
51+
wait(ev)
6152

62-
println("Testing GPU matrix multiplication...")
63-
@test isapprox(Array(d_c), c)
64-
end
53+
@test isapprox(Array(d_c), a*b)
6554
end
66-
67-
check()

examples/memcopy.jl

Lines changed: 0 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -13,22 +13,12 @@ function mycopy!(A::Array, B::Array)
1313
kernel(A, B, ndrange=length(A))
1414
end
1515

16-
function mycopy_static!(A::Array, B::Array)
17-
@assert size(A) == size(B)
18-
kernel = copy_kernel!(CPU(), 32, size(A)) # if size(A) varies this will cause recompilation
19-
kernel(A, B, ndrange=size(A))
20-
end
21-
2216
A = zeros(128, 128)
2317
B = ones(128, 128)
2418
event = mycopy!(A, B)
2519
wait(event)
2620
@test A == B
2721

28-
A = zeros(128, 128)
29-
event = mycopy_static!(A, B)
30-
wait(event)
31-
@test A == B
3222

3323
if has_cuda_gpu()
3424
using CuArrays
@@ -38,20 +28,9 @@ if has_cuda_gpu()
3828
copy_kernel!(CUDA(), 256)(A, B, ndrange=length(A))
3929
end
4030

41-
function mycopy_static!(A::CuArray, B::CuArray)
42-
@assert size(A) == size(B)
43-
kernel = copy_kernel!(CUDA(), 32, size(A)) # if size(A) varies this will cause recompilation
44-
kernel(A, B, ndrange=size(A))
45-
end
46-
4731
A = CuArray{Float32}(undef, 1024)
4832
B = CuArrays.ones(Float32, 1024)
4933
event = mycopy!(A, B)
5034
wait(event)
5135
@test A == B
52-
53-
A = CuArray{Float32}(undef, 1024)
54-
event = mycopy_static!(A, B)
55-
wait(event)
56-
@test A == B
5736
end

examples/memcopy_static.jl

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
using KernelAbstractions
2+
using CUDAapi
3+
using Test
4+
5+
@kernel function copy_kernel!(A, @Const(B))
6+
I = @index(Global)
7+
@inbounds A[I] = B[I]
8+
end
9+
10+
function mycopy_static!(A::Array, B::Array)
11+
@assert size(A) == size(B)
12+
kernel = copy_kernel!(CPU(), 32, size(A)) # if size(A) varies this will cause recompilation
13+
kernel(A, B, ndrange=size(A))
14+
end
15+
16+
A = zeros(128, 128)
17+
B = ones(128, 128)
18+
event = mycopy_static!(A, B)
19+
wait(event)
20+
@test A == B
21+
22+
if has_cuda_gpu()
23+
using CuArrays
24+
25+
function mycopy_static!(A::CuArray, B::CuArray)
26+
@assert size(A) == size(B)
27+
kernel = copy_kernel!(CUDA(), 32, size(A)) # if size(A) varies this will cause recompilation
28+
kernel(A, B, ndrange=size(A))
29+
end
30+
31+
A = CuArray{Float32}(undef, 1024)
32+
B = CuArrays.ones(Float32, 1024)
33+
event = mycopy_static!(A, B)
34+
wait(event)
35+
@test A == B
36+
end

0 commit comments

Comments
 (0)