cleanup examples

vchuravy · vchuravy · commit 59b90102b316 · 2020-02-20T22:50:22.000-05:00
diff --git a/docs/make.jl b/docs/make.jl
@@ -11,6 +11,10 @@ makedocs(
         "Writing kernels" => "kernels.md",
         "Examples" => [
             "examples/memcopy.md"
+            "examples/memcopy_static.md"
+            "examples/naive_transpose.md"
+            "examples/performance.md"
+            "examples/matmul.md"
         ],
         "API"          => "api.md",
         "Extras" => [
diff --git a/docs/src/examples/matmul.md b/docs/src/examples/matmul.md
@@ -0,0 +1,13 @@
+# Matmul
+
+
+````@eval
+using Markdown
+using KernelAbstractions
+path = joinpath(dirname(pathof(KernelAbstractions)), "..", "examples/matmul.jl")
+Markdown.parse("""
+```julia
+$(read(path, String))
+```
+""")
+````
diff --git a/docs/src/examples/memcopy_static.md b/docs/src/examples/memcopy_static.md
@@ -0,0 +1,15 @@
+# Memcopy with static NDRange
+
+The first example simple copies memory from `A` to `B`. In contrast to the previous examples
+it uses a fully static kernel configuration. Specializing the kernel on the iteration range itself.
+
+````@eval
+using Markdown
+using KernelAbstractions
+path = joinpath(dirname(pathof(KernelAbstractions)), "..", "examples/memcopy_static.jl")
+Markdown.parse("""
+```julia
+$(read(path, String))
+```
+""")
+````
diff --git a/docs/src/examples/naive_transpose.md b/docs/src/examples/naive_transpose.md
@@ -0,0 +1,12 @@
+# Naive Transpose
+
+````@eval
+using Markdown
+using KernelAbstractions
+path = joinpath(dirname(pathof(KernelAbstractions)), "..", "examples/naive_transpose.jl")
+Markdown.parse("""
+```julia
+$(read(path, String))
+```
+""")
+````
diff --git a/docs/src/examples/performance.md b/docs/src/examples/performance.md
@@ -0,0 +1,81 @@
+# Measuring performance
+
+Run under `nsight-cu`:
+
+```sh
+nv-nsight-cu-cli --nvtx --profile-from-start=off --section=SpeedOfLight julia --project=examples examples/performance.jl
+```
+
+## Results:
+
+Collated results on a V100:
+
+| Kernel          | Time   | Speed of Light Mem % |
+| --------------- | ------ | -------------------- |
+| naive (32, 32)  | 1.19ms | 65.06%               |
+| naive (1024, 1) | 1.79ms | 56.13 %              |
+| naive (1, 1024) | 3.03ms | 60.02 %              |
+
+### Full output:
+```
+==PROF==   0: Naive transpose (32, 32)
+    Section: GPU Speed Of Light
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Memory Frequency                                                         cycle/usecond                         878.88
+    SOL FB                                                                               %                          38.16
+    Elapsed Cycles                                                                   cycle                      1,447,874
+    SM Frequency                                                             cycle/nsecond                           1.23
+    Memory [%]                                                                           %                          65.93
+    Duration                                                                       msecond                           1.17
+    SOL L2                                                                               %                          19.08
+    SOL TEX                                                                              %                          66.19
+    SM Active Cycles                                                                 cycle                   1,440,706.40
+    SM [%]                                                                               %                          23.56
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+  ptxcall___gpu_transpose_kernel_naive__430_2, 2020-Feb-20 22:42:24, Context 1, Stream 23
+
+==PROF==   0: Naive transpose (1024, 1)
+    Section: GPU Speed Of Light
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Memory Frequency                                                         cycle/usecond                         877.69
+    SOL FB                                                                               %                          22.40
+    Elapsed Cycles                                                                   cycle                      2,473,141
+    SM Frequency                                                             cycle/nsecond                           1.23
+    Memory [%]                                                                           %                          51.17
+    Duration                                                                       msecond                           2.00
+    SOL L2                                                                               %                          50.17
+    SOL TEX                                                                              %                          51.27
+    SM Active Cycles                                                                 cycle                   2,465,610.06
+    SM [%]                                                                               %                          11.68
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
+  ptxcall___gpu_transpose_kernel_naive__430_3, 2020-Feb-20 22:42:28, Context 1, Stream 25
+
+==PROF==   0: Naive transpose (1, 1024)
+    Section: GPU Speed Of Light
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    Memory Frequency                                                         cycle/usecond                         876.69
+    SOL FB                                                                               %                          17.88
+    Elapsed Cycles                                                                   cycle                      3,737,127
+    SM Frequency                                                             cycle/nsecond                           1.24
+    Memory [%]                                                                           %                          60.02
+    Duration                                                                       msecond                           3.02
+    SOL L2                                                                               %                          60.02
+    SOL TEX                                                                              %                          45.65
+    SM Active Cycles                                                                 cycle                   3,732,591.59
+    SM [%]                                                                               %                          12.56
+    ---------------------------------------------------------------------- --------------- ------------------------------
+```
+
+## Code
+````@eval
+using Markdown
+using KernelAbstractions
+path = joinpath(dirname(pathof(KernelAbstractions)), "..", "examples/performance.jl")
+Markdown.parse("""
+```julia
+$(read(path, String))
+```
+""")
+````
diff --git a/examples/Project.toml b/examples/Project.toml
@@ -0,0 +1,7 @@
+[deps]
+CUDAapi = "3895d2a7-ec45-59b8-82bb-cfc6a382f9b3"
+CUDAdrv = "c5f51814-7f29-56b8-a69c-e4d8f6be1fde"
+CUDAnative = "be33ccc6-a3ff-5ff2-a52e-74243cff1e17"
+CuArrays = "3a865a2d-5b23-5a0f-bc46-62713ec82fae"
+KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
+Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
diff --git a/examples/matmul.jl b/examples/matmul.jl
@@ -5,17 +5,11 @@ if CUDAapi.has_cuda_gpu()
 end
 
 # Simple kernel for matrix multiplication
-@kernel function matmul!(a, b, c)
-    if size(a)[2] != size(b)[1]
-        # here, we need a CPU / GPU generic print statement, like...
-        # CUDAnative.@cuprintf("Matrix size mismatch!")
-        return nothing
-    end
+@kernel function matmul_kernel!(a, b, c)
     cI = @index(Global, Cartesian)
 
     # creating a temporary sum variable for matrix multiplication
-    tmp_sum = 0
-
+    tmp_sum = zero(eltype(c))
     for i = 1:size(a)[2]
         tmp_sum += a[cI[1],i] * b[i,cI[2]]
     end
@@ -24,44 +18,37 @@ end
 end
 
 # Creating a wrapper kernel for launching with error checks
-function launch_matmul!(a, b, c)
+function matmul!(a, b, c)
     if size(a)[2] != size(b)[1]
         println("Matrix size mismatch!")
         return nothing
     end
     if isa(a, Array)
-        kernel! = matmul!(CPU(),4)
+        kernel! = matmul_kernel!(CPU(),4)
     else
-        kernel! = matmul!(CUDA(),256)
+        kernel! = matmul_kernel!(CUDA(),256)
     end
     kernel!(a, b, c, ndrange=size(c)) 
 end
 
-function check()
-    a = rand(256,123)
-    b = rand(123, 45)
-    c = zeros(256, 45)
+a = rand(256,123)
+b = rand(123, 45)
+c = zeros(256, 45)
 
-    # beginning CPU tests, returns event
-    ev = launch_matmul!(a,b,c)
-    wait(ev)
+# beginning CPU tests, returns event
+ev = matmul!(a,b,c)
+wait(ev)
 
-    println("Testing CPU matrix multiplication...")
-    @test isapprox(a*b, c)
+@test isapprox(c, a*b)
 
-    # beginning GPU tests
-    if has_cuda_gpu()
-        d_a = CuArray(a)
-        d_b = CuArray(b)
-        d_c = CuArray(c)
+# beginning GPU tests
+if has_cuda_gpu()
+    d_a = CuArray(a)
+    d_b = CuArray(b)
+    d_c = CuArray(c)
 
-        ev = launch_matmul!(d_a, d_b, d_c)
-        wait(ev)
-        c = a*b
+    ev = matmul!(d_a, d_b, d_c)
+    wait(ev)
 
-        println("Testing GPU matrix multiplication...")
-        @test isapprox(Array(d_c), c)
-    end
+    @test isapprox(Array(d_c), a*b)
 end
-
-check()
diff --git a/examples/memcopy.jl b/examples/memcopy.jl
@@ -13,22 +13,12 @@ function mycopy!(A::Array, B::Array)
     kernel(A, B, ndrange=length(A))
 end
 
-function mycopy_static!(A::Array, B::Array)
-    @assert size(A) == size(B)
-    kernel = copy_kernel!(CPU(), 32, size(A)) # if size(A) varies this will cause recompilation
-    kernel(A, B, ndrange=size(A))
-end
-
 A = zeros(128, 128)
 B = ones(128, 128)
 event = mycopy!(A, B)
 wait(event)
 @test A == B
 
-A = zeros(128, 128)
-event = mycopy_static!(A, B)
-wait(event)
-@test A == B
 
 if has_cuda_gpu()
     using CuArrays
@@ -38,20 +28,9 @@ if has_cuda_gpu()
         copy_kernel!(CUDA(), 256)(A, B, ndrange=length(A))
     end
 
-    function mycopy_static!(A::CuArray, B::CuArray)
-        @assert size(A) == size(B)
-        kernel = copy_kernel!(CUDA(), 32, size(A)) # if size(A) varies this will cause recompilation
-        kernel(A, B, ndrange=size(A))
-    end
-
     A = CuArray{Float32}(undef, 1024)
     B = CuArrays.ones(Float32, 1024)
     event = mycopy!(A, B)
     wait(event)
     @test A == B
-
-    A = CuArray{Float32}(undef, 1024)
-    event = mycopy_static!(A, B)
-    wait(event)
-    @test A == B
 end
diff --git a/examples/memcopy_static.jl b/examples/memcopy_static.jl
@@ -0,0 +1,36 @@
+using KernelAbstractions
+using CUDAapi
+using Test
+
+@kernel function copy_kernel!(A, @Const(B))
+    I = @index(Global)
+    @inbounds A[I] = B[I]
+end
+
+function mycopy_static!(A::Array, B::Array)
+    @assert size(A) == size(B)
+    kernel = copy_kernel!(CPU(), 32, size(A)) # if size(A) varies this will cause recompilation
+    kernel(A, B, ndrange=size(A))
+end
+
+A = zeros(128, 128)
+B = ones(128, 128)
+event = mycopy_static!(A, B)
+wait(event)
+@test A == B
+
+if has_cuda_gpu()
+    using CuArrays
+
+    function mycopy_static!(A::CuArray, B::CuArray)
+        @assert size(A) == size(B)
+        kernel = copy_kernel!(CUDA(), 32, size(A)) # if size(A) varies this will cause recompilation
+        kernel(A, B, ndrange=size(A))
+    end
+
+    A = CuArray{Float32}(undef, 1024)
+    B = CuArrays.ones(Float32, 1024)
+    event = mycopy_static!(A, B)
+    wait(event)
+    @test A == B
+end
diff --git a/examples/naive_transpose.jl b/examples/naive_transpose.jl
diff --git a/examples/performance.jl b/examples/performance.jl