Replace the shared memory macros with regular functions.

maleadt · maleadt · commit efcfe39ba80a · 2021-08-26T08:23:08.000+02:00
diff --git a/docs/src/api/kernel.md b/docs/src/api/kernel.md
@@ -35,8 +35,8 @@ CUDA.Const
 ### Shared memory
 
 ```@docs
-@cuStaticSharedMem
-@cuDynamicSharedMem
+CuStaticSharedArray
+CuDynamicSharedArray
 ```
 
 ### Texture memory
diff --git a/examples/pairwise.jl b/examples/pairwise.jl
@@ -52,7 +52,7 @@ function pairwise_dist_kernel(lat::CuDeviceVector{Float32}, lon::CuDeviceVector{
 
     if i <= n && j <= n
         # store to shared memory
-        shmem = @cuDynamicSharedMem(Float32, 2*blockDim().x + 2*blockDim().y)
+        shmem = CuDynamicSharedArray(Float32, 2*blockDim().x + 2*blockDim().y)
         if threadIdx().y == 1
             shmem[threadIdx().x] = lat[i]
             shmem[blockDim().x + threadIdx().x] = lon[i]
diff --git a/perf/volumerhs.jl b/perf/volumerhs.jl
@@ -91,9 +91,9 @@ function volumerhs!(rhs, Q, vgeo, gravity, D, nelem)
 
     Nq = N + 1
 
-    s_D = @cuStaticSharedMem eltype(D) (Nq, Nq)
-    s_F = @cuStaticSharedMem eltype(Q) (Nq, Nq, _nstate)
-    s_G = @cuStaticSharedMem eltype(Q) (Nq, Nq, _nstate)
+    s_D = CuStaticSharedArray(eltype(D), (Nq, Nq))
+    s_F = CuStaticSharedArray(eltype(Q), (Nq, Nq, _nstate))
+    s_G = CuStaticSharedArray(eltype(Q), (Nq, Nq, _nstate))
 
     r_rhsρ = MArray{Tuple{Nq}, eltype(rhs)}(undef)
     r_rhsU = MArray{Tuple{Nq}, eltype(rhs)}(undef)
diff --git a/src/accumulate.jl b/src/accumulate.jl
@@ -19,7 +19,7 @@ function partial_scan(op::Function, output::AbstractArray{T}, input::AbstractArr
     thread = threadIdx().x
     block = blockIdx().x
 
-    temp = @cuDynamicSharedMem(T, (2*threads,))
+    temp = CuDynamicSharedArray(T, (2*threads,))
 
     # iterate the main dimension using threads and the first block dimension
     i = (blockIdx().x-1) * blockDim().x + threadIdx().x
diff --git a/src/device/intrinsics/memory_shared.jl b/src/device/intrinsics/memory_shared.jl
@@ -1,25 +1,32 @@
 # Shared Memory (part of B.2)
 
-export @cuStaticSharedMem, @cuDynamicSharedMem
+export @cuStaticSharedMem, @cuDynamicSharedMem, CuStaticSharedArray, CuDynamicSharedArray
 
 """
-    @cuStaticSharedMem(T::Type, dims) -> CuDeviceArray{T,AS.Shared}
+    CuStaticSharedArray(T::Type, dims) -> CuDeviceArray{T,AS.Shared}
 
 Get an array of type `T` and dimensions `dims` (either an integer length or tuple shape)
 pointing to a statically-allocated piece of shared memory. The type should be statically
 inferable and the dimensions should be constant, or an error will be thrown and the
 generator function will be called dynamically.
 """
+@inline function CuStaticSharedArray(::Type{T}, dims) where {T}
+    len = prod(dims)
+    # NOTE: this relies on const-prop to forward the literal length to the generator.
+    #       maybe we should include the size in the type, like StaticArrays does?
+    ptr = emit_shmem(T, Val(len))
+    CuDeviceArray(dims, ptr)
+end
+
 macro cuStaticSharedMem(T, dims)
+    Base.depwarn("@cuStaticSharedMem is deprecated, please use the CuStaticSharedArray function", :CuStaticSharedArray)
     quote
-        len = prod($(esc(dims)))
-        ptr = emit_shmem($(esc(T)), Val(len))
-        CuDeviceArray($(esc(dims)), ptr)
+        CuStaticSharedArray($(esc(T)), $(esc(dims)))
     end
 end
 
 """
-    @cuDynamicSharedMem(T::Type, dims, offset::Integer=0) -> CuDeviceArray{T,AS.Shared}
+    CuDynamicSharedArray(T::Type, dims, offset::Integer=0) -> CuDeviceArray{T,AS.Shared}
 
 Get an array of type `T` and dimensions `dims` (either an integer length or tuple shape)
 pointing to a dynamically-allocated piece of shared memory. The type should be statically
@@ -31,12 +38,17 @@ Optionally, an offset parameter indicating how many bytes to add to the base sha
 pointer can be specified. This is useful when dealing with a heterogeneous buffer of dynamic
 shared memory; in the case of a homogeneous multi-part buffer it is preferred to use `view`.
 """
-macro cuDynamicSharedMem(T, dims, offset=0)
+@inline function CuDynamicSharedArray(::Type{T}, dims, offset=0) where {T}
+    len = prod(dims)
+    ptr = emit_shmem(T) + offset
     # TODO: boundscheck against %dynamic_smem_size (currently unsupported by LLVM)
+    CuDeviceArray(dims, ptr)
+end
+
+macro cuDynamicSharedMem(T, dims, offset=0)
+    Base.depwarn("@cuDynamicSharedMem is deprecated, please use the CuDynamicSharedArray function", :CuStaticSharedArray)
     quote
-        len = prod($(esc(dims)))
-        ptr = emit_shmem($(esc(T))) + $(esc(offset))
-        CuDeviceArray($(esc(dims)), ptr)
+        CuDynamicSharedArray($(esc(T)), $(esc(dims)), $(esc(offset)))
     end
 end
 
diff --git a/src/mapreduce.jl b/src/mapreduce.jl
@@ -20,7 +20,7 @@ end
 @inline function reduce_block(op, val::T, neutral, shuffle::Val{true}) where T
     # shared mem for partial sums
     assume(warpsize() == 32)
-    shared = @cuStaticSharedMem(T, 32)
+    shared = CuStaticSharedArray(T, 32)
 
     wid, lane = fldmod1(threadIdx().x, warpsize())
 
@@ -54,7 +54,7 @@ end
     thread = threadIdx().x
 
     # shared mem for a complete reduction
-    shared = @cuDynamicSharedMem(T, (threads,))
+    shared = CuDynamicSharedArray(T, (threads,))
     @inbounds shared[thread] = val
 
     # perform a reduction
diff --git a/src/sorting.jl b/src/sorting.jl
@@ -102,8 +102,8 @@ from `lo` to `hi` of `values`.
 """
 function partition_batches_kernel(values::AbstractArray{T}, pivot, lo, hi, parity, lt::F1,
                                   by::F2) where {T,F1,F2}
-    sums = @cuDynamicSharedMem(Int, blockDim().x)
-    swap = @cuDynamicSharedMem(T, blockDim().x, sizeof(sums))
+    sums = CuDynamicSharedArray(Int, blockDim().x)
+    swap = CuDynamicSharedArray(T, blockDim().x, sizeof(sums))
     batch_partition(values, pivot, swap, sums, lo, hi, parity, lt, by)
     return
 end
@@ -375,8 +375,8 @@ early end to recursion if we started `stuck` at 0.
 """
 function qsort_kernel(vals::AbstractArray{T,N}, lo, hi, parity, sync::Val{S}, sync_depth,
                       prev_pivot, lt::F1, by::F2, ::Val{dims}, partial=nothing, stuck=-1) where {T, N, S, F1, F2, dims}
-    b_sums = @cuDynamicSharedMem(Int, blockDim().x)
-    swap = @cuDynamicSharedMem(T, blockDim().x, sizeof(b_sums))
+    b_sums = CuDynamicSharedArray(Int, blockDim().x)
+    swap = CuDynamicSharedArray(T, blockDim().x, sizeof(b_sums))
     shmem = sizeof(b_sums) + sizeof(swap)
     L = hi - lo
 
diff --git a/test/codegen.jl b/test/codegen.jl
@@ -2,7 +2,7 @@
 
 @testset "JuliaLang/julia#21121" begin
     function foobar()
-        weight_matrix = @cuStaticSharedMem(Float32, (16, 16))
+        weight_matrix = CuStaticSharedArray(Float32, (16, 16))
         sync_threads()
         weight_matrix[1, 16] *= 2
         sync_threads()
@@ -75,7 +75,7 @@ end
     @inbounds function kernel(input, output, n)
         i = threadIdx().x
 
-        temp = @cuStaticSharedMem(Int, 1)
+        temp = CuStaticSharedArray(Int, 1)
         if i == 1
             1 <= n || throw_some()
             temp[1] = input
diff --git a/test/device/array.jl b/test/device/array.jl
@@ -141,7 +141,7 @@ end
 
 
 function kernel_shmem_reinterpet_equal_size!(y)
-  a = @cuDynamicSharedMem(Float32, (blockDim().x,))
+  a = CuDynamicSharedArray(Float32, (blockDim().x,))
   b = reinterpret(UInt32, a)
   a[threadIdx().x] = threadIdx().x
   b[threadIdx().x] += 1
@@ -172,7 +172,7 @@ end
 end
 
 function kernel_shmem_reinterpet_smaller_size!(y)
-  a = @cuDynamicSharedMem(UInt128, (blockDim().x,))
+  a = CuDynamicSharedArray(UInt128, (blockDim().x,))
   i32 = Int32(threadIdx().x)
   p = i32 + i32 * im
   q = i32 - i32 * im
@@ -209,7 +209,7 @@ end
 end
 
 function kernel_shmem_reinterpet_larger_size!(y)
-  a = @cuDynamicSharedMem(Float32, (4 * blockDim().x,))
+  a = CuDynamicSharedArray(Float32, (4 * blockDim().x,))
   b = reinterpret(UInt128, a)
   a[1 + 4 * (threadIdx().x - 1)] = threadIdx().x
   a[2 + 4 * (threadIdx().x - 1)] = threadIdx().x * 2
diff --git a/test/device/intrinsics/atomics.jl b/test/device/intrinsics/atomics.jl
@@ -194,7 +194,7 @@ end
 
 @testset "shared memory" begin
     function kernel()
-        shared = @cuStaticSharedMem(Float32, 1)
+        shared = CuStaticSharedArray(Float32, 1)
         @atomic shared[threadIdx().x] += 0f0
         return
     end
@@ -425,7 +425,7 @@ end
     # https://github.com/JuliaGPU/CUDA.jl/issues/311
 
     function kernel(a)
-        b = CUDA.@cuStaticSharedMem(Int, 1)
+        b = CUDA.CuStaticSharedArray(Int, 1)
 
         if threadIdx().x == 1
             b[] = a[]
@@ -452,7 +452,7 @@ end
 
     function kernel()
         tid = threadIdx().x
-        shared = @cuStaticSharedMem(Float32, 4)
+        shared = CuStaticSharedArray(Float32, 4)
         CUDA.atomic_add!(pointer(shared, tid), shared[tid + 2])
         sync_threads()
         CUDA.atomic_add!(pointer(shared, tid), shared[tid + 2])
diff --git a/test/device/intrinsics/memory.jl b/test/device/intrinsics/memory.jl
@@ -11,28 +11,28 @@ n = 256
 
 @testset "constructors" begin
     # static
-    @on_device @cuStaticSharedMem(Float32, 1)
-    @on_device @cuStaticSharedMem(Float32, (1,2))
-    @on_device @cuStaticSharedMem(Tuple{Float32, Float32}, 1)
-    @on_device @cuStaticSharedMem(Tuple{Float32, Float32}, (1,2))
-    @on_device @cuStaticSharedMem(Tuple{RGB{Float32}, UInt32}, 1)
-    @on_device @cuStaticSharedMem(Tuple{RGB{Float32}, UInt32}, (1,2))
+    @on_device CuStaticSharedArray(Float32, 1)
+    @on_device CuStaticSharedArray(Float32, (1,2))
+    @on_device CuStaticSharedArray(Tuple{Float32, Float32}, 1)
+    @on_device CuStaticSharedArray(Tuple{Float32, Float32}, (1,2))
+    @on_device CuStaticSharedArray(Tuple{RGB{Float32}, UInt32}, 1)
+    @on_device CuStaticSharedArray(Tuple{RGB{Float32}, UInt32}, (1,2))
 
     # dynamic
-    @on_device @cuDynamicSharedMem(Float32, 1)
-    @on_device @cuDynamicSharedMem(Float32, (1, 2))
-    @on_device @cuDynamicSharedMem(Tuple{Float32, Float32}, 1)
-    @on_device @cuDynamicSharedMem(Tuple{Float32, Float32}, (1,2))
-    @on_device @cuDynamicSharedMem(Tuple{RGB{Float32}, UInt32}, 1)
-    @on_device @cuDynamicSharedMem(Tuple{RGB{Float32}, UInt32}, (1,2))
+    @on_device CuDynamicSharedArray(Float32, 1)
+    @on_device CuDynamicSharedArray(Float32, (1, 2))
+    @on_device CuDynamicSharedArray(Tuple{Float32, Float32}, 1)
+    @on_device CuDynamicSharedArray(Tuple{Float32, Float32}, (1,2))
+    @on_device CuDynamicSharedArray(Tuple{RGB{Float32}, UInt32}, 1)
+    @on_device CuDynamicSharedArray(Tuple{RGB{Float32}, UInt32}, (1,2))
 
     # dynamic with offset
-    @on_device @cuDynamicSharedMem(Float32, 1, 8)
-    @on_device @cuDynamicSharedMem(Float32, (1,2), 8)
-    @on_device @cuDynamicSharedMem(Tuple{Float32, Float32}, 1, 8)
-    @on_device @cuDynamicSharedMem(Tuple{Float32, Float32}, (1,2), 8)
-    @on_device @cuDynamicSharedMem(Tuple{RGB{Float32}, UInt32}, 1, 8)
-    @on_device @cuDynamicSharedMem(Tuple{RGB{Float32}, UInt32}, (1,2), 8)
+    @on_device CuDynamicSharedArray(Float32, 1, 8)
+    @on_device CuDynamicSharedArray(Float32, (1,2), 8)
+    @on_device CuDynamicSharedArray(Tuple{Float32, Float32}, 1, 8)
+    @on_device CuDynamicSharedArray(Tuple{Float32, Float32}, (1,2), 8)
+    @on_device CuDynamicSharedArray(Tuple{RGB{Float32}, UInt32}, 1, 8)
+    @on_device CuDynamicSharedArray(Tuple{RGB{Float32}, UInt32}, (1,2), 8)
 end
 
 
@@ -43,7 +43,7 @@ end
         t = threadIdx().x
         tr = n-t+1
 
-        s = @cuDynamicSharedMem(Float32, n)
+        s = CuDynamicSharedArray(Float32, n)
         s[t] = d[t]
         sync_threads()
         d[t] = s[tr]
@@ -64,7 +64,7 @@ end
             t = threadIdx().x
             tr = n-t+1
 
-            s = @cuDynamicSharedMem(T, n)
+            s = CuDynamicSharedArray(T, n)
             s[t] = d[t]
             sync_threads()
             d[t] = s[tr]
@@ -83,7 +83,7 @@ end
 @testset "alignment" begin
     # bug: used to generate align=12, which is invalid (non pow2)
     function kernel(v0::T, n) where {T}
-        shared = @cuDynamicSharedMem(T, n)
+        shared = CuDynamicSharedArray(T, n)
         @inbounds shared[Cuint(1)] = v0
         return
     end
@@ -103,8 +103,8 @@ end
         t = threadIdx().x
         tr = n-t+1
 
-        s = @cuStaticSharedMem(Float32, 1024)
-        s2 = @cuStaticSharedMem(Float32, 1024)  # catch aliasing
+        s = CuStaticSharedArray(Float32, 1024)
+        s2 = CuStaticSharedArray(Float32, 1024)  # catch aliasing
 
         s[t] = d[t]
         s2[t] = 2*d[t]
@@ -127,8 +127,8 @@ end
             t = threadIdx().x
             tr = n-t+1
 
-            s = @cuStaticSharedMem(T, 1024)
-            s2 = @cuStaticSharedMem(T, 1024)  # catch aliasing
+            s = CuStaticSharedArray(T, 1024)
+            s2 = CuStaticSharedArray(T, 1024)  # catch aliasing
 
             s[t] = d[t]
             s2[t] = d[t]
@@ -149,7 +149,7 @@ end
 @testset "alignment" begin
     # bug: used to generate align=12, which is invalid (non pow2)
     function kernel(v0::T) where {T}
-        shared = CUDA.@cuStaticSharedMem(T, 32)
+        shared = CUDA.CuStaticSharedArray(T, 32)
         @inbounds shared[Cuint(1)] = v0
         return
     end
@@ -169,7 +169,7 @@ end
         t = threadIdx().x
         tr = n-t+1
 
-        s = @cuDynamicSharedMem(eltype(a), 2*n)
+        s = CuDynamicSharedArray(eltype(a), 2*n)
 
         sa = view(s, 1:n)
         sa[t] = a[t]
@@ -202,12 +202,12 @@ end
         t = threadIdx().x
         tr = n-t+1
 
-        sa = @cuDynamicSharedMem(eltype(a), n)
+        sa = CuDynamicSharedArray(eltype(a), n)
         sa[t] = a[t]
         sync_threads()
         a[t] = sa[tr]
 
-        sb = @cuDynamicSharedMem(eltype(b), n, n*sizeof(eltype(a)))
+        sb = CuDynamicSharedArray(eltype(b), n, n*sizeof(eltype(a)))
         sb[t] = b[t]
         sync_threads()
         b[t] = sb[tr]
diff --git a/test/device/intrinsics/output.jl b/test/device/intrinsics/output.jl
@@ -140,8 +140,8 @@ end
 
 @testset "@cushow array pointers" begin
     function kernel()
-        a = @cuStaticSharedMem(Float32, 1)
-        b = @cuStaticSharedMem(Float32, 2)
+        a = CuStaticSharedArray(Float32, 1)
+        b = CuStaticSharedArray(Float32, 2)
         @cushow pointer(a) pointer(b)
         return
     end
diff --git a/test/device/intrinsics/wmma.jl b/test/device/intrinsics/wmma.jl
@@ -33,7 +33,7 @@ using CUDA.WMMA
 
             @eval @inbounds function kernel(input_ptr, result_dev)
                 if $do_shared_test
-                    input_shared = @cuStaticSharedMem($array_ty, 256)
+                    input_shared = CuStaticSharedArray($array_ty, 256)
                     fill!(input_shared, 42)
 
                     data = $func(pointer(input_shared), 16)
@@ -84,7 +84,7 @@ using CUDA.WMMA
 
             @eval function kernel(output_dev, output_ptr)
                 if $do_shared_test
-                    shared_mem = @cuStaticSharedMem($array_ty, 256)
+                    shared_mem = CuStaticSharedArray($array_ty, 256)
                     $func(pointer(shared_mem), $data, 16)
 
                     for i = 1:256
@@ -280,7 +280,7 @@ end
 
     @testset "Shared" begin
         function kernel()
-            shmem = @cuStaticSharedMem(Float32, (16, 16))
+            shmem = CuStaticSharedArray(Float32, (16, 16))
             conf = WMMA.Config{16, 16, 16, Float32}
 
             d_frag = WMMA.fill_c(Float32(0), conf)
diff --git a/test/execution.jl b/test/execution.jl