Merge pull request #1114 from JuliaGPU/tb/shmem

maleadt · web-flow · commit afe81794038d · 2021-08-26T13:58:25.000+02:00
Remove the hacky unique'ing of shmem GVs.
diff --git a/docs/src/api/kernel.md b/docs/src/api/kernel.md
@@ -35,8 +35,8 @@ CUDA.Const
 ### Shared memory
 
 ```@docs
-@cuStaticSharedMem
-@cuDynamicSharedMem
+CuStaticSharedArray
+CuDynamicSharedArray
 ```
 
 ### Texture memory
diff --git a/examples/pairwise.jl b/examples/pairwise.jl
@@ -52,7 +52,7 @@ function pairwise_dist_kernel(lat::CuDeviceVector{Float32}, lon::CuDeviceVector{
 
     if i <= n && j <= n
         # store to shared memory
-        shmem = @cuDynamicSharedMem(Float32, 2*blockDim().x + 2*blockDim().y)
+        shmem = CuDynamicSharedArray(Float32, 2*blockDim().x + 2*blockDim().y)
         if threadIdx().y == 1
             shmem[threadIdx().x] = lat[i]
             shmem[blockDim().x + threadIdx().x] = lon[i]
diff --git a/perf/volumerhs.jl b/perf/volumerhs.jl
@@ -91,9 +91,9 @@ function volumerhs!(rhs, Q, vgeo, gravity, D, nelem)
 
     Nq = N + 1
 
-    s_D = @cuStaticSharedMem eltype(D) (Nq, Nq)
-    s_F = @cuStaticSharedMem eltype(Q) (Nq, Nq, _nstate)
-    s_G = @cuStaticSharedMem eltype(Q) (Nq, Nq, _nstate)
+    s_D = CuStaticSharedArray(eltype(D), (Nq, Nq))
+    s_F = CuStaticSharedArray(eltype(Q), (Nq, Nq, _nstate))
+    s_G = CuStaticSharedArray(eltype(Q), (Nq, Nq, _nstate))
 
     r_rhsρ = MArray{Tuple{Nq}, eltype(rhs)}(undef)
     r_rhsU = MArray{Tuple{Nq}, eltype(rhs)}(undef)
diff --git a/src/accumulate.jl b/src/accumulate.jl
@@ -19,7 +19,7 @@ function partial_scan(op::Function, output::AbstractArray{T}, input::AbstractArr
     thread = threadIdx().x
     block = blockIdx().x
 
-    temp = @cuDynamicSharedMem(T, (2*threads,))
+    temp = CuDynamicSharedArray(T, (2*threads,))
 
     # iterate the main dimension using threads and the first block dimension
     i = (blockIdx().x-1) * blockDim().x + threadIdx().x
diff --git a/src/device/intrinsics/memory_shared.jl b/src/device/intrinsics/memory_shared.jl
@@ -1,33 +1,32 @@
 # Shared Memory (part of B.2)
 
-export @cuStaticSharedMem, @cuDynamicSharedMem
-
-shmem_id = 0
+export @cuStaticSharedMem, @cuDynamicSharedMem, CuStaticSharedArray, CuDynamicSharedArray
 
 """
-    @cuStaticSharedMem(T::Type, dims) -> CuDeviceArray{T,AS.Shared}
+    CuStaticSharedArray(T::Type, dims) -> CuDeviceArray{T,AS.Shared}
 
 Get an array of type `T` and dimensions `dims` (either an integer length or tuple shape)
 pointing to a statically-allocated piece of shared memory. The type should be statically
 inferable and the dimensions should be constant, or an error will be thrown and the
 generator function will be called dynamically.
 """
-macro cuStaticSharedMem(T, dims)
-    # FIXME: generating a unique id in the macro is incorrect, as multiple parametrically typed
-    #        functions will alias the id (and the size might be a parameter). but incrementing in
-    #        the @generated function doesn't work, as it is supposed to be pure and identical
-    #        invocations will erroneously share (and even cause multiple shmem globals).
-    id = gensym("static_shmem")
+@inline function CuStaticSharedArray(::Type{T}, dims) where {T}
+    len = prod(dims)
+    # NOTE: this relies on const-prop to forward the literal length to the generator.
+    #       maybe we should include the size in the type, like StaticArrays does?
+    ptr = emit_shmem(T, Val(len))
+    CuDeviceArray(dims, ptr)
+end
 
+macro cuStaticSharedMem(T, dims)
+    Base.depwarn("@cuStaticSharedMem is deprecated, please use the CuStaticSharedArray function", :CuStaticSharedArray)
     quote
-        len = prod($(esc(dims)))
-        ptr = emit_shmem(Val($(QuoteNode(id))), $(esc(T)), Val(len))
-        CuDeviceArray($(esc(dims)), ptr)
+        CuStaticSharedArray($(esc(T)), $(esc(dims)))
     end
 end
 
 """
-    @cuDynamicSharedMem(T::Type, dims, offset::Integer=0) -> CuDeviceArray{T,AS.Shared}
+    CuDynamicSharedArray(T::Type, dims, offset::Integer=0) -> CuDeviceArray{T,AS.Shared}
 
 Get an array of type `T` and dimensions `dims` (either an integer length or tuple shape)
 pointing to a dynamically-allocated piece of shared memory. The type should be statically
@@ -39,20 +38,26 @@ Optionally, an offset parameter indicating how many bytes to add to the base sha
 pointer can be specified. This is useful when dealing with a heterogeneous buffer of dynamic
 shared memory; in the case of a homogeneous multi-part buffer it is preferred to use `view`.
 """
-macro cuDynamicSharedMem(T, dims, offset=0)
-    id = gensym("dynamic_shmem")
-
-    # TODO: boundscheck against %dynamic_smem_size (currently unsupported by LLVM)
+@inline function CuDynamicSharedArray(::Type{T}, dims, offset=0) where {T}
+    len = prod(dims)
+    @boundscheck if offset+len > dynamic_smem_size()
+        throw(BoundsError())
+    end
+    ptr = emit_shmem(T) + offset
+    CuDeviceArray(dims, ptr)
+end
 
+macro cuDynamicSharedMem(T, dims, offset=0)
+    Base.depwarn("@cuDynamicSharedMem is deprecated, please use the CuDynamicSharedArray function", :CuStaticSharedArray)
     quote
-        len = prod($(esc(dims)))
-        ptr = emit_shmem(Val($(QuoteNode(id))), $(esc(T))) + $(esc(offset))
-        CuDeviceArray($(esc(dims)), ptr)
+        CuDynamicSharedArray($(esc(T)), $(esc(dims)), $(esc(offset)))
     end
 end
 
+dynamic_smem_size() = @asmcall("mov.u32 \$0, %dynamic_smem_size;", "=r", true, UInt32, Tuple{})
+
 # get a pointer to shared memory, with known (static) or zero length (dynamic shared memory)
-@generated function emit_shmem(::Val{id}, ::Type{T}, ::Val{len}=Val(0)) where {id,T,len}
+@generated function emit_shmem(::Type{T}, ::Val{len}=Val(0)) where {T,len}
     Context() do ctx
         eltyp = convert(LLVMType, T; ctx)
         T_ptr = convert(LLVMType, LLVMPtr{T,AS.Shared}; ctx)
@@ -63,7 +68,7 @@ end
         # create the global variable
         mod = LLVM.parent(llvm_f)
         gv_typ = LLVM.ArrayType(eltyp, len)
-        gv = GlobalVariable(mod, gv_typ, GPUCompiler.safe_name(string(id)), AS.Shared)
+        gv = GlobalVariable(mod, gv_typ, "shmem", AS.Shared)
         if len > 0
             # static shared memory should be demoted to local variables, whenever possible.
             # this is done by the NVPTX ASM printer:
diff --git a/src/mapreduce.jl b/src/mapreduce.jl
@@ -20,7 +20,7 @@ end
 @inline function reduce_block(op, val::T, neutral, shuffle::Val{true}) where T
     # shared mem for partial sums
     assume(warpsize() == 32)
-    shared = @cuStaticSharedMem(T, 32)
+    shared = CuStaticSharedArray(T, 32)
 
     wid, lane = fldmod1(threadIdx().x, warpsize())
 
@@ -54,7 +54,7 @@ end
     thread = threadIdx().x
 
     # shared mem for a complete reduction
-    shared = @cuDynamicSharedMem(T, (threads,))
+    shared = CuDynamicSharedArray(T, (threads,))
     @inbounds shared[thread] = val
 
     # perform a reduction
diff --git a/src/sorting.jl b/src/sorting.jl
@@ -102,8 +102,8 @@ from `lo` to `hi` of `values`.
 """
 function partition_batches_kernel(values::AbstractArray{T}, pivot, lo, hi, parity, lt::F1,
                                   by::F2) where {T,F1,F2}
-    sums = @cuDynamicSharedMem(Int, blockDim().x)
-    swap = @cuDynamicSharedMem(T, blockDim().x, sizeof(sums))
+    sums = CuDynamicSharedArray(Int, blockDim().x)
+    swap = CuDynamicSharedArray(T, blockDim().x, sizeof(sums))
     batch_partition(values, pivot, swap, sums, lo, hi, parity, lt, by)
     return
 end
@@ -375,8 +375,8 @@ early end to recursion if we started `stuck` at 0.
 """
 function qsort_kernel(vals::AbstractArray{T,N}, lo, hi, parity, sync::Val{S}, sync_depth,
                       prev_pivot, lt::F1, by::F2, ::Val{dims}, partial=nothing, stuck=-1) where {T, N, S, F1, F2, dims}
-    b_sums = @cuDynamicSharedMem(Int, blockDim().x)
-    swap = @cuDynamicSharedMem(T, blockDim().x, sizeof(b_sums))
+    b_sums = CuDynamicSharedArray(Int, blockDim().x)
+    swap = CuDynamicSharedArray(T, blockDim().x, sizeof(b_sums))
     shmem = sizeof(b_sums) + sizeof(swap)
     L = hi - lo
 
diff --git a/test/codegen.jl b/test/codegen.jl
@@ -2,7 +2,7 @@
 
 @testset "JuliaLang/julia#21121" begin
     function foobar()
-        weight_matrix = @cuStaticSharedMem(Float32, (16, 16))
+        weight_matrix = CuStaticSharedArray(Float32, (16, 16))
         sync_threads()
         weight_matrix[1, 16] *= 2
         sync_threads()
@@ -75,7 +75,7 @@ end
     @inbounds function kernel(input, output, n)
         i = threadIdx().x
 
-        temp = @cuStaticSharedMem(Int, 1)
+        temp = CuStaticSharedArray(Int, 1)
         if i == 1
             1 <= n || throw_some()
             temp[1] = input
diff --git a/test/device/array.jl b/test/device/array.jl
@@ -141,7 +141,7 @@ end
 
 
 function kernel_shmem_reinterpet_equal_size!(y)
-  a = @cuDynamicSharedMem(Float32, (blockDim().x,))
+  a = CuDynamicSharedArray(Float32, (blockDim().x,))
   b = reinterpret(UInt32, a)
   a[threadIdx().x] = threadIdx().x
   b[threadIdx().x] += 1
@@ -172,7 +172,7 @@ end
 end
 
 function kernel_shmem_reinterpet_smaller_size!(y)
-  a = @cuDynamicSharedMem(UInt128, (blockDim().x,))
+  a = CuDynamicSharedArray(UInt128, (blockDim().x,))
   i32 = Int32(threadIdx().x)
   p = i32 + i32 * im
   q = i32 - i32 * im
@@ -209,7 +209,7 @@ end
 end
 
 function kernel_shmem_reinterpet_larger_size!(y)
-  a = @cuDynamicSharedMem(Float32, (4 * blockDim().x,))
+  a = CuDynamicSharedArray(Float32, (4 * blockDim().x,))
   b = reinterpret(UInt128, a)
   a[1 + 4 * (threadIdx().x - 1)] = threadIdx().x
   a[2 + 4 * (threadIdx().x - 1)] = threadIdx().x * 2
diff --git a/test/device/intrinsics/atomics.jl b/test/device/intrinsics/atomics.jl
@@ -194,7 +194,7 @@ end
 
 @testset "shared memory" begin
     function kernel()
-        shared = @cuStaticSharedMem(Float32, 1)
+        shared = CuStaticSharedArray(Float32, 1)
         @atomic shared[threadIdx().x] += 0f0
         return
     end
@@ -425,7 +425,7 @@ end
     # https://github.com/JuliaGPU/CUDA.jl/issues/311
 
     function kernel(a)
-        b = CUDA.@cuStaticSharedMem(Int, 1)
+        b = CUDA.CuStaticSharedArray(Int, 1)
 
         if threadIdx().x == 1
             b[] = a[]
@@ -452,7 +452,7 @@ end
 
     function kernel()
         tid = threadIdx().x
-        shared = @cuStaticSharedMem(Float32, 4)
+        shared = CuStaticSharedArray(Float32, 4)
         CUDA.atomic_add!(pointer(shared, tid), shared[tid + 2])
         sync_threads()
         CUDA.atomic_add!(pointer(shared, tid), shared[tid + 2])
diff --git a/test/device/intrinsics/memory.jl b/test/device/intrinsics/memory.jl
@@ -11,28 +11,28 @@ n = 256
 
 @testset "constructors" begin
     # static
-    @on_device @cuStaticSharedMem(Float32, 1)
-    @on_device @cuStaticSharedMem(Float32, (1,2))
-    @on_device @cuStaticSharedMem(Tuple{Float32, Float32}, 1)
-    @on_device @cuStaticSharedMem(Tuple{Float32, Float32}, (1,2))
-    @on_device @cuStaticSharedMem(Tuple{RGB{Float32}, UInt32}, 1)
-    @on_device @cuStaticSharedMem(Tuple{RGB{Float32}, UInt32}, (1,2))
+    @on_device CuStaticSharedArray(Float32, 1)
+    @on_device CuStaticSharedArray(Float32, (1,2))
+    @on_device CuStaticSharedArray(Tuple{Float32, Float32}, 1)
+    @on_device CuStaticSharedArray(Tuple{Float32, Float32}, (1,2))
+    @on_device CuStaticSharedArray(Tuple{RGB{Float32}, UInt32}, 1)
+    @on_device CuStaticSharedArray(Tuple{RGB{Float32}, UInt32}, (1,2))
 
     # dynamic
-    @on_device @cuDynamicSharedMem(Float32, 1)
-    @on_device @cuDynamicSharedMem(Float32, (1, 2))
-    @on_device @cuDynamicSharedMem(Tuple{Float32, Float32}, 1)
-    @on_device @cuDynamicSharedMem(Tuple{Float32, Float32}, (1,2))
-    @on_device @cuDynamicSharedMem(Tuple{RGB{Float32}, UInt32}, 1)
-    @on_device @cuDynamicSharedMem(Tuple{RGB{Float32}, UInt32}, (1,2))
+    @on_device shmem=sizeof(Float32) CuDynamicSharedArray(Float32, 1)
+    @on_device shmem=sizeof(Float32) CuDynamicSharedArray(Float32, (1, 2))
+    @on_device shmem=sizeof(Tuple{Float32, Float32}) CuDynamicSharedArray(Tuple{Float32, Float32}, 1)
+    @on_device shmem=sizeof(Tuple{Float32, Float32}) CuDynamicSharedArray(Tuple{Float32, Float32}, (1,2))
+    @on_device shmem=sizeof(Tuple{RGB{Float32}, UInt32}) CuDynamicSharedArray(Tuple{RGB{Float32}, UInt32}, 1)
+    @on_device shmem=sizeof(Tuple{RGB{Float32}, UInt32}) CuDynamicSharedArray(Tuple{RGB{Float32}, UInt32}, (1,2))
 
     # dynamic with offset
-    @on_device @cuDynamicSharedMem(Float32, 1, 8)
-    @on_device @cuDynamicSharedMem(Float32, (1,2), 8)
-    @on_device @cuDynamicSharedMem(Tuple{Float32, Float32}, 1, 8)
-    @on_device @cuDynamicSharedMem(Tuple{Float32, Float32}, (1,2), 8)
-    @on_device @cuDynamicSharedMem(Tuple{RGB{Float32}, UInt32}, 1, 8)
-    @on_device @cuDynamicSharedMem(Tuple{RGB{Float32}, UInt32}, (1,2), 8)
+    @on_device shmem=sizeof(Float32)+8 CuDynamicSharedArray(Float32, 1, 8)
+    @on_device shmem=sizeof(Float32)+8 CuDynamicSharedArray(Float32, (1,2), 8)
+    @on_device shmem=sizeof(Tuple{Float32, Float32})+8 CuDynamicSharedArray(Tuple{Float32, Float32}, 1, 8)
+    @on_device shmem=sizeof(Tuple{Float32, Float32})+8 CuDynamicSharedArray(Tuple{Float32, Float32}, (1,2), 8)
+    @on_device shmem=sizeof(Tuple{RGB{Float32}, UInt32})+8 CuDynamicSharedArray(Tuple{RGB{Float32}, UInt32}, 1, 8)
+    @on_device shmem=sizeof(Tuple{RGB{Float32}, UInt32})+8 CuDynamicSharedArray(Tuple{RGB{Float32}, UInt32}, (1,2), 8)
 end
 
 
@@ -43,7 +43,7 @@ end
         t = threadIdx().x
         tr = n-t+1
 
-        s = @cuDynamicSharedMem(Float32, n)
+        s = CuDynamicSharedArray(Float32, n)
         s[t] = d[t]
         sync_threads()
         d[t] = s[tr]
@@ -64,7 +64,7 @@ end
             t = threadIdx().x
             tr = n-t+1
 
-            s = @cuDynamicSharedMem(T, n)
+            s = CuDynamicSharedArray(T, n)
             s[t] = d[t]
             sync_threads()
             d[t] = s[tr]
@@ -83,7 +83,7 @@ end
 @testset "alignment" begin
     # bug: used to generate align=12, which is invalid (non pow2)
     function kernel(v0::T, n) where {T}
-        shared = @cuDynamicSharedMem(T, n)
+        shared = CuDynamicSharedArray(T, n)
         @inbounds shared[Cuint(1)] = v0
         return
     end
@@ -103,8 +103,8 @@ end
         t = threadIdx().x
         tr = n-t+1
 
-        s = @cuStaticSharedMem(Float32, 1024)
-        s2 = @cuStaticSharedMem(Float32, 1024)  # catch aliasing
+        s = CuStaticSharedArray(Float32, 1024)
+        s2 = CuStaticSharedArray(Float32, 1024)  # catch aliasing
 
         s[t] = d[t]
         s2[t] = 2*d[t]
@@ -127,8 +127,8 @@ end
             t = threadIdx().x
             tr = n-t+1
 
-            s = @cuStaticSharedMem(T, 1024)
-            s2 = @cuStaticSharedMem(T, 1024)  # catch aliasing
+            s = CuStaticSharedArray(T, 1024)
+            s2 = CuStaticSharedArray(T, 1024)  # catch aliasing
 
             s[t] = d[t]
             s2[t] = d[t]
@@ -149,7 +149,7 @@ end
 @testset "alignment" begin
     # bug: used to generate align=12, which is invalid (non pow2)
     function kernel(v0::T) where {T}
-        shared = CUDA.@cuStaticSharedMem(T, 32)
+        shared = CUDA.CuStaticSharedArray(T, 32)
         @inbounds shared[Cuint(1)] = v0
         return
     end
@@ -169,7 +169,7 @@ end
         t = threadIdx().x
         tr = n-t+1
 
-        s = @cuDynamicSharedMem(eltype(a), 2*n)
+        s = CuDynamicSharedArray(eltype(a), 2*n)
 
         sa = view(s, 1:n)
         sa[t] = a[t]
@@ -202,12 +202,12 @@ end
         t = threadIdx().x
         tr = n-t+1
 
-        sa = @cuDynamicSharedMem(eltype(a), n)
+        sa = CuDynamicSharedArray(eltype(a), n)
         sa[t] = a[t]
         sync_threads()
         a[t] = sa[tr]
 
-        sb = @cuDynamicSharedMem(eltype(b), n, n*sizeof(eltype(a)))
+        sb = CuDynamicSharedArray(eltype(b), n, n*sizeof(eltype(a)))
         sb[t] = b[t]
         sync_threads()
         b[t] = sb[tr]
diff --git a/test/device/intrinsics/output.jl b/test/device/intrinsics/output.jl
@@ -140,8 +140,8 @@ end
 
 @testset "@cushow array pointers" begin
     function kernel()
-        a = @cuStaticSharedMem(Float32, 1)
-        b = @cuStaticSharedMem(Float32, 2)
+        a = CuStaticSharedArray(Float32, 1)
+        b = CuStaticSharedArray(Float32, 2)
         @cushow pointer(a) pointer(b)
         return
     end
diff --git a/test/device/intrinsics/wmma.jl b/test/device/intrinsics/wmma.jl
diff --git a/test/execution.jl b/test/execution.jl
diff --git a/test/setup.jl b/test/setup.jl