Skip to content

Commit afe8179

Browse files
authored
Merge pull request #1114 from JuliaGPU/tb/shmem
Remove the hacky unique'ing of shmem GVs.
2 parents 9b2d508 + f7fd063 commit afe8179

File tree

15 files changed

+94
-86
lines changed

15 files changed

+94
-86
lines changed

docs/src/api/kernel.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,8 +35,8 @@ CUDA.Const
3535
### Shared memory
3636

3737
```@docs
38-
@cuStaticSharedMem
39-
@cuDynamicSharedMem
38+
CuStaticSharedArray
39+
CuDynamicSharedArray
4040
```
4141

4242
### Texture memory

examples/pairwise.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ function pairwise_dist_kernel(lat::CuDeviceVector{Float32}, lon::CuDeviceVector{
5252

5353
if i <= n && j <= n
5454
# store to shared memory
55-
shmem = @cuDynamicSharedMem(Float32, 2*blockDim().x + 2*blockDim().y)
55+
shmem = CuDynamicSharedArray(Float32, 2*blockDim().x + 2*blockDim().y)
5656
if threadIdx().y == 1
5757
shmem[threadIdx().x] = lat[i]
5858
shmem[blockDim().x + threadIdx().x] = lon[i]

perf/volumerhs.jl

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -91,9 +91,9 @@ function volumerhs!(rhs, Q, vgeo, gravity, D, nelem)
9191

9292
Nq = N + 1
9393

94-
s_D = @cuStaticSharedMem eltype(D) (Nq, Nq)
95-
s_F = @cuStaticSharedMem eltype(Q) (Nq, Nq, _nstate)
96-
s_G = @cuStaticSharedMem eltype(Q) (Nq, Nq, _nstate)
94+
s_D = CuStaticSharedArray(eltype(D), (Nq, Nq))
95+
s_F = CuStaticSharedArray(eltype(Q), (Nq, Nq, _nstate))
96+
s_G = CuStaticSharedArray(eltype(Q), (Nq, Nq, _nstate))
9797

9898
r_rhsρ = MArray{Tuple{Nq}, eltype(rhs)}(undef)
9999
r_rhsU = MArray{Tuple{Nq}, eltype(rhs)}(undef)

src/accumulate.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ function partial_scan(op::Function, output::AbstractArray{T}, input::AbstractArr
1919
thread = threadIdx().x
2020
block = blockIdx().x
2121

22-
temp = @cuDynamicSharedMem(T, (2*threads,))
22+
temp = CuDynamicSharedArray(T, (2*threads,))
2323

2424
# iterate the main dimension using threads and the first block dimension
2525
i = (blockIdx().x-1) * blockDim().x + threadIdx().x

src/device/intrinsics/memory_shared.jl

Lines changed: 28 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,33 +1,32 @@
11
# Shared Memory (part of B.2)
22

3-
export @cuStaticSharedMem, @cuDynamicSharedMem
4-
5-
shmem_id = 0
3+
export @cuStaticSharedMem, @cuDynamicSharedMem, CuStaticSharedArray, CuDynamicSharedArray
64

75
"""
8-
@cuStaticSharedMem(T::Type, dims) -> CuDeviceArray{T,AS.Shared}
6+
CuStaticSharedArray(T::Type, dims) -> CuDeviceArray{T,AS.Shared}
97
108
Get an array of type `T` and dimensions `dims` (either an integer length or tuple shape)
119
pointing to a statically-allocated piece of shared memory. The type should be statically
1210
inferable and the dimensions should be constant, or an error will be thrown and the
1311
generator function will be called dynamically.
1412
"""
15-
macro cuStaticSharedMem(T, dims)
16-
# FIXME: generating a unique id in the macro is incorrect, as multiple parametrically typed
17-
# functions will alias the id (and the size might be a parameter). but incrementing in
18-
# the @generated function doesn't work, as it is supposed to be pure and identical
19-
# invocations will erroneously share (and even cause multiple shmem globals).
20-
id = gensym("static_shmem")
13+
@inline function CuStaticSharedArray(::Type{T}, dims) where {T}
14+
len = prod(dims)
15+
# NOTE: this relies on const-prop to forward the literal length to the generator.
16+
# maybe we should include the size in the type, like StaticArrays does?
17+
ptr = emit_shmem(T, Val(len))
18+
CuDeviceArray(dims, ptr)
19+
end
2120

21+
macro cuStaticSharedMem(T, dims)
22+
Base.depwarn("@cuStaticSharedMem is deprecated, please use the CuStaticSharedArray function", :CuStaticSharedArray)
2223
quote
23-
len = prod($(esc(dims)))
24-
ptr = emit_shmem(Val($(QuoteNode(id))), $(esc(T)), Val(len))
25-
CuDeviceArray($(esc(dims)), ptr)
24+
CuStaticSharedArray($(esc(T)), $(esc(dims)))
2625
end
2726
end
2827

2928
"""
30-
@cuDynamicSharedMem(T::Type, dims, offset::Integer=0) -> CuDeviceArray{T,AS.Shared}
29+
CuDynamicSharedArray(T::Type, dims, offset::Integer=0) -> CuDeviceArray{T,AS.Shared}
3130
3231
Get an array of type `T` and dimensions `dims` (either an integer length or tuple shape)
3332
pointing to a dynamically-allocated piece of shared memory. The type should be statically
@@ -39,20 +38,26 @@ Optionally, an offset parameter indicating how many bytes to add to the base sha
3938
pointer can be specified. This is useful when dealing with a heterogeneous buffer of dynamic
4039
shared memory; in the case of a homogeneous multi-part buffer it is preferred to use `view`.
4140
"""
42-
macro cuDynamicSharedMem(T, dims, offset=0)
43-
id = gensym("dynamic_shmem")
44-
45-
# TODO: boundscheck against %dynamic_smem_size (currently unsupported by LLVM)
41+
@inline function CuDynamicSharedArray(::Type{T}, dims, offset=0) where {T}
42+
len = prod(dims)
43+
@boundscheck if offset+len > dynamic_smem_size()
44+
throw(BoundsError())
45+
end
46+
ptr = emit_shmem(T) + offset
47+
CuDeviceArray(dims, ptr)
48+
end
4649

50+
macro cuDynamicSharedMem(T, dims, offset=0)
51+
Base.depwarn("@cuDynamicSharedMem is deprecated, please use the CuDynamicSharedArray function", :CuStaticSharedArray)
4752
quote
48-
len = prod($(esc(dims)))
49-
ptr = emit_shmem(Val($(QuoteNode(id))), $(esc(T))) + $(esc(offset))
50-
CuDeviceArray($(esc(dims)), ptr)
53+
CuDynamicSharedArray($(esc(T)), $(esc(dims)), $(esc(offset)))
5154
end
5255
end
5356

57+
dynamic_smem_size() = @asmcall("mov.u32 \$0, %dynamic_smem_size;", "=r", true, UInt32, Tuple{})
58+
5459
# get a pointer to shared memory, with known (static) or zero length (dynamic shared memory)
55-
@generated function emit_shmem(::Val{id}, ::Type{T}, ::Val{len}=Val(0)) where {id,T,len}
60+
@generated function emit_shmem(::Type{T}, ::Val{len}=Val(0)) where {T,len}
5661
Context() do ctx
5762
eltyp = convert(LLVMType, T; ctx)
5863
T_ptr = convert(LLVMType, LLVMPtr{T,AS.Shared}; ctx)
@@ -63,7 +68,7 @@ end
6368
# create the global variable
6469
mod = LLVM.parent(llvm_f)
6570
gv_typ = LLVM.ArrayType(eltyp, len)
66-
gv = GlobalVariable(mod, gv_typ, GPUCompiler.safe_name(string(id)), AS.Shared)
71+
gv = GlobalVariable(mod, gv_typ, "shmem", AS.Shared)
6772
if len > 0
6873
# static shared memory should be demoted to local variables, whenever possible.
6974
# this is done by the NVPTX ASM printer:

src/mapreduce.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ end
2020
@inline function reduce_block(op, val::T, neutral, shuffle::Val{true}) where T
2121
# shared mem for partial sums
2222
assume(warpsize() == 32)
23-
shared = @cuStaticSharedMem(T, 32)
23+
shared = CuStaticSharedArray(T, 32)
2424

2525
wid, lane = fldmod1(threadIdx().x, warpsize())
2626

@@ -54,7 +54,7 @@ end
5454
thread = threadIdx().x
5555

5656
# shared mem for a complete reduction
57-
shared = @cuDynamicSharedMem(T, (threads,))
57+
shared = CuDynamicSharedArray(T, (threads,))
5858
@inbounds shared[thread] = val
5959

6060
# perform a reduction

src/sorting.jl

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -102,8 +102,8 @@ from `lo` to `hi` of `values`.
102102
"""
103103
function partition_batches_kernel(values::AbstractArray{T}, pivot, lo, hi, parity, lt::F1,
104104
by::F2) where {T,F1,F2}
105-
sums = @cuDynamicSharedMem(Int, blockDim().x)
106-
swap = @cuDynamicSharedMem(T, blockDim().x, sizeof(sums))
105+
sums = CuDynamicSharedArray(Int, blockDim().x)
106+
swap = CuDynamicSharedArray(T, blockDim().x, sizeof(sums))
107107
batch_partition(values, pivot, swap, sums, lo, hi, parity, lt, by)
108108
return
109109
end
@@ -375,8 +375,8 @@ early end to recursion if we started `stuck` at 0.
375375
"""
376376
function qsort_kernel(vals::AbstractArray{T,N}, lo, hi, parity, sync::Val{S}, sync_depth,
377377
prev_pivot, lt::F1, by::F2, ::Val{dims}, partial=nothing, stuck=-1) where {T, N, S, F1, F2, dims}
378-
b_sums = @cuDynamicSharedMem(Int, blockDim().x)
379-
swap = @cuDynamicSharedMem(T, blockDim().x, sizeof(b_sums))
378+
b_sums = CuDynamicSharedArray(Int, blockDim().x)
379+
swap = CuDynamicSharedArray(T, blockDim().x, sizeof(b_sums))
380380
shmem = sizeof(b_sums) + sizeof(swap)
381381
L = hi - lo
382382

test/codegen.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
@testset "JuliaLang/julia#21121" begin
44
function foobar()
5-
weight_matrix = @cuStaticSharedMem(Float32, (16, 16))
5+
weight_matrix = CuStaticSharedArray(Float32, (16, 16))
66
sync_threads()
77
weight_matrix[1, 16] *= 2
88
sync_threads()
@@ -75,7 +75,7 @@ end
7575
@inbounds function kernel(input, output, n)
7676
i = threadIdx().x
7777

78-
temp = @cuStaticSharedMem(Int, 1)
78+
temp = CuStaticSharedArray(Int, 1)
7979
if i == 1
8080
1 <= n || throw_some()
8181
temp[1] = input

test/device/array.jl

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -141,7 +141,7 @@ end
141141

142142

143143
function kernel_shmem_reinterpet_equal_size!(y)
144-
a = @cuDynamicSharedMem(Float32, (blockDim().x,))
144+
a = CuDynamicSharedArray(Float32, (blockDim().x,))
145145
b = reinterpret(UInt32, a)
146146
a[threadIdx().x] = threadIdx().x
147147
b[threadIdx().x] += 1
@@ -172,7 +172,7 @@ end
172172
end
173173

174174
function kernel_shmem_reinterpet_smaller_size!(y)
175-
a = @cuDynamicSharedMem(UInt128, (blockDim().x,))
175+
a = CuDynamicSharedArray(UInt128, (blockDim().x,))
176176
i32 = Int32(threadIdx().x)
177177
p = i32 + i32 * im
178178
q = i32 - i32 * im
@@ -209,7 +209,7 @@ end
209209
end
210210

211211
function kernel_shmem_reinterpet_larger_size!(y)
212-
a = @cuDynamicSharedMem(Float32, (4 * blockDim().x,))
212+
a = CuDynamicSharedArray(Float32, (4 * blockDim().x,))
213213
b = reinterpret(UInt128, a)
214214
a[1 + 4 * (threadIdx().x - 1)] = threadIdx().x
215215
a[2 + 4 * (threadIdx().x - 1)] = threadIdx().x * 2

test/device/intrinsics/atomics.jl

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -194,7 +194,7 @@ end
194194

195195
@testset "shared memory" begin
196196
function kernel()
197-
shared = @cuStaticSharedMem(Float32, 1)
197+
shared = CuStaticSharedArray(Float32, 1)
198198
@atomic shared[threadIdx().x] += 0f0
199199
return
200200
end
@@ -425,7 +425,7 @@ end
425425
# https://github.com/JuliaGPU/CUDA.jl/issues/311
426426

427427
function kernel(a)
428-
b = CUDA.@cuStaticSharedMem(Int, 1)
428+
b = CUDA.CuStaticSharedArray(Int, 1)
429429

430430
if threadIdx().x == 1
431431
b[] = a[]
@@ -452,7 +452,7 @@ end
452452

453453
function kernel()
454454
tid = threadIdx().x
455-
shared = @cuStaticSharedMem(Float32, 4)
455+
shared = CuStaticSharedArray(Float32, 4)
456456
CUDA.atomic_add!(pointer(shared, tid), shared[tid + 2])
457457
sync_threads()
458458
CUDA.atomic_add!(pointer(shared, tid), shared[tid + 2])

0 commit comments

Comments
 (0)