Skip to content

Commit efcfe39

Browse files
committed
Replace the shared memory macros with regular functions.
1 parent 2b2d030 commit efcfe39

File tree

14 files changed

+82
-70
lines changed

14 files changed

+82
-70
lines changed

docs/src/api/kernel.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,8 +35,8 @@ CUDA.Const
3535
### Shared memory
3636

3737
```@docs
38-
@cuStaticSharedMem
39-
@cuDynamicSharedMem
38+
CuStaticSharedArray
39+
CuDynamicSharedArray
4040
```
4141

4242
### Texture memory

examples/pairwise.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ function pairwise_dist_kernel(lat::CuDeviceVector{Float32}, lon::CuDeviceVector{
5252

5353
if i <= n && j <= n
5454
# store to shared memory
55-
shmem = @cuDynamicSharedMem(Float32, 2*blockDim().x + 2*blockDim().y)
55+
shmem = CuDynamicSharedArray(Float32, 2*blockDim().x + 2*blockDim().y)
5656
if threadIdx().y == 1
5757
shmem[threadIdx().x] = lat[i]
5858
shmem[blockDim().x + threadIdx().x] = lon[i]

perf/volumerhs.jl

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -91,9 +91,9 @@ function volumerhs!(rhs, Q, vgeo, gravity, D, nelem)
9191

9292
Nq = N + 1
9393

94-
s_D = @cuStaticSharedMem eltype(D) (Nq, Nq)
95-
s_F = @cuStaticSharedMem eltype(Q) (Nq, Nq, _nstate)
96-
s_G = @cuStaticSharedMem eltype(Q) (Nq, Nq, _nstate)
94+
s_D = CuStaticSharedArray(eltype(D), (Nq, Nq))
95+
s_F = CuStaticSharedArray(eltype(Q), (Nq, Nq, _nstate))
96+
s_G = CuStaticSharedArray(eltype(Q), (Nq, Nq, _nstate))
9797

9898
r_rhsρ = MArray{Tuple{Nq}, eltype(rhs)}(undef)
9999
r_rhsU = MArray{Tuple{Nq}, eltype(rhs)}(undef)

src/accumulate.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ function partial_scan(op::Function, output::AbstractArray{T}, input::AbstractArr
1919
thread = threadIdx().x
2020
block = blockIdx().x
2121

22-
temp = @cuDynamicSharedMem(T, (2*threads,))
22+
temp = CuDynamicSharedArray(T, (2*threads,))
2323

2424
# iterate the main dimension using threads and the first block dimension
2525
i = (blockIdx().x-1) * blockDim().x + threadIdx().x

src/device/intrinsics/memory_shared.jl

Lines changed: 22 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,32 @@
11
# Shared Memory (part of B.2)
22

3-
export @cuStaticSharedMem, @cuDynamicSharedMem
3+
export @cuStaticSharedMem, @cuDynamicSharedMem, CuStaticSharedArray, CuDynamicSharedArray
44

55
"""
6-
@cuStaticSharedMem(T::Type, dims) -> CuDeviceArray{T,AS.Shared}
6+
CuStaticSharedArray(T::Type, dims) -> CuDeviceArray{T,AS.Shared}
77
88
Get an array of type `T` and dimensions `dims` (either an integer length or tuple shape)
99
pointing to a statically-allocated piece of shared memory. The type should be statically
1010
inferable and the dimensions should be constant, or an error will be thrown and the
1111
generator function will be called dynamically.
1212
"""
13+
@inline function CuStaticSharedArray(::Type{T}, dims) where {T}
14+
len = prod(dims)
15+
# NOTE: this relies on const-prop to forward the literal length to the generator.
16+
# maybe we should include the size in the type, like StaticArrays does?
17+
ptr = emit_shmem(T, Val(len))
18+
CuDeviceArray(dims, ptr)
19+
end
20+
1321
macro cuStaticSharedMem(T, dims)
22+
Base.depwarn("@cuStaticSharedMem is deprecated, please use the CuStaticSharedArray function", :CuStaticSharedArray)
1423
quote
15-
len = prod($(esc(dims)))
16-
ptr = emit_shmem($(esc(T)), Val(len))
17-
CuDeviceArray($(esc(dims)), ptr)
24+
CuStaticSharedArray($(esc(T)), $(esc(dims)))
1825
end
1926
end
2027

2128
"""
22-
@cuDynamicSharedMem(T::Type, dims, offset::Integer=0) -> CuDeviceArray{T,AS.Shared}
29+
CuDynamicSharedArray(T::Type, dims, offset::Integer=0) -> CuDeviceArray{T,AS.Shared}
2330
2431
Get an array of type `T` and dimensions `dims` (either an integer length or tuple shape)
2532
pointing to a dynamically-allocated piece of shared memory. The type should be statically
@@ -31,12 +38,17 @@ Optionally, an offset parameter indicating how many bytes to add to the base sha
3138
pointer can be specified. This is useful when dealing with a heterogeneous buffer of dynamic
3239
shared memory; in the case of a homogeneous multi-part buffer it is preferred to use `view`.
3340
"""
34-
macro cuDynamicSharedMem(T, dims, offset=0)
41+
@inline function CuDynamicSharedArray(::Type{T}, dims, offset=0) where {T}
42+
len = prod(dims)
43+
ptr = emit_shmem(T) + offset
3544
# TODO: boundscheck against %dynamic_smem_size (currently unsupported by LLVM)
45+
CuDeviceArray(dims, ptr)
46+
end
47+
48+
macro cuDynamicSharedMem(T, dims, offset=0)
49+
Base.depwarn("@cuDynamicSharedMem is deprecated, please use the CuDynamicSharedArray function", :CuStaticSharedArray)
3650
quote
37-
len = prod($(esc(dims)))
38-
ptr = emit_shmem($(esc(T))) + $(esc(offset))
39-
CuDeviceArray($(esc(dims)), ptr)
51+
CuDynamicSharedArray($(esc(T)), $(esc(dims)), $(esc(offset)))
4052
end
4153
end
4254

src/mapreduce.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ end
2020
@inline function reduce_block(op, val::T, neutral, shuffle::Val{true}) where T
2121
# shared mem for partial sums
2222
assume(warpsize() == 32)
23-
shared = @cuStaticSharedMem(T, 32)
23+
shared = CuStaticSharedArray(T, 32)
2424

2525
wid, lane = fldmod1(threadIdx().x, warpsize())
2626

@@ -54,7 +54,7 @@ end
5454
thread = threadIdx().x
5555

5656
# shared mem for a complete reduction
57-
shared = @cuDynamicSharedMem(T, (threads,))
57+
shared = CuDynamicSharedArray(T, (threads,))
5858
@inbounds shared[thread] = val
5959

6060
# perform a reduction

src/sorting.jl

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -102,8 +102,8 @@ from `lo` to `hi` of `values`.
102102
"""
103103
function partition_batches_kernel(values::AbstractArray{T}, pivot, lo, hi, parity, lt::F1,
104104
by::F2) where {T,F1,F2}
105-
sums = @cuDynamicSharedMem(Int, blockDim().x)
106-
swap = @cuDynamicSharedMem(T, blockDim().x, sizeof(sums))
105+
sums = CuDynamicSharedArray(Int, blockDim().x)
106+
swap = CuDynamicSharedArray(T, blockDim().x, sizeof(sums))
107107
batch_partition(values, pivot, swap, sums, lo, hi, parity, lt, by)
108108
return
109109
end
@@ -375,8 +375,8 @@ early end to recursion if we started `stuck` at 0.
375375
"""
376376
function qsort_kernel(vals::AbstractArray{T,N}, lo, hi, parity, sync::Val{S}, sync_depth,
377377
prev_pivot, lt::F1, by::F2, ::Val{dims}, partial=nothing, stuck=-1) where {T, N, S, F1, F2, dims}
378-
b_sums = @cuDynamicSharedMem(Int, blockDim().x)
379-
swap = @cuDynamicSharedMem(T, blockDim().x, sizeof(b_sums))
378+
b_sums = CuDynamicSharedArray(Int, blockDim().x)
379+
swap = CuDynamicSharedArray(T, blockDim().x, sizeof(b_sums))
380380
shmem = sizeof(b_sums) + sizeof(swap)
381381
L = hi - lo
382382

test/codegen.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
@testset "JuliaLang/julia#21121" begin
44
function foobar()
5-
weight_matrix = @cuStaticSharedMem(Float32, (16, 16))
5+
weight_matrix = CuStaticSharedArray(Float32, (16, 16))
66
sync_threads()
77
weight_matrix[1, 16] *= 2
88
sync_threads()
@@ -75,7 +75,7 @@ end
7575
@inbounds function kernel(input, output, n)
7676
i = threadIdx().x
7777

78-
temp = @cuStaticSharedMem(Int, 1)
78+
temp = CuStaticSharedArray(Int, 1)
7979
if i == 1
8080
1 <= n || throw_some()
8181
temp[1] = input

test/device/array.jl

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -141,7 +141,7 @@ end
141141

142142

143143
function kernel_shmem_reinterpet_equal_size!(y)
144-
a = @cuDynamicSharedMem(Float32, (blockDim().x,))
144+
a = CuDynamicSharedArray(Float32, (blockDim().x,))
145145
b = reinterpret(UInt32, a)
146146
a[threadIdx().x] = threadIdx().x
147147
b[threadIdx().x] += 1
@@ -172,7 +172,7 @@ end
172172
end
173173

174174
function kernel_shmem_reinterpet_smaller_size!(y)
175-
a = @cuDynamicSharedMem(UInt128, (blockDim().x,))
175+
a = CuDynamicSharedArray(UInt128, (blockDim().x,))
176176
i32 = Int32(threadIdx().x)
177177
p = i32 + i32 * im
178178
q = i32 - i32 * im
@@ -209,7 +209,7 @@ end
209209
end
210210

211211
function kernel_shmem_reinterpet_larger_size!(y)
212-
a = @cuDynamicSharedMem(Float32, (4 * blockDim().x,))
212+
a = CuDynamicSharedArray(Float32, (4 * blockDim().x,))
213213
b = reinterpret(UInt128, a)
214214
a[1 + 4 * (threadIdx().x - 1)] = threadIdx().x
215215
a[2 + 4 * (threadIdx().x - 1)] = threadIdx().x * 2

test/device/intrinsics/atomics.jl

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -194,7 +194,7 @@ end
194194

195195
@testset "shared memory" begin
196196
function kernel()
197-
shared = @cuStaticSharedMem(Float32, 1)
197+
shared = CuStaticSharedArray(Float32, 1)
198198
@atomic shared[threadIdx().x] += 0f0
199199
return
200200
end
@@ -425,7 +425,7 @@ end
425425
# https://github.com/JuliaGPU/CUDA.jl/issues/311
426426

427427
function kernel(a)
428-
b = CUDA.@cuStaticSharedMem(Int, 1)
428+
b = CUDA.CuStaticSharedArray(Int, 1)
429429

430430
if threadIdx().x == 1
431431
b[] = a[]
@@ -452,7 +452,7 @@ end
452452

453453
function kernel()
454454
tid = threadIdx().x
455-
shared = @cuStaticSharedMem(Float32, 4)
455+
shared = CuStaticSharedArray(Float32, 4)
456456
CUDA.atomic_add!(pointer(shared, tid), shared[tid + 2])
457457
sync_threads()
458458
CUDA.atomic_add!(pointer(shared, tid), shared[tid + 2])

0 commit comments

Comments
 (0)