Skip to content

Commit 5355310

Browse files
authored
Port control barriers to SPIR-V, and wrap memory barriers. (#343)
1 parent 829fe2a commit 5355310

File tree

7 files changed

+236
-22
lines changed

7 files changed

+236
-22
lines changed

lib/intrinsics/src/synchronization.jl

Lines changed: 150 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,158 @@
11
# Synchronization Functions
22

3-
export barrier
3+
## SPIR-V wrappers
44

5-
const cl_mem_fence_flags = UInt32
6-
const CLK_LOCAL_MEM_FENCE = cl_mem_fence_flags(1)
7-
const CLK_GLOBAL_MEM_FENCE = cl_mem_fence_flags(2)
8-
9-
#barrier(flags=0) = @builtin_ccall("barrier", Cvoid, (UInt32,), flags)
10-
@device_function barrier(flags=0) = Base.llvmcall(("""
11-
declare void @_Z7barrierj(i32) #0
12-
define void @entry(i32 %0) #1 {
13-
call void @_Z7barrierj(i32 %0)
5+
module Scope
6+
const CrossDevice = 0
7+
const Device = 1
8+
const Workgroup = 2
9+
const Subgroup = 3
10+
const Invocation = 4
11+
const QueueFamily = 5
12+
const ShaderCall = 6
13+
end
14+
15+
module MemorySemantics
16+
const None = const Relaxed = 0x0000
17+
const Acquire = 0x0002
18+
const Release = 0x0004
19+
const AcquireRelease = 0x0008
20+
const SequentiallyConsistent = 0x0010
21+
const UniformMemory = 0x0040
22+
const SubgroupMemory = 0x0080
23+
const WorkgroupMemory = 0x0100
24+
const CrossWorkgroupMemory = 0x0200
25+
const AtomicCounterMemory = 0x0400
26+
const ImageMemory = 0x0800
27+
const OutputMemory = 0x1000
28+
const MakeAvailable = 0x2000
29+
const MakeVisible = 0x4000
30+
const Signal = 0x8000
31+
end
32+
33+
# `@builtin_ccall` does not support additional attributes like `convergent`
34+
# XXX: is this even needed? Doesn't LLVM reconstruct these?
35+
# using the `@builtin_ccall` version causes validation issues.
36+
37+
#@device_function @inline memory_barrier(scope, semantics) =
38+
# @builtin_ccall("__spirv_MemoryBarrier", Cvoid, (UInt32, UInt32), scope, semantics)
39+
@device_function memory_barrier(scope, semantics) =
40+
Base.llvmcall(("""
41+
declare void @_Z21__spirv_MemoryBarrierjj(i32, i32) #0
42+
define void @entry(i32 %scope, i32 %semantics) #1 {
43+
call void @_Z21__spirv_MemoryBarrierjj(i32 %scope, i32 %semantics)
44+
ret void
45+
}
46+
attributes #0 = { convergent }
47+
attributes #1 = { alwaysinline }
48+
""", "entry"),
49+
Cvoid, Tuple{UInt32, UInt32}, convert(UInt32, scope), convert(UInt32, semantics))
50+
51+
#@device_function @inline control_barrier(execution_scope, memory_scope, memory_semantics) =
52+
# @builtin_ccall("__spirv_ControlBarrier", Cvoid, (UInt32, UInt32, UInt32),
53+
# execution_scope, memory_scope, memory_semantics)
54+
@device_function @inline control_barrier(execution_scope, memory_scope, memory_semantics) =
55+
Base.llvmcall(("""
56+
declare void @_Z22__spirv_ControlBarrierjjj(i32, i32, i32) #0
57+
define void @entry(i32 %execution, i32 %memory, i32 %semantics) #1 {
58+
call void @_Z22__spirv_ControlBarrierjjj(i32 %execution, i32 %memory, i32 %semantics)
1459
ret void
1560
}
1661
attributes #0 = { convergent }
1762
attributes #1 = { alwaysinline }
1863
""", "entry"),
19-
Cvoid, Tuple{Int32}, convert(Int32, flags))
20-
push!(opencl_builtins, "_Z7barrierj")
21-
# TODO: add support for attributes to @builting_ccall/LLVM.@typed_ccall
64+
Cvoid,
65+
Tuple{UInt32, UInt32, UInt32},
66+
convert(UInt32, execution_scope),
67+
convert(UInt32, memory_scope),
68+
convert(UInt32, memory_semantics))
69+
70+
## OpenCL types
71+
72+
const cl_mem_fence_flags = UInt32
73+
const LOCAL_MEM_FENCE = cl_mem_fence_flags(1)
74+
const GLOBAL_MEM_FENCE = cl_mem_fence_flags(2)
75+
const IMAGE_MEM_FENCE = cl_mem_fence_flags(4)
76+
77+
@inline function mem_fence_flags_to_semantics(flags)
78+
semantics = MemorySemantics.None
79+
if (flags & LOCAL_MEM_FENCE) == LOCAL_MEM_FENCE
80+
semantics |= MemorySemantics.WorkgroupMemory
81+
end
82+
if (flags & GLOBAL_MEM_FENCE) == GLOBAL_MEM_FENCE
83+
semantics |= MemorySemantics.CrossWorkgroupMemory
84+
end
85+
return semantics
86+
end
87+
88+
@enum memory_scope begin
89+
memory_scope_work_item
90+
memory_scope_sub_group
91+
memory_scope_work_group
92+
memory_scope_device
93+
memory_scope_all_svm_devices
94+
memory_scope_all_devices
95+
end
96+
97+
@inline function cl_scope_to_spirv(scope)
98+
if scope == memory_scope_work_item
99+
Scope.Invocation
100+
elseif scope == memory_scope_sub_group
101+
Scope.Subgroup
102+
elseif scope == memory_scope_work_group
103+
Scope.Workgroup
104+
elseif scope == memory_scope_device
105+
Scope.Device
106+
elseif scope == memory_scope_all_svm_devices || scope == memory_scope_all_devices
107+
Scope.CrossDevice
108+
else
109+
error("Invalid memory scope: $scope")
110+
end
111+
end
112+
113+
@enum memory_order begin
114+
memory_order_relaxed
115+
memory_order_acquire
116+
memory_order_release
117+
memory_order_acq_rel
118+
memory_order_seq_cst
119+
end
120+
121+
122+
## OpenCL memory barriers
123+
124+
export atomic_work_item_fence, mem_fence, read_mem_fence, write_mem_fence
125+
126+
@inline function atomic_work_item_fence(flags, order, scope)
127+
semantics = mem_fence_flags_to_semantics(flags)
128+
if order == memory_order_relaxed
129+
semantics |= MemorySemantics.Relaxed
130+
elseif order == memory_order_acquire
131+
semantics |= MemorySemantics.Acquire
132+
elseif order == memory_order_release
133+
semantics |= MemorySemantics.Release
134+
elseif order == memory_order_acq_rel
135+
semantics |= MemorySemantics.AcquireRelease
136+
elseif order == memory_order_seq_cst
137+
semantics |= MemorySemantics.SequentiallyConsistent
138+
else
139+
error("Invalid memory order: $order")
140+
end
141+
memory_barrier(cl_scope_to_spirv(scope), semantics)
142+
end
143+
144+
# legacy fence functions
145+
mem_fence(flags) = atomic_work_item_fence(flags, memory_order_acq_rel, memory_scope_work_group)
146+
read_mem_fence(flags) = atomic_work_item_fence(flags, memory_order_acquire, memory_scope_work_group)
147+
write_mem_fence(flags) = atomic_work_item_fence(flags, memory_order_release, memory_scope_work_group)
148+
149+
150+
## OpenCL execution barriers
151+
152+
export barrier, work_group_barrier
153+
154+
@inline work_group_barrier(flags, scope = memory_scope_work_group) =
155+
control_barrier(Scope.Workgroup, cl_scope_to_spirv(scope),
156+
MemorySemantics.SequentiallyConsistent | mem_fence_flags_to_semantics(flags))
157+
158+
barrier(flags) = work_group_barrier(flags)

lib/intrinsics/src/utils.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
const opencl_builtins = String["printf"]
1+
const known_intrinsics = String["printf"]
22

33
# OpenCL functions need to be mangled according to the C++ Itanium spec. We implement a very
44
# limited version of that spec here, just enough to support OpenCL built-ins.
@@ -71,7 +71,7 @@ macro builtin_ccall(name, ret, argtypes, args...)
7171
mangled *= mangle(t)
7272
end
7373

74-
push!(opencl_builtins, mangled)
74+
push!(known_intrinsics, mangled)
7575
esc(quote
7676
@typed_ccall($mangled, llvmcall, $ret, ($(argtypes...),), $(args...))
7777
end)

src/OpenCLKernels.jl

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
module OpenCLKernels
22

33
using ..OpenCL
4-
using ..OpenCL: @device_override, SPIRVIntrinsics, method_table
4+
using ..OpenCL: @device_override, method_table
55

66
import KernelAbstractions as KA
77

@@ -153,7 +153,7 @@ end
153153
## Shared and Scratch Memory
154154

155155
@device_override @inline function KA.SharedMemory(::Type{T}, ::Val{Dims}, ::Val{Id}) where {T, Dims, Id}
156-
ptr = SPIRVIntrinsics.emit_localmemory(T, Val(prod(Dims)))
156+
ptr = OpenCL.emit_localmemory(T, Val(prod(Dims)))
157157
CLDeviceArray(Dims, ptr)
158158
end
159159

@@ -165,11 +165,11 @@ end
165165
## Synchronization and Printing
166166

167167
@device_override @inline function KA.__synchronize()
168-
SPIRVIntrinsics.barrier(SPIRVIntrinsics.CLK_LOCAL_MEM_FENCE | SPIRVIntrinsics.CLK_GLOBAL_MEM_FENCE)
168+
work_group_barrier(OpenCL.LOCAL_MEM_FENCE | OpenCL.GLOBAL_MEM_FENCE)
169169
end
170170

171171
@device_override @inline function KA.__print(args...)
172-
SPIRVIntrinsics._print(args...)
172+
OpenCL._print(args...)
173173
end
174174

175175

src/compiler/compilation.jl

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,8 @@ GPUCompiler.isintrinsic(job::OpenCLCompilerJob, fn::String) =
1515
invoke(GPUCompiler.isintrinsic,
1616
Tuple{CompilerJob{SPIRVCompilerTarget}, typeof(fn)},
1717
job, fn) ||
18-
in(fn, opencl_builtins)
18+
in(fn, known_intrinsics) ||
19+
contains(fn, "__spirv_")
1920

2021

2122
## compiler implementation (cache, configure, compile, and link)

src/mapreduce.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
# perform a reduction
1818
d = 1
1919
while d < items
20-
barrier()
20+
work_group_barrier(LOCAL_MEM_FENCE)
2121
index = 2 * d * (item-1) + 1
2222
@inbounds if index <= items
2323
other_val = if index + d <= items

test/intrinsics.jl

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
@testset "intrinsics" begin
2+
3+
@testset "barrier" begin
4+
5+
@on_device barrier(OpenCL.LOCAL_MEM_FENCE)
6+
@on_device barrier(OpenCL.GLOBAL_MEM_FENCE)
7+
@on_device barrier(OpenCL.LOCAL_MEM_FENCE | OpenCL.GLOBAL_MEM_FENCE)
8+
9+
@on_device work_group_barrier(OpenCL.LOCAL_MEM_FENCE)
10+
@on_device work_group_barrier(OpenCL.GLOBAL_MEM_FENCE)
11+
@on_device work_group_barrier(OpenCL.IMAGE_MEM_FENCE)
12+
13+
@on_device work_group_barrier(OpenCL.LOCAL_MEM_FENCE | OpenCL.GLOBAL_MEM_FENCE)
14+
@on_device work_group_barrier(OpenCL.LOCAL_MEM_FENCE | OpenCL.IMAGE_MEM_FENCE)
15+
@on_device work_group_barrier(OpenCL.GLOBAL_MEM_FENCE | OpenCL.LOCAL_MEM_FENCE | OpenCL.IMAGE_MEM_FENCE)
16+
17+
@on_device work_group_barrier(OpenCL.LOCAL_MEM_FENCE, OpenCL.memory_scope_work_item)
18+
@on_device work_group_barrier(OpenCL.LOCAL_MEM_FENCE, OpenCL.memory_scope_work_group)
19+
@on_device work_group_barrier(OpenCL.LOCAL_MEM_FENCE, OpenCL.memory_scope_device)
20+
@on_device work_group_barrier(OpenCL.LOCAL_MEM_FENCE, OpenCL.memory_scope_all_svm_devices)
21+
@on_device work_group_barrier(OpenCL.LOCAL_MEM_FENCE, OpenCL.memory_scope_sub_group)
22+
23+
end
24+
25+
@testset "mem_fence" begin
26+
27+
@on_device mem_fence(OpenCL.LOCAL_MEM_FENCE)
28+
@on_device mem_fence(OpenCL.GLOBAL_MEM_FENCE)
29+
@on_device mem_fence(OpenCL.LOCAL_MEM_FENCE | OpenCL.GLOBAL_MEM_FENCE)
30+
31+
@on_device read_mem_fence(OpenCL.LOCAL_MEM_FENCE)
32+
@on_device read_mem_fence(OpenCL.GLOBAL_MEM_FENCE)
33+
@on_device read_mem_fence(OpenCL.LOCAL_MEM_FENCE | OpenCL.GLOBAL_MEM_FENCE)
34+
35+
@on_device write_mem_fence(OpenCL.LOCAL_MEM_FENCE)
36+
@on_device write_mem_fence(OpenCL.GLOBAL_MEM_FENCE)
37+
@on_device write_mem_fence(OpenCL.LOCAL_MEM_FENCE | OpenCL.GLOBAL_MEM_FENCE)
38+
39+
end
40+
41+
@testset "atomic_work_item_fence" begin
42+
43+
@on_device atomic_work_item_fence(OpenCL.LOCAL_MEM_FENCE, OpenCL.memory_order_relaxed, OpenCL.memory_scope_work_item)
44+
@on_device atomic_work_item_fence(OpenCL.GLOBAL_MEM_FENCE, OpenCL.memory_order_acquire, OpenCL.memory_scope_work_group)
45+
@on_device atomic_work_item_fence(OpenCL.IMAGE_MEM_FENCE, OpenCL.memory_order_release, OpenCL.memory_scope_device)
46+
@on_device atomic_work_item_fence(OpenCL.LOCAL_MEM_FENCE, OpenCL.memory_order_acq_rel, OpenCL.memory_scope_all_svm_devices)
47+
@on_device atomic_work_item_fence(OpenCL.GLOBAL_MEM_FENCE, OpenCL.memory_order_seq_cst, OpenCL.memory_scope_sub_group)
48+
@on_device atomic_work_item_fence(OpenCL.IMAGE_MEM_FENCE | OpenCL.LOCAL_MEM_FENCE, OpenCL.memory_order_acquire, OpenCL.memory_scope_sub_group)
49+
50+
end
51+
52+
end
53+

test/setup.jl

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ function runtests(f, name, platform_filter)
9090
end
9191

9292
# some tests require native execution capabilities
93-
requires_il = name in ["execution", "kernelabstractions"] ||
93+
requires_il = name in ["execution", "intrinsics", "kernelabstractions"] ||
9494
startswith(name, "gpuarrays/")
9595

9696
ex = quote
@@ -140,4 +140,27 @@ function runtests(f, name, platform_filter)
140140
end
141141
end
142142

143+
144+
## auxiliary stuff
145+
146+
# Run some code on-device
147+
macro on_device(ex...)
148+
code = ex[end]
149+
kwargs = ex[1:end-1]
150+
151+
@gensym kernel
152+
esc(quote
153+
let
154+
function $kernel()
155+
$code
156+
return
157+
end
158+
159+
@opencl $(kwargs...) $kernel()
160+
cl.finish(cl.queue())
161+
end
162+
end)
163+
end
164+
165+
143166
nothing # File is loaded via a remotecall to "include". Ensure it returns "nothing".

0 commit comments

Comments
 (0)