Skip to content

Commit dbad788

Browse files
authored
Default to hardware floating-point atomics. (#604)
1 parent 9825359 commit dbad788

File tree

4 files changed

+75
-9
lines changed

4 files changed

+75
-9
lines changed

docs/src/kernel_programming.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,12 @@ This value can then be used to select the groupsize for the kernel:
3737
@roc groupsize=occupancy.groupsize mykernel(args...)
3838
```
3939

40+
```@docs
41+
AMDGPU.@roc
42+
AMDGPU.Runtime.HIPKernel
43+
AMDGPU.Compiler.hipfunction
44+
```
45+
4046
## Atomics
4147

4248
AMDGPU.jl relies on [Atomix.jl](https://github.com/JuliaConcurrent/Atomix.jl)

src/compiler/codegen.jl

Lines changed: 40 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,12 @@
11
struct HIPCompilerParams <: AbstractCompilerParams
22
# Whether to compile kernel for the wavefront of size 64.
33
wavefrontsize64::Bool
4+
# AMD GPU devices support fast atomic read-modify-write (RMW)
5+
# operations on floating-point values.
6+
# On single- or double-precision floating-point values this may generate
7+
# a hardware RMW instruction that is faster than emulating
8+
# the atomic operation using an atomic compare-and-swap (CAS) loop.
9+
unsafe_fp_atomics::Bool
410
end
511

612
const HIPCompilerConfig = CompilerConfig{GCNCompilerTarget, HIPCompilerParams}
@@ -65,10 +71,18 @@ function GPUCompiler.finish_module!(
6571
target_fns = (
6672
"signal_exception", "report_exception", "malloc", "__throw_")
6773
inline_attr = EnumAttribute("alwaysinline")
74+
atomic_attr = StringAttribute("amdgpu-unsafe-fp-atomics", "true")
75+
6876
for fn in LLVM.functions(mod)
69-
any(occursin.(target_fns, LLVM.name(fn))) || continue
70-
attrs = LLVM.function_attributes(fn)
71-
inline_attr collect(attrs) || push!(attrs, inline_attr)
77+
do_inline = any(occursin.(target_fns, LLVM.name(fn)))
78+
if job.config.params.unsafe_fp_atomics || do_inline
79+
attrs = LLVM.function_attributes(fn)
80+
81+
do_inline && inline_attr collect(attrs) &&
82+
push!(attrs, inline_attr)
83+
job.config.params.unsafe_fp_atomics &&
84+
push!(attrs, atomic_attr)
85+
end
7286
end
7387

7488
return entry
@@ -85,18 +99,36 @@ function parse_llvm_features(arch::String)
8599
end
86100

87101

88-
function compiler_config(
89-
dev::HIP.HIPDevice; kernel::Bool = true,
90-
name::Union{String, Nothing} = nothing, always_inline::Bool = true,
102+
function compiler_config(dev::HIP.HIPDevice;
103+
name::Union{String, Nothing} = nothing, kernel::Bool = true,
104+
unsafe_fp_atomics::Bool = true,
91105
)
92106
dev_isa, features = parse_llvm_features(HIP.gcn_arch(dev))
93107
target = GCNCompilerTarget(; dev_isa, features)
94-
params = HIPCompilerParams(HIP.wavefrontsize(dev) == 64)
95-
CompilerConfig(target, params; kernel, name, always_inline)
108+
params = HIPCompilerParams(HIP.wavefrontsize(dev) == 64, unsafe_fp_atomics)
109+
CompilerConfig(target, params; kernel, name, always_inline=true)
96110
end
97111

98112
const hipfunction_lock = ReentrantLock()
99113

114+
"""
115+
hipfunction(f::F, tt::TT = Tuple{}; kwargs...)
116+
117+
Compile Julia function `f` to a HIP kernel given a tuple of
118+
argument's types `tt` that it accepts.
119+
120+
The following kwargs are supported:
121+
122+
- `name::Union{String, Nothing} = nothing`:
123+
A unique name to give a compiled kernel.
124+
- `unsafe_fp_atomics::Bool = true`:
125+
Whether to use 'unsafe' floating-point atomics.
126+
AMD GPU devices support fast atomic read-modify-write (RMW)
127+
operations on floating-point values.
128+
On single- or double-precision floating-point values this may generate
129+
a hardware RMW instruction that is faster than emulating
130+
the atomic operation using an atomic compare-and-swap (CAS) loop.
131+
"""
100132
function hipfunction(f::F, tt::TT = Tuple{}; kwargs...) where {F <: Core.Function, TT}
101133
Base.@lock hipfunction_lock begin
102134
dev = AMDGPU.device()

src/highlevel.jl

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -110,9 +110,24 @@ register methods for the the `AMDGPU.Adaptor` type.
110110
rocconvert(arg) = adapt(Runtime.Adaptor(), arg)
111111

112112
const MACRO_KWARGS = [:launch]
113-
const COMPILER_KWARGS = [:name]
113+
const COMPILER_KWARGS = [:name, :unsafe_fp_atomics]
114114
const LAUNCH_KWARGS = [:gridsize, :groupsize, :shmem, :stream]
115115

116+
"""
117+
@roc [kwargs...] func(args...)
118+
119+
High-level interface for launching kernels on GPU.
120+
Upon a first call it will be compiled, subsequent calls will re-use
121+
the compiled object.
122+
123+
Several keyword arguments are supported:
124+
- `launch::Bool = true`: whether to launch the kernel.
125+
If `false`, then returns a compiled kernel which can be launched by
126+
calling it and passing arguments.
127+
- Arguments that influence kernel compilation, see
128+
[`AMDGPU.Compiler.hipfunction`](@ref).
129+
- Arguments that influence kernel launch, see [`AMDGPU.Runtime.HIPKernel`](@ref).
130+
"""
116131
macro roc(ex...)
117132
# destructure the `@roc` expression
118133
call = ex[end]

src/runtime/hip-execution.jl

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,16 @@
1+
"""
2+
(ker::HIPKernel)(args::Vararg{Any, N}; kwargs...)
3+
4+
Launch compiled HIPKernel by passing arguments to it.
5+
6+
The following kwargs are supported:
7+
- `gridsize::ROCDim = 1`: Size of the grid.
8+
- `groupsize::ROCDim = 1`: Size of the workgroup.
9+
- `shmem::Integer = 0`:
10+
Amount of dynamically-allocated shared memory in bytes.
11+
- `stream::HIP.HIPStream = AMDGPU.stream()`:
12+
Stream on which to launch the kernel.
13+
"""
114
struct HIPKernel{F, TT} <: AbstractKernel{F, TT}
215
f::F
316
fun::HIP.HIPFunction

0 commit comments

Comments
 (0)