Default to hardware floating-point atomics. (#604)

pxl-th · web-flow · commit dbad788fae53 · 2024-02-27T17:15:23.000+02:00
diff --git a/docs/src/kernel_programming.md b/docs/src/kernel_programming.md
@@ -37,6 +37,12 @@ This value can then be used to select the groupsize for the kernel:
 @roc groupsize=occupancy.groupsize mykernel(args...)
 ```
 
+```@docs
+AMDGPU.@roc
+AMDGPU.Runtime.HIPKernel
+AMDGPU.Compiler.hipfunction
+```
+
 ## Atomics
 
 AMDGPU.jl relies on [Atomix.jl](https://github.com/JuliaConcurrent/Atomix.jl)
diff --git a/src/compiler/codegen.jl b/src/compiler/codegen.jl
@@ -1,6 +1,12 @@
 struct HIPCompilerParams <: AbstractCompilerParams
     # Whether to compile kernel for the wavefront of size 64.
     wavefrontsize64::Bool
+    # AMD GPU devices support fast atomic read-modify-write (RMW)
+    # operations on floating-point values.
+    # On single- or double-precision floating-point values this may generate
+    # a hardware RMW instruction that is faster than emulating
+    # the atomic operation using an atomic compare-and-swap (CAS) loop.
+    unsafe_fp_atomics::Bool
 end
 
 const HIPCompilerConfig = CompilerConfig{GCNCompilerTarget, HIPCompilerParams}
@@ -65,10 +71,18 @@ function GPUCompiler.finish_module!(
     target_fns = (
         "signal_exception", "report_exception", "malloc", "__throw_")
     inline_attr = EnumAttribute("alwaysinline")
+    atomic_attr = StringAttribute("amdgpu-unsafe-fp-atomics", "true")
+
     for fn in LLVM.functions(mod)
-        any(occursin.(target_fns, LLVM.name(fn))) || continue
-        attrs = LLVM.function_attributes(fn)
-        inline_attr ∈ collect(attrs) || push!(attrs, inline_attr)
+        do_inline = any(occursin.(target_fns, LLVM.name(fn)))
+        if job.config.params.unsafe_fp_atomics || do_inline
+            attrs = LLVM.function_attributes(fn)
+
+            do_inline && inline_attr ∉ collect(attrs) &&
+                push!(attrs, inline_attr)
+            job.config.params.unsafe_fp_atomics &&
+                push!(attrs, atomic_attr)
+        end
     end
 
     return entry
@@ -85,18 +99,36 @@ function parse_llvm_features(arch::String)
 end
 
 
-function compiler_config(
-    dev::HIP.HIPDevice; kernel::Bool = true,
-    name::Union{String, Nothing} = nothing, always_inline::Bool = true,
+function compiler_config(dev::HIP.HIPDevice;
+    name::Union{String, Nothing} = nothing, kernel::Bool = true,
+    unsafe_fp_atomics::Bool = true,
 )
     dev_isa, features = parse_llvm_features(HIP.gcn_arch(dev))
     target = GCNCompilerTarget(; dev_isa, features)
-    params = HIPCompilerParams(HIP.wavefrontsize(dev) == 64)
-    CompilerConfig(target, params; kernel, name, always_inline)
+    params = HIPCompilerParams(HIP.wavefrontsize(dev) == 64, unsafe_fp_atomics)
+    CompilerConfig(target, params; kernel, name, always_inline=true)
 end
 
 const hipfunction_lock = ReentrantLock()
 
+"""
+    hipfunction(f::F, tt::TT = Tuple{}; kwargs...)
+
+Compile Julia function `f` to a HIP kernel given a tuple of
+argument's types `tt` that it accepts.
+
+The following kwargs are supported:
+
+- `name::Union{String, Nothing} = nothing`:
+    A unique name to give a compiled kernel.
+- `unsafe_fp_atomics::Bool = true`:
+    Whether to use 'unsafe' floating-point atomics.
+    AMD GPU devices support fast atomic read-modify-write (RMW)
+    operations on floating-point values.
+    On single- or double-precision floating-point values this may generate
+    a hardware RMW instruction that is faster than emulating
+    the atomic operation using an atomic compare-and-swap (CAS) loop.
+"""
 function hipfunction(f::F, tt::TT = Tuple{}; kwargs...) where {F <: Core.Function, TT}
     Base.@lock hipfunction_lock begin
         dev = AMDGPU.device()
diff --git a/src/highlevel.jl b/src/highlevel.jl
@@ -110,9 +110,24 @@ register methods for the the `AMDGPU.Adaptor` type.
 rocconvert(arg) = adapt(Runtime.Adaptor(), arg)
 
 const MACRO_KWARGS = [:launch]
-const COMPILER_KWARGS = [:name]
+const COMPILER_KWARGS = [:name, :unsafe_fp_atomics]
 const LAUNCH_KWARGS = [:gridsize, :groupsize, :shmem, :stream]
 
+"""
+    @roc [kwargs...] func(args...)
+
+High-level interface for launching kernels on GPU.
+Upon a first call it will be compiled, subsequent calls will re-use
+the compiled object.
+
+Several keyword arguments are supported:
+- `launch::Bool = true`: whether to launch the kernel.
+    If `false`, then returns a compiled kernel which can be launched by
+    calling it and passing arguments.
+- Arguments that influence kernel compilation, see
+    [`AMDGPU.Compiler.hipfunction`](@ref).
+- Arguments that influence kernel launch, see [`AMDGPU.Runtime.HIPKernel`](@ref).
+"""
 macro roc(ex...)
     # destructure the `@roc` expression
     call = ex[end]
diff --git a/src/runtime/hip-execution.jl b/src/runtime/hip-execution.jl
@@ -1,3 +1,16 @@
+"""
+    (ker::HIPKernel)(args::Vararg{Any, N}; kwargs...)
+
+Launch compiled HIPKernel by passing arguments to it.
+
+The following kwargs are supported:
+- `gridsize::ROCDim = 1`: Size of the grid.
+- `groupsize::ROCDim = 1`:  Size of the workgroup.
+- `shmem::Integer = 0`:
+    Amount of dynamically-allocated shared memory in bytes.
+- `stream::HIP.HIPStream = AMDGPU.stream()`:
+    Stream on which to launch the kernel.
+"""
 struct HIPKernel{F, TT} <: AbstractKernel{F, TT}
     f::F
     fun::HIP.HIPFunction