From ac3911ee296ac7d3fc5b8b8537f7fbd50397c386 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Tue, 5 Mar 2024 09:15:20 +0100 Subject: [PATCH 1/7] Introduce StaticCartesianIndices to eliminate expensive integer divisions. --- src/device/indexing.jl | 31 +++++++++++++++++++++++++- src/host/broadcast.jl | 49 +++++++++++++++++++++++++++++------------- src/host/math.jl | 2 +- 3 files changed, 65 insertions(+), 17 deletions(-) diff --git a/src/device/indexing.jl b/src/device/indexing.jl index b0b9990d..a12ec16b 100644 --- a/src/device/indexing.jl +++ b/src/device/indexing.jl @@ -64,7 +64,9 @@ macro linearidx(A, grididx=1, ctxsym=:ctx) quote x = $(esc(A)) i = linear_index($(esc(ctxsym)), $(esc(grididx))) - i > length(x) && return + if !(1 <= i <= length(x)) + return + end i end end @@ -81,3 +83,30 @@ macro cartesianidx(A, grididx=1, ctxsym=:ctx) @inbounds CartesianIndices(x)[i] end end + + +## utilities + +# indexing a CartesianIndices at run time generates an integer division, which is slow. +# to work around this, we can use a static CartesianIndices type to avoid the division. +# this informs LLVM and the back-end about the static iteration bounds, allowing it to +# lower the integer divison to a series of bit shifts, dramatically improving performance. +# also see: maleadt/StaticCartesian.jl, JuliaGPU/GPUArrays.jl#454 + +struct StaticCartesianIndices{N, I} end + +StaticCartesianIndices(iter::CartesianIndices{N}) where {N} = + StaticCartesianIndices{N, iter.indices}() +StaticCartesianIndices(x) = StaticCartesianIndices(CartesianIndices(x)) + +Base.CartesianIndices(iter::StaticCartesianIndices{N, I}) where {N, I} = + CartesianIndices{N, typeof(I)}(I) + +Base.@propagate_inbounds Base.getindex(I::StaticCartesianIndices, i::Int) = + CartesianIndices(I)[i] +Base.length(I::StaticCartesianIndices) = length(CartesianIndices(I)) + +function Base.show(io::IO, I::StaticCartesianIndices) + print(io, "Static") + show(io, CartesianIndices(I)) +end diff --git a/src/host/broadcast.jl b/src/host/broadcast.jl index d7cec877..247fda9b 100644 --- a/src/host/broadcast.jl +++ b/src/host/broadcast.jl @@ -32,32 +32,41 @@ end return _copyto!(dest, instantiate(Broadcasted{Style}(bc.f, bc.args, axes(dest)))) end -@inline Base.copyto!(dest::AnyGPUArray, bc::Broadcasted{Nothing}) = _copyto!(dest, bc) # Keep it for ArrayConflict +@inline Base.copyto!(dest::AnyGPUArray, bc::Broadcasted{Nothing}) = + _copyto!(dest, bc) # Keep it for ArrayConflict -@inline Base.copyto!(dest::AbstractArray, bc::Broadcasted{<:AbstractGPUArrayStyle}) = _copyto!(dest, bc) +@inline Base.copyto!(dest::AbstractArray, bc::Broadcasted{<:AbstractGPUArrayStyle}) = + _copyto!(dest, bc) @inline function _copyto!(dest::AbstractArray, bc::Broadcasted) axes(dest) == axes(bc) || Broadcast.throwdm(axes(dest), axes(bc)) isempty(dest) && return dest bc′ = Broadcast.preprocess(dest, bc) + # if we can't efficiently index, use static indices to help the compiler eliminate idivs + Is = if isa(IndexStyle(dest), IndexCartesian) || isa(IndexStyle(bc′), IndexCartesian) + StaticCartesianIndices(dest) + else + CartesianIndices(dest) + end + # grid-stride kernel - function broadcast_kernel(ctx, dest, bc′, nelem) - i = 0 - while i < nelem - i += 1 - I = @cartesianidx(dest, i) + function broadcast_kernel(ctx, dest, Is, bc′, nelem) + i = 1 + while i <= nelem + I = @cartesianidx(Is, i) @inbounds dest[I] = bc′[I] + i += 1 end return end elements = length(dest) elements_per_thread = typemax(Int) - heuristic = launch_heuristic(backend(dest), broadcast_kernel, dest, bc′, 1; + heuristic = launch_heuristic(backend(dest), broadcast_kernel, dest, Is, bc′, 1; elements, elements_per_thread) config = launch_configuration(backend(dest), heuristic; elements, elements_per_thread) - gpu_call(broadcast_kernel, dest, bc′, config.elements_per_thread; + gpu_call(broadcast_kernel, dest, Is, bc′, config.elements_per_thread; threads=config.threads, blocks=config.blocks) return dest @@ -99,24 +108,34 @@ function Base.map!(f, dest::AnyGPUArray, xs::AbstractArray...) bc = Broadcast.preprocess(dest, bc) end + # if we can't efficiently index, use static indices to help the compiler eliminate idivs + Is = if isa(IndexStyle(bc), IndexCartesian) + StaticCartesianIndices(axes(bc)) + else + CartesianIndices(axes(bc)) + end + # grid-stride kernel - function map_kernel(ctx, dest, bc, nelem) - for i in 1:nelem + function map_kernel(ctx, dest, Is, bc, nelem) + i = 1 + while i <= nelem j = linear_index(ctx, i) j > common_length && return - J = CartesianIndices(axes(bc))[j] - @inbounds dest[j] = bc[J] + I = @cartesianidx(Is, i) + @inbounds dest[j] = bc[I] + + i += 1 end return end elements = common_length elements_per_thread = typemax(Int) - heuristic = launch_heuristic(backend(dest), map_kernel, dest, bc, 1; + heuristic = launch_heuristic(backend(dest), map_kernel, dest, Is, bc, 1; elements, elements_per_thread) config = launch_configuration(backend(dest), heuristic; elements, elements_per_thread) - gpu_call(map_kernel, dest, bc, config.elements_per_thread; + gpu_call(map_kernel, dest, Is, bc, config.elements_per_thread; threads=config.threads, blocks=config.blocks) return dest diff --git a/src/host/math.jl b/src/host/math.jl index 8d02c97f..cf455d31 100644 --- a/src/host/math.jl +++ b/src/host/math.jl @@ -2,7 +2,7 @@ function Base.clamp!(A::AnyGPUArray, low, high) gpu_call(A, low, high) do ctx, A, low, high - I = @cartesianidx A + I = @linearidx A A[I] = clamp(A[I], low, high) return end From 90fb573c02bb6a63d3f50c0d8cd52effddf6f5e7 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Tue, 5 Mar 2024 09:59:56 +0100 Subject: [PATCH 2/7] Bump Julia version used by CI. --- .buildkite/pipeline.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index 8f7d4f80..b7f4aa82 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -2,7 +2,7 @@ steps: - label: "CUDA.jl" plugins: - JuliaCI/julia#v1: - version: 1.8 + version: "1.10" - JuliaCI/julia-coverage#v1: codecov: true command: | @@ -23,7 +23,7 @@ steps: - label: "oneAPI.jl" plugins: - JuliaCI/julia#v1: - version: 1.8 + version: "1.10" - JuliaCI/julia-coverage#v1: codecov: true command: | @@ -48,7 +48,7 @@ steps: - label: "Metal.jl" plugins: - JuliaCI/julia#v1: - version: 1.8 + version: "1.10" - JuliaCI/julia-coverage#v1: codecov: true command: | From c3783d791395f57a512b42187ca7cca3cfd9edcc Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Wed, 6 Mar 2024 13:04:40 +0100 Subject: [PATCH 3/7] Simplify. --- src/host/broadcast.jl | 46 +++++++++++++++++++++++++------------------ 1 file changed, 27 insertions(+), 19 deletions(-) diff --git a/src/host/broadcast.jl b/src/host/broadcast.jl index 247fda9b..6842c67a 100644 --- a/src/host/broadcast.jl +++ b/src/host/broadcast.jl @@ -2,7 +2,7 @@ using Base.Broadcast -import Base.Broadcast: BroadcastStyle, Broadcasted, AbstractArrayStyle, instantiate +using Base.Broadcast: BroadcastStyle, Broadcasted, AbstractArrayStyle, instantiate # but make sure we don't dispatch to the optimized copy method that directly indexes function Broadcast.copy(bc::Broadcasted{<:AbstractGPUArrayStyle{0}}) @@ -41,32 +41,40 @@ end @inline function _copyto!(dest::AbstractArray, bc::Broadcasted) axes(dest) == axes(bc) || Broadcast.throwdm(axes(dest), axes(bc)) isempty(dest) && return dest - bc′ = Broadcast.preprocess(dest, bc) - - # if we can't efficiently index, use static indices to help the compiler eliminate idivs - Is = if isa(IndexStyle(dest), IndexCartesian) || isa(IndexStyle(bc′), IndexCartesian) - StaticCartesianIndices(dest) + bc = Broadcast.preprocess(dest, bc) + + broadcast_kernel = if ndims(bc) == 1 || + (isa(IndexStyle(dest), IndexLinear) && + isa(IndexStyle(bc), IndexLinear)) + function (ctx, dest, bc, nelem) + i = 1 + while i <= nelem + I = @linearidx(dest, i) + @inbounds dest[I] = bc[I] + i += 1 + end + return + end else - CartesianIndices(dest) - end - - # grid-stride kernel - function broadcast_kernel(ctx, dest, Is, bc′, nelem) - i = 1 - while i <= nelem - I = @cartesianidx(Is, i) - @inbounds dest[I] = bc′[I] - i += 1 + # XXX: we could use StaticCartesianIndices here, but that results in many compilations + function (ctx, dest, bc, nelem) + i = 0 + while i < nelem + i += 1 + I = @cartesianidx(dest, i) + @inbounds dest[I] = bc[I] + end + return end - return end + elements = length(dest) elements_per_thread = typemax(Int) - heuristic = launch_heuristic(backend(dest), broadcast_kernel, dest, Is, bc′, 1; + heuristic = launch_heuristic(backend(dest), broadcast_kernel, dest, bc, 1; elements, elements_per_thread) config = launch_configuration(backend(dest), heuristic; elements, elements_per_thread) - gpu_call(broadcast_kernel, dest, Is, bc′, config.elements_per_thread; + gpu_call(broadcast_kernel, dest, bc, config.elements_per_thread; threads=config.threads, blocks=config.blocks) return dest From 5b9d7c0e2924d3c82c1387af7fdca62a789db981 Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Wed, 6 Mar 2024 13:09:56 +0100 Subject: [PATCH 4/7] Remove StaticCartesian. --- src/device/indexing.jl | 27 --------------------------- 1 file changed, 27 deletions(-) diff --git a/src/device/indexing.jl b/src/device/indexing.jl index a12ec16b..31084fce 100644 --- a/src/device/indexing.jl +++ b/src/device/indexing.jl @@ -83,30 +83,3 @@ macro cartesianidx(A, grididx=1, ctxsym=:ctx) @inbounds CartesianIndices(x)[i] end end - - -## utilities - -# indexing a CartesianIndices at run time generates an integer division, which is slow. -# to work around this, we can use a static CartesianIndices type to avoid the division. -# this informs LLVM and the back-end about the static iteration bounds, allowing it to -# lower the integer divison to a series of bit shifts, dramatically improving performance. -# also see: maleadt/StaticCartesian.jl, JuliaGPU/GPUArrays.jl#454 - -struct StaticCartesianIndices{N, I} end - -StaticCartesianIndices(iter::CartesianIndices{N}) where {N} = - StaticCartesianIndices{N, iter.indices}() -StaticCartesianIndices(x) = StaticCartesianIndices(CartesianIndices(x)) - -Base.CartesianIndices(iter::StaticCartesianIndices{N, I}) where {N, I} = - CartesianIndices{N, typeof(I)}(I) - -Base.@propagate_inbounds Base.getindex(I::StaticCartesianIndices, i::Int) = - CartesianIndices(I)[i] -Base.length(I::StaticCartesianIndices) = length(CartesianIndices(I)) - -function Base.show(io::IO, I::StaticCartesianIndices) - print(io, "Static") - show(io, CartesianIndices(I)) -end From 548a8d92276e4136ca130feb1561911fbac87c2d Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Wed, 6 Mar 2024 13:13:27 +0100 Subject: [PATCH 5/7] Revert map change. --- src/host/broadcast.jl | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/src/host/broadcast.jl b/src/host/broadcast.jl index 6842c67a..e4fa9172 100644 --- a/src/host/broadcast.jl +++ b/src/host/broadcast.jl @@ -116,13 +116,6 @@ function Base.map!(f, dest::AnyGPUArray, xs::AbstractArray...) bc = Broadcast.preprocess(dest, bc) end - # if we can't efficiently index, use static indices to help the compiler eliminate idivs - Is = if isa(IndexStyle(bc), IndexCartesian) - StaticCartesianIndices(axes(bc)) - else - CartesianIndices(axes(bc)) - end - # grid-stride kernel function map_kernel(ctx, dest, Is, bc, nelem) i = 1 @@ -130,8 +123,8 @@ function Base.map!(f, dest::AnyGPUArray, xs::AbstractArray...) j = linear_index(ctx, i) j > common_length && return - I = @cartesianidx(Is, i) - @inbounds dest[j] = bc[I] + J = CartesianIndices(axes(bc))[j] + @inbounds dest[j] = bc[J] i += 1 end @@ -139,11 +132,11 @@ function Base.map!(f, dest::AnyGPUArray, xs::AbstractArray...) end elements = common_length elements_per_thread = typemax(Int) - heuristic = launch_heuristic(backend(dest), map_kernel, dest, Is, bc, 1; + heuristic = launch_heuristic(backend(dest), map_kernel, dest, bc, 1; elements, elements_per_thread) config = launch_configuration(backend(dest), heuristic; elements, elements_per_thread) - gpu_call(map_kernel, dest, Is, bc, config.elements_per_thread; + gpu_call(map_kernel, dest, bc, config.elements_per_thread; threads=config.threads, blocks=config.blocks) return dest From fb8cf80582c6d5f54ccee3f1003260a691a4e49f Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Wed, 6 Mar 2024 13:13:42 +0100 Subject: [PATCH 6/7] Remove outdated comment. --- src/host/broadcast.jl | 1 - 1 file changed, 1 deletion(-) diff --git a/src/host/broadcast.jl b/src/host/broadcast.jl index e4fa9172..bf99bc6f 100644 --- a/src/host/broadcast.jl +++ b/src/host/broadcast.jl @@ -56,7 +56,6 @@ end return end else - # XXX: we could use StaticCartesianIndices here, but that results in many compilations function (ctx, dest, bc, nelem) i = 0 while i < nelem From 0f6bf2a25cca10fc147a7674d0ce2932f24a1bed Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Wed, 6 Mar 2024 13:21:25 +0100 Subject: [PATCH 7/7] Fixes. --- src/host/broadcast.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/host/broadcast.jl b/src/host/broadcast.jl index bf99bc6f..93532e5f 100644 --- a/src/host/broadcast.jl +++ b/src/host/broadcast.jl @@ -43,7 +43,7 @@ end isempty(dest) && return dest bc = Broadcast.preprocess(dest, bc) - broadcast_kernel = if ndims(bc) == 1 || + broadcast_kernel = if ndims(dest) == 1 || (isa(IndexStyle(dest), IndexLinear) && isa(IndexStyle(bc), IndexLinear)) function (ctx, dest, bc, nelem) @@ -116,7 +116,7 @@ function Base.map!(f, dest::AnyGPUArray, xs::AbstractArray...) end # grid-stride kernel - function map_kernel(ctx, dest, Is, bc, nelem) + function map_kernel(ctx, dest, bc, nelem) i = 1 while i <= nelem j = linear_index(ctx, i)