From ac3911ee296ac7d3fc5b8b8537f7fbd50397c386 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Tue, 5 Mar 2024 09:15:20 +0100
Subject: [PATCH 1/7] Introduce StaticCartesianIndices to eliminate expensive
 integer divisions.

---
 src/device/indexing.jl | 31 +++++++++++++++++++++++++-
 src/host/broadcast.jl  | 49 +++++++++++++++++++++++++++++-------------
 src/host/math.jl       |  2 +-
 3 files changed, 65 insertions(+), 17 deletions(-)

diff --git a/src/device/indexing.jl b/src/device/indexing.jl
index b0b9990d..a12ec16b 100644
--- a/src/device/indexing.jl
+++ b/src/device/indexing.jl
@@ -64,7 +64,9 @@ macro linearidx(A, grididx=1, ctxsym=:ctx)
     quote
         x = $(esc(A))
         i = linear_index($(esc(ctxsym)), $(esc(grididx)))
-        i > length(x) && return
+        if !(1 <= i <= length(x))
+            return
+        end
         i
     end
 end
@@ -81,3 +83,30 @@ macro cartesianidx(A, grididx=1, ctxsym=:ctx)
         @inbounds CartesianIndices(x)[i]
     end
 end
+
+
+## utilities
+
+# indexing a CartesianIndices at run time generates an integer division, which is slow.
+# to work around this, we can use a static CartesianIndices type to avoid the division.
+# this informs LLVM and the back-end about the static iteration bounds, allowing it to
+# lower the integer divison to a series of bit shifts, dramatically improving performance.
+# also see: maleadt/StaticCartesian.jl, JuliaGPU/GPUArrays.jl#454
+
+struct StaticCartesianIndices{N, I} end
+
+StaticCartesianIndices(iter::CartesianIndices{N}) where {N} =
+    StaticCartesianIndices{N, iter.indices}()
+StaticCartesianIndices(x) = StaticCartesianIndices(CartesianIndices(x))
+
+Base.CartesianIndices(iter::StaticCartesianIndices{N, I}) where {N, I} =
+    CartesianIndices{N, typeof(I)}(I)
+
+Base.@propagate_inbounds Base.getindex(I::StaticCartesianIndices, i::Int) =
+    CartesianIndices(I)[i]
+Base.length(I::StaticCartesianIndices) = length(CartesianIndices(I))
+
+function Base.show(io::IO, I::StaticCartesianIndices)
+    print(io, "Static")
+    show(io, CartesianIndices(I))
+end
diff --git a/src/host/broadcast.jl b/src/host/broadcast.jl
index d7cec877..247fda9b 100644
--- a/src/host/broadcast.jl
+++ b/src/host/broadcast.jl
@@ -32,32 +32,41 @@ end
     return _copyto!(dest, instantiate(Broadcasted{Style}(bc.f, bc.args, axes(dest))))
 end
 
-@inline Base.copyto!(dest::AnyGPUArray, bc::Broadcasted{Nothing}) = _copyto!(dest, bc) # Keep it for ArrayConflict
+@inline Base.copyto!(dest::AnyGPUArray, bc::Broadcasted{Nothing}) =
+    _copyto!(dest, bc) # Keep it for ArrayConflict
 
-@inline Base.copyto!(dest::AbstractArray, bc::Broadcasted{<:AbstractGPUArrayStyle}) = _copyto!(dest, bc)
+@inline Base.copyto!(dest::AbstractArray, bc::Broadcasted{<:AbstractGPUArrayStyle}) =
+    _copyto!(dest, bc)
 
 @inline function _copyto!(dest::AbstractArray, bc::Broadcasted)
     axes(dest) == axes(bc) || Broadcast.throwdm(axes(dest), axes(bc))
     isempty(dest) && return dest
     bc′ = Broadcast.preprocess(dest, bc)
 
+    # if we can't efficiently index, use static indices to help the compiler eliminate idivs
+    Is = if isa(IndexStyle(dest), IndexCartesian) || isa(IndexStyle(bc′), IndexCartesian)
+        StaticCartesianIndices(dest)
+    else
+        CartesianIndices(dest)
+    end
+
     # grid-stride kernel
-    function broadcast_kernel(ctx, dest, bc′, nelem)
-        i = 0
-        while i < nelem
-            i += 1
-            I = @cartesianidx(dest, i)
+    function broadcast_kernel(ctx, dest, Is, bc′, nelem)
+        i = 1
+        while i <= nelem
+            I = @cartesianidx(Is, i)
             @inbounds dest[I] = bc′[I]
+            i += 1
         end
         return
     end
     elements = length(dest)
     elements_per_thread = typemax(Int)
-    heuristic = launch_heuristic(backend(dest), broadcast_kernel, dest, bc′, 1;
+    heuristic = launch_heuristic(backend(dest), broadcast_kernel, dest, Is, bc′, 1;
                                  elements, elements_per_thread)
     config = launch_configuration(backend(dest), heuristic;
                                   elements, elements_per_thread)
-    gpu_call(broadcast_kernel, dest, bc′, config.elements_per_thread;
+    gpu_call(broadcast_kernel, dest, Is, bc′, config.elements_per_thread;
              threads=config.threads, blocks=config.blocks)
 
     return dest
@@ -99,24 +108,34 @@ function Base.map!(f, dest::AnyGPUArray, xs::AbstractArray...)
         bc = Broadcast.preprocess(dest, bc)
     end
 
+    # if we can't efficiently index, use static indices to help the compiler eliminate idivs
+    Is = if isa(IndexStyle(bc), IndexCartesian)
+        StaticCartesianIndices(axes(bc))
+    else
+        CartesianIndices(axes(bc))
+    end
+
     # grid-stride kernel
-    function map_kernel(ctx, dest, bc, nelem)
-        for i in 1:nelem
+    function map_kernel(ctx, dest, Is, bc, nelem)
+        i = 1
+        while i <= nelem
             j = linear_index(ctx, i)
             j > common_length && return
 
-            J = CartesianIndices(axes(bc))[j]
-            @inbounds dest[j] = bc[J]
+            I = @cartesianidx(Is, i)
+            @inbounds dest[j] = bc[I]
+
+            i += 1
         end
         return
     end
     elements = common_length
     elements_per_thread = typemax(Int)
-    heuristic = launch_heuristic(backend(dest), map_kernel, dest, bc, 1;
+    heuristic = launch_heuristic(backend(dest), map_kernel, dest, Is, bc, 1;
                                  elements, elements_per_thread)
     config = launch_configuration(backend(dest), heuristic;
                                   elements, elements_per_thread)
-    gpu_call(map_kernel, dest, bc, config.elements_per_thread;
+    gpu_call(map_kernel, dest, Is, bc, config.elements_per_thread;
              threads=config.threads, blocks=config.blocks)
 
     return dest
diff --git a/src/host/math.jl b/src/host/math.jl
index 8d02c97f..cf455d31 100644
--- a/src/host/math.jl
+++ b/src/host/math.jl
@@ -2,7 +2,7 @@
 
 function Base.clamp!(A::AnyGPUArray, low, high)
     gpu_call(A, low, high) do ctx, A, low, high
-        I = @cartesianidx A
+        I = @linearidx A
         A[I] = clamp(A[I], low, high)
         return
     end

From 90fb573c02bb6a63d3f50c0d8cd52effddf6f5e7 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Tue, 5 Mar 2024 09:59:56 +0100
Subject: [PATCH 2/7] Bump Julia version used by CI.

---
 .buildkite/pipeline.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index 8f7d4f80..b7f4aa82 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -2,7 +2,7 @@ steps:
   - label: "CUDA.jl"
     plugins:
       - JuliaCI/julia#v1:
-          version: 1.8
+          version: "1.10"
       - JuliaCI/julia-coverage#v1:
           codecov: true
     command: |
@@ -23,7 +23,7 @@ steps:
   - label: "oneAPI.jl"
     plugins:
       - JuliaCI/julia#v1:
-          version: 1.8
+          version: "1.10"
       - JuliaCI/julia-coverage#v1:
           codecov: true
     command: |
@@ -48,7 +48,7 @@ steps:
   - label: "Metal.jl"
     plugins:
       - JuliaCI/julia#v1:
-          version: 1.8
+          version: "1.10"
       - JuliaCI/julia-coverage#v1:
           codecov: true
     command: |

From c3783d791395f57a512b42187ca7cca3cfd9edcc Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Wed, 6 Mar 2024 13:04:40 +0100
Subject: [PATCH 3/7] Simplify.

---
 src/host/broadcast.jl | 46 +++++++++++++++++++++++++------------------
 1 file changed, 27 insertions(+), 19 deletions(-)

diff --git a/src/host/broadcast.jl b/src/host/broadcast.jl
index 247fda9b..6842c67a 100644
--- a/src/host/broadcast.jl
+++ b/src/host/broadcast.jl
@@ -2,7 +2,7 @@
 
 using Base.Broadcast
 
-import Base.Broadcast: BroadcastStyle, Broadcasted, AbstractArrayStyle, instantiate
+using Base.Broadcast: BroadcastStyle, Broadcasted, AbstractArrayStyle, instantiate
 
 # but make sure we don't dispatch to the optimized copy method that directly indexes
 function Broadcast.copy(bc::Broadcasted{<:AbstractGPUArrayStyle{0}})
@@ -41,32 +41,40 @@ end
 @inline function _copyto!(dest::AbstractArray, bc::Broadcasted)
     axes(dest) == axes(bc) || Broadcast.throwdm(axes(dest), axes(bc))
     isempty(dest) && return dest
-    bc′ = Broadcast.preprocess(dest, bc)
-
-    # if we can't efficiently index, use static indices to help the compiler eliminate idivs
-    Is = if isa(IndexStyle(dest), IndexCartesian) || isa(IndexStyle(bc′), IndexCartesian)
-        StaticCartesianIndices(dest)
+    bc = Broadcast.preprocess(dest, bc)
+
+    broadcast_kernel = if ndims(bc) == 1 ||
+                          (isa(IndexStyle(dest), IndexLinear) &&
+                           isa(IndexStyle(bc), IndexLinear))
+        function (ctx, dest, bc, nelem)
+            i = 1
+            while i <= nelem
+                I = @linearidx(dest, i)
+                @inbounds dest[I] = bc[I]
+                i += 1
+            end
+            return
+        end
     else
-        CartesianIndices(dest)
-    end
-
-    # grid-stride kernel
-    function broadcast_kernel(ctx, dest, Is, bc′, nelem)
-        i = 1
-        while i <= nelem
-            I = @cartesianidx(Is, i)
-            @inbounds dest[I] = bc′[I]
-            i += 1
+        # XXX: we could use StaticCartesianIndices here, but that results in many compilations
+        function (ctx, dest, bc, nelem)
+            i = 0
+            while i < nelem
+                i += 1
+                I = @cartesianidx(dest, i)
+                @inbounds dest[I] = bc[I]
+            end
+            return
         end
-        return
     end
+
     elements = length(dest)
     elements_per_thread = typemax(Int)
-    heuristic = launch_heuristic(backend(dest), broadcast_kernel, dest, Is, bc′, 1;
+    heuristic = launch_heuristic(backend(dest), broadcast_kernel, dest, bc, 1;
                                  elements, elements_per_thread)
     config = launch_configuration(backend(dest), heuristic;
                                   elements, elements_per_thread)
-    gpu_call(broadcast_kernel, dest, Is, bc′, config.elements_per_thread;
+    gpu_call(broadcast_kernel, dest, bc, config.elements_per_thread;
              threads=config.threads, blocks=config.blocks)
 
     return dest

From 5b9d7c0e2924d3c82c1387af7fdca62a789db981 Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Wed, 6 Mar 2024 13:09:56 +0100
Subject: [PATCH 4/7] Remove StaticCartesian.

---
 src/device/indexing.jl | 27 ---------------------------
 1 file changed, 27 deletions(-)

diff --git a/src/device/indexing.jl b/src/device/indexing.jl
index a12ec16b..31084fce 100644
--- a/src/device/indexing.jl
+++ b/src/device/indexing.jl
@@ -83,30 +83,3 @@ macro cartesianidx(A, grididx=1, ctxsym=:ctx)
         @inbounds CartesianIndices(x)[i]
     end
 end
-
-
-## utilities
-
-# indexing a CartesianIndices at run time generates an integer division, which is slow.
-# to work around this, we can use a static CartesianIndices type to avoid the division.
-# this informs LLVM and the back-end about the static iteration bounds, allowing it to
-# lower the integer divison to a series of bit shifts, dramatically improving performance.
-# also see: maleadt/StaticCartesian.jl, JuliaGPU/GPUArrays.jl#454
-
-struct StaticCartesianIndices{N, I} end
-
-StaticCartesianIndices(iter::CartesianIndices{N}) where {N} =
-    StaticCartesianIndices{N, iter.indices}()
-StaticCartesianIndices(x) = StaticCartesianIndices(CartesianIndices(x))
-
-Base.CartesianIndices(iter::StaticCartesianIndices{N, I}) where {N, I} =
-    CartesianIndices{N, typeof(I)}(I)
-
-Base.@propagate_inbounds Base.getindex(I::StaticCartesianIndices, i::Int) =
-    CartesianIndices(I)[i]
-Base.length(I::StaticCartesianIndices) = length(CartesianIndices(I))
-
-function Base.show(io::IO, I::StaticCartesianIndices)
-    print(io, "Static")
-    show(io, CartesianIndices(I))
-end

From 548a8d92276e4136ca130feb1561911fbac87c2d Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Wed, 6 Mar 2024 13:13:27 +0100
Subject: [PATCH 5/7] Revert map change.

---
 src/host/broadcast.jl | 15 ++++-----------
 1 file changed, 4 insertions(+), 11 deletions(-)

diff --git a/src/host/broadcast.jl b/src/host/broadcast.jl
index 6842c67a..e4fa9172 100644
--- a/src/host/broadcast.jl
+++ b/src/host/broadcast.jl
@@ -116,13 +116,6 @@ function Base.map!(f, dest::AnyGPUArray, xs::AbstractArray...)
         bc = Broadcast.preprocess(dest, bc)
     end
 
-    # if we can't efficiently index, use static indices to help the compiler eliminate idivs
-    Is = if isa(IndexStyle(bc), IndexCartesian)
-        StaticCartesianIndices(axes(bc))
-    else
-        CartesianIndices(axes(bc))
-    end
-
     # grid-stride kernel
     function map_kernel(ctx, dest, Is, bc, nelem)
         i = 1
@@ -130,8 +123,8 @@ function Base.map!(f, dest::AnyGPUArray, xs::AbstractArray...)
             j = linear_index(ctx, i)
             j > common_length && return
 
-            I = @cartesianidx(Is, i)
-            @inbounds dest[j] = bc[I]
+            J = CartesianIndices(axes(bc))[j]
+            @inbounds dest[j] = bc[J]
 
             i += 1
         end
@@ -139,11 +132,11 @@ function Base.map!(f, dest::AnyGPUArray, xs::AbstractArray...)
     end
     elements = common_length
     elements_per_thread = typemax(Int)
-    heuristic = launch_heuristic(backend(dest), map_kernel, dest, Is, bc, 1;
+    heuristic = launch_heuristic(backend(dest), map_kernel, dest, bc, 1;
                                  elements, elements_per_thread)
     config = launch_configuration(backend(dest), heuristic;
                                   elements, elements_per_thread)
-    gpu_call(map_kernel, dest, Is, bc, config.elements_per_thread;
+    gpu_call(map_kernel, dest, bc, config.elements_per_thread;
              threads=config.threads, blocks=config.blocks)
 
     return dest

From fb8cf80582c6d5f54ccee3f1003260a691a4e49f Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Wed, 6 Mar 2024 13:13:42 +0100
Subject: [PATCH 6/7] Remove outdated comment.

---
 src/host/broadcast.jl | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/host/broadcast.jl b/src/host/broadcast.jl
index e4fa9172..bf99bc6f 100644
--- a/src/host/broadcast.jl
+++ b/src/host/broadcast.jl
@@ -56,7 +56,6 @@ end
             return
         end
     else
-        # XXX: we could use StaticCartesianIndices here, but that results in many compilations
         function (ctx, dest, bc, nelem)
             i = 0
             while i < nelem

From 0f6bf2a25cca10fc147a7674d0ce2932f24a1bed Mon Sep 17 00:00:00 2001
From: Tim Besard <tim.besard@gmail.com>
Date: Wed, 6 Mar 2024 13:21:25 +0100
Subject: [PATCH 7/7] Fixes.

---
 src/host/broadcast.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/host/broadcast.jl b/src/host/broadcast.jl
index bf99bc6f..93532e5f 100644
--- a/src/host/broadcast.jl
+++ b/src/host/broadcast.jl
@@ -43,7 +43,7 @@ end
     isempty(dest) && return dest
     bc = Broadcast.preprocess(dest, bc)
 
-    broadcast_kernel = if ndims(bc) == 1 ||
+    broadcast_kernel = if ndims(dest) == 1 ||
                           (isa(IndexStyle(dest), IndexLinear) &&
                            isa(IndexStyle(bc), IndexLinear))
         function (ctx, dest, bc, nelem)
@@ -116,7 +116,7 @@ function Base.map!(f, dest::AnyGPUArray, xs::AbstractArray...)
     end
 
     # grid-stride kernel
-    function map_kernel(ctx, dest, Is, bc, nelem)
+    function map_kernel(ctx, dest, bc, nelem)
         i = 1
         while i <= nelem
             j = linear_index(ctx, i)