Use linear indexing in broadcast kernel when possible (#520)

maleadt · web-flow · commit e4d40ea873c7 · 2024-03-08T11:56:40.000+01:00
diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
@@ -2,7 +2,7 @@ steps:
   - label: "CUDA.jl"
     plugins:
       - JuliaCI/julia#v1:
-          version: 1.8
+          version: "1.10"
       - JuliaCI/julia-coverage#v1:
           codecov: true
     command: |
@@ -23,7 +23,7 @@ steps:
   - label: "oneAPI.jl"
     plugins:
       - JuliaCI/julia#v1:
-          version: 1.8
+          version: "1.10"
       - JuliaCI/julia-coverage#v1:
           codecov: true
     command: |
@@ -48,7 +48,7 @@ steps:
   - label: "Metal.jl"
     plugins:
       - JuliaCI/julia#v1:
-          version: 1.8
+          version: "1.10"
       - JuliaCI/julia-coverage#v1:
           codecov: true
     command: |
diff --git a/src/device/indexing.jl b/src/device/indexing.jl
@@ -64,7 +64,9 @@ macro linearidx(A, grididx=1, ctxsym=:ctx)
     quote
         x = $(esc(A))
         i = linear_index($(esc(ctxsym)), $(esc(grididx)))
-        i > length(x) && return
+        if !(1 <= i <= length(x))
+            return
+        end
         i
     end
 end
diff --git a/src/host/broadcast.jl b/src/host/broadcast.jl
@@ -2,7 +2,7 @@
 
 using Base.Broadcast
 
-import Base.Broadcast: BroadcastStyle, Broadcasted, AbstractArrayStyle, instantiate
+using Base.Broadcast: BroadcastStyle, Broadcasted, AbstractArrayStyle, instantiate
 
 # but make sure we don't dispatch to the optimized copy method that directly indexes
 function Broadcast.copy(bc::Broadcasted{<:AbstractGPUArrayStyle{0}})
@@ -32,32 +32,48 @@ end
     return _copyto!(dest, instantiate(Broadcasted{Style}(bc.f, bc.args, axes(dest))))
 end
 
-@inline Base.copyto!(dest::AnyGPUArray, bc::Broadcasted{Nothing}) = _copyto!(dest, bc) # Keep it for ArrayConflict
+@inline Base.copyto!(dest::AnyGPUArray, bc::Broadcasted{Nothing}) =
+    _copyto!(dest, bc) # Keep it for ArrayConflict
 
-@inline Base.copyto!(dest::AbstractArray, bc::Broadcasted{<:AbstractGPUArrayStyle}) = _copyto!(dest, bc)
+@inline Base.copyto!(dest::AbstractArray, bc::Broadcasted{<:AbstractGPUArrayStyle}) =
+    _copyto!(dest, bc)
 
 @inline function _copyto!(dest::AbstractArray, bc::Broadcasted)
     axes(dest) == axes(bc) || Broadcast.throwdm(axes(dest), axes(bc))
     isempty(dest) && return dest
-    bc′ = Broadcast.preprocess(dest, bc)
-
-    # grid-stride kernel
-    function broadcast_kernel(ctx, dest, bc′, nelem)
-        i = 0
-        while i < nelem
-            i += 1
-            I = @cartesianidx(dest, i)
-            @inbounds dest[I] = bc′[I]
+    bc = Broadcast.preprocess(dest, bc)
+
+    broadcast_kernel = if ndims(dest) == 1 ||
+                          (isa(IndexStyle(dest), IndexLinear) &&
+                           isa(IndexStyle(bc), IndexLinear))
+        function (ctx, dest, bc, nelem)
+            i = 1
+            while i <= nelem
+                I = @linearidx(dest, i)
+                @inbounds dest[I] = bc[I]
+                i += 1
+            end
+            return
+        end
+    else
+        function (ctx, dest, bc, nelem)
+            i = 0
+            while i < nelem
+                i += 1
+                I = @cartesianidx(dest, i)
+                @inbounds dest[I] = bc[I]
+            end
+            return
         end
-        return
     end
+
     elements = length(dest)
     elements_per_thread = typemax(Int)
-    heuristic = launch_heuristic(backend(dest), broadcast_kernel, dest, bc′, 1;
+    heuristic = launch_heuristic(backend(dest), broadcast_kernel, dest, bc, 1;
                                  elements, elements_per_thread)
     config = launch_configuration(backend(dest), heuristic;
                                   elements, elements_per_thread)
-    gpu_call(broadcast_kernel, dest, bc′, config.elements_per_thread;
+    gpu_call(broadcast_kernel, dest, bc, config.elements_per_thread;
              threads=config.threads, blocks=config.blocks)
 
     return dest
@@ -101,12 +117,15 @@ function Base.map!(f, dest::AnyGPUArray, xs::AbstractArray...)
 
     # grid-stride kernel
     function map_kernel(ctx, dest, bc, nelem)
-        for i in 1:nelem
+        i = 1
+        while i <= nelem
             j = linear_index(ctx, i)
             j > common_length && return
 
             J = CartesianIndices(axes(bc))[j]
             @inbounds dest[j] = bc[J]
+
+            i += 1
         end
         return
     end
diff --git a/src/host/math.jl b/src/host/math.jl
@@ -2,7 +2,7 @@
 
 function Base.clamp!(A::AnyGPUArray, low, high)
     gpu_call(A, low, high) do ctx, A, low, high
-        I = @cartesianidx A
+        I = @linearidx A
         A[I] = clamp(A[I], low, high)
         return
     end