From 09d3d51820617dea2410ea86451a659051c334a5 Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Fri, 6 Jun 2025 17:07:56 -0300
Subject: [PATCH 1/7] Use Float32 in examples with backends that don't support
 Float64

---
 examples/memcopy.jl        | 4 ++--
 examples/memcopy_static.jl | 4 ++--
 examples/utils.jl          | 2 ++
 3 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/examples/memcopy.jl b/examples/memcopy.jl
index 3159f3670..0fe141041 100644
--- a/examples/memcopy.jl
+++ b/examples/memcopy.jl
@@ -16,8 +16,8 @@ function mycopy!(A, B)
     return
 end
 
-A = KernelAbstractions.zeros(backend, Float64, 128, 128)
-B = KernelAbstractions.ones(backend, Float64, 128, 128)
+A = KernelAbstractions.zeros(backend, f_type, 128, 128)
+B = KernelAbstractions.ones(backend, f_type, 128, 128)
 mycopy!(A, B)
 KernelAbstractions.synchronize(backend)
 @test A == B
diff --git a/examples/memcopy_static.jl b/examples/memcopy_static.jl
index 9f088294e..7c965c557 100644
--- a/examples/memcopy_static.jl
+++ b/examples/memcopy_static.jl
@@ -16,8 +16,8 @@ function mycopy_static!(A, B)
     return
 end
 
-A = KernelAbstractions.zeros(backend, Float64, 128, 128)
-B = KernelAbstractions.ones(backend, Float64, 128, 128)
+A = KernelAbstractions.zeros(backend, f_type, 128, 128)
+B = KernelAbstractions.ones(backend, f_type, 128, 128)
 mycopy_static!(A, B)
 KernelAbstractions.synchronize(backend)
 @test A == B
diff --git a/examples/utils.jl b/examples/utils.jl
index 5e93299b1..cd9f56459 100644
--- a/examples/utils.jl
+++ b/examples/utils.jl
@@ -7,3 +7,5 @@ if Base.find_package("CUDA") !== nothing
 else
     const backend = CPU()
 end
+
+const f_type = KernelAbstractions.supports_float64(backend) ? Float64 : Float32

From 34227175527d5583e069950bc1186265cfe53377 Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Fri, 6 Jun 2025 17:10:03 -0300
Subject: [PATCH 2/7] Add `backend` argument for `examples_testset`

---
 examples/utils.jl | 16 +++++++++-------
 test/examples.jl  |  3 ++-
 test/testsuite.jl |  2 +-
 3 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/examples/utils.jl b/examples/utils.jl
index cd9f56459..b1ab6353b 100644
--- a/examples/utils.jl
+++ b/examples/utils.jl
@@ -1,11 +1,13 @@
 # EXCLUDE FROM TESTING
-if Base.find_package("CUDA") !== nothing
-    using CUDA
-    using CUDA.CUDAKernels
-    const backend = CUDABackend()
-    CUDA.allowscalar(false)
-else
-    const backend = CPU()
+if !(@isdefined backend)
+    if Base.find_package("CUDA") !== nothing
+        using CUDA
+        using CUDA.CUDAKernels
+        const backend = CUDABackend()
+        CUDA.allowscalar(false)
+    else
+        const backend = CPU()
+    end
 end
 
 const f_type = KernelAbstractions.supports_float64(backend) ? Float64 : Float32
diff --git a/test/examples.jl b/test/examples.jl
index 02374db89..d10f48d65 100644
--- a/test/examples.jl
+++ b/test/examples.jl
@@ -9,7 +9,7 @@ function find_sources(path::String, sources = String[])
     return sources
 end
 
-function examples_testsuite(backend_str)
+function examples_testsuite(backend, backend_str)
     @testset "examples" begin
         examples_dir = joinpath(@__DIR__, "..", "examples")
         examples = find_sources(examples_dir)
@@ -21,6 +21,7 @@ function examples_testsuite(backend_str)
         @testset "$(basename(example))" for example in examples
             @eval module $(gensym())
             backend_str = $backend_str
+            const backend = ($backend)()
             include($example)
             end
             @test true
diff --git a/test/testsuite.jl b/test/testsuite.jl
index f9f96fcf0..2418db998 100644
--- a/test/testsuite.jl
+++ b/test/testsuite.jl
@@ -84,7 +84,7 @@ function testsuite(backend, backend_str, backend_mod, AT, DAT; skip_tests = Set{
     end
 
     @conditional_testset "Examples" skip_tests begin
-        examples_testsuite(backend_str)
+        examples_testsuite(backend, backend_str)
     end
 
     return

From 2963addd3c1932948709df6911512c7a3b52ec24 Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Fri, 6 Jun 2025 17:11:28 -0300
Subject: [PATCH 3/7] Reduce `TILE_DIM` for compatibility

Metal doesn't always support 1-24 threads, which causes intermittent errors with 32x32 tiles
---
 examples/performant_matmul.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/performant_matmul.jl b/examples/performant_matmul.jl
index dbd5ad75a..3b274de5d 100644
--- a/examples/performant_matmul.jl
+++ b/examples/performant_matmul.jl
@@ -4,7 +4,7 @@ using Test
 using Random
 include(joinpath(dirname(pathof(KernelAbstractions)), "../examples/utils.jl")) # Load backend
 
-const TILE_DIM = 32
+const TILE_DIM = 16
 
 @kernel function coalesced_matmul_kernel!(
         output, @Const(input1), @Const(input2), N, R, M,

From 4cefdfb1a7c669048cf6a17df3e08d62bcc79362 Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Fri, 6 Jun 2025 17:14:25 -0300
Subject: [PATCH 4/7] Fix histogram implementation

The final part of the loop expects every thread to exists, so we cannot not launch them. Avoid work on extra threads until then.

Also use Int32 since some backends lack Int64 atomics, and make one of the tests have weird groupsize since that's when the errors used to pop up.
---
 examples/histogram.jl | 37 ++++++++++++++++++-------------------
 1 file changed, 18 insertions(+), 19 deletions(-)

diff --git a/examples/histogram.jl b/examples/histogram.jl
index 3e91c29d4..9edc02930 100644
--- a/examples/histogram.jl
+++ b/examples/histogram.jl
@@ -5,7 +5,7 @@ include(joinpath(dirname(pathof(KernelAbstractions)), "../examples/utils.jl")) #
 
 # Function to use as a baseline for CPU metrics
 function create_histogram(input)
-    histogram_output = zeros(Int, maximum(input))
+    histogram_output = zeros(eltype(input), maximum(input))
     for i in input
         histogram_output[i] += 1
     end
@@ -13,23 +13,21 @@ function create_histogram(input)
 end
 
 # This a 1D histogram kernel where the histogramming happens on shmem
-@kernel function histogram_kernel!(histogram_output, input)
-    tid = @index(Global, Linear)
+@kernel unsafe_indices = true function histogram_kernel!(histogram_output, input)
+    gid = @index(Group, Linear)
     lid = @index(Local, Linear)
 
-    @uniform warpsize = Int(32)
-
-    @uniform gs = @groupsize()[1]
+    @uniform gs = prod(@groupsize())
+    tid = (gid - 1) * gs + lid
     @uniform N = length(histogram_output)
 
-    shared_histogram = @localmem Int (gs)
+    shared_histogram = @localmem eltype(input) (gs)
 
     # This will go through all input elements and assign them to a location in
     # shmem. Note that if there is not enough shem, we create different shmem
     # blocks to write to. For example, if shmem is of size 256, but it's
     # possible to get a value of 312, then we will have 2 separate shmem blocks,
     # one from 1->256, and another from 256->512
-    @uniform max_element = 1
     for min_element in 1:gs:N
 
         # Setting shared_histogram to 0
@@ -42,7 +40,7 @@ end
         end
 
         # Defining bin on shared memory and writing to it if possible
-        bin = input[tid]
+        bin = tid <= length(input) ? input[tid] : 0
         if bin >= min_element && bin < max_element
             bin -= min_element - 1
             @atomic shared_histogram[bin] += 1
@@ -58,10 +56,10 @@ end
 
 end
 
-function histogram!(histogram_output, input)
+function histogram!(histogram_output, input, groupsize = 256)
     backend = get_backend(histogram_output)
     # Need static block size
-    kernel! = histogram_kernel!(backend, (256,))
+    kernel! = histogram_kernel!(backend, (groupsize,))
     kernel!(histogram_output, input, ndrange = size(input))
     return
 end
@@ -74,9 +72,10 @@ function move(backend, input)
 end
 
 @testset "histogram tests" begin
-    rand_input = [rand(1:128) for i in 1:1000]
-    linear_input = [i for i in 1:1024]
-    all_two = [2 for i in 1:512]
+    # Use Int32 as some backends don't support 64-bit atomics
+    rand_input = Int32.(rand(1:128, 1000))
+    linear_input = Int32.(1:1024)
+    all_two = fill(Int32(2), 512)
 
     histogram_rand_baseline = create_histogram(rand_input)
     histogram_linear_baseline = create_histogram(linear_input)
@@ -86,14 +85,14 @@ end
     linear_input = move(backend, linear_input)
     all_two = move(backend, all_two)
 
-    rand_histogram = KernelAbstractions.zeros(backend, Int, 128)
-    linear_histogram = KernelAbstractions.zeros(backend, Int, 1024)
-    two_histogram = KernelAbstractions.zeros(backend, Int, 2)
+    rand_histogram = KernelAbstractions.zeros(backend, eltype(rand_input), Int(maximum(rand_input)))
+    linear_histogram = KernelAbstractions.zeros(backend, eltype(linear_input), Int(maximum(linear_input)))
+    two_histogram = KernelAbstractions.zeros(backend, eltype(all_two), Int(maximum(all_two)))
 
-    histogram!(rand_histogram, rand_input)
+    histogram!(rand_histogram, rand_input, 6)
     histogram!(linear_histogram, linear_input)
     histogram!(two_histogram, all_two)
-    KernelAbstractions.synchronize(CPU())
+    KernelAbstractions.synchronize(backend)
 
     @test isapprox(Array(rand_histogram), histogram_rand_baseline)
     @test isapprox(Array(linear_histogram), histogram_linear_baseline)

From 432117c51d41479629f3cdc4888367242ba668ba Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Fri, 6 Jun 2025 17:05:53 -0300
Subject: [PATCH 5/7] REVERT BEFORE MERGING

---
 Project.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index acd4e89de..889f1c4f6 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "KernelAbstractions"
 uuid = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 authors = ["Valentin Churavy <v.churavy@gmail.com> and contributors"]
-version = "0.10.0-dev"
+version = "0.9.35"
 
 [deps]
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"

From c3abf87397244e8c1e52e4abf1f536c2993cf2fc Mon Sep 17 00:00:00 2001
From: Valentin Churavy <v.churavy@gmail.com>
Date: Tue, 10 Jun 2025 10:30:56 +0200
Subject: [PATCH 6/7] Update Project.toml

Co-authored-by: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
---
 Project.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index 889f1c4f6..acd4e89de 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "KernelAbstractions"
 uuid = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 authors = ["Valentin Churavy <v.churavy@gmail.com> and contributors"]
-version = "0.9.35"
+version = "0.10.0-dev"
 
 [deps]
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"

From 8d285dbb39e6ba6a31345e7771f2978dc5ba5b72 Mon Sep 17 00:00:00 2001
From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com>
Date: Tue, 10 Jun 2025 10:43:35 -0300
Subject: [PATCH 7/7] Justify `TILE_DIM = 16`

---
 examples/performant_matmul.jl | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/examples/performant_matmul.jl b/examples/performant_matmul.jl
index 3b274de5d..afd247d13 100644
--- a/examples/performant_matmul.jl
+++ b/examples/performant_matmul.jl
@@ -4,6 +4,9 @@ using Test
 using Random
 include(joinpath(dirname(pathof(KernelAbstractions)), "../examples/utils.jl")) # Load backend
 
+# We use a TILE_DIM of 16 as a safe value since while
+#  most backends support up to 1024 threads per group,
+#  Metal sometimes supports fewer.
 const TILE_DIM = 16
 
 @kernel function coalesced_matmul_kernel!(