Test correct backend in examples test (#597)

christiangnrd · web-flow · commit dab03b92f1f6 · 2025-06-10T15:52:03.000+02:00
* Use Float32 in examples with backends that don't support Float64

* Add `backend` argument for `examples_testset`

* Reduce `TILE_DIM` for compatibility

Metal doesn't always support 1-24 threads, which causes intermittent errors with 32x32 tiles

* Fix histogram implementation

The final part of the loop expects every thread to exists, so we cannot not launch them. Avoid work on extra threads until then.

Also use Int32 since some backends lack Int64 atomics, and make one of the tests have weird groupsize since that's when the errors used to pop up.
diff --git a/examples/histogram.jl b/examples/histogram.jl
@@ -5,31 +5,29 @@ include(joinpath(dirname(pathof(KernelAbstractions)), "../examples/utils.jl")) #
 
 # Function to use as a baseline for CPU metrics
 function create_histogram(input)
-    histogram_output = zeros(Int, maximum(input))
+    histogram_output = zeros(eltype(input), maximum(input))
     for i in input
         histogram_output[i] += 1
     end
     return histogram_output
 end
 
 # This a 1D histogram kernel where the histogramming happens on shmem
-@kernel function histogram_kernel!(histogram_output, input)
-    tid = @index(Global, Linear)
+@kernel unsafe_indices = true function histogram_kernel!(histogram_output, input)
+    gid = @index(Group, Linear)
     lid = @index(Local, Linear)
 
-    @uniform warpsize = Int(32)
-
-    @uniform gs = @groupsize()[1]
+    @uniform gs = prod(@groupsize())
+    tid = (gid - 1) * gs + lid
     @uniform N = length(histogram_output)
 
-    shared_histogram = @localmem Int (gs)
+    shared_histogram = @localmem eltype(input) (gs)
 
     # This will go through all input elements and assign them to a location in
     # shmem. Note that if there is not enough shem, we create different shmem
     # blocks to write to. For example, if shmem is of size 256, but it's
     # possible to get a value of 312, then we will have 2 separate shmem blocks,
     # one from 1->256, and another from 256->512
-    @uniform max_element = 1
     for min_element in 1:gs:N
 
         # Setting shared_histogram to 0
@@ -42,7 +40,7 @@ end
         end
 
         # Defining bin on shared memory and writing to it if possible
-        bin = input[tid]
+        bin = tid <= length(input) ? input[tid] : 0
         if bin >= min_element && bin < max_element
             bin -= min_element - 1
             @atomic shared_histogram[bin] += 1
@@ -58,10 +56,10 @@ end
 
 end
 
-function histogram!(histogram_output, input)
+function histogram!(histogram_output, input, groupsize = 256)
     backend = get_backend(histogram_output)
     # Need static block size
-    kernel! = histogram_kernel!(backend, (256,))
+    kernel! = histogram_kernel!(backend, (groupsize,))
     kernel!(histogram_output, input, ndrange = size(input))
     return
 end
@@ -74,9 +72,10 @@ function move(backend, input)
 end
 
 @testset "histogram tests" begin
-    rand_input = [rand(1:128) for i in 1:1000]
-    linear_input = [i for i in 1:1024]
-    all_two = [2 for i in 1:512]
+    # Use Int32 as some backends don't support 64-bit atomics
+    rand_input = Int32.(rand(1:128, 1000))
+    linear_input = Int32.(1:1024)
+    all_two = fill(Int32(2), 512)
 
     histogram_rand_baseline = create_histogram(rand_input)
     histogram_linear_baseline = create_histogram(linear_input)
@@ -86,14 +85,14 @@ end
     linear_input = move(backend, linear_input)
     all_two = move(backend, all_two)
 
-    rand_histogram = KernelAbstractions.zeros(backend, Int, 128)
-    linear_histogram = KernelAbstractions.zeros(backend, Int, 1024)
-    two_histogram = KernelAbstractions.zeros(backend, Int, 2)
+    rand_histogram = KernelAbstractions.zeros(backend, eltype(rand_input), Int(maximum(rand_input)))
+    linear_histogram = KernelAbstractions.zeros(backend, eltype(linear_input), Int(maximum(linear_input)))
+    two_histogram = KernelAbstractions.zeros(backend, eltype(all_two), Int(maximum(all_two)))
 
-    histogram!(rand_histogram, rand_input)
+    histogram!(rand_histogram, rand_input, 6)
     histogram!(linear_histogram, linear_input)
     histogram!(two_histogram, all_two)
-    KernelAbstractions.synchronize(CPU())
+    KernelAbstractions.synchronize(backend)
 
     @test isapprox(Array(rand_histogram), histogram_rand_baseline)
     @test isapprox(Array(linear_histogram), histogram_linear_baseline)
diff --git a/examples/memcopy.jl b/examples/memcopy.jl
@@ -16,8 +16,8 @@ function mycopy!(A, B)
     return
 end
 
-A = KernelAbstractions.zeros(backend, Float64, 128, 128)
-B = KernelAbstractions.ones(backend, Float64, 128, 128)
+A = KernelAbstractions.zeros(backend, f_type, 128, 128)
+B = KernelAbstractions.ones(backend, f_type, 128, 128)
 mycopy!(A, B)
 KernelAbstractions.synchronize(backend)
 @test A == B
diff --git a/examples/memcopy_static.jl b/examples/memcopy_static.jl
@@ -16,8 +16,8 @@ function mycopy_static!(A, B)
     return
 end
 
-A = KernelAbstractions.zeros(backend, Float64, 128, 128)
-B = KernelAbstractions.ones(backend, Float64, 128, 128)
+A = KernelAbstractions.zeros(backend, f_type, 128, 128)
+B = KernelAbstractions.ones(backend, f_type, 128, 128)
 mycopy_static!(A, B)
 KernelAbstractions.synchronize(backend)
 @test A == B
diff --git a/examples/performant_matmul.jl b/examples/performant_matmul.jl
@@ -4,7 +4,10 @@ using Test
 using Random
 include(joinpath(dirname(pathof(KernelAbstractions)), "../examples/utils.jl")) # Load backend
 
-const TILE_DIM = 32
+# We use a TILE_DIM of 16 as a safe value since while
+#  most backends support up to 1024 threads per group,
+#  Metal sometimes supports fewer.
+const TILE_DIM = 16
 
 @kernel function coalesced_matmul_kernel!(
         output, @Const(input1), @Const(input2), N, R, M,
diff --git a/examples/utils.jl b/examples/utils.jl
@@ -1,9 +1,13 @@
 # EXCLUDE FROM TESTING
-if Base.find_package("CUDA") !== nothing
-    using CUDA
-    using CUDA.CUDAKernels
-    const backend = CUDABackend()
-    CUDA.allowscalar(false)
-else
-    const backend = CPU()
+if !(@isdefined backend)
+    if Base.find_package("CUDA") !== nothing
+        using CUDA
+        using CUDA.CUDAKernels
+        const backend = CUDABackend()
+        CUDA.allowscalar(false)
+    else
+        const backend = CPU()
+    end
 end
+
+const f_type = KernelAbstractions.supports_float64(backend) ? Float64 : Float32
diff --git a/test/examples.jl b/test/examples.jl
@@ -9,7 +9,7 @@ function find_sources(path::String, sources = String[])
     return sources
 end
 
-function examples_testsuite(backend_str)
+function examples_testsuite(backend, backend_str)
     @testset "examples" begin
         examples_dir = joinpath(@__DIR__, "..", "examples")
         examples = find_sources(examples_dir)
@@ -21,6 +21,7 @@ function examples_testsuite(backend_str)
         @testset "$(basename(example))" for example in examples
             @eval module $(gensym())
             backend_str = $backend_str
+            const backend = ($backend)()
             include($example)
             end
             @test true
diff --git a/test/testsuite.jl b/test/testsuite.jl
@@ -84,7 +84,7 @@ function testsuite(backend, backend_str, backend_mod, AT, DAT; skip_tests = Set{
     end
 
     @conditional_testset "Examples" skip_tests begin
-        examples_testsuite(backend_str)
+        examples_testsuite(backend, backend_str)
     end
 
     return