From 09d3d51820617dea2410ea86451a659051c334a5 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Fri, 6 Jun 2025 17:07:56 -0300 Subject: [PATCH 1/7] Use Float32 in examples with backends that don't support Float64 --- examples/memcopy.jl | 4 ++-- examples/memcopy_static.jl | 4 ++-- examples/utils.jl | 2 ++ 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/examples/memcopy.jl b/examples/memcopy.jl index 3159f3670..0fe141041 100644 --- a/examples/memcopy.jl +++ b/examples/memcopy.jl @@ -16,8 +16,8 @@ function mycopy!(A, B) return end -A = KernelAbstractions.zeros(backend, Float64, 128, 128) -B = KernelAbstractions.ones(backend, Float64, 128, 128) +A = KernelAbstractions.zeros(backend, f_type, 128, 128) +B = KernelAbstractions.ones(backend, f_type, 128, 128) mycopy!(A, B) KernelAbstractions.synchronize(backend) @test A == B diff --git a/examples/memcopy_static.jl b/examples/memcopy_static.jl index 9f088294e..7c965c557 100644 --- a/examples/memcopy_static.jl +++ b/examples/memcopy_static.jl @@ -16,8 +16,8 @@ function mycopy_static!(A, B) return end -A = KernelAbstractions.zeros(backend, Float64, 128, 128) -B = KernelAbstractions.ones(backend, Float64, 128, 128) +A = KernelAbstractions.zeros(backend, f_type, 128, 128) +B = KernelAbstractions.ones(backend, f_type, 128, 128) mycopy_static!(A, B) KernelAbstractions.synchronize(backend) @test A == B diff --git a/examples/utils.jl b/examples/utils.jl index 5e93299b1..cd9f56459 100644 --- a/examples/utils.jl +++ b/examples/utils.jl @@ -7,3 +7,5 @@ if Base.find_package("CUDA") !== nothing else const backend = CPU() end + +const f_type = KernelAbstractions.supports_float64(backend) ? Float64 : Float32 From 34227175527d5583e069950bc1186265cfe53377 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Fri, 6 Jun 2025 17:10:03 -0300 Subject: [PATCH 2/7] Add `backend` argument for `examples_testset` --- examples/utils.jl | 16 +++++++++------- test/examples.jl | 3 ++- test/testsuite.jl | 2 +- 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/examples/utils.jl b/examples/utils.jl index cd9f56459..b1ab6353b 100644 --- a/examples/utils.jl +++ b/examples/utils.jl @@ -1,11 +1,13 @@ # EXCLUDE FROM TESTING -if Base.find_package("CUDA") !== nothing - using CUDA - using CUDA.CUDAKernels - const backend = CUDABackend() - CUDA.allowscalar(false) -else - const backend = CPU() +if !(@isdefined backend) + if Base.find_package("CUDA") !== nothing + using CUDA + using CUDA.CUDAKernels + const backend = CUDABackend() + CUDA.allowscalar(false) + else + const backend = CPU() + end end const f_type = KernelAbstractions.supports_float64(backend) ? Float64 : Float32 diff --git a/test/examples.jl b/test/examples.jl index 02374db89..d10f48d65 100644 --- a/test/examples.jl +++ b/test/examples.jl @@ -9,7 +9,7 @@ function find_sources(path::String, sources = String[]) return sources end -function examples_testsuite(backend_str) +function examples_testsuite(backend, backend_str) @testset "examples" begin examples_dir = joinpath(@__DIR__, "..", "examples") examples = find_sources(examples_dir) @@ -21,6 +21,7 @@ function examples_testsuite(backend_str) @testset "$(basename(example))" for example in examples @eval module $(gensym()) backend_str = $backend_str + const backend = ($backend)() include($example) end @test true diff --git a/test/testsuite.jl b/test/testsuite.jl index f9f96fcf0..2418db998 100644 --- a/test/testsuite.jl +++ b/test/testsuite.jl @@ -84,7 +84,7 @@ function testsuite(backend, backend_str, backend_mod, AT, DAT; skip_tests = Set{ end @conditional_testset "Examples" skip_tests begin - examples_testsuite(backend_str) + examples_testsuite(backend, backend_str) end return From 2963addd3c1932948709df6911512c7a3b52ec24 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Fri, 6 Jun 2025 17:11:28 -0300 Subject: [PATCH 3/7] Reduce `TILE_DIM` for compatibility Metal doesn't always support 1-24 threads, which causes intermittent errors with 32x32 tiles --- examples/performant_matmul.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/performant_matmul.jl b/examples/performant_matmul.jl index dbd5ad75a..3b274de5d 100644 --- a/examples/performant_matmul.jl +++ b/examples/performant_matmul.jl @@ -4,7 +4,7 @@ using Test using Random include(joinpath(dirname(pathof(KernelAbstractions)), "../examples/utils.jl")) # Load backend -const TILE_DIM = 32 +const TILE_DIM = 16 @kernel function coalesced_matmul_kernel!( output, @Const(input1), @Const(input2), N, R, M, From 4cefdfb1a7c669048cf6a17df3e08d62bcc79362 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Fri, 6 Jun 2025 17:14:25 -0300 Subject: [PATCH 4/7] Fix histogram implementation The final part of the loop expects every thread to exists, so we cannot not launch them. Avoid work on extra threads until then. Also use Int32 since some backends lack Int64 atomics, and make one of the tests have weird groupsize since that's when the errors used to pop up. --- examples/histogram.jl | 37 ++++++++++++++++++------------------- 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/examples/histogram.jl b/examples/histogram.jl index 3e91c29d4..9edc02930 100644 --- a/examples/histogram.jl +++ b/examples/histogram.jl @@ -5,7 +5,7 @@ include(joinpath(dirname(pathof(KernelAbstractions)), "../examples/utils.jl")) # # Function to use as a baseline for CPU metrics function create_histogram(input) - histogram_output = zeros(Int, maximum(input)) + histogram_output = zeros(eltype(input), maximum(input)) for i in input histogram_output[i] += 1 end @@ -13,23 +13,21 @@ function create_histogram(input) end # This a 1D histogram kernel where the histogramming happens on shmem -@kernel function histogram_kernel!(histogram_output, input) - tid = @index(Global, Linear) +@kernel unsafe_indices = true function histogram_kernel!(histogram_output, input) + gid = @index(Group, Linear) lid = @index(Local, Linear) - @uniform warpsize = Int(32) - - @uniform gs = @groupsize()[1] + @uniform gs = prod(@groupsize()) + tid = (gid - 1) * gs + lid @uniform N = length(histogram_output) - shared_histogram = @localmem Int (gs) + shared_histogram = @localmem eltype(input) (gs) # This will go through all input elements and assign them to a location in # shmem. Note that if there is not enough shem, we create different shmem # blocks to write to. For example, if shmem is of size 256, but it's # possible to get a value of 312, then we will have 2 separate shmem blocks, # one from 1->256, and another from 256->512 - @uniform max_element = 1 for min_element in 1:gs:N # Setting shared_histogram to 0 @@ -42,7 +40,7 @@ end end # Defining bin on shared memory and writing to it if possible - bin = input[tid] + bin = tid <= length(input) ? input[tid] : 0 if bin >= min_element && bin < max_element bin -= min_element - 1 @atomic shared_histogram[bin] += 1 @@ -58,10 +56,10 @@ end end -function histogram!(histogram_output, input) +function histogram!(histogram_output, input, groupsize = 256) backend = get_backend(histogram_output) # Need static block size - kernel! = histogram_kernel!(backend, (256,)) + kernel! = histogram_kernel!(backend, (groupsize,)) kernel!(histogram_output, input, ndrange = size(input)) return end @@ -74,9 +72,10 @@ function move(backend, input) end @testset "histogram tests" begin - rand_input = [rand(1:128) for i in 1:1000] - linear_input = [i for i in 1:1024] - all_two = [2 for i in 1:512] + # Use Int32 as some backends don't support 64-bit atomics + rand_input = Int32.(rand(1:128, 1000)) + linear_input = Int32.(1:1024) + all_two = fill(Int32(2), 512) histogram_rand_baseline = create_histogram(rand_input) histogram_linear_baseline = create_histogram(linear_input) @@ -86,14 +85,14 @@ end linear_input = move(backend, linear_input) all_two = move(backend, all_two) - rand_histogram = KernelAbstractions.zeros(backend, Int, 128) - linear_histogram = KernelAbstractions.zeros(backend, Int, 1024) - two_histogram = KernelAbstractions.zeros(backend, Int, 2) + rand_histogram = KernelAbstractions.zeros(backend, eltype(rand_input), Int(maximum(rand_input))) + linear_histogram = KernelAbstractions.zeros(backend, eltype(linear_input), Int(maximum(linear_input))) + two_histogram = KernelAbstractions.zeros(backend, eltype(all_two), Int(maximum(all_two))) - histogram!(rand_histogram, rand_input) + histogram!(rand_histogram, rand_input, 6) histogram!(linear_histogram, linear_input) histogram!(two_histogram, all_two) - KernelAbstractions.synchronize(CPU()) + KernelAbstractions.synchronize(backend) @test isapprox(Array(rand_histogram), histogram_rand_baseline) @test isapprox(Array(linear_histogram), histogram_linear_baseline) From 432117c51d41479629f3cdc4888367242ba668ba Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Fri, 6 Jun 2025 17:05:53 -0300 Subject: [PATCH 5/7] REVERT BEFORE MERGING --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index acd4e89de..889f1c4f6 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "KernelAbstractions" uuid = "63c18a36-062a-441e-b654-da1e3ab1ce7c" authors = ["Valentin Churavy and contributors"] -version = "0.10.0-dev" +version = "0.9.35" [deps] Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" From c3abf87397244e8c1e52e4abf1f536c2993cf2fc Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Tue, 10 Jun 2025 10:30:56 +0200 Subject: [PATCH 6/7] Update Project.toml Co-authored-by: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index 889f1c4f6..acd4e89de 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "KernelAbstractions" uuid = "63c18a36-062a-441e-b654-da1e3ab1ce7c" authors = ["Valentin Churavy and contributors"] -version = "0.9.35" +version = "0.10.0-dev" [deps] Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" From 8d285dbb39e6ba6a31345e7771f2978dc5ba5b72 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Tue, 10 Jun 2025 10:43:35 -0300 Subject: [PATCH 7/7] Justify `TILE_DIM = 16` --- examples/performant_matmul.jl | 3 +++ 1 file changed, 3 insertions(+) diff --git a/examples/performant_matmul.jl b/examples/performant_matmul.jl index 3b274de5d..afd247d13 100644 --- a/examples/performant_matmul.jl +++ b/examples/performant_matmul.jl @@ -4,6 +4,9 @@ using Test using Random include(joinpath(dirname(pathof(KernelAbstractions)), "../examples/utils.jl")) # Load backend +# We use a TILE_DIM of 16 as a safe value since while +# most backends support up to 1024 threads per group, +# Metal sometimes supports fewer. const TILE_DIM = 16 @kernel function coalesced_matmul_kernel!(