From 28cf68c4cae216b460d141b9f39fed3d7b8b7f2e Mon Sep 17 00:00:00 2001 From: Neven Sajko <4944410+nsajko@users.noreply.github.com> Date: Tue, 10 Jun 2025 15:00:43 +0200 Subject: [PATCH 1/3] prevent `get_backend` from overflowing the stack (#602) Co-authored-by: Valentin Churavy (cherry picked from commit 474050e959515bce591fd2383130fe96395b05d7) --- src/KernelAbstractions.jl | 8 +++++++- test/test.jl | 4 ++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl index 5103882ef..73298ca53 100644 --- a/src/KernelAbstractions.jl +++ b/src/KernelAbstractions.jl @@ -516,7 +516,13 @@ Get a [`Backend`](@ref) instance suitable for array `A`. function get_backend end # Should cover SubArray, ReshapedArray, ReinterpretArray, Hermitian, AbstractTriangular, etc.: -get_backend(A::AbstractArray) = get_backend(parent(A)) +function get_backend(A::AbstractArray) + P = parent(A) + if P isa typeof(A) + throw(ArgumentError("Implement `KernelAbstractions.get_backend(::$(typeof(A)))`")) + end + return get_backend(P) +end get_backend(::Array) = CPU() diff --git a/test/test.jl b/test/test.jl index 4e017c8a9..3528a2cc9 100644 --- a/test/test.jl +++ b/test/test.jl @@ -7,6 +7,9 @@ using Adapt identity(x) = x +struct UnknownAbstractVector <: AbstractVector{Float32} # issue #588 +end + function unittest_testsuite(Backend, backend_str, backend_mod, BackendArrayT; skip_tests = Set{String}()) @conditional_testset "partition" skip_tests begin backend = Backend() @@ -80,6 +83,7 @@ function unittest_testsuite(Backend, backend_str, backend_mod, BackendArrayT; sk @test @inferred(KernelAbstractions.get_backend(view(A, 2:4, 1:3))) isa backendT @test @inferred(KernelAbstractions.get_backend(Diagonal(x))) isa backendT @test @inferred(KernelAbstractions.get_backend(Tridiagonal(A))) isa backendT + @test_throws ArgumentError KernelAbstractions.get_backend(UnknownAbstractVector()) # issue #588 end @conditional_testset "sparse" skip_tests begin From 6a68c20644a0c5a01b3807ee96eea20fa0bf49c1 Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Tue, 10 Jun 2025 15:03:24 +0200 Subject: [PATCH 2/3] bump patch release version --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index cb50b4c33..6fa8a12f6 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "KernelAbstractions" uuid = "63c18a36-062a-441e-b654-da1e3ab1ce7c" authors = ["Valentin Churavy and contributors"] -version = "0.9.34" +version = "0.9.35" [deps] Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" From 69198a1e821064da4b875b669bf737fbabfe5629 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Tue, 10 Jun 2025 10:52:03 -0300 Subject: [PATCH 3/3] Test correct backend in examples test (#597) * Use Float32 in examples with backends that don't support Float64 * Add `backend` argument for `examples_testset` * Reduce `TILE_DIM` for compatibility Metal doesn't always support 1-24 threads, which causes intermittent errors with 32x32 tiles * Fix histogram implementation The final part of the loop expects every thread to exists, so we cannot not launch them. Avoid work on extra threads until then. Also use Int32 since some backends lack Int64 atomics, and make one of the tests have weird groupsize since that's when the errors used to pop up. (cherry picked from commit dab03b92f1f6515f6f297adc49e1fb318b1c8a01) --- examples/histogram.jl | 39 +++++++++++++++++------------------ examples/memcopy.jl | 4 ++-- examples/memcopy_static.jl | 4 ++-- examples/performant_matmul.jl | 5 ++++- examples/utils.jl | 18 +++++++++------- test/examples.jl | 3 ++- test/testsuite.jl | 2 +- 7 files changed, 41 insertions(+), 34 deletions(-) diff --git a/examples/histogram.jl b/examples/histogram.jl index 958fa0e1d..4cf43abcf 100644 --- a/examples/histogram.jl +++ b/examples/histogram.jl @@ -5,7 +5,7 @@ include(joinpath(dirname(pathof(KernelAbstractions)), "../examples/utils.jl")) # # Function to use as a baseline for CPU metrics function create_histogram(input) - histogram_output = zeros(Int, maximum(input)) + histogram_output = zeros(eltype(input), maximum(input)) for i in input histogram_output[i] += 1 end @@ -13,23 +13,21 @@ function create_histogram(input) end # This a 1D histogram kernel where the histogramming happens on shmem -@kernel function histogram_kernel!(histogram_output, input) - tid = @index(Global, Linear) +@kernel unsafe_indices = true function histogram_kernel!(histogram_output, input) + gid = @index(Group, Linear) lid = @index(Local, Linear) - @uniform warpsize = Int(32) - - @uniform gs = @groupsize()[1] + @uniform gs = prod(@groupsize()) + tid = (gid - 1) * gs + lid @uniform N = length(histogram_output) - shared_histogram = @localmem Int (gs) + shared_histogram = @localmem eltype(input) (gs) # This will go through all input elements and assign them to a location in # shmem. Note that if there is not enough shem, we create different shmem # blocks to write to. For example, if shmem is of size 256, but it's # possible to get a value of 312, then we will have 2 separate shmem blocks, # one from 1->256, and another from 256->512 - @uniform max_element = 1 for min_element in 1:gs:N # Setting shared_histogram to 0 @@ -42,7 +40,7 @@ end end # Defining bin on shared memory and writing to it if possible - bin = input[tid] + bin = tid <= length(input) ? input[tid] : 0 if bin >= min_element && bin < max_element bin -= min_element - 1 @atomic shared_histogram[bin] += 1 @@ -58,10 +56,10 @@ end end -function histogram!(histogram_output, input) +function histogram!(histogram_output, input, groupsize = 256) backend = get_backend(histogram_output) # Need static block size - kernel! = histogram_kernel!(backend, (256,)) + kernel! = histogram_kernel!(backend, (groupsize,)) kernel!(histogram_output, input, ndrange = size(input)) return end @@ -74,12 +72,13 @@ function move(backend, input) end @testset "histogram tests" begin - if Base.VERSION < v"1.7.0" && !KernelAbstractions.isgpu(backend) + if !KernelAbstractions.isgpu(backend) @test_skip false else - rand_input = [rand(1:128) for i in 1:1000] - linear_input = [i for i in 1:1024] - all_two = [2 for i in 1:512] + # Use Int32 as some backends don't support 64-bit atomics + rand_input = Int32.(rand(1:128, 1000)) + linear_input = Int32.(1:1024) + all_two = fill(Int32(2), 512) histogram_rand_baseline = create_histogram(rand_input) histogram_linear_baseline = create_histogram(linear_input) @@ -89,14 +88,14 @@ end linear_input = move(backend, linear_input) all_two = move(backend, all_two) - rand_histogram = KernelAbstractions.zeros(backend, Int, 128) - linear_histogram = KernelAbstractions.zeros(backend, Int, 1024) - two_histogram = KernelAbstractions.zeros(backend, Int, 2) + rand_histogram = KernelAbstractions.zeros(backend, eltype(rand_input), Int(maximum(rand_input))) + linear_histogram = KernelAbstractions.zeros(backend, eltype(linear_input), Int(maximum(linear_input))) + two_histogram = KernelAbstractions.zeros(backend, eltype(all_two), Int(maximum(all_two))) - histogram!(rand_histogram, rand_input) + histogram!(rand_histogram, rand_input, 6) histogram!(linear_histogram, linear_input) histogram!(two_histogram, all_two) - KernelAbstractions.synchronize(CPU()) + KernelAbstractions.synchronize(backend) @test isapprox(Array(rand_histogram), histogram_rand_baseline) @test isapprox(Array(linear_histogram), histogram_linear_baseline) diff --git a/examples/memcopy.jl b/examples/memcopy.jl index 3159f3670..0fe141041 100644 --- a/examples/memcopy.jl +++ b/examples/memcopy.jl @@ -16,8 +16,8 @@ function mycopy!(A, B) return end -A = KernelAbstractions.zeros(backend, Float64, 128, 128) -B = KernelAbstractions.ones(backend, Float64, 128, 128) +A = KernelAbstractions.zeros(backend, f_type, 128, 128) +B = KernelAbstractions.ones(backend, f_type, 128, 128) mycopy!(A, B) KernelAbstractions.synchronize(backend) @test A == B diff --git a/examples/memcopy_static.jl b/examples/memcopy_static.jl index 9f088294e..7c965c557 100644 --- a/examples/memcopy_static.jl +++ b/examples/memcopy_static.jl @@ -16,8 +16,8 @@ function mycopy_static!(A, B) return end -A = KernelAbstractions.zeros(backend, Float64, 128, 128) -B = KernelAbstractions.ones(backend, Float64, 128, 128) +A = KernelAbstractions.zeros(backend, f_type, 128, 128) +B = KernelAbstractions.ones(backend, f_type, 128, 128) mycopy_static!(A, B) KernelAbstractions.synchronize(backend) @test A == B diff --git a/examples/performant_matmul.jl b/examples/performant_matmul.jl index dbd5ad75a..afd247d13 100644 --- a/examples/performant_matmul.jl +++ b/examples/performant_matmul.jl @@ -4,7 +4,10 @@ using Test using Random include(joinpath(dirname(pathof(KernelAbstractions)), "../examples/utils.jl")) # Load backend -const TILE_DIM = 32 +# We use a TILE_DIM of 16 as a safe value since while +# most backends support up to 1024 threads per group, +# Metal sometimes supports fewer. +const TILE_DIM = 16 @kernel function coalesced_matmul_kernel!( output, @Const(input1), @Const(input2), N, R, M, diff --git a/examples/utils.jl b/examples/utils.jl index 5e93299b1..b1ab6353b 100644 --- a/examples/utils.jl +++ b/examples/utils.jl @@ -1,9 +1,13 @@ # EXCLUDE FROM TESTING -if Base.find_package("CUDA") !== nothing - using CUDA - using CUDA.CUDAKernels - const backend = CUDABackend() - CUDA.allowscalar(false) -else - const backend = CPU() +if !(@isdefined backend) + if Base.find_package("CUDA") !== nothing + using CUDA + using CUDA.CUDAKernels + const backend = CUDABackend() + CUDA.allowscalar(false) + else + const backend = CPU() + end end + +const f_type = KernelAbstractions.supports_float64(backend) ? Float64 : Float32 diff --git a/test/examples.jl b/test/examples.jl index 02374db89..d10f48d65 100644 --- a/test/examples.jl +++ b/test/examples.jl @@ -9,7 +9,7 @@ function find_sources(path::String, sources = String[]) return sources end -function examples_testsuite(backend_str) +function examples_testsuite(backend, backend_str) @testset "examples" begin examples_dir = joinpath(@__DIR__, "..", "examples") examples = find_sources(examples_dir) @@ -21,6 +21,7 @@ function examples_testsuite(backend_str) @testset "$(basename(example))" for example in examples @eval module $(gensym()) backend_str = $backend_str + const backend = ($backend)() include($example) end @test true diff --git a/test/testsuite.jl b/test/testsuite.jl index 29f780272..dd1f4629e 100644 --- a/test/testsuite.jl +++ b/test/testsuite.jl @@ -89,7 +89,7 @@ function testsuite(backend, backend_str, backend_mod, AT, DAT; skip_tests = Set{ end @conditional_testset "Examples" skip_tests begin - examples_testsuite(backend_str) + examples_testsuite(backend, backend_str) end return