diff --git a/Project.toml b/Project.toml index cb50b4c33..6fa8a12f6 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "KernelAbstractions" uuid = "63c18a36-062a-441e-b654-da1e3ab1ce7c" authors = ["Valentin Churavy and contributors"] -version = "0.9.34" +version = "0.9.35" [deps] Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" diff --git a/examples/histogram.jl b/examples/histogram.jl index 958fa0e1d..4cf43abcf 100644 --- a/examples/histogram.jl +++ b/examples/histogram.jl @@ -5,7 +5,7 @@ include(joinpath(dirname(pathof(KernelAbstractions)), "../examples/utils.jl")) # # Function to use as a baseline for CPU metrics function create_histogram(input) - histogram_output = zeros(Int, maximum(input)) + histogram_output = zeros(eltype(input), maximum(input)) for i in input histogram_output[i] += 1 end @@ -13,23 +13,21 @@ function create_histogram(input) end # This a 1D histogram kernel where the histogramming happens on shmem -@kernel function histogram_kernel!(histogram_output, input) - tid = @index(Global, Linear) +@kernel unsafe_indices = true function histogram_kernel!(histogram_output, input) + gid = @index(Group, Linear) lid = @index(Local, Linear) - @uniform warpsize = Int(32) - - @uniform gs = @groupsize()[1] + @uniform gs = prod(@groupsize()) + tid = (gid - 1) * gs + lid @uniform N = length(histogram_output) - shared_histogram = @localmem Int (gs) + shared_histogram = @localmem eltype(input) (gs) # This will go through all input elements and assign them to a location in # shmem. Note that if there is not enough shem, we create different shmem # blocks to write to. For example, if shmem is of size 256, but it's # possible to get a value of 312, then we will have 2 separate shmem blocks, # one from 1->256, and another from 256->512 - @uniform max_element = 1 for min_element in 1:gs:N # Setting shared_histogram to 0 @@ -42,7 +40,7 @@ end end # Defining bin on shared memory and writing to it if possible - bin = input[tid] + bin = tid <= length(input) ? input[tid] : 0 if bin >= min_element && bin < max_element bin -= min_element - 1 @atomic shared_histogram[bin] += 1 @@ -58,10 +56,10 @@ end end -function histogram!(histogram_output, input) +function histogram!(histogram_output, input, groupsize = 256) backend = get_backend(histogram_output) # Need static block size - kernel! = histogram_kernel!(backend, (256,)) + kernel! = histogram_kernel!(backend, (groupsize,)) kernel!(histogram_output, input, ndrange = size(input)) return end @@ -74,12 +72,13 @@ function move(backend, input) end @testset "histogram tests" begin - if Base.VERSION < v"1.7.0" && !KernelAbstractions.isgpu(backend) + if !KernelAbstractions.isgpu(backend) @test_skip false else - rand_input = [rand(1:128) for i in 1:1000] - linear_input = [i for i in 1:1024] - all_two = [2 for i in 1:512] + # Use Int32 as some backends don't support 64-bit atomics + rand_input = Int32.(rand(1:128, 1000)) + linear_input = Int32.(1:1024) + all_two = fill(Int32(2), 512) histogram_rand_baseline = create_histogram(rand_input) histogram_linear_baseline = create_histogram(linear_input) @@ -89,14 +88,14 @@ end linear_input = move(backend, linear_input) all_two = move(backend, all_two) - rand_histogram = KernelAbstractions.zeros(backend, Int, 128) - linear_histogram = KernelAbstractions.zeros(backend, Int, 1024) - two_histogram = KernelAbstractions.zeros(backend, Int, 2) + rand_histogram = KernelAbstractions.zeros(backend, eltype(rand_input), Int(maximum(rand_input))) + linear_histogram = KernelAbstractions.zeros(backend, eltype(linear_input), Int(maximum(linear_input))) + two_histogram = KernelAbstractions.zeros(backend, eltype(all_two), Int(maximum(all_two))) - histogram!(rand_histogram, rand_input) + histogram!(rand_histogram, rand_input, 6) histogram!(linear_histogram, linear_input) histogram!(two_histogram, all_two) - KernelAbstractions.synchronize(CPU()) + KernelAbstractions.synchronize(backend) @test isapprox(Array(rand_histogram), histogram_rand_baseline) @test isapprox(Array(linear_histogram), histogram_linear_baseline) diff --git a/examples/memcopy.jl b/examples/memcopy.jl index 3159f3670..0fe141041 100644 --- a/examples/memcopy.jl +++ b/examples/memcopy.jl @@ -16,8 +16,8 @@ function mycopy!(A, B) return end -A = KernelAbstractions.zeros(backend, Float64, 128, 128) -B = KernelAbstractions.ones(backend, Float64, 128, 128) +A = KernelAbstractions.zeros(backend, f_type, 128, 128) +B = KernelAbstractions.ones(backend, f_type, 128, 128) mycopy!(A, B) KernelAbstractions.synchronize(backend) @test A == B diff --git a/examples/memcopy_static.jl b/examples/memcopy_static.jl index 9f088294e..7c965c557 100644 --- a/examples/memcopy_static.jl +++ b/examples/memcopy_static.jl @@ -16,8 +16,8 @@ function mycopy_static!(A, B) return end -A = KernelAbstractions.zeros(backend, Float64, 128, 128) -B = KernelAbstractions.ones(backend, Float64, 128, 128) +A = KernelAbstractions.zeros(backend, f_type, 128, 128) +B = KernelAbstractions.ones(backend, f_type, 128, 128) mycopy_static!(A, B) KernelAbstractions.synchronize(backend) @test A == B diff --git a/examples/performant_matmul.jl b/examples/performant_matmul.jl index dbd5ad75a..afd247d13 100644 --- a/examples/performant_matmul.jl +++ b/examples/performant_matmul.jl @@ -4,7 +4,10 @@ using Test using Random include(joinpath(dirname(pathof(KernelAbstractions)), "../examples/utils.jl")) # Load backend -const TILE_DIM = 32 +# We use a TILE_DIM of 16 as a safe value since while +# most backends support up to 1024 threads per group, +# Metal sometimes supports fewer. +const TILE_DIM = 16 @kernel function coalesced_matmul_kernel!( output, @Const(input1), @Const(input2), N, R, M, diff --git a/examples/utils.jl b/examples/utils.jl index 5e93299b1..b1ab6353b 100644 --- a/examples/utils.jl +++ b/examples/utils.jl @@ -1,9 +1,13 @@ # EXCLUDE FROM TESTING -if Base.find_package("CUDA") !== nothing - using CUDA - using CUDA.CUDAKernels - const backend = CUDABackend() - CUDA.allowscalar(false) -else - const backend = CPU() +if !(@isdefined backend) + if Base.find_package("CUDA") !== nothing + using CUDA + using CUDA.CUDAKernels + const backend = CUDABackend() + CUDA.allowscalar(false) + else + const backend = CPU() + end end + +const f_type = KernelAbstractions.supports_float64(backend) ? Float64 : Float32 diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl index 5103882ef..73298ca53 100644 --- a/src/KernelAbstractions.jl +++ b/src/KernelAbstractions.jl @@ -516,7 +516,13 @@ Get a [`Backend`](@ref) instance suitable for array `A`. function get_backend end # Should cover SubArray, ReshapedArray, ReinterpretArray, Hermitian, AbstractTriangular, etc.: -get_backend(A::AbstractArray) = get_backend(parent(A)) +function get_backend(A::AbstractArray) + P = parent(A) + if P isa typeof(A) + throw(ArgumentError("Implement `KernelAbstractions.get_backend(::$(typeof(A)))`")) + end + return get_backend(P) +end get_backend(::Array) = CPU() diff --git a/test/examples.jl b/test/examples.jl index 02374db89..d10f48d65 100644 --- a/test/examples.jl +++ b/test/examples.jl @@ -9,7 +9,7 @@ function find_sources(path::String, sources = String[]) return sources end -function examples_testsuite(backend_str) +function examples_testsuite(backend, backend_str) @testset "examples" begin examples_dir = joinpath(@__DIR__, "..", "examples") examples = find_sources(examples_dir) @@ -21,6 +21,7 @@ function examples_testsuite(backend_str) @testset "$(basename(example))" for example in examples @eval module $(gensym()) backend_str = $backend_str + const backend = ($backend)() include($example) end @test true diff --git a/test/test.jl b/test/test.jl index 4e017c8a9..3528a2cc9 100644 --- a/test/test.jl +++ b/test/test.jl @@ -7,6 +7,9 @@ using Adapt identity(x) = x +struct UnknownAbstractVector <: AbstractVector{Float32} # issue #588 +end + function unittest_testsuite(Backend, backend_str, backend_mod, BackendArrayT; skip_tests = Set{String}()) @conditional_testset "partition" skip_tests begin backend = Backend() @@ -80,6 +83,7 @@ function unittest_testsuite(Backend, backend_str, backend_mod, BackendArrayT; sk @test @inferred(KernelAbstractions.get_backend(view(A, 2:4, 1:3))) isa backendT @test @inferred(KernelAbstractions.get_backend(Diagonal(x))) isa backendT @test @inferred(KernelAbstractions.get_backend(Tridiagonal(A))) isa backendT + @test_throws ArgumentError KernelAbstractions.get_backend(UnknownAbstractVector()) # issue #588 end @conditional_testset "sparse" skip_tests begin diff --git a/test/testsuite.jl b/test/testsuite.jl index 29f780272..dd1f4629e 100644 --- a/test/testsuite.jl +++ b/test/testsuite.jl @@ -89,7 +89,7 @@ function testsuite(backend, backend_str, backend_mod, AT, DAT; skip_tests = Set{ end @conditional_testset "Examples" skip_tests begin - examples_testsuite(backend_str) + examples_testsuite(backend, backend_str) end return