diff --git a/examples/histogram.jl b/examples/histogram.jl index 3e91c29d4..9edc02930 100644 --- a/examples/histogram.jl +++ b/examples/histogram.jl @@ -5,7 +5,7 @@ include(joinpath(dirname(pathof(KernelAbstractions)), "../examples/utils.jl")) # # Function to use as a baseline for CPU metrics function create_histogram(input) - histogram_output = zeros(Int, maximum(input)) + histogram_output = zeros(eltype(input), maximum(input)) for i in input histogram_output[i] += 1 end @@ -13,23 +13,21 @@ function create_histogram(input) end # This a 1D histogram kernel where the histogramming happens on shmem -@kernel function histogram_kernel!(histogram_output, input) - tid = @index(Global, Linear) +@kernel unsafe_indices = true function histogram_kernel!(histogram_output, input) + gid = @index(Group, Linear) lid = @index(Local, Linear) - @uniform warpsize = Int(32) - - @uniform gs = @groupsize()[1] + @uniform gs = prod(@groupsize()) + tid = (gid - 1) * gs + lid @uniform N = length(histogram_output) - shared_histogram = @localmem Int (gs) + shared_histogram = @localmem eltype(input) (gs) # This will go through all input elements and assign them to a location in # shmem. Note that if there is not enough shem, we create different shmem # blocks to write to. For example, if shmem is of size 256, but it's # possible to get a value of 312, then we will have 2 separate shmem blocks, # one from 1->256, and another from 256->512 - @uniform max_element = 1 for min_element in 1:gs:N # Setting shared_histogram to 0 @@ -42,7 +40,7 @@ end end # Defining bin on shared memory and writing to it if possible - bin = input[tid] + bin = tid <= length(input) ? input[tid] : 0 if bin >= min_element && bin < max_element bin -= min_element - 1 @atomic shared_histogram[bin] += 1 @@ -58,10 +56,10 @@ end end -function histogram!(histogram_output, input) +function histogram!(histogram_output, input, groupsize = 256) backend = get_backend(histogram_output) # Need static block size - kernel! = histogram_kernel!(backend, (256,)) + kernel! = histogram_kernel!(backend, (groupsize,)) kernel!(histogram_output, input, ndrange = size(input)) return end @@ -74,9 +72,10 @@ function move(backend, input) end @testset "histogram tests" begin - rand_input = [rand(1:128) for i in 1:1000] - linear_input = [i for i in 1:1024] - all_two = [2 for i in 1:512] + # Use Int32 as some backends don't support 64-bit atomics + rand_input = Int32.(rand(1:128, 1000)) + linear_input = Int32.(1:1024) + all_two = fill(Int32(2), 512) histogram_rand_baseline = create_histogram(rand_input) histogram_linear_baseline = create_histogram(linear_input) @@ -86,14 +85,14 @@ end linear_input = move(backend, linear_input) all_two = move(backend, all_two) - rand_histogram = KernelAbstractions.zeros(backend, Int, 128) - linear_histogram = KernelAbstractions.zeros(backend, Int, 1024) - two_histogram = KernelAbstractions.zeros(backend, Int, 2) + rand_histogram = KernelAbstractions.zeros(backend, eltype(rand_input), Int(maximum(rand_input))) + linear_histogram = KernelAbstractions.zeros(backend, eltype(linear_input), Int(maximum(linear_input))) + two_histogram = KernelAbstractions.zeros(backend, eltype(all_two), Int(maximum(all_two))) - histogram!(rand_histogram, rand_input) + histogram!(rand_histogram, rand_input, 6) histogram!(linear_histogram, linear_input) histogram!(two_histogram, all_two) - KernelAbstractions.synchronize(CPU()) + KernelAbstractions.synchronize(backend) @test isapprox(Array(rand_histogram), histogram_rand_baseline) @test isapprox(Array(linear_histogram), histogram_linear_baseline) diff --git a/examples/memcopy.jl b/examples/memcopy.jl index 3159f3670..0fe141041 100644 --- a/examples/memcopy.jl +++ b/examples/memcopy.jl @@ -16,8 +16,8 @@ function mycopy!(A, B) return end -A = KernelAbstractions.zeros(backend, Float64, 128, 128) -B = KernelAbstractions.ones(backend, Float64, 128, 128) +A = KernelAbstractions.zeros(backend, f_type, 128, 128) +B = KernelAbstractions.ones(backend, f_type, 128, 128) mycopy!(A, B) KernelAbstractions.synchronize(backend) @test A == B diff --git a/examples/memcopy_static.jl b/examples/memcopy_static.jl index 9f088294e..7c965c557 100644 --- a/examples/memcopy_static.jl +++ b/examples/memcopy_static.jl @@ -16,8 +16,8 @@ function mycopy_static!(A, B) return end -A = KernelAbstractions.zeros(backend, Float64, 128, 128) -B = KernelAbstractions.ones(backend, Float64, 128, 128) +A = KernelAbstractions.zeros(backend, f_type, 128, 128) +B = KernelAbstractions.ones(backend, f_type, 128, 128) mycopy_static!(A, B) KernelAbstractions.synchronize(backend) @test A == B diff --git a/examples/performant_matmul.jl b/examples/performant_matmul.jl index dbd5ad75a..afd247d13 100644 --- a/examples/performant_matmul.jl +++ b/examples/performant_matmul.jl @@ -4,7 +4,10 @@ using Test using Random include(joinpath(dirname(pathof(KernelAbstractions)), "../examples/utils.jl")) # Load backend -const TILE_DIM = 32 +# We use a TILE_DIM of 16 as a safe value since while +# most backends support up to 1024 threads per group, +# Metal sometimes supports fewer. +const TILE_DIM = 16 @kernel function coalesced_matmul_kernel!( output, @Const(input1), @Const(input2), N, R, M, diff --git a/examples/utils.jl b/examples/utils.jl index 5e93299b1..b1ab6353b 100644 --- a/examples/utils.jl +++ b/examples/utils.jl @@ -1,9 +1,13 @@ # EXCLUDE FROM TESTING -if Base.find_package("CUDA") !== nothing - using CUDA - using CUDA.CUDAKernels - const backend = CUDABackend() - CUDA.allowscalar(false) -else - const backend = CPU() +if !(@isdefined backend) + if Base.find_package("CUDA") !== nothing + using CUDA + using CUDA.CUDAKernels + const backend = CUDABackend() + CUDA.allowscalar(false) + else + const backend = CPU() + end end + +const f_type = KernelAbstractions.supports_float64(backend) ? Float64 : Float32 diff --git a/test/examples.jl b/test/examples.jl index 02374db89..d10f48d65 100644 --- a/test/examples.jl +++ b/test/examples.jl @@ -9,7 +9,7 @@ function find_sources(path::String, sources = String[]) return sources end -function examples_testsuite(backend_str) +function examples_testsuite(backend, backend_str) @testset "examples" begin examples_dir = joinpath(@__DIR__, "..", "examples") examples = find_sources(examples_dir) @@ -21,6 +21,7 @@ function examples_testsuite(backend_str) @testset "$(basename(example))" for example in examples @eval module $(gensym()) backend_str = $backend_str + const backend = ($backend)() include($example) end @test true diff --git a/test/testsuite.jl b/test/testsuite.jl index f9f96fcf0..2418db998 100644 --- a/test/testsuite.jl +++ b/test/testsuite.jl @@ -84,7 +84,7 @@ function testsuite(backend, backend_str, backend_mod, AT, DAT; skip_tests = Set{ end @conditional_testset "Examples" skip_tests begin - examples_testsuite(backend_str) + examples_testsuite(backend, backend_str) end return