diff --git a/Project.toml b/Project.toml
index cb50b4c33..6fa8a12f6 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "KernelAbstractions"
 uuid = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 authors = ["Valentin Churavy <v.churavy@gmail.com> and contributors"]
-version = "0.9.34"
+version = "0.9.35"
 
 [deps]
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
diff --git a/examples/histogram.jl b/examples/histogram.jl
index 958fa0e1d..4cf43abcf 100644
--- a/examples/histogram.jl
+++ b/examples/histogram.jl
@@ -5,7 +5,7 @@ include(joinpath(dirname(pathof(KernelAbstractions)), "../examples/utils.jl")) #
 
 # Function to use as a baseline for CPU metrics
 function create_histogram(input)
-    histogram_output = zeros(Int, maximum(input))
+    histogram_output = zeros(eltype(input), maximum(input))
     for i in input
         histogram_output[i] += 1
     end
@@ -13,23 +13,21 @@ function create_histogram(input)
 end
 
 # This a 1D histogram kernel where the histogramming happens on shmem
-@kernel function histogram_kernel!(histogram_output, input)
-    tid = @index(Global, Linear)
+@kernel unsafe_indices = true function histogram_kernel!(histogram_output, input)
+    gid = @index(Group, Linear)
     lid = @index(Local, Linear)
 
-    @uniform warpsize = Int(32)
-
-    @uniform gs = @groupsize()[1]
+    @uniform gs = prod(@groupsize())
+    tid = (gid - 1) * gs + lid
     @uniform N = length(histogram_output)
 
-    shared_histogram = @localmem Int (gs)
+    shared_histogram = @localmem eltype(input) (gs)
 
     # This will go through all input elements and assign them to a location in
     # shmem. Note that if there is not enough shem, we create different shmem
     # blocks to write to. For example, if shmem is of size 256, but it's
     # possible to get a value of 312, then we will have 2 separate shmem blocks,
     # one from 1->256, and another from 256->512
-    @uniform max_element = 1
     for min_element in 1:gs:N
 
         # Setting shared_histogram to 0
@@ -42,7 +40,7 @@ end
         end
 
         # Defining bin on shared memory and writing to it if possible
-        bin = input[tid]
+        bin = tid <= length(input) ? input[tid] : 0
         if bin >= min_element && bin < max_element
             bin -= min_element - 1
             @atomic shared_histogram[bin] += 1
@@ -58,10 +56,10 @@ end
 
 end
 
-function histogram!(histogram_output, input)
+function histogram!(histogram_output, input, groupsize = 256)
     backend = get_backend(histogram_output)
     # Need static block size
-    kernel! = histogram_kernel!(backend, (256,))
+    kernel! = histogram_kernel!(backend, (groupsize,))
     kernel!(histogram_output, input, ndrange = size(input))
     return
 end
@@ -74,12 +72,13 @@ function move(backend, input)
 end
 
 @testset "histogram tests" begin
-    if Base.VERSION < v"1.7.0" && !KernelAbstractions.isgpu(backend)
+    if !KernelAbstractions.isgpu(backend)
         @test_skip false
     else
-        rand_input = [rand(1:128) for i in 1:1000]
-        linear_input = [i for i in 1:1024]
-        all_two = [2 for i in 1:512]
+        # Use Int32 as some backends don't support 64-bit atomics
+        rand_input = Int32.(rand(1:128, 1000))
+        linear_input = Int32.(1:1024)
+        all_two = fill(Int32(2), 512)
 
         histogram_rand_baseline = create_histogram(rand_input)
         histogram_linear_baseline = create_histogram(linear_input)
@@ -89,14 +88,14 @@ end
         linear_input = move(backend, linear_input)
         all_two = move(backend, all_two)
 
-        rand_histogram = KernelAbstractions.zeros(backend, Int, 128)
-        linear_histogram = KernelAbstractions.zeros(backend, Int, 1024)
-        two_histogram = KernelAbstractions.zeros(backend, Int, 2)
+        rand_histogram = KernelAbstractions.zeros(backend, eltype(rand_input), Int(maximum(rand_input)))
+        linear_histogram = KernelAbstractions.zeros(backend, eltype(linear_input), Int(maximum(linear_input)))
+        two_histogram = KernelAbstractions.zeros(backend, eltype(all_two), Int(maximum(all_two)))
 
-        histogram!(rand_histogram, rand_input)
+        histogram!(rand_histogram, rand_input, 6)
         histogram!(linear_histogram, linear_input)
         histogram!(two_histogram, all_two)
-        KernelAbstractions.synchronize(CPU())
+        KernelAbstractions.synchronize(backend)
 
         @test isapprox(Array(rand_histogram), histogram_rand_baseline)
         @test isapprox(Array(linear_histogram), histogram_linear_baseline)
diff --git a/examples/memcopy.jl b/examples/memcopy.jl
index 3159f3670..0fe141041 100644
--- a/examples/memcopy.jl
+++ b/examples/memcopy.jl
@@ -16,8 +16,8 @@ function mycopy!(A, B)
     return
 end
 
-A = KernelAbstractions.zeros(backend, Float64, 128, 128)
-B = KernelAbstractions.ones(backend, Float64, 128, 128)
+A = KernelAbstractions.zeros(backend, f_type, 128, 128)
+B = KernelAbstractions.ones(backend, f_type, 128, 128)
 mycopy!(A, B)
 KernelAbstractions.synchronize(backend)
 @test A == B
diff --git a/examples/memcopy_static.jl b/examples/memcopy_static.jl
index 9f088294e..7c965c557 100644
--- a/examples/memcopy_static.jl
+++ b/examples/memcopy_static.jl
@@ -16,8 +16,8 @@ function mycopy_static!(A, B)
     return
 end
 
-A = KernelAbstractions.zeros(backend, Float64, 128, 128)
-B = KernelAbstractions.ones(backend, Float64, 128, 128)
+A = KernelAbstractions.zeros(backend, f_type, 128, 128)
+B = KernelAbstractions.ones(backend, f_type, 128, 128)
 mycopy_static!(A, B)
 KernelAbstractions.synchronize(backend)
 @test A == B
diff --git a/examples/performant_matmul.jl b/examples/performant_matmul.jl
index dbd5ad75a..afd247d13 100644
--- a/examples/performant_matmul.jl
+++ b/examples/performant_matmul.jl
@@ -4,7 +4,10 @@ using Test
 using Random
 include(joinpath(dirname(pathof(KernelAbstractions)), "../examples/utils.jl")) # Load backend
 
-const TILE_DIM = 32
+# We use a TILE_DIM of 16 as a safe value since while
+#  most backends support up to 1024 threads per group,
+#  Metal sometimes supports fewer.
+const TILE_DIM = 16
 
 @kernel function coalesced_matmul_kernel!(
         output, @Const(input1), @Const(input2), N, R, M,
diff --git a/examples/utils.jl b/examples/utils.jl
index 5e93299b1..b1ab6353b 100644
--- a/examples/utils.jl
+++ b/examples/utils.jl
@@ -1,9 +1,13 @@
 # EXCLUDE FROM TESTING
-if Base.find_package("CUDA") !== nothing
-    using CUDA
-    using CUDA.CUDAKernels
-    const backend = CUDABackend()
-    CUDA.allowscalar(false)
-else
-    const backend = CPU()
+if !(@isdefined backend)
+    if Base.find_package("CUDA") !== nothing
+        using CUDA
+        using CUDA.CUDAKernels
+        const backend = CUDABackend()
+        CUDA.allowscalar(false)
+    else
+        const backend = CPU()
+    end
 end
+
+const f_type = KernelAbstractions.supports_float64(backend) ? Float64 : Float32
diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl
index 5103882ef..73298ca53 100644
--- a/src/KernelAbstractions.jl
+++ b/src/KernelAbstractions.jl
@@ -516,7 +516,13 @@ Get a [`Backend`](@ref) instance suitable for array `A`.
 function get_backend end
 
 # Should cover SubArray, ReshapedArray, ReinterpretArray, Hermitian, AbstractTriangular, etc.:
-get_backend(A::AbstractArray) = get_backend(parent(A))
+function get_backend(A::AbstractArray)
+    P = parent(A)
+    if P isa typeof(A)
+        throw(ArgumentError("Implement `KernelAbstractions.get_backend(::$(typeof(A)))`"))
+    end
+    return get_backend(P)
+end
 
 get_backend(::Array) = CPU()
 
diff --git a/test/examples.jl b/test/examples.jl
index 02374db89..d10f48d65 100644
--- a/test/examples.jl
+++ b/test/examples.jl
@@ -9,7 +9,7 @@ function find_sources(path::String, sources = String[])
     return sources
 end
 
-function examples_testsuite(backend_str)
+function examples_testsuite(backend, backend_str)
     @testset "examples" begin
         examples_dir = joinpath(@__DIR__, "..", "examples")
         examples = find_sources(examples_dir)
@@ -21,6 +21,7 @@ function examples_testsuite(backend_str)
         @testset "$(basename(example))" for example in examples
             @eval module $(gensym())
             backend_str = $backend_str
+            const backend = ($backend)()
             include($example)
             end
             @test true
diff --git a/test/test.jl b/test/test.jl
index 4e017c8a9..3528a2cc9 100644
--- a/test/test.jl
+++ b/test/test.jl
@@ -7,6 +7,9 @@ using Adapt
 
 identity(x) = x
 
+struct UnknownAbstractVector <: AbstractVector{Float32}  # issue #588
+end
+
 function unittest_testsuite(Backend, backend_str, backend_mod, BackendArrayT; skip_tests = Set{String}())
     @conditional_testset "partition" skip_tests begin
         backend = Backend()
@@ -80,6 +83,7 @@ function unittest_testsuite(Backend, backend_str, backend_mod, BackendArrayT; sk
         @test @inferred(KernelAbstractions.get_backend(view(A, 2:4, 1:3))) isa backendT
         @test @inferred(KernelAbstractions.get_backend(Diagonal(x))) isa backendT
         @test @inferred(KernelAbstractions.get_backend(Tridiagonal(A))) isa backendT
+        @test_throws ArgumentError KernelAbstractions.get_backend(UnknownAbstractVector())  # issue #588
     end
 
     @conditional_testset "sparse" skip_tests begin
diff --git a/test/testsuite.jl b/test/testsuite.jl
index 29f780272..dd1f4629e 100644
--- a/test/testsuite.jl
+++ b/test/testsuite.jl
@@ -89,7 +89,7 @@ function testsuite(backend, backend_str, backend_mod, AT, DAT; skip_tests = Set{
     end
 
     @conditional_testset "Examples" skip_tests begin
-        examples_testsuite(backend_str)
+        examples_testsuite(backend, backend_str)
     end
 
     return