@@ -5,31 +5,28 @@ include(joinpath(dirname(pathof(KernelAbstractions)), "../examples/utils.jl")) #
5
5
6
6
# Function to use as a baseline for CPU metrics
7
7
function create_histogram (input)
8
- histogram_output = zeros (Int , maximum (input))
8
+ histogram_output = zeros (eltype (input) , maximum (input))
9
9
for i in input
10
10
histogram_output[i] += 1
11
11
end
12
12
return histogram_output
13
13
end
14
14
15
15
# This a 1D histogram kernel where the histogramming happens on shmem
16
- @kernel function histogram_kernel! (histogram_output, input)
16
+ @kernel unsafe_indices = true function histogram_kernel! (histogram_output, input)
17
17
tid = @index (Global, Linear)
18
18
lid = @index (Local, Linear)
19
19
20
- @uniform warpsize = Int (32 )
21
-
22
- @uniform gs = @groupsize ()[1 ]
20
+ @uniform gs = prod (@groupsize ())
23
21
@uniform N = length (histogram_output)
24
22
25
- shared_histogram = @localmem Int (gs)
23
+ shared_histogram = @localmem eltype (input) (gs)
26
24
27
25
# This will go through all input elements and assign them to a location in
28
26
# shmem. Note that if there is not enough shem, we create different shmem
29
27
# blocks to write to. For example, if shmem is of size 256, but it's
30
28
# possible to get a value of 312, then we will have 2 separate shmem blocks,
31
29
# one from 1->256, and another from 256->512
32
- @uniform max_element = 1
33
30
for min_element in 1 : gs: N
34
31
35
32
# Setting shared_histogram to 0
42
39
end
43
40
44
41
# Defining bin on shared memory and writing to it if possible
45
- bin = input[tid]
42
+ bin = tid <= length ( input) ? input [tid] : 0
46
43
if bin >= min_element && bin < max_element
47
44
bin -= min_element - 1
48
45
@atomic shared_histogram[bin] += 1
58
55
59
56
end
60
57
61
- function histogram! (histogram_output, input)
58
+ function histogram! (histogram_output, input, groupsize = 256 )
62
59
backend = get_backend (histogram_output)
63
60
# Need static block size
64
- kernel! = histogram_kernel! (backend, (256 ,))
61
+ kernel! = histogram_kernel! (backend, (groupsize ,))
65
62
kernel! (histogram_output, input, ndrange = size (input))
66
63
return
67
64
end
@@ -74,9 +71,10 @@ function move(backend, input)
74
71
end
75
72
76
73
@testset " histogram tests" begin
77
- rand_input = [rand (1 : 128 ) for i in 1 : 1000 ]
78
- linear_input = [i for i in 1 : 1024 ]
79
- all_two = [2 for i in 1 : 512 ]
74
+ # Use Int32 as some backends don't support 64-bit atomics
75
+ rand_input = Int32 .(rand (1 : 128 , 1000 ))
76
+ linear_input = Int32 .(1 : 1024 )
77
+ all_two = fill (Int32 (2 ), 512 )
80
78
81
79
histogram_rand_baseline = create_histogram (rand_input)
82
80
histogram_linear_baseline = create_histogram (linear_input)
86
84
linear_input = move (backend, linear_input)
87
85
all_two = move (backend, all_two)
88
86
89
- rand_histogram = KernelAbstractions. zeros (backend, Int, 128 )
90
- linear_histogram = KernelAbstractions. zeros (backend, Int, 1024 )
91
- two_histogram = KernelAbstractions. zeros (backend, Int, 2 )
87
+ rand_histogram = KernelAbstractions. zeros (backend, eltype (rand_input), Int ( maximum (rand_input)) )
88
+ linear_histogram = KernelAbstractions. zeros (backend, eltype (linear_input), Int ( maximum (linear_input)) )
89
+ two_histogram = KernelAbstractions. zeros (backend, eltype (all_two), Int ( maximum (all_two)) )
92
90
93
- histogram! (rand_histogram, rand_input)
91
+ histogram! (rand_histogram, rand_input, 6 )
94
92
histogram! (linear_histogram, linear_input)
95
93
histogram! (two_histogram, all_two)
96
- KernelAbstractions. synchronize (CPU () )
94
+ KernelAbstractions. synchronize (backend )
97
95
98
96
@test isapprox (Array (rand_histogram), histogram_rand_baseline)
99
97
@test isapprox (Array (linear_histogram), histogram_linear_baseline)
0 commit comments