Skip to content

Commit f9d86d9

Browse files
committed
adding simple histogram excample
1 parent e28f9b4 commit f9d86d9

File tree

1 file changed

+116
-0
lines changed

1 file changed

+116
-0
lines changed

examples/histogram.jl

Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
using Test
2+
using CUDA
3+
using CUDAKernels
4+
using KernelAbstractions
5+
6+
# Function to use as a baseline for CPU metrics
7+
function create_histogram(input)
8+
histogram_output = zeros(Int, maximum(input))
9+
for i = 1:length(input)
10+
histogram_output[input[i]] += 1
11+
end
12+
return histogram_output
13+
end
14+
15+
# This a 1D histogram kernel where the histogramming happens on shmem
16+
@kernel function histogram_kernel!(histogram_output, input)
17+
tid = @index(Global, Linear)
18+
lid = @index(Local, Linear)
19+
20+
@uniform warpsize = Int(32)
21+
22+
@uniform gs = @groupsize()[1]
23+
@uniform N = length(histogram_output)
24+
25+
shared_histogram = @localmem Int (gs)
26+
27+
# This will go through all input elements and assign them to a location in
28+
# shmem. Note that if there is not enough shem, we create different shmem
29+
# blocks to write to. For example, if shmem is of size 256, but it's
30+
# possible to get a value of 312, then we will have 2 separate shmem blocks,
31+
# one from 1->256, and another from 256->512
32+
@uniform max_element = 1
33+
for min_element = 1:gs:N
34+
35+
# Setting shared_histogram to 0
36+
@inbounds shared_histogram[lid] = 0
37+
@synchronize()
38+
39+
max_element = min_element + gs
40+
if max_element > N
41+
max_element = N+1
42+
end
43+
44+
# Defining bin on shared memory and writing to it if possible
45+
bin = input[tid]
46+
if bin >= min_element && bin < max_element
47+
bin -= min_element-1
48+
atomic_add!(pointer(shared_histogram, bin), Int(1))
49+
end
50+
51+
@synchronize()
52+
53+
if ((lid+min_element-1) <= N)
54+
atomic_add!(pointer(histogram_output, lid+min_element-1),
55+
shared_histogram[lid])
56+
end
57+
58+
end
59+
60+
end
61+
62+
function histogram!(histogram_output, input;
63+
numcores = 4, numthreads = 256)
64+
65+
if isa(input, Array)
66+
kernel! = histogram_kernel!(CPU(), numcores)
67+
else
68+
kernel! = histogram_kernel!(CUDADevice(), numthreads)
69+
end
70+
71+
kernel!(histogram_output, input, ndrange=size(input))
72+
end
73+
74+
@testset "histogram tests" begin
75+
76+
rand_input = [rand(1:128) for i = 1:1000]
77+
linear_input = [i for i = 1:1024]
78+
all_2 = [2 for i = 1:512]
79+
80+
histogram_rand_baseline = create_histogram(rand_input)
81+
histogram_linear_baseline = create_histogram(linear_input)
82+
histogram_2_baseline = create_histogram(all_2)
83+
84+
CPU_rand_histogram = zeros(Int, 128)
85+
CPU_linear_histogram = zeros(Int, 1024)
86+
CPU_2_histogram = zeros(Int, 2)
87+
88+
wait(histogram!(CPU_rand_histogram, rand_input))
89+
wait(histogram!(CPU_linear_histogram, linear_input))
90+
wait(histogram!(CPU_2_histogram, all_2))
91+
92+
@test isapprox(CPU_rand_histogram, histogram_rand_baseline)
93+
@test isapprox(CPU_linear_histogram, histogram_linear_baseline)
94+
@test isapprox(CPU_2_histogram, histogram_2_baseline)
95+
96+
if has_cuda_gpu()
97+
CUDA.allowscalar(false)
98+
99+
GPU_rand_input = CuArray(rand_input)
100+
GPU_linear_input = CuArray(linear_input)
101+
GPU_2_input = CuArray(all_2)
102+
103+
GPU_rand_histogram = CuArray(zeros(Int, 128))
104+
GPU_linear_histogram = CuArray(zeros(Int, 1024))
105+
GPU_2_histogram = CuArray(zeros(Int, 2))
106+
107+
wait(histogram!(GPU_rand_histogram, GPU_rand_input))
108+
wait(histogram!(GPU_linear_histogram, GPU_linear_input))
109+
wait(histogram!(GPU_2_histogram, GPU_2_input))
110+
111+
@test isapprox(Array(GPU_rand_histogram), histogram_rand_baseline)
112+
@test isapprox(Array(GPU_linear_histogram), histogram_linear_baseline)
113+
@test isapprox(Array(GPU_2_histogram), histogram_2_baseline)
114+
end
115+
116+
end

0 commit comments

Comments
 (0)