Skip to content

Commit ec12883

Browse files
awadell1torfjelde
andauthored
Add n-dimensional repeat (#400)
Co-authored-by: Tor Erlend Fjelde <tor.erlend95@gmail.com>
1 parent 2b7dbeb commit ec12883

File tree

2 files changed

+145
-32
lines changed

2 files changed

+145
-32
lines changed

src/host/base.jl

Lines changed: 111 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,44 +1,124 @@
11
# common Base functionality
2+
import Base: _RepeatInnerOuter
23

3-
function Base.repeat(a::AbstractGPUVecOrMat, m::Int, n::Int = 1)
4-
o, p = size(a, 1), size(a, 2)
5-
b = similar(a, o*m, p*n)
6-
if length(b) == 0
7-
return b
4+
# Handle `out = repeat(x; inner)` by parallelizing over `out` array This can benchmark
5+
# faster if repeating elements along the first axis (i.e. `inner=(n, ones...)`), as data
6+
# access can be contiguous on write.
7+
function repeat_inner_dst_kernel!(
8+
ctx::AbstractKernelContext,
9+
xs::AbstractArray{<:Any, N},
10+
inner::NTuple{N, Int},
11+
out::AbstractArray{<:Any, N}
12+
) where {N}
13+
# Get the "stride" index in each dimension, where the size
14+
# of the stride is given by `inner`. The stride-index (sdx) then
15+
# corresponds to the index of the repeated value in `xs`.
16+
odx = @cartesianidx out
17+
dest_inds = odx.I
18+
sdx = ntuple(N) do i
19+
@inbounds (dest_inds[i] - 1) ÷ inner[i] + 1
820
end
9-
gpu_call(b, a, o, p, m, n; elements=n) do ctx, b, a, o, p, m, n
10-
j = linear_index(ctx)
11-
j > n && return
12-
d = (j - 1) * p + 1
13-
@inbounds for i in 1:m
14-
c = (i - 1) * o + 1
15-
for r in 1:p
16-
for k in 1:o
17-
b[k - 1 + c, r - 1 + d] = a[k, r]
18-
end
19-
end
21+
@inbounds out[odx] = xs[CartesianIndex(sdx)]
22+
return nothing
23+
end
24+
25+
# Handle `out = repeat(x; inner)` by parallelizing over the `xs` array This tends to
26+
# benchmark faster by having fewer read operations and avoiding the costly division
27+
# operation. Additionally, when repeating over the trailing dimension. `inner=(ones..., n)`,
28+
# data access can be contiguous during both the read and write operations.
29+
function repeat_inner_src_kernel!(
30+
ctx::AbstractKernelContext,
31+
xs::AbstractArray{<:Any, N},
32+
inner::NTuple{N, Int},
33+
out::AbstractArray{<:Any, N}
34+
) where {N}
35+
# Get single element from src
36+
idx = @cartesianidx xs
37+
@inbounds val = xs[idx]
38+
39+
# Loop over "repeat" indices of inner
40+
for rdx in CartesianIndices(inner)
41+
# Get destination CartesianIndex
42+
odx = ntuple(N) do i
43+
@inbounds (idx[i]-1) * inner[i] + rdx[i]
2044
end
21-
return
45+
@inbounds out[CartesianIndex(odx)] = val
2246
end
23-
return b
47+
return nothing
2448
end
2549

26-
function Base.repeat(a::AbstractGPUVector, m::Int)
27-
o = length(a)
28-
b = similar(a, o*m)
29-
if length(b) == 0
30-
return b
50+
function repeat_inner(xs::AnyGPUArray, inner)
51+
out = similar(xs, eltype(xs), inner .* size(xs))
52+
any(==(0), size(out)) && return out # consistent with `Base.repeat`
53+
54+
# Pick which kernel to launch based on `inner`, using the heuristic that if the largest
55+
# entry in `inner` is `inner[1]`, then we should parallelize over `out`. Otherwise, we
56+
# should parallelize over `xs`. This choice is purely for performance. Better heuristics
57+
# may exist, but hopefully, this is good enough.
58+
#
59+
# Using `repeat_inner_src_kernel!`, requires fewer read ops (`prod(size(xs))` vs.
60+
# `prod(size(out))`) and generally benchmarks faster than `repeat_inner_dst_kernel!`.
61+
# However, for `inner = (n, 1, 1)`, `repeat_inner_dst_kernel!` benchmarked faster as it
62+
# avoids strided memory access over `out`.
63+
# See https://github.com/JuliaGPU/GPUArrays.jl/pull/400#issuecomment-1172641982 for the
64+
# relevant benchmarks.
65+
if argmax(inner) == firstindex(inner)
66+
# Parallelize over the destination array
67+
gpu_call(repeat_inner_dst_kernel!, xs, inner, out; elements=prod(size(out)))
68+
else
69+
# Parallelize over the source array
70+
gpu_call(repeat_inner_src_kernel!, xs, inner, out; elements=prod(size(xs)))
3171
end
32-
gpu_call(b, a, o, m; elements=m) do ctx, b, a, o, m
33-
i = linear_index(ctx)
34-
i > m && return
35-
c = (i - 1)*o + 1
36-
@inbounds for i in 1:o
37-
b[c + i - 1] = a[i]
72+
return out
73+
end
74+
75+
function repeat_outer_kernel!(
76+
ctx::AbstractKernelContext,
77+
xs::AbstractArray{<:Any, N},
78+
xssize::NTuple{N},
79+
outer::NTuple{N},
80+
out::AbstractArray{<:Any, N}
81+
) where {N}
82+
# Get index to input element
83+
idx = @cartesianidx xs
84+
@inbounds val = xs[idx]
85+
86+
# Loop over repeat indices, copying val to out
87+
for rdx in CartesianIndices(outer)
88+
# Get destination CartesianIndex
89+
odx = ntuple(N) do i
90+
@inbounds idx[i] + xssize[i] * (rdx[i] -1)
3891
end
39-
return
92+
@inbounds out[CartesianIndex(odx)] = val
4093
end
41-
return b
94+
95+
return nothing
96+
end
97+
98+
function repeat_outer(xs::AnyGPUArray, outer)
99+
out = similar(xs, eltype(xs), outer .* size(xs))
100+
any(==(0), size(out)) && return out # consistent with `Base.repeat`
101+
gpu_call(repeat_outer_kernel!, xs, size(xs), outer, out; elements=length(xs))
102+
return out
103+
end
104+
105+
# Overload methods used by `Base.repeat`.
106+
# No need to implement `repeat_inner_outer` since this is implemented in `Base` as
107+
# `repeat_outer(repeat_inner(arr, inner), outer)`.
108+
function _RepeatInnerOuter.repeat_inner(xs::AnyGPUArray{<:Any, N}, dims::NTuple{N}) where {N}
109+
return repeat_inner(xs, dims)
110+
end
111+
112+
function _RepeatInnerOuter.repeat_outer(xs::AnyGPUArray{<:Any, N}, dims::NTuple{N}) where {N}
113+
return repeat_outer(xs, dims)
114+
end
115+
116+
function _RepeatInnerOuter.repeat_outer(xs::AnyGPUArray{<:Any, 1}, dims::Tuple{Any})
117+
return repeat_outer(xs, dims)
118+
end
119+
120+
function _RepeatInnerOuter.repeat_outer(xs::AnyGPUArray{<:Any, 2}, dims::NTuple{2, Any})
121+
return repeat_outer(xs, dims)
42122
end
43123

44124
## PermutedDimsArrays

test/testsuite/base.jl

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ end
3333
)
3434
dst = AT(dst)
3535
src = AT(src)
36-
36+
3737
copy!(dst, src)
3838
@test dst == src
3939
end
@@ -211,6 +211,39 @@ end
211211
@test compare(a-> repeat(a, 0), AT, rand(Float32, 10))
212212
@test compare(a-> repeat(a, 0), AT, rand(Float32, 5, 4))
213213
@test compare(a-> repeat(a, 4, 0), AT, rand(Float32, 10, 15))
214+
215+
# Test inputs.
216+
x = rand(Float32, 10)
217+
xmat = rand(Float32, 2, 10)
218+
xarr = rand(Float32, 3, 2, 10)
219+
220+
# Note: When testing repeat(x; inner) need to hit both `repeat_inner_src_kernel!`
221+
# and `repeat_inner_dst_kernel!` to get full coverage.
222+
223+
# Inner.
224+
@test compare(a -> repeat(a, inner=(2, )), AT, x)
225+
@test compare(a -> repeat(a, inner=(2, 3)), AT, xmat)
226+
@test compare(a -> repeat(a, inner=(3, 2)), AT, xmat)
227+
@test compare(a -> repeat(a, inner=(2, 3, 4)), AT, xarr)
228+
@test compare(a -> repeat(a, inner=(4, 3, 2)), AT, xarr)
229+
# Outer.
230+
@test compare(a -> repeat(a, outer=(2, )), AT, x)
231+
@test compare(a -> repeat(a, outer=(2, 3)), AT, xmat)
232+
@test compare(a -> repeat(a, outer=(2, 3, 4)), AT, xarr)
233+
# Both.
234+
@test compare(a -> repeat(a, inner=(2, ), outer=(2, )), AT, x)
235+
@test compare(a -> repeat(a, inner=(2, 3), outer=(2, 3)), AT, xmat)
236+
@test compare(a -> repeat(a, inner=(2, 3, 4), outer=(2, 1, 4)), AT, xarr)
237+
# Repeat which expands dimensionality.
238+
@test compare(a -> repeat(a, inner=(2, 1, 3)), AT, x)
239+
@test compare(a -> repeat(a, inner=(3, 1, 2)), AT, x)
240+
@test compare(a -> repeat(a, outer=(2, 1, 3)), AT, x)
241+
@test compare(a -> repeat(a, inner=(2, 1, 3), outer=(2, 2, 3)), AT, x)
242+
243+
@test compare(a -> repeat(a, inner=(2, 1, 3)), AT, xmat)
244+
@test compare(a -> repeat(a, inner=(3, 1, 2)), AT, xmat)
245+
@test compare(a -> repeat(a, outer=(2, 1, 3)), AT, xmat)
246+
@test compare(a -> repeat(a, inner=(2, 1, 3), outer=(2, 2, 3)), AT, xmat)
214247
end
215248

216249
@testset "permutedims" begin

0 commit comments

Comments
 (0)