|
1 | 1 | # common Base functionality
|
| 2 | +import Base: _RepeatInnerOuter |
2 | 3 |
|
3 |
| -function Base.repeat(a::AbstractGPUVecOrMat, m::Int, n::Int = 1) |
4 |
| - o, p = size(a, 1), size(a, 2) |
5 |
| - b = similar(a, o*m, p*n) |
6 |
| - if length(b) == 0 |
7 |
| - return b |
| 4 | +# Handle `out = repeat(x; inner)` by parallelizing over `out` array This can benchmark |
| 5 | +# faster if repeating elements along the first axis (i.e. `inner=(n, ones...)`), as data |
| 6 | +# access can be contiguous on write. |
| 7 | +function repeat_inner_dst_kernel!( |
| 8 | + ctx::AbstractKernelContext, |
| 9 | + xs::AbstractArray{<:Any, N}, |
| 10 | + inner::NTuple{N, Int}, |
| 11 | + out::AbstractArray{<:Any, N} |
| 12 | +) where {N} |
| 13 | + # Get the "stride" index in each dimension, where the size |
| 14 | + # of the stride is given by `inner`. The stride-index (sdx) then |
| 15 | + # corresponds to the index of the repeated value in `xs`. |
| 16 | + odx = @cartesianidx out |
| 17 | + dest_inds = odx.I |
| 18 | + sdx = ntuple(N) do i |
| 19 | + @inbounds (dest_inds[i] - 1) ÷ inner[i] + 1 |
8 | 20 | end
|
9 |
| - gpu_call(b, a, o, p, m, n; elements=n) do ctx, b, a, o, p, m, n |
10 |
| - j = linear_index(ctx) |
11 |
| - j > n && return |
12 |
| - d = (j - 1) * p + 1 |
13 |
| - @inbounds for i in 1:m |
14 |
| - c = (i - 1) * o + 1 |
15 |
| - for r in 1:p |
16 |
| - for k in 1:o |
17 |
| - b[k - 1 + c, r - 1 + d] = a[k, r] |
18 |
| - end |
19 |
| - end |
| 21 | + @inbounds out[odx] = xs[CartesianIndex(sdx)] |
| 22 | + return nothing |
| 23 | +end |
| 24 | + |
| 25 | +# Handle `out = repeat(x; inner)` by parallelizing over the `xs` array This tends to |
| 26 | +# benchmark faster by having fewer read operations and avoiding the costly division |
| 27 | +# operation. Additionally, when repeating over the trailing dimension. `inner=(ones..., n)`, |
| 28 | +# data access can be contiguous during both the read and write operations. |
| 29 | +function repeat_inner_src_kernel!( |
| 30 | + ctx::AbstractKernelContext, |
| 31 | + xs::AbstractArray{<:Any, N}, |
| 32 | + inner::NTuple{N, Int}, |
| 33 | + out::AbstractArray{<:Any, N} |
| 34 | +) where {N} |
| 35 | + # Get single element from src |
| 36 | + idx = @cartesianidx xs |
| 37 | + @inbounds val = xs[idx] |
| 38 | + |
| 39 | + # Loop over "repeat" indices of inner |
| 40 | + for rdx in CartesianIndices(inner) |
| 41 | + # Get destination CartesianIndex |
| 42 | + odx = ntuple(N) do i |
| 43 | + @inbounds (idx[i]-1) * inner[i] + rdx[i] |
20 | 44 | end
|
21 |
| - return |
| 45 | + @inbounds out[CartesianIndex(odx)] = val |
22 | 46 | end
|
23 |
| - return b |
| 47 | + return nothing |
24 | 48 | end
|
25 | 49 |
|
26 |
| -function Base.repeat(a::AbstractGPUVector, m::Int) |
27 |
| - o = length(a) |
28 |
| - b = similar(a, o*m) |
29 |
| - if length(b) == 0 |
30 |
| - return b |
| 50 | +function repeat_inner(xs::AnyGPUArray, inner) |
| 51 | + out = similar(xs, eltype(xs), inner .* size(xs)) |
| 52 | + any(==(0), size(out)) && return out # consistent with `Base.repeat` |
| 53 | + |
| 54 | + # Pick which kernel to launch based on `inner`, using the heuristic that if the largest |
| 55 | + # entry in `inner` is `inner[1]`, then we should parallelize over `out`. Otherwise, we |
| 56 | + # should parallelize over `xs`. This choice is purely for performance. Better heuristics |
| 57 | + # may exist, but hopefully, this is good enough. |
| 58 | + # |
| 59 | + # Using `repeat_inner_src_kernel!`, requires fewer read ops (`prod(size(xs))` vs. |
| 60 | + # `prod(size(out))`) and generally benchmarks faster than `repeat_inner_dst_kernel!`. |
| 61 | + # However, for `inner = (n, 1, 1)`, `repeat_inner_dst_kernel!` benchmarked faster as it |
| 62 | + # avoids strided memory access over `out`. |
| 63 | + # See https://github.com/JuliaGPU/GPUArrays.jl/pull/400#issuecomment-1172641982 for the |
| 64 | + # relevant benchmarks. |
| 65 | + if argmax(inner) == firstindex(inner) |
| 66 | + # Parallelize over the destination array |
| 67 | + gpu_call(repeat_inner_dst_kernel!, xs, inner, out; elements=prod(size(out))) |
| 68 | + else |
| 69 | + # Parallelize over the source array |
| 70 | + gpu_call(repeat_inner_src_kernel!, xs, inner, out; elements=prod(size(xs))) |
31 | 71 | end
|
32 |
| - gpu_call(b, a, o, m; elements=m) do ctx, b, a, o, m |
33 |
| - i = linear_index(ctx) |
34 |
| - i > m && return |
35 |
| - c = (i - 1)*o + 1 |
36 |
| - @inbounds for i in 1:o |
37 |
| - b[c + i - 1] = a[i] |
| 72 | + return out |
| 73 | +end |
| 74 | + |
| 75 | +function repeat_outer_kernel!( |
| 76 | + ctx::AbstractKernelContext, |
| 77 | + xs::AbstractArray{<:Any, N}, |
| 78 | + xssize::NTuple{N}, |
| 79 | + outer::NTuple{N}, |
| 80 | + out::AbstractArray{<:Any, N} |
| 81 | +) where {N} |
| 82 | + # Get index to input element |
| 83 | + idx = @cartesianidx xs |
| 84 | + @inbounds val = xs[idx] |
| 85 | + |
| 86 | + # Loop over repeat indices, copying val to out |
| 87 | + for rdx in CartesianIndices(outer) |
| 88 | + # Get destination CartesianIndex |
| 89 | + odx = ntuple(N) do i |
| 90 | + @inbounds idx[i] + xssize[i] * (rdx[i] -1) |
38 | 91 | end
|
39 |
| - return |
| 92 | + @inbounds out[CartesianIndex(odx)] = val |
40 | 93 | end
|
41 |
| - return b |
| 94 | + |
| 95 | + return nothing |
| 96 | +end |
| 97 | + |
| 98 | +function repeat_outer(xs::AnyGPUArray, outer) |
| 99 | + out = similar(xs, eltype(xs), outer .* size(xs)) |
| 100 | + any(==(0), size(out)) && return out # consistent with `Base.repeat` |
| 101 | + gpu_call(repeat_outer_kernel!, xs, size(xs), outer, out; elements=length(xs)) |
| 102 | + return out |
| 103 | +end |
| 104 | + |
| 105 | +# Overload methods used by `Base.repeat`. |
| 106 | +# No need to implement `repeat_inner_outer` since this is implemented in `Base` as |
| 107 | +# `repeat_outer(repeat_inner(arr, inner), outer)`. |
| 108 | +function _RepeatInnerOuter.repeat_inner(xs::AnyGPUArray{<:Any, N}, dims::NTuple{N}) where {N} |
| 109 | + return repeat_inner(xs, dims) |
| 110 | +end |
| 111 | + |
| 112 | +function _RepeatInnerOuter.repeat_outer(xs::AnyGPUArray{<:Any, N}, dims::NTuple{N}) where {N} |
| 113 | + return repeat_outer(xs, dims) |
| 114 | +end |
| 115 | + |
| 116 | +function _RepeatInnerOuter.repeat_outer(xs::AnyGPUArray{<:Any, 1}, dims::Tuple{Any}) |
| 117 | + return repeat_outer(xs, dims) |
| 118 | +end |
| 119 | + |
| 120 | +function _RepeatInnerOuter.repeat_outer(xs::AnyGPUArray{<:Any, 2}, dims::NTuple{2, Any}) |
| 121 | + return repeat_outer(xs, dims) |
42 | 122 | end
|
43 | 123 |
|
44 | 124 | ## PermutedDimsArrays
|
|
0 commit comments