Sorting: sortperm(; dims) and bitonic partialsort (#2308)

Alex Ellison · web-flow · commit b3e1bdf2b44a · 2024-04-09T14:58:13.000+02:00
diff --git a/src/sorting.jl b/src/sorting.jl
@@ -69,7 +69,8 @@ affected by `parity`. See `flex_lt`. `swap` is an array for exchanging values
 and `sums` is an array of Ints used during the merge sort.
 Uses block y index to decide which values to operate on.
 """
-@inline function batch_partition(values, pivot, swap, sums, lo, hi, parity, lt::F1, by::F2) where {F1,F2}
+@inline function batch_partition(values, pivot, swap, sums, lo, hi, parity,
+                                 lt::F1, by::F2) where {F1,F2}
     sync_threads()
     blockIdx_yz = (blockIdx().z - 1i32) * gridDim().y + blockIdx().y
     idx0 = lo + (blockIdx_yz - 1i32) * blockDim().x + threadIdx().x
@@ -88,7 +89,11 @@ Uses block y index to decide which values to operate on.
     cumsum!(sums)
 
     @inbounds if idx0 <= hi
-        dest_idx = @inbounds comparison ? blockDim().x - sums[end] + sums[threadIdx().x] : threadIdx().x - sums[threadIdx().x]
+        dest_idx = @inbounds if comparison
+            blockDim().x - sums[end] + sums[threadIdx().x]
+        else
+            threadIdx().x - sums[threadIdx().x]
+        end
         if dest_idx <= length(swap)
             swap[dest_idx] = val
         end
@@ -211,7 +216,7 @@ Finds the median of `vals` starting after `lo` and going for `blockDim().x`
 elements spaced by `stride`. Performs bitonic sort in shmem, returns middle value.
 Faster than bubble sort, but not as flexible. Does not modify `vals`
 """
-function bitonic_median(vals :: AbstractArray{T}, swap, lo, L, stride, lt::F1, by::F2) where {T,F1,F2}
+function bitonic_median(vals::AbstractArray{T}, swap, lo, L, stride, lt::F1, by::F2) where {T,F1,F2}
     sync_threads()
     bitonic_lt(i1, i2) = @inbounds flex_lt(swap[i1 + 1], swap[i2 + 1], false, lt, by)
 
@@ -337,7 +342,7 @@ Quicksort recursion condition
 For a full sort, `partial` is nothing so it shouldn't affect whether recursion
 happens.
 """
-function partial_range_overlap(lo, hi, partial :: Nothing)
+function partial_range_overlap(lo, hi, partial::Nothing)
     true
 end
 
@@ -374,7 +379,8 @@ it's possible that the first pivot will be that value, which could lead to an in
 early end to recursion if we started `stuck` at 0.
 """
 function qsort_kernel(vals::AbstractArray{T,N}, lo, hi, parity, sync::Val{S}, sync_depth,
-                      prev_pivot, lt::F1, by::F2, ::Val{dims}, partial=nothing, stuck=-1) where {T, N, S, F1, F2, dims}
+                      prev_pivot, lt::F1, by::F2, ::Val{dims}, partial=nothing,
+                      stuck=-1) where {T, N, S, F1, F2, dims}
     b_sums = CuDynamicSharedArray(Int, blockDim().x)
     swap = CuDynamicSharedArray(T, blockDim().x, sizeof(b_sums))
     shmem = sizeof(b_sums) + sizeof(swap)
@@ -449,7 +455,7 @@ function qsort_kernel(vals::AbstractArray{T,N}, lo, hi, parity, sync::Val{S}, sy
     return
 end
 
-function sort_args(args, partial_k :: Nothing)
+function sort_args(args, partial_k::Nothing)
     return args
 end
 
@@ -578,10 +584,43 @@ end
     end
 end
 
+@inline function extraneous_block(vals::AbstractArray, dims):: Bool
+    other_linear_index = ((gridDim().z  ÷ blockDim().z) * (blockIdx().y - 1)) + blockIdx().z
+    return other_linear_index > length(vals) ÷ size(vals)[dims]
+end
+
+@inline function extraneous_block(vals, dims)::Bool
+    return extraneous_block(vals[1], dims)
+end
+
+# methods are defined for Val{1} because using view has 2x speed penalty for 1D arrays
+@inline function view_along_dims(vals::AbstractArray{T, 1}, dimsval::Val{1}) where T
+    return vals
+end
+
+@inline function view_along_dims(vals::Tuple{AbstractArray{T,1},Any}, dimsval::Val{1}) where T
+    return vals[1], view_along_dims(vals[2], dimsval)
+end
+
+
+@inline function view_along_dims(vals::AbstractArray{T, N}, ::Val{dims}) where {T,N,dims}
+    otherdims = ntuple(i -> i == dims ? 1 : size(vals, i), N)
+    other_linear_index = ((gridDim().z  ÷ blockDim().z) * (blockIdx().y - 1)) + blockIdx().z
+    other = CartesianIndices(otherdims)[other_linear_index]
+    # create a view that keeps the sorting dimension but indexes across the others
+    slicedims = map(Base.Slice, axes(vals))
+    idxs = ntuple(i->i==dims ? slicedims[i] : other[i], N)
+    return view(vals, idxs...)
+end
+
+@inline function view_along_dims(vals, dimsval::Val{dims}) where dims
+    return vals[1], view_along_dims(vals[2], dimsval)
+end
+
+
 # Functions specifically for "large" bitonic steps (those that cannot use shmem)
 
-@inline function compare!(vals::AbstractArray{T}, i1::I, i2::I, dir::Bool, by, lt,
-                          rev) where {T,I}
+@inline function compare!(vals::AbstractArray{T, N}, i1::I, i2::I, dir::Bool, by, lt, rev) where {T,I,N}
     i1′, i2′ = i1 + one(I), i2 + one(I)
     @inbounds if dir != rev_lt(by(vals[i1′]), by(vals[i2′]), lt, rev)
         vals[i1′], vals[i2′] = vals[i2′], vals[i1′]
@@ -593,7 +632,8 @@ end
     i1′, i2′ = i1 + one(I), i2 + one(I)
     vals, inds = vals_inds
     # comparing tuples of (value, index) guarantees stability of sort
-    @inbounds if dir != rev_lt((by(vals[inds[i1′]]), inds[i1′]), (by(vals[inds[i2′]]), inds[i2′]), lt, rev)
+    @inbounds if dir != rev_lt((by(vals[inds[i1′]]), inds[i1′]),
+                               (by(vals[inds[i2′]]), inds[i2′]), lt, rev)
         inds[i1′], inds[i2′] = inds[i2′], inds[i1′]
     end
 end
@@ -645,16 +685,22 @@ Note that to avoid synchronization issues, only one thread from each pair of
 indices being swapped will actually move data.
 """
 function comparator_kernel(vals, length_vals::I, k::I, j::I, by::F1, lt::F2,
-                           rev) where {I,F1,F2}
+                           rev, dimsval::Val{dims}) where {I,F1,F2,dims}
+    if extraneous_block(vals, dims)
+        return nothing
+    end
+
     index = (blockDim().x * (blockIdx().x - one(I))) + threadIdx().x - one(I)
 
+    slice = view_along_dims(vals, dimsval)
+
     lo, n, dir = get_range(length_vals, index, k, j)
 
     if !(lo < zero(I) || n < zero(I)) && !(index >= length_vals)
         m = gp2lt(n)
         if lo <= index < lo + n - m
             i1, i2 = index, index + m
-            @inbounds compare!(vals, i1, i2, dir, by, lt, rev)
+            @inbounds compare!(slice, i1, i2, dir, by, lt, rev)
         end
     end
     return
@@ -804,15 +850,19 @@ a unique  range of indices corresponding to a comparator in the sorting network.
 Note that this moves the array values copied within shmem, but doesn't copy them
 back to global the way it does for indices.
 """
-function comparator_small_kernel(c, length_c::I, k::I, j_0::I, j_f::I,
-                                 by::F1, lt::F2, rev) where {I,F1,F2}
+function comparator_small_kernel(vals, length_vals::I, k::I, j_0::I, j_f::I,
+                                 by::F1, lt::F2, rev, dimsval::Val{dims}) where {I,F1,F2,dims}
+    if extraneous_block(vals, dims)
+        return nothing
+    end
+    slice = view_along_dims(vals, dimsval)
     pseudo_block_idx = (blockIdx().x - one(I)) * blockDim().y + threadIdx().y - one(I)
     # immutable info about the range used by this kernel
-    _lo, _n, dir = block_range(length_c, pseudo_block_idx, k, j_0)
+    _lo, _n, dir = block_range(length_vals, pseudo_block_idx, k, j_0)
     index = _lo + threadIdx().x - one(I)
     in_range = (threadIdx().x <= _n && _lo >= zero(I))
 
-    swap = initialize_shmem!(c, index, in_range)
+    swap = initialize_shmem!(slice, index, in_range)
 
     # mutable copies for pseudo-recursion
     lo, n = _lo, _n
@@ -829,7 +879,7 @@ function comparator_small_kernel(c, length_c::I, k::I, j_0::I, j_f::I,
         sync_threads()
     end
 
-    finalize_shmem!(c, swap, index, in_range)
+    finalize_shmem!(slice, swap, index, in_range)
     return
 end
 
@@ -849,20 +899,20 @@ of values and an index array for doing `sortperm!`. Cannot provide a stable
 `sort!` although `sortperm!` is properly stable. To reverse, set `rev=true`
 rather than `lt=!isless` (otherwise stability of sortperm breaks down).
 """
-function bitonic_sort!(c; by = identity, lt = isless, rev = false)
-    c_len = if typeof(c) <: Tuple
-        length(c[1])
+function bitonic_sort!(c; by = identity, lt = isless, rev = false, dims=1)
+    c_len, otherdims_len = if typeof(c) <: Tuple
+        size(c[1])[dims], length(c[1]) ÷ size(c[1])[dims]
     else
-        length(c)
+        size(c)[dims], length(c) ÷ size(c)[dims]
     end
 
-    # compile kernels (using Int32 for indexing, if possible, yielding a 10% speedup)
+    # compile kernels (using Int32 for indexing, if possible, yielding a 70% speedup)
     I = c_len <= typemax(Int32) ? Int32 : Int
-    args1 = (c, I(c_len), one(I), one(I), one(I), by, lt, Val(rev))
+    args1 = (c, I(c_len), one(I), one(I), one(I), by, lt, Val(rev), Val(dims))
     kernel1 = @cuda launch=false comparator_small_kernel(args1...)
+
     config1 = launch_configuration(kernel1.fun, shmem = threads -> bitonic_shmem(c, threads))
-    threads1 = prevpow(2, config1.threads)
-    args2 = (c, I(c_len), one(I), one(I), by, lt, Val(rev))
+    args2 = (c, I(c_len), one(I), one(I), by, lt, Val(rev), Val(dims))
     kernel2 = @cuda launch=false comparator_kernel(args2...)
     config2 = launch_configuration(kernel2.fun, shmem = threads -> bitonic_shmem(c, threads))
     # blocksize for kernel2 MUST be a power of 2
@@ -877,26 +927,30 @@ function bitonic_sort!(c; by = identity, lt = isless, rev = false)
     k0 = ceil(Int, log2(c_len))
     for k = k0:-1:1
         j_final = 1 + k0 - k
+
+        # non-sorting dims are put into blocks along grid y/z. Using sqrt minimizes wasted blocks
+        other_block_dims = Int(ceil(sqrt(otherdims_len))), Int(ceil(sqrt(otherdims_len)))
+
         for j = 1:j_final
-            args1 = (c, I.((c_len, k, j, j_final))..., by, lt, Val(rev))
-            args2 = (c, I.((c_len, k, j))..., by, lt, Val(rev))
+            args1 = (c, I.((c_len, k, j, j_final))..., by, lt, Val(rev), Val(dims))
+            args2 = (c, I.((c_len, k, j))..., by, lt, Val(rev), Val(dims))
             if k0 - k - j + 2 <= log_threads
-                # pseudo_block_length = max(nextpow(2, length(comparator)) for all comparators in this layer of the network)
+                # pseudo_block_length = max(nextpow(2, length(comparator))
+                # for all comparators in this layer of the network)
                 pseudo_block_length = 1 << abs(j_final + 1 - j)
                 # N_pseudo_blocks = how many pseudo-blocks are in this layer of the network
                 N_pseudo_blocks = nextpow(2, c_len) ÷ pseudo_block_length
                 pseudo_blocks_per_block = threads2 ÷ pseudo_block_length
 
                 # grid dimensions
-                N_blocks = max(1, N_pseudo_blocks ÷ pseudo_blocks_per_block)
+                N_blocks = max(1, N_pseudo_blocks ÷ pseudo_blocks_per_block), other_block_dims...
                 block_size = pseudo_block_length, threads2 ÷ pseudo_block_length
-
                 kernel1(args1...; blocks=N_blocks, threads=block_size,
                         shmem=bitonic_shmem(c, block_size))
                 break
             else
-                N_blocks = cld(c_len, threads1)
-                kernel2(args2...; blocks = N_blocks, threads=threads1)
+                N_blocks = cld(c_len, threads2), other_block_dims...
+                kernel2(args2...; blocks = N_blocks, threads=threads2)
             end
         end
     end
@@ -930,54 +984,58 @@ function Base.sort!(c::AnyCuVector, alg::QuickSortAlg; lt=isless, by=identity, r
     return c
 end
 
-function Base.sort!(c::AnyCuVector, alg::BitonicSortAlg; kwargs...)
+function Base.sort!(c::AnyCuArray, alg::BitonicSortAlg; kwargs...)
     return bitonic_sort!(c; kwargs...)
 end
 
-function Base.sort!(c::AnyCuVector; alg :: SortingAlgorithm = BitonicSort, kwargs...)
+function Base.sort!(c::AnyCuArray; alg::SortingAlgorithm = BitonicSort, kwargs...)
     return sort!(c, alg; kwargs...)
 end
 
-function Base.sort!(c::AnyCuArray; dims::Integer, lt=isless, by=identity, rev=false)
-    # for multi dim sorting, only quicksort is supported so no alg keyword
-    if rev
-        lt = !lt
-    end
-
-    quicksort!(c; lt, by, dims)
-    return c
-end
-
 function Base.sort(c::AnyCuArray; kwargs...)
     return sort!(copy(c); kwargs...)
 end
 
-function Base.partialsort!(c::AnyCuVector, k::Union{Integer, OrdinalRange};
-                           lt=isless, by=identity, rev=false)
+function Base.partialsort!(c::AnyCuVector, k::Union{Integer, OrdinalRange},
+                           alg::BitonicSortAlg; lt=isless, by=identity, rev=false)
+
+    sort!(c, alg; lt, by, rev)
+    return @allowscalar copy(c[k])
+end
+
+function Base.partialsort!(c::AnyCuVector, k::Union{Integer, OrdinalRange},
+                           alg::QuickSortAlg; lt=isless, by=identity, rev=false)
     # for reverse sorting, invert the less-than function
     if rev
         lt = !lt
     end
 
-    function out(k :: OrdinalRange)
+    function out(k::OrdinalRange)
         return copy(c[k])
     end
 
     # work around disallowed scalar index
-    function out(k :: Integer)
+    function out(k::Integer)
         return Array(c[k:k])[1]
     end
 
     quicksort!(c; lt, by, dims=1, partial_k=k)
     return out(k)
 end
 
+function Base.partialsort!(c::AnyCuArray, k::Union{Integer, OrdinalRange};
+                           alg::SortingAlgorithm=BitonicSort, kwargs...)
+    return partialsort!(c, k, alg; kwargs...)
+end
+
 function Base.partialsort(c::AnyCuArray, k::Union{Integer, OrdinalRange}; kwargs...)
     return partialsort!(copy(c), k; kwargs...)
 end
 
-function Base.sortperm!(ix::AnyCuArray{T}, A::AnyCuArray; initialized=false, kwargs...) where T
-    axes(ix) == axes(A) || throw(ArgumentError("index array must have the same size/axes as the source array, $(axes(ix)) != $(axes(A))"))
+function Base.sortperm!(ix::AnyCuArray, A::AnyCuArray; initialized=false, kwargs...)
+    if axes(ix) != axes(A)
+        throw(ArgumentError("index array must have the same size/axes as the source array, $(axes(ix)) != $(axes(A))"))
+    end
 
     if !initialized
         ix .= LinearIndices(A)
@@ -986,6 +1044,11 @@ function Base.sortperm!(ix::AnyCuArray{T}, A::AnyCuArray; initialized=false, kwa
     return ix
 end
 
-function Base.sortperm(c::AnyCuArray; kwargs...)
+function Base.sortperm(c::AnyCuVector; kwargs...)
     sortperm!(CuArray(1:length(c)), c; initialized=true, kwargs...)
 end
+
+function Base.sortperm(c::AnyCuArray; dims, kwargs...)
+    # Base errors for Matrices without dims arg, we should too
+    sortperm!(reshape(CuArray(1:length(c)), size(c)), c; initialized=true, dims, kwargs...)
+end
diff --git a/test/base/sorting.jl b/test/base/sorting.jl
@@ -315,7 +315,7 @@ end
 
         # multiple dimensions
         @test check_sort!(Int32, (4, 50000, 4); dims=2)
-        @test check_sort!(Int32, (4, 4, 50000); dims=3, rev=true)
+        @test check_sort!(Int32, (2, 2, 50000); dims=3, rev=true)
 
         # large sizes
         @test check_sort!(Float32, 2^25; alg=CUDA.QuickSort)
@@ -389,6 +389,13 @@ end
     @test check_sortperm(Float64, 1000000; rev=true)
     @test check_sortperm(Float64, 1000000; by=x->abs(x-0.5))
     @test check_sortperm(Float64, 1000000; rev=true, by=x->abs(x-0.5))
+    
+    if VERSION >= v"1.9"
+        # Base.jl didn't implement sortperm(;dims) until 1.9
+        @test check_sortperm(Float32, (100_000, 16); dims=1)
+        @test check_sortperm(Float32, (100_000, 16); dims=2)
+        @test check_sortperm(Float32, (100, 256, 256); dims=1)
+    end
     # check with Int32 indices
     @test check_sortperm!(collect(Int32(1):Int32(1000000)), Float32, 1000000)
     # `initialized` kwarg