From f26031561ab7af7bb3689fe662eeae6453ab4569 Mon Sep 17 00:00:00 2001 From: Takafumi Arakaki Date: Mon, 20 Dec 2021 22:19:29 -0500 Subject: [PATCH 1/4] Quicksort: copy twice instead of scan-scatter --- src/quicksort.jl | 265 +++++++++++------------------------------------ src/utils.jl | 6 ++ 2 files changed, 66 insertions(+), 205 deletions(-) diff --git a/src/quicksort.jl b/src/quicksort.jl index 88b361cd..20399c01 100644 --- a/src/quicksort.jl +++ b/src/quicksort.jl @@ -23,32 +23,13 @@ function Base.sort!( a = @set a.smallsize = a.basesize end ys = view(v, lo:hi) - _quicksort!( - similar(ys), - ys, - a, - o, - Vector{Int8}(undef, length(ys)), - false, # ys_is_result - true, # mutable_xs - ) + xs = similar(ys) + _quicksort!(ys, xs, a, o) return v end -function _quicksort!( - ys, - xs, - alg, - order, - cs = Vector{Int8}(undef, length(ys)), - ys_is_result = true, - mutable_xs = false, -) - @check length(ys) == length(xs) - if length(ys) <= max(8, alg.basesize) - return _quicksort_serial!(ys, xs, alg, order, cs, ys_is_result, mutable_xs) - end - pivot = _median( +function choose_pivot(xs, order) + return _median( order, ( xs[1], @@ -62,239 +43,113 @@ function _quicksort!( xs[end], ), ) +end + +function _quicksort!(ys, xs, alg, order) + @check length(ys) == length(xs) + if length(ys) <= max(8, alg.basesize) + return _quicksort_serial!(ys, xs, alg, order) + end + pivot = choose_pivot(ys, order) chunksize = alg.basesize # TODO: Calculate extrema during the first pass if it's possible # to use counting sort. - # TODO: When recursing, fuse copying _from_ `ys` to `xs` with the - # first pass. - # Compute sizes of each partition for each chunks. + # (1) `quicksort_partition!` -- partition each chunk in parallel xs_chunk_list = _partition(xs, chunksize) - cs_chunk_list = _partition(cs, chunksize) + ys_chunk_list = _partition(ys, chunksize) nchunks = cld(length(xs), chunksize) nbelows = Vector{Int}(undef, nchunks) - nequals = Vector{Int}(undef, nchunks) naboves = Vector{Int}(undef, nchunks) @DBG begin VERSION >= v"1.4" && - @check length(xs_chunk_list) == length(cs_chunk_list) == nchunks + @check length(xs_chunk_list) == length(ys_chunk_list) == nchunks fill!(nbelows, -1) - fill!(nequals, -1) fill!(naboves, -1) end - @sync for (nb, ne, na, xs_chunk, cs_chunk) in zip( + @sync for (nb, na, xs_chunk, ys_chunk) in zip( referenceable(nbelows), - referenceable(nequals), referenceable(naboves), xs_chunk_list, - cs_chunk_list, + ys_chunk_list, ) - @spawn partition_sizes!(nb, ne, na, xs_chunk, cs_chunk, pivot, order) + @spawn (nb[], na[]) = quicksort_partition!(xs_chunk, ys_chunk, pivot, order) end @DBG begin @check all(>=(0), nbelows) - @check all(>=(0), nequals) @check all(>=(0), naboves) + @check nbelows .+ nbelows == map(length, xs_chunk_list) end below_offsets = nbelows - equal_offsets = nequals above_offsets = naboves acc = exclusive_cumsum!(below_offsets) - acc = exclusive_cumsum!(equal_offsets, acc) acc = exclusive_cumsum!(above_offsets, acc) @check acc == length(xs) - @inline function singleton_chunkid(i) - nb = @inbounds get(below_offsets, i + 1, equal_offsets[1]) - below_offsets[i] - ne = @inbounds get(equal_offsets, i + 1, above_offsets[1]) - equal_offsets[i] - na = @inbounds get(above_offsets, i + 1, length(ys)) - above_offsets[i] - if (nb > 0) + (ne > 0) + (na > 0) == 1 - return 1 * (nb > 0) + 2 * (ne > 0) + 3 * (na > 0) - else - return 0 - end - end - - @sync begin - for (i, (xs_chunk, cs_chunk)) in enumerate(zip(xs_chunk_list, cs_chunk_list)) - singleton_chunkid(i) > 0 && continue - @spawn unsafe_quicksort_scatter!( - ys, - xs_chunk, - cs_chunk, - below_offsets[i], - equal_offsets[i], - above_offsets[i], - ) - end - for (i, xs_chunk) in enumerate(xs_chunk_list) - sid = singleton_chunkid(i) - sid > 0 || continue - idx = ( - below_offsets[i]+1:get(below_offsets, i + 1, equal_offsets[1]), - equal_offsets[i]+1:get(equal_offsets, i + 1, above_offsets[1]), - above_offsets[i]+1:get(above_offsets, i + 1, length(ys)), - )[sid] - # There is only one partition. Short-circuit scattering. - ys_chunk = view(ys, idx) - copyto!(ys_chunk, xs_chunk) - # Is it better to multi-thread this? - end + total_nbelows = above_offsets[1] + total_nbelows == 0 && return sort!(ys, alg.smallsort, order) + # TODO: Fallback to parallel mergesort? Scan the array to check degeneracy + # and also to estimate a good pivot? + + # (2) `quicksort_copyback!` -- Copy partitions back to the original + # (destination) array `ys` in the natural order + @sync for (i, (xs_chunk, below_offset, above_offset)) in + enumerate(zip(xs_chunk_list, below_offsets, above_offsets)) + local nb = get(below_offsets, i + 1, total_nbelows) - below_offsets[i] + @spawn quicksort_copyback!(ys, xs_chunk, nb, below_offset, above_offset) end - partitions = (1:equal_offsets[1], above_offsets[1]+1:length(xs)) + # (3) Recursively sort each partion + below = 1:total_nbelows + above = total_nbelows+1:length(xs) @sync begin - for idx in partitions - length(idx) <= alg.smallsize && continue - ys_new = view(ys, idx) - xs_new = view(xs, idx) - cs_new = view(cs, idx) - @spawn let zs - if mutable_xs - zs = xs_new - else - zs = similar(ys_new) - end - _quicksort!(zs, ys_new, alg, order, cs_new, !ys_is_result, true) - end - end - for idx in partitions - length(idx) <= alg.smallsize || continue - if ys_is_result - ys_new = view(ys, idx) - else - ys_new = copyto!(view(xs, idx), view(ys, idx)) - end - sort!(ys_new, alg.smallsort, order) - end - if !ys_is_result - let idx = equal_offsets[1]+1:above_offsets[1] - copyto!(view(xs, idx), view(ys, idx)) - end - end + @spawn _quicksort!(view(ys, above), view(xs, above), alg, order) + _quicksort!(view(ys, below), view(xs, below), alg, order) end - return ys_is_result ? ys : xs + return ys end -function _quicksort_serial!( - ys, - xs, - alg, - order, - cs = Vector{Int8}(undef, length(ys)), - ys_is_result = true, - mutable_xs = false, -) +function _quicksort_serial!(ys, xs, alg, order) # @check length(ys) == length(xs) if length(ys) <= max(8, alg.smallsize) - if ys_is_result - zs = copyto!(ys, xs) - else - zs = xs - end - return sort!(zs, alg.smallsort, order) + return sort!(ys, alg.smallsort, order) end - pivot = _median( - order, - ( - xs[1], - xs[end÷8], - xs[end÷4], - xs[3*(end÷8)], - xs[end÷2], - xs[5*(end÷8)], - xs[3*(end÷4)], - xs[7*(end÷8)], - xs[end], - ), - ) + pivot = choose_pivot(ys, order) + + nbelows, naboves = quicksort_partition!(xs, ys, pivot, order) + @DBG @check nbelows + naboves == length(xs) + nbelows == 0 && return sort!(ys, alg.smallsort, order) - (nbelows, nequals) = partition_sizes!(xs, cs, pivot, order) - if nequals == length(xs) - if ys_is_result - copyto!(ys, xs) - return ys - else - return xs - end - end - @assert nequals > 0 below_offset = 0 - equal_offset = nbelows - above_offset = nbelows + nequals - unsafe_quicksort_scatter!(ys, xs, cs, below_offset, equal_offset, above_offset) + above_offset = nbelows + quicksort_copyback!(ys, xs, nbelows, below_offset, above_offset) - below = 1:equal_offset + below = 1:above_offset above = above_offset+1:length(xs) - ya = view(ys, above) - yb = view(ys, below) - ca = view(cs, above) - cb = view(cs, below) - if mutable_xs - _quicksort_serial!(view(xs, above), ya, alg, order, ca, !ys_is_result, true) - _quicksort_serial!(view(xs, below), yb, alg, order, cb, !ys_is_result, true) - else - let zs = similar(ys) - _quicksort_serial!(view(zs, above), ya, alg, order, ca, !ys_is_result, true) - _quicksort_serial!(view(zs, below), yb, alg, order, cb, !ys_is_result, true) - end - end - if !ys_is_result - let idx = equal_offset+1:above_offset - copyto!(view(xs, idx), view(ys, idx)) - end - end - - return ys_is_result ? ys : xs -end + _quicksort_serial!(view(xs, below), view(ys, below), alg, order) + _quicksort_serial!(view(xs, above), view(ys, above), alg, order) -function partition_sizes!(nbelows, nequals, naboves, xs, cs, pivot, order) - (nb, ne) = partition_sizes!(xs, cs, pivot, order) - nbelows[] = nb - nequals[] = ne - naboves[] = length(xs) - (nb + ne) - return + return ys end -function partition_sizes!(xs, cs, pivot, order) - nbelows = 0 - nequals = 0 - @inbounds for i in eachindex(xs, cs) - x = xs[i] +function quicksort_partition!(xs, ys, pivot, order) + _foldl((0, 0), Unroll{4}(eachindex(xs, ys))) do (nbelows, naboves), i + @_inline_meta + x = @inbounds ys[i] b = Base.lt(order, x, pivot) - a = Base.lt(order, pivot, x) - cs[i] = ifelse(b, -Int8(1), ifelse(a, Int8(1), Int8(0))) nbelows += Int(b) - nequals += Int(!(a | b)) + naboves += Int(!b) + @inbounds xs[ifelse(b, nbelows, end - naboves + 1)] = x + return (nbelows, naboves) end - return (nbelows, nequals) end -function unsafe_quicksort_scatter!( - ys, - xs_chunk, - cs_chunk, - below_offset, - equal_offset, - above_offset, -) - b = below_offset - e = equal_offset - a = above_offset - _foldl((b, a, e), Unroll{4}(eachindex(xs_chunk, cs_chunk))) do (b, a, e), i - @inbounds x = xs_chunk[i] - @inbounds c = cs_chunk[i] - is_equal = c == 0 - is_above = c > 0 - is_below = c < 0 - e += Int(is_equal) - a += Int(is_above) - b += Int(is_below) - @inbounds ys[ifelse(is_equal, e, ifelse(is_above, a, b))] = x - (b, a, e) +function quicksort_copyback!(ys, xs_chunk, nbelows, below_offset, above_offset) + copyto!(ys, below_offset + 1, xs_chunk, firstindex(xs_chunk), nbelows) + @simd ivdep for i in 1:length(xs_chunk)-nbelows + @inbounds ys[above_offset+i] = xs_chunk[end-i+1] end - return end diff --git a/src/utils.jl b/src/utils.jl index a3ff7412..36863662 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -1,3 +1,9 @@ +@static if VERSION >= v"1.8" + @eval const $(Symbol("@_inline_meta")) = $(Symbol("@inline")) +else + using Base: @_inline_meta +end + function adhoc_partition(xs, n) @check firstindex(xs) == 1 m = cld(length(xs), n) From 4ff7ba5943b8b6ff48e4c3d49226122d2891b61e Mon Sep 17 00:00:00 2001 From: Takafumi Arakaki Date: Tue, 21 Dec 2021 19:48:28 -0500 Subject: [PATCH 2/4] Try to refine pivot when partition fails --- src/quicksort.jl | 83 +++++++++++++++++++++++++++++++++++++++++++++--- src/utils.jl | 2 ++ 2 files changed, 80 insertions(+), 5 deletions(-) diff --git a/src/quicksort.jl b/src/quicksort.jl index 20399c01..7b7e6fae 100644 --- a/src/quicksort.jl +++ b/src/quicksort.jl @@ -45,12 +45,16 @@ function choose_pivot(xs, order) ) end -function _quicksort!(ys, xs, alg, order) +function _quicksort!(ys, xs, alg, order, givenpivot = nothing) @check length(ys) == length(xs) if length(ys) <= max(8, alg.basesize) return _quicksort_serial!(ys, xs, alg, order) end - pivot = choose_pivot(ys, order) + pivot = if givenpivot === nothing + choose_pivot(ys, order) + else + something(givenpivot) + end chunksize = alg.basesize # TODO: Calculate extrema during the first pass if it's possible @@ -89,9 +93,12 @@ function _quicksort!(ys, xs, alg, order) @check acc == length(xs) total_nbelows = above_offsets[1] - total_nbelows == 0 && return sort!(ys, alg.smallsort, order) - # TODO: Fallback to parallel mergesort? Scan the array to check degeneracy - # and also to estimate a good pivot? + if total_nbelows == 0 + @assert givenpivot === nothing + betterpivot, ishomogenous = refine_pivot(ys, pivot, alg.basesize, order) + ishomogenous && return ys + return _quicksort!(ys, xs, alg, order, Some(betterpivot)) + end # (2) `quicksort_copyback!` -- Copy partitions back to the original # (destination) array `ys` in the natural order @@ -153,3 +160,69 @@ function quicksort_copyback!(ys, xs_chunk, nbelows, below_offset, above_offset) @inbounds ys[above_offset+i] = xs_chunk[end-i+1] end end + +""" + refine_pivot(ys, badpivot::T, basesize, order) -> (pivot::T, ishomogenous::Bool) + +Iterate over `ys` for refining `badpivot` and checking if all elements in `ys` +are `order`-equal to `badpivot` (i.e., it is impossible to refine `badpivot`). + +Return a value `pivot` in `ys` and a boolean `ishomogenous` indicating if `pivot` +is not `order`-greater than `badpivot`. + +Given the precondition: + + badpivot ∈ ys + all(!(y < badpivot) for y in ys) # i.e., total_nbelows == 0 + +`ishomogenous` implies all elements in `ys` are `order`-equal to `badpivot` and +`pivot` is better than `badpivot` if and only if `!ishomogenous`. +""" +function refine_pivot(ys, badpivot, basesize, order) + chunksize = max(basesize, cld(length(ys), Threads.nthreads())) + nchunks = cld(length(ys), chunksize) + nchunks == 1 && return refine_pivot_serial(ys, badpivot, order) + ishomogenous = Vector{Bool}(undef, nchunks) + pivots = Vector{eltype(ys)}(undef, nchunks) + @sync for (i, ys_chunk) in enumerate(_partition(ys, chunksize)) + @spawn (pivots[i], ishomogenous[i]) = refine_pivot_serial(ys_chunk, badpivot, order) + end + allishomogenous = all(ishomogenous) + allishomogenous && return (badpivot, true) + @DBG for (i, p) in pairs(pivots) + ishomogenous[i] && @check eq(order, p, badpivot) + end + # Find the smallest `pivot` that is not `badpivot`. Assuming that there are + # a lot of `badpivot` entries, this is perhaps better than using the median + # of `pivots`. + i0 = findfirst(!, ishomogenous) + goodpivot = pivots[i0] + for i in i0+1:nchunks + if @inbounds !ishomogenous[i] + p = @inbounds pivots[i] + if Base.lt(order, p, goodpivot) + goodpivot = p + end + end + end + return (goodpivot, false) +end + +function refine_pivot_serial(ys, badpivot, order) + for y in ys + if Base.lt(order, badpivot, y) + return (y, false) + else + # Since `refine_pivot` is called only if `total_nbelows == 0` and + # `y1` is the bad pivot, we have: + @DBG @check !Base.lt(order, y, badpivot) # i.e., y == y1 + end + end + return (badpivot, true) +end +# TODO: online median approximation +# TODO: Check if the homogeneity check can be done in `quicksort_partition!` +# without overall performance degradation? Use it to determine the pivot +# for the next recursion. +# TODO: Do this right after `choose_pivot` if it finds out that all samples are +# equivalent? diff --git a/src/utils.jl b/src/utils.jl index 36863662..4d951ba0 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -37,6 +37,8 @@ function elsizeof(::Type{T}) where {T} end end +eq(order, a, b) = !(Base.lt(order, a, b) || Base.lt(order, b, a)) + function _median(order, (a, b, c)::NTuple{3,Any}) # Sort `(a, b, c)`: if Base.lt(order, b, a) From 0caddec5e4a4cd74eddc4c99fcff2419d04a225e Mon Sep 17 00:00:00 2001 From: Takafumi Arakaki Date: Tue, 21 Dec 2021 22:02:31 -0500 Subject: [PATCH 3/4] Test `refine_pivot` --- test/test_sort.jl | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/test/test_sort.jl b/test/test_sort.jl index 6cf1e855..365bb806 100644 --- a/test/test_sort.jl +++ b/test/test_sort.jl @@ -1,8 +1,10 @@ module TestSort +using Base.Order: Forward using Random: shuffle using Test using ThreadsX +using ThreadsX.Implementations: refine_pivot @testset for basesize in 1:8 @testset for alg in [ThreadsX.MergeSort, ThreadsX.QuickSort, ThreadsX.StableQuickSort] @@ -48,6 +50,11 @@ end end end +@testset "refine_pivot" begin + @test refine_pivot([0, 1, 2, 3], 0, 1, Forward) == (1, false) + @test refine_pivot([0, 0, 0, 0], 0, 1, Forward) == (0, true) +end + randnans(n) = reinterpret(Float64, [rand(UInt64) | 0x7ff8000000000000 for i in 1:n]) function randn_with_nans(n, p) From 5dd61432a7445a6e1671ccb93451c267dee234d5 Mon Sep 17 00:00:00 2001 From: Takafumi Arakaki Date: Tue, 21 Dec 2021 22:24:55 -0500 Subject: [PATCH 4/4] Tweak comment style --- src/quicksort.jl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/quicksort.jl b/src/quicksort.jl index 7b7e6fae..f30ef29d 100644 --- a/src/quicksort.jl +++ b/src/quicksort.jl @@ -60,7 +60,7 @@ function _quicksort!(ys, xs, alg, order, givenpivot = nothing) # TODO: Calculate extrema during the first pass if it's possible # to use counting sort. - # (1) `quicksort_partition!` -- partition each chunk in parallel + # (1) `quicksort_partition!` -- Partition each chunk in parallel. xs_chunk_list = _partition(xs, chunksize) ys_chunk_list = _partition(ys, chunksize) nchunks = cld(length(xs), chunksize) @@ -101,14 +101,14 @@ function _quicksort!(ys, xs, alg, order, givenpivot = nothing) end # (2) `quicksort_copyback!` -- Copy partitions back to the original - # (destination) array `ys` in the natural order + # (destination) array `ys` in the natural order. @sync for (i, (xs_chunk, below_offset, above_offset)) in enumerate(zip(xs_chunk_list, below_offsets, above_offsets)) local nb = get(below_offsets, i + 1, total_nbelows) - below_offsets[i] @spawn quicksort_copyback!(ys, xs_chunk, nb, below_offset, above_offset) end - # (3) Recursively sort each partion + # (3) Recursively sort each partion. below = 1:total_nbelows above = total_nbelows+1:length(xs) @sync begin