implement extrema with mapreduce machinery

N5N3 · N5N3 · commit 099fab8d4943 · 2022-01-04T12:49:54.000+08:00
Update multidimensional.jl

Extend `min/maximum` optimization to much shorter length

add eagerly `NaN` break for `extrema`

performance optimization and code clean
diff --git a/base/multidimensional.jl b/base/multidimensional.jl
@@ -1746,41 +1746,80 @@ of `A`.
     This method requires Julia 1.2 or later.
 """
 extrema(f, A::AbstractArray; dims=:) = _extrema_dims(f, A, dims)
-
-_extrema_dims(f, A::AbstractArray, ::Colon) = _extrema_itr(f, A)
-
-function _extrema_dims(f, A::AbstractArray, dims)
-    sz = size(A)
-    for d in dims
-        sz = setindex(sz, 1, d)
-    end
-    T = promote_op(f, eltype(A))
-    B = Array{Tuple{T,T}}(undef, sz...)
-    return extrema!(f, B, A)
-end
-
-@noinline function extrema!(f, B, A)
-    require_one_based_indexing(B, A)
-    sA = size(A)
-    sB = size(B)
-    for I in CartesianIndices(sB)
-        fAI = f(A[I])
-        B[I] = (fAI, fAI)
-    end
-    Bmax = CartesianIndex(sB)
-    @inbounds @simd for I in CartesianIndices(sA)
-        J = min(Bmax,I)
-        BJ = B[J]
-        fAI = f(A[I])
-        if fAI < BJ[1]
-            B[J] = (fAI, BJ[2])
-        elseif fAI > BJ[2]
-            B[J] = (BJ[1], fAI)
+_extrema_dims(f, A::AbstractArray, dims) = mapreduce(x -> (fx = f(x); (fx, fx)), _extrema_op, A; dims)
+extrema!(B, A) = extrema!(identity, B, A)
+extrema!(f, B, A) = mapreduce!(x -> (fx = f(x); (fx, fx)), _extrema_op, B, A)
+_extrema_op((a, b), (c, d)) = min(a, c), max(b, d)
+function _extrema_op(x::NTuple{2,T}, y::NTuple{2,T}) where {T<:IEEEFloat}
+    (x1, x2), (y1, y2) = x, y
+    z1 = ifelse(isnan(x1)|isnan(y1), x1-y1, ifelse(signbit(x1-y1), x1, y1))
+    z2 = ifelse(isnan(x1)|isnan(y1), x1-y1, ifelse(signbit(x2-y2), y2, x2))
+    z1, z2
+end
+# avoid allocation for BigFloat
+function _extrema_op(x::NTuple{2,T}, y::NTuple{2,T}) where {T<:AbstractFloat}
+    (x1, x2), (y1, y2) = x, y
+    isnan(x1) && return x
+    isnan(y1) && return y
+    z1 = x1 < y1 || signbit(x1) > signbit(y1) ? x1 : y1
+    z2 = x2 < y2 || signbit(x2) > signbit(y2) ? y2 : x2
+    z1, z2
+end
+
+function reducedim_init(f, ::typeof(_extrema_op), A::AbstractArray, region)
+    ri = reduced_indices(A, region)
+    any(i -> isempty(axes(A, i)), region) && _empty_reduce_error()
+    A1 = view(A, ri...)
+    IT = eltype(A)
+    if missing isa IT
+        RT = promote_typejoin_union(_return_type(i -> f(i)[1], Tuple{nonmissingtype(IT)}))
+        T = Union{Tuple{RT,RT},Tuple{Missing,Missing}}
+    else
+        RT = promote_typejoin_union(_return_type(i -> f(i)[1], Tuple{IT}))
+        T = Union{Tuple{RT,RT}}
+    end
+    map!(f, reducedim_initarray(A,region,undef,T), A1)
+end
+
+function mapreduce_impl(f, op::typeof(_extrema_op),
+                        A::AbstractArrayOrBroadcasted, fi::Int, la::Int)
+    @inline elf(i) = @inbounds f(A[i])[1]
+    Eltype = _return_type(elf, Tuple{Int})
+    Eltype <: IEEEFloat ||
+        return invoke(mapreduce_impl,Tuple{Any,Any,typeof(A),Int,Int},f,op,A,fi,la)
+    ini, i = elf(fi), fi
+    v = ini, ini
+    if la - i >= 8
+        @noinline firstnan(temp) = (x=temp[findfirst(isnan,temp)]; ((x, x), fi))
+        function simd_kernal(::Val{N}, ini, i) where {N}
+            vmins = ntuple(Returns(ini), Val(N))
+            vmaxs = vmins
+            index = ntuple(identity, Val(N))
+            for _ in 1:(la-i)÷N
+                temp = map(elf, i .+ index)
+                mapreduce(isnan,|,temp) && return firstnan(temp)
+                vmins = map(_fast(min), vmins, temp)
+                vmaxs = map(_fast(max), vmaxs, temp)
+                i += N
+            end
+            (reduce(_fast(min), vmins), reduce(_fast(max), vmaxs)), i
         end
+        isnan(ini) && return v
+        if la - i < 64
+            v, i = simd_kernal(Val(4), ini, i)
+        elseif la - i < 256
+            v, i = simd_kernal(Val(8), ini, i)
+        else
+            v, i = simd_kernal(Val(64÷sizeof(Eltype)), ini, i)
+        end
+        i == fi && return v
     end
-    return B
+    while i < la
+        v′ = elf(i+=1)
+        v = _extrema_op(v, (v′,v′))
+    end
+    return v
 end
-extrema!(B, A) = extrema!(identity, B, A)
 
 # Show for pairs() with Cartesian indices. Needs to be here rather than show.jl for bootstrap order
 function Base.showarg(io::IO, r::Iterators.Pairs{<:Integer, <:Any, <:Any, T}, toplevel) where T <: Union{AbstractVector, Tuple}
diff --git a/base/reduce.jl b/base/reduce.jl
@@ -605,71 +605,55 @@ julia> prod(1:5; init = 1.0)
 prod(a; kw...) = mapreduce(identity, mul_prod, a; kw...)
 
 ## maximum & minimum
-_fast(::typeof(min),x,y) = min(x,y)
-_fast(::typeof(max),x,y) = max(x,y)
-function _fast(::typeof(max), x::AbstractFloat, y::AbstractFloat)
-    ifelse(isnan(x),
-        x,
-        ifelse(x > y, x, y))
-end
-
-function _fast(::typeof(min),x::AbstractFloat, y::AbstractFloat)
-    ifelse(isnan(x),
-        x,
-        ifelse(x < y, x, y))
-end
 
-isbadzero(::typeof(max), x::AbstractFloat) = (x == zero(x)) & signbit(x)
-isbadzero(::typeof(min), x::AbstractFloat) = (x == zero(x)) & !signbit(x)
-isbadzero(op, x) = false
-isgoodzero(::typeof(max), x) = isbadzero(min, x)
-isgoodzero(::typeof(min), x) = isbadzero(max, x)
-
-function mapreduce_impl(f, op::Union{typeof(max), typeof(min)},
-                        A::AbstractArrayOrBroadcasted, first::Int, last::Int)
-    # 1. This optimization gives different result from general fallback, if the inputs `f.(A)`
-    #    contains both 'missing' and 'Nan'.
-    # 2. For Integer cases, general fallback seems faster.
-    # Based the above reasons, only use this for AbstractFloat cases.
-    Eltype = _return_type(i -> f(A[i]), Tuple{Int})
-    Eltype <: AbstractFloat ||
-        return invoke(mapreduce_impl,Tuple{Any,Any,AbstractArrayOrBroadcasted,Int,Int},f,op,A,first,last)
-    a1 = @inbounds A[first]
-    v1 = mapreduce_first(f, op, a1)
-    v2 = v3 = v4 = v1
-    chunk_len = 256
-    start = first + 1
-    simdstop  = start + chunk_len - 4
-    while simdstop <= last - 3
-        # short circuit in case of NaN or missing
-        v1 == v1 || return v1
-        v2 == v2 || return v2
-        v3 == v3 || return v3
-        v4 == v4 || return v4
-        @inbounds for i in start:4:simdstop
-            v1 = _fast(op, v1, f(A[i+0]))
-            v2 = _fast(op, v2, f(A[i+1]))
-            v3 = _fast(op, v3, f(A[i+2]))
-            v4 = _fast(op, v4, f(A[i+3]))
+# Optimizaiton for min/max reduction
+_fast(op) = (x, y) -> _fast(op, x, y)
+_fast(op, x, y) = op(x, y)
+
+# used in optimized mapreduce_impl for IEEEFloat
+# where nan inputs has been handled
+_fast(::typeof(min), x::T, y::T) where {T<:IEEEFloat} = ifelse(signbit(x-y), x, y)
+_fast(::typeof(max), x::T, y::T) where {T<:IEEEFloat} = ifelse(signbit(x-y), y, x)
+
+function mapreduce_impl(f, op::Union{typeof(max),typeof(min)},
+                        A::AbstractArrayOrBroadcasted, fi::Int, la::Int)
+    @inline elf(i) = @inbounds f(A[i])
+    Eltype = _return_type(elf, Tuple{Int})
+    # For Integer input, general fallback is about 2x faster.
+    # Thus limit this optimization to IEEEFloat.
+    Eltype <: IEEEFloat ||
+        return invoke(mapreduce_impl,Tuple{Any,Any,typeof(A),Int,Int},f,op,A,fi,la)
+    v, i = elf(fi), fi
+    if la - i >= 8
+        # we always return the first nan
+        @noinline firstnan(temp) = temp[findfirst(isnan, temp)], fi
+        function simd_kernal(::Val{N}, ini, i) where {N}
+            vs = ntuple(Returns(ini), Val(N)) # initial values (non nan)
+            index = ntuple(identity, Val(N))
+            for _ in 1:(la-i)÷N
+                temp = map(elf, i .+ index)
+                # perform nan check, put this together is faster
+                mapreduce(isnan,|,temp) && return firstnan(temp)
+                # since temp has no nan, we can use _fast(op) safely
+                vs = map(_fast(op), vs, temp)
+                i += N
+            end
+            reduce(_fast(op), vs), i
         end
-        checkbounds(A, simdstop+3)
-        start += chunk_len
-        simdstop += chunk_len
-    end
-    v = op(op(v1,v2),op(v3,v4))
-    for i in start:last
-        @inbounds ai = A[i]
-        v = op(v, f(ai))
-    end
-
-    # enforce correct order of 0.0 and -0.0
-    # e.g. maximum([0.0, -0.0]) === 0.0
-    # should hold
-    if isbadzero(op, v)
-        for i in first:last
-            x = @inbounds A[i]
-            isgoodzero(op,x) && return x
+        isnan(v) && return v
+        # pick a proper unroll-size
+        if la - i < 64
+            v, i = simd_kernal(Val(4), v, i)
+        elseif la - i < 256
+            v, i = simd_kernal(Val(8), v, i)
+        else
+            # fill the cache-line
+            v, i = simd_kernal(Val(64÷sizeof(Eltype)), v, i)
         end
+        i == fi && return v # return by `firstnan`
+    end
+    while i < la
+        v = op(v, elf(i+=1))
     end
     return v
 end