Skip to content

Commit c5fa06a

Browse files
committed
Extend min/maximum optimization to much shorter length
1 parent 07d1f81 commit c5fa06a

File tree

1 file changed

+26
-34
lines changed

1 file changed

+26
-34
lines changed

base/reduce.jl

Lines changed: 26 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -625,49 +625,41 @@ isbadzero(op, x) = false
625625
isgoodzero(::typeof(max), x) = isbadzero(min, x)
626626
isgoodzero(::typeof(min), x) = isbadzero(max, x)
627627

628-
function mapreduce_impl(f, op::Union{typeof(max), typeof(min)},
629-
A::AbstractArrayOrBroadcasted, first::Int, last::Int)
630-
# 1. This optimization gives different result from general fallback, if the inputs `f.(A)`
631-
# contains both 'missing' and 'Nan'.
632-
# 2. For Integer cases, general fallback seems faster.
633-
# Based the above reasons, only use this for AbstractFloat cases.
634-
Eltype = _return_type(i -> f(A[i]), Tuple{Int})
628+
function mapreduce_impl(f, op::Union{typeof(max),typeof(min)},
629+
A::AbstractArrayOrBroadcasted, fi::Int, la::Int)
630+
@inline elf(i) = @inbounds f(A[i])
631+
# 1. If `f.(A)` contains both 'missing' and 'Nan', this might return `NaN`.
632+
# 2. For Integer input, general fallback is about 2x faster.
633+
# Thus limit this optimization to AbstractFloat.
634+
Eltype = _return_type(elf, Tuple{Int})
635635
Eltype <: AbstractFloat ||
636-
return invoke(mapreduce_impl,Tuple{Any,Any,AbstractArrayOrBroadcasted,Int,Int},f,op,A,first,last)
637-
a1 = @inbounds A[first]
638-
v1 = mapreduce_first(f, op, a1)
639-
v2 = v3 = v4 = v1
640-
chunk_len = 256
641-
start = first + 1
642-
simdstop = start + chunk_len - 4
643-
while simdstop <= last - 3
644-
# short circuit in case of NaN or missing
645-
v1 == v1 || return v1
646-
v2 == v2 || return v2
647-
v3 == v3 || return v3
648-
v4 == v4 || return v4
649-
@inbounds for i in start:4:simdstop
650-
v1 = _fast(op, v1, f(A[i+0]))
651-
v2 = _fast(op, v2, f(A[i+1]))
652-
v3 = _fast(op, v3, f(A[i+2]))
653-
v4 = _fast(op, v4, f(A[i+3]))
636+
return invoke(mapreduce_impl,Tuple{Any,Any,AbstractArrayOrBroadcasted,Int,Int},f,op,A,fi,la)
637+
v1 = v2 = v3 = v4 = elf(fi)
638+
len = (la - fi) >> 2
639+
i = fi
640+
for I in Iterators.partition(1:len, 64)
641+
for _ in I
642+
v1 = _fast(op, v1, elf(i+=1))
643+
v2 = _fast(op, v2, elf(i+=1))
644+
v3 = _fast(op, v3, elf(i+=1))
645+
v4 = _fast(op, v4, elf(i+=1))
654646
end
655-
checkbounds(A, simdstop+3)
656-
start += chunk_len
657-
simdstop += chunk_len
647+
# short circuit in case of NaN
648+
isnan(v1) && return v1
649+
isnan(v2) && return v2
650+
isnan(v3) && return v3
651+
isnan(v4) && return v4
658652
end
659653
v = op(op(v1,v2),op(v3,v4))
660-
for i in start:last
661-
@inbounds ai = A[i]
662-
v = op(v, f(ai))
654+
while i < la
655+
v = op(v, elf(i+=1))
663656
end
664-
665657
# enforce correct order of 0.0 and -0.0
666658
# e.g. maximum([0.0, -0.0]) === 0.0
667659
# should hold
668660
if isbadzero(op, v)
669-
for i in first:last
670-
x = @inbounds A[i]
661+
for i in fi:la
662+
x = elf(i)
671663
isgoodzero(op,x) && return x
672664
end
673665
end

0 commit comments

Comments
 (0)