@@ -625,49 +625,41 @@ isbadzero(op, x) = false
625
625
isgoodzero (:: typeof (max), x) = isbadzero (min, x)
626
626
isgoodzero (:: typeof (min), x) = isbadzero (max, x)
627
627
628
- function mapreduce_impl (f, op:: Union{typeof(max), typeof(min)} ,
629
- A:: AbstractArrayOrBroadcasted , first :: Int , last :: Int )
630
- # 1. This optimization gives different result from general fallback, if the inputs `f.(A)`
631
- # contains both 'missing' and 'Nan'.
632
- # 2. For Integer cases , general fallback seems faster.
633
- # Based the above reasons, only use this for AbstractFloat cases .
634
- Eltype = _return_type (i -> f (A[i]) , Tuple{Int})
628
+ function mapreduce_impl (f, op:: Union{typeof(max),typeof(min)} ,
629
+ A:: AbstractArrayOrBroadcasted , fi :: Int , la :: Int )
630
+ @inline elf (i) = @inbounds f (A[i])
631
+ # 1. If `f.(A)` contains both 'missing' and 'Nan', this might return `NaN` .
632
+ # 2. For Integer input , general fallback is about 2x faster.
633
+ # Thus limit this optimization to AbstractFloat .
634
+ Eltype = _return_type (elf , Tuple{Int})
635
635
Eltype <: AbstractFloat ||
636
- return invoke (mapreduce_impl,Tuple{Any,Any,AbstractArrayOrBroadcasted,Int,Int},f,op,A,first,last)
637
- a1 = @inbounds A[first]
638
- v1 = mapreduce_first (f, op, a1)
639
- v2 = v3 = v4 = v1
640
- chunk_len = 256
641
- start = first + 1
642
- simdstop = start + chunk_len - 4
643
- while simdstop <= last - 3
644
- # short circuit in case of NaN or missing
645
- v1 == v1 || return v1
646
- v2 == v2 || return v2
647
- v3 == v3 || return v3
648
- v4 == v4 || return v4
649
- @inbounds for i in start: 4 : simdstop
650
- v1 = _fast (op, v1, f (A[i+ 0 ]))
651
- v2 = _fast (op, v2, f (A[i+ 1 ]))
652
- v3 = _fast (op, v3, f (A[i+ 2 ]))
653
- v4 = _fast (op, v4, f (A[i+ 3 ]))
636
+ return invoke (mapreduce_impl,Tuple{Any,Any,AbstractArrayOrBroadcasted,Int,Int},f,op,A,fi,la)
637
+ v1 = v2 = v3 = v4 = elf (fi)
638
+ len = (la - fi) >> 2
639
+ i = fi
640
+ for I in Iterators. partition (1 : len, 64 )
641
+ for _ in I
642
+ v1 = _fast (op, v1, elf (i+= 1 ))
643
+ v2 = _fast (op, v2, elf (i+= 1 ))
644
+ v3 = _fast (op, v3, elf (i+= 1 ))
645
+ v4 = _fast (op, v4, elf (i+= 1 ))
654
646
end
655
- checkbounds (A, simdstop+ 3 )
656
- start += chunk_len
657
- simdstop += chunk_len
647
+ # short circuit in case of NaN
648
+ isnan (v1) && return v1
649
+ isnan (v2) && return v2
650
+ isnan (v3) && return v3
651
+ isnan (v4) && return v4
658
652
end
659
653
v = op (op (v1,v2),op (v3,v4))
660
- for i in start: last
661
- @inbounds ai = A[i]
662
- v = op (v, f (ai))
654
+ while i < la
655
+ v = op (v, elf (i+= 1 ))
663
656
end
664
-
665
657
# enforce correct order of 0.0 and -0.0
666
658
# e.g. maximum([0.0, -0.0]) === 0.0
667
659
# should hold
668
660
if isbadzero (op, v)
669
- for i in first : last
670
- x = @inbounds A[i]
661
+ for i in fi : la
662
+ x = elf (i)
671
663
isgoodzero (op,x) && return x
672
664
end
673
665
end
0 commit comments