@@ -605,71 +605,61 @@ julia> prod(1:5; init = 1.0)
605
605
prod (a; kw... ) = mapreduce (identity, mul_prod, a; kw... )
606
606
607
607
# # maximum & minimum
608
- _fast (:: typeof (min),x,y) = min (x,y)
609
- _fast (:: typeof (max),x,y) = max (x,y)
610
- function _fast (:: typeof (max), x:: AbstractFloat , y:: AbstractFloat )
611
- ifelse (isnan (x),
612
- x,
613
- ifelse (x > y, x, y))
614
- end
615
608
616
- function _fast (:: typeof (min),x:: AbstractFloat , y:: AbstractFloat )
617
- ifelse (isnan (x),
618
- x,
619
- ifelse (x < y, x, y))
620
- end
609
+ # Optimizaiton for min/max reduction
610
+ _fast (op) = (x, y) -> _fast (op, x, y)
611
+ _fast (op, x, y) = op (x, y)
621
612
622
- isbadzero (:: typeof (max), x:: AbstractFloat ) = (x == zero (x)) & signbit (x)
623
- isbadzero (:: typeof (min), x:: AbstractFloat ) = (x == zero (x)) & ! signbit (x)
624
- isbadzero (op, x) = false
625
- isgoodzero (:: typeof (max), x) = isbadzero (min, x)
626
- isgoodzero (:: typeof (min), x) = isbadzero (max, x)
627
-
628
- function mapreduce_impl (f, op:: Union{typeof(max), typeof(min)} ,
629
- A:: AbstractArrayOrBroadcasted , first:: Int , last:: Int )
630
- # 1. This optimization gives different result from general fallback, if the inputs `f.(A)`
631
- # contains both 'missing' and 'Nan'.
632
- # 2. For Integer cases, general fallback seems faster.
633
- # Based the above reasons, only use this for AbstractFloat cases.
634
- Eltype = _return_type (i -> f (A[i]), Tuple{Int})
635
- Eltype <: AbstractFloat ||
636
- return invoke (mapreduce_impl,Tuple{Any,Any,AbstractArrayOrBroadcasted,Int,Int},f,op,A,first,last)
637
- a1 = @inbounds A[first]
638
- v1 = mapreduce_first (f, op, a1)
639
- v2 = v3 = v4 = v1
640
- chunk_len = 256
641
- start = first + 1
642
- simdstop = start + chunk_len - 4
643
- while simdstop <= last - 3
644
- # short circuit in case of NaN or missing
645
- v1 == v1 || return v1
646
- v2 == v2 || return v2
647
- v3 == v3 || return v3
648
- v4 == v4 || return v4
649
- @inbounds for i in start: 4 : simdstop
650
- v1 = _fast (op, v1, f (A[i+ 0 ]))
651
- v2 = _fast (op, v2, f (A[i+ 1 ]))
652
- v3 = _fast (op, v3, f (A[i+ 2 ]))
653
- v4 = _fast (op, v4, f (A[i+ 3 ]))
654
- end
655
- checkbounds (A, simdstop+ 3 )
656
- start += chunk_len
657
- simdstop += chunk_len
658
- end
659
- v = op (op (v1,v2),op (v3,v4))
660
- for i in start: last
661
- @inbounds ai = A[i]
662
- v = op (v, f (ai))
613
+ # used in optimized mapreduce_impl for IEEEFloat
614
+ # where nan inputs has been handled
615
+ _fast (:: typeof (min), x:: T , y:: T ) where {T<: IEEEFloat } = ifelse (signbit (x- y), x, y)
616
+ _fast (:: typeof (max), x:: T , y:: T ) where {T<: IEEEFloat } = ifelse (signbit (x- y), y, x)
617
+
618
+ @inline function _anynan (x:: NTuple{N} ) where {N}
619
+ r = false ;
620
+ # use loop for better vectorization
621
+ @inbounds @simd for i in 1 : N>> 1
622
+ r |= (isnan (x[i])| isnan (x[i+ N>> 1 ]))
663
623
end
624
+ iseven (N) ? r : r| isnan (last (x))
625
+ end
664
626
665
- # enforce correct order of 0.0 and -0.0
666
- # e.g. maximum([0.0, -0.0]) === 0.0
667
- # should hold
668
- if isbadzero (op, v)
669
- for i in first: last
670
- x = @inbounds A[i]
671
- isgoodzero (op,x) && return x
627
+ function mapreduce_impl (f, op:: Union{typeof(max),typeof(min)} ,
628
+ A:: AbstractArrayOrBroadcasted , fi:: Int , la:: Int )
629
+ @inline elf (i:: Int ) = @inbounds f (A[i])
630
+ Eltype = _return_type (elf, Tuple{Int})
631
+ # For Integer input, general fallback is about 2x faster.
632
+ # Thus limit this optimization to IEEEFloat.
633
+ Eltype <: IEEEFloat ||
634
+ return mapreduce_impl (f, op, A, fi, la, pairwise_blocksize (f, op))
635
+ v, i = elf (fi), fi
636
+ if la - i >= 8
637
+ # we always return the first nan
638
+ @noinline firstnan (temp:: Tuple ) = temp[findfirst (isnan, temp)], fi
639
+ @noinline function simd_kernal (:: Val{N} , ini:: IEEEFloat , i:: Int ) where {N}
640
+ vs = ntuple (Returns (ini), Val (N)) # initial values (non nan)
641
+ index = ntuple (identity, Val (N))
642
+ for _ in 1 : (la- i)÷ N
643
+ temp = map (elf, i .+ index)
644
+ _anynan (temp) && return firstnan (temp) # nan check
645
+ vs = map (_fast (op), vs, temp) # then _fast(op) is safe
646
+ i += N
647
+ end
648
+ reduce (_fast (op), vs), i
649
+ end
650
+ isnan (v) && return v
651
+ # pick a proper unroll-size
652
+ if la - i < 64
653
+ v, i = simd_kernal (Val (4 ), v, i)
654
+ elseif la - i < 128
655
+ v, i = simd_kernal (Val (8 ), v, i)
656
+ else
657
+ v, i = simd_kernal (Val (16 ), v, i)
672
658
end
659
+ i == fi && return v # return by `firstnan`
660
+ end
661
+ while i < la
662
+ v = op (v, elf (i+= 1 ))
673
663
end
674
664
return v
675
665
end
0 commit comments