From 80f031fd1ea04affe8808faf8036fe47f4e0f418 Mon Sep 17 00:00:00 2001
From: Matt Bauman <mbauman@juliacomputing.com>
Date: Fri, 9 May 2025 22:46:27 -0400
Subject: [PATCH 01/55] counting and summing Bools with a small integer `init`
 promote to `[U]Int`.

(cherry picked from commit c923b1ddf06466ab094bf8af1ecff3c30fa187e8)
---
 base/reduce.jl    | 12 ++++++++----
 test/reduce.jl    | 10 +++++-----
 test/reducedim.jl | 10 +++++-----
 3 files changed, 18 insertions(+), 14 deletions(-)

diff --git a/base/reduce.jl b/base/reduce.jl
index 9041fd088ccf3..95a740d9e38a9 100644
--- a/base/reduce.jl
+++ b/base/reduce.jl
@@ -14,8 +14,9 @@ The reduction operator used in `sum`. The main difference from [`+`](@ref) is th
 integers are promoted to `Int`/`UInt`.
 """
 add_sum(x, y) = x + y
-add_sum(x::BitSignedSmall, y::BitSignedSmall) = Int(x) + Int(y)
-add_sum(x::BitUnsignedSmall, y::BitUnsignedSmall) = UInt(x) + UInt(y)
+add_sum(x::Bool, y::Bool) = Int(x) + Int(y)
+add_sum(x::Union{Bool,BitSignedSmall}, y::Union{Bool,BitSignedSmall}) = Int(x) + Int(y)
+add_sum(x::Union{Bool,BitUnsignedSmall}, y::Union{Bool,BitUnsignedSmall}) = UInt(x) + UInt(y)
 add_sum(x::Real, y::Real)::Real = x + y
 
 """
@@ -402,6 +403,7 @@ reduce_first(::typeof(+), x::Bool) = Int(x)
 reduce_first(::typeof(*), x::AbstractChar) = string(x)
 
 reduce_first(::typeof(add_sum), x) = reduce_first(+, x)
+reduce_first(::typeof(add_sum), x::Bool)   = Int(x)
 reduce_first(::typeof(add_sum), x::BitSignedSmall)   = Int(x)
 reduce_first(::typeof(add_sum), x::BitUnsignedSmall) = UInt(x)
 reduce_first(::typeof(mul_prod), x) = reduce_first(*, x)
@@ -1083,8 +1085,10 @@ count(f, itr; init=0) = _simple_count(f, itr, init)
 
 _simple_count(pred, itr, init) = sum(_bool(pred), itr; init)
 
-function _simple_count(::typeof(identity), x::Array{Bool}, init::T=0) where {T}
-    n::T = init
+function _simple_count(::typeof(identity), x::Array{Bool}, init=0)
+    v0 = Base.add_sum(init, false)
+    T = typeof(v0)
+    n::T = v0
     chunks = length(x) ÷ sizeof(UInt)
     mask = 0x0101010101010101 % UInt
     GC.@preserve x begin
diff --git a/test/reduce.jl b/test/reduce.jl
index c6274711cdef4..329f18149b361 100644
--- a/test/reduce.jl
+++ b/test/reduce.jl
@@ -586,11 +586,11 @@ struct NonFunctionIsZero end
 @test count(NonFunctionIsZero(), [0]) == 1
 @test count(NonFunctionIsZero(), [1]) == 0
 
-@test count(Iterators.repeated(true, 3), init=0x04) === 0x07
-@test count(!=(2), Iterators.take(1:7, 3), init=Int32(0)) === Int32(2)
-@test count(identity, [true, false], init=Int8(5)) === Int8(6)
-@test count(!, [true false; false true], dims=:, init=Int16(0)) === Int16(2)
-@test isequal(count(identity, [true false; false true], dims=2, init=UInt(4)), reshape(UInt[5, 5], 2, 1))
+@test count(Iterators.repeated(true, 3), init=0x00) === UInt(3)
+@test count(!=(2), Iterators.take(1:7, 3), init=Int32(0)) === 2
+@test count(identity, [true, false], init=Int8(0)) === 1
+@test count(!, [true false; false true], dims=:, init=Int16(0)) === 2
+@test isequal(count(identity, [true false; false true], dims=2, init=UInt(0)), reshape(UInt[1, 1], 2, 1))
 
 ## cumsum, cummin, cummax
 
diff --git a/test/reducedim.jl b/test/reducedim.jl
index 6a6f20214058c..76efe6c503e21 100644
--- a/test/reducedim.jl
+++ b/test/reducedim.jl
@@ -88,12 +88,12 @@ safe_minabs(A::Array{T}, region) where {T} = safe_mapslices(minimum, abs.(A), re
     @test @inferred(count(!, Breduc, dims=region)) ≈ safe_count(.!Breduc, region)
 
     @test isequal(
-        @inferred(count(Breduc, dims=region, init=0x02)),
-        safe_count(Breduc, region) .% UInt8 .+ 0x02,
+        @inferred(Array{UInt8,ndims(Breduc)}, count(Breduc, dims=region, init=0x00)),
+        safe_count(Breduc, region),
     )
     @test isequal(
-        @inferred(count(!, Breduc, dims=region, init=Int16(0))),
-        safe_count(.!Breduc, region) .% Int16,
+        @inferred(Array{Int16,ndims(Breduc)}, count(!, Breduc, dims=region, init=Int16(0))),
+        safe_count(.!Breduc, region),
     )
 end
 
@@ -693,7 +693,7 @@ end
     @test_throws TypeError count!([1], [1])
 end
 
-@test @inferred(count(false:true, dims=:, init=0x0004)) === 0x0005
+@test @inferred(UInt16, count(false:true, dims=:, init=0x0000)) === UInt(1)
 @test @inferred(count(isodd, reshape(1:9, 3, 3), dims=:, init=Int128(0))) === Int128(5)
 
 @testset "reduced_index for BigInt (issue #39995)" begin

From 3c3fdcc3a190aec424a9be7d1f2f7bfa542d34f2 Mon Sep 17 00:00:00 2001
From: Matt Bauman <mbauman@juliahub.com>
Date: Sat, 10 May 2025 00:57:42 -0400
Subject: [PATCH 02/55] Fix doctests

(cherry picked from commit 4bee1914f17bd512e15b07c3cdab1e042471cc6d)
---
 base/reduce.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/base/reduce.jl b/base/reduce.jl
index 95a740d9e38a9..7eec4be071be7 100644
--- a/base/reduce.jl
+++ b/base/reduce.jl
@@ -1075,8 +1075,8 @@ julia> count(i->(4<=i<=6), [2,3,4,5,6])
 julia> count([true, false, true, true])
 3
 
-julia> count(>(3), 1:7, init=0x03)
-0x07
+julia> count(>(3), 1:7, init=0x00)
+0x0000000000000004
 ```
 """
 count(itr; init=0) = count(identity, itr; init)

From 4baecd35b2c9ccbad453ae06f62837d15a8f4735 Mon Sep 17 00:00:00 2001
From: Matt Bauman <mbauman@juliacomputing.com>
Date: Sat, 10 May 2025 20:31:20 -0400
Subject: [PATCH 03/55] keep the add_sum method table small

for accumulate inference

(cherry picked from commit 4fadd8f2cbc3dee450a996936f8e037f17186980)
---
 base/reduce.jl | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/base/reduce.jl b/base/reduce.jl
index 7eec4be071be7..e280dda5a08b0 100644
--- a/base/reduce.jl
+++ b/base/reduce.jl
@@ -14,9 +14,11 @@ The reduction operator used in `sum`. The main difference from [`+`](@ref) is th
 integers are promoted to `Int`/`UInt`.
 """
 add_sum(x, y) = x + y
-add_sum(x::Bool, y::Bool) = Int(x) + Int(y)
-add_sum(x::Union{Bool,BitSignedSmall}, y::Union{Bool,BitSignedSmall}) = Int(x) + Int(y)
-add_sum(x::Union{Bool,BitUnsignedSmall}, y::Union{Bool,BitUnsignedSmall}) = UInt(x) + UInt(y)
+add_sum(x::Integer, y::Integer)::Integer = add_sum_integer(x, y)
+add_sum_integer(x::Bool, y::Bool) = Int(x) + Int(y)
+add_sum_integer(x::Union{Bool,BitSignedSmall}, y::Union{Bool,BitSignedSmall}) = Int(x) + Int(y)
+add_sum_integer(x::Union{Bool,BitUnsignedSmall}, y::Union{Bool,BitUnsignedSmall}) = UInt(x) + UInt(y)
+add_sum_integer(x, y) = x+y
 add_sum(x::Real, y::Real)::Real = x + y
 
 """
@@ -403,8 +405,7 @@ reduce_first(::typeof(+), x::Bool) = Int(x)
 reduce_first(::typeof(*), x::AbstractChar) = string(x)
 
 reduce_first(::typeof(add_sum), x) = reduce_first(+, x)
-reduce_first(::typeof(add_sum), x::Bool)   = Int(x)
-reduce_first(::typeof(add_sum), x::BitSignedSmall)   = Int(x)
+reduce_first(::typeof(add_sum), x::Union{Bool,BitSignedSmall})   = Int(x)
 reduce_first(::typeof(add_sum), x::BitUnsignedSmall) = UInt(x)
 reduce_first(::typeof(mul_prod), x) = reduce_first(*, x)
 reduce_first(::typeof(mul_prod), x::BitSignedSmall)   = Int(x)

From 93796abb32f429865a42c16c5bd9605e9d80d5f0 Mon Sep 17 00:00:00 2001
From: Matt Bauman <mbauman@juliacomputing.com>
Date: Thu, 29 Aug 2024 10:32:55 -0400
Subject: [PATCH 04/55] unify reduce/reducedim empty case behaviors

Previously, Julia tried to guess at initial values for empty dimensional reductions in a slightly different way than whole-array reductions. This change unifies those behaviors, such that `mapreduce_empty` is called for empty dimensional reductions, just like it is called for empty whole-array reductions.

Beyond the unification (which is valuable in its own right), this change enables some downstream simplifications to dimensional reductions and is likely to be the "most breaking" public behavior in a refactoring targeting more correctness improvements.

(cherry picked from commit 94f24b8bddf07927eb03f84ba9ef50f966a26633)
---
 NEWS.md           |  4 +++
 base/reducedim.jl | 14 ++++-----
 test/reducedim.jl | 74 ++++++++++++++++++++++++++++++++++++++---------
 3 files changed, 69 insertions(+), 23 deletions(-)

diff --git a/NEWS.md b/NEWS.md
index d2eb96214beec..962065d3d386d 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -51,6 +51,10 @@ New library features
 Standard library changes
 ------------------------
 
+* Empty dimensional reductions (e.g., `reduce` and `mapreduce` with the `dims` keyword
+  selecting one or more dimensions) now behave like their whole-array (`dims=:`) counterparts,
+  only returning values in unambiguous cases and erroring otherwise.
+
 #### JuliaSyntaxHighlighting
 
 #### LinearAlgebra
diff --git a/base/reducedim.jl b/base/reducedim.jl
index ba3434494bc0b..5717e99d193de 100644
--- a/base/reducedim.jl
+++ b/base/reducedim.jl
@@ -341,8 +341,10 @@ _mapreduce_dim(f, op, ::_InitialValue, A::AbstractArrayOrBroadcasted, ::Colon) =
 _mapreduce_dim(f, op, nt, A::AbstractArrayOrBroadcasted, dims) =
     mapreducedim!(f, op, reducedim_initarray(A, dims, nt), A)
 
-_mapreduce_dim(f, op, ::_InitialValue, A::AbstractArrayOrBroadcasted, dims) =
+function _mapreduce_dim(f, op, ::_InitialValue, A::AbstractArrayOrBroadcasted, dims)
+    isempty(A) && return fill(mapreduce_empty(f, op, eltype(A)), reduced_indices(A, dims))
     mapreducedim!(f, op, reducedim_init(f, op, A, dims), A)
+end
 
 """
     reduce(f, A::AbstractArray; dims=:, [init])
@@ -1128,10 +1130,7 @@ findmin(f, A::AbstractArray; dims=:) = _findmin(f, A, dims)
 function _findmin(f, A, region)
     ri = reduced_indices0(A, region)
     if isempty(A)
-        if prod(map(length, reduced_indices(A, region))) != 0
-            throw(ArgumentError("collection slices must be non-empty"))
-        end
-        similar(A, promote_op(f, eltype(A)), ri), zeros(eltype(keys(A)), ri)
+        _empty_reduce_error()
     else
         fA = f(first(A))
         findminmax!(f, isgreater, fill!(similar(A, _findminmax_inittype(f, A), ri), fA),
@@ -1201,10 +1200,7 @@ findmax(f, A::AbstractArray; dims=:) = _findmax(f, A, dims)
 function _findmax(f, A, region)
     ri = reduced_indices0(A, region)
     if isempty(A)
-        if prod(map(length, reduced_indices(A, region))) != 0
-            throw(ArgumentError("collection slices must be non-empty"))
-        end
-        similar(A, promote_op(f, eltype(A)), ri), zeros(eltype(keys(A)), ri)
+        _empty_reduce_error()
     else
         fA = f(first(A))
         findminmax!(f, isless, fill!(similar(A, _findminmax_inittype(f, A), ri), fA),
diff --git a/test/reducedim.jl b/test/reducedim.jl
index 76efe6c503e21..fd8d003f33b2c 100644
--- a/test/reducedim.jl
+++ b/test/reducedim.jl
@@ -207,23 +207,34 @@ end
     @test isequal(prod(A, dims=(1, 2)), fill(1, 1, 1))
     @test isequal(prod(A, dims=3), fill(1, 0, 1))
 
-    for f in (minimum, maximum)
+    for f in (minimum, maximum, findmin, findmax)
         @test_throws "reducing over an empty collection is not allowed" f(A, dims=1)
-        @test isequal(f(A, dims=2), zeros(Int, 0, 1))
+        @test_throws "reducing over an empty collection is not allowed" f(A, dims=2)
         @test_throws "reducing over an empty collection is not allowed" f(A, dims=(1, 2))
-        @test isequal(f(A, dims=3), zeros(Int, 0, 1))
-    end
-    for f in (findmin, findmax)
-        @test_throws ArgumentError f(A, dims=1)
-        @test isequal(f(A, dims=2), (zeros(Int, 0, 1), zeros(Int, 0, 1)))
-        @test_throws ArgumentError f(A, dims=(1, 2))
-        @test isequal(f(A, dims=3), (zeros(Int, 0, 1), zeros(Int, 0, 1)))
-        @test_throws ArgumentError f(abs2, A, dims=1)
-        @test isequal(f(abs2, A, dims=2), (zeros(Int, 0, 1), zeros(Int, 0, 1)))
-        @test_throws ArgumentError f(abs2, A, dims=(1, 2))
-        @test isequal(f(abs2, A, dims=3), (zeros(Int, 0, 1), zeros(Int, 0, 1)))
+        @test_throws "reducing over an empty collection is not allowed" f(A, dims=3)
+        if f === maximum
+            # maximum allows some empty reductions with abs/abs2
+            z = f(abs, A)
+            @test isequal(f(abs, A, dims=1), fill(z, (1,1)))
+            @test isequal(f(abs, A, dims=2), fill(z, (0,1)))
+            @test isequal(f(abs, A, dims=(1,2)), fill(z, (1,1)))
+            @test isequal(f(abs, A, dims=3), fill(z, (0,1)))
+            z = f(abs2, A)
+            @test isequal(f(abs2, A, dims=1), fill(z, (1,1)))
+            @test isequal(f(abs2, A, dims=2), fill(z, (0,1)))
+            @test isequal(f(abs2, A, dims=(1,2)), fill(z, (1,1)))
+            @test isequal(f(abs2, A, dims=3), fill(z, (0,1)))
+        else
+            @test_throws "reducing over an empty collection is not allowed" f(abs, A, dims=1)
+            @test_throws "reducing over an empty collection is not allowed" f(abs, A, dims=2)
+            @test_throws "reducing over an empty collection is not allowed" f(abs, A, dims=(1, 2))
+            @test_throws "reducing over an empty collection is not allowed" f(abs, A, dims=3)
+            @test_throws "reducing over an empty collection is not allowed" f(abs2, A, dims=1)
+            @test_throws "reducing over an empty collection is not allowed" f(abs2, A, dims=2)
+            @test_throws "reducing over an empty collection is not allowed" f(abs2, A, dims=(1, 2))
+            @test_throws "reducing over an empty collection is not allowed" f(abs2, A, dims=3)
+        end
     end
-
 end
 
 ## findmin/findmax/minimum/maximum
@@ -720,3 +731,38 @@ end
     @test_broken @inferred(maximum(exp, A; dims = 1))[1] === missing
     @test_broken @inferred(extrema(exp, A; dims = 1))[1] === (missing, missing)
 end
+
+some_exception(op) = try return (Some(op()), nothing); catch ex; return (nothing, ex); end
+reduced_shape(sz, dims) = ntuple(d -> d in dims ? 1 : sz[d], length(sz))
+
+@testset "Ensure that calling, e.g., sum(empty; dims) has the same behavior as sum(empty)" begin
+    @testset "$r(Array{$T}(undef, $sz); dims=$dims)" for
+        r in (minimum, maximum, findmin, findmax, extrema, sum, prod, all, any, count),
+        T in (Int, Union{Missing, Int}, Number, Union{Missing, Number}, Bool, Union{Missing, Bool}, Any),
+        sz in ((0,), (0,1), (1,0), (0,0), (0,0,1), (1,0,1)),
+        dims in (1, 2, 3, 4, (1,2), (1,3), (2,3,4), (1,2,3))
+
+        A = Array{T}(undef, sz)
+        rsz = reduced_shape(sz, dims)
+
+        v, ex = some_exception() do; r(A); end
+        if isnothing(v)
+            @test_throws typeof(ex) r(A; dims)
+        else
+            actual = fill(something(v), rsz)
+            @test isequal(r(A; dims), actual)
+            @test eltype(r(A; dims)) === eltype(actual)
+        end
+
+        for f in (identity, abs, abs2)
+            v, ex = some_exception() do; r(f, A); end
+            if isnothing(v)
+                @test_throws typeof(ex) r(f, A; dims)
+            else
+                actual = fill(something(v), rsz)
+                @test isequal(r(f, A; dims), actual)
+                @test eltype(r(f, A; dims)) === eltype(actual)
+            end
+        end
+    end
+end

From 0c3d4b2bf0948a9e0f6bf7eb8ca7f2b4bd116a43 Mon Sep 17 00:00:00 2001
From: Matt Bauman <mbauman@juliacomputing.com>
Date: Fri, 18 Apr 2025 22:43:12 -0400
Subject: [PATCH 05/55] TEMPORARILY USE SPARSEARRAYS PR BRANCH

(cherry picked from commit 0e924cf003de1673af4ca775504049636ed7ffd1)
---
 .../md5                                                   | 1 +
 .../sha512                                                | 1 +
 stdlib/SparseArrays.version                               | 8 ++++----
 3 files changed, 6 insertions(+), 4 deletions(-)
 create mode 100644 deps/checksums/SparseArrays-ac0cc6e8278bb614dc2a16e9353fa092dae6606a.tar.gz/md5
 create mode 100644 deps/checksums/SparseArrays-ac0cc6e8278bb614dc2a16e9353fa092dae6606a.tar.gz/sha512

diff --git a/deps/checksums/SparseArrays-ac0cc6e8278bb614dc2a16e9353fa092dae6606a.tar.gz/md5 b/deps/checksums/SparseArrays-ac0cc6e8278bb614dc2a16e9353fa092dae6606a.tar.gz/md5
new file mode 100644
index 0000000000000..edb55470f2e63
--- /dev/null
+++ b/deps/checksums/SparseArrays-ac0cc6e8278bb614dc2a16e9353fa092dae6606a.tar.gz/md5
@@ -0,0 +1 @@
+edafbcbfc29cb7a86aef4a92bf7e7267
diff --git a/deps/checksums/SparseArrays-ac0cc6e8278bb614dc2a16e9353fa092dae6606a.tar.gz/sha512 b/deps/checksums/SparseArrays-ac0cc6e8278bb614dc2a16e9353fa092dae6606a.tar.gz/sha512
new file mode 100644
index 0000000000000..f57ee756ca6fc
--- /dev/null
+++ b/deps/checksums/SparseArrays-ac0cc6e8278bb614dc2a16e9353fa092dae6606a.tar.gz/sha512
@@ -0,0 +1 @@
+230151395d43b005e1285f5938a5800650314dc562e4cc6363775b54b45b33076a551276e45a977a0ea3b07ca0088a2ea27def63196f435ff1d6153b93999485
diff --git a/stdlib/SparseArrays.version b/stdlib/SparseArrays.version
index 9498fbe4943f9..7a1d9697c9769 100644
--- a/stdlib/SparseArrays.version
+++ b/stdlib/SparseArrays.version
@@ -1,4 +1,4 @@
-SPARSEARRAYS_BRANCH = main
-SPARSEARRAYS_SHA1 = f3610c07fe0403792743d9c9802a25642a5f2d18
-SPARSEARRAYS_GIT_URL := https://github.com/JuliaSparse/SparseArrays.jl.git
-SPARSEARRAYS_TAR_URL = https://api.github.com/repos/JuliaSparse/SparseArrays.jl/tarball/$1
+SPARSEARRAYS_BRANCH = mb/mapreducedim_empty
+SPARSEARRAYS_SHA1 = ac0cc6e8278bb614dc2a16e9353fa092dae6606a
+SPARSEARRAYS_GIT_URL := https://github.com/mbauman/SparseArrays.jl.git
+SPARSEARRAYS_TAR_URL = https://api.github.com/repos/mbauman/SparseArrays.jl/tarball/$1

From 0a3f8443ffbf24161780f962c749d1ab57b60f3a Mon Sep 17 00:00:00 2001
From: Matt Bauman <mbauman@juliacomputing.com>
Date: Mon, 12 May 2025 16:21:14 -0400
Subject: [PATCH 06/55] add two-arg axes and size for Broadcasted

---
 base/broadcast.jl | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/base/broadcast.jl b/base/broadcast.jl
index b86baf08ddfe0..81896a886ff65 100644
--- a/base/broadcast.jl
+++ b/base/broadcast.jl
@@ -236,6 +236,7 @@ Base.similar(::Broadcasted{ArrayConflict}, ::Type{Bool}, dims) =
     similar(BitArray, dims)
 
 @inline Base.axes(bc::Broadcasted) = _axes(bc, bc.axes)
+@inline Base.axes(bc::Broadcasted, d) = get(axes(bc), d, OneTo(1))
 _axes(::Broadcasted, axes::Tuple) = axes
 @inline _axes(bc::Broadcasted, ::Nothing)  = combine_axes(bc.args...)
 _axes(bc::Broadcasted{<:AbstractArrayStyle{0}}, ::Nothing) = ()
@@ -266,6 +267,7 @@ Base.ndims(bc::Broadcasted) = ndims(typeof(bc))
 Base.ndims(::Type{<:Broadcasted{<:Any,<:NTuple{N,Any}}}) where {N} = N
 
 Base.size(bc::Broadcasted) = map(length, axes(bc))
+Base.size(bc::Broadcasted, d) = length(axes(bc, d))
 Base.length(bc::Broadcasted) = prod(size(bc))
 
 function Base.iterate(bc::Broadcasted)

From 19f0ebb713b83a8840c21a1cd257ce114a31bced Mon Sep 17 00:00:00 2001
From: Matt Bauman <mbauman@juliacomputing.com>
Date: Mon, 12 May 2025 16:21:58 -0400
Subject: [PATCH 07/55] fully pairwise mapreduce with consistent use of init

---
 base/deprecated.jl               |  15 +
 base/fastmath.jl                 |   9 +-
 base/missing.jl                  |  96 +-----
 base/multidimensional.jl         |  36 ++
 base/permuteddimsarray.jl        |  16 +-
 base/reduce.jl                   | 301 +++++++++++++----
 base/reducedim.jl                | 561 ++++++++++++-------------------
 test/reduce.jl                   |  57 ++++
 test/reducedim.jl                | 278 ++++++++++++++-
 test/testhelpers/OffsetArrays.jl |   1 -
 10 files changed, 849 insertions(+), 521 deletions(-)

diff --git a/base/deprecated.jl b/base/deprecated.jl
index c5701adf1a420..aba128b178b76 100644
--- a/base/deprecated.jl
+++ b/base/deprecated.jl
@@ -558,3 +558,18 @@ true
 isbindingresolved
 
 # END 1.12 deprecations
+
+# BEGIN 1.13 deprecations
+
+function reducedim_initarray end
+const _dep_message_reducedim_initarray = ", these internals have been removed. To customize the array returned by dimensional reductions, implement mapreduce_similar instead"
+deprecate(Base, :reducedim_initarray)
+function _mapreduce_dim end
+const _dep_message__mapreduce_dim = ", these internals have been removed. To customize the array returned by dimensional reductions, implement mapreduce_similar instead"
+deprecate(Base, :_mapreduce_dim)
+@deprecate_binding mapreducedim! Base.mapreduce! false
+@deprecate_binding _mapreducedim! Base.mapreduce! false
+@deprecate_binding reducedim! Base.reduce! false
+@deprecate mapreduce_impl(f, op, A::AbstractArrayOrBroadcasted, ifirst::Integer, ilast::Integer, blksize::Int=pairwise_blocksize(f, op)) Base.mapreduce_pairwise(f, op, A, _InitialValue(), ifirst:ilast) false
+
+# END 1.13 deprecations
diff --git a/base/fastmath.jl b/base/fastmath.jl
index f2f60519b99ac..ce3c3a1c40bf6 100644
--- a/base/fastmath.jl
+++ b/base/fastmath.jl
@@ -401,19 +401,14 @@ minimum_fast(a; kw...) = Base.reduce(min_fast, a; kw...)
 maximum_fast(f, a; kw...) = Base.mapreduce(f, max_fast, a; kw...)
 minimum_fast(f, a; kw...) = Base.mapreduce(f, min_fast, a; kw...)
 
-Base.reducedim_init(f, ::typeof(max_fast), A::AbstractArray, region) =
-    Base.reducedim_init(f, max, A::AbstractArray, region)
-Base.reducedim_init(f, ::typeof(min_fast), A::AbstractArray, region) =
-    Base.reducedim_init(f, min, A::AbstractArray, region)
-
 maximum!_fast(r::AbstractArray, A::AbstractArray; kw...) =
     maximum!_fast(identity, r, A; kw...)
 minimum!_fast(r::AbstractArray, A::AbstractArray; kw...) =
     minimum!_fast(identity, r, A; kw...)
 
 maximum!_fast(f::Function, r::AbstractArray, A::AbstractArray; init::Bool=true) =
-    Base.mapreducedim!(f, max_fast, Base.initarray!(r, f, max, init, A), A)
+    Base.mapreduce!(f, max_fast, r, A; update=!init)
 minimum!_fast(f::Function, r::AbstractArray, A::AbstractArray; init::Bool=true) =
-    Base.mapreducedim!(f, min_fast, Base.initarray!(r, f, min, init, A), A)
+    Base.mapreduce!(f, min_fast, r, A; update=!init)
 
 end
diff --git a/base/missing.jl b/base/missing.jl
index 5fc0e6ec9e620..4dce394627af4 100644
--- a/base/missing.jl
+++ b/base/missing.jl
@@ -269,99 +269,9 @@ function show(io::IO, s::SkipMissing)
     print(io, ')')
 end
 
-# Optimized mapreduce implementation
-# The generic method is faster when !(eltype(A) >: Missing) since it does not need
-# additional loops to identify the two first non-missing values of each block
-mapreduce(f, op, itr::SkipMissing{<:AbstractArray}) =
-    _mapreduce(f, op, IndexStyle(itr.x), eltype(itr.x) >: Missing ? itr : itr.x)
-
-function _mapreduce(f, op, ::IndexLinear, itr::SkipMissing{<:AbstractArray})
-    A = itr.x
-    ai = missing
-    inds = LinearIndices(A)
-    i = first(inds)
-    ilast = last(inds)
-    for outer i in i:ilast
-        @inbounds ai = A[i]
-        !ismissing(ai) && break
-    end
-    ismissing(ai) && return mapreduce_empty(f, op, eltype(itr))
-    a1::eltype(itr) = ai
-    i == typemax(typeof(i)) && return mapreduce_first(f, op, a1)
-    i += 1
-    ai = missing
-    for outer i in i:ilast
-        @inbounds ai = A[i]
-        !ismissing(ai) && break
-    end
-    ismissing(ai) && return mapreduce_first(f, op, a1)
-    # We know A contains at least two non-missing entries: the result cannot be nothing
-    something(mapreduce_impl(f, op, itr, first(inds), last(inds)))
-end
-
-_mapreduce(f, op, ::IndexCartesian, itr::SkipMissing) = mapfoldl(f, op, itr)
-
-mapreduce_impl(f, op, A::SkipMissing, ifirst::Integer, ilast::Integer) =
-    mapreduce_impl(f, op, A, ifirst, ilast, pairwise_blocksize(f, op))
-
-# Returns nothing when the input contains only missing values, and Some(x) otherwise
-@noinline function mapreduce_impl(f, op, itr::SkipMissing{<:AbstractArray},
-                                  ifirst::Integer, ilast::Integer, blksize::Int)
-    A = itr.x
-    if ifirst > ilast
-        return nothing
-    elseif ifirst == ilast
-        @inbounds a1 = A[ifirst]
-        if ismissing(a1)
-            return nothing
-        else
-            return Some(mapreduce_first(f, op, a1))
-        end
-    elseif ilast - ifirst < blksize
-        # sequential portion
-        ai = missing
-        i = ifirst
-        for outer i in i:ilast
-            @inbounds ai = A[i]
-            !ismissing(ai) && break
-        end
-        ismissing(ai) && return nothing
-        a1 = ai::eltype(itr)
-        i == typemax(typeof(i)) && return Some(mapreduce_first(f, op, a1))
-        i += 1
-        ai = missing
-        for outer i in i:ilast
-            @inbounds ai = A[i]
-            !ismissing(ai) && break
-        end
-        ismissing(ai) && return Some(mapreduce_first(f, op, a1))
-        a2 = ai::eltype(itr)
-        i == typemax(typeof(i)) && return Some(op(f(a1), f(a2)))
-        i += 1
-        v = op(f(a1), f(a2))
-        @simd for i = i:ilast
-            @inbounds ai = A[i]
-            if !ismissing(ai)
-                v = op(v, f(ai))
-            end
-        end
-        return Some(v)
-    else
-        # pairwise portion
-        imid = ifirst + (ilast - ifirst) >> 1
-        v1 = mapreduce_impl(f, op, itr, ifirst, imid, blksize)
-        v2 = mapreduce_impl(f, op, itr, imid+1, ilast, blksize)
-        if v1 === nothing && v2 === nothing
-            return nothing
-        elseif v1 === nothing
-            return v2
-        elseif v2 === nothing
-            return v1
-        else
-            return Some(op(something(v1), something(v2)))
-        end
-    end
-end
+# Simple optimization for mapreduce if the array cannot hold Missing
+mapreduce(f, op, itr::SkipMissing{<:AbstractArray}; init=Base._InitialValue(), dims=(:)) =
+    mapreducedim(f, op, eltype(itr.x) >: Missing ? itr : itr.x, init, dims)
 
 """
     filter(f, itr::SkipMissing{<:AbstractArray})
diff --git a/base/multidimensional.jl b/base/multidimensional.jl
index 7add7b9e74205..cc557d24e664f 100644
--- a/base/multidimensional.jl
+++ b/base/multidimensional.jl
@@ -1046,6 +1046,42 @@ end
     $(_generate_unsafe_setindex!_body(2))
 end
 
+# Reduction support for CartesianIndices
+halves(inds::CartesianIndices) = map(CartesianIndices∘tuple, _halves(inds.indices)...)
+mapreduce_kernel(f, op, A, init, ::CartesianIndices{0}) = init===_InitialValue() ? mapreduce_first(f, op, only(A)) : op(init, f(only(A)))
+mapreduce_kernel(f, op, A, init, inds::CartesianIndices{1}) = mapreduce_kernel(f, op, A, init, inds.indices[1])
+function mapreduce_kernel(f, op, A, init, inds::CartesianIndices)
+    i1, s = iterate(inds)
+    a1 = @inbounds A[i1]
+    v = _mapreduce_start(f, op, A, init, a1)
+    r = inds.indices[1]
+    if length(r) == 1
+        # SIMD over a one-element loop is less-than-helpful; just iterate
+        # over the rest of the indices without worrying about splitting out
+        # an inner SIMD loop
+        for i in Iterators.rest(inds, s)
+            ai = @inbounds A[i]
+            v = op(v, f(ai))
+        end
+    else
+        # Divide the loop into an outer and inner loop, and skip the first element
+        # in the first iteration of the outer loop
+        outer = CartesianIndices(tail(inds.indices))
+        o1, so = iterate(outer)
+        @simd for i in r[begin+1:end]
+            ai = @inbounds A[i, o1]
+            v = op(v, f(ai))
+        end
+        for o in Iterators.rest(outer, so)
+            @simd for i in r
+                ai = @inbounds A[i, o]
+                v = op(v, f(ai))
+            end
+        end
+    end
+    return v
+end
+
 diff(a::AbstractVector) = diff(a, dims=1)
 
 """
diff --git a/base/permuteddimsarray.jl b/base/permuteddimsarray.jl
index cf9748168aac2..2bb76e5bfd77d 100644
--- a/base/permuteddimsarray.jl
+++ b/base/permuteddimsarray.jl
@@ -345,21 +345,21 @@ end
 
 const CommutativeOps = Union{typeof(+),typeof(Base.add_sum),typeof(min),typeof(max),typeof(Base._extrema_rf),typeof(|),typeof(&)}
 
-function Base._mapreduce_dim(f, op::CommutativeOps, init::Base._InitialValue, A::PermutedDimsArray, dims::Colon)
-    Base._mapreduce_dim(f, op, init, parent(A), dims)
+function Base.mapreduce(f, op::CommutativeOps, A::PermutedDimsArray; init=Base._InitialValue(), dims=(:))
+    mapreduce(f, op, parent(A); init, dims)
 end
-function Base._mapreduce_dim(f::typeof(identity), op::Union{typeof(Base.mul_prod),typeof(*)}, init::Base._InitialValue, A::PermutedDimsArray{<:Union{Real,Complex}}, dims::Colon)
-    Base._mapreduce_dim(f, op, init, parent(A), dims)
+function Base.mapreduce(f::typeof(identity), op::Union{typeof(Base.mul_prod),typeof(*)}, A::PermutedDimsArray{<:Union{Real,Complex}}; init=_InitialValue(), dims=(:))
+    mapreduce(f, op, parent(A); init, dims)
 end
 
-function Base.mapreducedim!(f, op::CommutativeOps, B::AbstractArray{T,N}, A::PermutedDimsArray{S,N,perm,iperm}) where {T,S,N,perm,iperm}
+function Base.mapreduce!(f, op::CommutativeOps, B::AbstractArray{T,N}, A::PermutedDimsArray{S,N,perm,iperm}; init=Base._InitialValue()) where {T,S,N,perm,iperm}
     C = PermutedDimsArray{T,N,iperm,perm,typeof(B)}(B) # make the inverse permutation for the output
-    Base.mapreducedim!(f, op, C, parent(A))
+    Base.mapreduce!(f, op, C, parent(A); init)
     B
 end
-function Base.mapreducedim!(f::typeof(identity), op::Union{typeof(Base.mul_prod),typeof(*)}, B::AbstractArray{T,N}, A::PermutedDimsArray{<:Union{Real,Complex},N,perm,iperm}) where {T,N,perm,iperm}
+function Base.mapreduce!(f::typeof(identity), op::Union{typeof(Base.mul_prod),typeof(*)}, B::AbstractArray{T,N}, A::PermutedDimsArray{<:Union{Real,Complex},N,perm,iperm}; init=Base._InitialValue()) where {T,N,perm,iperm}
     C = PermutedDimsArray{T,N,iperm,perm,typeof(B)}(B) # make the inverse permutation for the output
-    Base.mapreducedim!(f, op, C, parent(A))
+    Base.mapreduce!(f, op, C, parent(A); init)
     B
 end
 
diff --git a/base/reduce.jl b/base/reduce.jl
index e280dda5a08b0..a5f49c8d2681f 100644
--- a/base/reduce.jl
+++ b/base/reduce.jl
@@ -242,42 +242,6 @@ julia> foldr(=>, 1:4; init=0)
 """
 foldr(op, itr; kw...) = mapfoldr(identity, op, itr; kw...)
 
-## reduce & mapreduce
-
-# `mapreduce_impl()` is called by `mapreduce()` (via `_mapreduce()`, when `A`
-# supports linear indexing) and does actual calculations (for `A[ifirst:ilast]` subset).
-# For efficiency, no parameter validity checks are done, it's the caller's responsibility.
-# `ifirst:ilast` range is assumed to be a valid non-empty subset of `A` indices.
-
-# This is a generic implementation of `mapreduce_impl()`,
-# certain `op` (e.g. `min` and `max`) may have their own specialized versions.
-@noinline function mapreduce_impl(f, op, A::AbstractArrayOrBroadcasted,
-                                  ifirst::Integer, ilast::Integer, blksize::Int)
-    if ifirst == ilast
-        @inbounds a1 = A[ifirst]
-        return mapreduce_first(f, op, a1)
-    elseif ilast - ifirst < blksize
-        # sequential portion
-        @inbounds a1 = A[ifirst]
-        @inbounds a2 = A[ifirst+1]
-        v = op(f(a1), f(a2))
-        @simd for i = ifirst + 2 : ilast
-            @inbounds ai = A[i]
-            v = op(v, f(ai))
-        end
-        return v
-    else
-        # pairwise portion
-        imid = ifirst + (ilast - ifirst) >> 1
-        v1 = mapreduce_impl(f, op, A, ifirst, imid, blksize)
-        v2 = mapreduce_impl(f, op, A, imid+1, ilast, blksize)
-        return op(v1, v2)
-    end
-end
-
-mapreduce_impl(f, op, A::AbstractArrayOrBroadcasted, ifirst::Integer, ilast::Integer) =
-    mapreduce_impl(f, op, A, ifirst, ilast, pairwise_blocksize(f, op))
-
 """
     mapreduce(f, op, itrs...; [init])
 
@@ -305,8 +269,40 @@ implementations may reuse the return value of `f` for elements that appear multi
 `itr`. Use [`mapfoldl`](@ref) or [`mapfoldr`](@ref) instead for
 guaranteed left or right associativity and invocation of `f` for every value.
 """
-mapreduce(f, op, itr; kw...) = mapfoldl(f, op, itr; kw...)
-mapreduce(f, op, itr, itrs...; kw...) = reduce(op, Generator(f, itr, itrs...); kw...)
+mapreduce(f, op, x; init=_InitialValue()) = mapreduce_pairwise(f, op, x, init)
+mapreduce(f, op, x, xs...; init=_InitialValue()) = mapreduce_pairwise(identity, op, Generator(f, x, xs...), init)
+
+"""
+    mapreduce(f, op, A::AbstractArray...; dims=:, [init])
+
+Evaluates to the same as `reduce(op, map(f, A...); dims=dims, init=init)`, but is generally
+faster because the intermediate array is avoided.
+
+!!! compat "Julia 1.2"
+    `mapreduce` with multiple iterators requires Julia 1.2 or later.
+
+# Examples
+```jldoctest
+julia> a = reshape(Vector(1:16), (4,4))
+4×4 Matrix{Int64}:
+ 1  5   9  13
+ 2  6  10  14
+ 3  7  11  15
+ 4  8  12  16
+
+julia> mapreduce(isodd, *, a, dims=1)
+1×4 Matrix{Bool}:
+ 0  0  0  0
+
+julia> mapreduce(isodd, |, a, dims=1)
+1×4 Matrix{Bool}:
+ 1  1  1  1
+```
+"""
+mapreduce(f, op, A::AbstractArrayOrBroadcasted; init=_InitialValue(), dims=(:)) = mapreducedim(f, op, A, init, dims)
+mapreduce(f, op, A::AbstractArrayOrBroadcasted, As::AbstractArrayOrBroadcasted...; init=_InitialValue(), dims=(:)) =
+    reduce(op, map(f, A, As...); init, dims)
+mapreducedim(f, op, A, init, ::Colon) = mapreduce_pairwise(f, op, A, init)
 
 # Note: sum_seq usually uses four or more accumulators after partial
 # unrolling, so each accumulator gets at most 256 numbers
@@ -401,11 +397,10 @@ additional methods should only be defined for cases where `op` gives a result wi
 different types than its inputs.
 """
 reduce_first(op, x) = x
-reduce_first(::typeof(+), x::Bool) = Int(x)
 reduce_first(::typeof(*), x::AbstractChar) = string(x)
 
 reduce_first(::typeof(add_sum), x) = reduce_first(+, x)
-reduce_first(::typeof(add_sum), x::Union{Bool,BitSignedSmall})   = Int(x)
+reduce_first(::typeof(add_sum), x::BitSignedSmall)   = Int(x)
 reduce_first(::typeof(add_sum), x::BitUnsignedSmall) = UInt(x)
 reduce_first(::typeof(mul_prod), x) = reduce_first(*, x)
 reduce_first(::typeof(mul_prod), x::BitSignedSmall)   = Int(x)
@@ -423,34 +418,176 @@ The default is `reduce_first(op, f(x))`.
 """
 mapreduce_first(f, op, x) = reduce_first(op, f(x))
 
-_mapreduce(f, op, A::AbstractArrayOrBroadcasted) = _mapreduce(f, op, IndexStyle(A), A)
-
-function _mapreduce(f, op, ::IndexLinear, A::AbstractArrayOrBroadcasted)
-    inds = LinearIndices(A)
-    n = length(inds)
-    if n == 0
-        return mapreduce_empty_iter(f, op, A, IteratorEltype(A))
-    elseif n == 1
-        @inbounds a1 = A[first(inds)]
-        return mapreduce_first(f, op, a1)
-    elseif n < 16 # process short array here, avoid mapreduce_impl() compilation
-        @inbounds i = first(inds)
-        @inbounds a1 = A[i]
-        @inbounds a2 = A[i+=1]
-        s = op(f(a1), f(a2))
-        while i < last(inds)
-            @inbounds Ai = A[i+=1]
-            s = op(s, f(Ai))
-        end
-        return s
-    else
-        return mapreduce_impl(f, op, A, first(inds), last(inds))
+_empty_eltype(x) = _empty_eltype(x, IteratorEltype(x))
+_empty_eltype(x, ::HasEltype) = eltype(x)
+_empty_eltype(_, _) = _empty_reduce_error()
+"""
+    _mapreduce_start(f, op, A, init, [a1])
+Perform the first step in a mapped reduction over `A` with 0 or one or more elements.
+The one-element method may be called multiple times within a single reduction at
+the start of each new chain of `op` calls.
+"""
+_mapreduce_start(f, op, A, ::_InitialValue) = mapreduce_empty(f, op, _empty_eltype(A))
+_mapreduce_start(f, op, A, ::_InitialValue, a1) = mapreduce_first(f, op, a1)
+_mapreduce_start(f, op, A, init) = init
+_mapreduce_start(f, op, A, init, a1) = op(init, mapreduce_first(f, op, a1))
+
+"""
+    mapreduce_pairwise(f, op, A, init)
+    mapreduce_pairwise(f, op, A, init, indices)
+    mapreduce_pairwise(f, op, itr, init, ::Union{HasLength, HasShape}, n, [state])
+    mapreduce_pairwise(f, op, itr, init, ::IteratorSize, n, valstate)
+
+Perform a pairwise reassociation of the mapped reduction with `f` and `op`
+Returns the result `v` or, for the 6+ argument iterator methods, a tuple `(v, state)` or `(v, iterate(itr, state))`
+
+The four-argument entry-point handles the empty case and dispatches to a polyalgorithm
+with three distinct mechanisms for accessing elements and structuring the subsequent
+recursion; all such recursive calls **must** process at least one element.
+
+    * An indexing method recursively splits `indices` until there are fewer than
+      `pairwise_blocksize` indices to process. CartesianIndices split into halves
+      across columns first, then within a single column (if necessary).
+    * An iterable method (with length) recursively takes half as many elements from
+      the iterable until `n` is less than or equal to the `pairwise_blocksize`. This
+      is slightly more complicated than the indexing case as we must additionally pass
+      (and return) the iterable's state along with the result of each part of the
+      reduction.
+    * An iterable method (without length) peels one element (and state) in advance of
+      every recursive call to handle the empty case and ensure that all recursive calls
+      process at least one element. This is slightly more complicated than the length
+      case as we must pass the iterable's next `(val, state)` to each recursive call
+      and similarly return it as the second part of the internal return tuple.
+      Additionally this algorithm effectively works backwards from the other two;
+      instead of starting at the *final* `op` reduction of the (possibly many) branches
+      of reduction chains, this starts at the initial chain of length **up to** the
+      `pairwise_blocksize` and then iteratively widens to incorporate larger chunks
+      (that are then recursively split) as necessary.
+
+All implementations dispatch to a similarly structured `mapreduce_kernel` to handle the
+base case that's essentially a `mapfoldl` that allows for further SIMD-like optimizations
+and reassociations.
+"""
+function mapreduce_pairwise(f, op, A::AbstractArrayOrBroadcasted, init)
+    isempty(A) && return _mapreduce_start(f, op, A, init)
+    length(A) <= pairwise_blocksize(f, op) && return mapreduce_kernel(f, op, A, init, eachindex(A))
+    return mapreduce_pairwise(f, op, A, init, eachindex(A))
+end
+mapreduce_pairwise(f, op, itr, init) = mapreduce_pairwise(f, op, itr, init, IteratorSize(itr))
+function mapreduce_pairwise(f, op, itr, init, S::Union{HasLength, HasShape})
+    n = length(itr)
+    n < 1 && return _mapreduce_start(f, op, itr, init)
+    n <= 16 && return mapfoldl(f, op, itr; init)
+    n <= pairwise_blocksize(f, op) && return mapreduce_kernel(f, op, itr, init, S, n)[1]
+    return mapreduce_pairwise(f, op, itr, init, S, n)[1]
+end
+function mapreduce_pairwise(f::F, op, itr, init, S::IteratorSize) where {F}
+    it = iterate(itr)
+    it === nothing && return _mapreduce_start(f, op, itr, init)
+    n = pairwise_blocksize(f, op)
+    v, it = mapreduce_kernel(f, op, itr, init, S, n, it)
+    while it !== nothing
+        n <<= 1
+        v1, it = mapreduce_pairwise(f, op, itr, init, S, n, it)
+        v = op(v, v1)
     end
+    return v
 end
 
-mapreduce(f, op, a::Number) = mapreduce_first(f, op, a)
+function mapreduce_pairwise(f::F, op::G, A, init, inds) where {F,G}
+    if length(inds) <= max(10, pairwise_blocksize(f, op))
+        return mapreduce_kernel(f, op, A, init, inds)
+    else
+        p1, p2 = halves(inds)
+        v1 = mapreduce_pairwise(f, op, A, init, p1)
+        v2 = mapreduce_pairwise(f, op, A, init, p2)
+        return op(v1, v2)
+    end
+end
+function halves(inds)
+    n = length(inds) >> 1
+    return (inds[begin:begin+n-1], inds[begin+n:end])
+end
+# Recursively find the last non-singleton range in the tuple to halve, keeping the rest whole
+_halves(tup) = length(tup[end]) > 1 ?
+                    (map(_wholes, front(tup))..., halves(tup[end])) :
+                    (_halves(front(tup))..., _wholes(tup[end]))
+_halves(::Tuple{}) = ()
+# This unconditionally re-indexes by begin:end to ensure type stability with the halving re-index
+_wholes(r) = r[begin:end], r[begin:end]
+
+function mapreduce_pairwise(f::F, op::G, itr, init, S::Union{HasLength,HasShape}, n, state...) where {F,G}
+    if n < max(10, pairwise_blocksize(f, op))
+        return mapreduce_kernel(f, op, itr, init, S, n, state...)
+    else
+        ndiv2 = n >> 1
+        v1, s = mapreduce_pairwise(f, op, itr, init, S, ndiv2, state...)
+        v2, s = mapreduce_pairwise(f, op, itr, init, S, n-ndiv2, s)
+        return (op(v1, v2), s)
+    end
+end
+function mapreduce_pairwise(f::F, op, itr, init, S::IteratorSize, n, it) where {F}
+    if n <= max(10, pairwise_blocksize(f, op))
+        v, it = mapreduce_kernel(f, op, itr, init, S, n, it)
+        return it === nothing ? (v, nothing) : (v, it)
+    else
+        ndiv2 = n >> 1
+        v1, it = mapreduce_pairwise(f, op, itr, init, S, ndiv2, it)
+        it === nothing && return (v1, nothing)
+        v2, it = mapreduce_pairwise(f, op, itr, init, S, n-ndiv2, it)
+        v = op(v1, v2)
+        return it === nothing ? (v, nothing) : (v, it)
+    end
+end
 
-_mapreduce(f, op, ::IndexCartesian, A::AbstractArrayOrBroadcasted) = mapfoldl(f, op, A)
+"""
+    mapreduce_kernel(f, op, A, init, inds) -> v
+    mapreduce_kernel(f, op, itr, init, ::Union{HasLength, HasShape}, n, [state]) -> (v, state)
+    mapreduce_kernel(f, op, itr, init, ::IteratorSize, n, valstate) -> (v, iterate(itr, state))
+
+Perform the mapped reduction with `f` and `op` akin to a `mapfoldl` but permitting SIMD-like
+optimizations and reassociations. Corresponding to `mapreduce_pairwise`, this function uses
+three (and a half) different mechanisms for accessing elements and structuring its loop(s):
+
+* Indexing with `inds` into `A`
+    * (with a dedicated optimization for CartesianIndices)
+* Iterating exactly `n` elements from `itr` with a known length and at least `n` elements remaining
+    This method additionally takes (and returns as the second element in a tuple) the current iterator state
+* Iterating *up to* `n` elements from `itr`, but possibly stopping early
+    This method additionally takes (and returns as the second element in a tuple) the *next* `(val, state)`
+    tuple from the iterator or `nothing`
+"""
+function mapreduce_kernel(f, op, A, init, inds)
+    a1 = @inbounds A[inds[begin]]
+    v = _mapreduce_start(f, op, A, init, a1)
+    length(inds) == 1 && return v
+    @simd for i in inds[begin+1:end]
+        a = @inbounds A[i]
+        v = op(v, f(a))
+    end
+    return v
+end
+function mapreduce_kernel(f, op, itr, init, ::Union{HasLength, HasShape}, n, state...)
+    a1, s = iterate(itr, state...)
+    v = _mapreduce_start(f, op, itr, init, a1)
+    @simd for _ in 2:n
+        a, s = iterate(itr, s)
+        v = op(v, f(a))
+    end
+    return v, s
+end
+function mapreduce_kernel(f, op, itr, init,  ::IteratorSize, n, it)
+    a1, s = it
+    v = _mapreduce_start(f, op, itr, init, a1)
+    @simd for _ in 2:n
+        it = iterate(itr, s)
+        it === nothing && return v, nothing
+        a, s = it
+        v = op(v, f(a))
+    end
+    it = iterate(itr, s)
+    return it === nothing ? (v, nothing) : (v, it)
+end
 
 """
     reduce(op, itr; [init])
@@ -487,9 +624,41 @@ julia> reduce(*, Int[]; init=1)
 1
 ```
 """
-reduce(op, itr; kw...) = mapreduce(identity, op, itr; kw...)
+reduce(op, itr; init=_InitialValue()) = mapreduce(identity, op, itr; init)
+
+"""
+    reduce(f, A::AbstractArray; dims=:, [init])
 
-reduce(op, a::Number) = a  # Do we want this?
+Reduce 2-argument function `f` along dimensions of `A`. `dims` is a vector specifying the
+dimensions to reduce, and the keyword argument `init` is the initial value to use in the
+reductions. For `+`, `*`, `max` and `min` the `init` argument is optional.
+
+The associativity of the reduction is implementation-dependent; if you need a particular
+associativity, e.g. left-to-right, you should write your own loop or consider using
+[`foldl`](@ref) or [`foldr`](@ref). See documentation for [`reduce`](@ref).
+
+# Examples
+```jldoctest
+julia> a = reshape(Vector(1:16), (4,4))
+4×4 Matrix{Int64}:
+ 1  5   9  13
+ 2  6  10  14
+ 3  7  11  15
+ 4  8  12  16
+
+julia> reduce(max, a, dims=2)
+4×1 Matrix{Int64}:
+ 13
+ 14
+ 15
+ 16
+
+julia> reduce(max, a, dims=1)
+1×4 Matrix{Int64}:
+ 4  8  12  16
+```
+"""
+reduce(op, A::AbstractArrayOrBroadcasted; init=_InitialValue(), dims=(:)) = mapreduce(identity, op, A; init, dims)
 
 ###### Specific reduction functions ######
 
diff --git a/base/reducedim.jl b/base/reducedim.jl
index 5717e99d193de..fcb9edb3b130f 100644
--- a/base/reducedim.jl
+++ b/base/reducedim.jl
@@ -36,349 +36,185 @@ end
 
 ###### Generic reduction functions #####
 
-## initialization
-# initarray! is only called by sum!, prod!, etc.
-for (Op, initfun) in ((:(typeof(add_sum)), :zero), (:(typeof(mul_prod)), :one))
-    @eval initarray!(a::AbstractArray{T}, ::Any, ::$(Op), init::Bool, src::AbstractArray) where {T} = (init && fill!(a, $(initfun)(T)); a)
+# Given two indices or ranges, merge them by dimension akin to a broadcasted `ifelse` over the dims
+_sliceall(x) = x[begin]:x[end] # avoid instabilities with OneTos and offset axes
+_ifelseslice(b,x,y) = ifelse(b, _sliceall(x), _sliceall(y))
+mergeindices(b::NTuple{N,Bool}, x::CartesianIndices{N}, y::CartesianIndices{N}) where {N} =
+    CartesianIndices(map(_ifelseslice, b, x.indices, y.indices))
+mergeindices(b::NTuple{N,Bool}, x::CartesianIndex{N}, y::CartesianIndices{N}) where {N} =
+    CartesianIndices(map(_ifelseslice, b, x.I, y.indices))
+mergeindices(b::NTuple{N,Bool}, x::CartesianIndices{N}, y::CartesianIndex{N}) where {N} =
+    CartesianIndices(map(_ifelseslice, b, x.indices, y.I))
+mergeindices(b::NTuple{N,Bool}, x::CartesianIndex{N}, y::CartesianIndex{N}) where {N} =
+    CartesianIndex(map((b,x,y)->ifelse(b, x, y), b, x.I, y.I))
+
+keep_first_trues(::Tuple{}) = ()
+keep_first_trues(t) = t[1] ? (true, keep_first_trues(tail(t))...) : ntuple(Returns(false), length(t))
+
+mapreduce_similar(A, ::Type{T}, dims) where {T} = similar(A, T, dims)
+
+# These special internal types allow exposing both in- and out-of-place array initialization
+# as a comprehension-like function call with a generator
+struct _MapReduceAllocator{T}
+    itr::T
 end
-
-initarray!(a::AbstractArray{T}, f, ::Union{typeof(min),typeof(max),typeof(_extrema_rf)},
-    init::Bool, src::AbstractArray) where {T} = (init && mapfirst!(f, a, src); a)
-
-for (Op, initval) in ((:(typeof(and_all)), true), (:(typeof(or_any)), false))
-    @eval initarray!(a::AbstractArray, ::Any, ::$(Op), init::Bool, src::AbstractArray) = (init && fill!(a, $initval); a)
+_similar_for(c::_MapReduceAllocator, ::Type{T}, itr, ::SizeUnknown, ::Nothing) where {T} =
+    mapreduce_similar(c.itr, T, (0,))
+_similar_for(c::_MapReduceAllocator, ::Type{T}, itr, ::HasLength, len::Integer) where {T} =
+    mapreduce_similar(c.itr, T, (len,))
+_similar_for(c::_MapReduceAllocator, ::Type{T}, itr, ::HasShape, axs) where {T} =
+    mapreduce_similar(c.itr, T, axs)
+(mra::_MapReduceAllocator)(itr) = collect_similar(mra, itr)
+
+struct _MapReduceInPlace{T,F}
+    A::T
+    op::F
+    update::Bool
 end
-
-# reducedim_initarray is called by
-reducedim_initarray(A::AbstractArrayOrBroadcasted, region, init, ::Type{R}) where {R} = fill!(similar(A,R,reduced_indices(A,region)), init)
-reducedim_initarray(A::AbstractArrayOrBroadcasted, region, init::T) where {T} = reducedim_initarray(A, region, init, T)
-
-# TODO: better way to handle reducedim initialization
-#
-# The current scheme is basically following Steven G. Johnson's original implementation
-#
-promote_union(T::Union) = promote_type(promote_union(T.a), promote_union(T.b))
-promote_union(T) = T
-
-_realtype(::Type{<:Complex}) = Real
-_realtype(::Type{Complex{T}}) where T<:Real = T
-_realtype(T::Type) = T
-_realtype(::Union{typeof(abs),typeof(abs2)}, T) = _realtype(T)
-_realtype(::Any, T) = T
-
-function reducedim_init(f, op::Union{typeof(+),typeof(add_sum)}, A::AbstractArray, region)
-    _reducedim_init(f, op, zero, sum, A, region)
-end
-function reducedim_init(f, op::Union{typeof(*),typeof(mul_prod)}, A::AbstractArray, region)
-    _reducedim_init(f, op, one, prod, A, region)
-end
-function _reducedim_init(f, op, fv, fop, A, region)
-    T = _realtype(f, promote_union(eltype(A)))
-    if T !== Any && applicable(zero, T)
-        x = f(zero(T))
-        z = op(fv(x), fv(x))
-        Tr = z isa T ? T : typeof(z)
+function (mri::_MapReduceInPlace)(g::Generator{<:AbstractArray})
+    A = mri.A
+    if mri.update
+        for i in g.iter
+            A[i] = mri.op(A[i], g.f(i))
+        end
     else
-        z = fv(fop(f, A))
-        Tr = typeof(z)
+        for i in g.iter
+            A[i] = g.f(i)
+        end
     end
-    return reducedim_initarray(A, region, z, Tr)
+    return A
 end
-
-# initialization when computing minima and maxima requires a little care
-for (f1, f2, initval, typeextreme) in ((:min, :max, :Inf, :typemax), (:max, :min, :(-Inf), :typemin))
-    @eval function reducedim_init(f, op::typeof($f1), A::AbstractArray, region)
-        # First compute the reduce indices. This will throw an ArgumentError
-        # if any region is invalid
-        ri = reduced_indices(A, region)
-
-        # Next, throw if reduction is over a region with length zero
-        any(i -> isempty(axes(A, i)), region) && _empty_reduce_error()
-
-        # Make a view of the first slice of the region
-        A1 = view(A, ri...)
-
-        if isempty(A1)
-            # If the slice is empty just return non-view version as the initial array
-            return map(f, A1)
-        else
-            # otherwise use the min/max of the first slice as initial value
-            v0 = mapreduce(f, $f2, A1)
-
-            T = _realtype(f, promote_union(eltype(A)))
-            Tr = v0 isa T ? T : typeof(v0)
-
-            # but NaNs, missing and unordered values need to be avoided as initial values
-            if v0 isa Number && isnan(v0)
-                # v0 is NaN
-                v0 = oftype(v0, $initval)
-            elseif isunordered(v0)
-                # v0 is missing or a third-party unordered value
-                Tnm = nonmissingtype(Tr)
-                if Tnm <: Union{BitInteger, IEEEFloat, BigFloat}
-                    v0 = $typeextreme(Tnm)
-                elseif !all(isunordered, A1)
-                    v0 = mapreduce(f, $f2, Iterators.filter(!isunordered, A1))
-                end
-            end
-            # v0 may have changed type.
-            Tr = v0 isa T ? T : typeof(v0)
-
-            return reducedim_initarray(A, region, v0, Tr)
+function (mri::_MapReduceInPlace)(g::Generator{<:Iterators.Enumerate{<:AbstractArray}})
+    A = mri.A
+    if mri.update
+        for ki in g.iter
+            A[ki[2]] = mri.op(A[ki[2]], g.f(ki))
+        end
+    else
+        for ki in g.iter
+            A[ki[2]] = g.f(ki)
         end
     end
+    return A
 end
 
-function reducedim_init(f::ExtremaMap, op::typeof(_extrema_rf), A::AbstractArray, region)
-    # First compute the reduce indices. This will throw an ArgumentError
-    # if any region is invalid
-    ri = reduced_indices(A, region)
-
-    # Next, throw if reduction is over a region with length zero
-    any(i -> isempty(axes(A, i)), region) && _empty_reduce_error()
-
-    # Make a view of the first slice of the region
-    A1 = view(A, ri...)
-
-    isempty(A1) && return map(f, A1)
-    # use the max/min of the first slice as initial value for non-empty cases
-    v0 = reverse(mapreduce(f, op, A1)) # turn minmax to maxmin
-
-    T = _realtype(f.f, promote_union(eltype(A)))
-    Tmin = v0[1] isa T ? T : typeof(v0[1])
-    Tmax = v0[2] isa T ? T : typeof(v0[2])
-
-    # but NaNs and missing need to be avoided as initial values
-    if v0[1] isa Number && isnan(v0[1])
-        # v0 is NaN
-        v0 = oftype(v0[1], Inf), oftype(v0[2], -Inf)
-    elseif isunordered(v0[1])
-        # v0 is missing or a third-party unordered value
-        Tminnm = nonmissingtype(Tmin)
-        Tmaxnm = nonmissingtype(Tmax)
-        if Tminnm <: Union{BitInteger, IEEEFloat, BigFloat} &&
-            Tmaxnm <: Union{BitInteger, IEEEFloat, BigFloat}
-            v0 = (typemax(Tminnm), typemin(Tmaxnm))
-        elseif !all(isunordered, A1)
-            v0 = reverse(mapreduce(f, op, Iterators.filter(!isunordered, A1)))
-        end
+if isdefined(Core, :Compiler)
+    _mapreduce_might_widen(_, _, _, _, ::_MapReduceInPlace) = false
+    function _mapreduce_might_widen(f::F, op::G, A::T, init, _) where {F,G,T}
+        return !isconcretetype(Core.Compiler.return_type(x->op(_mapreduce_start(f, op, first(x), init), f(first(x))), Tuple{T}))
     end
-    # v0 may have changed type.
-    Tmin = v0[1] isa T ? T : typeof(v0[1])
-    Tmax = v0[2] isa T ? T : typeof(v0[2])
-
-    return reducedim_initarray(A, region, v0, Tuple{Tmin,Tmax})
+else
+    _mapreduce_might_widen(_, _, _, _, ::_MapReduceInPlace) = false
+    _mapreduce_might_widen(_, _, _, _, _) = true
 end
 
-reducedim_init(f::Union{typeof(abs),typeof(abs2)}, op::typeof(max), A::AbstractArray{T}, region) where {T} =
-    reducedim_initarray(A, region, zero(f(zero(T))), _realtype(f, T))
-
-reducedim_init(f, op::typeof(and_all), A::AbstractArrayOrBroadcasted, region) = reducedim_initarray(A, region, true)
-reducedim_init(f, op::typeof(or_any), A::AbstractArrayOrBroadcasted, region) = reducedim_initarray(A, region, false)
-
-# These definitions are wrong in general; Cf. JuliaLang/julia#45562
-reducedim_init(f, op::typeof(&), A::AbstractArrayOrBroadcasted, region) = reducedim_initarray(A, region, true)
-reducedim_init(f, op::typeof(|), A::AbstractArrayOrBroadcasted, region) = reducedim_initarray(A, region, false)
-
-# specialize to make initialization more efficient for common cases
-
-let
-    BitIntFloat = Union{BitInteger, IEEEFloat}
-    T = Union{
-        Any[AbstractArray{t} for t in uniontypes(BitIntFloat)]...,
-        Any[AbstractArray{Complex{t}} for t in uniontypes(BitIntFloat)]...}
-
-    global function reducedim_init(f, op::Union{typeof(+),typeof(add_sum)}, A::T, region)
-        z = zero(f(zero(eltype(A))))
-        reducedim_initarray(A, region, op(z, z))
+function mapreducedim(f::F, op::OP, A, init, dims, alloc=_MapReduceAllocator(A)) where {F, OP}
+    # It's advantageous to additionally consider singleton dimensions as part of the
+    # reduction as that allows more cases to be considered contiguous
+    is_inner_dim = ntuple(d->d in dims || size(A, d) == 1, ndims(A))
+    inner = CartesianIndices(map((b,ax)->b ? ax : reduced_index(ax), is_inner_dim, axes(A)))
+    outer = CartesianIndices(reduced_indices(A, dims))
+    n = length(inner)
+    # Handle the empty and trivial 1-element cases:
+    if (n == 0 || isempty(A))
+        # This is broken out of the comprehension to ensure it's called, avoiding an empty Vector{Union{}}
+        v = _mapreduce_start(f, op, A, init)
+        return alloc(v for _ in outer)
     end
-    global function reducedim_init(f, op::Union{typeof(*),typeof(mul_prod)}, A::T, region)
-        u = one(f(one(eltype(A))))
-        reducedim_initarray(A, region, op(u, u))
+    n == 1 && return alloc(_mapreduce_start(f, op, A, init, A[i]) for i in outer)
+    # Now there are multiple loop ordering strategies depending upon the `dims`:
+    if is_inner_dim == keep_first_trues(is_inner_dim) || _mapreduce_might_widen(f, op, A, init, alloc)
+        # Column major contiguous reduction! This is the easy case
+        return mapreducedim_naive(f, op, A, init, is_inner_dim, inner, outer, alloc)
+    elseif 1 in dims
+        return mapreducedim_colmajor(f, op, A, init, is_inner_dim, inner, outer, alloc)
+    else
+        return mapreducedim_rowmajor(f, op, A, init, is_inner_dim, inner, outer, alloc)
     end
 end
 
-## generic (map)reduction
-
-has_fast_linear_indexing(a::AbstractArrayOrBroadcasted) = IndexStyle(a) === IndexLinear()
-has_fast_linear_indexing(a::AbstractVector) = true
-
-function check_reducedims(R, A)
-    # Check whether R has compatible dimensions w.r.t. A for reduction
-    #
-    # It returns an integer value (useful for choosing implementation)
-    # - If it reduces only along leading dimensions, e.g. sum(A, dims=1) or sum(A, dims=(1,2)),
-    #   it returns the length of the leading slice. For the two examples above,
-    #   it will be size(A, 1) or size(A, 1) * size(A, 2).
-    # - Otherwise, e.g. sum(A, dims=2) or sum(A, dims=(1,3)), it returns 0.
-    #
-    ndims(R) <= ndims(A) || throw(DimensionMismatch("cannot reduce $(ndims(A))-dimensional array to $(ndims(R)) dimensions"))
-    lsiz = 1
-    had_nonreduc = false
-    for i = 1:ndims(A)
-        Ri, Ai = axes(R, i), axes(A, i)
-        sRi, sAi = length(Ri), length(Ai)
-        if sRi == 1
-            if sAi > 1
-                if had_nonreduc
-                    lsiz = 0  # to reduce along i, but some previous dimensions were non-reducing
-                else
-                    lsiz *= sAi  # if lsiz was set to zero, it will stay to be zero
-                end
-            end
-        else
-            Ri == Ai || throw(DimensionMismatch("reduction on array with indices $(axes(A)) with output with indices $(axes(R))"))
-            had_nonreduc = true
-        end
-    end
-    return lsiz
+linear_size(A, is_inner_dim) = linear_size(IndexStyle(A), A, is_inner_dim)
+linear_size(::IndexLinear, A, is_inner_dim) = if is_inner_dim == keep_first_trues(is_inner_dim)
+    prod(map((b,sz)->ifelse(b, sz, 1), is_inner_dim, size(A)))
+else
+    0
 end
+linear_size(::IndexStyle, _, _) = 0
 
-"""
-Extract first entry of slices of array A into existing array R.
-"""
-copyfirst!(R::AbstractArray, A::AbstractArray) = mapfirst!(identity, R, A)
-
-function mapfirst!(f::F, R::AbstractArray, A::AbstractArray{<:Any,N}) where {N, F}
-    lsiz = check_reducedims(R, A)
-    t = _firstreducedslice(axes(R), axes(A))
-    map!(f, R, view(A, t...))
+function mapreducedim_naive(f::F, op::OP, A, init, is_inner_dim, inner, outer, alloc) where {F, OP}
+    lsiz = linear_size(A, is_inner_dim)
+    if lsiz > 0
+        i0 = first(LinearIndices(A))
+        alloc(mapreduce_pairwise(f, op, A, init, (i0:lsiz) .+ (lsiz*(i-1))) for (i,_) in enumerate(outer))
+    else
+        alloc(mapreduce_pairwise(f, op, A, init, mergeindices(is_inner_dim, inner, i)) for i in outer)
+    end
 end
-# We know that the axes of R and A are compatible, but R might have a different number of
-# dimensions than A, which is trickier than it seems due to offset arrays and type stability
-_firstreducedslice(::Tuple{}, a::Tuple{}) = ()
-_firstreducedslice(::Tuple, ::Tuple{}) = ()
-@inline _firstreducedslice(::Tuple{}, a::Tuple) = (_firstslice(a[1]), _firstreducedslice((), tail(a))...)
-@inline _firstreducedslice(r::Tuple, a::Tuple) = (length(r[1])==1 ? _firstslice(a[1]) : r[1], _firstreducedslice(tail(r), tail(a))...)
-_firstslice(i::OneTo) = OneTo(1)
-_firstslice(i::Slice) = Slice(_firstslice(i.indices))
-_firstslice(i) = i[firstindex(i):firstindex(i)]
-
-function _mapreducedim!(f, op, R::AbstractArray, A::AbstractArrayOrBroadcasted)
-    lsiz = check_reducedims(R,A)
-    isempty(A) && return R
-
-    if has_fast_linear_indexing(A) && lsiz > 16
-        # use mapreduce_impl, which is probably better tuned to achieve higher performance
-        nslices = div(length(A), lsiz)
-        ibase = first(LinearIndices(A))-1
-        for i in eachindex(R)
-            r = op(@inbounds(R[i]), mapreduce_impl(f, op, A, ibase+1, ibase+lsiz))
-            @inbounds R[i] = r
-            ibase += lsiz
-        end
-        return R
+function mapreducedim_colmajor(f::F, op::OP, A, init, is_inner_dim, inner, outer, alloc, enforce_pairwise=true) where {F, OP}
+    is_contiguous_inner = keep_first_trues(is_inner_dim)
+    contiguous_inner = mergeindices(is_contiguous_inner, inner, first(outer))
+    discontiguous_inner = mergeindices(is_contiguous_inner, first(outer), inner)
+    if enforce_pairwise && length(discontiguous_inner) > pairwise_blocksize(f, op)
+        return mapreducedim_naive(f, op, A, init, is_inner_dim, inner, outer, alloc)
     end
-    indsAt, indsRt = safe_tail(axes(A)), safe_tail(axes(R)) # handle d=1 manually
-    keep, Idefault = Broadcast.shapeindexer(indsRt)
-    if reducedim1(R, A)
-        # keep the accumulator as a local variable when reducing along the first dimension
-        i1 = first(axes1(R))
-        for IA in CartesianIndices(indsAt)
-            IR = Broadcast.newindex(IA, keep, Idefault)
-            @inbounds r = R[i1,IR]
-            @simd for i in axes(A, 1)
-                r = op(r, f(@inbounds(A[i, IA])))
-            end
-            @inbounds R[i1,IR] = r
-        end
-    else
-        for IA in CartesianIndices(indsAt)
-            IR = Broadcast.newindex(IA, keep, Idefault)
-            @simd for i in axes(A, 1)
-                v = op(@inbounds(R[i,IR]), f(@inbounds(A[i,IA])))
-                @inbounds R[i,IR] = v
-            end
+    # Initialize our reduction with the result of doing the first contiguous reduction
+    R = alloc(mapreduce_pairwise(f, op, A, init, mergeindices(is_inner_dim, contiguous_inner, o)) for o in outer)
+    for dci in Iterators.drop(discontiguous_inner, 1)
+        for o in outer
+            i = mergeindices(is_inner_dim, dci, o)
+            R[o] = op(R[o], mapreduce_pairwise(f, op, A, init, mergeindices(is_contiguous_inner, contiguous_inner, i)))
         end
     end
     return R
 end
 
-mapreducedim!(f, op, R::AbstractArray, A::AbstractArrayOrBroadcasted) =
-    (_mapreducedim!(f, op, R, A); R)
-
-reducedim!(op, R::AbstractArray{RT}, A::AbstractArrayOrBroadcasted) where {RT} =
-    mapreducedim!(identity, op, R, A)
+function mapreducedim_rowmajor(f::F, op::OP, A, init, is_inner_dim, inner, outer, alloc, enforce_pairwise=true) where {F, OP}
+    # This will only be cache-optimal in the fully "backwards" case — that is, dims are `twoplus:ndims(A)`
+    # And even then, it's simply not possible to do this cache-optimally with a pairwise reassociation;
+    # this is effectively a foldl.
+    if enforce_pairwise && length(inner) > pairwise_blocksize(f, op)
+        return mapreducedim_naive(f, op, A, init, is_inner_dim, inner, outer, alloc)
+    end
+    # Take one element from each outer iteration to initialize R
+    R = alloc(_mapreduce_start(f, op, A, init, A[o]) for o in outer)
+    for i in Iterators.drop(inner, 1)
+        @simd for o in outer
+            # SIMD still helps here! It doesn't reassociate _within_ each reduction, but it can allow
+            # evaluations _across_ the multiple reductions to coalesce into a single instruction at times
+            iA = mergeindices(is_inner_dim, i, o)
+            v = op(@inbounds(R[o]), f(@inbounds(A[iA])))
+            @inbounds R[o] = v
+        end
+    end
+    return R
+end
 
 """
-    mapreduce(f, op, A::AbstractArray...; dims=:, [init])
+    mapreduce!(f, op, R::AbstractArray, A::AbstractArrayOrBroadcasted; [init])
 
-Evaluates to the same as `reduce(op, map(f, A...); dims=dims, init=init)`, but is generally
-faster because the intermediate array is avoided.
+Compute `mapreduce(f, op, A; init, dims)` where `dims` are the singleton dimensions of `R`, storing the result into `R`.
 
-!!! compat "Julia 1.2"
-    `mapreduce` with multiple iterators requires Julia 1.2 or later.
-
-# Examples
-```jldoctest
-julia> a = reshape(Vector(1:16), (4,4))
-4×4 Matrix{Int64}:
- 1  5   9  13
- 2  6  10  14
- 3  7  11  15
- 4  8  12  16
-
-julia> mapreduce(isodd, *, a, dims=1)
-1×4 Matrix{Bool}:
- 0  0  0  0
-
-julia> mapreduce(isodd, |, a, dims=1)
-1×4 Matrix{Bool}:
- 1  1  1  1
-```
+!!! note
+    The previous values in `R` are _not_ used as initial values; they are completely ignored
 """
-mapreduce(f, op, A::AbstractArrayOrBroadcasted; dims=:, init=_InitialValue()) =
-    _mapreduce_dim(f, op, init, A, dims)
-mapreduce(f, op, A::AbstractArrayOrBroadcasted, B::AbstractArrayOrBroadcasted...; kw...) =
-    reduce(op, map(f, A, B...); kw...)
-
-_mapreduce_dim(f, op, nt, A::AbstractArrayOrBroadcasted, ::Colon) =
-    mapfoldl_impl(f, op, nt, A)
-
-_mapreduce_dim(f, op, ::_InitialValue, A::AbstractArrayOrBroadcasted, ::Colon) =
-    _mapreduce(f, op, IndexStyle(A), A)
-
-_mapreduce_dim(f, op, nt, A::AbstractArrayOrBroadcasted, dims) =
-    mapreducedim!(f, op, reducedim_initarray(A, dims, nt), A)
-
-function _mapreduce_dim(f, op, ::_InitialValue, A::AbstractArrayOrBroadcasted, dims)
-    isempty(A) && return fill(mapreduce_empty(f, op, eltype(A)), reduced_indices(A, dims))
-    mapreducedim!(f, op, reducedim_init(f, op, A, dims), A)
+function mapreduce!(f, op, R::AbstractArray, A::AbstractArrayOrBroadcasted; init=_InitialValue(), update=false)
+    if ndims(R) > ndims(A) || !all(d->size(R, d) in (1, size(A, d)), 1:max(ndims(R), ndims(A)))
+        throw(DimensionMismatch())
+    end
+    is_inner_dim = ntuple(d->size(R, d)==1, ndims(A))
+    dims = findall(is_inner_dim) #TODO: do better
+    return mapreducedim(f, op, A, init, dims, _MapReduceInPlace(R, op, update))
 end
 
 """
-    reduce(f, A::AbstractArray; dims=:, [init])
+    reduce!(op, R::AbstractArray, A::AbstractArrayOrBroadcasted; [init])
 
-Reduce 2-argument function `f` along dimensions of `A`. `dims` is a vector specifying the
-dimensions to reduce, and the keyword argument `init` is the initial value to use in the
-reductions. For `+`, `*`, `max` and `min` the `init` argument is optional.
+Compute `reduce(op, A; init, dims)` where `dims` are the singleton dimensions of `R`, storing the result into `R`.
 
-The associativity of the reduction is implementation-dependent; if you need a particular
-associativity, e.g. left-to-right, you should write your own loop or consider using
-[`foldl`](@ref) or [`foldr`](@ref). See documentation for [`reduce`](@ref).
-
-# Examples
-```jldoctest
-julia> a = reshape(Vector(1:16), (4,4))
-4×4 Matrix{Int64}:
- 1  5   9  13
- 2  6  10  14
- 3  7  11  15
- 4  8  12  16
-
-julia> reduce(max, a, dims=2)
-4×1 Matrix{Int64}:
- 13
- 14
- 15
- 16
-
-julia> reduce(max, a, dims=1)
-1×4 Matrix{Int64}:
- 4  8  12  16
-```
+!!! note
+    The previous values in `R` are _not_ used as initial values; they are completely ignored
 """
-reduce(op, A::AbstractArray; kw...) = mapreduce(identity, op, A; kw...)
+reduce!(op, R::AbstractArray, A::AbstractArrayOrBroadcasted; init=_InitialValue(), update=false) = mapreduce!(identity, op, R, A; init, update)
 
 ##### Specific reduction functions #####
 
@@ -447,7 +283,7 @@ julia> count!(<=(2), [1; 1], A)
 """
 count!(r::AbstractArray, A::AbstractArrayOrBroadcasted; init::Bool=true) = count!(identity, r, A; init=init)
 count!(f, r::AbstractArray, A::AbstractArrayOrBroadcasted; init::Bool=true) =
-    mapreducedim!(_bool(f), add_sum, initarray!(r, f, add_sum, init, A), A)
+    mapreduce!(_bool(f), add_sum, r, A; update=!init)
 
 """
     sum(A::AbstractArray; dims)
@@ -1007,7 +843,7 @@ for (fname, op) in [(:sum, :add_sum), (:prod, :mul_prod),
     mapf = fname === :extrema ? :(ExtremaMap(f)) : :f
     @eval begin
         $(fname!)(f::Function, r::AbstractArray, A::AbstractArray; init::Bool=true) =
-            mapreducedim!($mapf, $(op), initarray!(r, $mapf, $(op), init, A), A)
+            mapreduce!($mapf, $op, r, A; update=!init)
         $(fname!)(r::AbstractArray, A::AbstractArray; init::Bool=true) = $(fname!)(identity, r, A; init=init)
 
         $(_fname)(A, dims; kw...)    = $(_fname)(identity, A, dims; kw...)
@@ -1068,6 +904,62 @@ function findminmax!(f, op, Rval, Rind, A::AbstractArray{T,N}) where {T,N}
     Rval, Rind
 end
 
+struct PairsArray{T,N,A} <: AbstractArray{T,N}
+    array::A
+end
+PairsArray(array::AbstractArray{T,N}) where {T, N} = PairsArray{Tuple{keytype(array),T}, N, typeof(array)}(array)
+const PairsVector{T,A} = PairsArray{T, 1, A}
+IndexStyle(::PairsVector) = IndexLinear()
+IndexStyle(::PairsArray) = IndexCartesian()
+size(P::PairsArray) = size(P.array)
+axes(P::PairsArray) = axes(P.array)
+@inline function getindex(P::PairsVector, i::Int)
+    @boundscheck checkbounds(P, i)
+    @inbounds (i, P.array[i])
+end
+@inline function getindex(P::PairsArray{<:Any,N}, I::CartesianIndex{N}) where {N}
+    @boundscheck checkbounds(P, I)
+    @inbounds (I, P.array[I])
+end
+@propagate_inbounds getindex(P::PairsVector, i::CartesianIndex{1}) = P[i.I[1]]
+@propagate_inbounds getindex(P::PairsArray{<:Any,N}, I::Vararg{Int, N}) where {N} = P[CartesianIndex(I)]
+# Defer a few functions to the parent array
+similar(P::PairsArray, ::Type{T}, dims::Tuple{Union{Int, AbstractUnitRange}, Vararg{Union{Int, AbstractUnitRange}}}) where {T} = similar(P.array, T, dims)
+similar(P::PairsArray, ::Type{T}, dims::Tuple{Union{Int, Base.OneTo}, Vararg{Union{Int, Base.OneTo}}}) where {T} = similar(P.array, T, dims)
+similar(P::PairsArray, ::Type{T}, dims::Tuple{Int, Vararg{Int}}) where {T} = similar(P.array, T, dims)
+similar(P::PairsArray, ::Type{T}, dims::Tuple{}) where {T} = similar(P.array, T, dims)
+mapreduce_similar(P::PairsArray, ::Type{T}, dims) where {T} = mapreduce_similar(P.array, T, dims)
+
+# Use an ad-hoc specialized StructArray to allow in-place AoS->SoA transform
+struct ZippedArray{T,N,Style,A,B} <: AbstractArray{T,N}
+    first::A
+    second::B
+end
+function ZippedArray(A::AbstractArray{T,N},B::AbstractArray{S,N}) where {T,S,N}
+    axes(A) == axes(B) || throw(DimensionMismatch("both arrays must have the same shape"))
+    # TODO: It'd be better if we could transform a Tuple{Int, Union{Int, Missing}} to Union{Tuple{Int,Int}, Tuple{Int, Missing}}
+    ZippedArray{Tuple{T,S},N,IndexStyle(A,B),typeof(A),typeof(B)}(A,B)
+end
+size(Z::ZippedArray) = size(Z.first)
+axes(Z::ZippedArray) = axes(Z.first)
+IndexStyle(::ZippedArray{<:Any,<:Any,Style}) where {Style} = Style
+@inline function getindex(Z::ZippedArray, I::Int...)
+    @boundscheck checkbounds(Z, I...)
+    @inbounds (Z.first[I...], Z.second[I...])
+end
+@propagate_inbounds setindex!(Z::ZippedArray{T}, v, I::Int...) where {T} = setindex!(Z, convert(T, v), I...)
+@inline function setindex!(Z::ZippedArray{T}, v::T, I::Int...) where {T}
+    @boundscheck checkbounds(Z, I...)
+    @inbounds Z.first[I...] = v[1]
+    @inbounds Z.second[I...] = v[2]
+    return Z
+end
+_unzip(Z::ZippedArray) = (Z.first, Z.second)
+_unzip(A::AbstractArray) = ([a[1] for a in A], [a[2] for a in A])
+
+_transform_pair(f) = x-> (x[1], f(x[2]))
+_transform_pair(f::typeof(identity)) = f
+
 """
     findmin!(rval, rind, A) -> (minval, index)
 
@@ -1077,9 +969,11 @@ dimensions of `rval` and `rind`, and store the results in `rval` and `rind`.
 
 $(_DOCS_ALIASING_WARNING)
 """
-function findmin!(rval::AbstractArray, rind::AbstractArray, A::AbstractArray;
+findmin!(rval::AbstractArray, rind::AbstractArray, A::AbstractArray; init::Bool=true) = findmin!(identity, rval, rind, A; init)
+function findmin!(f, rval::AbstractArray, rind::AbstractArray, A::AbstractArray;
                   init::Bool=true)
-    findminmax!(identity, isgreater, init && !isempty(A) ? fill!(rval, first(A)) : rval, fill!(rind,zero(eltype(keys(A)))), A)
+    mapreduce!(_transform_pair(f), (x,y)->ifelse(isgreater(x[2], y[2]), y, x), ZippedArray(rind, rval), PairsArray(A); update=!init)
+    return (rval, rind)
 end
 
 """
@@ -1128,16 +1022,16 @@ julia> findmin(abs2, A, dims=2)
 findmin(f, A::AbstractArray; dims=:) = _findmin(f, A, dims)
 
 function _findmin(f, A, region)
-    ri = reduced_indices0(A, region)
-    if isempty(A)
-        _empty_reduce_error()
+    if f === identity
+        # Fast path with pre-allocated arrays
+        axs = reduced_indices(A, region)
+        return findmin!(identity, mapreduce_similar(A, eltype(A), axs), mapreduce_similar(A, keytype(A), axs), A)
     else
-        fA = f(first(A))
-        findminmax!(f, isgreater, fill!(similar(A, _findminmax_inittype(f, A), ri), fA),
-                    zeros(eltype(keys(A)), ri), A)
+        P = mapreduce(_transform_pair(f), (x,y)->ifelse(isgreater(x[2], y[2]), y, x), PairsArray(A); dims=region)
+        (inds, vals) = _unzip(P)
+        return (vals, inds)
     end
 end
-
 """
     findmax!(rval, rind, A) -> (maxval, index)
 
@@ -1147,9 +1041,11 @@ dimensions of `rval` and `rind`, and store the results in `rval` and `rind`.
 
 $(_DOCS_ALIASING_WARNING)
 """
-function findmax!(rval::AbstractArray, rind::AbstractArray, A::AbstractArray;
+findmax!(rval::AbstractArray, rind::AbstractArray, A::AbstractArray; init::Bool=true) = findmax!(identity, rval, rind, A; init)
+function findmax!(f, rval::AbstractArray, rind::AbstractArray, A::AbstractArray;
                   init::Bool=true)
-    findminmax!(identity, isless, init && !isempty(A) ? fill!(rval, first(A)) : rval, fill!(rind,zero(eltype(keys(A)))), A)
+    mapreduce!(_transform_pair(f), (x,y)->ifelse(isless(x[2], y[2]), y, x), ZippedArray(rind, rval), PairsArray(A); update=!init)
+    return (rval, rind)
 end
 
 """
@@ -1196,30 +1092,17 @@ julia> findmax(abs2, A, dims=2)
 ```
 """
 findmax(f, A::AbstractArray; dims=:) = _findmax(f, A, dims)
-
 function _findmax(f, A, region)
-    ri = reduced_indices0(A, region)
-    if isempty(A)
-        _empty_reduce_error()
+    if f === identity
+        axs = reduced_indices(A, region)
+        return findmax!(identity, mapreduce_similar(A, eltype(A), axs), mapreduce_similar(A, keytype(A), axs), A)
     else
-        fA = f(first(A))
-        findminmax!(f, isless, fill!(similar(A, _findminmax_inittype(f, A), ri), fA),
-                    zeros(eltype(keys(A)), ri), A)
+        P = mapreduce(_transform_pair(f), (x,y)->ifelse(isless(x[2], y[2]), y, x), PairsArray(A); dims=region)
+        (inds, vals) = _unzip(P)
+        return (vals, inds)
     end
 end
 
-function _findminmax_inittype(f, A::AbstractArray)
-    T = _realtype(f, promote_union(eltype(A)))
-    v0 = f(first(A))
-    # First conditional: T is >: typeof(v0), so return it
-    # Second conditional: handle missing specifically, as most often, f(missing) = missing;
-    # certainly, some predicate functions return Bool, but not all.
-    # Else, return the type of the transformation.
-    Tr = v0 isa T ? T : Missing <: eltype(A) ? Union{Missing, typeof(v0)} : typeof(v0)
-end
-
-reducedim1(R, A) = length(axes1(R)) == 1
-
 """
     argmin(A; dims) -> indices
 
diff --git a/test/reduce.jl b/test/reduce.jl
index 329f18149b361..13be28e57c31b 100644
--- a/test/reduce.jl
+++ b/test/reduce.jl
@@ -771,3 +771,60 @@ end
         Val(any(in((:one,:two,:three)),(:four,:three)))
     end |> only == Val{true}
 end
+
+@testset "type stability of internals for mean (etc); Statistics#160" begin
+    @test (@inferred Missing mapreduce(identity, (x,y)->Base.add_sum(x, y)/1, view([1.0,2.0,missing],1:2), dims=:)) == 3.0
+    @test (@inferred Missing mapreduce(identity, (x,y)->Base.add_sum(x, y)/1, view([1,2,missing],1:2), dims=:, init=0.0)) == 3.0
+    @test (@inferred Union{Missing,Int} mapreduce(identity, (x,y)->Base.add_sum(x, y)/1, view([1,2,missing],1:2), dims=:, init=0)) == 3.0
+    @test_broken (@inferred Union{Missing,Int} mapreduce(identity, (x,y)->Base.add_sum(x, y)/1, view([1,2,missing],1:2), dims=:)) == 3.0
+
+    @test (@inferred Missing reduce((x,y)->Base.add_sum(x, y)/1, view([1.0,2.0,missing],1:2), dims=:)) == 3.0
+    @test (@inferred Missing reduce((x,y)->Base.add_sum(x, y)/1, view([1,2,missing],1:2), dims=:, init=0.0)) == 3.0
+    @test (@inferred Union{Missing,Int} reduce((x,y)->Base.add_sum(x, y)/1, view([1,2,missing],1:2), dims=:, init=0)) == 3.0
+    @test_broken (@inferred Union{Missing,Int} reduce((x,y)->Base.add_sum(x, y)/1, view([1,2,missing],1:2), dims=:)) == 3.0
+
+    @test (@inferred Missing mapreduce(identity, (x,y)->Base.add_sum(x, y)/1, skipmissing(view([1.0,2.0,missing],1:2)))) == 3.0
+    @test (@inferred Missing mapreduce(identity, (x,y)->Base.add_sum(x, y)/1, skipmissing(view([1,2,missing],1:2)), init=0.0)) == 3.0
+    @test (@inferred Union{Missing,Int} mapreduce(identity, (x,y)->Base.add_sum(x, y)/1, skipmissing(view([1,2,missing],1:2)), init=0)) == 3.0
+    @test (@inferred Union{Missing,Int} mapreduce(identity, (x,y)->Base.add_sum(x, y)/1, skipmissing(view([1,2,missing],1:2)))) == 3.0
+
+    @test (@inferred Missing reduce((x,y)->Base.add_sum(x, y)/1, skipmissing(view([1.0,2.0,missing],1:2)))) == 3.0
+    @test (@inferred Missing reduce((x,y)->Base.add_sum(x, y)/1, skipmissing(view([1,2,missing],1:2)), init=0.0)) == 3.0
+    @test (@inferred Union{Missing,Int} reduce((x,y)->Base.add_sum(x, y)/1, skipmissing(view([1,2,missing],1:2)), init=0)) == 3.0
+    @test (@inferred Union{Missing,Int} reduce((x,y)->Base.add_sum(x, y)/1, skipmissing(view([1,2,missing],1:2)))) == 3.0
+end
+
+linear(x) = (@assert(IndexStyle(x) == IndexLinear()); x)
+cartesian(x) = (y = view(x, axes(x)...); @assert(IndexStyle(y) isa IndexCartesian); y)
+iterlen(x) = (y = Iterators.take(x, length(x)); @assert(Base.IteratorSize(y) isa Union{Base.HasLength, Base.HasShape}); y)
+iterunknown(x) = (y = skipmissing(convert(Array{Union{Missing, eltype(x)}}, x)); @assert(Base.IteratorSize(y) isa Base.SizeUnknown); y)
+# Issues #29883, #52365, #30421, #52457
+@testset "all floating point reductions are pairwise" begin
+    ulp = nextfloat(1f0)-1f0
+    sz = (100000,10)
+    len = prod(sz)
+    arr = fill(0.1f0, sz)
+    @test foldl(+, arr) === 100958.34f0 # Should not SIMD, should be exact
+    @test !(foldl(+, arr) ≈ len/10)
+    @test !isapprox(foldl(+, arr), len/10; rtol=100ulp)
+    @test sum(arr) ≈ len/10
+    @test sum(arr) ≈ len/10 rtol=100ulp
+    for f in (linear, cartesian, iterlen, iterunknown)
+        A = f(arr)
+        @test sum(A) ≈ (len/10) rtol=100ulp
+        @test sum(A; init=0.0) ≈ (len/10) rtol=100ulp
+        if A isa AbstractArray
+            @test only(unique(sum(A; dims=1))) ≈ (sz[1]/10) rtol=100ulp
+            @test only(unique(sum(permutedims(A); dims=2))) ≈ (sz[1]/10) rtol=100ulp
+            @test only(unique(sum(A; dims=1, init=0.0))) ≈ (sz[1]/10) rtol=100ulp
+            @test only(unique(sum(permutedims(A); dims=2, init=0.0))) ≈ (sz[1]/10) rtol=100ulp
+        end
+    end
+
+    @test sum((rand(Float32) for _ in 1:100000000))/100000000 ≈ 0.5
+
+    rng = MersenneTwister(630);
+    v = randn(rng, Float16, 1000)
+    sa = @view v[collect(1:end)]
+    @test sum(sa) ≈ sum(v)
+end
diff --git a/test/reducedim.jl b/test/reducedim.jl
index fd8d003f33b2c..f4c53a2a47e49 100644
--- a/test/reducedim.jl
+++ b/test/reducedim.jl
@@ -1,6 +1,8 @@
 # This file is a part of Julia. License is MIT: https://julialang.org/license
 
 using Random
+isdefined(Main, :OffsetArrays) || @eval Main include("testhelpers/OffsetArrays.jl")
+using .Main.OffsetArrays: OffsetVector, OffsetArray
 
 # main tests
 
@@ -173,7 +175,7 @@ end
 
     x = sum(Real[1.0], dims=1)
     @test x == [1.0]
-    @test x isa Vector{Real}
+    @test x isa Vector{<:Real}
 
     x = mapreduce(cos, +, Union{Int,Missing}[1, 2], dims=1)
     @test x == mapreduce(cos, +, [1, 2], dims=1)
@@ -552,6 +554,14 @@ end
     end
 end
 
+f44906(x) = maximum(f44906, x); f44906(x::Number) = x
+g44906(x) = mapreduce(g44906, max, x); g44906(x::Number) = x
+@testset "inference with minimum/maximum, issue #44906" begin
+    x = [[1, 2], [3, 4]];
+    @test @inferred(f44906(x)) == 4
+    @test @inferred(g44906(x)) == 4
+end
+
 # issue #6672
 @test sum(Real[1 2 3; 4 5.3 7.1], dims=2) == reshape([6, 16.4], 2, 1)
 @test sum(Any[1 2;3 4], dims=1) == [4 6]
@@ -606,11 +616,7 @@ end
         @test_throws BoundsError mapreduce(f, +, arr; dims)
         @test_throws BoundsError mapreduce(f, +, arr; dims, init=0)
         @test_throws BoundsError mapreduce(identity, op, arr)
-        try
-            #=@test_throws BoundsError=# mapreduce(identity, op, arr; dims)
-        catch ex
-            @test_broken ex isa BoundsError
-        end
+        @test_throws BoundsError mapreduce(identity, op, arr; dims)
         @test_throws BoundsError mapreduce(identity, op, arr; dims, init=0)
 
         @test_throws BoundsError findmin(f, arr)
@@ -645,7 +651,7 @@ function unordered_test_for_extrema(a; dims_test = ((), 1, 2, (1,2), 3))
     for dims in dims_test
         vext = extrema(a; dims)
         vmin, vmax = minimum(a; dims), maximum(a; dims)
-        @test isequal(extrema!(copy(vext), a), vext)
+        @test isequal(extrema!(similar(vext, NTuple{2, eltype(a)}), a), vext)
         @test all(x -> isequal(x[1], x[2:3]), zip(vext,vmin,vmax))
     end
 end
@@ -766,3 +772,261 @@ reduced_shape(sz, dims) = ntuple(d -> d in dims ? 1 : sz[d], length(sz))
         end
     end
 end
+
+some_exception(op) = try return (Some(op()), nothing); catch ex; return (nothing, ex); end
+reduced_shape(sz, dims) = ntuple(d -> d in dims ? 1 : sz[d], length(sz))
+
+@testset "Ensure that calling, e.g., sum(empty; dims) has the same behavior as sum(empty)" begin
+    @testset "$r(Array{$T}(undef, $sz); dims=$dims)" for
+        r in (minimum, maximum, findmin, findmax, extrema, sum, prod, all, any, count),
+        T in (Int, Union{Missing, Int}, Number, Union{Missing, Number}, Bool, Union{Missing, Bool}, Any),
+        sz in ((0,), (0,1), (1,0), (0,0), (0,0,1), (1,0,1)),
+        dims in (1, 2, 3, 4, (1,2), (1,3), (2,3,4), (1,2,3))
+
+        A = Array{T}(undef, sz)
+        rsz = reduced_shape(sz, dims)
+
+        v, ex = some_exception() do; r(A); end
+        if isnothing(v)
+            @test_throws typeof(ex) r(A; dims)
+        else
+            actual = fill(something(v), rsz)
+            @test isequal(r(A; dims), actual)
+            @test eltype(r(A; dims)) === eltype(actual)
+        end
+
+        for f in (identity, abs, abs2)
+            v, ex = some_exception() do; r(f, A); end
+            if isnothing(v)
+                @test_throws typeof(ex) r(f, A; dims)
+            else
+                actual = fill(something(v), rsz)
+                @test isequal(r(f, A; dims), actual)
+                @test eltype(r(f, A; dims)) === eltype(actual)
+            end
+        end
+    end
+end
+
+@testset "generic sum reductions; issue #31427" begin
+    add31427(a, b) = a+b
+    A = rand(1:10, 5, 5)
+    @test reduce(add31427, A, dims = (1, 2)) == reduce(+, A, dims = (1, 2)) == [sum(A);;]
+
+    As = [rand(5, 4) for _ in 1:2, _ in 1:3]
+    @test reduce(hcat, reduce(vcat, As, dims=1)) == [As[1,1] As[1,2] As[1,3]; As[2,1] As[2,2] As[2,3]]
+end
+
+@testset "sum with `missing`s; issue #55213 and #32366" begin
+    @test isequal(sum([0.0 1; 0.0 missing], dims=2), [1.0; missing;;])
+    @test sum([0.0 1; 0.0 missing], dims=2)[1] === 1.0
+    @test isequal(sum(Any[0.0 1; 0.0 missing], dims=2), [1.0; missing;;])
+    @test sum(Any[0.0 1; 0.0 missing], dims=2)[1] === 1.0
+
+    @test isequal(sum([1 0.0; 0.0 missing], dims=1), [1.0 missing])
+    @test sum([1 0.0; 0.0 missing], dims=1)[1] === 1.0
+    @test isequal(sum(Any[1 0.0; 0.0 missing], dims=1), [1.0 missing])
+    @test sum(Any[1 0.0; 0.0 missing], dims=1)[1] === 1.0
+
+    @test isequal(sum([true false missing], dims=2), sum([1 0 missing], dims=2))
+    @test isequal(sum([true false missing], dims=2), [sum([true false missing]);;])
+    @test isequal(sum([true false missing], dims=2), [missing;;])
+end
+
+@testset "issues #45566 and #47231; initializers" begin
+    @test reduce(gcd, [1]; dims=1) == [reduce(gcd, [1])] == [1]
+
+    x = reshape(1:6, 2, 3)
+    @test reduce(+, x, dims=2) == [9;12;;]
+    @test reduce(-, x, dims=1) == [1-2 3-4 5-6]
+    @test reduce(/, x, dims=1) == [1/2 3/4 5/6]
+    @test reduce(^, x, dims=1) == [1^2 3^4 5^6]
+    # These have arbitrary associativity, but they shouldn't error
+    @test reduce(-, x, dims=2) in ([(1-3)-5; (2-4)-6;;], [1-(3-5); 2-(4-6);;])
+    @test reduce(/, x, dims=2) in ([(1/3)/5; (2/4)/6;;], [1/(3/5); 2/(4/6);;])
+    @test reduce(^, x, dims=2) in ([(1^3)^5; (2^4)^6;;], [1^(3^5); 2^(4^6);;])
+end
+
+@testset "better initializations" begin
+    @test mapreduce(_ -> pi, +, [1,2]; dims=1) == [mapreduce(_ -> pi, +, [1,2])] == [2pi]
+    @test mapreduce(_ -> pi, +, [1 2]; dims=2) == [mapreduce(_ -> pi, +, [1 2]);;] == [2pi;;]
+    @test mapreduce(_ -> pi, +, [1 2]; dims=(1,2)) == [2pi;;]
+    @test mapreduce(_ -> pi, +, [1]; dims=1) == [mapreduce(_ -> pi, +, [1])] == [pi]
+    @test mapreduce(_ -> pi, +, [1;;]; dims=2) == [mapreduce(_ -> pi, +, [1;;]);;] == [pi;;]
+    @test_throws ArgumentError mapreduce(_ -> pi, +, []; dims=1)
+    @test_throws ArgumentError mapreduce(_ -> pi, +, [;;]; dims=2)
+    @test_throws ArgumentError mapreduce(_ -> pi, +, [;;]; dims=(1,2))
+    @test_throws ArgumentError mapreduce(_ -> pi, +, [])
+    @test_throws ArgumentError mapreduce(_ -> pi, +, [;;])
+
+    @test mapreduce(x -> log(x-1), +, [2,3,4]; dims=1) == [mapreduce(x -> log(x-1), +, [2,3,4])] == [log(1) + log(2) + log(3)]
+    @test mapreduce(x -> log(x-1), +, [2,3]; dims=1) == [mapreduce(x -> log(x-1), +, [2,3])] == [log(1) + log(2)]
+    @test mapreduce(x -> log(x-1), +, [2]; dims=1) == [mapreduce(x -> log(x-1), +, [2])] == [log(1)]
+    @test mapreduce(x -> log(x-1), +, [2 3 4]; dims=2) == [mapreduce(x -> log(x-1), +, [2,3,4]);;] == [log(1) + log(2) + log(3);;]
+    @test mapreduce(x -> log(x-1), +, [2 3]; dims=2) == [mapreduce(x -> log(x-1), +, [2,3]);;] == [log(1) + log(2);;]
+    @test mapreduce(x -> log(x-1), +, [2;;]; dims=2) == [mapreduce(x -> log(x-1), +, [2]);;] == [log(1);;]
+    @test_throws ArgumentError mapreduce(x -> log(x-1), +, []; dims=1)
+    @test_throws ArgumentError mapreduce(x -> log(x-1), +, [;;]; dims=2)
+    @test_throws ArgumentError mapreduce(x -> log(x-1), +, [;;]; dims=(1,2))
+    @test_throws ArgumentError mapreduce(x -> log(x-1), +, [])
+    @test_throws ArgumentError mapreduce(x -> log(x-1), +, [;;])
+
+    @test sum(x->sqrt(x-1), ones(5); dims=1) == [sum(x->sqrt(x-1), ones(5))] == [0.0]
+    @test sum(x->sqrt(x-1), ones(1); dims=1) == [sum(x->sqrt(x-1), ones(1))] == [0.0]
+    @test_throws ArgumentError sum(x->sqrt(x-1), ones(0); dims=1)
+    @test_throws ArgumentError sum(x->sqrt(x-1), ones(0))
+end
+
+@testset "reductions on broadcasted; issue #41054" begin
+    A = clamp.(randn(3,4), -1, 1)
+    bc = Base.broadcasted(+, A, 2)
+    @test sum(bc, dims=1) ≈ sum(A .+ 2, dims=1)
+    @test mapreduce(sqrt, +, bc, dims=1) ≈ mapreduce(sqrt, +, A .+ 2, dims=1)
+
+    @test sum(bc, dims=2) ≈ sum(A .+ 2, dims=2)
+    @test mapreduce(sqrt, +, bc, dims=2) ≈ mapreduce(sqrt, +, A .+ 2, dims=2)
+
+    @test sum(bc, dims=(1,2)) ≈ [sum(A .+ 2)]
+    @test mapreduce(sqrt, +, bc, dims=(1,2)) ≈ [mapreduce(sqrt, +, A .+ 2)]
+end
+
+@testset "reductions over complex values; issue #54920" begin
+    A = Complex{Int}.(rand(Complex{Int8}, 2, 2, 2));
+    @test maximum(abs, A; dims=(1,)) == mapreduce(abs, max, A, dims=(1,)) == [maximum(abs, A[:,1,1]) maximum(abs, A[:,2,1]);;; maximum(abs, A[:,1,2]) maximum(abs, A[:,2,2])]
+    @test maximum(abs, A; dims=(2,)) == mapreduce(abs, max, A, dims=(2,)) == [maximum(abs, A[1,:,1]); maximum(abs, A[2,:,1]);;; maximum(abs, A[1,:,2]); maximum(abs, A[2,:,2])]
+    @test maximum(abs, A; dims=(1, 2)) == mapreduce(abs, max, A, dims=(1, 2)) == [maximum(abs, A[:,:,1]);;; maximum(abs, A[:,:,2])]
+    @test maximum(abs, A; dims=(1, 2, 3)) == mapreduce(abs, max, A, dims=(1, 2, 3)) == [maximum(abs, A);;;]
+end
+
+@testset "bitwise operators on integers; part of issue #45562" begin
+    @test mapreduce(identity, &, [3,3,3]; dims=1) == [mapreduce(identity, &, [3,3,3])] == [3 & 3 & 3] == [3]
+    @test mapreduce(identity, |, [3,3,3]; dims=1) == [mapreduce(identity, |, [3,3,3])] == [3 | 3 | 3] == [3]
+    @test mapreduce(identity, xor, [3,3,3]; dims=1) == [mapreduce(identity, xor, [3,3,3])] == [xor(xor(3, 3), 3)] == [3]
+
+    @test mapreduce(identity, &, [3,7,6]; dims=1) == [mapreduce(identity, &, [3,7,6])] == [3 & 7 & 6] == [2]
+    @test mapreduce(identity, |, [3,7,6]; dims=1) == [mapreduce(identity, |, [3,7,6])] == [3 | 7 | 6] == [7]
+    @test mapreduce(identity, xor, [3,7,6]; dims=1) == [mapreduce(identity, xor, [3,7,6])] == [xor(xor(3, 7), 6)] == [2]
+end
+
+@testset "indexing" begin
+    A = [1 2; 3 4]
+    B = [5 6; 7 8]
+
+    @test mapreduce(/, +, A, B) ≈ sum(A./B)
+    @test mapreduce(i -> A[i]/B[i], +, eachindex(A,B)) ≈ sum(A./B)
+    @test mapreduce(i -> A[i]/B'[i], +, eachindex(A,B')) ≈ sum(A./B')
+
+    @test mapreduce(/, +, A, B; dims=1) ≈ sum(A./B; dims=1)
+    @test mapreduce(i -> A[i]/B[i], +, reshape(eachindex(A,B), size(A)); dims=1) ≈ sum(A./B; dims=1)  # BoundsError
+    @test mapreduce(i -> A[i]/B'[i], +, eachindex(A,B'); dims=1) ≈ sum(A./B'; dims=1)  # BoundsError
+end
+
+@testset "more related to issue #26488" begin
+    @test mapreduce(x -> log10(x-1), +, [11, 101]) ≈ 3.0
+    @test mapreduce(x -> log10(x-1), *, [11, 101]) ≈ 2.0
+    @test mapreduce(x -> log10(x-1), +, [11, 101]; dims=1) ≈ [3.0]
+    @test mapreduce(x -> log10(x-1), +, [11, 101]; init=-10) ≈ -7.0
+    @test mapreduce(x -> log10(x-1), +, [11, 101]; init=-10, dims=1) ≈ [-7.0]
+
+    @test mapreduce(_ -> pi, +, [1,2]) ≈ 2pi
+    @test mapreduce(_ -> pi, *, [1,2]) ≈ pi^2
+    @test mapreduce(_ -> pi, +, [1,2]; dims=1) ≈ [2pi]
+end
+
+_add(x,y) = x+y  # this avoids typeof(+) dispatch
+
+@testset "mapreduce fast paths, dims=:, op=$op, kw=$kw" for op in (+,_add),
+                                                            kw in ((;), (; init=0.0))
+    @test_broken 0 == @allocated mapreduce(/, op, 1:3, 4:7; kw...)
+    @test_broken 0 == @allocated mapreduce(/, op, 1:3, 4:9; kw...)  # stops early
+    @test mapreduce(/, op, 1:3, 4:7; kw...) ≈ reduce(op, map(/, 1:3, 4:7); kw...)
+    @test mapreduce(/, op, 1:3, 4:9; kw...) ≈ reduce(op, map(/, 1:3, 4:9); kw...)
+
+    A = [1 2; 3 4]
+    B = [5 6; 7 8]
+
+    @test_broken 0 == @allocated mapreduce(*, op, A, B; kw...)  # LinearIndices
+    @test_broken 0 == @allocated mapreduce(*, op, A, B'; kw...)  # CartesianIndices
+
+    @test mapreduce(*, op, A, B; kw...) == reduce(op, map(*, A, B); kw...)
+    @test mapreduce(*, op, A, B'; kw...) == reduce(op, map(*, A, B'); kw...)
+
+    @test_broken 0 == @allocated mapreduce(*, op, A, 5:7; kw...)  # stops early
+    @test_broken 0 == @allocated mapreduce(*, op, 1:3, B'; kw...)  # stops early
+    @test mapreduce(*, op, A, 5:7; kw...) == reduce(op, map(*, A, 5:7); kw...)
+    @test mapreduce(*, op, 1:3, B'; kw...) == reduce(op, map(*, 1:3, B'); kw...)
+    @test mapreduce(*, op, 1:3, B', 10:20; kw...) == reduce(op, map(*, 1:3, B', 10:20); kw...)
+
+    @test_throws DimensionMismatch map(*, A, hcat(B, 9:10))  # same ndims, does not stop early
+    @test_throws DimensionMismatch mapreduce(*, op, A, hcat(B, 9:10); kw...)
+end
+
+@testset "mapreduce fast paths, dims=$dims, op=$op, kw=$kw" for dims in (1,2,[2],(1,2),3),
+                                                                op in (+,*,_add),
+                                                                kw in ((;), (; init=0.0))
+    (kw == (;) && op == _add) && continue
+
+    @test mapreduce(/, op, 1:3, 4:7; dims, kw...) ≈ reduce(op, map(/, 1:3, 4:7); dims, kw...)
+    @test mapreduce(/, op, 1:3, 4:9; dims, kw...) ≈ reduce(op, map(/, 1:3, 4:9); dims, kw...)
+
+    A = [1 2; 3 4]
+    B = [5 6; 7 8]
+    @test mapreduce(*, op, A, B; dims, kw...) == reduce(op, map(*, A, B); dims, kw...)  # LinearIndices
+    @test mapreduce(*, op, A, B'; dims, kw...) == reduce(op, map(*, A, B'); dims, kw...)  # CartesianIndices
+
+    @test_broken @allocated(mapreduce(*, op, A, B; dims, kw...)) < @allocated(reduce(op, map(*, A, B); dims, kw...))
+    @test_broken @allocated(mapreduce(*, op, A, B'; dims, kw...)) < @allocated(reduce(op, map(*, A, B'); dims, kw...))
+
+    @test mapreduce(*, op, A, 5:7; dims, kw...) == reduce(op, map(*, A, 5:7); dims, kw...)  # stops early
+    @test mapreduce(*, op, 1:3, B'; dims, kw...) == reduce(op, map(*, 1:3, B'); dims, kw...)
+
+    @test_throws DimensionMismatch mapreduce(*, +, A, hcat(B, 9:10); dims, kw...)
+end
+
+struct Infinity21097 <: Number
+end
+Base.:+(::Infinity21097,::Infinity21097) = Infinity21097()
+@testset "don't call zero on unknown numbers, issue #21097" begin
+    @test reduce(+, [Infinity21097()], init=Infinity21097()) == Infinity21097()
+    @test reduce(+, [Infinity21097()], init=Infinity21097(), dims=1) == [Infinity21097()]
+    @test reduce(+, [Infinity21097();;], init=Infinity21097(), dims=2) == [Infinity21097();;]
+    for len in [2,15,16,17,31,32,33,63,64,65,127,128,129]
+        for init in ((), (; init=Infinity21097()))
+            @test reduce(+,fill(Infinity21097(), len); init...) == Infinity21097()
+            @test reduce(+,fill(Infinity21097(), len, 2); init...) == Infinity21097()
+            @test reduce(+,fill(Infinity21097(), len, 16); init...) == Infinity21097()
+            @test reduce(+,fill(Infinity21097(), len); dims = 1, init...) == [Infinity21097()]
+            @test reduce(+,fill(Infinity21097(), len, 2); dims = 1, init...) == fill(Infinity21097(), 1, 2)
+            @test reduce(+,fill(Infinity21097(), len, 16); dims = 1, init...) == fill(Infinity21097(), 1, 16)
+            @test reduce(+,fill(Infinity21097(), len, 2); dims = 2, init...) == fill(Infinity21097(), len, 1)
+            @test reduce(+,fill(Infinity21097(), len, 16); dims = 2, init...) == fill(Infinity21097(), len, 1)
+        end
+    end
+end
+
+@testset "don't assume zero is the neutral element; issue #54875" begin
+    A = rand(3,4)
+    @test sum(I -> A[I], CartesianIndices(A); dims=1) == sum(I -> A[I], CartesianIndices(A); dims=1, init=0.)
+end
+
+@testset "cannot construct a value of type Union{} for return result; issue #43731" begin
+    a = collect(1.0:4)
+    f(x) = x < 3 ? missing : x
+    @test ismissing(minimum(f, a))
+    @test ismissing(only(minimum(f, a; dims = 1)))
+end
+
+@testset "zero indices are not special, issue #38660" begin
+    v = OffsetVector([-1, 1], 0:1)
+    @test minimum(v) == -1
+    @test findmin(v, dims=1) == OffsetVector.(([-1], [0]), (0:0,))
+
+    A = rand(3,4,5)
+    OA = OffsetArray(A, (-1,-2,-3))
+    for dims in ((), 1, 2, 3, (1,2), (1,3), (2,3))
+        av, ai = findmin(A; dims)
+        ov, oi = findmin(OA; dims)
+        @test av == ov.parent
+        @test ai == oi.parent .+ CartesianIndex(1,2,3)
+    end
+end
diff --git a/test/testhelpers/OffsetArrays.jl b/test/testhelpers/OffsetArrays.jl
index 5acaa88064245..cce3ed0170c1f 100644
--- a/test/testhelpers/OffsetArrays.jl
+++ b/test/testhelpers/OffsetArrays.jl
@@ -211,7 +211,6 @@ Base.show(io::IO, r::IdOffsetRange) = print(io, IdOffsetRange, "(values=",first(
 
 # Optimizations
 @inline Base.checkindex(::Type{Bool}, inds::IdOffsetRange, i::Real) = Base.checkindex(Bool, inds.parent, i - inds.offset)
-Base._firstslice(i::IdOffsetRange) = IdOffsetRange(Base._firstslice(i.parent), i.offset)
 
 ########################################################################################################
 # origin.jl

From 0708c2bf127f9db69f1e68b68b693a4ef2fb5f9e Mon Sep 17 00:00:00 2001
From: Matt Bauman <mbauman@juliacomputing.com>
Date: Tue, 13 May 2025 17:44:55 -0400
Subject: [PATCH 08/55] fixup Broadcast size/axes

---
 base/broadcast.jl | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/base/broadcast.jl b/base/broadcast.jl
index 81896a886ff65..74421c1ca5384 100644
--- a/base/broadcast.jl
+++ b/base/broadcast.jl
@@ -236,14 +236,11 @@ Base.similar(::Broadcasted{ArrayConflict}, ::Type{Bool}, dims) =
     similar(BitArray, dims)
 
 @inline Base.axes(bc::Broadcasted) = _axes(bc, bc.axes)
-@inline Base.axes(bc::Broadcasted, d) = get(axes(bc), d, OneTo(1))
+@inline Base.axes(bc::Broadcasted, d::Integer) = get(axes(bc), d, OneTo(1))
 _axes(::Broadcasted, axes::Tuple) = axes
 @inline _axes(bc::Broadcasted, ::Nothing)  = combine_axes(bc.args...)
 _axes(bc::Broadcasted{<:AbstractArrayStyle{0}}, ::Nothing) = ()
 
-@inline Base.axes(bc::Broadcasted{<:Any, <:NTuple{N}}, d::Integer) where N =
-    d <= N ? axes(bc)[d] : OneTo(1)
-
 BroadcastStyle(::Type{<:Broadcasted{Style}}) where {Style} = Style()
 BroadcastStyle(::Type{<:Broadcasted{S}}) where {S<:Union{Nothing,Unknown}} =
     throw(ArgumentError("Broadcasted{Unknown} wrappers do not have a style assigned"))
@@ -267,7 +264,7 @@ Base.ndims(bc::Broadcasted) = ndims(typeof(bc))
 Base.ndims(::Type{<:Broadcasted{<:Any,<:NTuple{N,Any}}}) where {N} = N
 
 Base.size(bc::Broadcasted) = map(length, axes(bc))
-Base.size(bc::Broadcasted, d) = length(axes(bc, d))
+Base.size(bc::Broadcasted, d::Integer) = length(axes(bc, d))
 Base.length(bc::Broadcasted) = prod(size(bc))
 
 function Base.iterate(bc::Broadcasted)

From e719715d081492b69581ea026e26871e1ba34620 Mon Sep 17 00:00:00 2001
From: Matt Bauman <mbauman@juliacomputing.com>
Date: Tue, 13 May 2025 17:45:13 -0400
Subject: [PATCH 09/55] more deprecations

---
 base/deprecated.jl | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/base/deprecated.jl b/base/deprecated.jl
index aba128b178b76..dd22fd9c05592 100644
--- a/base/deprecated.jl
+++ b/base/deprecated.jl
@@ -561,6 +561,9 @@ isbindingresolved
 
 # BEGIN 1.13 deprecations
 
+function reducedim_init end
+const _dep_message_reducedim_init = ", these internals have been removed. To customize the array returned by dimensional reductions, implement mapreduce_similar instead"
+deprecate(Base, :reducedim_init)
 function reducedim_initarray end
 const _dep_message_reducedim_initarray = ", these internals have been removed. To customize the array returned by dimensional reductions, implement mapreduce_similar instead"
 deprecate(Base, :reducedim_initarray)
@@ -571,5 +574,7 @@ deprecate(Base, :_mapreduce_dim)
 @deprecate_binding _mapreducedim! Base.mapreduce! false
 @deprecate_binding reducedim! Base.reduce! false
 @deprecate mapreduce_impl(f, op, A::AbstractArrayOrBroadcasted, ifirst::Integer, ilast::Integer, blksize::Int=pairwise_blocksize(f, op)) Base.mapreduce_pairwise(f, op, A, _InitialValue(), ifirst:ilast) false
+@deprecate _mapreduce(f, op, A) Base.mapreduce_pairwise(f, op, A, Base._InitialValue()) false
+@deprecate _mapreduce(f, op, _, A) Base.mapreduce_pairwise(f, op, A, Base._InitialValue()) false
 
 # END 1.13 deprecations

From da3bdb19ba8f1fc4e44278929913cf78396db3d5 Mon Sep 17 00:00:00 2001
From: Matt Bauman <mbauman@juliacomputing.com>
Date: Tue, 13 May 2025 17:47:32 -0400
Subject: [PATCH 10/55] NEWS update for empty dimensional reductions

---
 NEWS.md | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/NEWS.md b/NEWS.md
index 962065d3d386d..8d7e186ed2ff1 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -51,9 +51,11 @@ New library features
 Standard library changes
 ------------------------
 
-* Empty dimensional reductions (e.g., `reduce` and `mapreduce` with the `dims` keyword
+* Empty dimensional reductions (`reduce`, `mapreduce`, `sum`, `minimum`, etc., with the `dims` keyword
   selecting one or more dimensions) now behave like their whole-array (`dims=:`) counterparts,
-  only returning values in unambiguous cases and erroring otherwise.
+  only returning values in unambiguous cases and erroring otherwise.  For example,
+  `minimum([], dims=2)` is now an error; it would previously return an empty array.
+  This can be addressed by specifying an `init` keyword argument as the error message directs.
 
 #### JuliaSyntaxHighlighting
 

From 8cef637679008c7a5dab7db519064c9b12136028 Mon Sep 17 00:00:00 2001
From: Matt Bauman <mbauman@juliacomputing.com>
Date: Tue, 13 May 2025 17:48:11 -0400
Subject: [PATCH 11/55] avoid calling length in functions that define length

---
 base/dict.jl              | 10 +++++++++-
 base/runtime_internals.jl |  8 +++++++-
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/base/dict.jl b/base/dict.jl
index e059fde102bab..849e4e0f57d49 100644
--- a/base/dict.jl
+++ b/base/dict.jl
@@ -845,7 +845,15 @@ function iterate(d::ImmutableDict{K,V}, t=d) where {K, V}
     !isdefined(t, :parent) && return nothing
     (Pair{K,V}(t.key, t.value), t.parent)
 end
-length(t::ImmutableDict) = count(Returns(true), t)
+# length is defined in terms of iteration; using a higher-order function to do the iteration
+# is likely to call `length` again, so this is a manual for loop:
+function length(t::ImmutableDict)
+    r = 0
+    for _ in t
+        r+=1
+    end
+    return r
+end
 isempty(t::ImmutableDict) = !isdefined(t, :parent)
 empty(::ImmutableDict, ::Type{K}, ::Type{V}) where {K, V} = ImmutableDict{K,V}()
 
diff --git a/base/runtime_internals.jl b/base/runtime_internals.jl
index db8fd114a3493..fb68842efe5be 100644
--- a/base/runtime_internals.jl
+++ b/base/runtime_internals.jl
@@ -1728,7 +1728,13 @@ function iterate(specs::MethodSpecializations, i::Int)
     item === nothing && return nothing
     return (item, i)
 end
-length(specs::MethodSpecializations) = count(Returns(true), specs)
+function length(specs::MethodSpecializations)
+    r = 0
+    for _ in specs
+        r += 1
+    end
+    return r
+end
 
 function length(mt::Core.MethodTable)
     n = 0

From 89c3975f86185f01848c15a1529af336c6e3f941 Mon Sep 17 00:00:00 2001
From: Matt Bauman <mbauman@juliacomputing.com>
Date: Tue, 13 May 2025 17:49:12 -0400
Subject: [PATCH 12/55] Add (offset) axes support for Enumerate

they already supported axes through the default fallback that uses size to construct OneTos
---
 base/iterators.jl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/base/iterators.jl b/base/iterators.jl
index 4b956073f0c04..7570416ae3c60 100644
--- a/base/iterators.jl
+++ b/base/iterators.jl
@@ -192,6 +192,7 @@ enumerate(iter) = Enumerate(iter)
 
 length(e::Enumerate) = length(e.itr)
 size(e::Enumerate) = size(e.itr)
+axes(e::Enumerate) = axes(e.itr)
 @propagate_inbounds function iterate(e::Enumerate, state=(1,))
     i, rest = state[1], tail(state)
     n = iterate(e.itr, rest...)

From f88cfeb46a807c08dfab591311f30dae28e19e3c Mon Sep 17 00:00:00 2001
From: Matt Bauman <mbauman@juliacomputing.com>
Date: Tue, 13 May 2025 17:49:46 -0400
Subject: [PATCH 13/55] make it a bit easier to extend `mapreduce_kernel`

---
 base/multidimensional.jl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/base/multidimensional.jl b/base/multidimensional.jl
index cc557d24e664f..d37719567fd50 100644
--- a/base/multidimensional.jl
+++ b/base/multidimensional.jl
@@ -1048,9 +1048,9 @@ end
 
 # Reduction support for CartesianIndices
 halves(inds::CartesianIndices) = map(CartesianIndices∘tuple, _halves(inds.indices)...)
-mapreduce_kernel(f, op, A, init, ::CartesianIndices{0}) = init===_InitialValue() ? mapreduce_first(f, op, only(A)) : op(init, f(only(A)))
-mapreduce_kernel(f, op, A, init, inds::CartesianIndices{1}) = mapreduce_kernel(f, op, A, init, inds.indices[1])
-function mapreduce_kernel(f, op, A, init, inds::CartesianIndices)
+function mapreduce_kernel(f, op, A, init, inds::CartesianIndices{N}) where {N}
+    N == 0 && return init===_InitialValue() ? mapreduce_first(f, op, only(A)) : op(init, f(only(A)))
+    N == 1 && return mapreduce_kernel(f, op, A, init, inds.indices[1])
     i1, s = iterate(inds)
     a1 = @inbounds A[i1]
     v = _mapreduce_start(f, op, A, init, a1)

From bddc126d2af305ba9493bdcda701eb7f4cdcf26b Mon Sep 17 00:00:00 2001
From: Matt Bauman <mbauman@juliacomputing.com>
Date: Tue, 13 May 2025 17:50:07 -0400
Subject: [PATCH 14/55] fixup PermutedDimsArray changes

---
 base/permuteddimsarray.jl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/base/permuteddimsarray.jl b/base/permuteddimsarray.jl
index 2bb76e5bfd77d..b95a5821e91d3 100644
--- a/base/permuteddimsarray.jl
+++ b/base/permuteddimsarray.jl
@@ -345,11 +345,11 @@ end
 
 const CommutativeOps = Union{typeof(+),typeof(Base.add_sum),typeof(min),typeof(max),typeof(Base._extrema_rf),typeof(|),typeof(&)}
 
-function Base.mapreduce(f, op::CommutativeOps, A::PermutedDimsArray; init=Base._InitialValue(), dims=(:))
-    mapreduce(f, op, parent(A); init, dims)
+function Base.mapreducedim(f, op::CommutativeOps, A::PermutedDimsArray, init, dims::Colon)
+    Base.mapreducedim(f, op, parent(A), init, dims)
 end
-function Base.mapreduce(f::typeof(identity), op::Union{typeof(Base.mul_prod),typeof(*)}, A::PermutedDimsArray{<:Union{Real,Complex}}; init=_InitialValue(), dims=(:))
-    mapreduce(f, op, parent(A); init, dims)
+function Base.mapreducedim(f::typeof(identity), op::Union{typeof(Base.mul_prod),typeof(*)}, A::PermutedDimsArray{<:Union{Real,Complex}}, init, dims::Colon)
+    Base.mapreducedim(f, op, parent(A), init, dims)
 end
 
 function Base.mapreduce!(f, op::CommutativeOps, B::AbstractArray{T,N}, A::PermutedDimsArray{S,N,perm,iperm}; init=Base._InitialValue()) where {T,S,N,perm,iperm}

From 7747f1e111e03aa87f18c1cffb1ce9bc7e76c53e Mon Sep 17 00:00:00 2001
From: Matt Bauman <mbauman@juliacomputing.com>
Date: Tue, 13 May 2025 17:50:36 -0400
Subject: [PATCH 15/55] avoid similar shenanigans; they are not needed

and this was causing ambiguities
---
 base/reducedim.jl | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/base/reducedim.jl b/base/reducedim.jl
index fcb9edb3b130f..dda08132eb95b 100644
--- a/base/reducedim.jl
+++ b/base/reducedim.jl
@@ -923,11 +923,6 @@ end
 end
 @propagate_inbounds getindex(P::PairsVector, i::CartesianIndex{1}) = P[i.I[1]]
 @propagate_inbounds getindex(P::PairsArray{<:Any,N}, I::Vararg{Int, N}) where {N} = P[CartesianIndex(I)]
-# Defer a few functions to the parent array
-similar(P::PairsArray, ::Type{T}, dims::Tuple{Union{Int, AbstractUnitRange}, Vararg{Union{Int, AbstractUnitRange}}}) where {T} = similar(P.array, T, dims)
-similar(P::PairsArray, ::Type{T}, dims::Tuple{Union{Int, Base.OneTo}, Vararg{Union{Int, Base.OneTo}}}) where {T} = similar(P.array, T, dims)
-similar(P::PairsArray, ::Type{T}, dims::Tuple{Int, Vararg{Int}}) where {T} = similar(P.array, T, dims)
-similar(P::PairsArray, ::Type{T}, dims::Tuple{}) where {T} = similar(P.array, T, dims)
 mapreduce_similar(P::PairsArray, ::Type{T}, dims) where {T} = mapreduce_similar(P.array, T, dims)
 
 # Use an ad-hoc specialized StructArray to allow in-place AoS->SoA transform

From 31de6281c7da51d3c2eaeaac19a32fcbbbf37e05 Mon Sep 17 00:00:00 2001
From: Matt Bauman <mbauman@juliacomputing.com>
Date: Tue, 13 May 2025 17:51:07 -0400
Subject: [PATCH 16/55] cartesian broadcasts are now pairwise!

---
 test/broadcast.jl | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/test/broadcast.jl b/test/broadcast.jl
index a751d9d381ce6..abb1c300051db 100644
--- a/test/broadcast.jl
+++ b/test/broadcast.jl
@@ -998,14 +998,10 @@ end
     # Now let's try it with `Broadcasted`:
     bcraw = Broadcast.broadcasted(identity, xs)
     bc = Broadcast.instantiate(bcraw)
-    # If `Broadcasted` has `IndexLinear` style, it should hit the
-    # `reduce` branch:
     @test IndexStyle(bc) == IndexLinear()
     @test reduce(paren, bc) == reduce(paren, xs)
-    # If `Broadcasted` does not have `IndexLinear` style, it should
-    # hit the `foldl` branch:
     @test IndexStyle(bcraw) == IndexCartesian()
-    @test reduce(paren, bcraw) == foldl(paren, xs)
+    @test reduce(paren, bcraw) == reduce(paren, xs)
 
     # issue #41055
     bc = Broadcast.instantiate(Broadcast.broadcasted(Base.literal_pow, Ref(^), [1,2], Ref(Val(2))))

From 2f5a886ee402cfcf25cf4b51462c76283f397d20 Mon Sep 17 00:00:00 2001
From: Matt Bauman <mbauman@juliacomputing.com>
Date: Tue, 13 May 2025 17:59:24 -0400
Subject: [PATCH 17/55] TEMPORARILY USE LINEAR ALGEBRA BRANCH

---
 .../md5                                                   | 1 +
 .../sha512                                                | 1 +
 stdlib/LinearAlgebra.version                              | 8 ++++----
 3 files changed, 6 insertions(+), 4 deletions(-)
 create mode 100644 deps/checksums/LinearAlgebra-e70953e599be1d43ead342c0f56b44f008900be8.tar.gz/md5
 create mode 100644 deps/checksums/LinearAlgebra-e70953e599be1d43ead342c0f56b44f008900be8.tar.gz/sha512

diff --git a/deps/checksums/LinearAlgebra-e70953e599be1d43ead342c0f56b44f008900be8.tar.gz/md5 b/deps/checksums/LinearAlgebra-e70953e599be1d43ead342c0f56b44f008900be8.tar.gz/md5
new file mode 100644
index 0000000000000..73207af2d33f4
--- /dev/null
+++ b/deps/checksums/LinearAlgebra-e70953e599be1d43ead342c0f56b44f008900be8.tar.gz/md5
@@ -0,0 +1 @@
+443d2498a80ad374894811c0b2d5e6bf
diff --git a/deps/checksums/LinearAlgebra-e70953e599be1d43ead342c0f56b44f008900be8.tar.gz/sha512 b/deps/checksums/LinearAlgebra-e70953e599be1d43ead342c0f56b44f008900be8.tar.gz/sha512
new file mode 100644
index 0000000000000..c253cdae5d70a
--- /dev/null
+++ b/deps/checksums/LinearAlgebra-e70953e599be1d43ead342c0f56b44f008900be8.tar.gz/sha512
@@ -0,0 +1 @@
+09077d9524aba81f3b2639843ce52153fa575faf40c2e05faca28a36350653c980617959f38f329bd4763f3def6821e619ddaf0050642be868b7c83883da53b1
diff --git a/stdlib/LinearAlgebra.version b/stdlib/LinearAlgebra.version
index 243522ef799ae..766d524787f1f 100644
--- a/stdlib/LinearAlgebra.version
+++ b/stdlib/LinearAlgebra.version
@@ -1,4 +1,4 @@
-LINEARALGEBRA_BRANCH = master
-LINEARALGEBRA_SHA1 = 07725da80be389e170cff75cf7157399c2449643
-LINEARALGEBRA_GIT_URL := https://github.com/JuliaLang/LinearAlgebra.jl.git
-LINEARALGEBRA_TAR_URL = https://api.github.com/repos/JuliaLang/LinearAlgebra.jl/tarball/$1
+LINEARALGEBRA_BRANCH = mb/ReduceReuse♻
+LINEARALGEBRA_SHA1 = e70953e599be1d43ead342c0f56b44f008900be8
+LINEARALGEBRA_GIT_URL := https://github.com/mbauman/LinearAlgebra.jl.git
+LINEARALGEBRA_TAR_URL = https://api.github.com/repos/mbauman/LinearAlgebra.jl/tarball/$1

From aeb9a89a566eeb8ee90c725079c6af578ad54b09 Mon Sep 17 00:00:00 2001
From: Matt Bauman <mbauman@juliacomputing.com>
Date: Tue, 13 May 2025 21:15:37 -0400
Subject: [PATCH 18/55] promote to Int

(cherry picked from commit 32fc16babf14e0ac26ca36adaedee4d1a3d4f0c4)
---
 base/reduce.jl    | 9 +++------
 test/reduce.jl    | 2 +-
 test/reducedim.jl | 2 +-
 3 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/base/reduce.jl b/base/reduce.jl
index a5f49c8d2681f..07ecfdb852ebf 100644
--- a/base/reduce.jl
+++ b/base/reduce.jl
@@ -14,11 +14,8 @@ The reduction operator used in `sum`. The main difference from [`+`](@ref) is th
 integers are promoted to `Int`/`UInt`.
 """
 add_sum(x, y) = x + y
-add_sum(x::Integer, y::Integer)::Integer = add_sum_integer(x, y)
-add_sum_integer(x::Bool, y::Bool) = Int(x) + Int(y)
-add_sum_integer(x::Union{Bool,BitSignedSmall}, y::Union{Bool,BitSignedSmall}) = Int(x) + Int(y)
-add_sum_integer(x::Union{Bool,BitUnsignedSmall}, y::Union{Bool,BitUnsignedSmall}) = UInt(x) + UInt(y)
-add_sum_integer(x, y) = x+y
+add_sum(x::Union{Bool,BitIntegerSmall}, y::Union{Bool,BitIntegerSmall}) = Int(x) + Int(y)
+add_sum(x::BitUnsignedSmall, y::BitUnsignedSmall) = UInt(x) + UInt(y)
 add_sum(x::Real, y::Real)::Real = x + y
 
 """
@@ -1245,7 +1242,7 @@ julia> count(i->(4<=i<=6), [2,3,4,5,6])
 julia> count([true, false, true, true])
 3
 
-julia> count(>(3), 1:7, init=0x00)
+julia> count(>(3), 1:7, init=UInt(0))
 0x0000000000000004
 ```
 """
diff --git a/test/reduce.jl b/test/reduce.jl
index 13be28e57c31b..9cd213b8b387b 100644
--- a/test/reduce.jl
+++ b/test/reduce.jl
@@ -586,7 +586,7 @@ struct NonFunctionIsZero end
 @test count(NonFunctionIsZero(), [0]) == 1
 @test count(NonFunctionIsZero(), [1]) == 0
 
-@test count(Iterators.repeated(true, 3), init=0x00) === UInt(3)
+@test count(Iterators.repeated(true, 3), init=UInt(0)) === UInt(3)
 @test count(!=(2), Iterators.take(1:7, 3), init=Int32(0)) === 2
 @test count(identity, [true, false], init=Int8(0)) === 1
 @test count(!, [true false; false true], dims=:, init=Int16(0)) === 2
diff --git a/test/reducedim.jl b/test/reducedim.jl
index f4c53a2a47e49..05ff47db78dad 100644
--- a/test/reducedim.jl
+++ b/test/reducedim.jl
@@ -710,7 +710,7 @@ end
     @test_throws TypeError count!([1], [1])
 end
 
-@test @inferred(UInt16, count(false:true, dims=:, init=0x0000)) === UInt(1)
+@test @inferred(UInt16, count(false:true, dims=:, init=0x0000)) === 1
 @test @inferred(count(isodd, reshape(1:9, 3, 3), dims=:, init=Int128(0))) === Int128(5)
 
 @testset "reduced_index for BigInt (issue #39995)" begin

From 2a0b7b7bd013c25e63c3204442ac5cc836c3be73 Mon Sep 17 00:00:00 2001
From: Matt Bauman <mbauman@juliacomputing.com>
Date: Tue, 13 May 2025 22:29:49 -0400
Subject: [PATCH 19/55] restore `reduce_first(+, true)` method

---
 base/reduce.jl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/base/reduce.jl b/base/reduce.jl
index 07ecfdb852ebf..a9ed85c3ba7aa 100644
--- a/base/reduce.jl
+++ b/base/reduce.jl
@@ -394,6 +394,7 @@ additional methods should only be defined for cases where `op` gives a result wi
 different types than its inputs.
 """
 reduce_first(op, x) = x
+reduce_first(::typeof(+), x::Bool) = Int(x)
 reduce_first(::typeof(*), x::AbstractChar) = string(x)
 
 reduce_first(::typeof(add_sum), x) = reduce_first(+, x)

From 6cc2496a81f0279cb4a154e250a91b24b52c8c6c Mon Sep 17 00:00:00 2001
From: Matt Bauman <mbauman@juliacomputing.com>
Date: Wed, 14 May 2025 17:53:19 -0400
Subject: [PATCH 20/55] TEMPORARILY USE SPARSEARRAY AND STATISTICS BRANCHES

---
 .../md5                                                   | 1 +
 .../sha512                                                | 1 +
 .../md5                                                   | 1 +
 .../sha512                                                | 1 +
 stdlib/SparseArrays.version                               | 4 ++--
 stdlib/Statistics.version                                 | 8 ++++----
 6 files changed, 10 insertions(+), 6 deletions(-)
 create mode 100644 deps/checksums/SparseArrays-975f04a739ddbe9e8dc04ed46f3227da0aa912e8.tar.gz/md5
 create mode 100644 deps/checksums/SparseArrays-975f04a739ddbe9e8dc04ed46f3227da0aa912e8.tar.gz/sha512
 create mode 100644 deps/checksums/Statistics-c674ab823526d05e7a85b468065089bf563823aa.tar.gz/md5
 create mode 100644 deps/checksums/Statistics-c674ab823526d05e7a85b468065089bf563823aa.tar.gz/sha512

diff --git a/deps/checksums/SparseArrays-975f04a739ddbe9e8dc04ed46f3227da0aa912e8.tar.gz/md5 b/deps/checksums/SparseArrays-975f04a739ddbe9e8dc04ed46f3227da0aa912e8.tar.gz/md5
new file mode 100644
index 0000000000000..cd5efe15436be
--- /dev/null
+++ b/deps/checksums/SparseArrays-975f04a739ddbe9e8dc04ed46f3227da0aa912e8.tar.gz/md5
@@ -0,0 +1 @@
+1fe453902519d84138fef290da911c64
diff --git a/deps/checksums/SparseArrays-975f04a739ddbe9e8dc04ed46f3227da0aa912e8.tar.gz/sha512 b/deps/checksums/SparseArrays-975f04a739ddbe9e8dc04ed46f3227da0aa912e8.tar.gz/sha512
new file mode 100644
index 0000000000000..9ac8d534f5af4
--- /dev/null
+++ b/deps/checksums/SparseArrays-975f04a739ddbe9e8dc04ed46f3227da0aa912e8.tar.gz/sha512
@@ -0,0 +1 @@
+e592a2a9e967b765b51a282d1a57b88e1891ec4c892cae8a5bdc4a576a228f56fd93c5da9aedd0ba002aa33c4772b25f6bfc786c241a5c552b7290d86e1720a5
diff --git a/deps/checksums/Statistics-c674ab823526d05e7a85b468065089bf563823aa.tar.gz/md5 b/deps/checksums/Statistics-c674ab823526d05e7a85b468065089bf563823aa.tar.gz/md5
new file mode 100644
index 0000000000000..145e567ddf5d4
--- /dev/null
+++ b/deps/checksums/Statistics-c674ab823526d05e7a85b468065089bf563823aa.tar.gz/md5
@@ -0,0 +1 @@
+d0a92cb724c9861390066b2262e771fc
diff --git a/deps/checksums/Statistics-c674ab823526d05e7a85b468065089bf563823aa.tar.gz/sha512 b/deps/checksums/Statistics-c674ab823526d05e7a85b468065089bf563823aa.tar.gz/sha512
new file mode 100644
index 0000000000000..58ac4fd864ed9
--- /dev/null
+++ b/deps/checksums/Statistics-c674ab823526d05e7a85b468065089bf563823aa.tar.gz/sha512
@@ -0,0 +1 @@
+b723403585a6a6fff94b4f1ebff0172937723507dc5a76756bd5788d6f13be465847079b28eea09a8b5896d49b0e7584e1a122049c87c4e237402a16eba5c943
diff --git a/stdlib/SparseArrays.version b/stdlib/SparseArrays.version
index 7a1d9697c9769..767682f68eeb3 100644
--- a/stdlib/SparseArrays.version
+++ b/stdlib/SparseArrays.version
@@ -1,4 +1,4 @@
-SPARSEARRAYS_BRANCH = mb/mapreducedim_empty
-SPARSEARRAYS_SHA1 = ac0cc6e8278bb614dc2a16e9353fa092dae6606a
+SPARSEARRAYS_BRANCH = mb/ReduceReuse♻
+SPARSEARRAYS_SHA1 = 975f04a739ddbe9e8dc04ed46f3227da0aa912e8
 SPARSEARRAYS_GIT_URL := https://github.com/mbauman/SparseArrays.jl.git
 SPARSEARRAYS_TAR_URL = https://api.github.com/repos/mbauman/SparseArrays.jl/tarball/$1
diff --git a/stdlib/Statistics.version b/stdlib/Statistics.version
index e6f0a62b3cec5..a435760ee57aa 100644
--- a/stdlib/Statistics.version
+++ b/stdlib/Statistics.version
@@ -1,4 +1,4 @@
-STATISTICS_BRANCH = master
-STATISTICS_SHA1 = 77bd5707f143eb624721a7df28ddef470e70ecef
-STATISTICS_GIT_URL := https://github.com/JuliaStats/Statistics.jl.git
-STATISTICS_TAR_URL = https://api.github.com/repos/JuliaStats/Statistics.jl/tarball/$1
+STATISTICS_BRANCH = mb/reduce-reuse-recycle
+STATISTICS_SHA1 = c674ab823526d05e7a85b468065089bf563823aa
+STATISTICS_GIT_URL := https://github.com/mbauman/Statistics.jl.git
+STATISTICS_TAR_URL = https://api.github.com/repos/mbauman/Statistics.jl/tarball/$1

From ec3def48a1c8458da19045350be265101e20f397 Mon Sep 17 00:00:00 2001
From: Matt Bauman <mbauman@juliacomputing.com>
Date: Mon, 19 May 2025 14:42:36 -0400
Subject: [PATCH 21/55] TEMPORARILY USE SPARSEARRAYS: mapreduce_kernel
 implementation!

---
 .../md5                                                         | 1 +
 .../sha512                                                      | 1 +
 stdlib/SparseArrays.version                                     | 2 +-
 3 files changed, 3 insertions(+), 1 deletion(-)
 create mode 100644 deps/checksums/SparseArrays-1b37c433f659f6187fa6c41037e082887b6f7387.tar.gz/md5
 create mode 100644 deps/checksums/SparseArrays-1b37c433f659f6187fa6c41037e082887b6f7387.tar.gz/sha512

diff --git a/deps/checksums/SparseArrays-1b37c433f659f6187fa6c41037e082887b6f7387.tar.gz/md5 b/deps/checksums/SparseArrays-1b37c433f659f6187fa6c41037e082887b6f7387.tar.gz/md5
new file mode 100644
index 0000000000000..b2b324144751a
--- /dev/null
+++ b/deps/checksums/SparseArrays-1b37c433f659f6187fa6c41037e082887b6f7387.tar.gz/md5
@@ -0,0 +1 @@
+04f0d741c51ca4322cba9d8acbd099e3
diff --git a/deps/checksums/SparseArrays-1b37c433f659f6187fa6c41037e082887b6f7387.tar.gz/sha512 b/deps/checksums/SparseArrays-1b37c433f659f6187fa6c41037e082887b6f7387.tar.gz/sha512
new file mode 100644
index 0000000000000..e3d5002842415
--- /dev/null
+++ b/deps/checksums/SparseArrays-1b37c433f659f6187fa6c41037e082887b6f7387.tar.gz/sha512
@@ -0,0 +1 @@
+8eadeed9dbe51a9f5cc7c81ff2bb19240118c57644a4dd5c448b35f84d214cb851162d2cb75bad3687becc8324e4fa6dbc3d0ff0ae3300df218ce11ccb4ae09d
diff --git a/stdlib/SparseArrays.version b/stdlib/SparseArrays.version
index 767682f68eeb3..a5493b8c48b3e 100644
--- a/stdlib/SparseArrays.version
+++ b/stdlib/SparseArrays.version
@@ -1,4 +1,4 @@
 SPARSEARRAYS_BRANCH = mb/ReduceReuse♻
-SPARSEARRAYS_SHA1 = 975f04a739ddbe9e8dc04ed46f3227da0aa912e8
+SPARSEARRAYS_SHA1 = 1b37c433f659f6187fa6c41037e082887b6f7387
 SPARSEARRAYS_GIT_URL := https://github.com/mbauman/SparseArrays.jl.git
 SPARSEARRAYS_TAR_URL = https://api.github.com/repos/mbauman/SparseArrays.jl/tarball/$1

From 7445ccd4156f2334480efbabb3a897581dce35f8 Mon Sep 17 00:00:00 2001
From: Matt Bauman <mbauman@juliacomputing.com>
Date: Mon, 19 May 2025 14:44:09 -0400
Subject: [PATCH 22/55] fix offset IndexLinears

---
 base/reducedim.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/base/reducedim.jl b/base/reducedim.jl
index dda08132eb95b..7ff6a9b4980c7 100644
--- a/base/reducedim.jl
+++ b/base/reducedim.jl
@@ -145,7 +145,7 @@ function mapreducedim_naive(f::F, op::OP, A, init, is_inner_dim, inner, outer, a
     lsiz = linear_size(A, is_inner_dim)
     if lsiz > 0
         i0 = first(LinearIndices(A))
-        alloc(mapreduce_pairwise(f, op, A, init, (i0:lsiz) .+ (lsiz*(i-1))) for (i,_) in enumerate(outer))
+        alloc(mapreduce_pairwise(f, op, A, init, (i0:i0+lsiz-1) .+ (lsiz*(i-1))) for (i,_) in enumerate(outer))
     else
         alloc(mapreduce_pairwise(f, op, A, init, mergeindices(is_inner_dim, inner, i)) for i in outer)
     end

From eed76db62647782e27477943c7801076d65fc5f7 Mon Sep 17 00:00:00 2001
From: Matt Bauman <mbauman@juliacomputing.com>
Date: Mon, 19 May 2025 14:44:50 -0400
Subject: [PATCH 23/55] support differently-offset output arrays

grumble grumble
---
 base/reducedim.jl | 33 +++++++++++++++++++++------------
 1 file changed, 21 insertions(+), 12 deletions(-)

diff --git a/base/reducedim.jl b/base/reducedim.jl
index 7ff6a9b4980c7..70da74715b126 100644
--- a/base/reducedim.jl
+++ b/base/reducedim.jl
@@ -109,11 +109,22 @@ else
 end
 
 function mapreducedim(f::F, op::OP, A, init, dims, alloc=_MapReduceAllocator(A)) where {F, OP}
-    # It's advantageous to additionally consider singleton dimensions as part of the
-    # reduction as that allows more cases to be considered contiguous
-    is_inner_dim = ntuple(d->d in dims || size(A, d) == 1, ndims(A))
+    if alloc isa _MapReduceInPlace
+        # We can ignore dims and just trust the output array's axes. Note that the
+        # other branch here optimizes for the case where the input array _also_ has
+        # a singleton dimension, but we cannot do that here because OffsetArrays
+        # supports reductions into differently-offset singleton dimensions. This means
+        # we cannot index directly into A with an `outer` index. The output may also have
+        # fewer dimensions than A, so we may need to add trailing dims here:
+        outer = CartesianIndices(ntuple(d->axes(alloc.A, d), ndims(A)))
+        is_inner_dim = map(==(1), size(outer))
+    else
+        # It's advantageous to additionally consider singleton dimensions as part of the
+        # reduction as that allows more cases to be considered contiguous
+        is_inner_dim = ntuple(d->d in dims || size(A, d) == 1, ndims(A))
+        outer = CartesianIndices(reduced_indices(A, dims))
+    end
     inner = CartesianIndices(map((b,ax)->b ? ax : reduced_index(ax), is_inner_dim, axes(A)))
-    outer = CartesianIndices(reduced_indices(A, dims))
     n = length(inner)
     # Handle the empty and trivial 1-element cases:
     if (n == 0 || isempty(A))
@@ -126,7 +137,7 @@ function mapreducedim(f::F, op::OP, A, init, dims, alloc=_MapReduceAllocator(A))
     if is_inner_dim == keep_first_trues(is_inner_dim) || _mapreduce_might_widen(f, op, A, init, alloc)
         # Column major contiguous reduction! This is the easy case
         return mapreducedim_naive(f, op, A, init, is_inner_dim, inner, outer, alloc)
-    elseif 1 in dims
+    elseif is_inner_dim[1] # `dims` includes the first dimension
         return mapreducedim_colmajor(f, op, A, init, is_inner_dim, inner, outer, alloc)
     else
         return mapreducedim_rowmajor(f, op, A, init, is_inner_dim, inner, outer, alloc)
@@ -152,8 +163,8 @@ function mapreducedim_naive(f::F, op::OP, A, init, is_inner_dim, inner, outer, a
 end
 function mapreducedim_colmajor(f::F, op::OP, A, init, is_inner_dim, inner, outer, alloc, enforce_pairwise=true) where {F, OP}
     is_contiguous_inner = keep_first_trues(is_inner_dim)
-    contiguous_inner = mergeindices(is_contiguous_inner, inner, first(outer))
-    discontiguous_inner = mergeindices(is_contiguous_inner, first(outer), inner)
+    contiguous_inner = mergeindices(is_contiguous_inner, inner, first(inner))
+    discontiguous_inner = mergeindices(is_contiguous_inner, first(inner), inner)
     if enforce_pairwise && length(discontiguous_inner) > pairwise_blocksize(f, op)
         return mapreducedim_naive(f, op, A, init, is_inner_dim, inner, outer, alloc)
     end
@@ -176,7 +187,7 @@ function mapreducedim_rowmajor(f::F, op::OP, A, init, is_inner_dim, inner, outer
         return mapreducedim_naive(f, op, A, init, is_inner_dim, inner, outer, alloc)
     end
     # Take one element from each outer iteration to initialize R
-    R = alloc(_mapreduce_start(f, op, A, init, A[o]) for o in outer)
+    R = alloc(_mapreduce_start(f, op, A, init, A[mergeindices(is_inner_dim, first(inner), o)]) for o in outer)
     for i in Iterators.drop(inner, 1)
         @simd for o in outer
             # SIMD still helps here! It doesn't reassociate _within_ each reduction, but it can allow
@@ -198,12 +209,10 @@ Compute `mapreduce(f, op, A; init, dims)` where `dims` are the singleton dimensi
     The previous values in `R` are _not_ used as initial values; they are completely ignored
 """
 function mapreduce!(f, op, R::AbstractArray, A::AbstractArrayOrBroadcasted; init=_InitialValue(), update=false)
-    if ndims(R) > ndims(A) || !all(d->size(R, d) in (1, size(A, d)), 1:max(ndims(R), ndims(A)))
+    if ndims(R) > ndims(A) || !all(d->size(R, d) == 1 || axes(R, d) == axes(A, d), 1:max(ndims(R), ndims(A)))
         throw(DimensionMismatch())
     end
-    is_inner_dim = ntuple(d->size(R, d)==1, ndims(A))
-    dims = findall(is_inner_dim) #TODO: do better
-    return mapreducedim(f, op, A, init, dims, _MapReduceInPlace(R, op, update))
+    return mapreducedim(f, op, A, init, nothing, _MapReduceInPlace(R, op, update))
 end
 
 """

From ed4cd1bb200a92ba82b341634600455930545743 Mon Sep 17 00:00:00 2001
From: Matt Bauman <mbauman@juliacomputing.com>
Date: Mon, 19 May 2025 15:28:48 -0400
Subject: [PATCH 24/55] remove not-fully-fixed MWE from 52457

---
 test/reduce.jl | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/test/reduce.jl b/test/reduce.jl
index 9cd213b8b387b..8002211d414c3 100644
--- a/test/reduce.jl
+++ b/test/reduce.jl
@@ -822,9 +822,4 @@ iterunknown(x) = (y = skipmissing(convert(Array{Union{Missing, eltype(x)}}, x));
     end
 
     @test sum((rand(Float32) for _ in 1:100000000))/100000000 ≈ 0.5
-
-    rng = MersenneTwister(630);
-    v = randn(rng, Float16, 1000)
-    sa = @view v[collect(1:end)]
-    @test sum(sa) ≈ sum(v)
 end

From cfe80cd88ccbc4bd6aa701a814de385a94ad2918 Mon Sep 17 00:00:00 2001
From: Matt Bauman <mbauman@juliacomputing.com>
Date: Mon, 19 May 2025 16:43:34 -0400
Subject: [PATCH 25/55] implement the kernel for SCartesianIndices

---
 base/reinterpretarray.jl | 58 +++++++++++++++-------------------------
 1 file changed, 22 insertions(+), 36 deletions(-)

diff --git a/base/reinterpretarray.jl b/base/reinterpretarray.jl
index d7f6a15afa0a7..e5e6953b5b301 100644
--- a/base/reinterpretarray.jl
+++ b/base/reinterpretarray.jl
@@ -905,43 +905,29 @@ end
 
 # Reductions with IndexSCartesian2
 
-function _mapreduce(f::F, op::OP, style::IndexSCartesian2{K}, A::AbstractArrayOrBroadcasted) where {F,OP,K}
-    inds = eachindex(style, A)
-    n = size(inds)[2]
-    if n == 0
-        return mapreduce_empty_iter(f, op, A, IteratorEltype(A))
-    else
-        return mapreduce_impl(f, op, A, first(inds), last(inds))
-    end
-end
-
-@noinline function mapreduce_impl(f::F, op::OP, A::AbstractArrayOrBroadcasted,
-                                  ifirst::SCI, ilast::SCI, blksize::Int) where {F,OP,SCI<:SCartesianIndex2{K}} where K
-    if ilast.j - ifirst.j < blksize
-        # sequential portion
-        @inbounds a1 = A[ifirst]
-        @inbounds a2 = A[SCI(2,ifirst.j)]
-        v = op(f(a1), f(a2))
-        @simd for i = ifirst.i + 2 : K
-            @inbounds ai = A[SCI(i,ifirst.j)]
+@noinline _halve_error() = throw(AssertionError("cannot split SCartesianIndices2{$K} into halves smaller than $K"))
+
+function halves(inds::SCartesianIndices2{K}) where {K}
+    inds2 = inds.indices2
+    length(inds2) == 1 && _halve_error()
+    map(SCartesianIndices2{K}, halves(inds2))
+end
+
+function mapreduce_kernel(f, op, A, init, inds::SCartesianIndices2{K}) where K
+    js = inds.indices2
+    j1 = first(js)
+    v = _mapreduce_start(f, op, A, init, @inbounds A[SCartesianIndex2{K}(1, j1)])
+    # finish the first column
+    @simd for i in 2:K
+        @inbounds ai = A[SCartesianIndex2{K}(i, j1)]
+        v = op(v, f(ai))
+    end
+    # Remaining columns
+    for j in js[begin+1:end]
+        @simd for i in 1:K
+            @inbounds ai = A[SCartesianIndex2{K}(i,j)]
             v = op(v, f(ai))
         end
-        # Remaining columns
-        for j = ifirst.j+1 : ilast.j
-            @simd for i = 1:K
-                @inbounds ai = A[SCI(i,j)]
-                v = op(v, f(ai))
-            end
-        end
-        return v
-    else
-        # pairwise portion
-        jmid = ifirst.j + (ilast.j - ifirst.j) >> 1
-        v1 = mapreduce_impl(f, op, A, ifirst, SCI(K,jmid), blksize)
-        v2 = mapreduce_impl(f, op, A, SCI(1,jmid+1), ilast, blksize)
-        return op(v1, v2)
     end
+    return v
 end
-
-mapreduce_impl(f::F, op::OP, A::AbstractArrayOrBroadcasted, ifirst::SCartesianIndex2, ilast::SCartesianIndex2) where {F,OP} =
-    mapreduce_impl(f, op, A, ifirst, ilast, pairwise_blocksize(f, op))

From aab7aa3b89da98f898f50a53f9c2a98cf5cb710b Mon Sep 17 00:00:00 2001
From: Matt Bauman <mbauman@juliacomputing.com>
Date: Mon, 19 May 2025 16:52:46 -0400
Subject: [PATCH 26/55] remove the mapreduce patches from juliac buildscript

we probably need some of these function specializations, but I do not know how to find them
---
 contrib/juliac-buildscript.jl | 38 +++++++++++++++++------------------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/contrib/juliac-buildscript.jl b/contrib/juliac-buildscript.jl
index 0549afc0e1508..5b49341ea54d7 100644
--- a/contrib/juliac-buildscript.jl
+++ b/contrib/juliac-buildscript.jl
@@ -100,31 +100,31 @@ end
     end
     show_type_name(io::IO, tn::Core.TypeName) = print(io, tn.name)
 
-    mapreduce(f::F, op::F2, A::AbstractArrayOrBroadcasted; dims=:, init=_InitialValue()) where {F, F2} =
-    _mapreduce_dim(f, op, init, A, dims)
-    mapreduce(f::F, op::F2, A::AbstractArrayOrBroadcasted...; kw...) where {F, F2} =
-        reduce(op, map(f, A...); kw...)
+    # mapreduce(f::F, op::F2, A::AbstractArrayOrBroadcasted; dims=:, init=_InitialValue()) where {F, F2} =
+    # _mapreduce_dim(f, op, init, A, dims)
+    # mapreduce(f::F, op::F2, A::AbstractArrayOrBroadcasted...; kw...) where {F, F2} =
+    #     reduce(op, map(f, A...); kw...)
 
-    _mapreduce_dim(f::F, op::F2, nt, A::AbstractArrayOrBroadcasted, ::Colon) where {F, F2} =
-        mapfoldl_impl(f, op, nt, A)
+    # _mapreduce_dim(f::F, op::F2, nt, A::AbstractArrayOrBroadcasted, ::Colon) where {F, F2} =
+    #     mapfoldl_impl(f, op, nt, A)
 
-    _mapreduce_dim(f::F, op::F2, ::_InitialValue, A::AbstractArrayOrBroadcasted, ::Colon) where {F, F2} =
-        _mapreduce(f, op, IndexStyle(A), A)
+    # _mapreduce_dim(f::F, op::F2, ::_InitialValue, A::AbstractArrayOrBroadcasted, ::Colon) where {F, F2} =
+    #     _mapreduce(f, op, IndexStyle(A), A)
 
-    _mapreduce_dim(f::F, op::F2, nt, A::AbstractArrayOrBroadcasted, dims) where {F, F2} =
-        mapreducedim!(f, op, reducedim_initarray(A, dims, nt), A)
+    # _mapreduce_dim(f::F, op::F2, nt, A::AbstractArrayOrBroadcasted, dims) where {F, F2} =
+    #     mapreducedim!(f, op, reducedim_initarray(A, dims, nt), A)
 
-    _mapreduce_dim(f::F, op::F2, ::_InitialValue, A::AbstractArrayOrBroadcasted, dims) where {F,F2} =
-        mapreducedim!(f, op, reducedim_init(f, op, A, dims), A)
+    # _mapreduce_dim(f::F, op::F2, ::_InitialValue, A::AbstractArrayOrBroadcasted, dims) where {F,F2} =
+    #     mapreducedim!(f, op, reducedim_init(f, op, A, dims), A)
 
-    mapreduce_empty_iter(f::F, op::F2, itr, ItrEltype) where {F, F2} =
-        reduce_empty_iter(MappingRF(f, op), itr, ItrEltype)
-        mapreduce_first(f::F, op::F2, x) where {F,F2} = reduce_first(op, f(x))
+    # mapreduce_empty_iter(f::F, op::F2, itr, ItrEltype) where {F, F2} =
+    #     reduce_empty_iter(MappingRF(f, op), itr, ItrEltype)
+    #     mapreduce_first(f::F, op::F2, x) where {F,F2} = reduce_first(op, f(x))
 
-    _mapreduce(f::F, op::F2, A::AbstractArrayOrBroadcasted) where {F,F2} = _mapreduce(f, op, IndexStyle(A), A)
-    mapreduce_empty(::typeof(identity), op::F, T) where {F} = reduce_empty(op, T)
-    mapreduce_empty(::typeof(abs), op::F, T) where {F}     = abs(reduce_empty(op, T))
-    mapreduce_empty(::typeof(abs2), op::F, T) where {F}    = abs2(reduce_empty(op, T))
+    # _mapreduce(f::F, op::F2, A::AbstractArrayOrBroadcasted) where {F,F2} = _mapreduce(f, op, IndexStyle(A), A)
+    # mapreduce_empty(::typeof(identity), op::F, T) where {F} = reduce_empty(op, T)
+    # mapreduce_empty(::typeof(abs), op::F, T) where {F}     = abs(reduce_empty(op, T))
+    # mapreduce_empty(::typeof(abs2), op::F, T) where {F}    = abs2(reduce_empty(op, T))
 end
 @eval Base.Sys begin
     __init_build() = nothing

From cf551b1ac867af19adc1ac5205ed831d526fa209 Mon Sep 17 00:00:00 2001
From: Matt Bauman <mbauman@juliacomputing.com>
Date: Mon, 19 May 2025 21:24:41 -0400
Subject: [PATCH 27/55] fix error test that was accidentally tripping over
 depwarns

---
 base/deprecated.jl | 3 +++
 test/error.jl      | 1 +
 2 files changed, 4 insertions(+)

diff --git a/base/deprecated.jl b/base/deprecated.jl
index dd22fd9c05592..bb781cc0f4911 100644
--- a/base/deprecated.jl
+++ b/base/deprecated.jl
@@ -564,12 +564,15 @@ isbindingresolved
 function reducedim_init end
 const _dep_message_reducedim_init = ", these internals have been removed. To customize the array returned by dimensional reductions, implement mapreduce_similar instead"
 deprecate(Base, :reducedim_init)
+deprecate(Base, Symbol("#reducedim_init"))
 function reducedim_initarray end
 const _dep_message_reducedim_initarray = ", these internals have been removed. To customize the array returned by dimensional reductions, implement mapreduce_similar instead"
 deprecate(Base, :reducedim_initarray)
+deprecate(Base, Symbol("#reducedim_initarray"))
 function _mapreduce_dim end
 const _dep_message__mapreduce_dim = ", these internals have been removed. To customize the array returned by dimensional reductions, implement mapreduce_similar instead"
 deprecate(Base, :_mapreduce_dim)
+deprecate(Base, Symbol("#_mapreduce_dim"))
 @deprecate_binding mapreducedim! Base.mapreduce! false
 @deprecate_binding _mapreducedim! Base.mapreduce! false
 @deprecate_binding reducedim! Base.reduce! false
diff --git a/test/error.jl b/test/error.jl
index f76a7809b08a9..291139583bb6b 100644
--- a/test/error.jl
+++ b/test/error.jl
@@ -110,6 +110,7 @@ end
             push!(visited, mod)
             for name in names(mod, all=true)
                 isdefined(mod, name) || continue
+                Base.isdeprecated(mod, name) && continue
                 value = getfield(mod, name)
                 if value isa Module
                     value === Main && continue

From 1748ff49b326c43836d18be00550c469c9245b04 Mon Sep 17 00:00:00 2001
From: Matt Bauman <mbauman@juliacomputing.com>
Date: Mon, 19 May 2025 21:25:28 -0400
Subject: [PATCH 28/55] TEMPORARILY USE LINEARALGEBRA -- bump to merge master

---
 .../md5                                                         | 1 +
 .../sha512                                                      | 1 +
 stdlib/LinearAlgebra.version                                    | 2 +-
 3 files changed, 3 insertions(+), 1 deletion(-)
 create mode 100644 deps/checksums/LinearAlgebra-5c6d351459459d2032443973bc28c4f346cc094c.tar.gz/md5
 create mode 100644 deps/checksums/LinearAlgebra-5c6d351459459d2032443973bc28c4f346cc094c.tar.gz/sha512

diff --git a/deps/checksums/LinearAlgebra-5c6d351459459d2032443973bc28c4f346cc094c.tar.gz/md5 b/deps/checksums/LinearAlgebra-5c6d351459459d2032443973bc28c4f346cc094c.tar.gz/md5
new file mode 100644
index 0000000000000..8eb75fdb8c94b
--- /dev/null
+++ b/deps/checksums/LinearAlgebra-5c6d351459459d2032443973bc28c4f346cc094c.tar.gz/md5
@@ -0,0 +1 @@
+c02013c82a3c13c8ccc603b6f4419b85
diff --git a/deps/checksums/LinearAlgebra-5c6d351459459d2032443973bc28c4f346cc094c.tar.gz/sha512 b/deps/checksums/LinearAlgebra-5c6d351459459d2032443973bc28c4f346cc094c.tar.gz/sha512
new file mode 100644
index 0000000000000..ed3452cbcb6e1
--- /dev/null
+++ b/deps/checksums/LinearAlgebra-5c6d351459459d2032443973bc28c4f346cc094c.tar.gz/sha512
@@ -0,0 +1 @@
+10d131c4f42fc69c556f54d2ef7c1d954d61a3d940141cbb48e3fbfbcde7628a3a0e0ee1bb069586048913d2b8e141dbe87347e46922b09a400d898beed4b35e
diff --git a/stdlib/LinearAlgebra.version b/stdlib/LinearAlgebra.version
index 766d524787f1f..ff2ced8da6765 100644
--- a/stdlib/LinearAlgebra.version
+++ b/stdlib/LinearAlgebra.version
@@ -1,4 +1,4 @@
 LINEARALGEBRA_BRANCH = mb/ReduceReuse♻
-LINEARALGEBRA_SHA1 = e70953e599be1d43ead342c0f56b44f008900be8
+LINEARALGEBRA_SHA1 = 5c6d351459459d2032443973bc28c4f346cc094c
 LINEARALGEBRA_GIT_URL := https://github.com/mbauman/LinearAlgebra.jl.git
 LINEARALGEBRA_TAR_URL = https://api.github.com/repos/mbauman/LinearAlgebra.jl/tarball/$1

From febc5eee6d7c27b3c9b21814d722e905c4ff2023 Mon Sep 17 00:00:00 2001
From: Matt Bauman <mbauman@juliacomputing.com>
Date: Tue, 20 May 2025 09:40:24 -0400
Subject: [PATCH 29/55] fix doctests

* The exact numeric results all got closer to their true values
* The Squares iterable needs length defined to sum it (as documented in the paragraph above!)
---
 base/mathconstants.jl        |  4 ++--
 doc/src/manual/arrays.md     |  2 +-
 doc/src/manual/interfaces.md | 13 ++++++-------
 3 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/base/mathconstants.jl b/base/mathconstants.jl
index d26f5115b5ccb..d2895199f6024 100644
--- a/base/mathconstants.jl
+++ b/base/mathconstants.jl
@@ -96,7 +96,7 @@ julia> Base.MathConstants.eulergamma
 julia> dx = 10^-6;
 
 julia> sum(-exp(-x) * log(x) for x in dx:dx:100) * dx
-0.5772078382499133
+0.5772078382090373
 ```
 """
 γ, const eulergamma = γ
@@ -129,7 +129,7 @@ julia> Base.MathConstants.catalan
 catalan = 0.9159655941772...
 
 julia> sum(log(x)/(1+x^2) for x in 1:0.01:10^6) * 0.01
-0.9159466120554123
+0.9159466120556183
 ```
 """
 catalan
diff --git a/doc/src/manual/arrays.md b/doc/src/manual/arrays.md
index ee9421ba1730b..eceda06dfd136 100644
--- a/doc/src/manual/arrays.md
+++ b/doc/src/manual/arrays.md
@@ -407,7 +407,7 @@ sums a series without allocating memory:
 
 ```jldoctest
 julia> sum(1/n^2 for n=1:1000)
-1.6439345666815615
+1.6439345666815603
 ```
 
 When writing a generator expression with multiple dimensions inside an argument list, parentheses
diff --git a/doc/src/manual/interfaces.md b/doc/src/manual/interfaces.md
index e752448f14a25..4e6d35c388913 100644
--- a/doc/src/manual/interfaces.md
+++ b/doc/src/manual/interfaces.md
@@ -61,10 +61,12 @@ julia> struct Squares
        end
 
 julia> Base.iterate(S::Squares, state=1) = state > S.count ? nothing : (state*state, state+1)
+
+julia> Base.length(S::Squares) = S.count
 ```
 
-With only [`iterate`](@ref) definition, the `Squares` type is already pretty powerful.
-We can iterate over all the elements:
+With just these [`iterate`](@ref) and [`length](@ref) definitions, the `Squares` type is already
+pretty powerful. We can iterate over all the elements:
 
 ```jldoctest squaretype
 julia> for item in Squares(7)
@@ -90,16 +92,13 @@ julia> sum(Squares(100))
 338350
 ```
 
-There are a few more methods we can extend to give Julia more information about this iterable
+There are other methods we can extend to give Julia more information about this iterable
 collection. We know that the elements in a `Squares` sequence will always be `Int`. By extending
 the [`eltype`](@ref) method, we can give that information to Julia and help it make more specialized
-code in the more complicated methods. We also know the number of elements in our sequence, so
-we can extend [`length`](@ref), too:
+code in the more complicated methods:
 
 ```jldoctest squaretype
 julia> Base.eltype(::Type{Squares}) = Int # Note that this is defined for the type
-
-julia> Base.length(S::Squares) = S.count
 ```
 
 Now, when we ask Julia to [`collect`](@ref) all the elements into an array it can preallocate a `Vector{Int}`

From 84e37117aabe743b0928520e195e7164ecf8c86f Mon Sep 17 00:00:00 2001
From: Matt Bauman <mbauman@juliacomputing.com>
Date: Tue, 20 May 2025 09:45:38 -0400
Subject: [PATCH 30/55] skip deprecations in doc checks

---
 base/docs/Docs.jl | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/base/docs/Docs.jl b/base/docs/Docs.jl
index ae3891e218824..180ed7c386b1a 100644
--- a/base/docs/Docs.jl
+++ b/base/docs/Docs.jl
@@ -831,7 +831,8 @@ See also: [`names`](@ref), [`Docs.hasdoc`](@ref), [`Base.ispublic`](@ref).
 """
 function undocumented_names(mod::Module; private::Bool=false)
     filter!(names(mod; all=true)) do sym
-        !hasdoc(mod, sym) && !startswith(string(sym), '#') &&
+        !Base.isdeprecated(mod, sym) &&
+            !hasdoc(mod, sym) && !startswith(string(sym), '#') &&
             (private || Base.ispublic(mod, sym))
     end
 end

From 8b8fd7c62f3b9d527bc8d8cfb1974e6ae992bcb6 Mon Sep 17 00:00:00 2001
From: Matt Bauman <mbauman@juliacomputing.com>
Date: Tue, 20 May 2025 11:59:48 -0400
Subject: [PATCH 31/55] make deprecations a bit gentler

---
 base/deprecated.jl | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/base/deprecated.jl b/base/deprecated.jl
index bb781cc0f4911..ae5c7600c6048 100644
--- a/base/deprecated.jl
+++ b/base/deprecated.jl
@@ -27,6 +27,7 @@ const __internal_changes_list = (
     :codeinfonargs,  # #54341
     :ocnopartial,
     :printcodeinfocalls,
+    :reducerefactor,
     # Add new change names above this line
 )
 
@@ -561,21 +562,24 @@ isbindingresolved
 
 # BEGIN 1.13 deprecations
 
-function reducedim_init end
+## These are all mapreduce internals that were not exported or public, but have deprecations to minimize disruptions
+@deprecate reducedim_init(f, op, A::AbstractArray, region) (v = mapreduce_empty(f, op, eltype(A)); fill!(mapreduce_similar(A, typeof(v), reduced_indices(A,region)), v)) false
 const _dep_message_reducedim_init = ", these internals have been removed. To customize the array returned by dimensional reductions, implement mapreduce_similar instead"
 deprecate(Base, :reducedim_init)
 deprecate(Base, Symbol("#reducedim_init"))
-function reducedim_initarray end
-const _dep_message_reducedim_initarray = ", these internals have been removed. To customize the array returned by dimensional reductions, implement mapreduce_similar instead"
+
+@deprecate (reducedim_initarray(A::Union{Base.AbstractBroadcasted, AbstractArray}, region, init, ::Type{T}) where {T}) fill!(mapreduce_similar(A,T,reduced_indices(A,region)), init) false
+@deprecate reducedim_initarray(A::Union{Base.AbstractBroadcasted, AbstractArray}, region, init) fill!(mapreduce_similar(A,typeof(init),reduced_indices(A,region)), init) false
+const _dep_message_reducedim_init = ", these internals have been removed. To customize the array returned by dimensional reductions, implement mapreduce_similar instead"
 deprecate(Base, :reducedim_initarray)
 deprecate(Base, Symbol("#reducedim_initarray"))
-function _mapreduce_dim end
-const _dep_message__mapreduce_dim = ", these internals have been removed. To customize the array returned by dimensional reductions, implement mapreduce_similar instead"
-deprecate(Base, :_mapreduce_dim)
-deprecate(Base, Symbol("#_mapreduce_dim"))
+
+@deprecate _mapreduce_dim(f, op, nt, A::Union{Base.AbstractBroadcasted, AbstractArray}, dims) mapreducedim(f, op, A, nt, dims) false
+
 @deprecate_binding mapreducedim! Base.mapreduce! false
 @deprecate_binding _mapreducedim! Base.mapreduce! false
 @deprecate_binding reducedim! Base.reduce! false
+
 @deprecate mapreduce_impl(f, op, A::AbstractArrayOrBroadcasted, ifirst::Integer, ilast::Integer, blksize::Int=pairwise_blocksize(f, op)) Base.mapreduce_pairwise(f, op, A, _InitialValue(), ifirst:ilast) false
 @deprecate _mapreduce(f, op, A) Base.mapreduce_pairwise(f, op, A, Base._InitialValue()) false
 @deprecate _mapreduce(f, op, _, A) Base.mapreduce_pairwise(f, op, A, Base._InitialValue()) false

From 133e960642b86215875184a9e908718a5fb04241 Mon Sep 17 00:00:00 2001
From: Matt Bauman <mbauman@juliacomputing.com>
Date: Tue, 20 May 2025 15:53:31 -0400
Subject: [PATCH 32/55] specialize constructors in transformer for find*

---
 base/reducedim.jl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/base/reducedim.jl b/base/reducedim.jl
index 70da74715b126..6cfa4fe9df1e1 100644
--- a/base/reducedim.jl
+++ b/base/reducedim.jl
@@ -962,6 +962,7 @@ _unzip(Z::ZippedArray) = (Z.first, Z.second)
 _unzip(A::AbstractArray) = ([a[1] for a in A], [a[2] for a in A])
 
 _transform_pair(f) = x-> (x[1], f(x[2]))
+_transform_pair(::Type{F}) where {F} = x-> (x[1], F(x[2]))
 _transform_pair(f::typeof(identity)) = f
 
 """

From 5fdb54c5f420114c3db1228d5e338c93742d20ad Mon Sep 17 00:00:00 2001
From: N5N3 <2642243996@qq.com>
Date: Tue, 20 May 2025 15:56:34 -0400
Subject: [PATCH 33/55] add tests from 45822

---
 test/offsetarray.jl |  8 ++++++
 test/reducedim.jl   | 65 ++++++++++++++++++++++++++++-----------------
 2 files changed, 48 insertions(+), 25 deletions(-)

diff --git a/test/offsetarray.jl b/test/offsetarray.jl
index 8e2ee33c49ed6..003d51447f547 100644
--- a/test/offsetarray.jl
+++ b/test/offsetarray.jl
@@ -925,3 +925,11 @@ end
     @test parent(B) == A0
     @test axes(B) == Base.IdentityUnitRange.((-10:-9, 9:10))
 end
+
+# issue #38660
+@testset "`findmin/max` for OffsetArray" begin
+    ov = OffsetVector([-1, 1], 0:1)
+    @test @inferred(findmin(ov; dims = 1)) .|> first == (-1, 0)
+    ov = OffsetVector([-1, 1], -1:0)
+    @test @inferred(findmax(ov; dims = 1)) .|> first == (1, 0)
+end
diff --git a/test/reducedim.jl b/test/reducedim.jl
index 05ff47db78dad..782f3419ca863 100644
--- a/test/reducedim.jl
+++ b/test/reducedim.jl
@@ -307,33 +307,48 @@ end
     end
 end
 
+struct WithIteratorsKeys{T,N} <: AbstractArray{T,N}
+    data::Array{T,N}
+end
+Base.size(a::WithIteratorsKeys) = size(a.data)
+Base.getindex(a::WithIteratorsKeys, inds...) = a.data[inds...]
+struct IteratorsKeys{T,A<:AbstractArray{T}}
+    iter::A
+    IteratorsKeys(iter) = new{eltype(iter),typeof(iter)}(iter)
+end
+Base.iterate(a::IteratorsKeys, state...) = Base.iterate(a.iter, state...)
+Base.keys(a::WithIteratorsKeys) = IteratorsKeys(keys(a.data))
+Base.eltype(::IteratorsKeys{T}) where {T} = T
+
 # findmin/findmax function arguments: output type inference
 @testset "findmin/findmax output type inference" begin
-    A = ["1" "22"; "333" "4444"]
-    for (tup, rval, rind) in [((1,), [1 2], [CartesianIndex(1, 1) CartesianIndex(1, 2)]),
-                              ((2,), reshape([1, 3], 2, 1), reshape([CartesianIndex(1, 1), CartesianIndex(2, 1)], 2, 1)),
-                              ((1,2), fill(1,1,1), fill(CartesianIndex(1,1),1,1))]
-        rval′, rind′ = findmin(length, A, dims=tup)
-        @test (rval, rind) == (rval′, rind′)
-        @test typeof(rval′) == Matrix{Int}
-    end
-    for (tup, rval, rind) in [((1,), [3 4], [CartesianIndex(2, 1) CartesianIndex(2, 2)]),
-                              ((2,), reshape([2, 4], 2, 1), reshape([CartesianIndex(1, 2), CartesianIndex(2, 2)], 2, 1)),
-                              ((1,2), fill(4,1,1), fill(CartesianIndex(2,2),1,1))]
-        rval′, rind′ = findmax(length, A, dims=tup)
-        @test (rval, rind) == (rval′, rind′)
-        @test typeof(rval) == Matrix{Int}
-    end
-    B = [1.5 1.0; 5.5 6.0]
-    for (tup, rval, rind) in [((1,), [3//2 1//1], [CartesianIndex(1, 1) CartesianIndex(1, 2)]),
-                              ((2,), reshape([1//1, 11//2], 2, 1), reshape([CartesianIndex(1, 2), CartesianIndex(2, 1)], 2, 1)),
-                              ((1,2), fill(1//1,1,1), fill(CartesianIndex(1,2),1,1))]
-        rval′, rind′ = findmin(Rational, B, dims=tup)
-        @test (rval, rind) == (rval′, rind′)
-        @test typeof(rval) == Matrix{Rational{Int}}
-        rval′, rind′ = findmin(Rational ∘ abs ∘ complex, B, dims=tup)
-        @test (rval, rind) == (rval′, rind′)
-        @test typeof(rval) == Matrix{Rational{Int}}
+    for wrapper in (identity, WithIteratorsKeys)
+        A = wrapper(["1" "22"; "333" "4444"])
+        for (tup, rval, rind) in [((1,), [1 2], [CartesianIndex(1, 1) CartesianIndex(1, 2)]),
+                                ((2,), reshape([1, 3], 2, 1), reshape([CartesianIndex(1, 1), CartesianIndex(2, 1)], 2, 1)),
+                                ((1,2), fill(1,1,1), fill(CartesianIndex(1,1),1,1))]
+            rval′, rind′ = @inferred findmin(length, A, dims=tup)
+            @test (rval, rind) == (rval′, rind′)
+            @test typeof(rval′) == Matrix{Int}
+        end
+        for (tup, rval, rind) in [((1,), [3 4], [CartesianIndex(2, 1) CartesianIndex(2, 2)]),
+                                ((2,), reshape([2, 4], 2, 1), reshape([CartesianIndex(1, 2), CartesianIndex(2, 2)], 2, 1)),
+                                ((1,2), fill(4,1,1), fill(CartesianIndex(2,2),1,1))]
+            rval′, rind′ = @inferred findmax(length, A, dims=tup)
+            @test (rval, rind) == (rval′, rind′)
+            @test typeof(rval) == Matrix{Int}
+        end
+        B = wrapper([1.5 1.0; 5.5 6.0])
+        for (tup, rval, rind) in [((1,), [3//2 1//1], [CartesianIndex(1, 1) CartesianIndex(1, 2)]),
+                                ((2,), reshape([1//1, 11//2], 2, 1), reshape([CartesianIndex(1, 2), CartesianIndex(2, 1)], 2, 1)),
+                                ((1,2), fill(1//1,1,1), fill(CartesianIndex(1,2),1,1))]
+            rval′, rind′ = @inferred findmin(Rational, B, dims=tup)
+            @test (rval, rind) == (rval′, rind′)
+            @test typeof(rval) == Matrix{Rational{Int}}
+            rval′, rind′ = @inferred findmin(Rational ∘ abs ∘ complex, B, dims=tup)
+            @test (rval, rind) == (rval′, rind′)
+            @test typeof(rval) == Matrix{Rational{Int}}
+        end
     end
 end
 

From e013f9f26fe9ec2c079932f0aa84e35d6a8683ce Mon Sep 17 00:00:00 2001
From: Matt Bauman <mbauman@juliacomputing.com>
Date: Tue, 20 May 2025 16:25:44 -0400
Subject: [PATCH 34/55] add tests for 39385

---
 test/reducedim.jl | 49 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 49 insertions(+)

diff --git a/test/reducedim.jl b/test/reducedim.jl
index 782f3419ca863..8193522669442 100644
--- a/test/reducedim.jl
+++ b/test/reducedim.jl
@@ -1045,3 +1045,52 @@ end
         @test ai == oi.parent .+ CartesianIndex(1,2,3)
     end
 end
+
+@testset "in-place reduction aliasing, issue #39385" begin
+    a = [1 2 3];
+    b = copy(a); @test sum!(b, b) == sum(a; dims=()) == a
+    b = copy(a); @test prod!(b, b) == prod(a; dims=()) == a
+    a = [true, false]
+    b = copy(a); @test any!(a, a) == any(a; dims=()) == a
+    b = copy(a); @test all!(a, a) == all(a; dims=()) == a
+
+    A = collect(reshape(1:105, 7, 5, 3))
+    B = copy(A); @test sum!(B,B) == A
+    B = copy(A); @test prod!(B,B) == A
+
+    B = copy(A); @test sum!(view(B, 1:1, :, :),B) == sum(A; dims=1)
+    B = copy(A); @test sum!(view(B, :, 1:1, :),B) == sum(A; dims=2)
+    B = copy(A); @test sum!(view(B, :, :, 1:1),B) == sum(A; dims=3)
+    B = copy(A); @test sum!(view(B, 1:1, 1:1, :),B) == sum(A; dims=(1,2))
+    B = copy(A); @test sum!(view(B, 1:1, :, 1:1),B) == sum(A; dims=(1,3))
+    B = copy(A); @test sum!(view(B, :, 1:1, 1:1),B) == sum(A; dims=(2,3))
+    B = copy(A); @test sum!(view(B, 1:1, 1:1, 1:1),B) == sum(A; dims=(1,2,3))
+
+    B = copy(A); @test prod!(view(B, 1:1, :, :),B) == prod(A; dims=1)
+    B = copy(A); @test prod!(view(B, :, 1:1, :),B) == prod(A; dims=2)
+    B = copy(A); @test prod!(view(B, :, :, 1:1),B) == prod(A; dims=3)
+    B = copy(A); @test prod!(view(B, 1:1, 1:1, :),B) == prod(A; dims=(1,2))
+    B = copy(A); @test prod!(view(B, 1:1, :, 1:1),B) == prod(A; dims=(1,3))
+    B = copy(A); @test prod!(view(B, :, 1:1, 1:1),B) == prod(A; dims=(2,3))
+    B = copy(A); @test prod!(view(B, 1:1, 1:1, 1:1),B) == prod(A; dims=(1,2,3))
+
+    A = rand(7, 5, 3) .> 0.9
+    B = copy(A); @test any!(B, B) == any(A, dims=())
+    B = copy(A); @test all!(!, B, B) == all(!, A, dims=())
+
+    B = copy(A); @test any!(view(B, 1:1, :, :),B) == any(A; dims=1)
+    B = copy(A); @test any!(view(B, :, 1:1, :),B) == any(A; dims=2)
+    B = copy(A); @test any!(view(B, :, :, 1:1),B) == any(A; dims=3)
+    B = copy(A); @test any!(view(B, 1:1, 1:1, :),B) == any(A; dims=(1,2))
+    B = copy(A); @test any!(view(B, 1:1, :, 1:1),B) == any(A; dims=(1,3))
+    B = copy(A); @test any!(view(B, :, 1:1, 1:1),B) == any(A; dims=(2,3))
+    B = copy(A); @test any!(view(B, 1:1, 1:1, 1:1),B) == any(A; dims=(1,2,3))
+
+    B = copy(A); @test all!(!, view(B, 1:1, :, :),B) == all(!, A; dims=1)
+    B = copy(A); @test all!(!, view(B, :, 1:1, :),B) == all(!, A; dims=2)
+    B = copy(A); @test all!(!, view(B, :, :, 1:1),B) == all(!, A; dims=3)
+    B = copy(A); @test all!(!, view(B, 1:1, 1:1, :),B) == all(!, A; dims=(1,2))
+    B = copy(A); @test all!(!, view(B, 1:1, :, 1:1),B) == all(!, A; dims=(1,3))
+    B = copy(A); @test all!(!, view(B, :, 1:1, 1:1),B) == all(!, A; dims=(2,3))
+    B = copy(A); @test all!(!, view(B, 1:1, 1:1, 1:1),B) == all(!, A; dims=(1,2,3))
+end

From 6fb8ef1d51800522808323703fc38e974f5a23cd Mon Sep 17 00:00:00 2001
From: Matt Bauman <mbauman@juliacomputing.com>
Date: Thu, 22 May 2025 09:31:01 -0400
Subject: [PATCH 35/55] restore promote_union and _realtype

---
 base/reducedim.jl | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/base/reducedim.jl b/base/reducedim.jl
index 6cfa4fe9df1e1..b4888fdb43e8d 100644
--- a/base/reducedim.jl
+++ b/base/reducedim.jl
@@ -51,6 +51,15 @@ mergeindices(b::NTuple{N,Bool}, x::CartesianIndex{N}, y::CartesianIndex{N}) wher
 keep_first_trues(::Tuple{}) = ()
 keep_first_trues(t) = t[1] ? (true, keep_first_trues(tail(t))...) : ntuple(Returns(false), length(t))
 
+# These functions aren't used in the implementation here, but are used widely in the ecosystem
+promote_union(T::Union) = promote_type(promote_union(T.a), promote_union(T.b))
+promote_union(T) = T
+_realtype(::Type{<:Complex}) = Real
+_realtype(::Type{Complex{T}}) where T<:Real = T
+_realtype(T::Type) = T
+-_realtype(::Union{typeof(abs),typeof(abs2)}, T) = _realtype(T)
+_realtype(::Any, T) = T
+
 mapreduce_similar(A, ::Type{T}, dims) where {T} = similar(A, T, dims)
 
 # These special internal types allow exposing both in- and out-of-place array initialization

From 167888de369aba9e42c98b06b83494054ab698a9 Mon Sep 17 00:00:00 2001
From: Matt Bauman <mbauman@juliacomputing.com>
Date: Thu, 22 May 2025 09:33:22 -0400
Subject: [PATCH 36/55] fixup! restore promote_union and _realtype

---
 base/reducedim.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/base/reducedim.jl b/base/reducedim.jl
index b4888fdb43e8d..e35b209506f1f 100644
--- a/base/reducedim.jl
+++ b/base/reducedim.jl
@@ -57,7 +57,7 @@ promote_union(T) = T
 _realtype(::Type{<:Complex}) = Real
 _realtype(::Type{Complex{T}}) where T<:Real = T
 _realtype(T::Type) = T
--_realtype(::Union{typeof(abs),typeof(abs2)}, T) = _realtype(T)
+_realtype(::Union{typeof(abs),typeof(abs2)}, T) = _realtype(T)
 _realtype(::Any, T) = T
 
 mapreduce_similar(A, ::Type{T}, dims) where {T} = similar(A, T, dims)

From a093f9d0347fbd1c6aabb39bc8b0f2c6fa538905 Mon Sep 17 00:00:00 2001
From: Matt Bauman <mbauman@juliacomputing.com>
Date: Tue, 27 May 2025 10:01:14 -0400
Subject: [PATCH 37/55] restore has_fast_linear_indexing

---
 base/reduce.jl | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/base/reduce.jl b/base/reduce.jl
index a9ed85c3ba7aa..e54b17bf2fa97 100644
--- a/base/reduce.jl
+++ b/base/reduce.jl
@@ -7,6 +7,9 @@
 abstract type AbstractBroadcasted end
 const AbstractArrayOrBroadcasted = Union{AbstractArray, AbstractBroadcasted}
 
+has_fast_linear_indexing(a::AbstractArrayOrBroadcasted) = IndexStyle(a) === IndexLinear()
+has_fast_linear_indexing(a::AbstractVector) = true
+
 """
     Base.add_sum(x, y)
 

From be75442ac8e92a001e6ba6e09065aaa73ef339cb Mon Sep 17 00:00:00 2001
From: Matt Bauman <mbauman@juliacomputing.com>
Date: Tue, 27 May 2025 10:48:44 -0400
Subject: [PATCH 38/55] simplify inner indices computation

---
 base/reducedim.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/base/reducedim.jl b/base/reducedim.jl
index e35b209506f1f..f6a699aa6d616 100644
--- a/base/reducedim.jl
+++ b/base/reducedim.jl
@@ -133,7 +133,7 @@ function mapreducedim(f::F, op::OP, A, init, dims, alloc=_MapReduceAllocator(A))
         is_inner_dim = ntuple(d->d in dims || size(A, d) == 1, ndims(A))
         outer = CartesianIndices(reduced_indices(A, dims))
     end
-    inner = CartesianIndices(map((b,ax)->b ? ax : reduced_index(ax), is_inner_dim, axes(A)))
+    inner = mergeindices(is_inner_dim, CartesianIndices(A), outer)
     n = length(inner)
     # Handle the empty and trivial 1-element cases:
     if (n == 0 || isempty(A))

From a6e4e73d428a8fdacfc5615768fbfcfbbdc8f32f Mon Sep 17 00:00:00 2001
From: Matt Bauman <mbauman@juliacomputing.com>
Date: Sun, 25 May 2025 16:39:16 -0400
Subject: [PATCH 39/55] Specialize function args to avoid allocations

---
 base/missing.jl |  2 +-
 base/reduce.jl  | 18 +++++++++---------
 test/reduce.jl  |  2 ++
 3 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/base/missing.jl b/base/missing.jl
index 4dce394627af4..27d2eb03d2e9a 100644
--- a/base/missing.jl
+++ b/base/missing.jl
@@ -270,7 +270,7 @@ function show(io::IO, s::SkipMissing)
 end
 
 # Simple optimization for mapreduce if the array cannot hold Missing
-mapreduce(f, op, itr::SkipMissing{<:AbstractArray}; init=Base._InitialValue(), dims=(:)) =
+mapreduce(f::F, op::G, itr::SkipMissing{<:AbstractArray}; init=Base._InitialValue(), dims=(:)) where {F,G} =
     mapreducedim(f, op, eltype(itr.x) >: Missing ? itr : itr.x, init, dims)
 
 """
diff --git a/base/reduce.jl b/base/reduce.jl
index e54b17bf2fa97..b64a8a8a53d1c 100644
--- a/base/reduce.jl
+++ b/base/reduce.jl
@@ -269,8 +269,8 @@ implementations may reuse the return value of `f` for elements that appear multi
 `itr`. Use [`mapfoldl`](@ref) or [`mapfoldr`](@ref) instead for
 guaranteed left or right associativity and invocation of `f` for every value.
 """
-mapreduce(f, op, x; init=_InitialValue()) = mapreduce_pairwise(f, op, x, init)
-mapreduce(f, op, x, xs...; init=_InitialValue()) = mapreduce_pairwise(identity, op, Generator(f, x, xs...), init)
+mapreduce(f::F, op::G, x; init=_InitialValue()) where {F,G} = mapreduce_pairwise(f, op, x, init)
+mapreduce(f::F, op::G, x, xs...; init=_InitialValue()) where {F,G} = mapreduce_pairwise(identity, op, Generator(f, x, xs...), init)
 
 """
     mapreduce(f, op, A::AbstractArray...; dims=:, [init])
@@ -299,10 +299,10 @@ julia> mapreduce(isodd, |, a, dims=1)
  1  1  1  1
 ```
 """
-mapreduce(f, op, A::AbstractArrayOrBroadcasted; init=_InitialValue(), dims=(:)) = mapreducedim(f, op, A, init, dims)
-mapreduce(f, op, A::AbstractArrayOrBroadcasted, As::AbstractArrayOrBroadcasted...; init=_InitialValue(), dims=(:)) =
+mapreduce(f::F, op::G, A::AbstractArrayOrBroadcasted; init=_InitialValue(), dims=(:)) where {F,G} = mapreducedim(f, op, A, init, dims)
+mapreduce(f::F, op::G, A::AbstractArrayOrBroadcasted, As::AbstractArrayOrBroadcasted...; init=_InitialValue(), dims=(:)) where {F,G} =
     reduce(op, map(f, A, As...); init, dims)
-mapreducedim(f, op, A, init, ::Colon) = mapreduce_pairwise(f, op, A, init)
+mapreducedim(f::F, op::G, A, init, ::Colon) where {F,G} = mapreduce_pairwise(f, op, A, init)
 
 # Note: sum_seq usually uses four or more accumulators after partial
 # unrolling, so each accumulator gets at most 256 numbers
@@ -469,12 +469,12 @@ All implementations dispatch to a similarly structured `mapreduce_kernel` to han
 base case that's essentially a `mapfoldl` that allows for further SIMD-like optimizations
 and reassociations.
 """
-function mapreduce_pairwise(f, op, A::AbstractArrayOrBroadcasted, init)
+function mapreduce_pairwise(f::F, op::G, A::AbstractArrayOrBroadcasted, init) where {F, G}
     isempty(A) && return _mapreduce_start(f, op, A, init)
     length(A) <= pairwise_blocksize(f, op) && return mapreduce_kernel(f, op, A, init, eachindex(A))
     return mapreduce_pairwise(f, op, A, init, eachindex(A))
 end
-mapreduce_pairwise(f, op, itr, init) = mapreduce_pairwise(f, op, itr, init, IteratorSize(itr))
+mapreduce_pairwise(f::F, op::G, itr, init) where {F, G} = mapreduce_pairwise(f, op, itr, init, IteratorSize(itr)) 
 function mapreduce_pairwise(f, op, itr, init, S::Union{HasLength, HasShape})
     n = length(itr)
     n < 1 && return _mapreduce_start(f, op, itr, init)
@@ -482,7 +482,7 @@ function mapreduce_pairwise(f, op, itr, init, S::Union{HasLength, HasShape})
     n <= pairwise_blocksize(f, op) && return mapreduce_kernel(f, op, itr, init, S, n)[1]
     return mapreduce_pairwise(f, op, itr, init, S, n)[1]
 end
-function mapreduce_pairwise(f::F, op, itr, init, S::IteratorSize) where {F}
+function mapreduce_pairwise(f::F, op::G, itr, init, S::IteratorSize) where {F, G}
     it = iterate(itr)
     it === nothing && return _mapreduce_start(f, op, itr, init)
     n = pairwise_blocksize(f, op)
@@ -527,7 +527,7 @@ function mapreduce_pairwise(f::F, op::G, itr, init, S::Union{HasLength,HasShape}
         return (op(v1, v2), s)
     end
 end
-function mapreduce_pairwise(f::F, op, itr, init, S::IteratorSize, n, it) where {F}
+function mapreduce_pairwise(f::F, op::G, itr, init, S::IteratorSize, n, it) where {F,G}
     if n <= max(10, pairwise_blocksize(f, op))
         v, it = mapreduce_kernel(f, op, itr, init, S, n, it)
         return it === nothing ? (v, nothing) : (v, it)
diff --git a/test/reduce.jl b/test/reduce.jl
index 8002211d414c3..e4b2194b074a8 100644
--- a/test/reduce.jl
+++ b/test/reduce.jl
@@ -44,6 +44,8 @@ end
 let x = rand(10)
     @test 0 == @allocated(sum(Iterators.reverse(x)))
     @test 0 == @allocated(foldr(-, x))
+    @test 0 == @allocated(sum(A))
+    @test 0 == @allocated(sum(log, A))
 end
 
 # reduce

From 744e3fd976bb8a274cdca3075992d77c8712d04c Mon Sep 17 00:00:00 2001
From: Matt Bauman <mbauman@juliacomputing.com>
Date: Tue, 27 May 2025 14:30:40 -0400
Subject: [PATCH 40/55] fixup! simplify inner indices computation

---
 base/reducedim.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/base/reducedim.jl b/base/reducedim.jl
index f6a699aa6d616..b4e7f3581e69b 100644
--- a/base/reducedim.jl
+++ b/base/reducedim.jl
@@ -37,7 +37,7 @@ end
 ###### Generic reduction functions #####
 
 # Given two indices or ranges, merge them by dimension akin to a broadcasted `ifelse` over the dims
-_sliceall(x) = x[begin]:x[end] # avoid instabilities with OneTos and offset axes
+_sliceall(x) = first(x):last(x) # avoid instabilities with OneTos and offset axes
 _ifelseslice(b,x,y) = ifelse(b, _sliceall(x), _sliceall(y))
 mergeindices(b::NTuple{N,Bool}, x::CartesianIndices{N}, y::CartesianIndices{N}) where {N} =
     CartesianIndices(map(_ifelseslice, b, x.indices, y.indices))

From 647d5ed5186352b4344c12c6602c1e66c6cf9398 Mon Sep 17 00:00:00 2001
From: Matt Bauman <mbauman@juliacomputing.com>
Date: Tue, 27 May 2025 17:03:16 -0400
Subject: [PATCH 41/55] Revert "fixup! simplify inner indices computation"

This reverts commit 744e3fd976bb8a274cdca3075992d77c8712d04c.
---
 base/reducedim.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/base/reducedim.jl b/base/reducedim.jl
index b4e7f3581e69b..f6a699aa6d616 100644
--- a/base/reducedim.jl
+++ b/base/reducedim.jl
@@ -37,7 +37,7 @@ end
 ###### Generic reduction functions #####
 
 # Given two indices or ranges, merge them by dimension akin to a broadcasted `ifelse` over the dims
-_sliceall(x) = first(x):last(x) # avoid instabilities with OneTos and offset axes
+_sliceall(x) = x[begin]:x[end] # avoid instabilities with OneTos and offset axes
 _ifelseslice(b,x,y) = ifelse(b, _sliceall(x), _sliceall(y))
 mergeindices(b::NTuple{N,Bool}, x::CartesianIndices{N}, y::CartesianIndices{N}) where {N} =
     CartesianIndices(map(_ifelseslice, b, x.indices, y.indices))

From 6fb57cc9f56926d78f47b8769c72799b43ac95bb Mon Sep 17 00:00:00 2001
From: Matt Bauman <mbauman@juliacomputing.com>
Date: Tue, 27 May 2025 17:03:23 -0400
Subject: [PATCH 42/55] Revert "simplify inner indices computation"

This reverts commit be75442ac8e92a001e6ba6e09065aaa73ef339cb.
---
 base/reducedim.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/base/reducedim.jl b/base/reducedim.jl
index f6a699aa6d616..e35b209506f1f 100644
--- a/base/reducedim.jl
+++ b/base/reducedim.jl
@@ -133,7 +133,7 @@ function mapreducedim(f::F, op::OP, A, init, dims, alloc=_MapReduceAllocator(A))
         is_inner_dim = ntuple(d->d in dims || size(A, d) == 1, ndims(A))
         outer = CartesianIndices(reduced_indices(A, dims))
     end
-    inner = mergeindices(is_inner_dim, CartesianIndices(A), outer)
+    inner = CartesianIndices(map((b,ax)->b ? ax : reduced_index(ax), is_inner_dim, axes(A)))
     n = length(inner)
     # Handle the empty and trivial 1-element cases:
     if (n == 0 || isempty(A))

From 07e11a2b32afad697341e5a377a069d27fe84299 Mon Sep 17 00:00:00 2001
From: Matt Bauman <mbauman@juliacomputing.com>
Date: Wed, 28 May 2025 00:50:48 -0400
Subject: [PATCH 43/55] transducers for SkipMissing

---
 base/missing.jl | 90 +++++++++++++++++++++++++++++++++++++++++++++++--
 base/reduce.jl  |  1 -
 2 files changed, 87 insertions(+), 4 deletions(-)

diff --git a/base/missing.jl b/base/missing.jl
index 27d2eb03d2e9a..bd6dc2889e3cb 100644
--- a/base/missing.jl
+++ b/base/missing.jl
@@ -269,9 +269,93 @@ function show(io::IO, s::SkipMissing)
     print(io, ')')
 end
 
-# Simple optimization for mapreduce if the array cannot hold Missing
-mapreduce(f::F, op::G, itr::SkipMissing{<:AbstractArray}; init=Base._InitialValue(), dims=(:)) where {F,G} =
-    mapreducedim(f, op, eltype(itr.x) >: Missing ? itr : itr.x, init, dims)
+# Transducers for mapreduce; the hardest part is ensuring that the
+# every reduction chain starts correctly when `init` is not provided because
+# we need to call reduce_first on the first *non-missing* value.
+struct _SkipMissingReducer{F} <: Function
+    op::F
+end
+(smr::_SkipMissingReducer)(::Missing, ::Missing) = missing
+(smr::_SkipMissingReducer)(::Missing, y)         = y
+(smr::_SkipMissingReducer)(x,         ::Missing) = x
+(smr::_SkipMissingReducer)(x,         y)         = smr.op(x, y)
+
+struct _SkipMissingMapper{T,F} <: Function
+    f::F
+end
+_SkipMissingMapper{T}(f) where {T} = _SkipMissingMapper{T, typeof(f)}(f)
+(smm::_SkipMissingMapper)(::Missing) = missing
+(smm::_SkipMissingMapper{T})(x) where {T} = smm.f(x::T)
+
+mapreduce(f::F, op::G, itr::SkipMissing; init=Base._InitialValue()) where {F,G} = _mapreduce_skipmissing(f, op, itr, init)
+mapreduce(f::F, op::G, itr::SkipMissing{<:AbstractArray}; init=Base._InitialValue()) where {F,G} = _mapreduce_skipmissing_array(f, op, itr, init)
+
+# The case with an init is easy; just use transducers!
+function _mapreduce_skipmissing(f::F, op::G, itr, init) where {F,G}
+    (IteratorEltype(itr.x) === HasEltype() && !(eltype(itr.x) >: Missing)) && return mapreduce(f, op, itr.x; init)
+    v = mapreduce_pairwise(_SkipMissingMapper{eltype(itr)}(f), _SkipMissingReducer(op), itr.x, init)
+    return ismissing(v) ? mapreduce_empty(f, op, eltype(itr)) : v
+end
+# Cases without an init require an initial foldl step at the beginning of every chain. This is hard in general
+function _mapreduce_skipmissing(f::F, op::G, itr, init::_InitialValue) where {F,G}
+    (IteratorEltype(itr.x) === HasEltype() && !(eltype(itr.x) >: Missing)) && return mapreduce(f, op, itr.x; init)
+    return mapreduce_pairwise(f, op, itr, init)
+end
+
+# And for Arrays we can do better (either way!) by pretending all indices are available,
+# but ensuring that the kernels carefully init each chain
+function _mapreduce_skipmissing_array(f::F, op::G, itr, init) where {F,G}
+    (IteratorEltype(itr.x) === HasEltype() && !(eltype(itr.x) >: Missing)) && return mapreduce(f, op, itr.x; init)
+    # Ensure we either get an AbstractUnitRange from eachindex or fallback to a CartesianIndices
+    ei = eachindex(itr.x)
+    inds = ei isa AbstractUnitRange ? ei : CartesianIndices(itr.x)
+    # The transducers are still useful here; they ensure combined empty chains appropriately skip missing
+    v = mapreduce_pairwise(_SkipMissingMapper{eltype(itr)}(f), _SkipMissingReducer(op), itr, init, inds)
+    return ismissing(v) ? mapreduce_empty(f, op, eltype(itr)) : v
+end
+
+function mapreduce_kernel(f, op, itr::SkipMissing{<:AbstractArray}, init, inds::AbstractUnitRange)
+    i1, iN = first(inds), last(inds)
+    A = itr.x
+    i = i1; ai = missing
+    for outer i in i1:iN
+        ai = @inbounds A[i]
+        !ismissing(ai) && break
+    end
+    ismissing(ai) && return missing
+    v = _mapreduce_start(f, op, A, init, ai::eltype(itr))
+    i == typemax(typeof(i)) && return v
+    for i in i+1:iN
+        @inbounds ai = A[i]
+        if !ismissing(ai)
+            v = op(v, f(ai::eltype(itr)))
+        end
+    end
+    return v
+end
+
+function mapreduce_kernel(f, op, itr::SkipMissing{<:AbstractArray}, init, inds::CartesianIndices)
+    A = itr.x
+    it = iterate(inds)
+    local s
+    ai = missing
+    while it !== nothing
+        i, s = it
+        ai = @inbounds A[i]
+        !ismissing(ai) && break
+        it = iterate(inds, s)
+    end
+    ismissing(ai) && return missing
+    v = _mapreduce_start(f, op, A, init, ai::eltype(itr))
+    for i in Iterators.rest(inds, s)
+        @inbounds ai = A[i]
+        if !ismissing(ai)
+            v = op(v, f(ai::eltype(itr)))
+        end
+    end
+    return v
+end
+
 
 """
     filter(f, itr::SkipMissing{<:AbstractArray})
diff --git a/base/reduce.jl b/base/reduce.jl
index b64a8a8a53d1c..d3b98d82d02e5 100644
--- a/base/reduce.jl
+++ b/base/reduce.jl
@@ -478,7 +478,6 @@ mapreduce_pairwise(f::F, op::G, itr, init) where {F, G} = mapreduce_pairwise(f,
 function mapreduce_pairwise(f, op, itr, init, S::Union{HasLength, HasShape})
     n = length(itr)
     n < 1 && return _mapreduce_start(f, op, itr, init)
-    n <= 16 && return mapfoldl(f, op, itr; init)
     n <= pairwise_blocksize(f, op) && return mapreduce_kernel(f, op, itr, init, S, n)[1]
     return mapreduce_pairwise(f, op, itr, init, S, n)[1]
 end

From 144c2533588fc66c07d61635ef21fe23f6b4c398 Mon Sep 17 00:00:00 2001
From: Matt Bauman <mbauman@juliacomputing.com>
Date: Wed, 28 May 2025 11:18:03 -0400
Subject: [PATCH 44/55] fewer transducers

---
 base/missing.jl | 20 ++++++--------------
 1 file changed, 6 insertions(+), 14 deletions(-)

diff --git a/base/missing.jl b/base/missing.jl
index bd6dc2889e3cb..6b648e98377c0 100644
--- a/base/missing.jl
+++ b/base/missing.jl
@@ -287,25 +287,17 @@ _SkipMissingMapper{T}(f) where {T} = _SkipMissingMapper{T, typeof(f)}(f)
 (smm::_SkipMissingMapper)(::Missing) = missing
 (smm::_SkipMissingMapper{T})(x) where {T} = smm.f(x::T)
 
-mapreduce(f::F, op::G, itr::SkipMissing; init=Base._InitialValue()) where {F,G} = _mapreduce_skipmissing(f, op, itr, init)
-mapreduce(f::F, op::G, itr::SkipMissing{<:AbstractArray}; init=Base._InitialValue()) where {F,G} = _mapreduce_skipmissing_array(f, op, itr, init)
-
-# The case with an init is easy; just use transducers!
-function _mapreduce_skipmissing(f::F, op::G, itr, init) where {F,G}
-    (IteratorEltype(itr.x) === HasEltype() && !(eltype(itr.x) >: Missing)) && return mapreduce(f, op, itr.x; init)
-    v = mapreduce_pairwise(_SkipMissingMapper{eltype(itr)}(f), _SkipMissingReducer(op), itr.x, init)
-    return ismissing(v) ? mapreduce_empty(f, op, eltype(itr)) : v
-end
-# Cases without an init require an initial foldl step at the beginning of every chain. This is hard in general
-function _mapreduce_skipmissing(f::F, op::G, itr, init::_InitialValue) where {F,G}
+function mapreduce(f::F, op::G, itr::SkipMissing; init=Base._InitialValue()) where {F,G}
     (IteratorEltype(itr.x) === HasEltype() && !(eltype(itr.x) >: Missing)) && return mapreduce(f, op, itr.x; init)
+    # Note that we _could_ use the transducers here in cases where `init` is provided, but the cost of
+    # iterating over the unstable `Union{Nothing, Tuple{Union{Missing, T}, IterState}}` is very high
     return mapreduce_pairwise(f, op, itr, init)
 end
 
-# And for Arrays we can do better (either way!) by pretending all indices are available,
+# For Arrays we can do better (either way!) by pretending all indices are available,
 # but ensuring that the kernels carefully init each chain
-function _mapreduce_skipmissing_array(f::F, op::G, itr, init) where {F,G}
-    (IteratorEltype(itr.x) === HasEltype() && !(eltype(itr.x) >: Missing)) && return mapreduce(f, op, itr.x; init)
+function mapreduce(f::F, op::G, itr::SkipMissing{<:AbstractArray}; init=Base._InitialValue()) where {F,G}
+    !(eltype(itr.x) >: Missing) && return mapreduce(f, op, itr.x; init)
     # Ensure we either get an AbstractUnitRange from eachindex or fallback to a CartesianIndices
     ei = eachindex(itr.x)
     inds = ei isa AbstractUnitRange ? ei : CartesianIndices(itr.x)

From ddc4502b0bc23fca982f2fa0758aaadcdaf1cd50 Mon Sep 17 00:00:00 2001
From: Matt Bauman <mbauman@juliacomputing.com>
Date: Wed, 28 May 2025 11:43:35 -0400
Subject: [PATCH 45/55] fixup! Specialize function args to avoid allocations

---
 test/reduce.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/reduce.jl b/test/reduce.jl
index e4b2194b074a8..6631482a240d1 100644
--- a/test/reduce.jl
+++ b/test/reduce.jl
@@ -44,8 +44,8 @@ end
 let x = rand(10)
     @test 0 == @allocated(sum(Iterators.reverse(x)))
     @test 0 == @allocated(foldr(-, x))
-    @test 0 == @allocated(sum(A))
-    @test 0 == @allocated(sum(log, A))
+    @test 0 == @allocated(sum(x))
+    @test 0 == @allocated(sum(log, x))
 end
 
 # reduce

From 30c8afbc5e5085820c3ffad17fb5f17a9796b980 Mon Sep 17 00:00:00 2001
From: Matt Bauman <mbauman@juliacomputing.com>
Date: Wed, 28 May 2025 13:16:28 -0400
Subject: [PATCH 46/55] be more careful about which dimensions are considered
 "inner" reductions

---
 base/reducedim.jl | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/base/reducedim.jl b/base/reducedim.jl
index e35b209506f1f..81aec43b5df0d 100644
--- a/base/reducedim.jl
+++ b/base/reducedim.jl
@@ -117,6 +117,17 @@ else
     _mapreduce_might_widen(_, _, _, _, _) = true
 end
 
+# When performing dimensional reductions over arrays with singleton dimensions, we have
+# a choice as to whether that singleton dimenion should be a part of the reduction or not;
+# it does not affect the output. It's advantageous to additionally consider _leading_ singleton
+# dimensions as part of the reduction as that allows more cases to be considered contiguous;
+# but once we've broken contiguity it's more helpful to _ignore_ those cases.
+compute_inner_dims(flagged_dims, source_size) =
+    flagged_dims[1] || source_size[1] == 1 ?
+        (true, compute_inner_dims(tail(flagged_dims), tail(source_size))...) :
+        (false, map((flag,sz)->flag && sz != 1, tail(flagged_dims), tail(source_size))...)
+compute_inner_dims(::Tuple{}, ::Tuple{}) = ()
+
 function mapreducedim(f::F, op::OP, A, init, dims, alloc=_MapReduceAllocator(A)) where {F, OP}
     if alloc isa _MapReduceInPlace
         # We can ignore dims and just trust the output array's axes. Note that the
@@ -128,9 +139,7 @@ function mapreducedim(f::F, op::OP, A, init, dims, alloc=_MapReduceAllocator(A))
         outer = CartesianIndices(ntuple(d->axes(alloc.A, d), ndims(A)))
         is_inner_dim = map(==(1), size(outer))
     else
-        # It's advantageous to additionally consider singleton dimensions as part of the
-        # reduction as that allows more cases to be considered contiguous
-        is_inner_dim = ntuple(d->d in dims || size(A, d) == 1, ndims(A))
+        is_inner_dim = compute_inner_dims(ntuple(d->d in dims, ndims(A)), size(A))
         outer = CartesianIndices(reduced_indices(A, dims))
     end
     inner = CartesianIndices(map((b,ax)->b ? ax : reduced_index(ax), is_inner_dim, axes(A)))

From b66af5539f043c52e9a9a47b065fb1aa1829f832 Mon Sep 17 00:00:00 2001
From: Matt Bauman <mbauman@juliacomputing.com>
Date: Tue, 3 Jun 2025 01:06:25 -0400
Subject: [PATCH 47/55] only use mapreduce_first without initial values

and slightly improve perf for abstract iterables
---
 base/reduce.jl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/base/reduce.jl b/base/reduce.jl
index d3b98d82d02e5..7beae45042c0b 100644
--- a/base/reduce.jl
+++ b/base/reduce.jl
@@ -428,10 +428,10 @@ Perform the first step in a mapped reduction over `A` with 0 or one or more elem
 The one-element method may be called multiple times within a single reduction at
 the start of each new chain of `op` calls.
 """
-_mapreduce_start(f, op, A, ::_InitialValue) = mapreduce_empty(f, op, _empty_eltype(A))
-_mapreduce_start(f, op, A, ::_InitialValue, a1) = mapreduce_first(f, op, a1)
-_mapreduce_start(f, op, A, init) = init
-_mapreduce_start(f, op, A, init, a1) = op(init, mapreduce_first(f, op, a1))
+@inline _mapreduce_start(f, op, A, ::_InitialValue) = mapreduce_empty(f, op, _empty_eltype(A))
+@inline _mapreduce_start(f, op, A, ::_InitialValue, a1) = mapreduce_first(f, op, a1)
+@inline _mapreduce_start(f, op, A, init) = init
+@inline _mapreduce_start(f, op, A, init, a1) = op(init, f(a1))
 
 """
     mapreduce_pairwise(f, op, A, init)

From a729ea7f64df0348ef2e6e52328b831d6af25a64 Mon Sep 17 00:00:00 2001
From: Matt Bauman <mbauman@juliacomputing.com>
Date: Thu, 29 May 2025 13:07:39 -0400
Subject: [PATCH 48/55] skip skipmissing transducers entirely

they ended up losing type stability and it isn't hard to just define our own pairwise impl
---
 base/missing.jl | 39 +++++++++++++--------------------------
 test/reduce.jl  |  8 ++++----
 2 files changed, 17 insertions(+), 30 deletions(-)

diff --git a/base/missing.jl b/base/missing.jl
index 6b648e98377c0..2998f979196f0 100644
--- a/base/missing.jl
+++ b/base/missing.jl
@@ -269,42 +269,29 @@ function show(io::IO, s::SkipMissing)
     print(io, ')')
 end
 
-# Transducers for mapreduce; the hardest part is ensuring that the
-# every reduction chain starts correctly when `init` is not provided because
-# we need to call reduce_first on the first *non-missing* value.
-struct _SkipMissingReducer{F} <: Function
-    op::F
-end
-(smr::_SkipMissingReducer)(::Missing, ::Missing) = missing
-(smr::_SkipMissingReducer)(::Missing, y)         = y
-(smr::_SkipMissingReducer)(x,         ::Missing) = x
-(smr::_SkipMissingReducer)(x,         y)         = smr.op(x, y)
-
-struct _SkipMissingMapper{T,F} <: Function
-    f::F
-end
-_SkipMissingMapper{T}(f) where {T} = _SkipMissingMapper{T, typeof(f)}(f)
-(smm::_SkipMissingMapper)(::Missing) = missing
-(smm::_SkipMissingMapper{T})(x) where {T} = smm.f(x::T)
-
+# Simple optimization: catch skipmissings that wrap iterators that don't include Missing
 function mapreduce(f::F, op::G, itr::SkipMissing; init=Base._InitialValue()) where {F,G}
     (IteratorEltype(itr.x) === HasEltype() && !(eltype(itr.x) >: Missing)) && return mapreduce(f, op, itr.x; init)
-    # Note that we _could_ use the transducers here in cases where `init` is provided, but the cost of
-    # iterating over the unstable `Union{Nothing, Tuple{Union{Missing, T}, IterState}}` is very high
     return mapreduce_pairwise(f, op, itr, init)
 end
 
-# For Arrays we can do better (either way!) by pretending all indices are available,
-# but ensuring that the kernels carefully init each chain
-function mapreduce(f::F, op::G, itr::SkipMissing{<:AbstractArray}; init=Base._InitialValue()) where {F,G}
-    !(eltype(itr.x) >: Missing) && return mapreduce(f, op, itr.x; init)
+function mapreduce_pairwise(f::F, op::G, itr::SkipMissing{<:AbstractArray}, init) where {F,G}
     # Ensure we either get an AbstractUnitRange from eachindex or fallback to a CartesianIndices
     ei = eachindex(itr.x)
     inds = ei isa AbstractUnitRange ? ei : CartesianIndices(itr.x)
-    # The transducers are still useful here; they ensure combined empty chains appropriately skip missing
-    v = mapreduce_pairwise(_SkipMissingMapper{eltype(itr)}(f), _SkipMissingReducer(op), itr, init, inds)
+    v = mapreduce_pairwise(f, op, itr, init, inds)
     return ismissing(v) ? mapreduce_empty(f, op, eltype(itr)) : v
 end
+function mapreduce_pairwise(f::F, op::G, A::SkipMissing{<:AbstractArray}, init, inds) where {F,G}
+    if length(inds) <= max(10, pairwise_blocksize(f, op))
+        return mapreduce_kernel(f, op, A, init, inds)
+    else
+        p1, p2 = halves(inds)
+        v1 = mapreduce_pairwise(f, op, A, init, p1)
+        v2 = mapreduce_pairwise(f, op, A, init, p2)
+        return ismissing(v1) ? v2 : ismissing(v2) ? v1 : op(v1, v2)
+    end
+end
 
 function mapreduce_kernel(f, op, itr::SkipMissing{<:AbstractArray}, init, inds::AbstractUnitRange)
     i1, iN = first(inds), last(inds)
diff --git a/test/reduce.jl b/test/reduce.jl
index 6631482a240d1..82e3844b71bcd 100644
--- a/test/reduce.jl
+++ b/test/reduce.jl
@@ -787,13 +787,13 @@ end
 
     @test (@inferred Missing mapreduce(identity, (x,y)->Base.add_sum(x, y)/1, skipmissing(view([1.0,2.0,missing],1:2)))) == 3.0
     @test (@inferred Missing mapreduce(identity, (x,y)->Base.add_sum(x, y)/1, skipmissing(view([1,2,missing],1:2)), init=0.0)) == 3.0
-    @test (@inferred Union{Missing,Int} mapreduce(identity, (x,y)->Base.add_sum(x, y)/1, skipmissing(view([1,2,missing],1:2)), init=0)) == 3.0
-    @test (@inferred Union{Missing,Int} mapreduce(identity, (x,y)->Base.add_sum(x, y)/1, skipmissing(view([1,2,missing],1:2)))) == 3.0
+    @test (@inferred Int mapreduce(identity, (x,y)->Base.add_sum(x, y)/1, skipmissing(view([1,2,missing],1:2)), init=0)) == 3.0
+    @test (@inferred Int mapreduce(identity, (x,y)->Base.add_sum(x, y)/1, skipmissing(view([1,2,missing],1:2)))) == 3.0
 
     @test (@inferred Missing reduce((x,y)->Base.add_sum(x, y)/1, skipmissing(view([1.0,2.0,missing],1:2)))) == 3.0
     @test (@inferred Missing reduce((x,y)->Base.add_sum(x, y)/1, skipmissing(view([1,2,missing],1:2)), init=0.0)) == 3.0
-    @test (@inferred Union{Missing,Int} reduce((x,y)->Base.add_sum(x, y)/1, skipmissing(view([1,2,missing],1:2)), init=0)) == 3.0
-    @test (@inferred Union{Missing,Int} reduce((x,y)->Base.add_sum(x, y)/1, skipmissing(view([1,2,missing],1:2)))) == 3.0
+    @test (@inferred Int reduce((x,y)->Base.add_sum(x, y)/1, skipmissing(view([1,2,missing],1:2)), init=0)) == 3.0
+    @test (@inferred Int reduce((x,y)->Base.add_sum(x, y)/1, skipmissing(view([1,2,missing],1:2)))) == 3.0
 end
 
 linear(x) = (@assert(IndexStyle(x) == IndexLinear()); x)

From 13ee1b5858ce0fc4deafd215c10c1d2b9e8ccacd Mon Sep 17 00:00:00 2001
From: Matt Bauman <mbauman@juliacomputing.com>
Date: Thu, 29 May 2025 17:46:33 -0400
Subject: [PATCH 49/55] use distinct skipmissing functions

they are just different enough to break some assumptions about this function
---
 base/missing.jl | 32 +++++++++++++++-----------------
 1 file changed, 15 insertions(+), 17 deletions(-)

diff --git a/base/missing.jl b/base/missing.jl
index 2998f979196f0..376118133d572 100644
--- a/base/missing.jl
+++ b/base/missing.jl
@@ -276,45 +276,43 @@ function mapreduce(f::F, op::G, itr::SkipMissing; init=Base._InitialValue()) whe
 end
 
 function mapreduce_pairwise(f::F, op::G, itr::SkipMissing{<:AbstractArray}, init) where {F,G}
-    # Ensure we either get an AbstractUnitRange from eachindex or fallback to a CartesianIndices
-    ei = eachindex(itr.x)
-    inds = ei isa AbstractUnitRange ? ei : CartesianIndices(itr.x)
-    v = mapreduce_pairwise(f, op, itr, init, inds)
-    return ismissing(v) ? mapreduce_empty(f, op, eltype(itr)) : v
+    v = mapreduce_skipmissing_pairwise(f, op, itr.x, init, eachindex(itr.x))
+    return ismissing(v) ? _mapreduce_start(f, op, itr, init) : v
 end
-function mapreduce_pairwise(f::F, op::G, A::SkipMissing{<:AbstractArray}, init, inds) where {F,G}
+
+# This is based on mapreduce_pairwise, but with special sauce that allows one or both of the pairwise splits
+# to not process any elements; returning `missing` as the sentinel in such a situation
+function mapreduce_skipmissing_pairwise(f::F, op::G, A, init, inds) where {F,G}
     if length(inds) <= max(10, pairwise_blocksize(f, op))
-        return mapreduce_kernel(f, op, A, init, inds)
+        return mapreduce_skipmissing_kernel(f, op, A, init, inds)
     else
         p1, p2 = halves(inds)
-        v1 = mapreduce_pairwise(f, op, A, init, p1)
-        v2 = mapreduce_pairwise(f, op, A, init, p2)
+        v1 = mapreduce_skipmissing_pairwise(f, op, A, init, p1)
+        v2 = mapreduce_skipmissing_pairwise(f, op, A, init, p2)
         return ismissing(v1) ? v2 : ismissing(v2) ? v1 : op(v1, v2)
     end
 end
 
-function mapreduce_kernel(f, op, itr::SkipMissing{<:AbstractArray}, init, inds::AbstractUnitRange)
+function mapreduce_skipmissing_kernel(f, op, A, init, inds::AbstractUnitRange)
     i1, iN = first(inds), last(inds)
-    A = itr.x
     i = i1; ai = missing
     for outer i in i1:iN
         ai = @inbounds A[i]
         !ismissing(ai) && break
     end
     ismissing(ai) && return missing
-    v = _mapreduce_start(f, op, A, init, ai::eltype(itr))
+    v = _mapreduce_start(f, op, A, init, ai)
     i == typemax(typeof(i)) && return v
     for i in i+1:iN
         @inbounds ai = A[i]
         if !ismissing(ai)
-            v = op(v, f(ai::eltype(itr)))
+            v = op(v, f(ai))
         end
     end
     return v
 end
 
-function mapreduce_kernel(f, op, itr::SkipMissing{<:AbstractArray}, init, inds::CartesianIndices)
-    A = itr.x
+function mapreduce_skipmissing_kernel(f, op, A, init, inds)
     it = iterate(inds)
     local s
     ai = missing
@@ -325,11 +323,11 @@ function mapreduce_kernel(f, op, itr::SkipMissing{<:AbstractArray}, init, inds::
         it = iterate(inds, s)
     end
     ismissing(ai) && return missing
-    v = _mapreduce_start(f, op, A, init, ai::eltype(itr))
+    v = _mapreduce_start(f, op, A, init, ai)
     for i in Iterators.rest(inds, s)
         @inbounds ai = A[i]
         if !ismissing(ai)
-            v = op(v, f(ai::eltype(itr)))
+            v = op(v, f(ai))
         end
     end
     return v

From e6948ec4f53d257ab7bee91a28071504f8f6b1fe Mon Sep 17 00:00:00 2001
From: Matt Bauman <mbauman@juliacomputing.com>
Date: Fri, 30 May 2025 14:48:58 -0400
Subject: [PATCH 50/55] improve inference for unstable HasLength iters

---
 base/reduce.jl | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/base/reduce.jl b/base/reduce.jl
index 7beae45042c0b..2e0722ee4ad02 100644
--- a/base/reduce.jl
+++ b/base/reduce.jl
@@ -571,7 +571,9 @@ function mapreduce_kernel(f, op, itr, init, ::Union{HasLength, HasShape}, n, sta
     a1, s = iterate(itr, state...)
     v = _mapreduce_start(f, op, itr, init, a1)
     @simd for _ in 2:n
-        a, s = iterate(itr, s)
+        it = iterate(itr, s)
+        it === nothing && return v, s # This will only happen if an iterator lied about its length
+        a, s = it
         v = op(v, f(a))
     end
     return v, s

From 8ec7b417bf1ae5b0a2b3c440a2aa6958b734b837 Mon Sep 17 00:00:00 2001
From: Matt Bauman <mbauman@juliacomputing.com>
Date: Fri, 30 May 2025 15:16:43 -0400
Subject: [PATCH 51/55] add tuned quick outs

---
 base/reduce.jl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/base/reduce.jl b/base/reduce.jl
index 2e0722ee4ad02..92c6ed4f9abce 100644
--- a/base/reduce.jl
+++ b/base/reduce.jl
@@ -470,15 +470,15 @@ base case that's essentially a `mapfoldl` that allows for further SIMD-like opti
 and reassociations.
 """
 function mapreduce_pairwise(f::F, op::G, A::AbstractArrayOrBroadcasted, init) where {F, G}
-    isempty(A) && return _mapreduce_start(f, op, A, init)
-    length(A) <= pairwise_blocksize(f, op) && return mapreduce_kernel(f, op, A, init, eachindex(A))
+    n = length(A)
+    n <= 16 && return mapfoldl(f, op, A; init) # The overhead of SIMD is more expensive than a straight loop
+    n <= pairwise_blocksize(f, op) && return mapreduce_kernel(f, op, A, init, eachindex(A))
     return mapreduce_pairwise(f, op, A, init, eachindex(A))
 end
 mapreduce_pairwise(f::F, op::G, itr, init) where {F, G} = mapreduce_pairwise(f, op, itr, init, IteratorSize(itr)) 
 function mapreduce_pairwise(f, op, itr, init, S::Union{HasLength, HasShape})
     n = length(itr)
-    n < 1 && return _mapreduce_start(f, op, itr, init)
-    n <= pairwise_blocksize(f, op) && return mapreduce_kernel(f, op, itr, init, S, n)[1]
+    n <= pairwise_blocksize(f, op) && return mapfoldl(f, op, itr; init) # Iterators typically won't SIMD
     return mapreduce_pairwise(f, op, itr, init, S, n)[1]
 end
 function mapreduce_pairwise(f::F, op::G, itr, init, S::IteratorSize) where {F, G}

From 427bbb7cfa73555774fed1f1cfbdd58ccc10bcdd Mon Sep 17 00:00:00 2001
From: Matt Bauman <mbauman@juliacomputing.com>
Date: Tue, 3 Jun 2025 01:33:10 -0400
Subject: [PATCH 52/55] simplify bootstrap; avoid vcat early in bootstrap order

---
 base/intfuncs.jl | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/base/intfuncs.jl b/base/intfuncs.jl
index 4116f1631b724..f361320cf6c27 100644
--- a/base/intfuncs.jl
+++ b/base/intfuncs.jl
@@ -827,7 +827,7 @@ function oct(x::Unsigned, pad::Int, neg::Bool)
 end
 
 # 2-digit decimal characters ("00":"99")
-const _dec_d100 = UInt16[
+const _dec_d100 = [
 # generating expression: UInt16[(0x30 + i % 10) << 0x8 + (0x30 + i ÷ 10) for i = 0:99]
 #    0 0,    0 1,    0 2,    0 3, and so on in little-endian
   0x3030, 0x3130, 0x3230, 0x3330, 0x3430, 0x3530, 0x3630, 0x3730, 0x3830, 0x3930,
@@ -917,8 +917,16 @@ function hex(x::Unsigned, pad::Int, neg::Bool)
     unsafe_takestring(a)
 end
 
-const base36digits = UInt8['0':'9';'a':'z']
-const base62digits = UInt8['0':'9';'A':'Z';'a':'z']
+# UInt8['0':'9';'a':'z']
+const base36digits = [0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39,
+    0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
+    0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a]
+# UInt8['0':'9';'A':'Z';'a':'z']
+const base62digits = [0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39,
+    0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f,
+    0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a,
+    0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f,
+    0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a]
 
 function _base(base::Integer, x::Integer, pad::Int, neg::Bool)
     (x >= 0) | (base < 0) || throw(DomainError(x, "For negative `x`, `base` must be negative."))

From 5f8c6a37b9ca9f22824cc60ef3af56dec8ee126a Mon Sep 17 00:00:00 2001
From: Matt Bauman <mbauman@juliacomputing.com>
Date: Tue, 3 Jun 2025 00:39:27 -0400
Subject: [PATCH 53/55] skip at-simd in favor of explicit re-commutating

---
 base/cartesian.jl         |   2 +-
 base/multidimensional.jl  | 169 ++++++++++++++++++++++++++++++++++++--
 base/permuteddimsarray.jl |   3 +-
 base/reduce.jl            | 105 ++++++++++++++---------
 4 files changed, 230 insertions(+), 49 deletions(-)

diff --git a/base/cartesian.jl b/base/cartesian.jl
index ca0fc0aac0cfc..faf8b82835776 100644
--- a/base/cartesian.jl
+++ b/base/cartesian.jl
@@ -402,7 +402,7 @@ function exprresolve_conditional(ex::Expr)
                 return true, exprresolve_cond_dict[callee](ex.args[2], ex.args[3])
             end
         end
-    elseif Meta.isexpr(ex, :block, 2) && ex.args[1] isa LineNumberNode
+    elseif ex.head === :block && length(ex.args) == 2 && ex.args[1] isa LineNumberNode
         return exprresolve_conditional(ex.args[2])
     end
     false, false
diff --git a/base/multidimensional.jl b/base/multidimensional.jl
index d37719567fd50..37433f265a18f 100644
--- a/base/multidimensional.jl
+++ b/base/multidimensional.jl
@@ -1049,16 +1049,15 @@ end
 # Reduction support for CartesianIndices
 halves(inds::CartesianIndices) = map(CartesianIndices∘tuple, _halves(inds.indices)...)
 function mapreduce_kernel(f, op, A, init, inds::CartesianIndices{N}) where {N}
-    N == 0 && return init===_InitialValue() ? mapreduce_first(f, op, only(A)) : op(init, f(only(A)))
+    N == 0 && return _mapreduce_start(f, op, A, init, A[inds[]])
     N == 1 && return mapreduce_kernel(f, op, A, init, inds.indices[1])
-    i1, s = iterate(inds)
-    a1 = @inbounds A[i1]
-    v = _mapreduce_start(f, op, A, init, a1)
+    is_commutative_op(op) && return mapreduce_kernel_commutative(f, op, A, init, inds)
     r = inds.indices[1]
     if length(r) == 1
-        # SIMD over a one-element loop is less-than-helpful; just iterate
-        # over the rest of the indices without worrying about splitting out
-        # an inner SIMD loop
+        # A one-element inner loop is less-than-helpful
+        i1, s = iterate(inds)
+        a1 = @inbounds A[i1]
+        v = _mapreduce_start(f, op, A, init, a1)
         for i in Iterators.rest(inds, s)
             ai = @inbounds A[i]
             v = op(v, f(ai))
@@ -1068,12 +1067,13 @@ function mapreduce_kernel(f, op, A, init, inds::CartesianIndices{N}) where {N}
         # in the first iteration of the outer loop
         outer = CartesianIndices(tail(inds.indices))
         o1, so = iterate(outer)
-        @simd for i in r[begin+1:end]
+        v = _mapreduce_start(f, op, A, init, A[r[begin], o1])
+        for i in r[begin+1:end]
             ai = @inbounds A[i, o1]
             v = op(v, f(ai))
         end
         for o in Iterators.rest(outer, so)
-            @simd for i in r
+            for i in r
                 ai = @inbounds A[i, o]
                 v = op(v, f(ai))
             end
@@ -1082,6 +1082,157 @@ function mapreduce_kernel(f, op, A, init, inds::CartesianIndices{N}) where {N}
     return v
 end
 
+function mapreduce_kernel_commutative(f, op, A, init, inds::AbstractArray)
+    if length(inds) < 16
+        i1, iN = firstindex(inds), lastindex(inds)
+        v_1 = _mapreduce_start(f, op, A, init, @inbounds A[inds[i1]])
+        for i in i1+1:iN
+            a = @inbounds A[inds[i]]
+            v_1 = op(v_1, f(a))
+        end
+        return v_1
+    end
+    return _mapreduce_kernel_commutative(f, op, A, init, inds)
+end
+
+# This special internal method must have at least 4 indices and allows passing
+# optional scalar leading and trailing dimensions
+function _mapreduce_kernel_commutative(f, op, A, init, inds, leading=(), trailing=())
+    i1, iN = firstindex(inds), lastindex(inds)
+    n = length(inds)
+    @nexprs 4 N->a_N = @inbounds A[leading..., inds[i1+(N-1)], trailing...]
+    @nexprs 4 N->v_N = _mapreduce_start(f, op, A, init, a_N)
+    for batch in 1:(n>>2)-1
+        i = i1 + batch*4
+        @nexprs 4 N->a_N = @inbounds A[leading..., inds[i+(N-1)], trailing...]
+        @nexprs 4 N->fa_N = f(a_N)
+        @nexprs 4 N->v_N = op(v_N, fa_N)
+    end
+    v = op(op(v_1, v_2), op(v_3, v_4))
+    i = i1 + (n>>2)*4 - 1
+    i == iN && return v
+    for i in i+1:iN
+        ai = @inbounds A[leading..., inds[i], trailing...]
+        v = op(v, f(ai))
+    end
+    return v
+end
+
+function mapreduce_kernel_commutative(f::F, op::G, A, init, inds::CartesianIndices{N}) where {N,F,G}
+    N == 0 && return _mapreduce_start(f, op, A, init, A[inds[]])
+    N == 1 && return mapreduce_kernel_commutative(f, op, A, init, inds.indices[1])
+    is = inds.indices[1]
+    js = inds.indices[2]
+    if length(is) == 1 && length(js) >= 4
+        # It's quite useful to optimize this case for dimensional reductions
+        i = only(is)
+        outer = CartesianIndices(tail(tail(inds.indices)))
+        o1, s = iterate(outer)
+        v = _mapreduce_kernel_commutative(f, op, A, init, js, (i,), o1.I)
+        for o in Iterators.rest(outer, s)
+            v = op(v, _mapreduce_kernel_commutative(f, op, A, init, js, (i,), o.I))
+        end
+        return v
+    elseif length(is) < 4 # TODO: tune this number
+        # These small cases could be further optimized
+        return mapreduce_kernel_commutative(i->f(A[i]), op, inds, init, HasShape{N}(), length(inds))[1]
+    else
+        outer = CartesianIndices(tail(inds.indices))
+        o1, s = iterate(outer)
+        v = _mapreduce_kernel_commutative(f, op, A, init, is, (), o1.I)
+        for o in Iterators.rest(outer, s)
+            v = op(v, _mapreduce_kernel_commutative(f, op, A, init, is, (), o.I))
+        end
+        return v
+    end
+end
+
+@noinline _throw_iterator_assertion_error() = throw(AssertionError("iterator promised a length longer than it iterates"))
+function mapreduce_kernel_commutative(f, op, itr, init, ::Union{HasLength, HasShape}, n, state...)
+    it = iterate(itr, state...)
+    it === nothing && _throw_iterator_assertion_error()
+    a, s = it
+    v_1 = _mapreduce_start(f, op, itr, init, a)
+    if n < 16
+        for _ in 2:n
+            it = iterate(itr, s)
+            it === nothing && _throw_iterator_assertion_error()
+            a, s = it
+            v_1 = op(v_1, f(a))
+        end
+        return v_1, s
+    end
+    @nexprs 3 n->begin
+        it = iterate(itr, s)
+        it === nothing && _throw_iterator_assertion_error()
+        a, s = it
+        v_{n+1} = _mapreduce_start(f, op, itr, init, a)
+    end
+    i = 4
+    for outer i in 8:4:n
+        @nexprs 4 n->begin
+            it = iterate(itr, s)
+            it === nothing && _throw_iterator_assertion_error()
+            a_n, s = it
+        end
+        @nexprs 4 n-> fa_n = f(a_n)
+        @nexprs 4 n-> v_n = op(v_n, fa_n)
+    end
+    v = op(op(v_1, v_2), op(v_3, v_4))
+    for _ in i+1:n
+        it = iterate(itr, s)
+        it === nothing && _throw_iterator_assertion_error()
+        a, s = it
+        v = op(v, f(a))
+    end
+    return v, s
+end
+function mapreduce_kernel_commutative(f, op, itr, init, ::IteratorSize, n, state...)
+    it = iterate(itr, state...)
+    it === nothing && return nothing
+    a, s = it
+    v_1 = _mapreduce_start(f, op, itr, init, a)
+    it = iterate(itr, s)
+    it === nothing && return Some(v_1)
+    a, s = it
+    v_2 = _mapreduce_start(f, op, itr, init, a)
+    it = iterate(itr, s)
+    it === nothing && return Some(op(v_1, v_2))
+    a, s = it
+    v_3 = _mapreduce_start(f, op, itr, init, a)
+    it = iterate(itr, s)
+    it === nothing && return Some(op(op(v_1, v_2), v_3))
+    a, s = it
+    v_4 = _mapreduce_start(f, op, itr, init, a)
+    for _ in 2:n>>2
+        @nexprs 4 N->begin
+            it = iterate(itr, s)
+            if it === nothing
+                N > 3 && (v_3 = op(v_3, f(a_3)))
+                N > 2 && (v_2 = op(v_2, f(a_2)))
+                N > 1 && (v_1 = op(v_1, f(a_1)))
+                return Some(op(op(v_1, v_2), op(v_3, v_4)))
+            end
+            a_N, s = it
+        end
+        @nexprs 4 N->fa_N = f(a_N)
+        @nexprs 4 N->v_N = op(v_N, fa_N)
+    end
+    v = op(op(v_1, v_2), op(v_3, v_4))
+    i = (n>>2)*4
+    @nexprs 4 N->begin
+        it = iterate(itr, s)
+        if it === nothing
+            return Some(v)
+        elseif n < i+N
+            return (v, s)
+        end
+        a, s = it
+        v = op(v, f(a))
+    end
+    return (v, s)
+end
+
 diff(a::AbstractVector) = diff(a, dims=1)
 
 """
diff --git a/base/permuteddimsarray.jl b/base/permuteddimsarray.jl
index b95a5821e91d3..542e9561a6ec0 100644
--- a/base/permuteddimsarray.jl
+++ b/base/permuteddimsarray.jl
@@ -343,8 +343,7 @@ end
     P
 end
 
-const CommutativeOps = Union{typeof(+),typeof(Base.add_sum),typeof(min),typeof(max),typeof(Base._extrema_rf),typeof(|),typeof(&)}
-
+using Base: CommutativeOps
 function Base.mapreducedim(f, op::CommutativeOps, A::PermutedDimsArray, init, dims::Colon)
     Base.mapreducedim(f, op, parent(A), init, dims)
 end
diff --git a/base/reduce.jl b/base/reduce.jl
index 92c6ed4f9abce..fc1e2d9359861 100644
--- a/base/reduce.jl
+++ b/base/reduce.jl
@@ -435,30 +435,39 @@ the start of each new chain of `op` calls.
 
 """
     mapreduce_pairwise(f, op, A, init)
-    mapreduce_pairwise(f, op, A, init, indices)
-    mapreduce_pairwise(f, op, itr, init, ::Union{HasLength, HasShape}, n, [state])
-    mapreduce_pairwise(f, op, itr, init, ::IteratorSize, n, valstate)
+    mapreduce_pairwise(f, op, A, init, indices::AbstractArray)
+    mapreduce_pairwise(f, op, itr, init, ::Union{HasLength, HasShape})
+    mapreduce_pairwise(f, op, itr, init, ::IteratorSize)
 
 Perform a pairwise reassociation of the mapped reduction with `f` and `op`
 Returns the result `v` or, for the 6+ argument iterator methods, a tuple `(v, state)` or `(v, iterate(itr, state))`
 
-The four-argument entry-point handles the empty case and dispatches to a polyalgorithm
+These entry-points handle the empty case and dispatches to a polyalgorithm
 with three distinct mechanisms for accessing elements and structuring the subsequent
-recursion; all such recursive calls **must** process at least one element.
+recursion; the methods with length must process at least one element.
+
+    mapreduce_pairwise(f, op, A, init, indices::AbstractArray)
 
     * An indexing method recursively splits `indices` until there are fewer than
       `pairwise_blocksize` indices to process. CartesianIndices split into halves
       across columns first, then within a single column (if necessary).
+
+    mapreduce_pairwise(f, op, itr, init, ::Union{HasLength, HasShape}, n, [state]) -> (value, state)
+
     * An iterable method (with length) recursively takes half as many elements from
       the iterable until `n` is less than or equal to the `pairwise_blocksize`. This
       is slightly more complicated than the indexing case as we must additionally pass
       (and return) the iterable's state along with the result of each part of the
       reduction.
-    * An iterable method (without length) peels one element (and state) in advance of
-      every recursive call to handle the empty case and ensure that all recursive calls
-      process at least one element. This is slightly more complicated than the length
-      case as we must pass the iterable's next `(val, state)` to each recursive call
-      and similarly return it as the second part of the internal return tuple.
+
+    mapreduce_pairwise(f, op, itr, init, ::IteratorSize, n, [state]) -> (value, state) | nothing | Some(value)
+
+    * An iterable method (without length) that — unlike the above two methods — might
+      not process any values (because we don't know if there are values in the iterable).
+      This is slightly more complicated than the length case as there are three possibilities:
+        - there may not be any values left in the iterator at all (returning `nothing`)
+        - there may have been _some_ values but the iterator was exhausted (returning `Some` value)
+        - there may be more values left, returning a `(value, state)` tuple
       Additionally this algorithm effectively works backwards from the other two;
       instead of starting at the *final* `op` reduction of the (possibly many) branches
       of reduction chains, this starts at the initial chain of length **up to** the
@@ -471,24 +480,34 @@ and reassociations.
 """
 function mapreduce_pairwise(f::F, op::G, A::AbstractArrayOrBroadcasted, init) where {F, G}
     n = length(A)
-    n <= 16 && return mapfoldl(f, op, A; init) # The overhead of SIMD is more expensive than a straight loop
+    n == 0 && return _mapreduce_start(f, op, A, init)
     n <= pairwise_blocksize(f, op) && return mapreduce_kernel(f, op, A, init, eachindex(A))
     return mapreduce_pairwise(f, op, A, init, eachindex(A))
 end
 mapreduce_pairwise(f::F, op::G, itr, init) where {F, G} = mapreduce_pairwise(f, op, itr, init, IteratorSize(itr)) 
 function mapreduce_pairwise(f, op, itr, init, S::Union{HasLength, HasShape})
     n = length(itr)
-    n <= pairwise_blocksize(f, op) && return mapfoldl(f, op, itr; init) # Iterators typically won't SIMD
+    n == 0 && return _mapreduce_start(f, op, itr, init)
+    # Note that there can be a significant advantage to the for-loop lowering of mapfoldl
+    n <= pairwise_blocksize(f, op) && return mapreduce_kernel(f, op, itr, init, S, n)[1]
     return mapreduce_pairwise(f, op, itr, init, S, n)[1]
 end
+# There are three possible return values for partial reductions of SizeUnknown
+# No values -> return nothing
+# Some values, but ended -> return Some(value)
+# Some values, more to come -> return value, state
 function mapreduce_pairwise(f::F, op::G, itr, init, S::IteratorSize) where {F, G}
-    it = iterate(itr)
-    it === nothing && return _mapreduce_start(f, op, itr, init)
     n = pairwise_blocksize(f, op)
-    v, it = mapreduce_kernel(f, op, itr, init, S, n, it)
-    while it !== nothing
+    ret = mapreduce_kernel(f, op, itr, init, S, n)
+    ret === nothing && return _mapreduce_start(f, op, itr, init)
+    ret isa Some && return something(ret)
+    v, s = ret
+    while n < prevpow(2, typemax(n))
         n <<= 1
-        v1, it = mapreduce_pairwise(f, op, itr, init, S, n, it)
+        ret = mapreduce_pairwise(f, op, itr, init, S, n, s)
+        ret === nothing && return v
+        ret isa Some && return op(v, something(ret))
+        v1, s = ret
         v = op(v, v1)
     end
     return v
@@ -517,7 +536,7 @@ _halves(::Tuple{}) = ()
 _wholes(r) = r[begin:end], r[begin:end]
 
 function mapreduce_pairwise(f::F, op::G, itr, init, S::Union{HasLength,HasShape}, n, state...) where {F,G}
-    if n < max(10, pairwise_blocksize(f, op))
+    if n <= max(10, pairwise_blocksize(f, op))
         return mapreduce_kernel(f, op, itr, init, S, n, state...)
     else
         ndiv2 = n >> 1
@@ -526,24 +545,27 @@ function mapreduce_pairwise(f::F, op::G, itr, init, S::Union{HasLength,HasShape}
         return (op(v1, v2), s)
     end
 end
-function mapreduce_pairwise(f::F, op::G, itr, init, S::IteratorSize, n, it) where {F,G}
+function mapreduce_pairwise(f::F, op::G, itr, init, S::IteratorSize, n, state...) where {F,G}
     if n <= max(10, pairwise_blocksize(f, op))
-        v, it = mapreduce_kernel(f, op, itr, init, S, n, it)
-        return it === nothing ? (v, nothing) : (v, it)
+        return mapreduce_kernel(f, op, itr, init, S, n, state...)
     else
         ndiv2 = n >> 1
-        v1, it = mapreduce_pairwise(f, op, itr, init, S, ndiv2, it)
-        it === nothing && return (v1, nothing)
-        v2, it = mapreduce_pairwise(f, op, itr, init, S, n-ndiv2, it)
-        v = op(v1, v2)
-        return it === nothing ? (v, nothing) : (v, it)
+        ret = mapreduce_pairwise(f, op, itr, init, S, ndiv2, state...)
+        ret === nothing && return nothing
+        ret isa Some && return ret
+        v1, s = ret
+        ret = mapreduce_pairwise(f, op, itr, init, S, n-ndiv2, s)
+        ret === nothing && return Some(v1)
+        ret isa Some && return Some(op(v1, something(ret)))
+        v2, s = ret
+        return (op(v1, v2), s)
     end
 end
 
 """
-    mapreduce_kernel(f, op, A, init, inds) -> v
+    mapreduce_kernel(f, op, A, init, inds::AbstractArray) -> v
     mapreduce_kernel(f, op, itr, init, ::Union{HasLength, HasShape}, n, [state]) -> (v, state)
-    mapreduce_kernel(f, op, itr, init, ::IteratorSize, n, valstate) -> (v, iterate(itr, state))
+    mapreduce_kernel(f, op, itr, init, ::IteratorSize, n, [state]) -> (v, state) | nothing | Some(v)
 
 Perform the mapped reduction with `f` and `op` akin to a `mapfoldl` but permitting SIMD-like
 optimizations and reassociations. Corresponding to `mapreduce_pairwise`, this function uses
@@ -557,20 +579,22 @@ three (and a half) different mechanisms for accessing elements and structuring i
     This method additionally takes (and returns as the second element in a tuple) the *next* `(val, state)`
     tuple from the iterator or `nothing`
 """
-function mapreduce_kernel(f, op, A, init, inds)
+function mapreduce_kernel(f::F, op::G, A, init, inds::AbstractArray) where {F, G}
+    is_commutative_op(op) && return mapreduce_kernel_commutative(f, op, A, init, inds)
     a1 = @inbounds A[inds[begin]]
     v = _mapreduce_start(f, op, A, init, a1)
     length(inds) == 1 && return v
-    @simd for i in inds[begin+1:end]
+    for i in inds[begin+1:end]
         a = @inbounds A[i]
         v = op(v, f(a))
     end
     return v
 end
-function mapreduce_kernel(f, op, itr, init, ::Union{HasLength, HasShape}, n, state...)
+function mapreduce_kernel(f::F, op::G, itr, init, S::Union{HasLength, HasShape}, n, state...) where {F, G}
+    is_commutative_op(op) && return mapreduce_kernel_commutative(f, op, itr, init, S, n, state...)
     a1, s = iterate(itr, state...)
     v = _mapreduce_start(f, op, itr, init, a1)
-    @simd for _ in 2:n
+    for _ in 2:n
         it = iterate(itr, s)
         it === nothing && return v, s # This will only happen if an iterator lied about its length
         a, s = it
@@ -578,17 +602,19 @@ function mapreduce_kernel(f, op, itr, init, ::Union{HasLength, HasShape}, n, sta
     end
     return v, s
 end
-function mapreduce_kernel(f, op, itr, init,  ::IteratorSize, n, it)
+function mapreduce_kernel(f::F, op::G, itr, init, S::IteratorSize, n, state...) where {F, G}
+    is_commutative_op(op) && return mapreduce_kernel_commutative(f, op, itr, init, S, n, state...)
+    it = iterate(itr, state...)
+    it === nothing && return nothing
     a1, s = it
     v = _mapreduce_start(f, op, itr, init, a1)
-    @simd for _ in 2:n
+    for _ in 2:n
         it = iterate(itr, s)
-        it === nothing && return v, nothing
+        it === nothing && return Some(v)
         a, s = it
         v = op(v, f(a))
     end
-    it = iterate(itr, s)
-    return it === nothing ? (v, nothing) : (v, it)
+    return (v, s)
 end
 
 """
@@ -1281,3 +1307,8 @@ for (fred, f) in ((maximum, max), (minimum, min), (sum, add_sum))
         isempty(r) ? init : op(init, $fred(r))
     end
 end
+
+# Reduction kernels that explicitly encode simplistic SIMD-ish reorderings
+const CommutativeOps = Union{typeof(+),typeof(Base.add_sum),typeof(min),typeof(max),typeof(Base._extrema_rf),typeof(|),typeof(&)}
+is_commutative_op(::CommutativeOps) = true
+is_commutative_op(::Any) = false

From 264b8dbea15bcd150919121d1c078e24c213ef03 Mon Sep 17 00:00:00 2001
From: Matt Bauman <mbauman@juliacomputing.com>
Date: Thu, 26 Jun 2025 16:29:31 -0400
Subject: [PATCH 54/55] fixup! Specialize function args to avoid allocations

---
 base/reduce.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/base/reduce.jl b/base/reduce.jl
index 63e44f9c92826..d5d111f7a900f 100644
--- a/base/reduce.jl
+++ b/base/reduce.jl
@@ -486,7 +486,7 @@ function mapreduce_pairwise(f::F, op::G, A::AbstractArrayOrBroadcasted, init) wh
     n <= pairwise_blocksize(f, op) && return mapreduce_kernel(f, op, A, init, eachindex(A))
     return mapreduce_pairwise(f, op, A, init, eachindex(A))
 end
-mapreduce_pairwise(f::F, op::G, itr, init) where {F, G} = mapreduce_pairwise(f, op, itr, init, IteratorSize(itr)) 
+mapreduce_pairwise(f::F, op::G, itr, init) where {F, G} = mapreduce_pairwise(f, op, itr, init, IteratorSize(itr))
 function mapreduce_pairwise(f, op, itr, init, S::Union{HasLength, HasShape})
     n = length(itr)
     n == 0 && return _mapreduce_start(f, op, itr, init)

From 3d5cfec7d3c4257cae2d23ae150da2faa49d78da Mon Sep 17 00:00:00 2001
From: Matt Bauman <mbauman@juliacomputing.com>
Date: Thu, 3 Jul 2025 12:37:40 -0400
Subject: [PATCH 55/55] move to 8x bins for commutative ops

---
 base/multidimensional.jl | 76 +++++++++++++++++++++++++---------------
 1 file changed, 48 insertions(+), 28 deletions(-)

diff --git a/base/multidimensional.jl b/base/multidimensional.jl
index 3b37d68c3793a..50e3d17a4ca68 100644
--- a/base/multidimensional.jl
+++ b/base/multidimensional.jl
@@ -1113,21 +1113,21 @@ function mapreduce_kernel_commutative(f, op, A, init, inds::AbstractArray)
     return _mapreduce_kernel_commutative(f, op, A, init, inds)
 end
 
-# This special internal method must have at least 4 indices and allows passing
+# This special internal method must have at least 8 indices and allows passing
 # optional scalar leading and trailing dimensions
 function _mapreduce_kernel_commutative(f, op, A, init, inds, leading=(), trailing=())
     i1, iN = firstindex(inds), lastindex(inds)
     n = length(inds)
-    @nexprs 4 N->a_N = @inbounds A[leading..., inds[i1+(N-1)], trailing...]
-    @nexprs 4 N->v_N = _mapreduce_start(f, op, A, init, a_N)
-    for batch in 1:(n>>2)-1
-        i = i1 + batch*4
-        @nexprs 4 N->a_N = @inbounds A[leading..., inds[i+(N-1)], trailing...]
-        @nexprs 4 N->fa_N = f(a_N)
-        @nexprs 4 N->v_N = op(v_N, fa_N)
-    end
-    v = op(op(v_1, v_2), op(v_3, v_4))
-    i = i1 + (n>>2)*4 - 1
+    @nexprs 8 N->a_N = @inbounds A[leading..., inds[i1+(N-1)], trailing...]
+    @nexprs 8 N->v_N = _mapreduce_start(f, op, A, init, a_N)
+    for batch in 1:(n>>3)-1
+        i = i1 + batch*8
+        @nexprs 8 N->a_N = @inbounds A[leading..., inds[i+(N-1)], trailing...]
+        @nexprs 8 N->fa_N = f(a_N)
+        @nexprs 8 N->v_N = op(v_N, fa_N)
+    end
+    v = op(op(op(v_1, v_2), op(v_3, v_4)), op(op(v_5, v_6), op(v_7, v_8)))
+    i = i1 + (n>>3)*8 - 1
     i == iN && return v
     for i in i+1:iN
         ai = @inbounds A[leading..., inds[i], trailing...]
@@ -1141,7 +1141,7 @@ function mapreduce_kernel_commutative(f::F, op::G, A, init, inds::CartesianIndic
     N == 1 && return mapreduce_kernel_commutative(f, op, A, init, inds.indices[1])
     is = inds.indices[1]
     js = inds.indices[2]
-    if length(is) == 1 && length(js) >= 4
+    if length(is) == 1 && length(js) >= 8
         # It's quite useful to optimize this case for dimensional reductions
         i = only(is)
         outer = CartesianIndices(tail(tail(inds.indices)))
@@ -1151,7 +1151,7 @@ function mapreduce_kernel_commutative(f::F, op::G, A, init, inds::CartesianIndic
             v = op(v, _mapreduce_kernel_commutative(f, op, A, init, js, (i,), o.I))
         end
         return v
-    elseif length(is) < 4 # TODO: tune this number
+    elseif length(is) < 8 # TODO: tune this number
         # These small cases could be further optimized
         return mapreduce_kernel_commutative(i->f(A[i]), op, inds, init, HasShape{N}(), length(inds))[1]
     else
@@ -1180,23 +1180,23 @@ function mapreduce_kernel_commutative(f, op, itr, init, ::Union{HasLength, HasSh
         end
         return v_1, s
     end
-    @nexprs 3 n->begin
+    @nexprs 7 n->begin
         it = iterate(itr, s)
         it === nothing && _throw_iterator_assertion_error()
         a, s = it
         v_{n+1} = _mapreduce_start(f, op, itr, init, a)
     end
-    i = 4
-    for outer i in 8:4:n
-        @nexprs 4 n->begin
+    i = 8
+    for outer i in 16:8:n
+        @nexprs 8 n->begin
             it = iterate(itr, s)
             it === nothing && _throw_iterator_assertion_error()
             a_n, s = it
         end
-        @nexprs 4 n-> fa_n = f(a_n)
-        @nexprs 4 n-> v_n = op(v_n, fa_n)
+        @nexprs 8 n-> fa_n = f(a_n)
+        @nexprs 8 n-> v_n = op(v_n, fa_n)
     end
-    v = op(op(v_1, v_2), op(v_3, v_4))
+    v = op(op(op(v_1, v_2), op(v_3, v_4)), op(op(v_5, v_6), op(v_7, v_8)))
     for _ in i+1:n
         it = iterate(itr, s)
         it === nothing && _throw_iterator_assertion_error()
@@ -1222,23 +1222,43 @@ function mapreduce_kernel_commutative(f, op, itr, init, ::IteratorSize, n, state
     it === nothing && return Some(op(op(v_1, v_2), v_3))
     a, s = it
     v_4 = _mapreduce_start(f, op, itr, init, a)
-    for _ in 2:n>>2
-        @nexprs 4 N->begin
+    it = iterate(itr, s)
+    it === nothing && return Some(op(op(v_1, v_2), op(v_3, v_4)))
+    a, s = it
+    v_5 = _mapreduce_start(f, op, itr, init, a)
+    it = iterate(itr, s)
+    it === nothing && return Some(op(op(op(v_1, v_2), op(v_3, v_4)), v_5))
+    a, s = it
+    v_6 = _mapreduce_start(f, op, itr, init, a)
+    it = iterate(itr, s)
+    it === nothing && return Some(op(op(op(v_1, v_2), op(v_3, v_4)), op(v_5, v_6)))
+    a, s = it
+    v_7 = _mapreduce_start(f, op, itr, init, a)
+    it = iterate(itr, s)
+    it === nothing && return Some(op(op(op(v_1, v_2), op(v_3, v_4)), op(op(v_5, v_6), v_7)))
+    a, s = it
+    v_8 = _mapreduce_start(f, op, itr, init, a)
+    for _ in 3:n>>3
+        @nexprs 8 N->begin
             it = iterate(itr, s)
             if it === nothing
+                N > 7 && (v_7 = op(v_7, f(a_7)))
+                N > 6 && (v_6 = op(v_6, f(a_6)))
+                N > 5 && (v_5 = op(v_5, f(a_5)))
+                N > 4 && (v_4 = op(v_4, f(a_4)))
                 N > 3 && (v_3 = op(v_3, f(a_3)))
                 N > 2 && (v_2 = op(v_2, f(a_2)))
                 N > 1 && (v_1 = op(v_1, f(a_1)))
-                return Some(op(op(v_1, v_2), op(v_3, v_4)))
+                return Some(op(op(op(v_1, v_2), op(v_3, v_4)), op(op(v_5, v_6), op(v_7, v_8))))
             end
             a_N, s = it
         end
-        @nexprs 4 N->fa_N = f(a_N)
-        @nexprs 4 N->v_N = op(v_N, fa_N)
+        @nexprs 8 N->fa_N = f(a_N)
+        @nexprs 8 N->v_N = op(v_N, fa_N)
     end
-    v = op(op(v_1, v_2), op(v_3, v_4))
-    i = (n>>2)*4
-    @nexprs 4 N->begin
+    v = op(op(op(v_1, v_2), op(v_3, v_4)), op(op(v_5, v_6), op(v_7, v_8)))
+    i = (n>>3)*8
+    @nexprs 8 N->begin
         it = iterate(itr, s)
         if it === nothing
             return Some(v)