Finish removing the BigInts from * for FD{Int128}!

NHDaly · NHDaly · commit 428d6f7212e2 · 2024-06-12T20:14:10.000-06:00
Finally implements the fast-multiplication optimization from #45, but this time for 128-bit FixedDecimals! :) This is a follow-up to #93, which introduces an Int256 type for widemul. However, the fldmod still required 2 BigInt allocations. Now, this PR uses a custom implementation of the LLVM div-by-const optimization for (U)Int256, which briefly widens to Int512 (😅) to perform the fldmod by the constant 10^f coefficient. This brings 128-bit FD multiply to the same performance as 64-bit. :)
diff --git a/src/FixedPointDecimals.jl b/src/FixedPointDecimals.jl
@@ -36,9 +36,11 @@ export checked_abs, checked_add, checked_cld, checked_div, checked_fld,
 
 using Base: decompose, BitInteger
 
-import BitIntegers  # For 128-bit _widemul / _widen
+using BitIntegers: BitIntegers, UInt256, Int256
 import Parsers
 
+include("fldmod-by-const.jl")
+
 # floats that support fma and are roughly IEEE-like
 const FMAFloat = Union{Float16, Float32, Float64, BigFloat}
 
@@ -129,8 +131,10 @@ _widemul(x::Unsigned,y::Signed) = signed(_widen(x)) * _widen(y)
 
 # Custom widen implementation to avoid the cost of widening to BigInt.
 # FD{Int128} operations should widen to 256 bits internally, rather than to a BigInt.
-_widen(::Type{Int128}) = BitIntegers.Int256
-_widen(::Type{UInt128}) = BitIntegers.UInt256
+_widen(::Type{Int128}) = Int256
+_widen(::Type{UInt128}) = UInt256
+_widen(::Type{Int256}) = BitIntegers.Int512
+_widen(::Type{UInt256}) = BitIntegers.UInt512
 _widen(t::Type) = widen(t)
 _widen(x::T) where {T} = (_widen(T))(x)
 
@@ -196,41 +200,12 @@ function _round_to_nearest(quotient::T,
 end
 _round_to_nearest(q, r, d, m=RoundNearest) = _round_to_nearest(promote(q, r, d)..., m)
 
-# In many of our calls to fldmod, `y` is a constant (the coefficient, 10^f). However, since
-# `fldmod` is sometimes not being inlined, that constant information is not available to the
-# optimizer. We need an inlined version of fldmod so that the compiler can replace expensive
-# divide-by-power-of-ten instructions with the cheaper multiply-by-inverse-coefficient.
-@inline fldmodinline(x,::Val{y}) where {y} = (fld(x,y), mod(x,y))
-
-# Note that LLVM *can* handle *div-by-const* for 256-bit integers, so we override the
-# implementation from BitInteger, which calls out to big().
-const BitInteger256 = Union{BitIntegers.Int256, BitIntegers.UInt256}
-@inline function fldmodinline(x::T, ::Val{y}) where {T<:BitInteger256, y}
-    (_fld(x,T(y)), _mod(x,T(y)))
-end
-@inline _fld(x, y) = _div(promote(x, y)..., RoundDown)
-@inline function _mod(x::T, y) where T<:Integer
-    y == -1 && return T(0)   # avoid potential overflow in fld
-    return x - _fld(x, y) * y
-end
-@inline _mod(x::T, y::T) where {T<:Unsigned} = _rem(x, y)
-@inline _rem(x::T, y::T) where {T<:Signed} = checked_srem_int(x, y)
-@inline _rem(x::T, y::T) where {T<:Unsigned} = checked_urem_int(x, y)
-
-# fld(x, y) == div(x, y) - ((x >= 0) != (y >= 0) && rem(x, y) != 0 ? 1 : 0)
-@inline _div(x::T, y::T, ::typeof(RoundDown)) where {T<:Unsigned} = Base.checked_udiv_int(x, y)
-@inline function _div(x::T, y::T, ::typeof(RoundDown)) where T<:Integer
-    d = Base.checked_sdiv_int(x, y)  # Explicitly call out
-    return d - (signbit(x ⊻ y) & (d * y != x))
-end
-
-
 # multiplication rounds to nearest even representation
 # TODO: can we use floating point to speed this up? after we build a
 # correctness test suite.
 function Base.:*(x::FD{T, f}, y::FD{T, f}) where {T, f}
     powt = coefficient(FD{T, f})
-    quotient, remainder = fldmodinline(_widemul(x.i, y.i), Val(powt))
+    quotient, remainder = fldmod_by_const(_widemul(x.i, y.i), Val(powt))
     reinterpret(FD{T, f}, _round_to_nearest(quotient, remainder, powt))
 end
 
@@ -257,12 +232,12 @@ function Base.round(x::FD{T, f},
                              RoundingMode{:NearestTiesUp},
                              RoundingMode{:NearestTiesAway}}=RoundNearest) where {T, f}
     powt = coefficient(FD{T, f})
-    quotient, remainder = fldmodinline(x.i, Val(powt))
+    quotient, remainder = fldmod_by_const(x.i, Val(powt))
     FD{T, f}(_round_to_nearest(quotient, remainder, powt, m))
 end
 function Base.ceil(x::FD{T, f}) where {T, f}
     powt = coefficient(FD{T, f})
-    quotient, remainder = fldmodinline(x.i, Val(powt))
+    quotient, remainder = fldmod_by_const(x.i, Val(powt))
     if remainder > 0
         FD{T, f}(quotient + one(quotient))
     else
@@ -458,7 +433,7 @@ function Base.checked_sub(x::T, y::T) where {T<:FD}
 end
 function Base.checked_mul(x::FD{T,f}, y::FD{T,f}) where {T<:Integer,f}
     powt = coefficient(FD{T, f})
-    quotient, remainder = fldmodinline(_widemul(x.i, y.i), Val(powt))
+    quotient, remainder = fldmod_by_const(_widemul(x.i, y.i), Val(powt))
     v = _round_to_nearest(quotient, remainder, powt)
     typemin(T) <= v <= typemax(T) || Base.Checked.throw_overflowerr_binaryop(:*, x, y)
     return reinterpret(FD{T, f}, T(v))
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -10,4 +10,9 @@ include(joinpath(pkg_path, "test", "utils.jl"))
 
 @testset "FixedPointDecimals" begin
     include("FixedDecimal.jl")
-end  # global testset
+end
+
+@testset "FixedPointDecimals" begin
+    include("fldmod-by-const_tests.jl")
+end
+