From a1c17112416949c0bd3af78f2edb45bfc7c6323d Mon Sep 17 00:00:00 2001
From: Nathan Daly <nathan.daly@relational.ai>
Date: Wed, 12 Jun 2024 13:11:28 -0600
Subject: [PATCH 01/32] Use Int256 to avoid BigInt in FD operations.

We do not here explicitly introduce support for FD{BitIntegers.Int256},
though that should work out of the box both before and after this PR.

Rather, this PR _uses_ a (U)Int256 under the hood to prevent allocations
from Int128 widening to BigInt in FD operations.
---
 Project.toml              |  1 +
 src/FixedPointDecimals.jl | 32 +++++++++++++++++++++++++-------
 2 files changed, 26 insertions(+), 7 deletions(-)

diff --git a/Project.toml b/Project.toml
index 3f313f9..2301391 100644
--- a/Project.toml
+++ b/Project.toml
@@ -4,6 +4,7 @@ authors = ["Fengyang Wang <fengyang.wang.0@gmail.com>", "Curtis Vogt <curtis.vog
 version = "0.5.2"
 
 [deps]
+BitIntegers = "c3b6d118-76ef-56ca-8cc7-ebb389d030a1"
 Parsers = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0"
 
 [compat]
diff --git a/src/FixedPointDecimals.jl b/src/FixedPointDecimals.jl
index 9b1a747..317699b 100644
--- a/src/FixedPointDecimals.jl
+++ b/src/FixedPointDecimals.jl
@@ -35,6 +35,8 @@ export checked_abs, checked_add, checked_cld, checked_div, checked_fld,
     checked_mod, checked_mul, checked_neg, checked_rem, checked_sub
 
 using Base: decompose, BitInteger
+
+import BitIntegers  # For 128-bit _widemul / _widen
 import Parsers
 
 # floats that support fma and are roughly IEEE-like
@@ -118,6 +120,22 @@ function __init__()
     return
 end
 
+# Custom widemul implementation to avoid the cost of widening to BigInt.
+# FD{Int128} operations should widen to 256 bits internally, rather than to a BigInt.
+const BitInteger128 = Union{Int128, UInt128}
+_widemul(x, y) = widemul(x, y)
+_widemul(x::BitInteger128, y) = _widemul(promote(x, y)...)
+_widemul(x, y::BitInteger128) = _widemul(promote(x, y)...)
+_widemul(x::Int128, y::Int128) = BitIntegers.Int256(x) * BitIntegers.Int256(y)
+_widemul(x::UInt128, y::UInt128) = BitIntegers.UInt256(x) * BitIntegers.UInt256(y)
+
+# Custom widen implementation to avoid the cost of widening to BigInt.
+# FD{Int128} operations should widen to 256 bits internally, rather than to a BigInt.
+_widen(::Type{Int128}) = BitIntegers.Int256
+_widen(::Type{UInt128}) = BitIntegers.UInt256
+_widen(t) = widen(t)
+
+
 (::Type{T})(x::Real) where {T <: FD} = convert(T, x)
 
 floattype(::Type{<:FD{T}}) where {T<:Union{Int8, UInt8, Int16, UInt16}} = Float32
@@ -188,7 +206,7 @@ _round_to_nearest(q, r, d, m=RoundNearest) = _round_to_nearest(promote(q, r, d).
 # correctness test suite.
 function Base.:*(x::FD{T, f}, y::FD{T, f}) where {T, f}
     powt = coefficient(FD{T, f})
-    quotient, remainder = fldmodinline(widemul(x.i, y.i), powt)
+    quotient, remainder = fldmodinline(_widemul(x.i, y.i), powt)
     reinterpret(FD{T, f}, _round_to_nearest(quotient, remainder, powt))
 end
 
@@ -416,7 +434,7 @@ function Base.checked_sub(x::T, y::T) where {T<:FD}
 end
 function Base.checked_mul(x::FD{T,f}, y::FD{T,f}) where {T<:Integer,f}
     powt = coefficient(FD{T, f})
-    quotient, remainder = fldmodinline(widemul(x.i, y.i), powt)
+    quotient, remainder = fldmodinline(_widemul(x.i, y.i), powt)
     v = _round_to_nearest(quotient, remainder, powt)
     typemin(T) <= v <= typemax(T) || Base.Checked.throw_overflowerr_binaryop(:*, x, y)
     return reinterpret(FD{T, f}, T(v))
@@ -474,7 +492,7 @@ checked_rdiv(x::FD, y::FD) = checked_rdiv(promote(x, y)...)
 
 function checked_rdiv(x::FD{T,f}, y::FD{T,f}) where {T<:Integer,f}
     powt = coefficient(FD{T, f})
-    quotient, remainder = fldmod(widemul(x.i, powt), y.i)
+    quotient, remainder = fldmod(_widemul(x.i, powt), y.i)
     v = _round_to_nearest(quotient, remainder, y.i)
     typemin(T) <= v <= typemax(T) || Base.Checked.throw_overflowerr_binaryop(:/, x, y)
     return reinterpret(FD{T, f}, v)
@@ -484,8 +502,8 @@ end
 # FixedDecimal.
 function checked_rdiv(x::Integer, y::FD{T, f}) where {T<:Integer, f}
     powt = coefficient(FD{T, f})
-    powtsq = widemul(powt, powt)
-    quotient, remainder = fldmod(widemul(x, powtsq), y.i)
+    powtsq = _widemul(powt, powt)
+    quotient, remainder = fldmod(_widemul(x, powtsq), y.i)
     v = _round_to_nearest(quotient, remainder, y.i)
     typemin(T) <= v <= typemax(T) || Base.Checked.throw_overflowerr_binaryop(:/, x, y)
     reinterpret(FD{T, f}, v)
@@ -722,7 +740,7 @@ NOTE: This function is expensive, since it contains a while-loop, but it is actu
       This function does not have or depend on any side-effects.
 """
 function max_exp10(::Type{T}) where {T <: Integer}
-    W = widen(T)
+    W = _widen(T)
     type_max = W(typemax(T))
 
     powt = one(W)
@@ -759,4 +777,4 @@ value(fd::FD) = fd.i
 # for generic hashing
 Base.decompose(fd::FD) = decompose(Rational(fd))
 
-end
+end  # module

From 77562387c986cc3b00f21cdc5bf5058abfb0f74a Mon Sep 17 00:00:00 2001
From: Nathan Daly <nathan.daly@relational.ai>
Date: Wed, 12 Jun 2024 13:27:48 -0600
Subject: [PATCH 02/32] Further reduce BigInts by skipping a `rem()` in iseven

---
 src/FixedPointDecimals.jl | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/FixedPointDecimals.jl b/src/FixedPointDecimals.jl
index 317699b..ac9e5b5 100644
--- a/src/FixedPointDecimals.jl
+++ b/src/FixedPointDecimals.jl
@@ -175,7 +175,9 @@ function _round_to_nearest(quotient::T,
                            divisor::T,
                            ::RoundingMode{M}=RoundNearest) where {T <: Integer, M}
     halfdivisor = divisor >> 1
-    if iseven(divisor) && remainder == halfdivisor
+    # PERF Note: Only need the last bit to check iseven, and default iseven(Int256)
+    # allocates, so we truncate first.
+    if iseven((divisor % Int8)) && remainder == halfdivisor
         # `:NearestTiesAway` will tie away from zero, e.g. -8.5 ->
         # -9. `:NearestTiesUp` will always ties towards positive
         # infinity. `:Nearest` will tie towards the nearest even

From 78e45dc31960b960728dd6c1825132ee771e783f Mon Sep 17 00:00:00 2001
From: Nathan Daly <nathan.daly@relational.ai>
Date: Wed, 12 Jun 2024 13:39:22 -0600
Subject: [PATCH 03/32] Fix ambiguity in _widemul(Int256, UInt256)

---
 src/FixedPointDecimals.jl | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/src/FixedPointDecimals.jl b/src/FixedPointDecimals.jl
index ac9e5b5..28b8611 100644
--- a/src/FixedPointDecimals.jl
+++ b/src/FixedPointDecimals.jl
@@ -123,17 +123,16 @@ end
 # Custom widemul implementation to avoid the cost of widening to BigInt.
 # FD{Int128} operations should widen to 256 bits internally, rather than to a BigInt.
 const BitInteger128 = Union{Int128, UInt128}
-_widemul(x, y) = widemul(x, y)
-_widemul(x::BitInteger128, y) = _widemul(promote(x, y)...)
-_widemul(x, y::BitInteger128) = _widemul(promote(x, y)...)
-_widemul(x::Int128, y::Int128) = BitIntegers.Int256(x) * BitIntegers.Int256(y)
-_widemul(x::UInt128, y::UInt128) = BitIntegers.UInt256(x) * BitIntegers.UInt256(y)
+_widemul(x, y) = _widen(x) * _widen(y)
+_widemul(x::Signed,y::Unsigned) = _widen(x) * signed(_widen(y))
+_widemul(x::Unsigned,y::Signed) = signed(_widen(x)) * _widen(y)
 
 # Custom widen implementation to avoid the cost of widening to BigInt.
 # FD{Int128} operations should widen to 256 bits internally, rather than to a BigInt.
 _widen(::Type{Int128}) = BitIntegers.Int256
 _widen(::Type{UInt128}) = BitIntegers.UInt256
-_widen(t) = widen(t)
+_widen(t::Type) = widen(t)
+_widen(x::T) where {T} = (_widen(T))(x)
 
 
 (::Type{T})(x::Real) where {T <: FD} = convert(T, x)

From 879c60229e436dbf5d942a522571600b067d5708 Mon Sep 17 00:00:00 2001
From: Nathan Daly <nathan.daly@relational.ai>
Date: Wed, 12 Jun 2024 13:59:47 -0600
Subject: [PATCH 04/32] Bump patch version number

---
 Project.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index 2301391..f3fcdaf 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "FixedPointDecimals"
 uuid = "fb4d412d-6eee-574d-9565-ede6634db7b0"
 authors = ["Fengyang Wang <fengyang.wang.0@gmail.com>", "Curtis Vogt <curtis.vogt@gmail.com>"]
-version = "0.5.2"
+version = "0.5.3"
 
 [deps]
 BitIntegers = "c3b6d118-76ef-56ca-8cc7-ebb389d030a1"

From a24565110dc16462cb2e7b2bd81adb161bd9ef20 Mon Sep 17 00:00:00 2001
From: Nathan Daly <nathan.daly@relational.ai>
Date: Wed, 12 Jun 2024 14:01:10 -0600
Subject: [PATCH 05/32] Add compat for BitIntegers

---
 Project.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Project.toml b/Project.toml
index f3fcdaf..f120b6c 100644
--- a/Project.toml
+++ b/Project.toml
@@ -9,6 +9,7 @@ Parsers = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0"
 
 [compat]
 Parsers = "2.7"
+BitIntegers = "0.3.1"
 julia = "1.6"
 
 [extras]

From 4bd8c45b4f57b8691c12dde3f6ddc2b4678cda66 Mon Sep 17 00:00:00 2001
From: Nathan Daly <nathan.daly@relational.ai>
Date: Wed, 12 Jun 2024 14:47:58 -0600
Subject: [PATCH 06/32] Finish removing the BigInts from * for FD{Int128}!
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Finally implements the fast-multiplication optimization from
https://github.com/JuliaMath/FixedPointDecimals.jl/pull/45, but this
time for 128-bit FixedDecimals! :)

This is a follow-up to
https://github.com/JuliaMath/FixedPointDecimals.jl/pull/93, which
introduces an Int256 type for widemul. However, the fldmod still
required 2 BigInt allocations.

Now, this PR uses a custom implementation of the LLVM div-by-const
optimization for (U)Int256, which briefly widens to Int512 (😅) to
perform the fldmod by the constant 10^f coefficient.

This brings 128-bit FD multiply to the same performance as 64-bit. :)
---
 src/FixedPointDecimals.jl     | 24 ++++-----
 src/fldmod-by-const.jl        | 96 +++++++++++++++++++++++++++++++++++
 test/fldmod-by-const_tests.jl | 58 +++++++++++++++++++++
 test/runtests.jl              |  7 ++-
 4 files changed, 171 insertions(+), 14 deletions(-)
 create mode 100644 src/fldmod-by-const.jl
 create mode 100644 test/fldmod-by-const_tests.jl

diff --git a/src/FixedPointDecimals.jl b/src/FixedPointDecimals.jl
index 28b8611..df8f75a 100644
--- a/src/FixedPointDecimals.jl
+++ b/src/FixedPointDecimals.jl
@@ -36,9 +36,11 @@ export checked_abs, checked_add, checked_cld, checked_div, checked_fld,
 
 using Base: decompose, BitInteger
 
-import BitIntegers  # For 128-bit _widemul / _widen
+using BitIntegers: BitIntegers, UInt256, Int256
 import Parsers
 
+include("fldmod-by-const.jl")
+
 # floats that support fma and are roughly IEEE-like
 const FMAFloat = Union{Float16, Float32, Float64, BigFloat}
 
@@ -129,8 +131,10 @@ _widemul(x::Unsigned,y::Signed) = signed(_widen(x)) * _widen(y)
 
 # Custom widen implementation to avoid the cost of widening to BigInt.
 # FD{Int128} operations should widen to 256 bits internally, rather than to a BigInt.
-_widen(::Type{Int128}) = BitIntegers.Int256
-_widen(::Type{UInt128}) = BitIntegers.UInt256
+_widen(::Type{Int128}) = Int256
+_widen(::Type{UInt128}) = UInt256
+_widen(::Type{Int256}) = BitIntegers.Int512
+_widen(::Type{UInt256}) = BitIntegers.UInt512
 _widen(t::Type) = widen(t)
 _widen(x::T) where {T} = (_widen(T))(x)
 
@@ -196,18 +200,12 @@ function _round_to_nearest(quotient::T,
 end
 _round_to_nearest(q, r, d, m=RoundNearest) = _round_to_nearest(promote(q, r, d)..., m)
 
-# In many of our calls to fldmod, `y` is a constant (the coefficient, 10^f). However, since
-# `fldmod` is sometimes not being inlined, that constant information is not available to the
-# optimizer. We need an inlined version of fldmod so that the compiler can replace expensive
-# divide-by-power-of-ten instructions with the cheaper multiply-by-inverse-coefficient.
-@inline fldmodinline(x,y) = (fld(x,y), mod(x,y))
-
 # multiplication rounds to nearest even representation
 # TODO: can we use floating point to speed this up? after we build a
 # correctness test suite.
 function Base.:*(x::FD{T, f}, y::FD{T, f}) where {T, f}
     powt = coefficient(FD{T, f})
-    quotient, remainder = fldmodinline(_widemul(x.i, y.i), powt)
+    quotient, remainder = fldmod_by_const(_widemul(x.i, y.i), powt)
     reinterpret(FD{T, f}, _round_to_nearest(quotient, remainder, powt))
 end
 
@@ -234,12 +232,12 @@ function Base.round(x::FD{T, f},
                              RoundingMode{:NearestTiesUp},
                              RoundingMode{:NearestTiesAway}}=RoundNearest) where {T, f}
     powt = coefficient(FD{T, f})
-    quotient, remainder = fldmodinline(x.i, powt)
+    quotient, remainder = fldmod_by_const(x.i, powt)
     FD{T, f}(_round_to_nearest(quotient, remainder, powt, m))
 end
 function Base.ceil(x::FD{T, f}) where {T, f}
     powt = coefficient(FD{T, f})
-    quotient, remainder = fldmodinline(x.i, powt)
+    quotient, remainder = fldmod_by_const(x.i, powt)
     if remainder > 0
         FD{T, f}(quotient + one(quotient))
     else
@@ -435,7 +433,7 @@ function Base.checked_sub(x::T, y::T) where {T<:FD}
 end
 function Base.checked_mul(x::FD{T,f}, y::FD{T,f}) where {T<:Integer,f}
     powt = coefficient(FD{T, f})
-    quotient, remainder = fldmodinline(_widemul(x.i, y.i), powt)
+    quotient, remainder = fldmod_by_const(_widemul(x.i, y.i), powt)
     v = _round_to_nearest(quotient, remainder, powt)
     typemin(T) <= v <= typemax(T) || Base.Checked.throw_overflowerr_binaryop(:*, x, y)
     return reinterpret(FD{T, f}, T(v))
diff --git a/src/fldmod-by-const.jl b/src/fldmod-by-const.jl
new file mode 100644
index 0000000..48ffcfb
--- /dev/null
+++ b/src/fldmod-by-const.jl
@@ -0,0 +1,96 @@
+
+const BitInteger256 = Union{UInt256, Int256}
+
+@inline function fldmod_by_const(x, y)
+    # For small-to-normal integers, LLVM can correctly optimize away the division, if it
+    # knows it's dividing by a const. We cannot call `Base.fldmod` since it's not
+    # inlined, so here we have explictly inlined it instead.
+    return (fld(x,y), mod(x,y))
+end
+@inline function fldmod_by_const(x::BitInteger256, y)
+    # For large or non-standard Int types, LLVM doesn't optimize
+    # well, so we use a custom implementation of fldmod.
+    d = fld_by_const(x, Val(y))
+    return d, manual_mod(promote(x, y, d)...)
+end
+
+# Calculate fld(x,y) when y is a Val constant.
+# The implementation for fld_by_const was lifted directly from Base.fld(x,y), except that
+# it uses `div_by_const` instead of `div`.
+fld_by_const(x::T, y::Val{C}) where {T<:UInt256, C} = div_by_const(x, y)
+function fld_by_const(x::T, y::Val{C}) where {T<:Int256, C}
+    d = div_by_const(x, y)
+    return d - (signbit(x ⊻ C) & (d * C != x))
+end
+
+# Calculate `mod(x,y)` after you've already acquired quotient, the result of `fld(x,y)`.
+# REQUIRES:
+#   - `y != -1`
+@inline function manual_mod(x::T, y::T, quotient::T) where T<:BitInteger256
+    return x - quotient * y
+end
+
+Base.@assume_effects :total function div_by_const(x::T, ::Val{C}) where {T, C}
+    # These checks will be compiled away during specialization.
+    # While for `*(FixedDecimal, FixedDecimal)`, C will always be a power of 10, these
+    # checks allow this function to work for any `C > 0`, in case that's useful in the
+    # future.
+    if C == 1
+        return x
+    elseif ispow2(C)
+        return div(x, C)  # Will already do the right thing
+    elseif C <= 0
+        throw(DomainError("C must be > 0"))
+    end
+    # Calculate the magic number 2^N/C. Note that this is computed statically, not at
+    # runtime.
+    inverse_coeff, toshift = calculate_inverse_coeff(T, C)
+    # Compute the upper-half of widemul(x, 2^nbits(T)/C).
+    # By keeping only the upper half, we're essentially dividing by 2^nbits(T), undoing the
+    # numerator of the multiplication, so that the result is equal to x/C.
+    out = mul_hi(x, inverse_coeff)
+    # This condition will be compiled away during specialization.
+    if T <: Signed
+        # Because our magic number has a leading one (since we shift all-the-way left), the
+        # result is negative if it's Signed. We add x to give us the positive equivalent.
+        out += x
+        signshift = (nbits(x) - 1)
+        isnegative = T(out >>> signshift)  # 1 if < 0 else 0 (Unsigned bitshift to read top bit)
+    end
+    # Undo the bitshifts used to calculate the invoeff magic number with maximum precision.
+    out = out >> toshift
+    if T <: Signed
+        out =  out + isnegative
+    end
+    return T(out)
+end
+
+Base.@assume_effects :foldable function calculate_inverse_coeff(::Type{T}, C) where {T}
+    # First, calculate 2^nbits(T)/C
+    # We shift away leading zeros to preserve the most precision when we use it to multiply
+    # in the next step. At the end, we will shift the final answer back to undo this
+    # operation (which is why we need to return `toshift`).
+    # Note, also, that we calculate invcoeff at double-precision so that the left-shift
+    # doesn't leave trailing zeros. We truncate to only the upper-half before returning.
+    UT = _unsigned(T)
+    invcoeff = typemax(_widen(UT)) ÷ C
+    toshift = leading_zeros(invcoeff)
+    invcoeff = invcoeff << toshift
+    # Now, truncate to only the upper half of invcoeff, after we've shifted. Instead of
+    # bitshifting, we round to maintain precision. (This is needed to prevent off-by-ones.)
+    # -- This is equivalent to `invcoeff = T(invcoeff >> sizeof(T))`, except rounded. --
+    invcoeff = _round_to_nearest(fldmod(invcoeff, typemax(UT))..., typemax(UT)) % T
+    return invcoeff, toshift
+end
+
+function mul_hi(x::T, y::T) where T
+    xy = _widemul(x, y)  # support Int256 -> Int512 (!!)
+    (xy >> nbits(T)) % T
+end
+
+# Annoyingly, Unsigned(T) isn't defined for BitIntegers types:
+# https://github.com/rfourquet/BitIntegers.jl/pull/2
+_unsigned(x) = unsigned(x)
+_unsigned(::Type{Int256}) = UInt256
+
+nbits(x) = sizeof(x) * 8
diff --git a/test/fldmod-by-const_tests.jl b/test/fldmod-by-const_tests.jl
new file mode 100644
index 0000000..6b6a8d1
--- /dev/null
+++ b/test/fldmod-by-const_tests.jl
@@ -0,0 +1,58 @@
+using Test
+using FixedPointDecimals
+
+@testset "calculate_inverse_coeff signed" begin
+    using FixedPointDecimals: calculate_inverse_coeff
+
+    # The correct magic number here comes from investigating the following native code
+    # produced on an m2 aarch64 macbook pro:
+    # @code_native ((x)->fld(x,100))(2)
+    #   ...
+    #         mov     x8, #55051
+    #         movk    x8, #28835, lsl #16
+    #         movk    x8, #2621, lsl #32
+    #         movk    x8, #41943, lsl #48
+    # Where:
+    # julia> 55051 | 28835 << 16 | 2621 << 32 | 41943 << 48
+    # -6640827866535438581
+    @test calculate_inverse_coeff(Int64, 100) == (-6640827866535438581, 6)
+
+    # Same for the tests below:
+
+    # (LLVM's magic number is shifted one bit less, then they shift by 2, instead of 3,
+    #  but the result is the same.)
+    @test calculate_inverse_coeff(Int64, 10) == (7378697629483820647 << 1, 3)
+
+    @test calculate_inverse_coeff(Int64, 1) == (1, 0)
+end
+
+@testset "calculate_inverse_coeff signed 4" begin
+    using FixedPointDecimals: calculate_inverse_coeff
+
+    # Same here, our magic number is shifted 2 bits more than LLVM's
+    @test calculate_inverse_coeff(UInt64, 100) == (0xa3d70a3d70a3d70b, 6)
+
+    @test calculate_inverse_coeff(UInt64, 1) == (UInt64(0x1), 0)
+end
+
+@testset "div_by_const" begin
+    vals = [2432, 100, 0x1, Int32(10000), typemax(Int64), typemax(Int16), 8, Int64(2)^32]
+    for a_base in vals
+        # Only test negative numbers on `a`, since div_by_const requires b > 0.
+        @testset for (a, b, f) in Iterators.product((a_base, -a_base), vals, (unsigned, signed))
+            a, b = promote(f(a), f(b))
+            @test FixedPointDecimals.div_by_const(a, Val(b)) == a ÷ b
+        end
+    end
+end
+
+@testset "fldmod_by_const" begin
+    vals = [2432, 100, 0x1, Int32(10000), typemax(Int64), typemax(Int16), 8, Int64(2)^32]
+    for a_base in vals
+        # Only test negative numbers on `a`, since fldmod_by_const requires b > 0.
+        @testset for (a, b, f) in Iterators.product((a_base, -a_base), vals, (unsigned, signed))
+            a, b = promote(f(a), f(b))
+            @test FixedPointDecimals.fldmod_by_const(a, b) == fldmod(a, b)
+        end
+    end
+end
diff --git a/test/runtests.jl b/test/runtests.jl
index 81f9cea..62b2987 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -10,4 +10,9 @@ include(joinpath(pkg_path, "test", "utils.jl"))
 
 @testset "FixedPointDecimals" begin
     include("FixedDecimal.jl")
-end  # global testset
+end
+
+@testset "FixedPointDecimals" begin
+    include("fldmod-by-const_tests.jl")
+end
+

From 4e53f3d1d990f1b78abd79f599335bfdfe7a2bec Mon Sep 17 00:00:00 2001
From: Nathan Daly <nathan.daly@relational.ai>
Date: Wed, 12 Jun 2024 20:37:13 -0600
Subject: [PATCH 07/32] Support older versions of julia

---
 src/FixedPointDecimals.jl | 13 ++++++++++++-
 src/fldmod-by-const.jl    |  2 +-
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/src/FixedPointDecimals.jl b/src/FixedPointDecimals.jl
index df8f75a..c42f4ce 100644
--- a/src/FixedPointDecimals.jl
+++ b/src/FixedPointDecimals.jl
@@ -39,7 +39,18 @@ using Base: decompose, BitInteger
 using BitIntegers: BitIntegers, UInt256, Int256
 import Parsers
 
-include("fldmod-by-const.jl")
+# The effects system in newer versions of Julia is necessary for the fldmod_by_const
+# optimization to take affect without adverse performance impacts.
+# For older versions of julia, we will fall back to the flmod implementation, which works
+# very fast for all FixedDecimals of size <= 64 bits.
+if isdefined(Base, Symbol("@assume_effects"))
+    include("fldmod-by-const.jl")
+else
+    # We force-inline this, to allow the fldmod operation to be specialized for a constant
+    # second argument (converting divide-by-power-of-ten instructions with the cheaper
+    # multiply-by-inverse-coefficient.) Sadly this doesn't work for Int128.
+    @inline fldmod_by_const(x,y) = (fld(x,y), mod(x,y))
+end
 
 # floats that support fma and are roughly IEEE-like
 const FMAFloat = Union{Float16, Float32, Float64, BigFloat}
diff --git a/src/fldmod-by-const.jl b/src/fldmod-by-const.jl
index 48ffcfb..e71dc20 100644
--- a/src/fldmod-by-const.jl
+++ b/src/fldmod-by-const.jl
@@ -30,7 +30,7 @@ end
     return x - quotient * y
 end
 
-Base.@assume_effects :total function div_by_const(x::T, ::Val{C}) where {T, C}
+function div_by_const(x::T, ::Val{C}) where {T, C}
     # These checks will be compiled away during specialization.
     # While for `*(FixedDecimal, FixedDecimal)`, C will always be a power of 10, these
     # checks allow this function to work for any `C > 0`, in case that's useful in the

From dfd41b1be3484356d67d21e3804ae033216af1d8 Mon Sep 17 00:00:00 2001
From: Nathan Daly <nathan.daly@relational.ai>
Date: Wed, 12 Jun 2024 20:42:43 -0600
Subject: [PATCH 08/32] Comments

---
 src/fldmod-by-const.jl | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/fldmod-by-const.jl b/src/fldmod-by-const.jl
index e71dc20..26cc3ff 100644
--- a/src/fldmod-by-const.jl
+++ b/src/fldmod-by-const.jl
@@ -30,6 +30,8 @@ end
     return x - quotient * y
 end
 
+# This function is based on the native code produced by the following:
+# @code_native ((x)->div(x, 100))(Int64(2))
 function div_by_const(x::T, ::Val{C}) where {T, C}
     # These checks will be compiled away during specialization.
     # While for `*(FixedDecimal, FixedDecimal)`, C will always be a power of 10, these

From efee91bd2d53cf2ffbd71a28b7416525bc5c5da7 Mon Sep 17 00:00:00 2001
From: Nathan Daly <nathan.daly@relational.ai>
Date: Wed, 12 Jun 2024 21:50:06 -0600
Subject: [PATCH 09/32] Disable fldmod-by-const tests on older julia

---
 test/runtests.jl | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/test/runtests.jl b/test/runtests.jl
index 62b2987..583bed7 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -12,7 +12,10 @@ include(joinpath(pkg_path, "test", "utils.jl"))
     include("FixedDecimal.jl")
 end
 
+
+if isdefined(Base, Symbol("@assume_effects"))
 @testset "FixedPointDecimals" begin
     include("fldmod-by-const_tests.jl")
 end
+end
 

From a03d754dc995f39df37734b6a229eb1e24e86edc Mon Sep 17 00:00:00 2001
From: Nathan Daly <nathan.daly@relational.ai>
Date: Wed, 12 Jun 2024 22:00:37 -0600
Subject: [PATCH 10/32] Fix one other case of iseven allocating a BigInt

---
 src/FixedPointDecimals.jl | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/FixedPointDecimals.jl b/src/FixedPointDecimals.jl
index c42f4ce..0bb3133 100644
--- a/src/FixedPointDecimals.jl
+++ b/src/FixedPointDecimals.jl
@@ -191,7 +191,8 @@ function _round_to_nearest(quotient::T,
     halfdivisor = divisor >> 1
     # PERF Note: Only need the last bit to check iseven, and default iseven(Int256)
     # allocates, so we truncate first.
-    if iseven((divisor % Int8)) && remainder == halfdivisor
+    _iseven(x) = (x & 0x1) == 0
+    if _iseven(divisor) && remainder == halfdivisor
         # `:NearestTiesAway` will tie away from zero, e.g. -8.5 ->
         # -9. `:NearestTiesUp` will always ties towards positive
         # infinity. `:Nearest` will tie towards the nearest even
@@ -199,7 +200,7 @@ function _round_to_nearest(quotient::T,
         if M == :NearestTiesAway
             ifelse(quotient < zero(quotient), quotient, quotient + one(quotient))
         elseif M == :Nearest
-            ifelse(iseven(quotient), quotient, quotient + one(quotient))
+            ifelse(_iseven(quotient), quotient, quotient + one(quotient))
         else
             quotient + one(quotient)
         end

From 4ed8ebf6750bd3a5a91678ea06af8192c72139e8 Mon Sep 17 00:00:00 2001
From: Nathan Daly <nathan.daly@relational.ai>
Date: Thu, 13 Jun 2024 11:54:57 -0600
Subject: [PATCH 11/32] Apply this optimization to FD{Int64} as well.

---
 src/fldmod-by-const.jl | 26 +++++++++++++++++++++-----
 1 file changed, 21 insertions(+), 5 deletions(-)

diff --git a/src/fldmod-by-const.jl b/src/fldmod-by-const.jl
index 26cc3ff..7c7ff81 100644
--- a/src/fldmod-by-const.jl
+++ b/src/fldmod-by-const.jl
@@ -1,5 +1,18 @@
+# NOTE: We apply this optimization to values of type (U)Int128 and (U)Int256, which covers
+# FixedDecimal{(U)Int64} and FixedDecimal{(U)Int128}.
+# Julia+LLVM have built-in optimizations that apply this already for FD{(U)Int64}, however
+# this customized implementation appears to produce still even faster code than LLVM can
+# produce on its own. So we apply this for both sizes.
+# Before:
+# julia> @btime for _ in 1:10000 fd = fd * fd end setup = (fd = FixedDecimal{Int64,3}(1.234))
+#     247.709 μs (0 allocations: 0 bytes)
+#   FixedDecimal{Int64,3}(4230510070790917.029)
+# After:
+# julia> @btime for _ in 1:10000 fd = fd * fd end setup = (fd = FixedDecimal{Int64,3}(1.234))
+#     106.125 μs (0 allocations: 0 bytes)
+#   FixedDecimal{Int64,3}(4230510070790917.029)
 
-const BitInteger256 = Union{UInt256, Int256}
+const BigBitIntegers = Union{UInt128, Int128, UInt256, Int256}
 
 @inline function fldmod_by_const(x, y)
     # For small-to-normal integers, LLVM can correctly optimize away the division, if it
@@ -7,7 +20,7 @@ const BitInteger256 = Union{UInt256, Int256}
     # inlined, so here we have explictly inlined it instead.
     return (fld(x,y), mod(x,y))
 end
-@inline function fldmod_by_const(x::BitInteger256, y)
+@inline function fldmod_by_const(x::BigBitIntegers, y)
     # For large or non-standard Int types, LLVM doesn't optimize
     # well, so we use a custom implementation of fldmod.
     d = fld_by_const(x, Val(y))
@@ -17,8 +30,8 @@ end
 # Calculate fld(x,y) when y is a Val constant.
 # The implementation for fld_by_const was lifted directly from Base.fld(x,y), except that
 # it uses `div_by_const` instead of `div`.
-fld_by_const(x::T, y::Val{C}) where {T<:UInt256, C} = div_by_const(x, y)
-function fld_by_const(x::T, y::Val{C}) where {T<:Int256, C}
+fld_by_const(x::T, y::Val{C}) where {T<:Unsigned, C} = div_by_const(x, y)
+function fld_by_const(x::T, y::Val{C}) where {T<:Signed, C}
     d = div_by_const(x, y)
     return d - (signbit(x ⊻ C) & (d * C != x))
 end
@@ -26,7 +39,7 @@ end
 # Calculate `mod(x,y)` after you've already acquired quotient, the result of `fld(x,y)`.
 # REQUIRES:
 #   - `y != -1`
-@inline function manual_mod(x::T, y::T, quotient::T) where T<:BitInteger256
+@inline function manual_mod(x::T, y::T, quotient::T) where T<:BigBitIntegers
     return x - quotient * y
 end
 
@@ -94,5 +107,8 @@ end
 # https://github.com/rfourquet/BitIntegers.jl/pull/2
 _unsigned(x) = unsigned(x)
 _unsigned(::Type{Int256}) = UInt256
+_unsigned(::Type{UInt256}) = UInt256
+_unsigned(::Type{Int128}) = UInt128
+_unsigned(::Type{UInt128}) = UInt128
 
 nbits(x) = sizeof(x) * 8

From 3f39b8aded7b150526eb58aabbda8862e2f5cf50 Mon Sep 17 00:00:00 2001
From: Nathan Daly <nathan.daly@relational.ai>
Date: Thu, 13 Jun 2024 13:46:27 -0600
Subject: [PATCH 12/32] Adjust to run for all integer types!

---
 src/fldmod-by-const.jl | 59 ++++++++++++++++++++++++++++++------------
 1 file changed, 42 insertions(+), 17 deletions(-)

diff --git a/src/fldmod-by-const.jl b/src/fldmod-by-const.jl
index 7c7ff81..3366ccb 100644
--- a/src/fldmod-by-const.jl
+++ b/src/fldmod-by-const.jl
@@ -1,30 +1,55 @@
-# NOTE: We apply this optimization to values of type (U)Int128 and (U)Int256, which covers
-# FixedDecimal{(U)Int64} and FixedDecimal{(U)Int128}.
-# Julia+LLVM have built-in optimizations that apply this already for FD{(U)Int64}, however
-# this customized implementation appears to produce still even faster code than LLVM can
-# produce on its own. So we apply this for both sizes.
+# NOTE: Surprisingly, even though LLVM implements a version of this optimization on its own
+# for smaller integer sizes (<=64-bits), using the code in this file produces faster
+# multiplications for *all* types of integers. So we use our custom fldmod_by_const for all
+# bit integer types.
 # Before:
+# julia> @btime for _ in 1:10000 fd = fd * fd end setup = (fd = FixedDecimal{Int32,3}(1.234))
+#     84.959 μs (0 allocations: 0 bytes)
+#   FixedDecimal{Int32,3}(1700943.280)
+#
 # julia> @btime for _ in 1:10000 fd = fd * fd end setup = (fd = FixedDecimal{Int64,3}(1.234))
 #     247.709 μs (0 allocations: 0 bytes)
 #   FixedDecimal{Int64,3}(4230510070790917.029)
+#
+#julia> @btime for _ in 1:10000 fd = fd * fd end setup = (fd = FixedDecimal{Int128,3}(1.234))
+#    4.077 ms (160798 allocations: 3.22 MiB)
+#  FixedDecimal{Int128,3}(-66726338547984585007169386718143307.324)
+#
 # After:
+# julia> @btime for _ in 1:10000 fd = fd * fd end setup = (fd = FixedDecimal{Int32,3}(1.234))
+#     68.416 μs (0 allocations: 0 bytes)
+#   FixedDecimal{Int32,3}(1700943.280)
+#
 # julia> @btime for _ in 1:10000 fd = fd * fd end setup = (fd = FixedDecimal{Int64,3}(1.234))
 #     106.125 μs (0 allocations: 0 bytes)
 #   FixedDecimal{Int64,3}(4230510070790917.029)
+#
+# julia> @btime for _ in 1:10000 fd = fd * fd end setup = (fd = FixedDecimal{Int128,3}(1.234))
+#     204.125 μs (0 allocations: 0 bytes)
+#   FixedDecimal{Int128,3}(-66726338547984585007169386718143307.324)
 
-const BigBitIntegers = Union{UInt128, Int128, UInt256, Int256}
+"""
+    ShouldUseCustomFldmodByConst(::Type{<:MyCustomIntType})) = true
+A trait to control opt-in for the custom `fldmod_by_const` implementation. To use this for a
+given integer type, you can define this overload for your integer type.
+You will also need to implement some parts of the interface below, including _widen().
+"""
+ShouldUseCustomFldmodByConst(::Type{<:Base.BitInteger}) = true
+ShouldUseCustomFldmodByConst(::Type{<:Union{Int256,Int128}}) = true
+ShouldUseCustomFldmodByConst(::Type) = false
 
 @inline function fldmod_by_const(x, y)
-    # For small-to-normal integers, LLVM can correctly optimize away the division, if it
-    # knows it's dividing by a const. We cannot call `Base.fldmod` since it's not
-    # inlined, so here we have explictly inlined it instead.
-    return (fld(x,y), mod(x,y))
-end
-@inline function fldmod_by_const(x::BigBitIntegers, y)
-    # For large or non-standard Int types, LLVM doesn't optimize
-    # well, so we use a custom implementation of fldmod.
-    d = fld_by_const(x, Val(y))
-    return d, manual_mod(promote(x, y, d)...)
+    if ShouldUseCustomFldmodByConst(typeof(x))
+        # For large Int types, LLVM doesn't optimize well, so we use a custom implementation
+        # of fldmod, which extends that optimization to those larger integer types.
+        d = fld_by_const(x, Val(y))
+        return d, manual_mod(promote(x, y, d)...)
+    else
+        # For other integers, LLVM might be able to correctly optimize away the division, if
+        # it knows it's dividing by a const. We cannot call `Base.fldmod` since it's not
+        # inlined, so here we have explictly inlined it instead.
+        return (fld(x,y), mod(x,y))
+    end
 end
 
 # Calculate fld(x,y) when y is a Val constant.
@@ -39,7 +64,7 @@ end
 # Calculate `mod(x,y)` after you've already acquired quotient, the result of `fld(x,y)`.
 # REQUIRES:
 #   - `y != -1`
-@inline function manual_mod(x::T, y::T, quotient::T) where T<:BigBitIntegers
+@inline function manual_mod(x::T, y::T, quotient::T) where T<:Integer
     return x - quotient * y
 end
 

From 20c66f2b4946f2062f3a14aa225cabbdced53759 Mon Sep 17 00:00:00 2001
From: Nathan Daly <nathan.daly@relational.ai>
Date: Thu, 13 Jun 2024 13:55:47 -0600
Subject: [PATCH 13/32] Clarify the `_unsigned(x)` methods with comments

---
 src/fldmod-by-const.jl | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/fldmod-by-const.jl b/src/fldmod-by-const.jl
index 3366ccb..cc15554 100644
--- a/src/fldmod-by-const.jl
+++ b/src/fldmod-by-const.jl
@@ -35,7 +35,7 @@ given integer type, you can define this overload for your integer type.
 You will also need to implement some parts of the interface below, including _widen().
 """
 ShouldUseCustomFldmodByConst(::Type{<:Base.BitInteger}) = true
-ShouldUseCustomFldmodByConst(::Type{<:Union{Int256,Int128}}) = true
+ShouldUseCustomFldmodByConst(::Type{<:Union{Int256,UInt256}}) = true
 ShouldUseCustomFldmodByConst(::Type) = false
 
 @inline function fldmod_by_const(x, y)
@@ -130,10 +130,12 @@ end
 
 # Annoyingly, Unsigned(T) isn't defined for BitIntegers types:
 # https://github.com/rfourquet/BitIntegers.jl/pull/2
+# Note: We do not need this for Int512, since we only widen to 512 _after_ calling
+# _unsigned, above. This code is only for supporting the built-in integer types, which only
+# go up to 128-bits (widened twice to 512). If a user wants to extend FixedDecimals for
+# other integer types, they will need to add methods to either _unsigned or unsigned.
 _unsigned(x) = unsigned(x)
 _unsigned(::Type{Int256}) = UInt256
 _unsigned(::Type{UInt256}) = UInt256
-_unsigned(::Type{Int128}) = UInt128
-_unsigned(::Type{UInt128}) = UInt128
 
 nbits(x) = sizeof(x) * 8

From f2958baf17afc877b22f2e5d038dcfaa39c4d784 Mon Sep 17 00:00:00 2001
From: Nathan Daly <NHDaly@gmail.com>
Date: Thu, 13 Jun 2024 13:57:27 -0600
Subject: [PATCH 14/32] Apply suggestions from code review

Co-authored-by: Curtis Vogt <curtis.vogt@gmail.com>
---
 src/FixedPointDecimals.jl | 2 +-
 test/runtests.jl          | 8 +-------
 2 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/src/FixedPointDecimals.jl b/src/FixedPointDecimals.jl
index 0bb3133..d36f9fa 100644
--- a/src/FixedPointDecimals.jl
+++ b/src/FixedPointDecimals.jl
@@ -36,7 +36,7 @@ export checked_abs, checked_add, checked_cld, checked_div, checked_fld,
 
 using Base: decompose, BitInteger
 
-using BitIntegers: BitIntegers, UInt256, Int256
+using BitIntegers: BitIntegers, Int256, Int512, UInt256, UInt512
 import Parsers
 
 # The effects system in newer versions of Julia is necessary for the fldmod_by_const
diff --git a/test/runtests.jl b/test/runtests.jl
index 583bed7..b5fadb6 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -10,12 +10,6 @@ include(joinpath(pkg_path, "test", "utils.jl"))
 
 @testset "FixedPointDecimals" begin
     include("FixedDecimal.jl")
-end
-
-
-if isdefined(Base, Symbol("@assume_effects"))
-@testset "FixedPointDecimals" begin
-    include("fldmod-by-const_tests.jl")
-end
+    isdefined(Base, Symbol("@assume_effects")) && include("fldmod-by-const_tests.jl")
 end
 

From 0019bb0af65fd626fc27904d6c4ab162456612d0 Mon Sep 17 00:00:00 2001
From: Nathan Daly <NHDaly@gmail.com>
Date: Sun, 16 Jun 2024 21:49:28 -0600
Subject: [PATCH 15/32] Update src/FixedPointDecimals.jl

Co-authored-by: Curtis Vogt <curtis.vogt@gmail.com>
---
 src/FixedPointDecimals.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/FixedPointDecimals.jl b/src/FixedPointDecimals.jl
index d36f9fa..ebe0702 100644
--- a/src/FixedPointDecimals.jl
+++ b/src/FixedPointDecimals.jl
@@ -144,8 +144,8 @@ _widemul(x::Unsigned,y::Signed) = signed(_widen(x)) * _widen(y)
 # FD{Int128} operations should widen to 256 bits internally, rather than to a BigInt.
 _widen(::Type{Int128}) = Int256
 _widen(::Type{UInt128}) = UInt256
-_widen(::Type{Int256}) = BitIntegers.Int512
-_widen(::Type{UInt256}) = BitIntegers.UInt512
+_widen(::Type{Int256}) = Int512
+_widen(::Type{UInt256}) = UInt512
 _widen(t::Type) = widen(t)
 _widen(x::T) where {T} = (_widen(T))(x)
 

From 4a53703706b5d4012284af63e3ff75cf7ad8de60 Mon Sep 17 00:00:00 2001
From: Nathan Daly <nathan.daly@relational.ai>
Date: Wed, 19 Jun 2024 11:50:49 -0600
Subject: [PATCH 16/32] Add extensive tests for multiplication correctness, to
 cover the new change

Tests all FD{(U)Int16} values.
Tests most corner cases for FD{(U)Int128} values.
---
 test/fldmod-by-const_tests.jl | 65 +++++++++++++++++++++++++++++++++++
 1 file changed, 65 insertions(+)

diff --git a/test/fldmod-by-const_tests.jl b/test/fldmod-by-const_tests.jl
index 6b6a8d1..c4f2bf5 100644
--- a/test/fldmod-by-const_tests.jl
+++ b/test/fldmod-by-const_tests.jl
@@ -56,3 +56,68 @@ end
         end
     end
 end
+
+@testset "fixed decimal multiplication - exhaustive 16-bit" begin
+    @testset for P in (0,1,2,3,4)
+        @testset for T in (Int16, UInt16)
+            FD = FixedDecimal{T,P}
+
+            function test_multiplies_correctly(fd, x)
+                big = FixedDecimal{BigInt, P}(fd)
+                big_mul = big * x
+                # This might overflow: ...
+                mul = fd * x
+                # ... so we truncate big to the same size
+                @test big_mul.i % T == mul.i % T
+            end
+            @testset for v in typemin(FD) : eps(FD) : typemax(FD)
+                test_multiplies_correctly(v, typemin(T))
+                test_multiplies_correctly(v, -1)
+                test_multiplies_correctly(v, -eps(FD))
+                test_multiplies_correctly(v, 0)
+                test_multiplies_correctly(v, eps(FD))
+                test_multiplies_correctly(v, 1)
+                test_multiplies_correctly(v, 2)
+                test_multiplies_correctly(v, 3)
+                test_multiplies_correctly(v, typemax(T))
+            end
+        end
+    end
+end
+
+@testset "fixed decimal multiplication - 128-bit" begin
+    @testset for P in 0:37
+        @testset for T in (Int128, UInt128)
+            FD = FixedDecimal{T,P}
+
+            function test_multiplies_correctly(fd, x)
+                big = FixedDecimal{BigInt, P}(fd)
+                big_mul = big * x
+                # This might overflow: ...
+                mul = fd * x
+                # ... so we truncate big to the same size
+                @test big_mul.i % T == mul.i % T
+            end
+            vals = FD[
+                typemin(FD), typemax(FD),
+                typemin(FD) + eps(FD), typemax(FD) - eps(FD),
+                0.0, eps(FD), 0.1, 1.0, 2.0,
+                typemax(FD) ÷ 2,
+            ]
+            if T <: Signed
+                append!(vals, vals.*-1)
+            end
+            @testset for v in vals
+                test_multiplies_correctly(v, typemin(T))
+                test_multiplies_correctly(v, -1)
+                test_multiplies_correctly(v, -eps(FD))
+                test_multiplies_correctly(v, 0)
+                test_multiplies_correctly(v, eps(FD))
+                test_multiplies_correctly(v, 1)
+                test_multiplies_correctly(v, 2)
+                test_multiplies_correctly(v, 3)
+                test_multiplies_correctly(v, typemax(T))
+            end
+        end
+    end
+end

From 27a3e0f320eb1ae55b55c2ae2570db5ba8d5bead Mon Sep 17 00:00:00 2001
From: Nathan Daly <nathan.daly@relational.ai>
Date: Wed, 19 Jun 2024 11:59:18 -0600
Subject: [PATCH 17/32] Named testsets to make failures easier to identify

---
 test/fldmod-by-const_tests.jl | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/test/fldmod-by-const_tests.jl b/test/fldmod-by-const_tests.jl
index c4f2bf5..dfcee19 100644
--- a/test/fldmod-by-const_tests.jl
+++ b/test/fldmod-by-const_tests.jl
@@ -67,8 +67,10 @@ end
                 big_mul = big * x
                 # This might overflow: ...
                 mul = fd * x
-                # ... so we truncate big to the same size
-                @test big_mul.i % T == mul.i % T
+                @testset "$fd * $x" begin
+                    # ... so we truncate big to the same size
+                    @test big_mul.i % T == mul.i % T
+                end
             end
             @testset for v in typemin(FD) : eps(FD) : typemax(FD)
                 test_multiplies_correctly(v, typemin(T))
@@ -95,8 +97,10 @@ end
                 big_mul = big * x
                 # This might overflow: ...
                 mul = fd * x
-                # ... so we truncate big to the same size
-                @test big_mul.i % T == mul.i % T
+                @testset "$fd * $x" begin
+                    # ... so we truncate big to the same size
+                    @test big_mul.i % T == mul.i % T
+                end
             end
             vals = FD[
                 typemin(FD), typemax(FD),

From e4cb73b3dca4b6254b0c49cb44aaf1c4c863a4e3 Mon Sep 17 00:00:00 2001
From: Nathan Daly <nathan.daly@relational.ai>
Date: Wed, 19 Jun 2024 13:09:57 -0600
Subject: [PATCH 18/32] Fix off-by-one error in rounding truncation in
 calculate_inverse_coeff()

We must use the precise value of 2^nbits(T) in order to get the correct
division in all cases.

....... UGH except now the Int64 tests aren't passing.
---
 src/fldmod-by-const.jl        |  3 ++-
 test/fldmod-by-const_tests.jl | 24 ++++++++++++++++++++++++
 2 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/src/fldmod-by-const.jl b/src/fldmod-by-const.jl
index cc15554..36f0885 100644
--- a/src/fldmod-by-const.jl
+++ b/src/fldmod-by-const.jl
@@ -119,7 +119,8 @@ Base.@assume_effects :foldable function calculate_inverse_coeff(::Type{T}, C) wh
     # Now, truncate to only the upper half of invcoeff, after we've shifted. Instead of
     # bitshifting, we round to maintain precision. (This is needed to prevent off-by-ones.)
     # -- This is equivalent to `invcoeff = T(invcoeff >> sizeof(T))`, except rounded. --
-    invcoeff = _round_to_nearest(fldmod(invcoeff, typemax(UT))..., typemax(UT)) % T
+    two_to_N = _widen(typemax(UT)) + UT(1) # Precise value for 2^nbits(T) (doesn't fit in T)
+    invcoeff = _round_to_nearest(fldmod(invcoeff, two_to_N)..., two_to_N ) % T
     return invcoeff, toshift
 end
 
diff --git a/test/fldmod-by-const_tests.jl b/test/fldmod-by-const_tests.jl
index dfcee19..9f1db82 100644
--- a/test/fldmod-by-const_tests.jl
+++ b/test/fldmod-by-const_tests.jl
@@ -57,6 +57,30 @@ end
     end
 end
 
+@testset "fixed decimal multiplication - exhaustive 8-bit" begin
+    @testset for P in (0,1)
+        @testset for T in (Int8, UInt8)
+            FD = FixedDecimal{T,P}
+
+            function test_multiplies_correctly(fd, x)
+                big = FixedDecimal{BigInt, P}(fd)
+                big_mul = big * x
+                # This might overflow: ...
+                mul = fd * x
+                @testset "$fd * $x" begin
+                    # ... so we truncate big to the same size
+                    @test big_mul.i % T == mul.i % T
+                end
+            end
+            @testset for v in typemin(FD) : eps(FD) : typemax(FD)
+                @testset for v2 in typemin(FD) : eps(FD) : typemax(FD)
+                    test_multiplies_correctly(v, v2)
+                end
+            end
+        end
+    end
+end
+
 @testset "fixed decimal multiplication - exhaustive 16-bit" begin
     @testset for P in (0,1,2,3,4)
         @testset for T in (Int16, UInt16)

From 73f6547af5191737008ee8ba2eda591f6eab365d Mon Sep 17 00:00:00 2001
From: Nathan Daly <nathan.daly@relational.ai>
Date: Wed, 19 Jun 2024 14:46:23 -0600
Subject: [PATCH 19/32] Add some comments and requires

---
 src/fldmod-by-const.jl        | 7 +++++++
 test/fldmod-by-const_tests.jl | 4 ----
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/fldmod-by-const.jl b/src/fldmod-by-const.jl
index 36f0885..d96b96c 100644
--- a/src/fldmod-by-const.jl
+++ b/src/fldmod-by-const.jl
@@ -105,6 +105,13 @@ function div_by_const(x::T, ::Val{C}) where {T, C}
     return T(out)
 end
 
+
+
+
+# REQUIRES: C must not be 0, 1, -1
+# See this implementation in LLVM:
+# https://llvm.org/doxygen/DivisionByConstantInfo_8cpp_source.html
+# which was taken from "Hacker's Delight".
 Base.@assume_effects :foldable function calculate_inverse_coeff(::Type{T}, C) where {T}
     # First, calculate 2^nbits(T)/C
     # We shift away leading zeros to preserve the most precision when we use it to multiply
diff --git a/test/fldmod-by-const_tests.jl b/test/fldmod-by-const_tests.jl
index 9f1db82..5e248d2 100644
--- a/test/fldmod-by-const_tests.jl
+++ b/test/fldmod-by-const_tests.jl
@@ -22,8 +22,6 @@ using FixedPointDecimals
     # (LLVM's magic number is shifted one bit less, then they shift by 2, instead of 3,
     #  but the result is the same.)
     @test calculate_inverse_coeff(Int64, 10) == (7378697629483820647 << 1, 3)
-
-    @test calculate_inverse_coeff(Int64, 1) == (1, 0)
 end
 
 @testset "calculate_inverse_coeff signed 4" begin
@@ -31,8 +29,6 @@ end
 
     # Same here, our magic number is shifted 2 bits more than LLVM's
     @test calculate_inverse_coeff(UInt64, 100) == (0xa3d70a3d70a3d70b, 6)
-
-    @test calculate_inverse_coeff(UInt64, 1) == (UInt64(0x1), 0)
 end
 
 @testset "div_by_const" begin

From 4e9fdd6f9193b7c214f8db01072d773c5c2c30a6 Mon Sep 17 00:00:00 2001
From: Nathan Daly <nathan.daly@relational.ai>
Date: Wed, 19 Jun 2024 15:43:32 -0600
Subject: [PATCH 20/32] Copy/pasted definition for unsigned numbers straight
 from the book

---
 src/fldmod-by-const.jl        | 41 +++++++++++++++++++++++++++++++++++
 test/fldmod-by-const_tests.jl |  2 +-
 2 files changed, 42 insertions(+), 1 deletion(-)

diff --git a/src/fldmod-by-const.jl b/src/fldmod-by-const.jl
index d96b96c..b69054e 100644
--- a/src/fldmod-by-const.jl
+++ b/src/fldmod-by-const.jl
@@ -136,6 +136,47 @@ function mul_hi(x::T, y::T) where T
     (xy >> nbits(T)) % T
 end
 
+
+
+# Unsigned magic number computation + divide by constant
+# Hacker's delight figure 10–4
+Base.@assume_effects :foldable function magicgu(nmax, d)
+    T = typeof(nmax)
+    nc = (nmax ÷ d) * d - 1
+    N = T(floor(log2(nmax) + 1))
+    for p in 0:2N
+        if 2^p > nc * (d - 1 - (2^p - 1) % d)
+            m = (2^p + d - 1 - (2^p - 1) % d) ÷ d
+            return (m%T, p)
+        end
+    end
+    error("Can't find p, something is wrong.")
+end
+function div_by_const(x::T, ::Val{C}) where {T<:Unsigned, C}
+    # These checks will be compiled away during specialization.
+    # While for `*(FixedDecimal, FixedDecimal)`, C will always be a power of 10, these
+    # checks allow this function to work for any `C > 0`, in case that's useful in the
+    # future.
+    if C == 1
+        return x
+    elseif ispow2(C)
+        return div(x, C)  # Will already do the right thing
+    elseif C <= 0
+        throw(DomainError("C must be > 0"))
+    end
+    # Calculate the magic number 2^N/C. Note that this is computed statically, not at
+    # runtime.
+    inverse_coeff, toshift = magicgu(typemax(T), C)
+    out = _widemul(x, inverse_coeff)  # support Int256 -> Int512 (!!)
+    out = out >> toshift
+    return out % T
+end
+
+
+
+
+
+
 # Annoyingly, Unsigned(T) isn't defined for BitIntegers types:
 # https://github.com/rfourquet/BitIntegers.jl/pull/2
 # Note: We do not need this for Int512, since we only widen to 512 _after_ calling
diff --git a/test/fldmod-by-const_tests.jl b/test/fldmod-by-const_tests.jl
index 5e248d2..0adaa35 100644
--- a/test/fldmod-by-const_tests.jl
+++ b/test/fldmod-by-const_tests.jl
@@ -53,7 +53,7 @@ end
     end
 end
 
-@testset "fixed decimal multiplication - exhaustive 8-bit" begin
+@testitem "fixed decimal multiplication - exhaustive 8-bit" begin
     @testset for P in (0,1)
         @testset for T in (Int8, UInt8)
             FD = FixedDecimal{T,P}

From 188933d1c8c5ab07ffb118304b4beb6f2741ff57 Mon Sep 17 00:00:00 2001
From: Nathan Daly <nathan.daly@relational.ai>
Date: Wed, 19 Jun 2024 18:55:31 -0600
Subject: [PATCH 21/32] Have `magicgu` support arbitrary integer sizes

---
 src/fldmod-by-const.jl | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/fldmod-by-const.jl b/src/fldmod-by-const.jl
index b69054e..1562a4a 100644
--- a/src/fldmod-by-const.jl
+++ b/src/fldmod-by-const.jl
@@ -144,13 +144,15 @@ Base.@assume_effects :foldable function magicgu(nmax, d)
     T = typeof(nmax)
     nc = (nmax ÷ d) * d - 1
     N = T(floor(log2(nmax) + 1))
+    MathType = _widen(T)
+    two = MathType(2)
     for p in 0:2N
-        if 2^p > nc * (d - 1 - (2^p - 1) % d)
-            m = (2^p + d - 1 - (2^p - 1) % d) ÷ d
+        if two^p > nc * (d - 1 - (2^p - 1) % d)
+            m = (two^p + d - 1 - (2^p - 1) % d) ÷ d
             return (m%T, p)
         end
     end
-    error("Can't find p, something is wrong.")
+    error("Can't find p, something is wrong. nmax: $(repr(nmax)), d: $(repr(d)), N: $N")
 end
 function div_by_const(x::T, ::Val{C}) where {T<:Unsigned, C}
     # These checks will be compiled away during specialization.

From f6d375c0231747ff9b044b7494d43904c2f786ea Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Drvo=C5=A1t=C4=9Bp?=
 <tomas.drvostep@gmail.com>
Date: Fri, 21 Jun 2024 17:21:52 +0200
Subject: [PATCH 22/32] Use the formulas from Hacker's delight for both signed
 and unsigned Ints

---
 src/fldmod-by-const.jl        | 142 +++++++++-------------------------
 test/fldmod-by-const_tests.jl |  32 +-------
 2 files changed, 39 insertions(+), 135 deletions(-)

diff --git a/src/fldmod-by-const.jl b/src/fldmod-by-const.jl
index 1562a4a..6c62ce3 100644
--- a/src/fldmod-by-const.jl
+++ b/src/fldmod-by-const.jl
@@ -68,93 +68,44 @@ end
     return x - quotient * y
 end
 
-# This function is based on the native code produced by the following:
-# @code_native ((x)->div(x, 100))(Int64(2))
-function div_by_const(x::T, ::Val{C}) where {T, C}
-    # These checks will be compiled away during specialization.
-    # While for `*(FixedDecimal, FixedDecimal)`, C will always be a power of 10, these
-    # checks allow this function to work for any `C > 0`, in case that's useful in the
-    # future.
-    if C == 1
-        return x
-    elseif ispow2(C)
-        return div(x, C)  # Will already do the right thing
-    elseif C <= 0
-        throw(DomainError("C must be > 0"))
-    end
-    # Calculate the magic number 2^N/C. Note that this is computed statically, not at
-    # runtime.
-    inverse_coeff, toshift = calculate_inverse_coeff(T, C)
-    # Compute the upper-half of widemul(x, 2^nbits(T)/C).
-    # By keeping only the upper half, we're essentially dividing by 2^nbits(T), undoing the
-    # numerator of the multiplication, so that the result is equal to x/C.
-    out = mul_hi(x, inverse_coeff)
-    # This condition will be compiled away during specialization.
-    if T <: Signed
-        # Because our magic number has a leading one (since we shift all-the-way left), the
-        # result is negative if it's Signed. We add x to give us the positive equivalent.
-        out += x
-        signshift = (nbits(x) - 1)
-        isnegative = T(out >>> signshift)  # 1 if < 0 else 0 (Unsigned bitshift to read top bit)
-    end
-    # Undo the bitshifts used to calculate the invoeff magic number with maximum precision.
-    out = out >> toshift
-    if T <: Signed
-        out =  out + isnegative
+# Unsigned magic number computation + shift by constant
+# See Hacker's delight, equations (26) and (27) from Chapter 10-9.
+# nmax and divisor must be > 0.
+Base.@assume_effects :foldable function magicg(nmax::Unsigned, divisor)
+    T = typeof(nmax)
+    W = _widen(T)
+    d = W(divisor)
+
+    nc = div(W(nmax) + W(1), d) * d - W(1)
+    nbits = sizeof(nmax) * 8 - leading_zeros(nmax) # most significant bit
+    for p in 0:2nbits
+        if W(2)^p > nc * (d - W(1) - rem(W(2)^p - W(1), d))       # (27)
+            m = div(W(2)^p + d - W(1) - rem(W(2)^p - W(1), d), d) # (26)
+            return (m, p)
+        end
     end
-    return T(out)
+    return W(0), 0 # Should never reach here
 end
 
-
-
-
-# REQUIRES: C must not be 0, 1, -1
-# See this implementation in LLVM:
-# https://llvm.org/doxygen/DivisionByConstantInfo_8cpp_source.html
-# which was taken from "Hacker's Delight".
-Base.@assume_effects :foldable function calculate_inverse_coeff(::Type{T}, C) where {T}
-    # First, calculate 2^nbits(T)/C
-    # We shift away leading zeros to preserve the most precision when we use it to multiply
-    # in the next step. At the end, we will shift the final answer back to undo this
-    # operation (which is why we need to return `toshift`).
-    # Note, also, that we calculate invcoeff at double-precision so that the left-shift
-    # doesn't leave trailing zeros. We truncate to only the upper-half before returning.
-    UT = _unsigned(T)
-    invcoeff = typemax(_widen(UT)) ÷ C
-    toshift = leading_zeros(invcoeff)
-    invcoeff = invcoeff << toshift
-    # Now, truncate to only the upper half of invcoeff, after we've shifted. Instead of
-    # bitshifting, we round to maintain precision. (This is needed to prevent off-by-ones.)
-    # -- This is equivalent to `invcoeff = T(invcoeff >> sizeof(T))`, except rounded. --
-    two_to_N = _widen(typemax(UT)) + UT(1) # Precise value for 2^nbits(T) (doesn't fit in T)
-    invcoeff = _round_to_nearest(fldmod(invcoeff, two_to_N)..., two_to_N ) % T
-    return invcoeff, toshift
-end
-
-function mul_hi(x::T, y::T) where T
-    xy = _widemul(x, y)  # support Int256 -> Int512 (!!)
-    (xy >> nbits(T)) % T
-end
-
-
-
-# Unsigned magic number computation + divide by constant
-# Hacker's delight figure 10–4
-Base.@assume_effects :foldable function magicgu(nmax, d)
+# See Hacker's delight, equations (5) and (6) from Chapter 10-4.
+# nmax and divisor must be > 0.
+Base.@assume_effects :foldable function magicg(nmax::Signed, divisor)
     T = typeof(nmax)
-    nc = (nmax ÷ d) * d - 1
-    N = T(floor(log2(nmax) + 1))
-    MathType = _widen(T)
-    two = MathType(2)
-    for p in 0:2N
-        if two^p > nc * (d - 1 - (2^p - 1) % d)
-            m = (two^p + d - 1 - (2^p - 1) % d) ÷ d
-            return (m%T, p)
+    W = _widen(T)
+    d = W(divisor)
+
+    nc = ((W(nmax) + W(1)) ÷ d) * d - W(1)
+    nbits = sizeof(nmax) * 8 - leading_zeros(nmax) # most significant bit
+    for p in 0:2nbits
+        if W(2)^p > nc * (d - rem(W(2)^p, d))       # (6)
+            m = div(W(2)^p + d - rem(W(2)^p, d), d) # (5)
+            return (m, p)
         end
     end
-    error("Can't find p, something is wrong. nmax: $(repr(nmax)), d: $(repr(d)), N: $N")
+    return W(0), 0 # Should never reach here
 end
-function div_by_const(x::T, ::Val{C}) where {T<:Unsigned, C}
+
+function div_by_const(x::T, ::Val{C}) where {T, C}
     # These checks will be compiled away during specialization.
     # While for `*(FixedDecimal, FixedDecimal)`, C will always be a power of 10, these
     # checks allow this function to work for any `C > 0`, in case that's useful in the
@@ -162,31 +113,14 @@ function div_by_const(x::T, ::Val{C}) where {T<:Unsigned, C}
     if C == 1
         return x
     elseif ispow2(C)
-        return div(x, C)  # Will already do the right thing
+        return div(x, C) # Will already do the right thing
     elseif C <= 0
         throw(DomainError("C must be > 0"))
     end
-    # Calculate the magic number 2^N/C. Note that this is computed statically, not at
-    # runtime.
-    inverse_coeff, toshift = magicgu(typemax(T), C)
-    out = _widemul(x, inverse_coeff)  # support Int256 -> Int512 (!!)
-    out = out >> toshift
-    return out % T
-end
-
-
-
+    # Calculate the magic number and shift amount, based on Hacker's Delight, Chapter 10.
+    magic_number, shift = magicg(typemax(T), C)
 
-
-
-# Annoyingly, Unsigned(T) isn't defined for BitIntegers types:
-# https://github.com/rfourquet/BitIntegers.jl/pull/2
-# Note: We do not need this for Int512, since we only widen to 512 _after_ calling
-# _unsigned, above. This code is only for supporting the built-in integer types, which only
-# go up to 128-bits (widened twice to 512). If a user wants to extend FixedDecimals for
-# other integer types, they will need to add methods to either _unsigned or unsigned.
-_unsigned(x) = unsigned(x)
-_unsigned(::Type{Int256}) = UInt256
-_unsigned(::Type{UInt256}) = UInt256
-
-nbits(x) = sizeof(x) * 8
+    out = _widemul(promote(x, magic_number)...)
+    out >>= shift
+    return (out % T) + (x < zero(T)) # TODO: Figure out why and if the + 1 if non-positive is necessary
+end
diff --git a/test/fldmod-by-const_tests.jl b/test/fldmod-by-const_tests.jl
index 0adaa35..95dfce9 100644
--- a/test/fldmod-by-const_tests.jl
+++ b/test/fldmod-by-const_tests.jl
@@ -1,36 +1,6 @@
 using Test
 using FixedPointDecimals
 
-@testset "calculate_inverse_coeff signed" begin
-    using FixedPointDecimals: calculate_inverse_coeff
-
-    # The correct magic number here comes from investigating the following native code
-    # produced on an m2 aarch64 macbook pro:
-    # @code_native ((x)->fld(x,100))(2)
-    #   ...
-    #         mov     x8, #55051
-    #         movk    x8, #28835, lsl #16
-    #         movk    x8, #2621, lsl #32
-    #         movk    x8, #41943, lsl #48
-    # Where:
-    # julia> 55051 | 28835 << 16 | 2621 << 32 | 41943 << 48
-    # -6640827866535438581
-    @test calculate_inverse_coeff(Int64, 100) == (-6640827866535438581, 6)
-
-    # Same for the tests below:
-
-    # (LLVM's magic number is shifted one bit less, then they shift by 2, instead of 3,
-    #  but the result is the same.)
-    @test calculate_inverse_coeff(Int64, 10) == (7378697629483820647 << 1, 3)
-end
-
-@testset "calculate_inverse_coeff signed 4" begin
-    using FixedPointDecimals: calculate_inverse_coeff
-
-    # Same here, our magic number is shifted 2 bits more than LLVM's
-    @test calculate_inverse_coeff(UInt64, 100) == (0xa3d70a3d70a3d70b, 6)
-end
-
 @testset "div_by_const" begin
     vals = [2432, 100, 0x1, Int32(10000), typemax(Int64), typemax(Int16), 8, Int64(2)^32]
     for a_base in vals
@@ -53,7 +23,7 @@ end
     end
 end
 
-@testitem "fixed decimal multiplication - exhaustive 8-bit" begin
+@testset "fixed decimal multiplication - exhaustive 8-bit" begin
     @testset for P in (0,1)
         @testset for T in (Int8, UInt8)
             FD = FixedDecimal{T,P}

From b79c8737710df1a021a8749343d3e3af172e68a5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Drvo=C5=A1t=C4=9Bp?=
 <tomas.drvostep@gmail.com>
Date: Tue, 25 Jun 2024 19:41:00 +0200
Subject: [PATCH 23/32] .

---
 src/fldmod-by-const.jl | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/src/fldmod-by-const.jl b/src/fldmod-by-const.jl
index 6c62ce3..e141adc 100644
--- a/src/fldmod-by-const.jl
+++ b/src/fldmod-by-const.jl
@@ -70,15 +70,16 @@ end
 
 # Unsigned magic number computation + shift by constant
 # See Hacker's delight, equations (26) and (27) from Chapter 10-9.
-# nmax and divisor must be > 0.
+# requires nmax > divisor > 1.
 Base.@assume_effects :foldable function magicg(nmax::Unsigned, divisor)
     T = typeof(nmax)
     W = _widen(T)
     d = W(divisor)
 
-    nc = div(W(nmax) + W(1), d) * d - W(1)
-    nbits = sizeof(nmax) * 8 - leading_zeros(nmax) # most significant bit
-    for p in 0:2nbits
+    nc = div(W(nmax) + W(1), d) * d - W(1)      # largest multiple of d <= nmax, minus 1
+    nbits = 8sizeof(nmax) - leading_zeros(nmax) # most significant bit
+    # shift must be larger than int size because we want the high bits of the wide multiplication
+    for p in 8sizeof(nmax):2nbits
         if W(2)^p > nc * (d - W(1) - rem(W(2)^p - W(1), d))       # (27)
             m = div(W(2)^p + d - W(1) - rem(W(2)^p - W(1), d), d) # (26)
             return (m, p)
@@ -88,15 +89,16 @@ Base.@assume_effects :foldable function magicg(nmax::Unsigned, divisor)
 end
 
 # See Hacker's delight, equations (5) and (6) from Chapter 10-4.
-# nmax and divisor must be > 0.
+# requires nmax > divisor > 1.
 Base.@assume_effects :foldable function magicg(nmax::Signed, divisor)
     T = typeof(nmax)
     W = _widen(T)
     d = W(divisor)
 
-    nc = ((W(nmax) + W(1)) ÷ d) * d - W(1)
-    nbits = sizeof(nmax) * 8 - leading_zeros(nmax) # most significant bit
-    for p in 0:2nbits
+    nc = div(W(nmax) + W(1), d) * d - W(1)      # largest multiple of d <= nmax, minus 1
+    nbits = 8sizeof(nmax) - leading_zeros(nmax) # most significant bit
+    # shift must be larger than int size because we want the high bits of the wide multiplication
+    for p in 8sizeof(nmax):2nbits
         if W(2)^p > nc * (d - rem(W(2)^p, d))       # (6)
             m = div(W(2)^p + d - rem(W(2)^p, d), d) # (5)
             return (m, p)
@@ -122,5 +124,6 @@ function div_by_const(x::T, ::Val{C}) where {T, C}
 
     out = _widemul(promote(x, magic_number)...)
     out >>= shift
-    return (out % T) + (x < zero(T)) # TODO: Figure out why and if the + 1 if non-positive is necessary
+    # TODO: prove that we need to add 1 for negative numbers!
+    return (out % T) + (x < zero(T))
 end

From 4f4d17a75b859bcd5b89f932e1d8faaf2fb91b63 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Drvo=C5=A1t=C4=9Bp?=
 <tomas.drvostep@gmail.com>
Date: Wed, 26 Jun 2024 17:13:13 +0200
Subject: [PATCH 24/32] .

---
 src/fldmod-by-const.jl | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/fldmod-by-const.jl b/src/fldmod-by-const.jl
index e141adc..3dc44cf 100644
--- a/src/fldmod-by-const.jl
+++ b/src/fldmod-by-const.jl
@@ -70,7 +70,7 @@ end
 
 # Unsigned magic number computation + shift by constant
 # See Hacker's delight, equations (26) and (27) from Chapter 10-9.
-# requires nmax > divisor > 1.
+# requires nmax >= divisor > 2. divisor must not be a power of 2.
 Base.@assume_effects :foldable function magicg(nmax::Unsigned, divisor)
     T = typeof(nmax)
     W = _widen(T)
@@ -79,7 +79,7 @@ Base.@assume_effects :foldable function magicg(nmax::Unsigned, divisor)
     nc = div(W(nmax) + W(1), d) * d - W(1)      # largest multiple of d <= nmax, minus 1
     nbits = 8sizeof(nmax) - leading_zeros(nmax) # most significant bit
     # shift must be larger than int size because we want the high bits of the wide multiplication
-    for p in 8sizeof(nmax):2nbits
+    for p in nbits-1:2nbits-1
         if W(2)^p > nc * (d - W(1) - rem(W(2)^p - W(1), d))       # (27)
             m = div(W(2)^p + d - W(1) - rem(W(2)^p - W(1), d), d) # (26)
             return (m, p)
@@ -89,7 +89,7 @@ Base.@assume_effects :foldable function magicg(nmax::Unsigned, divisor)
 end
 
 # See Hacker's delight, equations (5) and (6) from Chapter 10-4.
-# requires nmax > divisor > 1.
+# requires nmax >= divisor > 2. divisor must not be a power of 2.
 Base.@assume_effects :foldable function magicg(nmax::Signed, divisor)
     T = typeof(nmax)
     W = _widen(T)
@@ -98,7 +98,7 @@ Base.@assume_effects :foldable function magicg(nmax::Signed, divisor)
     nc = div(W(nmax) + W(1), d) * d - W(1)      # largest multiple of d <= nmax, minus 1
     nbits = 8sizeof(nmax) - leading_zeros(nmax) # most significant bit
     # shift must be larger than int size because we want the high bits of the wide multiplication
-    for p in 8sizeof(nmax):2nbits
+    for p in nbits-1:2nbits-1
         if W(2)^p > nc * (d - rem(W(2)^p, d))       # (6)
             m = div(W(2)^p + d - rem(W(2)^p, d), d) # (5)
             return (m, p)
@@ -124,6 +124,6 @@ function div_by_const(x::T, ::Val{C}) where {T, C}
 
     out = _widemul(promote(x, magic_number)...)
     out >>= shift
-    # TODO: prove that we need to add 1 for negative numbers!
+    # Adding one as implied by formula (1b) in Hacker's delight, Chapter 10-4.
     return (out % T) + (x < zero(T))
 end

From eaeaddfa6f8b025c712e82b7c4173bc3e91a4c38 Mon Sep 17 00:00:00 2001
From: Nathan Daly <nathan.daly@relational.ai>
Date: Tue, 9 Jul 2024 18:51:12 -0600
Subject: [PATCH 25/32] Restrict back to just Int128 & Int256 for custom
 div_by_const

LLVM _does_ do this automatically if you directly call `fldmod`.

Improve the comments as well.
---
 src/fldmod-by-const.jl | 49 +++++++++++++++++++++---------------------
 1 file changed, 24 insertions(+), 25 deletions(-)

diff --git a/src/fldmod-by-const.jl b/src/fldmod-by-const.jl
index 3dc44cf..7e3bff8 100644
--- a/src/fldmod-by-const.jl
+++ b/src/fldmod-by-const.jl
@@ -1,31 +1,35 @@
-# NOTE: Surprisingly, even though LLVM implements a version of this optimization on its own
-# for smaller integer sizes (<=64-bits), using the code in this file produces faster
-# multiplications for *all* types of integers. So we use our custom fldmod_by_const for all
-# bit integer types.
+# This file includes a manual implementation of the divide-by-constant optimization for
+# non-power-of-2 divisors. LLVM automatically includes this optimization, but only for
+# smaller integer sizes (<=64-bits on my 64-bit machine).
+#
+# NOTE: We use LLVM's built-in implementation for Int64 and smaller, to keep the code
+# simpler (though the code we produce is identical, verified in tests.) We apply this
+# optimization to (U)Int128 and (U)Int256, which result from multiplying FD{Int64}s and
+# FD{Int128}s.
 # Before:
 # julia> @btime for _ in 1:10000 fd = fd * fd end setup = (fd = FixedDecimal{Int32,3}(1.234))
-#     84.959 μs (0 allocations: 0 bytes)
+#     54.125 μs (0 allocations: 0 bytes)   # (Unchanged)
 #   FixedDecimal{Int32,3}(1700943.280)
 #
 # julia> @btime for _ in 1:10000 fd = fd * fd end setup = (fd = FixedDecimal{Int64,3}(1.234))
-#     247.709 μs (0 allocations: 0 bytes)
+#     174.625 μs (0 allocations: 0 bytes)
 #   FixedDecimal{Int64,3}(4230510070790917.029)
 #
-#julia> @btime for _ in 1:10000 fd = fd * fd end setup = (fd = FixedDecimal{Int128,3}(1.234))
-#    4.077 ms (160798 allocations: 3.22 MiB)
-#  FixedDecimal{Int128,3}(-66726338547984585007169386718143307.324)
+# julia> @btime for _ in 1:10000 fd = fd * fd end setup = (fd = FixedDecimal{Int128,3}(1.234))
+#     2.119 ms (79986 allocations: 1.60 MiB)
+#   FixedDecimal{Int128,3}(-66726338547984585007169386718143307.324)
 #
 # After:
 # julia> @btime for _ in 1:10000 fd = fd * fd end setup = (fd = FixedDecimal{Int32,3}(1.234))
-#     68.416 μs (0 allocations: 0 bytes)
+#     56.958 μs (0 allocations: 0 bytes)   # (Unchanged)
 #   FixedDecimal{Int32,3}(1700943.280)
 #
 # julia> @btime for _ in 1:10000 fd = fd * fd end setup = (fd = FixedDecimal{Int64,3}(1.234))
-#     106.125 μs (0 allocations: 0 bytes)
+#     90.708 μs (0 allocations: 0 bytes)
 #   FixedDecimal{Int64,3}(4230510070790917.029)
 #
 # julia> @btime for _ in 1:10000 fd = fd * fd end setup = (fd = FixedDecimal{Int128,3}(1.234))
-#     204.125 μs (0 allocations: 0 bytes)
+#     180.167 μs (0 allocations: 0 bytes)
 #   FixedDecimal{Int128,3}(-66726338547984585007169386718143307.324)
 
 """
@@ -34,8 +38,8 @@ A trait to control opt-in for the custom `fldmod_by_const` implementation. To us
 given integer type, you can define this overload for your integer type.
 You will also need to implement some parts of the interface below, including _widen().
 """
-ShouldUseCustomFldmodByConst(::Type{<:Base.BitInteger}) = true
-ShouldUseCustomFldmodByConst(::Type{<:Union{Int256,UInt256}}) = true
+ShouldUseCustomFldmodByConst(::Type{<:Union{Int128,UInt128}}) = true  # For FD{Int64}
+ShouldUseCustomFldmodByConst(::Type{<:Union{Int256,UInt256}}) = true  # For FD{Int128}
 ShouldUseCustomFldmodByConst(::Type) = false
 
 @inline function fldmod_by_const(x, y)
@@ -43,12 +47,12 @@ ShouldUseCustomFldmodByConst(::Type) = false
         # For large Int types, LLVM doesn't optimize well, so we use a custom implementation
         # of fldmod, which extends that optimization to those larger integer types.
         d = fld_by_const(x, Val(y))
-        return d, manual_mod(promote(x, y, d)...)
+        return d, (x - d * y)
     else
         # For other integers, LLVM might be able to correctly optimize away the division, if
-        # it knows it's dividing by a const. We cannot call `Base.fldmod` since it's not
-        # inlined, so here we have explictly inlined it instead.
-        return (fld(x,y), mod(x,y))
+        # it knows it's dividing by a const.
+        # Since julia 1.8+, fldmod(x,y) automatically optimizes for constant divisors.
+        return fldmod(x, y)
     end
 end
 
@@ -61,15 +65,9 @@ function fld_by_const(x::T, y::Val{C}) where {T<:Signed, C}
     return d - (signbit(x ⊻ C) & (d * C != x))
 end
 
-# Calculate `mod(x,y)` after you've already acquired quotient, the result of `fld(x,y)`.
-# REQUIRES:
-#   - `y != -1`
-@inline function manual_mod(x::T, y::T, quotient::T) where T<:Integer
-    return x - quotient * y
-end
-
 # Unsigned magic number computation + shift by constant
 # See Hacker's delight, equations (26) and (27) from Chapter 10-9.
+# (See also the errata on https://web.archive.org/web/20190915025154/http://www.hackersdelight.org/)
 # requires nmax >= divisor > 2. divisor must not be a power of 2.
 Base.@assume_effects :foldable function magicg(nmax::Unsigned, divisor)
     T = typeof(nmax)
@@ -89,6 +87,7 @@ Base.@assume_effects :foldable function magicg(nmax::Unsigned, divisor)
 end
 
 # See Hacker's delight, equations (5) and (6) from Chapter 10-4.
+# (See also the errata on https://web.archive.org/web/20190915025154/http://www.hackersdelight.org/)
 # requires nmax >= divisor > 2. divisor must not be a power of 2.
 Base.@assume_effects :foldable function magicg(nmax::Signed, divisor)
     T = typeof(nmax)

From a2dcf56e6d471dbfa02f4883b7829c8430bfde5c Mon Sep 17 00:00:00 2001
From: Nathan Daly <nathan.daly@relational.ai>
Date: Tue, 9 Jul 2024 18:52:15 -0600
Subject: [PATCH 26/32] It turns out that in newer versions of julia, you
 should call fldmod, not fld(x,y),mod(x,y), even if y is a constant! :)

Improved that for julia versions 1.8 - 1.9
---
 src/FixedPointDecimals.jl | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/FixedPointDecimals.jl b/src/FixedPointDecimals.jl
index ebe0702..38e45bb 100644
--- a/src/FixedPointDecimals.jl
+++ b/src/FixedPointDecimals.jl
@@ -49,7 +49,13 @@ else
     # We force-inline this, to allow the fldmod operation to be specialized for a constant
     # second argument (converting divide-by-power-of-ten instructions with the cheaper
     # multiply-by-inverse-coefficient.) Sadly this doesn't work for Int128.
-    @inline fldmod_by_const(x,y) = (fld(x,y), mod(x,y))
+    if VERSION >= v"1.8.0-"
+        # Since julia 1.8+, the built-in fldmod optimizes for constant divisors, which is
+        # even better than computing the division twice.
+        @inline fldmod_by_const(x,y) = fldmod(x,y)
+    else
+        @inline fldmod_by_const(x,y) = (fld(x,y), mod(x,y))
+    end
 end
 
 # floats that support fma and are roughly IEEE-like

From ef578e91f47ac3319b152350bf69a289a4b22823 Mon Sep 17 00:00:00 2001
From: Nathan Daly <nathan.daly@relational.ai>
Date: Tue, 9 Jul 2024 18:56:46 -0600
Subject: [PATCH 27/32] Reorganize the functions to be top-down

---
 src/fldmod-by-const.jl | 47 +++++++++++++++++++++---------------------
 1 file changed, 23 insertions(+), 24 deletions(-)

diff --git a/src/fldmod-by-const.jl b/src/fldmod-by-const.jl
index 7e3bff8..cf200e8 100644
--- a/src/fldmod-by-const.jl
+++ b/src/fldmod-by-const.jl
@@ -3,9 +3,8 @@
 # smaller integer sizes (<=64-bits on my 64-bit machine).
 #
 # NOTE: We use LLVM's built-in implementation for Int64 and smaller, to keep the code
-# simpler (though the code we produce is identical, verified in tests.) We apply this
-# optimization to (U)Int128 and (U)Int256, which result from multiplying FD{Int64}s and
-# FD{Int128}s.
+# simpler (though the code we produce is identical). We apply this optimization to (U)Int128
+# and (U)Int256, which result from multiplying FD{Int64}s and FD{Int128}s.
 # Before:
 # julia> @btime for _ in 1:10000 fd = fd * fd end setup = (fd = FixedDecimal{Int32,3}(1.234))
 #     54.125 μs (0 allocations: 0 bytes)   # (Unchanged)
@@ -65,6 +64,27 @@ function fld_by_const(x::T, y::Val{C}) where {T<:Signed, C}
     return d - (signbit(x ⊻ C) & (d * C != x))
 end
 
+function div_by_const(x::T, ::Val{C}) where {T, C}
+    # These checks will be compiled away during specialization.
+    # While for `*(FixedDecimal, FixedDecimal)`, C will always be a power of 10, these
+    # checks allow this function to work for any `C > 0`, in case that's useful in the
+    # future.
+    if C == 1
+        return x
+    elseif ispow2(C)
+        return div(x, C) # Will already do the right thing
+    elseif C <= 0
+        throw(DomainError("C must be > 0"))
+    end
+    # Calculate the magic number and shift amount, based on Hacker's Delight, Chapter 10.
+    magic_number, shift = magicg(typemax(T), C)
+
+    out = _widemul(promote(x, magic_number)...)
+    out >>= shift
+    # Adding one as implied by formula (1b) in Hacker's delight, Chapter 10-4.
+    return (out % T) + (x < zero(T))
+end
+
 # Unsigned magic number computation + shift by constant
 # See Hacker's delight, equations (26) and (27) from Chapter 10-9.
 # (See also the errata on https://web.archive.org/web/20190915025154/http://www.hackersdelight.org/)
@@ -105,24 +125,3 @@ Base.@assume_effects :foldable function magicg(nmax::Signed, divisor)
     end
     return W(0), 0 # Should never reach here
 end
-
-function div_by_const(x::T, ::Val{C}) where {T, C}
-    # These checks will be compiled away during specialization.
-    # While for `*(FixedDecimal, FixedDecimal)`, C will always be a power of 10, these
-    # checks allow this function to work for any `C > 0`, in case that's useful in the
-    # future.
-    if C == 1
-        return x
-    elseif ispow2(C)
-        return div(x, C) # Will already do the right thing
-    elseif C <= 0
-        throw(DomainError("C must be > 0"))
-    end
-    # Calculate the magic number and shift amount, based on Hacker's Delight, Chapter 10.
-    magic_number, shift = magicg(typemax(T), C)
-
-    out = _widemul(promote(x, magic_number)...)
-    out >>= shift
-    # Adding one as implied by formula (1b) in Hacker's delight, Chapter 10-4.
-    return (out % T) + (x < zero(T))
-end

From 8df68b5fe1fced4f8aff35c52d8f56cc216fcfe7 Mon Sep 17 00:00:00 2001
From: Nathan Daly <nathan.daly@relational.ai>
Date: Tue, 9 Jul 2024 19:23:22 -0600
Subject: [PATCH 28/32] More thorough tests for flmdod_by_const

---
 test/fldmod-by-const_tests.jl | 37 +++++++++++++++--------------------
 1 file changed, 16 insertions(+), 21 deletions(-)

diff --git a/test/fldmod-by-const_tests.jl b/test/fldmod-by-const_tests.jl
index 95dfce9..2a63031 100644
--- a/test/fldmod-by-const_tests.jl
+++ b/test/fldmod-by-const_tests.jl
@@ -23,33 +23,25 @@ end
     end
 end
 
-@testset "fixed decimal multiplication - exhaustive 8-bit" begin
-    @testset for P in (0,1)
-        @testset for T in (Int8, UInt8)
-            FD = FixedDecimal{T,P}
-
-            function test_multiplies_correctly(fd, x)
-                big = FixedDecimal{BigInt, P}(fd)
-                big_mul = big * x
-                # This might overflow: ...
-                mul = fd * x
-                @testset "$fd * $x" begin
-                    # ... so we truncate big to the same size
-                    @test big_mul.i % T == mul.i % T
-                end
-            end
-            @testset for v in typemin(FD) : eps(FD) : typemax(FD)
-                @testset for v2 in typemin(FD) : eps(FD) : typemax(FD)
-                    test_multiplies_correctly(v, v2)
+@testset "flmdod_by_const - exhaustive 8-bit" begin
+    @testset for T in (Int8, UInt8)
+        @testset for x in typemin(T) : typemax(T)
+            @testset for y in typemin(T) : typemax(T)
+                y == 0 && continue
+                y == -1 && continue
+                @testset "fldmod($x, $y)" begin
+                    @test fldmod(x, y) == FixedPointDecimals.fldmod_by_const(x, y)
                 end
             end
         end
     end
 end
 
-@testset "fixed decimal multiplication - exhaustive 16-bit" begin
+# 64-bit FD multiplication uses the custom fldmod_by_const implementation.
+# Test a few various different cases to try to ensure it works correctly.
+@testset "fixed decimal multiplication - 64-bit" begin
     @testset for P in (0,1,2,3,4)
-        @testset for T in (Int16, UInt16)
+        @testset for T in (Int64, UInt64)
             FD = FixedDecimal{T,P}
 
             function test_multiplies_correctly(fd, x)
@@ -62,7 +54,10 @@ end
                     @test big_mul.i % T == mul.i % T
                 end
             end
-            @testset for v in typemin(FD) : eps(FD) : typemax(FD)
+            num_tests = 2<<11
+            # Add one to avoid powers-of-2 to get hopefully better tests
+            epsilon = (typemax(FD) ÷ num_tests) + eps(FD)
+            @testset for v in typemin(FD) : epsilon : typemax(FD)
                 test_multiplies_correctly(v, typemin(T))
                 test_multiplies_correctly(v, -1)
                 test_multiplies_correctly(v, -eps(FD))

From fd096ace8f4c28eede455c91ca799b2ff88e9439 Mon Sep 17 00:00:00 2001
From: Nathan Daly <nathan.daly@relational.ai>
Date: Mon, 12 Aug 2024 11:52:54 -0600
Subject: [PATCH 29/32] Bump patch version number

---
 Project.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Project.toml b/Project.toml
index f120b6c..9bef680 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "FixedPointDecimals"
 uuid = "fb4d412d-6eee-574d-9565-ede6634db7b0"
 authors = ["Fengyang Wang <fengyang.wang.0@gmail.com>", "Curtis Vogt <curtis.vogt@gmail.com>"]
-version = "0.5.3"
+version = "0.5.4"
 
 [deps]
 BitIntegers = "c3b6d118-76ef-56ca-8cc7-ebb389d030a1"

From e147f5cd3d890e0125e235591a796a94366c4669 Mon Sep 17 00:00:00 2001
From: Nathan Daly <nathan.daly@relational.ai>
Date: Tue, 13 Aug 2024 10:37:06 -0600
Subject: [PATCH 30/32] Add _widemul unit test

---
 test/FixedDecimal.jl | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/test/FixedDecimal.jl b/test/FixedDecimal.jl
index 18d87d7..41fcda8 100644
--- a/test/FixedDecimal.jl
+++ b/test/FixedDecimal.jl
@@ -1408,3 +1408,25 @@ end
     @test hash(FD2(1//10)) ≠ hash(0.1)
 end
 
+@testset "_widemul" begin
+    _widemul = FixedPointDecimals._widemul
+    Int256, UInt256 = FixedPointDecimals.Int256, FixedPointDecimals.UInt256
+    Int512, UInt512 = FixedPointDecimals.Int512, FixedPointDecimals.UInt512
+
+    @test _widemul(UInt8(3), UInt8(2)) === widemul(UInt8(3), UInt8(2)) === UInt16(6)
+    @test _widemul(Int8(3), UInt8(2)) === widemul(Int8(3), UInt8(2)) === Int16(6)
+    @test _widemul(UInt8(3), Int8(2)) === widemul(UInt8(3), Int8(2)) === Int16(6)
+
+    @test _widemul(UInt16(3), UInt8(2)) === widemul(UInt16(3), UInt8(2)) === UInt32(6)
+    @test _widemul(UInt16(3), Int8(2)) === widemul(UInt16(3), Int8(2)) === Int32(6)
+    @test _widemul(Int16(3), UInt8(2)) === widemul(Int16(3), UInt8(2)) === Int32(6)
+
+    # Custom widenings
+    @test _widemul(UInt128(3), UInt128(2)) === UInt256(6)
+    @test _widemul(Int128(3), UInt128(2)) === Int256(6)
+    @test _widemul(UInt128(3), Int128(2)) === Int256(6)
+
+    @test _widemul(UInt256(3), UInt256(2)) === UInt512(6)
+    @test _widemul(Int256(3), UInt256(2)) === Int512(6)
+    @test _widemul(UInt256(3), Int256(2)) === Int512(6)
+end

From f830c3c98fca678c9d21c91ccca5aa96fea9c616 Mon Sep 17 00:00:00 2001
From: Nathan Daly <nathan.daly@relational.ai>
Date: Tue, 13 Aug 2024 10:40:49 -0600
Subject: [PATCH 31/32] Change "unreachable" comment to an `@assert false`

---
 src/fldmod-by-const.jl | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/fldmod-by-const.jl b/src/fldmod-by-const.jl
index cf200e8..d36b4da 100644
--- a/src/fldmod-by-const.jl
+++ b/src/fldmod-by-const.jl
@@ -103,7 +103,9 @@ Base.@assume_effects :foldable function magicg(nmax::Unsigned, divisor)
             return (m, p)
         end
     end
-    return W(0), 0 # Should never reach here
+    @assert false """magicg bug: Unreachable reached. divisor=$divisor, nmax=$nmax.
+        Please report an issue to https://github.com/JuliaMath/FixedPointDecimals.jl
+        """
 end
 
 # See Hacker's delight, equations (5) and (6) from Chapter 10-4.
@@ -123,5 +125,7 @@ Base.@assume_effects :foldable function magicg(nmax::Signed, divisor)
             return (m, p)
         end
     end
-    return W(0), 0 # Should never reach here
+    @assert false """magicg bug: Unreachable reached. divisor=$divisor, nmax=$nmax.
+        Please report an issue to https://github.com/JuliaMath/FixedPointDecimals.jl
+        """
 end

From 71ba82e3b80f3ac53833627424abac27d2375359 Mon Sep 17 00:00:00 2001
From: Nathan Daly <nathan.daly@relational.ai>
Date: Mon, 9 Sep 2024 18:08:11 -0600
Subject: [PATCH 32/32] add test comment

---
 test/fldmod-by-const_tests.jl | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/test/fldmod-by-const_tests.jl b/test/fldmod-by-const_tests.jl
index 2a63031..42b04d0 100644
--- a/test/fldmod-by-const_tests.jl
+++ b/test/fldmod-by-const_tests.jl
@@ -23,6 +23,9 @@ end
     end
 end
 
+# We don't actually use fldmod_by_const with 8-bit ints, but they're useful because
+# we can exhaustively test every possible combination, to increase our confidence in
+# the implementation.
 @testset "flmdod_by_const - exhaustive 8-bit" begin
     @testset for T in (Int8, UInt8)
         @testset for x in typemin(T) : typemax(T)