From a1c17112416949c0bd3af78f2edb45bfc7c6323d Mon Sep 17 00:00:00 2001 From: Nathan Daly Date: Wed, 12 Jun 2024 13:11:28 -0600 Subject: [PATCH 01/32] Use Int256 to avoid BigInt in FD operations. We do not here explicitly introduce support for FD{BitIntegers.Int256}, though that should work out of the box both before and after this PR. Rather, this PR _uses_ a (U)Int256 under the hood to prevent allocations from Int128 widening to BigInt in FD operations. --- Project.toml | 1 + src/FixedPointDecimals.jl | 32 +++++++++++++++++++++++++------- 2 files changed, 26 insertions(+), 7 deletions(-) diff --git a/Project.toml b/Project.toml index 3f313f9..2301391 100644 --- a/Project.toml +++ b/Project.toml @@ -4,6 +4,7 @@ authors = ["Fengyang Wang ", "Curtis Vogt Date: Wed, 12 Jun 2024 13:27:48 -0600 Subject: [PATCH 02/32] Further reduce BigInts by skipping a `rem()` in iseven --- src/FixedPointDecimals.jl | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/FixedPointDecimals.jl b/src/FixedPointDecimals.jl index 317699b..ac9e5b5 100644 --- a/src/FixedPointDecimals.jl +++ b/src/FixedPointDecimals.jl @@ -175,7 +175,9 @@ function _round_to_nearest(quotient::T, divisor::T, ::RoundingMode{M}=RoundNearest) where {T <: Integer, M} halfdivisor = divisor >> 1 - if iseven(divisor) && remainder == halfdivisor + # PERF Note: Only need the last bit to check iseven, and default iseven(Int256) + # allocates, so we truncate first. + if iseven((divisor % Int8)) && remainder == halfdivisor # `:NearestTiesAway` will tie away from zero, e.g. -8.5 -> # -9. `:NearestTiesUp` will always ties towards positive # infinity. `:Nearest` will tie towards the nearest even From 78e45dc31960b960728dd6c1825132ee771e783f Mon Sep 17 00:00:00 2001 From: Nathan Daly Date: Wed, 12 Jun 2024 13:39:22 -0600 Subject: [PATCH 03/32] Fix ambiguity in _widemul(Int256, UInt256) --- src/FixedPointDecimals.jl | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/FixedPointDecimals.jl b/src/FixedPointDecimals.jl index ac9e5b5..28b8611 100644 --- a/src/FixedPointDecimals.jl +++ b/src/FixedPointDecimals.jl @@ -123,17 +123,16 @@ end # Custom widemul implementation to avoid the cost of widening to BigInt. # FD{Int128} operations should widen to 256 bits internally, rather than to a BigInt. const BitInteger128 = Union{Int128, UInt128} -_widemul(x, y) = widemul(x, y) -_widemul(x::BitInteger128, y) = _widemul(promote(x, y)...) -_widemul(x, y::BitInteger128) = _widemul(promote(x, y)...) -_widemul(x::Int128, y::Int128) = BitIntegers.Int256(x) * BitIntegers.Int256(y) -_widemul(x::UInt128, y::UInt128) = BitIntegers.UInt256(x) * BitIntegers.UInt256(y) +_widemul(x, y) = _widen(x) * _widen(y) +_widemul(x::Signed,y::Unsigned) = _widen(x) * signed(_widen(y)) +_widemul(x::Unsigned,y::Signed) = signed(_widen(x)) * _widen(y) # Custom widen implementation to avoid the cost of widening to BigInt. # FD{Int128} operations should widen to 256 bits internally, rather than to a BigInt. _widen(::Type{Int128}) = BitIntegers.Int256 _widen(::Type{UInt128}) = BitIntegers.UInt256 -_widen(t) = widen(t) +_widen(t::Type) = widen(t) +_widen(x::T) where {T} = (_widen(T))(x) (::Type{T})(x::Real) where {T <: FD} = convert(T, x) From 879c60229e436dbf5d942a522571600b067d5708 Mon Sep 17 00:00:00 2001 From: Nathan Daly Date: Wed, 12 Jun 2024 13:59:47 -0600 Subject: [PATCH 04/32] Bump patch version number --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index 2301391..f3fcdaf 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "FixedPointDecimals" uuid = "fb4d412d-6eee-574d-9565-ede6634db7b0" authors = ["Fengyang Wang ", "Curtis Vogt "] -version = "0.5.2" +version = "0.5.3" [deps] BitIntegers = "c3b6d118-76ef-56ca-8cc7-ebb389d030a1" From a24565110dc16462cb2e7b2bd81adb161bd9ef20 Mon Sep 17 00:00:00 2001 From: Nathan Daly Date: Wed, 12 Jun 2024 14:01:10 -0600 Subject: [PATCH 05/32] Add compat for BitIntegers --- Project.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/Project.toml b/Project.toml index f3fcdaf..f120b6c 100644 --- a/Project.toml +++ b/Project.toml @@ -9,6 +9,7 @@ Parsers = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0" [compat] Parsers = "2.7" +BitIntegers = "0.3.1" julia = "1.6" [extras] From 4bd8c45b4f57b8691c12dde3f6ddc2b4678cda66 Mon Sep 17 00:00:00 2001 From: Nathan Daly Date: Wed, 12 Jun 2024 14:47:58 -0600 Subject: [PATCH 06/32] Finish removing the BigInts from * for FD{Int128}! MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Finally implements the fast-multiplication optimization from https://github.com/JuliaMath/FixedPointDecimals.jl/pull/45, but this time for 128-bit FixedDecimals! :) This is a follow-up to https://github.com/JuliaMath/FixedPointDecimals.jl/pull/93, which introduces an Int256 type for widemul. However, the fldmod still required 2 BigInt allocations. Now, this PR uses a custom implementation of the LLVM div-by-const optimization for (U)Int256, which briefly widens to Int512 (😅) to perform the fldmod by the constant 10^f coefficient. This brings 128-bit FD multiply to the same performance as 64-bit. :) --- src/FixedPointDecimals.jl | 24 ++++----- src/fldmod-by-const.jl | 96 +++++++++++++++++++++++++++++++++++ test/fldmod-by-const_tests.jl | 58 +++++++++++++++++++++ test/runtests.jl | 7 ++- 4 files changed, 171 insertions(+), 14 deletions(-) create mode 100644 src/fldmod-by-const.jl create mode 100644 test/fldmod-by-const_tests.jl diff --git a/src/FixedPointDecimals.jl b/src/FixedPointDecimals.jl index 28b8611..df8f75a 100644 --- a/src/FixedPointDecimals.jl +++ b/src/FixedPointDecimals.jl @@ -36,9 +36,11 @@ export checked_abs, checked_add, checked_cld, checked_div, checked_fld, using Base: decompose, BitInteger -import BitIntegers # For 128-bit _widemul / _widen +using BitIntegers: BitIntegers, UInt256, Int256 import Parsers +include("fldmod-by-const.jl") + # floats that support fma and are roughly IEEE-like const FMAFloat = Union{Float16, Float32, Float64, BigFloat} @@ -129,8 +131,10 @@ _widemul(x::Unsigned,y::Signed) = signed(_widen(x)) * _widen(y) # Custom widen implementation to avoid the cost of widening to BigInt. # FD{Int128} operations should widen to 256 bits internally, rather than to a BigInt. -_widen(::Type{Int128}) = BitIntegers.Int256 -_widen(::Type{UInt128}) = BitIntegers.UInt256 +_widen(::Type{Int128}) = Int256 +_widen(::Type{UInt128}) = UInt256 +_widen(::Type{Int256}) = BitIntegers.Int512 +_widen(::Type{UInt256}) = BitIntegers.UInt512 _widen(t::Type) = widen(t) _widen(x::T) where {T} = (_widen(T))(x) @@ -196,18 +200,12 @@ function _round_to_nearest(quotient::T, end _round_to_nearest(q, r, d, m=RoundNearest) = _round_to_nearest(promote(q, r, d)..., m) -# In many of our calls to fldmod, `y` is a constant (the coefficient, 10^f). However, since -# `fldmod` is sometimes not being inlined, that constant information is not available to the -# optimizer. We need an inlined version of fldmod so that the compiler can replace expensive -# divide-by-power-of-ten instructions with the cheaper multiply-by-inverse-coefficient. -@inline fldmodinline(x,y) = (fld(x,y), mod(x,y)) - # multiplication rounds to nearest even representation # TODO: can we use floating point to speed this up? after we build a # correctness test suite. function Base.:*(x::FD{T, f}, y::FD{T, f}) where {T, f} powt = coefficient(FD{T, f}) - quotient, remainder = fldmodinline(_widemul(x.i, y.i), powt) + quotient, remainder = fldmod_by_const(_widemul(x.i, y.i), powt) reinterpret(FD{T, f}, _round_to_nearest(quotient, remainder, powt)) end @@ -234,12 +232,12 @@ function Base.round(x::FD{T, f}, RoundingMode{:NearestTiesUp}, RoundingMode{:NearestTiesAway}}=RoundNearest) where {T, f} powt = coefficient(FD{T, f}) - quotient, remainder = fldmodinline(x.i, powt) + quotient, remainder = fldmod_by_const(x.i, powt) FD{T, f}(_round_to_nearest(quotient, remainder, powt, m)) end function Base.ceil(x::FD{T, f}) where {T, f} powt = coefficient(FD{T, f}) - quotient, remainder = fldmodinline(x.i, powt) + quotient, remainder = fldmod_by_const(x.i, powt) if remainder > 0 FD{T, f}(quotient + one(quotient)) else @@ -435,7 +433,7 @@ function Base.checked_sub(x::T, y::T) where {T<:FD} end function Base.checked_mul(x::FD{T,f}, y::FD{T,f}) where {T<:Integer,f} powt = coefficient(FD{T, f}) - quotient, remainder = fldmodinline(_widemul(x.i, y.i), powt) + quotient, remainder = fldmod_by_const(_widemul(x.i, y.i), powt) v = _round_to_nearest(quotient, remainder, powt) typemin(T) <= v <= typemax(T) || Base.Checked.throw_overflowerr_binaryop(:*, x, y) return reinterpret(FD{T, f}, T(v)) diff --git a/src/fldmod-by-const.jl b/src/fldmod-by-const.jl new file mode 100644 index 0000000..48ffcfb --- /dev/null +++ b/src/fldmod-by-const.jl @@ -0,0 +1,96 @@ + +const BitInteger256 = Union{UInt256, Int256} + +@inline function fldmod_by_const(x, y) + # For small-to-normal integers, LLVM can correctly optimize away the division, if it + # knows it's dividing by a const. We cannot call `Base.fldmod` since it's not + # inlined, so here we have explictly inlined it instead. + return (fld(x,y), mod(x,y)) +end +@inline function fldmod_by_const(x::BitInteger256, y) + # For large or non-standard Int types, LLVM doesn't optimize + # well, so we use a custom implementation of fldmod. + d = fld_by_const(x, Val(y)) + return d, manual_mod(promote(x, y, d)...) +end + +# Calculate fld(x,y) when y is a Val constant. +# The implementation for fld_by_const was lifted directly from Base.fld(x,y), except that +# it uses `div_by_const` instead of `div`. +fld_by_const(x::T, y::Val{C}) where {T<:UInt256, C} = div_by_const(x, y) +function fld_by_const(x::T, y::Val{C}) where {T<:Int256, C} + d = div_by_const(x, y) + return d - (signbit(x ⊻ C) & (d * C != x)) +end + +# Calculate `mod(x,y)` after you've already acquired quotient, the result of `fld(x,y)`. +# REQUIRES: +# - `y != -1` +@inline function manual_mod(x::T, y::T, quotient::T) where T<:BitInteger256 + return x - quotient * y +end + +Base.@assume_effects :total function div_by_const(x::T, ::Val{C}) where {T, C} + # These checks will be compiled away during specialization. + # While for `*(FixedDecimal, FixedDecimal)`, C will always be a power of 10, these + # checks allow this function to work for any `C > 0`, in case that's useful in the + # future. + if C == 1 + return x + elseif ispow2(C) + return div(x, C) # Will already do the right thing + elseif C <= 0 + throw(DomainError("C must be > 0")) + end + # Calculate the magic number 2^N/C. Note that this is computed statically, not at + # runtime. + inverse_coeff, toshift = calculate_inverse_coeff(T, C) + # Compute the upper-half of widemul(x, 2^nbits(T)/C). + # By keeping only the upper half, we're essentially dividing by 2^nbits(T), undoing the + # numerator of the multiplication, so that the result is equal to x/C. + out = mul_hi(x, inverse_coeff) + # This condition will be compiled away during specialization. + if T <: Signed + # Because our magic number has a leading one (since we shift all-the-way left), the + # result is negative if it's Signed. We add x to give us the positive equivalent. + out += x + signshift = (nbits(x) - 1) + isnegative = T(out >>> signshift) # 1 if < 0 else 0 (Unsigned bitshift to read top bit) + end + # Undo the bitshifts used to calculate the invoeff magic number with maximum precision. + out = out >> toshift + if T <: Signed + out = out + isnegative + end + return T(out) +end + +Base.@assume_effects :foldable function calculate_inverse_coeff(::Type{T}, C) where {T} + # First, calculate 2^nbits(T)/C + # We shift away leading zeros to preserve the most precision when we use it to multiply + # in the next step. At the end, we will shift the final answer back to undo this + # operation (which is why we need to return `toshift`). + # Note, also, that we calculate invcoeff at double-precision so that the left-shift + # doesn't leave trailing zeros. We truncate to only the upper-half before returning. + UT = _unsigned(T) + invcoeff = typemax(_widen(UT)) ÷ C + toshift = leading_zeros(invcoeff) + invcoeff = invcoeff << toshift + # Now, truncate to only the upper half of invcoeff, after we've shifted. Instead of + # bitshifting, we round to maintain precision. (This is needed to prevent off-by-ones.) + # -- This is equivalent to `invcoeff = T(invcoeff >> sizeof(T))`, except rounded. -- + invcoeff = _round_to_nearest(fldmod(invcoeff, typemax(UT))..., typemax(UT)) % T + return invcoeff, toshift +end + +function mul_hi(x::T, y::T) where T + xy = _widemul(x, y) # support Int256 -> Int512 (!!) + (xy >> nbits(T)) % T +end + +# Annoyingly, Unsigned(T) isn't defined for BitIntegers types: +# https://github.com/rfourquet/BitIntegers.jl/pull/2 +_unsigned(x) = unsigned(x) +_unsigned(::Type{Int256}) = UInt256 + +nbits(x) = sizeof(x) * 8 diff --git a/test/fldmod-by-const_tests.jl b/test/fldmod-by-const_tests.jl new file mode 100644 index 0000000..6b6a8d1 --- /dev/null +++ b/test/fldmod-by-const_tests.jl @@ -0,0 +1,58 @@ +using Test +using FixedPointDecimals + +@testset "calculate_inverse_coeff signed" begin + using FixedPointDecimals: calculate_inverse_coeff + + # The correct magic number here comes from investigating the following native code + # produced on an m2 aarch64 macbook pro: + # @code_native ((x)->fld(x,100))(2) + # ... + # mov x8, #55051 + # movk x8, #28835, lsl #16 + # movk x8, #2621, lsl #32 + # movk x8, #41943, lsl #48 + # Where: + # julia> 55051 | 28835 << 16 | 2621 << 32 | 41943 << 48 + # -6640827866535438581 + @test calculate_inverse_coeff(Int64, 100) == (-6640827866535438581, 6) + + # Same for the tests below: + + # (LLVM's magic number is shifted one bit less, then they shift by 2, instead of 3, + # but the result is the same.) + @test calculate_inverse_coeff(Int64, 10) == (7378697629483820647 << 1, 3) + + @test calculate_inverse_coeff(Int64, 1) == (1, 0) +end + +@testset "calculate_inverse_coeff signed 4" begin + using FixedPointDecimals: calculate_inverse_coeff + + # Same here, our magic number is shifted 2 bits more than LLVM's + @test calculate_inverse_coeff(UInt64, 100) == (0xa3d70a3d70a3d70b, 6) + + @test calculate_inverse_coeff(UInt64, 1) == (UInt64(0x1), 0) +end + +@testset "div_by_const" begin + vals = [2432, 100, 0x1, Int32(10000), typemax(Int64), typemax(Int16), 8, Int64(2)^32] + for a_base in vals + # Only test negative numbers on `a`, since div_by_const requires b > 0. + @testset for (a, b, f) in Iterators.product((a_base, -a_base), vals, (unsigned, signed)) + a, b = promote(f(a), f(b)) + @test FixedPointDecimals.div_by_const(a, Val(b)) == a ÷ b + end + end +end + +@testset "fldmod_by_const" begin + vals = [2432, 100, 0x1, Int32(10000), typemax(Int64), typemax(Int16), 8, Int64(2)^32] + for a_base in vals + # Only test negative numbers on `a`, since fldmod_by_const requires b > 0. + @testset for (a, b, f) in Iterators.product((a_base, -a_base), vals, (unsigned, signed)) + a, b = promote(f(a), f(b)) + @test FixedPointDecimals.fldmod_by_const(a, b) == fldmod(a, b) + end + end +end diff --git a/test/runtests.jl b/test/runtests.jl index 81f9cea..62b2987 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -10,4 +10,9 @@ include(joinpath(pkg_path, "test", "utils.jl")) @testset "FixedPointDecimals" begin include("FixedDecimal.jl") -end # global testset +end + +@testset "FixedPointDecimals" begin + include("fldmod-by-const_tests.jl") +end + From 4e53f3d1d990f1b78abd79f599335bfdfe7a2bec Mon Sep 17 00:00:00 2001 From: Nathan Daly Date: Wed, 12 Jun 2024 20:37:13 -0600 Subject: [PATCH 07/32] Support older versions of julia --- src/FixedPointDecimals.jl | 13 ++++++++++++- src/fldmod-by-const.jl | 2 +- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/src/FixedPointDecimals.jl b/src/FixedPointDecimals.jl index df8f75a..c42f4ce 100644 --- a/src/FixedPointDecimals.jl +++ b/src/FixedPointDecimals.jl @@ -39,7 +39,18 @@ using Base: decompose, BitInteger using BitIntegers: BitIntegers, UInt256, Int256 import Parsers -include("fldmod-by-const.jl") +# The effects system in newer versions of Julia is necessary for the fldmod_by_const +# optimization to take affect without adverse performance impacts. +# For older versions of julia, we will fall back to the flmod implementation, which works +# very fast for all FixedDecimals of size <= 64 bits. +if isdefined(Base, Symbol("@assume_effects")) + include("fldmod-by-const.jl") +else + # We force-inline this, to allow the fldmod operation to be specialized for a constant + # second argument (converting divide-by-power-of-ten instructions with the cheaper + # multiply-by-inverse-coefficient.) Sadly this doesn't work for Int128. + @inline fldmod_by_const(x,y) = (fld(x,y), mod(x,y)) +end # floats that support fma and are roughly IEEE-like const FMAFloat = Union{Float16, Float32, Float64, BigFloat} diff --git a/src/fldmod-by-const.jl b/src/fldmod-by-const.jl index 48ffcfb..e71dc20 100644 --- a/src/fldmod-by-const.jl +++ b/src/fldmod-by-const.jl @@ -30,7 +30,7 @@ end return x - quotient * y end -Base.@assume_effects :total function div_by_const(x::T, ::Val{C}) where {T, C} +function div_by_const(x::T, ::Val{C}) where {T, C} # These checks will be compiled away during specialization. # While for `*(FixedDecimal, FixedDecimal)`, C will always be a power of 10, these # checks allow this function to work for any `C > 0`, in case that's useful in the From dfd41b1be3484356d67d21e3804ae033216af1d8 Mon Sep 17 00:00:00 2001 From: Nathan Daly Date: Wed, 12 Jun 2024 20:42:43 -0600 Subject: [PATCH 08/32] Comments --- src/fldmod-by-const.jl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/fldmod-by-const.jl b/src/fldmod-by-const.jl index e71dc20..26cc3ff 100644 --- a/src/fldmod-by-const.jl +++ b/src/fldmod-by-const.jl @@ -30,6 +30,8 @@ end return x - quotient * y end +# This function is based on the native code produced by the following: +# @code_native ((x)->div(x, 100))(Int64(2)) function div_by_const(x::T, ::Val{C}) where {T, C} # These checks will be compiled away during specialization. # While for `*(FixedDecimal, FixedDecimal)`, C will always be a power of 10, these From efee91bd2d53cf2ffbd71a28b7416525bc5c5da7 Mon Sep 17 00:00:00 2001 From: Nathan Daly Date: Wed, 12 Jun 2024 21:50:06 -0600 Subject: [PATCH 09/32] Disable fldmod-by-const tests on older julia --- test/runtests.jl | 3 +++ 1 file changed, 3 insertions(+) diff --git a/test/runtests.jl b/test/runtests.jl index 62b2987..583bed7 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -12,7 +12,10 @@ include(joinpath(pkg_path, "test", "utils.jl")) include("FixedDecimal.jl") end + +if isdefined(Base, Symbol("@assume_effects")) @testset "FixedPointDecimals" begin include("fldmod-by-const_tests.jl") end +end From a03d754dc995f39df37734b6a229eb1e24e86edc Mon Sep 17 00:00:00 2001 From: Nathan Daly Date: Wed, 12 Jun 2024 22:00:37 -0600 Subject: [PATCH 10/32] Fix one other case of iseven allocating a BigInt --- src/FixedPointDecimals.jl | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/FixedPointDecimals.jl b/src/FixedPointDecimals.jl index c42f4ce..0bb3133 100644 --- a/src/FixedPointDecimals.jl +++ b/src/FixedPointDecimals.jl @@ -191,7 +191,8 @@ function _round_to_nearest(quotient::T, halfdivisor = divisor >> 1 # PERF Note: Only need the last bit to check iseven, and default iseven(Int256) # allocates, so we truncate first. - if iseven((divisor % Int8)) && remainder == halfdivisor + _iseven(x) = (x & 0x1) == 0 + if _iseven(divisor) && remainder == halfdivisor # `:NearestTiesAway` will tie away from zero, e.g. -8.5 -> # -9. `:NearestTiesUp` will always ties towards positive # infinity. `:Nearest` will tie towards the nearest even @@ -199,7 +200,7 @@ function _round_to_nearest(quotient::T, if M == :NearestTiesAway ifelse(quotient < zero(quotient), quotient, quotient + one(quotient)) elseif M == :Nearest - ifelse(iseven(quotient), quotient, quotient + one(quotient)) + ifelse(_iseven(quotient), quotient, quotient + one(quotient)) else quotient + one(quotient) end From 4ed8ebf6750bd3a5a91678ea06af8192c72139e8 Mon Sep 17 00:00:00 2001 From: Nathan Daly Date: Thu, 13 Jun 2024 11:54:57 -0600 Subject: [PATCH 11/32] Apply this optimization to FD{Int64} as well. --- src/fldmod-by-const.jl | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/src/fldmod-by-const.jl b/src/fldmod-by-const.jl index 26cc3ff..7c7ff81 100644 --- a/src/fldmod-by-const.jl +++ b/src/fldmod-by-const.jl @@ -1,5 +1,18 @@ +# NOTE: We apply this optimization to values of type (U)Int128 and (U)Int256, which covers +# FixedDecimal{(U)Int64} and FixedDecimal{(U)Int128}. +# Julia+LLVM have built-in optimizations that apply this already for FD{(U)Int64}, however +# this customized implementation appears to produce still even faster code than LLVM can +# produce on its own. So we apply this for both sizes. +# Before: +# julia> @btime for _ in 1:10000 fd = fd * fd end setup = (fd = FixedDecimal{Int64,3}(1.234)) +# 247.709 μs (0 allocations: 0 bytes) +# FixedDecimal{Int64,3}(4230510070790917.029) +# After: +# julia> @btime for _ in 1:10000 fd = fd * fd end setup = (fd = FixedDecimal{Int64,3}(1.234)) +# 106.125 μs (0 allocations: 0 bytes) +# FixedDecimal{Int64,3}(4230510070790917.029) -const BitInteger256 = Union{UInt256, Int256} +const BigBitIntegers = Union{UInt128, Int128, UInt256, Int256} @inline function fldmod_by_const(x, y) # For small-to-normal integers, LLVM can correctly optimize away the division, if it @@ -7,7 +20,7 @@ const BitInteger256 = Union{UInt256, Int256} # inlined, so here we have explictly inlined it instead. return (fld(x,y), mod(x,y)) end -@inline function fldmod_by_const(x::BitInteger256, y) +@inline function fldmod_by_const(x::BigBitIntegers, y) # For large or non-standard Int types, LLVM doesn't optimize # well, so we use a custom implementation of fldmod. d = fld_by_const(x, Val(y)) @@ -17,8 +30,8 @@ end # Calculate fld(x,y) when y is a Val constant. # The implementation for fld_by_const was lifted directly from Base.fld(x,y), except that # it uses `div_by_const` instead of `div`. -fld_by_const(x::T, y::Val{C}) where {T<:UInt256, C} = div_by_const(x, y) -function fld_by_const(x::T, y::Val{C}) where {T<:Int256, C} +fld_by_const(x::T, y::Val{C}) where {T<:Unsigned, C} = div_by_const(x, y) +function fld_by_const(x::T, y::Val{C}) where {T<:Signed, C} d = div_by_const(x, y) return d - (signbit(x ⊻ C) & (d * C != x)) end @@ -26,7 +39,7 @@ end # Calculate `mod(x,y)` after you've already acquired quotient, the result of `fld(x,y)`. # REQUIRES: # - `y != -1` -@inline function manual_mod(x::T, y::T, quotient::T) where T<:BitInteger256 +@inline function manual_mod(x::T, y::T, quotient::T) where T<:BigBitIntegers return x - quotient * y end @@ -94,5 +107,8 @@ end # https://github.com/rfourquet/BitIntegers.jl/pull/2 _unsigned(x) = unsigned(x) _unsigned(::Type{Int256}) = UInt256 +_unsigned(::Type{UInt256}) = UInt256 +_unsigned(::Type{Int128}) = UInt128 +_unsigned(::Type{UInt128}) = UInt128 nbits(x) = sizeof(x) * 8 From 3f39b8aded7b150526eb58aabbda8862e2f5cf50 Mon Sep 17 00:00:00 2001 From: Nathan Daly Date: Thu, 13 Jun 2024 13:46:27 -0600 Subject: [PATCH 12/32] Adjust to run for all integer types! --- src/fldmod-by-const.jl | 59 ++++++++++++++++++++++++++++++------------ 1 file changed, 42 insertions(+), 17 deletions(-) diff --git a/src/fldmod-by-const.jl b/src/fldmod-by-const.jl index 7c7ff81..3366ccb 100644 --- a/src/fldmod-by-const.jl +++ b/src/fldmod-by-const.jl @@ -1,30 +1,55 @@ -# NOTE: We apply this optimization to values of type (U)Int128 and (U)Int256, which covers -# FixedDecimal{(U)Int64} and FixedDecimal{(U)Int128}. -# Julia+LLVM have built-in optimizations that apply this already for FD{(U)Int64}, however -# this customized implementation appears to produce still even faster code than LLVM can -# produce on its own. So we apply this for both sizes. +# NOTE: Surprisingly, even though LLVM implements a version of this optimization on its own +# for smaller integer sizes (<=64-bits), using the code in this file produces faster +# multiplications for *all* types of integers. So we use our custom fldmod_by_const for all +# bit integer types. # Before: +# julia> @btime for _ in 1:10000 fd = fd * fd end setup = (fd = FixedDecimal{Int32,3}(1.234)) +# 84.959 μs (0 allocations: 0 bytes) +# FixedDecimal{Int32,3}(1700943.280) +# # julia> @btime for _ in 1:10000 fd = fd * fd end setup = (fd = FixedDecimal{Int64,3}(1.234)) # 247.709 μs (0 allocations: 0 bytes) # FixedDecimal{Int64,3}(4230510070790917.029) +# +#julia> @btime for _ in 1:10000 fd = fd * fd end setup = (fd = FixedDecimal{Int128,3}(1.234)) +# 4.077 ms (160798 allocations: 3.22 MiB) +# FixedDecimal{Int128,3}(-66726338547984585007169386718143307.324) +# # After: +# julia> @btime for _ in 1:10000 fd = fd * fd end setup = (fd = FixedDecimal{Int32,3}(1.234)) +# 68.416 μs (0 allocations: 0 bytes) +# FixedDecimal{Int32,3}(1700943.280) +# # julia> @btime for _ in 1:10000 fd = fd * fd end setup = (fd = FixedDecimal{Int64,3}(1.234)) # 106.125 μs (0 allocations: 0 bytes) # FixedDecimal{Int64,3}(4230510070790917.029) +# +# julia> @btime for _ in 1:10000 fd = fd * fd end setup = (fd = FixedDecimal{Int128,3}(1.234)) +# 204.125 μs (0 allocations: 0 bytes) +# FixedDecimal{Int128,3}(-66726338547984585007169386718143307.324) -const BigBitIntegers = Union{UInt128, Int128, UInt256, Int256} +""" + ShouldUseCustomFldmodByConst(::Type{<:MyCustomIntType})) = true +A trait to control opt-in for the custom `fldmod_by_const` implementation. To use this for a +given integer type, you can define this overload for your integer type. +You will also need to implement some parts of the interface below, including _widen(). +""" +ShouldUseCustomFldmodByConst(::Type{<:Base.BitInteger}) = true +ShouldUseCustomFldmodByConst(::Type{<:Union{Int256,Int128}}) = true +ShouldUseCustomFldmodByConst(::Type) = false @inline function fldmod_by_const(x, y) - # For small-to-normal integers, LLVM can correctly optimize away the division, if it - # knows it's dividing by a const. We cannot call `Base.fldmod` since it's not - # inlined, so here we have explictly inlined it instead. - return (fld(x,y), mod(x,y)) -end -@inline function fldmod_by_const(x::BigBitIntegers, y) - # For large or non-standard Int types, LLVM doesn't optimize - # well, so we use a custom implementation of fldmod. - d = fld_by_const(x, Val(y)) - return d, manual_mod(promote(x, y, d)...) + if ShouldUseCustomFldmodByConst(typeof(x)) + # For large Int types, LLVM doesn't optimize well, so we use a custom implementation + # of fldmod, which extends that optimization to those larger integer types. + d = fld_by_const(x, Val(y)) + return d, manual_mod(promote(x, y, d)...) + else + # For other integers, LLVM might be able to correctly optimize away the division, if + # it knows it's dividing by a const. We cannot call `Base.fldmod` since it's not + # inlined, so here we have explictly inlined it instead. + return (fld(x,y), mod(x,y)) + end end # Calculate fld(x,y) when y is a Val constant. @@ -39,7 +64,7 @@ end # Calculate `mod(x,y)` after you've already acquired quotient, the result of `fld(x,y)`. # REQUIRES: # - `y != -1` -@inline function manual_mod(x::T, y::T, quotient::T) where T<:BigBitIntegers +@inline function manual_mod(x::T, y::T, quotient::T) where T<:Integer return x - quotient * y end From 20c66f2b4946f2062f3a14aa225cabbdced53759 Mon Sep 17 00:00:00 2001 From: Nathan Daly Date: Thu, 13 Jun 2024 13:55:47 -0600 Subject: [PATCH 13/32] Clarify the `_unsigned(x)` methods with comments --- src/fldmod-by-const.jl | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/fldmod-by-const.jl b/src/fldmod-by-const.jl index 3366ccb..cc15554 100644 --- a/src/fldmod-by-const.jl +++ b/src/fldmod-by-const.jl @@ -35,7 +35,7 @@ given integer type, you can define this overload for your integer type. You will also need to implement some parts of the interface below, including _widen(). """ ShouldUseCustomFldmodByConst(::Type{<:Base.BitInteger}) = true -ShouldUseCustomFldmodByConst(::Type{<:Union{Int256,Int128}}) = true +ShouldUseCustomFldmodByConst(::Type{<:Union{Int256,UInt256}}) = true ShouldUseCustomFldmodByConst(::Type) = false @inline function fldmod_by_const(x, y) @@ -130,10 +130,12 @@ end # Annoyingly, Unsigned(T) isn't defined for BitIntegers types: # https://github.com/rfourquet/BitIntegers.jl/pull/2 +# Note: We do not need this for Int512, since we only widen to 512 _after_ calling +# _unsigned, above. This code is only for supporting the built-in integer types, which only +# go up to 128-bits (widened twice to 512). If a user wants to extend FixedDecimals for +# other integer types, they will need to add methods to either _unsigned or unsigned. _unsigned(x) = unsigned(x) _unsigned(::Type{Int256}) = UInt256 _unsigned(::Type{UInt256}) = UInt256 -_unsigned(::Type{Int128}) = UInt128 -_unsigned(::Type{UInt128}) = UInt128 nbits(x) = sizeof(x) * 8 From f2958baf17afc877b22f2e5d038dcfaa39c4d784 Mon Sep 17 00:00:00 2001 From: Nathan Daly Date: Thu, 13 Jun 2024 13:57:27 -0600 Subject: [PATCH 14/32] Apply suggestions from code review Co-authored-by: Curtis Vogt --- src/FixedPointDecimals.jl | 2 +- test/runtests.jl | 8 +------- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/src/FixedPointDecimals.jl b/src/FixedPointDecimals.jl index 0bb3133..d36f9fa 100644 --- a/src/FixedPointDecimals.jl +++ b/src/FixedPointDecimals.jl @@ -36,7 +36,7 @@ export checked_abs, checked_add, checked_cld, checked_div, checked_fld, using Base: decompose, BitInteger -using BitIntegers: BitIntegers, UInt256, Int256 +using BitIntegers: BitIntegers, Int256, Int512, UInt256, UInt512 import Parsers # The effects system in newer versions of Julia is necessary for the fldmod_by_const diff --git a/test/runtests.jl b/test/runtests.jl index 583bed7..b5fadb6 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -10,12 +10,6 @@ include(joinpath(pkg_path, "test", "utils.jl")) @testset "FixedPointDecimals" begin include("FixedDecimal.jl") -end - - -if isdefined(Base, Symbol("@assume_effects")) -@testset "FixedPointDecimals" begin - include("fldmod-by-const_tests.jl") -end + isdefined(Base, Symbol("@assume_effects")) && include("fldmod-by-const_tests.jl") end From 0019bb0af65fd626fc27904d6c4ab162456612d0 Mon Sep 17 00:00:00 2001 From: Nathan Daly Date: Sun, 16 Jun 2024 21:49:28 -0600 Subject: [PATCH 15/32] Update src/FixedPointDecimals.jl Co-authored-by: Curtis Vogt --- src/FixedPointDecimals.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/FixedPointDecimals.jl b/src/FixedPointDecimals.jl index d36f9fa..ebe0702 100644 --- a/src/FixedPointDecimals.jl +++ b/src/FixedPointDecimals.jl @@ -144,8 +144,8 @@ _widemul(x::Unsigned,y::Signed) = signed(_widen(x)) * _widen(y) # FD{Int128} operations should widen to 256 bits internally, rather than to a BigInt. _widen(::Type{Int128}) = Int256 _widen(::Type{UInt128}) = UInt256 -_widen(::Type{Int256}) = BitIntegers.Int512 -_widen(::Type{UInt256}) = BitIntegers.UInt512 +_widen(::Type{Int256}) = Int512 +_widen(::Type{UInt256}) = UInt512 _widen(t::Type) = widen(t) _widen(x::T) where {T} = (_widen(T))(x) From 4a53703706b5d4012284af63e3ff75cf7ad8de60 Mon Sep 17 00:00:00 2001 From: Nathan Daly Date: Wed, 19 Jun 2024 11:50:49 -0600 Subject: [PATCH 16/32] Add extensive tests for multiplication correctness, to cover the new change Tests all FD{(U)Int16} values. Tests most corner cases for FD{(U)Int128} values. --- test/fldmod-by-const_tests.jl | 65 +++++++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) diff --git a/test/fldmod-by-const_tests.jl b/test/fldmod-by-const_tests.jl index 6b6a8d1..c4f2bf5 100644 --- a/test/fldmod-by-const_tests.jl +++ b/test/fldmod-by-const_tests.jl @@ -56,3 +56,68 @@ end end end end + +@testset "fixed decimal multiplication - exhaustive 16-bit" begin + @testset for P in (0,1,2,3,4) + @testset for T in (Int16, UInt16) + FD = FixedDecimal{T,P} + + function test_multiplies_correctly(fd, x) + big = FixedDecimal{BigInt, P}(fd) + big_mul = big * x + # This might overflow: ... + mul = fd * x + # ... so we truncate big to the same size + @test big_mul.i % T == mul.i % T + end + @testset for v in typemin(FD) : eps(FD) : typemax(FD) + test_multiplies_correctly(v, typemin(T)) + test_multiplies_correctly(v, -1) + test_multiplies_correctly(v, -eps(FD)) + test_multiplies_correctly(v, 0) + test_multiplies_correctly(v, eps(FD)) + test_multiplies_correctly(v, 1) + test_multiplies_correctly(v, 2) + test_multiplies_correctly(v, 3) + test_multiplies_correctly(v, typemax(T)) + end + end + end +end + +@testset "fixed decimal multiplication - 128-bit" begin + @testset for P in 0:37 + @testset for T in (Int128, UInt128) + FD = FixedDecimal{T,P} + + function test_multiplies_correctly(fd, x) + big = FixedDecimal{BigInt, P}(fd) + big_mul = big * x + # This might overflow: ... + mul = fd * x + # ... so we truncate big to the same size + @test big_mul.i % T == mul.i % T + end + vals = FD[ + typemin(FD), typemax(FD), + typemin(FD) + eps(FD), typemax(FD) - eps(FD), + 0.0, eps(FD), 0.1, 1.0, 2.0, + typemax(FD) ÷ 2, + ] + if T <: Signed + append!(vals, vals.*-1) + end + @testset for v in vals + test_multiplies_correctly(v, typemin(T)) + test_multiplies_correctly(v, -1) + test_multiplies_correctly(v, -eps(FD)) + test_multiplies_correctly(v, 0) + test_multiplies_correctly(v, eps(FD)) + test_multiplies_correctly(v, 1) + test_multiplies_correctly(v, 2) + test_multiplies_correctly(v, 3) + test_multiplies_correctly(v, typemax(T)) + end + end + end +end From 27a3e0f320eb1ae55b55c2ae2570db5ba8d5bead Mon Sep 17 00:00:00 2001 From: Nathan Daly Date: Wed, 19 Jun 2024 11:59:18 -0600 Subject: [PATCH 17/32] Named testsets to make failures easier to identify --- test/fldmod-by-const_tests.jl | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/test/fldmod-by-const_tests.jl b/test/fldmod-by-const_tests.jl index c4f2bf5..dfcee19 100644 --- a/test/fldmod-by-const_tests.jl +++ b/test/fldmod-by-const_tests.jl @@ -67,8 +67,10 @@ end big_mul = big * x # This might overflow: ... mul = fd * x - # ... so we truncate big to the same size - @test big_mul.i % T == mul.i % T + @testset "$fd * $x" begin + # ... so we truncate big to the same size + @test big_mul.i % T == mul.i % T + end end @testset for v in typemin(FD) : eps(FD) : typemax(FD) test_multiplies_correctly(v, typemin(T)) @@ -95,8 +97,10 @@ end big_mul = big * x # This might overflow: ... mul = fd * x - # ... so we truncate big to the same size - @test big_mul.i % T == mul.i % T + @testset "$fd * $x" begin + # ... so we truncate big to the same size + @test big_mul.i % T == mul.i % T + end end vals = FD[ typemin(FD), typemax(FD), From e4cb73b3dca4b6254b0c49cb44aaf1c4c863a4e3 Mon Sep 17 00:00:00 2001 From: Nathan Daly Date: Wed, 19 Jun 2024 13:09:57 -0600 Subject: [PATCH 18/32] Fix off-by-one error in rounding truncation in calculate_inverse_coeff() We must use the precise value of 2^nbits(T) in order to get the correct division in all cases. ....... UGH except now the Int64 tests aren't passing. --- src/fldmod-by-const.jl | 3 ++- test/fldmod-by-const_tests.jl | 24 ++++++++++++++++++++++++ 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/src/fldmod-by-const.jl b/src/fldmod-by-const.jl index cc15554..36f0885 100644 --- a/src/fldmod-by-const.jl +++ b/src/fldmod-by-const.jl @@ -119,7 +119,8 @@ Base.@assume_effects :foldable function calculate_inverse_coeff(::Type{T}, C) wh # Now, truncate to only the upper half of invcoeff, after we've shifted. Instead of # bitshifting, we round to maintain precision. (This is needed to prevent off-by-ones.) # -- This is equivalent to `invcoeff = T(invcoeff >> sizeof(T))`, except rounded. -- - invcoeff = _round_to_nearest(fldmod(invcoeff, typemax(UT))..., typemax(UT)) % T + two_to_N = _widen(typemax(UT)) + UT(1) # Precise value for 2^nbits(T) (doesn't fit in T) + invcoeff = _round_to_nearest(fldmod(invcoeff, two_to_N)..., two_to_N ) % T return invcoeff, toshift end diff --git a/test/fldmod-by-const_tests.jl b/test/fldmod-by-const_tests.jl index dfcee19..9f1db82 100644 --- a/test/fldmod-by-const_tests.jl +++ b/test/fldmod-by-const_tests.jl @@ -57,6 +57,30 @@ end end end +@testset "fixed decimal multiplication - exhaustive 8-bit" begin + @testset for P in (0,1) + @testset for T in (Int8, UInt8) + FD = FixedDecimal{T,P} + + function test_multiplies_correctly(fd, x) + big = FixedDecimal{BigInt, P}(fd) + big_mul = big * x + # This might overflow: ... + mul = fd * x + @testset "$fd * $x" begin + # ... so we truncate big to the same size + @test big_mul.i % T == mul.i % T + end + end + @testset for v in typemin(FD) : eps(FD) : typemax(FD) + @testset for v2 in typemin(FD) : eps(FD) : typemax(FD) + test_multiplies_correctly(v, v2) + end + end + end + end +end + @testset "fixed decimal multiplication - exhaustive 16-bit" begin @testset for P in (0,1,2,3,4) @testset for T in (Int16, UInt16) From 73f6547af5191737008ee8ba2eda591f6eab365d Mon Sep 17 00:00:00 2001 From: Nathan Daly Date: Wed, 19 Jun 2024 14:46:23 -0600 Subject: [PATCH 19/32] Add some comments and requires --- src/fldmod-by-const.jl | 7 +++++++ test/fldmod-by-const_tests.jl | 4 ---- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/src/fldmod-by-const.jl b/src/fldmod-by-const.jl index 36f0885..d96b96c 100644 --- a/src/fldmod-by-const.jl +++ b/src/fldmod-by-const.jl @@ -105,6 +105,13 @@ function div_by_const(x::T, ::Val{C}) where {T, C} return T(out) end + + + +# REQUIRES: C must not be 0, 1, -1 +# See this implementation in LLVM: +# https://llvm.org/doxygen/DivisionByConstantInfo_8cpp_source.html +# which was taken from "Hacker's Delight". Base.@assume_effects :foldable function calculate_inverse_coeff(::Type{T}, C) where {T} # First, calculate 2^nbits(T)/C # We shift away leading zeros to preserve the most precision when we use it to multiply diff --git a/test/fldmod-by-const_tests.jl b/test/fldmod-by-const_tests.jl index 9f1db82..5e248d2 100644 --- a/test/fldmod-by-const_tests.jl +++ b/test/fldmod-by-const_tests.jl @@ -22,8 +22,6 @@ using FixedPointDecimals # (LLVM's magic number is shifted one bit less, then they shift by 2, instead of 3, # but the result is the same.) @test calculate_inverse_coeff(Int64, 10) == (7378697629483820647 << 1, 3) - - @test calculate_inverse_coeff(Int64, 1) == (1, 0) end @testset "calculate_inverse_coeff signed 4" begin @@ -31,8 +29,6 @@ end # Same here, our magic number is shifted 2 bits more than LLVM's @test calculate_inverse_coeff(UInt64, 100) == (0xa3d70a3d70a3d70b, 6) - - @test calculate_inverse_coeff(UInt64, 1) == (UInt64(0x1), 0) end @testset "div_by_const" begin From 4e9fdd6f9193b7c214f8db01072d773c5c2c30a6 Mon Sep 17 00:00:00 2001 From: Nathan Daly Date: Wed, 19 Jun 2024 15:43:32 -0600 Subject: [PATCH 20/32] Copy/pasted definition for unsigned numbers straight from the book --- src/fldmod-by-const.jl | 41 +++++++++++++++++++++++++++++++++++ test/fldmod-by-const_tests.jl | 2 +- 2 files changed, 42 insertions(+), 1 deletion(-) diff --git a/src/fldmod-by-const.jl b/src/fldmod-by-const.jl index d96b96c..b69054e 100644 --- a/src/fldmod-by-const.jl +++ b/src/fldmod-by-const.jl @@ -136,6 +136,47 @@ function mul_hi(x::T, y::T) where T (xy >> nbits(T)) % T end + + +# Unsigned magic number computation + divide by constant +# Hacker's delight figure 10–4 +Base.@assume_effects :foldable function magicgu(nmax, d) + T = typeof(nmax) + nc = (nmax ÷ d) * d - 1 + N = T(floor(log2(nmax) + 1)) + for p in 0:2N + if 2^p > nc * (d - 1 - (2^p - 1) % d) + m = (2^p + d - 1 - (2^p - 1) % d) ÷ d + return (m%T, p) + end + end + error("Can't find p, something is wrong.") +end +function div_by_const(x::T, ::Val{C}) where {T<:Unsigned, C} + # These checks will be compiled away during specialization. + # While for `*(FixedDecimal, FixedDecimal)`, C will always be a power of 10, these + # checks allow this function to work for any `C > 0`, in case that's useful in the + # future. + if C == 1 + return x + elseif ispow2(C) + return div(x, C) # Will already do the right thing + elseif C <= 0 + throw(DomainError("C must be > 0")) + end + # Calculate the magic number 2^N/C. Note that this is computed statically, not at + # runtime. + inverse_coeff, toshift = magicgu(typemax(T), C) + out = _widemul(x, inverse_coeff) # support Int256 -> Int512 (!!) + out = out >> toshift + return out % T +end + + + + + + # Annoyingly, Unsigned(T) isn't defined for BitIntegers types: # https://github.com/rfourquet/BitIntegers.jl/pull/2 # Note: We do not need this for Int512, since we only widen to 512 _after_ calling diff --git a/test/fldmod-by-const_tests.jl b/test/fldmod-by-const_tests.jl index 5e248d2..0adaa35 100644 --- a/test/fldmod-by-const_tests.jl +++ b/test/fldmod-by-const_tests.jl @@ -53,7 +53,7 @@ end end end -@testset "fixed decimal multiplication - exhaustive 8-bit" begin +@testitem "fixed decimal multiplication - exhaustive 8-bit" begin @testset for P in (0,1) @testset for T in (Int8, UInt8) FD = FixedDecimal{T,P} From 188933d1c8c5ab07ffb118304b4beb6f2741ff57 Mon Sep 17 00:00:00 2001 From: Nathan Daly Date: Wed, 19 Jun 2024 18:55:31 -0600 Subject: [PATCH 21/32] Have `magicgu` support arbitrary integer sizes --- src/fldmod-by-const.jl | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/fldmod-by-const.jl b/src/fldmod-by-const.jl index b69054e..1562a4a 100644 --- a/src/fldmod-by-const.jl +++ b/src/fldmod-by-const.jl @@ -144,13 +144,15 @@ Base.@assume_effects :foldable function magicgu(nmax, d) T = typeof(nmax) nc = (nmax ÷ d) * d - 1 N = T(floor(log2(nmax) + 1)) + MathType = _widen(T) + two = MathType(2) for p in 0:2N - if 2^p > nc * (d - 1 - (2^p - 1) % d) - m = (2^p + d - 1 - (2^p - 1) % d) ÷ d + if two^p > nc * (d - 1 - (2^p - 1) % d) + m = (two^p + d - 1 - (2^p - 1) % d) ÷ d return (m%T, p) end end - error("Can't find p, something is wrong.") + error("Can't find p, something is wrong. nmax: $(repr(nmax)), d: $(repr(d)), N: $N") end function div_by_const(x::T, ::Val{C}) where {T<:Unsigned, C} # These checks will be compiled away during specialization. From f6d375c0231747ff9b044b7494d43904c2f786ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Drvo=C5=A1t=C4=9Bp?= Date: Fri, 21 Jun 2024 17:21:52 +0200 Subject: [PATCH 22/32] Use the formulas from Hacker's delight for both signed and unsigned Ints --- src/fldmod-by-const.jl | 142 +++++++++------------------------- test/fldmod-by-const_tests.jl | 32 +------- 2 files changed, 39 insertions(+), 135 deletions(-) diff --git a/src/fldmod-by-const.jl b/src/fldmod-by-const.jl index 1562a4a..6c62ce3 100644 --- a/src/fldmod-by-const.jl +++ b/src/fldmod-by-const.jl @@ -68,93 +68,44 @@ end return x - quotient * y end -# This function is based on the native code produced by the following: -# @code_native ((x)->div(x, 100))(Int64(2)) -function div_by_const(x::T, ::Val{C}) where {T, C} - # These checks will be compiled away during specialization. - # While for `*(FixedDecimal, FixedDecimal)`, C will always be a power of 10, these - # checks allow this function to work for any `C > 0`, in case that's useful in the - # future. - if C == 1 - return x - elseif ispow2(C) - return div(x, C) # Will already do the right thing - elseif C <= 0 - throw(DomainError("C must be > 0")) - end - # Calculate the magic number 2^N/C. Note that this is computed statically, not at - # runtime. - inverse_coeff, toshift = calculate_inverse_coeff(T, C) - # Compute the upper-half of widemul(x, 2^nbits(T)/C). - # By keeping only the upper half, we're essentially dividing by 2^nbits(T), undoing the - # numerator of the multiplication, so that the result is equal to x/C. - out = mul_hi(x, inverse_coeff) - # This condition will be compiled away during specialization. - if T <: Signed - # Because our magic number has a leading one (since we shift all-the-way left), the - # result is negative if it's Signed. We add x to give us the positive equivalent. - out += x - signshift = (nbits(x) - 1) - isnegative = T(out >>> signshift) # 1 if < 0 else 0 (Unsigned bitshift to read top bit) - end - # Undo the bitshifts used to calculate the invoeff magic number with maximum precision. - out = out >> toshift - if T <: Signed - out = out + isnegative +# Unsigned magic number computation + shift by constant +# See Hacker's delight, equations (26) and (27) from Chapter 10-9. +# nmax and divisor must be > 0. +Base.@assume_effects :foldable function magicg(nmax::Unsigned, divisor) + T = typeof(nmax) + W = _widen(T) + d = W(divisor) + + nc = div(W(nmax) + W(1), d) * d - W(1) + nbits = sizeof(nmax) * 8 - leading_zeros(nmax) # most significant bit + for p in 0:2nbits + if W(2)^p > nc * (d - W(1) - rem(W(2)^p - W(1), d)) # (27) + m = div(W(2)^p + d - W(1) - rem(W(2)^p - W(1), d), d) # (26) + return (m, p) + end end - return T(out) + return W(0), 0 # Should never reach here end - - - -# REQUIRES: C must not be 0, 1, -1 -# See this implementation in LLVM: -# https://llvm.org/doxygen/DivisionByConstantInfo_8cpp_source.html -# which was taken from "Hacker's Delight". -Base.@assume_effects :foldable function calculate_inverse_coeff(::Type{T}, C) where {T} - # First, calculate 2^nbits(T)/C - # We shift away leading zeros to preserve the most precision when we use it to multiply - # in the next step. At the end, we will shift the final answer back to undo this - # operation (which is why we need to return `toshift`). - # Note, also, that we calculate invcoeff at double-precision so that the left-shift - # doesn't leave trailing zeros. We truncate to only the upper-half before returning. - UT = _unsigned(T) - invcoeff = typemax(_widen(UT)) ÷ C - toshift = leading_zeros(invcoeff) - invcoeff = invcoeff << toshift - # Now, truncate to only the upper half of invcoeff, after we've shifted. Instead of - # bitshifting, we round to maintain precision. (This is needed to prevent off-by-ones.) - # -- This is equivalent to `invcoeff = T(invcoeff >> sizeof(T))`, except rounded. -- - two_to_N = _widen(typemax(UT)) + UT(1) # Precise value for 2^nbits(T) (doesn't fit in T) - invcoeff = _round_to_nearest(fldmod(invcoeff, two_to_N)..., two_to_N ) % T - return invcoeff, toshift -end - -function mul_hi(x::T, y::T) where T - xy = _widemul(x, y) # support Int256 -> Int512 (!!) - (xy >> nbits(T)) % T -end - - - -# Unsigned magic number computation + divide by constant -# Hacker's delight figure 10–4 -Base.@assume_effects :foldable function magicgu(nmax, d) +# See Hacker's delight, equations (5) and (6) from Chapter 10-4. +# nmax and divisor must be > 0. +Base.@assume_effects :foldable function magicg(nmax::Signed, divisor) T = typeof(nmax) - nc = (nmax ÷ d) * d - 1 - N = T(floor(log2(nmax) + 1)) - MathType = _widen(T) - two = MathType(2) - for p in 0:2N - if two^p > nc * (d - 1 - (2^p - 1) % d) - m = (two^p + d - 1 - (2^p - 1) % d) ÷ d - return (m%T, p) + W = _widen(T) + d = W(divisor) + + nc = ((W(nmax) + W(1)) ÷ d) * d - W(1) + nbits = sizeof(nmax) * 8 - leading_zeros(nmax) # most significant bit + for p in 0:2nbits + if W(2)^p > nc * (d - rem(W(2)^p, d)) # (6) + m = div(W(2)^p + d - rem(W(2)^p, d), d) # (5) + return (m, p) end end - error("Can't find p, something is wrong. nmax: $(repr(nmax)), d: $(repr(d)), N: $N") + return W(0), 0 # Should never reach here end -function div_by_const(x::T, ::Val{C}) where {T<:Unsigned, C} + +function div_by_const(x::T, ::Val{C}) where {T, C} # These checks will be compiled away during specialization. # While for `*(FixedDecimal, FixedDecimal)`, C will always be a power of 10, these # checks allow this function to work for any `C > 0`, in case that's useful in the @@ -162,31 +113,14 @@ function div_by_const(x::T, ::Val{C}) where {T<:Unsigned, C} if C == 1 return x elseif ispow2(C) - return div(x, C) # Will already do the right thing + return div(x, C) # Will already do the right thing elseif C <= 0 throw(DomainError("C must be > 0")) end - # Calculate the magic number 2^N/C. Note that this is computed statically, not at - # runtime. - inverse_coeff, toshift = magicgu(typemax(T), C) - out = _widemul(x, inverse_coeff) # support Int256 -> Int512 (!!) - out = out >> toshift - return out % T -end - - - + # Calculate the magic number and shift amount, based on Hacker's Delight, Chapter 10. + magic_number, shift = magicg(typemax(T), C) - - -# Annoyingly, Unsigned(T) isn't defined for BitIntegers types: -# https://github.com/rfourquet/BitIntegers.jl/pull/2 -# Note: We do not need this for Int512, since we only widen to 512 _after_ calling -# _unsigned, above. This code is only for supporting the built-in integer types, which only -# go up to 128-bits (widened twice to 512). If a user wants to extend FixedDecimals for -# other integer types, they will need to add methods to either _unsigned or unsigned. -_unsigned(x) = unsigned(x) -_unsigned(::Type{Int256}) = UInt256 -_unsigned(::Type{UInt256}) = UInt256 - -nbits(x) = sizeof(x) * 8 + out = _widemul(promote(x, magic_number)...) + out >>= shift + return (out % T) + (x < zero(T)) # TODO: Figure out why and if the + 1 if non-positive is necessary +end diff --git a/test/fldmod-by-const_tests.jl b/test/fldmod-by-const_tests.jl index 0adaa35..95dfce9 100644 --- a/test/fldmod-by-const_tests.jl +++ b/test/fldmod-by-const_tests.jl @@ -1,36 +1,6 @@ using Test using FixedPointDecimals -@testset "calculate_inverse_coeff signed" begin - using FixedPointDecimals: calculate_inverse_coeff - - # The correct magic number here comes from investigating the following native code - # produced on an m2 aarch64 macbook pro: - # @code_native ((x)->fld(x,100))(2) - # ... - # mov x8, #55051 - # movk x8, #28835, lsl #16 - # movk x8, #2621, lsl #32 - # movk x8, #41943, lsl #48 - # Where: - # julia> 55051 | 28835 << 16 | 2621 << 32 | 41943 << 48 - # -6640827866535438581 - @test calculate_inverse_coeff(Int64, 100) == (-6640827866535438581, 6) - - # Same for the tests below: - - # (LLVM's magic number is shifted one bit less, then they shift by 2, instead of 3, - # but the result is the same.) - @test calculate_inverse_coeff(Int64, 10) == (7378697629483820647 << 1, 3) -end - -@testset "calculate_inverse_coeff signed 4" begin - using FixedPointDecimals: calculate_inverse_coeff - - # Same here, our magic number is shifted 2 bits more than LLVM's - @test calculate_inverse_coeff(UInt64, 100) == (0xa3d70a3d70a3d70b, 6) -end - @testset "div_by_const" begin vals = [2432, 100, 0x1, Int32(10000), typemax(Int64), typemax(Int16), 8, Int64(2)^32] for a_base in vals @@ -53,7 +23,7 @@ end end end -@testitem "fixed decimal multiplication - exhaustive 8-bit" begin +@testset "fixed decimal multiplication - exhaustive 8-bit" begin @testset for P in (0,1) @testset for T in (Int8, UInt8) FD = FixedDecimal{T,P} From b79c8737710df1a021a8749343d3e3af172e68a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Drvo=C5=A1t=C4=9Bp?= Date: Tue, 25 Jun 2024 19:41:00 +0200 Subject: [PATCH 23/32] . --- src/fldmod-by-const.jl | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/src/fldmod-by-const.jl b/src/fldmod-by-const.jl index 6c62ce3..e141adc 100644 --- a/src/fldmod-by-const.jl +++ b/src/fldmod-by-const.jl @@ -70,15 +70,16 @@ end # Unsigned magic number computation + shift by constant # See Hacker's delight, equations (26) and (27) from Chapter 10-9. -# nmax and divisor must be > 0. +# requires nmax > divisor > 1. Base.@assume_effects :foldable function magicg(nmax::Unsigned, divisor) T = typeof(nmax) W = _widen(T) d = W(divisor) - nc = div(W(nmax) + W(1), d) * d - W(1) - nbits = sizeof(nmax) * 8 - leading_zeros(nmax) # most significant bit - for p in 0:2nbits + nc = div(W(nmax) + W(1), d) * d - W(1) # largest multiple of d <= nmax, minus 1 + nbits = 8sizeof(nmax) - leading_zeros(nmax) # most significant bit + # shift must be larger than int size because we want the high bits of the wide multiplication + for p in 8sizeof(nmax):2nbits if W(2)^p > nc * (d - W(1) - rem(W(2)^p - W(1), d)) # (27) m = div(W(2)^p + d - W(1) - rem(W(2)^p - W(1), d), d) # (26) return (m, p) @@ -88,15 +89,16 @@ Base.@assume_effects :foldable function magicg(nmax::Unsigned, divisor) end # See Hacker's delight, equations (5) and (6) from Chapter 10-4. -# nmax and divisor must be > 0. +# requires nmax > divisor > 1. Base.@assume_effects :foldable function magicg(nmax::Signed, divisor) T = typeof(nmax) W = _widen(T) d = W(divisor) - nc = ((W(nmax) + W(1)) ÷ d) * d - W(1) - nbits = sizeof(nmax) * 8 - leading_zeros(nmax) # most significant bit - for p in 0:2nbits + nc = div(W(nmax) + W(1), d) * d - W(1) # largest multiple of d <= nmax, minus 1 + nbits = 8sizeof(nmax) - leading_zeros(nmax) # most significant bit + # shift must be larger than int size because we want the high bits of the wide multiplication + for p in 8sizeof(nmax):2nbits if W(2)^p > nc * (d - rem(W(2)^p, d)) # (6) m = div(W(2)^p + d - rem(W(2)^p, d), d) # (5) return (m, p) @@ -122,5 +124,6 @@ function div_by_const(x::T, ::Val{C}) where {T, C} out = _widemul(promote(x, magic_number)...) out >>= shift - return (out % T) + (x < zero(T)) # TODO: Figure out why and if the + 1 if non-positive is necessary + # TODO: prove that we need to add 1 for negative numbers! + return (out % T) + (x < zero(T)) end From 4f4d17a75b859bcd5b89f932e1d8faaf2fb91b63 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tom=C3=A1=C5=A1=20Drvo=C5=A1t=C4=9Bp?= Date: Wed, 26 Jun 2024 17:13:13 +0200 Subject: [PATCH 24/32] . --- src/fldmod-by-const.jl | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/fldmod-by-const.jl b/src/fldmod-by-const.jl index e141adc..3dc44cf 100644 --- a/src/fldmod-by-const.jl +++ b/src/fldmod-by-const.jl @@ -70,7 +70,7 @@ end # Unsigned magic number computation + shift by constant # See Hacker's delight, equations (26) and (27) from Chapter 10-9. -# requires nmax > divisor > 1. +# requires nmax >= divisor > 2. divisor must not be a power of 2. Base.@assume_effects :foldable function magicg(nmax::Unsigned, divisor) T = typeof(nmax) W = _widen(T) @@ -79,7 +79,7 @@ Base.@assume_effects :foldable function magicg(nmax::Unsigned, divisor) nc = div(W(nmax) + W(1), d) * d - W(1) # largest multiple of d <= nmax, minus 1 nbits = 8sizeof(nmax) - leading_zeros(nmax) # most significant bit # shift must be larger than int size because we want the high bits of the wide multiplication - for p in 8sizeof(nmax):2nbits + for p in nbits-1:2nbits-1 if W(2)^p > nc * (d - W(1) - rem(W(2)^p - W(1), d)) # (27) m = div(W(2)^p + d - W(1) - rem(W(2)^p - W(1), d), d) # (26) return (m, p) @@ -89,7 +89,7 @@ Base.@assume_effects :foldable function magicg(nmax::Unsigned, divisor) end # See Hacker's delight, equations (5) and (6) from Chapter 10-4. -# requires nmax > divisor > 1. +# requires nmax >= divisor > 2. divisor must not be a power of 2. Base.@assume_effects :foldable function magicg(nmax::Signed, divisor) T = typeof(nmax) W = _widen(T) @@ -98,7 +98,7 @@ Base.@assume_effects :foldable function magicg(nmax::Signed, divisor) nc = div(W(nmax) + W(1), d) * d - W(1) # largest multiple of d <= nmax, minus 1 nbits = 8sizeof(nmax) - leading_zeros(nmax) # most significant bit # shift must be larger than int size because we want the high bits of the wide multiplication - for p in 8sizeof(nmax):2nbits + for p in nbits-1:2nbits-1 if W(2)^p > nc * (d - rem(W(2)^p, d)) # (6) m = div(W(2)^p + d - rem(W(2)^p, d), d) # (5) return (m, p) @@ -124,6 +124,6 @@ function div_by_const(x::T, ::Val{C}) where {T, C} out = _widemul(promote(x, magic_number)...) out >>= shift - # TODO: prove that we need to add 1 for negative numbers! + # Adding one as implied by formula (1b) in Hacker's delight, Chapter 10-4. return (out % T) + (x < zero(T)) end From eaeaddfa6f8b025c712e82b7c4173bc3e91a4c38 Mon Sep 17 00:00:00 2001 From: Nathan Daly Date: Tue, 9 Jul 2024 18:51:12 -0600 Subject: [PATCH 25/32] Restrict back to just Int128 & Int256 for custom div_by_const LLVM _does_ do this automatically if you directly call `fldmod`. Improve the comments as well. --- src/fldmod-by-const.jl | 49 +++++++++++++++++++++--------------------- 1 file changed, 24 insertions(+), 25 deletions(-) diff --git a/src/fldmod-by-const.jl b/src/fldmod-by-const.jl index 3dc44cf..7e3bff8 100644 --- a/src/fldmod-by-const.jl +++ b/src/fldmod-by-const.jl @@ -1,31 +1,35 @@ -# NOTE: Surprisingly, even though LLVM implements a version of this optimization on its own -# for smaller integer sizes (<=64-bits), using the code in this file produces faster -# multiplications for *all* types of integers. So we use our custom fldmod_by_const for all -# bit integer types. +# This file includes a manual implementation of the divide-by-constant optimization for +# non-power-of-2 divisors. LLVM automatically includes this optimization, but only for +# smaller integer sizes (<=64-bits on my 64-bit machine). +# +# NOTE: We use LLVM's built-in implementation for Int64 and smaller, to keep the code +# simpler (though the code we produce is identical, verified in tests.) We apply this +# optimization to (U)Int128 and (U)Int256, which result from multiplying FD{Int64}s and +# FD{Int128}s. # Before: # julia> @btime for _ in 1:10000 fd = fd * fd end setup = (fd = FixedDecimal{Int32,3}(1.234)) -# 84.959 μs (0 allocations: 0 bytes) +# 54.125 μs (0 allocations: 0 bytes) # (Unchanged) # FixedDecimal{Int32,3}(1700943.280) # # julia> @btime for _ in 1:10000 fd = fd * fd end setup = (fd = FixedDecimal{Int64,3}(1.234)) -# 247.709 μs (0 allocations: 0 bytes) +# 174.625 μs (0 allocations: 0 bytes) # FixedDecimal{Int64,3}(4230510070790917.029) # -#julia> @btime for _ in 1:10000 fd = fd * fd end setup = (fd = FixedDecimal{Int128,3}(1.234)) -# 4.077 ms (160798 allocations: 3.22 MiB) -# FixedDecimal{Int128,3}(-66726338547984585007169386718143307.324) +# julia> @btime for _ in 1:10000 fd = fd * fd end setup = (fd = FixedDecimal{Int128,3}(1.234)) +# 2.119 ms (79986 allocations: 1.60 MiB) +# FixedDecimal{Int128,3}(-66726338547984585007169386718143307.324) # # After: # julia> @btime for _ in 1:10000 fd = fd * fd end setup = (fd = FixedDecimal{Int32,3}(1.234)) -# 68.416 μs (0 allocations: 0 bytes) +# 56.958 μs (0 allocations: 0 bytes) # (Unchanged) # FixedDecimal{Int32,3}(1700943.280) # # julia> @btime for _ in 1:10000 fd = fd * fd end setup = (fd = FixedDecimal{Int64,3}(1.234)) -# 106.125 μs (0 allocations: 0 bytes) +# 90.708 μs (0 allocations: 0 bytes) # FixedDecimal{Int64,3}(4230510070790917.029) # # julia> @btime for _ in 1:10000 fd = fd * fd end setup = (fd = FixedDecimal{Int128,3}(1.234)) -# 204.125 μs (0 allocations: 0 bytes) +# 180.167 μs (0 allocations: 0 bytes) # FixedDecimal{Int128,3}(-66726338547984585007169386718143307.324) """ @@ -34,8 +38,8 @@ A trait to control opt-in for the custom `fldmod_by_const` implementation. To us given integer type, you can define this overload for your integer type. You will also need to implement some parts of the interface below, including _widen(). """ -ShouldUseCustomFldmodByConst(::Type{<:Base.BitInteger}) = true -ShouldUseCustomFldmodByConst(::Type{<:Union{Int256,UInt256}}) = true +ShouldUseCustomFldmodByConst(::Type{<:Union{Int128,UInt128}}) = true # For FD{Int64} +ShouldUseCustomFldmodByConst(::Type{<:Union{Int256,UInt256}}) = true # For FD{Int128} ShouldUseCustomFldmodByConst(::Type) = false @inline function fldmod_by_const(x, y) @@ -43,12 +47,12 @@ ShouldUseCustomFldmodByConst(::Type) = false # For large Int types, LLVM doesn't optimize well, so we use a custom implementation # of fldmod, which extends that optimization to those larger integer types. d = fld_by_const(x, Val(y)) - return d, manual_mod(promote(x, y, d)...) + return d, (x - d * y) else # For other integers, LLVM might be able to correctly optimize away the division, if - # it knows it's dividing by a const. We cannot call `Base.fldmod` since it's not - # inlined, so here we have explictly inlined it instead. - return (fld(x,y), mod(x,y)) + # it knows it's dividing by a const. + # Since julia 1.8+, fldmod(x,y) automatically optimizes for constant divisors. + return fldmod(x, y) end end @@ -61,15 +65,9 @@ function fld_by_const(x::T, y::Val{C}) where {T<:Signed, C} return d - (signbit(x ⊻ C) & (d * C != x)) end -# Calculate `mod(x,y)` after you've already acquired quotient, the result of `fld(x,y)`. -# REQUIRES: -# - `y != -1` -@inline function manual_mod(x::T, y::T, quotient::T) where T<:Integer - return x - quotient * y -end - # Unsigned magic number computation + shift by constant # See Hacker's delight, equations (26) and (27) from Chapter 10-9. +# (See also the errata on https://web.archive.org/web/20190915025154/http://www.hackersdelight.org/) # requires nmax >= divisor > 2. divisor must not be a power of 2. Base.@assume_effects :foldable function magicg(nmax::Unsigned, divisor) T = typeof(nmax) @@ -89,6 +87,7 @@ Base.@assume_effects :foldable function magicg(nmax::Unsigned, divisor) end # See Hacker's delight, equations (5) and (6) from Chapter 10-4. +# (See also the errata on https://web.archive.org/web/20190915025154/http://www.hackersdelight.org/) # requires nmax >= divisor > 2. divisor must not be a power of 2. Base.@assume_effects :foldable function magicg(nmax::Signed, divisor) T = typeof(nmax) From a2dcf56e6d471dbfa02f4883b7829c8430bfde5c Mon Sep 17 00:00:00 2001 From: Nathan Daly Date: Tue, 9 Jul 2024 18:52:15 -0600 Subject: [PATCH 26/32] It turns out that in newer versions of julia, you should call fldmod, not fld(x,y),mod(x,y), even if y is a constant! :) Improved that for julia versions 1.8 - 1.9 --- src/FixedPointDecimals.jl | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/FixedPointDecimals.jl b/src/FixedPointDecimals.jl index ebe0702..38e45bb 100644 --- a/src/FixedPointDecimals.jl +++ b/src/FixedPointDecimals.jl @@ -49,7 +49,13 @@ else # We force-inline this, to allow the fldmod operation to be specialized for a constant # second argument (converting divide-by-power-of-ten instructions with the cheaper # multiply-by-inverse-coefficient.) Sadly this doesn't work for Int128. - @inline fldmod_by_const(x,y) = (fld(x,y), mod(x,y)) + if VERSION >= v"1.8.0-" + # Since julia 1.8+, the built-in fldmod optimizes for constant divisors, which is + # even better than computing the division twice. + @inline fldmod_by_const(x,y) = fldmod(x,y) + else + @inline fldmod_by_const(x,y) = (fld(x,y), mod(x,y)) + end end # floats that support fma and are roughly IEEE-like From ef578e91f47ac3319b152350bf69a289a4b22823 Mon Sep 17 00:00:00 2001 From: Nathan Daly Date: Tue, 9 Jul 2024 18:56:46 -0600 Subject: [PATCH 27/32] Reorganize the functions to be top-down --- src/fldmod-by-const.jl | 47 +++++++++++++++++++++--------------------- 1 file changed, 23 insertions(+), 24 deletions(-) diff --git a/src/fldmod-by-const.jl b/src/fldmod-by-const.jl index 7e3bff8..cf200e8 100644 --- a/src/fldmod-by-const.jl +++ b/src/fldmod-by-const.jl @@ -3,9 +3,8 @@ # smaller integer sizes (<=64-bits on my 64-bit machine). # # NOTE: We use LLVM's built-in implementation for Int64 and smaller, to keep the code -# simpler (though the code we produce is identical, verified in tests.) We apply this -# optimization to (U)Int128 and (U)Int256, which result from multiplying FD{Int64}s and -# FD{Int128}s. +# simpler (though the code we produce is identical). We apply this optimization to (U)Int128 +# and (U)Int256, which result from multiplying FD{Int64}s and FD{Int128}s. # Before: # julia> @btime for _ in 1:10000 fd = fd * fd end setup = (fd = FixedDecimal{Int32,3}(1.234)) # 54.125 μs (0 allocations: 0 bytes) # (Unchanged) @@ -65,6 +64,27 @@ function fld_by_const(x::T, y::Val{C}) where {T<:Signed, C} return d - (signbit(x ⊻ C) & (d * C != x)) end +function div_by_const(x::T, ::Val{C}) where {T, C} + # These checks will be compiled away during specialization. + # While for `*(FixedDecimal, FixedDecimal)`, C will always be a power of 10, these + # checks allow this function to work for any `C > 0`, in case that's useful in the + # future. + if C == 1 + return x + elseif ispow2(C) + return div(x, C) # Will already do the right thing + elseif C <= 0 + throw(DomainError("C must be > 0")) + end + # Calculate the magic number and shift amount, based on Hacker's Delight, Chapter 10. + magic_number, shift = magicg(typemax(T), C) + + out = _widemul(promote(x, magic_number)...) + out >>= shift + # Adding one as implied by formula (1b) in Hacker's delight, Chapter 10-4. + return (out % T) + (x < zero(T)) +end + # Unsigned magic number computation + shift by constant # See Hacker's delight, equations (26) and (27) from Chapter 10-9. # (See also the errata on https://web.archive.org/web/20190915025154/http://www.hackersdelight.org/) @@ -105,24 +125,3 @@ Base.@assume_effects :foldable function magicg(nmax::Signed, divisor) end return W(0), 0 # Should never reach here end - -function div_by_const(x::T, ::Val{C}) where {T, C} - # These checks will be compiled away during specialization. - # While for `*(FixedDecimal, FixedDecimal)`, C will always be a power of 10, these - # checks allow this function to work for any `C > 0`, in case that's useful in the - # future. - if C == 1 - return x - elseif ispow2(C) - return div(x, C) # Will already do the right thing - elseif C <= 0 - throw(DomainError("C must be > 0")) - end - # Calculate the magic number and shift amount, based on Hacker's Delight, Chapter 10. - magic_number, shift = magicg(typemax(T), C) - - out = _widemul(promote(x, magic_number)...) - out >>= shift - # Adding one as implied by formula (1b) in Hacker's delight, Chapter 10-4. - return (out % T) + (x < zero(T)) -end From 8df68b5fe1fced4f8aff35c52d8f56cc216fcfe7 Mon Sep 17 00:00:00 2001 From: Nathan Daly Date: Tue, 9 Jul 2024 19:23:22 -0600 Subject: [PATCH 28/32] More thorough tests for flmdod_by_const --- test/fldmod-by-const_tests.jl | 37 +++++++++++++++-------------------- 1 file changed, 16 insertions(+), 21 deletions(-) diff --git a/test/fldmod-by-const_tests.jl b/test/fldmod-by-const_tests.jl index 95dfce9..2a63031 100644 --- a/test/fldmod-by-const_tests.jl +++ b/test/fldmod-by-const_tests.jl @@ -23,33 +23,25 @@ end end end -@testset "fixed decimal multiplication - exhaustive 8-bit" begin - @testset for P in (0,1) - @testset for T in (Int8, UInt8) - FD = FixedDecimal{T,P} - - function test_multiplies_correctly(fd, x) - big = FixedDecimal{BigInt, P}(fd) - big_mul = big * x - # This might overflow: ... - mul = fd * x - @testset "$fd * $x" begin - # ... so we truncate big to the same size - @test big_mul.i % T == mul.i % T - end - end - @testset for v in typemin(FD) : eps(FD) : typemax(FD) - @testset for v2 in typemin(FD) : eps(FD) : typemax(FD) - test_multiplies_correctly(v, v2) +@testset "flmdod_by_const - exhaustive 8-bit" begin + @testset for T in (Int8, UInt8) + @testset for x in typemin(T) : typemax(T) + @testset for y in typemin(T) : typemax(T) + y == 0 && continue + y == -1 && continue + @testset "fldmod($x, $y)" begin + @test fldmod(x, y) == FixedPointDecimals.fldmod_by_const(x, y) end end end end end -@testset "fixed decimal multiplication - exhaustive 16-bit" begin +# 64-bit FD multiplication uses the custom fldmod_by_const implementation. +# Test a few various different cases to try to ensure it works correctly. +@testset "fixed decimal multiplication - 64-bit" begin @testset for P in (0,1,2,3,4) - @testset for T in (Int16, UInt16) + @testset for T in (Int64, UInt64) FD = FixedDecimal{T,P} function test_multiplies_correctly(fd, x) @@ -62,7 +54,10 @@ end @test big_mul.i % T == mul.i % T end end - @testset for v in typemin(FD) : eps(FD) : typemax(FD) + num_tests = 2<<11 + # Add one to avoid powers-of-2 to get hopefully better tests + epsilon = (typemax(FD) ÷ num_tests) + eps(FD) + @testset for v in typemin(FD) : epsilon : typemax(FD) test_multiplies_correctly(v, typemin(T)) test_multiplies_correctly(v, -1) test_multiplies_correctly(v, -eps(FD)) From fd096ace8f4c28eede455c91ca799b2ff88e9439 Mon Sep 17 00:00:00 2001 From: Nathan Daly Date: Mon, 12 Aug 2024 11:52:54 -0600 Subject: [PATCH 29/32] Bump patch version number --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index f120b6c..9bef680 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "FixedPointDecimals" uuid = "fb4d412d-6eee-574d-9565-ede6634db7b0" authors = ["Fengyang Wang ", "Curtis Vogt "] -version = "0.5.3" +version = "0.5.4" [deps] BitIntegers = "c3b6d118-76ef-56ca-8cc7-ebb389d030a1" From e147f5cd3d890e0125e235591a796a94366c4669 Mon Sep 17 00:00:00 2001 From: Nathan Daly Date: Tue, 13 Aug 2024 10:37:06 -0600 Subject: [PATCH 30/32] Add _widemul unit test --- test/FixedDecimal.jl | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/test/FixedDecimal.jl b/test/FixedDecimal.jl index 18d87d7..41fcda8 100644 --- a/test/FixedDecimal.jl +++ b/test/FixedDecimal.jl @@ -1408,3 +1408,25 @@ end @test hash(FD2(1//10)) ≠ hash(0.1) end +@testset "_widemul" begin + _widemul = FixedPointDecimals._widemul + Int256, UInt256 = FixedPointDecimals.Int256, FixedPointDecimals.UInt256 + Int512, UInt512 = FixedPointDecimals.Int512, FixedPointDecimals.UInt512 + + @test _widemul(UInt8(3), UInt8(2)) === widemul(UInt8(3), UInt8(2)) === UInt16(6) + @test _widemul(Int8(3), UInt8(2)) === widemul(Int8(3), UInt8(2)) === Int16(6) + @test _widemul(UInt8(3), Int8(2)) === widemul(UInt8(3), Int8(2)) === Int16(6) + + @test _widemul(UInt16(3), UInt8(2)) === widemul(UInt16(3), UInt8(2)) === UInt32(6) + @test _widemul(UInt16(3), Int8(2)) === widemul(UInt16(3), Int8(2)) === Int32(6) + @test _widemul(Int16(3), UInt8(2)) === widemul(Int16(3), UInt8(2)) === Int32(6) + + # Custom widenings + @test _widemul(UInt128(3), UInt128(2)) === UInt256(6) + @test _widemul(Int128(3), UInt128(2)) === Int256(6) + @test _widemul(UInt128(3), Int128(2)) === Int256(6) + + @test _widemul(UInt256(3), UInt256(2)) === UInt512(6) + @test _widemul(Int256(3), UInt256(2)) === Int512(6) + @test _widemul(UInt256(3), Int256(2)) === Int512(6) +end From f830c3c98fca678c9d21c91ccca5aa96fea9c616 Mon Sep 17 00:00:00 2001 From: Nathan Daly Date: Tue, 13 Aug 2024 10:40:49 -0600 Subject: [PATCH 31/32] Change "unreachable" comment to an `@assert false` --- src/fldmod-by-const.jl | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/fldmod-by-const.jl b/src/fldmod-by-const.jl index cf200e8..d36b4da 100644 --- a/src/fldmod-by-const.jl +++ b/src/fldmod-by-const.jl @@ -103,7 +103,9 @@ Base.@assume_effects :foldable function magicg(nmax::Unsigned, divisor) return (m, p) end end - return W(0), 0 # Should never reach here + @assert false """magicg bug: Unreachable reached. divisor=$divisor, nmax=$nmax. + Please report an issue to https://github.com/JuliaMath/FixedPointDecimals.jl + """ end # See Hacker's delight, equations (5) and (6) from Chapter 10-4. @@ -123,5 +125,7 @@ Base.@assume_effects :foldable function magicg(nmax::Signed, divisor) return (m, p) end end - return W(0), 0 # Should never reach here + @assert false """magicg bug: Unreachable reached. divisor=$divisor, nmax=$nmax. + Please report an issue to https://github.com/JuliaMath/FixedPointDecimals.jl + """ end From 71ba82e3b80f3ac53833627424abac27d2375359 Mon Sep 17 00:00:00 2001 From: Nathan Daly Date: Mon, 9 Sep 2024 18:08:11 -0600 Subject: [PATCH 32/32] add test comment --- test/fldmod-by-const_tests.jl | 3 +++ 1 file changed, 3 insertions(+) diff --git a/test/fldmod-by-const_tests.jl b/test/fldmod-by-const_tests.jl index 2a63031..42b04d0 100644 --- a/test/fldmod-by-const_tests.jl +++ b/test/fldmod-by-const_tests.jl @@ -23,6 +23,9 @@ end end end +# We don't actually use fldmod_by_const with 8-bit ints, but they're useful because +# we can exhaustively test every possible combination, to increase our confidence in +# the implementation. @testset "flmdod_by_const - exhaustive 8-bit" begin @testset for T in (Int8, UInt8) @testset for x in typemin(T) : typemax(T)