Skip to content

Commit 0b06662

Browse files
committed
Finish removing the BigInts from * for FD{Int128}!
Finally implements the fast-multiplication optimization from #45, but this time for 128-bit FixedDecimals! :) This is a follow-up to #93, which introduces an Int256 type for widemul. However, the fldmod still required 2 BigInt allocations. Now, this PR uses a custom implementation of the LLVM div-by-const optimization for (U)Int256, which briefly widens to Int512 (😅) to perform the fldmod by the constant 10^f coefficient. This brings 128-bit FD multiply to the same performance as 64-bit. :)
1 parent a245651 commit 0b06662

File tree

4 files changed

+171
-14
lines changed

4 files changed

+171
-14
lines changed

src/FixedPointDecimals.jl

Lines changed: 11 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -36,9 +36,11 @@ export checked_abs, checked_add, checked_cld, checked_div, checked_fld,
3636

3737
using Base: decompose, BitInteger
3838

39-
import BitIntegers # For 128-bit _widemul / _widen
39+
using BitIntegers: BitIntegers, UInt256, Int256
4040
import Parsers
4141

42+
include("fldmod-by-const.jl")
43+
4244
# floats that support fma and are roughly IEEE-like
4345
const FMAFloat = Union{Float16, Float32, Float64, BigFloat}
4446

@@ -129,8 +131,10 @@ _widemul(x::Unsigned,y::Signed) = signed(_widen(x)) * _widen(y)
129131

130132
# Custom widen implementation to avoid the cost of widening to BigInt.
131133
# FD{Int128} operations should widen to 256 bits internally, rather than to a BigInt.
132-
_widen(::Type{Int128}) = BitIntegers.Int256
133-
_widen(::Type{UInt128}) = BitIntegers.UInt256
134+
_widen(::Type{Int128}) = Int256
135+
_widen(::Type{UInt128}) = UInt256
136+
_widen(::Type{Int256}) = BitIntegers.Int512
137+
_widen(::Type{UInt256}) = BitIntegers.UInt512
134138
_widen(t::Type) = widen(t)
135139
_widen(x::T) where {T} = (_widen(T))(x)
136140

@@ -196,18 +200,12 @@ function _round_to_nearest(quotient::T,
196200
end
197201
_round_to_nearest(q, r, d, m=RoundNearest) = _round_to_nearest(promote(q, r, d)..., m)
198202

199-
# In many of our calls to fldmod, `y` is a constant (the coefficient, 10^f). However, since
200-
# `fldmod` is sometimes not being inlined, that constant information is not available to the
201-
# optimizer. We need an inlined version of fldmod so that the compiler can replace expensive
202-
# divide-by-power-of-ten instructions with the cheaper multiply-by-inverse-coefficient.
203-
@inline fldmodinline(x,y) = (fld(x,y), mod(x,y))
204-
205203
# multiplication rounds to nearest even representation
206204
# TODO: can we use floating point to speed this up? after we build a
207205
# correctness test suite.
208206
function Base.:*(x::FD{T, f}, y::FD{T, f}) where {T, f}
209207
powt = coefficient(FD{T, f})
210-
quotient, remainder = fldmodinline(_widemul(x.i, y.i), powt)
208+
quotient, remainder = fldmod_by_const(_widemul(x.i, y.i), Val(powt))
211209
reinterpret(FD{T, f}, _round_to_nearest(quotient, remainder, powt))
212210
end
213211

@@ -234,12 +232,12 @@ function Base.round(x::FD{T, f},
234232
RoundingMode{:NearestTiesUp},
235233
RoundingMode{:NearestTiesAway}}=RoundNearest) where {T, f}
236234
powt = coefficient(FD{T, f})
237-
quotient, remainder = fldmodinline(x.i, powt)
235+
quotient, remainder = fldmod_by_const(x.i, Val(powt))
238236
FD{T, f}(_round_to_nearest(quotient, remainder, powt, m))
239237
end
240238
function Base.ceil(x::FD{T, f}) where {T, f}
241239
powt = coefficient(FD{T, f})
242-
quotient, remainder = fldmodinline(x.i, powt)
240+
quotient, remainder = fldmod_by_const(x.i, Val(powt))
243241
if remainder > 0
244242
FD{T, f}(quotient + one(quotient))
245243
else
@@ -435,7 +433,7 @@ function Base.checked_sub(x::T, y::T) where {T<:FD}
435433
end
436434
function Base.checked_mul(x::FD{T,f}, y::FD{T,f}) where {T<:Integer,f}
437435
powt = coefficient(FD{T, f})
438-
quotient, remainder = fldmodinline(_widemul(x.i, y.i), powt)
436+
quotient, remainder = fldmod_by_const(_widemul(x.i, y.i), Val(powt))
439437
v = _round_to_nearest(quotient, remainder, powt)
440438
typemin(T) <= v <= typemax(T) || Base.Checked.throw_overflowerr_binaryop(:*, x, y)
441439
return reinterpret(FD{T, f}, T(v))

src/fldmod-by-const.jl

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
2+
const BitInteger256 = Union{UInt256, Int256}
3+
4+
@inline function fldmod_by_const(x, ::Val{y}) where y
5+
# For small-to-normal integers, LLVM can correctly optimize away the division, if it
6+
# knows it's dividing by a const. We cannot call `Base.fldmod` since it's not
7+
# inlined, so here we have explictly inlined it instead.
8+
return (fld(x,y), mod(x,y))
9+
end
10+
@inline function fldmod_by_const(x::BitInteger256, y::Val{C}) where C
11+
# For large or non-standard Int types, LLVM doesn't optimize
12+
# well, so we use a custom implementation of fldmod.
13+
d = fld_by_const(x, y)
14+
return d, manual_mod(promote(x, C, d)...)
15+
end
16+
17+
# Calculate fld(x,y) when y is a Val constant.
18+
# The implementation for fld_by_const was lifted directly from Base.fld(x,y), except that
19+
# it uses `div_by_const` instead of `div`.
20+
fld_by_const(x::T, y::Val{C}) where {T<:UInt256, C} = div_by_const(x, y)
21+
function fld_by_const(x::T, y::Val{C}) where {T<:Int256, C}
22+
d = div_by_const(x, y)
23+
return d - (signbit(x C) & (d * C != x))
24+
end
25+
26+
# Calculate `mod(x,y)` after you've already acquired quotient, the result of `fld(x,y)`.
27+
# REQUIRES:
28+
# - `y != -1`
29+
@inline function manual_mod(x::T, y::T, quotient::T) where T<:BitInteger256
30+
return x - quotient * y
31+
end
32+
33+
Base.@assume_effects :total function div_by_const(x::T, ::Val{C}) where {T, C}
34+
# These checks will be compiled away during specialization.
35+
# While for `*(FixedDecimal, FixedDecimal)`, C will always be a power of 10, these
36+
# checks allow this function to work for any `C > 0`, in case that's useful in the
37+
# future.
38+
if C == 1
39+
return x
40+
elseif ispow2(C)
41+
return div(x, C) # Will already do the right thing
42+
elseif C <= 0
43+
throw(DomainError("C must be > 0"))
44+
end
45+
# Calculate the magic number 2^N/C. Note that this is computed statically, not at
46+
# runtime.
47+
inverse_coeff, toshift = calculate_inverse_coeff(T, C)
48+
# Compute the upper-half of widemul(x, 2^nbits(T)/C).
49+
# By keeping only the upper half, we're essentially dividing by 2^nbits(T), undoing the
50+
# numerator of the multiplication, so that the result is equal to x/C.
51+
out = mul_hi(x, inverse_coeff)
52+
# This condition will be compiled away during specialization.
53+
if T <: Signed
54+
# Because our magic number has a leading one (since we shift all-the-way left), the
55+
# result is negative if it's Signed. We add x to give us the positive equivalent.
56+
out += x
57+
signshift = (nbits(x) - 1)
58+
isnegative = T(out >>> signshift) # 1 if < 0 else 0 (Unsigned bitshift to read top bit)
59+
end
60+
# Undo the bitshifts used to calculate the invoeff magic number with maximum precision.
61+
out = out >> toshift
62+
if T <: Signed
63+
out = out + isnegative
64+
end
65+
return T(out)
66+
end
67+
68+
Base.@assume_effects :foldable function calculate_inverse_coeff(::Type{T}, C) where {T}
69+
# First, calculate 2^nbits(T)/C
70+
# We shift away leading zeros to preserve the most precision when we use it to multiply
71+
# in the next step. At the end, we will shift the final answer back to undo this
72+
# operation (which is why we need to return `toshift`).
73+
# Note, also, that we calculate invcoeff at double-precision so that the left-shift
74+
# doesn't leave trailing zeros. We truncate to only the upper-half before returning.
75+
UT = _unsigned(T)
76+
invcoeff = typemax(_widen(UT)) ÷ C
77+
toshift = leading_zeros(invcoeff)
78+
invcoeff = invcoeff << toshift
79+
# Now, truncate to only the upper half of invcoeff, after we've shifted. Instead of
80+
# bitshifting, we round to maintain precision. (This is needed to prevent off-by-ones.)
81+
# -- This is equivalent to `invcoeff = T(invcoeff >> sizeof(T))`, except rounded. --
82+
invcoeff = _round_to_nearest(fldmod(invcoeff, typemax(UT))..., typemax(UT)) % T
83+
return invcoeff, toshift
84+
end
85+
86+
function mul_hi(x::T, y::T) where T
87+
xy = _widemul(x, y) # support Int256 -> Int512 (!!)
88+
(xy >> nbits(T)) % T
89+
end
90+
91+
# Annoyingly, Unsigned(T) isn't defined for BitIntegers types:
92+
# https://github.com/rfourquet/BitIntegers.jl/pull/2
93+
_unsigned(x) = unsigned(x)
94+
_unsigned(::Type{Int256}) = UInt256
95+
96+
nbits(x) = sizeof(x) * 8

test/fldmod-by-const_tests.jl

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
using Test
2+
using FixedPointDecimals
3+
4+
@testset "calculate_inverse_coeff signed" begin
5+
using FixedPointDecimals: calculate_inverse_coeff
6+
7+
# The correct magic number here comes from investigating the following native code
8+
# produced on an m2 aarch64 macbook pro:
9+
# @code_native ((x)->fld(x,100))(2)
10+
# ...
11+
# mov x8, #55051
12+
# movk x8, #28835, lsl #16
13+
# movk x8, #2621, lsl #32
14+
# movk x8, #41943, lsl #48
15+
# Where:
16+
# julia> 55051 | 28835 << 16 | 2621 << 32 | 41943 << 48
17+
# -6640827866535438581
18+
@test calculate_inverse_coeff(Int64, 100) == (-6640827866535438581, 6)
19+
20+
# Same for the tests below:
21+
22+
# (LLVM's magic number is shifted one bit less, then they shift by 2, instead of 3,
23+
# but the result is the same.)
24+
@test calculate_inverse_coeff(Int64, 10) == (7378697629483820647 << 1, 3)
25+
26+
@test calculate_inverse_coeff(Int64, 1) == (1, 0)
27+
end
28+
29+
@testset "calculate_inverse_coeff signed 4" begin
30+
using FixedPointDecimals: calculate_inverse_coeff
31+
32+
# Same here, our magic number is shifted 2 bits more than LLVM's
33+
@test calculate_inverse_coeff(UInt64, 100) == (0xa3d70a3d70a3d70b, 6)
34+
35+
@test calculate_inverse_coeff(UInt64, 1) == (UInt64(0x1), 0)
36+
end
37+
38+
@testset "div_by_const" begin
39+
vals = [2432, 100, 0x1, Int32(10000), typemax(Int64), typemax(Int16), 8, Int64(2)^32]
40+
for a_base in vals
41+
# Only test negative numbers on `a`, since div_by_const requires b > 0.
42+
@testset for (a, b, f) in Iterators.product((a_base, -a_base), vals, (unsigned, signed))
43+
a, b = promote(f(a), f(b))
44+
@test FixedPointDecimals.div_by_const(a, Val(b)) == a ÷ b
45+
end
46+
end
47+
end
48+
49+
@testset "fldmod_by_const" begin
50+
vals = [2432, 100, 0x1, Int32(10000), typemax(Int64), typemax(Int16), 8, Int64(2)^32]
51+
for a_base in vals
52+
# Only test negative numbers on `a`, since fldmod_by_const requires b > 0.
53+
@testset for (a, b, f) in Iterators.product((a_base, -a_base), vals, (unsigned, signed))
54+
a, b = promote(f(a), f(b))
55+
@test FixedPointDecimals.fldmod_by_const(a, Val(b)) == fldmod(a, b)
56+
end
57+
end
58+
end

test/runtests.jl

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,4 +10,9 @@ include(joinpath(pkg_path, "test", "utils.jl"))
1010

1111
@testset "FixedPointDecimals" begin
1212
include("FixedDecimal.jl")
13-
end # global testset
13+
end
14+
15+
@testset "FixedPointDecimals" begin
16+
include("fldmod-by-const_tests.jl")
17+
end
18+

0 commit comments

Comments
 (0)