Merge pull request #80 from Drvi/td-new-parser

NHDaly · web-flow · commit 75c722674bce · 2023-06-14T16:21:51.000-06:00
Use a `Parsers.jl`-based parser implementation
diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
@@ -13,7 +13,7 @@ jobs:
       fail-fast: false
       matrix:
         version:
-          - '1.0'
+          - '1.6'
           - '1'
 #          - 'nightly'
         os:
diff --git a/Project.toml b/Project.toml
@@ -1,9 +1,13 @@
 name = "FixedPointDecimals"
 uuid = "fb4d412d-6eee-574d-9565-ede6634db7b0"
 authors = ["Fengyang Wang <fengyang.wang.0@gmail.com>", "Curtis Vogt <curtis.vogt@gmail.com>"]
-version = "0.4.2"
+version = "0.4.3"
+
+[deps]
+Parsers = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0"
 
 [compat]
+Parsers = "2.7"
 julia = "1.6"
 
 [extras]
diff --git a/src/FixedPointDecimals.jl b/src/FixedPointDecimals.jl
@@ -28,6 +28,7 @@ module FixedPointDecimals
 export FixedDecimal, RoundThrows
 
 using Base: decompose, BitInteger
+import Parsers
 
 # floats that support fma and are roughly IEEE-like
 const FMAFloat = Union{Float16, Float32, Float64, BigFloat}
@@ -100,6 +101,16 @@ end
 
 const FD = FixedDecimal
 
+include("parse.jl")
+
+function __init__()
+    nt = isdefined(Base.Threads, :maxthreadid) ? Threads.maxthreadid() : Threads.nthreads()
+    # Buffers used in parsing when dealing with BigInts, see _divpow10! in parse.jl
+    resize!(empty!(_BIGINT_10s), nt)
+    resize!(empty!(_BIGINT_Rs), nt)
+    return
+end
+
 (::Type{T})(x::Real) where {T <: FD} = convert(T, x)
 
 floattype(::Type{<:FD{T}}) where {T<:Union{Int8, UInt8, Int16, UInt16}} = Float32
@@ -413,78 +424,6 @@ function Base.show(io::IO, x::FD{T, f}) where {T, f}
     end
 end
 
-# parsing
-
-"""
-    RoundThrows
-
-Raises an `InexactError` if any rounding is necessary.
-"""
-const RoundThrows = RoundingMode{:Throw}()
-
-function Base.parse(::Type{FD{T, f}}, str::AbstractString, mode::RoundingMode=RoundNearest) where {T, f}
-    if !(mode in (RoundThrows, RoundNearest, RoundToZero))
-        throw(ArgumentError("Unhandled rounding mode $mode"))
-    end
-
-    # Parse exponent information
-    exp_index = something(findfirst(==('e'), str), 0)
-    if exp_index > 0
-        exp = parse(Int, str[(exp_index + 1):end])
-        sig_end = exp_index - 1
-    else
-        exp = 0
-        sig_end = lastindex(str)
-    end
-
-    # Remove the decimal place from the string
-    sign = T(first(str) == '-' ? -1 : 1)
-    dec_index = something(findfirst(==('.'), str), 0)
-    sig_start = sign < 0 ? 2 : 1
-    if dec_index > 0
-        int_str = str[sig_start:(dec_index - 1)] * str[(dec_index + 1):sig_end]
-        exp -= sig_end - dec_index
-    else
-        int_str = str[sig_start:sig_end]
-    end
-
-    # Split the integer string into the value we can represent inside the FixedDecimal and
-    # the remaining digits we'll use during rounding
-    int_end = lastindex(int_str)
-    pivot = int_end + exp - (-f)
-
-    a = rpad(int_str[1:min(pivot, int_end)], pivot, '0')
-    b = lpad(int_str[max(pivot, 1):int_end], int_end - pivot + 1, '0')
-
-    # Parse the strings
-    val = isempty(a) ? T(0) : sign * parse(T, a)
-    if !isempty(b) && any(!isequal('0'), b[2:end])
-        if mode == RoundThrows
-            throw(InexactError(:parse, FD{T, f}, str))
-        elseif mode == RoundNearest
-            val += sign * parse_round(T, b, mode)
-        end
-    end
-
-    reinterpret(FD{T, f}, val)
-end
-
-function parse_round(::Type{T}, fractional::AbstractString, ::RoundingMode{:Nearest}) where T
-    # Note: parsing each digit individually ensures we don't run into an OverflowError
-    digits = Int8[parse(Int8, d) for d in fractional]
-    for i in length(digits):-1:2
-        if digits[i] > 5 || digits[i] == 5 && isodd(digits[i - 1])
-            if i - 1 == 1
-                return T(1)
-            else
-                digits[i - 1] += 1
-            end
-        end
-    end
-    return T(0)
-end
-
-
 """
     max_exp10(T)
 
diff --git a/src/parse.jl b/src/parse.jl
@@ -0,0 +1,239 @@
+using Parsers
+using Parsers: AbstractConf, Result
+
+"""
+    RoundThrows
+
+Raises an `InexactError` if any rounding is necessary.
+"""
+const RoundThrows = RoundingMode{:Throw}()
+
+# make our own conf struct to avoid specializing Parsers.typeparser on each unique precision value
+struct FixedDecimalConf{T<:Integer} <: AbstractConf{T}
+    f::Int
+end
+# This overload says that when parsing a FixedDecimal type, use our new custom FixedDecimalConf type
+Parsers.conf(::Type{FixedDecimal{T,f}}, opts::Parsers.Options, kw...) where {T<:Integer,f} = FixedDecimalConf{T}(f)
+# Because the value returned from our `typeparser` isn't a FixedDecimal, we overload here to show we're returning an integer type
+Parsers.returntype(::Type{FixedDecimal{T,f}}) where {T,f} = T
+# This overload allows us to take the Result{IntegerType} returned from typeparser and turn it into a FixedDecimal Result
+function Parsers.result(FD::Type{FixedDecimal{T,f}}, res::Parsers.Result{T}) where {T,f}
+    return Parsers.invalid(res.code) ? Result{FD}(res.code, res.tlen) :
+        Result{FD}(res.code, res.tlen, reinterpret(FD, res.val))
+end
+# Tell Parsers that we can use our custom typeparser and not rely on Base.tryparse
+Parsers.supportedtype(::Type{<:FixedDecimal}) = true
+
+const OPTIONS_ROUND_NEAREST = Parsers.Options(rounding=RoundNearest)
+const OPTIONS_ROUND_TO_ZERO = Parsers.Options(rounding=RoundToZero)
+const OPTIONS_ROUND_THROWS = Parsers.Options(rounding=nothing)
+
+# TODO: a lookup table per type would be faster
+@inline _shift(n::T, decpos) where {T} = T(10)^decpos * n
+
+const _BIGINT1 = BigInt(1)
+const _BIGINT2 = BigInt(2)
+const _BIGINT10 = BigInt(10)
+const _BIGINT_10s = BigInt[] # buffer for "remainders" in _divpow10!, accessed via `Parsers.access_threaded`
+const _BIGINT_Rs = BigInt[]  # buffer for "remainders" in _divpow10!, accessed via `Parsers.access_threaded`
+
+for T in (Base.BitSigned_types..., Base.BitUnsigned_types...)
+    let bytes = Tuple(codeunits(string(typemax(T))))
+        # The number of digits an integer of type T can hold
+        @eval _maxintdigits(::Type{$T}) = $(length(bytes))
+    end
+end
+
+# All `v`s are non-negative
+function _unsafe_convert_int(::Type{T}, v::V) where {T<:Integer,V<:Integer}
+    return sizeof(T) > sizeof(V) ? T(v) :
+           sizeof(T) < sizeof(V) ? unsafe_trunc(T, v) :
+           Base.bitcast(T, v)
+end
+_unsafe_convert_int(::Type{T}, v::BigInt) where {T<:Integer} = unsafe_trunc(T, v)
+_unsafe_convert_int(::Type{T}, v::T) where {T<:Integer} = v
+
+function _check_overflows(::Type{T}, v::BigInt, neg::Bool) where {T<:Integer}
+    return neg ? -v < typemin(T) : v > typemax(T)
+end
+function _check_overflows(::Type{T}, v::V, neg::Bool) where {T<:Integer,V<:Union{UInt64,UInt128}}
+    return sizeof(T) <= sizeof(V) && (neg ? v > _unsafe_convert_int(V, typemax(T)) + one(V) : v > typemax(T))
+end
+_check_overflows(::Type{T}, v::T, neg::Bool) where {T <: Integer} = false
+
+# `x = div(x, 10^pow, mode)`; may set code |= INEXACT for RoundThrows
+# x is non-negative, pow is >= 1
+# `!` to signal we mutate bigints in-place
+function _divpow10!(x::T, code, pow, mode::RoundingMode) where {T}
+    return div(x, _shift(one(T), pow), mode), code
+end
+function _divpow10!(x::T, code, pow, ::RoundingMode{:Throw}) where {T}
+    q, r = divrem(x, _shift(one(T), pow))
+    r == 0 || (code |= Parsers.INEXACT)
+    return q, code
+end
+function _divpow10!(x::BigInt, code, pow, ::RoundingMode{:Nearest})
+    # adapted from https://github.com/JuliaLang/julia/blob/112554e1a533cebad4cb0daa27df59636405c075/base/div.jl#L217
+    @inbounds r = Parsers.access_threaded(() -> (@static VERSION > v"1.5" ? BigInt(; nbits=256) : BigInt()), _BIGINT_Rs)  # we must not yield here!
+    @inbounds y = Parsers.access_threaded(() -> (@static VERSION > v"1.5" ? BigInt(; nbits=256) : BigInt()), _BIGINT_10s) # we must not yield here!
+    Base.GMP.MPZ.set!(y, _BIGINT10)             # y = 10
+    Base.GMP.MPZ.pow_ui!(y, pow)                # y = y^pow
+    Base.GMP.MPZ.tdiv_qr!(x, r, x, y)           # x, r = divrem(x, y)
+    Base.GMP.MPZ.tdiv_q!(y, _BIGINT2)           # y = div(y, 2)
+    iseven(x) && Base.GMP.MPZ.add!(y, _BIGINT1) # y = y + iseven(x)
+    if r >= y
+        Base.GMP.MPZ.add!(x, _BIGINT1)          # x = x + (r >= y)
+    end
+    return x, code
+end
+function _divpow10!(x::BigInt, code, pow, ::RoundingMode{:ToZero})
+    @inbounds y = Parsers.access_threaded(() -> (@static VERSION > v"1.5" ? BigInt(; nbits=256) : BigInt()), _BIGINT_10s) # we must not yield here!
+    Base.GMP.MPZ.set!(y, _BIGINT10) # y = 10
+    Base.GMP.MPZ.pow_ui!(y, pow)    # y = y^pow
+    Base.GMP.MPZ.tdiv_q!(x, y)      # x = div(x, y)
+    return x, code
+end
+
+function _divpow10!(x::BigInt, code, pow, ::RoundingMode{:Throw})
+    @inbounds y = Parsers.access_threaded(() -> (@static VERSION > v"1.5" ? BigInt(; nbits=256) : BigInt()), _BIGINT_10s) # we must not yield here!
+    Base.GMP.MPZ.set!(y, _BIGINT10)   # y = 10
+    Base.GMP.MPZ.pow_ui!(y, pow)      # y = y^pow
+    Base.GMP.MPZ.tdiv_qr!(x, y, x, y) # x, y = divrem(x, y)
+    y == 0 || (code |= Parsers.INEXACT)
+    return x, code
+end
+
+# Rescale the digits we accumulated so far into the the a an integer representing the decimal
+# Note the 2nd argument `FloatType` is used by Parsers.jl for _float_ parsing, but we can ignore in the fixed decimal case
+@inline function Parsers.scale(
+    conf::FixedDecimalConf{T}, ::Parsers.FloatType, digits::V, exp, neg, code, ndigits, f::F, options::Parsers.Options
+) where {T,V,F}
+    rounding = something(options.rounding, RoundThrows)
+    # Positive: how many trailing zeroes we need to add to our integer
+    # Negative: how many digits are past our precision (we need to handle them in rounding)
+    decimal_shift = conf.f + exp
+    # Number of digits we need to accumulate including any trailigng zeros or digits past our precision
+    backing_integer_digits = ndigits + decimal_shift
+    may_overflow = backing_integer_digits == _maxintdigits(T)
+    if iszero(ndigits)
+        # all digits are zero
+        i = zero(T)
+    # The backing_integer_digits == 0 case is handled in the `else` (it means
+    # that all the digits are passed the precision but we might get `1` from rounding)
+    elseif backing_integer_digits < 0
+        # All digits are past our precision, no overflow possible, but we might get an inexact
+        i = zero(T)
+        (rounding === RoundThrows) && (code |= Parsers.INEXACT)
+    elseif neg && (T <: Unsigned)
+        # Unsigned types can't represent negative numbers
+        i = _unsafe_convert_int(T, digits)
+        code |= Parsers.INVALID
+    elseif backing_integer_digits > _maxintdigits(T)
+        i = _unsafe_convert_int(T, digits)
+        # The number of digits to accumulate is larger than the capacity of T, we overflow
+        # We don't check for inexact here because we already have an error
+        code |= Parsers.OVERFLOW
+    else
+        if decimal_shift > 0
+            r = _unsafe_convert_int(T, digits)
+            i = _shift(r, decimal_shift)
+            may_overflow && (r >= i) && (code |= Parsers.OVERFLOW)
+        elseif decimal_shift < 0
+            if rounding === RoundNearest
+                r, code = _divpow10!(digits, code, -decimal_shift, RoundNearest)
+            elseif rounding === RoundToZero
+                r, code = _divpow10!(digits, code, -decimal_shift, RoundToZero)
+            else
+                r, code = _divpow10!(digits, code, -decimal_shift, RoundThrows)
+            end
+            # Now that the digits were rescaled we can check for overflow
+            # can happen e.g. if digits were unsigned ints and out type is signed
+            may_overflow && _check_overflows(T, r, neg) && (code |= Parsers.OVERFLOW)
+            i = _unsafe_convert_int(T, r)
+        else
+            may_overflow && _check_overflows(T, digits, neg) && (code |= Parsers.OVERFLOW)
+            i = _unsafe_convert_int(T, digits)
+        end
+    end
+    out = ifelse(neg, -i, i)
+    return (out, code)
+end
+
+# If we only saw integer digits and not fractional or exponent digits, we just call scale with exp of 0
+# To handle type conversions and overflow checks etc.
+@inline function Parsers.noscale(conf::FixedDecimalConf{T}, digits::Integer, neg::Bool, code, ndigits, f::F, options::Parsers.Options) where {T,F}
+    FT = Parsers.FLOAT64 # not used by FixedDecimal parser
+    exp = 0
+    return Parsers.scale(conf, FT, digits, exp, neg, code, ndigits, f, options)
+end
+
+# This hooks into the floating point parsing machinery from Parsers.jl, where we also accumulate
+# all the digits and note the effective exponent before we do "scaling" -- for FixedDecimals,
+# the scaling means padding the backing integer with zeros or rounding them as necessary.
+# We overloaded the "scale" and "noscale" methods to produce backing integers for FixedDecimals.
+# We return a value of T -- i.e. the _integer_ backing the FixedDecimal, the reintrpret needs to happen later
+@inline function Parsers.typeparser(conf::FixedDecimalConf{T}, source, pos, len, b, code, pl, options) where {T<:Integer}
+    if !(options.rounding in (nothing, RoundNearest, RoundToZero, RoundThrows))
+        throw(ArgumentError("Unhandled rounding mode $(options.rounding)"))
+    end
+
+    startpos = pos
+    # begin parsing
+    neg = b == UInt8('-')
+    if neg || b == UInt8('+')
+        pos += 1
+        Parsers.incr!(source)
+        if Parsers.eof(source, pos, len)
+            code |= Parsers.INVALID | Parsers.EOF
+            x = zero(T)
+            @goto done
+        end
+        b = Parsers.peekbyte(source, pos)
+    else
+        # Check if the input is empty
+        if Parsers.eof(source, pos, len)
+            code |= Parsers.INVALID | Parsers.EOF
+            x = zero(T)
+            @goto done
+        end
+    end
+
+    if (b - UInt8('0')) <= 0x09 || b == options.decimal
+        x, code, pos = Parsers.parsedigits(conf, source, pos, len, b, code, options, UInt64(0), neg, startpos, true, 0, nothing)
+    else
+        x = zero(T)
+        code |= Parsers.INVALID
+    end
+    @label done
+    return pos, code, Parsers.PosLen(pl.pos, pos - pl.pos), x
+end
+
+function _base_parse(::Type{FD{T, f}}, source::AbstractString, mode::RoundingMode=RoundNearest) where {T, f}
+    if !(mode in (RoundThrows, RoundNearest, RoundToZero))
+        throw(ArgumentError("Unhandled rounding mode $mode"))
+    end
+
+    bytes = codeunits(source)
+    options = mode === RoundNearest ? OPTIONS_ROUND_NEAREST :
+        mode === RoundToZero ? OPTIONS_ROUND_TO_ZERO :
+        OPTIONS_ROUND_THROWS
+    res = Parsers.xparse2(FD{T, f}, bytes, 1, length(bytes), options)
+    return res
+end
+
+function Base.tryparse(::Type{FD{T, f}}, source::AbstractString, mode::RoundingMode=RoundNearest) where {T, f}
+    isempty(source) && return nothing
+    res = _base_parse(FD{T, f}, source, mode)
+    # If we didn't reach eof, there was some garbage at the end of the string after something that looked like a number
+    return (Parsers.eof(res.code) && Parsers.ok(res.code)) ? res.val : nothing
+end
+
+function Base.parse(::Type{FD{T, f}}, source::AbstractString, mode::RoundingMode=RoundNearest) where {T, f}
+    isempty(source) && throw(ArgumentError("Empty input is not allowed"))
+    res = _base_parse(FD{T, f}, source, mode)
+    Parsers.inexact(res.code) && throw(InexactError(:parse, FD{T, f}, source))
+    Parsers.overflow(res.code) && throw(OverflowError("overflow parsing $(repr(source)) as $(FD{T, f})"))
+    # If we didn't reach eof, there was some garbage at the end of the string after something that looked like a number
+    (!Parsers.eof(res.code) || Parsers.invalid(res.code)) && throw(ArgumentError("cannot parse $(repr(source)) as $(FD{T, f})"))
+    return res.val
+end
diff --git a/test/parse_tests.jl b/test/parse_tests.jl
diff --git a/test/runtests.jl b/test/runtests.jl