equally fast version with a smaller table.

oscardssmith · oscardssmith · commit 25a91e4cb6b9 · 2020-08-28T12:35:37.000-05:00
Uses a quartic which allows a smaller table. Performance is equal to slightly better, and accuracy is similar.
diff --git a/base/special/exp.jl b/base/special/exp.jl
@@ -36,15 +36,13 @@ const LOG2_E = 1.442695040888963407359924681001892137426646
 
 # log(2) into upper and lower bits
 LN2U(::Type{Float32}) = 6.9313812256f-1
-
 LN2L(::Type{Float32}) = 9.0580006145f-6
 
-
-# -log(2)/1024 into upper and lower bits
-LN2o1024U(::Type{Float64}) = -6.769015435155716e-4
-LN2o1024L(::Type{Float64}) = -2.264694154146777e-20
-# 1024/log(2)
-LN2o1024INV(::Type{Float64}) = 1477.3197218702985
+# -log(2)/256 into upper and lower bits
+LN2o256U(::Type{Float64}) = -2.7076061740622863e-3
+LN2o256L(::Type{Float64}) = -9.058776616587108e-20
+# 256/log(2)
+LN2o256INV(::Type{Float64}) = 369.3299304675746
 
 # magic rounding constant: 1.5*2^52 Adding, then subtracting it from a float rounds it to an Int.
 MAGIC_ROUND_CONST(::Type{Float64}) = 6755399441055744.0
@@ -58,11 +56,16 @@ MIN_EXP(::Type{Float64}) = -745.1332191019412076235   # log 2^-1075
 MIN_EXP(::Type{Float32}) = -103.97207708f0            # log 2^-150
 
 
-@inline exp_kernel(x) = evalpoly(x, (0.9999999999999999, 1.0, 0.5000000047728719, 0.16666666809852823))
+@inline exp_kernel(x) = evalpoly(x, (1.0,
+									 0.9999999999999912,
+									 0.4999999999999962,
+									 0.16666668575815502,
+									 0.041666671121347275))
+
 @inline exp_kernel(x::Float32) = @horner(x, 1.6666625440f-1, -2.7667332906f-3)
 
 #Note that this would need to be included literally to actually run.
-const J_TABLE = Float64[2.0^(-53 + big(j-1)/1024) for j in 1:1024]
+const J_TABLE = Float64[2.0^(-53 + big(j-1)/256) for j in 1:256]
 
 # for values smaller than this threshold just use a Taylor expansion
 @eval exp_small_thres(::Type{Float32}) = $(2.0f0^-13)
@@ -146,34 +149,33 @@ function exp(x::T) where T<:Float32
 end
 
 # Method
-# 1. Argument reduction: Reduce x to an r so that |r| <= ln(2)/1024. Given x,
+# 1. Argument reduction: Reduce x to an r so that |r| <= ln(2)/512. Given x,
 #    find r and integers k, j such that
-#       x = k*ln(2) + j/1024 + r,  0 <= j < 1024, |r| <= ln(2)/1024.
+#       x = k*ln(2) + j/256 + r,  0 <= j < 256, |r| <= ln(2)/512.
 #
 # 2. Approximate exp(r) by it's degree 3 taylor series around 0.
 #    Since the bounds on r are very tight, this is sufficient to be accurate to floating point epsilon.
 #
-# 3. Scale back: exp(x) = 2^k * 2^(j/1024) * exp(r)
-#    Since the range of possible j is small, 2^(j/1024) is simply stored for all possible values.
+# 3. Scale back: exp(x) = 2^k * 2^(j/256) * exp(r)
+#    Since the range of possible j is small, 2^(j/256) is simply stored for all possible values.
 #    Technichally the J_table stores a scaled version which makes subnormal numbers work better.
 
 
-
 function myexp(x::T) where T<:Float64
     if !(abs(x) < abs(MAX_EXP(T)))
         x <= -MIN_EXP(T) && return 0.
         x >=  MAX_EXP(T) && return Inf
         isnan(x) && return x
     end
 
-    N_float = muladd(x, LN2o1024INV(T), MAGIC_ROUND_CONST(T))
+    N_float = muladd(x, LN2o256INV(T), MAGIC_ROUND_CONST(T))
     N = reinterpret(Int64, N_float) % Int32
 
     N_float -=  MAGIC_ROUND_CONST(T)
-    r = muladd(N_float, LN2o1024L(T), muladd(N_float, LN2o1024U(T), x))
-    k = N >> 10
+    r = muladd(N_float, LN2o256L(T), muladd(N_float, LN2o256U(T), x))
+    k = N >> 8
 
-    small_part = @inbounds J_TABLE[N&1023 + 1] * exp_kernel(r)
+    small_part = @inbounds J_TABLE[N&255 + 1] * exp2_kernel(r)
 
     twopk = rem(53 + k, UInt64) << 52
     return reinterpret(T, twopk+reinterpret(Int, small_part))