|
15 | 15 | */
|
16 | 16 | =#
|
17 | 17 |
|
18 |
| -const a0f = 7.7215664089f-02 #= 0x3d9e233f =# |
19 |
| -const a1f = 3.2246702909f-01 #= 0x3ea51a66 =# |
20 |
| -const a2f = 6.7352302372f-02 #= 0x3d89f001 =# |
21 |
| -const a3f = 2.0580807701f-02 #= 0x3ca89915 =# |
22 |
| -const a4f = 7.3855509982f-03 #= 0x3bf2027e =# |
23 |
| -const a5f = 2.8905137442f-03 #= 0x3b3d6ec6 =# |
24 |
| -const a6f = 1.1927076848f-03 #= 0x3a9c54a1 =# |
25 |
| -const a7f = 5.1006977446f-04 #= 0x3a05b634 =# |
26 |
| -const a8f = 2.2086278477f-04 #= 0x39679767 =# |
27 |
| -const a9f = 1.0801156895f-04 #= 0x38e28445 =# |
28 |
| -const a10f = 2.5214456400f-05 #= 0x37d383a2 =# |
29 |
| -const a11f = 4.4864096708f-05 #= 0x383c2c75 =# |
30 |
| -const tcf = 1.4616321325f+00 #= 0x3fbb16c3 =# |
31 |
| -const tff = -1.2148628384f-01 #= 0xbdf8cdcd =# |
32 |
| -# #= tt = -(tail of tf) =# |
33 |
| -const ttf = 6.6971006518f-09 #= 0x31e61c52 =# |
34 |
| -const t0f = 4.8383611441f-01 #= 0x3ef7b95e =# |
35 |
| -const t1f = -1.4758771658f-01 #= 0xbe17213c =# |
36 |
| -const t2f = 6.4624942839f-02 #= 0x3d845a15 =# |
37 |
| -const t3f = -3.2788541168f-02 #= 0xbd064d47 =# |
38 |
| -const t4f = 1.7970675603f-02 #= 0x3c93373d =# |
39 |
| -const t5f = -1.0314224288f-02 #= 0xbc28fcfe =# |
40 |
| -const t6f = 6.1005386524f-03 #= 0x3bc7e707 =# |
41 |
| -const t7f = -3.6845202558f-03 #= 0xbb7177fe =# |
42 |
| -const t8f = 2.2596477065f-03 #= 0x3b141699 =# |
43 |
| -const t9f = -1.4034647029f-03 #= 0xbab7f476 =# |
44 |
| -const t10f = 8.8108185446f-04 #= 0x3a66f867 =# |
45 |
| -const t11f = -5.3859531181f-04 #= 0xba0d3085 =# |
46 |
| -const t12f = 3.1563205994f-04 #= 0x39a57b6b =# |
47 |
| -const t13f = -3.1275415677f-04 #= 0xb9a3f927 =# |
48 |
| -const t14f = 3.3552918467f-04 #= 0x39afe9f7 =# |
49 |
| -const u0f = -7.7215664089f-02 #= 0xbd9e233f =# |
50 |
| -const u1f = 6.3282704353f-01 #= 0x3f2200f4 =# |
51 |
| -const u2f = 1.4549225569f+00 #= 0x3fba3ae7 =# |
52 |
| -const u3f = 9.7771751881f-01 #= 0x3f7a4bb2 =# |
53 |
| -const u4f = 2.2896373272f-01 #= 0x3e6a7578 =# |
54 |
| -const u5f = 1.3381091878f-02 #= 0x3c5b3c5e =# |
55 |
| -const v1f = 2.4559779167f+00 #= 0x401d2ebe =# |
56 |
| -const v2f = 2.1284897327f+00 #= 0x4008392d =# |
57 |
| -const v3f = 7.6928514242f-01 #= 0x3f44efdf =# |
58 |
| -const v4f = 1.0422264785f-01 #= 0x3dd572af =# |
59 |
| -const v5f = 3.2170924824f-03 #= 0x3b52d5db =# |
60 |
| -const s0f = -7.7215664089f-02 #= 0xbd9e233f =# |
61 |
| -const s1f = 2.1498242021f-01 #= 0x3e5c245a =# |
62 |
| -const s2f = 3.2577878237f-01 #= 0x3ea6cc7a =# |
63 |
| -const s3f = 1.4635047317f-01 #= 0x3e15dce6 =# |
64 |
| -const s4f = 2.6642270386f-02 #= 0x3cda40e4 =# |
65 |
| -const s5f = 1.8402845599f-03 #= 0x3af135b4 =# |
66 |
| -const s6f = 3.1947532989f-05 #= 0x3805ff67 =# |
67 |
| -const r1f = 1.3920053244f+00 #= 0x3fb22d3b =# |
68 |
| -const r2f = 7.2193557024f-01 #= 0x3f38d0c5 =# |
69 |
| -const r3f = 1.7193385959f-01 #= 0x3e300f6e =# |
70 |
| -const r4f = 1.8645919859f-02 #= 0x3c98bf54 =# |
71 |
| -const r5f = 7.7794247773f-04 #= 0x3a4beed6 =# |
72 |
| -const r6f = 7.3266842264f-06 #= 0x36f5d7bd =# |
73 |
| -const w0f = 4.1893854737f-01 #= 0x3ed67f1d =# |
74 |
| -const w1f = 8.3333335817f-02 #= 0x3daaaaab =# |
75 |
| -const w2f = -2.7777778450f-03 #= 0xbb360b61 =# |
76 |
| -const w3f = 7.9365057172f-04 #= 0x3a500cfd =# |
77 |
| -const w4f = -5.9518753551f-04 #= 0xba1c065c =# |
78 |
| -const w5f = 8.3633989561f-04 #= 0x3a5b3dd2 =# |
79 |
| -const w6f = -1.6309292987f-03 #= 0xbad5c4e8 =# |
80 |
| - |
81 | 18 | # Matches OpenLibm behavior exactly, including return of sign
|
82 | 19 | function _lgammaf_r(x::Float32)
|
83 | 20 | hx = reinterpret(Int32, x)
|
84 | 21 |
|
85 | 22 | #= purge off +-inf, NaN, +-0, tiny and negative arguments =#
|
86 | 23 | signgamp = Int32(1)
|
87 |
| - ix = signed(hx & 0x7fffffff) |
88 |
| - ix ≥ 0x7f800000 && return x * x, signgamp |
89 |
| - ix == 0 && return 1.0f0 / 0.0f0, signgamp |
| 24 | + ix = hx & 0x7fffffff |
| 25 | + ix ≥ 0x7f800000 && return Inf32, signgamp |
| 26 | + ix == 0x00000000 && return Inf32, signgamp |
90 | 27 | if ix < 0x35000000 #= |x|<2**-21, return -log(|x|) =#
|
91 |
| - if hx < 0 |
| 28 | + if hx < Int32(0) |
92 | 29 | signgamp = Int32(-1)
|
93 | 30 | return -log(-x), signgamp
|
94 | 31 | else
|
95 | 32 | return -log(x), signgamp
|
96 | 33 | end
|
97 | 34 | end
|
98 |
| - if hx < 0 |
99 |
| - ix ≥ 0x4b000000 && return 1.0f0 / 0.0f0, signgamp #= |x|>=2**23, must be -integer =# |
| 35 | + if hx < Int32(0) |
| 36 | + # ix ≥ 0x4b000000 && return Inf32, signgamp #= |x|>=2**23, must be -integer =# |
100 | 37 | t = sinpi(x)
|
101 |
| - t == 0.0f0 && return 1.0f0 / 0.0f0, signgamp #= -integer =# |
| 38 | + t == 0.0f0 && return Inf32, signgamp #= -integer =# |
102 | 39 | nadj = log(π / abs(t * x))
|
103 | 40 | if t < 0.0f0; signgamp = Int32(-1); end
|
104 | 41 | x = -x
|
105 | 42 | end
|
106 | 43 |
|
107 |
| - #= purge off 1 and 2 =# |
108 |
| - if ix == 0x3f800000 || ix == 0x40000000 |
109 |
| - r = 0.0f0 |
110 |
| - #= for x < 2.0 =# |
111 |
| - elseif ix < 0x40000000 |
112 |
| - if ix ≤ 0x3f666666 #= lgamma(x) = lgamma(x+1)-log(x) =# |
113 |
| - r = -log(x) |
114 |
| - if ix ≥ 0x3f3b4a20 |
115 |
| - y = 1.0f0 - x |
116 |
| - i = Int8(0) |
117 |
| - elseif ix ≥ 0x3e6d3308 |
118 |
| - y = x - (tcf - 1.0f0) |
119 |
| - i = Int8(1) |
120 |
| - else |
121 |
| - y = x |
122 |
| - i = Int8(2) |
123 |
| - end |
124 |
| - else |
| 44 | + if ix ≤ 0x40000000 #= for 1.0 ≤ x ≤ 2.0 =# |
| 45 | + i = round(x, RoundToZero) |
| 46 | + f = x - i |
| 47 | + if f == 0.0f0 #= purge off 1 and 2 =# |
| 48 | + return 0.0f0, signgamp |
| 49 | + elseif i == 1.0f0 |
125 | 50 | r = 0.0f0
|
126 |
| - if ix ≥ 0x3fdda618 #= [1.7316,2] =# |
127 |
| - y = 2.0f0 - x |
128 |
| - i = Int8(0) |
129 |
| - elseif ix ≥ 0x3f9da620 #= [1.23,1.73] =# |
130 |
| - y = x - tcf |
131 |
| - i = Int8(1) |
132 |
| - else |
133 |
| - y = x - 1.0f0 |
134 |
| - i = Int8(2) |
135 |
| - end |
136 |
| - end |
137 |
| - if i == Int8(0) |
138 |
| - z = y*y; |
139 |
| - p1 = a0f+z*(a2f+z*(a4f+z*(a6f+z*(a8f+z*a10f)))); |
140 |
| - p2 = z*(a1f+z*(a3f+z*(a5f+z*(a7f+z*(a9f+z*a11f))))); |
141 |
| - p = y*p1+p2; |
142 |
| - r += (p-0.5f0*y); |
143 |
| - elseif i == Int8(1) |
144 |
| - z = y*y; |
145 |
| - w = z*y; |
146 |
| - p1 = t0f+w*(t3f+w*(t6f+w*(t9f +w*t12f))); #= parallel comp =# |
147 |
| - p2 = t1f+w*(t4f+w*(t7f+w*(t10f+w*t13f))); |
148 |
| - p3 = t2f+w*(t5f+w*(t8f+w*(t11f+w*t14f))); |
149 |
| - p = z*p1-(ttf-w*(p2+y*p3)); |
150 |
| - r += (tff + p) |
151 |
| - elseif i == Int8(2) |
152 |
| - p1 = y*(u0f+y*(u1f+y*(u2f+y*(u3f+y*(u4f+y*u5f))))); |
153 |
| - p2 = 1.0f0+y*(v1f+y*(v2f+y*(v3f+y*(v4f+y*v5f)))); |
154 |
| - r += (-0.5f0*y + p1/p2); |
155 |
| - end |
156 |
| - elseif ix < 0x41000000 #= x < 8.0 =# |
157 |
| - i = Base.unsafe_trunc(Int8, x) |
158 |
| - y = x - Float32(i) |
159 |
| - # If performed here, performance is 2x worse; hence, move it below. |
160 |
| - # p = y*(s0f+y*(s1f+y*(s2f+y*(s3f+y*(s4f+y*(s5f+y*s6f)))))); |
161 |
| - # q = 1.0f0 + y*(r1f+y*(r2f+y*(r3f+y*(r4f+y*(r5f+y*r6f))))); |
162 |
| - # r = 0.5f0*y+p/q; |
163 |
| - z = 1.0f0; #= lgamma(1+s) = log(s) + lgamma(s) =# |
164 |
| - if i == Int8(7) |
165 |
| - z *= (y + 6.0f0) |
166 |
| - @goto case6 |
167 |
| - elseif i == Int8(6) |
168 |
| - @label case6 |
169 |
| - z *= (y + 5.0f0) |
170 |
| - @goto case5 |
171 |
| - elseif i == Int8(5) |
172 |
| - @label case5 |
173 |
| - z *= (y + 4.0f0) |
174 |
| - @goto case4 |
175 |
| - elseif i == Int8(4) |
176 |
| - @label case4 |
177 |
| - z *= (y + 3.0f0) |
178 |
| - @goto case3 |
179 |
| - elseif i == Int8(3) |
180 |
| - @label case3 |
181 |
| - z *= (y + 2.0f0) |
182 |
| - end |
183 |
| - # r += log(z) |
184 |
| - p = y*(s0f+y*(s1f+y*(s2f+y*(s3f+y*(s4f+y*(s5f+y*s6f)))))); |
185 |
| - q = 1.0f0 + y*(r1f+y*(r2f+y*(r3f+y*(r4f+y*(r5f+y*r6f))))); |
186 |
| - r = log(z) + 0.5f0*y+p/q; |
187 |
| - #= 8.0 ≤ x < 2^58 =# |
188 |
| - elseif ix < 0x5c800000 |
189 |
| - t = log(x) |
190 |
| - z = 1.0f0 / x |
191 |
| - y = z * z |
192 |
| - w = w0f+z*(w1f+y*(w2f+y*(w3f+y*(w4f+y*(w5f+y*w6f))))); |
193 |
| - r = (x-0.5f0)*(t-1.0f0)+w; |
194 |
| - else |
195 |
| - #= 2^58 ≤ x ≤ Inf =# |
196 |
| - r = x * (log(x) - 1.0f0) |
197 |
| - end |
198 |
| - if hx < 0 |
199 |
| - r = nadj - r |
200 |
| - end |
201 |
| - return r, signgamp |
202 |
| -end |
203 |
| - |
204 |
| -# Deviates from OpenLibm: throws instead of returning negative sign; approximately 25% faster |
205 |
| -# when sign is not needed in subsequent computations. |
206 |
| -function _loggammaf_r(x::Float32) |
207 |
| - hx = reinterpret(Int32, x) |
208 |
| - |
209 |
| - #= purge off +-inf, NaN, +-0, tiny and negative arguments =# |
210 |
| - ix = signed(hx & 0x7fffffff) |
211 |
| - ix ≥ 0x7f800000 && return x * x |
212 |
| - ix == 0 && return 1.0f0 / 0.0f0 |
213 |
| - if ix < 0x35000000 #= |x|<2**-21, return -log(|x|) =# |
214 |
| - if hx < 0 |
215 |
| - # return -log(-x) |
216 |
| - throw(DomainError(x, "`gamma(x)` must be non-negative")) |
| 51 | + c = 1.0f0 |
217 | 52 | else
|
218 |
| - return -log(x) |
219 |
| - end |
220 |
| - end |
221 |
| - if hx < 0 |
222 |
| - ix ≥ 0x4b000000 && return 1.0f0 / 0.0f0 #= |x|>=2**23, must be -integer =# |
223 |
| - t = sinpi(x) |
224 |
| - t == 0.0f0 && return 1.0f0 / 0.0f0 #= -integer =# |
225 |
| - nadj = log(π / abs(t * x)) |
226 |
| - t < 0.0f0 && throw(DomainError(x, "`gamma(x)` must be non-negative")) |
227 |
| - x = -x |
228 |
| - end |
229 |
| - |
230 |
| - #= purge off 1 and 2 =# |
231 |
| - if ix == 0x3f800000 || ix == 0x40000000 |
232 |
| - r = 0.0f0 |
233 |
| - #= for x < 2.0 =# |
234 |
| - elseif ix < 0x40000000 |
235 |
| - if ix ≤ 0x3f666666 #= lgamma(x) = lgamma(x+1)-log(x) =# |
236 | 53 | r = -log(x)
|
237 |
| - if ix ≥ 0x3f3b4a20 |
238 |
| - y = 1.0f0 - x |
239 |
| - i = Int8(0) |
240 |
| - elseif ix ≥ 0x3e6d3308 |
241 |
| - y = x - (tcf - 1.0f0) |
242 |
| - i = Int8(1) |
243 |
| - else |
244 |
| - y = x |
245 |
| - i = Int8(2) |
246 |
| - end |
247 |
| - else |
248 |
| - r = 0.0f0 |
249 |
| - if ix ≥ 0x3fdda618 #= [1.7316,2] =# |
250 |
| - y = 2.0f0 - x |
251 |
| - i = Int8(0) |
252 |
| - elseif ix ≥ 0x3f9da620 #= [1.23,1.73] =# |
253 |
| - y = x - tcf |
254 |
| - i = Int8(1) |
255 |
| - else |
256 |
| - y = x - 1.0f0 |
257 |
| - i = Int8(2) |
258 |
| - end |
| 54 | + c = 0.0f0 |
259 | 55 | end
|
260 |
| - if i == Int8(0) |
261 |
| - z = y*y; |
262 |
| - p1 = a0f+z*(a2f+z*(a4f+z*(a6f+z*(a8f+z*a10f)))); |
263 |
| - p2 = z*(a1f+z*(a3f+z*(a5f+z*(a7f+z*(a9f+z*a11f))))); |
264 |
| - p = y*p1+p2; |
265 |
| - r += (p-0.5f0*y); |
266 |
| - elseif i == Int8(1) |
267 |
| - z = y*y; |
268 |
| - w = z*y; |
269 |
| - p1 = t0f+w*(t3f+w*(t6f+w*(t9f +w*t12f))); #= parallel comp =# |
270 |
| - p2 = t1f+w*(t4f+w*(t7f+w*(t10f+w*t13f))); |
271 |
| - p3 = t2f+w*(t5f+w*(t8f+w*(t11f+w*t14f))); |
272 |
| - p = z*p1-(ttf-w*(p2+y*p3)); |
273 |
| - r += (tff + p) |
274 |
| - elseif i == Int8(2) |
275 |
| - p1 = y*(u0f+y*(u1f+y*(u2f+y*(u3f+y*(u4f+y*u5f))))); |
276 |
| - p2 = 1.0f0+y*(v1f+y*(v2f+y*(v3f+y*(v4f+y*v5f)))); |
277 |
| - r += (-0.5f0*y + p1/p2); |
| 56 | + if f ≥ 0.7315998f0 |
| 57 | + y = 1.0f0 + c - x |
| 58 | + z = y * y |
| 59 | + p1 = evalpoly(z, (7.7215664089f-2, 6.7352302372f-2, 7.3855509982f-3, 1.1927076848f-3, 2.2086278477f-4, 2.5214456400f-5)) |
| 60 | + p2 = evalpoly(z, (3.2246702909f-1, 2.0580807701f-2, 2.8905137442f-3, 5.1006977446f-4, 1.0801156895f-4, 4.4864096708f-5)) |
| 61 | + p = muladd(p1, y, p2) |
| 62 | + r += muladd(y, -0.5f0, p) |
| 63 | + elseif f ≥ 0.23163998f0 # or, the lb? 0.2316322f0 |
| 64 | + y = x - 0.46163213f0 - c |
| 65 | + z = y * y |
| 66 | + w = z * y |
| 67 | + p1 = evalpoly(w, (4.8383611441f-1, -3.2788541168f-2, 6.1005386524f-3, -1.4034647029f-3, 3.1563205994f-4)) |
| 68 | + p2 = evalpoly(w, (-1.4758771658f-1, 1.7970675603f-2, -3.6845202558f-3, 8.8108185446f-4, -3.1275415677f-4)) |
| 69 | + p3 = evalpoly(w, (6.4624942839f-2, -1.0314224288f-2, 2.2596477065f-3, -5.3859531181f-4, 3.3552918467f-4)) |
| 70 | + p = muladd(z, p1, -muladd(w, -muladd(p3, y, p2), 6.6971006518f-9)) |
| 71 | + r += p - 1.2148628384f-1 |
| 72 | + else |
| 73 | + y = x - c |
| 74 | + p1 = y * evalpoly(y, (-7.7215664089f-2, 6.3282704353f-1, 1.4549225569f0, 9.7771751881f-1, 2.2896373272f-1, 1.3381091878f-2)) |
| 75 | + p2 = evalpoly(y, (1.0f0, 2.4559779167f0, 2.1284897327f0, 7.6928514242f-1, 1.0422264785f-1, 3.2170924824f-3)) |
| 76 | + r += muladd(y, -0.5f0, p1 / p2) |
278 | 77 | end
|
279 | 78 | elseif ix < 0x41000000 #= x < 8.0 =#
|
280 |
| - i = Base.unsafe_trunc(Int8, x) |
281 |
| - y = x - Float32(i) |
282 |
| - # If performed here, performance is 2x worse; hence, move it below. |
283 |
| - # p = y*(s0f+y*(s1f+y*(s2f+y*(s3f+y*(s4f+y*(s5f+y*s6f)))))); |
284 |
| - # q = 1.0f0 + y*(r1f+y*(r2f+y*(r3f+y*(r4f+y*(r5f+y*r6f))))); |
285 |
| - # r = 0.5f0*y+p/q; |
286 |
| - z = 1.0f0; #= lgamma(1+s) = log(s) + lgamma(s) =# |
287 |
| - if i == 7 |
288 |
| - z *= (y + 6.0f0) |
289 |
| - @goto case6 |
290 |
| - elseif i == 6 |
291 |
| - @label case6 |
292 |
| - z *= (y + 5.0f0) |
293 |
| - @goto case5 |
294 |
| - elseif i == 5 |
295 |
| - @label case5 |
296 |
| - z *= (y + 4.0f0) |
297 |
| - @goto case4 |
298 |
| - elseif i == 4 |
299 |
| - @label case4 |
300 |
| - z *= (y + 3.0f0) |
301 |
| - @goto case3 |
302 |
| - elseif i == 3 |
303 |
| - @label case3 |
304 |
| - z *= (y + 2.0f0) |
| 79 | + i = round(x, RoundToZero) |
| 80 | + y = x - i |
| 81 | + z = 1.0f0 |
| 82 | + p = 0.0f0 |
| 83 | + u = x |
| 84 | + while u ≥ 3.0f0 |
| 85 | + p -= 1.0f0 |
| 86 | + u = x + p |
| 87 | + z *= u |
305 | 88 | end
|
306 |
| - # r += log(z) |
307 |
| - p = y*(s0f+y*(s1f+y*(s2f+y*(s3f+y*(s4f+y*(s5f+y*s6f)))))); |
308 |
| - q = 1.0f0 + y*(r1f+y*(r2f+y*(r3f+y*(r4f+y*(r5f+y*r6f))))); |
309 |
| - r = log(z) + 0.5f0*y+p/q; |
310 |
| - #= 8.0 ≤ x < 2^58 =# |
311 |
| - elseif ix < 0x5c800000 |
312 |
| - t = log(x) |
| 89 | + p = y * evalpoly(y, (-7.7215664089f-2, 2.1498242021f-1, 3.2577878237f-1, 1.4635047317f-1, 2.6642270386f-2, 1.8402845599f-3, 3.1947532989f-5)) |
| 90 | + q = evalpoly(y, (1.0f0, 1.3920053244f0, 7.2193557024f-1, 1.7193385959f-1, 1.8645919859f-2, 7.7794247773f-4, 7.3266842264f-6)) |
| 91 | + r = log(z) + muladd(0.5f0, y, p / q) |
| 92 | + elseif ix < 0x5c800000 #= 8.0 ≤ x < 2^58 =# |
313 | 93 | z = 1.0f0 / x
|
314 | 94 | y = z * z
|
315 |
| - w = w0f+z*(w1f+y*(w2f+y*(w3f+y*(w4f+y*(w5f+y*w6f))))); |
316 |
| - r = (x-0.5f0)*(t-1.0f0)+w; |
| 95 | + w = muladd(z, evalpoly(y, (8.3333335817f-2, -2.7777778450f-3, 7.9365057172f-4, -5.9518753551f-4, 8.3633989561f-4, -1.6309292987f-3)), 4.1893854737f-1) |
| 96 | + r = muladd(x - 0.5f0, log(x) - 1.0f0, w) |
317 | 97 | else
|
318 | 98 | #= 2^58 ≤ x ≤ Inf =#
|
319 |
| - r = x * (log(x) - 1.0f0) |
| 99 | + r = muladd(x, log(x), -x) |
320 | 100 | end
|
321 |
| - if hx < 0 |
| 101 | + if hx < Int32(0) |
322 | 102 | r = nadj - r
|
323 | 103 | end
|
324 |
| - return r |
| 104 | + return r, signgamp |
325 | 105 | end
|
0 commit comments