Skip to content

Commit 417deed

Browse files
Update Float32 implementation
1 parent f27ecab commit 417deed

File tree

1 file changed

+54
-274
lines changed

1 file changed

+54
-274
lines changed

src/e_lgammaf_r.jl

Lines changed: 54 additions & 274 deletions
Original file line numberDiff line numberDiff line change
@@ -15,311 +15,91 @@
1515
*/
1616
=#
1717

18-
const a0f = 7.7215664089f-02 #= 0x3d9e233f =#
19-
const a1f = 3.2246702909f-01 #= 0x3ea51a66 =#
20-
const a2f = 6.7352302372f-02 #= 0x3d89f001 =#
21-
const a3f = 2.0580807701f-02 #= 0x3ca89915 =#
22-
const a4f = 7.3855509982f-03 #= 0x3bf2027e =#
23-
const a5f = 2.8905137442f-03 #= 0x3b3d6ec6 =#
24-
const a6f = 1.1927076848f-03 #= 0x3a9c54a1 =#
25-
const a7f = 5.1006977446f-04 #= 0x3a05b634 =#
26-
const a8f = 2.2086278477f-04 #= 0x39679767 =#
27-
const a9f = 1.0801156895f-04 #= 0x38e28445 =#
28-
const a10f = 2.5214456400f-05 #= 0x37d383a2 =#
29-
const a11f = 4.4864096708f-05 #= 0x383c2c75 =#
30-
const tcf = 1.4616321325f+00 #= 0x3fbb16c3 =#
31-
const tff = -1.2148628384f-01 #= 0xbdf8cdcd =#
32-
# #= tt = -(tail of tf) =#
33-
const ttf = 6.6971006518f-09 #= 0x31e61c52 =#
34-
const t0f = 4.8383611441f-01 #= 0x3ef7b95e =#
35-
const t1f = -1.4758771658f-01 #= 0xbe17213c =#
36-
const t2f = 6.4624942839f-02 #= 0x3d845a15 =#
37-
const t3f = -3.2788541168f-02 #= 0xbd064d47 =#
38-
const t4f = 1.7970675603f-02 #= 0x3c93373d =#
39-
const t5f = -1.0314224288f-02 #= 0xbc28fcfe =#
40-
const t6f = 6.1005386524f-03 #= 0x3bc7e707 =#
41-
const t7f = -3.6845202558f-03 #= 0xbb7177fe =#
42-
const t8f = 2.2596477065f-03 #= 0x3b141699 =#
43-
const t9f = -1.4034647029f-03 #= 0xbab7f476 =#
44-
const t10f = 8.8108185446f-04 #= 0x3a66f867 =#
45-
const t11f = -5.3859531181f-04 #= 0xba0d3085 =#
46-
const t12f = 3.1563205994f-04 #= 0x39a57b6b =#
47-
const t13f = -3.1275415677f-04 #= 0xb9a3f927 =#
48-
const t14f = 3.3552918467f-04 #= 0x39afe9f7 =#
49-
const u0f = -7.7215664089f-02 #= 0xbd9e233f =#
50-
const u1f = 6.3282704353f-01 #= 0x3f2200f4 =#
51-
const u2f = 1.4549225569f+00 #= 0x3fba3ae7 =#
52-
const u3f = 9.7771751881f-01 #= 0x3f7a4bb2 =#
53-
const u4f = 2.2896373272f-01 #= 0x3e6a7578 =#
54-
const u5f = 1.3381091878f-02 #= 0x3c5b3c5e =#
55-
const v1f = 2.4559779167f+00 #= 0x401d2ebe =#
56-
const v2f = 2.1284897327f+00 #= 0x4008392d =#
57-
const v3f = 7.6928514242f-01 #= 0x3f44efdf =#
58-
const v4f = 1.0422264785f-01 #= 0x3dd572af =#
59-
const v5f = 3.2170924824f-03 #= 0x3b52d5db =#
60-
const s0f = -7.7215664089f-02 #= 0xbd9e233f =#
61-
const s1f = 2.1498242021f-01 #= 0x3e5c245a =#
62-
const s2f = 3.2577878237f-01 #= 0x3ea6cc7a =#
63-
const s3f = 1.4635047317f-01 #= 0x3e15dce6 =#
64-
const s4f = 2.6642270386f-02 #= 0x3cda40e4 =#
65-
const s5f = 1.8402845599f-03 #= 0x3af135b4 =#
66-
const s6f = 3.1947532989f-05 #= 0x3805ff67 =#
67-
const r1f = 1.3920053244f+00 #= 0x3fb22d3b =#
68-
const r2f = 7.2193557024f-01 #= 0x3f38d0c5 =#
69-
const r3f = 1.7193385959f-01 #= 0x3e300f6e =#
70-
const r4f = 1.8645919859f-02 #= 0x3c98bf54 =#
71-
const r5f = 7.7794247773f-04 #= 0x3a4beed6 =#
72-
const r6f = 7.3266842264f-06 #= 0x36f5d7bd =#
73-
const w0f = 4.1893854737f-01 #= 0x3ed67f1d =#
74-
const w1f = 8.3333335817f-02 #= 0x3daaaaab =#
75-
const w2f = -2.7777778450f-03 #= 0xbb360b61 =#
76-
const w3f = 7.9365057172f-04 #= 0x3a500cfd =#
77-
const w4f = -5.9518753551f-04 #= 0xba1c065c =#
78-
const w5f = 8.3633989561f-04 #= 0x3a5b3dd2 =#
79-
const w6f = -1.6309292987f-03 #= 0xbad5c4e8 =#
80-
8118
# Matches OpenLibm behavior exactly, including return of sign
8219
function _lgammaf_r(x::Float32)
8320
hx = reinterpret(Int32, x)
8421

8522
#= purge off +-inf, NaN, +-0, tiny and negative arguments =#
8623
signgamp = Int32(1)
87-
ix = signed(hx & 0x7fffffff)
88-
ix 0x7f800000 && return x * x, signgamp
89-
ix == 0 && return 1.0f0 / 0.0f0, signgamp
24+
ix = hx & 0x7fffffff
25+
ix 0x7f800000 && return Inf32, signgamp
26+
ix == 0x00000000 && return Inf32, signgamp
9027
if ix < 0x35000000 #= |x|<2**-21, return -log(|x|) =#
91-
if hx < 0
28+
if hx < Int32(0)
9229
signgamp = Int32(-1)
9330
return -log(-x), signgamp
9431
else
9532
return -log(x), signgamp
9633
end
9734
end
98-
if hx < 0
99-
ix 0x4b000000 && return 1.0f0 / 0.0f0, signgamp #= |x|>=2**23, must be -integer =#
35+
if hx < Int32(0)
36+
# ix ≥ 0x4b000000 && return Inf32, signgamp #= |x|>=2**23, must be -integer =#
10037
t = sinpi(x)
101-
t == 0.0f0 && return 1.0f0 / 0.0f0, signgamp #= -integer =#
38+
t == 0.0f0 && return Inf32, signgamp #= -integer =#
10239
nadj = log/ abs(t * x))
10340
if t < 0.0f0; signgamp = Int32(-1); end
10441
x = -x
10542
end
10643

107-
#= purge off 1 and 2 =#
108-
if ix == 0x3f800000 || ix == 0x40000000
109-
r = 0.0f0
110-
#= for x < 2.0 =#
111-
elseif ix < 0x40000000
112-
if ix 0x3f666666 #= lgamma(x) = lgamma(x+1)-log(x) =#
113-
r = -log(x)
114-
if ix 0x3f3b4a20
115-
y = 1.0f0 - x
116-
i = Int8(0)
117-
elseif ix 0x3e6d3308
118-
y = x - (tcf - 1.0f0)
119-
i = Int8(1)
120-
else
121-
y = x
122-
i = Int8(2)
123-
end
124-
else
44+
if ix 0x40000000 #= for 1.0 ≤ x ≤ 2.0 =#
45+
i = round(x, RoundToZero)
46+
f = x - i
47+
if f == 0.0f0 #= purge off 1 and 2 =#
48+
return 0.0f0, signgamp
49+
elseif i == 1.0f0
12550
r = 0.0f0
126-
if ix 0x3fdda618 #= [1.7316,2] =#
127-
y = 2.0f0 - x
128-
i = Int8(0)
129-
elseif ix 0x3f9da620 #= [1.23,1.73] =#
130-
y = x - tcf
131-
i = Int8(1)
132-
else
133-
y = x - 1.0f0
134-
i = Int8(2)
135-
end
136-
end
137-
if i == Int8(0)
138-
z = y*y;
139-
p1 = a0f+z*(a2f+z*(a4f+z*(a6f+z*(a8f+z*a10f))));
140-
p2 = z*(a1f+z*(a3f+z*(a5f+z*(a7f+z*(a9f+z*a11f)))));
141-
p = y*p1+p2;
142-
r += (p-0.5f0*y);
143-
elseif i == Int8(1)
144-
z = y*y;
145-
w = z*y;
146-
p1 = t0f+w*(t3f+w*(t6f+w*(t9f +w*t12f))); #= parallel comp =#
147-
p2 = t1f+w*(t4f+w*(t7f+w*(t10f+w*t13f)));
148-
p3 = t2f+w*(t5f+w*(t8f+w*(t11f+w*t14f)));
149-
p = z*p1-(ttf-w*(p2+y*p3));
150-
r += (tff + p)
151-
elseif i == Int8(2)
152-
p1 = y*(u0f+y*(u1f+y*(u2f+y*(u3f+y*(u4f+y*u5f)))));
153-
p2 = 1.0f0+y*(v1f+y*(v2f+y*(v3f+y*(v4f+y*v5f))));
154-
r += (-0.5f0*y + p1/p2);
155-
end
156-
elseif ix < 0x41000000 #= x < 8.0 =#
157-
i = Base.unsafe_trunc(Int8, x)
158-
y = x - Float32(i)
159-
# If performed here, performance is 2x worse; hence, move it below.
160-
# p = y*(s0f+y*(s1f+y*(s2f+y*(s3f+y*(s4f+y*(s5f+y*s6f))))));
161-
# q = 1.0f0 + y*(r1f+y*(r2f+y*(r3f+y*(r4f+y*(r5f+y*r6f)))));
162-
# r = 0.5f0*y+p/q;
163-
z = 1.0f0; #= lgamma(1+s) = log(s) + lgamma(s) =#
164-
if i == Int8(7)
165-
z *= (y + 6.0f0)
166-
@goto case6
167-
elseif i == Int8(6)
168-
@label case6
169-
z *= (y + 5.0f0)
170-
@goto case5
171-
elseif i == Int8(5)
172-
@label case5
173-
z *= (y + 4.0f0)
174-
@goto case4
175-
elseif i == Int8(4)
176-
@label case4
177-
z *= (y + 3.0f0)
178-
@goto case3
179-
elseif i == Int8(3)
180-
@label case3
181-
z *= (y + 2.0f0)
182-
end
183-
# r += log(z)
184-
p = y*(s0f+y*(s1f+y*(s2f+y*(s3f+y*(s4f+y*(s5f+y*s6f))))));
185-
q = 1.0f0 + y*(r1f+y*(r2f+y*(r3f+y*(r4f+y*(r5f+y*r6f)))));
186-
r = log(z) + 0.5f0*y+p/q;
187-
#= 8.0 ≤ x < 2^58 =#
188-
elseif ix < 0x5c800000
189-
t = log(x)
190-
z = 1.0f0 / x
191-
y = z * z
192-
w = w0f+z*(w1f+y*(w2f+y*(w3f+y*(w4f+y*(w5f+y*w6f)))));
193-
r = (x-0.5f0)*(t-1.0f0)+w;
194-
else
195-
#= 2^58 ≤ x ≤ Inf =#
196-
r = x * (log(x) - 1.0f0)
197-
end
198-
if hx < 0
199-
r = nadj - r
200-
end
201-
return r, signgamp
202-
end
203-
204-
# Deviates from OpenLibm: throws instead of returning negative sign; approximately 25% faster
205-
# when sign is not needed in subsequent computations.
206-
function _loggammaf_r(x::Float32)
207-
hx = reinterpret(Int32, x)
208-
209-
#= purge off +-inf, NaN, +-0, tiny and negative arguments =#
210-
ix = signed(hx & 0x7fffffff)
211-
ix 0x7f800000 && return x * x
212-
ix == 0 && return 1.0f0 / 0.0f0
213-
if ix < 0x35000000 #= |x|<2**-21, return -log(|x|) =#
214-
if hx < 0
215-
# return -log(-x)
216-
throw(DomainError(x, "`gamma(x)` must be non-negative"))
51+
c = 1.0f0
21752
else
218-
return -log(x)
219-
end
220-
end
221-
if hx < 0
222-
ix 0x4b000000 && return 1.0f0 / 0.0f0 #= |x|>=2**23, must be -integer =#
223-
t = sinpi(x)
224-
t == 0.0f0 && return 1.0f0 / 0.0f0 #= -integer =#
225-
nadj = log/ abs(t * x))
226-
t < 0.0f0 && throw(DomainError(x, "`gamma(x)` must be non-negative"))
227-
x = -x
228-
end
229-
230-
#= purge off 1 and 2 =#
231-
if ix == 0x3f800000 || ix == 0x40000000
232-
r = 0.0f0
233-
#= for x < 2.0 =#
234-
elseif ix < 0x40000000
235-
if ix 0x3f666666 #= lgamma(x) = lgamma(x+1)-log(x) =#
23653
r = -log(x)
237-
if ix 0x3f3b4a20
238-
y = 1.0f0 - x
239-
i = Int8(0)
240-
elseif ix 0x3e6d3308
241-
y = x - (tcf - 1.0f0)
242-
i = Int8(1)
243-
else
244-
y = x
245-
i = Int8(2)
246-
end
247-
else
248-
r = 0.0f0
249-
if ix 0x3fdda618 #= [1.7316,2] =#
250-
y = 2.0f0 - x
251-
i = Int8(0)
252-
elseif ix 0x3f9da620 #= [1.23,1.73] =#
253-
y = x - tcf
254-
i = Int8(1)
255-
else
256-
y = x - 1.0f0
257-
i = Int8(2)
258-
end
54+
c = 0.0f0
25955
end
260-
if i == Int8(0)
261-
z = y*y;
262-
p1 = a0f+z*(a2f+z*(a4f+z*(a6f+z*(a8f+z*a10f))));
263-
p2 = z*(a1f+z*(a3f+z*(a5f+z*(a7f+z*(a9f+z*a11f)))));
264-
p = y*p1+p2;
265-
r += (p-0.5f0*y);
266-
elseif i == Int8(1)
267-
z = y*y;
268-
w = z*y;
269-
p1 = t0f+w*(t3f+w*(t6f+w*(t9f +w*t12f))); #= parallel comp =#
270-
p2 = t1f+w*(t4f+w*(t7f+w*(t10f+w*t13f)));
271-
p3 = t2f+w*(t5f+w*(t8f+w*(t11f+w*t14f)));
272-
p = z*p1-(ttf-w*(p2+y*p3));
273-
r += (tff + p)
274-
elseif i == Int8(2)
275-
p1 = y*(u0f+y*(u1f+y*(u2f+y*(u3f+y*(u4f+y*u5f)))));
276-
p2 = 1.0f0+y*(v1f+y*(v2f+y*(v3f+y*(v4f+y*v5f))));
277-
r += (-0.5f0*y + p1/p2);
56+
if f 0.7315998f0
57+
y = 1.0f0 + c - x
58+
z = y * y
59+
p1 = evalpoly(z, (7.7215664089f-2, 6.7352302372f-2, 7.3855509982f-3, 1.1927076848f-3, 2.2086278477f-4, 2.5214456400f-5))
60+
p2 = evalpoly(z, (3.2246702909f-1, 2.0580807701f-2, 2.8905137442f-3, 5.1006977446f-4, 1.0801156895f-4, 4.4864096708f-5))
61+
p = muladd(p1, y, p2)
62+
r += muladd(y, -0.5f0, p)
63+
elseif f 0.23163998f0 # or, the lb? 0.2316322f0
64+
y = x - 0.46163213f0 - c
65+
z = y * y
66+
w = z * y
67+
p1 = evalpoly(w, (4.8383611441f-1, -3.2788541168f-2, 6.1005386524f-3, -1.4034647029f-3, 3.1563205994f-4))
68+
p2 = evalpoly(w, (-1.4758771658f-1, 1.7970675603f-2, -3.6845202558f-3, 8.8108185446f-4, -3.1275415677f-4))
69+
p3 = evalpoly(w, (6.4624942839f-2, -1.0314224288f-2, 2.2596477065f-3, -5.3859531181f-4, 3.3552918467f-4))
70+
p = muladd(z, p1, -muladd(w, -muladd(p3, y, p2), 6.6971006518f-9))
71+
r += p - 1.2148628384f-1
72+
else
73+
y = x - c
74+
p1 = y * evalpoly(y, (-7.7215664089f-2, 6.3282704353f-1, 1.4549225569f0, 9.7771751881f-1, 2.2896373272f-1, 1.3381091878f-2))
75+
p2 = evalpoly(y, (1.0f0, 2.4559779167f0, 2.1284897327f0, 7.6928514242f-1, 1.0422264785f-1, 3.2170924824f-3))
76+
r += muladd(y, -0.5f0, p1 / p2)
27877
end
27978
elseif ix < 0x41000000 #= x < 8.0 =#
280-
i = Base.unsafe_trunc(Int8, x)
281-
y = x - Float32(i)
282-
# If performed here, performance is 2x worse; hence, move it below.
283-
# p = y*(s0f+y*(s1f+y*(s2f+y*(s3f+y*(s4f+y*(s5f+y*s6f))))));
284-
# q = 1.0f0 + y*(r1f+y*(r2f+y*(r3f+y*(r4f+y*(r5f+y*r6f)))));
285-
# r = 0.5f0*y+p/q;
286-
z = 1.0f0; #= lgamma(1+s) = log(s) + lgamma(s) =#
287-
if i == 7
288-
z *= (y + 6.0f0)
289-
@goto case6
290-
elseif i == 6
291-
@label case6
292-
z *= (y + 5.0f0)
293-
@goto case5
294-
elseif i == 5
295-
@label case5
296-
z *= (y + 4.0f0)
297-
@goto case4
298-
elseif i == 4
299-
@label case4
300-
z *= (y + 3.0f0)
301-
@goto case3
302-
elseif i == 3
303-
@label case3
304-
z *= (y + 2.0f0)
79+
i = round(x, RoundToZero)
80+
y = x - i
81+
z = 1.0f0
82+
p = 0.0f0
83+
u = x
84+
while u 3.0f0
85+
p -= 1.0f0
86+
u = x + p
87+
z *= u
30588
end
306-
# r += log(z)
307-
p = y*(s0f+y*(s1f+y*(s2f+y*(s3f+y*(s4f+y*(s5f+y*s6f))))));
308-
q = 1.0f0 + y*(r1f+y*(r2f+y*(r3f+y*(r4f+y*(r5f+y*r6f)))));
309-
r = log(z) + 0.5f0*y+p/q;
310-
#= 8.0 ≤ x < 2^58 =#
311-
elseif ix < 0x5c800000
312-
t = log(x)
89+
p = y * evalpoly(y, (-7.7215664089f-2, 2.1498242021f-1, 3.2577878237f-1, 1.4635047317f-1, 2.6642270386f-2, 1.8402845599f-3, 3.1947532989f-5))
90+
q = evalpoly(y, (1.0f0, 1.3920053244f0, 7.2193557024f-1, 1.7193385959f-1, 1.8645919859f-2, 7.7794247773f-4, 7.3266842264f-6))
91+
r = log(z) + muladd(0.5f0, y, p / q)
92+
elseif ix < 0x5c800000 #= 8.0 ≤ x < 2^58 =#
31393
z = 1.0f0 / x
31494
y = z * z
315-
w = w0f+z*(w1f+y*(w2f+y*(w3f+y*(w4f+y*(w5f+y*w6f)))));
316-
r = (x-0.5f0)*(t-1.0f0)+w;
95+
w = muladd(z, evalpoly(y, (8.3333335817f-2, -2.7777778450f-3, 7.9365057172f-4, -5.9518753551f-4, 8.3633989561f-4, -1.6309292987f-3)), 4.1893854737f-1)
96+
r = muladd(x - 0.5f0, log(x) - 1.0f0, w)
31797
else
31898
#= 2^58 ≤ x ≤ Inf =#
319-
r = x * (log(x) - 1.0f0)
99+
r = muladd(x, log(x), -x)
320100
end
321-
if hx < 0
101+
if hx < Int32(0)
322102
r = nadj - r
323103
end
324-
return r
104+
return r, signgamp
325105
end

0 commit comments

Comments
 (0)