|
5 | 5 | */
|
6 | 6 |
|
7 | 7 | use super::Float;
|
8 |
| -use super::support::{FpResult, Round, cold_path}; |
| 8 | +use super::support::{CastFrom, FpResult, Int, MinInt, Round, cold_path}; |
9 | 9 |
|
10 | 10 | /// Compute the cube root of the argument.
|
11 | 11 | #[cfg_attr(all(test, assert_no_panic), no_panic::no_panic)]
|
12 | 12 | pub fn cbrt(x: f64) -> f64 {
|
13 | 13 | cbrt_round(x, Round::Nearest).val
|
14 | 14 | }
|
15 | 15 |
|
16 |
| -pub fn cbrt_round(x: f64, round: Round) -> FpResult<f64> { |
17 |
| - const ESCALE: [f64; 3] = [ |
18 |
| - 1.0, |
19 |
| - hf64!("0x1.428a2f98d728bp+0"), /* 2^(1/3) */ |
20 |
| - hf64!("0x1.965fea53d6e3dp+0"), /* 2^(2/3) */ |
21 |
| - ]; |
22 |
| - |
23 |
| - /* the polynomial c0+c1*x+c2*x^2+c3*x^3 approximates x^(1/3) on [1,2] |
24 |
| - with maximal error < 9.2e-5 (attained at x=2) */ |
25 |
| - const C: [f64; 4] = [ |
26 |
| - hf64!("0x1.1b0babccfef9cp-1"), |
27 |
| - hf64!("0x1.2c9a3e94d1da5p-1"), |
28 |
| - hf64!("-0x1.4dc30b1a1ddbap-3"), |
29 |
| - hf64!("0x1.7a8d3e4ec9b07p-6"), |
30 |
| - ]; |
31 |
| - |
32 |
| - let u0: f64 = hf64!("0x1.5555555555555p-2"); |
33 |
| - let u1: f64 = hf64!("0x1.c71c71c71c71cp-3"); |
34 |
| - |
35 |
| - let rsc = [1.0, -1.0, 0.5, -0.5, 0.25, -0.25]; |
36 |
| - |
37 |
| - let off = [hf64!("0x1p-53"), 0.0, 0.0, 0.0]; |
| 16 | +pub fn cbrt_round<F: Float + CbrtHelper>(x: F, round: Round) -> FpResult<F> |
| 17 | +where |
| 18 | + F::Int: CastFrom<u64>, |
| 19 | + F::Int: CastFrom<u32>, |
| 20 | + F::Int: From<u8>, |
| 21 | +{ |
| 22 | + let zero = F::Int::ZERO; |
| 23 | + let one = F::Int::ONE; |
| 24 | + let u0: F = F::U0; |
| 25 | + let u1: F = F::U1; |
| 26 | + let rsc = F::RSC; |
| 27 | + let off = F::OFF; |
38 | 28 |
|
39 | 29 | /* rm=0 for rounding to nearest, and other values for directed roundings */
|
40 |
| - let hx: u64 = x.to_bits(); |
41 |
| - let mut mant: u64 = hx & f64::SIG_MASK; |
42 |
| - let sign: u64 = hx >> 63; |
| 30 | + let hx = x.to_bits(); |
| 31 | + let mut mant: F::Int = hx & F::SIG_MASK; |
| 32 | + let sign: F::Int = hx >> (F::BITS - 1); |
| 33 | + let neg = x.is_sign_negative(); |
43 | 34 |
|
44 |
| - let mut e: u32 = (hx >> f64::SIG_BITS) as u32 & f64::EXP_SAT; |
| 35 | + let mut e: u32 = x.exp(); |
45 | 36 |
|
46 |
| - if ((e + 1) & f64::EXP_SAT) < 2 { |
| 37 | + if ((e + 1) & F::EXP_SAT) < 2 { |
47 | 38 | cold_path();
|
48 | 39 |
|
49 |
| - let ix: u64 = hx & !f64::SIGN_MASK; |
| 40 | + let ix = hx & !F::SIGN_MASK; |
50 | 41 |
|
51 | 42 | /* 0, inf, nan: we return x + x instead of simply x,
|
52 | 43 | to that for x a signaling NaN, it correctly triggers
|
53 | 44 | the invalid exception. */
|
54 |
| - if e == f64::EXP_SAT || ix == 0 { |
| 45 | + if e == F::EXP_SAT || ix == zero { |
55 | 46 | return FpResult::ok(x + x);
|
56 | 47 | }
|
57 | 48 |
|
58 | 49 | let nz = ix.leading_zeros() - 11; /* subnormal */
|
59 | 50 | mant <<= nz;
|
60 |
| - mant &= f64::SIG_MASK; |
| 51 | + mant &= F::SIG_MASK; |
61 | 52 | e = e.wrapping_sub(nz - 1);
|
62 | 53 | }
|
63 | 54 |
|
64 | 55 | e = e.wrapping_add(3072);
|
65 |
| - let cvt1: u64 = mant | (0x3ffu64 << 52); |
66 |
| - let mut cvt5: u64 = cvt1; |
| 56 | + let cvt1 = mant | (F::Int::cast_from(F::EXP_BIAS) << F::SIG_BITS); |
| 57 | + let mut cvt5 = cvt1; |
67 | 58 |
|
68 | 59 | let et: u32 = e / 3;
|
69 | 60 | let it: u32 = e % 3;
|
70 | 61 |
|
71 | 62 | /* 2^(3k+it) <= x < 2^(3k+it+1), with 0 <= it <= 3 */
|
72 |
| - cvt5 += u64::from(it) << f64::SIG_BITS; |
73 |
| - cvt5 |= sign << 63; |
74 |
| - let zz: f64 = f64::from_bits(cvt5); |
| 63 | + cvt5 += F::Int::cast_from(it) << F::SIG_BITS; |
| 64 | + cvt5 |= sign << (F::BITS - 1); |
| 65 | + let zz: F = F::from_bits(cvt5); |
75 | 66 |
|
76 | 67 | /* cbrt(x) = cbrt(zz)*2^(et-1365) where 1 <= zz < 8 */
|
77 |
| - let mut isc: u64 = ESCALE[it as usize].to_bits(); // todo: index |
78 |
| - isc |= sign << 63; |
79 |
| - let cvt2: u64 = isc; |
80 |
| - let z: f64 = f64::from_bits(cvt1); |
| 68 | + let mut isc = F::ESCALE[it as usize].to_bits(); // todo: index |
| 69 | + isc |= sign << (F::BITS - 1); |
| 70 | + let cvt2 = isc; |
| 71 | + let z: F = F::from_bits(cvt1); |
81 | 72 |
|
82 | 73 | /* cbrt(zz) = cbrt(z)*isc, where isc encodes 1, 2^(1/3) or 2^(2/3),
|
83 | 74 | and 1 <= z < 2 */
|
84 |
| - let r: f64 = 1.0 / z; |
85 |
| - let rr: f64 = r * rsc[((it as usize) << 1) | sign as usize]; |
86 |
| - let z2: f64 = z * z; |
87 |
| - let c0: f64 = C[0] + z * C[1]; |
88 |
| - let c2: f64 = C[2] + z * C[3]; |
89 |
| - let mut y: f64 = c0 + z2 * c2; |
90 |
| - let mut y2: f64 = y * y; |
| 75 | + let r: F = F::ONE / z; |
| 76 | + let rr: F = r * rsc[((it as usize) << 1) | neg as usize]; |
| 77 | + let z2: F = z * z; |
| 78 | + let c0: F = F::C[0] + z * F::C[1]; |
| 79 | + let c2: F = F::C[2] + z * F::C[3]; |
| 80 | + let mut y: F = c0 + z2 * c2; |
| 81 | + let mut y2: F = y * y; |
91 | 82 |
|
92 | 83 | /* y is an approximation of z^(1/3) */
|
93 |
| - let mut h: f64 = y2 * (y * r) - 1.0; |
| 84 | + let mut h: F = y2 * (y * r) - F::ONE; |
94 | 85 |
|
95 | 86 | /* h determines the error between y and z^(1/3) */
|
96 | 87 | y -= (h * y) * (u0 - u1 * h);
|
97 | 88 |
|
98 | 89 | /* The correction y -= (h*y)*(u0 - u1*h) corresponds to a cubic variant
|
99 | 90 | of Newton's method, with the function f(y) = 1-z/y^3. */
|
100 |
| - y *= f64::from_bits(cvt2); |
| 91 | + y *= F::from_bits(cvt2); |
101 | 92 |
|
102 | 93 | /* Now y is an approximation of zz^(1/3),
|
103 | 94 | * and rr an approximation of 1/zz. We now perform another iteration of
|
104 | 95 | * Newton-Raphson, this time with a linear approximation only. */
|
105 | 96 | y2 = y * y;
|
106 |
| - let mut y2l: f64 = fmaf64(y, y, -y2); |
| 97 | + let mut y2l: F = y.fma(y, -y2); |
107 | 98 |
|
108 | 99 | /* y2 + y2l = y^2 exactly */
|
109 |
| - let mut y3: f64 = y2 * y; |
110 |
| - let mut y3l: f64 = fmaf64(y, y2, -y3) + y * y2l; |
| 100 | + let mut y3: F = y2 * y; |
| 101 | + let mut y3l: F = y.fma(y2, -y3) + y * y2l; |
111 | 102 |
|
112 | 103 | /* y3 + y3l approximates y^3 with about 106 bits of accuracy */
|
113 | 104 | h = ((y3 - zz) + y3l) * rr;
|
114 |
| - let mut dy: f64 = h * (y * u0); |
| 105 | + let mut dy: F = h * (y * u0); |
115 | 106 |
|
116 | 107 | /* the approximation of zz^(1/3) is y - dy */
|
117 |
| - let mut y1: f64 = y - dy; |
| 108 | + let mut y1: F = y - dy; |
118 | 109 | dy = (y - y1) - dy;
|
119 | 110 |
|
120 | 111 | /* the approximation of zz^(1/3) is now y1 + dy, where |dy| < 1/2 ulp(y)
|
121 | 112 | * (for rounding to nearest) */
|
122 |
| - let mut ady: f64 = dy.abs(); |
| 113 | + let mut ady: F = dy.abs(); |
123 | 114 |
|
124 | 115 | /* For directed roundings, ady0 is tiny when dy is tiny, or ady0 is near
|
125 | 116 | * from ulp(1);
|
126 | 117 | * for rounding to nearest, ady0 is tiny when dy is near from 1/2 ulp(1),
|
127 | 118 | * or from 3/2 ulp(1). */
|
128 |
| - let mut ady0: f64 = (ady - off[round as usize]).abs(); |
129 |
| - let mut ady1: f64 = (ady - (hf64!("0x1p-52") + off[round as usize])).abs(); |
| 119 | + let mut ady0: F = (ady - off[round as usize]).abs(); |
| 120 | + let mut ady1: F = (ady - (F::TWO_POW_NEG_SIG_BITS + off[round as usize])).abs(); |
130 | 121 |
|
131 |
| - if ady0 < hf64!("0x1p-75") || ady1 < hf64!("0x1p-75") { |
| 122 | + let magic = F::from_parts(false, (-75 + F::EXP_BIAS as i32) as u32, zero); |
| 123 | + |
| 124 | + if ady0 < magic || ady1 < magic { |
132 | 125 | cold_path();
|
133 | 126 |
|
134 | 127 | y2 = y1 * y1;
|
135 |
| - y2l = fmaf64(y1, y1, -y2); |
| 128 | + y2l = y1.fma(y1, -y2); |
136 | 129 | y3 = y2 * y1;
|
137 |
| - y3l = fmaf64(y1, y2, -y3) + y1 * y2l; |
| 130 | + y3l = y1.fma(y2, -y3) + y1 * y2l; |
138 | 131 | h = ((y3 - zz) + y3l) * rr;
|
139 | 132 | dy = h * (y1 * u0);
|
140 | 133 | y = y1 - dy;
|
141 | 134 | dy = (y1 - y) - dy;
|
142 | 135 | y1 = y;
|
143 | 136 | ady = dy.abs();
|
144 | 137 | ady0 = (ady - off[round as usize]).abs();
|
145 |
| - ady1 = (ady - (hf64!("0x1p-52") + off[round as usize])).abs(); |
| 138 | + ady1 = (ady - (F::TWO_POW_NEG_SIG_BITS + off[round as usize])).abs(); |
146 | 139 |
|
147 |
| - if ady0 < hf64!("0x1p-98") || ady1 < hf64!("0x1p-98") { |
| 140 | + let magic2 = F::from_parts(false, (-98 + F::EXP_BIAS as i32) as u32, zero); |
| 141 | + if ady0 < magic2 || ady1 < magic2 { |
148 | 142 | cold_path();
|
149 |
| - let azz: f64 = zz.abs(); |
| 143 | + let azz: F = zz.abs(); |
150 | 144 |
|
151 | 145 | // ~ 0x1.79d15d0e8d59b80000000000000ffc3dp+0
|
152 |
| - if azz == hf64!("0x1.9b78223aa307cp+1") { |
153 |
| - y1 = hf64!("0x1.79d15d0e8d59cp+0").copysign(zz); |
| 146 | + if azz == F::AZMAGIC1 { |
| 147 | + y1 = F::AZMAGIC2.copysign(zz); |
154 | 148 | }
|
155 | 149 |
|
156 | 150 | // ~ 0x1.de87aa837820e80000000000001c0f08p+0
|
157 |
| - if azz == hf64!("0x1.a202bfc89ddffp+2") { |
158 |
| - y1 = hf64!("0x1.de87aa837820fp+0").copysign(zz); |
| 151 | + if azz == F::AZMAGIC3 { |
| 152 | + y1 = F::AZMAGIC4.copysign(zz); |
159 | 153 | }
|
160 | 154 |
|
161 | 155 | if round != Round::Nearest {
|
162 |
| - let wlist = [ |
163 |
| - (hf64!("0x1.3a9ccd7f022dbp+0"), hf64!("0x1.1236160ba9b93p+0")), // ~ 0x1.1236160ba9b930000000000001e7e8fap+0 |
164 |
| - (hf64!("0x1.7845d2faac6fep+0"), hf64!("0x1.23115e657e49cp+0")), // ~ 0x1.23115e657e49c0000000000001d7a799p+0 |
165 |
| - (hf64!("0x1.d1ef81cbbbe71p+0"), hf64!("0x1.388fb44cdcf5ap+0")), // ~ 0x1.388fb44cdcf5a0000000000002202c55p+0 |
166 |
| - (hf64!("0x1.0a2014f62987cp+1"), hf64!("0x1.46bcbf47dc1e8p+0")), // ~ 0x1.46bcbf47dc1e8000000000000303aa2dp+0 |
167 |
| - (hf64!("0x1.fe18a044a5501p+1"), hf64!("0x1.95decfec9c904p+0")), // ~ 0x1.95decfec9c9040000000000000159e8ep+0 |
168 |
| - (hf64!("0x1.a6bb8c803147bp+2"), hf64!("0x1.e05335a6401dep+0")), // ~ 0x1.e05335a6401de00000000000027ca017p+0 |
169 |
| - (hf64!("0x1.ac8538a031cbdp+2"), hf64!("0x1.e281d87098de8p+0")), // ~ 0x1.e281d87098de80000000000000ee9314p+0 |
170 |
| - ]; |
171 |
| - |
172 |
| - for (a, b) in wlist { |
| 156 | + for (a, b) in F::WLIST { |
173 | 157 | if azz == a {
|
174 |
| - let tmp = if round as u64 + sign == 2 { hf64!("0x1p-52") } else { 0.0 }; |
| 158 | + let tmp = if F::Int::from(round as u8) + sign == F::Int::cast_from(2) { |
| 159 | + F::TWO_POW_NEG_SIG_BITS |
| 160 | + } else { |
| 161 | + F::ZERO |
| 162 | + }; |
175 | 163 | y1 = (b + tmp).copysign(zz);
|
176 | 164 | }
|
177 | 165 | }
|
178 | 166 | }
|
179 | 167 | }
|
180 | 168 | }
|
181 | 169 |
|
182 |
| - let mut cvt3: u64 = y1.to_bits(); |
183 |
| - cvt3 = cvt3.wrapping_add(((et.wrapping_sub(342).wrapping_sub(1023)) as u64) << 52); |
184 |
| - let m0: u64 = cvt3 << 30; |
| 170 | + let mut cvt3 = y1.to_bits(); |
| 171 | + cvt3 = cvt3.wrapping_add((F::Int::cast_from(et.wrapping_sub(342).wrapping_sub(1023))) << 52); |
| 172 | + let m0 = cvt3 << 30; |
185 | 173 | let m1 = m0 >> 63;
|
186 | 174 |
|
187 |
| - if (m0 ^ m1) <= (1u64 << 30) { |
| 175 | + if (m0 ^ m1) <= (one << 30) { |
188 | 176 | cold_path();
|
189 | 177 |
|
190 |
| - let mut cvt4: u64 = y1.to_bits(); |
191 |
| - cvt4 = (cvt4 + (164 << 15)) & 0xffffffffffff0000u64; |
| 178 | + let mut cvt4 = y1.to_bits(); |
| 179 | + cvt4 = (cvt4 + (F::Int::cast_from(164) << 15)) & F::Int::cast_from(0xffffffffffff0000u64); |
192 | 180 |
|
193 |
| - if ((f64::from_bits(cvt4) - y1) - dy).abs() < hf64!("0x1p-60") || (zz).abs() == 1.0 { |
194 |
| - cvt3 = (cvt3 + (1u64 << 15)) & 0xffffffffffff0000u64; |
| 181 | + let magic3 = F::from_parts(false, (-60 + F::EXP_BIAS as i32) as u32, zero); |
| 182 | + if ((F::from_bits(cvt4) - y1) - dy).abs() < magic3 || (zz).abs() == F::ONE { |
| 183 | + cvt3 = (cvt3 + (one << 15)) & F::Int::cast_from(0xffffffffffff0000u64); |
195 | 184 | }
|
196 | 185 | }
|
197 | 186 |
|
198 |
| - FpResult::ok(f64::from_bits(cvt3)) |
| 187 | + FpResult::ok(F::from_bits(cvt3)) |
199 | 188 | }
|
200 | 189 |
|
201 |
| -fn fmaf64(x: f64, y: f64, z: f64) -> f64 { |
202 |
| - #[cfg(intrinsics_enabled)] |
203 |
| - { |
204 |
| - return unsafe { core::intrinsics::fmaf64(x, y, z) }; |
205 |
| - } |
| 190 | +trait CbrtHelper: Float { |
| 191 | + /// 2^(n / 3) for n = [0, 1, 2] |
| 192 | + const ESCALE: [Self; 3]; |
| 193 | + /// The polynomial `c0+c1*x+c2*x^2+c3*x^3` approximates `x^(1/3)` on `[1,2]` |
| 194 | + /// with maximal error < 9.2e-5 (attained at x=2) |
| 195 | + const C: [Self; 4]; |
| 196 | + const U0: Self; |
| 197 | + const U1: Self; |
| 198 | + const RSC: [Self; 6]; |
| 199 | + const OFF: [Self; 4]; |
| 200 | + const WLIST: [(Self, Self); 7]; |
| 201 | + const AZMAGIC1: Self; |
| 202 | + const AZMAGIC2: Self; |
| 203 | + const AZMAGIC3: Self; |
| 204 | + const AZMAGIC4: Self; |
| 205 | + fn fma(self, y: Self, z: Self) -> Self; |
| 206 | +} |
206 | 207 |
|
207 |
| - #[cfg(not(intrinsics_enabled))] |
208 |
| - { |
209 |
| - return super::fma(x, y, z); |
| 208 | +impl CbrtHelper for f64 { |
| 209 | + const ESCALE: [Self; 3] = [ |
| 210 | + 1.0, |
| 211 | + hf64!("0x1.428a2f98d728bp+0"), /* 2^(1/3) */ |
| 212 | + hf64!("0x1.965fea53d6e3dp+0"), /* 2^(2/3) */ |
| 213 | + ]; |
| 214 | + |
| 215 | + const C: [Self; 4] = [ |
| 216 | + hf64!("0x1.1b0babccfef9cp-1"), |
| 217 | + hf64!("0x1.2c9a3e94d1da5p-1"), |
| 218 | + hf64!("-0x1.4dc30b1a1ddbap-3"), |
| 219 | + hf64!("0x1.7a8d3e4ec9b07p-6"), |
| 220 | + ]; |
| 221 | + |
| 222 | + const U0: Self = hf64!("0x1.5555555555555p-2"); |
| 223 | + |
| 224 | + const U1: Self = hf64!("0x1.c71c71c71c71cp-3"); |
| 225 | + |
| 226 | + const RSC: [Self; 6] = [1.0, -1.0, 0.5, -0.5, 0.25, -0.25]; |
| 227 | + |
| 228 | + const OFF: [Self; 4] = [hf64!("0x1p-53"), 0.0, 0.0, 0.0]; |
| 229 | + |
| 230 | + const WLIST: [(Self, Self); 7] = [ |
| 231 | + (hf64!("0x1.3a9ccd7f022dbp+0"), hf64!("0x1.1236160ba9b93p+0")), // ~ 0x1.1236160ba9b930000000000001e7e8fap+0 |
| 232 | + (hf64!("0x1.7845d2faac6fep+0"), hf64!("0x1.23115e657e49cp+0")), // ~ 0x1.23115e657e49c0000000000001d7a799p+0 |
| 233 | + (hf64!("0x1.d1ef81cbbbe71p+0"), hf64!("0x1.388fb44cdcf5ap+0")), // ~ 0x1.388fb44cdcf5a0000000000002202c55p+0 |
| 234 | + (hf64!("0x1.0a2014f62987cp+1"), hf64!("0x1.46bcbf47dc1e8p+0")), // ~ 0x1.46bcbf47dc1e8000000000000303aa2dp+0 |
| 235 | + (hf64!("0x1.fe18a044a5501p+1"), hf64!("0x1.95decfec9c904p+0")), // ~ 0x1.95decfec9c9040000000000000159e8ep+0 |
| 236 | + (hf64!("0x1.a6bb8c803147bp+2"), hf64!("0x1.e05335a6401dep+0")), // ~ 0x1.e05335a6401de00000000000027ca017p+0 |
| 237 | + (hf64!("0x1.ac8538a031cbdp+2"), hf64!("0x1.e281d87098de8p+0")), // ~ 0x1.e281d87098de80000000000000ee9314p+0 |
| 238 | + ]; |
| 239 | + |
| 240 | + const AZMAGIC1: Self = hf64!("0x1.9b78223aa307cp+1"); |
| 241 | + const AZMAGIC2: Self = hf64!("0x1.79d15d0e8d59cp+0"); |
| 242 | + const AZMAGIC3: Self = hf64!("0x1.a202bfc89ddffp+2"); |
| 243 | + const AZMAGIC4: Self = hf64!("0x1.de87aa837820fp+0"); |
| 244 | + |
| 245 | + fn fma(self, y: Self, z: Self) -> Self { |
| 246 | + #[cfg(intrinsics_enabled)] |
| 247 | + { |
| 248 | + return unsafe { core::intrinsics::fmaf64(self, y, z) }; |
| 249 | + } |
| 250 | + |
| 251 | + #[cfg(not(intrinsics_enabled))] |
| 252 | + { |
| 253 | + return super::fma(self, y, z); |
| 254 | + } |
210 | 255 | }
|
211 | 256 | }
|
212 | 257 |
|
|
0 commit comments