|
1 |
| -#![allow(unused)] |
2 |
| -#![allow(clippy::all)] |
3 |
| - |
4 |
| -use super::super::support::Hexf; |
5 | 1 | use super::super::{CastFrom, CastInto, Float, IntTy, MinInt};
|
6 | 2 |
|
7 |
| -#[cfg(not(optimizations_enabled))] |
8 |
| -extern crate std; |
9 |
| -#[cfg(not(optimizations_enabled))] |
10 |
| -use std::dbg; |
11 |
| - |
12 |
| -#[cfg(optimizations_enabled)] |
13 |
| -macro_rules! dbg { |
14 |
| - ($($tt:tt)*) => {}; |
15 |
| -} |
16 |
| - |
17 | 3 | /// Scale the exponent.
|
18 | 4 | ///
|
19 | 5 | /// From N3220:
|
|
36 | 22 | F::Int: CastFrom<i32>,
|
37 | 23 | F::Int: CastFrom<u32>,
|
38 | 24 | {
|
39 |
| - dbg!(); |
40 | 25 | let zero = IntTy::<F>::ZERO;
|
41 | 26 |
|
42 | 27 | // Bits including the implicit bit
|
|
55 | 40 | // 2 ^ sig_total_bits, representation of what can be accounted for with subnormals
|
56 | 41 | let f_exp_subnorm = F::from_parts(false, sig_total_bits + F::EXP_BIAS, zero);
|
57 | 42 |
|
58 |
| - dbg!(exp_max, exp_min, sig_total_bits, sig_total_bits + F::EXP_BIAS); |
59 |
| - dbg!(Hexf(f_exp_max), Hexf(f_exp_min), Hexf(f_exp_subnorm)); |
60 |
| - dbg!(Hexf(x), n); |
61 |
| - |
62 | 43 | // The goal is to multiply `x` by a scale factor that applies `n`. However, there are cases
|
63 | 44 | // where `2^n` is not representable by `F` but the result should be, e.g. `x = 2^Emin` with
|
64 | 45 | // `n = -EMin + 2`. To get around this, reduce the magnitude of the final scale operation by
|
@@ -86,56 +67,62 @@ where
|
86 | 67 |
|
87 | 68 | let mul = f_exp_min * f_exp_subnorm;
|
88 | 69 | let add = -exp_min - sig_total_bits as i32;
|
89 |
| - dbg!(Hexf(mul), add); |
| 70 | + |
| 71 | + // Worse case negative `n`: `x` is the maximum positive value, the result is `F::MIN`. |
| 72 | + // This can be reached by three scaling multiplications (two here and one final). |
| 73 | + debug_assert!(-exp_min + F::SIG_BITS as i32 + exp_max <= add * 2 + -exp_min); |
90 | 74 |
|
91 | 75 | x *= mul;
|
92 | 76 | n += add;
|
93 | 77 |
|
94 | 78 | if n < exp_min {
|
95 | 79 | x *= mul;
|
96 | 80 | n += add;
|
| 81 | + |
97 | 82 | if n < exp_min {
|
98 | 83 | x *= mul;
|
99 | 84 | n += add;
|
| 85 | + |
100 | 86 | if n < exp_min {
|
101 | 87 | n = exp_min;
|
102 |
| - dbg!(Hexf(x), n); |
103 | 88 | }
|
104 | 89 | }
|
105 | 90 | }
|
106 | 91 | } else {
|
107 |
| - let add = (n + sig_total_bits as i32).clamp(exp_min, sig_total_bits as i32); |
108 |
| - let mul = F::from_parts(false, (F::EXP_BIAS as i32 + add) as u32, zero); |
109 |
| - let add = -add; |
110 |
| - dbg!(add, Hexf(mul)); |
| 92 | + // `f16` is unique compared to other float types in that the difference between the |
| 93 | + // minimum exponent and the significand bits (`add = -exp_min - sig_total_bits`) is |
| 94 | + // small, only three. The above method depend on decrementing `n` by `add` two times; |
| 95 | + // for other float types this works out because `add` is a substantial fraction of |
| 96 | + // the exponent range. For `f16`, however, 3 is relatively small compared to the |
| 97 | + // exponent range (which is 39), so that would require a lot of rounds. |
| 98 | + // |
| 99 | + // Work aroudn this by using a different algorithm that scales by the max possible |
| 100 | + // value that does not exceed the minimum normal exponent. |
| 101 | + |
| 102 | + let add = -(n + sig_total_bits as i32).clamp(exp_min, sig_total_bits as i32); |
| 103 | + let mul = F::from_parts(false, (F::EXP_BIAS as i32 - add) as u32, zero); |
111 | 104 |
|
112 | 105 | x *= mul;
|
113 | 106 | n += add;
|
114 |
| - dbg!(Hexf(x), n); |
115 | 107 |
|
116 | 108 | if n < exp_min {
|
117 | 109 | let add = (n + sig_total_bits as i32).clamp(exp_min, sig_total_bits as i32);
|
118 | 110 | let mul = F::from_parts(false, (F::EXP_BIAS as i32 + add) as u32, zero);
|
119 | 111 | let add = -add;
|
120 |
| - dbg!(add, Hexf(mul)); |
121 | 112 |
|
122 | 113 | x *= mul;
|
123 | 114 | n += add;
|
124 |
| - dbg!(Hexf(x), n); |
125 | 115 |
|
126 | 116 | if n < exp_min {
|
127 | 117 | n = exp_min;
|
128 |
| - dbg!(Hexf(x), n); |
129 | 118 | }
|
130 | 119 | }
|
131 | 120 | // f16
|
132 | 121 | }
|
133 | 122 | }
|
134 |
| - dbg!(Hexf(x), n); |
| 123 | + |
135 | 124 | let scale = F::from_parts(false, (F::EXP_BIAS as i32 + n) as u32, zero);
|
136 |
| - let ret = x * scale; |
137 |
| - dbg!(Hexf(scale), Hexf(ret)); |
138 |
| - ret |
| 125 | + x * scale |
139 | 126 | }
|
140 | 127 |
|
141 | 128 | #[cfg(test)]
|
|
0 commit comments