Improve the comments for hexadecimal float parsing. (bytecodealliance#1888)

sunfishcode · web-flow · commit 6a76d04f30a6 · 2024-10-31T16:04:54.000Z
* Improve the comments for hexadecimal float parsing.

Replace the comments requesting better comments with better comments,
and tidy up the code.

* Fix a typo.
diff --git a/crates/wast/src/token.rs b/crates/wast/src/token.rs
@@ -435,6 +435,7 @@ macro_rules! float {
             let signif_bits = width - 1 - $exp_bits;
             let signif_mask = (1 << exp_offset) - 1;
             let bias = (1 << ($exp_bits - 1)) - 1;
+            let msb = 1 << neg_offset;
 
             let (hex, integral, fractional, exponent_str) = match val {
                 // Infinity is when the exponent bits are all set and
@@ -497,53 +498,77 @@ macro_rules! float {
                 return Some(float.to_bits());
             }
 
-            // Parsing hex floats is... hard! I don't really know what most of
-            // this below does. It was copied from Gecko's implementation in
-            // `WasmTextToBinary.cpp`. Would love comments on this if you have
-            // them!
-            let fractional = fractional.as_ref().map(|s| &**s).unwrap_or("");
-            let negative = integral.starts_with('-');
+            // Parse a hexadecimal floating-point value.
+            //
+            // The main loop here is simpler than for parsing decimal floats,
+            // because we can just parse hexadecimal digits and then shift
+            // their bits into place in the significand. But in addition to
+            // that, we also need to handle non-normalized representations,
+            // where the integral part is not "1", to convert them to
+            // normalized results, to round, in case we get more digits than
+            // the target format supports, and to handle overflow and subnormal
+            // cases.
+
+            // Get slices of digits for the integral and fractional parts. We
+            // can trivially skip any leading zeros in the integral part.
+            let is_negative = integral.starts_with('-');
             let integral = integral.trim_start_matches('-').trim_start_matches('0');
+            let fractional = fractional.as_ref().map(|s| &**s).unwrap_or("");
 
-            // Do a bunch of work up front to locate the first non-zero digit
-            // to determine the initial exponent. There's a number of
-            // adjustments depending on where the digit was found, but the
-            // general idea here is that I'm not really sure why things are
-            // calculated the way they are but it should match Gecko.
+            // Locate the first non-zero digit to determine the initial exponent.
+            //
+            // If there's no integral part, skip past leading zeros so that
+            // something like "0x.0000000000000000000002" doesn't cause us to hit
+            // a shift overflow when we try to shift the value into place. We'll
+            // adjust the exponent below to account for these skipped zeros.
             let fractional_no_leading = fractional.trim_start_matches('0');
             let fractional_iter = if integral.is_empty() {
                 fractional_no_leading.chars()
             } else {
                 fractional.chars()
             };
+
+            // Create a unified iterator over the digits of the integral part
+            // followed by the digits of the fractional part. The boolean value
+            // indicates which of these parts we're in.
             let mut digits = integral.chars()
                 .map(|c| (to_hex(c) as $int, false))
                 .chain(fractional_iter.map(|c| (to_hex(c) as $int, true)));
+
+            // Compute the number of leading zeros in the first non-zero digit,
+            // since if the first digit is not "1" we'll need to adjust for
+            // normalization.
             let lead_nonzero_digit = match digits.next() {
                 Some((c, _)) => c,
-                // No digits? Must be `+0` or `-0`, being careful to handle the
-                // sign encoding here.
-                None if negative => return Some(1 << (width - 1)),
+                // No non-zero digits? Must be `+0` or `-0`, being careful to
+                // handle the sign encoding here.
+                None if is_negative => return Some(msb),
                 None => return Some(0),
             };
-            let mut significand = 0 as $int;
+            let lz = (lead_nonzero_digit as u8).leading_zeros() as i32 - 4;
+
+            // Prepare for the main parsing loop. Calculate the initial values
+            // of `exponent` and `significand` based on what we've seen so far.
             let mut exponent = if !integral.is_empty() {
                 1
             } else {
+                // Adjust the exponent digits to account for any leading zeros
+                // in the fractional part that we skipped above.
                 -((fractional.len() - fractional_no_leading.len() + 1) as i32) + 1
             };
-            let lz = (lead_nonzero_digit as u8).leading_zeros() as i32 - 4;
-            exponent = exponent.checked_mul(4)?.checked_sub(lz + 1)?;
             let mut significand_pos = (width - (4 - (lz as usize))) as isize;
-            assert!(significand_pos >= 0);
-            significand |= lead_nonzero_digit << significand_pos;
+            let mut significand: $int = lead_nonzero_digit << significand_pos;
+            let mut discarded_extra_nonzero = false;
+
+            assert!(significand_pos >= 0, "$int should be at least 4 bits wide");
+
+            // Adjust for leading zeros in the first digit.
+            exponent = exponent.checked_mul(4)?.checked_sub(lz + 1)?;
 
             // Now that we've got an anchor in the string we parse the remaining
-            // digits. Again, not entirely sure why everything is the way it is
-            // here! This is copied frmo gecko.
-            let mut discarded_extra_nonzero = false;
-            for (digit, is_fractional) in digits {
-                if !is_fractional {
+            // hexadecimal digits.
+            for (digit, in_fractional) in digits {
+                if !in_fractional {
                     exponent += 4;
                 }
                 if significand_pos > -4 {
@@ -560,12 +585,19 @@ macro_rules! float {
                 }
             }
 
+            debug_assert!(significand != 0, "The case of no non-zero digits should have been handled above");
+
+            // Parse the exponent string, which despite this being a hexadecimal
+            // syntax, is a decimal number, and add it the exponent we've
+            // computed from the potentially non-normalized significand.
             exponent = exponent.checked_add(match exponent_str {
                 Some(s) => s.parse::<i32>().ok()?,
                 None => 0,
             })?;
-            debug_assert!(significand != 0);
 
+            // Encode the exponent and significand. Also calculate the bits of
+            // the significand which are discarded, as we'll use them to
+            // determine if we need to round up.
             let (encoded_exponent, encoded_significand, discarded_significand) =
                 if exponent <= -bias {
                     // Underflow to subnormal or zero.
@@ -598,26 +630,29 @@ macro_rules! float {
                     )
                 };
 
+            // Combine the encoded exponent and encoded significand to produce
+            // the raw result, except for the sign bit, which we'll apply at
+            // the end.
             let bits = encoded_exponent | encoded_significand;
 
-            // Apply rounding. If this overflows the significand, it carries
-            // into the exponent bit according to the magic of the IEEE 754
-            // encoding.
-            //
-            // Or rather, the comment above is what Gecko says so it's copied
-            // here too.
-            let msb = 1 << (width - 1);
+            // Apply rounding. Do an integer add of `0` or `1` on the raw
+            // result, depending on whether rounding is needed. Rounding can
+            // lead to a floating-point overflow, but we don't need to
+            // special-case that here because it turns out that IEEE 754 floats
+            // are encoded such that when an integer add of `1` carries into
+            // the bits of the exponent field, it produces the correct encoding
+            // for infinity.
             let bits = bits
                 + (((discarded_significand & msb != 0)
                     && ((discarded_significand & !msb != 0) ||
                          discarded_extra_nonzero ||
                          // ties to even
                          (encoded_significand & 1 != 0))) as $int);
 
-            // Just before we return the bits be sure to handle the sign bit we
+            // Just before we return the bits, be sure to handle the sign bit we
             // found at the beginning.
-            let bits = if negative {
-                bits | (1 << (width - 1))
+            let bits = if is_negative {
+                bits | msb
             } else {
                 bits
             };