Merge #141

bors[bot] · ejmahler · cuviper · web-flow · commit b3d48f48d4b8 · 2020-11-02T20:31:27.000Z
141: Use the _addcarry and _subborrow intrinsics when available r=cuviper a=ejmahler

When compiling for x86_64, with "u64_digit" enabled, some benchmarks are improved by using `_addcarry_u64` instead of the custom-written `adc` function, and using `_subborrow_u64)` instead of the custom-written `sbb` function.

The fib and fib2 benchmarks improved the most, most benchmarks improved a little, and a few were worse within the margin of error.

The only benchmark that did legitimately worse was the `gcd_euclid` family, but there's a comment after those benchmarks saying `// Integer for BigUint now uses Stein for gcd`. the stein benchmarks showed improvements with this change.

Looking at the generated assembly, it was generating adcq instructions both before and after the change, but post-change the code using adc is a little shorter. It's possible that the intrinsic provided just enough of a hint to the compiler that it was able to optimize some things away. The compiler wasn't generating sbb instructions at all, so this adds them -- and once nice thing is that this change eliminates signed-&gt;unsigned conversions.

Let me know if you'd prefer a different away to organize the platform-specific code.

Co-authored-by: Elliott Mahler &lt;join.together@gmail.com&gt;
Co-authored-by: Josh Stone &lt;cuviper@gmail.com&gt;
diff --git a/build.rs b/build.rs
@@ -6,14 +6,31 @@ use std::path::Path;
 
 fn main() {
     let pointer_width = env::var("CARGO_CFG_TARGET_POINTER_WIDTH");
-    if pointer_width.as_ref().map(String::as_str) == Ok("64") {
+    let u64_digit = pointer_width.as_ref().map(String::as_str) == Ok("64");
+    if u64_digit {
         autocfg::emit("u64_digit");
     }
     let ac = autocfg::new();
-    if ac.probe_path("std::convert::TryFrom") || ac.probe_path("core::convert::TryFrom") {
+    let std = if ac.probe_sysroot_crate("std") {
+        "std"
+    } else {
+        "core"
+    };
+    if ac.probe_path(&format!("{}::convert::TryFrom", std)) {
         autocfg::emit("has_try_from");
     }
 
+    if let Ok(target_arch) = env::var("CARGO_CFG_TARGET_ARCH") {
+        if target_arch == "x86_64" || target_arch == "x86" {
+            let digit = if u64_digit { "u64" } else { "u32" };
+
+            let addcarry = format!("{}::arch::{}::_addcarry_{}", std, target_arch, digit);
+            if ac.probe_path(&addcarry) {
+                autocfg::emit("use_addcarry");
+            }
+        }
+    }
+
     autocfg::rerun_path("build.rs");
 
     write_radix_bases().unwrap();
diff --git a/src/algorithms.rs b/src/algorithms.rs
@@ -5,35 +5,80 @@ use core::iter::repeat;
 use core::mem;
 use num_traits::{One, PrimInt, Zero};
 
+#[cfg(all(use_addcarry, target_arch = "x86_64"))]
+use core::arch::x86_64 as arch;
+
+#[cfg(all(use_addcarry, target_arch = "x86"))]
+use core::arch::x86 as arch;
+
 use crate::biguint::biguint_from_vec;
 use crate::biguint::BigUint;
 
 use crate::bigint::BigInt;
 use crate::bigint::Sign;
 use crate::bigint::Sign::{Minus, NoSign, Plus};
 
-use crate::big_digit::{self, BigDigit, DoubleBigDigit, SignedDoubleBigDigit};
+use crate::big_digit::{self, BigDigit, DoubleBigDigit};
 
-// Generic functions for add/subtract/multiply with carry/borrow:
+// only needed for the fallback implementation of `sbb`
+#[cfg(not(use_addcarry))]
+use crate::big_digit::SignedDoubleBigDigit;
+
+// Generic functions for add/subtract/multiply with carry/borrow. These are specialized
+// for some platforms to take advantage of intrinsics, etc.
 
 // Add with carry:
+#[cfg(all(use_addcarry, u64_digit))]
 #[inline]
-fn adc(a: BigDigit, b: BigDigit, acc: &mut DoubleBigDigit) -> BigDigit {
-    *acc += DoubleBigDigit::from(a);
-    *acc += DoubleBigDigit::from(b);
-    let lo = *acc as BigDigit;
-    *acc >>= big_digit::BITS;
-    lo
+fn adc(carry: u8, a: u64, b: u64, out: &mut u64) -> u8 {
+    // Safety: There are absolutely no safety concerns with calling `_addcarry_u64`.
+    // It's just unsafe for API consistency with other intrinsics.
+    unsafe { arch::_addcarry_u64(carry, a, b, out) }
+}
+
+#[cfg(all(use_addcarry, not(u64_digit)))]
+#[inline]
+fn adc(carry: u8, a: u32, b: u32, out: &mut u32) -> u8 {
+    // Safety: There are absolutely no safety concerns with calling `_addcarry_u32`.
+    // It's just unsafe for API consistency with other intrinsics.
+    unsafe { arch::_addcarry_u32(carry, a, b, out) }
+}
+
+// fallback for environments where we don't have an addcarry intrinsic
+#[cfg(not(use_addcarry))]
+#[inline]
+fn adc(carry: u8, a: BigDigit, b: BigDigit, out: &mut BigDigit) -> u8 {
+    let sum = DoubleBigDigit::from(a) + DoubleBigDigit::from(b) + DoubleBigDigit::from(carry);
+    *out = sum as BigDigit;
+    (sum >> big_digit::BITS) as u8
 }
 
 // Subtract with borrow:
+#[cfg(all(use_addcarry, u64_digit))]
 #[inline]
-fn sbb(a: BigDigit, b: BigDigit, acc: &mut SignedDoubleBigDigit) -> BigDigit {
-    *acc += SignedDoubleBigDigit::from(a);
-    *acc -= SignedDoubleBigDigit::from(b);
-    let lo = *acc as BigDigit;
-    *acc >>= big_digit::BITS;
-    lo
+fn sbb(borrow: u8, a: u64, b: u64, out: &mut u64) -> u8 {
+    // Safety: There are absolutely no safety concerns with calling `_subborrow_u64`.
+    // It's just unsafe for API consistency with other intrinsics.
+    unsafe { arch::_subborrow_u64(borrow, a, b, out) }
+}
+
+#[cfg(all(use_addcarry, not(u64_digit)))]
+#[inline]
+fn sbb(borrow: u8, a: u32, b: u32, out: &mut u32) -> u8 {
+    // Safety: There are absolutely no safety concerns with calling `_subborrow_u32`.
+    // It's just unsafe for API consistency with other intrinsics.
+    unsafe { arch::_subborrow_u32(borrow, a, b, out) }
+}
+
+// fallback for environments where we don't have a subborrow intrinsic
+#[cfg(not(use_addcarry))]
+#[inline]
+fn sbb(borrow: u8, a: BigDigit, b: BigDigit, out: &mut BigDigit) -> u8 {
+    let difference = SignedDoubleBigDigit::from(a)
+        - SignedDoubleBigDigit::from(b)
+        - SignedDoubleBigDigit::from(borrow);
+    *out = difference as BigDigit;
+    u8::from(difference < 0)
 }
 
 #[inline]
@@ -140,12 +185,12 @@ pub(crate) fn __add2(a: &mut [BigDigit], b: &[BigDigit]) -> BigDigit {
     let (a_lo, a_hi) = a.split_at_mut(b.len());
 
     for (a, b) in a_lo.iter_mut().zip(b) {
-        *a = adc(*a, *b, &mut carry);
+        carry = adc(carry, *a, *b, a);
     }
 
     if carry != 0 {
         for a in a_hi {
-            *a = adc(*a, 0, &mut carry);
+            carry = adc(carry, *a, 0, a);
             if carry == 0 {
                 break;
             }
@@ -174,12 +219,12 @@ pub(crate) fn sub2(a: &mut [BigDigit], b: &[BigDigit]) {
     let (b_lo, b_hi) = b.split_at(len);
 
     for (a, b) in a_lo.iter_mut().zip(b_lo) {
-        *a = sbb(*a, *b, &mut borrow);
+        borrow = sbb(borrow, *a, *b, a);
     }
 
     if borrow != 0 {
         for a in a_hi {
-            *a = sbb(*a, 0, &mut borrow);
+            borrow = sbb(borrow, *a, 0, a);
             if borrow == 0 {
                 break;
             }
@@ -195,16 +240,16 @@ pub(crate) fn sub2(a: &mut [BigDigit], b: &[BigDigit]) {
 
 // Only for the Sub impl. `a` and `b` must have same length.
 #[inline]
-pub(crate) fn __sub2rev(a: &[BigDigit], b: &mut [BigDigit]) -> BigDigit {
+pub(crate) fn __sub2rev(a: &[BigDigit], b: &mut [BigDigit]) -> u8 {
     debug_assert!(b.len() == a.len());
 
     let mut borrow = 0;
 
     for (ai, bi) in a.iter().zip(b) {
-        *bi = sbb(*ai, *bi, &mut borrow);
+        borrow = sbb(borrow, *ai, *bi, bi);
     }
 
-    borrow as BigDigit
+    borrow
 }
 
 pub(crate) fn sub2rev(a: &[BigDigit], b: &mut [BigDigit]) {
@@ -259,11 +304,14 @@ pub(crate) fn mac_digit(acc: &mut [BigDigit], b: &[BigDigit], c: BigDigit) {
         *a = mac_with_carry(*a, b, c, &mut carry);
     }
 
-    let mut a = a_hi.iter_mut();
-    while carry != 0 {
-        let a = a.next().expect("carry overflow during multiplication!");
-        *a = adc(*a, 0, &mut carry);
-    }
+    let (carry_hi, carry_lo) = big_digit::from_doublebigdigit(carry);
+
+    let final_carry = if carry_hi == 0 {
+        __add2(a_hi, &[carry_lo])
+    } else {
+        __add2(a_hi, &[carry_hi, carry_lo])
+    };
+    assert_eq!(final_carry, 0, "carry overflow during multiplication!");
 }
 
 /// Subtract a multiple.