From 51a7148b3e2c4da7c41a3b6a56f9a7527fb4dd13 Mon Sep 17 00:00:00 2001
From: David Himmelstrup <lemmih@gmail.com>
Date: Wed, 28 Jul 2021 09:52:13 +0800
Subject: [PATCH 1/4] SmallVec backend for BigUint.

---
 Cargo.toml                    |  2 ++
 src/bigint.rs                 |  4 ++-
 src/bigint/bits.rs            | 22 ++++++------
 src/bigint/multiplication.rs  |  8 ++++-
 src/bigrand.rs                | 25 +++++---------
 src/biguint.rs                | 64 +++++++++++++++++++++++++++++------
 src/biguint/addition.rs       | 20 +++++++++++
 src/biguint/arbitrary.rs      |  2 +-
 src/biguint/division.rs       | 14 ++++++--
 src/biguint/monty.rs          |  6 ++--
 src/biguint/multiplication.rs | 19 +++++++++--
 src/biguint/shift.rs          | 16 +++++----
 src/biguint/subtraction.rs    |  9 +++++
 src/lib.rs                    |  4 +++
 14 files changed, 162 insertions(+), 53 deletions(-)
diff --git a/Cargo.toml b/Cargo.toml
index 3de8c72c..292fed8a 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -16,6 +16,7 @@ edition = "2018"
 
 [features]
 default = ["std"]
+union = ["smallvec/union"]
 std = ["num-integer/std", "num-traits/std"]
 
 [package.metadata.docs.rs]
@@ -38,6 +39,7 @@ harness = false
 name = "shootout-pidigits"
 
 [dependencies]
+smallvec = { version = "1.6.1" }
 
 [dependencies.num-integer]
 version = "0.1.42"
diff --git a/src/bigint.rs b/src/bigint.rs
index 891eeb46..d8a6a3b8 100644
--- a/src/bigint.rs
+++ b/src/bigint.rs
@@ -14,6 +14,8 @@ use core::{i64, u64};
 use num_integer::{Integer, Roots};
 use num_traits::{Num, One, Pow, Signed, Zero};
 
+use smallvec::SmallVec;
+
 use self::Sign::{Minus, NoSign, Plus};
 
 use crate::big_digit::BigDigit;
@@ -538,7 +540,7 @@ impl IntDigits for BigInt {
         self.data.digits()
     }
     #[inline]
-    fn digits_mut(&mut self) -> &mut Vec<BigDigit> {
+    fn digits_mut(&mut self) -> &mut SmallVec<[BigDigit; BigUint::INLINED]> {
         self.data.digits_mut()
     }
     #[inline]
diff --git a/src/bigint/bits.rs b/src/bigint/bits.rs
index 686def4d..fee8a0f2 100644
--- a/src/bigint/bits.rs
+++ b/src/bigint/bits.rs
@@ -3,7 +3,9 @@ use super::Sign::{Minus, NoSign, Plus};
 
 use crate::big_digit::{self, BigDigit, DoubleBigDigit};
 use crate::biguint::IntDigits;
-use crate::std_alloc::Vec;
+use crate::BigUint;
+
+use smallvec::SmallVec;
 
 use core::cmp::Ordering::{Equal, Greater, Less};
 use core::ops::{BitAnd, BitAndAssign, BitOr, BitOrAssign, BitXor, BitXorAssign};
@@ -36,7 +38,7 @@ fn negate_carry(a: BigDigit, acc: &mut DoubleBigDigit) -> BigDigit {
 // + 1 & -ff = ...0 01 & ...f 01 = ...0 01 = + 1
 // +ff & - 1 = ...0 ff & ...f ff = ...0 ff = +ff
 // answer is pos, has length of a
-fn bitand_pos_neg(a: &mut Vec<BigDigit>, b: &[BigDigit]) {
+fn bitand_pos_neg(a: &mut SmallVec<[BigDigit; BigUint::INLINED]>, b: &[BigDigit]) {
     let mut carry_b = 1;
     for (ai, &bi) in a.iter_mut().zip(b.iter()) {
         let twos_b = negate_carry(bi, &mut carry_b);
@@ -48,7 +50,7 @@ fn bitand_pos_neg(a: &mut Vec<BigDigit>, b: &[BigDigit]) {
 // - 1 & +ff = ...f ff & ...0 ff = ...0 ff = +ff
 // -ff & + 1 = ...f 01 & ...0 01 = ...0 01 = + 1
 // answer is pos, has length of b
-fn bitand_neg_pos(a: &mut Vec<BigDigit>, b: &[BigDigit]) {
+fn bitand_neg_pos(a: &mut SmallVec<[BigDigit; BigUint::INLINED]>, b: &[BigDigit]) {
     let mut carry_a = 1;
     for (ai, &bi) in a.iter_mut().zip(b.iter()) {
         let twos_a = negate_carry(*ai, &mut carry_a);
@@ -69,7 +71,7 @@ fn bitand_neg_pos(a: &mut Vec<BigDigit>, b: &[BigDigit]) {
 // -ff & - 1 = ...f 01 & ...f ff = ...f 01 = - ff
 // -ff & -fe = ...f 01 & ...f 02 = ...f 00 = -100
 // answer is neg, has length of longest with a possible carry
-fn bitand_neg_neg(a: &mut Vec<BigDigit>, b: &[BigDigit]) {
+fn bitand_neg_neg(a: &mut SmallVec<[BigDigit; BigUint::INLINED]>, b: &[BigDigit]) {
     let mut carry_a = 1;
     let mut carry_b = 1;
     let mut carry_and = 1;
@@ -173,7 +175,7 @@ impl<'a> BitAndAssign<&'a BigInt> for BigInt {
 // + 1 | -ff = ...0 01 | ...f 01 = ...f 01 = -ff
 // +ff | - 1 = ...0 ff | ...f ff = ...f ff = - 1
 // answer is neg, has length of b
-fn bitor_pos_neg(a: &mut Vec<BigDigit>, b: &[BigDigit]) {
+fn bitor_pos_neg(a: &mut SmallVec<[BigDigit; BigUint::INLINED]>, b: &[BigDigit]) {
     let mut carry_b = 1;
     let mut carry_or = 1;
     for (ai, &bi) in a.iter_mut().zip(b.iter()) {
@@ -202,7 +204,7 @@ fn bitor_pos_neg(a: &mut Vec<BigDigit>, b: &[BigDigit]) {
 // - 1 | +ff = ...f ff | ...0 ff = ...f ff = - 1
 // -ff | + 1 = ...f 01 | ...0 01 = ...f 01 = -ff
 // answer is neg, has length of a
-fn bitor_neg_pos(a: &mut Vec<BigDigit>, b: &[BigDigit]) {
+fn bitor_neg_pos(a: &mut SmallVec<[BigDigit; BigUint::INLINED]>, b: &[BigDigit]) {
     let mut carry_a = 1;
     let mut carry_or = 1;
     for (ai, &bi) in a.iter_mut().zip(b.iter()) {
@@ -224,7 +226,7 @@ fn bitor_neg_pos(a: &mut Vec<BigDigit>, b: &[BigDigit]) {
 // - 1 | -ff = ...f ff | ...f 01 = ...f ff = -1
 // -ff | - 1 = ...f 01 | ...f ff = ...f ff = -1
 // answer is neg, has length of shortest
-fn bitor_neg_neg(a: &mut Vec<BigDigit>, b: &[BigDigit]) {
+fn bitor_neg_neg(a: &mut SmallVec<[BigDigit; BigUint::INLINED]>, b: &[BigDigit]) {
     let mut carry_a = 1;
     let mut carry_b = 1;
     let mut carry_or = 1;
@@ -308,7 +310,7 @@ impl<'a> BitOrAssign<&'a BigInt> for BigInt {
 // + 1 ^ -ff = ...0 01 ^ ...f 01 = ...f 00 = -100
 // +ff ^ - 1 = ...0 ff ^ ...f ff = ...f 00 = -100
 // answer is neg, has length of longest with a possible carry
-fn bitxor_pos_neg(a: &mut Vec<BigDigit>, b: &[BigDigit]) {
+fn bitxor_pos_neg(a: &mut SmallVec<[BigDigit; BigUint::INLINED]>, b: &[BigDigit]) {
     let mut carry_b = 1;
     let mut carry_xor = 1;
     for (ai, &bi) in a.iter_mut().zip(b.iter()) {
@@ -341,7 +343,7 @@ fn bitxor_pos_neg(a: &mut Vec<BigDigit>, b: &[BigDigit]) {
 // - 1 ^ +ff = ...f ff ^ ...0 ff = ...f 00 = -100
 // -ff ^ + 1 = ...f 01 ^ ...0 01 = ...f 00 = -100
 // answer is neg, has length of longest with a possible carry
-fn bitxor_neg_pos(a: &mut Vec<BigDigit>, b: &[BigDigit]) {
+fn bitxor_neg_pos(a: &mut SmallVec<[BigDigit; BigUint::INLINED]>, b: &[BigDigit]) {
     let mut carry_a = 1;
     let mut carry_xor = 1;
     for (ai, &bi) in a.iter_mut().zip(b.iter()) {
@@ -374,7 +376,7 @@ fn bitxor_neg_pos(a: &mut Vec<BigDigit>, b: &[BigDigit]) {
 // - 1 ^ -ff = ...f ff ^ ...f 01 = ...0 fe = +fe
 // -ff & - 1 = ...f 01 ^ ...f ff = ...0 fe = +fe
 // answer is pos, has length of longest
-fn bitxor_neg_neg(a: &mut Vec<BigDigit>, b: &[BigDigit]) {
+fn bitxor_neg_neg(a: &mut SmallVec<[BigDigit; BigUint::INLINED]>, b: &[BigDigit]) {
     let mut carry_a = 1;
     let mut carry_b = 1;
     for (ai, &bi) in a.iter_mut().zip(b.iter()) {
diff --git a/src/bigint/multiplication.rs b/src/bigint/multiplication.rs
index a2d97081..a73d4fa1 100644
--- a/src/bigint/multiplication.rs
+++ b/src/bigint/multiplication.rs
@@ -29,9 +29,15 @@ macro_rules! impl_mul {
             #[inline]
             fn mul(self, other: $Other) -> BigInt {
                 // automatically match value/ref
+                // from_biguint optimizes poorly if it cannot tell NoSign is impossible.
+                if self.is_zero() || other.is_zero() {
+                    return BigInt::zero();
+                }
+                let new_sign = if (self.sign() == Minus) ^ (other.sign() == Minus) { Minus } else { Plus };
                 let BigInt { data: x, .. } = self;
                 let BigInt { data: y, .. } = other;
-                BigInt::from_biguint(self.sign * other.sign, x * y)
+                BigInt::from_biguint(new_sign, x * y)
+                // BigInt::from_biguint(self.sign * other.sign, x * y)
             }
         }
     )*}
diff --git a/src/bigrand.rs b/src/bigrand.rs
index cb440327..2e62d235 100644
--- a/src/bigrand.rs
+++ b/src/bigrand.rs
@@ -3,11 +3,12 @@
 use rand::distributions::uniform::{SampleBorrow, SampleUniform, UniformSampler};
 use rand::prelude::*;
 
+use crate::big_digit::BigDigit;
 use crate::BigInt;
 use crate::BigUint;
 use crate::Sign::*;
 
-use crate::biguint::biguint_from_vec;
+use crate::biguint::biguint_from_smallvec;
 
 use num_integer::Integer;
 use num_traits::{ToPrimitive, Zero};
@@ -37,12 +38,12 @@ pub trait RandBigInt {
     fn gen_bigint_range(&mut self, lbound: &BigInt, ubound: &BigInt) -> BigInt;
 }
 
-fn gen_bits<R: Rng + ?Sized>(rng: &mut R, data: &mut [u32], rem: u64) {
+fn gen_bits<R: Rng + ?Sized>(rng: &mut R, data: &mut [BigDigit], rem: u64) {
     // `fill` is faster than many `gen::<u32>` calls
     rng.fill(data);
     if rem > 0 {
         let last = data.len() - 1;
-        data[last] >>= 32 - rem;
+        data[last] >>= crate::big_digit::BITS as u64 - rem;
     }
 }
 
@@ -60,28 +61,20 @@ impl<R: Rng + ?Sized> RandBigInt for R {
 
     #[cfg(u64_digit)]
     fn gen_biguint(&mut self, bit_size: u64) -> BigUint {
-        use core::slice;
+        use smallvec::smallvec;
 
-        let (digits, rem) = bit_size.div_rem(&32);
+        let (digits, rem) = bit_size.div_rem(&64);
         let len = (digits + (rem > 0) as u64)
             .to_usize()
             .expect("capacity overflow");
-        let native_digits = bit_size.div_ceil(&64);
-        let native_len = native_digits.to_usize().expect("capacity overflow");
-        let mut data = vec![0u64; native_len];
-        unsafe {
-            // Generate bits in a `&mut [u32]` slice for value stability
-            let ptr = data.as_mut_ptr() as *mut u32;
-            debug_assert!(native_len * 2 >= len);
-            let data = slice::from_raw_parts_mut(ptr, len);
-            gen_bits(self, data, rem);
-        }
+        let mut data = smallvec![0u64; len];
+        gen_bits(self, data.as_mut_slice(), rem);
         #[cfg(target_endian = "big")]
         for digit in &mut data {
             // swap u32 digits into u64 endianness
             *digit = (*digit << 32) | (*digit >> 32);
         }
-        biguint_from_vec(data)
+        biguint_from_smallvec(data)
     }
 
     fn gen_bigint(&mut self, bit_size: u64) -> BigInt {
diff --git a/src/biguint.rs b/src/biguint.rs
index 271a8837..bcbc4968 100644
--- a/src/biguint.rs
+++ b/src/biguint.rs
@@ -13,6 +13,8 @@ use core::{u32, u64, u8};
 use num_integer::{Integer, Roots};
 use num_traits::{Num, One, Pow, ToPrimitive, Unsigned, Zero};
 
+use smallvec::{smallvec, SmallVec};
+
 mod addition;
 mod division;
 mod multiplication;
@@ -36,7 +38,7 @@ pub use self::iter::{U32Digits, U64Digits};
 
 /// A big unsigned integer type.
 pub struct BigUint {
-    data: Vec<BigDigit>,
+    data: SmallVec<[BigDigit; BigUint::INLINED]>,
 }
 
 // Note: derived `Clone` doesn't specialize `clone_from`,
@@ -44,8 +46,21 @@ pub struct BigUint {
 impl Clone for BigUint {
     #[inline]
     fn clone(&self) -> Self {
+        // #[inline(never)]
+        // fn cold_clone(a: &BigUint) -> BigUint {
+        //     BigUint {
+        //         data: SmallVec::from_slice(&a.data), // This uses memcpy rather than repeated calls to .clone().
+        //     }
+        // }
+        // if self.data.spilled() {
+        //     cold_clone(self)
+        // } else {
+        //     BigUint {
+        //         data: unsafe { std::ptr::read(&self.data) },
+        //     }
+        // }
         BigUint {
-            data: self.data.clone(),
+            data: SmallVec::from_slice(&self.data), // This uses memcpy rather than repeated calls to .clone().
         }
     }
 
@@ -146,7 +161,9 @@ impl fmt::Octal for BigUint {
 impl Zero for BigUint {
     #[inline]
     fn zero() -> BigUint {
-        BigUint { data: Vec::new() }
+        BigUint {
+            data: SmallVec::new(),
+        }
     }
 
     #[inline]
@@ -163,7 +180,7 @@ impl Zero for BigUint {
 impl One for BigUint {
     #[inline]
     fn one() -> BigUint {
-        BigUint { data: vec![1] }
+        BigUint { data: smallvec![1] }
     }
 
     #[inline]
@@ -218,6 +235,17 @@ impl Integer for BigUint {
     /// The result is always positive.
     #[inline]
     fn gcd(&self, other: &Self) -> Self {
+        // use core::convert::TryInto;
+        // if let Some(x) = self.to_u64() {
+        //     if let Some(y) = other.to_u64() {
+        //         return BigUint::from(x.gcd(&y));
+        //     }
+        // }
+        // if let Some(x) = self.to_u128() {
+        //     if let Some(y) = other.to_u128() {
+        //         return BigUint::from(x.gcd(&y));
+        //     }
+        // }
         #[inline]
         fn twos(x: &BigUint) -> u64 {
             x.trailing_zeros().unwrap_or(0)
@@ -512,10 +540,23 @@ pub trait ToBigUint {
 /// The digits are in little-endian base matching `BigDigit`.
 #[inline]
 pub(crate) fn biguint_from_vec(digits: Vec<BigDigit>) -> BigUint {
+    BigUint {
+        data: SmallVec::from_vec(digits),
+    }
+    .normalized()
+}
+
+/// Creates and initializes a `BigUint`.
+///
+/// The digits are in little-endian base matching `BigDigit`.
+#[inline]
+pub(crate) fn biguint_from_smallvec(digits: SmallVec<[BigDigit; BigUint::INLINED]>) -> BigUint {
     BigUint { data: digits }.normalized()
 }
 
 impl BigUint {
+    pub(crate) const INLINED: usize = 2;
+
     /// Creates and initializes a `BigUint`.
     ///
     /// The base 2<sup>32</sup> digits are ordered least significant digit first.
@@ -850,9 +891,12 @@ impl BigUint {
             let len = self.data.iter().rposition(|&d| d != 0).map_or(0, |i| i + 1);
             self.data.truncate(len);
         }
-        if self.data.len() < self.data.capacity() / 4 {
-            self.data.shrink_to_fit();
-        }
+        // Shrinking hurts performance of many algorithms which do not care about deallocating working memory.
+        // For example, 'to_str_radix' consumes a BigUint by dividing out digits. The possibility of shrinking
+        // the BigUint in the inner loop significantly lowers performance.
+        // if self.data.len() < self.data.capacity() / 4 {
+        //     self.data.shrink_to_fit();
+        // }
     }
 
     /// Returns a normalized `BigUint`.
@@ -958,7 +1002,7 @@ impl BigUint {
 
 pub(crate) trait IntDigits {
     fn digits(&self) -> &[BigDigit];
-    fn digits_mut(&mut self) -> &mut Vec<BigDigit>;
+    fn digits_mut(&mut self) -> &mut SmallVec<[BigDigit; BigUint::INLINED]>;
     fn normalize(&mut self);
     fn capacity(&self) -> usize;
     fn len(&self) -> usize;
@@ -970,7 +1014,7 @@ impl IntDigits for BigUint {
         &self.data
     }
     #[inline]
-    fn digits_mut(&mut self) -> &mut Vec<BigDigit> {
+    fn digits_mut(&mut self) -> &mut SmallVec<[BigDigit; BigUint::INLINED]> {
         &mut self.data
     }
     #[inline]
@@ -1036,7 +1080,7 @@ fn test_from_slice() {
 fn test_from_slice() {
     fn check(slice: &[u32], data: &[BigDigit]) {
         assert_eq!(
-            BigUint::from_slice(slice).data,
+            BigUint::from_slice(slice).data.as_slice(),
             data,
             "from {:?}, to {:?}",
             slice,
diff --git a/src/biguint/addition.rs b/src/biguint/addition.rs
index e54f8cb1..a5262cd7 100644
--- a/src/biguint/addition.rs
+++ b/src/biguint/addition.rs
@@ -89,7 +89,14 @@ forward_val_assign!(impl AddAssign for BigUint, add_assign);
 impl<'a> Add<&'a BigUint> for BigUint {
     type Output = BigUint;
 
+    #[inline]
     fn add(mut self, other: &BigUint) -> BigUint {
+        if !other.data.spilled() {
+            use num_traits::ToPrimitive;
+            if let Some(x) = other.to_u64() {
+                return self + x;
+            }
+        }
         self += other;
         self
     }
@@ -97,6 +104,13 @@ impl<'a> Add<&'a BigUint> for BigUint {
 impl<'a> AddAssign<&'a BigUint> for BigUint {
     #[inline]
     fn add_assign(&mut self, other: &BigUint) {
+        if !other.data.spilled() {
+            use num_traits::ToPrimitive;
+            if let Some(x) = other.to_u64() {
+                self.add_assign(x);
+                return;
+            }
+        }
         let self_len = self.data.len();
         let carry = if self_len < other.data.len() {
             let lo_carry = __add2(&mut self.data[..], &other.data[..self_len]);
@@ -148,6 +162,12 @@ impl Add<u64> for BigUint {
 
     #[inline]
     fn add(mut self, other: u64) -> BigUint {
+        use num_traits::ToPrimitive;
+        if !self.data.spilled() {
+            if let Some(x) = self.to_u64() {
+                return BigUint::from(x as u128 + other as u128);
+            }
+        }
         self += other;
         self
     }
diff --git a/src/biguint/arbitrary.rs b/src/biguint/arbitrary.rs
index 6fa91c0f..217e807d 100644
--- a/src/biguint/arbitrary.rs
+++ b/src/biguint/arbitrary.rs
@@ -14,7 +14,7 @@ impl quickcheck::Arbitrary for BigUint {
 
     fn shrink(&self) -> Box<dyn Iterator<Item = Self>> {
         // Use shrinker from Vec
-        Box::new(self.data.shrink().map(biguint_from_vec))
+        Box::new(self.data.clone().into_vec().shrink().map(biguint_from_vec))
     }
 }
 
diff --git a/src/biguint/division.rs b/src/biguint/division.rs
index 343705e1..a488babe 100644
--- a/src/biguint/division.rs
+++ b/src/biguint/division.rs
@@ -6,6 +6,8 @@ use super::BigUint;
 use crate::big_digit::{self, BigDigit, DoubleBigDigit};
 use crate::UsizePromotion;
 
+use smallvec::smallvec;
+
 use core::cmp::Ordering::{Equal, Greater, Less};
 use core::mem;
 use core::ops::{Div, DivAssign, Rem, RemAssign};
@@ -45,6 +47,12 @@ pub(super) fn div_rem_digit(mut a: BigUint, b: BigDigit) -> (BigUint, BigDigit)
         panic!("attempt to divide by zero")
     }
 
+    if !a.data.spilled() {
+        if let Some(x) = a.to_u64() {
+            return (BigUint::from(x / b), x % b);
+        }
+    }
+
     let mut rem = 0;
 
     if b <= big_digit::HALF {
@@ -125,7 +133,7 @@ fn div_rem(mut u: BigUint, mut d: BigUint) -> (BigUint, BigUint) {
     }
 
     if d.data.len() == 1 {
-        if d.data == [1] {
+        if d.data[0] == 1 {
             return (u, Zero::zero());
         }
         let (div, rem) = div_rem_digit(u, d.data[0]);
@@ -172,7 +180,7 @@ pub(super) fn div_rem_ref(u: &BigUint, d: &BigUint) -> (BigUint, BigUint) {
     }
 
     if d.data.len() == 1 {
-        if d.data == [1] {
+        if d.data[0] == 1 {
             return (u.clone(), Zero::zero());
         }
 
@@ -240,7 +248,7 @@ fn div_rem_core(mut a: BigUint, b: &BigUint) -> (BigUint, BigUint) {
 
     let q_len = a.data.len() - b.data.len() + 1;
     let mut q = BigUint {
-        data: vec![0; q_len],
+        data: smallvec![0; q_len],
     };
 
     for j in (0..q_len).rev() {
diff --git a/src/biguint/monty.rs b/src/biguint/monty.rs
index a5c79aa9..160aa7a8 100644
--- a/src/biguint/monty.rs
+++ b/src/biguint/monty.rs
@@ -6,6 +6,8 @@ use num_traits::{One, Zero};
 use crate::big_digit::{self, BigDigit, DoubleBigDigit, SignedDoubleBigDigit};
 use crate::biguint::BigUint;
 
+use smallvec::SmallVec;
+
 struct MontyReducer {
     n0inv: BigDigit,
 }
@@ -75,13 +77,13 @@ fn montgomery(x: &BigUint, y: &BigUint, m: &BigUint, k: BigDigit, n: usize) -> B
     }
 
     if c == 0 {
-        z.data = z.data[n..].to_vec();
+        z.data = SmallVec::from_vec(z.data[n..].to_vec());
     } else {
         {
             let (mut first, second) = z.data.split_at_mut(n);
             sub_vv(&mut first, &second, &m.data);
         }
-        z.data = z.data[..n].to_vec();
+        z.data = SmallVec::from_vec(z.data[..n].to_vec());
     }
 
     z
diff --git a/src/biguint/multiplication.rs b/src/biguint/multiplication.rs
index 581c9e17..82b73a14 100644
--- a/src/biguint/multiplication.rs
+++ b/src/biguint/multiplication.rs
@@ -8,6 +8,8 @@ use crate::big_digit::{self, BigDigit, DoubleBigDigit};
 use crate::Sign::{self, Minus, NoSign, Plus};
 use crate::{BigInt, UsizePromotion};
 
+use smallvec::smallvec;
+
 use core::cmp::Ordering;
 use core::iter::Product;
 use core::ops::{Mul, MulAssign};
@@ -169,7 +171,9 @@ fn mac3(mut acc: &mut [BigDigit], mut b: &[BigDigit], mut c: &[BigDigit]) {
         // We reuse the same BigUint for all the intermediate multiplies and have to size p
         // appropriately here: x1.len() >= x0.len and y1.len() >= y0.len():
         let len = x1.len() + y1.len();
-        let mut p = BigUint { data: vec![0; len] };
+        let mut p = BigUint {
+            data: smallvec![0; len],
+        };
 
         // p2 = x1 * y1
         mac3(&mut p.data, x1, y1);
@@ -345,7 +349,9 @@ fn mac3(mut acc: &mut [BigDigit], mut b: &[BigDigit], mut c: &[BigDigit]) {
 
 fn mul3(x: &[BigDigit], y: &[BigDigit]) -> BigUint {
     let len = x.len() + y.len();
-    let mut prod = BigUint { data: vec![0; len] };
+    let mut prod = BigUint {
+        data: smallvec![0; len],
+    };
 
     mac3(&mut prod.data, x, y);
     prod.normalized()
@@ -403,6 +409,10 @@ macro_rules! impl_mul {
             #[inline]
             fn mul(self, other: $Other) -> BigUint {
                 match (&*self.data, &*other.data) {
+                    (&[a], &[b]) => {
+                        use crate::big_digit::*;
+                        BigUint::from(a as DoubleBigDigit * b as DoubleBigDigit)
+                    },
                     // multiply by zero
                     (&[], _) | (_, &[]) => BigUint::zero(),
                     // multiply by a scalar
@@ -432,6 +442,11 @@ macro_rules! impl_mul_assign {
                     (&[], _) => {},
                     (_, &[]) => self.set_zero(),
                     // multiply by a scalar
+                    (&[a], &[b]) => {
+                        use crate::big_digit::*;
+                        *self = BigUint::from(a as DoubleBigDigit * b as DoubleBigDigit);
+                    },
+                    // multiply by a scalar
                     (_, &[digit]) => *self *= digit,
                     (&[digit], _) => *self = other * digit,
                     // full multiplication
diff --git a/src/biguint/shift.rs b/src/biguint/shift.rs
index 05964d2a..6802df1a 100644
--- a/src/biguint/shift.rs
+++ b/src/biguint/shift.rs
@@ -1,7 +1,9 @@
-use super::{biguint_from_vec, BigUint};
+use super::{biguint_from_smallvec, BigUint};
 
 use crate::big_digit;
-use crate::std_alloc::{Cow, Vec};
+use crate::std_alloc::Cow;
+
+use smallvec::SmallVec;
 
 use core::mem;
 use core::ops::{Shl, ShlAssign, Shr, ShrAssign};
@@ -26,9 +28,9 @@ fn biguint_shl2(n: Cow<'_, BigUint>, digits: usize, shift: u8) -> BigUint {
         0 => n.into_owned().data,
         _ => {
             let len = digits.saturating_add(n.data.len() + 1);
-            let mut data = Vec::with_capacity(len);
+            let mut data = SmallVec::with_capacity(len);
             data.resize(digits, 0);
-            data.extend(n.data.iter());
+            data.extend(n.data.iter().copied());
             data
         }
     };
@@ -46,7 +48,7 @@ fn biguint_shl2(n: Cow<'_, BigUint>, digits: usize, shift: u8) -> BigUint {
         }
     }
 
-    biguint_from_vec(data)
+    biguint_from_smallvec(data)
 }
 
 #[inline]
@@ -70,7 +72,7 @@ fn biguint_shr2(n: Cow<'_, BigUint>, digits: usize, shift: u8) -> BigUint {
         return n;
     }
     let mut data = match n {
-        Cow::Borrowed(n) => n.data[digits..].to_vec(),
+        Cow::Borrowed(n) => SmallVec::from_slice(&n.data[digits..]),
         Cow::Owned(mut n) => {
             n.data.drain(..digits);
             n.data
@@ -87,7 +89,7 @@ fn biguint_shr2(n: Cow<'_, BigUint>, digits: usize, shift: u8) -> BigUint {
         }
     }
 
-    biguint_from_vec(data)
+    biguint_from_smallvec(data)
 }
 
 macro_rules! impl_shift {
diff --git a/src/biguint/subtraction.rs b/src/biguint/subtraction.rs
index 67005175..f98f0e3e 100644
--- a/src/biguint/subtraction.rs
+++ b/src/biguint/subtraction.rs
@@ -117,6 +117,7 @@ impl<'a> Sub<&'a BigUint> for BigUint {
     }
 }
 impl<'a> SubAssign<&'a BigUint> for BigUint {
+    #[inline]
     fn sub_assign(&mut self, other: &'a BigUint) {
         sub2(&mut self.data[..], &other.data[..]);
         self.normalize();
@@ -127,6 +128,14 @@ impl<'a> Sub<BigUint> for &'a BigUint {
     type Output = BigUint;
 
     fn sub(self, mut other: BigUint) -> BigUint {
+        use num_traits::ToPrimitive;
+        if !self.data.spilled() {
+            if let Some(x) = self.to_u64() {
+                if let Some(y) = other.to_u64() {
+                    return BigUint::from(x - y);
+                }
+            }
+        }
         let other_len = other.data.len();
         if other_len < self.data.len() {
             let lo_borrow = __sub2rev(&self.data[..other_len], &mut other.data);
diff --git a/src/lib.rs b/src/lib.rs
index b88c5df2..f7b5638b 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -292,3 +292,7 @@ mod big_digit {
         DoubleBigDigit::from(lo) | (DoubleBigDigit::from(hi) << BITS)
     }
 }
+
+pub fn mul_test(a: &BigUint, b: &BigUint) -> BigUint {
+    a * b
+}

From 8a0dcc25affe5614f40fc87d5e0d8d7850c94a28 Mon Sep 17 00:00:00 2001
From: David Himmelstrup <lemmih@gmail.com>
Date: Wed, 28 Jul 2021 09:52:36 +0800
Subject: [PATCH 2/4] Fix bad performance with SmallVec in 'shr_assign'.

---
 src/biguint/shift.rs | 41 +++++++++++++++++++++++++++++++++++++++--
 1 file changed, 39 insertions(+), 2 deletions(-)

diff --git a/src/biguint/shift.rs b/src/biguint/shift.rs
index 6802df1a..b0ef92fb 100644
--- a/src/biguint/shift.rs
+++ b/src/biguint/shift.rs
@@ -92,6 +92,44 @@ fn biguint_shr2(n: Cow<'_, BigUint>, digits: usize, shift: u8) -> BigUint {
     biguint_from_smallvec(data)
 }
 
+use crate::big_digit::BigDigit;
+#[inline]
+pub(crate) fn biguint_shr_mut<T: PrimInt>(n: &mut BigUint, shift: T) {
+    if shift < T::zero() {
+        panic!("attempt to shift right with negative");
+    }
+    if n.is_zero() || shift.is_zero() {
+        return;
+    }
+    let bits = T::from(big_digit::BITS).unwrap();
+    let digits = (shift / bits).to_usize().unwrap_or(core::usize::MAX);
+    let shift = (shift % bits).to_u8().unwrap();
+    slice_shr2(&mut n.data, digits, shift);
+    n.data.truncate(n.data.len() - digits);
+    n.normalize();
+}
+
+fn slice_shr2(mut data: &mut [BigDigit], digits: usize, shift: u8) {
+    if digits >= data.len() {
+        return;
+    }
+    if digits > 0 {
+        let len = data.len();
+        data.copy_within(digits.., 0);
+        data = &mut data[0..len - digits];
+    }
+
+    if shift > 0 {
+        let mut borrow = 0;
+        let borrow_shift = big_digit::BITS as u8 - shift;
+        for elem in data.iter_mut().rev() {
+            let new_borrow = *elem << borrow_shift;
+            *elem = (*elem >> shift) | borrow;
+            borrow = new_borrow;
+        }
+    }
+}
+
 macro_rules! impl_shift {
     (@ref $Shx:ident :: $shx:ident, $ShxAssign:ident :: $shx_assign:ident, $rhs:ty) => {
         impl<'b> $Shx<&'b $rhs> for BigUint {
@@ -162,8 +200,7 @@ macro_rules! impl_shift {
         impl ShrAssign<$rhs> for BigUint {
             #[inline]
             fn shr_assign(&mut self, rhs: $rhs) {
-                let n = mem::replace(self, BigUint::zero());
-                *self = n >> rhs;
+                biguint_shr_mut(self, rhs)
             }
         }
         impl_shift! { @ref Shr::shr, ShrAssign::shr_assign, $rhs }

From a0a92a9a6cfe5df99cde80e0e7073e8bc691ae03 Mon Sep 17 00:00:00 2001
From: David Himmelstrup <lemmih@gmail.com>
Date: Wed, 28 Jul 2021 10:05:09 +0800
Subject: [PATCH 3/4] Abstract the SmallVec vector backend.

---
 src/bigint.rs        |  6 ++----
 src/bigint/bits.rs   | 23 ++++++++++-------------
 src/biguint.rs       | 10 ++++++----
 src/biguint/shift.rs |  6 +++---
 4 files changed, 21 insertions(+), 24 deletions(-)

diff --git a/src/bigint.rs b/src/bigint.rs
index d8a6a3b8..a4056345 100644
--- a/src/bigint.rs
+++ b/src/bigint.rs
@@ -14,13 +14,11 @@ use core::{i64, u64};
 use num_integer::{Integer, Roots};
 use num_traits::{Num, One, Pow, Signed, Zero};
 
-use smallvec::SmallVec;
-
 use self::Sign::{Minus, NoSign, Plus};
 
 use crate::big_digit::BigDigit;
 use crate::biguint::to_str_radix_reversed;
-use crate::biguint::{BigUint, IntDigits, U32Digits, U64Digits};
+use crate::biguint::{BigDigitVec, BigUint, IntDigits, U32Digits, U64Digits};
 
 mod addition;
 mod division;
@@ -540,7 +538,7 @@ impl IntDigits for BigInt {
         self.data.digits()
     }
     #[inline]
-    fn digits_mut(&mut self) -> &mut SmallVec<[BigDigit; BigUint::INLINED]> {
+    fn digits_mut(&mut self) -> &mut BigDigitVec {
         self.data.digits_mut()
     }
     #[inline]
diff --git a/src/bigint/bits.rs b/src/bigint/bits.rs
index fee8a0f2..412fb6c2 100644
--- a/src/bigint/bits.rs
+++ b/src/bigint/bits.rs
@@ -2,10 +2,7 @@ use super::BigInt;
 use super::Sign::{Minus, NoSign, Plus};
 
 use crate::big_digit::{self, BigDigit, DoubleBigDigit};
-use crate::biguint::IntDigits;
-use crate::BigUint;
-
-use smallvec::SmallVec;
+use crate::biguint::{BigDigitVec, IntDigits};
 
 use core::cmp::Ordering::{Equal, Greater, Less};
 use core::ops::{BitAnd, BitAndAssign, BitOr, BitOrAssign, BitXor, BitXorAssign};
@@ -38,7 +35,7 @@ fn negate_carry(a: BigDigit, acc: &mut DoubleBigDigit) -> BigDigit {
 // + 1 & -ff = ...0 01 & ...f 01 = ...0 01 = + 1
 // +ff & - 1 = ...0 ff & ...f ff = ...0 ff = +ff
 // answer is pos, has length of a
-fn bitand_pos_neg(a: &mut SmallVec<[BigDigit; BigUint::INLINED]>, b: &[BigDigit]) {
+fn bitand_pos_neg(a: &mut BigDigitVec, b: &[BigDigit]) {
     let mut carry_b = 1;
     for (ai, &bi) in a.iter_mut().zip(b.iter()) {
         let twos_b = negate_carry(bi, &mut carry_b);
@@ -50,7 +47,7 @@ fn bitand_pos_neg(a: &mut SmallVec<[BigDigit; BigUint::INLINED]>, b: &[BigDigit]
 // - 1 & +ff = ...f ff & ...0 ff = ...0 ff = +ff
 // -ff & + 1 = ...f 01 & ...0 01 = ...0 01 = + 1
 // answer is pos, has length of b
-fn bitand_neg_pos(a: &mut SmallVec<[BigDigit; BigUint::INLINED]>, b: &[BigDigit]) {
+fn bitand_neg_pos(a: &mut BigDigitVec, b: &[BigDigit]) {
     let mut carry_a = 1;
     for (ai, &bi) in a.iter_mut().zip(b.iter()) {
         let twos_a = negate_carry(*ai, &mut carry_a);
@@ -71,7 +68,7 @@ fn bitand_neg_pos(a: &mut SmallVec<[BigDigit; BigUint::INLINED]>, b: &[BigDigit]
 // -ff & - 1 = ...f 01 & ...f ff = ...f 01 = - ff
 // -ff & -fe = ...f 01 & ...f 02 = ...f 00 = -100
 // answer is neg, has length of longest with a possible carry
-fn bitand_neg_neg(a: &mut SmallVec<[BigDigit; BigUint::INLINED]>, b: &[BigDigit]) {
+fn bitand_neg_neg(a: &mut BigDigitVec, b: &[BigDigit]) {
     let mut carry_a = 1;
     let mut carry_b = 1;
     let mut carry_and = 1;
@@ -175,7 +172,7 @@ impl<'a> BitAndAssign<&'a BigInt> for BigInt {
 // + 1 | -ff = ...0 01 | ...f 01 = ...f 01 = -ff
 // +ff | - 1 = ...0 ff | ...f ff = ...f ff = - 1
 // answer is neg, has length of b
-fn bitor_pos_neg(a: &mut SmallVec<[BigDigit; BigUint::INLINED]>, b: &[BigDigit]) {
+fn bitor_pos_neg(a: &mut BigDigitVec, b: &[BigDigit]) {
     let mut carry_b = 1;
     let mut carry_or = 1;
     for (ai, &bi) in a.iter_mut().zip(b.iter()) {
@@ -204,7 +201,7 @@ fn bitor_pos_neg(a: &mut SmallVec<[BigDigit; BigUint::INLINED]>, b: &[BigDigit])
 // - 1 | +ff = ...f ff | ...0 ff = ...f ff = - 1
 // -ff | + 1 = ...f 01 | ...0 01 = ...f 01 = -ff
 // answer is neg, has length of a
-fn bitor_neg_pos(a: &mut SmallVec<[BigDigit; BigUint::INLINED]>, b: &[BigDigit]) {
+fn bitor_neg_pos(a: &mut BigDigitVec, b: &[BigDigit]) {
     let mut carry_a = 1;
     let mut carry_or = 1;
     for (ai, &bi) in a.iter_mut().zip(b.iter()) {
@@ -226,7 +223,7 @@ fn bitor_neg_pos(a: &mut SmallVec<[BigDigit; BigUint::INLINED]>, b: &[BigDigit])
 // - 1 | -ff = ...f ff | ...f 01 = ...f ff = -1
 // -ff | - 1 = ...f 01 | ...f ff = ...f ff = -1
 // answer is neg, has length of shortest
-fn bitor_neg_neg(a: &mut SmallVec<[BigDigit; BigUint::INLINED]>, b: &[BigDigit]) {
+fn bitor_neg_neg(a: &mut BigDigitVec, b: &[BigDigit]) {
     let mut carry_a = 1;
     let mut carry_b = 1;
     let mut carry_or = 1;
@@ -310,7 +307,7 @@ impl<'a> BitOrAssign<&'a BigInt> for BigInt {
 // + 1 ^ -ff = ...0 01 ^ ...f 01 = ...f 00 = -100
 // +ff ^ - 1 = ...0 ff ^ ...f ff = ...f 00 = -100
 // answer is neg, has length of longest with a possible carry
-fn bitxor_pos_neg(a: &mut SmallVec<[BigDigit; BigUint::INLINED]>, b: &[BigDigit]) {
+fn bitxor_pos_neg(a: &mut BigDigitVec, b: &[BigDigit]) {
     let mut carry_b = 1;
     let mut carry_xor = 1;
     for (ai, &bi) in a.iter_mut().zip(b.iter()) {
@@ -343,7 +340,7 @@ fn bitxor_pos_neg(a: &mut SmallVec<[BigDigit; BigUint::INLINED]>, b: &[BigDigit]
 // - 1 ^ +ff = ...f ff ^ ...0 ff = ...f 00 = -100
 // -ff ^ + 1 = ...f 01 ^ ...0 01 = ...f 00 = -100
 // answer is neg, has length of longest with a possible carry
-fn bitxor_neg_pos(a: &mut SmallVec<[BigDigit; BigUint::INLINED]>, b: &[BigDigit]) {
+fn bitxor_neg_pos(a: &mut BigDigitVec, b: &[BigDigit]) {
     let mut carry_a = 1;
     let mut carry_xor = 1;
     for (ai, &bi) in a.iter_mut().zip(b.iter()) {
@@ -376,7 +373,7 @@ fn bitxor_neg_pos(a: &mut SmallVec<[BigDigit; BigUint::INLINED]>, b: &[BigDigit]
 // - 1 ^ -ff = ...f ff ^ ...f 01 = ...0 fe = +fe
 // -ff & - 1 = ...f 01 ^ ...f ff = ...0 fe = +fe
 // answer is pos, has length of longest
-fn bitxor_neg_neg(a: &mut SmallVec<[BigDigit; BigUint::INLINED]>, b: &[BigDigit]) {
+fn bitxor_neg_neg(a: &mut BigDigitVec, b: &[BigDigit]) {
     let mut carry_a = 1;
     let mut carry_b = 1;
     for (ai, &bi) in a.iter_mut().zip(b.iter()) {
diff --git a/src/biguint.rs b/src/biguint.rs
index bcbc4968..10d9e30f 100644
--- a/src/biguint.rs
+++ b/src/biguint.rs
@@ -38,9 +38,11 @@ pub use self::iter::{U32Digits, U64Digits};
 
 /// A big unsigned integer type.
 pub struct BigUint {
-    data: SmallVec<[BigDigit; BigUint::INLINED]>,
+    data: BigDigitVec,
 }
 
+pub(crate) type BigDigitVec = SmallVec<[BigDigit; BigUint::INLINED]>;
+
 // Note: derived `Clone` doesn't specialize `clone_from`,
 // but we want to keep the allocation in `data`.
 impl Clone for BigUint {
@@ -550,7 +552,7 @@ pub(crate) fn biguint_from_vec(digits: Vec<BigDigit>) -> BigUint {
 ///
 /// The digits are in little-endian base matching `BigDigit`.
 #[inline]
-pub(crate) fn biguint_from_smallvec(digits: SmallVec<[BigDigit; BigUint::INLINED]>) -> BigUint {
+pub(crate) fn biguint_from_bigdigitvec(digits: BigDigitVec) -> BigUint {
     BigUint { data: digits }.normalized()
 }
 
@@ -1002,7 +1004,7 @@ impl BigUint {
 
 pub(crate) trait IntDigits {
     fn digits(&self) -> &[BigDigit];
-    fn digits_mut(&mut self) -> &mut SmallVec<[BigDigit; BigUint::INLINED]>;
+    fn digits_mut(&mut self) -> &mut BigDigitVec;
     fn normalize(&mut self);
     fn capacity(&self) -> usize;
     fn len(&self) -> usize;
@@ -1014,7 +1016,7 @@ impl IntDigits for BigUint {
         &self.data
     }
     #[inline]
-    fn digits_mut(&mut self) -> &mut SmallVec<[BigDigit; BigUint::INLINED]> {
+    fn digits_mut(&mut self) -> &mut BigDigitVec {
         &mut self.data
     }
     #[inline]
diff --git a/src/biguint/shift.rs b/src/biguint/shift.rs
index b0ef92fb..b390c145 100644
--- a/src/biguint/shift.rs
+++ b/src/biguint/shift.rs
@@ -1,4 +1,4 @@
-use super::{biguint_from_smallvec, BigUint};
+use super::{biguint_from_bigdigitvec, BigUint};
 
 use crate::big_digit;
 use crate::std_alloc::Cow;
@@ -48,7 +48,7 @@ fn biguint_shl2(n: Cow<'_, BigUint>, digits: usize, shift: u8) -> BigUint {
         }
     }
 
-    biguint_from_smallvec(data)
+    biguint_from_bigdigitvec(data)
 }
 
 #[inline]
@@ -89,7 +89,7 @@ fn biguint_shr2(n: Cow<'_, BigUint>, digits: usize, shift: u8) -> BigUint {
         }
     }
 
-    biguint_from_smallvec(data)
+    biguint_from_bigdigitvec(data)
 }
 
 use crate::big_digit::BigDigit;

From a0523f54c23b2d88716f6ddfe834ec4fc97247a8 Mon Sep 17 00:00:00 2001
From: David Himmelstrup <lemmih@gmail.com>
Date: Sat, 28 Aug 2021 16:29:51 +0800
Subject: [PATCH 4/4] Make 'smallvec' an optional dependency with 'union'
 enabled.

---
 Cargo.toml                    |  3 +--
 src/bigrand.rs                |  8 +++---
 src/biguint.rs                | 29 ++++++---------------
 src/biguint/addition.rs       |  8 +++---
 src/biguint/division.rs       |  6 ++---
 src/biguint/monty.rs          |  6 ++---
 src/biguint/multiplication.rs |  6 ++---
 src/biguint/shift.rs          |  6 ++---
 src/biguint/subtraction.rs    |  4 ++-
 src/lib.rs                    | 49 +++++++++++++++++++++++++++++++++++
 10 files changed, 81 insertions(+), 44 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index 292fed8a..03afc7ca 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -16,7 +16,6 @@ edition = "2018"
 
 [features]
 default = ["std"]
-union = ["smallvec/union"]
 std = ["num-integer/std", "num-traits/std"]
 
 [package.metadata.docs.rs]
@@ -39,7 +38,7 @@ harness = false
 name = "shootout-pidigits"
 
 [dependencies]
-smallvec = { version = "1.6.1" }
+smallvec = { version = "1.6.1", optional = true, features = ["union"] }
 
 [dependencies.num-integer]
 version = "0.1.42"
diff --git a/src/bigrand.rs b/src/bigrand.rs
index 2e62d235..8dd42ec8 100644
--- a/src/bigrand.rs
+++ b/src/bigrand.rs
@@ -8,7 +8,7 @@ use crate::BigInt;
 use crate::BigUint;
 use crate::Sign::*;
 
-use crate::biguint::biguint_from_smallvec;
+use crate::biguint::biguint_from_vec;
 
 use num_integer::Integer;
 use num_traits::{ToPrimitive, Zero};
@@ -61,20 +61,18 @@ impl<R: Rng + ?Sized> RandBigInt for R {
 
     #[cfg(u64_digit)]
     fn gen_biguint(&mut self, bit_size: u64) -> BigUint {
-        use smallvec::smallvec;
-
         let (digits, rem) = bit_size.div_rem(&64);
         let len = (digits + (rem > 0) as u64)
             .to_usize()
             .expect("capacity overflow");
-        let mut data = smallvec![0u64; len];
+        let mut data = vec![0u64; len];
         gen_bits(self, data.as_mut_slice(), rem);
         #[cfg(target_endian = "big")]
         for digit in &mut data {
             // swap u32 digits into u64 endianness
             *digit = (*digit << 32) | (*digit >> 32);
         }
-        biguint_from_smallvec(data)
+        biguint_from_vec(data)
     }
 
     fn gen_bigint(&mut self, bit_size: u64) -> BigInt {
diff --git a/src/biguint.rs b/src/biguint.rs
index 10d9e30f..18d3bbce 100644
--- a/src/biguint.rs
+++ b/src/biguint.rs
@@ -13,7 +13,7 @@ use core::{u32, u64, u8};
 use num_integer::{Integer, Roots};
 use num_traits::{Num, One, Pow, ToPrimitive, Unsigned, Zero};
 
-use smallvec::{smallvec, SmallVec};
+use crate::backend;
 
 mod addition;
 mod division;
@@ -41,28 +41,15 @@ pub struct BigUint {
     data: BigDigitVec,
 }
 
-pub(crate) type BigDigitVec = SmallVec<[BigDigit; BigUint::INLINED]>;
+pub(crate) type BigDigitVec = backend::Vec<BigDigit>;
 
 // Note: derived `Clone` doesn't specialize `clone_from`,
 // but we want to keep the allocation in `data`.
 impl Clone for BigUint {
     #[inline]
     fn clone(&self) -> Self {
-        // #[inline(never)]
-        // fn cold_clone(a: &BigUint) -> BigUint {
-        //     BigUint {
-        //         data: SmallVec::from_slice(&a.data), // This uses memcpy rather than repeated calls to .clone().
-        //     }
-        // }
-        // if self.data.spilled() {
-        //     cold_clone(self)
-        // } else {
-        //     BigUint {
-        //         data: unsafe { std::ptr::read(&self.data) },
-        //     }
-        // }
         BigUint {
-            data: SmallVec::from_slice(&self.data), // This uses memcpy rather than repeated calls to .clone().
+            data: backend::clone(&self.data),
         }
     }
 
@@ -164,7 +151,7 @@ impl Zero for BigUint {
     #[inline]
     fn zero() -> BigUint {
         BigUint {
-            data: SmallVec::new(),
+            data: backend::Vec::new(),
         }
     }
 
@@ -182,7 +169,9 @@ impl Zero for BigUint {
 impl One for BigUint {
     #[inline]
     fn one() -> BigUint {
-        BigUint { data: smallvec![1] }
+        BigUint {
+            data: backend::vec![1],
+        }
     }
 
     #[inline]
@@ -543,7 +532,7 @@ pub trait ToBigUint {
 #[inline]
 pub(crate) fn biguint_from_vec(digits: Vec<BigDigit>) -> BigUint {
     BigUint {
-        data: SmallVec::from_vec(digits),
+        data: backend::from_vec(digits),
     }
     .normalized()
 }
@@ -557,8 +546,6 @@ pub(crate) fn biguint_from_bigdigitvec(digits: BigDigitVec) -> BigUint {
 }
 
 impl BigUint {
-    pub(crate) const INLINED: usize = 2;
-
     /// Creates and initializes a `BigUint`.
     ///
     /// The base 2<sup>32</sup> digits are ordered least significant digit first.
diff --git a/src/biguint/addition.rs b/src/biguint/addition.rs
index a5262cd7..4a8c48d1 100644
--- a/src/biguint/addition.rs
+++ b/src/biguint/addition.rs
@@ -5,6 +5,8 @@ use super::{BigUint, IntDigits};
 use crate::big_digit::{self, BigDigit};
 use crate::UsizePromotion;
 
+use crate::backend;
+
 use core::iter::Sum;
 use core::ops::{Add, AddAssign};
 use num_traits::{CheckedAdd, Zero};
@@ -91,7 +93,7 @@ impl<'a> Add<&'a BigUint> for BigUint {
 
     #[inline]
     fn add(mut self, other: &BigUint) -> BigUint {
-        if !other.data.spilled() {
+        if backend::inlined(&other.data) {
             use num_traits::ToPrimitive;
             if let Some(x) = other.to_u64() {
                 return self + x;
@@ -104,7 +106,7 @@ impl<'a> Add<&'a BigUint> for BigUint {
 impl<'a> AddAssign<&'a BigUint> for BigUint {
     #[inline]
     fn add_assign(&mut self, other: &BigUint) {
-        if !other.data.spilled() {
+        if backend::inlined(&other.data) {
             use num_traits::ToPrimitive;
             if let Some(x) = other.to_u64() {
                 self.add_assign(x);
@@ -163,7 +165,7 @@ impl Add<u64> for BigUint {
     #[inline]
     fn add(mut self, other: u64) -> BigUint {
         use num_traits::ToPrimitive;
-        if !self.data.spilled() {
+        if backend::inlined(&self.data) {
             if let Some(x) = self.to_u64() {
                 return BigUint::from(x as u128 + other as u128);
             }
diff --git a/src/biguint/division.rs b/src/biguint/division.rs
index a488babe..0d6aa15c 100644
--- a/src/biguint/division.rs
+++ b/src/biguint/division.rs
@@ -6,7 +6,7 @@ use super::BigUint;
 use crate::big_digit::{self, BigDigit, DoubleBigDigit};
 use crate::UsizePromotion;
 
-use smallvec::smallvec;
+use crate::backend;
 
 use core::cmp::Ordering::{Equal, Greater, Less};
 use core::mem;
@@ -47,7 +47,7 @@ pub(super) fn div_rem_digit(mut a: BigUint, b: BigDigit) -> (BigUint, BigDigit)
         panic!("attempt to divide by zero")
     }
 
-    if !a.data.spilled() {
+    if backend::inlined(&a.data) {
         if let Some(x) = a.to_u64() {
             return (BigUint::from(x / b), x % b);
         }
@@ -248,7 +248,7 @@ fn div_rem_core(mut a: BigUint, b: &BigUint) -> (BigUint, BigUint) {
 
     let q_len = a.data.len() - b.data.len() + 1;
     let mut q = BigUint {
-        data: smallvec![0; q_len],
+        data: backend::vec![0; q_len],
     };
 
     for j in (0..q_len).rev() {
diff --git a/src/biguint/monty.rs b/src/biguint/monty.rs
index 160aa7a8..ff7796ec 100644
--- a/src/biguint/monty.rs
+++ b/src/biguint/monty.rs
@@ -6,7 +6,7 @@ use num_traits::{One, Zero};
 use crate::big_digit::{self, BigDigit, DoubleBigDigit, SignedDoubleBigDigit};
 use crate::biguint::BigUint;
 
-use smallvec::SmallVec;
+use crate::backend;
 
 struct MontyReducer {
     n0inv: BigDigit,
@@ -77,13 +77,13 @@ fn montgomery(x: &BigUint, y: &BigUint, m: &BigUint, k: BigDigit, n: usize) -> B
     }
 
     if c == 0 {
-        z.data = SmallVec::from_vec(z.data[n..].to_vec());
+        z.data = backend::from_slice(&z.data[n..]);
     } else {
         {
             let (mut first, second) = z.data.split_at_mut(n);
             sub_vv(&mut first, &second, &m.data);
         }
-        z.data = SmallVec::from_vec(z.data[..n].to_vec());
+        z.data = backend::from_slice(&z.data[..n]);
     }
 
     z
diff --git a/src/biguint/multiplication.rs b/src/biguint/multiplication.rs
index 82b73a14..be1007b8 100644
--- a/src/biguint/multiplication.rs
+++ b/src/biguint/multiplication.rs
@@ -8,7 +8,7 @@ use crate::big_digit::{self, BigDigit, DoubleBigDigit};
 use crate::Sign::{self, Minus, NoSign, Plus};
 use crate::{BigInt, UsizePromotion};
 
-use smallvec::smallvec;
+use crate::backend;
 
 use core::cmp::Ordering;
 use core::iter::Product;
@@ -172,7 +172,7 @@ fn mac3(mut acc: &mut [BigDigit], mut b: &[BigDigit], mut c: &[BigDigit]) {
         // appropriately here: x1.len() >= x0.len and y1.len() >= y0.len():
         let len = x1.len() + y1.len();
         let mut p = BigUint {
-            data: smallvec![0; len],
+            data: backend::vec![0; len],
         };
 
         // p2 = x1 * y1
@@ -350,7 +350,7 @@ fn mac3(mut acc: &mut [BigDigit], mut b: &[BigDigit], mut c: &[BigDigit]) {
 fn mul3(x: &[BigDigit], y: &[BigDigit]) -> BigUint {
     let len = x.len() + y.len();
     let mut prod = BigUint {
-        data: smallvec![0; len],
+        data: backend::vec![0; len],
     };
 
     mac3(&mut prod.data, x, y);
diff --git a/src/biguint/shift.rs b/src/biguint/shift.rs
index b390c145..265ca2fe 100644
--- a/src/biguint/shift.rs
+++ b/src/biguint/shift.rs
@@ -3,7 +3,7 @@ use super::{biguint_from_bigdigitvec, BigUint};
 use crate::big_digit;
 use crate::std_alloc::Cow;
 
-use smallvec::SmallVec;
+use crate::backend;
 
 use core::mem;
 use core::ops::{Shl, ShlAssign, Shr, ShrAssign};
@@ -28,7 +28,7 @@ fn biguint_shl2(n: Cow<'_, BigUint>, digits: usize, shift: u8) -> BigUint {
         0 => n.into_owned().data,
         _ => {
             let len = digits.saturating_add(n.data.len() + 1);
-            let mut data = SmallVec::with_capacity(len);
+            let mut data = backend::Vec::with_capacity(len);
             data.resize(digits, 0);
             data.extend(n.data.iter().copied());
             data
@@ -72,7 +72,7 @@ fn biguint_shr2(n: Cow<'_, BigUint>, digits: usize, shift: u8) -> BigUint {
         return n;
     }
     let mut data = match n {
-        Cow::Borrowed(n) => SmallVec::from_slice(&n.data[digits..]),
+        Cow::Borrowed(n) => backend::from_slice(&n.data[digits..]),
         Cow::Owned(mut n) => {
             n.data.drain(..digits);
             n.data
diff --git a/src/biguint/subtraction.rs b/src/biguint/subtraction.rs
index f98f0e3e..0b73aa04 100644
--- a/src/biguint/subtraction.rs
+++ b/src/biguint/subtraction.rs
@@ -5,6 +5,8 @@ use super::BigUint;
 use crate::big_digit::{self, BigDigit};
 use crate::UsizePromotion;
 
+use crate::backend;
+
 use core::cmp::Ordering::{Equal, Greater, Less};
 use core::ops::{Sub, SubAssign};
 use num_traits::{CheckedSub, Zero};
@@ -129,7 +131,7 @@ impl<'a> Sub<BigUint> for &'a BigUint {
 
     fn sub(self, mut other: BigUint) -> BigUint {
         use num_traits::ToPrimitive;
-        if !self.data.spilled() {
+        if backend::inlined(&self.data) {
             if let Some(x) = self.to_u64() {
                 if let Some(y) = other.to_u64() {
                     return BigUint::from(x - y);
diff --git a/src/lib.rs b/src/lib.rs
index f7b5638b..057eac74 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -98,6 +98,8 @@ mod std_alloc {
     #[cfg(any(feature = "quickcheck"))]
     pub(crate) use std::boxed::Box;
     pub(crate) use std::string::String;
+    #[cfg(not(feature = "smallvec"))]
+    pub(crate) use std::vec;
     pub(crate) use std::vec::Vec;
 }
 
@@ -111,6 +113,8 @@ mod std_alloc {
     #[cfg(any(feature = "quickcheck"))]
     pub(crate) use alloc::boxed::Box;
     pub(crate) use alloc::string::String;
+    #[cfg(not(feature = "smallvec"))]
+    pub(crate) use alloc::vec;
     pub(crate) use alloc::vec::Vec;
 }
 
@@ -118,6 +122,51 @@ use core::fmt;
 #[cfg(feature = "std")]
 use std::error::Error;
 
+#[cfg(feature = "smallvec")]
+mod backend {
+
+    const INLINED: usize = 2;
+
+    pub(crate) use smallvec::{smallvec as vec, SmallVec};
+    pub(crate) type Vec<T> = SmallVec<[T; INLINED]>;
+
+    pub(crate) fn clone<T: Copy>(vec: &Vec<T>) -> Vec<T> {
+        Vec::from_slice(vec) // This uses memcpy rather than repeated calls to .clone().
+    }
+
+    pub(crate) fn from_slice<T: Copy>(slice: &[T]) -> Vec<T> {
+        Vec::from_slice(slice)
+    }
+
+    pub(crate) fn from_vec<T: Copy>(vec: crate::std_alloc::Vec<T>) -> Vec<T> {
+        Vec::from_vec(vec)
+    }
+
+    pub(crate) fn inlined<T: Copy>(vec: &Vec<T>) -> bool {
+        !vec.spilled()
+    }
+}
+
+#[cfg(not(feature = "smallvec"))]
+mod backend {
+    pub(crate) use crate::std_alloc::{vec, Vec};
+    pub(crate) fn clone<T: Copy>(vec: &Vec<T>) -> Vec<T> {
+        vec.clone()
+    }
+
+    pub(crate) fn from_slice<T: Copy>(slice: &[T]) -> Vec<T> {
+        slice.to_vec()
+    }
+
+    pub(crate) fn from_vec<T: Copy>(vec: Vec<T>) -> Vec<T> {
+        vec
+    }
+
+    pub(crate) fn inlined<T: Copy>(_vec: &Vec<T>) -> bool {
+        false
+    }
+}
+
 #[macro_use]
 mod macros;