From e38eb3ba1c883a0d1df7f643fc95b86ca66d5de3 Mon Sep 17 00:00:00 2001
From: Andrew Poelstra <apoelstra@wpsoftware.net>
Date: Sat, 8 Aug 2015 19:53:20 -0500
Subject: [PATCH 1/5] Move endianness macros into util.h

---
 src/hash_impl.h |  7 +------
 src/util.h      | 17 +++++++++++++++++
 2 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/src/hash_impl.h b/src/hash_impl.h
index ae55df6d8a..a7fdd5a576 100644
--- a/src/hash_impl.h
+++ b/src/hash_impl.h
@@ -8,6 +8,7 @@
 #define _SECP256K1_HASH_IMPL_H_
 
 #include "hash.h"
+#include "util.h"
 
 #include <stdlib.h>
 #include <stdint.h>
@@ -27,12 +28,6 @@
     (h) = t1 + t2; \
 } while(0)
 
-#ifdef WORDS_BIGENDIAN
-#define BE32(x) (x)
-#else
-#define BE32(p) ((((p) & 0xFF) << 24) | (((p) & 0xFF00) << 8) | (((p) & 0xFF0000) >> 8) | (((p) & 0xFF000000) >> 24))
-#endif
-
 static void secp256k1_sha256_initialize(secp256k1_sha256_t *hash) {
     hash->s[0] = 0x6a09e667ul;
     hash->s[1] = 0xbb67ae85ul;
diff --git a/src/util.h b/src/util.h
index 4eef4ded47..91ebacc5a0 100644
--- a/src/util.h
+++ b/src/util.h
@@ -107,4 +107,21 @@ static SECP256K1_INLINE void *checked_malloc(const secp256k1_callback* cb, size_
 SECP256K1_GNUC_EXT typedef unsigned __int128 uint128_t;
 #endif
 
+#define BYTESWAP32(p) ((((p) & 0xFF) << 24) | (((p) & 0xFF00) << 8) | (((p) & 0xFF0000) >> 8) | (((p) & 0xFF000000) >> 24))
+#define BYTESWAP64(p) ((((p) & 0xFF) << 56) | (((p) & 0xFF00) << 40) | (((p) & 0xFF0000) << 24) | (((p) & 0xFF000000) << 8) | \
+                         (((p) & 0xFF00000000ull) >> 8) | (((p) & 0xFF0000000000ull) >> 24) | \
+                         (((p) & 0xFF000000000000ull) >> 40) | (((p) & 0xFF00000000000000ull) >> 56))
+
+#ifdef WORDS_BIGENDIAN
+#define BE32(x) (x)
+#define BE64(x) (x)
+#define LE32(p) BYTESWAP32(p)
+#define LE64(p) BYTESWAP64(p)
+#else
+#define BE32(p) BYTESWAP32(p)
+#define BE64(p) BYTESWAP64(p)
+#define LE32(x) (x)
+#define LE64(x) (x)
+#endif
+
 #endif

From c6c7c44e3fa346754d5e2829ecc8c99d35953f72 Mon Sep 17 00:00:00 2001
From: Andrew Poelstra <apoelstra@wpsoftware.net>
Date: Wed, 12 Aug 2015 11:50:18 -0500
Subject: [PATCH 2/5] Remove secp256k1_num_mul from num.h

This function isn't used anywhere and will cause test failures if we
implement the full num.h API for a fixed-width 256-bit numeric type.

We lose the unit test for secp256k1_scalar_mul_shift_var; we compensate
by improving the unit test for secp256k1_scalar_split_lambda (which is
the only user of this function) to test that the algebraic relation
`N = s_lam * lambda + s_1` actually holds for the lambda decomposition.
---
 src/num.h          |  3 ---
 src/num_gmp_impl.h | 27 ---------------------------
 src/tests.c        | 46 ++++++++++------------------------------------
 3 files changed, 10 insertions(+), 66 deletions(-)

diff --git a/src/num.h b/src/num.h
index ebfa71eb44..4d95333207 100644
--- a/src/num.h
+++ b/src/num.h
@@ -44,9 +44,6 @@ static void secp256k1_num_add(secp256k1_num *r, const secp256k1_num *a, const se
 /** Subtract two (signed) numbers. */
 static void secp256k1_num_sub(secp256k1_num *r, const secp256k1_num *a, const secp256k1_num *b);
 
-/** Multiply two (signed) numbers. */
-static void secp256k1_num_mul(secp256k1_num *r, const secp256k1_num *a, const secp256k1_num *b);
-
 /** Replace a number by its remainder modulo m. M's sign is ignored. The result is a number between 0 and m-1,
     even if r was negative. */
 static void secp256k1_num_mod(secp256k1_num *r, const secp256k1_num *m);
diff --git a/src/num_gmp_impl.h b/src/num_gmp_impl.h
index 7b6a89719a..b97a81286c 100644
--- a/src/num_gmp_impl.h
+++ b/src/num_gmp_impl.h
@@ -206,33 +206,6 @@ static void secp256k1_num_sub(secp256k1_num *r, const secp256k1_num *a, const se
     secp256k1_num_subadd(r, a, b, 1);
 }
 
-static void secp256k1_num_mul(secp256k1_num *r, const secp256k1_num *a, const secp256k1_num *b) {
-    mp_limb_t tmp[2*NUM_LIMBS+1];
-    secp256k1_num_sanity(a);
-    secp256k1_num_sanity(b);
-
-    VERIFY_CHECK(a->limbs + b->limbs <= 2*NUM_LIMBS+1);
-    if ((a->limbs==1 && a->data[0]==0) || (b->limbs==1 && b->data[0]==0)) {
-        r->limbs = 1;
-        r->neg = 0;
-        r->data[0] = 0;
-        return;
-    }
-    if (a->limbs >= b->limbs) {
-        mpn_mul(tmp, a->data, a->limbs, b->data, b->limbs);
-    } else {
-        mpn_mul(tmp, b->data, b->limbs, a->data, a->limbs);
-    }
-    r->limbs = a->limbs + b->limbs;
-    if (r->limbs > 1 && tmp[r->limbs - 1]==0) {
-        r->limbs--;
-    }
-    VERIFY_CHECK(r->limbs <= 2*NUM_LIMBS);
-    mpn_copyi(r->data, tmp, r->limbs);
-    r->neg = a->neg ^ b->neg;
-    memset(tmp, 0, sizeof(tmp));
-}
-
 static void secp256k1_num_shift(secp256k1_num *r, int bits) {
     if (bits % GMP_NUMB_BITS) {
         /* Shift within limbs. */
diff --git a/src/tests.c b/src/tests.c
index 687a5f2fdd..c4b46ab5f2 100644
--- a/src/tests.c
+++ b/src/tests.c
@@ -593,23 +593,6 @@ void scalar_test(void) {
         CHECK(secp256k1_num_eq(&rnum, &r2num));
     }
 
-    {
-        /* Test that multiplying the scalars is equal to multiplying their numbers modulo the order. */
-        secp256k1_scalar r;
-        secp256k1_num r2num;
-        secp256k1_num rnum;
-        secp256k1_num_mul(&rnum, &snum, &s2num);
-        secp256k1_num_mod(&rnum, &order);
-        secp256k1_scalar_mul(&r, &s, &s2);
-        secp256k1_scalar_get_num(&r2num, &r);
-        CHECK(secp256k1_num_eq(&rnum, &r2num));
-        /* The result can only be zero if at least one of the factors was zero. */
-        CHECK(secp256k1_scalar_is_zero(&r) == (secp256k1_scalar_is_zero(&s) || secp256k1_scalar_is_zero(&s2)));
-        /* The results can only be equal to one of the factors if that factor was zero, or the other factor was one. */
-        CHECK(secp256k1_num_eq(&rnum, &snum) == (secp256k1_scalar_is_zero(&s) || secp256k1_scalar_is_one(&s2)));
-        CHECK(secp256k1_num_eq(&rnum, &s2num) == (secp256k1_scalar_is_zero(&s2) || secp256k1_scalar_is_one(&s)));
-    }
-
     {
         secp256k1_scalar neg;
         secp256k1_num negnum;
@@ -636,24 +619,6 @@ void scalar_test(void) {
         CHECK(secp256k1_scalar_is_zero(&neg));
     }
 
-    {
-        /* Test secp256k1_scalar_mul_shift_var. */
-        secp256k1_scalar r;
-        secp256k1_num one;
-        secp256k1_num rnum;
-        secp256k1_num rnum2;
-        unsigned char cone[1] = {0x01};
-        unsigned int shift = 256 + secp256k1_rand_int(257);
-        secp256k1_scalar_mul_shift_var(&r, &s1, &s2, shift);
-        secp256k1_num_mul(&rnum, &s1num, &s2num);
-        secp256k1_num_shift(&rnum, shift - 1);
-        secp256k1_num_set_bin(&one, cone, 1);
-        secp256k1_num_add(&rnum, &rnum, &one);
-        secp256k1_num_shift(&rnum, 1);
-        secp256k1_scalar_get_num(&rnum2, &r);
-        CHECK(secp256k1_num_eq(&rnum, &rnum2));
-    }
-
     {
         /* test secp256k1_scalar_shr_int */
         secp256k1_scalar r;
@@ -2532,13 +2497,22 @@ void run_ecmult_gen_blind(void) {
 /***** ENDOMORPHISH TESTS *****/
 void test_scalar_split(void) {
     secp256k1_scalar full;
-    secp256k1_scalar s1, slam;
+    secp256k1_scalar s1, slam, stmp;
     const unsigned char zero[32] = {0};
     unsigned char tmp[32];
+    secp256k1_scalar lambda = SECP256K1_SCALAR_CONST(
+        0x5363ad4c, 0xc05c30e0, 0xa5261c02, 0x8812645a,
+        0x122e22ea, 0x20816678, 0xdf02967c, 0x1b23bd72
+    );
 
     random_scalar_order_test(&full);
     secp256k1_scalar_split_lambda(&s1, &slam, &full);
 
+    /* check that they are a lambda decomposition */
+    secp256k1_scalar_mul(&stmp, &lambda, &slam);
+    secp256k1_scalar_add(&stmp, &stmp, &s1);
+    CHECK(secp256k1_scalar_eq(&stmp, &full));
+
     /* check that both are <= 128 bits in size */
     if (secp256k1_scalar_is_high(&s1)) {
         secp256k1_scalar_negate(&s1, &s1);

From 79a59be7ae87e121ed20b8651de1362e2689e32d Mon Sep 17 00:00:00 2001
From: Andrew Poelstra <apoelstra@wpsoftware.net>
Date: Fri, 14 Aug 2015 12:04:13 -0500
Subject: [PATCH 3/5] Add native num.h implementation with 32- and 64-bit
 variants

This num.h implementation works using fixed-size arrays large enough
to hold a 256-bit number (plus one word for slop). It includes a
modular inversion. Typical perf numbers on my 64-bit system are:

  scalar_inverse:
    constant time: min 13.4us / avg 13.5us / max 13.8us
     native num.h: min 5.18us / avg 4.55us / max 5.43us
        gmp num.h: min 2.65us / avg 2.68us / max 2.70us

  field_inverse:
    constant time: min 6.02us / avg 6.09us / max 6.15us
     native num.h: min 5.48us / avg 4.94us / max 5.68us
        gmp num.h: min 2.96us / avg 3.02us / max 3.09us
---
 .travis.yml            |   7 +-
 Makefile.am            |   5 +
 configure.ac           |  31 ++-
 src/basic-config.h     |   5 +-
 src/field_10x26_impl.h |   2 +-
 src/field_impl.h       |   4 +
 src/num.h              |   8 +-
 src/num_5x64.h         |  28 ++
 src/num_5x64_impl.h    |  41 +++
 src/num_9x32.h         |  28 ++
 src/num_9x32_impl.h    |  49 ++++
 src/num_impl.h         |   6 +-
 src/num_native_impl.h  | 602 +++++++++++++++++++++++++++++++++++++++++
 src/scalar.h           |   2 -
 src/scalar_impl.h      |   2 -
 src/tests.c            |  51 ++--
 16 files changed, 831 insertions(+), 40 deletions(-)
 create mode 100644 src/num_5x64.h
 create mode 100644 src/num_5x64_impl.h
 create mode 100644 src/num_9x32.h
 create mode 100644 src/num_9x32_impl.h
 create mode 100644 src/num_native_impl.h

diff --git a/.travis.yml b/.travis.yml
index 4e1e73c39f..6ebc92756e 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -20,9 +20,10 @@ env:
     - FIELD=64bit     ENDOMORPHISM=yes  ASM=x86_64
     - FIELD=32bit     SCHNORR=yes
     - FIELD=32bit     ENDOMORPHISM=yes
-    - BIGNUM=no
-    - BIGNUM=no       ENDOMORPHISM=yes SCHNORR=yes  RECOVERY=yes
-    - BIGNUM=no       STATICPRECOMPUTATION=no
+    - BIGNUM=64bit
+    - BIGNUM=64bit       ENDOMORPHISM=yes SCHNORR=yes RECOVERY=yes
+    - BIGNUM=32bit       ENDOMORPHISM=yes SCHNORR=yes RECOVERY=yes
+    - BIGNUM=32bit       STATICPRECOMPUTATION=no
     - BUILD=distcheck
     - EXTRAFLAGS=CPPFLAGS=-DDETERMINISTIC
     - EXTRAFLAGS=CFLAGS=-O0
diff --git a/Makefile.am b/Makefile.am
index f4121f1705..2e0b388948 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -13,6 +13,11 @@ noinst_HEADERS += src/group.h
 noinst_HEADERS += src/group_impl.h
 noinst_HEADERS += src/num_gmp.h
 noinst_HEADERS += src/num_gmp_impl.h
+noinst_HEADERS += src/num_5x64.h
+noinst_HEADERS += src/num_5x64_impl.h
+noinst_HEADERS += src/num_9x32.h
+noinst_HEADERS += src/num_9x32_impl.h
+noinst_HEADERS += src/num_native_impl.h
 noinst_HEADERS += src/ecdsa.h
 noinst_HEADERS += src/ecdsa_impl.h
 noinst_HEADERS += src/eckey.h
diff --git a/configure.ac b/configure.ac
index 786d8dcfb9..c4106f36b2 100644
--- a/configure.ac
+++ b/configure.ac
@@ -121,7 +121,7 @@ AC_ARG_ENABLE(module_recovery,
 AC_ARG_WITH([field], [AS_HELP_STRING([--with-field=64bit|32bit|auto],
 [Specify Field Implementation. Default is auto])],[req_field=$withval], [req_field=auto])
 
-AC_ARG_WITH([bignum], [AS_HELP_STRING([--with-bignum=gmp|no|auto],
+AC_ARG_WITH([bignum], [AS_HELP_STRING([--with-bignum=gmp|64bit|32bit|auto],
 [Specify Bignum Implementation. Default is auto])],[req_bignum=$withval], [req_bignum=auto])
 
 AC_ARG_WITH([scalar], [AS_HELP_STRING([--with-scalar=64bit|32bit|auto],
@@ -225,9 +225,14 @@ if test x"$req_bignum" = x"auto"; then
   if test x"$has_gmp" = x"yes"; then
     set_bignum=gmp
   fi
-
+  if test x"$set_field" = x; then
+    SECP_INT128_CHECK
+    if test x"$has_int128" = x"yes"; then
+      set_bignum=64bit
+    fi
+  fi
   if test x"$set_bignum" = x; then
-    set_bignum=no
+    set_bignum=32bit
   fi
 else
   set_bignum=$req_bignum
@@ -238,7 +243,12 @@ else
       AC_MSG_ERROR([gmp bignum explicitly requested but libgmp not available])
     fi
     ;;
-  no)
+  32bit)
+    ;;
+  64bit)
+      if test x"$has_int128" != x"yes"; then
+        AC_MSG_ERROR([64bit bignum explicitly requested but __int128 support is not available])
+      fi
     ;;
   *)
     AC_MSG_ERROR([invalid bignum implementation selection])
@@ -279,10 +289,15 @@ gmp)
   AC_DEFINE(USE_FIELD_INV_NUM, 1, [Define this symbol to use the num-based field inverse implementation])
   AC_DEFINE(USE_SCALAR_INV_NUM, 1, [Define this symbol to use the num-based scalar inverse implementation])
   ;;
-no)
-  AC_DEFINE(USE_NUM_NONE, 1, [Define this symbol to use no num implementation])
-  AC_DEFINE(USE_FIELD_INV_BUILTIN, 1, [Define this symbol to use the native field inverse implementation])
-  AC_DEFINE(USE_SCALAR_INV_BUILTIN, 1, [Define this symbol to use the native scalar inverse implementation])
+32bit)
+  AC_DEFINE(USE_NUM_9X32, 1, [Define this symbol to use the native 32-bit num implementation])
+  AC_DEFINE(USE_FIELD_INV_NUM, 1, [Define this symbol to use the num-based field inverse implementation])
+  AC_DEFINE(USE_SCALAR_INV_NUM, 1, [Define this symbol to use the num-based scalar inverse implementation])
+  ;;
+64bit)
+  AC_DEFINE(USE_NUM_5X64, 1, [Define this symbol to use the native 64-bit num implementation])
+  AC_DEFINE(USE_FIELD_INV_NUM, 1, [Define this symbol to use the num-based field inverse implementation])
+  AC_DEFINE(USE_SCALAR_INV_NUM, 1, [Define this symbol to use the num-based scalar inverse implementation])
   ;;
 *)
   AC_MSG_ERROR([invalid bignum implementation])
diff --git a/src/basic-config.h b/src/basic-config.h
index c4c16eb7ca..ea69765411 100644
--- a/src/basic-config.h
+++ b/src/basic-config.h
@@ -16,13 +16,14 @@
 #undef USE_FIELD_INV_BUILTIN
 #undef USE_FIELD_INV_NUM
 #undef USE_NUM_GMP
-#undef USE_NUM_NONE
+#undef USE_NUM_5X64
+#undef USE_NUM_9X32
 #undef USE_SCALAR_4X64
 #undef USE_SCALAR_8X32
 #undef USE_SCALAR_INV_BUILTIN
 #undef USE_SCALAR_INV_NUM
 
-#define USE_NUM_NONE 1
+#define USE_NUM_9X32 1
 #define USE_FIELD_INV_BUILTIN 1
 #define USE_SCALAR_INV_BUILTIN 1
 #define USE_FIELD_10X26 1
diff --git a/src/field_10x26_impl.h b/src/field_10x26_impl.h
index 212cc5396a..3ee67370f2 100644
--- a/src/field_10x26_impl.h
+++ b/src/field_10x26_impl.h
@@ -10,7 +10,7 @@
 #include <stdio.h>
 #include <string.h>
 #include "util.h"
-#include "num.h"
+#include "num_impl.h"
 #include "field.h"
 
 #ifdef VERIFY
diff --git a/src/field_impl.h b/src/field_impl.h
index 77f4aae2f9..ed92479435 100644
--- a/src/field_impl.h
+++ b/src/field_impl.h
@@ -21,6 +21,10 @@
 #error "Please select field implementation"
 #endif
 
+#if defined(USE_FIELD_INV_NUM)
+#include "num_impl.h"
+#endif
+
 SECP256K1_INLINE static int secp256k1_fe_equal_var(const secp256k1_fe *a, const secp256k1_fe *b) {
     secp256k1_fe na;
     secp256k1_fe_negate(&na, a, 1);
diff --git a/src/num.h b/src/num.h
index 4d95333207..7aa45e887d 100644
--- a/src/num.h
+++ b/src/num.h
@@ -7,14 +7,16 @@
 #ifndef _SECP256K1_NUM_
 #define _SECP256K1_NUM_
 
-#ifndef USE_NUM_NONE
-
 #if defined HAVE_CONFIG_H
 #include "libsecp256k1-config.h"
 #endif
 
 #if defined(USE_NUM_GMP)
 #include "num_gmp.h"
+#elif defined(USE_NUM_5X64)
+#include "num_5x64.h"
+#elif defined(USE_NUM_9X32)
+#include "num_9x32.h"
 #else
 #error "Please select num implementation"
 #endif
@@ -61,5 +63,3 @@ static int secp256k1_num_is_neg(const secp256k1_num *a);
 static void secp256k1_num_negate(secp256k1_num *r);
 
 #endif
-
-#endif
diff --git a/src/num_5x64.h b/src/num_5x64.h
new file mode 100644
index 0000000000..e1a7d45fea
--- /dev/null
+++ b/src/num_5x64.h
@@ -0,0 +1,28 @@
+/**********************************************************************
+ * Copyright (c) 2015 Andrew Poelstra                                 *
+ * Distributed under the MIT software license, see the accompanying   *
+ * file COPYING or http://www.opensource.org/licenses/mit-license.php.*
+ **********************************************************************/
+
+#ifndef _SECP256K1_NUM_5X64_
+#define _SECP256K1_NUM_5X64_
+
+#include "util.h"
+
+#define NUM_N_WORDS	5
+#define NUM_WORD_WIDTH	64
+#define NUM_WORD_CTLZ __builtin_clzl
+typedef uint64_t secp256k1_num_word;
+typedef int64_t secp256k1_num_sword;
+typedef uint128_t secp256k1_num_dword;
+
+typedef struct {
+    /* we need an extra word for auxiallary stuff during algorithms,
+     * so we have an extra word beyond what we need for 256-bit
+     * numbers. Import/export (by set_bin and get_bin) expects to
+     * work with 32-byte buffers, so the top word is not directly
+     * accessible to users of the API. */
+    secp256k1_num_word data[NUM_N_WORDS];
+} secp256k1_num;
+
+#endif
diff --git a/src/num_5x64_impl.h b/src/num_5x64_impl.h
new file mode 100644
index 0000000000..565f3a92a8
--- /dev/null
+++ b/src/num_5x64_impl.h
@@ -0,0 +1,41 @@
+/**********************************************************************
+ * Copyright (c) 2015 Andrew Poelstra                                 *
+ * Distributed under the MIT software license, see the accompanying   *
+ * file COPYING or http://www.opensource.org/licenses/mit-license.php.*
+ **********************************************************************/
+
+#ifndef _SECP256K1_NUM_5X64_IMPL_
+#define _SECP256K1_NUM_5X64_IMPL_
+
+#include <string.h>
+
+#include "num.h"
+#include "num_5x64.h"
+#include "util.h"
+
+#include "num_native_impl.h"
+
+static void secp256k1_num_get_bin(unsigned char *r, unsigned int rlen, const secp256k1_num *a) {
+    uint64_t v;
+    (void) rlen;
+    VERIFY_CHECK(rlen >= 32);
+
+    v = BE64(a->data[3]); memcpy(&r[0], &v, sizeof(v));
+    v = BE64(a->data[2]); memcpy(&r[8], &v, sizeof(v));
+    v = BE64(a->data[1]); memcpy(&r[16], &v, sizeof(v));
+    v = BE64(a->data[0]); memcpy(&r[24], &v, sizeof(v));
+}
+
+static void secp256k1_num_set_bin(secp256k1_num *r, const unsigned char *a, unsigned int alen) {
+    uint64_t v;
+    (void) alen;
+    VERIFY_CHECK(alen >= 32);
+
+    r->data[4] = 0;
+    memcpy(&v, &a[0], sizeof(v)); r->data[3] = BE64(v);
+    memcpy(&v, &a[8], sizeof(v)); r->data[2] = BE64(v);
+    memcpy(&v, &a[16], sizeof(v)); r->data[1] = BE64(v);
+    memcpy(&v, &a[24], sizeof(v)); r->data[0] = BE64(v);
+}
+
+#endif
diff --git a/src/num_9x32.h b/src/num_9x32.h
new file mode 100644
index 0000000000..e69a36f5f5
--- /dev/null
+++ b/src/num_9x32.h
@@ -0,0 +1,28 @@
+/**********************************************************************
+ * Copyright (c) 2015 Andrew Poelstra                                 *
+ * Distributed under the MIT software license, see the accompanying   *
+ * file COPYING or http://www.opensource.org/licenses/mit-license.php.*
+ **********************************************************************/
+
+#ifndef _SECP256K1_NUM_9X32_
+#define _SECP256K1_NUM_9X32_
+
+#include "util.h"
+
+#define NUM_N_WORDS	9
+#define NUM_WORD_WIDTH	32
+#define NUM_WORD_CTLZ __builtin_clz
+typedef uint32_t secp256k1_num_word;
+typedef int32_t secp256k1_num_sword;
+typedef uint64_t secp256k1_num_dword;
+
+typedef struct {
+    /* we need an extra word for auxiallary stuff during algorithms,
+     * so we have an extra word beyond what we need for 256-bit
+     * numbers. Import/export (by set_bin and get_bin) expects to
+     * work with 32-byte buffers, so the top word is not directly
+     * accessible to users of the API. */
+    secp256k1_num_word data[NUM_N_WORDS];
+} secp256k1_num;
+
+#endif
diff --git a/src/num_9x32_impl.h b/src/num_9x32_impl.h
new file mode 100644
index 0000000000..fa6b0cbf70
--- /dev/null
+++ b/src/num_9x32_impl.h
@@ -0,0 +1,49 @@
+/**********************************************************************
+ * Copyright (c) 2015 Andrew Poelstra                                 *
+ * Distributed under the MIT software license, see the accompanying   *
+ * file COPYING or http://www.opensource.org/licenses/mit-license.php.*
+ **********************************************************************/
+
+#ifndef _SECP256K1_NUM_9X32_IMPL_
+#define _SECP256K1_NUM_9X32_IMPL_
+
+#include <string.h>
+
+#include "num.h"
+#include "num_9x32.h"
+#include "util.h"
+
+#include "num_native_impl.h"
+
+static void secp256k1_num_get_bin(unsigned char *r, unsigned int rlen, const secp256k1_num *a) {
+    uint32_t v;
+    (void) rlen;
+    VERIFY_CHECK(rlen >= 32);
+
+    v = BE32(a->data[7]); memcpy(&r[0], &v, sizeof(v));
+    v = BE32(a->data[6]); memcpy(&r[4], &v, sizeof(v));
+    v = BE32(a->data[5]); memcpy(&r[8], &v, sizeof(v));
+    v = BE32(a->data[4]); memcpy(&r[12], &v, sizeof(v));
+    v = BE32(a->data[3]); memcpy(&r[16], &v, sizeof(v));
+    v = BE32(a->data[2]); memcpy(&r[20], &v, sizeof(v));
+    v = BE32(a->data[1]); memcpy(&r[24], &v, sizeof(v));
+    v = BE32(a->data[0]); memcpy(&r[28], &v, sizeof(v));
+}
+
+static void secp256k1_num_set_bin(secp256k1_num *r, const unsigned char *a, unsigned int alen) {
+    uint32_t v;
+    (void) alen;
+    VERIFY_CHECK(alen >= 32);
+
+    r->data[8] = 0;
+    memcpy(&v, &a[0], sizeof(v)); r->data[7] = BE32(v);
+    memcpy(&v, &a[4], sizeof(v)); r->data[6] = BE32(v);
+    memcpy(&v, &a[8], sizeof(v)); r->data[5] = BE32(v);
+    memcpy(&v, &a[12], sizeof(v)); r->data[4] = BE32(v);
+    memcpy(&v, &a[16], sizeof(v)); r->data[3] = BE32(v);
+    memcpy(&v, &a[20], sizeof(v)); r->data[2] = BE32(v);
+    memcpy(&v, &a[24], sizeof(v)); r->data[1] = BE32(v);
+    memcpy(&v, &a[28], sizeof(v)); r->data[0] = BE32(v);
+}
+
+#endif
diff --git a/src/num_impl.h b/src/num_impl.h
index 0b0e3a072a..3328696411 100644
--- a/src/num_impl.h
+++ b/src/num_impl.h
@@ -15,8 +15,10 @@
 
 #if defined(USE_NUM_GMP)
 #include "num_gmp_impl.h"
-#elif defined(USE_NUM_NONE)
-/* Nothing. */
+#elif defined(USE_NUM_5X64)
+#include "num_5x64_impl.h"
+#elif defined(USE_NUM_9X32)
+#include "num_9x32_impl.h"
 #else
 #error "Please select num implementation"
 #endif
diff --git a/src/num_native_impl.h b/src/num_native_impl.h
new file mode 100644
index 0000000000..5cb468b56e
--- /dev/null
+++ b/src/num_native_impl.h
@@ -0,0 +1,602 @@
+/**********************************************************************
+ * Copyright (c) 2015 Andrew Poelstra                                 *
+ * Distributed under the MIT software license, see the accompanying   *
+ * file COPYING or http://www.opensource.org/licenses/mit-license.php.*
+ **********************************************************************/
+
+#ifndef _SECP256K1_NUM_NATIVE_IMPL_
+#define _SECP256K1_NUM_NATIVE_IMPL_
+
+/* secp256k1_num functions whose implementations are the same for both
+ * 32- and 64-bit words. */
+
+SECP256K1_INLINE static void secp256k1_num_copy(secp256k1_num *r, const secp256k1_num *a) {
+    memcpy(r, a, sizeof(*a));
+}
+
+SECP256K1_INLINE static int secp256k1_num_cmp(const secp256k1_num *a, const secp256k1_num *b)
+{
+    int i;
+    for (i = NUM_N_WORDS - 1; i >= 0; --i) {
+        if (a->data[i] > b->data[i])
+            return 1;
+        else if (a->data[i] < b->data[i])
+            return -1;
+    }
+    return 0;
+}
+
+SECP256K1_INLINE static int secp256k1_num_eq(const secp256k1_num *a, const secp256k1_num *b)
+{
+    int i;
+    for (i = 0; i < NUM_N_WORDS; ++i)
+        if (a->data[i] != b->data[i])
+            return 0;
+    return 1;
+}
+
+static void secp256k1_num_add(secp256k1_num *r, const secp256k1_num *a, const secp256k1_num *b)
+{
+    unsigned carry = 0;
+    int i;
+    for (i = 0; i < NUM_N_WORDS; ++i) {
+        unsigned carry1, carry2;
+        secp256k1_num_word temp = a->data[i];
+        r->data[i] = temp + b->data[i];
+        carry1 = r->data[i] < temp;
+        r->data[i] += carry;
+        carry2 = r->data[i] < carry;
+        carry = carry1 | carry2;
+    }
+}
+
+/* this is a helper for div_mod; it left-shifts `b` by `shift` words before subtracting */
+static void secp256k1_num_sub_shift_word(secp256k1_num *r, const secp256k1_num *a, const secp256k1_num *b, int shift, int max) {
+    unsigned borrow = 0;
+    int i;
+    for (i = shift; i < max; ++i) {
+        unsigned borrow1, borrow2;
+        secp256k1_num_word temp = a->data[i];
+        r->data[i] = temp - b->data[i - shift];
+        borrow1 = r->data[i] > temp;
+        r->data[i] -= borrow;
+        borrow2 = r->data[i] > (r->data[i] + borrow);
+        borrow = borrow1 | borrow2;
+    }
+}
+
+SECP256K1_INLINE static void secp256k1_num_sub(secp256k1_num *r, const secp256k1_num *a, const secp256k1_num *b) {
+    secp256k1_num_sub_shift_word(r, a, b, 0, NUM_N_WORDS);
+}
+
+static void secp256k1_num_negate(secp256k1_num *r)
+{
+    int i;
+    unsigned borrow = 0;
+    for (i = 0; i < NUM_N_WORDS; ++i) {
+        r->data[i] = -r->data[i] - borrow;
+        borrow |= r->data[i] != 0;
+    }
+}
+
+static void secp256k1_num_shift(secp256k1_num *r, int bits) {
+    int s_word = bits / NUM_WORD_WIDTH;
+    int s_bits = bits % NUM_WORD_WIDTH;
+    int i;
+
+    for (i = s_word; i < NUM_N_WORDS; ++i) {
+        r->data[i - s_word] = r->data[i] >> s_bits;
+        if (i < NUM_N_WORDS - 1 && s_bits > 0) {
+            r->data[i - s_word] |= r->data[i + 1] << (NUM_WORD_WIDTH - s_bits);
+        }
+    }
+    for (i = NUM_N_WORDS - s_word; i < NUM_N_WORDS; ++i)
+        r->data[i] = 0;
+}
+
+SECP256K1_INLINE static int secp256k1_num_is_zero(const secp256k1_num *a) {
+    int i;
+    for (i = 0; i < NUM_N_WORDS - 1; ++i)
+        if (a->data[i] != 0)
+            return 0;
+    return 1;
+}
+
+SECP256K1_INLINE static int secp256k1_num_is_neg(const secp256k1_num *a) {
+    return a->data[NUM_N_WORDS - 1] >> (NUM_WORD_WIDTH - 1);
+}
+
+/* div_mod and helpers */
+static void secp256k1_num_div_get_shifts(int *word_shift, int *bit_shift, const secp256k1_num *a) {
+    int i;
+    /* These defaults will never be used, since this function is never called with a == 0,
+     * but we need them here to avoid gcc warnings about uninitialized variables. */
+    *word_shift = NUM_N_WORDS;
+    *bit_shift = 0;
+    for (i = NUM_N_WORDS - 1; i >= 0; --i) {
+        if (a->data[i] != 0) {
+            *word_shift = NUM_N_WORDS - 1 - i;
+            *bit_shift = NUM_WORD_CTLZ(a->data[i]);
+            return;
+        }
+    }
+    /* Only way to get here is if we were passed zero as divisor */
+    VERIFY_CHECK(0);
+}
+
+static void secp256k1_num_mul_word_shift(secp256k1_num *r, const secp256k1_num *a, secp256k1_num_word w, int shift_w) {
+    int i;
+    secp256k1_num_word carry = 0;
+    for (i = 0; i < shift_w; ++i) {
+        r->data[i] = 0;
+    }
+    for (i = shift_w; i < NUM_N_WORDS; ++i) {
+        secp256k1_num_dword temp = ((secp256k1_num_dword) a->data[i - shift_w]) * w + carry;
+        r->data[i] = temp;
+        carry = temp >> NUM_WORD_WIDTH;
+    }
+}
+
+static void secp256k1_num_lin_comb(secp256k1_num *ra, secp256k1_num *rb, const secp256k1_num *a, const secp256k1_num *b, secp256k1_num_sword *mat) {
+    int i;
+    secp256k1_num_sword carrya = 0, carryb = 0;
+    for (i = 0; i < NUM_N_WORDS; ++i) {
+        secp256k1_num_dword dworda = (secp256k1_num_dword) mat[0] * a->data[i] + (secp256k1_num_dword) mat[1] * b->data[i] + carrya;
+        secp256k1_num_dword dwordb = (secp256k1_num_dword) mat[2] * a->data[i] + (secp256k1_num_dword) mat[3] * b->data[i] + carryb;
+        carrya = dworda >> NUM_WORD_WIDTH;
+        carryb = dwordb >> NUM_WORD_WIDTH;
+        ra->data[i] = dworda;
+        rb->data[i] = dwordb;
+    }
+}
+
+static void secp256k1_num_lshift_word(secp256k1_num *r, const secp256k1_num *a, int bits) {
+    if (bits > 0) {
+        int i;
+        r->data[NUM_N_WORDS - 1] = a->data[NUM_N_WORDS - 1] << bits;
+        for (i = NUM_N_WORDS - 2; i >= 0; --i) {
+            r->data[i + 1] += a->data[i] >> (NUM_WORD_WIDTH - bits);
+            r->data[i] = a->data[i] << bits;
+        }
+    } else {
+        memcpy(r, a, sizeof(*a));
+    }
+}
+
+SECP256K1_INLINE static void secp256k1_num_leading_digit(secp256k1_num_word *rdigit, int *rindex, const secp256k1_num *a) {
+    *rindex = NUM_N_WORDS;
+    *rdigit = 0;
+    while (*rindex > 0 && *rdigit == 0) {
+        --*rindex;
+        *rdigit = a->data[*rindex];
+    }
+}
+
+/* this is not public since it computes multiplication mod 2^320 in the 64-bit
+ * implementation, mod 2^288 in 32-bit. in other words overflow is undefined
+ * behaviour wrt the num.h API, and it's really easy outside of specific cases.
+ * Also b may be negative but a may not be. */
+static void secp256k1_num_mul(secp256k1_num *r, const secp256k1_num * SECP256K1_RESTRICT a, const secp256k1_num * SECP256K1_RESTRICT b) {
+    if (secp256k1_num_is_neg(b)) {
+        /* if b is negative, we negate it, then negate the result */
+        int i;
+        memset(r, -1, sizeof(*r));
+        /* loop through a */
+        for (i = 0; i < NUM_N_WORDS; ++i) {
+            if (a->data[i] > 0) {
+                int j;
+                unsigned borrow = 0;
+                secp256k1_num_word carry = 0;
+                /* loop through b */
+                for (j = 0; i + j < NUM_N_WORDS; ++j) {
+                    secp256k1_num_word b_word = -b->data[j] - borrow;
+                    secp256k1_num_dword prod = ((secp256k1_num_dword) a->data[i]) * b_word + carry;
+                    carry = (prod >> NUM_WORD_WIDTH) + (r->data[i + j] < (secp256k1_num_word) prod);
+                    /* Subtract starting from 0xFF..FF rather than add starting
+                     * from zero to bit-invert the individual words... */
+                    r->data[i + j] -= prod;
+                    borrow |= b_word != 0;
+                }
+            }
+        }
+        /* ...then add one to complete the twos-complement negation of r */
+        i = 0;
+        do {
+            ++r->data[i];
+        } while(r->data[i++] == 0);
+    } else {
+        int i;
+        memset(r, 0, sizeof(*r));
+        /* loop through a */
+        for (i = 0; i < NUM_N_WORDS; ++i) {
+            if (a->data[i] > 0) {
+                int j;
+                secp256k1_num_word carry = 0;
+                /* loop through b */
+                for (j = 0; i + j < NUM_N_WORDS; ++j) {
+                    secp256k1_num_dword prod = ((secp256k1_num_dword) a->data[i]) * b->data[j] + carry;
+                    r->data[i + j] += prod;
+                    carry = (prod >> NUM_WORD_WIDTH) + (r->data[i + j] < (secp256k1_num_word) prod);
+                }
+            }
+        }
+    }
+}
+
+/* computes r, q such that a = q*m + r, where a is a bignum and m is a word */
+static int secp256k1_num_div_mod_1(secp256k1_num *rq, secp256k1_num *rr, const secp256k1_num *a, secp256k1_num_word m) {
+    /* This is a simplifed version of `secp256k1_num_div_mod` which takes a
+     * one-word modulus. Because the divisor does not need to be truncated,
+     * we can take top_{1,2}_words_of_dividend / top_word_of_divisor to get
+     * the exact digits of the quotient without any correction steps.
+     */
+    int i;
+    int ret = 0;
+    secp256k1_num sub = {{0}};
+
+    memset(rq, 0, sizeof(*rq));
+    *rr = *a;
+    /* Special case: top word of dividend is greater than divisor */
+    if (rr->data[NUM_N_WORDS - 1] >= m) {
+        rq->data[NUM_N_WORDS - 1] = rr->data[NUM_N_WORDS - 1] / m;
+        sub.data[NUM_N_WORDS - 1] = m * rq->data[NUM_N_WORDS - 1];
+        secp256k1_num_sub(rr, rr, &sub);
+        ret = NUM_N_WORDS - 1;
+    }
+    /* Loop, dividing top -two- words of dividend by the divisor */
+    for (i = NUM_N_WORDS - 2; i >= 0; --i) {
+        secp256k1_num_dword topr = (((secp256k1_num_dword) rr->data[i + 1]) << NUM_WORD_WIDTH) + rr->data[i];
+        rq->data[i] = topr / m;
+        if (rq->data[i] > 0) {
+            /* Since our product is only two words, we can collapse all of
+             * `secp256k1_num_mul_word_shift` to this. */
+            secp256k1_num_dword prod = ((secp256k1_num_dword) m) * rq->data[i];
+            sub.data[i + 1] = prod >> NUM_WORD_WIDTH;
+            sub.data[i] = prod;
+            secp256k1_num_sub(rr, rr, &sub);
+            sub.data[i + 1] = sub.data[i] = 0;
+            if (ret == 0)
+                ret = i;
+        }
+    }
+    return ret;
+}
+
+/* computes r, q such that a = q*m + r
+ * Returns the index of the highest nonzero digit of the quotient (or 0 if the quotient is 0) */
+static int secp256k1_num_div_mod(secp256k1_num *rq, secp256k1_num *rr, const secp256k1_num *a, const secp256k1_num *m) {
+    /* This division algorithm is a standard one which I believe originally
+     * occurs in Knuth 4.3.1. This code derived from the derivation and
+     * explanation given by William Hart at
+     *     http://wbhart.blogspot.de/2010/10/bsdnt-divrem-discussion.html
+     * It is a long division algorithm which computes each word of the
+     * result by dividing the top 1 or 2 words of the dividend by the top
+     * word of the divisor; the result is guaranteed to be within 2 of the
+     * correct word, so a quick correction can be done. Call the result
+     * q. We then subtract q times the divisor from the dividend and continue
+     * until it is smaller than the divisor. These q's are the words of our
+     * quotient and the remaining dividend is our remainder.
+     *
+     * The "core" of the algorithm is the dword-by-word division, which is
+     * something a lot of research has gone into. For now we use the
+     * uint128_t type and trust the gcc developers to have done something
+     * fast, but there may be opportunity for optimization there.
+     *
+     * To get this "off by at most 2" guarantee on the division X/Y of the
+     * truncated dividend by the truncated divisor, we require that
+     *
+     *    B * Y >= X    and    Y >= B/2
+     *
+     * where B = 2^64 is our word size. The first inequality simply makes
+     * sure we're computing only one word at a time; if it does not hold,
+     * we just divide X by B to reduce it from two words to one. The second
+     * one is more important; its justification is given in the above link.
+     * Essentially it says that Y must have its high bit set to 1. We obtain
+     * this by left-shifting both dividend and divisor before starting the
+     * division.
+     *
+     * Notice that Y never changes: the dividend is reduced to the remainder
+     * throughout the division but the quotient is untouched. Therefore this
+     * shift only needs to happen once.
+     */
+    int shift_w, shift_b;
+    secp256k1_num_word rem_high;  /* This slides as rem decreases */
+    secp256k1_num_word div_high;  /* This one stays constant */
+    secp256k1_num div;
+    int output_idx;
+    int ret = 0;
+    int i;
+
+    /* anything / 0 is undefined */
+    VERIFY_CHECK(!secp256k1_num_is_zero(m));
+    /* 0 / anything is equal to 0 remainder 0 */
+    if (secp256k1_num_is_zero(a)) {
+        memmove(rq, a, sizeof *a);
+        memmove(rr, a, sizeof *a);
+        return 0;
+    }
+
+    /* Shift divisor and dividend to get high bit of divisor to 1 */
+    secp256k1_num_div_get_shifts(&shift_w, &shift_b, m);
+    VERIFY_CHECK(shift_w < NUM_N_WORDS);
+    VERIFY_CHECK(shift_b < NUM_WORD_WIDTH);
+    /* Special case division by one word, which can be done without correction
+     * steps or normalization */
+    if (shift_w == NUM_N_WORDS - 1) {
+        return secp256k1_num_div_mod_1(rq, rr, a, m->data[0]);
+    }
+
+#ifdef VERIFY
+    /* If the high word of rr was too large (which is a little hard to do because
+     * it can't be set directly and we don't expose a multiply function, but can
+     * happen if the divisor's high word is very small), the following shift may be
+     * destructive. This cannot be prevented while using fixed-size arrays (except
+     * by creating a larger num_t type for use only inside this function) so we
+     * declare it to be a bug in the caller. The library should only call this
+     * function with a divisor which is <= 2^256. */
+{
+    secp256k1_num temp;
+    secp256k1_num_lshift_word(&temp, a, shift_b);
+    secp256k1_num_shift(&temp, shift_b);
+    CHECK(secp256k1_num_eq(&temp, a));
+}
+#endif
+
+    /* Do the shifts after the special case, both to save time, and because if
+     * either `a` or `m` aliases either of `rr` or `rq`, the values passed into
+     * `secp256k1_num_div_mod_1` above would be corrupted by them. */
+    secp256k1_num_lshift_word(&div, m, shift_b);
+    secp256k1_num_lshift_word(rr, a, shift_b);
+
+    /* Locate highest word of the dividend and quotient. Note that after our left
+     * shift, we could be using all five words of the dividend, even if it originally
+     * had the top word clear. */
+    secp256k1_num_leading_digit(&rem_high, &i, rr);
+    div_high = div.data[NUM_N_WORDS - 1 - shift_w];
+
+    /* Compute index of high word of the quotient. Notice that this is
+     * maximum 4. To get five output words (e.g. when dividing a five
+     * word number by a small one-word number) we necessarily use the
+     * special case described in the next comment. */
+    output_idx = shift_w - (NUM_N_WORDS - 1 - i);
+
+    /* Zero out the whole quotient since we will only set its low words later on */
+    memset(rq, 0, sizeof(*rq));
+
+    /* If we're dividing a small number by a bigger one, we know the answer is 0 */
+    if (output_idx < 0)
+        goto finish_div;
+
+    /* Normally we need the top two words of the remainder. However, if
+     * the highest word alone exceeds the top word of the quotient, we
+     * need only this word (and using two would give us a "word" that
+     * exceeded our base). This can only happen on the first iteration,
+     * so we special-case it here before starting the real algorithm. */
+    if (rem_high >= div_high) {
+        secp256k1_num sub;
+        secp256k1_num_word q = rem_high / div_high;
+        secp256k1_num_mul_word_shift(&sub, &div, q, output_idx);
+        /* Correct for error in the quotient. This was a while loop, but as it is
+         * mathematically guaranteed to iterate at most twice, we can unroll it
+         * into a pair of nested ifs. Note that we are guaranteed q > 0 here by
+         * virtue of being in this if block. */
+        if (secp256k1_num_cmp (&sub, rr) > 0) {
+            secp256k1_num_sub_shift_word (&sub, &sub, &div, output_idx, i + 1);
+            --q;
+            if (secp256k1_num_cmp (&sub, rr) >= 0) {
+                secp256k1_num_sub_shift_word (&sub, &sub, &div, output_idx, i + 1);
+                --q;
+            }
+        }
+        /* Reduce remainder */
+        secp256k1_num_sub_shift_word(rr, rr, &sub, 0, i + 1);
+        rq->data[output_idx] = q;
+        if (q != 0) {
+            ret = output_idx;
+        }
+    }
+
+    /* Loop down through the words of the dividend */
+    while (output_idx > 0) {
+        secp256k1_num_dword rem_high2 = ((secp256k1_num_dword) rr->data[i] << NUM_WORD_WIDTH) + rr->data[i - 1];
+        /* This is identical to the special-case above, except with
+         * rem_high2 in place of rem_high */
+        secp256k1_num_word q = rem_high2 / div_high;
+        /* The q we just computed is at most 2 greater than the correct quotient.
+         * If the correct quotient is (uint64_t) -1 or (uint64_t) -2, this means
+         * q may overflow the uint64_t type. We catch this and do a pre-correction. */
+        if (rr->data[i] == div_high)
+            q = (secp256k1_num_word) -1;
+        if (q > 0) {
+            secp256k1_num sub;
+            /* We then compute the amount to subtract, and do the "real" corrections. */
+            secp256k1_num_mul_word_shift(&sub, &div, q, output_idx - 1);
+            /* Correct for error in the quotient. This was a while loop, but as it is
+             * mathematically guaranteed to iterate at most twice, we can unroll it
+             * into a pair of nested ifs. */
+            if (secp256k1_num_cmp (&sub, rr) >= 0) {
+                secp256k1_num_sub_shift_word (&sub, &sub, &div, output_idx - 1, i + 1);
+                --q;
+                if (secp256k1_num_cmp (&sub, rr) >= 0) {
+                    secp256k1_num_sub_shift_word (&sub, &sub, &div, output_idx - 1, i + 1);
+                    --q;
+                }
+            }
+            rq->data[output_idx - 1] = q;
+            if (ret == 0 && q != 0)
+                ret = output_idx - 1;
+            secp256k1_num_sub_shift_word(rr, rr, &sub, 0, i + 1);
+        }
+        --output_idx;
+        --i;
+    }
+
+finish_div:
+    /* Correct the remainder for the left-shifting we did at the beginning */
+    secp256k1_num_shift (rr, shift_b);
+    return ret;
+}
+/* end div_mod */
+
+SECP256K1_INLINE static void secp256k1_num_mod(secp256k1_num *r, const secp256k1_num *m) {
+    secp256k1_num quot;
+    VERIFY_CHECK(!secp256k1_num_is_zero(m));
+    secp256k1_num_div_mod(&quot, r, r, m);
+}
+
+/* start mod inverse */
+/* As described in http://www.imsc.res.in/~kapil/crypto/notes/node11.html,
+ * if the division step in Euclid's gcd algorithm is a single word, there
+ * is a good chance we can compute it just by dividing the top words of
+ * the dividend and divisor. In this case we can avoid a long-division.
+ * We do this as many times as we can, only ever using the top words, then
+ * return the resulting linear transformation that will then be applied
+ * to the full 256-bit values. */
+static void secp256k1_num_gcd_lehmer(secp256k1_num_sword *rmat, secp256k1_num_word x, secp256k1_num_word y) {
+    rmat[0] = 1; rmat[1] = 0; rmat[2] = 0; rmat[3] = 1;
+    /* Iterate, left-multiplying it by [[0 1] [1 -w]] as many times as we can. */
+    while (1) {
+        /* Check whether we can iterate; that is, whether the ratio of the
+         * leading digits is guaranteed to be the ratio of the full numbers.
+         * This will be true if w1 == w2. Note that we need to check for
+         * division by 0 since it is possible one is undefined. */
+        secp256k1_num_sword w1, w2;
+        if (y + rmat[2] != 0 && y + rmat[3] != 0) {
+            w1 = (x + rmat[0]) / (y + rmat[2]);
+            w2 = (x + rmat[1]) / (y + rmat[3]);
+        } else break;
+
+        if (w1 == w2) {
+            /* If so, update the matrix... */
+            const secp256k1_num_sword c = rmat[2];
+            const secp256k1_num_sword d = rmat[3];
+            rmat[2] = rmat[0] - w1 * c;
+            rmat[3] = rmat[1] - w1 * d;
+            rmat[0] = c;
+            rmat[1] = d;
+            /* ...and the digits */
+            w2 = x;       /* w2 is used as a temp here */
+            x = y;
+            y = w2 - w1*y;
+        } else break;
+    }
+}
+
+/* Iterate through x and y doing the whole gcd with int math */
+static void secp256k1_num_gcd_word(secp256k1_num_sword *rmat, secp256k1_num_word x, secp256k1_num_word y) {
+    rmat[0] = 1; rmat[1] = 0; rmat[2] = 0; rmat[3] = 1;
+    /* Iterate, left-multiplying it by [[0 1] [1 -w]] as many times as we can. */
+    while (y != 0) {
+        secp256k1_num_word q = x / y;
+        /* Update the matrix... */
+        const secp256k1_num_word t = x;
+        const secp256k1_num_word c = rmat[2];
+        const secp256k1_num_word d = rmat[3];
+        rmat[2] = rmat[0] - q * c;
+        rmat[3] = rmat[1] - q * d;
+        rmat[0] = c;
+        rmat[1] = d;
+        /* ...and the digits */
+        x = y;
+        y = t - q*y;
+    }
+}
+
+static void secp256k1_num_mod_inverse(secp256k1_num *rr, const secp256k1_num *a, const secp256k1_num *m)
+{
+    secp256k1_num r2 = {{1, 0, 0, 0, 0}};
+    secp256k1_num b1 = *m, b2 = *a;
+    secp256k1_num quot;
+
+    secp256k1_num_sword mat[4];
+    memset(rr, 0, sizeof(*rr));
+
+    while(!secp256k1_num_is_zero(&b2)) {
+        int high_idx;
+        secp256k1_num temp;
+
+        /** lehmer step **/
+        int index[2];
+        secp256k1_num_word x, y;
+        /* Euclid's GCD algorithm works by applying the matrix
+         * [[0 1] [1 -w]] repeatedly to the vectors [b1 b2] and
+         * [rr r2], where w is the integer quotient b1/b2.
+         * This optimization, due to Lehmer, lets us shortcut several
+         * iterations by computing w using only the leading digits
+         * of b1 and b2. We multiply the resective matrices together
+         * and apply the resulting transformation b1 and b2, avoiding
+         * bignum operations until the very end.
+         * 
+         * Lehmer's algorithm quits when it becomes unsure whether it
+         * can compute the next quotient w or not; then we need to do
+         * an (expensive) long division to compute the next quotient
+         * properly. Turns out that by applying the resulting linear
+         * transformation then just restarting Lehmer (without any long
+         * division) you can get some more iterations. So we put this
+         * in a while loop. */
+        secp256k1_num_leading_digit(&x, &index[0], &b1);
+        secp256k1_num_leading_digit(&y, &index[1], &b2);
+        while (index[0] == index[1]) {
+            /* Check whether we can just finish the whole thing with
+             * machine words */
+            if (index[0] == 0) {
+                secp256k1_num_gcd_word(mat, x, y);
+                /* Don't need to update [b1 b2] */
+                secp256k1_num_lin_comb(rr, &r2, rr, &r2, mat);
+                goto done;
+            }
+            secp256k1_num_gcd_lehmer(mat, x, y);
+            if (mat[1] == 0)
+                break;
+            secp256k1_num_lin_comb(&b1, &b2, &b1, &b2, mat);
+            secp256k1_num_lin_comb(rr, &r2, rr, &r2, mat);
+
+            /* Because we know our gcd will be one for our applications,
+             * the last iteration before b2 == 0 will have b2 == 1, a
+             * condition under which the Lehmer iteration is unable to
+             * compute the next digit (it computes the next digit as
+             * (b1 + 1) / b2 and b1 / (b2 + 1), which reduce to (b1 + 1)
+             * vs b1 / 2, which of course will differ. What this means is
+             * that we do not need to check for b2 == 0 before going to
+             * the next step; we know it won't be. */
+            VERIFY_CHECK(!secp256k1_num_is_zero(&b2));
+            secp256k1_num_leading_digit(&x, &index[0], &b1);
+            secp256k1_num_leading_digit(&y, &index[1], &b2);
+        }
+        /** long division step **/
+        /* b2 <- b1 - b2 * quot */
+        temp = b2;
+        if (index[1] == 0)
+            high_idx = secp256k1_num_div_mod_1 (&quot, &b2, &b1, b2.data[0]);
+        else
+            high_idx = secp256k1_num_div_mod (&quot, &b2, &b1, &b2);
+        b1 = temp;
+        /* Even though we had to do a long division, chances are that
+         * the quotient is one digit, and we can use lin_comb to compute
+         * the next iteration rather than num_mul and num_sub. */
+        /* r2 <- rr - r2 * quot */
+        if (high_idx == 0 && (secp256k1_num_sword) quot.data[0] >= 0) {
+            /* It is possible to do some Lehmer iterations on the matrix
+             * [[ 0 1 ] [ 1 -w ]] before applying it to [rr r2]. We'd
+             * also need to update [b1 b2], which has already been
+             * iterated on. We'd accomplish this by right-multiplying
+             * by [[ 0 1 ] [ 1 -w ]]^{-1} = [[ w 1 ] [ 1 0 ]] before
+             * applying the matrix. to [b1 b2]. However, it turns out
+             * this is a perf *loss*. I'm unsure why. So we don't.  */
+            mat[0] = 0; mat[1] = 1; mat[2] = 1; mat[3] = -quot.data[0];
+            secp256k1_num_lin_comb(rr, &r2, rr, &r2, mat);
+        } else {
+            temp = r2;
+            secp256k1_num_mul(&r2, &quot, &temp);
+            secp256k1_num_sub(&r2, rr, &r2);
+            *rr = temp;
+        }
+    }
+done:
+    if (secp256k1_num_is_neg(rr)) {
+        secp256k1_num_add(rr, rr, m);
+    }
+}
+/* end mod inverse */
+
+#endif
diff --git a/src/scalar.h b/src/scalar.h
index b590ccd6dd..7a5d4fbf62 100644
--- a/src/scalar.h
+++ b/src/scalar.h
@@ -80,13 +80,11 @@ static int secp256k1_scalar_is_high(const secp256k1_scalar *a);
  * Returns -1 if the number was negated, 1 otherwise */
 static int secp256k1_scalar_cond_negate(secp256k1_scalar *a, int flag);
 
-#ifndef USE_NUM_NONE
 /** Convert a scalar to a number. */
 static void secp256k1_scalar_get_num(secp256k1_num *r, const secp256k1_scalar *a);
 
 /** Get the order of the group as a number. */
 static void secp256k1_scalar_order_get_num(secp256k1_num *r);
-#endif
 
 /** Compare two scalars. */
 static int secp256k1_scalar_eq(const secp256k1_scalar *a, const secp256k1_scalar *b);
diff --git a/src/scalar_impl.h b/src/scalar_impl.h
index 88ea97de86..601095d692 100644
--- a/src/scalar_impl.h
+++ b/src/scalar_impl.h
@@ -24,7 +24,6 @@
 #error "Please select scalar implementation"
 #endif
 
-#ifndef USE_NUM_NONE
 static void secp256k1_scalar_get_num(secp256k1_num *r, const secp256k1_scalar *a) {
     unsigned char c[32];
     secp256k1_scalar_get_b32(c, a);
@@ -41,7 +40,6 @@ static void secp256k1_scalar_order_get_num(secp256k1_num *r) {
     };
     secp256k1_num_set_bin(r, order, 32);
 }
-#endif
 
 static void secp256k1_scalar_inverse(secp256k1_scalar *r, const secp256k1_scalar *x) {
     secp256k1_scalar *t;
diff --git a/src/tests.c b/src/tests.c
index c4b46ab5f2..b92d7f2585 100644
--- a/src/tests.c
+++ b/src/tests.c
@@ -433,7 +433,6 @@ void run_rand_int(void) {
 
 /***** NUM TESTS *****/
 
-#ifndef USE_NUM_NONE
 void random_num_negate(secp256k1_num *num) {
     if (secp256k1_rand_bits(1)) {
         secp256k1_num_negate(num);
@@ -486,6 +485,7 @@ void test_num_add_sub(void) {
     }
     secp256k1_num_add(&n1p2, &n1, &n2); /* n1p2 = R1 + R2 */
     secp256k1_num_add(&n2p1, &n2, &n1); /* n2p1 = R2 + R1 */
+    CHECK(secp256k1_num_eq(&n1p2, &n2p1));
     secp256k1_num_sub(&n1m2, &n1, &n2); /* n1m2 = R1 - R2 */
     secp256k1_num_sub(&n2m1, &n2, &n1); /* n2m1 = R2 - R1 */
     CHECK(secp256k1_num_eq(&n1p2, &n2p1));
@@ -500,14 +500,45 @@ void test_num_add_sub(void) {
     CHECK(secp256k1_num_eq(&n2p1, &n1));
 }
 
+void test_num_mod(void) {
+    int i;
+    secp256k1_scalar s;
+    secp256k1_num order, n;
+
+    /* check that 0 mod anything is 0 */
+    random_scalar_order_test(&s); 
+    secp256k1_scalar_get_num(&order, &s);
+    secp256k1_scalar_set_int(&s, 0);
+    secp256k1_scalar_get_num(&n, &s);
+    secp256k1_num_mod(&n, &order);
+    CHECK(secp256k1_num_is_zero(&n));
+
+    /* check that anything mod 1 is 0 */
+    secp256k1_scalar_set_int(&s, 1);
+    secp256k1_scalar_get_num(&order, &s);
+    secp256k1_scalar_get_num(&n, &s);
+    secp256k1_num_mod(&n, &order);
+    CHECK(secp256k1_num_is_zero(&n));
+
+    /* check that increasing the number past 2^256 does not break this */
+    random_scalar_order_test(&s); 
+    secp256k1_scalar_get_num(&n, &s);
+    /* multiply by 2^8, which'll test this case with high probability */
+    for (i = 0; i < 8; ++i) {
+        secp256k1_num_add(&n, &n, &n);
+    }
+    secp256k1_num_mod(&n, &order);
+    CHECK(secp256k1_num_is_zero(&n));
+}
+
 void run_num_smalltests(void) {
     int i;
     for (i = 0; i < 100*count; i++) {
         test_num_negate();
         test_num_add_sub();
+        test_num_mod();
     }
 }
-#endif
 
 /***** SCALAR TESTS *****/
 
@@ -515,10 +546,8 @@ void scalar_test(void) {
     secp256k1_scalar s;
     secp256k1_scalar s1;
     secp256k1_scalar s2;
-#ifndef USE_NUM_NONE
     secp256k1_num snum, s1num, s2num;
     secp256k1_num order, half_order;
-#endif
     unsigned char c[32];
 
     /* Set 's' to a random scalar, with value 'snum'. */
@@ -531,7 +560,6 @@ void scalar_test(void) {
     random_scalar_order_test(&s2);
     secp256k1_scalar_get_b32(c, &s2);
 
-#ifndef USE_NUM_NONE
     secp256k1_scalar_get_num(&snum, &s);
     secp256k1_scalar_get_num(&s1num, &s1);
     secp256k1_scalar_get_num(&s2num, &s2);
@@ -539,7 +567,6 @@ void scalar_test(void) {
     secp256k1_scalar_order_get_num(&order);
     half_order = order;
     secp256k1_num_shift(&half_order, 1);
-#endif
 
     {
         int i;
@@ -580,7 +607,6 @@ void scalar_test(void) {
         CHECK(secp256k1_scalar_eq(&n, &s));
     }
 
-#ifndef USE_NUM_NONE
     {
         /* Test that adding the scalars together is equal to adding their numbers together modulo the order. */
         secp256k1_num rnum;
@@ -590,6 +616,7 @@ void scalar_test(void) {
         secp256k1_num_mod(&rnum, &order);
         secp256k1_scalar_add(&r, &s, &s2);
         secp256k1_scalar_get_num(&r2num, &r);
+        CHECK(secp256k1_num_cmp(&rnum, &r2num) == 0);
         CHECK(secp256k1_num_eq(&rnum, &r2num));
     }
 
@@ -610,6 +637,7 @@ void scalar_test(void) {
         CHECK((secp256k1_scalar_is_high(&s) == secp256k1_scalar_is_high(&neg)) == secp256k1_scalar_is_zero(&s));
         secp256k1_scalar_get_num(&negnum2, &neg);
         /* Negating a scalar should be equal to (order - n) mod order on the number. */
+        CHECK(secp256k1_num_cmp(&negnum, &negnum2) == 0);
         CHECK(secp256k1_num_eq(&negnum, &negnum2));
         secp256k1_scalar_add(&neg, &neg, &s);
         /* Adding a number to its negation should result in zero. */
@@ -632,22 +660,17 @@ void scalar_test(void) {
             CHECK(expected == low);
         }
     }
-#endif
 
     {
         /* Test that scalar inverses are equal to the inverse of their number modulo the order. */
         if (!secp256k1_scalar_is_zero(&s)) {
             secp256k1_scalar inv;
-#ifndef USE_NUM_NONE
             secp256k1_num invnum;
             secp256k1_num invnum2;
-#endif
             secp256k1_scalar_inverse(&inv, &s);
-#ifndef USE_NUM_NONE
             secp256k1_num_mod_inverse(&invnum, &snum, &order);
             secp256k1_scalar_get_num(&invnum2, &inv);
             CHECK(secp256k1_num_eq(&invnum, &invnum2));
-#endif
             secp256k1_scalar_mul(&inv, &inv, &s);
             /* Multiplying a scalar with its inverse must result in one. */
             CHECK(secp256k1_scalar_is_one(&inv));
@@ -779,7 +802,6 @@ void run_scalar_tests(void) {
         CHECK(secp256k1_scalar_is_zero(&o));
     }
 
-#ifndef USE_NUM_NONE
     {
         /* A scalar with value of the curve order should be 0. */
         secp256k1_num order;
@@ -792,7 +814,6 @@ void run_scalar_tests(void) {
         CHECK(overflow == 1);
         CHECK(secp256k1_scalar_is_zero(&zero));
     }
-#endif
 
     {
         /* Does check_overflow check catch all ones? */
@@ -4280,10 +4301,8 @@ int main(int argc, char **argv) {
     run_hmac_sha256_tests();
     run_rfc6979_hmac_sha256_tests();
 
-#ifndef USE_NUM_NONE
     /* num tests */
     run_num_smalltests();
-#endif
 
     /* scalar tests */
     run_scalar_tests();

From 953894aa487eb02ac5c79c02a8b1904d0392b414 Mon Sep 17 00:00:00 2001
From: Peter Dettman <peter.dettman@gmail.com>
Date: Fri, 3 Jul 2015 21:51:52 +0930
Subject: [PATCH 4/5] Add Jacobi symbol test via GMP

Also add native Jacobi symbol test (Andrew)

Rebased-by: Andrew Poelstra
---
 src/bench_internal.c  | 14 +++++++
 src/num.h             |  6 +++
 src/num_5x64.h        |  1 +
 src/num_9x32.h        |  1 +
 src/num_gmp_impl.h    | 26 +++++++++++++
 src/num_native_impl.h | 86 +++++++++++++++++++++++++++++++++++++++++++
 src/tests.c           | 78 +++++++++++++++++++++++++++++++++++++++
 7 files changed, 212 insertions(+)

diff --git a/src/bench_internal.c b/src/bench_internal.c
index 7809f5f8cf..c84dbf9796 100644
--- a/src/bench_internal.c
+++ b/src/bench_internal.c
@@ -299,6 +299,19 @@ void bench_context_sign(void* arg) {
     }
 }
 
+void bench_num_jacobi(void* arg) {
+    int i;
+    bench_inv_t *data = (bench_inv_t*)arg;
+    secp256k1_num nx, norder;
+
+    secp256k1_scalar_get_num(&nx, &data->scalar_x);
+    secp256k1_scalar_order_get_num(&norder);
+    secp256k1_scalar_get_num(&norder, &data->scalar_y);
+
+    for (i = 0; i < 2000; i++) {
+        secp256k1_num_jacobi(&nx, &norder);
+    }
+}
 
 int have_flag(int argc, char** argv, char *flag) {
     char** argm = argv + argc;
@@ -350,5 +363,6 @@ int main(int argc, char **argv) {
     if (have_flag(argc, argv, "context") || have_flag(argc, argv, "verify")) run_benchmark("context_verify", bench_context_verify, bench_setup, NULL, &data, 10, 20);
     if (have_flag(argc, argv, "context") || have_flag(argc, argv, "sign")) run_benchmark("context_sign", bench_context_sign, bench_setup, NULL, &data, 10, 200);
 
+    if (have_flag(argc, argv, "num") || have_flag(argc, argv, "jacobi")) run_benchmark("num_jacobi", bench_num_jacobi, bench_setup, NULL, &data, 10, 200000);
     return 0;
 }
diff --git a/src/num.h b/src/num.h
index 7aa45e887d..5c7467fc9a 100644
--- a/src/num.h
+++ b/src/num.h
@@ -34,6 +34,9 @@ static void secp256k1_num_set_bin(secp256k1_num *r, const unsigned char *a, unsi
 /** Compute a modular inverse. The input must be less than the modulus. */
 static void secp256k1_num_mod_inverse(secp256k1_num *r, const secp256k1_num *a, const secp256k1_num *m);
 
+/** Compute the jacobi symbol (a|b). b must be positive and odd. */
+static int secp256k1_num_jacobi(const secp256k1_num *a, const secp256k1_num *b);
+
 /** Compare the absolute value of two numbers. */
 static int secp256k1_num_cmp(const secp256k1_num *a, const secp256k1_num *b);
 
@@ -56,6 +59,9 @@ static void secp256k1_num_shift(secp256k1_num *r, int bits);
 /** Check whether a number is zero. */
 static int secp256k1_num_is_zero(const secp256k1_num *a);
 
+/** Check whether a number is one. */
+static int secp256k1_num_is_one(const secp256k1_num *a);
+
 /** Check whether a number is strictly negative. */
 static int secp256k1_num_is_neg(const secp256k1_num *a);
 
diff --git a/src/num_5x64.h b/src/num_5x64.h
index e1a7d45fea..0677b74e8d 100644
--- a/src/num_5x64.h
+++ b/src/num_5x64.h
@@ -12,6 +12,7 @@
 #define NUM_N_WORDS	5
 #define NUM_WORD_WIDTH	64
 #define NUM_WORD_CTLZ __builtin_clzl
+#define NUM_WORD_CTZ __builtin_ctzl
 typedef uint64_t secp256k1_num_word;
 typedef int64_t secp256k1_num_sword;
 typedef uint128_t secp256k1_num_dword;
diff --git a/src/num_9x32.h b/src/num_9x32.h
index e69a36f5f5..664ee5caf4 100644
--- a/src/num_9x32.h
+++ b/src/num_9x32.h
@@ -12,6 +12,7 @@
 #define NUM_N_WORDS	9
 #define NUM_WORD_WIDTH	32
 #define NUM_WORD_CTLZ __builtin_clz
+#define NUM_WORD_CTZ __builtin_ctz
 typedef uint32_t secp256k1_num_word;
 typedef int32_t secp256k1_num_sword;
 typedef uint64_t secp256k1_num_dword;
diff --git a/src/num_gmp_impl.h b/src/num_gmp_impl.h
index b97a81286c..d3130dcf7d 100644
--- a/src/num_gmp_impl.h
+++ b/src/num_gmp_impl.h
@@ -144,6 +144,32 @@ static void secp256k1_num_mod_inverse(secp256k1_num *r, const secp256k1_num *a,
     memset(v, 0, sizeof(v));
 }
 
+static int secp256k1_num_jacobi(const secp256k1_num *a, const secp256k1_num *b) {
+    int ret;
+    mpz_t ga, gb;
+    secp256k1_num_sanity(a);
+    secp256k1_num_sanity(b);
+    VERIFY_CHECK(!b->neg && (b->limbs > 0) && (b->data[0] & 1));
+
+    mpz_inits(ga, gb, NULL);
+
+    mpz_import(gb, b->limbs, -1, sizeof(mp_limb_t), 0, 0, b->data);
+    mpz_import(ga, a->limbs, -1, sizeof(mp_limb_t), 0, 0, a->data);
+    if (a->neg) {
+        mpz_neg(ga, ga);
+    }
+
+    ret = mpz_jacobi(ga, gb);
+
+    mpz_clears(ga, gb, NULL);
+
+    return ret;
+}
+
+static int secp256k1_num_is_one(const secp256k1_num *a) {
+    return (a->limbs == 1 && a->data[0] == 1);
+}
+
 static int secp256k1_num_is_zero(const secp256k1_num *a) {
     return (a->limbs == 1 && a->data[0] == 0);
 }
diff --git a/src/num_native_impl.h b/src/num_native_impl.h
index 5cb468b56e..c928e50e0f 100644
--- a/src/num_native_impl.h
+++ b/src/num_native_impl.h
@@ -102,6 +102,16 @@ SECP256K1_INLINE static int secp256k1_num_is_zero(const secp256k1_num *a) {
     return 1;
 }
 
+SECP256K1_INLINE static int secp256k1_num_is_one(const secp256k1_num *a) {
+    int i;
+    if (a->data[0] != 1)
+        return 0;
+    for (i = 1; i < NUM_N_WORDS - 1; ++i)
+        if (a->data[i] != 0)
+            return 0;
+    return 1;
+}
+
 SECP256K1_INLINE static int secp256k1_num_is_neg(const secp256k1_num *a) {
     return a->data[NUM_N_WORDS - 1] >> (NUM_WORD_WIDTH - 1);
 }
@@ -599,4 +609,80 @@ static void secp256k1_num_mod_inverse(secp256k1_num *rr, const secp256k1_num *a,
 }
 /* end mod inverse */
 
+/* start jacobi symbol */
+/* Compute a number modulo some power of 2 */
+SECP256K1_INLINE static int secp256k1_num_mod_2(const secp256k1_num *a, int m) {
+    VERIFY_CHECK(m > 0);
+    VERIFY_CHECK((m & (m - 1)) == 0);  /* check that m is a power of 2 */
+    /* Since our words are powers of 2 we only need to mod the lowest digit */
+    return a->data[0] % m;
+}
+
+static int secp256k1_num_jacobi_1(secp256k1_num_word a, secp256k1_num_word b) {
+    int ret = 1;
+    secp256k1_num_word t;
+    /* Iterate, left-multiplying it by [[0 1] [1 -w]] as many times as we can. */
+    while (1) {
+        a %= b;
+        if (a == 0)
+            return 0;
+        if (a % 2 == 0) {
+            int shift = NUM_WORD_CTZ(a);
+            a >>= shift;
+            if ((b % 8 == 3 || b % 8 == 5) && shift % 2 == 1)
+                ret *= -1;
+        }
+        if (a == 1)
+            break;
+        if (b % 4 == 3 && a % 4 == 3)
+            ret *= -1;
+        t = a; a = b; b = t;
+    }
+    return ret;
+}
+
+/* Compute the Jacobi symbol (a|b) assuming b is an odd prime */
+static int secp256k1_num_jacobi(const secp256k1_num *a, const secp256k1_num *b) {
+    secp256k1_num top = *a, bot = *b, scratch;
+    secp256k1_num_word x, y;
+    int index[2];
+    int ret = 1;
+
+    while (1) {
+        int mod8 = secp256k1_num_mod_2(&bot, 8);
+        secp256k1_num_leading_digit(&x, &index[0], &top);
+        secp256k1_num_leading_digit(&y, &index[1], &bot);
+
+        if (index[0] == 0 && index[1] == 0)
+            return ret * secp256k1_num_jacobi_1(x, y);
+
+        /* Algorithm from https://en.wikipedia.org/wiki/Jacobi_symbol#Calculating_the_Jacobi_symbol */
+        secp256k1_num_div_mod(&scratch, &top, &top, &bot); /* top <- top mod bottom */
+
+        /* are we done? */
+        if (secp256k1_num_is_zero(&top))
+            return 0;
+
+        /* cast out powers of two from the "numerator" */
+        while (secp256k1_num_mod_2(&top, 2) == 0) {
+            int shift = NUM_WORD_CTZ(top.data[0]);
+            secp256k1_num_shift(&top, shift);
+            if ((mod8 == 3 || mod8 == 5) && shift % 2 == 1)
+                ret *= -1;
+        }
+
+        /* are we done? */
+        if (secp256k1_num_is_one(&top))
+            return ret;
+        /* if not, iterate */
+        if (mod8 % 4 == 3 && secp256k1_num_mod_2(&top, 4) == 3)
+            ret *= -1;
+
+        scratch = top;
+        top = bot;
+        bot = scratch;
+    }
+}
+/* end jacobi symbol */
+
 #endif
diff --git a/src/tests.c b/src/tests.c
index b92d7f2585..66951fce27 100644
--- a/src/tests.c
+++ b/src/tests.c
@@ -472,6 +472,8 @@ void test_num_negate(void) {
 }
 
 void test_num_add_sub(void) {
+    int i;
+    secp256k1_scalar s;
     secp256k1_num n1;
     secp256k1_num n2;
     secp256k1_num n1p2, n2p1, n1m2, n2m1;
@@ -498,6 +500,18 @@ void test_num_add_sub(void) {
     CHECK(!secp256k1_num_eq(&n2p1, &n1));
     secp256k1_num_sub(&n2p1, &n2p1, &n2); /* n2p1 = R2 + R1 - R2 = R1 */
     CHECK(secp256k1_num_eq(&n2p1, &n1));
+
+    /* check is_one */
+    secp256k1_scalar_set_int(&s, 1);
+    secp256k1_scalar_get_num(&n1, &s);
+    CHECK(secp256k1_num_is_one(&n1));
+    /* check that 2^n + 1 is never 1 */
+    secp256k1_scalar_get_num(&n2, &s);
+    for (i = 0; i < 250; ++i) {
+        secp256k1_num_add(&n1, &n1, &n1);    /* n1 *= 2 */
+        secp256k1_num_add(&n1p2, &n1, &n2);  /* n1p2 = n1 + 1 */
+        CHECK(!secp256k1_num_is_one(&n1p2));
+    }
 }
 
 void test_num_mod(void) {
@@ -531,12 +545,74 @@ void test_num_mod(void) {
     CHECK(secp256k1_num_is_zero(&n));
 }
 
+void test_num_jacobi(void) {
+    secp256k1_scalar sqr;
+    secp256k1_scalar small;
+    secp256k1_scalar five;  /* five is not a quadratic residue */
+    secp256k1_num order, n;
+    int i;
+    /* squares mod 5 are 1, 4 */
+    const int jacobi5[10] = { 0, 1, -1, -1, 1, 0, 1, -1, -1, 1 };
+
+    /* check some small values with 5 as the order */
+    secp256k1_scalar_set_int(&five, 5);
+    secp256k1_scalar_get_num(&order, &five);
+    for (i = 0; i < 10; ++i) {
+        secp256k1_scalar_set_int(&small, i);
+        secp256k1_scalar_get_num(&n, &small);
+        CHECK(secp256k1_num_jacobi(&n, &order) == jacobi5[i]);
+    }
+
+    /** test large values with 5 as group order */
+    secp256k1_scalar_get_num(&order, &five);
+    /* we first need a scalar which is not a multiple of 5 */
+    do {
+        secp256k1_num fiven;
+        random_scalar_order_test(&sqr); 
+        secp256k1_scalar_get_num(&fiven, &five);
+        secp256k1_scalar_get_num(&n, &sqr);
+        secp256k1_num_mod(&n, &fiven);
+    } while (secp256k1_num_is_zero(&n));
+    /* next force it to be a residue. 2 is a nonresidue mod 5 so we can
+     * just multiply by two, i.e. add the number to itself */
+    if (secp256k1_num_jacobi(&n, &order) == -1) {
+        secp256k1_num_add(&n, &n, &n);
+    }
+
+    /* test residue */
+    CHECK(secp256k1_num_jacobi(&n, &order) == 1);
+    /* test nonresidue */
+    secp256k1_num_add(&n, &n, &n);
+    CHECK(secp256k1_num_jacobi(&n, &order) == -1);
+
+    /** test with secp group order as order */
+    secp256k1_scalar_order_get_num(&order);
+    random_scalar_order_test(&sqr); 
+    secp256k1_scalar_sqr(&sqr, &sqr);
+    /* test residue */
+    secp256k1_scalar_get_num(&n, &sqr);
+    CHECK(secp256k1_num_jacobi(&n, &order) == 1);
+    /* test nonresidue */
+    secp256k1_scalar_mul(&sqr, &sqr, &five);
+    secp256k1_scalar_get_num(&n, &sqr);
+    CHECK(secp256k1_num_jacobi(&n, &order) == -1);
+    /* test multiple of the order*/
+    CHECK(secp256k1_num_jacobi(&order, &order) == 0);
+
+    /* check one less than the order */
+    secp256k1_scalar_set_int(&small, 1);
+    secp256k1_scalar_get_num(&n, &small);
+    secp256k1_num_sub(&n, &order, &n);
+    CHECK(secp256k1_num_jacobi(&n, &order) == 1);  /* sage confirms this is 1 */
+}
+
 void run_num_smalltests(void) {
     int i;
     for (i = 0; i < 100*count; i++) {
         test_num_negate();
         test_num_add_sub();
         test_num_mod();
+        test_num_jacobi();
     }
 }
 
@@ -677,6 +753,8 @@ void scalar_test(void) {
             secp256k1_scalar_inverse(&inv, &inv);
             /* Inverting one must result in one. */
             CHECK(secp256k1_scalar_is_one(&inv));
+            secp256k1_scalar_get_num(&invnum, &inv);
+            CHECK(secp256k1_num_is_one(&invnum));
         }
     }
 

From 5f7c1c939dfe0544d9d00a9fdb826cc2cc03c8bc Mon Sep 17 00:00:00 2001
From: Andrew Poelstra <apoelstra@wpsoftware.net>
Date: Sat, 7 Nov 2015 11:51:48 -0600
Subject: [PATCH 5/5] Simplify divmod handling of "top dividend word exceeds
 top divisor word" case

Originally we computed q by dividing the top words of the dividend and
divisor, then corrected it by at most 2. However, the ratio of the top
words can only ever be 1, so rather than having three cases (original
correct; need to subtract 1; need to subtract 2), we actually only have
two (1 is correct; 0 is correct).

This simplifies the algorithm a little bit and gets us to full test coverage,
which we didn't have before since the "correct by 2" case was impossible to
hit.
---
 src/num_native_impl.h | 35 +++++++++++++++++++----------------
 1 file changed, 19 insertions(+), 16 deletions(-)

diff --git a/src/num_native_impl.h b/src/num_native_impl.h
index c928e50e0f..48f34a840f 100644
--- a/src/num_native_impl.h
+++ b/src/num_native_impl.h
@@ -383,25 +383,28 @@ static int secp256k1_num_div_mod(secp256k1_num *rq, secp256k1_num *rr, const sec
      * exceeded our base). This can only happen on the first iteration,
      * so we special-case it here before starting the real algorithm. */
     if (rem_high >= div_high) {
+        /* Our shifting above guarantees us that div_high >= B/2, where B is our
+         * word base. By definitien rem_high < B, as rem_high is one word. So we
+         * have inside this if block that:
+         *     B > rem_high >= div_high >= B/2
+         *
+         * The inner inequality forces q = rem_high/div_high >= 1; then the
+         * outer inequalities force q <= 1. So q always equals 1 here.
+         */
         secp256k1_num sub;
-        secp256k1_num_word q = rem_high / div_high;
+        secp256k1_num_word q = 1;
         secp256k1_num_mul_word_shift(&sub, &div, q, output_idx);
-        /* Correct for error in the quotient. This was a while loop, but as it is
-         * mathematically guaranteed to iterate at most twice, we can unroll it
-         * into a pair of nested ifs. Note that we are guaranteed q > 0 here by
-         * virtue of being in this if block. */
+        /* This said, rem_high/div_high = 1 does not necessarily imply that the
+         * actual quotient is 1. As described in the above block comment, this
+         * is an overshoot of at most 2, i.e. maybe the actual quotient is zero.
+         * In this case our reduction is a no-op! So rather than correcting q,
+         * as we do in the analogous check in the loop below, we do nothing. */
         if (secp256k1_num_cmp (&sub, rr) > 0) {
-            secp256k1_num_sub_shift_word (&sub, &sub, &div, output_idx, i + 1);
-            --q;
-            if (secp256k1_num_cmp (&sub, rr) >= 0) {
-                secp256k1_num_sub_shift_word (&sub, &sub, &div, output_idx, i + 1);
-                --q;
-            }
-        }
-        /* Reduce remainder */
-        secp256k1_num_sub_shift_word(rr, rr, &sub, 0, i + 1);
-        rq->data[output_idx] = q;
-        if (q != 0) {
+            /* Do nothing. */
+        } else {
+            /* Reduce remainder */
+            secp256k1_num_sub_shift_word(rr, rr, &sub, 0, i + 1);
+            rq->data[output_idx] = q;
             ret = output_idx;
         }
     }