diff --git a/.gitignore b/.gitignore
index e0b7b7a48a..6596b41d34 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,6 @@
 bench_inv
 bench_ecdh
+bench_ecdh_opt
 bench_sign
 bench_verify
 bench_schnorr_verify
diff --git a/include/secp256k1_ecdh.h b/include/secp256k1_ecdh.h
index 4b84d7a963..5a6da554a2 100644
--- a/include/secp256k1_ecdh.h
+++ b/include/secp256k1_ecdh.h
@@ -24,6 +24,14 @@ SECP256K1_API SECP256K1_WARN_UNUSED_RESULT int secp256k1_ecdh(
   const unsigned char *privkey
 ) SECP256K1_ARG_NONNULL(1) SECP256K1_ARG_NONNULL(2) SECP256K1_ARG_NONNULL(3) SECP256K1_ARG_NONNULL(4);
 
+SECP256K1_API SECP256K1_WARN_UNUSED_RESULT int secp256k1_ecdh_opt(
+  const secp256k1_context* ctx,
+  unsigned char *result,
+  const unsigned char *pub,
+  size_t publen,
+  const unsigned char *privkey
+) SECP256K1_ARG_NONNULL(1) SECP256K1_ARG_NONNULL(2) SECP256K1_ARG_NONNULL(3) SECP256K1_ARG_NONNULL(5);
+
 # ifdef __cplusplus
 }
 # endif
diff --git a/src/bench_ecdh.c b/src/bench_ecdh.c
index 5a7c6376e0..7e0be39555 100644
--- a/src/bench_ecdh.c
+++ b/src/bench_ecdh.c
@@ -13,13 +13,13 @@
 
 typedef struct {
     secp256k1_context *ctx;
-    secp256k1_pubkey point;
+    secp256k1_pubkey pubkey;
     unsigned char scalar[32];
-} bench_ecdh_t;
+} bench_ecdh_data;
 
 static void bench_ecdh_setup(void* arg) {
     int i;
-    bench_ecdh_t *data = (bench_ecdh_t*)arg;
+    bench_ecdh_data *data = (bench_ecdh_data*)arg;
     const unsigned char point[] = {
         0x03,
         0x54, 0x94, 0xc1, 0x5d, 0x32, 0x09, 0x97, 0x06,
@@ -28,25 +28,25 @@ static void bench_ecdh_setup(void* arg) {
         0xa2, 0xba, 0xd1, 0x84, 0xf8, 0x83, 0xc6, 0x9f
     };
 
-    data->ctx = secp256k1_context_create(0);
+    data->ctx = secp256k1_context_create(SECP256K1_CONTEXT_NONE);
     for (i = 0; i < 32; i++) {
         data->scalar[i] = i + 1;
     }
-    CHECK(secp256k1_ec_pubkey_parse(data->ctx, &data->point, point, sizeof(point)) == 1);
+    CHECK(secp256k1_ec_pubkey_parse(data->ctx, &data->pubkey, point, sizeof(point)) == 1);
 }
 
 static void bench_ecdh(void* arg) {
     int i;
     unsigned char res[32];
-    bench_ecdh_t *data = (bench_ecdh_t*)arg;
+    bench_ecdh_data *data = (bench_ecdh_data*)arg;
 
     for (i = 0; i < 20000; i++) {
-        CHECK(secp256k1_ecdh(data->ctx, res, &data->point, data->scalar) == 1);
+        CHECK(secp256k1_ecdh(data->ctx, res, &data->pubkey, data->scalar) == 1);
     }
 }
 
 int main(void) {
-    bench_ecdh_t data;
+    bench_ecdh_data data;
 
     run_benchmark("ecdh", bench_ecdh, bench_ecdh_setup, NULL, &data, 10, 20000);
     return 0;
diff --git a/src/bench_ecdh_opt.c b/src/bench_ecdh_opt.c
new file mode 100644
index 0000000000..45ac2bcb47
--- /dev/null
+++ b/src/bench_ecdh_opt.c
@@ -0,0 +1,54 @@
+/**********************************************************************
+ * Copyright (c) 2015 Pieter Wuille, Andrew Poelstra                  *
+ * Distributed under the MIT software license, see the accompanying   *
+ * file COPYING or http://www.opensource.org/licenses/mit-license.php.*
+ **********************************************************************/
+
+#include <string.h>
+
+#include "include/secp256k1.h"
+#include "include/secp256k1_ecdh.h"
+#include "util.h"
+#include "bench.h"
+
+typedef struct {
+    secp256k1_context *ctx;
+    unsigned char point[33];
+    unsigned char scalar[32];
+} bench_ecdh_data;
+
+static void bench_ecdh_setup(void* arg) {
+    int i;
+    bench_ecdh_data *data = (bench_ecdh_data*)arg;
+    const unsigned char point[] = {
+        0x03,
+        0x54, 0x94, 0xc1, 0x5d, 0x32, 0x09, 0x97, 0x06,
+        0xc2, 0x39, 0x5f, 0x94, 0x34, 0x87, 0x45, 0xfd,
+        0x75, 0x7c, 0xe3, 0x0e, 0x4e, 0x8c, 0x90, 0xfb,
+        0xa2, 0xba, 0xd1, 0x84, 0xf8, 0x83, 0xc6, 0x9f
+    };
+
+    data->ctx = secp256k1_context_create(SECP256K1_CONTEXT_VERIFY);
+    for (i = 0; i < 32; i++) {
+        data->scalar[i] = i + 1;
+    }
+    CHECK(sizeof(point) == sizeof(data->point));
+    memcpy(data->point, point, sizeof(point));
+}
+
+static void bench_ecdh(void* arg) {
+    int i;
+    unsigned char res[32];
+    bench_ecdh_data *data = (bench_ecdh_data*)arg;
+
+    for (i = 0; i < 20000; i++) {
+        CHECK(secp256k1_ecdh_opt(data->ctx, res, data->point, sizeof(data->point), data->scalar) == 1);
+    }
+}
+
+int main(void) {
+    bench_ecdh_data data;
+
+    run_benchmark("ecdh", bench_ecdh, bench_ecdh_setup, NULL, &data, 10, 20000);
+    return 0;
+}
diff --git a/src/bench_internal.c b/src/bench_internal.c
index 7809f5f8cf..d4efd2f8a6 100644
--- a/src/bench_internal.c
+++ b/src/bench_internal.c
@@ -191,6 +191,26 @@ void bench_field_sqrt_var(void* arg) {
     }
 }
 
+void bench_field_rsqrt_var(void* arg) {
+    int i;
+    bench_inv_t *data = (bench_inv_t*)arg;
+
+    for (i = 0; i < 20000; i++) {
+        secp256k1_fe_rsqrt_var(&data->fe_x, &data->fe_y, &data->fe_x);
+        secp256k1_fe_add(&data->fe_x, &data->fe_y);
+    }
+}
+
+void bench_field_par_rsqrt_inv_var(void* arg) {
+    int i;
+    bench_inv_t *data = (bench_inv_t*)arg;
+
+    for (i = 0; i < 20000; i++) {
+        secp256k1_fe_par_rsqrt_inv_var(&data->fe_x, &data->fe_y, &data->fe_x, &data->fe_y);
+        secp256k1_fe_add(&data->fe_x, &data->fe_y);
+    }
+}
+
 void bench_group_double_var(void* arg) {
     int i;
     bench_inv_t *data = (bench_inv_t*)arg;
@@ -334,6 +354,8 @@ int main(int argc, char **argv) {
     if (have_flag(argc, argv, "field") || have_flag(argc, argv, "inverse")) run_benchmark("field_inverse", bench_field_inverse, bench_setup, NULL, &data, 10, 20000);
     if (have_flag(argc, argv, "field") || have_flag(argc, argv, "inverse")) run_benchmark("field_inverse_var", bench_field_inverse_var, bench_setup, NULL, &data, 10, 20000);
     if (have_flag(argc, argv, "field") || have_flag(argc, argv, "sqrt")) run_benchmark("field_sqrt_var", bench_field_sqrt_var, bench_setup, NULL, &data, 10, 20000);
+    if (have_flag(argc, argv, "field") || have_flag(argc, argv, "sqrt")) run_benchmark("field_rsqrt_var", bench_field_rsqrt_var, bench_setup, NULL, &data, 10, 20000);
+    if (have_flag(argc, argv, "field") || have_flag(argc, argv, "sqrt") || have_flag(argc, argv, "inverse")) run_benchmark("field_par_rsqrt_inv_var", bench_field_par_rsqrt_inv_var, bench_setup, NULL, &data, 10, 20000);
 
     if (have_flag(argc, argv, "group") || have_flag(argc, argv, "double")) run_benchmark("group_double_var", bench_group_double_var, bench_setup, NULL, &data, 10, 200000);
     if (have_flag(argc, argv, "group") || have_flag(argc, argv, "add")) run_benchmark("group_add_var", bench_group_add_var, bench_setup, NULL, &data, 10, 200000);
diff --git a/src/field.h b/src/field.h
index 2d52af5e36..e28fdf71b1 100644
--- a/src/field.h
+++ b/src/field.h
@@ -39,13 +39,11 @@ static void secp256k1_fe_normalize_weak(secp256k1_fe *r);
 /** Normalize a field element, without constant-time guarantee. */
 static void secp256k1_fe_normalize_var(secp256k1_fe *r);
 
-/** Verify whether a field element represents zero i.e. would normalize to a zero value. The field
- *  implementation may optionally normalize the input, but this should not be relied upon. */
-static int secp256k1_fe_normalizes_to_zero(secp256k1_fe *r);
+/** Verify whether a field element represents zero i.e. would normalize to a zero value. */
+static int secp256k1_fe_normalizes_to_zero(const secp256k1_fe *r);
 
-/** Verify whether a field element represents zero i.e. would normalize to a zero value. The field
- *  implementation may optionally normalize the input, but this should not be relied upon. */
-static int secp256k1_fe_normalizes_to_zero_var(secp256k1_fe *r);
+/** Verify whether a field element represents zero i.e. would normalize to a zero value. */
+static int secp256k1_fe_normalizes_to_zero_var(const secp256k1_fe *r);
 
 /** Set a field element equal to a small integer. Resulting field element is normalized. */
 static void secp256k1_fe_set_int(secp256k1_fe *r, int a);
@@ -88,12 +86,28 @@ static void secp256k1_fe_mul(secp256k1_fe *r, const secp256k1_fe *a, const secp2
 static void secp256k1_fe_sqr(secp256k1_fe *r, const secp256k1_fe *a);
 
 /** If a has a square root, it is computed in r and 1 is returned. If a does not
- *  have a square root, the root of its negation is computed and 0 is returned.
+ *  have a square root, the root of -a is computed and 0 is returned.
  *  The input's magnitude can be at most 8. The output magnitude is 1 (but not
  *  guaranteed to be normalized). The result in r will always be a square
  *  itself. */
 static int secp256k1_fe_sqrt_var(secp256k1_fe *r, const secp256k1_fe *a);
 
+/** If a has a square root, the square root is computed in rs, its reciprocal square root is
+ *  calculated in rr, and 1 is returned. If a does not have a square root, the root (and recip. root)
+ *  of -a are computed and 0 is returned. The input's magnitude can be at most 8. The
+ *  outputs' magnitudes are 1 (but not guaranteed to be normalized). The result in rs will always be
+ *  a square itself. The result in rr will be a square if, and only if, a is a square.
+ */
+static int secp256k1_fe_rsqrt_var(secp256k1_fe *rs, secp256k1_fe *rr, const secp256k1_fe *a);
+
+/** Parallel reciprocal square root and inverse. Sets ri to be the (modular) inverse of b. If a has a
+ *  square root, the reciprocal of its square root is computed in rr and 1 is returned. If a does not
+ *  have a square root, the reciprocal root of -a is computed and 0 is returned. The inputs'
+ *  magnitudes can be at most 8. The outputs' magnitudes are 1 (but not guaranteed to be normalized).
+ *  The result in rr will be a square if, and only if, a is a square.
+ */
+static int secp256k1_fe_par_rsqrt_inv_var(secp256k1_fe *rr,  secp256k1_fe *ri, const secp256k1_fe *a, const secp256k1_fe *b);
+
 /** Sets a field element to be the (modular) inverse of another. Requires the input's magnitude to be
  *  at most 8. The output magnitude is 1 (but not guaranteed to be normalized). */
 static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a);
diff --git a/src/field_10x26_impl.h b/src/field_10x26_impl.h
index 212cc5396a..8751bb3cc0 100644
--- a/src/field_10x26_impl.h
+++ b/src/field_10x26_impl.h
@@ -188,7 +188,7 @@ static void secp256k1_fe_normalize_var(secp256k1_fe *r) {
 #endif
 }
 
-static int secp256k1_fe_normalizes_to_zero(secp256k1_fe *r) {
+static int secp256k1_fe_normalizes_to_zero(const secp256k1_fe *r) {
     uint32_t t0 = r->n[0], t1 = r->n[1], t2 = r->n[2], t3 = r->n[3], t4 = r->n[4],
              t5 = r->n[5], t6 = r->n[6], t7 = r->n[7], t8 = r->n[8], t9 = r->n[9];
 
@@ -217,7 +217,7 @@ static int secp256k1_fe_normalizes_to_zero(secp256k1_fe *r) {
     return (z0 == 0) | (z1 == 0x3FFFFFFUL);
 }
 
-static int secp256k1_fe_normalizes_to_zero_var(secp256k1_fe *r) {
+static int secp256k1_fe_normalizes_to_zero_var(const secp256k1_fe *r) {
     uint32_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9;
     uint32_t z0, z1;
     uint32_t x;
diff --git a/src/field_5x52_impl.h b/src/field_5x52_impl.h
index b31e24ab81..3d14962e02 100644
--- a/src/field_5x52_impl.h
+++ b/src/field_5x52_impl.h
@@ -167,7 +167,7 @@ static void secp256k1_fe_normalize_var(secp256k1_fe *r) {
 #endif
 }
 
-static int secp256k1_fe_normalizes_to_zero(secp256k1_fe *r) {
+static int secp256k1_fe_normalizes_to_zero(const secp256k1_fe *r) {
     uint64_t t0 = r->n[0], t1 = r->n[1], t2 = r->n[2], t3 = r->n[3], t4 = r->n[4];
 
     /* z0 tracks a possible raw value of 0, z1 tracks a possible raw value of P */
@@ -190,7 +190,7 @@ static int secp256k1_fe_normalizes_to_zero(secp256k1_fe *r) {
     return (z0 == 0) | (z1 == 0xFFFFFFFFFFFFFULL);
 }
 
-static int secp256k1_fe_normalizes_to_zero_var(secp256k1_fe *r) {
+static int secp256k1_fe_normalizes_to_zero_var(const secp256k1_fe *r) {
     uint64_t t0, t1, t2, t3, t4;
     uint64_t z0, z1;
     uint64_t x;
diff --git a/src/field_impl.h b/src/field_impl.h
index 77f4aae2f9..b9608bb8a6 100644
--- a/src/field_impl.h
+++ b/src/field_impl.h
@@ -28,29 +28,21 @@ SECP256K1_INLINE static int secp256k1_fe_equal_var(const secp256k1_fe *a, const
     return secp256k1_fe_normalizes_to_zero_var(&na);
 }
 
-static int secp256k1_fe_sqrt_var(secp256k1_fe *r, const secp256k1_fe *a) {
-    /** Given that p is congruent to 3 mod 4, we can compute the square root of
-     *  a mod p as the (p+1)/4'th power of a.
-     *
-     *  As (p+1)/4 is an even number, it will have the same result for a and for
-     *  (-a). Only one of these two numbers actually has a square root however,
-     *  so we test at the end by squaring and comparing to the input.
-     *  Also because (p+1)/4 is an even number, the computed square root is
-     *  itself always a square (a ** ((p+1)/4) is the square of a ** ((p+1)/8)).
-     */
-    secp256k1_fe x2, x3, x6, x9, x11, x22, x44, x88, x176, x220, x223, t1;
+static void secp256k1_fe_common_exp(secp256k1_fe *r1, secp256k1_fe *r2, const secp256k1_fe *a) {
+    secp256k1_fe t, x, x2, x3, x6, x9, x11, x22, x44, x88, x176, x220, x223;
     int j;
 
-    /** The binary representation of (p + 1)/4 has 3 blocks of 1s, with lengths in
-     *  { 2, 22, 223 }. Use an addition chain to calculate 2^n - 1 for each block:
-     *  1, [2], 3, 6, 9, 11, [22], 44, 88, 176, 220, [223]
-     */
+    CHECK(r1 != r2);
+
+    x = *a;
+
+    secp256k1_fe_sqr(&x2, &x);
+    secp256k1_fe_mul(&x2, &x2, &x);
 
-    secp256k1_fe_sqr(&x2, a);
-    secp256k1_fe_mul(&x2, &x2, a);
+    *r2 = x2;
 
     secp256k1_fe_sqr(&x3, &x2);
-    secp256k1_fe_mul(&x3, &x3, a);
+    secp256k1_fe_mul(&x3, &x3, &x);
 
     x6 = x3;
     for (j=0; j<3; j++) {
@@ -108,112 +100,112 @@ static int secp256k1_fe_sqrt_var(secp256k1_fe *r, const secp256k1_fe *a) {
 
     /* The final result is then assembled using a sliding window over the blocks. */
 
-    t1 = x223;
+    t = x223;
     for (j=0; j<23; j++) {
-        secp256k1_fe_sqr(&t1, &t1);
+        secp256k1_fe_sqr(&t, &t);
     }
-    secp256k1_fe_mul(&t1, &t1, &x22);
-    for (j=0; j<6; j++) {
-        secp256k1_fe_sqr(&t1, &t1);
+    secp256k1_fe_mul(&t, &t, &x22);
+
+    for (j=0; j<5; j++) {
+        secp256k1_fe_sqr(&t, &t);
     }
-    secp256k1_fe_mul(&t1, &t1, &x2);
-    secp256k1_fe_sqr(&t1, &t1);
-    secp256k1_fe_sqr(r, &t1);
+    *r1 = t;
+}
+
+static int secp256k1_fe_sqrt_var(secp256k1_fe *r, const secp256k1_fe *a) {
+    secp256k1_fe t, x, x2;
+
+    x = *a;
+
+    secp256k1_fe_common_exp(&t, &x2, &x);
+
+    secp256k1_fe_sqr(&t, &t);
+    secp256k1_fe_mul(&t, &t, &x2);
+    secp256k1_fe_sqr(&t, &t);
+    secp256k1_fe_sqr(&t, &t);
+
+    *r = t;
 
     /* Check that a square root was actually calculated */
 
-    secp256k1_fe_sqr(&t1, r);
-    return secp256k1_fe_equal_var(&t1, a);
+    secp256k1_fe_sqr(&t, &t);
+    return secp256k1_fe_equal_var(&t, &x);
 }
 
-static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) {
-    secp256k1_fe x2, x3, x6, x9, x11, x22, x44, x88, x176, x220, x223, t1;
+static int secp256k1_fe_rsqrt_var(secp256k1_fe *rs, secp256k1_fe *rr, const secp256k1_fe *a) {
+    secp256k1_fe t, x, x2;
     int j;
 
-    /** The binary representation of (p - 2) has 5 blocks of 1s, with lengths in
-     *  { 1, 2, 22, 223 }. Use an addition chain to calculate 2^n - 1 for each block:
-     *  [1], [2], 3, 6, 9, 11, [22], 44, 88, 176, 220, [223]
-     */
+    CHECK(rs != rr);
 
-    secp256k1_fe_sqr(&x2, a);
-    secp256k1_fe_mul(&x2, &x2, a);
+    x = *a;
 
-    secp256k1_fe_sqr(&x3, &x2);
-    secp256k1_fe_mul(&x3, &x3, a);
+    secp256k1_fe_common_exp(&t, &x2, &x);
 
-    x6 = x3;
+    secp256k1_fe_mul(&t, &t, &x);
     for (j=0; j<3; j++) {
-        secp256k1_fe_sqr(&x6, &x6);
+        secp256k1_fe_sqr(&t, &t);
     }
-    secp256k1_fe_mul(&x6, &x6, &x3);
+    secp256k1_fe_mul(&t, &t, &x2);
 
-    x9 = x6;
-    for (j=0; j<3; j++) {
-        secp256k1_fe_sqr(&x9, &x9);
-    }
-    secp256k1_fe_mul(&x9, &x9, &x3);
+    *rr = t;
 
-    x11 = x9;
-    for (j=0; j<2; j++) {
-        secp256k1_fe_sqr(&x11, &x11);
-    }
-    secp256k1_fe_mul(&x11, &x11, &x2);
+    secp256k1_fe_mul(&t, &t, &x);
 
-    x22 = x11;
-    for (j=0; j<11; j++) {
-        secp256k1_fe_sqr(&x22, &x22);
-    }
-    secp256k1_fe_mul(&x22, &x22, &x11);
+    *rs = t;
 
-    x44 = x22;
-    for (j=0; j<22; j++) {
-        secp256k1_fe_sqr(&x44, &x44);
-    }
-    secp256k1_fe_mul(&x44, &x44, &x22);
+    /* Check that a square root was actually calculated */
 
-    x88 = x44;
-    for (j=0; j<44; j++) {
-        secp256k1_fe_sqr(&x88, &x88);
-    }
-    secp256k1_fe_mul(&x88, &x88, &x44);
+    secp256k1_fe_sqr(&t, &t);
+    return secp256k1_fe_equal_var(&t, &x);
+}
 
-    x176 = x88;
-    for (j=0; j<88; j++) {
-        secp256k1_fe_sqr(&x176, &x176);
-    }
-    secp256k1_fe_mul(&x176, &x176, &x88);
+static int secp256k1_fe_par_rsqrt_inv_var(secp256k1_fe *rr, secp256k1_fe *ri, const secp256k1_fe *a, const secp256k1_fe *b) {
 
-    x220 = x176;
-    for (j=0; j<44; j++) {
-        secp256k1_fe_sqr(&x220, &x220);
-    }
-    secp256k1_fe_mul(&x220, &x220, &x44);
+    secp256k1_fe b2, ab2, ab4, sqrt, recip, t;
+    int ret;
 
-    x223 = x220;
-    for (j=0; j<3; j++) {
-        secp256k1_fe_sqr(&x223, &x223);
-    }
-    secp256k1_fe_mul(&x223, &x223, &x3);
+    CHECK(rr != ri);
 
-    /* The final result is then assembled using a sliding window over the blocks. */
+    /* Zero inputs could possibly be handled with conditional moves, if necessary */
+    CHECK(!secp256k1_fe_normalizes_to_zero(a) && !secp256k1_fe_normalizes_to_zero(b));
 
-    t1 = x223;
-    for (j=0; j<23; j++) {
-        secp256k1_fe_sqr(&t1, &t1);
-    }
-    secp256k1_fe_mul(&t1, &t1, &x22);
-    for (j=0; j<5; j++) {
-        secp256k1_fe_sqr(&t1, &t1);
-    }
-    secp256k1_fe_mul(&t1, &t1, a);
+    /* Calculate the reciprocal sqrt of a.b^4 */
+
+    secp256k1_fe_sqr(&b2, b);
+    secp256k1_fe_mul(&ab2, &b2, a);
+    secp256k1_fe_mul(&ab4, &ab2, &b2);
+
+    ret = secp256k1_fe_rsqrt_var(&sqrt, &recip, &ab4);
+
+    /* Inverse */
+    secp256k1_fe_sqr(&t, &recip);
+    secp256k1_fe_mul(&t, &t, &ab2);
+    secp256k1_fe_mul(&t, &t, b);
+    secp256k1_fe_sqr(&t, &t);
+    secp256k1_fe_mul(ri, b, &t);
+
+    /* Reciprocal */
+    secp256k1_fe_mul(rr, &recip, &b2);
+
+    return ret;
+}
+
+static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) {
+    secp256k1_fe t, x2;
+    int j;
+
+    secp256k1_fe_common_exp(&t, &x2, a);
+
+    secp256k1_fe_mul(&t, &t, a);
     for (j=0; j<3; j++) {
-        secp256k1_fe_sqr(&t1, &t1);
+        secp256k1_fe_sqr(&t, &t);
     }
-    secp256k1_fe_mul(&t1, &t1, &x2);
+    secp256k1_fe_mul(&t, &t, &x2);
     for (j=0; j<2; j++) {
-        secp256k1_fe_sqr(&t1, &t1);
+        secp256k1_fe_sqr(&t, &t);
     }
-    secp256k1_fe_mul(r, a, &t1);
+    secp256k1_fe_mul(r, a, &t);
 }
 
 static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) {
diff --git a/src/group.h b/src/group.h
index ebfe1ca70c..27fe0bb244 100644
--- a/src/group.h
+++ b/src/group.h
@@ -53,6 +53,8 @@ static int secp256k1_ge_set_xquad_var(secp256k1_ge *r, const secp256k1_fe *x);
  *  for Y. Return value indicates whether the result is valid. */
 static int secp256k1_ge_set_xo_var(secp256k1_ge *r, const secp256k1_fe *x, int odd);
 
+static void secp256k1_ge_set_xo_iso(secp256k1_ge *r, secp256k1_fe *rk, const secp256k1_fe *x);
+
 /** Check whether a group element is the point at infinity. */
 static int secp256k1_ge_is_infinity(const secp256k1_ge *a);
 
@@ -64,6 +66,8 @@ static void secp256k1_ge_neg(secp256k1_ge *r, const secp256k1_ge *a);
 /** Set a group element equal to another which is given in jacobian coordinates */
 static void secp256k1_ge_set_gej(secp256k1_ge *r, secp256k1_gej *a);
 
+static void secp256k1_ge_set_gej_zinv(secp256k1_ge *r, const secp256k1_gej *a, const secp256k1_fe *zi);
+
 /** Set a batch of group elements equal to the inputs given in jacobian coordinates */
 static void secp256k1_ge_set_all_gej_var(size_t len, secp256k1_ge *r, const secp256k1_gej *a, const secp256k1_callback *cb);
 
diff --git a/src/group_impl.h b/src/group_impl.h
index 42e2f6e6eb..982429a367 100644
--- a/src/group_impl.h
+++ b/src/group_impl.h
@@ -185,7 +185,22 @@ static int secp256k1_ge_set_xo_var(secp256k1_ge *r, const secp256k1_fe *x, int o
         secp256k1_fe_negate(&r->y, &r->y, 1);
     }
     return 1;
+}
+
+static void secp256k1_ge_set_xo_iso(secp256k1_ge *r, secp256k1_fe *rk, const secp256k1_fe *x) {
+    secp256k1_fe t;
+
+    secp256k1_fe_sqr(&t, x);
+    secp256k1_fe_mul(&t, &t, x);
+    secp256k1_fe_set_int(rk, 7);
+    secp256k1_fe_add(rk, &t);           /* K = X^3 + 7 (2) */
 
+    /* Perhaps redundant as the equation cannot give 0 */
+    CHECK(!secp256k1_fe_normalizes_to_zero(rk));
+
+    r->infinity = 0;
+    secp256k1_fe_mul(&r->x, rk, x);     /* r->x = K*X (1) */
+    secp256k1_fe_sqr(&r->y, rk);        /* r->y = K^2 (1) */
 }
 
 static void secp256k1_gej_set_ge(secp256k1_gej *r, const secp256k1_ge *a) {
diff --git a/src/modules/ecdh/Makefile.am.include b/src/modules/ecdh/Makefile.am.include
index 670b9c1152..09c9aa1f8b 100644
--- a/src/modules/ecdh/Makefile.am.include
+++ b/src/modules/ecdh/Makefile.am.include
@@ -2,7 +2,9 @@ include_HEADERS += include/secp256k1_ecdh.h
 noinst_HEADERS += src/modules/ecdh/main_impl.h
 noinst_HEADERS += src/modules/ecdh/tests_impl.h
 if USE_BENCHMARK
-noinst_PROGRAMS += bench_ecdh
+noinst_PROGRAMS += bench_ecdh bench_ecdh_opt
 bench_ecdh_SOURCES = src/bench_ecdh.c
 bench_ecdh_LDADD = libsecp256k1.la $(SECP_LIBS)
+bench_ecdh_opt_SOURCES = src/bench_ecdh_opt.c
+bench_ecdh_opt_LDADD = libsecp256k1.la $(SECP_LIBS)
 endif
diff --git a/src/modules/ecdh/main_impl.h b/src/modules/ecdh/main_impl.h
index c23e4f82f7..a56aaa7921 100644
--- a/src/modules/ecdh/main_impl.h
+++ b/src/modules/ecdh/main_impl.h
@@ -10,7 +10,26 @@
 #include "include/secp256k1_ecdh.h"
 #include "ecmult_const_impl.h"
 
-int secp256k1_ecdh(const secp256k1_context* ctx, unsigned char *result, const secp256k1_pubkey *point, const unsigned char *scalar) {
+static void secp256k1_ecdh_hash(unsigned char *result, secp256k1_ge *pt) {
+    unsigned char x[32];
+    unsigned char y[1];
+    secp256k1_sha256_t sha;
+
+    /* Compute a hash of the point in compressed form
+     * Note we cannot use secp256k1_eckey_pubkey_serialize here since it does not
+     * expect its output to be secret and has a timing sidechannel. */
+    secp256k1_fe_normalize(&pt->x);
+    secp256k1_fe_normalize(&pt->y);
+    secp256k1_fe_get_b32(x, &pt->x);
+    y[0] = 0x02 | secp256k1_fe_is_odd(&pt->y);
+
+    secp256k1_sha256_initialize(&sha);
+    secp256k1_sha256_write(&sha, y, sizeof(y));
+    secp256k1_sha256_write(&sha, x, sizeof(x));
+    secp256k1_sha256_finalize(&sha, result);
+}
+
+int secp256k1_ecdh(const secp256k1_context *ctx, unsigned char *result, const secp256k1_pubkey *point, const unsigned char *scalar) {
     int ret = 0;
     int overflow = 0;
     secp256k1_gej res;
@@ -21,29 +40,14 @@ int secp256k1_ecdh(const secp256k1_context* ctx, unsigned char *result, const se
     ARG_CHECK(scalar != NULL);
     (void)ctx;
 
-    secp256k1_pubkey_load(ctx, &pt, point);
     secp256k1_scalar_set_b32(&s, scalar, &overflow);
     if (overflow || secp256k1_scalar_is_zero(&s)) {
         ret = 0;
     } else {
-        unsigned char x[32];
-        unsigned char y[1];
-        secp256k1_sha256_t sha;
-
+        secp256k1_pubkey_load(ctx, &pt, point);
         secp256k1_ecmult_const(&res, &pt, &s);
         secp256k1_ge_set_gej(&pt, &res);
-        /* Compute a hash of the point in compressed form
-         * Note we cannot use secp256k1_eckey_pubkey_serialize here since it does not
-         * expect its output to be secret and has a timing sidechannel. */
-        secp256k1_fe_normalize(&pt.x);
-        secp256k1_fe_normalize(&pt.y);
-        secp256k1_fe_get_b32(x, &pt.x);
-        y[0] = 0x02 | secp256k1_fe_is_odd(&pt.y);
-
-        secp256k1_sha256_initialize(&sha);
-        secp256k1_sha256_write(&sha, y, sizeof(y));
-        secp256k1_sha256_write(&sha, x, sizeof(x));
-        secp256k1_sha256_finalize(&sha, result);
+        secp256k1_ecdh_hash(result, &pt);
         ret = 1;
     }
 
@@ -51,4 +55,70 @@ int secp256k1_ecdh(const secp256k1_context* ctx, unsigned char *result, const se
     return ret;
 }
 
+int secp256k1_ecdh_opt(const secp256k1_context *ctx, unsigned char *result, const unsigned char *pub, size_t publen, const unsigned char *scalar) {
+    int ret = 0;
+    int overflow = 0;
+    secp256k1_fe k, t, zi;
+    secp256k1_gej res;
+    secp256k1_ge pt;
+    secp256k1_scalar s;
+    ARG_CHECK(result != NULL);
+    ARG_CHECK(pub != NULL);
+    ARG_CHECK(scalar != NULL);
+    (void)ctx;
+
+    if (!(publen == 33 && (pub[0] == 0x02 || pub[0] == 0x03))) {
+        secp256k1_pubkey pubkey;
+        if (!secp256k1_ec_pubkey_parse(ctx, &pubkey, pub, publen)) {
+            return 0;
+        }
+        return secp256k1_ecdh(ctx, result, &pubkey, scalar);
+    }
+
+    if (!secp256k1_fe_set_b32(&t, &pub[1])) {
+        return 0;
+    }
+
+    secp256k1_scalar_set_b32(&s, scalar, &overflow);
+    if (overflow || secp256k1_scalar_is_zero(&s)) {
+        ret = 0;
+    } else {
+        /*
+         * Construct a point on an isomorphism described by u^2 == k (possibly on the twist)
+         */
+        secp256k1_ge_set_xo_iso(&pt, &k, &t);
+        secp256k1_ecmult_const(&res, &pt, &s);
+
+        /*
+         * Set 't' to the reciprocal sqrt of 'k' and 'zi' to the inverse of 'res.z'.
+         *
+         * TODO Infinity is a possibility because the twist has smaller order - the sqrt
+         *      would not be found in that case anyway, but need to handle possible 0 in res.z?
+         *      (Probably need cmovs based on res.infinity into 'ret' and 'res.z', to replace
+         *      the simple test here.)
+         * TODO Need secp256k1_fe_par_rsqrt_inv_var to be constant-time (and maybe handle 0?)
+         */
+        if (res.infinity || !secp256k1_fe_par_rsqrt_inv_var(&t, &zi, &k, &res.z)) {
+            ret = 0;
+        }
+        else {
+            secp256k1_fe_mul(&zi, &zi, &t);
+
+            /* Set (+/-)t = "pub.y" (from compressed input); adjust the sign of 'zi' accordingly */
+            secp256k1_fe_mul(&t, &t, &k);
+            secp256k1_fe_normalize(&t);
+            if (secp256k1_fe_is_odd(&t) != (pub[0] == 0x03)) {
+                secp256k1_fe_negate(&zi, &zi, 1);
+            }
+
+            secp256k1_ge_set_gej_zinv(&pt, &res, &zi);
+            secp256k1_ecdh_hash(result, &pt);
+            ret = 1;
+        }
+    }
+
+    secp256k1_scalar_clear(&s);
+    return ret;
+}
+
 #endif
diff --git a/src/modules/ecdh/tests_impl.h b/src/modules/ecdh/tests_impl.h
index 7badc9033f..54d12f5c24 100644
--- a/src/modules/ecdh/tests_impl.h
+++ b/src/modules/ecdh/tests_impl.h
@@ -41,7 +41,43 @@ void test_ecdh_generator_basepoint(void) {
     }
 }
 
-void test_bad_scalar(void) {
+void test_ecdh_opt_generator_basepoint(void) {
+    unsigned char s_one[32] = { 0 };
+    secp256k1_pubkey point[2];
+    int i;
+
+    s_one[31] = 1;
+    /* Check against pubkey creation when the basepoint is the generator */
+    for (i = 0; i < 100; ++i) {
+        secp256k1_sha256_t sha;
+        unsigned char s_b32[32];
+        unsigned char output_ecdh[32];
+        unsigned char output_ser[32];
+        unsigned char point_ser[33];
+        size_t point_ser_len = sizeof(point_ser);
+        secp256k1_scalar s;
+
+        random_scalar_order(&s);
+        secp256k1_scalar_get_b32(s_b32, &s);
+
+        /* compute using ECDH function */
+        CHECK(secp256k1_ec_pubkey_create(ctx, &point[0], s_one) == 1);
+        CHECK(secp256k1_ec_pubkey_serialize(ctx, point_ser, &point_ser_len, &point[0], SECP256K1_EC_COMPRESSED) == 1);
+        CHECK(point_ser_len == sizeof(point_ser));
+        CHECK(secp256k1_ecdh_opt(ctx, output_ecdh, point_ser, point_ser_len, s_b32) == 1);
+        /* compute "explicitly" */
+        CHECK(secp256k1_ec_pubkey_create(ctx, &point[1], s_b32) == 1);
+        CHECK(secp256k1_ec_pubkey_serialize(ctx, point_ser, &point_ser_len, &point[1], SECP256K1_EC_COMPRESSED) == 1);
+        CHECK(point_ser_len == sizeof(point_ser));
+        secp256k1_sha256_initialize(&sha);
+        secp256k1_sha256_write(&sha, point_ser, point_ser_len);
+        secp256k1_sha256_finalize(&sha, output_ser);
+        /* compare */
+        CHECK(memcmp(output_ecdh, output_ser, sizeof(output_ser)) == 0);
+    }
+}
+
+void test_ecdh_bad_scalar(void) {
     unsigned char s_zero[32] = { 0 };
     unsigned char s_overflow[32] = {
         0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
@@ -67,9 +103,41 @@ void test_bad_scalar(void) {
     CHECK(secp256k1_ecdh(ctx, output, &point, s_overflow) == 1);
 }
 
+void test_ecdh_opt_bad_scalar(void) {
+    unsigned char s_zero[32] = { 0 };
+    unsigned char s_overflow[32] = {
+        0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+        0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xfe,
+        0xba, 0xae, 0xdc, 0xe6, 0xaf, 0x48, 0xa0, 0x3b,
+        0xbf, 0xd2, 0x5e, 0x8c, 0xd0, 0x36, 0x41, 0x41
+    };
+    unsigned char s_rand[32] = { 0 };
+    unsigned char output[32];
+    secp256k1_scalar rand;
+    secp256k1_pubkey point;
+    unsigned char point_ser[33];
+    size_t point_ser_len = sizeof(point_ser);
+
+    /* Create random point */
+    random_scalar_order(&rand);
+    secp256k1_scalar_get_b32(s_rand, &rand);
+    CHECK(secp256k1_ec_pubkey_create(ctx, &point, s_rand) == 1);
+    CHECK(secp256k1_ec_pubkey_serialize(ctx, point_ser, &point_ser_len, &point, SECP256K1_EC_COMPRESSED) == 1);
+    CHECK(point_ser_len == sizeof(point_ser));
+
+    /* Try to multiply it by bad values */
+    CHECK(secp256k1_ecdh_opt(ctx, output, point_ser, point_ser_len, s_zero) == 0);
+    CHECK(secp256k1_ecdh_opt(ctx, output, point_ser, point_ser_len, s_overflow) == 0);
+    /* ...and a good one */
+    s_overflow[31] -= 1;
+    CHECK(secp256k1_ecdh_opt(ctx, output, point_ser, point_ser_len, s_overflow) == 1);
+}
+
 void run_ecdh_tests(void) {
     test_ecdh_generator_basepoint();
-    test_bad_scalar();
+    test_ecdh_opt_generator_basepoint();
+    test_ecdh_bad_scalar();
+    test_ecdh_opt_bad_scalar();
 }
 
 #endif
diff --git a/src/tests.c b/src/tests.c
index 3abfe1254c..49e96af24a 100644
--- a/src/tests.c
+++ b/src/tests.c
@@ -1647,9 +1647,10 @@ void test_sqrt(const secp256k1_fe *a, const secp256k1_fe *k) {
     if (k != NULL) {
         /* Check that the returned root is +/- the given known answer */
         secp256k1_fe_negate(&r2, &r1, 1);
-        secp256k1_fe_add(&r1, k); secp256k1_fe_add(&r2, k);
-        secp256k1_fe_normalize(&r1); secp256k1_fe_normalize(&r2);
-        CHECK(secp256k1_fe_is_zero(&r1) || secp256k1_fe_is_zero(&r2));
+        CHECK(check_fe_equal(k, &r1) || check_fe_equal(k, &r2));
+
+        /* Our sqrt guarantees to return the root that is itself a square */
+        CHECK(secp256k1_fe_sqrt_var(&r1, &r1) != 0);
     }
 }
 
@@ -1687,6 +1688,107 @@ void run_sqrt(void) {
     }
 }
 
+void test_rsqrt(const secp256k1_fe *a, const secp256k1_fe *k) {
+    secp256k1_fe r1, r2;
+    int v = secp256k1_fe_rsqrt_var(&r1, &r2, a);
+    CHECK((v == 0) == (k == NULL));
+
+    secp256k1_fe_mul(&r2, &r2, a);
+    CHECK(check_fe_equal(&r1, &r2));
+
+    if (k != NULL) {
+        /* Check that the returned root is +/- the given known answer */
+        secp256k1_fe_negate(&r2, &r1, 1);
+        CHECK(check_fe_equal(k, &r1) || check_fe_equal(k, &r2));
+
+        /* Our sqrt guarantees to return the root that is itself a square */
+        CHECK(secp256k1_fe_rsqrt_var(&r1, &r2, &r1) != 0);
+    }
+}
+
+void run_rsqrt(void) {
+    secp256k1_fe ns, x, s, t;
+    int i;
+
+    /* Check sqrt(0) is 0 */
+    secp256k1_fe_set_int(&x, 0);
+    secp256k1_fe_sqr(&s, &x);
+    test_rsqrt(&s, &x);
+
+    /* Check sqrt of small squares (and their negatives) */
+    for (i = 1; i <= 100; i++) {
+        secp256k1_fe_set_int(&x, i);
+        secp256k1_fe_sqr(&s, &x);
+        test_rsqrt(&s, &x);
+        secp256k1_fe_negate(&t, &s, 1);
+        test_rsqrt(&t, NULL);
+    }
+
+    /* Consistency checks for large random values */
+    for (i = 0; i < 10; i++) {
+        int j;
+        random_fe_non_square(&ns);
+        for (j = 0; j < count; j++) {
+            random_fe(&x);
+            secp256k1_fe_sqr(&s, &x);
+            test_rsqrt(&s, &x);
+            secp256k1_fe_negate(&t, &s, 1);
+            test_rsqrt(&t, NULL);
+            secp256k1_fe_mul(&t, &s, &ns);
+            test_rsqrt(&t, NULL);
+        }
+    }
+}
+
+void test_par_rsqrt_inv(const secp256k1_fe *a, const secp256k1_fe *k) {
+    secp256k1_fe r1, r2, x, xi, t;
+    int v;
+
+    random_fe_non_zero(&x);
+    v = secp256k1_fe_par_rsqrt_inv_var(&r2, &xi, a, &x);
+    CHECK((v == 0) == (k == NULL));
+
+    /* Derive the square root from the reciprocal square root */
+    secp256k1_fe_mul(&r1, &r2, a);
+
+    if (k != NULL) {
+        /* Check that the calculated root is +/- the given known answer */
+        secp256k1_fe_negate(&t, &r1, 1);
+        CHECK(check_fe_equal(k, &r1) || check_fe_equal(k, &t));
+    }
+
+    CHECK(check_fe_inverse(&x, &xi));
+}
+
+void run_par_rsqrt_inv(void) {
+    secp256k1_fe ns, x, s, t;
+    int i;
+
+    /* Check sqrt of small squares (and their negatives) */
+    for (i = 1; i <= 100; i++) {
+        secp256k1_fe_set_int(&x, i);
+        secp256k1_fe_sqr(&s, &x);
+        test_par_rsqrt_inv(&s, &x);
+        secp256k1_fe_negate(&t, &s, 1);
+        test_par_rsqrt_inv(&t, NULL);
+    }
+
+    /* Consistency checks for large random values */
+    for (i = 0; i < 10; i++) {
+        int j;
+        random_fe_non_square(&ns);
+        for (j = 0; j < count; j++) {
+            random_fe_non_zero(&x);
+            secp256k1_fe_sqr(&s, &x);
+            test_par_rsqrt_inv(&s, &x);
+            secp256k1_fe_negate(&t, &s, 1);
+            test_par_rsqrt_inv(&t, NULL);
+            secp256k1_fe_mul(&t, &s, &ns);
+            test_par_rsqrt_inv(&t, NULL);
+        }
+    }
+}
+
 /***** GROUP TESTS *****/
 
 void ge_equals_ge(const secp256k1_ge *a, const secp256k1_ge *b) {
@@ -4323,6 +4425,8 @@ int main(int argc, char **argv) {
     run_field_convert();
     run_sqr();
     run_sqrt();
+    run_rsqrt();
+    run_par_rsqrt_inv();
 
     /* group tests */
     run_ge();