Reciprocal square root, parallel r-sqrt/inv

peterdettman · peterdettman · commit 61df4707cb5b · 2015-11-29T17:16:04.000+07:00
- Factor out common exponentiation prelude for _fe_sqrt_var/fe_inv
- Allow _fe_sqrt_var output to overwrite input
- Make argument for _fe_normalizes_to_zero(_var) const
- Add _fe_rsqrt_var for calculating reciprocal square root (1/sqrt(x))
- Add _fe_par_rsqrt_inv_var to calculate the r-sqrt of one input and
  the inverse of a second input, with a single exponentiation.
diff --git a/src/bench_internal.c b/src/bench_internal.c
@@ -191,6 +191,26 @@ void bench_field_sqrt_var(void* arg) {
     }
 }
 
+void bench_field_rsqrt_var(void* arg) {
+    int i;
+    bench_inv_t *data = (bench_inv_t*)arg;
+
+    for (i = 0; i < 20000; i++) {
+        secp256k1_fe_rsqrt_var(&data->fe_x, &data->fe_y, &data->fe_x);
+        secp256k1_fe_add(&data->fe_x, &data->fe_y);
+    }
+}
+
+void bench_field_par_rsqrt_inv_var(void* arg) {
+    int i;
+    bench_inv_t *data = (bench_inv_t*)arg;
+
+    for (i = 0; i < 20000; i++) {
+        secp256k1_fe_par_rsqrt_inv_var(&data->fe_x, &data->fe_y, &data->fe_x, &data->fe_y);
+        secp256k1_fe_add(&data->fe_x, &data->fe_y);
+    }
+}
+
 void bench_group_double_var(void* arg) {
     int i;
     bench_inv_t *data = (bench_inv_t*)arg;
@@ -334,6 +354,8 @@ int main(int argc, char **argv) {
     if (have_flag(argc, argv, "field") || have_flag(argc, argv, "inverse")) run_benchmark("field_inverse", bench_field_inverse, bench_setup, NULL, &data, 10, 20000);
     if (have_flag(argc, argv, "field") || have_flag(argc, argv, "inverse")) run_benchmark("field_inverse_var", bench_field_inverse_var, bench_setup, NULL, &data, 10, 20000);
     if (have_flag(argc, argv, "field") || have_flag(argc, argv, "sqrt")) run_benchmark("field_sqrt_var", bench_field_sqrt_var, bench_setup, NULL, &data, 10, 20000);
+    if (have_flag(argc, argv, "field") || have_flag(argc, argv, "sqrt")) run_benchmark("field_rsqrt_var", bench_field_rsqrt_var, bench_setup, NULL, &data, 10, 20000);
+    if (have_flag(argc, argv, "field") || have_flag(argc, argv, "sqrt") || have_flag(argc, argv, "inverse")) run_benchmark("field_par_rsqrt_inv_var", bench_field_par_rsqrt_inv_var, bench_setup, NULL, &data, 10, 20000);
 
     if (have_flag(argc, argv, "group") || have_flag(argc, argv, "double")) run_benchmark("group_double_var", bench_group_double_var, bench_setup, NULL, &data, 10, 200000);
     if (have_flag(argc, argv, "group") || have_flag(argc, argv, "add")) run_benchmark("group_add_var", bench_group_add_var, bench_setup, NULL, &data, 10, 200000);
diff --git a/src/field.h b/src/field.h
@@ -39,13 +39,11 @@ static void secp256k1_fe_normalize_weak(secp256k1_fe *r);
 /** Normalize a field element, without constant-time guarantee. */
 static void secp256k1_fe_normalize_var(secp256k1_fe *r);
 
-/** Verify whether a field element represents zero i.e. would normalize to a zero value. The field
- *  implementation may optionally normalize the input, but this should not be relied upon. */
-static int secp256k1_fe_normalizes_to_zero(secp256k1_fe *r);
+/** Verify whether a field element represents zero i.e. would normalize to a zero value. */
+static int secp256k1_fe_normalizes_to_zero(const secp256k1_fe *r);
 
-/** Verify whether a field element represents zero i.e. would normalize to a zero value. The field
- *  implementation may optionally normalize the input, but this should not be relied upon. */
-static int secp256k1_fe_normalizes_to_zero_var(secp256k1_fe *r);
+/** Verify whether a field element represents zero i.e. would normalize to a zero value. */
+static int secp256k1_fe_normalizes_to_zero_var(const secp256k1_fe *r);
 
 /** Set a field element equal to a small integer. Resulting field element is normalized. */
 static void secp256k1_fe_set_int(secp256k1_fe *r, int a);
@@ -88,12 +86,28 @@ static void secp256k1_fe_mul(secp256k1_fe *r, const secp256k1_fe *a, const secp2
 static void secp256k1_fe_sqr(secp256k1_fe *r, const secp256k1_fe *a);
 
 /** If a has a square root, it is computed in r and 1 is returned. If a does not
- *  have a square root, the root of its negation is computed and 0 is returned.
+ *  have a square root, the root of -a is computed and 0 is returned.
  *  The input's magnitude can be at most 8. The output magnitude is 1 (but not
  *  guaranteed to be normalized). The result in r will always be a square
  *  itself. */
 static int secp256k1_fe_sqrt_var(secp256k1_fe *r, const secp256k1_fe *a);
 
+/** If a has a square root, the square root is computed in rs, its reciprocal square root is
+ *  calculated in rr, and 1 is returned. If a does not have a square root, the root (and recip. root)
+ *  of -a are computed and 0 is returned. The input's magnitude can be at most 8. The
+ *  outputs' magnitudes are 1 (but not guaranteed to be normalized). The result in rs will always be
+ *  a square itself. The result in rr will be a square if, and only if, a is a square.
+ */
+static int secp256k1_fe_rsqrt_var(secp256k1_fe *rs, secp256k1_fe *rr, const secp256k1_fe *a);
+
+/** Parallel reciprocal square root and inverse. Sets ri to be the (modular) inverse of b. If a has a
+ *  square root, the reciprocal of its square root is computed in rr and 1 is returned. If a does not
+ *  have a square root, the reciprocal root of -a is computed and 0 is returned. The inputs'
+ *  magnitudes can be at most 8. The outputs' magnitudes are 1 (but not guaranteed to be normalized).
+ *  The result in rr will be a square if, and only if, a is a square.
+ */
+static int secp256k1_fe_par_rsqrt_inv_var(secp256k1_fe *rr,  secp256k1_fe *ri, const secp256k1_fe *a, const secp256k1_fe *b);
+
 /** Sets a field element to be the (modular) inverse of another. Requires the input's magnitude to be
  *  at most 8. The output magnitude is 1 (but not guaranteed to be normalized). */
 static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a);
diff --git a/src/field_10x26_impl.h b/src/field_10x26_impl.h
@@ -188,7 +188,7 @@ static void secp256k1_fe_normalize_var(secp256k1_fe *r) {
 #endif
 }
 
-static int secp256k1_fe_normalizes_to_zero(secp256k1_fe *r) {
+static int secp256k1_fe_normalizes_to_zero(const secp256k1_fe *r) {
     uint32_t t0 = r->n[0], t1 = r->n[1], t2 = r->n[2], t3 = r->n[3], t4 = r->n[4],
              t5 = r->n[5], t6 = r->n[6], t7 = r->n[7], t8 = r->n[8], t9 = r->n[9];
 
@@ -217,7 +217,7 @@ static int secp256k1_fe_normalizes_to_zero(secp256k1_fe *r) {
     return (z0 == 0) | (z1 == 0x3FFFFFFUL);
 }
 
-static int secp256k1_fe_normalizes_to_zero_var(secp256k1_fe *r) {
+static int secp256k1_fe_normalizes_to_zero_var(const secp256k1_fe *r) {
     uint32_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9;
     uint32_t z0, z1;
     uint32_t x;
diff --git a/src/field_5x52_impl.h b/src/field_5x52_impl.h
@@ -167,7 +167,7 @@ static void secp256k1_fe_normalize_var(secp256k1_fe *r) {
 #endif
 }
 
-static int secp256k1_fe_normalizes_to_zero(secp256k1_fe *r) {
+static int secp256k1_fe_normalizes_to_zero(const secp256k1_fe *r) {
     uint64_t t0 = r->n[0], t1 = r->n[1], t2 = r->n[2], t3 = r->n[3], t4 = r->n[4];
 
     /* z0 tracks a possible raw value of 0, z1 tracks a possible raw value of P */
@@ -190,7 +190,7 @@ static int secp256k1_fe_normalizes_to_zero(secp256k1_fe *r) {
     return (z0 == 0) | (z1 == 0xFFFFFFFFFFFFFULL);
 }
 
-static int secp256k1_fe_normalizes_to_zero_var(secp256k1_fe *r) {
+static int secp256k1_fe_normalizes_to_zero_var(const secp256k1_fe *r) {
     uint64_t t0, t1, t2, t3, t4;
     uint64_t z0, z1;
     uint64_t x;
diff --git a/src/field_impl.h b/src/field_impl.h
@@ -28,29 +28,21 @@ SECP256K1_INLINE static int secp256k1_fe_equal_var(const secp256k1_fe *a, const
     return secp256k1_fe_normalizes_to_zero_var(&na);
 }
 
-static int secp256k1_fe_sqrt_var(secp256k1_fe *r, const secp256k1_fe *a) {
-    /** Given that p is congruent to 3 mod 4, we can compute the square root of
-     *  a mod p as the (p+1)/4'th power of a.
-     *
-     *  As (p+1)/4 is an even number, it will have the same result for a and for
-     *  (-a). Only one of these two numbers actually has a square root however,
-     *  so we test at the end by squaring and comparing to the input.
-     *  Also because (p+1)/4 is an even number, the computed square root is
-     *  itself always a square (a ** ((p+1)/4) is the square of a ** ((p+1)/8)).
-     */
-    secp256k1_fe x2, x3, x6, x9, x11, x22, x44, x88, x176, x220, x223, t1;
+static void secp256k1_fe_common_exp(secp256k1_fe *r1, secp256k1_fe *r2, const secp256k1_fe *a) {
+    secp256k1_fe t, x, x2, x3, x6, x9, x11, x22, x44, x88, x176, x220, x223;
     int j;
 
-    /** The binary representation of (p + 1)/4 has 3 blocks of 1s, with lengths in
-     *  { 2, 22, 223 }. Use an addition chain to calculate 2^n - 1 for each block:
-     *  1, [2], 3, 6, 9, 11, [22], 44, 88, 176, 220, [223]
-     */
+    CHECK(r1 != r2);
+
+    x = *a;
+
+    secp256k1_fe_sqr(&x2, &x);
+    secp256k1_fe_mul(&x2, &x2, &x);
 
-    secp256k1_fe_sqr(&x2, a);
-    secp256k1_fe_mul(&x2, &x2, a);
+    *r2 = x2;
 
     secp256k1_fe_sqr(&x3, &x2);
-    secp256k1_fe_mul(&x3, &x3, a);
+    secp256k1_fe_mul(&x3, &x3, &x);
 
     x6 = x3;
     for (j=0; j<3; j++) {
@@ -108,112 +100,112 @@ static int secp256k1_fe_sqrt_var(secp256k1_fe *r, const secp256k1_fe *a) {
 
     /* The final result is then assembled using a sliding window over the blocks. */
 
-    t1 = x223;
+    t = x223;
     for (j=0; j<23; j++) {
-        secp256k1_fe_sqr(&t1, &t1);
+        secp256k1_fe_sqr(&t, &t);
     }
-    secp256k1_fe_mul(&t1, &t1, &x22);
-    for (j=0; j<6; j++) {
-        secp256k1_fe_sqr(&t1, &t1);
+    secp256k1_fe_mul(&t, &t, &x22);
+
+    for (j=0; j<5; j++) {
+        secp256k1_fe_sqr(&t, &t);
     }
-    secp256k1_fe_mul(&t1, &t1, &x2);
-    secp256k1_fe_sqr(&t1, &t1);
-    secp256k1_fe_sqr(r, &t1);
+    *r1 = t;
+}
+
+static int secp256k1_fe_sqrt_var(secp256k1_fe *r, const secp256k1_fe *a) {
+    secp256k1_fe t, x, x2;
+
+    x = *a;
+
+    secp256k1_fe_common_exp(&t, &x2, &x);
+
+    secp256k1_fe_sqr(&t, &t);
+    secp256k1_fe_mul(&t, &t, &x2);
+    secp256k1_fe_sqr(&t, &t);
+    secp256k1_fe_sqr(&t, &t);
+
+    *r = t;
 
     /* Check that a square root was actually calculated */
 
-    secp256k1_fe_sqr(&t1, r);
-    return secp256k1_fe_equal_var(&t1, a);
+    secp256k1_fe_sqr(&t, &t);
+    return secp256k1_fe_equal_var(&t, &x);
 }
 
-static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) {
-    secp256k1_fe x2, x3, x6, x9, x11, x22, x44, x88, x176, x220, x223, t1;
+static int secp256k1_fe_rsqrt_var(secp256k1_fe *rs, secp256k1_fe *rr, const secp256k1_fe *a) {
+    secp256k1_fe t, x, x2;
     int j;
 
-    /** The binary representation of (p - 2) has 5 blocks of 1s, with lengths in
-     *  { 1, 2, 22, 223 }. Use an addition chain to calculate 2^n - 1 for each block:
-     *  [1], [2], 3, 6, 9, 11, [22], 44, 88, 176, 220, [223]
-     */
+    CHECK(rs != rr);
 
-    secp256k1_fe_sqr(&x2, a);
-    secp256k1_fe_mul(&x2, &x2, a);
+    x = *a;
 
-    secp256k1_fe_sqr(&x3, &x2);
-    secp256k1_fe_mul(&x3, &x3, a);
+    secp256k1_fe_common_exp(&t, &x2, &x);
 
-    x6 = x3;
+    secp256k1_fe_mul(&t, &t, &x);
     for (j=0; j<3; j++) {
-        secp256k1_fe_sqr(&x6, &x6);
+        secp256k1_fe_sqr(&t, &t);
     }
-    secp256k1_fe_mul(&x6, &x6, &x3);
+    secp256k1_fe_mul(&t, &t, &x2);
 
-    x9 = x6;
-    for (j=0; j<3; j++) {
-        secp256k1_fe_sqr(&x9, &x9);
-    }
-    secp256k1_fe_mul(&x9, &x9, &x3);
+    *rr = t;
 
-    x11 = x9;
-    for (j=0; j<2; j++) {
-        secp256k1_fe_sqr(&x11, &x11);
-    }
-    secp256k1_fe_mul(&x11, &x11, &x2);
+    secp256k1_fe_mul(&t, &t, &x);
 
-    x22 = x11;
-    for (j=0; j<11; j++) {
-        secp256k1_fe_sqr(&x22, &x22);
-    }
-    secp256k1_fe_mul(&x22, &x22, &x11);
+    *rs = t;
 
-    x44 = x22;
-    for (j=0; j<22; j++) {
-        secp256k1_fe_sqr(&x44, &x44);
-    }
-    secp256k1_fe_mul(&x44, &x44, &x22);
+    /* Check that a square root was actually calculated */
 
-    x88 = x44;
-    for (j=0; j<44; j++) {
-        secp256k1_fe_sqr(&x88, &x88);
-    }
-    secp256k1_fe_mul(&x88, &x88, &x44);
+    secp256k1_fe_sqr(&t, &t);
+    return secp256k1_fe_equal_var(&t, &x);
+}
 
-    x176 = x88;
-    for (j=0; j<88; j++) {
-        secp256k1_fe_sqr(&x176, &x176);
-    }
-    secp256k1_fe_mul(&x176, &x176, &x88);
+static int secp256k1_fe_par_rsqrt_inv_var(secp256k1_fe *rr, secp256k1_fe *ri, const secp256k1_fe *a, const secp256k1_fe *b) {
 
-    x220 = x176;
-    for (j=0; j<44; j++) {
-        secp256k1_fe_sqr(&x220, &x220);
-    }
-    secp256k1_fe_mul(&x220, &x220, &x44);
+    secp256k1_fe b2, ab2, ab4, sqrt, recip, t;
+    int ret;
 
-    x223 = x220;
-    for (j=0; j<3; j++) {
-        secp256k1_fe_sqr(&x223, &x223);
-    }
-    secp256k1_fe_mul(&x223, &x223, &x3);
+    CHECK(rr != ri);
 
-    /* The final result is then assembled using a sliding window over the blocks. */
+    /* Zero inputs could possibly be handled with conditional moves, if necessary */
+    CHECK(!secp256k1_fe_normalizes_to_zero(a) && !secp256k1_fe_normalizes_to_zero(b));
 
-    t1 = x223;
-    for (j=0; j<23; j++) {
-        secp256k1_fe_sqr(&t1, &t1);
-    }
-    secp256k1_fe_mul(&t1, &t1, &x22);
-    for (j=0; j<5; j++) {
-        secp256k1_fe_sqr(&t1, &t1);
-    }
-    secp256k1_fe_mul(&t1, &t1, a);
+    /* Calculate the reciprocal sqrt of a.b^4 */
+
+    secp256k1_fe_sqr(&b2, b);
+    secp256k1_fe_mul(&ab2, &b2, a);
+    secp256k1_fe_mul(&ab4, &ab2, &b2);
+
+    ret = secp256k1_fe_rsqrt_var(&sqrt, &recip, &ab4);
+
+    /* Inverse */
+    secp256k1_fe_sqr(&t, &recip);
+    secp256k1_fe_mul(&t, &t, &ab2);
+    secp256k1_fe_mul(&t, &t, b);
+    secp256k1_fe_sqr(&t, &t);
+    secp256k1_fe_mul(ri, b, &t);
+
+    /* Reciprocal */
+    secp256k1_fe_mul(rr, &recip, &b2);
+
+    return ret;
+}
+
+static void secp256k1_fe_inv(secp256k1_fe *r, const secp256k1_fe *a) {
+    secp256k1_fe t, x2;
+    int j;
+
+    secp256k1_fe_common_exp(&t, &x2, a);
+
+    secp256k1_fe_mul(&t, &t, a);
     for (j=0; j<3; j++) {
-        secp256k1_fe_sqr(&t1, &t1);
+        secp256k1_fe_sqr(&t, &t);
     }
-    secp256k1_fe_mul(&t1, &t1, &x2);
+    secp256k1_fe_mul(&t, &t, &x2);
     for (j=0; j<2; j++) {
-        secp256k1_fe_sqr(&t1, &t1);
+        secp256k1_fe_sqr(&t, &t);
     }
-    secp256k1_fe_mul(r, a, &t1);
+    secp256k1_fe_mul(r, a, &t);
 }
 
 static void secp256k1_fe_inv_var(secp256k1_fe *r, const secp256k1_fe *a) {
diff --git a/src/tests.c b/src/tests.c

Original file line number	Diff line number	Diff line change
`@@ -188,7 +188,7 @@ static void secp256k1_fe_normalize_var(secp256k1_fe *r) {`
`188`	`188`	`#endif`
`189`	`189`	`}`
`190`	`190`
`191`		`-static int secp256k1_fe_normalizes_to_zero(secp256k1_fe *r) {`
	`191`	`+static int secp256k1_fe_normalizes_to_zero(const secp256k1_fe *r) {`
`192`	`192`	`uint32_t t0 = r->n[0], t1 = r->n[1], t2 = r->n[2], t3 = r->n[3], t4 = r->n[4],`
`193`	`193`	`t5 = r->n[5], t6 = r->n[6], t7 = r->n[7], t8 = r->n[8], t9 = r->n[9];`
`194`	`194`
`@@ -217,7 +217,7 @@ static int secp256k1_fe_normalizes_to_zero(secp256k1_fe *r) {`
`217`	`217`	`return (z0 == 0) \| (z1 == 0x3FFFFFFUL);`
`218`	`218`	`}`
`219`	`219`
`220`		`-static int secp256k1_fe_normalizes_to_zero_var(secp256k1_fe *r) {`
	`220`	`+static int secp256k1_fe_normalizes_to_zero_var(const secp256k1_fe *r) {`
`221`	`221`	`uint32_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9;`
`222`	`222`	`uint32_t z0, z1;`
`223`	`223`	`uint32_t x;`
Original file line number	Diff line number	Diff line change
`@@ -167,7 +167,7 @@ static void secp256k1_fe_normalize_var(secp256k1_fe *r) {`
`167`	`167`	`#endif`
`168`	`168`	`}`
`169`	`169`
`170`		`-static int secp256k1_fe_normalizes_to_zero(secp256k1_fe *r) {`
	`170`	`+static int secp256k1_fe_normalizes_to_zero(const secp256k1_fe *r) {`
`171`	`171`	`uint64_t t0 = r->n[0], t1 = r->n[1], t2 = r->n[2], t3 = r->n[3], t4 = r->n[4];`
`172`	`172`
`173`	`173`	`/* z0 tracks a possible raw value of 0, z1 tracks a possible raw value of P */`
`@@ -190,7 +190,7 @@ static int secp256k1_fe_normalizes_to_zero(secp256k1_fe *r) {`
`190`	`190`	`return (z0 == 0) \| (z1 == 0xFFFFFFFFFFFFFULL);`
`191`	`191`	`}`
`192`	`192`
`193`		`-static int secp256k1_fe_normalizes_to_zero_var(secp256k1_fe *r) {`
	`193`	`+static int secp256k1_fe_normalizes_to_zero_var(const secp256k1_fe *r) {`
`194`	`194`	`uint64_t t0, t1, t2, t3, t4;`
`195`	`195`	`uint64_t z0, z1;`
`196`	`196`	`uint64_t x;`