Skip to content

Commit 4372e28

Browse files
committed
Rewrite _scalar_reduce_512
1 parent be6944a commit 4372e28

File tree

1 file changed

+115
-60
lines changed

1 file changed

+115
-60
lines changed

src/scalar_4x64_impl.h

Lines changed: 115 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -483,77 +483,132 @@ static void secp256k1_scalar_reduce_512(secp256k1_scalar *r, const uint64_t *l)
483483
: "=g"(c)
484484
: "g"(p0), "g"(p1), "g"(p2), "g"(p3), "g"(p4), "D"(r), "i"(SECP256K1_N_C_0), "i"(SECP256K1_N_C_1)
485485
: "rax", "rdx", "r8", "r9", "r10", "cc", "memory");
486+
487+
/* Final reduction of r. */
488+
secp256k1_scalar_reduce(r, c + secp256k1_scalar_check_overflow(r));
486489
#else
487-
uint128_t c;
488-
uint64_t c0, c1, c2;
489-
uint64_t n0 = l[4], n1 = l[5], n2 = l[6], n3 = l[7];
490-
uint64_t m0, m1, m2, m3, m4, m5;
491-
uint32_t m6;
492-
uint64_t p0, p1, p2, p3;
493-
uint32_t p4;
490+
uint128_t c, u, v;
491+
uint64_t n4 = l[4], n5 = l[5], n6 = l[6], n7 = l[7];
492+
uint64_t m0, m1, m2, m3, m4, m5, m6;
493+
uint64_t p0, p1, p2, p3, p4;
494494

495495
/* Reduce 512 bits into 385. */
496496
/* m[0..6] = l[0..3] + n[0..3] * SECP256K1_N_C. */
497-
c0 = l[0]; c1 = 0; c2 = 0;
498-
muladd_fast(n0, SECP256K1_N_C_0);
499-
extract_fast(m0);
500-
sumadd_fast(l[1]);
501-
muladd(n1, SECP256K1_N_C_0);
502-
muladd(n0, SECP256K1_N_C_1);
503-
extract(m1);
504-
sumadd(l[2]);
505-
muladd(n2, SECP256K1_N_C_0);
506-
muladd(n1, SECP256K1_N_C_1);
507-
sumadd(n0);
508-
extract(m2);
509-
sumadd(l[3]);
510-
muladd(n3, SECP256K1_N_C_0);
511-
muladd(n2, SECP256K1_N_C_1);
512-
sumadd(n1);
513-
extract(m3);
514-
muladd(n3, SECP256K1_N_C_1);
515-
sumadd(n2);
516-
extract(m4);
517-
sumadd_fast(n3);
518-
extract_fast(m5);
519-
VERIFY_CHECK(c0 <= 1);
520-
m6 = c0;
497+
c = (uint128_t)n4 * SECP256K1_N_C_0;
498+
c += l[0];
499+
m0 = (uint64_t)c; c >>= 64;
500+
501+
u = (uint128_t)n4 * SECP256K1_N_C_1;
502+
u += l[1];
503+
c += (uint128_t)n5 * SECP256K1_N_C_0;
504+
c += (uint64_t)u; u >>= 64;
505+
m1 = (uint64_t)c; c >>= 64;
506+
507+
c += n4;
508+
u += (uint128_t)n5 * SECP256K1_N_C_1;
509+
u += l[2];
510+
v = (uint128_t)n6 * SECP256K1_N_C_0;
511+
c += (uint64_t)u; u >>= 64;
512+
c += (uint64_t)v; v >>= 64;
513+
m2 = (uint64_t)c; c >>= 64;
514+
515+
c += n5;
516+
u += (uint128_t)n6 * SECP256K1_N_C_1;
517+
u += l[3];
518+
v += (uint128_t)n7 * SECP256K1_N_C_0;
519+
c += (uint64_t)u; u >>= 64;
520+
c += (uint64_t)v; v >>= 64;
521+
m3 = (uint64_t)c; c >>= 64;
522+
523+
c += n6;
524+
u += (uint128_t)n7 * SECP256K1_N_C_1;
525+
c += (uint64_t)u; u >>= 64;
526+
c += (uint64_t)v;
527+
m4 = (uint64_t)c; c >>= 64;
528+
529+
c += n7;
530+
c += (uint64_t)u;
531+
m5 = (uint64_t)c; c >>= 64;
532+
533+
/* The carry to m6 is 0 or 1, we negate it for use as a mask. */
534+
m6 = -(uint64_t)c;
535+
VERIFY_CHECK(-m6 <= 1);
521536

522537
/* Reduce 385 bits into 258. */
523538
/* p[0..4] = m[0..3] + m[4..6] * SECP256K1_N_C. */
524-
c0 = m0; c1 = 0; c2 = 0;
525-
muladd_fast(m4, SECP256K1_N_C_0);
526-
extract_fast(p0);
527-
sumadd_fast(m1);
528-
muladd(m5, SECP256K1_N_C_0);
529-
muladd(m4, SECP256K1_N_C_1);
530-
extract(p1);
531-
sumadd(m2);
532-
muladd(m6, SECP256K1_N_C_0);
533-
muladd(m5, SECP256K1_N_C_1);
534-
sumadd(m4);
535-
extract(p2);
536-
sumadd_fast(m3);
537-
muladd_fast(m6, SECP256K1_N_C_1);
538-
sumadd_fast(m5);
539-
extract_fast(p3);
540-
p4 = c0 + m6;
541-
VERIFY_CHECK(p4 <= 2);
539+
c = (uint128_t)m4 * SECP256K1_N_C_0;
540+
c += m0;
541+
p0 = (uint64_t)c; c >>= 64;
542+
543+
u = (uint128_t)m4 * SECP256K1_N_C_1;
544+
u += m1;
545+
c += (uint128_t)m5 * SECP256K1_N_C_0;
546+
c += (uint64_t)u; u >>= 64;
547+
p1 = (uint64_t)c; c >>= 64;
548+
549+
c += m4;
550+
u += (uint128_t)m5 * SECP256K1_N_C_1;
551+
u += m2;
552+
c += (m6 & SECP256K1_N_C_0);
553+
c += (uint64_t)u; u >>= 64;
554+
p2 = (uint64_t)c; c >>= 64;
555+
556+
c += m5;
557+
c += (m6 & SECP256K1_N_C_1);
558+
c += m3;
559+
c += (uint64_t)u;
560+
p3 = (uint64_t)c; c >>= 64;
561+
562+
p4 = (uint64_t)c - m6;;
563+
VERIFY_CHECK(p4 <= 3);
564+
565+
/* Effectively add an extra SECP256K1_N_C during the next pass.
566+
* Values that would have landed in the range [SECP256K_N, 2^256)
567+
* will instead "wrap" and carry back to p4 */
568+
++p4;
542569

543570
/* Reduce 258 bits into 256. */
544571
/* r[0..3] = p[0..3] + p[4] * SECP256K1_N_C. */
545-
c = p0 + (uint128_t)SECP256K1_N_C_0 * p4;
546-
r->d[0] = c & 0xFFFFFFFFFFFFFFFFULL; c >>= 64;
547-
c += p1 + (uint128_t)SECP256K1_N_C_1 * p4;
548-
r->d[1] = c & 0xFFFFFFFFFFFFFFFFULL; c >>= 64;
549-
c += p2 + (uint128_t)p4;
550-
r->d[2] = c & 0xFFFFFFFFFFFFFFFFULL; c >>= 64;
551-
c += p3;
552-
r->d[3] = c & 0xFFFFFFFFFFFFFFFFULL; c >>= 64;
572+
c = (uint128_t)SECP256K1_N_C_0 * p4;
573+
c += p0;
574+
p0 = (uint64_t)c; c >>= 64;
575+
c += (uint128_t)SECP256K1_N_C_1 * p4;
576+
c += p1;
577+
p1 = (uint64_t)c; c >>= 64;
578+
c += p4;
579+
c += p2;
580+
p2 = (uint64_t)c; c >>= 64;
581+
c += p3;
582+
p3 = (uint64_t)c; c >>= 64;
583+
VERIFY_CHECK((uint64_t)c <= 1);
584+
p4 = (uint64_t)c;
585+
586+
/* Recover the extra SECP256K1_N_C from the previous pass.
587+
* If p4 is 1, it becomes a 0 mask - the final pass is a no-op
588+
* If p4 is 0, the decrement creates a UINT64_MAX mask that enables the
589+
* addition of SECP256K_N in the final pass, which must result
590+
* in a final carry, which balances the accounts. */
591+
--p4;
592+
593+
c = p4 & SECP256K1_N_0;
594+
c += p0;
595+
p0 = (uint64_t)c; c >>= 64;
596+
c += p4 & SECP256K1_N_1;
597+
c += p1;
598+
p1 = (uint64_t)c; c >>= 64;
599+
c += p4 & SECP256K1_N_2;
600+
c += p2;
601+
p2 = (uint64_t)c; c >>= 64;
602+
c += p4 & SECP256K1_N_3;
603+
c += p3;
604+
p3 = (uint64_t)c;
605+
VERIFY_CHECK((uint64_t)(c >> 64) + p4 == 0);
606+
607+
r->d[0] = p0;
608+
r->d[1] = p1;
609+
r->d[2] = p2;
610+
r->d[3] = p3;
553611
#endif
554-
555-
/* Final reduction of r. */
556-
secp256k1_scalar_reduce(r, c + secp256k1_scalar_check_overflow(r));
557612
}
558613

559614
static void secp256k1_scalar_mul_512(uint64_t l[8], const secp256k1_scalar *a, const secp256k1_scalar *b) {

0 commit comments

Comments
 (0)