Skip to content

Commit f371201

Browse files
committed
Scalar 4x64 performance improvements
1 parent 1199492 commit f371201

File tree

1 file changed

+58
-23
lines changed

1 file changed

+58
-23
lines changed

src/scalar_4x64_impl.h

Lines changed: 58 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -853,30 +853,65 @@ static void secp256k1_scalar_sqr_512(uint64_t l[8], const secp256k1_scalar *a) {
853853
: "S"(l), "D"(a->d)
854854
: "rax", "rdx", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "cc", "memory");
855855
#else
856-
/* 160 bit accumulator. */
857-
uint64_t c0 = 0, c1 = 0;
858-
uint32_t c2 = 0;
859856

860-
/* l[0..7] = a[0..3] * b[0..3]. */
861-
muladd_fast(a->d[0], a->d[0]);
862-
extract_fast(l[0]);
863-
muladd2(a->d[0], a->d[1]);
864-
extract(l[1]);
865-
muladd2(a->d[0], a->d[2]);
866-
muladd(a->d[1], a->d[1]);
867-
extract(l[2]);
868-
muladd2(a->d[0], a->d[3]);
869-
muladd2(a->d[1], a->d[2]);
870-
extract(l[3]);
871-
muladd2(a->d[1], a->d[3]);
872-
muladd(a->d[2], a->d[2]);
873-
extract(l[4]);
874-
muladd2(a->d[2], a->d[3]);
875-
extract(l[5]);
876-
muladd_fast(a->d[3], a->d[3]);
877-
extract_fast(l[6]);
878-
VERIFY_CHECK(c1 == 0);
879-
l[7] = c0;
857+
const uint64_t *d = &a->d[0];
858+
uint64_t d0 = d[0], d1 = d[1], d2 = d[2], d3 = d[3];
859+
860+
uint128_t c, u, v;
861+
uint64_t w;
862+
863+
c = (uint128_t)d0 * d0;
864+
l[0] = (uint64_t)c; c >>= 64;
865+
866+
u = (uint128_t)d0 * d1;
867+
w = (uint64_t)u; u >>= 64;
868+
c += w;
869+
c += w;
870+
l[1] = (uint64_t)c; c >>= 64;
871+
872+
v = (uint128_t)d1 * d1;
873+
c += (uint64_t)v; v >>= 64;
874+
u += (uint128_t)d0 * d2;
875+
w = (uint64_t)u; u >>= 64;
876+
c += w;
877+
c += w;
878+
l[2] = (uint64_t)c; c >>= 64;
879+
880+
c += (uint64_t)v;
881+
v = (uint128_t)d1 * d2;
882+
u += (uint128_t)d0 * d3;
883+
u += (uint64_t)v; v >>= 64;
884+
w = (uint64_t)u; u >>= 64;
885+
c += w;
886+
c += w;
887+
l[3] = (uint64_t)c; c >>= 64;
888+
889+
u += (uint64_t)v;
890+
v = (uint128_t)d2 * d2;
891+
c += (uint64_t)v; v >>= 64;
892+
u += (uint128_t)d1 * d3;
893+
w = (uint64_t)u; u >>= 64;
894+
c += w;
895+
c += w;
896+
l[4] = (uint64_t)c; c >>= 64;
897+
898+
c += (uint64_t)v;
899+
u += (uint128_t)d2 * d3;
900+
w = (uint64_t)u; u >>= 64;
901+
c += w;
902+
c += w;
903+
l[5] = (uint64_t)c; c >>= 64;
904+
905+
v = (uint128_t)d3 * d3;
906+
c += (uint64_t)v; v >>= 64;
907+
w = (uint64_t)u;
908+
c += w;
909+
c += w;
910+
l[6] = (uint64_t)c; c >>= 64;
911+
912+
VERIFY_CHECK(((v + c) >> 64) == 0);
913+
l[7] = (uint64_t)v + (uint64_t)c;
914+
880915
#endif
881916
}
882917

0 commit comments

Comments
 (0)