From f423d2e14db3a2367c1c9e37f618bca7b4d077ef Mon Sep 17 00:00:00 2001 From: Elichai Turkel Date: Fri, 27 Sep 2019 13:37:25 -0400 Subject: [PATCH 1/2] Add memory fence to bench_internal --- src/bench.h | 9 +++++++++ src/bench_internal.c | 30 ++++++++++++++++++++++++++++-- 2 files changed, 37 insertions(+), 2 deletions(-) diff --git a/src/bench.h b/src/bench.h index 5b59783f68..dbab03ea97 100644 --- a/src/bench.h +++ b/src/bench.h @@ -12,6 +12,15 @@ #include #include "sys/time.h" + +/* A memory fence to prevent compiler optimizations + It tells the optimizer that it can do whatever it wants with *p so the optimizer can't optimize *p out. + The nice thing is that because the assembly is actually empty it doesn't add any instrcutions + *Notice: This is a best effort, nothing promise us it will always work.* */ +static void memory_fence(void *p) { + __asm__ __volatile__("": : "g"(p) : "memory"); +} + static double gettimedouble(void) { struct timeval tv; gettimeofday(&tv, NULL); diff --git a/src/bench_internal.c b/src/bench_internal.c index a8f4e9e12f..fe8f468dc0 100644 --- a/src/bench_internal.c +++ b/src/bench_internal.c @@ -27,6 +27,7 @@ typedef struct { int wnaf[256]; } bench_inv; + void bench_setup(void* arg) { bench_inv *data = (bench_inv*)arg; @@ -62,6 +63,7 @@ void bench_scalar_add(void* arg) { for (i = 0; i < 2000000; i++) { secp256k1_scalar_add(&data->scalar_x, &data->scalar_x, &data->scalar_y); + memory_fence(data); } } @@ -71,6 +73,7 @@ void bench_scalar_negate(void* arg) { for (i = 0; i < 2000000; i++) { secp256k1_scalar_negate(&data->scalar_x, &data->scalar_x); + memory_fence(data); } } @@ -80,6 +83,7 @@ void bench_scalar_sqr(void* arg) { for (i = 0; i < 200000; i++) { secp256k1_scalar_sqr(&data->scalar_x, &data->scalar_x); + memory_fence(data); } } @@ -89,6 +93,7 @@ void bench_scalar_mul(void* arg) { for (i = 0; i < 200000; i++) { secp256k1_scalar_mul(&data->scalar_x, &data->scalar_x, &data->scalar_y); + memory_fence(data); } } @@ -101,6 +106,7 @@ void bench_scalar_split(void* arg) { secp256k1_scalar l, r; secp256k1_scalar_split_lambda(&l, &r, &data->scalar_x); secp256k1_scalar_add(&data->scalar_x, &data->scalar_x, &data->scalar_y); + memory_fence(data); } } #endif @@ -112,6 +118,7 @@ void bench_scalar_inverse(void* arg) { for (i = 0; i < 2000; i++) { secp256k1_scalar_inverse(&data->scalar_x, &data->scalar_x); secp256k1_scalar_add(&data->scalar_x, &data->scalar_x, &data->scalar_y); + memory_fence(data); } } @@ -122,6 +129,7 @@ void bench_scalar_inverse_var(void* arg) { for (i = 0; i < 2000; i++) { secp256k1_scalar_inverse_var(&data->scalar_x, &data->scalar_x); secp256k1_scalar_add(&data->scalar_x, &data->scalar_x, &data->scalar_y); + memory_fence(data); } } @@ -131,6 +139,7 @@ void bench_field_normalize(void* arg) { for (i = 0; i < 2000000; i++) { secp256k1_fe_normalize(&data->fe_x); + memory_fence(data); } } @@ -140,6 +149,7 @@ void bench_field_normalize_weak(void* arg) { for (i = 0; i < 2000000; i++) { secp256k1_fe_normalize_weak(&data->fe_x); + memory_fence(data); } } @@ -149,6 +159,7 @@ void bench_field_mul(void* arg) { for (i = 0; i < 200000; i++) { secp256k1_fe_mul(&data->fe_x, &data->fe_x, &data->fe_y); + memory_fence(data); } } @@ -158,6 +169,7 @@ void bench_field_sqr(void* arg) { for (i = 0; i < 200000; i++) { secp256k1_fe_sqr(&data->fe_x, &data->fe_x); + memory_fence(data); } } @@ -168,6 +180,7 @@ void bench_field_inverse(void* arg) { for (i = 0; i < 20000; i++) { secp256k1_fe_inv(&data->fe_x, &data->fe_x); secp256k1_fe_add(&data->fe_x, &data->fe_y); + memory_fence(data); } } @@ -178,6 +191,7 @@ void bench_field_inverse_var(void* arg) { for (i = 0; i < 20000; i++) { secp256k1_fe_inv_var(&data->fe_x, &data->fe_x); secp256k1_fe_add(&data->fe_x, &data->fe_y); + memory_fence(data); } } @@ -190,6 +204,7 @@ void bench_field_sqrt(void* arg) { t = data->fe_x; secp256k1_fe_sqrt(&data->fe_x, &t); secp256k1_fe_add(&data->fe_x, &data->fe_y); + memory_fence(data); } } @@ -199,6 +214,7 @@ void bench_group_double_var(void* arg) { for (i = 0; i < 200000; i++) { secp256k1_gej_double_var(&data->gej_x, &data->gej_x, NULL); + memory_fence(data); } } @@ -208,6 +224,7 @@ void bench_group_add_var(void* arg) { for (i = 0; i < 200000; i++) { secp256k1_gej_add_var(&data->gej_x, &data->gej_x, &data->gej_y, NULL); + memory_fence(data); } } @@ -217,6 +234,7 @@ void bench_group_add_affine(void* arg) { for (i = 0; i < 200000; i++) { secp256k1_gej_add_ge(&data->gej_x, &data->gej_x, &data->ge_y); + memory_fence(data); } } @@ -226,6 +244,7 @@ void bench_group_add_affine_var(void* arg) { for (i = 0; i < 200000; i++) { secp256k1_gej_add_ge_var(&data->gej_x, &data->gej_x, &data->ge_y, NULL); + memory_fence(data); } } @@ -235,6 +254,7 @@ void bench_group_jacobi_var(void* arg) { for (i = 0; i < 20000; i++) { secp256k1_gej_has_quad_y_var(&data->gej_x); + memory_fence(data); } } @@ -245,6 +265,7 @@ void bench_ecmult_wnaf(void* arg) { for (i = 0; i < 20000; i++) { secp256k1_ecmult_wnaf(data->wnaf, 256, &data->scalar_x, WINDOW_A); secp256k1_scalar_add(&data->scalar_x, &data->scalar_x, &data->scalar_y); + memory_fence(data); } } @@ -255,6 +276,7 @@ void bench_wnaf_const(void* arg) { for (i = 0; i < 20000; i++) { secp256k1_wnaf_const(data->wnaf, &data->scalar_x, WINDOW_A, 256); secp256k1_scalar_add(&data->scalar_x, &data->scalar_x, &data->scalar_y); + memory_fence(data); } } @@ -268,6 +290,7 @@ void bench_sha256(void* arg) { secp256k1_sha256_initialize(&sha); secp256k1_sha256_write(&sha, data->data, 32); secp256k1_sha256_finalize(&sha, data->data); + memory_fence(data); } } @@ -280,6 +303,7 @@ void bench_hmac_sha256(void* arg) { secp256k1_hmac_sha256_initialize(&hmac, data->data, 32); secp256k1_hmac_sha256_write(&hmac, data->data, 32); secp256k1_hmac_sha256_finalize(&hmac, data->data); + memory_fence(data); } } @@ -291,6 +315,7 @@ void bench_rfc6979_hmac_sha256(void* arg) { for (i = 0; i < 20000; i++) { secp256k1_rfc6979_hmac_sha256_initialize(&rng, data->data, 64); secp256k1_rfc6979_hmac_sha256_generate(&rng, data->data, 32); + memory_fence(data); } } @@ -312,7 +337,7 @@ void bench_context_sign(void* arg) { #ifndef USE_NUM_NONE void bench_num_jacobi(void* arg) { - int i; + int i, j; bench_inv *data = (bench_inv*)arg; secp256k1_num nx, norder; @@ -321,7 +346,8 @@ void bench_num_jacobi(void* arg) { secp256k1_scalar_get_num(&norder, &data->scalar_y); for (i = 0; i < 200000; i++) { - secp256k1_num_jacobi(&nx, &norder); + j = secp256k1_num_jacobi(&nx, &norder); + memory_fence(&j); } } #endif From a5198c52ba9f3eac992fba8000aec8f155039575 Mon Sep 17 00:00:00 2001 From: Elichai Turkel Date: Tue, 22 Oct 2019 00:00:11 +0300 Subject: [PATCH 2/2] Remove memory fences where we can do better --- src/bench.h | 12 ++++- src/bench_internal.c | 109 +++++++++++++++++++++++-------------------- 2 files changed, 70 insertions(+), 51 deletions(-) diff --git a/src/bench.h b/src/bench.h index dbab03ea97..9ba7cf121a 100644 --- a/src/bench.h +++ b/src/bench.h @@ -13,11 +13,21 @@ #include "sys/time.h" +# if defined(__GNUC__) +# define ALWAYS_INLINE SECP256K1_INLINE __attribute__((__always_inline__)) +# elif defined(_MSC_VER) && !defined(__clang__) +# define ALWAYS_INLINE SECP256K1_INLINE __forceinline +# elif defined(__CLANG__) && __has_attribute(__always_inline__) +# define ALWAYS_INLINE SECP256K1_INLINE __attribute__((__always_inline__)) +# else +# define ALWAYS_INLINE SECP256K1_INLINE +# endif + /* A memory fence to prevent compiler optimizations It tells the optimizer that it can do whatever it wants with *p so the optimizer can't optimize *p out. The nice thing is that because the assembly is actually empty it doesn't add any instrcutions *Notice: This is a best effort, nothing promise us it will always work.* */ -static void memory_fence(void *p) { +ALWAYS_INLINE static void memory_fence(void *p) { __asm__ __volatile__("": : "g"(p) : "memory"); } diff --git a/src/bench_internal.c b/src/bench_internal.c index fe8f468dc0..163da9731b 100644 --- a/src/bench_internal.c +++ b/src/bench_internal.c @@ -27,7 +27,6 @@ typedef struct { int wnaf[256]; } bench_inv; - void bench_setup(void* arg) { bench_inv *data = (bench_inv*)arg; @@ -58,13 +57,13 @@ void bench_setup(void* arg) { } void bench_scalar_add(void* arg) { - int i; + int i, j = 0; bench_inv *data = (bench_inv*)arg; for (i = 0; i < 2000000; i++) { - secp256k1_scalar_add(&data->scalar_x, &data->scalar_x, &data->scalar_y); - memory_fence(data); + j += secp256k1_scalar_add(&data->scalar_x, &data->scalar_x, &data->scalar_y); } + CHECK(j <= 2000000); } void bench_scalar_negate(void* arg) { @@ -73,8 +72,8 @@ void bench_scalar_negate(void* arg) { for (i = 0; i < 2000000; i++) { secp256k1_scalar_negate(&data->scalar_x, &data->scalar_x); - memory_fence(data); } + memory_fence(data); } void bench_scalar_sqr(void* arg) { @@ -83,8 +82,8 @@ void bench_scalar_sqr(void* arg) { for (i = 0; i < 200000; i++) { secp256k1_scalar_sqr(&data->scalar_x, &data->scalar_x); - memory_fence(data); } + memory_fence(data); } void bench_scalar_mul(void* arg) { @@ -93,44 +92,45 @@ void bench_scalar_mul(void* arg) { for (i = 0; i < 200000; i++) { secp256k1_scalar_mul(&data->scalar_x, &data->scalar_x, &data->scalar_y); - memory_fence(data); } + memory_fence(data); } #ifdef USE_ENDOMORPHISM void bench_scalar_split(void* arg) { - int i; + int i, j = 0; bench_inv *data = (bench_inv*)arg; for (i = 0; i < 20000; i++) { - secp256k1_scalar l, r; - secp256k1_scalar_split_lambda(&l, &r, &data->scalar_x); - secp256k1_scalar_add(&data->scalar_x, &data->scalar_x, &data->scalar_y); - memory_fence(data); + secp256k1_scalar_split_lambda(&data->scalar_x, &data->scalar_y, &data->scalar_x); + j += secp256k1_scalar_add(&data->scalar_x, &data->scalar_x, &data->scalar_y); } + CHECK(j <= 2000); } #endif void bench_scalar_inverse(void* arg) { - int i; + int i, j = 0; bench_inv *data = (bench_inv*)arg; for (i = 0; i < 2000; i++) { secp256k1_scalar_inverse(&data->scalar_x, &data->scalar_x); - secp256k1_scalar_add(&data->scalar_x, &data->scalar_x, &data->scalar_y); - memory_fence(data); + j += secp256k1_scalar_add(&data->scalar_x, &data->scalar_x, &data->scalar_y); } + memory_fence(data); + CHECK(j <= 2000); } void bench_scalar_inverse_var(void* arg) { - int i; + int i, j = 0; bench_inv *data = (bench_inv*)arg; for (i = 0; i < 2000; i++) { secp256k1_scalar_inverse_var(&data->scalar_x, &data->scalar_x); - secp256k1_scalar_add(&data->scalar_x, &data->scalar_x, &data->scalar_y); - memory_fence(data); + j += secp256k1_scalar_add(&data->scalar_x, &data->scalar_x, &data->scalar_y); } + memory_fence(data); + CHECK(j <= 2000); } void bench_field_normalize(void* arg) { @@ -139,8 +139,8 @@ void bench_field_normalize(void* arg) { for (i = 0; i < 2000000; i++) { secp256k1_fe_normalize(&data->fe_x); - memory_fence(data); } + memory_fence(data); } void bench_field_normalize_weak(void* arg) { @@ -149,8 +149,8 @@ void bench_field_normalize_weak(void* arg) { for (i = 0; i < 2000000; i++) { secp256k1_fe_normalize_weak(&data->fe_x); - memory_fence(data); } + memory_fence(data); } void bench_field_mul(void* arg) { @@ -159,8 +159,8 @@ void bench_field_mul(void* arg) { for (i = 0; i < 200000; i++) { secp256k1_fe_mul(&data->fe_x, &data->fe_x, &data->fe_y); - memory_fence(data); } + memory_fence(data); } void bench_field_sqr(void* arg) { @@ -169,8 +169,8 @@ void bench_field_sqr(void* arg) { for (i = 0; i < 200000; i++) { secp256k1_fe_sqr(&data->fe_x, &data->fe_x); - memory_fence(data); } + memory_fence(data); } void bench_field_inverse(void* arg) { @@ -180,8 +180,8 @@ void bench_field_inverse(void* arg) { for (i = 0; i < 20000; i++) { secp256k1_fe_inv(&data->fe_x, &data->fe_x); secp256k1_fe_add(&data->fe_x, &data->fe_y); - memory_fence(data); } + memory_fence(data); } void bench_field_inverse_var(void* arg) { @@ -191,21 +191,22 @@ void bench_field_inverse_var(void* arg) { for (i = 0; i < 20000; i++) { secp256k1_fe_inv_var(&data->fe_x, &data->fe_x); secp256k1_fe_add(&data->fe_x, &data->fe_y); - memory_fence(data); } + memory_fence(data); } void bench_field_sqrt(void* arg) { - int i; + int i, j=0; bench_inv *data = (bench_inv*)arg; secp256k1_fe t; for (i = 0; i < 20000; i++) { t = data->fe_x; - secp256k1_fe_sqrt(&data->fe_x, &t); + j += secp256k1_fe_sqrt(&data->fe_x, &t); secp256k1_fe_add(&data->fe_x, &data->fe_y); - memory_fence(data); } + memory_fence(data); + CHECK(j <= 20000); } void bench_group_double_var(void* arg) { @@ -214,8 +215,8 @@ void bench_group_double_var(void* arg) { for (i = 0; i < 200000; i++) { secp256k1_gej_double_var(&data->gej_x, &data->gej_x, NULL); - memory_fence(data); } + memory_fence(data); } void bench_group_add_var(void* arg) { @@ -224,8 +225,8 @@ void bench_group_add_var(void* arg) { for (i = 0; i < 200000; i++) { secp256k1_gej_add_var(&data->gej_x, &data->gej_x, &data->gej_y, NULL); - memory_fence(data); } + memory_fence(data); } void bench_group_add_affine(void* arg) { @@ -234,8 +235,8 @@ void bench_group_add_affine(void* arg) { for (i = 0; i < 200000; i++) { secp256k1_gej_add_ge(&data->gej_x, &data->gej_x, &data->ge_y); - memory_fence(data); } + memory_fence(data); } void bench_group_add_affine_var(void* arg) { @@ -244,40 +245,42 @@ void bench_group_add_affine_var(void* arg) { for (i = 0; i < 200000; i++) { secp256k1_gej_add_ge_var(&data->gej_x, &data->gej_x, &data->ge_y, NULL); - memory_fence(data); } + memory_fence(data); } void bench_group_jacobi_var(void* arg) { - int i; + int i, j = 0; bench_inv *data = (bench_inv*)arg; for (i = 0; i < 20000; i++) { - secp256k1_gej_has_quad_y_var(&data->gej_x); - memory_fence(data); + j += secp256k1_gej_has_quad_y_var(&data->gej_x); } + CHECK(j == 20000); } void bench_ecmult_wnaf(void* arg) { - int i; + int i, bits=0, overflow = 0; bench_inv *data = (bench_inv*)arg; for (i = 0; i < 20000; i++) { - secp256k1_ecmult_wnaf(data->wnaf, 256, &data->scalar_x, WINDOW_A); - secp256k1_scalar_add(&data->scalar_x, &data->scalar_x, &data->scalar_y); - memory_fence(data); + bits += secp256k1_ecmult_wnaf(data->wnaf, 256, &data->scalar_x, WINDOW_A); + overflow += secp256k1_scalar_add(&data->scalar_x, &data->scalar_x, &data->scalar_y); } + CHECK(overflow >= 0); + CHECK(bits <= 256*20000); } void bench_wnaf_const(void* arg) { - int i; + int i, bits = 0, overflow = 0; bench_inv *data = (bench_inv*)arg; for (i = 0; i < 20000; i++) { - secp256k1_wnaf_const(data->wnaf, &data->scalar_x, WINDOW_A, 256); - secp256k1_scalar_add(&data->scalar_x, &data->scalar_x, &data->scalar_y); - memory_fence(data); + bits += secp256k1_wnaf_const(data->wnaf, &data->scalar_x, WINDOW_A, 256); + overflow += secp256k1_scalar_add(&data->scalar_x, &data->scalar_x, &data->scalar_y); } + CHECK(overflow >= 0); + CHECK(bits <= 256*20000); } @@ -290,8 +293,8 @@ void bench_sha256(void* arg) { secp256k1_sha256_initialize(&sha); secp256k1_sha256_write(&sha, data->data, 32); secp256k1_sha256_finalize(&sha, data->data); - memory_fence(data); } + memory_fence(data); } void bench_hmac_sha256(void* arg) { @@ -303,8 +306,8 @@ void bench_hmac_sha256(void* arg) { secp256k1_hmac_sha256_initialize(&hmac, data->data, 32); secp256k1_hmac_sha256_write(&hmac, data->data, 32); secp256k1_hmac_sha256_finalize(&hmac, data->data); - memory_fence(data); } + memory_fence(data); } void bench_rfc6979_hmac_sha256(void* arg) { @@ -315,29 +318,35 @@ void bench_rfc6979_hmac_sha256(void* arg) { for (i = 0; i < 20000; i++) { secp256k1_rfc6979_hmac_sha256_initialize(&rng, data->data, 64); secp256k1_rfc6979_hmac_sha256_generate(&rng, data->data, 32); - memory_fence(data); } + memory_fence(data); } void bench_context_verify(void* arg) { int i; + secp256k1_context* ctx; (void)arg; for (i = 0; i < 20; i++) { - secp256k1_context_destroy(secp256k1_context_create(SECP256K1_CONTEXT_VERIFY)); + ctx = secp256k1_context_create(SECP256K1_CONTEXT_VERIFY); + memory_fence(ctx); + secp256k1_context_destroy(ctx); } } void bench_context_sign(void* arg) { int i; + secp256k1_context* ctx; (void)arg; for (i = 0; i < 200; i++) { - secp256k1_context_destroy(secp256k1_context_create(SECP256K1_CONTEXT_SIGN)); + ctx = secp256k1_context_create(SECP256K1_CONTEXT_SIGN); + memory_fence(ctx); + secp256k1_context_destroy(ctx); } } #ifndef USE_NUM_NONE void bench_num_jacobi(void* arg) { - int i, j; + int i, j = 0; bench_inv *data = (bench_inv*)arg; secp256k1_num nx, norder; @@ -346,9 +355,9 @@ void bench_num_jacobi(void* arg) { secp256k1_scalar_get_num(&norder, &data->scalar_y); for (i = 0; i < 200000; i++) { - j = secp256k1_num_jacobi(&nx, &norder); - memory_fence(&j); + j += secp256k1_num_jacobi(&nx, &norder); } + CHECK(j <= 200000); } #endif