Skip to content

Commit 32e906d

Browse files
committed
magic for lround
1 parent f9e81b5 commit 32e906d

File tree

1 file changed

+9
-11
lines changed

1 file changed

+9
-11
lines changed

cp-algo/util/simd.hpp

+9-11
Original file line numberDiff line numberDiff line change
@@ -20,31 +20,29 @@ namespace cp_algo {
2020
#endif
2121
}
2222

23-
i64x4 lround(dx4 a) {
24-
#ifdef __AVX2__
25-
return __builtin_convertvector(_mm256_round_pd(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC), i64x4);
26-
#else
27-
return __builtin_convertvector(a < 0 ? a - 0.5 : a + 0.5, i64x4);
28-
#endif
23+
i64x4 lround(dx4 x) {
24+
// https://stackoverflow.com/a/77376595
25+
static constexpr dx4 magic = dx4() + double(3ULL << 51);
26+
return i64x4(x + magic) - i64x4(magic);
2927
}
3028

3129
dx4 round(dx4 a) {
3230
#ifdef __AVX2__
3331
return _mm256_round_pd(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
3432
#else
35-
return __builtin_convertvector(lround(a), Simd);
33+
return __builtin_convertvector(lround(a), dx4);
3634
#endif
3735
}
3836

3937
u64x4 montgomery_reduce(u64x4 x, u64x4 mod, u64x4 imod) {
40-
#ifdef __AVX2__
38+
#ifndef __AVX2__
4139
auto x_ninv = _mm256_mul_epu32(__m256i(x), __m256i(imod));
4240
auto x_res = _mm256_add_epi64(__m256i(x), _mm256_mul_epu32(x_ninv, __m256i(mod)));
4341
return u64x4(_mm256_bsrli_epi128(x_res, 4));
4442
#else
45-
auto x_ninv = x * imod;
46-
auto x_res = x + ((x_ninv << 32) >> 32) * mod;
47-
return u64x4(x_res >> 32);
43+
44+
auto x_ninv = u64x4(u32x8(x) * u32x8(imod));
45+
return (x + x_ninv * mod) >> 32;
4846
#endif
4947
}
5048

0 commit comments

Comments
 (0)