@@ -10,27 +10,25 @@ namespace cp_algo {
10
10
using u64x4 = simd<uint64_t , 4 >;
11
11
using u32x8 = simd<uint32_t , 8 >;
12
12
using u32x4 = simd<uint32_t , 4 >;
13
+ using dx4 = simd<double , 4 >;
13
14
14
- template <typename Simd>
15
- Simd abs (Simd a) {
15
+ dx4 abs (dx4 a) {
16
16
#ifdef __AVX2__
17
- return _mm256_and_pd (a, Simd {} + 1 /0 .);
17
+ return _mm256_and_pd (a, dx4 {} + 1 /0 .);
18
18
#else
19
19
return a < 0 ? -a : a;
20
20
#endif
21
21
}
22
22
23
- template <typename Simd>
24
- i64x4 lround (Simd a) {
23
+ i64x4 lround (dx4 a) {
25
24
#ifdef __AVX2__
26
25
return __builtin_convertvector (_mm256_round_pd (a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC), i64x4);
27
26
#else
28
27
return __builtin_convertvector (a < 0 ? a - 0.5 : a + 0.5 , i64x4);
29
28
#endif
30
29
}
31
30
32
- template <typename Simd>
33
- Simd round (Simd a) {
31
+ dx4 round (dx4 a) {
34
32
#ifdef __AVX2__
35
33
return _mm256_round_pd (a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
36
34
#else
@@ -55,6 +53,14 @@ namespace cp_algo {
55
53
return montgomery_reduce (u64x4 (_mm256_mul_epu32 (__m256i (x), __m256i (y))), mod, imod);
56
54
#else
57
55
return montgomery_reduce (x * y, mod, imod);
56
+ #endif
57
+ }
58
+
59
+ dx4 rotate_right (dx4 x) {
60
+ #ifdef __AVX2__
61
+ return _mm256_permute4x64_pd (x, _MM_SHUFFLE (2 , 1 , 0 , 3 ));
62
+ #else
63
+ return __builtin_shufflevector (x, x, 3 , 0 , 1 , 2 );
58
64
#endif
59
65
}
60
66
}
0 commit comments