factor out rotate_right

adamant-pwn · adamant-pwn · commit a6e235b78cec · 2025-04-28T11:02:59.000+02:00
diff --git a/cp-algo/math/cvector.hpp b/cp-algo/math/cvector.hpp
@@ -10,7 +10,7 @@ namespace stdx = std::experimental;
 namespace cp_algo::math::fft {
     static constexpr size_t flen = 4;
     using ftype = double;
-    using vftype = simd<ftype, flen>;
+    using vftype = dx4;
     using point = complex<ftype>;
     using vpoint = complex<vftype>;
     static constexpr vftype vz = {};
@@ -91,8 +91,8 @@ namespace cp_algo::math::fft {
                 vpoint res = vz;
                 for (size_t i = 0; i < flen; i++) {
                     res += vpoint(vz + Ax[i], vz + Ay[i]) * Bv;
-                    real(Bv) = __builtin_shufflevector(real(Bv), real(Bv), 3, 0, 1, 2);
-                    imag(Bv) = __builtin_shufflevector(imag(Bv), imag(Bv), 3, 0, 1, 2);
+                    real(Bv) = rotate_right(real(Bv));
+                    imag(Bv) = rotate_right(imag(Bv));
                     auto x = real(Bv)[0], y = imag(Bv)[0];
                     real(Bv)[0] = x * real(rt) - y * imag(rt);
                     imag(Bv)[0] = x * imag(rt) + y * real(rt);
diff --git a/cp-algo/math/fft.hpp b/cp-algo/math/fft.hpp
@@ -63,10 +63,10 @@ namespace cp_algo::math::fft {
                     vpoint Av = {vz + Ax[i], vz + Ay[i]}, Bv = {vz + Bx[i], vz + By[i]};
                     AC += Av * Cv; AD += Av * Dv;
                     BC += Bv * Cv; BD += Bv * Dv;
-                    real(Cv) = __builtin_shufflevector(real(Cv), real(Cv), 3, 0, 1, 2);
-                    imag(Cv) = __builtin_shufflevector(imag(Cv), imag(Cv), 3, 0, 1, 2);
-                    real(Dv) = __builtin_shufflevector(real(Dv), real(Dv), 3, 0, 1, 2);
-                    imag(Dv) = __builtin_shufflevector(imag(Dv), imag(Dv), 3, 0, 1, 2);
+                    real(Cv) = rotate_right(real(Cv));
+                    imag(Cv) = rotate_right(imag(Cv));
+                    real(Dv) = rotate_right(real(Dv));
+                    imag(Dv) = rotate_right(imag(Dv));
                     auto cx = real(Cv)[0], cy = imag(Cv)[0];
                     auto dx = real(Dv)[0], dy = imag(Dv)[0];
                     real(Cv)[0] = cx * real(rt) - cy * imag(rt);
diff --git a/cp-algo/util/simd.hpp b/cp-algo/util/simd.hpp
@@ -10,27 +10,25 @@ namespace cp_algo {
     using u64x4 = simd<uint64_t, 4>;
     using u32x8 = simd<uint32_t, 8>;
     using u32x4 = simd<uint32_t, 4>;
+    using dx4 = simd<double, 4>;
 
-    template<typename Simd>
-    Simd abs(Simd a) {
+    dx4 abs(dx4 a) {
 #ifdef __AVX2__
-    return _mm256_and_pd(a, Simd{} + 1/0.);
+    return _mm256_and_pd(a, dx4{} + 1/0.);
 #else
     return a < 0 ? -a : a;
 #endif
     }
 
-    template<typename Simd>
-    i64x4 lround(Simd a) {
+    i64x4 lround(dx4 a) {
 #ifdef __AVX2__
         return __builtin_convertvector(_mm256_round_pd(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC), i64x4);
 #else
         return __builtin_convertvector(a < 0 ? a - 0.5 : a + 0.5, i64x4);
 #endif
     }
 
-    template<typename Simd>
-    Simd round(Simd a) {
+    dx4 round(dx4 a) {
 #ifdef __AVX2__
         return _mm256_round_pd(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
 #else
@@ -55,6 +53,14 @@ namespace cp_algo {
         return montgomery_reduce(u64x4(_mm256_mul_epu32(__m256i(x), __m256i(y))), mod, imod);
 #else
         return montgomery_reduce(x * y, mod, imod);
+#endif
+    }
+
+    dx4 rotate_right(dx4 x) {
+#ifdef __AVX2__
+        return _mm256_permute4x64_pd(x, _MM_SHUFFLE(2, 1, 0, 3));
+#else
+        return __builtin_shufflevector(x, x, 3, 0, 1, 2);
 #endif
     }
 }