Skip to content

Commit 6ee23c9

Browse files
authored
Merge pull request opencv#19486 from fpetrogalli:dotprod_fast-3.4
* [hal][neon] Optimize the v_dotprod_fast intrinsics for aarch64. On Armv8 in AArch64 execution mode, we can skip the sequence v<op>_<ty>(vget_high_<ty>(x), vget_high_<ty>(y)) in favour of v<op>_high_<ty>(x, y) This has better changes for recent compilers to use less data movement operations and better register allocation. See for example: https://godbolt.org/z/bPq7vd * [hal][neon] Fix build failure on armv7. * [hal][neon] Address review comments in PR. PR: opencv#19486 * [hal][neon] Define macro to check for the AArch64 execution state of Armv8. * [hal][neon] Fix macro definition for AArch64. The fix is needed to prevent warnings when building for Armv7.
1 parent cac5b0f commit 6ee23c9

File tree

1 file changed

+37
-1
lines changed

1 file changed

+37
-1
lines changed

modules/core/include/opencv2/core/hal/intrin_neon.hpp

Lines changed: 37 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,22 @@ CV_CPU_OPTIMIZATION_HAL_NAMESPACE_BEGIN
6262
#define CV_SIMD128_64F 0
6363
#endif
6464

65+
// The following macro checks if the code is being compiled for the
66+
// AArch64 execution state of Armv8, to enable the 128-bit
67+
// intrinsics. The macro `__ARM_64BIT_STATE` is the one recommended by
68+
// the Arm C Language Extension (ACLE) specifications [1] to check the
69+
// availability of 128-bit intrinsics, and it is supporrted by clang
70+
// and gcc. The macro `_M_ARM64` is the equivalent one for Microsoft
71+
// Visual Studio [2] .
72+
//
73+
// [1] https://developer.arm.com/documentation/101028/0012/13--Advanced-SIMD--Neon--intrinsics
74+
// [2] https://docs.microsoft.com/en-us/cpp/preprocessor/predefined-macros
75+
#if defined(__ARM_64BIT_STATE) || defined(_M_ARM64)
76+
#define CV_NEON_AARCH64 1
77+
#else
78+
#define CV_NEON_AARCH64 0
79+
#endif
80+
6581
// TODO
6682
#define CV_NEON_DOT 0
6783

@@ -726,41 +742,61 @@ inline v_float64x2 v_dotprod_expand(const v_int32x4& a, const v_int32x4& b,
726742
// 16 >> 32
727743
inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b)
728744
{
745+
#if CV_NEON_AARCH64
746+
int32x4_t p = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val));
747+
return v_int32x4(vmlal_high_s16(p, a.val, b.val));
748+
#else
729749
int16x4_t a0 = vget_low_s16(a.val);
730750
int16x4_t a1 = vget_high_s16(a.val);
731751
int16x4_t b0 = vget_low_s16(b.val);
732752
int16x4_t b1 = vget_high_s16(b.val);
733753
int32x4_t p = vmull_s16(a0, b0);
734754
return v_int32x4(vmlal_s16(p, a1, b1));
755+
#endif
735756
}
736757
inline v_int32x4 v_dotprod_fast(const v_int16x8& a, const v_int16x8& b, const v_int32x4& c)
737758
{
759+
#if CV_NEON_AARCH64
760+
int32x4_t p = vmlal_s16(c.val, vget_low_s16(a.val), vget_low_s16(b.val));
761+
return v_int32x4(vmlal_high_s16(p, a.val, b.val));
762+
#else
738763
int16x4_t a0 = vget_low_s16(a.val);
739764
int16x4_t a1 = vget_high_s16(a.val);
740765
int16x4_t b0 = vget_low_s16(b.val);
741766
int16x4_t b1 = vget_high_s16(b.val);
742767
int32x4_t p = vmlal_s16(c.val, a0, b0);
743768
return v_int32x4(vmlal_s16(p, a1, b1));
769+
#endif
744770
}
745771

746772
// 32 >> 64
747773
inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b)
748774
{
775+
#if CV_NEON_AARCH64
776+
int64x2_t p = vmull_s32(vget_low_s32(a.val), vget_low_s32(b.val));
777+
return v_int64x2(vmlal_high_s32(p, a.val, b.val));
778+
#else
749779
int32x2_t a0 = vget_low_s32(a.val);
750780
int32x2_t a1 = vget_high_s32(a.val);
751781
int32x2_t b0 = vget_low_s32(b.val);
752782
int32x2_t b1 = vget_high_s32(b.val);
753783
int64x2_t p = vmull_s32(a0, b0);
754784
return v_int64x2(vmlal_s32(p, a1, b1));
785+
#endif
755786
}
756787
inline v_int64x2 v_dotprod_fast(const v_int32x4& a, const v_int32x4& b, const v_int64x2& c)
757788
{
789+
#if CV_NEON_AARCH64
790+
int64x2_t p = vmlal_s32(c.val, vget_low_s32(a.val), vget_low_s32(b.val));
791+
return v_int64x2(vmlal_high_s32(p, a.val, b.val));
792+
#else
758793
int32x2_t a0 = vget_low_s32(a.val);
759794
int32x2_t a1 = vget_high_s32(a.val);
760795
int32x2_t b0 = vget_low_s32(b.val);
761796
int32x2_t b1 = vget_high_s32(b.val);
762797
int64x2_t p = vmlal_s32(c.val, a0, b0);
763798
return v_int64x2(vmlal_s32(p, a1, b1));
799+
#endif
764800
}
765801

766802
// 8 >> 32
@@ -1292,7 +1328,7 @@ inline int64 v_reduce_sum(const v_int64x2& a)
12921328
#if CV_SIMD128_64F
12931329
inline double v_reduce_sum(const v_float64x2& a)
12941330
{
1295-
return vgetq_lane_f64(a.val, 0) + vgetq_lane_f64(a.val, 1);
1331+
return vaddvq_f64(a.val);
12961332
}
12971333
#endif
12981334

0 commit comments

Comments
 (0)