File tree Expand file tree Collapse file tree 3 files changed +15
-12
lines changed Expand file tree Collapse file tree 3 files changed +15
-12
lines changed Original file line number Diff line number Diff line change @@ -104,6 +104,15 @@ class ClockCounter
104
104
#endif
105
105
106
106
107
+ #if defined(__SSE2__)
108
+ inline __m128 _mm_abs_ps (const __m128 &x)
109
+ {
110
+ static const __m128 mask = _mm_castsi128_ps (_mm_set1_epi32 (~0x80000000 ));
111
+ return _mm_and_ps (x, mask);
112
+ }
113
+ #endif
114
+
115
+
107
116
// //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
108
117
// Exception handle
109
118
Original file line number Diff line number Diff line change @@ -88,18 +88,15 @@ void BM3D_Basic_Process::CollaborativeFilter(int plane,
88
88
const ptrdiff_t simd_residue = srcGroup.size () % simd_step;
89
89
const ptrdiff_t simd_width = srcGroup.size () - simd_residue;
90
90
91
- static const __m128 zero_ps = _mm_setzero_ps ();
92
91
__m128i cmp_sum = _mm_setzero_si128 ();
93
92
94
93
for (const auto upper1 = srcp + simd_width; srcp < upper1; srcp += simd_step, thrp += simd_step)
95
94
{
96
95
const __m128 s1 = _mm_load_ps (srcp);
97
- const __m128 t1p = _mm_load_ps (thrp);
98
- const __m128 t1n = _mm_sub_ps (zero_ps, t1p);
96
+ const __m128 t1 = _mm_load_ps (thrp);
99
97
100
- const __m128 cmp1 = _mm_cmpgt_ps (s1, t1p);
101
- const __m128 cmp2 = _mm_cmplt_ps (s1, t1n);
102
- const __m128 cmp = _mm_or_ps (cmp1, cmp2);
98
+ const __m128 s1abs = _mm_abs_ps (s1);
99
+ const __m128 cmp = _mm_cmpgt_ps (s1abs, t1);
103
100
104
101
const __m128 d1 = _mm_and_ps (cmp, s1);
105
102
_mm_store_ps (srcp, d1);
Original file line number Diff line number Diff line change @@ -88,18 +88,15 @@ void VBM3D_Basic_Process::CollaborativeFilter(int plane,
88
88
const ptrdiff_t simd_residue = srcGroup.size () % simd_step;
89
89
const ptrdiff_t simd_width = srcGroup.size () - simd_residue;
90
90
91
- static const __m128 zero_ps = _mm_setzero_ps ();
92
91
__m128i cmp_sum = _mm_setzero_si128 ();
93
92
94
93
for (const auto upper1 = srcp + simd_width; srcp < upper1; srcp += simd_step, thrp += simd_step)
95
94
{
96
95
const __m128 s1 = _mm_load_ps (srcp);
97
- const __m128 t1p = _mm_load_ps (thrp);
98
- const __m128 t1n = _mm_sub_ps (zero_ps, t1p);
96
+ const __m128 t1 = _mm_load_ps (thrp);
99
97
100
- const __m128 cmp1 = _mm_cmpgt_ps (s1, t1p);
101
- const __m128 cmp2 = _mm_cmplt_ps (s1, t1n);
102
- const __m128 cmp = _mm_or_ps (cmp1, cmp2);
98
+ const __m128 s1abs = _mm_abs_ps (s1);
99
+ const __m128 cmp = _mm_cmpgt_ps (s1abs, t1);
103
100
104
101
const __m128 d1 = _mm_and_ps (cmp, s1);
105
102
_mm_store_ps (srcp, d1);
You can’t perform that action at this time.
0 commit comments