Skip to content

Commit 1263139

Browse files
committed
Optimized the SSE2 path for hard-thresholding filter
Thanks for the idea from MonoS: MonoS@421fa78
1 parent 1fc6137 commit 1263139

File tree

3 files changed

+15
-12
lines changed

3 files changed

+15
-12
lines changed

include/Helper.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,15 @@ class ClockCounter
104104
#endif
105105

106106

107+
#if defined(__SSE2__)
108+
inline __m128 _mm_abs_ps(const __m128 &x)
109+
{
110+
static const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(~0x80000000));
111+
return _mm_and_ps(x, mask);
112+
}
113+
#endif
114+
115+
107116
////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
108117
// Exception handle
109118

source/BM3D_Basic.cpp

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -88,18 +88,15 @@ void BM3D_Basic_Process::CollaborativeFilter(int plane,
8888
const ptrdiff_t simd_residue = srcGroup.size() % simd_step;
8989
const ptrdiff_t simd_width = srcGroup.size() - simd_residue;
9090

91-
static const __m128 zero_ps = _mm_setzero_ps();
9291
__m128i cmp_sum = _mm_setzero_si128();
9392

9493
for (const auto upper1 = srcp + simd_width; srcp < upper1; srcp += simd_step, thrp += simd_step)
9594
{
9695
const __m128 s1 = _mm_load_ps(srcp);
97-
const __m128 t1p = _mm_load_ps(thrp);
98-
const __m128 t1n = _mm_sub_ps(zero_ps, t1p);
96+
const __m128 t1 = _mm_load_ps(thrp);
9997

100-
const __m128 cmp1 = _mm_cmpgt_ps(s1, t1p);
101-
const __m128 cmp2 = _mm_cmplt_ps(s1, t1n);
102-
const __m128 cmp = _mm_or_ps(cmp1, cmp2);
98+
const __m128 s1abs = _mm_abs_ps(s1);
99+
const __m128 cmp = _mm_cmpgt_ps(s1abs, t1);
103100

104101
const __m128 d1 = _mm_and_ps(cmp, s1);
105102
_mm_store_ps(srcp, d1);

source/VBM3D_Basic.cpp

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -88,18 +88,15 @@ void VBM3D_Basic_Process::CollaborativeFilter(int plane,
8888
const ptrdiff_t simd_residue = srcGroup.size() % simd_step;
8989
const ptrdiff_t simd_width = srcGroup.size() - simd_residue;
9090

91-
static const __m128 zero_ps = _mm_setzero_ps();
9291
__m128i cmp_sum = _mm_setzero_si128();
9392

9493
for (const auto upper1 = srcp + simd_width; srcp < upper1; srcp += simd_step, thrp += simd_step)
9594
{
9695
const __m128 s1 = _mm_load_ps(srcp);
97-
const __m128 t1p = _mm_load_ps(thrp);
98-
const __m128 t1n = _mm_sub_ps(zero_ps, t1p);
96+
const __m128 t1 = _mm_load_ps(thrp);
9997

100-
const __m128 cmp1 = _mm_cmpgt_ps(s1, t1p);
101-
const __m128 cmp2 = _mm_cmplt_ps(s1, t1n);
102-
const __m128 cmp = _mm_or_ps(cmp1, cmp2);
98+
const __m128 s1abs = _mm_abs_ps(s1);
99+
const __m128 cmp = _mm_cmpgt_ps(s1abs, t1);
103100

104101
const __m128 d1 = _mm_and_ps(cmp, s1);
105102
_mm_store_ps(srcp, d1);

0 commit comments

Comments
 (0)