Skip to content

Commit f9ec2f8

Browse files
committed
Merge pull request #16 from HolyWu/master
Added SSE2 instructions to SIMD implement of hard-thresholding filter for basic estimate
2 parents 4371c27 + b489aea commit f9ec2f8

File tree

2 files changed

+10
-2
lines changed

2 files changed

+10
-2
lines changed

source/BM3D_Basic.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ void BM3D_Basic_Process::CollaborativeFilter(int plane,
8383
auto thrp = d.f[plane].thrTable[GroupSize - 1].get();
8484
const auto upper = srcp + srcGroup.size();
8585

86-
#if defined(__SSE4_1__)
86+
#if defined(__SSE2__)
8787
static const ptrdiff_t simd_step = 4;
8888
const ptrdiff_t simd_residue = srcGroup.size() % simd_step;
8989
const ptrdiff_t simd_width = srcGroup.size() - simd_residue;
@@ -101,7 +101,11 @@ void BM3D_Basic_Process::CollaborativeFilter(int plane,
101101
const __m128 cmp2 = _mm_cmplt_ps(s1, t1n);
102102
const __m128 cmp = _mm_or_ps(cmp1, cmp2);
103103

104+
#if defined(__SSE4_1__)
104105
const __m128 d1 = _mm_blendv_ps(zero_ps, s1, cmp);
106+
#else
107+
const __m128 d1 = _mm_or_ps(_mm_and_ps(cmp, s1), _mm_andnot_ps(cmp, zero_ps));
108+
#endif
105109
_mm_store_ps(srcp, d1);
106110
cmp_sum = _mm_sub_epi32(cmp_sum, _mm_castps_si128(cmp));
107111
}

source/VBM3D_Basic.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ void VBM3D_Basic_Process::CollaborativeFilter(int plane,
8383
auto thrp = d.f[plane].thrTable[GroupSize - 1].get();
8484
const auto upper = srcp + srcGroup.size();
8585

86-
#if defined(__SSE4_1__)
86+
#if defined(__SSE2__)
8787
static const ptrdiff_t simd_step = 4;
8888
const ptrdiff_t simd_residue = srcGroup.size() % simd_step;
8989
const ptrdiff_t simd_width = srcGroup.size() - simd_residue;
@@ -101,7 +101,11 @@ void VBM3D_Basic_Process::CollaborativeFilter(int plane,
101101
const __m128 cmp2 = _mm_cmplt_ps(s1, t1n);
102102
const __m128 cmp = _mm_or_ps(cmp1, cmp2);
103103

104+
#if defined(__SSE4_1__)
104105
const __m128 d1 = _mm_blendv_ps(zero_ps, s1, cmp);
106+
#else
107+
const __m128 d1 = _mm_or_ps(_mm_and_ps(cmp, s1), _mm_andnot_ps(cmp, zero_ps));
108+
#endif
105109
_mm_store_ps(srcp, d1);
106110
cmp_sum = _mm_sub_epi32(cmp_sum, _mm_castps_si128(cmp));
107111
}

0 commit comments

Comments
 (0)