Skip to content

Commit 4371c27

Browse files
committed
Added SSE2 optimization for Wiener filtering in final estimate
1 parent 9b7f3ac commit 4371c27

File tree

2 files changed

+70
-10
lines changed

2 files changed

+70
-10
lines changed

source/BM3D_Final.cpp

Lines changed: 35 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -68,13 +68,43 @@ void BM3D_Final_Process::CollaborativeFilter(int plane,
6868
// Apply empirical Wiener filtering to the source group guided by the reference group
6969
const FLType sigmaSquare = d.f[plane].wienerSigmaSqr[GroupSize - 1];
7070

71-
Block_For_each(srcGroup, refGroup, [&](FLType &x, FLType y)
71+
auto srcp = srcGroup.data();
72+
auto refp = refGroup.data();
73+
const auto upper = srcp + srcGroup.size();
74+
75+
#if defined(__SSE2__)
76+
static const ptrdiff_t simd_step = 4;
77+
const ptrdiff_t simd_residue = srcGroup.size() % simd_step;
78+
const ptrdiff_t simd_width = srcGroup.size() - simd_residue;
79+
80+
const __m128 sgm_sqr = _mm_set_ps1(sigmaSquare);
81+
__m128 l2wiener_sum = _mm_setzero_ps();
82+
83+
for (const auto upper1 = srcp + simd_width; srcp < upper1; srcp += simd_step, refp += simd_step)
84+
{
85+
const __m128 s1 = _mm_load_ps(srcp);
86+
const __m128 r1 = _mm_load_ps(refp);
87+
const __m128 r1sqr = _mm_mul_ps(r1, r1);
88+
89+
const __m128 wiener = _mm_mul_ps(r1sqr, _mm_rcp_ps(_mm_add_ps(r1sqr, sgm_sqr)));
90+
91+
const __m128 d1 = _mm_mul_ps(s1, wiener);
92+
_mm_store_ps(srcp, d1);
93+
l2wiener_sum = _mm_add_ps(l2wiener_sum, _mm_mul_ps(wiener, wiener));
94+
}
95+
96+
alignas(16) FLType l2wiener_sum_f32[4];
97+
_mm_store_ps(l2wiener_sum_f32, l2wiener_sum);
98+
L2Wiener += l2wiener_sum_f32[0] + l2wiener_sum_f32[1] + l2wiener_sum_f32[2] + l2wiener_sum_f32[3];
99+
#endif
100+
101+
for (; srcp < upper; ++srcp, ++refp)
72102
{
73-
FLType ySquare = y * y;
74-
FLType wienerCoef = ySquare / (ySquare + sigmaSquare);
75-
x *= wienerCoef;
103+
const FLType refSquare = *refp * *refp;
104+
const FLType wienerCoef = refSquare / (refSquare + sigmaSquare);
105+
*srcp *= wienerCoef;
76106
L2Wiener += wienerCoef * wienerCoef;
77-
});
107+
}
78108

79109
// Apply backward 3D transform to the filtered group
80110
d.f[plane].bp[GroupSize - 1].execute_r2r(srcGroup.data(), srcGroup.data());

source/VBM3D_Final.cpp

Lines changed: 35 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -68,13 +68,43 @@ void VBM3D_Final_Process::CollaborativeFilter(int plane,
6868
// Apply empirical Wiener filtering to the source group guided by the reference group
6969
const FLType sigmaSquare = d.f[plane].wienerSigmaSqr[GroupSize - 1];
7070

71-
Block_For_each(srcGroup, refGroup, [&](FLType &x, FLType y)
71+
auto srcp = srcGroup.data();
72+
auto refp = refGroup.data();
73+
const auto upper = srcp + srcGroup.size();
74+
75+
#if defined(__SSE2__)
76+
static const ptrdiff_t simd_step = 4;
77+
const ptrdiff_t simd_residue = srcGroup.size() % simd_step;
78+
const ptrdiff_t simd_width = srcGroup.size() - simd_residue;
79+
80+
const __m128 sgm_sqr = _mm_set_ps1(sigmaSquare);
81+
__m128 l2wiener_sum = _mm_setzero_ps();
82+
83+
for (const auto upper1 = srcp + simd_width; srcp < upper1; srcp += simd_step, refp += simd_step)
84+
{
85+
const __m128 s1 = _mm_load_ps(srcp);
86+
const __m128 r1 = _mm_load_ps(refp);
87+
const __m128 r1sqr = _mm_mul_ps(r1, r1);
88+
89+
const __m128 wiener = _mm_mul_ps(r1sqr, _mm_rcp_ps(_mm_add_ps(r1sqr, sgm_sqr)));
90+
91+
const __m128 d1 = _mm_mul_ps(s1, wiener);
92+
_mm_store_ps(srcp, d1);
93+
l2wiener_sum = _mm_add_ps(l2wiener_sum, _mm_mul_ps(wiener, wiener));
94+
}
95+
96+
alignas(16) FLType l2wiener_sum_f32[4];
97+
_mm_store_ps(l2wiener_sum_f32, l2wiener_sum);
98+
L2Wiener += l2wiener_sum_f32[0] + l2wiener_sum_f32[1] + l2wiener_sum_f32[2] + l2wiener_sum_f32[3];
99+
#endif
100+
101+
for (; srcp < upper; ++srcp, ++refp)
72102
{
73-
FLType ySquare = y * y;
74-
FLType wienerCoef = ySquare / (ySquare + sigmaSquare);
75-
x *= wienerCoef;
103+
const FLType refSquare = *refp * *refp;
104+
const FLType wienerCoef = refSquare / (refSquare + sigmaSquare);
105+
*srcp *= wienerCoef;
76106
L2Wiener += wienerCoef * wienerCoef;
77-
});
107+
}
78108

79109
// Apply backward 3D transform to the filtered group
80110
d.f[plane].bp[GroupSize - 1].execute_r2r(srcGroup.data(), srcGroup.data());

0 commit comments

Comments
 (0)