@@ -68,13 +68,43 @@ void VBM3D_Final_Process::CollaborativeFilter(int plane,
68
68
// Apply empirical Wiener filtering to the source group guided by the reference group
69
69
const FLType sigmaSquare = d.f [plane].wienerSigmaSqr [GroupSize - 1 ];
70
70
71
- Block_For_each (srcGroup, refGroup, [&](FLType &x, FLType y)
71
+ auto srcp = srcGroup.data ();
72
+ auto refp = refGroup.data ();
73
+ const auto upper = srcp + srcGroup.size ();
74
+
75
+ #if defined(__SSE2__)
76
+ static const ptrdiff_t simd_step = 4 ;
77
+ const ptrdiff_t simd_residue = srcGroup.size () % simd_step;
78
+ const ptrdiff_t simd_width = srcGroup.size () - simd_residue;
79
+
80
+ const __m128 sgm_sqr = _mm_set_ps1 (sigmaSquare);
81
+ __m128 l2wiener_sum = _mm_setzero_ps ();
82
+
83
+ for (const auto upper1 = srcp + simd_width; srcp < upper1; srcp += simd_step, refp += simd_step)
84
+ {
85
+ const __m128 s1 = _mm_load_ps (srcp);
86
+ const __m128 r1 = _mm_load_ps (refp);
87
+ const __m128 r1sqr = _mm_mul_ps (r1, r1);
88
+
89
+ const __m128 wiener = _mm_mul_ps (r1sqr, _mm_rcp_ps (_mm_add_ps (r1sqr, sgm_sqr)));
90
+
91
+ const __m128 d1 = _mm_mul_ps (s1, wiener);
92
+ _mm_store_ps (srcp, d1);
93
+ l2wiener_sum = _mm_add_ps (l2wiener_sum, _mm_mul_ps (wiener, wiener));
94
+ }
95
+
96
+ alignas (16 ) FLType l2wiener_sum_f32[4 ];
97
+ _mm_store_ps (l2wiener_sum_f32, l2wiener_sum);
98
+ L2Wiener += l2wiener_sum_f32[0 ] + l2wiener_sum_f32[1 ] + l2wiener_sum_f32[2 ] + l2wiener_sum_f32[3 ];
99
+ #endif
100
+
101
+ for (; srcp < upper; ++srcp, ++refp)
72
102
{
73
- FLType ySquare = y * y ;
74
- FLType wienerCoef = ySquare / (ySquare + sigmaSquare);
75
- x *= wienerCoef;
103
+ const FLType refSquare = *refp * *refp ;
104
+ const FLType wienerCoef = refSquare / (refSquare + sigmaSquare);
105
+ *srcp *= wienerCoef;
76
106
L2Wiener += wienerCoef * wienerCoef;
77
- });
107
+ }
78
108
79
109
// Apply backward 3D transform to the filtered group
80
110
d.f [plane].bp [GroupSize - 1 ].execute_r2r (srcGroup.data (), srcGroup.data ());
0 commit comments