Skip to content

Commit fb3b297

Browse files
author
Anna Khakimova
authored
Merge pull request opencv#18466 from anna-khakimova:ak/simd_addw_bitwise
GAPI: SIMD optimization for AddWeighted kernel. * Add, sub, absdiff kernels optimization * AddW kernel * And, or kernels * AddWeighted refactoring and SIMD opt for AbsDiffC kernel * Remove simd opt of AbsDiffC kernel * Refactoring * Applied comments * Refactoring.Step2 * Applied comments.Step2
1 parent 050327a commit fb3b297

File tree

1 file changed

+131
-2
lines changed

1 file changed

+131
-2
lines changed

modules/gapi/src/backends/fluid/gfluidcore.cpp

Lines changed: 131 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,130 @@ static inline DST divr(SRC1 x, SRC2 y, float scale=1)
9797
// Fluid kernels: addWeighted
9898
//
9999
//---------------------------
100+
#if CV_SSE2
101+
CV_ALWAYS_INLINE v_float32 v_load_f32(const ushort* in)
102+
{
103+
return v_cvt_f32(v_reinterpret_as_s32(vx_load_expand(in)));
104+
}
105+
106+
CV_ALWAYS_INLINE v_float32 v_load_f32(const short* in)
107+
{
108+
return v_cvt_f32(vx_load_expand(in));
109+
}
110+
111+
CV_ALWAYS_INLINE v_float32 v_load_f32(const uchar* in)
112+
{
113+
return v_cvt_f32(v_reinterpret_as_s32(vx_load_expand_q(in)));
114+
}
115+
116+
CV_ALWAYS_INLINE void addw_short_store(short* out, const v_int32& c1, const v_int32& c2)
117+
{
118+
vx_store(out, v_pack(c1, c2));
119+
}
120+
121+
CV_ALWAYS_INLINE void addw_short_store(ushort* out, const v_int32& c1, const v_int32& c2)
122+
{
123+
vx_store(out, v_pack_u(c1, c2));
124+
}
125+
126+
template<typename SRC, typename DST>
127+
CV_ALWAYS_INLINE int addw_simd(const SRC in1[], const SRC in2[], DST out[],
128+
const float _alpha, const float _beta,
129+
const float _gamma, int length)
130+
{
131+
static_assert(((std::is_same<SRC, ushort>::value) && (std::is_same<DST, ushort>::value)) ||
132+
((std::is_same<SRC, short>::value) && (std::is_same<DST, short>::value)),
133+
"This templated overload is only for short and ushort type combinations.");
134+
135+
constexpr int nlanes = (std::is_same<DST, ushort>::value) ? static_cast<int>(v_uint16::nlanes) :
136+
static_cast<int>(v_int16::nlanes);
137+
138+
if (length < nlanes)
139+
return 0;
140+
141+
v_float32 alpha = vx_setall_f32(_alpha);
142+
v_float32 beta = vx_setall_f32(_beta);
143+
v_float32 gamma = vx_setall_f32(_gamma);
144+
145+
int x = 0;
146+
for (;;)
147+
{
148+
for (; x <= length - nlanes; x += nlanes)
149+
{
150+
v_float32 a1 = v_load_f32(&in1[x]);
151+
v_float32 a2 = v_load_f32(&in1[x + nlanes / 2]);
152+
v_float32 b1 = v_load_f32(&in2[x]);
153+
v_float32 b2 = v_load_f32(&in2[x + nlanes / 2]);
154+
155+
addw_short_store(&out[x], v_round(v_fma(a1, alpha, v_fma(b1, beta, gamma))),
156+
v_round(v_fma(a2, alpha, v_fma(b2, beta, gamma))));
157+
}
158+
159+
if (x < length)
160+
{
161+
x = length - nlanes;
162+
continue; // process one more time (unaligned tail)
163+
}
164+
break;
165+
}
166+
return x;
167+
}
168+
169+
template<typename SRC>
170+
CV_ALWAYS_INLINE int addw_simd(const SRC in1[], const SRC in2[], uchar out[],
171+
const float _alpha, const float _beta,
172+
const float _gamma, int length)
173+
{
174+
constexpr int nlanes = v_uint8::nlanes;
175+
176+
if (length < nlanes)
177+
return 0;
178+
179+
v_float32 alpha = vx_setall_f32(_alpha);
180+
v_float32 beta = vx_setall_f32(_beta);
181+
v_float32 gamma = vx_setall_f32(_gamma);
182+
183+
int x = 0;
184+
for (;;)
185+
{
186+
for (; x <= length - nlanes; x += nlanes)
187+
{
188+
v_float32 a1 = v_load_f32(&in1[x]);
189+
v_float32 a2 = v_load_f32(&in1[x + nlanes / 4]);
190+
v_float32 a3 = v_load_f32(&in1[x + nlanes / 2]);
191+
v_float32 a4 = v_load_f32(&in1[x + 3 * nlanes / 4]);
192+
v_float32 b1 = v_load_f32(&in2[x]);
193+
v_float32 b2 = v_load_f32(&in2[x + nlanes / 4]);
194+
v_float32 b3 = v_load_f32(&in2[x + nlanes / 2]);
195+
v_float32 b4 = v_load_f32(&in2[x + 3 * nlanes / 4]);
196+
197+
v_int32 sum1 = v_round(v_fma(a1, alpha, v_fma(b1, beta, gamma))),
198+
sum2 = v_round(v_fma(a2, alpha, v_fma(b2, beta, gamma))),
199+
sum3 = v_round(v_fma(a3, alpha, v_fma(b3, beta, gamma))),
200+
sum4 = v_round(v_fma(a4, alpha, v_fma(b4, beta, gamma)));
201+
202+
vx_store(&out[x], v_pack_u(v_pack(sum1, sum2), v_pack(sum3, sum4)));
203+
}
204+
205+
if (x < length)
206+
{
207+
x = length - nlanes;
208+
continue; // process one more time (unaligned tail)
209+
}
210+
break;
211+
}
212+
return x;
213+
}
214+
215+
template<typename SRC>
216+
CV_ALWAYS_INLINE int addw_simd(const SRC*, const SRC*, float*,
217+
const float, const float,
218+
const float, int)
219+
{
220+
//Cases when dst type is float are successfully vectorized with compiler.
221+
return 0;
222+
}
223+
#endif // CV_SSE2
100224

101225
template<typename DST, typename SRC1, typename SRC2>
102226
static void run_addweighted(Buffer &dst, const View &src1, const View &src2,
@@ -117,8 +241,13 @@ static void run_addweighted(Buffer &dst, const View &src1, const View &src2,
117241
auto _beta = static_cast<float>( beta );
118242
auto _gamma = static_cast<float>( gamma );
119243

120-
for (int l=0; l < length; l++)
121-
out[l] = addWeighted<DST>(in1[l], in2[l], _alpha, _beta, _gamma);
244+
int x = 0;
245+
#if CV_SSE2
246+
x = addw_simd(in1, in2, out, _alpha, _beta, _gamma, length);
247+
#endif
248+
249+
for (; x < length; ++x)
250+
out[x] = addWeighted<DST>(in1[x], in2[x], _alpha, _beta, _gamma);
122251
}
123252

124253
GAPI_FLUID_KERNEL(GFluidAddW, cv::gapi::core::GAddW, false)

0 commit comments

Comments
 (0)