@@ -97,6 +97,130 @@ static inline DST divr(SRC1 x, SRC2 y, float scale=1)
97
97
// Fluid kernels: addWeighted
98
98
//
99
99
// ---------------------------
100
+ #if CV_SSE2
101
+ CV_ALWAYS_INLINE v_float32 v_load_f32 (const ushort* in)
102
+ {
103
+ return v_cvt_f32 (v_reinterpret_as_s32 (vx_load_expand (in)));
104
+ }
105
+
106
+ CV_ALWAYS_INLINE v_float32 v_load_f32 (const short * in)
107
+ {
108
+ return v_cvt_f32 (vx_load_expand (in));
109
+ }
110
+
111
+ CV_ALWAYS_INLINE v_float32 v_load_f32 (const uchar* in)
112
+ {
113
+ return v_cvt_f32 (v_reinterpret_as_s32 (vx_load_expand_q (in)));
114
+ }
115
+
116
+ CV_ALWAYS_INLINE void addw_short_store (short * out, const v_int32& c1, const v_int32& c2)
117
+ {
118
+ vx_store (out, v_pack (c1, c2));
119
+ }
120
+
121
+ CV_ALWAYS_INLINE void addw_short_store (ushort* out, const v_int32& c1, const v_int32& c2)
122
+ {
123
+ vx_store (out, v_pack_u (c1, c2));
124
+ }
125
+
126
+ template <typename SRC, typename DST>
127
+ CV_ALWAYS_INLINE int addw_simd (const SRC in1[], const SRC in2[], DST out[],
128
+ const float _alpha, const float _beta,
129
+ const float _gamma, int length)
130
+ {
131
+ static_assert (((std::is_same<SRC, ushort>::value) && (std::is_same<DST, ushort>::value)) ||
132
+ ((std::is_same<SRC, short >::value) && (std::is_same<DST, short >::value)),
133
+ " This templated overload is only for short and ushort type combinations." );
134
+
135
+ constexpr int nlanes = (std::is_same<DST, ushort>::value) ? static_cast <int >(v_uint16::nlanes) :
136
+ static_cast <int >(v_int16::nlanes);
137
+
138
+ if (length < nlanes)
139
+ return 0 ;
140
+
141
+ v_float32 alpha = vx_setall_f32 (_alpha);
142
+ v_float32 beta = vx_setall_f32 (_beta);
143
+ v_float32 gamma = vx_setall_f32 (_gamma);
144
+
145
+ int x = 0 ;
146
+ for (;;)
147
+ {
148
+ for (; x <= length - nlanes; x += nlanes)
149
+ {
150
+ v_float32 a1 = v_load_f32 (&in1[x]);
151
+ v_float32 a2 = v_load_f32 (&in1[x + nlanes / 2 ]);
152
+ v_float32 b1 = v_load_f32 (&in2[x]);
153
+ v_float32 b2 = v_load_f32 (&in2[x + nlanes / 2 ]);
154
+
155
+ addw_short_store (&out[x], v_round (v_fma (a1, alpha, v_fma (b1, beta, gamma))),
156
+ v_round (v_fma (a2, alpha, v_fma (b2, beta, gamma))));
157
+ }
158
+
159
+ if (x < length)
160
+ {
161
+ x = length - nlanes;
162
+ continue ; // process one more time (unaligned tail)
163
+ }
164
+ break ;
165
+ }
166
+ return x;
167
+ }
168
+
169
+ template <typename SRC>
170
+ CV_ALWAYS_INLINE int addw_simd (const SRC in1[], const SRC in2[], uchar out[],
171
+ const float _alpha, const float _beta,
172
+ const float _gamma, int length)
173
+ {
174
+ constexpr int nlanes = v_uint8::nlanes;
175
+
176
+ if (length < nlanes)
177
+ return 0 ;
178
+
179
+ v_float32 alpha = vx_setall_f32 (_alpha);
180
+ v_float32 beta = vx_setall_f32 (_beta);
181
+ v_float32 gamma = vx_setall_f32 (_gamma);
182
+
183
+ int x = 0 ;
184
+ for (;;)
185
+ {
186
+ for (; x <= length - nlanes; x += nlanes)
187
+ {
188
+ v_float32 a1 = v_load_f32 (&in1[x]);
189
+ v_float32 a2 = v_load_f32 (&in1[x + nlanes / 4 ]);
190
+ v_float32 a3 = v_load_f32 (&in1[x + nlanes / 2 ]);
191
+ v_float32 a4 = v_load_f32 (&in1[x + 3 * nlanes / 4 ]);
192
+ v_float32 b1 = v_load_f32 (&in2[x]);
193
+ v_float32 b2 = v_load_f32 (&in2[x + nlanes / 4 ]);
194
+ v_float32 b3 = v_load_f32 (&in2[x + nlanes / 2 ]);
195
+ v_float32 b4 = v_load_f32 (&in2[x + 3 * nlanes / 4 ]);
196
+
197
+ v_int32 sum1 = v_round (v_fma (a1, alpha, v_fma (b1, beta, gamma))),
198
+ sum2 = v_round (v_fma (a2, alpha, v_fma (b2, beta, gamma))),
199
+ sum3 = v_round (v_fma (a3, alpha, v_fma (b3, beta, gamma))),
200
+ sum4 = v_round (v_fma (a4, alpha, v_fma (b4, beta, gamma)));
201
+
202
+ vx_store (&out[x], v_pack_u (v_pack (sum1, sum2), v_pack (sum3, sum4)));
203
+ }
204
+
205
+ if (x < length)
206
+ {
207
+ x = length - nlanes;
208
+ continue ; // process one more time (unaligned tail)
209
+ }
210
+ break ;
211
+ }
212
+ return x;
213
+ }
214
+
215
+ template <typename SRC>
216
+ CV_ALWAYS_INLINE int addw_simd (const SRC*, const SRC*, float *,
217
+ const float , const float ,
218
+ const float , int )
219
+ {
220
+ // Cases when dst type is float are successfully vectorized with compiler.
221
+ return 0 ;
222
+ }
223
+ #endif // CV_SSE2
100
224
101
225
template <typename DST, typename SRC1, typename SRC2>
102
226
static void run_addweighted (Buffer &dst, const View &src1, const View &src2,
@@ -117,8 +241,13 @@ static void run_addweighted(Buffer &dst, const View &src1, const View &src2,
117
241
auto _beta = static_cast <float >( beta );
118
242
auto _gamma = static_cast <float >( gamma );
119
243
120
- for (int l=0 ; l < length; l++)
121
- out[l] = addWeighted<DST>(in1[l], in2[l], _alpha, _beta, _gamma);
244
+ int x = 0 ;
245
+ #if CV_SSE2
246
+ x = addw_simd (in1, in2, out, _alpha, _beta, _gamma, length);
247
+ #endif
248
+
249
+ for (; x < length; ++x)
250
+ out[x] = addWeighted<DST>(in1[x], in2[x], _alpha, _beta, _gamma);
122
251
}
123
252
124
253
GAPI_FLUID_KERNEL (GFluidAddW, cv::gapi::core::GAddW, false )
0 commit comments