Skip to content

Commit b639795

Browse files
committed
Merge pull request #2403 from alalek:fix_core_simd_emulator
2 parents 7a817da + ee3ef10 commit b639795

File tree

2 files changed

+19
-14
lines changed

2 files changed

+19
-14
lines changed

modules/optflow/src/sparse_matching_gpc.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
//M*/
4242

4343
#include "precomp.hpp"
44+
#undef CV_FORCE_SIMD128_CPP // mixed HAL SIMD/SSE code
4445
#include "opencv2/core/core_c.h"
4546
#include "opencv2/core/private.hpp"
4647
#include "opencv2/flann/miniflann.hpp"

modules/xphoto/src/learning_based_color_balance.cpp

Lines changed: 18 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -262,26 +262,26 @@ void LearningBasedWBImpl::getAverageAndBrightestColorChromaticity(Vec2f &average
262262
uint sumB = 0, sumG = 0, sumR = 0;
263263
uchar *src_ptr = src.ptr<uchar>();
264264
#if CV_SIMD128
265-
v_uint8x16 v_inB, v_inG, v_inR, v_mask;
266-
v_uint16x8 v_sR1, v_sR2, v_sG1, v_sG2, v_sB1, v_sB2, v_sum;
267-
v_uint16x8 v_max_sum = v_setall_u16(0), v_max_mask, v_brightestR, v_brightestG, v_brightestB;
268-
v_uint32x4 v_uint1, v_uint2, v_SB = v_setzero_u32(), v_SG = v_setzero_u32(), v_SR = v_setzero_u32();
265+
v_uint16x8 v_max_sum = v_setall_u16(0), v_brightestR = v_setall_u16(0), v_brightestG = v_setall_u16(0), v_brightestB = v_setall_u16(0);
266+
v_uint32x4 v_SB = v_setzero_u32(), v_SG = v_setzero_u32(), v_SR = v_setzero_u32();
269267
for (; i < src_len - 15; i += 16)
270268
{
269+
v_uint8x16 v_inB, v_inG, v_inR;
271270
v_load_deinterleave(src_ptr + 3 * i, v_inB, v_inG, v_inR);
272-
v_mask = v_load(mask_ptr + i);
271+
v_uint8x16 v_mask = v_load(mask_ptr + i);
273272

274273
v_inB &= v_mask;
275274
v_inG &= v_mask;
276275
v_inR &= v_mask;
277276

277+
v_uint16x8 v_sR1, v_sR2, v_sG1, v_sG2, v_sB1, v_sB2;
278278
v_expand(v_inB, v_sB1, v_sB2);
279279
v_expand(v_inG, v_sG1, v_sG2);
280280
v_expand(v_inR, v_sR1, v_sR2);
281281

282282
// update the brightest (R,G,B) tuple (process left half):
283-
v_sum = v_sB1 + v_sG1 + v_sR1;
284-
v_max_mask = (v_sum > v_max_sum);
283+
v_uint16x8 v_sum = v_sB1 + v_sG1 + v_sR1;
284+
v_uint16x8 v_max_mask = (v_sum > v_max_sum);
285285
v_max_sum = v_max(v_sum, v_max_sum);
286286
v_brightestB = (v_sB1 & v_max_mask) + (v_brightestB & (~v_max_mask));
287287
v_brightestG = (v_sG1 & v_max_mask) + (v_brightestG & (~v_max_mask));
@@ -299,6 +299,8 @@ void LearningBasedWBImpl::getAverageAndBrightestColorChromaticity(Vec2f &average
299299
v_sB1 = v_sB1 + v_sB2;
300300
v_sG1 = v_sG1 + v_sG2;
301301
v_sR1 = v_sR1 + v_sR2;
302+
303+
v_uint32x4 v_uint1, v_uint2;
302304
v_expand(v_sB1, v_uint1, v_uint2);
303305
v_SB += v_uint1 + v_uint2;
304306
v_expand(v_sG1, v_uint1, v_uint2);
@@ -351,27 +353,28 @@ void LearningBasedWBImpl::getAverageAndBrightestColorChromaticity(Vec2f &average
351353
uint64 sumB = 0, sumG = 0, sumR = 0;
352354
ushort *src_ptr = src.ptr<ushort>();
353355
#if CV_SIMD128
354-
v_uint16x8 v_inB, v_inG, v_inR, v_mask, v_mask_lower = v_setall_u16(255);
355-
v_uint32x4 v_iR1, v_iR2, v_iG1, v_iG2, v_iB1, v_iB2, v_sum;
356-
v_uint32x4 v_max_sum = v_setall_u32(0), v_max_mask, v_brightestR, v_brightestG, v_brightestB;
357-
v_uint64x2 v_uint64_1, v_uint64_2, v_SB = v_setzero_u64(), v_SG = v_setzero_u64(), v_SR = v_setzero_u64();
356+
const v_uint16x8 v_mask_lower = v_setall_u16(255);
357+
v_uint32x4 v_max_sum = v_setall_u32(0), v_brightestR = v_setall_u32(0), v_brightestG = v_setall_u32(0), v_brightestB = v_setall_u32(0);
358+
v_uint64x2 v_SB = v_setzero_u64(), v_SG = v_setzero_u64(), v_SR = v_setzero_u64();
358359
for (; i < src_len - 7; i += 8)
359360
{
361+
v_uint16x8 v_inB, v_inG, v_inR;
360362
v_load_deinterleave(src_ptr + 3 * i, v_inB, v_inG, v_inR);
361-
v_mask = v_load_expand(mask_ptr + i);
363+
v_uint16x8 v_mask = v_load_expand(mask_ptr + i);
362364
v_mask = v_mask | ((v_mask & v_mask_lower) << 8);
363365

364366
v_inB &= v_mask;
365367
v_inG &= v_mask;
366368
v_inR &= v_mask;
367369

370+
v_uint32x4 v_iR1, v_iR2, v_iG1, v_iG2, v_iB1, v_iB2;
368371
v_expand(v_inB, v_iB1, v_iB2);
369372
v_expand(v_inG, v_iG1, v_iG2);
370373
v_expand(v_inR, v_iR1, v_iR2);
371374

372375
// update the brightest (R,G,B) tuple (process left half):
373-
v_sum = v_iB1 + v_iG1 + v_iR1;
374-
v_max_mask = (v_sum > v_max_sum);
376+
v_uint32x4 v_sum = v_iB1 + v_iG1 + v_iR1;
377+
v_uint32x4 v_max_mask = (v_sum > v_max_sum);
375378
v_max_sum = v_max(v_sum, v_max_sum);
376379
v_brightestB = (v_iB1 & v_max_mask) + (v_brightestB & (~v_max_mask));
377380
v_brightestG = (v_iG1 & v_max_mask) + (v_brightestG & (~v_max_mask));
@@ -389,6 +392,7 @@ void LearningBasedWBImpl::getAverageAndBrightestColorChromaticity(Vec2f &average
389392
v_iB1 = v_iB1 + v_iB2;
390393
v_iG1 = v_iG1 + v_iG2;
391394
v_iR1 = v_iR1 + v_iR2;
395+
v_uint64x2 v_uint64_1, v_uint64_2;
392396
v_expand(v_iB1, v_uint64_1, v_uint64_2);
393397
v_SB += v_uint64_1 + v_uint64_2;
394398
v_expand(v_iG1, v_uint64_1, v_uint64_2);

0 commit comments

Comments
 (0)