@@ -229,25 +229,25 @@ class TrackerInvoker : public cv::ParallelLoopBody
229
229
v_int16x8 v01 = v_reinterpret_as_s16 (v_load_expand (Jptr + x + cn));
230
230
v_int16x8 v10 = v_reinterpret_as_s16 (v_load_expand (Jptr1 + x));
231
231
v_int16x8 v11 = v_reinterpret_as_s16 (v_load_expand (Jptr1 + x + cn));
232
- v_int16x8 vmask = v_reinterpret_as_s16 (v_load_expand (maskPtr + x)) * vmax_val_16;
232
+ v_int16x8 vmask = v_mul ( v_reinterpret_as_s16 (v_load_expand (maskPtr + x)), vmax_val_16) ;
233
233
234
234
v_int32x4 t0, t1;
235
235
v_int16x8 t00, t01, t10, t11;
236
236
v_zip (v00, v01, t00, t01);
237
237
v_zip (v10, v11, t10, t11);
238
238
239
239
// subpixel interpolation
240
- t0 = v_dotprod (t00, vqw0, vdelta) + v_dotprod (t10, vqw1);
241
- t1 = v_dotprod (t01, vqw0, vdelta) + v_dotprod (t11, vqw1);
242
- t0 = t0 >> ( W_BITS - 5 );
243
- t1 = t1 >> ( W_BITS - 5 );
240
+ t0 = v_add ( v_dotprod (t00, vqw0, vdelta), v_dotprod (t10, vqw1) );
241
+ t1 = v_add ( v_dotprod (t01, vqw0, vdelta), v_dotprod (t11, vqw1) );
242
+ t0 = v_shr (t0, W_BITS - 5 );
243
+ t1 = v_shr (t1, W_BITS - 5 );
244
244
245
245
// diff = J - I
246
- diff0 = v_pack (t0, t1) - vI ;
246
+ diff0 = v_sub ( v_pack (t0, t1), vI) ;
247
247
// I*gain.x + gain.x
248
248
v_mul_expand (vI, vgain_value, t0, t1);
249
- diff0 = diff0 + v_pack (t0 >> bitShift, t1 >> bitShift) + vconst_value;
250
- diff0 = diff0 & vmask;
249
+ diff0 = v_add ( v_add ( diff0, v_pack (v_shr (t0, bitShift), v_shr (t1, bitShift))), vconst_value) ;
250
+ diff0 = v_and ( diff0, vmask) ;
251
251
v_zip (diff0, diff0, diff2, diff1);
252
252
253
253
v_int32x4 diff0_0;
@@ -259,16 +259,16 @@ class TrackerInvoker : public cv::ParallelLoopBody
259
259
v_zip (vIxy_0, vIxy_1, v10, v11);
260
260
v_zip (diff2, diff1, v00, v01);
261
261
262
- vqb0 += v_cvt_f32 (v_dotprod (v00, v10));
263
- vqb1 += v_cvt_f32 (v_dotprod (v01, v11));
262
+ vqb0 = v_add (vqb0, v_cvt_f32 (v_dotprod (v00, v10) ));
263
+ vqb1 = v_add (vqb1, v_cvt_f32 (v_dotprod (v01, v11) ));
264
264
265
265
v_int32x4 vI0, vI1;
266
266
v_expand (vI, vI0, vI1);
267
- vqb2 += v_cvt_f32 (diff0_0 * vI0);
268
- vqb2 += v_cvt_f32 (diff0_1 * vI1);
267
+ vqb2 = v_add (vqb2, v_cvt_f32 (v_mul ( diff0_0, vI0)) );
268
+ vqb2 = v_add (vqb2, v_cvt_f32 (v_mul ( diff0_1, vI1)) );
269
269
270
- vqb3 += v_cvt_f32 (diff0_0);
271
- vqb3 += v_cvt_f32 (diff0_1);
270
+ vqb3 = v_add (vqb3, v_cvt_f32 (diff0_0) );
271
+ vqb3 = v_add (vqb3, v_cvt_f32 (diff0_1) );
272
272
273
273
if (j == 0 )
274
274
{
@@ -285,17 +285,17 @@ class TrackerInvoker : public cv::ParallelLoopBody
285
285
vAxx = v_muladd (fx, fx, vAxx);
286
286
287
287
// sumIx und sumIy
288
- vsumIx += fx ;
289
- vsumIy += fy ;
288
+ vsumIx = v_add (vsumIx, fx) ;
289
+ vsumIy = v_add (vsumIy, fy) ;
290
290
291
- vsumW1 += vI_ps * fx ;
292
- vsumW2 += vI_ps * fy ;
291
+ vsumW1 = v_add (vsumW1, v_mul ( vI_ps, fx)) ;
292
+ vsumW2 = v_add (vsumW2, v_mul ( vI_ps, fy)) ;
293
293
294
294
// sumI
295
- vsumI += vI_ps;
295
+ vsumI = v_add (vsumI, vI_ps) ;
296
296
297
297
// sumDI
298
- vsumDI += vI_ps * vI_ps;
298
+ vsumDI = v_add (vsumDI, v_mul ( vI_ps, vI_ps)) ;
299
299
300
300
v01 = v_reinterpret_as_s16 (v_interleave_pairs (v_reinterpret_as_s32 (v_interleave_pairs (vIxy_1))));
301
301
v_expand (v01, t1, t0);
@@ -309,17 +309,17 @@ class TrackerInvoker : public cv::ParallelLoopBody
309
309
vAxx = v_muladd (fx, fx, vAxx);
310
310
311
311
// sumIx und sumIy
312
- vsumIx += fx ;
313
- vsumIy += fy ;
312
+ vsumIx = v_add (vsumIx, fx) ;
313
+ vsumIy = v_add (vsumIy, fy) ;
314
314
315
- vsumW1 += vI_ps * fx ;
316
- vsumW2 += vI_ps * fy ;
315
+ vsumW1 = v_add (vsumW1, v_mul ( vI_ps, fx)) ;
316
+ vsumW2 = v_add (vsumW2, v_mul ( vI_ps, fy)) ;
317
317
318
318
// sumI
319
- vsumI += vI_ps;
319
+ vsumI = v_add (vsumI, vI_ps) ;
320
320
321
321
// sumDI
322
- vsumDI += vI_ps * vI_ps;
322
+ vsumDI = v_add (vsumDI, v_mul ( vI_ps, vI_ps)) ;
323
323
}
324
324
}
325
325
#else
@@ -388,7 +388,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
388
388
389
389
#if CV_SIMD128
390
390
float CV_DECL_ALIGNED (16 ) bbuf[4 ];
391
- v_store_aligned (bbuf, vqb0 + vqb1);
391
+ v_store_aligned (bbuf, v_add ( vqb0, vqb1) );
392
392
b1 = bbuf[0 ] + bbuf[2 ];
393
393
b2 = bbuf[1 ] + bbuf[3 ];
394
394
b3 = v_reduce_sum (vqb2);
@@ -696,19 +696,19 @@ class TrackerInvoker : public cv::ParallelLoopBody
696
696
v_int16x8 v01 = v_reinterpret_as_s16 (v_load_expand (Jptr + x + cn));
697
697
v_int16x8 v10 = v_reinterpret_as_s16 (v_load_expand (Jptr1 + x));
698
698
v_int16x8 v11 = v_reinterpret_as_s16 (v_load_expand (Jptr1 + x + cn));
699
- v_int16x8 vmask = v_reinterpret_as_s16 (v_load_expand (maskPtr + x)) * vmax_val_16;
699
+ v_int16x8 vmask = v_mul ( v_reinterpret_as_s16 (v_load_expand (maskPtr + x)), vmax_val_16) ;
700
700
701
701
v_int32x4 t0, t1;
702
702
v_int16x8 t00, t01, t10, t11;
703
703
v_zip (v00, v01, t00, t01);
704
704
v_zip (v10, v11, t10, t11);
705
705
706
- t0 = v_dotprod (t00, vqw0, vdelta) + v_dotprod (t10, vqw1);
707
- t1 = v_dotprod (t01, vqw0, vdelta) + v_dotprod (t11, vqw1);
708
- t0 = t0 >> ( W_BITS - 5 );
709
- t1 = t1 >> ( W_BITS - 5 );
710
- diff0 = v_pack (t0, t1) - diff0;
711
- diff0 = diff0 & vmask;
706
+ t0 = v_add ( v_dotprod (t00, vqw0, vdelta), v_dotprod (t10, vqw1) );
707
+ t1 = v_add ( v_dotprod (t01, vqw0, vdelta), v_dotprod (t11, vqw1) );
708
+ t0 = v_shr (t0, W_BITS - 5 );
709
+ t1 = v_shr (t1, W_BITS - 5 );
710
+ diff0 = v_sub ( v_pack (t0, t1), diff0) ;
711
+ diff0 = v_and ( diff0, vmask) ;
712
712
713
713
v_zip (diff0, diff0, diff2, diff1); // It0 It0 It1 It1 ...
714
714
@@ -717,8 +717,8 @@ class TrackerInvoker : public cv::ParallelLoopBody
717
717
v_zip (vIxy_0, vIxy_1, v10, v11);
718
718
v_zip (diff2, diff1, v00, v01);
719
719
720
- vqb0 += v_cvt_f32 (v_dotprod (v00, v10));
721
- vqb1 += v_cvt_f32 (v_dotprod (v01, v11));
720
+ vqb0 = v_add (vqb0, v_cvt_f32 (v_dotprod (v00, v10) ));
721
+ vqb1 = v_add (vqb1, v_cvt_f32 (v_dotprod (v01, v11) ));
722
722
}
723
723
#else
724
724
for ( ; x < winSize.width *cn; x++, dIptr += 2 )
@@ -737,7 +737,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
737
737
738
738
#if CV_SIMD128
739
739
float CV_DECL_ALIGNED (16 ) bbuf[4 ];
740
- v_store_aligned (bbuf, vqb0 + vqb1);
740
+ v_store_aligned (bbuf, v_add ( vqb0, vqb1) );
741
741
b1 = bbuf[0 ] + bbuf[2 ];
742
742
b2 = bbuf[1 ] + bbuf[3 ];
743
743
#endif
0 commit comments