@@ -2703,39 +2703,30 @@ void warpAffineBlocklineNN(int *adelta, int *bdelta, short* xy, int X0, int Y0,
2703
2703
{
2704
2704
CALL_HAL (warpAffineBlocklineNN, cv_hal_warpAffineBlocklineNN, adelta, bdelta, xy, X0, Y0, bw);
2705
2705
2706
- const int AB_BITS = MAX (10 , ( int ) INTER_BITS);
2706
+ constexpr int AB_BITS = MAX (10 , static_cast < int >( INTER_BITS) );
2707
2707
int x1 = 0 ;
2708
- #if CV_TRY_SSE4_1
2709
- bool useSSE4_1 = CV_CPU_HAS_SUPPORT_SSE4_1;
2710
- if ( useSSE4_1 )
2711
- opt_SSE4_1::WarpAffineInvoker_Blockline_SSE41 (adelta, bdelta, xy, X0, Y0, bw);
2712
- else
2713
- #endif
2708
+ #if (CV_SIMD || CV_SIMD_SCALABLE)
2714
2709
{
2715
- #if CV_SIMD128
2710
+ const v_int32 v_X0 = vx_setall_s32 (X0);
2711
+ const v_int32 v_Y0 = vx_setall_s32 (Y0);
2712
+ const int step = VTraits<v_int16>::vlanes ();
2713
+ for (; x1 <= bw - step; x1 += step)
2716
2714
{
2717
- v_int32x4 v_X0 = v_setall_s32 (X0), v_Y0 = v_setall_s32 (Y0);
2718
- int span = VTraits<v_uint16x8>::vlanes ();
2719
- for ( ; x1 <= bw - span; x1 += span )
2720
- {
2721
- v_int16x8 v_dst[2 ];
2722
- #define CV_CONVERT_MAP (ptr,offset,shift ) v_pack(v_shr<AB_BITS>(v_add(shift,v_load(ptr + offset))),\
2723
- v_shr<AB_BITS>(v_add (shift,v_load (ptr + offset + 4 ))))
2724
- v_dst[0 ] = CV_CONVERT_MAP (adelta, x1, v_X0);
2725
- v_dst[1 ] = CV_CONVERT_MAP (bdelta, x1, v_Y0);
2726
- #undef CV_CONVERT_MAP
2727
- v_store_interleave (xy + (x1 << 1 ), v_dst[0 ], v_dst[1 ]);
2728
- }
2729
- }
2730
- #endif
2731
- for ( ; x1 < bw; x1++ )
2732
- {
2733
- int X = (X0 + adelta[x1]) >> AB_BITS;
2734
- int Y = (Y0 + bdelta[x1]) >> AB_BITS;
2735
- xy[x1*2 ] = saturate_cast<short >(X);
2736
- xy[x1*2 +1 ] = saturate_cast<short >(Y);
2715
+ v_int16 v_X = v_pack (v_shr<AB_BITS>(v_add (v_X0, vx_load (adelta + x1))),
2716
+ v_shr<AB_BITS>(v_add (v_X0, vx_load (adelta + x1 + step / 2 ))));
2717
+ v_int16 v_Y = v_pack (v_shr<AB_BITS>(v_add (v_Y0, vx_load (bdelta + x1))),
2718
+ v_shr<AB_BITS>(v_add (v_Y0, vx_load (bdelta + x1 + step / 2 ))));
2719
+ v_store_interleave (xy + 2 * x1, v_X, v_Y);
2737
2720
}
2738
2721
}
2722
+ #endif
2723
+ for (; x1 < bw; x1++)
2724
+ {
2725
+ const int X = (X0 + adelta[x1]) >> AB_BITS;
2726
+ const int Y = (Y0 + bdelta[x1]) >> AB_BITS;
2727
+ xy[x1 * 2 ] = saturate_cast<short >(X);
2728
+ xy[x1 * 2 + 1 ] = saturate_cast<short >(Y);
2729
+ }
2739
2730
}
2740
2731
2741
2732
void warpAffineBlockline (int *adelta, int *bdelta, short * xy, short * alpha, int X0, int Y0, int bw)
0 commit comments