Skip to content

Commit ee95bfe

Browse files
authored
Merge pull request opencv#26203 from FantasqueX:generic-simd-warpAffineBlocklineNN
Use generic SIMD in warpAffineBlocklineNN
2 parents ddc03c0 + 45b9398 commit ee95bfe

File tree

3 files changed

+19
-65
lines changed

3 files changed

+19
-65
lines changed

modules/imgproc/src/imgwarp.cpp

Lines changed: 19 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -2703,39 +2703,30 @@ void warpAffineBlocklineNN(int *adelta, int *bdelta, short* xy, int X0, int Y0,
27032703
{
27042704
CALL_HAL(warpAffineBlocklineNN, cv_hal_warpAffineBlocklineNN, adelta, bdelta, xy, X0, Y0, bw);
27052705

2706-
const int AB_BITS = MAX(10, (int)INTER_BITS);
2706+
constexpr int AB_BITS = MAX(10, static_cast<int>(INTER_BITS));
27072707
int x1 = 0;
2708-
#if CV_TRY_SSE4_1
2709-
bool useSSE4_1 = CV_CPU_HAS_SUPPORT_SSE4_1;
2710-
if( useSSE4_1 )
2711-
opt_SSE4_1::WarpAffineInvoker_Blockline_SSE41(adelta, bdelta, xy, X0, Y0, bw);
2712-
else
2713-
#endif
2708+
#if (CV_SIMD || CV_SIMD_SCALABLE)
27142709
{
2715-
#if CV_SIMD128
2710+
const v_int32 v_X0 = vx_setall_s32(X0);
2711+
const v_int32 v_Y0 = vx_setall_s32(Y0);
2712+
const int step = VTraits<v_int16>::vlanes();
2713+
for (; x1 <= bw - step; x1 += step)
27162714
{
2717-
v_int32x4 v_X0 = v_setall_s32(X0), v_Y0 = v_setall_s32(Y0);
2718-
int span = VTraits<v_uint16x8>::vlanes();
2719-
for( ; x1 <= bw - span; x1 += span )
2720-
{
2721-
v_int16x8 v_dst[2];
2722-
#define CV_CONVERT_MAP(ptr,offset,shift) v_pack(v_shr<AB_BITS>(v_add(shift,v_load(ptr + offset))),\
2723-
v_shr<AB_BITS>(v_add(shift,v_load(ptr + offset + 4))))
2724-
v_dst[0] = CV_CONVERT_MAP(adelta, x1, v_X0);
2725-
v_dst[1] = CV_CONVERT_MAP(bdelta, x1, v_Y0);
2726-
#undef CV_CONVERT_MAP
2727-
v_store_interleave(xy + (x1 << 1), v_dst[0], v_dst[1]);
2728-
}
2729-
}
2730-
#endif
2731-
for( ; x1 < bw; x1++ )
2732-
{
2733-
int X = (X0 + adelta[x1]) >> AB_BITS;
2734-
int Y = (Y0 + bdelta[x1]) >> AB_BITS;
2735-
xy[x1*2] = saturate_cast<short>(X);
2736-
xy[x1*2+1] = saturate_cast<short>(Y);
2715+
v_int16 v_X = v_pack(v_shr<AB_BITS>(v_add(v_X0, vx_load(adelta + x1))),
2716+
v_shr<AB_BITS>(v_add(v_X0, vx_load(adelta + x1 + step / 2))));
2717+
v_int16 v_Y = v_pack(v_shr<AB_BITS>(v_add(v_Y0, vx_load(bdelta + x1))),
2718+
v_shr<AB_BITS>(v_add(v_Y0, vx_load(bdelta + x1 + step / 2))));
2719+
v_store_interleave(xy + 2 * x1, v_X, v_Y);
27372720
}
27382721
}
2722+
#endif
2723+
for (; x1 < bw; x1++)
2724+
{
2725+
const int X = (X0 + adelta[x1]) >> AB_BITS;
2726+
const int Y = (Y0 + bdelta[x1]) >> AB_BITS;
2727+
xy[x1 * 2] = saturate_cast<short>(X);
2728+
xy[x1 * 2 + 1] = saturate_cast<short>(Y);
2729+
}
27392730
}
27402731

27412732
void warpAffineBlockline(int *adelta, int *bdelta, short* xy, short* alpha, int X0, int Y0, int bw)

modules/imgproc/src/imgwarp.hpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,6 @@ namespace opt_SSE4_1
7474
void convertMaps_nninterpolate32f1c16s_SSE41(const float* src1f, const float* src2f, short* dst1, int width);
7575
void convertMaps_32f1c16s_SSE41(const float* src1f, const float* src2f, short* dst1, ushort* dst2, int width);
7676
void convertMaps_32f2c16s_SSE41(const float* src1f, short* dst1, ushort* dst2, int width);
77-
void WarpAffineInvoker_Blockline_SSE41(int *adelta, int *bdelta, short* xy, int X0, int Y0, int bw);
7877

7978
class WarpPerspectiveLine_SSE4
8079
{

modules/imgproc/src/imgwarp.sse4_1.cpp

Lines changed: 0 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -173,42 +173,6 @@ void convertMaps_32f2c16s_SSE41(const float* src1f, short* dst1, ushort* dst2, i
173173
}
174174
}
175175

176-
void WarpAffineInvoker_Blockline_SSE41(int *adelta, int *bdelta, short* xy, int X0, int Y0, int bw)
177-
{
178-
const int AB_BITS = MAX(10, (int)INTER_BITS);
179-
int x1 = 0;
180-
181-
__m128i v_X0 = _mm_set1_epi32(X0);
182-
__m128i v_Y0 = _mm_set1_epi32(Y0);
183-
for (; x1 <= bw - 16; x1 += 16)
184-
{
185-
__m128i v_x0 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x1))), AB_BITS),
186-
_mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x1 + 4))), AB_BITS));
187-
__m128i v_x1 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x1 + 8))), AB_BITS),
188-
_mm_srai_epi32(_mm_add_epi32(v_X0, _mm_loadu_si128((__m128i const *)(adelta + x1 + 12))), AB_BITS));
189-
190-
__m128i v_y0 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x1))), AB_BITS),
191-
_mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x1 + 4))), AB_BITS));
192-
__m128i v_y1 = _mm_packs_epi32(_mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x1 + 8))), AB_BITS),
193-
_mm_srai_epi32(_mm_add_epi32(v_Y0, _mm_loadu_si128((__m128i const *)(bdelta + x1 + 12))), AB_BITS));
194-
195-
_mm_interleave_epi16(v_x0, v_x1, v_y0, v_y1);
196-
197-
_mm_storeu_si128((__m128i *)(xy + x1 * 2), v_x0);
198-
_mm_storeu_si128((__m128i *)(xy + x1 * 2 + 8), v_x1);
199-
_mm_storeu_si128((__m128i *)(xy + x1 * 2 + 16), v_y0);
200-
_mm_storeu_si128((__m128i *)(xy + x1 * 2 + 24), v_y1);
201-
}
202-
for (; x1 < bw; x1++)
203-
{
204-
int X = (X0 + adelta[x1]) >> AB_BITS;
205-
int Y = (Y0 + bdelta[x1]) >> AB_BITS;
206-
xy[x1 * 2] = saturate_cast<short>(X);
207-
xy[x1 * 2 + 1] = saturate_cast<short>(Y);
208-
}
209-
}
210-
211-
212176
class WarpPerspectiveLine_SSE4_Impl CV_FINAL : public WarpPerspectiveLine_SSE4
213177
{
214178
public:

0 commit comments

Comments
 (0)