Skip to content

Commit 7ab3a80

Browse files
author
Anna Khakimova
authored
Merge pull request opencv#19233 from anna-khakimova:ak/simd_absdiffc
GAPI: SIMD optimization for AbsDiffC kernel * SIMD optimization for AbsDiffC kernel * Applied comments * Applying comments and refactoring: Remove new univ intrinsics. * Performance experiment * Applied comments.Step2 * Applied comments. Step3
1 parent e5518ee commit 7ab3a80

File tree

4 files changed

+299
-21
lines changed

4 files changed

+299
-21
lines changed

modules/gapi/include/opencv2/gapi/core.hpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -298,8 +298,8 @@ namespace core {
298298
}
299299
};
300300

301-
G_TYPED_KERNEL(GAbsDiffC, <GMat(GMat, GScalar)>, "org.opencv.core.matrixop.absdiffC") {
302-
static GMatDesc outMeta(GMatDesc a, GScalarDesc) {
301+
G_TYPED_KERNEL(GAbsDiffC, <GMat(GMat,GScalar)>, "org.opencv.core.matrixop.absdiffC") {
302+
static GMatDesc outMeta(const GMatDesc& a, const GScalarDesc&) {
303303
return a;
304304
}
305305
};

modules/gapi/perf/cpu/gapi_core_perf_tests_fluid.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -147,7 +147,9 @@ INSTANTIATE_TEST_CASE_P(AbsDiffPerfTestFluid, AbsDiffPerfTest,
147147

148148
INSTANTIATE_TEST_CASE_P(AbsDiffCPerfTestFluid, AbsDiffCPerfTest,
149149
Combine(Values(szSmall128, szVGA, sz720p, sz1080p),
150-
Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1),
150+
Values(CV_8UC1, CV_16UC1, CV_16SC1, CV_8UC2,
151+
CV_16UC2, CV_16SC2, CV_8UC3, CV_16UC3,
152+
CV_16SC3, CV_8UC4, CV_16UC4, CV_16SC4),
151153
Values(cv::compile_args(CORE_FLUID))));
152154

153155
// INSTANTIATE_TEST_CASE_P(SumPerfTestFluid, SumPerfTest,

modules/gapi/src/backends/fluid/gfluidcore.cpp

Lines changed: 291 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ static inline DST divr(SRC1 x, SRC2 y, float scale=1)
9797
// Fluid kernels: addWeighted
9898
//
9999
//---------------------------
100-
#if CV_SSE2
100+
#if CV_SIMD
101101
CV_ALWAYS_INLINE v_float32 v_load_f32(const ushort* in)
102102
{
103103
return v_cvt_f32(v_reinterpret_as_s32(vx_load_expand(in)));
@@ -112,7 +112,9 @@ CV_ALWAYS_INLINE v_float32 v_load_f32(const uchar* in)
112112
{
113113
return v_cvt_f32(v_reinterpret_as_s32(vx_load_expand_q(in)));
114114
}
115+
#endif
115116

117+
#if CV_SSE2
116118
CV_ALWAYS_INLINE void addw_short_store(short* out, const v_int32& c1, const v_int32& c2)
117119
{
118120
vx_store(out, v_pack(c1, c2));
@@ -972,6 +974,262 @@ static void run_arithm_s(DST out[], const SRC in[], int width, int chan,
972974
CV_Error(cv::Error::StsBadArg, "unsupported number of channels");
973975
}
974976

977+
#if CV_SIMD
978+
CV_ALWAYS_INLINE void absdiffc_short_store_c1c2c4(short* out_ptr, const v_int32& c1, const v_int32& c2)
979+
{
980+
vx_store(out_ptr, v_pack(c1, c2));
981+
}
982+
983+
CV_ALWAYS_INLINE void absdiffc_short_store_c1c2c4(ushort* out_ptr, const v_int32& c1, const v_int32& c2)
984+
{
985+
vx_store(out_ptr, v_pack_u(c1, c2));
986+
}
987+
988+
template<typename T>
989+
CV_ALWAYS_INLINE int absdiffc_simd_c1c2c4(const T in[], T out[],
990+
const v_float32& s, const int length)
991+
{
992+
static_assert((std::is_same<T, ushort>::value) || (std::is_same<T, short>::value),
993+
"This templated overload is only for short or ushort type combinations.");
994+
995+
constexpr int nlanes = (std::is_same<T, ushort>::value) ? static_cast<int>(v_uint16::nlanes) :
996+
static_cast<int>(v_int16::nlanes);
997+
if (length < nlanes)
998+
return 0;
999+
1000+
int x = 0;
1001+
for (;;)
1002+
{
1003+
for (; x <= length - nlanes; x += nlanes)
1004+
{
1005+
v_float32 a1 = v_load_f32(in + x);
1006+
v_float32 a2 = v_load_f32(in + x + nlanes / 2);
1007+
1008+
absdiffc_short_store_c1c2c4(&out[x], v_round(v_absdiff(a1, s)),
1009+
v_round(v_absdiff(a2, s)));
1010+
}
1011+
1012+
if (x < length && (in != out))
1013+
{
1014+
x = length - nlanes;
1015+
continue; // process unaligned tail
1016+
}
1017+
break;
1018+
}
1019+
return x;
1020+
}
1021+
1022+
template<>
1023+
CV_ALWAYS_INLINE int absdiffc_simd_c1c2c4<uchar>(const uchar in[], uchar out[],
1024+
const v_float32& s, const int length)
1025+
{
1026+
constexpr int nlanes = static_cast<int>(v_uint8::nlanes);
1027+
1028+
if (length < nlanes)
1029+
return 0;
1030+
1031+
int x = 0;
1032+
for (;;)
1033+
{
1034+
for (; x <= length - nlanes; x += nlanes)
1035+
{
1036+
v_float32 a1 = v_load_f32(in + x);
1037+
v_float32 a2 = v_load_f32(in + x + nlanes / 4);
1038+
v_float32 a3 = v_load_f32(in + x + nlanes / 2);
1039+
v_float32 a4 = v_load_f32(in + x + 3 * nlanes / 4);
1040+
1041+
vx_store(&out[x], v_pack_u(v_pack(v_round(v_absdiff(a1, s)),
1042+
v_round(v_absdiff(a2, s))),
1043+
v_pack(v_round(v_absdiff(a3, s)),
1044+
v_round(v_absdiff(a4, s)))));
1045+
}
1046+
1047+
if (x < length && (in != out))
1048+
{
1049+
x = length - nlanes;
1050+
continue; // process unaligned tail
1051+
}
1052+
break;
1053+
}
1054+
return x;
1055+
}
1056+
1057+
CV_ALWAYS_INLINE void absdiffc_short_store_c3(short* out_ptr, const v_int32& c1,
1058+
const v_int32& c2, const v_int32& c3,
1059+
const v_int32& c4, const v_int32& c5,
1060+
const v_int32& c6)
1061+
{
1062+
constexpr int nlanes = static_cast<int>(v_int16::nlanes);
1063+
vx_store(out_ptr, v_pack(c1, c2));
1064+
vx_store(out_ptr + nlanes, v_pack(c3, c4));
1065+
vx_store(out_ptr + 2*nlanes, v_pack(c5, c6));
1066+
}
1067+
1068+
CV_ALWAYS_INLINE void absdiffc_short_store_c3(ushort* out_ptr, const v_int32& c1,
1069+
const v_int32& c2, const v_int32& c3,
1070+
const v_int32& c4, const v_int32& c5,
1071+
const v_int32& c6)
1072+
{
1073+
constexpr int nlanes = static_cast<int>(v_uint16::nlanes);
1074+
vx_store(out_ptr, v_pack_u(c1, c2));
1075+
vx_store(out_ptr + nlanes, v_pack_u(c3, c4));
1076+
vx_store(out_ptr + 2*nlanes, v_pack_u(c5, c6));
1077+
}
1078+
1079+
template<typename T>
1080+
CV_ALWAYS_INLINE int absdiffc_simd_c3_impl(const T in[], T out[],
1081+
const v_float32& s1, const v_float32& s2,
1082+
const v_float32& s3, const int length)
1083+
{
1084+
static_assert((std::is_same<T, ushort>::value) || (std::is_same<T, short>::value),
1085+
"This templated overload is only for short or ushort type combinations.");
1086+
1087+
constexpr int nlanes = (std::is_same<T, ushort>::value) ? static_cast<int>(v_uint16::nlanes):
1088+
static_cast<int>(v_int16::nlanes);
1089+
1090+
if (length < 3 * nlanes)
1091+
return 0;
1092+
1093+
int x = 0;
1094+
for (;;)
1095+
{
1096+
for (; x <= length - 3 * nlanes; x += 3 * nlanes)
1097+
{
1098+
v_float32 a1 = v_load_f32(in + x);
1099+
v_float32 a2 = v_load_f32(in + x + nlanes / 2);
1100+
v_float32 a3 = v_load_f32(in + x + nlanes);
1101+
v_float32 a4 = v_load_f32(in + x + 3 * nlanes / 2);
1102+
v_float32 a5 = v_load_f32(in + x + 2 * nlanes);
1103+
v_float32 a6 = v_load_f32(in + x + 5 * nlanes / 2);
1104+
1105+
absdiffc_short_store_c3(&out[x], v_round(v_absdiff(a1, s1)),
1106+
v_round(v_absdiff(a2, s2)),
1107+
v_round(v_absdiff(a3, s3)),
1108+
v_round(v_absdiff(a4, s1)),
1109+
v_round(v_absdiff(a5, s2)),
1110+
v_round(v_absdiff(a6, s3)));
1111+
}
1112+
1113+
if (x < length && (in != out))
1114+
{
1115+
x = length - 3 * nlanes;
1116+
continue; // process unaligned tail
1117+
}
1118+
break;
1119+
}
1120+
return x;
1121+
}
1122+
1123+
template<>
1124+
CV_ALWAYS_INLINE int absdiffc_simd_c3_impl<uchar>(const uchar in[], uchar out[],
1125+
const v_float32& s1, const v_float32& s2,
1126+
const v_float32& s3, const int length)
1127+
{
1128+
constexpr int nlanes = static_cast<int>(v_uint8::nlanes);
1129+
1130+
if (length < 3 * nlanes)
1131+
return 0;
1132+
1133+
int x = 0;
1134+
1135+
for (;;)
1136+
{
1137+
for (; x <= length - 3 * nlanes; x += 3 * nlanes)
1138+
{
1139+
vx_store(&out[x],
1140+
v_pack_u(v_pack(v_round(v_absdiff(v_load_f32(in + x), s1)),
1141+
v_round(v_absdiff(v_load_f32(in + x + nlanes/4), s2))),
1142+
v_pack(v_round(v_absdiff(v_load_f32(in + x + nlanes/2), s3)),
1143+
v_round(v_absdiff(v_load_f32(in + x + 3*nlanes/4), s1)))));
1144+
1145+
vx_store(&out[x + nlanes],
1146+
v_pack_u(v_pack(v_round(v_absdiff(v_load_f32(in + x + nlanes), s2)),
1147+
v_round(v_absdiff(v_load_f32(in + x + 5*nlanes/4), s3))),
1148+
v_pack(v_round(v_absdiff(v_load_f32(in + x + 3*nlanes/2), s1)),
1149+
v_round(v_absdiff(v_load_f32(in + x + 7*nlanes/4), s2)))));
1150+
1151+
vx_store(&out[x + 2 * nlanes],
1152+
v_pack_u(v_pack(v_round(v_absdiff(v_load_f32(in + x + 2*nlanes), s3)),
1153+
v_round(v_absdiff(v_load_f32(in + x + 9*nlanes/4), s1))),
1154+
v_pack(v_round(v_absdiff(v_load_f32(in + x + 5*nlanes/2), s2)),
1155+
v_round(v_absdiff(v_load_f32(in + x + 11*nlanes/4), s3)))));
1156+
}
1157+
1158+
if (x < length && (in != out))
1159+
{
1160+
x = length - 3 * nlanes;
1161+
continue; // process unaligned tail
1162+
}
1163+
break;
1164+
}
1165+
return x;
1166+
}
1167+
1168+
template<typename T>
1169+
CV_ALWAYS_INLINE int absdiffc_simd_channels(const T in[], const float scalar[], T out[],
1170+
const int width, int chan)
1171+
{
1172+
int length = width * chan;
1173+
v_float32 s = vx_load(scalar);
1174+
1175+
return absdiffc_simd_c1c2c4(in, out, s, length);
1176+
}
1177+
1178+
template<typename T>
1179+
CV_ALWAYS_INLINE int absdiffc_simd_c3(const T in[], const float scalar[], T out[], int width)
1180+
{
1181+
constexpr int chan = 3;
1182+
int length = width * chan;
1183+
1184+
v_float32 s1 = vx_load(scalar);
1185+
#if CV_SIMD_WIDTH == 32
1186+
v_float32 s2 = vx_load(scalar + 2);
1187+
v_float32 s3 = vx_load(scalar + 1);
1188+
#else
1189+
v_float32 s2 = vx_load(scalar + 1);
1190+
v_float32 s3 = vx_load(scalar + 2);
1191+
#endif
1192+
1193+
return absdiffc_simd_c3_impl(in, out, s1, s2, s3, length);
1194+
}
1195+
1196+
template<typename T>
1197+
CV_ALWAYS_INLINE int absdiffc_simd(const T in[], const float scalar[], T out[], int width, int chan)
1198+
{
1199+
switch (chan)
1200+
{
1201+
case 1:
1202+
case 2:
1203+
case 4:
1204+
return absdiffc_simd_channels(in, scalar, out, width, chan);
1205+
case 3:
1206+
return absdiffc_simd_c3(in, scalar, out, width);
1207+
default:
1208+
break;
1209+
}
1210+
1211+
return 0;
1212+
}
1213+
#endif // CV_SIMD
1214+
1215+
template<typename DST, typename SRC>
1216+
static void run_absdiffc(Buffer &dst, const View &src, const float scalar[])
1217+
{
1218+
const auto *in = src.InLine<SRC>(0);
1219+
auto *out = dst.OutLine<DST>();
1220+
1221+
int width = dst.length();
1222+
int chan = dst.meta().chan;
1223+
1224+
int w = 0;
1225+
#if CV_SIMD
1226+
w = absdiffc_simd(in, scalar, out, width, chan);
1227+
#endif
1228+
1229+
for (; w < width*chan; ++w)
1230+
out[w] = absdiff<DST>(in[w], scalar[w%chan]);
1231+
}
1232+
9751233
template<typename DST, typename SRC>
9761234
static void run_arithm_s(Buffer &dst, const View &src, const float scalar[4], Arithm arithm,
9771235
float scale=1)
@@ -990,11 +1248,6 @@ static void run_arithm_s(Buffer &dst, const View &src, const float scalar[4], Ar
9901248

9911249
switch (arithm)
9921250
{
993-
case ARITHM_ABSDIFF:
994-
for (int w=0; w < width; w++)
995-
for (int c=0; c < chan; c++)
996-
out[chan*w + c] = absdiff<DST>(in[chan*w + c], scalar[c]);
997-
break;
9981251
case ARITHM_ADD:
9991252
if (usemyscal)
10001253
{
@@ -1089,26 +1342,47 @@ static void run_arithm_rs(Buffer &dst, const View &src, const float scalar[4], A
10891342
}
10901343
}
10911344

1092-
GAPI_FLUID_KERNEL(GFluidAbsDiffC, cv::gapi::core::GAbsDiffC, false)
1345+
GAPI_FLUID_KERNEL(GFluidAbsDiffC, cv::gapi::core::GAbsDiffC, true)
10931346
{
10941347
static const int Window = 1;
10951348

1096-
static void run(const View &src, const cv::Scalar &_scalar, Buffer &dst)
1349+
static void run(const View &src, const cv::Scalar& _scalar, Buffer &dst, Buffer& scratch)
10971350
{
1098-
const float scalar[4] = {
1099-
static_cast<float>(_scalar[0]),
1100-
static_cast<float>(_scalar[1]),
1101-
static_cast<float>(_scalar[2]),
1102-
static_cast<float>(_scalar[3])
1103-
};
1351+
if (dst.y() == 0)
1352+
{
1353+
const int chan = src.meta().chan;
1354+
float* sc = scratch.OutLine<float>();
1355+
1356+
for (int i = 0; i < scratch.length(); ++i)
1357+
sc[i] = static_cast<float>(_scalar[i % chan]);
1358+
}
1359+
1360+
const float* scalar = scratch.OutLine<float>();
11041361

11051362
// DST SRC OP __VA_ARGS__
1106-
UNARY_(uchar , uchar , run_arithm_s, dst, src, scalar, ARITHM_ABSDIFF);
1107-
UNARY_(ushort, ushort, run_arithm_s, dst, src, scalar, ARITHM_ABSDIFF);
1108-
UNARY_( short, short, run_arithm_s, dst, src, scalar, ARITHM_ABSDIFF);
1363+
UNARY_(uchar, uchar, run_absdiffc, dst, src, scalar);
1364+
UNARY_(ushort, ushort, run_absdiffc, dst, src, scalar);
1365+
UNARY_(short, short, run_absdiffc, dst, src, scalar);
11091366

11101367
CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
11111368
}
1369+
1370+
static void initScratch(const GMatDesc&, const GScalarDesc&, Buffer& scratch)
1371+
{
1372+
#if CV_SIMD
1373+
constexpr int buflen = static_cast<int>(v_float32::nlanes) + 2; // buffer size
1374+
#else
1375+
constexpr int buflen = 4;
1376+
#endif
1377+
cv::Size bufsize(buflen, 1);
1378+
GMatDesc bufdesc = { CV_32F, 1, bufsize };
1379+
Buffer buffer(bufdesc);
1380+
scratch = std::move(buffer);
1381+
}
1382+
1383+
static void resetScratch(Buffer& /* scratch */)
1384+
{
1385+
}
11121386
};
11131387

11141388
GAPI_FLUID_KERNEL(GFluidAddC, cv::gapi::core::GAddC, false)

modules/gapi/test/cpu/gapi_core_tests_fluid.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,9 @@ INSTANTIATE_TEST_CASE_P(AbsDiffTestFluid, AbsDiffTest,
105105
Values(CORE_FLUID)));
106106

107107
INSTANTIATE_TEST_CASE_P(AbsDiffCTestFluid, AbsDiffCTest,
108-
Combine(Values(CV_8UC1, CV_16UC1, CV_16SC1),
108+
Combine(Values(CV_8UC1, CV_16UC1, CV_16SC1, CV_8UC2,
109+
CV_16UC2, CV_16SC2, CV_8UC3, CV_16UC3,
110+
CV_16SC3, CV_8UC4, CV_16UC4, CV_16SC4),
109111
Values(cv::Size(1280, 720),
110112
cv::Size(640, 480),
111113
cv::Size(128, 128)),

0 commit comments

Comments
 (0)