@@ -97,7 +97,7 @@ static inline DST divr(SRC1 x, SRC2 y, float scale=1)
97
97
// Fluid kernels: addWeighted
98
98
//
99
99
// ---------------------------
100
- #if CV_SSE2
100
+ #if CV_SIMD
101
101
CV_ALWAYS_INLINE v_float32 v_load_f32 (const ushort* in)
102
102
{
103
103
return v_cvt_f32 (v_reinterpret_as_s32 (vx_load_expand (in)));
@@ -112,7 +112,9 @@ CV_ALWAYS_INLINE v_float32 v_load_f32(const uchar* in)
112
112
{
113
113
return v_cvt_f32 (v_reinterpret_as_s32 (vx_load_expand_q (in)));
114
114
}
115
+ #endif
115
116
117
+ #if CV_SSE2
116
118
CV_ALWAYS_INLINE void addw_short_store (short * out, const v_int32& c1, const v_int32& c2)
117
119
{
118
120
vx_store (out, v_pack (c1, c2));
@@ -972,6 +974,262 @@ static void run_arithm_s(DST out[], const SRC in[], int width, int chan,
972
974
CV_Error (cv::Error::StsBadArg, " unsupported number of channels" );
973
975
}
974
976
977
+ #if CV_SIMD
978
+ CV_ALWAYS_INLINE void absdiffc_short_store_c1c2c4 (short * out_ptr, const v_int32& c1, const v_int32& c2)
979
+ {
980
+ vx_store (out_ptr, v_pack (c1, c2));
981
+ }
982
+
983
+ CV_ALWAYS_INLINE void absdiffc_short_store_c1c2c4 (ushort* out_ptr, const v_int32& c1, const v_int32& c2)
984
+ {
985
+ vx_store (out_ptr, v_pack_u (c1, c2));
986
+ }
987
+
988
+ template <typename T>
989
+ CV_ALWAYS_INLINE int absdiffc_simd_c1c2c4 (const T in[], T out[],
990
+ const v_float32& s, const int length)
991
+ {
992
+ static_assert ((std::is_same<T, ushort>::value) || (std::is_same<T, short >::value),
993
+ " This templated overload is only for short or ushort type combinations." );
994
+
995
+ constexpr int nlanes = (std::is_same<T, ushort>::value) ? static_cast <int >(v_uint16::nlanes) :
996
+ static_cast <int >(v_int16::nlanes);
997
+ if (length < nlanes)
998
+ return 0 ;
999
+
1000
+ int x = 0 ;
1001
+ for (;;)
1002
+ {
1003
+ for (; x <= length - nlanes; x += nlanes)
1004
+ {
1005
+ v_float32 a1 = v_load_f32 (in + x);
1006
+ v_float32 a2 = v_load_f32 (in + x + nlanes / 2 );
1007
+
1008
+ absdiffc_short_store_c1c2c4 (&out[x], v_round (v_absdiff (a1, s)),
1009
+ v_round (v_absdiff (a2, s)));
1010
+ }
1011
+
1012
+ if (x < length && (in != out))
1013
+ {
1014
+ x = length - nlanes;
1015
+ continue ; // process unaligned tail
1016
+ }
1017
+ break ;
1018
+ }
1019
+ return x;
1020
+ }
1021
+
1022
+ template <>
1023
+ CV_ALWAYS_INLINE int absdiffc_simd_c1c2c4<uchar>(const uchar in[], uchar out[],
1024
+ const v_float32& s, const int length)
1025
+ {
1026
+ constexpr int nlanes = static_cast <int >(v_uint8::nlanes);
1027
+
1028
+ if (length < nlanes)
1029
+ return 0 ;
1030
+
1031
+ int x = 0 ;
1032
+ for (;;)
1033
+ {
1034
+ for (; x <= length - nlanes; x += nlanes)
1035
+ {
1036
+ v_float32 a1 = v_load_f32 (in + x);
1037
+ v_float32 a2 = v_load_f32 (in + x + nlanes / 4 );
1038
+ v_float32 a3 = v_load_f32 (in + x + nlanes / 2 );
1039
+ v_float32 a4 = v_load_f32 (in + x + 3 * nlanes / 4 );
1040
+
1041
+ vx_store (&out[x], v_pack_u (v_pack (v_round (v_absdiff (a1, s)),
1042
+ v_round (v_absdiff (a2, s))),
1043
+ v_pack (v_round (v_absdiff (a3, s)),
1044
+ v_round (v_absdiff (a4, s)))));
1045
+ }
1046
+
1047
+ if (x < length && (in != out))
1048
+ {
1049
+ x = length - nlanes;
1050
+ continue ; // process unaligned tail
1051
+ }
1052
+ break ;
1053
+ }
1054
+ return x;
1055
+ }
1056
+
1057
+ CV_ALWAYS_INLINE void absdiffc_short_store_c3 (short * out_ptr, const v_int32& c1,
1058
+ const v_int32& c2, const v_int32& c3,
1059
+ const v_int32& c4, const v_int32& c5,
1060
+ const v_int32& c6)
1061
+ {
1062
+ constexpr int nlanes = static_cast <int >(v_int16::nlanes);
1063
+ vx_store (out_ptr, v_pack (c1, c2));
1064
+ vx_store (out_ptr + nlanes, v_pack (c3, c4));
1065
+ vx_store (out_ptr + 2 *nlanes, v_pack (c5, c6));
1066
+ }
1067
+
1068
+ CV_ALWAYS_INLINE void absdiffc_short_store_c3 (ushort* out_ptr, const v_int32& c1,
1069
+ const v_int32& c2, const v_int32& c3,
1070
+ const v_int32& c4, const v_int32& c5,
1071
+ const v_int32& c6)
1072
+ {
1073
+ constexpr int nlanes = static_cast <int >(v_uint16::nlanes);
1074
+ vx_store (out_ptr, v_pack_u (c1, c2));
1075
+ vx_store (out_ptr + nlanes, v_pack_u (c3, c4));
1076
+ vx_store (out_ptr + 2 *nlanes, v_pack_u (c5, c6));
1077
+ }
1078
+
1079
+ template <typename T>
1080
+ CV_ALWAYS_INLINE int absdiffc_simd_c3_impl (const T in[], T out[],
1081
+ const v_float32& s1, const v_float32& s2,
1082
+ const v_float32& s3, const int length)
1083
+ {
1084
+ static_assert ((std::is_same<T, ushort>::value) || (std::is_same<T, short >::value),
1085
+ " This templated overload is only for short or ushort type combinations." );
1086
+
1087
+ constexpr int nlanes = (std::is_same<T, ushort>::value) ? static_cast <int >(v_uint16::nlanes):
1088
+ static_cast <int >(v_int16::nlanes);
1089
+
1090
+ if (length < 3 * nlanes)
1091
+ return 0 ;
1092
+
1093
+ int x = 0 ;
1094
+ for (;;)
1095
+ {
1096
+ for (; x <= length - 3 * nlanes; x += 3 * nlanes)
1097
+ {
1098
+ v_float32 a1 = v_load_f32 (in + x);
1099
+ v_float32 a2 = v_load_f32 (in + x + nlanes / 2 );
1100
+ v_float32 a3 = v_load_f32 (in + x + nlanes);
1101
+ v_float32 a4 = v_load_f32 (in + x + 3 * nlanes / 2 );
1102
+ v_float32 a5 = v_load_f32 (in + x + 2 * nlanes);
1103
+ v_float32 a6 = v_load_f32 (in + x + 5 * nlanes / 2 );
1104
+
1105
+ absdiffc_short_store_c3 (&out[x], v_round (v_absdiff (a1, s1)),
1106
+ v_round (v_absdiff (a2, s2)),
1107
+ v_round (v_absdiff (a3, s3)),
1108
+ v_round (v_absdiff (a4, s1)),
1109
+ v_round (v_absdiff (a5, s2)),
1110
+ v_round (v_absdiff (a6, s3)));
1111
+ }
1112
+
1113
+ if (x < length && (in != out))
1114
+ {
1115
+ x = length - 3 * nlanes;
1116
+ continue ; // process unaligned tail
1117
+ }
1118
+ break ;
1119
+ }
1120
+ return x;
1121
+ }
1122
+
1123
+ template <>
1124
+ CV_ALWAYS_INLINE int absdiffc_simd_c3_impl<uchar>(const uchar in[], uchar out[],
1125
+ const v_float32& s1, const v_float32& s2,
1126
+ const v_float32& s3, const int length)
1127
+ {
1128
+ constexpr int nlanes = static_cast <int >(v_uint8::nlanes);
1129
+
1130
+ if (length < 3 * nlanes)
1131
+ return 0 ;
1132
+
1133
+ int x = 0 ;
1134
+
1135
+ for (;;)
1136
+ {
1137
+ for (; x <= length - 3 * nlanes; x += 3 * nlanes)
1138
+ {
1139
+ vx_store (&out[x],
1140
+ v_pack_u (v_pack (v_round (v_absdiff (v_load_f32 (in + x), s1)),
1141
+ v_round (v_absdiff (v_load_f32 (in + x + nlanes/4 ), s2))),
1142
+ v_pack (v_round (v_absdiff (v_load_f32 (in + x + nlanes/2 ), s3)),
1143
+ v_round (v_absdiff (v_load_f32 (in + x + 3 *nlanes/4 ), s1)))));
1144
+
1145
+ vx_store (&out[x + nlanes],
1146
+ v_pack_u (v_pack (v_round (v_absdiff (v_load_f32 (in + x + nlanes), s2)),
1147
+ v_round (v_absdiff (v_load_f32 (in + x + 5 *nlanes/4 ), s3))),
1148
+ v_pack (v_round (v_absdiff (v_load_f32 (in + x + 3 *nlanes/2 ), s1)),
1149
+ v_round (v_absdiff (v_load_f32 (in + x + 7 *nlanes/4 ), s2)))));
1150
+
1151
+ vx_store (&out[x + 2 * nlanes],
1152
+ v_pack_u (v_pack (v_round (v_absdiff (v_load_f32 (in + x + 2 *nlanes), s3)),
1153
+ v_round (v_absdiff (v_load_f32 (in + x + 9 *nlanes/4 ), s1))),
1154
+ v_pack (v_round (v_absdiff (v_load_f32 (in + x + 5 *nlanes/2 ), s2)),
1155
+ v_round (v_absdiff (v_load_f32 (in + x + 11 *nlanes/4 ), s3)))));
1156
+ }
1157
+
1158
+ if (x < length && (in != out))
1159
+ {
1160
+ x = length - 3 * nlanes;
1161
+ continue ; // process unaligned tail
1162
+ }
1163
+ break ;
1164
+ }
1165
+ return x;
1166
+ }
1167
+
1168
+ template <typename T>
1169
+ CV_ALWAYS_INLINE int absdiffc_simd_channels (const T in[], const float scalar[], T out[],
1170
+ const int width, int chan)
1171
+ {
1172
+ int length = width * chan;
1173
+ v_float32 s = vx_load (scalar);
1174
+
1175
+ return absdiffc_simd_c1c2c4 (in, out, s, length);
1176
+ }
1177
+
1178
+ template <typename T>
1179
+ CV_ALWAYS_INLINE int absdiffc_simd_c3 (const T in[], const float scalar[], T out[], int width)
1180
+ {
1181
+ constexpr int chan = 3 ;
1182
+ int length = width * chan;
1183
+
1184
+ v_float32 s1 = vx_load (scalar);
1185
+ #if CV_SIMD_WIDTH == 32
1186
+ v_float32 s2 = vx_load (scalar + 2 );
1187
+ v_float32 s3 = vx_load (scalar + 1 );
1188
+ #else
1189
+ v_float32 s2 = vx_load (scalar + 1 );
1190
+ v_float32 s3 = vx_load (scalar + 2 );
1191
+ #endif
1192
+
1193
+ return absdiffc_simd_c3_impl (in, out, s1, s2, s3, length);
1194
+ }
1195
+
1196
+ template <typename T>
1197
+ CV_ALWAYS_INLINE int absdiffc_simd (const T in[], const float scalar[], T out[], int width, int chan)
1198
+ {
1199
+ switch (chan)
1200
+ {
1201
+ case 1 :
1202
+ case 2 :
1203
+ case 4 :
1204
+ return absdiffc_simd_channels (in, scalar, out, width, chan);
1205
+ case 3 :
1206
+ return absdiffc_simd_c3 (in, scalar, out, width);
1207
+ default :
1208
+ break ;
1209
+ }
1210
+
1211
+ return 0 ;
1212
+ }
1213
+ #endif // CV_SIMD
1214
+
1215
+ template <typename DST, typename SRC>
1216
+ static void run_absdiffc (Buffer &dst, const View &src, const float scalar[])
1217
+ {
1218
+ const auto *in = src.InLine <SRC>(0 );
1219
+ auto *out = dst.OutLine <DST>();
1220
+
1221
+ int width = dst.length ();
1222
+ int chan = dst.meta ().chan ;
1223
+
1224
+ int w = 0 ;
1225
+ #if CV_SIMD
1226
+ w = absdiffc_simd (in, scalar, out, width, chan);
1227
+ #endif
1228
+
1229
+ for (; w < width*chan; ++w)
1230
+ out[w] = absdiff<DST>(in[w], scalar[w%chan]);
1231
+ }
1232
+
975
1233
template <typename DST, typename SRC>
976
1234
static void run_arithm_s (Buffer &dst, const View &src, const float scalar[4 ], Arithm arithm,
977
1235
float scale=1 )
@@ -990,11 +1248,6 @@ static void run_arithm_s(Buffer &dst, const View &src, const float scalar[4], Ar
990
1248
991
1249
switch (arithm)
992
1250
{
993
- case ARITHM_ABSDIFF:
994
- for (int w=0 ; w < width; w++)
995
- for (int c=0 ; c < chan; c++)
996
- out[chan*w + c] = absdiff<DST>(in[chan*w + c], scalar[c]);
997
- break ;
998
1251
case ARITHM_ADD:
999
1252
if (usemyscal)
1000
1253
{
@@ -1089,26 +1342,47 @@ static void run_arithm_rs(Buffer &dst, const View &src, const float scalar[4], A
1089
1342
}
1090
1343
}
1091
1344
1092
- GAPI_FLUID_KERNEL (GFluidAbsDiffC, cv::gapi::core::GAbsDiffC, false )
1345
+ GAPI_FLUID_KERNEL (GFluidAbsDiffC, cv::gapi::core::GAbsDiffC, true )
1093
1346
{
1094
1347
static const int Window = 1 ;
1095
1348
1096
- static void run (const View &src, const cv::Scalar & _scalar, Buffer &dst)
1349
+ static void run (const View &src, const cv::Scalar& _scalar, Buffer &dst, Buffer& scratch )
1097
1350
{
1098
- const float scalar[4 ] = {
1099
- static_cast <float >(_scalar[0 ]),
1100
- static_cast <float >(_scalar[1 ]),
1101
- static_cast <float >(_scalar[2 ]),
1102
- static_cast <float >(_scalar[3 ])
1103
- };
1351
+ if (dst.y () == 0 )
1352
+ {
1353
+ const int chan = src.meta ().chan ;
1354
+ float * sc = scratch.OutLine <float >();
1355
+
1356
+ for (int i = 0 ; i < scratch.length (); ++i)
1357
+ sc[i] = static_cast <float >(_scalar[i % chan]);
1358
+ }
1359
+
1360
+ const float * scalar = scratch.OutLine <float >();
1104
1361
1105
1362
// DST SRC OP __VA_ARGS__
1106
- UNARY_ (uchar , uchar , run_arithm_s , dst, src, scalar, ARITHM_ABSDIFF );
1107
- UNARY_ (ushort, ushort, run_arithm_s , dst, src, scalar, ARITHM_ABSDIFF );
1108
- UNARY_ ( short , short , run_arithm_s , dst, src, scalar, ARITHM_ABSDIFF );
1363
+ UNARY_ (uchar, uchar, run_absdiffc , dst, src, scalar);
1364
+ UNARY_ (ushort, ushort, run_absdiffc , dst, src, scalar);
1365
+ UNARY_ (short , short , run_absdiffc , dst, src, scalar);
1109
1366
1110
1367
CV_Error (cv::Error::StsBadArg, " unsupported combination of types" );
1111
1368
}
1369
+
1370
+ static void initScratch (const GMatDesc&, const GScalarDesc&, Buffer& scratch)
1371
+ {
1372
+ #if CV_SIMD
1373
+ constexpr int buflen = static_cast <int >(v_float32::nlanes) + 2 ; // buffer size
1374
+ #else
1375
+ constexpr int buflen = 4 ;
1376
+ #endif
1377
+ cv::Size bufsize (buflen, 1 );
1378
+ GMatDesc bufdesc = { CV_32F, 1 , bufsize };
1379
+ Buffer buffer (bufdesc);
1380
+ scratch = std::move (buffer);
1381
+ }
1382
+
1383
+ static void resetScratch (Buffer& /* scratch */ )
1384
+ {
1385
+ }
1112
1386
};
1113
1387
1114
1388
GAPI_FLUID_KERNEL (GFluidAddC, cv::gapi::core::GAddC, false )
0 commit comments