Merge pull request opencv#19233 from anna-khakimova:ak/simd_absdiffc

Anna Khakimova · web-flow · commit 7ab3a80d0ab7 · 2021-02-08T13:02:35.000Z
GAPI: SIMD optimization for AbsDiffC kernel

* SIMD optimization for AbsDiffC kernel

* Applied comments

* Applying comments and refactoring: Remove new univ intrinsics.

* Performance experiment

* Applied comments.Step2

* Applied comments. Step3
diff --git a/modules/gapi/include/opencv2/gapi/core.hpp b/modules/gapi/include/opencv2/gapi/core.hpp
@@ -298,8 +298,8 @@ namespace core {
         }
     };
 
-    G_TYPED_KERNEL(GAbsDiffC, <GMat(GMat, GScalar)>, "org.opencv.core.matrixop.absdiffC") {
-        static GMatDesc outMeta(GMatDesc a, GScalarDesc) {
+    G_TYPED_KERNEL(GAbsDiffC, <GMat(GMat,GScalar)>, "org.opencv.core.matrixop.absdiffC") {
+        static GMatDesc outMeta(const GMatDesc& a, const GScalarDesc&) {
             return a;
         }
     };
diff --git a/modules/gapi/perf/cpu/gapi_core_perf_tests_fluid.cpp b/modules/gapi/perf/cpu/gapi_core_perf_tests_fluid.cpp
@@ -147,7 +147,9 @@ INSTANTIATE_TEST_CASE_P(AbsDiffPerfTestFluid, AbsDiffPerfTest,
 
 INSTANTIATE_TEST_CASE_P(AbsDiffCPerfTestFluid, AbsDiffCPerfTest,
     Combine(Values(szSmall128, szVGA, sz720p, sz1080p),
-            Values(CV_8UC1, CV_8UC3, CV_16UC1, CV_16SC1),
+            Values(CV_8UC1, CV_16UC1, CV_16SC1, CV_8UC2,
+                   CV_16UC2, CV_16SC2, CV_8UC3, CV_16UC3,
+                   CV_16SC3, CV_8UC4, CV_16UC4, CV_16SC4),
             Values(cv::compile_args(CORE_FLUID))));
 
 // INSTANTIATE_TEST_CASE_P(SumPerfTestFluid, SumPerfTest,
diff --git a/modules/gapi/src/backends/fluid/gfluidcore.cpp b/modules/gapi/src/backends/fluid/gfluidcore.cpp
@@ -97,7 +97,7 @@ static inline DST divr(SRC1 x, SRC2 y, float scale=1)
 // Fluid kernels: addWeighted
 //
 //---------------------------
-#if CV_SSE2
+#if CV_SIMD
 CV_ALWAYS_INLINE v_float32 v_load_f32(const ushort* in)
 {
     return v_cvt_f32(v_reinterpret_as_s32(vx_load_expand(in)));
@@ -112,7 +112,9 @@ CV_ALWAYS_INLINE v_float32 v_load_f32(const uchar* in)
 {
     return v_cvt_f32(v_reinterpret_as_s32(vx_load_expand_q(in)));
 }
+#endif
 
+#if CV_SSE2
 CV_ALWAYS_INLINE void addw_short_store(short* out, const v_int32& c1, const v_int32& c2)
 {
     vx_store(out, v_pack(c1, c2));
@@ -972,6 +974,262 @@ static void run_arithm_s(DST out[], const SRC in[], int width, int chan,
         CV_Error(cv::Error::StsBadArg, "unsupported number of channels");
 }
 
+#if CV_SIMD
+CV_ALWAYS_INLINE void absdiffc_short_store_c1c2c4(short* out_ptr, const v_int32& c1, const v_int32& c2)
+{
+    vx_store(out_ptr, v_pack(c1, c2));
+}
+
+CV_ALWAYS_INLINE void absdiffc_short_store_c1c2c4(ushort* out_ptr, const v_int32& c1, const v_int32& c2)
+{
+    vx_store(out_ptr, v_pack_u(c1, c2));
+}
+
+template<typename T>
+CV_ALWAYS_INLINE int absdiffc_simd_c1c2c4(const T in[], T out[],
+                                          const v_float32& s, const int length)
+{
+    static_assert((std::is_same<T, ushort>::value) || (std::is_same<T, short>::value),
+                  "This templated overload is only for short or ushort type combinations.");
+
+    constexpr int nlanes = (std::is_same<T, ushort>::value) ? static_cast<int>(v_uint16::nlanes) :
+                                                              static_cast<int>(v_int16::nlanes);
+    if (length < nlanes)
+        return 0;
+
+    int x = 0;
+    for (;;)
+    {
+        for (; x <= length - nlanes; x += nlanes)
+        {
+            v_float32 a1 = v_load_f32(in + x);
+            v_float32 a2 = v_load_f32(in + x + nlanes / 2);
+
+            absdiffc_short_store_c1c2c4(&out[x], v_round(v_absdiff(a1, s)),
+                                                 v_round(v_absdiff(a2, s)));
+        }
+
+        if (x < length && (in != out))
+        {
+            x = length - nlanes;
+            continue;  // process unaligned tail
+        }
+        break;
+    }
+    return x;
+}
+
+template<>
+CV_ALWAYS_INLINE int absdiffc_simd_c1c2c4<uchar>(const uchar in[], uchar out[],
+                                                 const v_float32& s, const int length)
+{
+    constexpr int nlanes = static_cast<int>(v_uint8::nlanes);
+
+    if (length < nlanes)
+        return 0;
+
+    int x = 0;
+    for (;;)
+    {
+        for (; x <= length - nlanes; x += nlanes)
+        {
+            v_float32 a1 = v_load_f32(in + x);
+            v_float32 a2 = v_load_f32(in + x + nlanes / 4);
+            v_float32 a3 = v_load_f32(in + x + nlanes / 2);
+            v_float32 a4 = v_load_f32(in + x + 3 * nlanes / 4);
+
+            vx_store(&out[x], v_pack_u(v_pack(v_round(v_absdiff(a1, s)),
+                                              v_round(v_absdiff(a2, s))),
+                                       v_pack(v_round(v_absdiff(a3, s)),
+                                              v_round(v_absdiff(a4, s)))));
+        }
+
+        if (x < length && (in != out))
+        {
+            x = length - nlanes;
+            continue;  // process unaligned tail
+        }
+        break;
+    }
+    return x;
+}
+
+CV_ALWAYS_INLINE void absdiffc_short_store_c3(short* out_ptr, const v_int32& c1,
+                                              const v_int32& c2, const v_int32& c3,
+                                              const v_int32& c4, const v_int32& c5,
+                                              const v_int32& c6)
+{
+    constexpr int nlanes = static_cast<int>(v_int16::nlanes);
+    vx_store(out_ptr, v_pack(c1, c2));
+    vx_store(out_ptr + nlanes, v_pack(c3, c4));
+    vx_store(out_ptr + 2*nlanes, v_pack(c5, c6));
+}
+
+CV_ALWAYS_INLINE void absdiffc_short_store_c3(ushort* out_ptr, const v_int32& c1,
+                                              const v_int32& c2, const v_int32& c3,
+                                              const v_int32& c4, const v_int32& c5,
+                                              const v_int32& c6)
+{
+    constexpr int nlanes = static_cast<int>(v_uint16::nlanes);
+    vx_store(out_ptr, v_pack_u(c1, c2));
+    vx_store(out_ptr + nlanes, v_pack_u(c3, c4));
+    vx_store(out_ptr + 2*nlanes, v_pack_u(c5, c6));
+}
+
+template<typename T>
+CV_ALWAYS_INLINE int absdiffc_simd_c3_impl(const T in[], T out[],
+                                           const v_float32& s1, const v_float32& s2,
+                                           const v_float32& s3, const int length)
+{
+    static_assert((std::is_same<T, ushort>::value) || (std::is_same<T, short>::value),
+                  "This templated overload is only for short or ushort type combinations.");
+
+    constexpr int nlanes = (std::is_same<T, ushort>::value) ? static_cast<int>(v_uint16::nlanes):
+                                                              static_cast<int>(v_int16::nlanes);
+
+    if (length < 3 * nlanes)
+        return 0;
+
+    int x = 0;
+    for (;;)
+    {
+        for (; x <= length - 3 * nlanes; x += 3 * nlanes)
+        {
+            v_float32 a1 = v_load_f32(in + x);
+            v_float32 a2 = v_load_f32(in + x + nlanes / 2);
+            v_float32 a3 = v_load_f32(in + x + nlanes);
+            v_float32 a4 = v_load_f32(in + x + 3 * nlanes / 2);
+            v_float32 a5 = v_load_f32(in + x + 2 * nlanes);
+            v_float32 a6 = v_load_f32(in + x + 5 * nlanes / 2);
+
+            absdiffc_short_store_c3(&out[x], v_round(v_absdiff(a1, s1)),
+                                             v_round(v_absdiff(a2, s2)),
+                                             v_round(v_absdiff(a3, s3)),
+                                             v_round(v_absdiff(a4, s1)),
+                                             v_round(v_absdiff(a5, s2)),
+                                             v_round(v_absdiff(a6, s3)));
+        }
+
+        if (x < length && (in != out))
+        {
+            x = length - 3 * nlanes;
+            continue;  // process unaligned tail
+        }
+        break;
+    }
+    return x;
+}
+
+template<>
+CV_ALWAYS_INLINE int absdiffc_simd_c3_impl<uchar>(const uchar in[], uchar out[],
+                                                  const v_float32& s1, const v_float32& s2,
+                                                  const v_float32& s3, const int length)
+{
+    constexpr int nlanes = static_cast<int>(v_uint8::nlanes);
+
+    if (length < 3 * nlanes)
+        return 0;
+
+    int x = 0;
+
+    for (;;)
+    {
+        for (; x <= length - 3 * nlanes; x += 3 * nlanes)
+        {
+            vx_store(&out[x],
+                     v_pack_u(v_pack(v_round(v_absdiff(v_load_f32(in + x), s1)),
+                                     v_round(v_absdiff(v_load_f32(in + x + nlanes/4), s2))),
+                              v_pack(v_round(v_absdiff(v_load_f32(in + x + nlanes/2), s3)),
+                                     v_round(v_absdiff(v_load_f32(in + x + 3*nlanes/4), s1)))));
+
+            vx_store(&out[x + nlanes],
+                     v_pack_u(v_pack(v_round(v_absdiff(v_load_f32(in + x + nlanes), s2)),
+                                     v_round(v_absdiff(v_load_f32(in + x + 5*nlanes/4), s3))),
+                              v_pack(v_round(v_absdiff(v_load_f32(in + x + 3*nlanes/2), s1)),
+                                     v_round(v_absdiff(v_load_f32(in + x + 7*nlanes/4), s2)))));
+
+            vx_store(&out[x + 2 * nlanes],
+                     v_pack_u(v_pack(v_round(v_absdiff(v_load_f32(in + x + 2*nlanes), s3)),
+                                     v_round(v_absdiff(v_load_f32(in + x + 9*nlanes/4), s1))),
+                              v_pack(v_round(v_absdiff(v_load_f32(in + x + 5*nlanes/2), s2)),
+                                     v_round(v_absdiff(v_load_f32(in + x + 11*nlanes/4), s3)))));
+        }
+
+        if (x < length && (in != out))
+        {
+            x = length - 3 * nlanes;
+            continue;  // process unaligned tail
+        }
+        break;
+    }
+    return x;
+}
+
+template<typename T>
+CV_ALWAYS_INLINE int absdiffc_simd_channels(const T in[], const float scalar[], T out[],
+                                            const int width, int chan)
+{
+    int length = width * chan;
+    v_float32 s = vx_load(scalar);
+
+    return absdiffc_simd_c1c2c4(in, out, s, length);
+}
+
+template<typename T>
+CV_ALWAYS_INLINE int absdiffc_simd_c3(const T in[], const float scalar[], T out[], int width)
+{
+    constexpr int chan = 3;
+    int length = width * chan;
+
+    v_float32 s1 = vx_load(scalar);
+#if CV_SIMD_WIDTH == 32
+    v_float32 s2 = vx_load(scalar + 2);
+    v_float32 s3 = vx_load(scalar + 1);
+#else
+    v_float32 s2 = vx_load(scalar + 1);
+    v_float32 s3 = vx_load(scalar + 2);
+#endif
+
+    return absdiffc_simd_c3_impl(in, out, s1, s2, s3, length);
+}
+
+template<typename T>
+CV_ALWAYS_INLINE int absdiffc_simd(const T in[], const float scalar[], T out[], int width, int chan)
+{
+    switch (chan)
+    {
+    case 1:
+    case 2:
+    case 4:
+        return absdiffc_simd_channels(in, scalar, out, width, chan);
+    case 3:
+        return absdiffc_simd_c3(in, scalar, out, width);
+    default:
+        break;
+    }
+
+    return 0;
+}
+#endif  // CV_SIMD
+
+template<typename DST, typename SRC>
+static void run_absdiffc(Buffer &dst, const View &src, const float scalar[])
+{
+    const auto *in = src.InLine<SRC>(0);
+    auto *out = dst.OutLine<DST>();
+
+    int width = dst.length();
+    int chan = dst.meta().chan;
+
+    int w = 0;
+#if CV_SIMD
+    w = absdiffc_simd(in, scalar, out, width, chan);
+#endif
+
+    for (; w < width*chan; ++w)
+        out[w] = absdiff<DST>(in[w], scalar[w%chan]);
+}
+
 template<typename DST, typename SRC>
 static void run_arithm_s(Buffer &dst, const View &src, const float scalar[4], Arithm arithm,
                          float scale=1)
@@ -990,11 +1248,6 @@ static void run_arithm_s(Buffer &dst, const View &src, const float scalar[4], Ar
 
     switch (arithm)
     {
-    case ARITHM_ABSDIFF:
-        for (int w=0; w < width; w++)
-            for (int c=0; c < chan; c++)
-                out[chan*w + c] = absdiff<DST>(in[chan*w + c], scalar[c]);
-        break;
     case ARITHM_ADD:
         if (usemyscal)
         {
@@ -1089,26 +1342,47 @@ static void run_arithm_rs(Buffer &dst, const View &src, const float scalar[4], A
     }
 }
 
-GAPI_FLUID_KERNEL(GFluidAbsDiffC, cv::gapi::core::GAbsDiffC, false)
+GAPI_FLUID_KERNEL(GFluidAbsDiffC, cv::gapi::core::GAbsDiffC, true)
 {
     static const int Window = 1;
 
-    static void run(const View &src, const cv::Scalar &_scalar, Buffer &dst)
+    static void run(const View &src, const cv::Scalar& _scalar, Buffer &dst, Buffer& scratch)
     {
-        const float scalar[4] = {
-            static_cast<float>(_scalar[0]),
-            static_cast<float>(_scalar[1]),
-            static_cast<float>(_scalar[2]),
-            static_cast<float>(_scalar[3])
-        };
+        if (dst.y() == 0)
+        {
+            const int chan = src.meta().chan;
+            float* sc = scratch.OutLine<float>();
+
+            for (int i = 0; i < scratch.length(); ++i)
+                sc[i] = static_cast<float>(_scalar[i % chan]);
+        }
+
+        const float* scalar = scratch.OutLine<float>();
 
         //     DST     SRC     OP            __VA_ARGS__
-        UNARY_(uchar , uchar , run_arithm_s, dst, src, scalar, ARITHM_ABSDIFF);
-        UNARY_(ushort, ushort, run_arithm_s, dst, src, scalar, ARITHM_ABSDIFF);
-        UNARY_( short,  short, run_arithm_s, dst, src, scalar, ARITHM_ABSDIFF);
+        UNARY_(uchar, uchar, run_absdiffc, dst, src, scalar);
+        UNARY_(ushort, ushort, run_absdiffc, dst, src, scalar);
+        UNARY_(short, short, run_absdiffc, dst, src, scalar);
 
         CV_Error(cv::Error::StsBadArg, "unsupported combination of types");
     }
+
+    static void initScratch(const GMatDesc&, const GScalarDesc&, Buffer& scratch)
+    {
+#if CV_SIMD
+        constexpr int buflen = static_cast<int>(v_float32::nlanes) + 2; // buffer size
+#else
+        constexpr int buflen = 4;
+#endif
+        cv::Size bufsize(buflen, 1);
+        GMatDesc bufdesc = { CV_32F, 1, bufsize };
+        Buffer buffer(bufdesc);
+        scratch = std::move(buffer);
+    }
+
+    static void resetScratch(Buffer& /* scratch */)
+    {
+    }
 };
 
 GAPI_FLUID_KERNEL(GFluidAddC, cv::gapi::core::GAddC, false)
diff --git a/modules/gapi/test/cpu/gapi_core_tests_fluid.cpp b/modules/gapi/test/cpu/gapi_core_tests_fluid.cpp
@@ -105,7 +105,9 @@ INSTANTIATE_TEST_CASE_P(AbsDiffTestFluid, AbsDiffTest,
                                 Values(CORE_FLUID)));
 
 INSTANTIATE_TEST_CASE_P(AbsDiffCTestFluid, AbsDiffCTest,
-                        Combine(Values(CV_8UC1, CV_16UC1, CV_16SC1),
+                        Combine(Values(CV_8UC1, CV_16UC1, CV_16SC1, CV_8UC2,
+                                       CV_16UC2, CV_16SC2, CV_8UC3, CV_16UC3,
+                                       CV_16SC3, CV_8UC4, CV_16UC4, CV_16SC4),
                                 Values(cv::Size(1280, 720),
                                        cv::Size(640, 480),
                                        cv::Size(128, 128)),

Original file line number	Diff line number	Diff line change
`@@ -298,8 +298,8 @@ namespace core {`
`298`	`298`	`}`
`299`	`299`	`};`
`300`	`300`
`301`		`- G_TYPED_KERNEL(GAbsDiffC, <GMat(GMat, GScalar)>, "org.opencv.core.matrixop.absdiffC") {`
`302`		`- static GMatDesc outMeta(GMatDesc a, GScalarDesc) {`
	`301`	`+ G_TYPED_KERNEL(GAbsDiffC, <GMat(GMat,GScalar)>, "org.opencv.core.matrixop.absdiffC") {`
	`302`	`+ static GMatDesc outMeta(const GMatDesc& a, const GScalarDesc&) {`
`303`	`303`	`return a;`
`304`	`304`	`}`
`305`	`305`	`};`