Merge pull request opencv#26088 from plctlab:rvp_pt2

asmorkalov · web-flow · commit 7de3a8e960d7 · 2024-09-11T12:18:42.000+03:00
3rdparty: NDSRVP - Part 2: Filter
diff --git a/3rdparty/ndsrvp/include/imgproc.hpp b/3rdparty/ndsrvp/include/imgproc.hpp
@@ -5,6 +5,8 @@
 #ifndef OPENCV_NDSRVP_IMGPROC_HPP
 #define OPENCV_NDSRVP_IMGPROC_HPP
 
+struct cvhalFilter2D;
+
 namespace cv {
 
 namespace ndsrvp {
@@ -71,6 +73,34 @@ int threshold(const uchar* src_data, size_t src_step,
 #undef cv_hal_threshold
 #define cv_hal_threshold (cv::ndsrvp::threshold)
 
+// ################ filter ################
+
+int filterInit(cvhalFilter2D **context,
+    uchar *kernel_data, size_t kernel_step,
+    int kernel_type, int kernel_width,
+    int kernel_height, int max_width, int max_height,
+    int src_type, int dst_type, int borderType,
+    double delta, int anchor_x, int anchor_y,
+    bool allowSubmatrix, bool allowInplace);
+
+#undef cv_hal_filterInit
+#define cv_hal_filterInit (cv::ndsrvp::filterInit)
+
+int filter(cvhalFilter2D *context,
+    const uchar *src_data, size_t src_step,
+    uchar *dst_data, size_t dst_step,
+    int width, int height,
+    int full_width, int full_height,
+    int offset_x, int offset_y);
+
+#undef cv_hal_filter
+#define cv_hal_filter (cv::ndsrvp::filter)
+
+int filterFree(cvhalFilter2D *context);
+
+#undef cv_hal_filterFree
+#define cv_hal_filterFree (cv::ndsrvp::filterFree)
+
 } // namespace ndsrvp
 
 } // namespace cv
diff --git a/3rdparty/ndsrvp/src/cvutils.cpp b/3rdparty/ndsrvp/src/cvutils.cpp
@@ -73,6 +73,40 @@ int borderInterpolate(int p, int len, int borderType)
     return p;
 }
 
+int16x4_t borderInterpolate_vector(int16x4_t vp, short len, int borderType)
+{
+    int16x4_t vzero = (int16x4_t){0, 0, 0, 0};
+    int16x4_t vone = (int16x4_t){1, 1, 1, 1};
+    int16x4_t vlen = (int16x4_t){len, len, len, len};
+    if(borderType == CV_HAL_BORDER_REPLICATE)
+        vp = (int16x4_t)__nds__bpick(0, __nds__bpick((long)(vlen - 1), (long)vp, (long)(vp >= vlen)), (long)(vp < 0));
+    else if(borderType == CV_HAL_BORDER_REFLECT || borderType == CV_HAL_BORDER_REFLECT_101)
+    {
+        int16x4_t vdelta = (borderType == CV_HAL_BORDER_REFLECT_101) ? vone : vzero;
+        if(len == 1)
+            return vzero;
+        do
+        {
+            int16x4_t vneg = -vp - 1 + vdelta;
+            int16x4_t vpos = vlen - 1 - (vp - vlen) - vdelta;
+            vp = (int16x4_t)__nds__bpick((long)vneg, __nds__bpick((long)vpos, (long)vp, (long)(vp >= vlen)), (long)(vp < 0));
+        }
+        while( (long)(vp >= vlen) || (long)(vp < 0) );
+    }
+    else if(borderType == CV_HAL_BORDER_WRAP)
+    {
+        ndsrvp_assert(len > 0);
+        int16x4_t vneg = vp - ((vp - vlen + 1) / vlen) * vlen;
+        int16x4_t vpos = vp % vlen;
+        vp = (int16x4_t)__nds__bpick((long)vneg, __nds__bpick((long)vpos, (long)vp, (long)(vp >= vlen)), (long)(vp < 0));
+    }
+    else if(borderType == CV_HAL_BORDER_CONSTANT)
+        vp = (int16x4_t)__nds__bpick((long)-vone, (long)vp, (long)(vp < 0 || vp >= vlen));
+    else
+        ndsrvp_error(Error::StsBadArg, "borderInterpolate_vector(): Unknown/unsupported border type");
+    return vp;
+}
+
 } // namespace ndsrvp
 
 } // namespace cv
diff --git a/3rdparty/ndsrvp/src/cvutils.hpp b/3rdparty/ndsrvp/src/cvutils.hpp
@@ -14,6 +14,7 @@
 #include <iostream>
 #include <string>
 #include <array>
+#include <vector>
 #include <climits>
 #include <algorithm>
 
@@ -26,16 +27,26 @@ namespace ndsrvp {
 void* fastMalloc(size_t size);
 void fastFree(void* ptr);
 int borderInterpolate(int p, int len, int borderType);
+int16x4_t borderInterpolate_vector(int16x4_t vp, short len, int borderType);
 
 #ifndef MAX
 #  define MAX(a,b)  ((a) < (b) ? (b) : (a))
 #endif
 
+#ifndef MIN
+#  define MIN(a,b)  ((a) > (b) ? (b) : (a))
+#endif
+
 #define CV_MAT_CN_MASK          ((CV_CN_MAX - 1) << CV_CN_SHIFT)
 #define CV_MAT_CN(flags)        ((((flags) & CV_MAT_CN_MASK) >> CV_CN_SHIFT) + 1)
 
+#define CV_ELEM_SIZE1(type) ((0x28442211 >> CV_MAT_DEPTH(type)*4) & 15)
+#define CV_ELEM_SIZE(type) (CV_MAT_CN(type)*CV_ELEM_SIZE1(type))
+
 #define CV_MALLOC_ALIGN 64
 
+inline size_t getElemSize(int type) { return (size_t)CV_ELEM_SIZE(type); }
+
 // error codes
 
 enum Error{
@@ -69,6 +80,135 @@ inline int32x2_t vclip(int32x2_t x, int32x2_t a, int32x2_t b)
     return (int32x2_t)__nds__bpick((long)a, __nds__bpick((long)(b - 1), (long)x, (long)(x < b)), (long)(x >= a));
 }
 
+// expand
+
+/*
+    [0] [1] [2] [3] [4] [5] [6] [7]
+810 [  0  ] [  1  ] [  4  ] [  5  ]
+832 [  2  ] [  3  ] [  6  ] [  7  ]
+bb  [  0  ] [  1  ] [  2  ] [  3  ]
+tt  [  4  ] [  5  ] [  6  ] [  7  ]
+*/
+
+inline void ndsrvp_u8_u16_expand8(const unsigned long vs, ushort* dst)
+{
+    unsigned long vs810 = __nds__zunpkd810(vs);
+    unsigned long vs832 = __nds__zunpkd832(vs);
+    *(unsigned long*)dst = __nds__pkbb32(vs832, vs810);
+    *(unsigned long*)(dst + 4) = __nds__pktt32(vs832, vs810);
+}
+
+/*
+    [0] [1] [2] [3] [4] [5] [6] [7]
+820 [  0  ] [  2  ] [  4  ] [  6  ]
+831 [  1  ] [  3  ] [  5  ] [  7  ]
+bb  [  0  ] [  2  ] [  1  ] [  3  ]
+tt  [  4  ] [  6  ] [  5  ] [  7  ]
+*/
+
+inline void ndsrvp_u8_u16_eswap8(const unsigned long vs, ushort* dst)
+{
+    unsigned long vs820 = __nds__zunpkd820(vs);
+    unsigned long vs831 = __nds__zunpkd831(vs);
+    *(unsigned long*)dst = __nds__pkbb32(vs831, vs820);
+    *(unsigned long*)(dst + 4) = __nds__pktt32(vs831, vs820);
+}
+
+/*
+    [0] [1] [2] [3] [4] [5] [6] [7]
+820 [  0  ] [  2  ] [  4  ] [  6  ]
+831 [  1  ] [  3  ] [  5  ] [  7  ]
+bb  [  0  ] [  2  ] [  1  ] [  3  ]
+tt  [  4  ] [  6  ] [  5  ] [  7  ]
+bbbb[      0      ] [      1      ]
+bbtt[      2      ] [      3      ]
+ttbb[      4      ] [      5      ]
+tttt[      6      ] [      7      ]
+*/
+
+
+inline void ndsrvp_u8_u32_expand8(const unsigned long vs, uint* dst)
+{
+    unsigned long vs820 = __nds__zunpkd820(vs);
+    unsigned long vs831 = __nds__zunpkd831(vs);
+    unsigned long vsbb = __nds__pkbb32(vs831, vs820);
+    unsigned long vstt = __nds__pktt32(vs831, vs820);
+    *(unsigned long*)dst = __nds__pkbb16(0, vsbb);
+    *(unsigned long*)(dst + 2) = __nds__pktt16(0, vsbb);
+    *(unsigned long*)(dst + 4) = __nds__pkbb16(0, vstt);
+    *(unsigned long*)(dst + 6) = __nds__pktt16(0, vstt);
+}
+
+// float replacement
+
+inline void ndsrvp_f32_add8(const float* a, const float* b, float* c)
+{
+    c[0] = a[0] + b[0];
+    c[1] = a[1] + b[1];
+    c[2] = a[2] + b[2];
+    c[3] = a[3] + b[3];
+    c[4] = a[4] + b[4];
+    c[5] = a[5] + b[5];
+    c[6] = a[6] + b[6];
+    c[7] = a[7] + b[7];
+}
+
+/*
+    [1] [8] [23]
+    [24] [8]
+*/
+
+inline void ndsrvp_f32_u8_mul8(const float* a, const unsigned long b, float* c) // experimental, not bit exact
+{
+    const int mask_frac = 0x007FFFFF;
+    const int mask_sign = 0x7FFFFFFF;
+    const int mask_lead = 0x40000000;
+    const int ofs_exp = 23;
+
+    uint32x2_t va01 = *(uint32x2_t*)a;
+    uint32x2_t va23 = *(uint32x2_t*)(a + 2);
+    uint32x2_t va45 = *(uint32x2_t*)(a + 4);
+    uint32x2_t va67 = *(uint32x2_t*)(a + 6);
+
+    uint32x2_t vaexp01 = va01 >> ofs_exp;
+    uint32x2_t vaexp23 = va23 >> ofs_exp;
+    uint32x2_t vaexp45 = va45 >> ofs_exp;
+    uint32x2_t vaexp67 = va67 >> ofs_exp;
+
+    uint32x2_t vafrac01 = ((va01 << 7) & mask_sign) | mask_lead;
+    uint32x2_t vafrac23 = ((va23 << 7) & mask_sign) | mask_lead;
+    uint32x2_t vafrac45 = ((va45 << 7) & mask_sign) | mask_lead;
+    uint32x2_t vafrac67 = ((va67 << 7) & mask_sign) | mask_lead;
+
+    int16x4_t vb[2]; // fake signed for signed multiply
+    ndsrvp_u8_u16_eswap8(b, (ushort*)vb);
+
+    vafrac01 = (uint32x2_t)__nds__kmmwb2_u((long)vafrac01, (unsigned long)vb[0]);
+    vafrac23 = (uint32x2_t)__nds__kmmwt2_u((long)vafrac23, (unsigned long)vb[0]);
+    vafrac45 = (uint32x2_t)__nds__kmmwb2_u((long)vafrac45, (unsigned long)vb[1]);
+    vafrac67 = (uint32x2_t)__nds__kmmwt2_u((long)vafrac67, (unsigned long)vb[1]);
+
+    uint32x2_t vaclz01 = __nds__v_clz32(vafrac01) - 8;
+    uint32x2_t vaclz23 = __nds__v_clz32(vafrac23) - 8;
+    uint32x2_t vaclz45 = __nds__v_clz32(vafrac45) - 8;
+    uint32x2_t vaclz67 = __nds__v_clz32(vafrac67) - 8;
+
+    vaexp01 += 8 - vaclz01;
+    vaexp23 += 8 - vaclz23;
+    vaexp45 += 8 - vaclz45;
+    vaexp67 += 8 - vaclz67;
+
+    vafrac01 <<= vaclz01;
+    vafrac23 <<= vaclz23;
+    vafrac45 <<= vaclz45;
+    vafrac67 <<= vaclz67;
+
+    *(uint32x2_t*)c = (vaexp01 << ofs_exp) | (vafrac01 & mask_frac);
+    *(uint32x2_t*)(c + 2) = (vaexp23 << ofs_exp) | (vafrac23 & mask_frac);
+    *(uint32x2_t*)(c + 4) = (vaexp45 << ofs_exp) | (vafrac45 & mask_frac);
+    *(uint32x2_t*)(c + 6) = (vaexp67 << ofs_exp) | (vafrac67 & mask_frac);
+}
+
 // saturate
 
 template<typename _Tp> static inline _Tp saturate_cast(int v)    { return _Tp(v); }
@@ -94,6 +234,26 @@ template<> inline short saturate_cast<short>(double v)     { return saturate_cas
 template<> inline int saturate_cast<int>(float v)     { return (int)lrintf(v); }
 template<> inline int saturate_cast<int>(double v)     { return (int)lrint(v); }
 
+inline double cast_ptr_to_double(const uchar* v, int depth) {
+    switch (depth) {
+        case CV_8U: return (double)*(uchar*)v;
+        case CV_8S: return (double)*(char*)v;
+        case CV_16U: return (double)*(ushort*)v;
+        case CV_16S: return (double)*(short*)v;
+        case CV_32S: return (double)*(int*)v;
+        case CV_32F: return (double)*(float*)v;
+        case CV_64F: return (double)*(double*)v;
+        case CV_16F: return (double)*(float*)v;
+        default: return 0;
+    }
+}
+
+template <typename _Tp>
+inline _Tp data_at(const uchar* data, int step, int y, int x, int cn)
+{
+    return ((_Tp*)(data + y * step))[x * cn];
+}
+
 // align
 
 inline long align(size_t v, int n)
diff --git a/3rdparty/ndsrvp/src/filter.cpp b/3rdparty/ndsrvp/src/filter.cpp