From 37c7495cefdd79f069505a3e88501125ca6fced8 Mon Sep 17 00:00:00 2001
From: Zhirui Dai <daizhirui@hotmail.com>
Date: Sun, 1 Oct 2023 17:22:19 -0700
Subject: [PATCH 1/6] fix build with GNU13

---
 modules/sfm/src/libmv_light/libmv/numeric/numeric.h | 1 +
 1 file changed, 1 insertion(+)
diff --git a/modules/sfm/src/libmv_light/libmv/numeric/numeric.h b/modules/sfm/src/libmv_light/libmv/numeric/numeric.h
index 9e7927e0bbc..41a55634dcd 100644
--- a/modules/sfm/src/libmv_light/libmv/numeric/numeric.h
+++ b/modules/sfm/src/libmv_light/libmv/numeric/numeric.h
@@ -32,6 +32,7 @@
 #include <Eigen/LU>
 #include <Eigen/QR>
 #include <Eigen/SVD>
+#include <cassert>
 
 #if !defined(__MINGW32__)
 #  if defined(_WIN32) || defined(__APPLE__) || \

From 36bb8727192340f038e1222c7599670ebc5cd88c Mon Sep 17 00:00:00 2001
From: Maksim Shabunin <maksim.shabunin@gmail.com>
Date: Fri, 13 Oct 2023 22:47:22 +0300
Subject: [PATCH 2/6] CI: enable RISC-V for 4.x branch

---
 .github/workflows/PR-4.x.yaml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/PR-4.x.yaml b/.github/workflows/PR-4.x.yaml
index 250b5e157a6..f05e94bfa32 100644
--- a/.github/workflows/PR-4.x.yaml
+++ b/.github/workflows/PR-4.x.yaml
@@ -26,3 +26,6 @@ jobs:
 
   macOS-X64:
     uses: opencv/ci-gha-workflow/.github/workflows/OCV-Contrib-PR-4.x-macOS-x86_64.yaml@main
+
+  Linux-RISC-V-Clang:
+    uses: opencv/ci-gha-workflow/.github/workflows/OCV-Contrib-PR-4.x-RISCV.yaml

From b9a460b85733c75a700c61b52ccee1fe653e21db Mon Sep 17 00:00:00 2001
From: Liutong HAN <liutong2020@iscas.ac.cn>
Date: Sun, 8 Oct 2023 16:01:23 +0800
Subject: [PATCH 3/6] Use new Universal Intrinsic API.

---
 modules/optflow/src/rlof/berlof_invoker.hpp   | 224 +++++++++---------
 modules/optflow/src/rlof/plk_invoker.hpp      |  74 +++---
 modules/optflow/src/rlof/rlof_invoker.hpp     | 132 +++++------
 modules/optflow/src/rlof/rlof_invokerbase.hpp |  76 +++---
 modules/optflow/src/rlof/rlof_localflow.cpp   |   8 +-
 modules/rgbd/src/colored_tsdf.cpp             | 120 +++++-----
 modules/rgbd/src/fast_icp.cpp                 |  48 ++--
 modules/rgbd/src/hash_tsdf.cpp                |  32 +--
 modules/rgbd/src/tsdf.cpp                     |  90 ++++---
 modules/rgbd/src/tsdf_functions.cpp           |  83 +++----
 modules/rgbd/src/tsdf_functions.hpp           |   2 +-
 modules/rgbd/src/utils.hpp                    |   2 +-
 modules/ximgproc/src/anisodiff.cpp            |  20 +-
 modules/ximgproc/src/fgs_filter.cpp           |  58 ++---
 .../xphoto/src/grayworld_white_balance.cpp    |  56 ++---
 .../src/learning_based_color_balance.cpp      |  82 +++----
 16 files changed, 549 insertions(+), 558 deletions(-)

diff --git a/modules/optflow/src/rlof/berlof_invoker.hpp b/modules/optflow/src/rlof/berlof_invoker.hpp
index 8fde6e457c1..e51f8091cbe 100644
--- a/modules/optflow/src/rlof/berlof_invoker.hpp
+++ b/modules/optflow/src/rlof/berlof_invoker.hpp
@@ -296,7 +296,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
                             v_int16x8 v01 = v_reinterpret_as_s16(v_load_expand(Jptr + x + cn));
                             v_int16x8 v10 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x));
                             v_int16x8 v11 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x + cn));
-                            v_int16x8 vmask = v_reinterpret_as_s16(v_load_expand(maskPtr + x)) * vmax_val_16;
+                            v_int16x8 vmask = v_mul(v_reinterpret_as_s16(v_load_expand(maskPtr + x)), vmax_val_16);
 
                             v_int32x4 t0, t1;
                             v_int16x8 t00, t01, t10, t11;
@@ -304,35 +304,35 @@ class TrackerInvoker : public cv::ParallelLoopBody
                             v_zip(v10, v11, t10, t11);
 
                             //subpixel interpolation
-                            t0 = v_dotprod(t00, vqw0, vdelta) + v_dotprod(t10, vqw1);
-                            t1 = v_dotprod(t01, vqw0, vdelta) + v_dotprod(t11, vqw1);
-                            t0 = t0 >> (W_BITS - 5);
-                            t1 = t1 >> (W_BITS - 5);
+                            t0 = v_add(v_dotprod(t00, vqw0, vdelta), v_dotprod(t10, vqw1));
+                            t1 = v_add(v_dotprod(t01, vqw0, vdelta), v_dotprod(t11, vqw1));
+                            t0 = v_shr(t0, W_BITS - 5);
+                            t1 = v_shr(t1, W_BITS - 5);
                             diff0 = v_pack(t0, t1);
                             // I*gain.x + gain.x
                             v_int16x8 diff[4] =
                             {
-                                ((v11 << 5) - vI) & vmask,
-                                ((v01 << 5) - vI) & vmask,
-                                ((v10 << 5) - vI) & vmask,
-                                ((v00 << 5) - vI) & vmask
+                                v_and(v_sub(v_shl<5>(v11), vI), vmask),
+                                v_and(v_sub(v_shl<5>(v01), vI), vmask),
+                                v_and(v_sub(v_shl<5>(v10), vI), vmask),
+                                v_and(v_sub(v_shl<5>(v00), vI), vmask)
                             };
-                            diff0 = diff0 - vI;
-                            diff0 = diff0 & vmask;
+                            diff0 = v_sub(diff0, vI);
+                            diff0 = v_and(diff0, vmask);
 
-                            v_int16x8 vscale_diff_is_pos = diff0 > vscale;
-                            veta = veta + (vscale_diff_is_pos & v_setall_s16(2)) + v_setall_s16(-1);
+                            v_int16x8 vscale_diff_is_pos = v_gt(diff0, vscale);
+                            veta = v_add(v_add(veta, v_and(vscale_diff_is_pos, v_setall_s16(2))), v_setall_s16(-1));
                             // since there is no abs vor int16x8 we have to do this hack
                             v_int16x8 vabs_diff = v_reinterpret_as_s16(v_abs(diff0));
                             v_int16x8 vset2, vset1;
                             // |It| < sigma1 ?
-                            vset2 = vabs_diff < vparam1;
+                            vset2 = v_lt(vabs_diff, vparam1);
                             // It > 0 ?
-                            v_int16x8 vdiff_is_pos = diff0 > vzero;
+                            v_int16x8 vdiff_is_pos = v_gt(diff0, vzero);
                             // sigma0 < |It| < sigma1 ?
-                            vset1 = vset2 & (vabs_diff > vparam0);
+                            vset1 = v_and(vset2, v_gt(vabs_diff, vparam0));
                             // val = |It| -/+ sigma1
-                            v_int16x8 vtmp_param1 = diff0 + v_select(vdiff_is_pos, vneg_param1, vparam1);
+                            v_int16x8 vtmp_param1 = v_add(diff0, v_select(vdiff_is_pos, vneg_param1, vparam1));
 
                             v_int16x8 vIxy_0 = v_reinterpret_as_s16(v_load(dIptr)); // Ix0 Iy0 Ix1 Iy1 ...
                             v_int16x8 vIxy_1 = v_reinterpret_as_s16(v_load(dIptr + 8));
@@ -342,7 +342,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
                             for (unsigned int mmi = 0; mmi < 4; mmi++)
                             {
                                 // It == 0     ? |It| > sigma13
-                                diff0 = vset2 & diff[mmi];
+                                diff0 = v_and(vset2, diff[mmi]);
                                 // It == val ? sigma0 < |It| < sigma1
                                 diff0 = v_select(vset1, vtmp_param1, diff0);
 
@@ -350,16 +350,16 @@ class TrackerInvoker : public cv::ParallelLoopBody
                                 // diff = diff * sigma2
                                 v_int32x4 diff_int_0, diff_int_1;
                                 v_mul_expand(diff0, tale_, diff_int_0, diff_int_1);
-                                v_int32x4 diff0_0 = diff_int_0 >> s2bitShift;
-                                v_int32x4 diff0_1 = diff_int_1 >> s2bitShift;
+                                v_int32x4 diff0_0 = v_shr(diff_int_0, s2bitShift);
+                                v_int32x4 diff0_1 = v_shr(diff_int_1, s2bitShift);
                                 diff0 = v_pack(diff0_0, diff0_1);
                                 v_zip(diff0, diff0, diff2, diff1); // It0 It0 It1 It1 ...
 
                                 v_zip(vIxy_0, vIxy_1, v10, v11);
                                 v_zip(diff2, diff1, v00, v01);
 
-                                vqb0[mmi] += v_cvt_f32(v_dotprod(v00, v10));
-                                vqb1[mmi] += v_cvt_f32(v_dotprod(v01, v11));
+                                vqb0[mmi] = v_add(vqb0[mmi], v_cvt_f32(v_dotprod(v00, v10)));
+                                vqb1[mmi] = v_add(vqb1[mmi], v_cvt_f32(v_dotprod(v01, v11)));
                             }
                             if (j == 0)
                             {
@@ -387,8 +387,8 @@ class TrackerInvoker : public cv::ParallelLoopBody
                                 v_float32x4 fx = v_cvt_f32(t1);
 
                                 // A11 - A22
-                                v_float32x4 fxtale = fx * vtale_0;
-                                v_float32x4 fytale = fy * vtale_0;
+                                v_float32x4 fxtale = v_mul(fx, vtale_0);
+                                v_float32x4 fytale = v_mul(fy, vtale_0);
 
                                 vAyy = v_muladd(fy, fytale, vAyy);
                                 vAxy = v_muladd(fx, fytale, vAxy);
@@ -402,8 +402,8 @@ class TrackerInvoker : public cv::ParallelLoopBody
                                 fx = v_cvt_f32(t1);
 
                                 // A11 - A22
-                                fxtale = fx * vtale_1;
-                                fytale = fy * vtale_1;
+                                fxtale = v_mul(fx, vtale_1);
+                                fytale = v_mul(fy, vtale_1);
 
                                 vAyy = v_muladd(fy, fytale, vAyy);
                                 vAxy = v_muladd(fx, fytale, vAxy);
@@ -544,7 +544,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
                     float CV_DECL_ALIGNED(16) bbuf[4];
                     for (int mmi = 0; mmi < 4; mmi++)
                     {
-                        v_store_aligned(bbuf, vqb0[mmi] + vqb1[mmi]);
+                        v_store_aligned(bbuf, v_add(vqb0[mmi], vqb1[mmi]));
                         _b0[mmi] = bbuf[0] + bbuf[2];
                         _b1[mmi] = bbuf[1] + bbuf[3];
                     }
@@ -960,7 +960,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
                             v_int16x8 v01 = v_reinterpret_as_s16(v_load_expand(Jptr + x + cn));
                             v_int16x8 v10 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x));
                             v_int16x8 v11 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x + cn));
-                            v_int16x8 vmask = v_reinterpret_as_s16(v_load_expand(maskPtr + x)) * vmax_val_16;
+                            v_int16x8 vmask = v_mul(v_reinterpret_as_s16(v_load_expand(maskPtr + x)), vmax_val_16);
 
                             v_int32x4 t0, t1;
                             v_int16x8 t00, t01, t10, t11;
@@ -968,38 +968,38 @@ class TrackerInvoker : public cv::ParallelLoopBody
                             v_zip(v10, v11, t10, t11);
 
                             //subpixel interpolation
-                            t0 = v_dotprod(t00, vqw0, vdelta) + v_dotprod(t10, vqw1);
-                            t1 = v_dotprod(t01, vqw0, vdelta) + v_dotprod(t11, vqw1);
-                            t0 = t0 >> (W_BITS - 5);
-                            t1 = t1 >> (W_BITS - 5);
+                            t0 = v_add(v_dotprod(t00, vqw0, vdelta), v_dotprod(t10, vqw1));
+                            t1 = v_add(v_dotprod(t01, vqw0, vdelta), v_dotprod(t11, vqw1));
+                            t0 = v_shr(t0, W_BITS - 5);
+                            t1 = v_shr(t1, W_BITS - 5);
                             diff0 = v_pack(t0, t1);
                             // I*gain.x + gain.x
                             v_mul_expand(vI, vgain_value, t0, t1);
-                            v_int16x8 diff_value = v_pack(t0 >> bitShift, t1 >> bitShift) + vconst_value - vI;
+                            v_int16x8 diff_value = v_sub(v_add(v_pack(v_shr(t0, bitShift), v_shr(t1, bitShift)), vconst_value), vI);
 
                             v_int16x8 diff[4] =
                             {
-                                ((v11 << 5) + diff_value) & vmask,
-                                ((v01 << 5) + diff_value) & vmask,
-                                ((v10 << 5) + diff_value) & vmask,
-                                ((v00 << 5) + diff_value) & vmask
+                                v_and(v_add(v_shl<5>(v11), diff_value), vmask),
+                                v_and(v_add(v_shl<5>(v01), diff_value), vmask),
+                                v_and(v_add(v_shl<5>(v10), diff_value), vmask),
+                                v_and(v_add(v_shl<5>(v00), diff_value), vmask)
                             };
-                            diff0 = diff0 + diff_value;
-                            diff0 = diff0 & vmask;
+                            diff0 = v_add(diff0, diff_value);
+                            diff0 = v_and(diff0, vmask);
 
-                            v_int16x8 vscale_diff_is_pos = diff0 > vscale;
-                            veta = veta + (vscale_diff_is_pos & v_setall_s16(2)) + v_setall_s16(-1);
+                            v_int16x8 vscale_diff_is_pos = v_gt(diff0, vscale);
+                            veta = v_add(v_add(veta, v_and(vscale_diff_is_pos, v_setall_s16(2))), v_setall_s16(-1));
                             // since there is no abs vor int16x8 we have to do this hack
                             v_int16x8 vabs_diff = v_reinterpret_as_s16(v_abs(diff0));
                             v_int16x8 vset2, vset1;
                             // |It| < sigma1 ?
-                            vset2 = vabs_diff < vparam1;
+                            vset2 = v_lt(vabs_diff, vparam1);
                             // It > 0 ?
-                            v_int16x8 vdiff_is_pos = diff0 > vzero;
+                            v_int16x8 vdiff_is_pos = v_gt(diff0, vzero);
                             // sigma0 < |It| < sigma1 ?
-                            vset1 = vset2 & (vabs_diff > vparam0);
+                            vset1 = v_and(vset2, v_gt(vabs_diff, vparam0));
                             // val = |It| -/+ sigma1
-                            v_int16x8 vtmp_param1 = diff0 + v_select(vdiff_is_pos, vneg_param1, vparam1);
+                            v_int16x8 vtmp_param1 = v_add(diff0, v_select(vdiff_is_pos, vneg_param1, vparam1));
 
                             v_int16x8 vIxy_0 = v_reinterpret_as_s16(v_load(dIptr)); // Ix0 Iy0 Ix1 Iy1 ...
                             v_int16x8 vIxy_1 = v_reinterpret_as_s16(v_load(dIptr + 8));
@@ -1009,7 +1009,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
                             for (unsigned int mmi = 0; mmi < 4; mmi++)
                             {
                                 // It == 0     ? |It| > sigma13
-                                diff0 = vset2 & diff[mmi];
+                                diff0 = v_and(vset2, diff[mmi]);
                                 // It == val ? sigma0 < |It| < sigma1
                                 diff0 = v_select(vset1, vtmp_param1, diff0);
 
@@ -1017,22 +1017,22 @@ class TrackerInvoker : public cv::ParallelLoopBody
                                 // diff = diff * sigma2
                                 v_int32x4 diff_int_0, diff_int_1;
                                 v_mul_expand(diff0, tale_, diff_int_0, diff_int_1);
-                                v_int32x4 diff0_0 = diff_int_0 >> s2bitShift;
-                                v_int32x4 diff0_1 = diff_int_1 >> s2bitShift;
+                                v_int32x4 diff0_0 = v_shr(diff_int_0, s2bitShift);
+                                v_int32x4 diff0_1 = v_shr(diff_int_1, s2bitShift);
                                 diff0 = v_pack(diff0_0, diff0_1);
                                 v_zip(diff0, diff0, diff2, diff1); // It0 It0 It1 It1 ...
 
                                 v_zip(vIxy_0, vIxy_1, v10, v11);
                                 v_zip(diff2, diff1, v00, v01);
 
-                                vqb0[mmi] += v_cvt_f32(v_dotprod(v00, v10));
-                                vqb1[mmi] += v_cvt_f32(v_dotprod(v01, v11));
+                                vqb0[mmi] = v_add(vqb0[mmi], v_cvt_f32(v_dotprod(v00, v10)));
+                                vqb1[mmi] = v_add(vqb1[mmi], v_cvt_f32(v_dotprod(v01, v11)));
 
-                                vqb2[mmi] += v_cvt_f32(diff0_0 * vI0);
-                                vqb2[mmi] += v_cvt_f32(diff0_1 * vI1);
+                                vqb2[mmi] = v_add(vqb2[mmi], v_cvt_f32(v_mul(diff0_0, vI0)));
+                                vqb2[mmi] = v_add(vqb2[mmi], v_cvt_f32(v_mul(diff0_1, vI1)));
 
-                                vqb3[mmi] += v_cvt_f32(diff0_0);
-                                vqb3[mmi] += v_cvt_f32(diff0_1);
+                                vqb3[mmi] = v_add(vqb3[mmi], v_cvt_f32(diff0_0));
+                                vqb3[mmi] = v_add(vqb3[mmi], v_cvt_f32(diff0_1));
                             }
                             if (j == 0)
                             {
@@ -1060,29 +1060,29 @@ class TrackerInvoker : public cv::ParallelLoopBody
                                 v_float32x4 fx = v_cvt_f32(t1);
 
                                 // A11 - A22
-                                v_float32x4 fxtale = fx * vtale_0;
-                                v_float32x4 fytale = fy * vtale_0;
+                                v_float32x4 fxtale = v_mul(fx, vtale_0);
+                                v_float32x4 fytale = v_mul(fy, vtale_0);
 
                                 vAyy = v_muladd(fy, fytale, vAyy);
                                 vAxy = v_muladd(fx, fytale, vAxy);
                                 vAxx = v_muladd(fx, fxtale, vAxx);
 
                                 // sumIx und sumIy
-                                vsumIx += fxtale;
-                                vsumIy += fytale;
+                                vsumIx = v_add(vsumIx, fxtale);
+                                vsumIy = v_add(vsumIy, fytale);
 
-                                vsumW1 += vI_ps * fxtale;
-                                vsumW2 += vI_ps * fytale;
+                                vsumW1 = v_add(vsumW1, v_mul(vI_ps, fxtale));
+                                vsumW2 = v_add(vsumW2, v_mul(vI_ps, fytale));
 
                                 // sumI
-                                v_float32x4 vI_tale = vI_ps * vtale_0;
-                                vsumI += vI_tale;
+                                v_float32x4 vI_tale = v_mul(vI_ps, vtale_0);
+                                vsumI = v_add(vsumI, vI_tale);
 
                                 // sumW
-                                vsumW += vtale_0;
+                                vsumW = v_add(vsumW, vtale_0);
 
                                 // sumDI
-                                vsumDI += vI_ps * vI_tale;
+                                vsumDI = v_add(vsumDI, v_mul(vI_ps, vI_tale));
 
                                 v01 = v_reinterpret_as_s16(v_interleave_pairs(v_reinterpret_as_s32(v_interleave_pairs(vIxy_1))));
                                 v_expand(v01, t1, t0);
@@ -1092,29 +1092,29 @@ class TrackerInvoker : public cv::ParallelLoopBody
                                 fx = v_cvt_f32(t1);
 
                                 // A11 - A22
-                                fxtale = fx * vtale_1;
-                                fytale = fy * vtale_1;
+                                fxtale = v_mul(fx, vtale_1);
+                                fytale = v_mul(fy, vtale_1);
 
                                 vAyy = v_muladd(fy, fytale, vAyy);
                                 vAxy = v_muladd(fx, fytale, vAxy);
                                 vAxx = v_muladd(fx, fxtale, vAxx);
 
                                 // sumIx und sumIy
-                                vsumIx += fxtale;
-                                vsumIy += fytale;
+                                vsumIx = v_add(vsumIx, fxtale);
+                                vsumIy = v_add(vsumIy, fytale);
 
-                                vsumW1 += vI_ps * fxtale;
-                                vsumW2 += vI_ps * fytale;
+                                vsumW1 = v_add(vsumW1, v_mul(vI_ps, fxtale));
+                                vsumW2 = v_add(vsumW2, v_mul(vI_ps, fytale));
 
                                 // sumI
-                                vI_tale = vI_ps * vtale_1;
-                                vsumI += vI_tale;
+                                vI_tale = v_mul(vI_ps, vtale_1);
+                                vsumI = v_add(vsumI, vI_tale);
 
                                 // sumW
-                                vsumW += vtale_1;
+                                vsumW = v_add(vsumW, vtale_1);
 
                                 // sumDI
-                                vsumDI += vI_ps * vI_tale;
+                                vsumDI = v_add(vsumDI, v_mul(vI_ps, vI_tale));
                             }
                      }
 
@@ -1304,7 +1304,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
                 float CV_DECL_ALIGNED(16) bbuf[4];
                 for(int mmi = 0; mmi < 4; mmi++)
                 {
-                    v_store_aligned(bbuf, vqb0[mmi] + vqb1[mmi]);
+                    v_store_aligned(bbuf, v_add(vqb0[mmi], vqb1[mmi]));
                     _b0[mmi] = bbuf[0] + bbuf[2];
                     _b1[mmi] = bbuf[1] + bbuf[3];
                     _b2[mmi] = v_reduce_sum(vqb2[mmi]);
@@ -1655,14 +1655,14 @@ class TrackerInvoker  : public cv::ParallelLoopBody
                             v_int16x8 v01 = v_reinterpret_as_s16(v_load_expand(Jptr + x + cn));
                             v_int16x8 v10 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x));
                             v_int16x8 v11 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x + cn));
-                            v_int16x8 vmask = v_reinterpret_as_s16(v_load_expand(maskPtr + x)) * vmax_val_16;
+                            v_int16x8 vmask = v_mul(v_reinterpret_as_s16(v_load_expand(maskPtr + x)), vmax_val_16);
 
                             v_int16x8 diff[4] =
                             {
-                                ((v00 << 5) - vI) & vmask,
-                                ((v01 << 5) - vI) & vmask,
-                                ((v10 << 5) - vI) & vmask,
-                                ((v11 << 5) - vI) & vmask,
+                                v_and(v_sub(v_shl<5>(v00), vI), vmask),
+                                v_and(v_sub(v_shl<5>(v01), vI), vmask),
+                                v_and(v_sub(v_shl<5>(v10), vI), vmask),
+                                v_and(v_sub(v_shl<5>(v11), vI), vmask),
                             };
 
                             v_int16x8 vIxy_0 = v_reinterpret_as_s16(v_load(dIptr)); // Ix0 Iy0 Ix1 Iy1 ...
@@ -1672,8 +1672,8 @@ class TrackerInvoker  : public cv::ParallelLoopBody
                                 v_zip(diff[mmi], diff[mmi], diff1, diff0);
                                 v_zip(vIxy_0, vIxy_1, v10, v11);
                                 v_zip(diff1, diff0, v00, v01);
-                                vqb0[mmi] += v_cvt_f32(v_dotprod(v00, v10));
-                                vqb1[mmi] += v_cvt_f32(v_dotprod(v01, v11));
+                                vqb0[mmi] = v_add(vqb0[mmi], v_cvt_f32(v_dotprod(v00, v10)));
+                                vqb1[mmi] = v_add(vqb1[mmi], v_cvt_f32(v_dotprod(v01, v11)));
                             }
                          }
 #else
@@ -1704,7 +1704,7 @@ class TrackerInvoker  : public cv::ParallelLoopBody
                     float CV_DECL_ALIGNED(16) bbuf[4];
                     for (int mmi = 0; mmi < 4; mmi++)
                     {
-                        v_store_aligned(bbuf, vqb0[mmi] + vqb1[mmi]);
+                        v_store_aligned(bbuf, v_add(vqb0[mmi], vqb1[mmi]));
                         _b1[mmi] = bbuf[0] + bbuf[2];
                         _b2[mmi] = bbuf[1] + bbuf[3];
                     }
@@ -2071,7 +2071,7 @@ namespace radial {
                                 v_int16x8 v01 = v_reinterpret_as_s16(v_load_expand(Jptr + x + cn));
                                 v_int16x8 v10 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x));
                                 v_int16x8 v11 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x + cn));
-                                v_int16x8 vmask = v_reinterpret_as_s16(v_load_expand(maskPtr + x)) * vmax_val_16;
+                                v_int16x8 vmask = v_mul(v_reinterpret_as_s16(v_load_expand(maskPtr + x)), vmax_val_16);
 
                                 v_int32x4 t0, t1;
                                 v_int16x8 t00, t01, t10, t11;
@@ -2079,21 +2079,21 @@ namespace radial {
                                 v_zip(v10, v11, t10, t11);
 
                                 //subpixel interpolation
-                                t0 = v_dotprod(t00, vqw0, vdelta) + v_dotprod(t10, vqw1);
-                                t1 = v_dotprod(t01, vqw0, vdelta) + v_dotprod(t11, vqw1);
-                                t0 = t0 >> (W_BITS - 5);
-                                t1 = t1 >> (W_BITS - 5);
+                                t0 = v_add(v_dotprod(t00, vqw0, vdelta), v_dotprod(t10, vqw1));
+                                t1 = v_add(v_dotprod(t01, vqw0, vdelta), v_dotprod(t11, vqw1));
+                                t0 = v_shr(t0, W_BITS - 5);
+                                t1 = v_shr(t1, W_BITS - 5);
                                 diff0 = v_pack(t0, t1);
                                 // I*gain.x + gain.x
                                 v_mul_expand(vI, vgain_value, t0, t1);
-                                v_int16x8 diff_value = v_pack(t0 >> bitShift, t1 >> bitShift) + vconst_value - vI;
+                                v_int16x8 diff_value = v_sub(v_add(v_pack(v_shr(t0, bitShift), v_shr(t1, bitShift)), vconst_value), vI);
 
                                 v_int16x8 diff[4] =
                                 {
-                                    ((v11 << 5) + diff_value) & vmask,
-                                    ((v01 << 5) + diff_value) & vmask,
-                                    ((v10 << 5) + diff_value) & vmask,
-                                    ((v00 << 5) + diff_value) & vmask
+                                    v_and(v_add(v_shl<5>(v11), diff_value), vmask),
+                                    v_and(v_add(v_shl<5>(v01), diff_value), vmask),
+                                    v_and(v_add(v_shl<5>(v10), diff_value), vmask),
+                                    v_and(v_add(v_shl<5>(v00), diff_value), vmask)
                                 };
                                 v_int16x8 vIxy_0 = v_reinterpret_as_s16(v_load(dIptr)); // Ix0 Iy0 Ix1 Iy1 ...
                                 v_int16x8 vIxy_1 = v_reinterpret_as_s16(v_load(dIptr + 8));
@@ -2109,14 +2109,14 @@ namespace radial {
                                     v_zip(diff[mmi], diff[mmi], diff2, diff1);
 
                                     v_zip(diff2, diff1, v00, v01);
-                                    vqb0[mmi] += v_cvt_f32(v_dotprod(v00, v10));
-                                    vqb1[mmi] += v_cvt_f32(v_dotprod(v01, v11));
+                                    vqb0[mmi] = v_add(vqb0[mmi], v_cvt_f32(v_dotprod(v00, v10)));
+                                    vqb1[mmi] = v_add(vqb1[mmi], v_cvt_f32(v_dotprod(v01, v11)));
 
-                                    vqb2[mmi] += v_cvt_f32(diff0_0 * vI0);
-                                    vqb2[mmi] += v_cvt_f32(diff0_1 * vI1);
+                                    vqb2[mmi] = v_add(vqb2[mmi], v_cvt_f32(v_mul(diff0_0, vI0)));
+                                    vqb2[mmi] = v_add(vqb2[mmi], v_cvt_f32(v_mul(diff0_1, vI1)));
 
-                                    vqb3[mmi] += v_cvt_f32(diff0_0);
-                                    vqb3[mmi] += v_cvt_f32(diff0_1);
+                                    vqb3[mmi] = v_add(vqb3[mmi], v_cvt_f32(diff0_0));
+                                    vqb3[mmi] = v_add(vqb3[mmi], v_cvt_f32(diff0_1));
                                 }
                                 if (j == 0)
                                 {
@@ -2133,17 +2133,17 @@ namespace radial {
                                     vAxx = v_muladd(fx, fx, vAxx);
 
                                     // sumIx und sumIy
-                                    vsumIx += fx;
-                                    vsumIy += fy;
+                                    vsumIx = v_add(vsumIx, fx);
+                                    vsumIy = v_add(vsumIy, fy);
 
-                                    vsumW1 += vI_ps * fx;
-                                    vsumW2 += vI_ps * fy;
+                                    vsumW1 = v_add(vsumW1, v_mul(vI_ps, fx));
+                                    vsumW2 = v_add(vsumW2, v_mul(vI_ps, fy));
 
                                     // sumI
-                                    vsumI += vI_ps;
+                                    vsumI = v_add(vsumI, vI_ps);
 
                                     // sumDI
-                                    vsumDI += vI_ps * vI_ps;
+                                    vsumDI = v_add(vsumDI, v_mul(vI_ps, vI_ps));
 
                                     v01 = v_reinterpret_as_s16(v_interleave_pairs(v_reinterpret_as_s32(v_interleave_pairs(vIxy_1))));
                                     v_expand(v01, t1, t0);
@@ -2158,17 +2158,17 @@ namespace radial {
                                     vAxx = v_muladd(fx, fx, vAxx);
 
                                     // sumIx und sumIy
-                                    vsumIx += fx;
-                                    vsumIy += fy;
+                                    vsumIx = v_add(vsumIx, fx);
+                                    vsumIy = v_add(vsumIy, fy);
 
-                                    vsumW1 += vI_ps * fx;
-                                    vsumW2 += vI_ps * fy;
+                                    vsumW1 = v_add(vsumW1, v_mul(vI_ps, fx));
+                                    vsumW2 = v_add(vsumW2, v_mul(vI_ps, fy));
 
                                     // sumI
-                                    vsumI += vI_ps;
+                                    vsumI = v_add(vsumI, vI_ps);
 
                                     // sumDI
-                                    vsumDI += vI_ps * vI_ps;
+                                    vsumDI = v_add(vsumDI, v_mul(vI_ps, vI_ps));
                                 }
                             }
 
@@ -2299,7 +2299,7 @@ namespace radial {
                         float CV_DECL_ALIGNED(16) bbuf[4];
                         for (int mmi = 0; mmi < 4; mmi++)
                         {
-                            v_store_aligned(bbuf, vqb0[mmi] + vqb1[mmi]);
+                            v_store_aligned(bbuf, v_add(vqb0[mmi], vqb1[mmi]));
                             _b0[mmi] = bbuf[0] + bbuf[2];
                             _b1[mmi] = bbuf[1] + bbuf[3];
                             _b2[mmi] = v_reduce_sum(vqb2[mmi]);
diff --git a/modules/optflow/src/rlof/plk_invoker.hpp b/modules/optflow/src/rlof/plk_invoker.hpp
index 5ea85de889e..71cf50c8205 100644
--- a/modules/optflow/src/rlof/plk_invoker.hpp
+++ b/modules/optflow/src/rlof/plk_invoker.hpp
@@ -229,7 +229,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
                         v_int16x8 v01 = v_reinterpret_as_s16(v_load_expand(Jptr + x + cn));
                         v_int16x8 v10 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x));
                         v_int16x8 v11 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x + cn));
-                        v_int16x8 vmask = v_reinterpret_as_s16(v_load_expand(maskPtr + x)) * vmax_val_16;
+                        v_int16x8 vmask = v_mul(v_reinterpret_as_s16(v_load_expand(maskPtr + x)), vmax_val_16);
 
                         v_int32x4 t0, t1;
                         v_int16x8 t00, t01, t10, t11;
@@ -237,17 +237,17 @@ class TrackerInvoker : public cv::ParallelLoopBody
                         v_zip(v10, v11, t10, t11);
 
                         //subpixel interpolation
-                        t0 = v_dotprod(t00, vqw0, vdelta) + v_dotprod(t10, vqw1);
-                        t1 = v_dotprod(t01, vqw0, vdelta) + v_dotprod(t11, vqw1);
-                        t0 = t0 >> (W_BITS - 5);
-                        t1 = t1 >> (W_BITS - 5);
+                        t0 = v_add(v_dotprod(t00, vqw0, vdelta), v_dotprod(t10, vqw1));
+                        t1 = v_add(v_dotprod(t01, vqw0, vdelta), v_dotprod(t11, vqw1));
+                        t0 = v_shr(t0, W_BITS - 5);
+                        t1 = v_shr(t1, W_BITS - 5);
 
                         // diff = J - I
-                        diff0 = v_pack(t0, t1) - vI;
+                        diff0 = v_sub(v_pack(t0, t1), vI);
                         // I*gain.x + gain.x
                         v_mul_expand(vI, vgain_value, t0, t1);
-                        diff0 = diff0 + v_pack(t0 >> bitShift, t1 >> bitShift) + vconst_value;
-                        diff0 = diff0 & vmask;
+                        diff0 = v_add(v_add(diff0, v_pack(v_shr(t0, bitShift), v_shr(t1, bitShift))), vconst_value);
+                        diff0 = v_and(diff0, vmask);
                         v_zip(diff0, diff0, diff2, diff1);
 
                         v_int32x4 diff0_0;
@@ -259,16 +259,16 @@ class TrackerInvoker : public cv::ParallelLoopBody
                         v_zip(vIxy_0, vIxy_1, v10, v11);
                         v_zip(diff2, diff1, v00, v01);
 
-                        vqb0 += v_cvt_f32(v_dotprod(v00, v10));
-                        vqb1 += v_cvt_f32(v_dotprod(v01, v11));
+                        vqb0 = v_add(vqb0, v_cvt_f32(v_dotprod(v00, v10)));
+                        vqb1 = v_add(vqb1, v_cvt_f32(v_dotprod(v01, v11)));
 
                         v_int32x4 vI0, vI1;
                         v_expand(vI, vI0, vI1);
-                        vqb2 += v_cvt_f32(diff0_0 * vI0);
-                        vqb2 += v_cvt_f32(diff0_1 * vI1);
+                        vqb2 = v_add(vqb2, v_cvt_f32(v_mul(diff0_0, vI0)));
+                        vqb2 = v_add(vqb2, v_cvt_f32(v_mul(diff0_1, vI1)));
 
-                        vqb3 += v_cvt_f32(diff0_0);
-                        vqb3 += v_cvt_f32(diff0_1);
+                        vqb3 = v_add(vqb3, v_cvt_f32(diff0_0));
+                        vqb3 = v_add(vqb3, v_cvt_f32(diff0_1));
 
                         if (j == 0)
                         {
@@ -285,17 +285,17 @@ class TrackerInvoker : public cv::ParallelLoopBody
                             vAxx = v_muladd(fx, fx, vAxx);
 
                             // sumIx und sumIy
-                            vsumIx += fx;
-                            vsumIy += fy;
+                            vsumIx = v_add(vsumIx, fx);
+                            vsumIy = v_add(vsumIy, fy);
 
-                            vsumW1 += vI_ps * fx;
-                            vsumW2 += vI_ps * fy;
+                            vsumW1 = v_add(vsumW1, v_mul(vI_ps, fx));
+                            vsumW2 = v_add(vsumW2, v_mul(vI_ps, fy));
 
                             // sumI
-                            vsumI += vI_ps;
+                            vsumI = v_add(vsumI, vI_ps);
 
                             // sumDI
-                            vsumDI += vI_ps * vI_ps;
+                            vsumDI = v_add(vsumDI, v_mul(vI_ps, vI_ps));
 
                             v01 = v_reinterpret_as_s16(v_interleave_pairs(v_reinterpret_as_s32(v_interleave_pairs(vIxy_1))));
                             v_expand(v01, t1, t0);
@@ -309,17 +309,17 @@ class TrackerInvoker : public cv::ParallelLoopBody
                             vAxx = v_muladd(fx, fx, vAxx);
 
                             // sumIx und sumIy
-                            vsumIx += fx;
-                            vsumIy += fy;
+                            vsumIx = v_add(vsumIx, fx);
+                            vsumIy = v_add(vsumIy, fy);
 
-                            vsumW1 += vI_ps * fx;
-                            vsumW2 += vI_ps * fy;
+                            vsumW1 = v_add(vsumW1, v_mul(vI_ps, fx));
+                            vsumW2 = v_add(vsumW2, v_mul(vI_ps, fy));
 
                             // sumI
-                            vsumI += vI_ps;
+                            vsumI = v_add(vsumI, vI_ps);
 
                             // sumDI
-                            vsumDI += vI_ps * vI_ps;
+                            vsumDI = v_add(vsumDI, v_mul(vI_ps, vI_ps));
                         }
                     }
 #else
@@ -388,7 +388,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
 
 #if CV_SIMD128
                 float CV_DECL_ALIGNED(16) bbuf[4];
-                v_store_aligned(bbuf, vqb0 + vqb1);
+                v_store_aligned(bbuf, v_add(vqb0, vqb1));
                 b1 = bbuf[0] + bbuf[2];
                 b2 = bbuf[1] + bbuf[3];
                 b3 = v_reduce_sum(vqb2);
@@ -696,19 +696,19 @@ class TrackerInvoker : public cv::ParallelLoopBody
                         v_int16x8 v01 = v_reinterpret_as_s16(v_load_expand(Jptr + x + cn));
                         v_int16x8 v10 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x));
                         v_int16x8 v11 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x + cn));
-                        v_int16x8 vmask = v_reinterpret_as_s16(v_load_expand(maskPtr + x)) * vmax_val_16;
+                        v_int16x8 vmask = v_mul(v_reinterpret_as_s16(v_load_expand(maskPtr + x)), vmax_val_16);
 
                         v_int32x4 t0, t1;
                         v_int16x8 t00, t01, t10, t11;
                         v_zip(v00, v01, t00, t01);
                         v_zip(v10, v11, t10, t11);
 
-                        t0 = v_dotprod(t00, vqw0, vdelta) + v_dotprod(t10, vqw1);
-                        t1 = v_dotprod(t01, vqw0, vdelta) + v_dotprod(t11, vqw1);
-                        t0 = t0 >> (W_BITS - 5);
-                        t1 = t1 >> (W_BITS - 5);
-                        diff0 = v_pack(t0, t1) - diff0;
-                        diff0 = diff0 & vmask;
+                        t0 = v_add(v_dotprod(t00, vqw0, vdelta), v_dotprod(t10, vqw1));
+                        t1 = v_add(v_dotprod(t01, vqw0, vdelta), v_dotprod(t11, vqw1));
+                        t0 = v_shr(t0, W_BITS - 5);
+                        t1 = v_shr(t1, W_BITS - 5);
+                        diff0 = v_sub(v_pack(t0, t1), diff0);
+                        diff0 = v_and(diff0, vmask);
 
                         v_zip(diff0, diff0, diff2, diff1); // It0 It0 It1 It1 ...
 
@@ -717,8 +717,8 @@ class TrackerInvoker : public cv::ParallelLoopBody
                         v_zip(vIxy_0, vIxy_1, v10, v11);
                         v_zip(diff2, diff1, v00, v01);
 
-                        vqb0 += v_cvt_f32(v_dotprod(v00, v10));
-                        vqb1 += v_cvt_f32(v_dotprod(v01, v11));
+                        vqb0 = v_add(vqb0, v_cvt_f32(v_dotprod(v00, v10)));
+                        vqb1 = v_add(vqb1, v_cvt_f32(v_dotprod(v01, v11)));
                     }
 #else
                     for( ; x < winSize.width*cn; x++, dIptr += 2 )
@@ -737,7 +737,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
 
 #if CV_SIMD128
                 float CV_DECL_ALIGNED(16) bbuf[4];
-                v_store_aligned(bbuf, vqb0 + vqb1);
+                v_store_aligned(bbuf, v_add(vqb0, vqb1));
                 b1 = bbuf[0] + bbuf[2];
                 b2 = bbuf[1] + bbuf[3];
 #endif
diff --git a/modules/optflow/src/rlof/rlof_invoker.hpp b/modules/optflow/src/rlof/rlof_invoker.hpp
index 9bee35fc6a3..5597d882491 100644
--- a/modules/optflow/src/rlof/rlof_invoker.hpp
+++ b/modules/optflow/src/rlof/rlof_invoker.hpp
@@ -246,35 +246,35 @@ class TrackerInvoker : public cv::ParallelLoopBody
                         v_int16x8 v01 = v_reinterpret_as_s16(v_load_expand(Jptr + x + cn));
                         v_int16x8 v10 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x));
                         v_int16x8 v11 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x + cn));
-                        v_int16x8 vmask = v_reinterpret_as_s16(v_load_expand(maskPtr + x)) * vmax_val_16;
+                        v_int16x8 vmask = v_mul(v_reinterpret_as_s16(v_load_expand(maskPtr + x)), vmax_val_16);
 
                         v_int32x4 t0, t1;
                         v_int16x8 t00, t01, t10, t11;
                         v_zip(v00, v01, t00, t01);
                         v_zip(v10, v11, t10, t11);
 
-                        t0 = v_dotprod(t00, vqw0, vdelta) + v_dotprod(t10, vqw1);
-                        t1 = v_dotprod(t01, vqw0, vdelta) + v_dotprod(t11, vqw1);
-                        t0 = t0 >> (W_BITS - 5);
-                        t1 = t1 >> (W_BITS - 5);
-                        diff0 = v_pack(t0, t1) - diff0;
-                        diff0 = diff0 & vmask;
+                        t0 = v_add(v_dotprod(t00, vqw0, vdelta), v_dotprod(t10, vqw1));
+                        t1 = v_add(v_dotprod(t01, vqw0, vdelta), v_dotprod(t11, vqw1));
+                        t0 = v_shr(t0, W_BITS - 5);
+                        t1 = v_shr(t1, W_BITS - 5);
+                        diff0 = v_sub(v_pack(t0, t1), diff0);
+                        diff0 = v_and(diff0, vmask);
 
-                        v_int16x8 vscale_diff_is_pos = diff0 > vscale;
-                        veta = veta + (vscale_diff_is_pos & v_setall_s16(2)) + v_setall_s16(-1);
+                        v_int16x8 vscale_diff_is_pos = v_gt(diff0, vscale);
+                        veta = v_add(v_add(veta, v_and(vscale_diff_is_pos, v_setall_s16(2))), v_setall_s16(-1));
                         // since there is no abs vor int16x8 we have to do this hack
                         v_int16x8 vabs_diff = v_reinterpret_as_s16(v_abs(diff0));
                         v_int16x8 vset2, vset1;
                         // |It| < sigma1 ?
-                        vset2 = vabs_diff < vparam1;
+                        vset2 = v_lt(vabs_diff, vparam1);
                         // It > 0 ?
-                        v_int16x8 vdiff_is_pos = diff0 > vzero;
+                        v_int16x8 vdiff_is_pos = v_gt(diff0, vzero);
                         // sigma0 < |It| < sigma1 ?
-                        vset1 = vset2 & (vabs_diff > vparam0);
+                        vset1 = v_and(vset2, v_gt(vabs_diff, vparam0));
                         // val = |It| -/+ sigma1
-                        v_int16x8 vtmp_param1 = diff0 + v_select(vdiff_is_pos, vneg_param1, vparam1);
+                        v_int16x8 vtmp_param1 = v_add(diff0, v_select(vdiff_is_pos, vneg_param1, vparam1));
                         // It == 0     ? |It| > sigma13
-                        diff0 = vset2 & diff0;
+                        diff0 = v_and(vset2, diff0);
                         // It == val ? sigma0 < |It| < sigma1
                         diff0 = v_select(vset1, vtmp_param1, diff0);
 
@@ -282,7 +282,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
                         // diff = diff * sigma2
                         v_int32x4 diff_int_0, diff_int_1;
                         v_mul_expand(diff0, tale_, diff_int_0, diff_int_1);
-                        diff0 = v_pack(diff_int_0 >> s2bitShift, diff_int_1 >> s2bitShift);
+                        diff0 = v_pack(v_shr(diff_int_0, s2bitShift), v_shr(diff_int_1, s2bitShift));
                         v_zip(diff0, diff0, diff2, diff1); // It0 It0 It1 It1 ...
 
                         v_int16x8 vIxy_0 = v_reinterpret_as_s16(v_load(dIptr)); // Ix0 Iy0 Ix1 Iy1 ...
@@ -290,8 +290,8 @@ class TrackerInvoker : public cv::ParallelLoopBody
                         v_zip(vIxy_0, vIxy_1, v10, v11);
                         v_zip(diff2, diff1, v00, v01);
 
-                        vqb0 += v_cvt_f32(v_dotprod(v00, v10));
-                        vqb1 += v_cvt_f32(v_dotprod(v01, v11));
+                        vqb0 = v_add(vqb0, v_cvt_f32(v_dotprod(v00, v10)));
+                        vqb1 = v_add(vqb1, v_cvt_f32(v_dotprod(v01, v11)));
                         if (j == 0)
                         {
                             v_int32x4 vset1_0, vset1_1, vset2_0, vset2_1;
@@ -316,8 +316,8 @@ class TrackerInvoker : public cv::ParallelLoopBody
                             v_float32x4 fx = v_cvt_f32(t1);
 
                             // A11 - A22
-                            v_float32x4 fxtale = fx * vtale_0;
-                            v_float32x4 fytale = fy * vtale_0;
+                            v_float32x4 fxtale = v_mul(fx, vtale_0);
+                            v_float32x4 fytale = v_mul(fy, vtale_0);
 
                             vAyy = v_muladd(fy, fytale, vAyy);
                             vAxy = v_muladd(fx, fytale, vAxy);
@@ -330,8 +330,8 @@ class TrackerInvoker : public cv::ParallelLoopBody
                             fx = v_cvt_f32(t1);
 
                             // A11 - A22
-                            fxtale = fx * vtale_1;
-                            fytale = fy * vtale_1;
+                            fxtale = v_mul(fx, vtale_1);
+                            fytale = v_mul(fy, vtale_1);
 
                             vAyy = v_muladd(fy, fytale, vAyy);
                             vAxy = v_muladd(fx, fytale, vAxy);
@@ -431,7 +431,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
 
 #if CV_SIMD128
                 float CV_DECL_ALIGNED(16) bbuf[4];
-                v_store_aligned(bbuf, vqb0 + vqb1);
+                v_store_aligned(bbuf, v_add(vqb0, vqb1));
                 b1 += bbuf[0] + bbuf[2];
                 b2 += bbuf[1] + bbuf[3];
 #endif
@@ -769,7 +769,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
                         v_int16x8 v01 = v_reinterpret_as_s16(v_load_expand(Jptr + x + cn));
                         v_int16x8 v10 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x));
                         v_int16x8 v11 = v_reinterpret_as_s16(v_load_expand(Jptr1 + x + cn));
-                        v_int16x8 vmask = v_reinterpret_as_s16(v_load_expand(maskPtr + x)) * vmax_val_16;
+                        v_int16x8 vmask = v_mul(v_reinterpret_as_s16(v_load_expand(maskPtr + x)), vmax_val_16);
 
                         v_int32x4 t0, t1;
                         v_int16x8 t00, t01, t10, t11;
@@ -777,33 +777,33 @@ class TrackerInvoker : public cv::ParallelLoopBody
                         v_zip(v10, v11, t10, t11);
 
                         //subpixel interpolation
-                        t0 = v_dotprod(t00, vqw0, vdelta) + v_dotprod(t10, vqw1);
-                        t1 = v_dotprod(t01, vqw0, vdelta) + v_dotprod(t11, vqw1);
-                        t0 = t0 >> (W_BITS - 5);
-                        t1 = t1 >> (W_BITS - 5);
+                        t0 = v_add(v_dotprod(t00, vqw0, vdelta), v_dotprod(t10, vqw1));
+                        t1 = v_add(v_dotprod(t01, vqw0, vdelta), v_dotprod(t11, vqw1));
+                        t0 = v_shr(t0, W_BITS - 5);
+                        t1 = v_shr(t1, W_BITS - 5);
 
                         // diff = J - I
-                        diff0 = v_pack(t0, t1) - vI;
+                        diff0 = v_sub(v_pack(t0, t1), vI);
                         // I*gain.x + gain.x
                         v_mul_expand(vI, vgain_value, t0, t1);
-                        diff0 = diff0 + v_pack(t0 >> bitShift, t1 >> bitShift) + vconst_value;
-                        diff0 = diff0 & vmask;
+                        diff0 = v_add(v_add(diff0, v_pack(v_shr(t0, bitShift), v_shr(t1, bitShift))), vconst_value);
+                        diff0 = v_and(diff0, vmask);
 
-                        v_int16x8 vscale_diff_is_pos = diff0 > vscale;
-                        veta = veta + (vscale_diff_is_pos & v_setall_s16(2)) + v_setall_s16(-1);
+                        v_int16x8 vscale_diff_is_pos = v_gt(diff0, vscale);
+                        veta = v_add(v_add(veta, v_and(vscale_diff_is_pos, v_setall_s16(2))), v_setall_s16(-1));
                         // since there is no abs vor int16x8 we have to do this hack
                         v_int16x8 vabs_diff = v_reinterpret_as_s16(v_abs(diff0));
                         v_int16x8 vset2, vset1;
                         // |It| < sigma1 ?
-                        vset2 = vabs_diff < vparam1;
+                        vset2 = v_lt(vabs_diff, vparam1);
                         // It > 0 ?
-                        v_int16x8 vdiff_is_pos = diff0 > vzero;
+                        v_int16x8 vdiff_is_pos = v_gt(diff0, vzero);
                         // sigma0 < |It| < sigma1 ?
-                        vset1 = vset2 & (vabs_diff > vparam0);
+                        vset1 = v_and(vset2, v_gt(vabs_diff, vparam0));
                         // val = |It| -/+ sigma1
-                        v_int16x8 vtmp_param1 = diff0 + v_select(vdiff_is_pos, vneg_param1, vparam1);
+                        v_int16x8 vtmp_param1 = v_add(diff0, v_select(vdiff_is_pos, vneg_param1, vparam1));
                         // It == 0     ? |It| > sigma13
-                        diff0 = vset2 & diff0;
+                        diff0 = v_and(vset2, diff0);
                         // It == val ? sigma0 < |It| < sigma1
                         diff0 = v_select(vset1, vtmp_param1, diff0);
 
@@ -811,8 +811,8 @@ class TrackerInvoker : public cv::ParallelLoopBody
                         // diff = diff * sigma2
                         v_int32x4 diff_int_0, diff_int_1;
                         v_mul_expand(diff0, tale_, diff_int_0, diff_int_1);
-                        v_int32x4 diff0_0 = diff_int_0 >> s2bitShift;
-                        v_int32x4 diff0_1 = diff_int_1 >> s2bitShift;
+                        v_int32x4 diff0_0 = v_shr(diff_int_0, s2bitShift);
+                        v_int32x4 diff0_1 = v_shr(diff_int_1, s2bitShift);
                         diff0 = v_pack(diff0_0, diff0_1);
                         v_zip(diff0, diff0, diff2, diff1); // It0 It0 It1 It1 ...
 
@@ -821,16 +821,16 @@ class TrackerInvoker : public cv::ParallelLoopBody
                         v_zip(vIxy_0, vIxy_1, v10, v11);
                         v_zip(diff2, diff1, v00, v01);
 
-                        vqb0 += v_cvt_f32(v_dotprod(v00, v10));
-                        vqb1 += v_cvt_f32(v_dotprod(v01, v11));
+                        vqb0 = v_add(vqb0, v_cvt_f32(v_dotprod(v00, v10)));
+                        vqb1 = v_add(vqb1, v_cvt_f32(v_dotprod(v01, v11)));
 
                         v_int32x4 vI0, vI1;
                         v_expand(vI, vI0, vI1);
-                        vqb2 += v_cvt_f32(diff0_0 * vI0);
-                        vqb2 += v_cvt_f32(diff0_1 * vI1);
+                        vqb2 = v_add(vqb2, v_cvt_f32(v_mul(diff0_0, vI0)));
+                        vqb2 = v_add(vqb2, v_cvt_f32(v_mul(diff0_1, vI1)));
 
-                        vqb3 += v_cvt_f32(diff0_0);
-                        vqb3 += v_cvt_f32(diff0_1);
+                        vqb3 = v_add(vqb3, v_cvt_f32(diff0_0));
+                        vqb3 = v_add(vqb3, v_cvt_f32(diff0_1));
 
                         if (j == 0)
                         {
@@ -858,29 +858,29 @@ class TrackerInvoker : public cv::ParallelLoopBody
                             v_float32x4 fx = v_cvt_f32(t1);
 
                             // A11 - A22
-                            v_float32x4 fxtale = fx * vtale_0;
-                            v_float32x4 fytale = fy * vtale_0;
+                            v_float32x4 fxtale = v_mul(fx, vtale_0);
+                            v_float32x4 fytale = v_mul(fy, vtale_0);
 
                             vAyy = v_muladd(fy, fytale, vAyy);
                             vAxy = v_muladd(fx, fytale, vAxy);
                             vAxx = v_muladd(fx, fxtale, vAxx);
 
                             // sumIx und sumIy
-                            vsumIx += fxtale;
-                            vsumIy += fytale;
+                            vsumIx = v_add(vsumIx, fxtale);
+                            vsumIy = v_add(vsumIy, fytale);
 
-                            vsumW1 += vI_ps * fxtale;
-                            vsumW2 += vI_ps * fytale;
+                            vsumW1 = v_add(vsumW1, v_mul(vI_ps, fxtale));
+                            vsumW2 = v_add(vsumW2, v_mul(vI_ps, fytale));
 
                             // sumI
-                            v_float32x4 vI_tale = vI_ps * vtale_0;
-                            vsumI += vI_tale;
+                            v_float32x4 vI_tale = v_mul(vI_ps, vtale_0);
+                            vsumI = v_add(vsumI, vI_tale);
 
                             // sumW
-                            vsumW += vtale_0;
+                            vsumW = v_add(vsumW, vtale_0);
 
                             // sumDI
-                            vsumDI += vI_ps * vI_tale;
+                            vsumDI = v_add(vsumDI, v_mul(vI_ps, vI_tale));
 
                             v01 = v_reinterpret_as_s16(v_interleave_pairs(v_reinterpret_as_s32(v_interleave_pairs(vIxy_1))));
                             v_expand(v01, t1, t0);
@@ -890,29 +890,29 @@ class TrackerInvoker : public cv::ParallelLoopBody
                             fx = v_cvt_f32(t1);
 
                             // A11 - A22
-                            fxtale = fx * vtale_1;
-                            fytale = fy * vtale_1;
+                            fxtale = v_mul(fx, vtale_1);
+                            fytale = v_mul(fy, vtale_1);
 
                             vAyy = v_muladd(fy, fytale, vAyy);
                             vAxy = v_muladd(fx, fytale, vAxy);
                             vAxx = v_muladd(fx, fxtale, vAxx);
 
                             // sumIx und sumIy
-                            vsumIx += fxtale;
-                            vsumIy += fytale;
+                            vsumIx = v_add(vsumIx, fxtale);
+                            vsumIy = v_add(vsumIy, fytale);
 
-                            vsumW1 += vI_ps * fxtale;
-                            vsumW2 += vI_ps * fytale;
+                            vsumW1 = v_add(vsumW1, v_mul(vI_ps, fxtale));
+                            vsumW2 = v_add(vsumW2, v_mul(vI_ps, fytale));
 
                             // sumI
-                            vI_tale = vI_ps * vtale_1;
-                            vsumI += vI_tale;
+                            vI_tale = v_mul(vI_ps, vtale_1);
+                            vsumI = v_add(vsumI, vI_tale);
 
                             // sumW
-                            vsumW += vtale_1;
+                            vsumW = v_add(vsumW, vtale_1);
 
                             // sumDI
-                            vsumDI += vI_ps * vI_tale;
+                            vsumDI = v_add(vsumDI, v_mul(vI_ps, vI_tale));
                         }
                     }
 #else
@@ -1017,7 +1017,7 @@ class TrackerInvoker : public cv::ParallelLoopBody
                 }
 #if CV_SIMD128
                 float CV_DECL_ALIGNED(16) bbuf[4];
-                v_store_aligned(bbuf, vqb0 + vqb1);
+                v_store_aligned(bbuf, v_add(vqb0, vqb1));
                 b1 = bbuf[0] + bbuf[2];
                 b2 = bbuf[1] + bbuf[3];
                 b3 = v_reduce_sum(vqb2);
diff --git a/modules/optflow/src/rlof/rlof_invokerbase.hpp b/modules/optflow/src/rlof/rlof_invokerbase.hpp
index c6f77f6d62c..2db4234ecd8 100644
--- a/modules/optflow/src/rlof/rlof_invokerbase.hpp
+++ b/modules/optflow/src/rlof/rlof_invokerbase.hpp
@@ -71,15 +71,15 @@ static inline void copyWinBuffers(int iw00, int iw01, int iw10, int iw11,
 
         for (; x <= winSize.width*cn; x += 8, dsrc += 8 * 2, dsrc1 += 8 * 2, dIptr += 8 * 2)
         {
-            v_int32x4 vmask0 = v_reinterpret_as_s32(v_load_expand_q(maskPtr + x)) * vmax_val_32;
-            v_int32x4 vmask1 = v_reinterpret_as_s32(v_load_expand_q(maskPtr + x + 4)) * vmax_val_32;
+            v_int32x4 vmask0 = v_mul(v_reinterpret_as_s32(v_load_expand_q(maskPtr + x)), vmax_val_32);
+            v_int32x4 vmask1 = v_mul(v_reinterpret_as_s32(v_load_expand_q(maskPtr + x + 4)), vmax_val_32);
             if (x + 4 > winSize.width)
             {
-                vmask0 = vmask0 & vmask_border_0;
+                vmask0 = v_and(vmask0, vmask_border_0);
             }
             if (x + 8 > winSize.width)
             {
-                vmask1 = vmask1 & vmask_border_1;
+                vmask1 = v_and(vmask1, vmask_border_1);
             }
 
             v_int32x4 t0, t1;
@@ -91,10 +91,10 @@ static inline void copyWinBuffers(int iw00, int iw01, int iw10, int iw11,
 
             v_zip(v00, v01, t00, t01);
             v_zip(v10, v11, t10, t11);
-            t0 = v_dotprod(t00, vqw0, vdelta) + v_dotprod(t10, vqw1);
-            t1 = v_dotprod(t01, vqw0, vdelta) + v_dotprod(t11, vqw1);
-            t0 = t0 >> (W_BITS - 5)  & vmask0;
-            t1 = t1 >> (W_BITS - 5)  & vmask1;
+            t0 = v_add(v_dotprod(t00, vqw0, vdelta), v_dotprod(t10, vqw1));
+            t1 = v_add(v_dotprod(t01, vqw0, vdelta), v_dotprod(t11, vqw1));
+            t0 = v_and(v_shr(t0, W_BITS - 5), vmask0);
+            t1 = v_and(v_shr(t1, W_BITS - 5), vmask1);
             v_store(Iptr + x, v_pack(t0, t1));
 
             v00 = v_reinterpret_as_s16(v_load(dsrc));
@@ -105,12 +105,12 @@ static inline void copyWinBuffers(int iw00, int iw01, int iw10, int iw11,
             v_zip(v00, v01, t00, t01);
             v_zip(v10, v11, t10, t11);
 
-            t0 = v_dotprod(t00, vqw0, vdelta_d) + v_dotprod(t10, vqw1);
-            t1 = v_dotprod(t01, vqw0, vdelta_d) + v_dotprod(t11, vqw1);
-            t0 = t0 >> W_BITS;
-            t1 = t1 >> W_BITS;
+            t0 = v_add(v_dotprod(t00, vqw0, vdelta_d), v_dotprod(t10, vqw1));
+            t1 = v_add(v_dotprod(t01, vqw0, vdelta_d), v_dotprod(t11, vqw1));
+            t0 = v_shr(t0, W_BITS);
+            t1 = v_shr(t1, W_BITS);
             v00 = v_pack(t0, t1); // Ix0 Iy0 Ix1 Iy1 ...
-            v00 = v00 & v_reinterpret_as_s16(vmask0);
+            v00 = v_and(v00, v_reinterpret_as_s16(vmask0));
             v_store(dIptr, v00);
 
             v00 = v_reinterpret_as_s16(v_load(dsrc + 4 * 2));
@@ -121,12 +121,12 @@ static inline void copyWinBuffers(int iw00, int iw01, int iw10, int iw11,
             v_zip(v00, v01, t00, t01);
             v_zip(v10, v11, t10, t11);
 
-            t0 = v_dotprod(t00, vqw0, vdelta_d) + v_dotprod(t10, vqw1);
-            t1 = v_dotprod(t01, vqw0, vdelta_d) + v_dotprod(t11, vqw1);
-            t0 = t0 >> W_BITS;
-            t1 = t1 >> W_BITS;
+            t0 = v_add(v_dotprod(t00, vqw0, vdelta_d), v_dotprod(t10, vqw1));
+            t1 = v_add(v_dotprod(t01, vqw0, vdelta_d), v_dotprod(t11, vqw1));
+            t0 = v_shr(t0, W_BITS);
+            t1 = v_shr(t1, W_BITS);
             v00 = v_pack(t0, t1); // Ix0 Iy0 Ix1 Iy1 ...
-            v00 = v00 & v_reinterpret_as_s16(vmask1);
+            v00 = v_and(v00, v_reinterpret_as_s16(vmask1));
             v_store(dIptr + 4 * 2, v00);
         }
 #else
@@ -187,15 +187,15 @@ static inline void copyWinBuffers(int iw00, int iw01, int iw10, int iw11,
 #if CV_SIMD128
         for (int x = 0; x <= winSize.width*cn; x += 8, dsrc += 8 * 2, dsrc1 += 8 * 2, dIptr += 8 * 2)
         {
-            v_int32x4 vmask0 = v_reinterpret_as_s32(v_load_expand_q(maskPtr + x)) * vmax_val_32;
-            v_int32x4 vmask1 = v_reinterpret_as_s32(v_load_expand_q(maskPtr + x + 4)) * vmax_val_32;
+            v_int32x4 vmask0 = v_mul(v_reinterpret_as_s32(v_load_expand_q(maskPtr + x)), vmax_val_32);
+            v_int32x4 vmask1 = v_mul(v_reinterpret_as_s32(v_load_expand_q(maskPtr + x + 4)), vmax_val_32);
             if (x + 4 > winSize.width)
             {
-                vmask0 = vmask0 & vmask_border0;
+                vmask0 = v_and(vmask0, vmask_border0);
             }
             if (x + 8 > winSize.width)
             {
-                vmask1 = vmask1 & vmask_border1;
+                vmask1 = v_and(vmask1, vmask_border1);
             }
 
             v_int32x4 t0, t1;
@@ -207,12 +207,12 @@ static inline void copyWinBuffers(int iw00, int iw01, int iw10, int iw11,
 
             v_zip(v00, v01, t00, t01);
             v_zip(v10, v11, t10, t11);
-            t0 = v_dotprod(t00, vqw0, vdelta) + v_dotprod(t10, vqw1);
-            t1 = v_dotprod(t01, vqw0, vdelta) + v_dotprod(t11, vqw1);
-            t0 = t0 >> (W_BITS - 5);
-            t1 = t1 >> (W_BITS - 5);
-            t0 = t0 & vmask0;
-            t1 = t1 & vmask1;
+            t0 = v_add(v_dotprod(t00, vqw0, vdelta), v_dotprod(t10, vqw1));
+            t1 = v_add(v_dotprod(t01, vqw0, vdelta), v_dotprod(t11, vqw1));
+            t0 = v_shr(t0, W_BITS - 5);
+            t1 = v_shr(t1, W_BITS - 5);
+            t0 = v_and(t0, vmask0);
+            t1 = v_and(t1, vmask1);
             v_store(Iptr + x, v_pack(t0, t1));
 
             v00 = v_reinterpret_as_s16(v_load(dsrc));
@@ -223,12 +223,12 @@ static inline void copyWinBuffers(int iw00, int iw01, int iw10, int iw11,
             v_zip(v00, v01, t00, t01);
             v_zip(v10, v11, t10, t11);
 
-            t0 = v_dotprod(t00, vqw0, vdelta_d) + v_dotprod(t10, vqw1);
-            t1 = v_dotprod(t01, vqw0, vdelta_d) + v_dotprod(t11, vqw1);
-            t0 = t0 >> W_BITS;
-            t1 = t1 >> W_BITS;
+            t0 = v_add(v_dotprod(t00, vqw0, vdelta_d), v_dotprod(t10, vqw1));
+            t1 = v_add(v_dotprod(t01, vqw0, vdelta_d), v_dotprod(t11, vqw1));
+            t0 = v_shr(t0, W_BITS);
+            t1 = v_shr(t1, W_BITS);
             v00 = v_pack(t0, t1); // Ix0 Iy0 Ix1 Iy1 ...
-            v00 = v00 & v_reinterpret_as_s16(vmask0);
+            v00 = v_and(v00, v_reinterpret_as_s16(vmask0));
             v_store(dIptr, v00);
 
             v00 = v_reinterpret_as_s16(v_interleave_pairs(v_reinterpret_as_s32(v_interleave_pairs(v00))));
@@ -249,12 +249,12 @@ static inline void copyWinBuffers(int iw00, int iw01, int iw10, int iw11,
             v_zip(v00, v01, t00, t01);
             v_zip(v10, v11, t10, t11);
 
-            t0 = v_dotprod(t00, vqw0, vdelta_d) + v_dotprod(t10, vqw1);
-            t1 = v_dotprod(t01, vqw0, vdelta_d) + v_dotprod(t11, vqw1);
-            t0 = t0 >> W_BITS;
-            t1 = t1 >> W_BITS;
+            t0 = v_add(v_dotprod(t00, vqw0, vdelta_d), v_dotprod(t10, vqw1));
+            t1 = v_add(v_dotprod(t01, vqw0, vdelta_d), v_dotprod(t11, vqw1));
+            t0 = v_shr(t0, W_BITS);
+            t1 = v_shr(t1, W_BITS);
             v00 = v_pack(t0, t1); // Ix0 Iy0 Ix1 Iy1 ...
-            v00 = v00 & v_reinterpret_as_s16(vmask1);
+            v00 = v_and(v00, v_reinterpret_as_s16(vmask1));
             v_store(dIptr + 4 * 2, v00);
 
             v00 = v_reinterpret_as_s16(v_interleave_pairs(v_reinterpret_as_s32(v_interleave_pairs(v00))));
diff --git a/modules/optflow/src/rlof/rlof_localflow.cpp b/modules/optflow/src/rlof/rlof_localflow.cpp
index 8f3c728201a..3bc264f3e34 100644
--- a/modules/optflow/src/rlof/rlof_localflow.cpp
+++ b/modules/optflow/src/rlof/rlof_localflow.cpp
@@ -52,8 +52,8 @@ static void calcSharrDeriv(const cv::Mat& src, cv::Mat& dst)
                 v_int16x8 s1 = v_reinterpret_as_s16(v_load_expand(srow1 + x));
                 v_int16x8 s2 = v_reinterpret_as_s16(v_load_expand(srow2 + x));
 
-                v_int16x8 t1 = s2 - s0;
-                v_int16x8 t0 = v_mul_wrap(s0 + s2, c3) + v_mul_wrap(s1, c10);
+                v_int16x8 t1 = v_sub(s2, s0);
+                v_int16x8 t0 = v_add(v_mul_wrap(v_add(s0, s2), c3), v_mul_wrap(s1, c10));
 
                 v_store(trow0 + x, t0);
                 v_store(trow1 + x, t1);
@@ -90,8 +90,8 @@ static void calcSharrDeriv(const cv::Mat& src, cv::Mat& dst)
                 v_int16x8 s3 = v_load(trow1 + x);
                 v_int16x8 s4 = v_load(trow1 + x + cn);
 
-                v_int16x8 t0 = s1 - s0;
-                v_int16x8 t1 = v_mul_wrap(s2 + s4, c3) + v_mul_wrap(s3, c10);
+                v_int16x8 t0 = v_sub(s1, s0);
+                v_int16x8 t1 = v_add(v_mul_wrap(v_add(s2, s4), c3), v_mul_wrap(s3, c10));
 
                 v_store_interleave((drow + x * 2), t0, t1);
             }
diff --git a/modules/rgbd/src/colored_tsdf.cpp b/modules/rgbd/src/colored_tsdf.cpp
index 0247f66d010..7ce2c7428d0 100644
--- a/modules/rgbd/src/colored_tsdf.cpp
+++ b/modules/rgbd/src/colored_tsdf.cpp
@@ -194,21 +194,21 @@ inline float ColoredTSDFVolumeCPU::interpolateVoxel(const v_float32x4& p) const
 {
     // tx, ty, tz = floor(p)
     v_int32x4 ip = v_floor(p);
-    v_float32x4 t = p - v_cvt_f32(ip);
-    float tx = t.get0();
+    v_float32x4 t = v_sub(p, v_cvt_f32(ip));
+    float tx = v_get0(t);
     t = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(t)));
-    float ty = t.get0();
+    float ty = v_get0(t);
     t = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(t)));
-    float tz = t.get0();
+    float tz = v_get0(t);
 
     int xdim = volDims[0], ydim = volDims[1], zdim = volDims[2];
     const RGBTsdfVoxel* volData = volume.ptr<RGBTsdfVoxel>();
 
-    int ix = ip.get0();
+    int ix = v_get0(ip);
     ip = v_rotate_right<1>(ip);
-    int iy = ip.get0();
+    int iy = v_get0(ip);
     ip = v_rotate_right<1>(ip);
-    int iz = ip.get0();
+    int iz = v_get0(ip);
 
     int coordBase = ix * xdim + iy * ydim + iz * zdim;
 
@@ -218,15 +218,15 @@ inline float ColoredTSDFVolumeCPU::interpolateVoxel(const v_float32x4& p) const
 
     v_float32x4 v0246 = tsdfToFloat_INTR(v_int32x4(vx[0], vx[2], vx[4], vx[6]));
     v_float32x4 v1357 = tsdfToFloat_INTR(v_int32x4(vx[1], vx[3], vx[5], vx[7]));
-    v_float32x4 vxx = v0246 + v_setall_f32(tz) * (v1357 - v0246);
+    v_float32x4 vxx = v_add(v0246, v_mul(v_setall_f32(tz), v_sub(v1357, v0246)));
 
     v_float32x4 v00_10 = vxx;
     v_float32x4 v01_11 = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(vxx)));
 
-    v_float32x4 v0_1 = v00_10 + v_setall_f32(ty) * (v01_11 - v00_10);
-    float v0 = v0_1.get0();
+    v_float32x4 v0_1 = v_add(v00_10, v_mul(v_setall_f32(ty), v_sub(v01_11, v00_10)));
+    float v0 = v_get0(v0_1);
     v0_1 = v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(v0_1)));
-    float v1 = v0_1.get0();
+    float v1 = v_get0(v0_1);
 
     return v0 + tx * (v1 - v0);
 }
@@ -276,27 +276,27 @@ inline Point3f ColoredTSDFVolumeCPU::getNormalVoxel(const Point3f& _p) const
 
 inline v_float32x4 ColoredTSDFVolumeCPU::getNormalVoxel(const v_float32x4& p) const
 {
-    if (v_check_any(p < v_float32x4(1.f, 1.f, 1.f, 0.f)) ||
-        v_check_any(p >= v_float32x4((float)(volResolution.x - 2),
+    if (v_check_any(v_lt(p, v_float32x4(1.f, 1.f, 1.f, 0.f))) ||
+        v_check_any(v_ge(p, v_float32x4((float)(volResolution.x - 2),
             (float)(volResolution.y - 2),
-            (float)(volResolution.z - 2), 1.f))
+            (float)(volResolution.z - 2), 1.f)))
         )
         return nanv;
 
     v_int32x4 ip = v_floor(p);
-    v_float32x4 t = p - v_cvt_f32(ip);
-    float tx = t.get0();
+    v_float32x4 t = v_sub(p, v_cvt_f32(ip));
+    float tx = v_get0(t);
     t = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(t)));
-    float ty = t.get0();
+    float ty = v_get0(t);
     t = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(t)));
-    float tz = t.get0();
+    float tz = v_get0(t);
 
     const int xdim = volDims[0], ydim = volDims[1], zdim = volDims[2];
     const RGBTsdfVoxel* volData = volume.ptr<RGBTsdfVoxel>();
 
-    int ix = ip.get0(); ip = v_rotate_right<1>(ip);
-    int iy = ip.get0(); ip = v_rotate_right<1>(ip);
-    int iz = ip.get0();
+    int ix = v_get0(ip); ip = v_rotate_right<1>(ip);
+    int iy = v_get0(ip); ip = v_rotate_right<1>(ip);
+    int iz = v_get0(ip);
 
     int coordBase = ix * xdim + iy * ydim + iz * zdim;
 
@@ -314,23 +314,23 @@ inline v_float32x4 ColoredTSDFVolumeCPU::getNormalVoxel(const v_float32x4& p) co
 
         v_float32x4 v0246(vx[0], vx[2], vx[4], vx[6]);
         v_float32x4 v1357(vx[1], vx[3], vx[5], vx[7]);
-        v_float32x4 vxx = v0246 + v_setall_f32(tz) * (v1357 - v0246);
+        v_float32x4 vxx = v_add(v0246, v_mul(v_setall_f32(tz), v_sub(v1357, v0246)));
 
         v_float32x4 v00_10 = vxx;
         v_float32x4 v01_11 = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(vxx)));
 
-        v_float32x4 v0_1 = v00_10 + v_setall_f32(ty) * (v01_11 - v00_10);
-        float v0 = v0_1.get0();
+        v_float32x4 v0_1 = v_add(v00_10, v_mul(v_setall_f32(ty), v_sub(v01_11, v00_10)));
+        float v0 = v_get0(v0_1);
         v0_1 = v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(v0_1)));
-        float v1 = v0_1.get0();
+        float v1 = v_get0(v0_1);
 
         nv = v0 + tx * (v1 - v0);
     }
 
     v_float32x4 n = v_load_aligned(an);
-    v_float32x4 Norm = v_sqrt(v_setall_f32(v_reduce_sum(n * n)));
+    v_float32x4 Norm = v_sqrt(v_setall_f32(v_reduce_sum(v_mul(n, n))));
 
-    return Norm.get0() < 0.0001f ? nanv : n / Norm;
+    return v_get0(Norm) < 0.0001f ? nanv : v_div(n, Norm);
 }
 #else
 inline Point3f ColoredTSDFVolumeCPU::getNormalVoxel(const Point3f& p) const
@@ -388,15 +388,15 @@ inline float ColoredTSDFVolumeCPU::interpolateColor(float tx, float ty, float tz
     v_float32x4 v0246, v1357;
     v_load_deinterleave(vx, v0246, v1357);
 
-    v_float32x4 vxx = v0246 + v_setall_f32(tz) * (v1357 - v0246);
+    v_float32x4 vxx = v_add(v0246, v_mul(v_setall_f32(tz), v_sub(v1357, v0246)));
 
     v_float32x4 v00_10 = vxx;
     v_float32x4 v01_11 = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(vxx)));
 
-    v_float32x4 v0_1 = v00_10 + v_setall_f32(ty) * (v01_11 - v00_10);
-    float v0 = v0_1.get0();
+    v_float32x4 v0_1 = v_add(v00_10, v_mul(v_setall_f32(ty), v_sub(v01_11, v00_10)));
+    float v0 = v_get0(v0_1);
     v0_1 = v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(v0_1)));
-    float v1 = v0_1.get0();
+    float v1 = v_get0(v0_1);
 
     return v0 + tx * (v1 - v0);
 }
@@ -427,10 +427,10 @@ inline Point3f ColoredTSDFVolumeCPU::getColorVoxel(const Point3f& _p) const
 }
 inline v_float32x4 ColoredTSDFVolumeCPU::getColorVoxel(const v_float32x4& p) const
 {
-    if (v_check_any(p < v_float32x4(1.f, 1.f, 1.f, 0.f)) ||
-        v_check_any(p >= v_float32x4((float)(volResolution.x - 2),
+    if (v_check_any(v_lt(p, v_float32x4(1.f, 1.f, 1.f, 0.f))) ||
+        v_check_any(v_ge(p, v_float32x4((float)(volResolution.x - 2),
             (float)(volResolution.y - 2),
-            (float)(volResolution.z - 2), 1.f))
+            (float)(volResolution.z - 2), 1.f)))
         )
         return nanv;
 
@@ -439,9 +439,9 @@ inline v_float32x4 ColoredTSDFVolumeCPU::getColorVoxel(const v_float32x4& p) con
     const int xdim = volDims[0], ydim = volDims[1], zdim = volDims[2];
     const RGBTsdfVoxel* volData = volume.ptr<RGBTsdfVoxel>();
 
-    int ix = ip.get0(); ip = v_rotate_right<1>(ip);
-    int iy = ip.get0(); ip = v_rotate_right<1>(ip);
-    int iz = ip.get0();
+    int ix = v_get0(ip); ip = v_rotate_right<1>(ip);
+    int iy = v_get0(ip); ip = v_rotate_right<1>(ip);
+    int iz = v_get0(ip);
 
     int coordBase = ix * xdim + iy * ydim + iz * zdim;
     float CV_DECL_ALIGNED(16) rgb[4];
@@ -456,12 +456,12 @@ inline v_float32x4 ColoredTSDFVolumeCPU::getColorVoxel(const v_float32x4& p) con
     }
 
     v_float32x4 vsi(voxelSizeInv, voxelSizeInv, voxelSizeInv, voxelSizeInv);
-    v_float32x4 ptVox = p * vsi;
+    v_float32x4 ptVox = v_mul(p, vsi);
     v_int32x4 iptVox = v_floor(ptVox);
-    v_float32x4 t = ptVox - v_cvt_f32(iptVox);
-    float tx = t.get0(); t = v_rotate_right<1>(t);
-    float ty = t.get0(); t = v_rotate_right<1>(t);
-    float tz = t.get0();
+    v_float32x4 t = v_sub(ptVox, v_cvt_f32(iptVox));
+    float tx = v_get0(t); t = v_rotate_right<1>(t);
+    float ty = v_get0(t); t = v_rotate_right<1>(t);
+    float tz = v_get0(t);
     rgb[0] = interpolateColor(tx, ty, tz, r);
     rgb[1] = interpolateColor(tx, ty, tz, g);
     rgb[2] = interpolateColor(tx, ty, tz, b);
@@ -583,21 +583,21 @@ struct ColorRaycastInvoker : ParallelLoopBody
                 // get direction through pixel in volume space:
 
                 // 1. reproject (x, y) on projecting plane where z = 1.f
-                v_float32x4 planed = (v_float32x4((float)x, (float)y, 0.f, 0.f) - vcxy) * vfxy;
+                v_float32x4 planed = v_mul(v_sub(v_float32x4((float)x, (float)y, 0.F, 0.F), vcxy), vfxy);
                 planed = v_combine_low(planed, v_float32x4(1.f, 0.f, 0.f, 0.f));
 
                 // 2. rotate to volume space
                 planed = v_matmuladd(planed, camRot0, camRot1, camRot2, v_setzero_f32());
 
                 // 3. normalize
-                v_float32x4 invNorm = v_invsqrt(v_setall_f32(v_reduce_sum(planed * planed)));
-                v_float32x4 dir = planed * invNorm;
+                v_float32x4 invNorm = v_invsqrt(v_setall_f32(v_reduce_sum(v_mul(planed, planed))));
+                v_float32x4 dir = v_mul(planed, invNorm);
 
                 // compute intersection of ray with all six bbox planes
-                v_float32x4 rayinv = v_setall_f32(1.f) / dir;
+                v_float32x4 rayinv = v_div(v_setall_f32(1.F), dir);
                 // div by zero should be eliminated by these products
-                v_float32x4 tbottom = rayinv * (boxDown - orig);
-                v_float32x4 ttop = rayinv * (boxUp - orig);
+                v_float32x4 tbottom = v_mul(rayinv, v_sub(boxDown, orig));
+                v_float32x4 ttop = v_mul(rayinv, v_sub(boxUp, orig));
 
                 // re-order intersections to find smallest and largest on each axis
                 v_float32x4 minAx = v_min(ttop, tbottom);
@@ -618,14 +618,14 @@ struct ColorRaycastInvoker : ParallelLoopBody
                 if (tmin < tmax)
                 {
                     // interpolation optimized a little
-                    orig *= invVoxelSize;
-                    dir *= invVoxelSize;
+                    orig = v_mul(orig, invVoxelSize);
+                    dir = v_mul(dir, invVoxelSize);
 
                     int xdim = volume.volDims[0];
                     int ydim = volume.volDims[1];
                     int zdim = volume.volDims[2];
-                    v_float32x4 rayStep = dir * v_setall_f32(tstep);
-                    v_float32x4 next = (orig + dir * v_setall_f32(tmin));
+                    v_float32x4 rayStep = v_mul(dir, v_setall_f32(this->tstep));
+                    v_float32x4 next = (v_add(orig, v_mul(dir, v_setall_f32(tmin))));
                     float f = volume.interpolateVoxel(next), fnext = f;
 
                     //raymarch
@@ -633,11 +633,11 @@ struct ColorRaycastInvoker : ParallelLoopBody
                     int nSteps = cvFloor((tmax - tmin) / tstep);
                     for (; steps < nSteps; steps++)
                     {
-                        next += rayStep;
+                        next = v_add(next, rayStep);
                         v_int32x4 ip = v_round(next);
-                        int ix = ip.get0(); ip = v_rotate_right<1>(ip);
-                        int iy = ip.get0(); ip = v_rotate_right<1>(ip);
-                        int iz = ip.get0();
+                        int ix = v_get0(ip); ip = v_rotate_right<1>(ip);
+                        int iy = v_get0(ip); ip = v_rotate_right<1>(ip);
+                        int iz = v_get0(ip);
                         int coord = ix * xdim + iy * ydim + iz * zdim;
 
                         fnext = tsdfToFloat(volume.volume.at<RGBTsdfVoxel>(coord).tsdf);
@@ -657,7 +657,7 @@ struct ColorRaycastInvoker : ParallelLoopBody
                     // linearly interpolate t between two f values
                     if (f > 0.f && fnext < 0.f)
                     {
-                        v_float32x4 tp = next - rayStep;
+                        v_float32x4 tp = v_sub(next, rayStep);
                         float ft = volume.interpolateVoxel(tp);
                         float ftdt = volume.interpolateVoxel(next);
                         float ts = tmin + tstep * (steps - ft / (ftdt - ft));
@@ -665,7 +665,7 @@ struct ColorRaycastInvoker : ParallelLoopBody
                         // avoid division by zero
                         if (!cvIsNaN(ts) && !cvIsInf(ts))
                         {
-                            v_float32x4 pv = (orig + dir * v_setall_f32(ts));
+                            v_float32x4 pv = (v_add(orig, v_mul(dir, v_setall_f32(ts))));
                             v_float32x4 nv = volume.getNormalVoxel(pv);
                             v_float32x4 cv = volume.getColorVoxel(pv);
 
@@ -675,9 +675,7 @@ struct ColorRaycastInvoker : ParallelLoopBody
                                 //convert pv and nv to camera space
                                 normal = v_matmuladd(nv, volRot0, volRot1, volRot2, v_setzero_f32());
                                 // interpolation optimized a little
-                                point = v_matmuladd(pv * v_float32x4(volume.voxelSize,
-                                    volume.voxelSize,
-                                    volume.voxelSize, 1.f),
+                                point = v_matmuladd(v_mul(pv, v_float32x4(this->volume.voxelSize, this->volume.voxelSize, this->volume.voxelSize, 1.F)),
                                     volRot0, volRot1, volRot2, volTrans);
                             }
                         }
diff --git a/modules/rgbd/src/fast_icp.cpp b/modules/rgbd/src/fast_icp.cpp
index 07eb84d19fd..5f3ab3f0a19 100644
--- a/modules/rgbd/src/fast_icp.cpp
+++ b/modules/rgbd/src/fast_icp.cpp
@@ -138,7 +138,7 @@ bool ICPImpl::estimateTransformT(cv::Affine3f& transform,
 #if USE_INTRINSICS
 static inline bool fastCheck(const v_float32x4& p0, const v_float32x4& p1)
 {
-    float check = (p0.get0() + p1.get0());
+    float check = (v_get0(p0) + v_get0(p1));
     return !cvIsNaN(check);
 }
 
@@ -160,7 +160,7 @@ static inline v_float32x4 crossProduct(const v_float32x4& a, const v_float32x4&
     v_float32x4 ayzx, azxy, byzx, bzxy;
     getCrossPerm(a, ayzx, azxy);
     getCrossPerm(b, byzx, bzxy);
-    return ayzx*bzxy - azxy*byzx;
+    return v_sub(v_mul(ayzx, bzxy), v_mul(azxy, byzx));
 }
 #else
 static inline bool fastCheck(const Point3f& p)
@@ -235,11 +235,11 @@ struct GetAbInvoker : ParallelLoopBody
 
                 //find correspondence by projecting the point
                 v_float32x4 oldCoords;
-                float pz = (v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(newP))).get0());
+                float pz = v_get0(v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(newP))));
                 // x, y, 0, 0
-                oldCoords = v_muladd(newP/v_setall_f32(pz), vfxy, vcxy);
+                oldCoords = v_muladd(v_div(newP, v_setall_f32(pz)), vfxy, vcxy);
 
-                if(!v_check_all((oldCoords >= v_setzero_f32()) & (oldCoords < vframe)))
+                if(!v_check_all(v_and(v_ge(oldCoords, v_setzero_f32()), v_lt(oldCoords, vframe))))
                     continue;
 
                 // bilinearly interpolate oldPts and oldNrm under oldCoords point
@@ -247,12 +247,12 @@ struct GetAbInvoker : ParallelLoopBody
                 v_float32x4 oldN;
                 {
                     v_int32x4 ixy = v_floor(oldCoords);
-                    v_float32x4 txy = oldCoords - v_cvt_f32(ixy);
-                    int xi = ixy.get0();
-                    int yi = v_rotate_right<1>(ixy).get0();
-                    v_float32x4 tx = v_setall_f32(txy.get0());
+                    v_float32x4 txy = v_sub(oldCoords, v_cvt_f32(ixy));
+                    int xi = v_get0(ixy);
+                    int yi = v_get0(v_rotate_right<1>(ixy));
+                    v_float32x4 tx = v_setall_f32(v_get0(txy));
                     txy = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(txy)));
-                    v_float32x4 ty = v_setall_f32(txy.get0());
+                    v_float32x4 ty = v_setall_f32(v_get0(txy));
 
                     const float* prow0 = (const float*)oldPts[yi+0];
                     const float* prow1 = (const float*)oldPts[yi+1];
@@ -275,23 +275,23 @@ struct GetAbInvoker : ParallelLoopBody
 
                     // NaN check is done later
 
-                    v_float32x4 p0 = p00 + tx*(p01 - p00);
-                    v_float32x4 p1 = p10 + tx*(p11 - p10);
-                    oldP = p0 + ty*(p1 - p0);
+                    v_float32x4 p0 = v_add(p00, v_mul(tx, v_sub(p01, p00)));
+                    v_float32x4 p1 = v_add(p10, v_mul(tx, v_sub(p11, p10)));
+                    oldP = v_add(p0, v_mul(ty, v_sub(p1, p0)));
 
-                    v_float32x4 n0 = n00 + tx*(n01 - n00);
-                    v_float32x4 n1 = n10 + tx*(n11 - n10);
-                    oldN = n0 + ty*(n1 - n0);
+                    v_float32x4 n0 = v_add(n00, v_mul(tx, v_sub(n01, n00)));
+                    v_float32x4 n1 = v_add(n10, v_mul(tx, v_sub(n11, n10)));
+                    oldN = v_add(n0, v_mul(ty, v_sub(n1, n0)));
                 }
 
                 bool oldPNcheck = fastCheck(oldP, oldN);
 
                 //filter by distance
-                v_float32x4 diff = newP - oldP;
-                bool distCheck = !(v_reduce_sum(diff*diff) > sqThresh);
+                v_float32x4 diff = v_sub(newP, oldP);
+                bool distCheck = !(v_reduce_sum(v_mul(diff, diff)) > sqThresh);
 
                 //filter by angle
-                bool angleCheck = !(abs(v_reduce_sum(newN*oldN)) < cosThresh);
+                bool angleCheck = !(abs(v_reduce_sum(v_mul(newN, oldN))) < cosThresh);
 
                 if(!(oldPNcheck && distCheck && angleCheck))
                     continue;
@@ -299,17 +299,17 @@ struct GetAbInvoker : ParallelLoopBody
                 // build point-wise vector ab = [ A | b ]
                 v_float32x4 VxNv = crossProduct(newP, oldN);
                 Point3f VxN;
-                VxN.x = VxNv.get0();
-                VxN.y = v_reinterpret_as_f32(v_extract<1>(v_reinterpret_as_u32(VxNv), v_setzero_u32())).get0();
-                VxN.z = v_reinterpret_as_f32(v_extract<2>(v_reinterpret_as_u32(VxNv), v_setzero_u32())).get0();
+                VxN.x = v_get0(VxNv);
+                VxN.y = v_get0(v_reinterpret_as_f32(v_extract<1>(v_reinterpret_as_u32(VxNv), v_setzero_u32())));
+                VxN.z = v_get0(v_reinterpret_as_f32(v_extract<2>(v_reinterpret_as_u32(VxNv), v_setzero_u32())));
 
-                float dotp = -v_reduce_sum(oldN*diff);
+                float dotp = -v_reduce_sum(v_mul(oldN, diff));
 
                 // build point-wise upper-triangle matrix [ab^T * ab] w/o last row
                 // which is [A^T*A | A^T*b]
                 // and gather sum
 
-                v_float32x4 vd = VxNv | v_float32x4(0, 0, 0, dotp);
+                v_float32x4 vd = v_or(VxNv, v_float32x4(0, 0, 0, dotp));
                 v_float32x4 n = oldN;
                 v_float32x4 nyzx;
                 {
diff --git a/modules/rgbd/src/hash_tsdf.cpp b/modules/rgbd/src/hash_tsdf.cpp
index 9cf05e55691..e47af731dba 100644
--- a/modules/rgbd/src/hash_tsdf.cpp
+++ b/modules/rgbd/src/hash_tsdf.cpp
@@ -425,15 +425,15 @@ inline float interpolate(float tx, float ty, float tz, float vx[8])
     v_float32x4 v0246, v1357;
     v_load_deinterleave(vx, v0246, v1357);
 
-    v_float32x4 vxx = v0246 + v_setall_f32(tz) * (v1357 - v0246);
+    v_float32x4 vxx = v_add(v0246, v_mul(v_setall_f32(tz), v_sub(v1357, v0246)));
 
     v_float32x4 v00_10 = vxx;
     v_float32x4 v01_11 = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(vxx)));
 
-    v_float32x4 v0_1 = v00_10 + v_setall_f32(ty) * (v01_11 - v00_10);
-    float v0 = v0_1.get0();
+    v_float32x4 v0_1 = v_add(v00_10, v_mul(v_setall_f32(ty), v_sub(v01_11, v00_10)));
+    float v0 = v_get0(v0_1);
     v0_1 = v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(v0_1)));
-    float v1 = v0_1.get0();
+    float v1 = v_get0(v0_1);
 
     return v0 + tx * (v1 - v0);
 }
@@ -598,9 +598,9 @@ Point3f HashTSDFVolumeCPU::getNormalVoxel(const Point3f &point) const
     v_float32x8 czp = v_lut(vals, idxzp);
     v_float32x8 czn = v_lut(vals, idxzn);
 
-    v_float32x8 vcxv = cxn - cxp;
-    v_float32x8 vcyv = cyn - cyp;
-    v_float32x8 vczv = czn - czp;
+    v_float32x8 vcxv = v_sub(cxn, cxp);
+    v_float32x8 vcyv = v_sub(cyn, cyp);
+    v_float32x8 vczv = v_sub(czn, czp);
 
     v_store(cxv, vcxv);
     v_store(cyv, vcyv);
@@ -615,9 +615,9 @@ Point3f HashTSDFVolumeCPU::getNormalVoxel(const Point3f &point) const
     v_float32x4 czp0 = v_lut(vals, idxzp + 0); v_float32x4 czp1 = v_lut(vals, idxzp + 4);
     v_float32x4 czn0 = v_lut(vals, idxzn + 0); v_float32x4 czn1 = v_lut(vals, idxzn + 4);
 
-    v_float32x4 cxv0 = cxn0 - cxp0; v_float32x4 cxv1 = cxn1 - cxp1;
-    v_float32x4 cyv0 = cyn0 - cyp0; v_float32x4 cyv1 = cyn1 - cyp1;
-    v_float32x4 czv0 = czn0 - czp0; v_float32x4 czv1 = czn1 - czp1;
+    v_float32x4 cxv0 = v_sub(cxn0, cxp0); v_float32x4 cxv1 = v_sub(cxn1, cxp1);
+    v_float32x4 cyv0 = v_sub(cyn0, cyp0); v_float32x4 cyv1 = v_sub(cyn1, cyp1);
+    v_float32x4 czv0 = v_sub(czn0, czp0); v_float32x4 czv1 = v_sub(czn1, czp1);
 
     v_store(cxv + 0, cxv0); v_store(cxv + 4, cxv1);
     v_store(cyv + 0, cyv0); v_store(cyv + 4, cyv1);
@@ -1523,9 +1523,9 @@ Point3f HashTSDFVolumeGPU::getNormalVoxel(const Point3f& point) const
     v_float32x8 czp = v_lut(vals, idxzp);
     v_float32x8 czn = v_lut(vals, idxzn);
 
-    v_float32x8 vcxv = cxn - cxp;
-    v_float32x8 vcyv = cyn - cyp;
-    v_float32x8 vczv = czn - czp;
+    v_float32x8 vcxv = v_sub(cxn, cxp);
+    v_float32x8 vcyv = v_sub(cyn, cyp);
+    v_float32x8 vczv = v_sub(czn, czp);
 
     v_store(cxv, vcxv);
     v_store(cyv, vcyv);
@@ -1540,9 +1540,9 @@ Point3f HashTSDFVolumeGPU::getNormalVoxel(const Point3f& point) const
     v_float32x4 czp0 = v_lut(vals, idxzp + 0); v_float32x4 czp1 = v_lut(vals, idxzp + 4);
     v_float32x4 czn0 = v_lut(vals, idxzn + 0); v_float32x4 czn1 = v_lut(vals, idxzn + 4);
 
-    v_float32x4 cxv0 = cxn0 - cxp0; v_float32x4 cxv1 = cxn1 - cxp1;
-    v_float32x4 cyv0 = cyn0 - cyp0; v_float32x4 cyv1 = cyn1 - cyp1;
-    v_float32x4 czv0 = czn0 - czp0; v_float32x4 czv1 = czn1 - czp1;
+    v_float32x4 cxv0 = v_sub(cxn0, cxp0); v_float32x4 cxv1 = v_sub(cxn1, cxp1);
+    v_float32x4 cyv0 = v_sub(cyn0, cyp0); v_float32x4 cyv1 = v_sub(cyn1, cyp1);
+    v_float32x4 czv0 = v_sub(czn0, czp0); v_float32x4 czv1 = v_sub(czn1, czp1);
 
     v_store(cxv + 0, cxv0); v_store(cxv + 4, cxv1);
     v_store(cyv + 0, cyv0); v_store(cyv + 4, cyv1);
diff --git a/modules/rgbd/src/tsdf.cpp b/modules/rgbd/src/tsdf.cpp
index 7b76985eb43..73a39a65929 100644
--- a/modules/rgbd/src/tsdf.cpp
+++ b/modules/rgbd/src/tsdf.cpp
@@ -146,21 +146,21 @@ inline float TSDFVolumeCPU::interpolateVoxel(const v_float32x4& p) const
 {
     // tx, ty, tz = floor(p)
     v_int32x4 ip  = v_floor(p);
-    v_float32x4 t = p - v_cvt_f32(ip);
-    float tx      = t.get0();
+    v_float32x4 t = v_sub(p, v_cvt_f32(ip));
+    float tx      = v_get0(t);
     t             = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(t)));
-    float ty      = t.get0();
+    float ty      = v_get0(t);
     t             = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(t)));
-    float tz      = t.get0();
+    float tz      = v_get0(t);
 
     int xdim = volDims[0], ydim = volDims[1], zdim = volDims[2];
     const TsdfVoxel* volData = volume.ptr<TsdfVoxel>();
 
-    int ix = ip.get0();
+    int ix = v_get0(ip);
     ip     = v_rotate_right<1>(ip);
-    int iy = ip.get0();
+    int iy = v_get0(ip);
     ip     = v_rotate_right<1>(ip);
-    int iz = ip.get0();
+    int iz = v_get0(ip);
 
     int coordBase = ix*xdim + iy*ydim + iz*zdim;
 
@@ -170,15 +170,15 @@ inline float TSDFVolumeCPU::interpolateVoxel(const v_float32x4& p) const
 
     v_float32x4 v0246 = tsdfToFloat_INTR(v_int32x4(vx[0], vx[2], vx[4], vx[6]));
     v_float32x4 v1357 = tsdfToFloat_INTR(v_int32x4(vx[1], vx[3], vx[5], vx[7]));
-    v_float32x4 vxx = v0246 + v_setall_f32(tz)*(v1357 - v0246);
+    v_float32x4 vxx = v_add(v0246, v_mul(v_setall_f32(tz), v_sub(v1357, v0246)));
 
     v_float32x4 v00_10 = vxx;
     v_float32x4 v01_11 = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(vxx)));
 
-    v_float32x4 v0_1 = v00_10 + v_setall_f32(ty)*(v01_11 - v00_10);
-    float v0         = v0_1.get0();
+    v_float32x4 v0_1 = v_add(v00_10, v_mul(v_setall_f32(ty), v_sub(v01_11, v00_10)));
+    float v0         = v_get0(v0_1);
     v0_1             = v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(v0_1)));
-    float v1         = v0_1.get0();
+    float v1         = v_get0(v0_1);
 
     return v0 + tx*(v1 - v0);
 }
@@ -228,27 +228,27 @@ inline Point3f TSDFVolumeCPU::getNormalVoxel(const Point3f& _p) const
 
 inline v_float32x4 TSDFVolumeCPU::getNormalVoxel(const v_float32x4& p) const
 {
-    if(v_check_any (p < v_float32x4(1.f, 1.f, 1.f, 0.f)) ||
-       v_check_any (p >= v_float32x4((float)(volResolution.x-2),
+    if(v_check_any (v_lt(p, v_float32x4(1.f, 1.f, 1.f, 0.f))) ||
+       v_check_any (v_ge(p, v_float32x4((float)(volResolution.x-2),
                                      (float)(volResolution.y-2),
-                                     (float)(volResolution.z-2), 1.f))
+                                     (float)(volResolution.z-2), 1.f)))
                    )
         return nanv;
 
     v_int32x4 ip  = v_floor(p);
-    v_float32x4 t = p - v_cvt_f32(ip);
-    float tx      = t.get0();
+    v_float32x4 t = v_sub(p, v_cvt_f32(ip));
+    float tx      = v_get0(t);
     t             = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(t)));
-    float ty      = t.get0();
+    float ty      = v_get0(t);
     t             = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(t)));
-    float tz      = t.get0();
+    float tz      = v_get0(t);
 
     const int xdim = volDims[0], ydim = volDims[1], zdim = volDims[2];
     const TsdfVoxel* volData = volume.ptr<TsdfVoxel>();
 
-    int ix = ip.get0(); ip = v_rotate_right<1>(ip);
-    int iy = ip.get0(); ip = v_rotate_right<1>(ip);
-    int iz = ip.get0();
+    int ix = v_get0(ip); ip = v_rotate_right<1>(ip);
+    int iy = v_get0(ip); ip = v_rotate_right<1>(ip);
+    int iz = v_get0(ip);
 
     int coordBase = ix*xdim + iy*ydim + iz*zdim;
 
@@ -266,23 +266,23 @@ inline v_float32x4 TSDFVolumeCPU::getNormalVoxel(const v_float32x4& p) const
 
         v_float32x4 v0246 (vx[0], vx[2], vx[4], vx[6]);
         v_float32x4 v1357 (vx[1], vx[3], vx[5], vx[7]);
-        v_float32x4 vxx = v0246 + v_setall_f32(tz)*(v1357 - v0246);
+        v_float32x4 vxx = v_add(v0246, v_mul(v_setall_f32(tz), v_sub(v1357, v0246)));
 
         v_float32x4 v00_10 = vxx;
         v_float32x4 v01_11 = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(vxx)));
 
-        v_float32x4 v0_1 = v00_10 + v_setall_f32(ty)*(v01_11 - v00_10);
-        float v0         = v0_1.get0();
+        v_float32x4 v0_1 = v_add(v00_10, v_mul(v_setall_f32(ty), v_sub(v01_11, v00_10)));
+        float v0         = v_get0(v0_1);
         v0_1             = v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(v0_1)));
-        float v1         = v0_1.get0();
+        float v1         = v_get0(v0_1);
 
         nv = v0 + tx*(v1 - v0);
     }
 
     v_float32x4 n       = v_load_aligned(an);
-    v_float32x4 Norm = v_sqrt(v_setall_f32(v_reduce_sum(n*n)));
+    v_float32x4 Norm = v_sqrt(v_setall_f32(v_reduce_sum(v_mul(n, n))));
 
-    return Norm.get0() < 0.0001f ? nanv : n/Norm;
+    return v_get0(Norm) < 0.0001f ? nanv : v_div(n, Norm);
 }
 #else
 inline Point3f TSDFVolumeCPU::getNormalVoxel(const Point3f& p) const
@@ -394,21 +394,21 @@ struct RaycastInvoker : ParallelLoopBody
                 // get direction through pixel in volume space:
 
                 // 1. reproject (x, y) on projecting plane where z = 1.f
-                v_float32x4 planed = (v_float32x4((float)x, (float)y, 0.f, 0.f) - vcxy)*vfxy;
+                v_float32x4 planed = v_mul(v_sub(v_float32x4((float)x, (float)y, 0.F, 0.F), vcxy), vfxy);
                 planed             = v_combine_low(planed, v_float32x4(1.f, 0.f, 0.f, 0.f));
 
                 // 2. rotate to volume space
                 planed = v_matmuladd(planed, camRot0, camRot1, camRot2, v_setzero_f32());
 
                 // 3. normalize
-                v_float32x4 invNorm = v_invsqrt(v_setall_f32(v_reduce_sum(planed*planed)));
-                v_float32x4 dir     = planed*invNorm;
+                v_float32x4 invNorm = v_invsqrt(v_setall_f32(v_reduce_sum(v_mul(planed, planed))));
+                v_float32x4 dir     = v_mul(planed, invNorm);
 
                 // compute intersection of ray with all six bbox planes
-                v_float32x4 rayinv = v_setall_f32(1.f)/dir;
+                v_float32x4 rayinv = v_div(v_setall_f32(1.F), dir);
                 // div by zero should be eliminated by these products
-                v_float32x4 tbottom = rayinv*(boxDown - orig);
-                v_float32x4 ttop    = rayinv*(boxUp   - orig);
+                v_float32x4 tbottom = v_mul(rayinv, v_sub(boxDown, orig));
+                v_float32x4 ttop    = v_mul(rayinv, v_sub(boxUp, orig));
 
                 // re-order intersections to find smallest and largest on each axis
                 v_float32x4 minAx = v_min(ttop, tbottom);
@@ -429,14 +429,14 @@ struct RaycastInvoker : ParallelLoopBody
                 if(tmin < tmax)
                 {
                     // interpolation optimized a little
-                    orig *= invVoxelSize;
-                    dir  *= invVoxelSize;
+                    orig = v_mul(orig, invVoxelSize);
+                    dir = v_mul(dir, invVoxelSize);
 
                     int xdim            = volume.volDims[0];
                     int ydim            = volume.volDims[1];
                     int zdim            = volume.volDims[2];
-                    v_float32x4 rayStep = dir * v_setall_f32(tstep);
-                    v_float32x4 next    = (orig + dir * v_setall_f32(tmin));
+                    v_float32x4 rayStep = v_mul(dir, v_setall_f32(this->tstep));
+                    v_float32x4 next    = (v_add(orig, v_mul(dir, v_setall_f32(tmin))));
                     float f = volume.interpolateVoxel(next), fnext = f;
 
                     //raymarch
@@ -444,11 +444,11 @@ struct RaycastInvoker : ParallelLoopBody
                     int nSteps = cvFloor((tmax - tmin)/tstep);
                     for(; steps < nSteps; steps++)
                     {
-                        next += rayStep;
+                        next = v_add(next, rayStep);
                         v_int32x4 ip = v_round(next);
-                        int ix = ip.get0(); ip = v_rotate_right<1>(ip);
-                        int iy = ip.get0(); ip = v_rotate_right<1>(ip);
-                        int iz = ip.get0();
+                        int ix = v_get0(ip); ip = v_rotate_right<1>(ip);
+                        int iy = v_get0(ip); ip = v_rotate_right<1>(ip);
+                        int iz = v_get0(ip);
                         int coord = ix*xdim + iy*ydim + iz*zdim;
 
                         fnext = tsdfToFloat(volume.volume.at<TsdfVoxel>(coord).tsdf);
@@ -468,7 +468,7 @@ struct RaycastInvoker : ParallelLoopBody
                     // linearly interpolate t between two f values
                     if(f > 0.f && fnext < 0.f)
                     {
-                        v_float32x4 tp = next - rayStep;
+                        v_float32x4 tp = v_sub(next, rayStep);
                         float ft    = volume.interpolateVoxel(tp);
                         float ftdt  = volume.interpolateVoxel(next);
                         float ts = tmin + tstep*(steps - ft/(ftdt - ft));
@@ -476,7 +476,7 @@ struct RaycastInvoker : ParallelLoopBody
                         // avoid division by zero
                         if(!cvIsNaN(ts) && !cvIsInf(ts))
                         {
-                            v_float32x4 pv = (orig + dir*v_setall_f32(ts));
+                            v_float32x4 pv = (v_add(orig, v_mul(dir, v_setall_f32(ts))));
                             v_float32x4 nv = volume.getNormalVoxel(pv);
 
                             if(!isNaN(nv))
@@ -484,9 +484,7 @@ struct RaycastInvoker : ParallelLoopBody
                                 //convert pv and nv to camera space
                                 normal = v_matmuladd(nv, volRot0, volRot1, volRot2, v_setzero_f32());
                                 // interpolation optimized a little
-                                point = v_matmuladd(pv*v_float32x4(volume.voxelSize,
-                                                                   volume.voxelSize,
-                                                                   volume.voxelSize, 1.f),
+                                point = v_matmuladd(v_mul(pv, v_float32x4(this->volume.voxelSize, this->volume.voxelSize, this->volume.voxelSize, 1.F)),
                                                     volRot0, volRot1, volRot2, volTrans);
                             }
                         }
diff --git a/modules/rgbd/src/tsdf_functions.cpp b/modules/rgbd/src/tsdf_functions.cpp
index 3f8bc26f011..09ca986a1bf 100644
--- a/modules/rgbd/src/tsdf_functions.cpp
+++ b/modules/rgbd/src/tsdf_functions.cpp
@@ -235,35 +235,33 @@ void integrateVolumeUnit(
                     // optimization of the following:
                     //Point3f volPt = Point3f(x, y, z)*voxelSize;
                     //Point3f camSpacePt = vol2cam * volPt;
-                    camSpacePt += zStep;
+                    camSpacePt = v_add(camSpacePt, zStep);
 
-                    float zCamSpace = v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(camSpacePt))).get0();
+                    float zCamSpace = v_get0(v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(camSpacePt))));
                     if (zCamSpace <= 0.f)
                         continue;
 
-                    v_float32x4 camPixVec = camSpacePt / v_setall_f32(zCamSpace);
+                    v_float32x4 camPixVec = v_div(camSpacePt, v_setall_f32(zCamSpace));
                     v_float32x4 projected = v_muladd(camPixVec, vfxy, vcxy);
                     // leave only first 2 lanes
-                    projected = v_reinterpret_as_f32(v_reinterpret_as_u32(projected) &
-                        v_uint32x4(0xFFFFFFFF, 0xFFFFFFFF, 0, 0));
+                    projected = v_reinterpret_as_f32(v_and(v_reinterpret_as_u32(projected), v_uint32x4(4294967295U, 4294967295U, 0, 0)));
 
                     depthType v;
                     // bilinearly interpolate depth at projected
                     {
                         const v_float32x4& pt = projected;
                         // check coords >= 0 and < imgSize
-                        v_uint32x4 limits = v_reinterpret_as_u32(pt < v_setzero_f32()) |
-                            v_reinterpret_as_u32(pt >= upLimits);
-                        limits = limits | v_rotate_right<1>(limits);
-                        if (limits.get0())
+                        v_uint32x4 limits = v_or(v_reinterpret_as_u32(v_lt(pt, v_setzero_f32())), v_reinterpret_as_u32(v_ge(pt, upLimits)));
+                        limits = v_or(limits, v_rotate_right<1>(limits));
+                        if (v_get0(limits))
                             continue;
 
                         // xi, yi = floor(pt)
                         v_int32x4 ip = v_floor(pt);
                         v_int32x4 ipshift = ip;
-                        int xi = ipshift.get0();
+                        int xi = v_get0(ipshift);
                         ipshift = v_rotate_right<1>(ipshift);
-                        int yi = ipshift.get0();
+                        int yi = v_get0(ipshift);
 
                         const depthType* row0 = depth[yi + 0];
                         const depthType* row1 = depth[yi + 1];
@@ -277,17 +275,17 @@ void integrateVolumeUnit(
 
                         // assume correct depth is positive
                         // don't fix missing data
-                        if (v_check_all(vall > v_setzero_f32()))
+                        if (v_check_all(v_gt(vall, v_setzero_f32())))
                         {
-                            v_float32x4 t = pt - v_cvt_f32(ip);
-                            float tx = t.get0();
+                            v_float32x4 t = v_sub(pt, v_cvt_f32(ip));
+                            float tx = v_get0(t);
                             t = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(t)));
-                            v_float32x4 ty = v_setall_f32(t.get0());
+                            v_float32x4 ty = v_setall_f32(v_get0(t));
                             // vx is y-interpolated between rows 0 and 1
-                            v_float32x4 vx = v001 + ty * (v101 - v001);
-                            float v0 = vx.get0();
+                            v_float32x4 vx = v_add(v001, v_mul(ty, v_sub(v101, v001)));
+                            float v0 = v_get0(vx);
                             vx = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(vx)));
-                            float v1 = vx.get0();
+                            float v1 = v_get0(vx);
                             v = v0 + tx * (v1 - v0);
                         }
                         else
@@ -295,8 +293,8 @@ void integrateVolumeUnit(
                     }
 
                     // norm(camPixVec) produces double which is too slow
-                    int _u = (int)projected.get0();
-                    int _v = (int)v_rotate_right<1>(projected).get0();
+                    int _u = (int)v_get0(projected);
+                    int _v = (int)v_get0(v_rotate_right<1>(projected));
                     if (!(_u >= 0 && _u < depth.cols && _v >= 0 && _v < depth.rows))
                         continue;
                     float pixNorm = pixNorms.at<float>(_v, _u);
@@ -500,35 +498,33 @@ void integrateRGBVolumeUnit(
                     // optimization of the following:
                     //Point3f volPt = Point3f(x, y, z)*voxelSize;
                     //Point3f camSpacePt = vol2cam * volPt;
-                    camSpacePt += zStep;
+                    camSpacePt = v_add(camSpacePt, zStep);
 
-                    float zCamSpace = v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(camSpacePt))).get0();
+                    float zCamSpace = v_get0(v_reinterpret_as_f32(v_rotate_right<2>(v_reinterpret_as_u32(camSpacePt))));
                     if (zCamSpace <= 0.f)
                         continue;
 
-                    v_float32x4 camPixVec = camSpacePt / v_setall_f32(zCamSpace);
+                    v_float32x4 camPixVec = v_div(camSpacePt, v_setall_f32(zCamSpace));
                     v_float32x4 projected = v_muladd(camPixVec, vfxy, vcxy);
                     // leave only first 2 lanes
-                    projected = v_reinterpret_as_f32(v_reinterpret_as_u32(projected) &
-                        v_uint32x4(0xFFFFFFFF, 0xFFFFFFFF, 0, 0));
+                    projected = v_reinterpret_as_f32(v_and(v_reinterpret_as_u32(projected), v_uint32x4(4294967295U, 4294967295U, 0, 0)));
 
                     depthType v;
                     // bilinearly interpolate depth at projected
                     {
                         const v_float32x4& pt = projected;
                         // check coords >= 0 and < imgSize
-                        v_uint32x4 limits = v_reinterpret_as_u32(pt < v_setzero_f32()) |
-                            v_reinterpret_as_u32(pt >= upLimits);
-                        limits = limits | v_rotate_right<1>(limits);
-                        if (limits.get0())
+                        v_uint32x4 limits = v_or(v_reinterpret_as_u32(v_lt(pt, v_setzero_f32())), v_reinterpret_as_u32(v_ge(pt, upLimits)));
+                        limits = v_or(limits,v_rotate_right<1>(limits));
+                        if (v_get0(limits))
                             continue;
 
                         // xi, yi = floor(pt)
                         v_int32x4 ip = v_floor(pt);
                         v_int32x4 ipshift = ip;
-                        int xi = ipshift.get0();
+                        int xi = v_get0(ipshift);
                         ipshift = v_rotate_right<1>(ipshift);
-                        int yi = ipshift.get0();
+                        int yi = v_get0(ipshift);
 
                         const depthType* row0 = depth[yi + 0];
                         const depthType* row1 = depth[yi + 1];
@@ -542,17 +538,17 @@ void integrateRGBVolumeUnit(
 
                         // assume correct depth is positive
                         // don't fix missing data
-                        if (v_check_all(vall > v_setzero_f32()))
+                        if (v_check_all(v_gt(vall, v_setzero_f32())))
                         {
-                            v_float32x4 t = pt - v_cvt_f32(ip);
-                            float tx = t.get0();
+                            v_float32x4 t = v_sub(pt, v_cvt_f32(ip));
+                            float tx = v_get0(t);
                             t = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(t)));
-                            v_float32x4 ty = v_setall_f32(t.get0());
+                            v_float32x4 ty = v_setall_f32(v_get0(t));
                             // vx is y-interpolated between rows 0 and 1
-                            v_float32x4 vx = v001 + ty * (v101 - v001);
-                            float v0 = vx.get0();
+                            v_float32x4 vx = v_add(v001, v_mul(ty, v_sub(v101, v001)));
+                            float v0 = v_get0(vx);
                             vx = v_reinterpret_as_f32(v_rotate_right<1>(v_reinterpret_as_u32(vx)));
-                            float v1 = vx.get0();
+                            float v1 = v_get0(vx);
                             v = v0 + tx * (v1 - v0);
                         }
                         else
@@ -561,14 +557,13 @@ void integrateRGBVolumeUnit(
 
                     v_float32x4 projectedRGB = v_muladd(camPixVec, rgb_vfxy, rgb_vcxy);
                     // leave only first 2 lanes
-                    projectedRGB = v_reinterpret_as_f32(v_reinterpret_as_u32(projected) &
-                        v_uint32x4(0xFFFFFFFF, 0xFFFFFFFF, 0, 0));
+                    projectedRGB = v_reinterpret_as_f32(v_and(v_reinterpret_as_u32(projected), v_uint32x4(0xFFFFFFFF, 0xFFFFFFFF, 0, 0)));
 
                     // norm(camPixVec) produces double which is too slow
-                    int _u = (int)projected.get0();
-                    int _v = (int)v_rotate_right<1>(projected).get0();
-                    int rgb_u = (int)projectedRGB.get0();
-                    int rgb_v = (int)v_rotate_right<1>(projectedRGB).get0();
+                    int _u = (int)v_get0(projected);
+                    int _v = (int)v_get0(v_rotate_right<1>(projected));
+                    int rgb_u = (int)v_get0(projectedRGB);
+                    int rgb_v = (int)v_get0(v_rotate_right<1>(projectedRGB));
 
                     if (!(_u >= 0 && _u < depth.cols && _v >= 0 && _v < depth.rows &&
                         rgb_v >= 0 && rgb_v < color.rows && rgb_u >= 0 && rgb_u < color.cols))
diff --git a/modules/rgbd/src/tsdf_functions.hpp b/modules/rgbd/src/tsdf_functions.hpp
index c763f9275e6..6038ab60c05 100644
--- a/modules/rgbd/src/tsdf_functions.hpp
+++ b/modules/rgbd/src/tsdf_functions.hpp
@@ -20,7 +20,7 @@ namespace kinfu
 inline v_float32x4 tsdfToFloat_INTR(const v_int32x4& num)
 {
     v_float32x4 num128 = v_setall_f32(-1.f / 128.f);
-    return v_cvt_f32(num) * num128;
+    return v_mul(v_cvt_f32(num), num128);
 }
 #endif
 
diff --git a/modules/rgbd/src/utils.hpp b/modules/rgbd/src/utils.hpp
index 0b963675390..2bb69713d6a 100644
--- a/modules/rgbd/src/utils.hpp
+++ b/modules/rgbd/src/utils.hpp
@@ -68,7 +68,7 @@ inline bool isNaN(cv::Point3f p)
 #if USE_INTRINSICS
 static inline bool isNaN(const cv::v_float32x4& p)
 {
-    return cv::v_check_any(p != p);
+    return cv::v_check_any(v_ne(p, p));
 }
 #endif
 
diff --git a/modules/ximgproc/src/anisodiff.cpp b/modules/ximgproc/src/anisodiff.cpp
index 996b4ac5b77..2b230a71242 100644
--- a/modules/ximgproc/src/anisodiff.cpp
+++ b/modules/ximgproc/src/anisodiff.cpp
@@ -74,8 +74,8 @@ inline v_uint8x16 v_finalize_pix_ch(const v_int16x8& c0, const v_int16x8& c1,
     v_expand_f32(c0, f0, f1);
     v_expand_f32(c1, f2, f3);
 
-    v_int16x8 d0 = v_pack(v_round(s0*alpha + f0), v_round(s1*alpha + f1));
-    v_int16x8 d1 = v_pack(v_round(s2*alpha + f2), v_round(s3*alpha + f3));
+    v_int16x8 d0 = v_pack(v_round(v_add(v_mul(s0, alpha), f0)), v_round(v_add(v_mul(s1, alpha), f1)));
+    v_int16x8 d1 = v_pack(v_round(v_add(v_mul(s2, alpha), f2)), v_round(v_add(v_mul(s3, alpha), f3)));
 
     return v_pack_u(d0, d1);
 }
@@ -135,12 +135,12 @@ class ADBody : public ParallelLoopBody
                     v_expand_s(p1, p10, p11);
                     v_expand_s(p2, p20, p21);
 
-                    v_int16x8 d00 = p00 - c00, d01 = p01 - c01;
-                    v_int16x8 d10 = p10 - c10, d11 = p11 - c11;
-                    v_int16x8 d20 = p20 - c20, d21 = p21 - c21;
+                    v_int16x8 d00 = v_sub(p00, c00), d01 = v_sub(p01, c01);
+                    v_int16x8 d10 = v_sub(p10, c10), d11 = v_sub(p11, c11);
+                    v_int16x8 d20 = v_sub(p20, c20), d21 = v_sub(p21, c21);
 
-                    v_uint16x8 n0 = v_abs(d00) + v_abs(d10) + v_abs(d20);
-                    v_uint16x8 n1 = v_abs(d01) + v_abs(d11) + v_abs(d21);
+                    v_uint16x8 n0 = v_add(v_add(v_abs(d00), v_abs(d10)), v_abs(d20));
+                    v_uint16x8 n1 = v_add(v_add(v_abs(d01), v_abs(d11)), v_abs(d21));
 
                     ushort CV_DECL_ALIGNED(16) nbuf[16];
                     v_store(nbuf, n0);
@@ -153,13 +153,13 @@ class ADBody : public ParallelLoopBody
 
                     v_expand_f32(d00, fd0, fd1);
                     v_expand_f32(d01, fd2, fd3);
-                    s00 += fd0*w0; s01 += fd1*w1; s02 += fd2*w2; s03 += fd3*w3;
+                    s00 = v_add(s00, v_mul(fd0, w0)); s01 = v_add(s01, v_mul(fd1, w1)); s02 = v_add(s02, v_mul(fd2, w2)); s03 = v_add(s03, v_mul(fd3, w3));
                     v_expand_f32(d10, fd0, fd1);
                     v_expand_f32(d11, fd2, fd3);
-                    s10 += fd0*w0; s11 += fd1*w1; s12 += fd2*w2; s13 += fd3*w3;
+                    s10 = v_add(s10, v_mul(fd0, w0)); s11 = v_add(s11, v_mul(fd1, w1)); s12 = v_add(s12, v_mul(fd2, w2)); s13 = v_add(s13, v_mul(fd3, w3));
                     v_expand_f32(d20, fd0, fd1);
                     v_expand_f32(d21, fd2, fd3);
-                    s20 += fd0*w0; s21 += fd1*w1; s22 += fd2*w2; s23 += fd3*w3;
+                    s20 = v_add(s20, v_mul(fd0, w0)); s21 = v_add(s21, v_mul(fd1, w1)); s22 = v_add(s22, v_mul(fd2, w2)); s23 = v_add(s23, v_mul(fd3, w3));
                 }
 
                 c0 = v_finalize_pix_ch(c00, c01, s00, s01, s02, s03, v_alpha);
diff --git a/modules/ximgproc/src/fgs_filter.cpp b/modules/ximgproc/src/fgs_filter.cpp
index 5e168da5dad..804e9f00a02 100644
--- a/modules/ximgproc/src/fgs_filter.cpp
+++ b/modules/ximgproc/src/fgs_filter.cpp
@@ -303,15 +303,15 @@ void FastGlobalSmootherFilterImpl::process_4row_block(Mat* cur,int i)
         v_float32x4 aux0,aux1,aux2,aux3;
 
 #define PROC4(Chor_in,cur_in,coef_prev_in,interD_prev_in,cur_prev_in,interD_out,cur_out,coef_cur_out)\
-        coef_cur_out = lambda_reg*Chor_in;\
-        aux0 = interD_prev_in*coef_prev_in;\
-        aux1 = coef_cur_out+coef_prev_in;\
-        aux1 = one_reg-aux1;\
-        aux0 = aux1-aux0;\
-        interD_out = coef_cur_out/aux0;\
-        aux1 = cur_prev_in*coef_prev_in;\
-        aux1 = cur_in - aux1;\
-        cur_out = aux1/aux0;
+        coef_cur_out = v_mul(lambda_reg, Chor_in);\
+        aux0 = v_mul(interD_prev_in, coef_prev_in);\
+        aux1 = v_add(coef_cur_out, coef_prev_in);\
+        aux1 = v_sub(one_reg, aux1);\
+        aux0 = v_sub(aux1, aux0);\
+        interD_out = v_div(coef_cur_out, aux0);\
+        aux1 = v_mul(cur_prev_in, coef_prev_in);\
+        aux1 = v_sub(cur_in, aux1);\
+        cur_out = v_div(aux1, aux0);
 
         for(;j<w-3;j+=4)
         {
@@ -406,14 +406,14 @@ void FastGlobalSmootherFilterImpl::process_4row_block(Mat* cur,int i)
             aux3 = v_load(cur_row_next3+j);
             v_transpose4x4(aux0,aux1,aux2,aux3,b0,b1,b2,b3);
 
-            aux0 = a3*cur_next_reg;
-            b3 = b3-aux0;
-            aux0 = a2*b3;
-            b2 = b2-aux0;
-            aux0 = a1*b2;
-            b1 = b1-aux0;
-            aux0 = a0*b1;
-            b0 = b0-aux0;
+            aux0 = v_mul(a3, cur_next_reg);
+            b3 = v_sub(b3, aux0);
+            aux0 = v_mul(a2, b3);
+            b2 = v_sub(b2, aux0);
+            aux0 = v_mul(a1, b2);
+            b1 = v_sub(b1, aux0);
+            aux0 = v_mul(a0, b1);
+            b0 = v_sub(b0, aux0);
 
             cur_next_reg = b0;
 
@@ -522,24 +522,24 @@ void FastGlobalSmootherFilterImpl::VerticalPass_ParBody::operator()(const Range&
         {
             a = v_load(Cvert_row_prev+j);
             b = v_load(Cvert_row+j);
-            coef_prev_reg = lambda_reg*a;
-            coef_cur_reg =  lambda_reg*b;
+            coef_prev_reg = v_mul(lambda_reg, a);
+            coef_cur_reg =  v_mul(lambda_reg, b);
 
             a = v_load(interD_row_prev+j);
-            a = a*coef_prev_reg;
+            a = v_mul(a, coef_prev_reg);
 
-            b = coef_prev_reg+coef_cur_reg;
-            b = b+a;
-            a = one_reg-b; //computed denom
+            b = v_add(coef_prev_reg, coef_cur_reg);
+            b = v_add(b, a);
+            a = v_sub(one_reg, b); //computed denom
 
-            b =  coef_cur_reg/a; //computed interD_row
+            b =  v_div(coef_cur_reg, a); //computed interD_row
 
             c = v_load(cur_row_prev+j);
-            c = c*coef_prev_reg;
+            c = v_mul(c, coef_prev_reg);
 
             d = v_load(cur_row+j);
-            d = d-c;
-            d = d/a; //computed cur_row
+            d = v_sub(d, c);
+            d = v_div(d, a); //computed cur_row
 
             v_store(interD_row+j,b);
             v_store(cur_row+j,d);
@@ -570,10 +570,10 @@ void FastGlobalSmootherFilterImpl::VerticalPass_ParBody::operator()(const Range&
         {
             a = v_load(interD_row+j);
             b = v_load(cur_row_next+j);
-            b = a*b;
+            b = v_mul(a, b);
 
             a = v_load(cur_row+j);
-            b = a-b;
+            b = v_sub(a, b);
             v_store(cur_row+j,b);
         }
 #endif
diff --git a/modules/xphoto/src/grayworld_white_balance.cpp b/modules/xphoto/src/grayworld_white_balance.cpp
index 8d5953d34ae..2c944c84e85 100644
--- a/modules/xphoto/src/grayworld_white_balance.cpp
+++ b/modules/xphoto/src/grayworld_white_balance.cpp
@@ -130,21 +130,21 @@ void calculateChannelSums(uint &sumB, uint &sumG, uint &sumR, uchar *src_data, i
         v_expand(v_max_val, v_max1, v_max2);
 
         // Calculate masks
-        v_m1 = ~(v_mul_wrap(v_max1 - v_min1, v_255) > v_mul_wrap(v_thresh, v_max1));
-        v_m2 = ~(v_mul_wrap(v_max2 - v_min2, v_255) > v_mul_wrap(v_thresh, v_max2));
+        v_m1 = v_not(v_gt(v_mul_wrap(v_sub(v_max1, v_min1), v_255), v_mul_wrap(v_thresh, v_max1)));
+        v_m2 = v_not(v_gt(v_mul_wrap(v_sub(v_max2, v_min2), v_255), v_mul_wrap(v_thresh, v_max2)));
 
         // Apply masks
-        v_iB1 = (v_iB1 & v_m1) + (v_iB2 & v_m2);
-        v_iG1 = (v_iG1 & v_m1) + (v_iG2 & v_m2);
-        v_iR1 = (v_iR1 & v_m1) + (v_iR2 & v_m2);
+        v_iB1 = v_add(v_and(v_iB1, v_m1), v_and(v_iB2, v_m2));
+        v_iG1 = v_add(v_and(v_iG1, v_m1), v_and(v_iG2, v_m2));
+        v_iR1 = v_add(v_and(v_iR1, v_m1), v_and(v_iR2, v_m2));
 
         // Split and add to the sums:
         v_expand(v_iB1, v_uint1, v_uint2);
-        v_SB += v_uint1 + v_uint2;
+        v_SB = v_add(v_SB, v_add(v_uint1, v_uint2));
         v_expand(v_iG1, v_uint1, v_uint2);
-        v_SG += v_uint1 + v_uint2;
+        v_SG = v_add(v_SG, v_add(v_uint1, v_uint2));
         v_expand(v_iR1, v_uint1, v_uint2);
-        v_SR += v_uint1 + v_uint2;
+        v_SR = v_add(v_SR, v_add(v_uint1, v_uint2));
     }
 
     sumB = v_reduce_sum(v_SB);
@@ -197,21 +197,21 @@ void calculateChannelSums(uint64 &sumB, uint64 &sumG, uint64 &sumR, ushort *src_
         v_expand(v_max_val, v_max1, v_max2);
 
         // Calculate masks
-        v_m1 = ~((v_max1 - v_min1) * v_65535 > v_thresh * v_max1);
-        v_m2 = ~((v_max2 - v_min2) * v_65535 > v_thresh * v_max2);
+        v_m1 = v_not(v_gt(v_mul(v_sub(v_max1, v_min1), v_65535), v_mul(v_thresh, v_max1)));
+        v_m2 = v_not(v_gt(v_mul(v_sub(v_max2, v_min2), v_65535), v_mul(v_thresh, v_max2)));
 
         // Apply masks
-        v_iB1 = (v_iB1 & v_m1) + (v_iB2 & v_m2);
-        v_iG1 = (v_iG1 & v_m1) + (v_iG2 & v_m2);
-        v_iR1 = (v_iR1 & v_m1) + (v_iR2 & v_m2);
+        v_iB1 = v_add(v_and(v_iB1, v_m1), v_and(v_iB2, v_m2));
+        v_iG1 = v_add(v_and(v_iG1, v_m1), v_and(v_iG2, v_m2));
+        v_iR1 = v_add(v_and(v_iR1, v_m1), v_and(v_iR2, v_m2));
 
         // Split and add to the sums:
         v_expand(v_iB1, v_u64_1, v_u64_2);
-        v_SB += v_u64_1 + v_u64_2;
+        v_SB = v_add(v_SB, v_add(v_u64_1, v_u64_2));
         v_expand(v_iG1, v_u64_1, v_u64_2);
-        v_SG += v_u64_1 + v_u64_2;
+        v_SG = v_add(v_SG, v_add(v_u64_1, v_u64_2));
         v_expand(v_iR1, v_u64_1, v_u64_2);
-        v_SR += v_u64_1 + v_u64_2;
+        v_SR = v_add(v_SR, v_add(v_u64_1, v_u64_2));
     }
 
     // Perform final reduction
@@ -282,12 +282,12 @@ void applyChannelGains(InputArray _src, OutputArray _dst, float gainB, float gai
             v_expand(v_inR, v_sR1, v_sR2);
 
             // Multiply by gains
-            v_sB1 = v_mul_wrap(v_sB1, v_gainB) >> 8;
-            v_sB2 = v_mul_wrap(v_sB2, v_gainB) >> 8;
-            v_sG1 = v_mul_wrap(v_sG1, v_gainG) >> 8;
-            v_sG2 = v_mul_wrap(v_sG2, v_gainG) >> 8;
-            v_sR1 = v_mul_wrap(v_sR1, v_gainR) >> 8;
-            v_sR2 = v_mul_wrap(v_sR2, v_gainR) >> 8;
+            v_sB1 = v_shr(v_mul_wrap(v_sB1, v_gainB), 8);
+            v_sB2 = v_shr(v_mul_wrap(v_sB2, v_gainB), 8);
+            v_sG1 = v_shr(v_mul_wrap(v_sG1, v_gainG), 8);
+            v_sG2 = v_shr(v_mul_wrap(v_sG2, v_gainG), 8);
+            v_sR1 = v_shr(v_mul_wrap(v_sR1, v_gainR), 8);
+            v_sR2 = v_shr(v_mul_wrap(v_sR2, v_gainR), 8);
 
             // Pack into vectors of v_uint8x16
             v_store_interleave(&dst_data[i], v_pack(v_sB1, v_sB2), v_pack(v_sG1, v_sG2), v_pack(v_sR1, v_sR2));
@@ -325,12 +325,12 @@ void applyChannelGains(InputArray _src, OutputArray _dst, float gainB, float gai
             v_expand(v_inR, v_sR1, v_sR2);
 
             // Multiply by scaling factors
-            v_sB1 = (v_sB1 * v_gainB) >> 16;
-            v_sB2 = (v_sB2 * v_gainB) >> 16;
-            v_sG1 = (v_sG1 * v_gainG) >> 16;
-            v_sG2 = (v_sG2 * v_gainG) >> 16;
-            v_sR1 = (v_sR1 * v_gainR) >> 16;
-            v_sR2 = (v_sR2 * v_gainR) >> 16;
+            v_sB1 = v_shr(v_mul(v_sB1, v_gainB), 16);
+            v_sB2 = v_shr(v_mul(v_sB2, v_gainB), 16);
+            v_sG1 = v_shr(v_mul(v_sG1, v_gainG), 16);
+            v_sG2 = v_shr(v_mul(v_sG2, v_gainG), 16);
+            v_sR1 = v_shr(v_mul(v_sR1, v_gainR), 16);
+            v_sR2 = v_shr(v_mul(v_sR2, v_gainR), 16);
 
             // Pack into vectors of v_uint16x8
             v_store_interleave(&dst_data[i], v_pack(v_sB1, v_sB2), v_pack(v_sG1, v_sG2), v_pack(v_sR1, v_sR2));
diff --git a/modules/xphoto/src/learning_based_color_balance.cpp b/modules/xphoto/src/learning_based_color_balance.cpp
index bd408e6cb49..de1958dcc60 100644
--- a/modules/xphoto/src/learning_based_color_balance.cpp
+++ b/modules/xphoto/src/learning_based_color_balance.cpp
@@ -192,7 +192,7 @@ void LearningBasedWBImpl::preprocessing(Mat &src)
             v_load_deinterleave(src_ptr + 3 * i, v_inB, v_inG, v_inR);
             v_local_max = v_max(v_inB, v_max(v_inG, v_inR));
             v_global_max = v_max(v_local_max, v_global_max);
-            v_mask = (v_local_max < v_thresh);
+            v_mask = (v_lt(v_local_max, v_thresh));
             v_store(mask_ptr + i, v_mask);
         }
         uchar global_max[16];
@@ -225,7 +225,7 @@ void LearningBasedWBImpl::preprocessing(Mat &src)
             v_load_deinterleave(src_ptr + 3 * i, v_inB, v_inG, v_inR);
             v_local_max = v_max(v_inB, v_max(v_inG, v_inR));
             v_global_max = v_max(v_local_max, v_global_max);
-            v_mask = (v_local_max < v_thresh);
+            v_mask = (v_lt(v_local_max, v_thresh));
             v_pack_store(mask_ptr + i, v_mask);
         }
         ushort global_max[8];
@@ -270,9 +270,9 @@ void LearningBasedWBImpl::getAverageAndBrightestColorChromaticity(Vec2f &average
             v_load_deinterleave(src_ptr + 3 * i, v_inB, v_inG, v_inR);
             v_uint8x16 v_mask = v_load(mask_ptr + i);
 
-            v_inB &= v_mask;
-            v_inG &= v_mask;
-            v_inR &= v_mask;
+            v_inB = v_and(v_inB, v_mask);
+            v_inG = v_and(v_inG, v_mask);
+            v_inR = v_and(v_inR, v_mask);
 
             v_uint16x8 v_sR1, v_sR2, v_sG1, v_sG2, v_sB1, v_sB2;
             v_expand(v_inB, v_sB1, v_sB2);
@@ -280,33 +280,33 @@ void LearningBasedWBImpl::getAverageAndBrightestColorChromaticity(Vec2f &average
             v_expand(v_inR, v_sR1, v_sR2);
 
             // update the brightest (R,G,B) tuple (process left half):
-            v_uint16x8 v_sum = v_sB1 + v_sG1 + v_sR1;
-            v_uint16x8 v_max_mask = (v_sum > v_max_sum);
+            v_uint16x8 v_sum = v_add(v_add(v_sB1, v_sG1), v_sR1);
+            v_uint16x8 v_max_mask = (v_gt(v_sum, v_max_sum));
             v_max_sum = v_max(v_sum, v_max_sum);
-            v_brightestB = (v_sB1 & v_max_mask) + (v_brightestB & (~v_max_mask));
-            v_brightestG = (v_sG1 & v_max_mask) + (v_brightestG & (~v_max_mask));
-            v_brightestR = (v_sR1 & v_max_mask) + (v_brightestR & (~v_max_mask));
+            v_brightestB = v_add(v_and(v_sB1, v_max_mask), v_and(v_brightestB, v_not(v_max_mask)));
+            v_brightestG = v_add(v_and(v_sG1, v_max_mask), v_and(v_brightestG, v_not(v_max_mask)));
+            v_brightestR = v_add(v_and(v_sR1, v_max_mask), v_and(v_brightestR, v_not(v_max_mask)));
 
             // update the brightest (R,G,B) tuple (process right half):
-            v_sum = v_sB2 + v_sG2 + v_sR2;
-            v_max_mask = (v_sum > v_max_sum);
+            v_sum = v_add(v_add(v_sB2, v_sG2), v_sR2);
+            v_max_mask = (v_gt(v_sum, v_max_sum));
             v_max_sum = v_max(v_sum, v_max_sum);
-            v_brightestB = (v_sB2 & v_max_mask) + (v_brightestB & (~v_max_mask));
-            v_brightestG = (v_sG2 & v_max_mask) + (v_brightestG & (~v_max_mask));
-            v_brightestR = (v_sR2 & v_max_mask) + (v_brightestR & (~v_max_mask));
+            v_brightestB = v_add(v_and(v_sB2, v_max_mask), v_and(v_brightestB, v_not(v_max_mask)));
+            v_brightestG = v_add(v_and(v_sG2, v_max_mask), v_and(v_brightestG, v_not(v_max_mask)));
+            v_brightestR = v_add(v_and(v_sR2, v_max_mask), v_and(v_brightestR, v_not(v_max_mask)));
 
             // update sums:
-            v_sB1 = v_sB1 + v_sB2;
-            v_sG1 = v_sG1 + v_sG2;
-            v_sR1 = v_sR1 + v_sR2;
+            v_sB1 = v_add(v_sB1, v_sB2);
+            v_sG1 = v_add(v_sG1, v_sG2);
+            v_sR1 = v_add(v_sR1, v_sR2);
 
             v_uint32x4 v_uint1, v_uint2;
             v_expand(v_sB1, v_uint1, v_uint2);
-            v_SB += v_uint1 + v_uint2;
+            v_SB = v_add(v_SB, v_add(v_uint1, v_uint2));
             v_expand(v_sG1, v_uint1, v_uint2);
-            v_SG += v_uint1 + v_uint2;
+            v_SG = v_add(v_SG, v_add(v_uint1, v_uint2));
             v_expand(v_sR1, v_uint1, v_uint2);
-            v_SR += v_uint1 + v_uint2;
+            v_SR = v_add(v_SR, v_add(v_uint1, v_uint2));
         }
         sumB = v_reduce_sum(v_SB);
         sumG = v_reduce_sum(v_SG);
@@ -361,11 +361,11 @@ void LearningBasedWBImpl::getAverageAndBrightestColorChromaticity(Vec2f &average
             v_uint16x8 v_inB, v_inG, v_inR;
             v_load_deinterleave(src_ptr + 3 * i, v_inB, v_inG, v_inR);
             v_uint16x8 v_mask = v_load_expand(mask_ptr + i);
-            v_mask = v_mask | ((v_mask & v_mask_lower) << 8);
+            v_mask = v_or(v_mask, v_shl<8>(v_and(v_mask, v_mask_lower)));
 
-            v_inB &= v_mask;
-            v_inG &= v_mask;
-            v_inR &= v_mask;
+            v_inB = v_and(v_inB, v_mask);
+            v_inG = v_and(v_inG, v_mask);
+            v_inR = v_and(v_inR, v_mask);
 
             v_uint32x4 v_iR1, v_iR2, v_iG1, v_iG2, v_iB1, v_iB2;
             v_expand(v_inB, v_iB1, v_iB2);
@@ -373,32 +373,32 @@ void LearningBasedWBImpl::getAverageAndBrightestColorChromaticity(Vec2f &average
             v_expand(v_inR, v_iR1, v_iR2);
 
             // update the brightest (R,G,B) tuple (process left half):
-            v_uint32x4 v_sum = v_iB1 + v_iG1 + v_iR1;
-            v_uint32x4 v_max_mask = (v_sum > v_max_sum);
+            v_uint32x4 v_sum = v_add(v_add(v_iB1, v_iG1), v_iR1);
+            v_uint32x4 v_max_mask = (v_gt(v_sum, v_max_sum));
             v_max_sum = v_max(v_sum, v_max_sum);
-            v_brightestB = (v_iB1 & v_max_mask) + (v_brightestB & (~v_max_mask));
-            v_brightestG = (v_iG1 & v_max_mask) + (v_brightestG & (~v_max_mask));
-            v_brightestR = (v_iR1 & v_max_mask) + (v_brightestR & (~v_max_mask));
+            v_brightestB = v_add(v_and(v_iB1, v_max_mask), v_and(v_brightestB, v_not(v_max_mask)));
+            v_brightestG = v_add(v_and(v_iG1, v_max_mask), v_and(v_brightestG, v_not(v_max_mask)));
+            v_brightestR = v_add(v_and(v_iR1, v_max_mask), v_and(v_brightestR, v_not(v_max_mask)));
 
             // update the brightest (R,G,B) tuple (process right half):
-            v_sum = v_iB2 + v_iG2 + v_iR2;
-            v_max_mask = (v_sum > v_max_sum);
+            v_sum = v_add(v_add(v_iB2, v_iG2), v_iR2);
+            v_max_mask = (v_gt(v_sum, v_max_sum));
             v_max_sum = v_max(v_sum, v_max_sum);
-            v_brightestB = (v_iB2 & v_max_mask) + (v_brightestB & (~v_max_mask));
-            v_brightestG = (v_iG2 & v_max_mask) + (v_brightestG & (~v_max_mask));
-            v_brightestR = (v_iR2 & v_max_mask) + (v_brightestR & (~v_max_mask));
+            v_brightestB = v_add(v_and(v_iB2, v_max_mask), v_and(v_brightestB, v_not(v_max_mask)));
+            v_brightestG = v_add(v_and(v_iG2, v_max_mask), v_and(v_brightestG, v_not(v_max_mask)));
+            v_brightestR = v_add(v_and(v_iR2, v_max_mask), v_and(v_brightestR, v_not(v_max_mask)));
 
             // update sums:
-            v_iB1 = v_iB1 + v_iB2;
-            v_iG1 = v_iG1 + v_iG2;
-            v_iR1 = v_iR1 + v_iR2;
+            v_iB1 = v_add(v_iB1, v_iB2);
+            v_iG1 = v_add(v_iG1, v_iG2);
+            v_iR1 = v_add(v_iR1, v_iR2);
             v_uint64x2 v_uint64_1, v_uint64_2;
             v_expand(v_iB1, v_uint64_1, v_uint64_2);
-            v_SB += v_uint64_1 + v_uint64_2;
+            v_SB = v_add(v_SB, v_add(v_uint64_1, v_uint64_2));
             v_expand(v_iG1, v_uint64_1, v_uint64_2);
-            v_SG += v_uint64_1 + v_uint64_2;
+            v_SG = v_add(v_SG, v_add(v_uint64_1, v_uint64_2));
             v_expand(v_iR1, v_uint64_1, v_uint64_2);
-            v_SR += v_uint64_1 + v_uint64_2;
+            v_SR = v_add(v_SR, v_add(v_uint64_1, v_uint64_2));
         }
         uint64 sum_arr[2];
         v_store(sum_arr, v_SB);

From 0bbfa6c3b21cf3c1872c6fa0152673e68820b0ff Mon Sep 17 00:00:00 2001
From: Maksim Shabunin <maksim.shabunin@gmail.com>
Date: Tue, 17 Oct 2023 16:30:31 +0300
Subject: [PATCH 4/6] CI: fixed RISC-V workflow path

---
 .github/workflows/PR-4.x.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/PR-4.x.yaml b/.github/workflows/PR-4.x.yaml
index f05e94bfa32..bd90f16ca28 100644
--- a/.github/workflows/PR-4.x.yaml
+++ b/.github/workflows/PR-4.x.yaml
@@ -28,4 +28,4 @@ jobs:
     uses: opencv/ci-gha-workflow/.github/workflows/OCV-Contrib-PR-4.x-macOS-x86_64.yaml@main
 
   Linux-RISC-V-Clang:
-    uses: opencv/ci-gha-workflow/.github/workflows/OCV-Contrib-PR-4.x-RISCV.yaml
+    uses: opencv/ci-gha-workflow/.github/workflows/OCV-Contrib-PR-4.x-RISCV.yaml@main

From f40bc0cb4cc6b015dc9fde6a18846ade0a8e42cd Mon Sep 17 00:00:00 2001
From: cudawarped <12133430+cudawarped@users.noreply.github.com>
Date: Fri, 22 Sep 2023 11:30:21 +0300
Subject: [PATCH 5/6] cudacodec: VideoWriter add container output

---
 .../cudacodec/include/opencv2/cudacodec.hpp   | 38 ++++-----
 modules/cudacodec/src/NvEncoder.cpp           | 10 ---
 modules/cudacodec/src/NvEncoder.h             | 11 +++
 modules/cudacodec/src/video_writer.cpp        | 82 +++++++++++++------
 modules/cudacodec/test/test_video.cpp         | 60 ++++++++++----
 5 files changed, 134 insertions(+), 67 deletions(-)

diff --git a/modules/cudacodec/include/opencv2/cudacodec.hpp b/modules/cudacodec/include/opencv2/cudacodec.hpp
index 42325c64613..d6421c2b8a4 100644
--- a/modules/cudacodec/include/opencv2/cudacodec.hpp
+++ b/modules/cudacodec/include/opencv2/cudacodec.hpp
@@ -184,18 +184,18 @@ struct CV_EXPORTS_W_SIMPLE EncoderParams
 public:
     CV_WRAP EncoderParams() : nvPreset(ENC_PRESET_P3), tuningInfo(ENC_TUNING_INFO_HIGH_QUALITY), encodingProfile(ENC_CODEC_PROFILE_AUTOSELECT),
         rateControlMode(ENC_PARAMS_RC_VBR), multiPassEncoding(ENC_MULTI_PASS_DISABLED), constQp({ 0,0,0 }), averageBitRate(0), maxBitRate(0),
-        targetQuality(30), gopLength(0) {};
-
+        targetQuality(30), gopLength(250), idrPeriod(250) {};
     CV_PROP_RW EncodePreset nvPreset;
     CV_PROP_RW EncodeTuningInfo tuningInfo;
     CV_PROP_RW EncodeProfile encodingProfile;
     CV_PROP_RW EncodeParamsRcMode rateControlMode;
     CV_PROP_RW EncodeMultiPass multiPassEncoding;
-    CV_PROP_RW EncodeQp constQp; //!< QP's for ENC_PARAMS_RC_CONSTQP.
-    CV_PROP_RW int averageBitRate; //!< target bitrate for ENC_PARAMS_RC_VBR and ENC_PARAMS_RC_CBR.
-    CV_PROP_RW int maxBitRate; //!< upper bound on bitrate for ENC_PARAMS_RC_VBR and ENC_PARAMS_RC_CONSTQP.
-    CV_PROP_RW uint8_t targetQuality; //!< value 0 - 51 where video quality decreases as targetQuality increases, used with ENC_PARAMS_RC_VBR.
-    CV_PROP_RW int gopLength;
+    CV_PROP_RW EncodeQp constQp; //!< QP's for \ref ENC_PARAMS_RC_CONSTQP.
+    CV_PROP_RW int averageBitRate; //!< target bitrate for \ref ENC_PARAMS_RC_VBR and \ref ENC_PARAMS_RC_CBR.
+    CV_PROP_RW int maxBitRate; //!< upper bound on bitrate for \ref ENC_PARAMS_RC_VBR and \ref ENC_PARAMS_RC_CONSTQP.
+    CV_PROP_RW uint8_t targetQuality; //!< value 0 - 51 where video quality decreases as targetQuality increases, used with \ref ENC_PARAMS_RC_VBR.
+    CV_PROP_RW int gopLength; //!< the number of pictures in one GOP, ensuring \ref idrPeriod >= \ref gopLength.
+    CV_PROP_RW int idrPeriod; //!< IDR interval, ensuring \ref idrPeriod >= \ref gopLength.
 };
 CV_EXPORTS bool operator==(const EncoderParams& lhs, const EncoderParams& rhs);
 
@@ -209,7 +209,7 @@ class CV_EXPORTS_W EncoderCallback {
 
     @param vPacket The raw bitstream for one or more frames.
     */
-    virtual void onEncoded(std::vector<std::vector<uint8_t>> vPacket) = 0;
+    virtual void onEncoded(const std::vector<std::vector<uint8_t>>& vPacket) = 0;
 
     /** @brief Callback function to that the encoding has finished.
     * */
@@ -218,14 +218,14 @@ class CV_EXPORTS_W EncoderCallback {
     virtual ~EncoderCallback() {}
 };
 
-/** @brief Video writer interface.
+/** @brief Video writer interface, see createVideoWriter().
 
-Available when built with WITH_NVCUVENC=ON while Nvidia's Video Codec SDK is installed.
+Available if Nvidia's Video Codec SDK is installed.
 
-Encoding support is dependent on the GPU, refer to the Nvidia Video Codec SDK Video Encode and Decode GPU Support Matrix for details.
+Only Codec::H264 and Codec::HEVC are supported with encoding support dependent on the GPU, refer to the Nvidia Video Codec SDK Video Encode and Decode GPU Support Matrix for details.
 
 @note
-   -   An example on how to use the videoWriter class can be found at
+   -   An example on how to use the VideoWriter class can be found at
         opencv_source_code/samples/gpu/video_writer.cpp
 */
 class CV_EXPORTS_W VideoWriter
@@ -253,9 +253,9 @@ class CV_EXPORTS_W VideoWriter
 
 /** @brief Creates video writer.
 
-@param fileName Name of the output video file. Only raw h264 or hevc files are supported.
+@param fileName Name of the output video file.
 @param frameSize Size of the input video frames.
-@param codec Codec.
+@param codec Supports Codec::H264 and Codec::HEVC.
 @param fps Framerate of the created video stream.
 @param colorFormat OpenCv color format of the frames to be encoded.
 @param encoderCallback Callbacks for video encoder. See cudacodec::EncoderCallback. Required for working with the encoded video stream.
@@ -266,9 +266,9 @@ CV_EXPORTS_W Ptr<cudacodec::VideoWriter> createVideoWriter(const String& fileNam
 
 /** @brief Creates video writer.
 
-@param fileName Name of the output video file. Only raw h264 or hevc files are supported.
+@param fileName Name of the output video file.
 @param frameSize Size of the input video frames.
-@param codec Codec.
+@param codec Supports Codec::H264 and Codec::HEVC.
 @param fps Framerate of the created video stream.
 @param colorFormat OpenCv color format of the frames to be encoded.
 @param params Additional encoding parameters.
@@ -361,14 +361,14 @@ enum class VideoReaderProps {
 #endif
 };
 
-/** @brief Video reader interface.
+/** @brief Video reader interface, see createVideoReader().
 
-Available when built with WITH_NVCUVID=ON while Nvidia's Video Codec SDK is installed.
+Available if Nvidia's Video Codec SDK is installed.
 
 Decoding support is dependent on the GPU, refer to the Nvidia Video Codec SDK Video Encode and Decode GPU Support Matrix for details.
 
 @note
-   -   An example on how to use the videoReader class can be found at
+   -   An example on how to use the VideoReader interface can be found at
         opencv_source_code/samples/gpu/video_reader.cpp
  */
 class CV_EXPORTS_W VideoReader
diff --git a/modules/cudacodec/src/NvEncoder.cpp b/modules/cudacodec/src/NvEncoder.cpp
index 249f6f1c61e..aa9d2a67c17 100644
--- a/modules/cudacodec/src/NvEncoder.cpp
+++ b/modules/cudacodec/src/NvEncoder.cpp
@@ -7,16 +7,6 @@
 #include "NvEncoder.h"
 
 namespace cv { namespace cudacodec {
-#ifndef _WIN32
-#include <cstring>
-static inline bool operator==(const GUID& guid1, const GUID& guid2) {
-    return !memcmp(&guid1, &guid2, sizeof(GUID));
-}
-
-static inline bool operator!=(const GUID& guid1, const GUID& guid2) {
-    return !(guid1 == guid2);
-}
-#endif
 
 NvEncoder::NvEncoder(NV_ENC_DEVICE_TYPE eDeviceType, void* pDevice, uint32_t nWidth, uint32_t nHeight, NV_ENC_BUFFER_FORMAT eBufferFormat,
     uint32_t nExtraOutputDelay) :
diff --git a/modules/cudacodec/src/NvEncoder.h b/modules/cudacodec/src/NvEncoder.h
index dd13d2c1501..c5a53712e14 100644
--- a/modules/cudacodec/src/NvEncoder.h
+++ b/modules/cudacodec/src/NvEncoder.h
@@ -15,6 +15,17 @@
 
 namespace cv { namespace cudacodec {
 
+#ifndef _WIN32
+#include <cstring>
+    static inline bool operator==(const GUID& guid1, const GUID& guid2) {
+        return !memcmp(&guid1, &guid2, sizeof(GUID));
+    }
+
+    static inline bool operator!=(const GUID& guid1, const GUID& guid2) {
+        return !(guid1 == guid2);
+    }
+#endif
+
 #define NVENC_THROW_ERROR( errorStr, errorCode ) \
 do \
 { \
diff --git a/modules/cudacodec/src/video_writer.cpp b/modules/cudacodec/src/video_writer.cpp
index db3e2e36306..8b5c703f759 100644
--- a/modules/cudacodec/src/video_writer.cpp
+++ b/modules/cudacodec/src/video_writer.cpp
@@ -59,7 +59,6 @@ GUID CodecGuid(const Codec codec);
 void FrameRate(const double fps, uint32_t& frameRateNum, uint32_t& frameRateDen);
 GUID EncodingProfileGuid(const EncodeProfile encodingProfile);
 GUID EncodingPresetGuid(const EncodePreset nvPreset);
-bool Equal(const GUID& g1, const GUID& g2);
 
 bool operator==(const EncoderParams& lhs, const EncoderParams& rhs)
 {
@@ -68,12 +67,48 @@ bool operator==(const EncoderParams& lhs, const EncoderParams& rhs)
             rhs.averageBitRate, rhs.maxBitRate, rhs.targetQuality, rhs.gopLength);
 };
 
+class FFmpegVideoWriter : public EncoderCallback
+{
+public:
+    FFmpegVideoWriter(const String& fileName, const Codec codec, const int fps, const Size sz, const int idrPeriod);
+    ~FFmpegVideoWriter();
+    void onEncoded(const std::vector<std::vector<uint8_t>>& vPacket);
+    void onEncodingFinished();
+private:
+    cv::VideoWriter writer;
+};
+
+FFmpegVideoWriter::FFmpegVideoWriter(const String& fileName, const Codec codec, const int fps, const Size sz, const int idrPeriod) {
+    if (!videoio_registry::hasBackend(CAP_FFMPEG))
+        CV_Error(Error::StsNotImplemented, "FFmpeg backend not found");
+    const int fourcc = codec == Codec::H264 ? cv::VideoWriter::fourcc('a', 'v', 'c', '1') : cv::VideoWriter::fourcc('h', 'e', 'v', '1');
+    writer.open(fileName, fourcc, fps, sz, { VideoWriterProperties::VIDEOWRITER_PROP_RAW_VIDEO, 1, VideoWriterProperties::VIDEOWRITER_PROP_KEY_INTERVAL, idrPeriod });
+    if (!writer.isOpened())
+        CV_Error(Error::StsUnsupportedFormat, "Unsupported video sink");
+}
+
+void FFmpegVideoWriter::onEncodingFinished() {
+    writer.release();
+}
+
+FFmpegVideoWriter::~FFmpegVideoWriter() {
+    onEncodingFinished();
+}
+
+void FFmpegVideoWriter::onEncoded(const std::vector<std::vector<uint8_t>>& vPacket) {
+    for (auto& packet : vPacket) {
+        Mat wrappedPacket(1, packet.size(), CV_8UC1, (void*)packet.data());
+        writer.write(wrappedPacket);
+    }
+}
+
+
 class RawVideoWriter : public EncoderCallback
 {
 public:
-    RawVideoWriter(String fileName);
+    RawVideoWriter(const String fileName);
     ~RawVideoWriter();
-    void onEncoded(std::vector<std::vector<uint8_t>> vPacket);
+    void onEncoded(const std::vector<std::vector<uint8_t>>& vPacket);
     void onEncodingFinished();
 private:
     std::ofstream fpOut;
@@ -93,9 +128,9 @@ RawVideoWriter::~RawVideoWriter() {
     onEncodingFinished();
 }
 
-void RawVideoWriter::onEncoded(std::vector<std::vector<uint8_t>> vPacket) {
+void RawVideoWriter::onEncoded(const std::vector<std::vector<uint8_t>>& vPacket) {
     for (auto& packet : vPacket)
-        fpOut.write(reinterpret_cast<char*>(packet.data()), packet.size());
+        fpOut.write(reinterpret_cast<const char*>(packet.data()), packet.size());
 }
 
 class VideoWriterImpl : public VideoWriter
@@ -172,12 +207,6 @@ VideoWriterImpl::VideoWriterImpl(const Ptr<EncoderCallback>& encoderCallBack_, c
     Init(codec, fps, frameSz);
 }
 
-VideoWriterImpl::VideoWriterImpl(const Ptr<EncoderCallback>& encoderCallback, const Size frameSz, const Codec codec, const double fps,
-    const ColorFormat colorFormat, const Stream& stream) :
-    VideoWriterImpl(encoderCallback, frameSz, codec, fps, colorFormat, EncoderParams(), stream)
-{
-}
-
 void VideoWriterImpl::release() {
     pEnc->EndEncode(vPacket);
     encoderCallback->onEncoded(vPacket);
@@ -271,12 +300,6 @@ GUID EncodingPresetGuid(const EncodePreset nvPreset) {
     CV_Error(Error::StsUnsupportedFormat, msg);
 }
 
-bool Equal(const GUID& g1, const GUID& g2) {
-    if (std::tie(g1.Data1, g1.Data2, g1.Data3, g1.Data4) == std::tie(g2.Data1, g2.Data2, g2.Data3, g2.Data4))
-        return true;
-    return false;
-}
-
 void VideoWriterImpl::InitializeEncoder(const GUID codec, const double fps)
 {
     NV_ENC_INITIALIZE_PARAMS initializeParams = {};
@@ -293,10 +316,10 @@ void VideoWriterImpl::InitializeEncoder(const GUID codec, const double fps)
     initializeParams.encodeConfig->rcParams.maxBitRate = encoderParams.maxBitRate;
     initializeParams.encodeConfig->rcParams.targetQuality = encoderParams.targetQuality;
     initializeParams.encodeConfig->gopLength = encoderParams.gopLength;
-    if (Equal(codec, NV_ENC_CODEC_H264_GUID))
-        initializeParams.encodeConfig->encodeCodecConfig.h264Config.idrPeriod = encoderParams.gopLength;
-    else if (Equal(codec, NV_ENC_CODEC_HEVC_GUID))
-        initializeParams.encodeConfig->encodeCodecConfig.hevcConfig.idrPeriod = encoderParams.gopLength;
+    if (codec == NV_ENC_CODEC_H264_GUID)
+        initializeParams.encodeConfig->encodeCodecConfig.h264Config.idrPeriod = encoderParams.idrPeriod;
+    else if (codec == NV_ENC_CODEC_HEVC_GUID)
+        initializeParams.encodeConfig->encodeCodecConfig.hevcConfig.idrPeriod = encoderParams.idrPeriod;
     pEnc->CreateEncoder(&initializeParams);
 }
 
@@ -371,14 +394,25 @@ EncoderParams VideoWriterImpl::getEncoderParams() const {
 Ptr<VideoWriter> createVideoWriter(const String& fileName, const Size frameSize, const Codec codec, const double fps, const ColorFormat colorFormat,
     Ptr<EncoderCallback> encoderCallback, const Stream& stream)
 {
-    encoderCallback = encoderCallback ? encoderCallback : new RawVideoWriter(fileName);
-    return makePtr<VideoWriterImpl>(encoderCallback, frameSize, codec, fps, colorFormat, stream);
+    return createVideoWriter(fileName, frameSize, codec, fps, colorFormat, EncoderParams(), encoderCallback, stream);
 }
 
 Ptr<VideoWriter> createVideoWriter(const String& fileName, const Size frameSize, const Codec codec, const double fps, const ColorFormat colorFormat,
     const EncoderParams& params, Ptr<EncoderCallback> encoderCallback, const Stream& stream)
 {
-    encoderCallback = encoderCallback ? encoderCallback : new RawVideoWriter(fileName);
+    CV_Assert(params.idrPeriod >= params.gopLength);
+    if (!encoderCallback) {
+        // required until PR for raw video encapsulation is merged and windows dll is updated
+#ifndef WIN32 // remove #define and keep code once merged
+        try {
+            encoderCallback = new FFmpegVideoWriter(fileName, codec, fps, frameSize, params.idrPeriod);
+        }
+        catch (...)
+#endif
+        {
+            encoderCallback = new RawVideoWriter(fileName);
+        }
+    }
     return makePtr<VideoWriterImpl>(encoderCallback, frameSize, codec, fps, colorFormat, params, stream);
 }
 
diff --git a/modules/cudacodec/test/test_video.cpp b/modules/cudacodec/test/test_video.cpp
index ead5fa944ca..45365dab230 100644
--- a/modules/cudacodec/test/test_video.cpp
+++ b/modules/cudacodec/test/test_video.cpp
@@ -639,7 +639,13 @@ CUDA_TEST_P(TransCode, H264ToH265)
     constexpr cv::cudacodec::ColorFormat colorFormat = cv::cudacodec::ColorFormat::NV_NV12;
     constexpr double fps = 25;
     const cudacodec::Codec codec = cudacodec::Codec::HEVC;
-    const std::string ext = ".h265";
+    // required until PR for raw video encapsulation is merged and windows dll is updated
+#ifdef WIN32
+    const std::string ext = ".hevc";
+#else
+    // use this after update
+    const std::string ext = ".mp4";
+#endif
     const std::string outputFile = cv::tempfile(ext.c_str());
     constexpr int nFrames = 5;
     Size frameSz;
@@ -716,7 +722,13 @@ CUDA_TEST_P(Write, Writer)
     const cudacodec::Codec codec = GET_PARAM(2);
     const double fps = GET_PARAM(3);
     const cv::cudacodec::ColorFormat colorFormat = GET_PARAM(4);
+    // required until PR for raw video encapsulation is merged and windows dll is updated
+#ifdef WIN32
     const std::string ext = codec == cudacodec::Codec::H264 ? ".h264" : ".hevc";
+#else
+    // use this after update
+    const std::string ext = ".mp4";
+#endif
     const std::string outputFile = cv::tempfile(ext.c_str());
     constexpr int nFrames = 5;
     Size frameSz;
@@ -750,7 +762,7 @@ CUDA_TEST_P(Write, Writer)
         const int width = static_cast<int>(cap.get(CAP_PROP_FRAME_WIDTH));
         const int height = static_cast<int>(cap.get(CAP_PROP_FRAME_HEIGHT));
         ASSERT_EQ(frameSz, Size(width, height));
-        ASSERT_TRUE(abs(fps - cap.get(CAP_PROP_FPS)) < 0.5);
+        ASSERT_EQ(fps, cap.get(CAP_PROP_FPS));
         Mat frame;
         for (int i = 0; i < nFrames; ++i) {
             cap >> frame;
@@ -761,24 +773,22 @@ CUDA_TEST_P(Write, Writer)
 }
 
 #define DEVICE_SRC true, false
-#define FPS 10, 29.7
+#define FPS 10, 29
 #define CODEC cv::cudacodec::Codec::H264, cv::cudacodec::Codec::HEVC
 #define COLOR_FORMAT cv::cudacodec::ColorFormat::BGR, cv::cudacodec::ColorFormat::RGB, cv::cudacodec::ColorFormat::BGRA, \
 cv::cudacodec::ColorFormat::RGBA, cv::cudacodec::ColorFormat::GRAY
 INSTANTIATE_TEST_CASE_P(CUDA_Codec, Write, testing::Combine(ALL_DEVICES, testing::Values(DEVICE_SRC), testing::Values(CODEC), testing::Values(FPS),
     testing::Values(COLOR_FORMAT)));
 
-
-struct EncoderParams : testing::TestWithParam<cv::cuda::DeviceInfo>
+PARAM_TEST_CASE(EncoderParams, cv::cuda::DeviceInfo, int)
 {
     cv::cuda::DeviceInfo devInfo;
     cv::cudacodec::EncoderParams params;
     virtual void SetUp()
     {
-        devInfo = GetParam();
+        devInfo = GET_PARAM(0);
         cv::cuda::setDevice(devInfo.deviceID());
         // Fixed params for CBR test
-        params.nvPreset = cv::cudacodec::EncodePreset::ENC_PRESET_P7;
         params.tuningInfo = cv::cudacodec::EncodeTuningInfo::ENC_TUNING_INFO_HIGH_QUALITY;
         params.encodingProfile = cv::cudacodec::EncodeProfile::ENC_H264_PROFILE_MAIN;
         params.rateControlMode = cv::cudacodec::EncodeParamsRcMode::ENC_PARAMS_RC_CBR;
@@ -787,19 +797,25 @@ struct EncoderParams : testing::TestWithParam<cv::cuda::DeviceInfo>
         params.maxBitRate = 0;
         params.targetQuality = 0;
         params.gopLength = 5;
+        params.idrPeriod = GET_PARAM(1);
     }
 };
 
-
 CUDA_TEST_P(EncoderParams, Writer)
 {
     const std::string inputFile = std::string(cvtest::TS::ptr()->get_data_path()) + "../highgui/video/big_buck_bunny.mp4";
     constexpr double fps = 25.0;
     constexpr cudacodec::Codec codec = cudacodec::Codec::H264;
+    // required until PR for raw video encapsulation is merged and windows dll is updated
+#ifdef WIN32
     const std::string ext = ".h264";
+#else
+    // use this after update
+    const std::string ext = ".mp4";
+#endif
     const std::string outputFile = cv::tempfile(ext.c_str());
     Size frameSz;
-    constexpr int nFrames = 5;
+    const int nFrames = max(params.gopLength, params.idrPeriod) + 1;
     {
         cv::VideoCapture reader(inputFile);
         ASSERT_TRUE(reader.isOpened());
@@ -829,20 +845,36 @@ CUDA_TEST_P(EncoderParams, Writer)
         const int height = static_cast<int>(cap.get(CAP_PROP_FRAME_HEIGHT));
         ASSERT_EQ(frameSz, Size(width, height));
         ASSERT_EQ(fps, cap.get(CAP_PROP_FPS));
-        const bool checkGop = videoio_registry::hasBackend(CAP_FFMPEG);
-        Mat frame;
+        const bool checkFrameType = videoio_registry::hasBackend(CAP_FFMPEG);
+        VideoCapture capRaw;
+        int idrPeriod = 0;
+        if (checkFrameType) {
+            capRaw.open(outputFile, CAP_FFMPEG, { CAP_PROP_FORMAT, -1 });
+            ASSERT_TRUE(capRaw.isOpened());
+            idrPeriod = params.idrPeriod == 0 ? params.gopLength : params.idrPeriod;
+        }
+        const double frameTypeIAsciiCode = 73.0; // see CAP_PROP_FRAME_TYPE
+        Mat frame, frameRaw;
         for (int i = 0; i < nFrames; ++i) {
             cap >> frame;
             ASSERT_FALSE(frame.empty());
-            if (checkGop && (cap.get(CAP_PROP_FRAME_TYPE) == 73)) {
-                ASSERT_TRUE(i % params.gopLength == 0);
+            if (checkFrameType) {
+                capRaw >> frameRaw;
+                ASSERT_FALSE(frameRaw.empty());
+                const bool intraFrameReference = cap.get(CAP_PROP_FRAME_TYPE) == frameTypeIAsciiCode;
+                const bool intraFrameActual = i % params.gopLength == 0;
+                ASSERT_EQ(intraFrameActual, intraFrameReference);
+                const bool keyFrameActual = capRaw.get(CAP_PROP_LRF_HAS_KEY_FRAME) == 1.0;
+                const bool keyFrameReference = i % idrPeriod == 0;
+                ASSERT_EQ(keyFrameActual, keyFrameReference);
             }
         }
     }
     ASSERT_EQ(0, remove(outputFile.c_str()));
 }
 
-INSTANTIATE_TEST_CASE_P(CUDA_Codec, EncoderParams, ALL_DEVICES);
+#define IDR_PERIOD testing::Values(5,10)
+INSTANTIATE_TEST_CASE_P(CUDA_Codec, EncoderParams, testing::Combine(ALL_DEVICES, IDR_PERIOD));
 
 #endif // HAVE_NVCUVENC
 

From 51de5807e87bdbdaefd8a5c61dfa0bf293ecd2db Mon Sep 17 00:00:00 2001
From: Alexander Smorkalov <alexander.smorkalov@xperience.ai>
Date: Thu, 26 Oct 2023 13:25:50 +0300
Subject: [PATCH 6/6] Set of build fixes for ## 3580, 3574.

---
 modules/cudaarithm/test/test_event.cpp   | 2 +-
 modules/datasets/src/tinyxml2/tinyxml2.h | 2 +-
 modules/xphoto/src/oilpainting.cpp       | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/modules/cudaarithm/test/test_event.cpp b/modules/cudaarithm/test/test_event.cpp
index 375c51d0d35..ffe0f7b681e 100644
--- a/modules/cudaarithm/test/test_event.cpp
+++ b/modules/cudaarithm/test/test_event.cpp
@@ -91,7 +91,7 @@ CUDA_TEST_P(AsyncEvent, Timing)
             const double elTimeMs = Event::elapsedTime(startEvent, stopEvent);
             ASSERT_GT(elTimeMs, 0);
         }
-        catch (cv::Exception ex) {
+        catch (const cv::Exception& ex) {
             failed = true;
         }
         ASSERT_EQ(failed, shouldFail.at(i));
diff --git a/modules/datasets/src/tinyxml2/tinyxml2.h b/modules/datasets/src/tinyxml2/tinyxml2.h
index 95ae3bcc057..89a16c65b75 100644
--- a/modules/datasets/src/tinyxml2/tinyxml2.h
+++ b/modules/datasets/src/tinyxml2/tinyxml2.h
@@ -212,7 +212,7 @@ template <class T, int INIT>
 class DynArray
 {
 public:
-    DynArray< T, INIT >() {
+    DynArray() {
         _mem = _pool;
         _allocated = INIT;
         _size = 0;
diff --git a/modules/xphoto/src/oilpainting.cpp b/modules/xphoto/src/oilpainting.cpp
index 21e62414c32..daeffd386a7 100644
--- a/modules/xphoto/src/oilpainting.cpp
+++ b/modules/xphoto/src/oilpainting.cpp
@@ -58,7 +58,7 @@ class ParallelOilPainting : public ParallelLoopBody
     int dynRatio;
 
 public:
-    ParallelOilPainting<Type>(Mat& img, Mat &d, Mat &iLuminance, int r,int k) :
+    ParallelOilPainting(Mat& img, Mat &d, Mat &iLuminance, int r,int k) :
         imgSrc(img),
         dst(d),
         imgLuminance(iLuminance),