opencv
diff --git a/‎.github/workflows/PR-5.x.yaml
Lines changed: 3 additions & 0 deletions b/‎.github/workflows/PR-5.x.yaml
Lines changed: 3 additions & 0 deletions
diff --git a/‎modules/bgsegm/test/test_backgroundsubtractor_gbh.cpp
Lines changed: 4 additions & 2 deletions b/‎modules/bgsegm/test/test_backgroundsubtractor_gbh.cpp
Lines changed: 4 additions & 2 deletions
diff --git a/‎modules/cudaarithm/src/cuda/polar_cart.cu
Lines changed: 2 additions & 16 deletions b/‎modules/cudaarithm/src/cuda/polar_cart.cu
Lines changed: 2 additions & 16 deletions
diff --git a/‎modules/cudaarithm/src/cuda/split_merge.cu
Lines changed: 6 additions & 3 deletions b/‎modules/cudaarithm/src/cuda/split_merge.cu
Lines changed: 6 additions & 3 deletions
diff --git a/‎modules/cudaarithm/src/reductions.cpp
Lines changed: 14 additions & 2 deletions b/‎modules/cudaarithm/src/reductions.cpp
Lines changed: 14 additions & 2 deletions
diff --git a/‎modules/cudafilters/src/cuda/median_filter.cu
Lines changed: 79 additions & 0 deletions b/‎modules/cudafilters/src/cuda/median_filter.cu
Lines changed: 79 additions & 0 deletions
@@ -15,6 +15,9 @@ jobs:
   Ubuntu2204-x64:
     uses: opencv/ci-gha-workflow/.github/workflows/OCV-Contrib-PR-5.x-U22.yaml@main
 
+  Ubuntu2404-x64:
+    uses: opencv/ci-gha-workflow/.github/workflows/OCV-Contrib-PR-5.x-U24.yaml@main
+
   Ubuntu2004-x64-CUDA:
     uses: opencv/ci-gha-workflow/.github/workflows/OCV-Contrib-PR-5.x-U20-Cuda.yaml@main
 
 
@@ -29,8 +29,10 @@ void CV_BackgroundSubtractorTest::run(int)
 {
     int code = cvtest::TS::OK;
     RNG& rng = ts->get_rng();
-    int type = ((unsigned int)rng)%7;  //!< pick a random type, 0 - 6, defined in types_c.h
-    int channels = 1 + ((unsigned int)rng)%4;  //!< random number of channels from 1 to 4.
+    int type = ((unsigned int)rng) % 3;
+    type = (type == 0) ? CV_8U : (type == 1) ? CV_16U : CV_32F; // 8U, 16U, 32F
+    int channels = ((unsigned int)rng)%3;
+    channels = (channels == 2) ? 4 : channels; // 1, 3, 4
     int channelsAndType = CV_MAKETYPE(type,channels);
     int width = 2 + ((unsigned int)rng)%98; //!< Mat will be 2 to 100 in width and height
     int height = 2 + ((unsigned int)rng)%98;
 
@@ -133,23 +133,9 @@ void cv::cuda::cartToPolar(InputArray _x, InputArray _y, OutputArray _mag, Outpu
     GpuMat_<float> anglec(angle.reshape(1));
 
     if (angleInDegrees)
-    {
-        gridTransformTuple(zipPtr(xc, yc),
-                           tie(magc, anglec),
-                           make_tuple(
-                               binaryTupleAdapter<0, 1>(magnitude_func<float>()),
-                               binaryTupleAdapter<0, 1>(direction_func<float, true>())),
-                           stream);
-    }
+        gridTransformBinary(xc, yc, magc, anglec, magnitude_func<float>(), direction_func<float, true>(), stream);
     else
-    {
-        gridTransformTuple(zipPtr(xc, yc),
-                           tie(magc, anglec),
-                           make_tuple(
-                               binaryTupleAdapter<0, 1>(magnitude_func<float>()),
-                               binaryTupleAdapter<0, 1>(direction_func<float, false>())),
-                           stream);
-    }
+        gridTransformBinary(xc, yc, magc, anglec, magnitude_func<float>(), direction_func<float, false>(), stream);
 
     syncOutput(mag, _mag, stream);
     syncOutput(angle, _angle, stream);
 
@@ -67,7 +67,8 @@ namespace
     {
         static void call(const GpuMat* src, GpuMat& dst, Stream& stream)
         {
-            gridMerge(zipPtr(globPtr<T>(src[0]), globPtr<T>(src[1])),
+            const std::array<GlobPtrSz<T>, 2> d_src = {globPtr<T>(src[0]), globPtr<T>(src[1])};
+            gridMerge(d_src,
                     globPtr<typename MakeVec<T, 2>::type>(dst),
                     stream);
         }
@@ -77,7 +78,8 @@ namespace
     {
         static void call(const GpuMat* src, GpuMat& dst, Stream& stream)
         {
-            gridMerge(zipPtr(globPtr<T>(src[0]), globPtr<T>(src[1]), globPtr<T>(src[2])),
+            const std::array<GlobPtrSz<T>, 3> d_src = {globPtr<T>(src[0]), globPtr<T>(src[1]), globPtr<T>(src[2])};
+            gridMerge(d_src,
                     globPtr<typename MakeVec<T, 3>::type>(dst),
                     stream);
         }
@@ -87,7 +89,8 @@ namespace
     {
         static void call(const GpuMat* src, GpuMat& dst, Stream& stream)
         {
-            gridMerge(zipPtr(globPtr<T>(src[0]), globPtr<T>(src[1]), globPtr<T>(src[2]), globPtr<T>(src[3])),
+            const std::array<GlobPtrSz<T>, 4 > d_src = {globPtr<T>(src[0]), globPtr<T>(src[1]), globPtr<T>(src[2]), globPtr<T>(src[3])};
+            gridMerge(d_src,
                     globPtr<typename MakeVec<T, 4>::type>(dst),
                     stream);
         }
 
@@ -151,7 +151,12 @@ void cv::cuda::meanStdDev(InputArray src, OutputArray dst, Stream& stream)
     sz.width  = gsrc.cols;
     sz.height = gsrc.rows;
 
+#if (CUDA_VERSION >= 12040)
+    size_t bufSize;
+#else
     int bufSize;
+#endif
+
 #if (CUDA_VERSION <= 4020)
     nppSafeCall( nppiMeanStdDev8uC1RGetBufferHostSize(sz, &bufSize) );
 #else
@@ -162,7 +167,8 @@ void cv::cuda::meanStdDev(InputArray src, OutputArray dst, Stream& stream)
 #endif
 
     BufferPool pool(stream);
-    GpuMat buf = pool.getBuffer(1, bufSize, gsrc.type());
+    CV_Assert(bufSize <= std::numeric_limits<int>::max());
+    GpuMat buf = pool.getBuffer(1, static_cast<int>(bufSize), gsrc.type());
 
     // detail: https://github.com/opencv/opencv/issues/11063
     //NppStreamHandler h(StreamAccessor::getStream(stream));
@@ -227,7 +233,12 @@ void cv::cuda::meanStdDev(InputArray src, OutputArray dst, InputArray mask, Stre
     sz.width  = gsrc.cols;
     sz.height = gsrc.rows;
 
+#if (CUDA_VERSION >= 12040)
+    size_t bufSize;
+#else
     int bufSize;
+#endif
+
 #if (CUDA_VERSION <= 4020)
         nppSafeCall( nppiMeanStdDev8uC1MRGetBufferHostSize(sz, &bufSize) );
 #else
@@ -238,7 +249,8 @@ void cv::cuda::meanStdDev(InputArray src, OutputArray dst, InputArray mask, Stre
 #endif
 
     BufferPool pool(stream);
-    GpuMat buf = pool.getBuffer(1, bufSize, gsrc.type());
+    CV_Assert(bufSize <= std::numeric_limits<int>::max());
+    GpuMat buf = pool.getBuffer(1, static_cast<int>(bufSize), gsrc.type());
 
     if(gsrc.type() == CV_8UC1)
         nppSafeCall( nppiMean_StdDev_8u_C1MR(gsrc.ptr<Npp8u>(), static_cast<int>(gsrc.step), gmask.ptr<Npp8u>(), static_cast<int>(gmask.step),
 
@@ -53,6 +53,17 @@
 #include "opencv2/core/cuda/saturate_cast.hpp"
 #include "opencv2/core/cuda/border_interpolate.hpp"
 
+
+// The CUB library is used for the Median Filter with Wavelet Matrix,
+// which has become a standard library since CUDA 11.
+#include "wavelet_matrix_feature_support_checks.h"
+#ifdef __OPENCV_USE_WAVELET_MATRIX_FOR_MEDIAN_FILTER_CUDA__
+#include "wavelet_matrix_multi.cuh"
+#include "wavelet_matrix_2d.cuh"
+#include "wavelet_matrix_float_supporter.cuh"
+#endif
+
+
 namespace cv { namespace cuda { namespace device
 {
         __device__ void histogramAddAndSub8(int* H, const int * hist_colAdd,const int * hist_colSub){
@@ -334,4 +345,72 @@ namespace cv { namespace cuda { namespace device
 
 }}}
 
+
+#ifdef __OPENCV_USE_WAVELET_MATRIX_FOR_MEDIAN_FILTER_CUDA__
+namespace cv { namespace cuda { namespace device
+    {
+        using namespace wavelet_matrix_median;
+
+        template<int CH_NUM, typename T>
+        void medianFiltering_wavelet_matrix_gpu(const PtrStepSz<T> src, PtrStepSz<T> dst, int radius,cudaStream_t stream){
+
+            constexpr bool is_float = std::is_same<T, float>::value;
+            constexpr static int WORD_SIZE = 32;
+            constexpr static int ThW = (std::is_same<T, uint8_t>::value ?  8 : 4);
+            constexpr static int ThH = (std::is_same<T, uint8_t>::value ? 64 : 256);
+            using XYIdxT = uint32_t;
+            using XIdxT = uint16_t;
+            using WM_T = typename std::conditional<is_float, uint32_t, T>::type;
+            using MedianResT = typename std::conditional<is_float, T, std::nullptr_t>::type;
+            using WM2D_IMPL = WaveletMatrix2dCu5C<WM_T, CH_NUM, WaveletMatrixMultiCu4G<XIdxT, 512>, 512, WORD_SIZE>;
+
+            CV_Assert(src.cols == dst.cols);
+            CV_Assert(dst.step % sizeof(T) == 0);
+
+            WM2D_IMPL WM_cuda(src.rows, src.cols, is_float, false);
+            WM_cuda.res_cu =  reinterpret_cast<WM_T*>(dst.ptr());
+
+            const size_t line_num = src.cols * CH_NUM;
+            if (is_float) {
+                WMMedianFloatSupporter::WMMedianFloatSupporter<float, CH_NUM, XYIdxT> float_supporter(src.rows, src.cols);
+                float_supporter.alloc();
+                for (int y = 0; y < src.rows; ++y) {
+                    cudaMemcpy(float_supporter.val_in_cu + y * line_num, src.ptr(y), line_num * sizeof(T), cudaMemcpyDeviceToDevice);
+                }
+                const auto p = WM_cuda.get_nowcu_and_buf_byte_div32();
+                float_supporter.sort_and_set((XYIdxT*)p.first, p.second);
+                WM_cuda.construct(nullptr, stream, true);
+                WM_cuda.template median2d<ThW, ThH, MedianResT, false>(radius, dst.step / sizeof(T), (MedianResT*)float_supporter.get_res_table(), stream);
+            } else {
+                for (int y = 0; y < src.rows; ++y) {
+                    cudaMemcpy(WM_cuda.src_cu + y * line_num, src.ptr(y), line_num * sizeof(T), cudaMemcpyDeviceToDevice);
+                }
+                WM_cuda.construct(nullptr, stream);
+                WM_cuda.template median2d<ThW, ThH, MedianResT, false>(radius, dst.step / sizeof(T), nullptr, stream);
+            }
+            WM_cuda.res_cu = nullptr;
+            if (!stream) {
+                cudaSafeCall( cudaDeviceSynchronize() );
+            }
+        }
+
+        template<typename T>
+        void medianFiltering_wavelet_matrix_gpu(const PtrStepSz<T> src, PtrStepSz<T> dst, int radius, const int num_channels, cudaStream_t stream){
+            if (num_channels == 1) {
+                medianFiltering_wavelet_matrix_gpu<1>(src, dst, radius, stream);
+            } else if (num_channels == 3) {
+                medianFiltering_wavelet_matrix_gpu<3>(src, dst, radius, stream);
+            } else if (num_channels == 4) {
+                medianFiltering_wavelet_matrix_gpu<4>(src, dst, radius, stream);
+            } else {
+                CV_Assert(num_channels == 1 || num_channels == 3 || num_channels == 4);
+            }
+        }
+
+        template void medianFiltering_wavelet_matrix_gpu(const PtrStepSz<uint8_t>  src, PtrStepSz<uint8_t>  dst, int radius, const int num_channels, cudaStream_t stream);
+        template void medianFiltering_wavelet_matrix_gpu(const PtrStepSz<uint16_t> src, PtrStepSz<uint16_t> dst, int radius, const int num_channels, cudaStream_t stream);
+        template void medianFiltering_wavelet_matrix_gpu(const PtrStepSz<float>    src, PtrStepSz<float>    dst, int radius, const int num_channels, cudaStream_t stream);
+}}}
+#endif // __OPENCV_USE_WAVELET_MATRIX_FOR_MEDIAN_FILTER_CUDA__
+
 #endif
Original file line number	Diff line number	Diff line change
`@@ -67,7 +67,8 @@ namespace`
`67`	`67`	`{`
`68`	`68`	`static void call(const GpuMat* src, GpuMat& dst, Stream& stream)`
`69`	`69`	`{`
`70`		`- gridMerge(zipPtr(globPtr<T>(src[0]), globPtr<T>(src[1])),`
	`70`	`+ const std::array<GlobPtrSz<T>, 2> d_src = {globPtr<T>(src[0]), globPtr<T>(src[1])};`
	`71`	`+ gridMerge(d_src,`
`71`	`72`	`globPtr<typename MakeVec<T, 2>::type>(dst),`
`72`	`73`	`stream);`
`73`	`74`	`}`
`@@ -77,7 +78,8 @@ namespace`
`77`	`78`	`{`
`78`	`79`	`static void call(const GpuMat* src, GpuMat& dst, Stream& stream)`
`79`	`80`	`{`
`80`		`- gridMerge(zipPtr(globPtr<T>(src[0]), globPtr<T>(src[1]), globPtr<T>(src[2])),`
	`81`	`+ const std::array<GlobPtrSz<T>, 3> d_src = {globPtr<T>(src[0]), globPtr<T>(src[1]), globPtr<T>(src[2])};`
	`82`	`+ gridMerge(d_src,`
`81`	`83`	`globPtr<typename MakeVec<T, 3>::type>(dst),`
`82`	`84`	`stream);`
`83`	`85`	`}`
`@@ -87,7 +89,8 @@ namespace`
`87`	`89`	`{`
`88`	`90`	`static void call(const GpuMat* src, GpuMat& dst, Stream& stream)`
`89`	`91`	`{`
`90`		`- gridMerge(zipPtr(globPtr<T>(src[0]), globPtr<T>(src[1]), globPtr<T>(src[2]), globPtr<T>(src[3])),`
	`92`	`+ const std::array<GlobPtrSz<T>, 4 > d_src = {globPtr<T>(src[0]), globPtr<T>(src[1]), globPtr<T>(src[2]), globPtr<T>(src[3])};`
	`93`	`+ gridMerge(d_src,`
`91`	`94`	`globPtr<typename MakeVec<T, 4>::type>(dst),`
`92`	`95`	`stream);`
`93`	`96`	`}`