From e18fc48565ed99325801853a0db4a1cfaef3d47f Mon Sep 17 00:00:00 2001
From: Wang Qiang <37444407+wangqiang9@users.noreply.github.com>
Date: Thu, 30 May 2024 16:40:19 +0800
Subject: [PATCH 1/3] Fixed a clerical error in qrcode.py

---
 modules/wechat_qrcode/samples/qrcode.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modules/wechat_qrcode/samples/qrcode.py b/modules/wechat_qrcode/samples/qrcode.py
index 7713734f993..405a523e07c 100644
--- a/modules/wechat_qrcode/samples/qrcode.py
+++ b/modules/wechat_qrcode/samples/qrcode.py
@@ -22,7 +22,7 @@
 except:
     print("---------------------------------------------------------------")
     print("Failed to initialize WeChatQRCode.")
-    print("Please, download 'detector.*' and 'sr.*' from")
+    print("Please, download 'detect.*' and 'sr.*' from")
     print("https://github.com/WeChatCV/opencv_3rdparty/tree/wechat_qrcode")
     print("and put them into the current directory.")
     print("---------------------------------------------------------------")

From 14e34187c5ee215a46c90878a0489a123e7dfdfb Mon Sep 17 00:00:00 2001
From: Gregor Burger <gregor.burger@bhs-technologies.com>
Date: Fri, 7 Jun 2024 08:25:10 +0200
Subject: [PATCH 2/3] also link to CUDA::cufft_static in case of
 BUILD_SHARED_LIBS=OFF

---
 modules/cudaarithm/CMakeLists.txt | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/modules/cudaarithm/CMakeLists.txt b/modules/cudaarithm/CMakeLists.txt
index 6ee7a9f96bb..b1aacd68696 100644
--- a/modules/cudaarithm/CMakeLists.txt
+++ b/modules/cudaarithm/CMakeLists.txt
@@ -9,6 +9,9 @@ ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 /wd4324 /wd4512 -Wundef -Wmissing-d
 set(extra_dependencies "")
 set(optional_dependencies "")
 if(ENABLE_CUDA_FIRST_CLASS_LANGUAGE)
+  if(UNIX AND NOT BUILD_SHARED_LIBS AND CUDA_VERSION_STRING VERSION_GREATER_EQUAL 9.2 AND CMAKE_VERSION VERSION_GREATER_EQUAL 3.23)
+    set(CUDA_FFT_LIB_EXT "_static_nocallback")
+  endif()
   list(APPEND extra_dependencies CUDA::cudart_static CUDA::nppial${CUDA_LIB_EXT} CUDA::nppc${CUDA_LIB_EXT} CUDA::nppitc${CUDA_LIB_EXT} CUDA::nppig${CUDA_LIB_EXT} CUDA::nppist${CUDA_LIB_EXT} CUDA::nppidei${CUDA_LIB_EXT})
   if(HAVE_CUBLAS)
     list(APPEND optional_dependencies CUDA::cublas${CUDA_LIB_EXT})
@@ -18,7 +21,8 @@ if(ENABLE_CUDA_FIRST_CLASS_LANGUAGE)
   endif()
   if(HAVE_CUFFT)
     # static version requires seperable compilation which is incompatible with opencv's current library structure
-    list(APPEND optional_dependencies CUDA::cufft)
+    # the cufft_static_nocallback variant does not requires seperable compilation. callbacks are currently not used.
+    list(APPEND optional_dependencies CUDA::cufft${CUDA_FFT_LIB_EXT})
   endif()
 else()
   if(HAVE_CUBLAS)

From b2c0ce0f2b3c9b908eed39f59d7eba735799a90e Mon Sep 17 00:00:00 2001
From: Pierre Chatelier <chacha21@users.noreply.github.com>
Date: Fri, 12 Jul 2024 15:15:35 +0200
Subject: [PATCH 3/3] Merge pull request #3731 from
 chacha21:cuda_separable_filter_single

supports empty kernels in cuda::SeparableLinearFilters #3731

[#25408](https://github.com/opencv/opencv/issues/25408)

When only 1D convolution is needed (row or column filter only), `cuda::LinearFilter` might be slower than `cuda::SeparableLinearFilter`
Using `cuda::SeparableLinearFilter` for 1D convolution can be done by using a `(1)` kernel for the ignored dimension.
By supporting empty kernels in `cuda::SeparableLinearFilter`, there is no need for that `(1)` kernel any more.
Additionaly, the inner `_buf ` used to store the intermediate convolution result can be saved when a single convolution is needed.

In "legacy" usage (row+col kernels), there is no regression in `cuda::SeparableLinearFilter` performance.
As soon as an empty kernel is used, the performance is largely increased.

Devil in the details : the "in-place" processing is supported and might need intermediate buf, but still no regression.

- [X] I agree to contribute to the project under Apache 2 License.
- [X] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV
- [X] The PR is proposed to the proper branch
- [X] There is a reference to the original bug report and related work
- [X] There is accuracy test, performance test and test data in opencv_extra repository, if applicable
      Patch to opencv_extra has the same branch name.
- [X] The feature is well documented and sample code can be built with the project CMake
---
 .../include/opencv2/cudafilters.hpp           |  4 +-
 modules/cudafilters/src/filtering.cpp         | 70 ++++++++++++----
 modules/cudafilters/test/test_filters.cpp     | 80 +++++++++++++++++++
 3 files changed, 138 insertions(+), 16 deletions(-)

diff --git a/modules/cudafilters/include/opencv2/cudafilters.hpp b/modules/cudafilters/include/opencv2/cudafilters.hpp
index 1519869f4ca..2aa9c846462 100644
--- a/modules/cudafilters/include/opencv2/cudafilters.hpp
+++ b/modules/cudafilters/include/opencv2/cudafilters.hpp
@@ -142,12 +142,14 @@ CV_EXPORTS_W Ptr<Filter> createLaplacianFilter(int srcType, int dstType, int ksi
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 // Separable Linear Filter
 
-/** @brief Creates a separable linear filter.
+/** @brief Creates a separable linear filter. In-place processing is supported.
 
 @param srcType Source array type.
 @param dstType Destination array type.
 @param rowKernel Horizontal filter coefficients. Support kernels with size \<= 32 .
+noArray() is supported to ignore the row filtering.
 @param columnKernel Vertical filter coefficients. Support kernels with size \<= 32 .
+noArray() is supported to ignore the column filtering.
 @param anchor Anchor position within the kernel. Negative values mean that anchor is positioned at
 the aperture center.
 @param rowBorderMode Pixel extrapolation method in the vertical direction For details, see
diff --git a/modules/cudafilters/src/filtering.cpp b/modules/cudafilters/src/filtering.cpp
index 2ae789c856d..185751f861f 100644
--- a/modules/cudafilters/src/filtering.cpp
+++ b/modules/cudafilters/src/filtering.cpp
@@ -386,28 +386,38 @@ namespace
         const int cn = CV_MAT_CN(srcType);
         const int ddepth = CV_MAT_DEPTH(dstType);
 
-        Mat rowKernel = _rowKernel.getMat();
-        Mat columnKernel = _columnKernel.getMat();
+        CV_Assert( _rowKernel.empty() || _rowKernel.isMat() );
+        CV_Assert( _columnKernel.empty() || _columnKernel.isMat() );
+        Mat rowKernel = _rowKernel.empty() ? cv::Mat() : _rowKernel.getMat();
+        Mat columnKernel = _columnKernel.empty() ? cv::Mat() : _columnKernel.getMat();
 
         CV_Assert( sdepth <= CV_64F && cn <= 4 );
-        CV_Assert( rowKernel.channels() == 1 );
-        CV_Assert( columnKernel.channels() == 1 );
+        CV_Assert( rowKernel.empty() || rowKernel.channels() == 1 );
+        CV_Assert( columnKernel.empty() || columnKernel.channels() == 1 );
         CV_Assert( rowBorderMode == BORDER_REFLECT101 || rowBorderMode == BORDER_REPLICATE || rowBorderMode == BORDER_CONSTANT || rowBorderMode == BORDER_REFLECT || rowBorderMode == BORDER_WRAP );
         CV_Assert( columnBorderMode == BORDER_REFLECT101 || columnBorderMode == BORDER_REPLICATE || columnBorderMode == BORDER_CONSTANT || columnBorderMode == BORDER_REFLECT || columnBorderMode == BORDER_WRAP );
 
         Mat kernel32F;
 
-        rowKernel.convertTo(kernel32F, CV_32F);
-        rowKernel_.upload(kernel32F.reshape(1, 1));
+        if (!rowKernel.empty())
+        {
+            rowKernel.convertTo(kernel32F, CV_32F);
+            rowKernel_.upload(kernel32F.reshape(1, 1));
+        }
 
-        columnKernel.convertTo(kernel32F, CV_32F);
-        columnKernel_.upload(kernel32F.reshape(1, 1));
+        if (!columnKernel.empty())
+        {
+            columnKernel.convertTo(kernel32F, CV_32F);
+            columnKernel_.upload(kernel32F.reshape(1, 1));
+        }
 
-        CV_Assert( rowKernel_.cols > 0 && rowKernel_.cols <= 32 );
-        CV_Assert( columnKernel_.cols > 0 && columnKernel_.cols <= 32 );
+        CV_Assert( rowKernel_.empty() || (rowKernel_.cols > 0 && rowKernel_.cols <= 32 ));
+        CV_Assert( columnKernel_.empty() || (columnKernel_.cols > 0 && columnKernel_.cols <= 32 ));
 
-        normalizeAnchor(anchor_.x, rowKernel_.cols);
-        normalizeAnchor(anchor_.y, columnKernel_.cols);
+        if (!rowKernel_.empty())
+          normalizeAnchor(anchor_.x, rowKernel_.cols);
+        if (!columnKernel_.empty())
+          normalizeAnchor(anchor_.y, columnKernel_.cols);
 
         bufType_ = CV_MAKE_TYPE(CV_32F, cn);
 
@@ -426,15 +436,45 @@ namespace
         _dst.create(src.size(), dstType_);
         GpuMat dst = _dst.getGpuMat();
 
-        ensureSizeIsEnough(src.size(), bufType_, buf_);
+        const bool isInPlace = (src.data == dst.data);
+        const bool hasRowKernel = !rowKernel_.empty();
+        const bool hasColKernel = !columnKernel_.empty();
+        const bool hasSingleKernel = (hasRowKernel ^ hasColKernel);
+        const bool needsSrcAdaptation = !hasRowKernel &&  hasColKernel && (srcType_ != bufType_);
+        const bool needsDstAdaptation =  hasRowKernel && !hasColKernel && (dstType_ != bufType_);
+        const bool needsBufForIntermediateStorage = (hasRowKernel && hasColKernel) || (hasSingleKernel && isInPlace);
+        const bool needsBuf = needsSrcAdaptation || needsDstAdaptation || needsBufForIntermediateStorage;
+        if (needsBuf)
+            ensureSizeIsEnough(src.size(), bufType_, buf_);
+
+        if (needsSrcAdaptation)
+            src.convertTo(buf_, bufType_, _stream);
+        GpuMat& srcAdapted = needsSrcAdaptation ? buf_ : src;
 
         DeviceInfo devInfo;
         const int cc = devInfo.majorVersion() * 10 + devInfo.minorVersion();
 
         cudaStream_t stream = StreamAccessor::getStream(_stream);
 
-        rowFilter_(src, buf_, rowKernel_.ptr<float>(), rowKernel_.cols, anchor_.x, rowBorderMode_, cc, stream);
-        columnFilter_(buf_, dst, columnKernel_.ptr<float>(), columnKernel_.cols, anchor_.y, columnBorderMode_, cc, stream);
+        if (!hasRowKernel && !hasColKernel && !isInPlace)
+            srcAdapted.convertTo(dst, dstType_, _stream);
+        else if (hasRowKernel || hasColKernel)
+        {
+            GpuMat& rowFilterSrc = srcAdapted;
+            GpuMat& rowFilterDst = !hasRowKernel ? srcAdapted : needsBuf ? buf_ : dst;
+            GpuMat& colFilterSrc = hasColKernel && needsBuf ? buf_ : srcAdapted;
+            GpuMat& colFilterTo = dst;
+
+            if (hasRowKernel)
+                rowFilter_(rowFilterSrc, rowFilterDst, rowKernel_.ptr<float>(), rowKernel_.cols, anchor_.x, rowBorderMode_, cc, stream);
+            else if (hasColKernel && (needsBufForIntermediateStorage && !needsSrcAdaptation))
+                rowFilterSrc.convertTo(buf_, bufType_, _stream);
+
+            if (hasColKernel)
+                columnFilter_(colFilterSrc, colFilterTo, columnKernel_.ptr<float>(), columnKernel_.cols, anchor_.y, columnBorderMode_, cc, stream);
+            else if (needsBuf)
+                buf_.convertTo(dst, dstType_, _stream);
+        }
     }
 }
 
diff --git a/modules/cudafilters/test/test_filters.cpp b/modules/cudafilters/test/test_filters.cpp
index 432b5d2a5ac..b70ea298ea0 100644
--- a/modules/cudafilters/test/test_filters.cpp
+++ b/modules/cudafilters/test/test_filters.cpp
@@ -281,6 +281,86 @@ INSTANTIATE_TEST_CASE_P(CUDA_Filters, SeparableLinearFilter, testing::Combine(
                     BorderType(cv::BORDER_REFLECT)),
     WHOLE_SUBMAT));
 
+PARAM_TEST_CASE(SeparableLinearFilterWithEmptyKernels, cv::cuda::DeviceInfo, MatDepth, Channels, MatDepth, bool, bool, bool)
+{
+    cv::cuda::DeviceInfo devInfo;
+    bool inPlace;
+    bool useRowKernel;
+    bool useColKernel;
+
+    cv::Size size;
+    int srcDepth;
+    int cn;
+    int dstDepth;
+    cv::Size ksize;
+    cv::Point anchor;
+    int borderType;
+    int srcType;
+    int dstType;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        srcDepth = GET_PARAM(1);
+        cn = GET_PARAM(2);
+        dstDepth = GET_PARAM(3);
+        inPlace = GET_PARAM(4);
+        useRowKernel = GET_PARAM(5);
+        useColKernel = GET_PARAM(6);
+
+        size = cv::Size(640, 480);
+        ksize = cv::Size(3, 1);
+        anchor = cv::Point(-1, -1);
+        borderType = cv::BORDER_REPLICATE;
+
+        cv::cuda::setDevice(devInfo.deviceID());
+
+        srcType = CV_MAKE_TYPE(srcDepth, cn);
+        dstType = CV_MAKE_TYPE(dstDepth, cn);
+    }
+};
+
+CUDA_TEST_P(SeparableLinearFilterWithEmptyKernels, Accuracy)
+{
+    cv::Mat src = randomMat(size, srcType);
+    cv::Mat rowKernel = (cv::Mat_<float>(ksize) << -1, 0, 1);
+    cv::Mat colKernel = rowKernel.t();
+    cv::Mat oneKernel = cv::Mat::ones(cv::Size(1, 1), CV_32FC1);
+    cv::Mat noKernel = cv::Mat();
+
+    cv::Ptr<cv::cuda::Filter> sepFilterDummyKernels =
+        cv::cuda::createSeparableLinearFilter(srcType, dstType,
+            useRowKernel ? rowKernel : oneKernel,
+            useColKernel ? colKernel : oneKernel,
+            cv::Point(-1, -1), cv::BORDER_REPLICATE, cv::BORDER_REPLICATE);
+
+    cv::Ptr<cv::cuda::Filter> sepFilterEmptyKernels =
+        cv::cuda::createSeparableLinearFilter(srcType, dstType,
+            useRowKernel ? rowKernel : noKernel,
+            useColKernel ? colKernel : noKernel,
+            cv::Point(-1, -1), cv::BORDER_REPLICATE, cv::BORDER_REPLICATE);
+
+    cv::cuda::GpuMat src_sep_dummyK = loadMat(src);
+    cv::cuda::GpuMat dst_sep_dummyK = inPlace ? src_sep_dummyK : cv::cuda::GpuMat();
+    cv::cuda::GpuMat src_sep_emptyK = loadMat(src);
+    cv::cuda::GpuMat dst_sep_emptyK = inPlace ? src_sep_emptyK : cv::cuda::GpuMat();
+
+    sepFilterDummyKernels->apply(src_sep_dummyK, dst_sep_dummyK);
+    sepFilterEmptyKernels->apply(src_sep_emptyK, dst_sep_emptyK);
+
+    EXPECT_MAT_NEAR(dst_sep_dummyK, dst_sep_emptyK, src.depth() < CV_32F ? 1.0 : 1e-2);
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Filters, SeparableLinearFilterWithEmptyKernels, testing::Combine(
+    ALL_DEVICES,
+    testing::Values(MatDepth(CV_8U), MatDepth(CV_16U), MatDepth(CV_16S), MatDepth(CV_32F)),
+    IMAGE_CHANNELS,
+    testing::Values(MatDepth(CV_8U), MatDepth(CV_16U), MatDepth(CV_16S), MatDepth(CV_32F)),
+    testing::Values(false, true),//in-place
+    testing::Values(false, true),//use row kernel
+    testing::Values(false, true)//use col kernel
+    ));
+
 /////////////////////////////////////////////////////////////////////////////////////////////////
 // Sobel