From e18fc48565ed99325801853a0db4a1cfaef3d47f Mon Sep 17 00:00:00 2001 From: Wang Qiang <37444407+wangqiang9@users.noreply.github.com> Date: Thu, 30 May 2024 16:40:19 +0800 Subject: [PATCH 1/3] Fixed a clerical error in qrcode.py --- modules/wechat_qrcode/samples/qrcode.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/wechat_qrcode/samples/qrcode.py b/modules/wechat_qrcode/samples/qrcode.py index 7713734f993..405a523e07c 100644 --- a/modules/wechat_qrcode/samples/qrcode.py +++ b/modules/wechat_qrcode/samples/qrcode.py @@ -22,7 +22,7 @@ except: print("---------------------------------------------------------------") print("Failed to initialize WeChatQRCode.") - print("Please, download 'detector.*' and 'sr.*' from") + print("Please, download 'detect.*' and 'sr.*' from") print("https://github.com/WeChatCV/opencv_3rdparty/tree/wechat_qrcode") print("and put them into the current directory.") print("---------------------------------------------------------------") From 14e34187c5ee215a46c90878a0489a123e7dfdfb Mon Sep 17 00:00:00 2001 From: Gregor Burger Date: Fri, 7 Jun 2024 08:25:10 +0200 Subject: [PATCH 2/3] also link to CUDA::cufft_static in case of BUILD_SHARED_LIBS=OFF --- modules/cudaarithm/CMakeLists.txt | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/modules/cudaarithm/CMakeLists.txt b/modules/cudaarithm/CMakeLists.txt index 6ee7a9f96bb..b1aacd68696 100644 --- a/modules/cudaarithm/CMakeLists.txt +++ b/modules/cudaarithm/CMakeLists.txt @@ -9,6 +9,9 @@ ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4127 /wd4324 /wd4512 -Wundef -Wmissing-d set(extra_dependencies "") set(optional_dependencies "") if(ENABLE_CUDA_FIRST_CLASS_LANGUAGE) + if(UNIX AND NOT BUILD_SHARED_LIBS AND CUDA_VERSION_STRING VERSION_GREATER_EQUAL 9.2 AND CMAKE_VERSION VERSION_GREATER_EQUAL 3.23) + set(CUDA_FFT_LIB_EXT "_static_nocallback") + endif() list(APPEND extra_dependencies CUDA::cudart_static CUDA::nppial${CUDA_LIB_EXT} CUDA::nppc${CUDA_LIB_EXT} CUDA::nppitc${CUDA_LIB_EXT} CUDA::nppig${CUDA_LIB_EXT} CUDA::nppist${CUDA_LIB_EXT} CUDA::nppidei${CUDA_LIB_EXT}) if(HAVE_CUBLAS) list(APPEND optional_dependencies CUDA::cublas${CUDA_LIB_EXT}) @@ -18,7 +21,8 @@ if(ENABLE_CUDA_FIRST_CLASS_LANGUAGE) endif() if(HAVE_CUFFT) # static version requires seperable compilation which is incompatible with opencv's current library structure - list(APPEND optional_dependencies CUDA::cufft) + # the cufft_static_nocallback variant does not requires seperable compilation. callbacks are currently not used. + list(APPEND optional_dependencies CUDA::cufft${CUDA_FFT_LIB_EXT}) endif() else() if(HAVE_CUBLAS) From b2c0ce0f2b3c9b908eed39f59d7eba735799a90e Mon Sep 17 00:00:00 2001 From: Pierre Chatelier Date: Fri, 12 Jul 2024 15:15:35 +0200 Subject: [PATCH 3/3] Merge pull request #3731 from chacha21:cuda_separable_filter_single supports empty kernels in cuda::SeparableLinearFilters #3731 [#25408](https://github.com/opencv/opencv/issues/25408) When only 1D convolution is needed (row or column filter only), `cuda::LinearFilter` might be slower than `cuda::SeparableLinearFilter` Using `cuda::SeparableLinearFilter` for 1D convolution can be done by using a `(1)` kernel for the ignored dimension. By supporting empty kernels in `cuda::SeparableLinearFilter`, there is no need for that `(1)` kernel any more. Additionaly, the inner `_buf ` used to store the intermediate convolution result can be saved when a single convolution is needed. In "legacy" usage (row+col kernels), there is no regression in `cuda::SeparableLinearFilter` performance. As soon as an empty kernel is used, the performance is largely increased. Devil in the details : the "in-place" processing is supported and might need intermediate buf, but still no regression. - [X] I agree to contribute to the project under Apache 2 License. - [X] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [X] The PR is proposed to the proper branch - [X] There is a reference to the original bug report and related work - [X] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [X] The feature is well documented and sample code can be built with the project CMake --- .../include/opencv2/cudafilters.hpp | 4 +- modules/cudafilters/src/filtering.cpp | 70 ++++++++++++---- modules/cudafilters/test/test_filters.cpp | 80 +++++++++++++++++++ 3 files changed, 138 insertions(+), 16 deletions(-) diff --git a/modules/cudafilters/include/opencv2/cudafilters.hpp b/modules/cudafilters/include/opencv2/cudafilters.hpp index 1519869f4ca..2aa9c846462 100644 --- a/modules/cudafilters/include/opencv2/cudafilters.hpp +++ b/modules/cudafilters/include/opencv2/cudafilters.hpp @@ -142,12 +142,14 @@ CV_EXPORTS_W Ptr createLaplacianFilter(int srcType, int dstType, int ksi //////////////////////////////////////////////////////////////////////////////////////////////////// // Separable Linear Filter -/** @brief Creates a separable linear filter. +/** @brief Creates a separable linear filter. In-place processing is supported. @param srcType Source array type. @param dstType Destination array type. @param rowKernel Horizontal filter coefficients. Support kernels with size \<= 32 . +noArray() is supported to ignore the row filtering. @param columnKernel Vertical filter coefficients. Support kernels with size \<= 32 . +noArray() is supported to ignore the column filtering. @param anchor Anchor position within the kernel. Negative values mean that anchor is positioned at the aperture center. @param rowBorderMode Pixel extrapolation method in the vertical direction For details, see diff --git a/modules/cudafilters/src/filtering.cpp b/modules/cudafilters/src/filtering.cpp index 2ae789c856d..185751f861f 100644 --- a/modules/cudafilters/src/filtering.cpp +++ b/modules/cudafilters/src/filtering.cpp @@ -386,28 +386,38 @@ namespace const int cn = CV_MAT_CN(srcType); const int ddepth = CV_MAT_DEPTH(dstType); - Mat rowKernel = _rowKernel.getMat(); - Mat columnKernel = _columnKernel.getMat(); + CV_Assert( _rowKernel.empty() || _rowKernel.isMat() ); + CV_Assert( _columnKernel.empty() || _columnKernel.isMat() ); + Mat rowKernel = _rowKernel.empty() ? cv::Mat() : _rowKernel.getMat(); + Mat columnKernel = _columnKernel.empty() ? cv::Mat() : _columnKernel.getMat(); CV_Assert( sdepth <= CV_64F && cn <= 4 ); - CV_Assert( rowKernel.channels() == 1 ); - CV_Assert( columnKernel.channels() == 1 ); + CV_Assert( rowKernel.empty() || rowKernel.channels() == 1 ); + CV_Assert( columnKernel.empty() || columnKernel.channels() == 1 ); CV_Assert( rowBorderMode == BORDER_REFLECT101 || rowBorderMode == BORDER_REPLICATE || rowBorderMode == BORDER_CONSTANT || rowBorderMode == BORDER_REFLECT || rowBorderMode == BORDER_WRAP ); CV_Assert( columnBorderMode == BORDER_REFLECT101 || columnBorderMode == BORDER_REPLICATE || columnBorderMode == BORDER_CONSTANT || columnBorderMode == BORDER_REFLECT || columnBorderMode == BORDER_WRAP ); Mat kernel32F; - rowKernel.convertTo(kernel32F, CV_32F); - rowKernel_.upload(kernel32F.reshape(1, 1)); + if (!rowKernel.empty()) + { + rowKernel.convertTo(kernel32F, CV_32F); + rowKernel_.upload(kernel32F.reshape(1, 1)); + } - columnKernel.convertTo(kernel32F, CV_32F); - columnKernel_.upload(kernel32F.reshape(1, 1)); + if (!columnKernel.empty()) + { + columnKernel.convertTo(kernel32F, CV_32F); + columnKernel_.upload(kernel32F.reshape(1, 1)); + } - CV_Assert( rowKernel_.cols > 0 && rowKernel_.cols <= 32 ); - CV_Assert( columnKernel_.cols > 0 && columnKernel_.cols <= 32 ); + CV_Assert( rowKernel_.empty() || (rowKernel_.cols > 0 && rowKernel_.cols <= 32 )); + CV_Assert( columnKernel_.empty() || (columnKernel_.cols > 0 && columnKernel_.cols <= 32 )); - normalizeAnchor(anchor_.x, rowKernel_.cols); - normalizeAnchor(anchor_.y, columnKernel_.cols); + if (!rowKernel_.empty()) + normalizeAnchor(anchor_.x, rowKernel_.cols); + if (!columnKernel_.empty()) + normalizeAnchor(anchor_.y, columnKernel_.cols); bufType_ = CV_MAKE_TYPE(CV_32F, cn); @@ -426,15 +436,45 @@ namespace _dst.create(src.size(), dstType_); GpuMat dst = _dst.getGpuMat(); - ensureSizeIsEnough(src.size(), bufType_, buf_); + const bool isInPlace = (src.data == dst.data); + const bool hasRowKernel = !rowKernel_.empty(); + const bool hasColKernel = !columnKernel_.empty(); + const bool hasSingleKernel = (hasRowKernel ^ hasColKernel); + const bool needsSrcAdaptation = !hasRowKernel && hasColKernel && (srcType_ != bufType_); + const bool needsDstAdaptation = hasRowKernel && !hasColKernel && (dstType_ != bufType_); + const bool needsBufForIntermediateStorage = (hasRowKernel && hasColKernel) || (hasSingleKernel && isInPlace); + const bool needsBuf = needsSrcAdaptation || needsDstAdaptation || needsBufForIntermediateStorage; + if (needsBuf) + ensureSizeIsEnough(src.size(), bufType_, buf_); + + if (needsSrcAdaptation) + src.convertTo(buf_, bufType_, _stream); + GpuMat& srcAdapted = needsSrcAdaptation ? buf_ : src; DeviceInfo devInfo; const int cc = devInfo.majorVersion() * 10 + devInfo.minorVersion(); cudaStream_t stream = StreamAccessor::getStream(_stream); - rowFilter_(src, buf_, rowKernel_.ptr(), rowKernel_.cols, anchor_.x, rowBorderMode_, cc, stream); - columnFilter_(buf_, dst, columnKernel_.ptr(), columnKernel_.cols, anchor_.y, columnBorderMode_, cc, stream); + if (!hasRowKernel && !hasColKernel && !isInPlace) + srcAdapted.convertTo(dst, dstType_, _stream); + else if (hasRowKernel || hasColKernel) + { + GpuMat& rowFilterSrc = srcAdapted; + GpuMat& rowFilterDst = !hasRowKernel ? srcAdapted : needsBuf ? buf_ : dst; + GpuMat& colFilterSrc = hasColKernel && needsBuf ? buf_ : srcAdapted; + GpuMat& colFilterTo = dst; + + if (hasRowKernel) + rowFilter_(rowFilterSrc, rowFilterDst, rowKernel_.ptr(), rowKernel_.cols, anchor_.x, rowBorderMode_, cc, stream); + else if (hasColKernel && (needsBufForIntermediateStorage && !needsSrcAdaptation)) + rowFilterSrc.convertTo(buf_, bufType_, _stream); + + if (hasColKernel) + columnFilter_(colFilterSrc, colFilterTo, columnKernel_.ptr(), columnKernel_.cols, anchor_.y, columnBorderMode_, cc, stream); + else if (needsBuf) + buf_.convertTo(dst, dstType_, _stream); + } } } diff --git a/modules/cudafilters/test/test_filters.cpp b/modules/cudafilters/test/test_filters.cpp index 432b5d2a5ac..b70ea298ea0 100644 --- a/modules/cudafilters/test/test_filters.cpp +++ b/modules/cudafilters/test/test_filters.cpp @@ -281,6 +281,86 @@ INSTANTIATE_TEST_CASE_P(CUDA_Filters, SeparableLinearFilter, testing::Combine( BorderType(cv::BORDER_REFLECT)), WHOLE_SUBMAT)); +PARAM_TEST_CASE(SeparableLinearFilterWithEmptyKernels, cv::cuda::DeviceInfo, MatDepth, Channels, MatDepth, bool, bool, bool) +{ + cv::cuda::DeviceInfo devInfo; + bool inPlace; + bool useRowKernel; + bool useColKernel; + + cv::Size size; + int srcDepth; + int cn; + int dstDepth; + cv::Size ksize; + cv::Point anchor; + int borderType; + int srcType; + int dstType; + + virtual void SetUp() + { + devInfo = GET_PARAM(0); + srcDepth = GET_PARAM(1); + cn = GET_PARAM(2); + dstDepth = GET_PARAM(3); + inPlace = GET_PARAM(4); + useRowKernel = GET_PARAM(5); + useColKernel = GET_PARAM(6); + + size = cv::Size(640, 480); + ksize = cv::Size(3, 1); + anchor = cv::Point(-1, -1); + borderType = cv::BORDER_REPLICATE; + + cv::cuda::setDevice(devInfo.deviceID()); + + srcType = CV_MAKE_TYPE(srcDepth, cn); + dstType = CV_MAKE_TYPE(dstDepth, cn); + } +}; + +CUDA_TEST_P(SeparableLinearFilterWithEmptyKernels, Accuracy) +{ + cv::Mat src = randomMat(size, srcType); + cv::Mat rowKernel = (cv::Mat_(ksize) << -1, 0, 1); + cv::Mat colKernel = rowKernel.t(); + cv::Mat oneKernel = cv::Mat::ones(cv::Size(1, 1), CV_32FC1); + cv::Mat noKernel = cv::Mat(); + + cv::Ptr sepFilterDummyKernels = + cv::cuda::createSeparableLinearFilter(srcType, dstType, + useRowKernel ? rowKernel : oneKernel, + useColKernel ? colKernel : oneKernel, + cv::Point(-1, -1), cv::BORDER_REPLICATE, cv::BORDER_REPLICATE); + + cv::Ptr sepFilterEmptyKernels = + cv::cuda::createSeparableLinearFilter(srcType, dstType, + useRowKernel ? rowKernel : noKernel, + useColKernel ? colKernel : noKernel, + cv::Point(-1, -1), cv::BORDER_REPLICATE, cv::BORDER_REPLICATE); + + cv::cuda::GpuMat src_sep_dummyK = loadMat(src); + cv::cuda::GpuMat dst_sep_dummyK = inPlace ? src_sep_dummyK : cv::cuda::GpuMat(); + cv::cuda::GpuMat src_sep_emptyK = loadMat(src); + cv::cuda::GpuMat dst_sep_emptyK = inPlace ? src_sep_emptyK : cv::cuda::GpuMat(); + + sepFilterDummyKernels->apply(src_sep_dummyK, dst_sep_dummyK); + sepFilterEmptyKernels->apply(src_sep_emptyK, dst_sep_emptyK); + + EXPECT_MAT_NEAR(dst_sep_dummyK, dst_sep_emptyK, src.depth() < CV_32F ? 1.0 : 1e-2); +} + +INSTANTIATE_TEST_CASE_P(CUDA_Filters, SeparableLinearFilterWithEmptyKernels, testing::Combine( + ALL_DEVICES, + testing::Values(MatDepth(CV_8U), MatDepth(CV_16U), MatDepth(CV_16S), MatDepth(CV_32F)), + IMAGE_CHANNELS, + testing::Values(MatDepth(CV_8U), MatDepth(CV_16U), MatDepth(CV_16S), MatDepth(CV_32F)), + testing::Values(false, true),//in-place + testing::Values(false, true),//use row kernel + testing::Values(false, true)//use col kernel + )); + ///////////////////////////////////////////////////////////////////////////////////////////////// // Sobel