Merge pull request #3970 from troelsy:4.x

troelsy · web-flow · commit 84024547e35e · 2025-07-17T09:36:33.000+03:00
Fix race condition in Otsu's method #3970 I found that in some cases, Otsu's method can have two or more equal thresholds. In case of the CUDA implementation, this would create a race-condition where the last thread to write the output gets to choose which Otsu threshold to use. The resulting image from cv::cuda::threshold would be the same, but the returned threshold would vary for each run. I solve this by first doing a poll to check if there are multiple matches with `__syncthreads_count`. If not, we can return the threshold as before. If there are more than one, the kernel does a minimum reduction to find the lowest threshold that matches the otsu score. It doesn't matter which threshold you choose as long as it is consistent, so I choose the smallest one. I'm unsure when `__syncthreads_count` as introduced, but it is compatible with CC≥2.0. CUDA Toolkit's archive only goes back to version 8.0, but it was documented back then (https://docs.nvidia.com/cuda/archive) ### Pull Request Readiness Checklist See details at https://github.com/opencv/opencv/wiki/How_to_contribute#making-a-good-pull-request - [x] I agree to contribute to the project under Apache 2 License. - [x] To the best of my knowledge, the proposed patch is not based on a code under GPL or another license that is incompatible with OpenCV - [ ] The PR is proposed to the proper branch - [ ] There is a reference to the original bug report and related work - [ ] There is accuracy test, performance test and test data in opencv_extra repository, if applicable Patch to opencv_extra has the same branch name. - [ ] The feature is well documented and sample code can be built with the project CMake
diff --git a/modules/cudaarithm/src/cuda/threshold.cu b/modules/cudaarithm/src/cuda/threshold.cu
@@ -98,7 +98,7 @@ namespace
 
 __global__ void otsu_sums(uint *histogram, uint *threshold_sums, unsigned long long *sums)
 {
-    const uint32_t n_bins = 256;
+    const uint n_bins = 256;
 
     __shared__ uint shared_memory_ts[n_bins];
     __shared__ unsigned long long shared_memory_s[n_bins];
@@ -109,7 +109,7 @@ __global__ void otsu_sums(uint *histogram, uint *threshold_sums, unsigned long l
     uint threshold_sum_above = 0;
     unsigned long long sum_above = 0;
 
-    if (bin_idx >= threshold)
+    if (bin_idx > threshold)
     {
         uint value = histogram[bin_idx];
         threshold_sum_above = value;
@@ -129,7 +129,7 @@ __global__ void otsu_sums(uint *histogram, uint *threshold_sums, unsigned long l
 __global__ void
 otsu_variance(float2 *variance, uint *histogram, uint *threshold_sums, unsigned long long *sums)
 {
-    const uint32_t n_bins = 256;
+    const uint n_bins = 256;
 
     __shared__ signed long long shared_memory_a[n_bins];
     __shared__ signed long long shared_memory_b[n_bins];
@@ -147,7 +147,7 @@ otsu_variance(float2 *variance, uint *histogram, uint *threshold_sums, unsigned
 
     float threshold_variance_above_f32 = 0;
     float threshold_variance_below_f32 = 0;
-    if (bin_idx >= threshold)
+    if (bin_idx > threshold)
     {
         float mean = (float) sum_above / n_samples_above;
         float sigma = bin_idx - mean;
@@ -172,11 +172,35 @@ otsu_variance(float2 *variance, uint *histogram, uint *threshold_sums, unsigned
     }
 }
 
+template <uint n_thresholds>
+__device__ bool has_lowest_score(
+    uint threshold, float original_score, float score, uint *shared_memory
+) {
+    // It may happen that multiple threads have the same minimum score. In that case, we want to find the thread with
+    // the lowest threshold. This is done by calling '__syncthreads_count' to count how many threads have a score
+    // that matches to the minimum score found. Since this is rare, we will optimize towards the common case where only
+    // one thread has the minimum score. If multiple threads have the same minimum score, we will find the minimum
+    // threshold that satifies the condition
+    bool has_match = original_score == score;
+    uint matches = __syncthreads_count(has_match);
+
+    if(matches > 1) {
+        // If this thread has a match, we use it; otherwise we give it a value that is larger than the maximum
+        // threshold, so it will never get picked
+        uint min_threshold = has_match ? threshold : n_thresholds;
+
+        blockReduce<n_thresholds>(shared_memory, min_threshold, threshold, minimum<uint>());
+
+        return min_threshold == threshold;
+    } else {
+        return has_match;
+    }
+}
 
 __global__ void
 otsu_score(uint *otsu_threshold, uint *threshold_sums, float2 *variance)
 {
-    const uint32_t n_thresholds = 256;
+    const uint n_thresholds = 256;
 
     __shared__ float shared_memory[n_thresholds];
 
@@ -190,8 +214,8 @@ otsu_score(uint *otsu_threshold, uint *threshold_sums, float2 *variance)
     float threshold_mean_below = (float)n_samples_below / n_samples;
 
     float2 variances = variance[threshold];
-    float variance_above = variances.x / n_samples_above;
-    float variance_below = variances.y / n_samples_below;
+    float variance_above = n_samples_above > 0 ? variances.x / n_samples_above : 0.0f;
+    float variance_below = n_samples_below > 0 ? variances.y / n_samples_below : 0.0f;
 
     float above = threshold_mean_above * variance_above;
     float below = threshold_mean_below * variance_below;
@@ -209,11 +233,11 @@ otsu_score(uint *otsu_threshold, uint *threshold_sums, float2 *variance)
 
     score = shared_memory[0];
 
-    // We found the minimum score, but we need to find the threshold. If we find the thread with the minimum score, we
-    // know which threshold it is
-    if (original_score == score)
+    // We found the minimum score, but in some cases multiple threads can have the same score, so we need to find the
+    // lowest threshold
+    if (has_lowest_score<n_thresholds>(threshold, original_score, score, (uint *) shared_memory))
     {
-        *otsu_threshold = threshold - 1;
+        *otsu_threshold = threshold;
     }
 }
 
diff --git a/modules/cudaarithm/test/test_element_operations.cpp b/modules/cudaarithm/test/test_element_operations.cpp
@@ -2625,6 +2625,50 @@ INSTANTIATE_TEST_CASE_P(CUDA_Arithm, ThresholdOtsu, testing::Combine(
     WHOLE_SUBMAT));
 
 
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+// ThresholdOtsuMulti Multiple Valid Thresholds
+
+PARAM_TEST_CASE(ThresholdOtsuMulti, cv::cuda::DeviceInfo, cv::Size, MatType, Channels)
+{
+    cv::cuda::DeviceInfo devInfo;
+    cv::Size size;
+    int type;
+    int channel;
+
+    virtual void SetUp()
+    {
+        devInfo = GET_PARAM(0);
+        size = GET_PARAM(1);
+        type = GET_PARAM(2);
+        channel = GET_PARAM(3);
+
+        cv::cuda::setDevice(devInfo.deviceID());
+    }
+};
+
+CUDA_TEST_P(ThresholdOtsuMulti, Accuracy)
+{
+    cv::Mat src = Mat(size, CV_MAKE_TYPE(type, channel), cv::Scalar(0));
+    src.colRange(src.cols / 2, src.cols).setTo(255);
+
+    cv::cuda::GpuMat dst = createMat(src.size(), src.type());
+    double otsu_gpu = cv::cuda::threshold(loadMat(src), dst, 0, 255, cv::THRESH_BINARY | cv::THRESH_OTSU);
+
+    cv::Mat dst_gold;
+    double otsu_cpu = cv::threshold(src, dst_gold, 0, 255, cv::THRESH_BINARY | cv::THRESH_OTSU);
+
+    ASSERT_DOUBLE_EQ(otsu_gpu, otsu_cpu);
+    ASSERT_DOUBLE_EQ(otsu_gpu, 0);
+    EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
+}
+
+INSTANTIATE_TEST_CASE_P(CUDA_Arithm, ThresholdOtsuMulti, testing::Combine(
+    ALL_DEVICES,
+    testing::Values(cv::Size(1, 100)),
+    testing::Values(MatDepth(CV_8U)),
+    testing::Values(Channels(1))));
+
+
 ////////////////////////////////////////////////////////////////////////////////
 // InRange