Skip to content

Fix race condition in Otsu's method #3970

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Jul 17, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 35 additions & 11 deletions modules/cudaarithm/src/cuda/threshold.cu
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ namespace

__global__ void otsu_sums(uint *histogram, uint *threshold_sums, unsigned long long *sums)
{
const uint32_t n_bins = 256;
const uint n_bins = 256;

__shared__ uint shared_memory_ts[n_bins];
__shared__ unsigned long long shared_memory_s[n_bins];
Expand All @@ -109,7 +109,7 @@ __global__ void otsu_sums(uint *histogram, uint *threshold_sums, unsigned long l
uint threshold_sum_above = 0;
unsigned long long sum_above = 0;

if (bin_idx >= threshold)
if (bin_idx > threshold)
{
uint value = histogram[bin_idx];
threshold_sum_above = value;
Expand All @@ -129,7 +129,7 @@ __global__ void otsu_sums(uint *histogram, uint *threshold_sums, unsigned long l
__global__ void
otsu_variance(float2 *variance, uint *histogram, uint *threshold_sums, unsigned long long *sums)
{
const uint32_t n_bins = 256;
const uint n_bins = 256;

__shared__ signed long long shared_memory_a[n_bins];
__shared__ signed long long shared_memory_b[n_bins];
Expand All @@ -147,7 +147,7 @@ otsu_variance(float2 *variance, uint *histogram, uint *threshold_sums, unsigned

float threshold_variance_above_f32 = 0;
float threshold_variance_below_f32 = 0;
if (bin_idx >= threshold)
if (bin_idx > threshold)
{
float mean = (float) sum_above / n_samples_above;
float sigma = bin_idx - mean;
Expand All @@ -172,11 +172,35 @@ otsu_variance(float2 *variance, uint *histogram, uint *threshold_sums, unsigned
}
}

template <uint n_thresholds>
__device__ bool has_lowest_score(
uint threshold, float original_score, float score, uint *shared_memory
) {
// It may happen that multiple threads have the same minimum score. In that case, we want to find the thread with
// the lowest threshold. This is done by calling '__syncthreads_count' to count how many threads have a score
// that matches to the minimum score found. Since this is rare, we will optimize towards the common case where only
// one thread has the minimum score. If multiple threads have the same minimum score, we will find the minimum
// threshold that satifies the condition
bool has_match = original_score == score;
uint matches = __syncthreads_count(has_match);

if(matches > 1) {
// If this thread has a match, we use it; otherwise we give it a value that is larger than the maximum
// threshold, so it will never get picked
uint min_threshold = has_match ? threshold : n_thresholds;

blockReduce<n_thresholds>(shared_memory, min_threshold, threshold, minimum<uint>());

return min_threshold == threshold;
} else {
return has_match;
}
}

__global__ void
otsu_score(uint *otsu_threshold, uint *threshold_sums, float2 *variance)
{
const uint32_t n_thresholds = 256;
const uint n_thresholds = 256;

__shared__ float shared_memory[n_thresholds];

Expand All @@ -190,8 +214,8 @@ otsu_score(uint *otsu_threshold, uint *threshold_sums, float2 *variance)
float threshold_mean_below = (float)n_samples_below / n_samples;

float2 variances = variance[threshold];
float variance_above = variances.x / n_samples_above;
float variance_below = variances.y / n_samples_below;
float variance_above = n_samples_above > 0 ? variances.x / n_samples_above : 0.0f;
float variance_below = n_samples_below > 0 ? variances.y / n_samples_below : 0.0f;

float above = threshold_mean_above * variance_above;
float below = threshold_mean_below * variance_below;
Expand All @@ -209,11 +233,11 @@ otsu_score(uint *otsu_threshold, uint *threshold_sums, float2 *variance)

score = shared_memory[0];

// We found the minimum score, but we need to find the threshold. If we find the thread with the minimum score, we
// know which threshold it is
if (original_score == score)
// We found the minimum score, but in some cases multiple threads can have the same score, so we need to find the
// lowest threshold
if (has_lowest_score<n_thresholds>(threshold, original_score, score, (uint *) shared_memory))
{
*otsu_threshold = threshold - 1;
*otsu_threshold = threshold;
}
}

Expand Down
44 changes: 44 additions & 0 deletions modules/cudaarithm/test/test_element_operations.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2625,6 +2625,50 @@ INSTANTIATE_TEST_CASE_P(CUDA_Arithm, ThresholdOtsu, testing::Combine(
WHOLE_SUBMAT));


///////////////////////////////////////////////////////////////////////////////////////////////////////
// ThresholdOtsuMulti Multiple Valid Thresholds

PARAM_TEST_CASE(ThresholdOtsuMulti, cv::cuda::DeviceInfo, cv::Size, MatType, Channels)
{
cv::cuda::DeviceInfo devInfo;
cv::Size size;
int type;
int channel;

virtual void SetUp()
{
devInfo = GET_PARAM(0);
size = GET_PARAM(1);
type = GET_PARAM(2);
channel = GET_PARAM(3);

cv::cuda::setDevice(devInfo.deviceID());
}
};

CUDA_TEST_P(ThresholdOtsuMulti, Accuracy)
{
cv::Mat src = Mat(size, CV_MAKE_TYPE(type, channel), cv::Scalar(0));
src.colRange(src.cols / 2, src.cols).setTo(255);

cv::cuda::GpuMat dst = createMat(src.size(), src.type());
double otsu_gpu = cv::cuda::threshold(loadMat(src), dst, 0, 255, cv::THRESH_BINARY | cv::THRESH_OTSU);

cv::Mat dst_gold;
double otsu_cpu = cv::threshold(src, dst_gold, 0, 255, cv::THRESH_BINARY | cv::THRESH_OTSU);

ASSERT_DOUBLE_EQ(otsu_gpu, otsu_cpu);
ASSERT_DOUBLE_EQ(otsu_gpu, 0);
EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
}

INSTANTIATE_TEST_CASE_P(CUDA_Arithm, ThresholdOtsuMulti, testing::Combine(
ALL_DEVICES,
testing::Values(cv::Size(1, 100)),
testing::Values(MatDepth(CV_8U)),
testing::Values(Channels(1))));


////////////////////////////////////////////////////////////////////////////////
// InRange

Expand Down
Loading