Skip to content

Commit f617f18

Browse files
committed
bit-exact cuda::equalizeHist
1 parent 2c1f348 commit f617f18

File tree

3 files changed

+217
-25
lines changed

3 files changed

+217
-25
lines changed

modules/cudaimgproc/src/cuda/hist.cu

Lines changed: 131 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -257,18 +257,15 @@ namespace hist
257257

258258
namespace hist
259259
{
260-
__constant__ int c_lut[256];
261-
262260
struct EqualizeHist : unary_function<uchar, uchar>
263261
{
264-
float scale;
262+
const uchar* lut;
265263

266-
__host__ EqualizeHist(float _scale) : scale(_scale) {}
264+
__host__ EqualizeHist(const uchar* _lut) : lut(_lut) {}
267265

268266
__device__ __forceinline__ uchar operator ()(uchar val) const
269267
{
270-
const int lut = c_lut[val];
271-
return __float2int_rn(scale * lut);
268+
return lut[val];
272269
}
273270
};
274271
}
@@ -283,16 +280,137 @@ namespace cv { namespace cuda { namespace device
283280

284281
namespace hist
285282
{
286-
void equalizeHist(PtrStepSzb src, PtrStepSzb dst, const int* lut, cudaStream_t stream)
283+
void equalizeHist(PtrStepSzb src, PtrStepSzb dst, const uchar* lut, cudaStream_t stream)
287284
{
288-
if (stream == 0)
289-
cudaSafeCall( cudaMemcpyToSymbol(c_lut, lut, 256 * sizeof(int), 0, cudaMemcpyDeviceToDevice) );
290-
else
291-
cudaSafeCall( cudaMemcpyToSymbolAsync(c_lut, lut, 256 * sizeof(int), 0, cudaMemcpyDeviceToDevice, stream) );
285+
device::transform(src, dst, EqualizeHist(lut), WithOutMask(), stream);
286+
}
287+
288+
__global__ void buildLutKernel(int* hist, unsigned char* lut, int size)
289+
{
290+
__shared__ int warp_smem[8];
291+
__shared__ int hist_smem[8][33];
292+
293+
#define HIST_SMEM_NO_BANK_CONFLICT(idx) hist_smem[(idx) >> 5][(idx) & 31]
294+
295+
const int tId = threadIdx.x;
296+
const int warpId = threadIdx.x / 32;
297+
const int laneId = threadIdx.x % 32;
298+
299+
// Step1 - Find minimum non-zero value in hist and make it zero
300+
HIST_SMEM_NO_BANK_CONFLICT(tId) = hist[tId];
301+
int nonZeroIdx = HIST_SMEM_NO_BANK_CONFLICT(tId) > 0 ? tId : 256;
302+
303+
__syncthreads();
304+
305+
for (int delta = 16; delta > 0; delta /= 2)
306+
{
307+
#if __CUDACC_VER_MAJOR__ >= 9
308+
int shflVal = __shfl_down_sync(0xFFFFFFFF, nonZeroIdx, delta);
309+
#else
310+
int shflVal = __shfl_down(nonZeroIdx, delta);
311+
#endif
312+
if (laneId < delta)
313+
nonZeroIdx = min(nonZeroIdx, shflVal);
314+
}
315+
316+
if (laneId == 0)
317+
warp_smem[warpId] = nonZeroIdx;
292318

293-
const float scale = 255.0f / (src.cols * src.rows);
319+
__syncthreads();
320+
321+
if (tId < 8)
322+
{
323+
int warpVal = warp_smem[tId];
324+
for (int delta = 4; delta > 0; delta /= 2)
325+
{
326+
#if __CUDACC_VER_MAJOR__ >= 9
327+
int shflVal = __shfl_down_sync(0x000000FF, warpVal, delta);
328+
#else
329+
int shflVal = __shfl_down(warpVal, delta);
330+
#endif
331+
if (tId < delta)
332+
warpVal = min(warpVal, shflVal);
333+
}
334+
if (tId == 0)
335+
{
336+
warp_smem[0] = warpVal; // warpVal - minimum index
337+
}
338+
}
339+
340+
__syncthreads();
341+
342+
const int minNonZeroIdx = warp_smem[0];
343+
const int minNonZeroVal = HIST_SMEM_NO_BANK_CONFLICT(minNonZeroIdx);
344+
if (minNonZeroVal == size)
345+
{
346+
// This is a special case: the whole image has the same color
347+
348+
lut[tId] = 0;
349+
if (tId == minNonZeroIdx)
350+
lut[tId] = minNonZeroIdx;
351+
return;
352+
}
294353

295-
device::transform(src, dst, EqualizeHist(scale), WithOutMask(), stream);
354+
if (tId == 0)
355+
HIST_SMEM_NO_BANK_CONFLICT(minNonZeroIdx) = 0;
356+
357+
__syncthreads();
358+
359+
// Step2 - Inclusive sum
360+
// Algorithm from GPU Gems 3 (A Work-Efficient Parallel Scan)
361+
// https://developer.nvidia.com/gpugems/gpugems3/part-vi-gpu-computing/chapter-39-parallel-prefix-sum-scan-cuda
362+
363+
// Step2 Phase1 - The Up-Sweep Phase
364+
for (int delta = 1; delta < 256; delta *= 2)
365+
{
366+
if (tId < 128 / delta)
367+
{
368+
int idx = 255 - 2 * tId * delta;
369+
HIST_SMEM_NO_BANK_CONFLICT(idx) += HIST_SMEM_NO_BANK_CONFLICT(idx - delta);
370+
}
371+
__syncthreads();
372+
}
373+
374+
// Step2 Phase2 - The Down-Sweep Phase
375+
if (tId == 0)
376+
HIST_SMEM_NO_BANK_CONFLICT(255) = 0;
377+
378+
for (int delta = 128; delta >= 1; delta /= 2)
379+
{
380+
if (tId < 128 / delta)
381+
{
382+
int rootIdx = 255 - tId * delta * 2;
383+
int leftIdx = rootIdx - delta;
384+
int tmp = HIST_SMEM_NO_BANK_CONFLICT(leftIdx);
385+
HIST_SMEM_NO_BANK_CONFLICT(leftIdx) = HIST_SMEM_NO_BANK_CONFLICT(rootIdx);
386+
HIST_SMEM_NO_BANK_CONFLICT(rootIdx) += tmp;
387+
}
388+
__syncthreads();
389+
}
390+
391+
// Step2 Phase3 - Convert exclusive sum to inclusive sum
392+
int tmp = HIST_SMEM_NO_BANK_CONFLICT(tId);
393+
__syncthreads();
394+
if (tId >= 1)
395+
HIST_SMEM_NO_BANK_CONFLICT(tId - 1) = tmp;
396+
if (tId == 255)
397+
HIST_SMEM_NO_BANK_CONFLICT(tId) = tmp + hist[tId];
398+
__syncthreads();
399+
400+
// Step3 - Scale values to build lut
401+
402+
lut[tId] = saturate_cast<unsigned char>(HIST_SMEM_NO_BANK_CONFLICT(tId) * (255.0f / (size - minNonZeroVal)));
403+
404+
#undef HIST_SMEM_NO_BANK_CONFLICT
405+
}
406+
407+
void buildLut(PtrStepSzi hist, PtrStepSzb lut, int size, cudaStream_t stream)
408+
{
409+
buildLutKernel<<<1, 256, 0, stream>>>(hist.data, lut.data, size);
410+
cudaSafeCall( cudaGetLastError() );
411+
412+
if (stream == 0)
413+
cudaSafeCall( cudaDeviceSynchronize() );
296414
}
297415
}
298416

modules/cudaimgproc/src/histogram.cpp

Lines changed: 6 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,8 @@ void cv::cuda::calcHist(InputArray _src, InputArray _mask, OutputArray _hist, St
102102

103103
namespace hist
104104
{
105-
void equalizeHist(PtrStepSzb src, PtrStepSzb dst, const int* lut, cudaStream_t stream);
105+
void equalizeHist(PtrStepSzb src, PtrStepSzb dst, const uchar* lut, cudaStream_t stream);
106+
void buildLut(PtrStepSzi hist, PtrStepSzb lut, int size, cudaStream_t stream);
106107
}
107108

108109
void cv::cuda::equalizeHist(InputArray _src, OutputArray _dst, Stream& _stream)
@@ -114,26 +115,21 @@ void cv::cuda::equalizeHist(InputArray _src, OutputArray _dst, Stream& _stream)
114115
_dst.create(src.size(), src.type());
115116
GpuMat dst = _dst.getGpuMat();
116117

117-
int intBufSize;
118-
nppSafeCall( nppsIntegralGetBufferSize_32s(256, &intBufSize) );
119-
120-
size_t bufSize = intBufSize + 2 * 256 * sizeof(int);
118+
size_t bufSize = 256 * sizeof(int) + 256 * sizeof(uchar);
121119

122120
BufferPool pool(_stream);
123121
GpuMat buf = pool.getBuffer(1, static_cast<int>(bufSize), CV_8UC1);
124122

125123
GpuMat hist(1, 256, CV_32SC1, buf.data);
126-
GpuMat lut(1, 256, CV_32SC1, buf.data + 256 * sizeof(int));
127-
GpuMat intBuf(1, intBufSize, CV_8UC1, buf.data + 2 * 256 * sizeof(int));
124+
GpuMat lut(1, 256, CV_8UC1, buf.data + 256 * sizeof(int));
128125

129126
cuda::calcHist(src, hist, _stream);
130127

131128
cudaStream_t stream = StreamAccessor::getStream(_stream);
132-
NppStreamHandler h(stream);
133129

134-
nppSafeCall( nppsIntegral_32s(hist.ptr<Npp32s>(), lut.ptr<Npp32s>(), 256, intBuf.ptr<Npp8u>()) );
130+
hist::buildLut(hist, lut, src.rows * src.cols, stream);
135131

136-
hist::equalizeHist(src, dst, lut.ptr<int>(), stream);
132+
hist::equalizeHist(src, dst, lut.data, stream);
137133
}
138134

139135
////////////////////////////////////////////////////////////////////////

modules/cudaimgproc/test/test_histogram.cpp

Lines changed: 80 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -208,7 +208,7 @@ CUDA_TEST_P(EqualizeHist, Async)
208208
cv::Mat dst_gold;
209209
cv::equalizeHist(src, dst_gold);
210210

211-
EXPECT_MAT_NEAR(dst_gold, dst, 3.0);
211+
EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
212212
}
213213

214214
CUDA_TEST_P(EqualizeHist, Accuracy)
@@ -221,13 +221,91 @@ CUDA_TEST_P(EqualizeHist, Accuracy)
221221
cv::Mat dst_gold;
222222
cv::equalizeHist(src, dst_gold);
223223

224-
EXPECT_MAT_NEAR(dst_gold, dst, 3.0);
224+
EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
225225
}
226226

227227
INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, EqualizeHist, testing::Combine(
228228
ALL_DEVICES,
229229
DIFFERENT_SIZES));
230230

231+
TEST(EqualizeHistIssue, Issue18035)
232+
{
233+
std::vector<std::string> imgPaths;
234+
imgPaths.push_back(std::string(cvtest::TS::ptr()->get_data_path()) + "../cv/shared/3MP.png");
235+
imgPaths.push_back(std::string(cvtest::TS::ptr()->get_data_path()) + "../cv/shared/5MP.png");
236+
imgPaths.push_back(std::string(cvtest::TS::ptr()->get_data_path()) + "../cv/shared/airplane.png");
237+
imgPaths.push_back(std::string(cvtest::TS::ptr()->get_data_path()) + "../cv/shared/baboon.png");
238+
imgPaths.push_back(std::string(cvtest::TS::ptr()->get_data_path()) + "../cv/shared/box.png");
239+
imgPaths.push_back(std::string(cvtest::TS::ptr()->get_data_path()) + "../cv/shared/box_in_scene.png");
240+
imgPaths.push_back(std::string(cvtest::TS::ptr()->get_data_path()) + "../cv/shared/fruits.png");
241+
imgPaths.push_back(std::string(cvtest::TS::ptr()->get_data_path()) + "../cv/shared/fruits_ecc.png");
242+
imgPaths.push_back(std::string(cvtest::TS::ptr()->get_data_path()) + "../cv/shared/graffiti.png");
243+
imgPaths.push_back(std::string(cvtest::TS::ptr()->get_data_path()) + "../cv/shared/lena.png");
244+
245+
for (int i = 0; i < imgPaths.size(); ++i)
246+
{
247+
std::string imgPath = imgPaths[i];
248+
cv::Mat src = cv::imread(imgPath, cv::IMREAD_GRAYSCALE);
249+
src = src / 30;
250+
251+
cv::cuda::GpuMat d_src, dst;
252+
d_src.upload(src);
253+
cv::cuda::equalizeHist(d_src, dst);
254+
255+
cv::Mat dst_gold;
256+
cv::equalizeHist(src, dst_gold);
257+
258+
EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
259+
}
260+
}
261+
262+
PARAM_TEST_CASE(EqualizeHistExtreme, cv::cuda::DeviceInfo, cv::Size, int)
263+
{
264+
cv::cuda::DeviceInfo devInfo;
265+
cv::Size size;
266+
int val;
267+
268+
virtual void SetUp()
269+
{
270+
devInfo = GET_PARAM(0);
271+
size = GET_PARAM(1);
272+
val = GET_PARAM(2);
273+
274+
cv::cuda::setDevice(devInfo.deviceID());
275+
}
276+
};
277+
278+
CUDA_TEST_P(EqualizeHistExtreme, Case1)
279+
{
280+
cv::Mat src(size, CV_8UC1, val);
281+
282+
cv::cuda::GpuMat dst;
283+
cv::cuda::equalizeHist(loadMat(src), dst);
284+
285+
cv::Mat dst_gold;
286+
cv::equalizeHist(src, dst_gold);
287+
288+
EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
289+
}
290+
291+
CUDA_TEST_P(EqualizeHistExtreme, Case2)
292+
{
293+
cv::Mat src = randomMat(size, CV_8UC1, val);
294+
295+
cv::cuda::GpuMat dst;
296+
cv::cuda::equalizeHist(loadMat(src), dst);
297+
298+
cv::Mat dst_gold;
299+
cv::equalizeHist(src, dst_gold);
300+
301+
EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
302+
}
303+
304+
INSTANTIATE_TEST_CASE_P(CUDA_ImgProc, EqualizeHistExtreme, testing::Combine(
305+
ALL_DEVICES,
306+
DIFFERENT_SIZES,
307+
testing::Range(0, 256)));
308+
231309
///////////////////////////////////////////////////////////////////////////////////////////////////////
232310
// CLAHE
233311

0 commit comments

Comments
 (0)