Skip to content

Commit 4f66f86

Browse files
authored
Merge pull request #3475 from cudawarped:cuda_fix_unaligned_hist
Modified histogram kernels to work with non aligned data
2 parents 70f8706 + 10e29b2 commit 4f66f86

File tree

3 files changed

+156
-96
lines changed

3 files changed

+156
-96
lines changed

modules/cudaimgproc/src/cuda/hist.cu

Lines changed: 90 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -52,38 +52,41 @@ using namespace cv::cuda::device;
5252

5353
namespace hist
5454
{
55-
__global__ void histogram256Kernel(const uchar* src, int cols, int rows, size_t step, int* hist)
55+
template<bool fourByteAligned>
56+
__global__ void histogram256Kernel(const uchar* src, int cols, int rows, size_t step, int* hist, const int offsetX = 0)
5657
{
5758
__shared__ int shist[256];
5859

5960
const int y = blockIdx.x * blockDim.y + threadIdx.y;
6061
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
61-
62+
const int alignedOffset = fourByteAligned ? 0 : 4 - offsetX;
6263
shist[tid] = 0;
6364
__syncthreads();
6465

65-
if (y < rows)
66-
{
67-
const unsigned int* rowPtr = (const unsigned int*) (src + y * step);
68-
69-
const int cols_4 = cols / 4;
70-
for (int x = threadIdx.x; x < cols_4; x += blockDim.x)
71-
{
72-
unsigned int data = rowPtr[x];
66+
if (y < rows) {
67+
const uchar* rowPtr = &src[y * step];
68+
// load uncoalesced head
69+
if (!fourByteAligned && threadIdx.x == 0) {
70+
for (int x = 0; x < min(alignedOffset, cols); x++)
71+
Emulation::smem::atomicAdd(&shist[static_cast<int>(rowPtr[x])], 1);
72+
}
7373

74-
Emulation::smem::atomicAdd(&shist[(data >> 0) & 0xFFU], 1);
75-
Emulation::smem::atomicAdd(&shist[(data >> 8) & 0xFFU], 1);
74+
// coalesced loads
75+
const unsigned int* rowPtrIntAligned = (const unsigned int*)(fourByteAligned ? &src[y * step] : &src[alignedOffset + y * step]);
76+
const int cols_4 = fourByteAligned ? cols / 4 : (cols - alignedOffset) / 4;
77+
for (int x = threadIdx.x; x < cols_4; x += blockDim.x) {
78+
const unsigned int data = rowPtrIntAligned[x];
79+
Emulation::smem::atomicAdd(&shist[(data >> 0) & 0xFFU], 1);
80+
Emulation::smem::atomicAdd(&shist[(data >> 8) & 0xFFU], 1);
7681
Emulation::smem::atomicAdd(&shist[(data >> 16) & 0xFFU], 1);
7782
Emulation::smem::atomicAdd(&shist[(data >> 24) & 0xFFU], 1);
7883
}
7984

80-
if (cols % 4 != 0 && threadIdx.x == 0)
81-
{
82-
for (int x = cols_4 * 4; x < cols; ++x)
83-
{
84-
unsigned int data = ((const uchar*)rowPtr)[x];
85-
Emulation::smem::atomicAdd(&shist[data], 1);
86-
}
85+
// load uncoalesced tail
86+
if (threadIdx.x == 0) {
87+
const int iTailStart = fourByteAligned ? cols_4 * 4 : cols_4 * 4 + alignedOffset;
88+
for (int x = iTailStart; x < cols; x++)
89+
Emulation::smem::atomicAdd(&shist[static_cast<int>(rowPtr[x])], 1);
8790
}
8891
}
8992

@@ -94,61 +97,70 @@ namespace hist
9497
::atomicAdd(hist + tid, histVal);
9598
}
9699

97-
void histogram256(PtrStepSzb src, int* hist, cudaStream_t stream)
100+
void histogram256(PtrStepSzb src, int* hist, const int offsetX, cudaStream_t stream)
98101
{
99102
const dim3 block(32, 8);
100103
const dim3 grid(divUp(src.rows, block.y));
101-
102-
histogram256Kernel<<<grid, block, 0, stream>>>(src.data, src.cols, src.rows, src.step, hist);
104+
if(offsetX)
105+
histogram256Kernel<false><<<grid, block, 0, stream>>>(src.data, src.cols, src.rows, src.step, hist, offsetX);
106+
else
107+
histogram256Kernel<true><<<grid, block, 0, stream>>>(src.data, src.cols, src.rows, src.step, hist, offsetX);
103108
cudaSafeCall( cudaGetLastError() );
104109

105110
if (stream == 0)
106111
cudaSafeCall( cudaDeviceSynchronize() );
107112
}
108113

109-
__global__ void histogram256Kernel(const uchar* src, int cols, int rows, size_t srcStep, const uchar* mask, size_t maskStep, int* hist)
114+
template<bool fourByteAligned>
115+
__global__ void histogram256Kernel(const uchar* src, int cols, int rows, size_t srcStep, const uchar* mask, size_t maskStep, int* hist, const int offsetX = 0)
110116
{
111117
__shared__ int shist[256];
112118

113119
const int y = blockIdx.x * blockDim.y + threadIdx.y;
114120
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
115-
121+
const int alignedOffset = fourByteAligned ? 0 : 4 - offsetX;
116122
shist[tid] = 0;
117123
__syncthreads();
118124

119125
if (y < rows)
120126
{
121-
const unsigned int* rowPtr = (const unsigned int*) (src + y * srcStep);
122-
const unsigned int* maskRowPtr = (const unsigned int*) (mask + y * maskStep);
127+
const uchar* rowPtr = &src[y * srcStep];
128+
const uchar* maskRowPtr = &mask[y * maskStep];
129+
// load uncoalesced head
130+
if (!fourByteAligned && threadIdx.x == 0) {
131+
for (int x = 0; x < min(alignedOffset, cols); x++) {
132+
if (maskRowPtr[x])
133+
Emulation::smem::atomicAdd(&shist[rowPtr[x]], 1);
134+
}
135+
}
123136

124-
const int cols_4 = cols / 4;
125-
for (int x = threadIdx.x; x < cols_4; x += blockDim.x)
126-
{
127-
unsigned int data = rowPtr[x];
128-
unsigned int m = maskRowPtr[x];
137+
// coalesced loads
138+
const unsigned int* rowPtrIntAligned = (const unsigned int*)(fourByteAligned ? &src[y * srcStep] : &src[alignedOffset + y * maskStep]);
139+
const unsigned int* maskRowPtrIntAligned = (const unsigned int*)(fourByteAligned ? &mask[y * maskStep] : &mask[alignedOffset + y * maskStep]);
140+
const int cols_4 = fourByteAligned ? cols / 4 : (cols - alignedOffset) / 4;
141+
for (int x = threadIdx.x; x < cols_4; x += blockDim.x) {
142+
const unsigned int data = rowPtrIntAligned[x];
143+
const unsigned int m = maskRowPtrIntAligned[x];
129144

130-
if ((m >> 0) & 0xFFU)
131-
Emulation::smem::atomicAdd(&shist[(data >> 0) & 0xFFU], 1);
145+
if ((m >> 0) & 0xFFU)
146+
Emulation::smem::atomicAdd(&shist[(data >> 0) & 0xFFU], 1);
132147

133-
if ((m >> 8) & 0xFFU)
134-
Emulation::smem::atomicAdd(&shist[(data >> 8) & 0xFFU], 1);
148+
if ((m >> 8) & 0xFFU)
149+
Emulation::smem::atomicAdd(&shist[(data >> 8) & 0xFFU], 1);
135150

136-
if ((m >> 16) & 0xFFU)
151+
if ((m >> 16) & 0xFFU)
137152
Emulation::smem::atomicAdd(&shist[(data >> 16) & 0xFFU], 1);
138153

139-
if ((m >> 24) & 0xFFU)
154+
if ((m >> 24) & 0xFFU)
140155
Emulation::smem::atomicAdd(&shist[(data >> 24) & 0xFFU], 1);
141156
}
142157

143-
if (cols % 4 != 0 && threadIdx.x == 0)
144-
{
145-
for (int x = cols_4 * 4; x < cols; ++x)
146-
{
147-
unsigned int data = ((const uchar*)rowPtr)[x];
148-
unsigned int m = ((const uchar*)maskRowPtr)[x];
149-
150-
if (m)
151-
Emulation::smem::atomicAdd(&shist[data], 1);
158+
// load uncoalesced tail
159+
if (threadIdx.x == 0) {
160+
const int iTailStart = fourByteAligned ? cols_4 * 4 : cols_4 * 4 + alignedOffset;
161+
for (int x = iTailStart; x < cols; x++) {
162+
if (maskRowPtr[x])
163+
Emulation::smem::atomicAdd(&shist[static_cast<int>(rowPtr[x])], 1);
152164
}
153165
}
154166
}
@@ -160,12 +172,15 @@ namespace hist
160172
::atomicAdd(hist + tid, histVal);
161173
}
162174

163-
void histogram256(PtrStepSzb src, PtrStepSzb mask, int* hist, cudaStream_t stream)
175+
void histogram256(PtrStepSzb src, PtrStepSzb mask, int* hist, const int offsetX, cudaStream_t stream)
164176
{
165177
const dim3 block(32, 8);
166178
const dim3 grid(divUp(src.rows, block.y));
167179

168-
histogram256Kernel<<<grid, block, 0, stream>>>(src.data, src.cols, src.rows, src.step, mask.data, mask.step, hist);
180+
if(offsetX)
181+
histogram256Kernel<false><<<grid, block, 0, stream>>>(src.data, src.cols, src.rows, src.step, mask.data, mask.step, hist, offsetX);
182+
else
183+
histogram256Kernel<true><<<grid, block, 0, stream>>>(src.data, src.cols, src.rows, src.step, mask.data, mask.step, hist, offsetX);
169184
cudaSafeCall( cudaGetLastError() );
170185

171186
if (stream == 0)
@@ -186,42 +201,44 @@ namespace hist
186201
}
187202
}
188203

189-
__global__ void histEven8u(const uchar* src, const size_t step, const int rows, const int cols,
190-
int* hist, const int binCount, const int binSize, const int lowerLevel, const int upperLevel)
204+
template<bool fourByteAligned>
205+
__global__ void histEven8u(const uchar* src, const size_t step, const int rows, const int cols, int* hist, const int binCount, const int binSize,
206+
const int lowerLevel, const int upperLevel, const int offsetX)
191207
{
192208
extern __shared__ int shist[];
193209

194210
const int y = blockIdx.x * blockDim.y + threadIdx.y;
195211
const int tid = threadIdx.y * blockDim.x + threadIdx.x;
196-
212+
const int alignedOffset = fourByteAligned ? 0 : 4 - offsetX;
197213
if (tid < binCount)
198214
shist[tid] = 0;
199-
200215
__syncthreads();
201216

202217
if (y < rows)
203218
{
204-
const uchar* rowPtr = src + y * step;
205-
const uint* rowPtr4 = (uint*) rowPtr;
206-
207-
const int cols_4 = cols / 4;
208-
for (int x = threadIdx.x; x < cols_4; x += blockDim.x)
209-
{
210-
const uint data = rowPtr4[x];
219+
const uchar* rowPtr = &src[y * step];
220+
// load uncoalesced head
221+
if (!fourByteAligned && threadIdx.x == 0) {
222+
for (int x = 0; x < min(alignedOffset, cols); x++)
223+
histEvenInc(shist, rowPtr[x], binSize, lowerLevel, upperLevel);
224+
}
211225

212-
histEvenInc(shist, (data >> 0) & 0xFFU, binSize, lowerLevel, upperLevel);
213-
histEvenInc(shist, (data >> 8) & 0xFFU, binSize, lowerLevel, upperLevel);
226+
// coalesced loads
227+
const unsigned int* rowPtrIntAligned = (const unsigned int*)(fourByteAligned ? &src[y * step] : &src[alignedOffset + y * step]);
228+
const int cols_4 = fourByteAligned ? cols / 4 : (cols - alignedOffset) / 4;
229+
for (int x = threadIdx.x; x < cols_4; x += blockDim.x) {
230+
const unsigned int data = rowPtrIntAligned[x];
231+
histEvenInc(shist, (data >> 0) & 0xFFU, binSize, lowerLevel, upperLevel);
232+
histEvenInc(shist, (data >> 8) & 0xFFU, binSize, lowerLevel, upperLevel);
214233
histEvenInc(shist, (data >> 16) & 0xFFU, binSize, lowerLevel, upperLevel);
215234
histEvenInc(shist, (data >> 24) & 0xFFU, binSize, lowerLevel, upperLevel);
216235
}
217236

218-
if (cols % 4 != 0 && threadIdx.x == 0)
219-
{
220-
for (int x = cols_4 * 4; x < cols; ++x)
221-
{
222-
const uchar data = rowPtr[x];
223-
histEvenInc(shist, data, binSize, lowerLevel, upperLevel);
224-
}
237+
// load uncoalesced tail
238+
if (threadIdx.x == 0) {
239+
const int iTailStart = fourByteAligned ? cols_4 * 4 : cols_4 * 4 + alignedOffset;
240+
for (int x = iTailStart; x < cols; x++)
241+
histEvenInc(shist, rowPtr[x], binSize, lowerLevel, upperLevel);
225242
}
226243
}
227244

@@ -236,7 +253,7 @@ namespace hist
236253
}
237254
}
238255

239-
void histEven8u(PtrStepSzb src, int* hist, int binCount, int lowerLevel, int upperLevel, cudaStream_t stream)
256+
void histEven8u(PtrStepSzb src, int* hist, int binCount, int lowerLevel, int upperLevel, const int offsetX, cudaStream_t stream)
240257
{
241258
const dim3 block(32, 8);
242259
const dim3 grid(divUp(src.rows, block.y));
@@ -245,7 +262,10 @@ namespace hist
245262

246263
const size_t smem_size = binCount * sizeof(int);
247264

248-
histEven8u<<<grid, block, smem_size, stream>>>(src.data, src.step, src.rows, src.cols, hist, binCount, binSize, lowerLevel, upperLevel);
265+
if(offsetX)
266+
histEven8u<false><<<grid, block, smem_size, stream>>>(src.data, src.step, src.rows, src.cols, hist, binCount, binSize, lowerLevel, upperLevel, offsetX);
267+
else
268+
histEven8u<true><<<grid, block, smem_size, stream>>>(src.data, src.step, src.rows, src.cols, hist, binCount, binSize, lowerLevel, upperLevel, offsetX);
249269
cudaSafeCall( cudaGetLastError() );
250270

251271
if (stream == 0)

modules/cudaimgproc/src/histogram.cpp

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -68,8 +68,8 @@ void cv::cuda::histRange(InputArray, GpuMat*, const GpuMat*, Stream&) { throw_no
6868

6969
namespace hist
7070
{
71-
void histogram256(PtrStepSzb src, int* hist, cudaStream_t stream);
72-
void histogram256(PtrStepSzb src, PtrStepSzb mask, int* hist, cudaStream_t stream);
71+
void histogram256(PtrStepSzb src, int* hist, const int offsetX, cudaStream_t stream);
72+
void histogram256(PtrStepSzb src, PtrStepSzb mask, int* hist, const int offsetX, cudaStream_t stream);
7373
}
7474

7575
void cv::cuda::calcHist(InputArray _src, OutputArray _hist, Stream& stream)
@@ -91,10 +91,12 @@ void cv::cuda::calcHist(InputArray _src, InputArray _mask, OutputArray _hist, St
9191

9292
hist.setTo(Scalar::all(0), stream);
9393

94+
Point ofs; Size wholeSize;
95+
src.locateROI(wholeSize, ofs);
9496
if (mask.empty())
95-
hist::histogram256(src, hist.ptr<int>(), StreamAccessor::getStream(stream));
97+
hist::histogram256(src, hist.ptr<int>(), ofs.x, StreamAccessor::getStream(stream));
9698
else
97-
hist::histogram256(src, mask, hist.ptr<int>(), StreamAccessor::getStream(stream));
99+
hist::histogram256(src, mask, hist.ptr<int>(), ofs.x, StreamAccessor::getStream(stream));
98100
}
99101

100102
////////////////////////////////////////////////////////////////////////
@@ -494,16 +496,18 @@ void cv::cuda::evenLevels(OutputArray _levels, int nLevels, int lowerLevel, int
494496

495497
namespace hist
496498
{
497-
void histEven8u(PtrStepSzb src, int* hist, int binCount, int lowerLevel, int upperLevel, cudaStream_t stream);
499+
void histEven8u(PtrStepSzb src, int* hist, int binCount, int lowerLevel, int upperLevel, const int offsetX, cudaStream_t stream);
498500
}
499501

500502
namespace
501503
{
502504
void histEven8u(const GpuMat& src, GpuMat& hist, int histSize, int lowerLevel, int upperLevel, cudaStream_t stream)
503505
{
506+
Point ofs; Size wholeSize;
507+
src.locateROI(wholeSize, ofs);
504508
hist.create(1, histSize, CV_32S);
505509
cudaSafeCall( cudaMemsetAsync(hist.data, 0, histSize * sizeof(int), stream) );
506-
hist::histEven8u(src, hist.ptr<int>(), histSize, lowerLevel, upperLevel, stream);
510+
hist::histEven8u(src, hist.ptr<int>(), histSize, lowerLevel, upperLevel, ofs.x, stream);
507511
}
508512
}
509513

0 commit comments

Comments
 (0)