@@ -52,38 +52,41 @@ using namespace cv::cuda::device;
52
52
53
53
namespace hist
54
54
{
55
- __global__ void histogram256Kernel (const uchar* src, int cols, int rows, size_t step, int * hist)
55
+ template <bool fourByteAligned>
56
+ __global__ void histogram256Kernel (const uchar* src, int cols, int rows, size_t step, int * hist, const int offsetX = 0 )
56
57
{
57
58
__shared__ int shist[256 ];
58
59
59
60
const int y = blockIdx .x * blockDim .y + threadIdx .y ;
60
61
const int tid = threadIdx .y * blockDim .x + threadIdx .x ;
61
-
62
+ const int alignedOffset = fourByteAligned ? 0 : 4 - offsetX;
62
63
shist[tid] = 0 ;
63
64
__syncthreads ();
64
65
65
- if (y < rows)
66
- {
67
- const unsigned int * rowPtr = (const unsigned int *) (src + y * step);
68
-
69
- const int cols_4 = cols / 4 ;
70
- for (int x = threadIdx .x ; x < cols_4; x += blockDim .x )
71
- {
72
- unsigned int data = rowPtr[x];
66
+ if (y < rows) {
67
+ const uchar* rowPtr = &src[y * step];
68
+ // load uncoalesced head
69
+ if (!fourByteAligned && threadIdx .x == 0 ) {
70
+ for (int x = 0 ; x < min (alignedOffset, cols); x++)
71
+ Emulation::smem::atomicAdd (&shist[static_cast <int >(rowPtr[x])], 1 );
72
+ }
73
73
74
- Emulation::smem::atomicAdd (&shist[(data >> 0 ) & 0xFFU ], 1 );
75
- Emulation::smem::atomicAdd (&shist[(data >> 8 ) & 0xFFU ], 1 );
74
+ // coalesced loads
75
+ const unsigned int * rowPtrIntAligned = (const unsigned int *)(fourByteAligned ? &src[y * step] : &src[alignedOffset + y * step]);
76
+ const int cols_4 = fourByteAligned ? cols / 4 : (cols - alignedOffset) / 4 ;
77
+ for (int x = threadIdx .x ; x < cols_4; x += blockDim .x ) {
78
+ const unsigned int data = rowPtrIntAligned[x];
79
+ Emulation::smem::atomicAdd (&shist[(data >> 0 ) & 0xFFU ], 1 );
80
+ Emulation::smem::atomicAdd (&shist[(data >> 8 ) & 0xFFU ], 1 );
76
81
Emulation::smem::atomicAdd (&shist[(data >> 16 ) & 0xFFU ], 1 );
77
82
Emulation::smem::atomicAdd (&shist[(data >> 24 ) & 0xFFU ], 1 );
78
83
}
79
84
80
- if (cols % 4 != 0 && threadIdx .x == 0 )
81
- {
82
- for (int x = cols_4 * 4 ; x < cols; ++x)
83
- {
84
- unsigned int data = ((const uchar*)rowPtr)[x];
85
- Emulation::smem::atomicAdd (&shist[data], 1 );
86
- }
85
+ // load uncoalesced tail
86
+ if (threadIdx .x == 0 ) {
87
+ const int iTailStart = fourByteAligned ? cols_4 * 4 : cols_4 * 4 + alignedOffset;
88
+ for (int x = iTailStart; x < cols; x++)
89
+ Emulation::smem::atomicAdd (&shist[static_cast <int >(rowPtr[x])], 1 );
87
90
}
88
91
}
89
92
@@ -94,61 +97,70 @@ namespace hist
94
97
::atomicAdd (hist + tid, histVal);
95
98
}
96
99
97
- void histogram256 (PtrStepSzb src, int * hist, cudaStream_t stream)
100
+ void histogram256 (PtrStepSzb src, int * hist, const int offsetX, cudaStream_t stream)
98
101
{
99
102
const dim3 block (32 , 8 );
100
103
const dim3 grid (divUp (src.rows , block.y ));
101
-
102
- histogram256Kernel<<<grid, block, 0 , stream>>> (src.data , src.cols , src.rows , src.step , hist);
104
+ if (offsetX)
105
+ histogram256Kernel<false ><<<grid, block, 0 , stream>>> (src.data , src.cols , src.rows , src.step , hist, offsetX);
106
+ else
107
+ histogram256Kernel<true ><<<grid, block, 0 , stream>>> (src.data , src.cols , src.rows , src.step , hist, offsetX);
103
108
cudaSafeCall ( cudaGetLastError () );
104
109
105
110
if (stream == 0 )
106
111
cudaSafeCall ( cudaDeviceSynchronize () );
107
112
}
108
113
109
- __global__ void histogram256Kernel (const uchar* src, int cols, int rows, size_t srcStep, const uchar* mask, size_t maskStep, int * hist)
114
+ template <bool fourByteAligned>
115
+ __global__ void histogram256Kernel (const uchar* src, int cols, int rows, size_t srcStep, const uchar* mask, size_t maskStep, int * hist, const int offsetX = 0 )
110
116
{
111
117
__shared__ int shist[256 ];
112
118
113
119
const int y = blockIdx .x * blockDim .y + threadIdx .y ;
114
120
const int tid = threadIdx .y * blockDim .x + threadIdx .x ;
115
-
121
+ const int alignedOffset = fourByteAligned ? 0 : 4 - offsetX;
116
122
shist[tid] = 0 ;
117
123
__syncthreads ();
118
124
119
125
if (y < rows)
120
126
{
121
- const unsigned int * rowPtr = (const unsigned int *) (src + y * srcStep);
122
- const unsigned int * maskRowPtr = (const unsigned int *) (mask + y * maskStep);
127
+ const uchar* rowPtr = &src[y * srcStep];
128
+ const uchar* maskRowPtr = &mask[y * maskStep];
129
+ // load uncoalesced head
130
+ if (!fourByteAligned && threadIdx .x == 0 ) {
131
+ for (int x = 0 ; x < min (alignedOffset, cols); x++) {
132
+ if (maskRowPtr[x])
133
+ Emulation::smem::atomicAdd (&shist[rowPtr[x]], 1 );
134
+ }
135
+ }
123
136
124
- const int cols_4 = cols / 4 ;
125
- for (int x = threadIdx .x ; x < cols_4; x += blockDim .x )
126
- {
127
- unsigned int data = rowPtr[x];
128
- unsigned int m = maskRowPtr[x];
137
+ // coalesced loads
138
+ const unsigned int * rowPtrIntAligned = (const unsigned int *)(fourByteAligned ? &src[y * srcStep] : &src[alignedOffset + y * maskStep]);
139
+ const unsigned int * maskRowPtrIntAligned = (const unsigned int *)(fourByteAligned ? &mask[y * maskStep] : &mask[alignedOffset + y * maskStep]);
140
+ const int cols_4 = fourByteAligned ? cols / 4 : (cols - alignedOffset) / 4 ;
141
+ for (int x = threadIdx .x ; x < cols_4; x += blockDim .x ) {
142
+ const unsigned int data = rowPtrIntAligned[x];
143
+ const unsigned int m = maskRowPtrIntAligned[x];
129
144
130
- if ((m >> 0 ) & 0xFFU )
131
- Emulation::smem::atomicAdd (&shist[(data >> 0 ) & 0xFFU ], 1 );
145
+ if ((m >> 0 ) & 0xFFU )
146
+ Emulation::smem::atomicAdd (&shist[(data >> 0 ) & 0xFFU ], 1 );
132
147
133
- if ((m >> 8 ) & 0xFFU )
134
- Emulation::smem::atomicAdd (&shist[(data >> 8 ) & 0xFFU ], 1 );
148
+ if ((m >> 8 ) & 0xFFU )
149
+ Emulation::smem::atomicAdd (&shist[(data >> 8 ) & 0xFFU ], 1 );
135
150
136
- if ((m >> 16 ) & 0xFFU )
151
+ if ((m >> 16 ) & 0xFFU )
137
152
Emulation::smem::atomicAdd (&shist[(data >> 16 ) & 0xFFU ], 1 );
138
153
139
- if ((m >> 24 ) & 0xFFU )
154
+ if ((m >> 24 ) & 0xFFU )
140
155
Emulation::smem::atomicAdd (&shist[(data >> 24 ) & 0xFFU ], 1 );
141
156
}
142
157
143
- if (cols % 4 != 0 && threadIdx .x == 0 )
144
- {
145
- for (int x = cols_4 * 4 ; x < cols; ++x)
146
- {
147
- unsigned int data = ((const uchar*)rowPtr)[x];
148
- unsigned int m = ((const uchar*)maskRowPtr)[x];
149
-
150
- if (m)
151
- Emulation::smem::atomicAdd (&shist[data], 1 );
158
+ // load uncoalesced tail
159
+ if (threadIdx .x == 0 ) {
160
+ const int iTailStart = fourByteAligned ? cols_4 * 4 : cols_4 * 4 + alignedOffset;
161
+ for (int x = iTailStart; x < cols; x++) {
162
+ if (maskRowPtr[x])
163
+ Emulation::smem::atomicAdd (&shist[static_cast <int >(rowPtr[x])], 1 );
152
164
}
153
165
}
154
166
}
@@ -160,12 +172,15 @@ namespace hist
160
172
::atomicAdd (hist + tid, histVal);
161
173
}
162
174
163
- void histogram256 (PtrStepSzb src, PtrStepSzb mask, int * hist, cudaStream_t stream)
175
+ void histogram256 (PtrStepSzb src, PtrStepSzb mask, int * hist, const int offsetX, cudaStream_t stream)
164
176
{
165
177
const dim3 block (32 , 8 );
166
178
const dim3 grid (divUp (src.rows , block.y ));
167
179
168
- histogram256Kernel<<<grid, block, 0 , stream>>> (src.data , src.cols , src.rows , src.step , mask.data , mask.step , hist);
180
+ if (offsetX)
181
+ histogram256Kernel<false ><<<grid, block, 0 , stream>>> (src.data , src.cols , src.rows , src.step , mask.data , mask.step , hist, offsetX);
182
+ else
183
+ histogram256Kernel<true ><<<grid, block, 0 , stream>>> (src.data , src.cols , src.rows , src.step , mask.data , mask.step , hist, offsetX);
169
184
cudaSafeCall ( cudaGetLastError () );
170
185
171
186
if (stream == 0 )
@@ -186,42 +201,44 @@ namespace hist
186
201
}
187
202
}
188
203
189
- __global__ void histEven8u (const uchar* src, const size_t step, const int rows, const int cols,
190
- int * hist, const int binCount, const int binSize, const int lowerLevel, const int upperLevel)
204
+ template <bool fourByteAligned>
205
+ __global__ void histEven8u (const uchar* src, const size_t step, const int rows, const int cols, int * hist, const int binCount, const int binSize,
206
+ const int lowerLevel, const int upperLevel, const int offsetX)
191
207
{
192
208
extern __shared__ int shist[];
193
209
194
210
const int y = blockIdx .x * blockDim .y + threadIdx .y ;
195
211
const int tid = threadIdx .y * blockDim .x + threadIdx .x ;
196
-
212
+ const int alignedOffset = fourByteAligned ? 0 : 4 - offsetX;
197
213
if (tid < binCount)
198
214
shist[tid] = 0 ;
199
-
200
215
__syncthreads ();
201
216
202
217
if (y < rows)
203
218
{
204
- const uchar* rowPtr = src + y * step;
205
- const uint* rowPtr4 = (uint*) rowPtr;
206
-
207
- const int cols_4 = cols / 4 ;
208
- for (int x = threadIdx .x ; x < cols_4; x += blockDim .x )
209
- {
210
- const uint data = rowPtr4[x];
219
+ const uchar* rowPtr = &src[y * step];
220
+ // load uncoalesced head
221
+ if (!fourByteAligned && threadIdx .x == 0 ) {
222
+ for (int x = 0 ; x < min (alignedOffset, cols); x++)
223
+ histEvenInc (shist, rowPtr[x], binSize, lowerLevel, upperLevel);
224
+ }
211
225
212
- histEvenInc (shist, (data >> 0 ) & 0xFFU , binSize, lowerLevel, upperLevel);
213
- histEvenInc (shist, (data >> 8 ) & 0xFFU , binSize, lowerLevel, upperLevel);
226
+ // coalesced loads
227
+ const unsigned int * rowPtrIntAligned = (const unsigned int *)(fourByteAligned ? &src[y * step] : &src[alignedOffset + y * step]);
228
+ const int cols_4 = fourByteAligned ? cols / 4 : (cols - alignedOffset) / 4 ;
229
+ for (int x = threadIdx .x ; x < cols_4; x += blockDim .x ) {
230
+ const unsigned int data = rowPtrIntAligned[x];
231
+ histEvenInc (shist, (data >> 0 ) & 0xFFU , binSize, lowerLevel, upperLevel);
232
+ histEvenInc (shist, (data >> 8 ) & 0xFFU , binSize, lowerLevel, upperLevel);
214
233
histEvenInc (shist, (data >> 16 ) & 0xFFU , binSize, lowerLevel, upperLevel);
215
234
histEvenInc (shist, (data >> 24 ) & 0xFFU , binSize, lowerLevel, upperLevel);
216
235
}
217
236
218
- if (cols % 4 != 0 && threadIdx .x == 0 )
219
- {
220
- for (int x = cols_4 * 4 ; x < cols; ++x)
221
- {
222
- const uchar data = rowPtr[x];
223
- histEvenInc (shist, data, binSize, lowerLevel, upperLevel);
224
- }
237
+ // load uncoalesced tail
238
+ if (threadIdx .x == 0 ) {
239
+ const int iTailStart = fourByteAligned ? cols_4 * 4 : cols_4 * 4 + alignedOffset;
240
+ for (int x = iTailStart; x < cols; x++)
241
+ histEvenInc (shist, rowPtr[x], binSize, lowerLevel, upperLevel);
225
242
}
226
243
}
227
244
@@ -236,7 +253,7 @@ namespace hist
236
253
}
237
254
}
238
255
239
- void histEven8u (PtrStepSzb src, int * hist, int binCount, int lowerLevel, int upperLevel, cudaStream_t stream)
256
+ void histEven8u (PtrStepSzb src, int * hist, int binCount, int lowerLevel, int upperLevel, const int offsetX, cudaStream_t stream)
240
257
{
241
258
const dim3 block (32 , 8 );
242
259
const dim3 grid (divUp (src.rows , block.y ));
@@ -245,7 +262,10 @@ namespace hist
245
262
246
263
const size_t smem_size = binCount * sizeof (int );
247
264
248
- histEven8u<<<grid, block, smem_size, stream>>> (src.data , src.step , src.rows , src.cols , hist, binCount, binSize, lowerLevel, upperLevel);
265
+ if (offsetX)
266
+ histEven8u<false ><<<grid, block, smem_size, stream>>> (src.data , src.step , src.rows , src.cols , hist, binCount, binSize, lowerLevel, upperLevel, offsetX);
267
+ else
268
+ histEven8u<true ><<<grid, block, smem_size, stream>>> (src.data , src.step , src.rows , src.cols , hist, binCount, binSize, lowerLevel, upperLevel, offsetX);
249
269
cudaSafeCall ( cudaGetLastError () );
250
270
251
271
if (stream == 0 )
0 commit comments