@@ -257,18 +257,15 @@ namespace hist
257
257
258
258
namespace hist
259
259
{
260
- __constant__ int c_lut[256 ];
261
-
262
260
struct EqualizeHist : unary_function<uchar, uchar>
263
261
{
264
- float scale ;
262
+ const uchar* lut ;
265
263
266
- __host__ EqualizeHist (float _scale ) : scale(_scale ) {}
264
+ __host__ EqualizeHist (const uchar* _lut ) : lut(_lut ) {}
267
265
268
266
__device__ __forceinline__ uchar operator ()(uchar val) const
269
267
{
270
- const int lut = c_lut[val];
271
- return __float2int_rn (scale * lut);
268
+ return lut[val];
272
269
}
273
270
};
274
271
}
@@ -283,16 +280,137 @@ namespace cv { namespace cuda { namespace device
283
280
284
281
namespace hist
285
282
{
286
- void equalizeHist (PtrStepSzb src, PtrStepSzb dst, const int * lut, cudaStream_t stream)
283
+ void equalizeHist (PtrStepSzb src, PtrStepSzb dst, const uchar * lut, cudaStream_t stream)
287
284
{
288
- if (stream == 0 )
289
- cudaSafeCall ( cudaMemcpyToSymbol (c_lut, lut, 256 * sizeof (int ), 0 , cudaMemcpyDeviceToDevice) );
290
- else
291
- cudaSafeCall ( cudaMemcpyToSymbolAsync (c_lut, lut, 256 * sizeof (int ), 0 , cudaMemcpyDeviceToDevice, stream) );
285
+ device::transform (src, dst, EqualizeHist (lut), WithOutMask (), stream);
286
+ }
287
+
288
+ __global__ void buildLutKernel (int * hist, unsigned char * lut, int size)
289
+ {
290
+ __shared__ int warp_smem[8 ];
291
+ __shared__ int hist_smem[8 ][33 ];
292
+
293
+ #define HIST_SMEM_NO_BANK_CONFLICT (idx ) hist_smem[(idx) >> 5 ][(idx) & 31 ]
294
+
295
+ const int tId = threadIdx .x ;
296
+ const int warpId = threadIdx .x / 32 ;
297
+ const int laneId = threadIdx .x % 32 ;
298
+
299
+ // Step1 - Find minimum non-zero value in hist and make it zero
300
+ HIST_SMEM_NO_BANK_CONFLICT (tId) = hist[tId];
301
+ int nonZeroIdx = HIST_SMEM_NO_BANK_CONFLICT (tId) > 0 ? tId : 256 ;
302
+
303
+ __syncthreads ();
304
+
305
+ for (int delta = 16 ; delta > 0 ; delta /= 2 )
306
+ {
307
+ #if __CUDACC_VER_MAJOR__ >= 9
308
+ int shflVal = __shfl_down_sync (0xFFFFFFFF , nonZeroIdx, delta);
309
+ #else
310
+ int shflVal = __shfl_down (nonZeroIdx, delta);
311
+ #endif
312
+ if (laneId < delta)
313
+ nonZeroIdx = min (nonZeroIdx, shflVal);
314
+ }
315
+
316
+ if (laneId == 0 )
317
+ warp_smem[warpId] = nonZeroIdx;
292
318
293
- const float scale = 255 .0f / (src.cols * src.rows );
319
+ __syncthreads ();
320
+
321
+ if (tId < 8 )
322
+ {
323
+ int warpVal = warp_smem[tId];
324
+ for (int delta = 4 ; delta > 0 ; delta /= 2 )
325
+ {
326
+ #if __CUDACC_VER_MAJOR__ >= 9
327
+ int shflVal = __shfl_down_sync (0x000000FF , warpVal, delta);
328
+ #else
329
+ int shflVal = __shfl_down (warpVal, delta);
330
+ #endif
331
+ if (tId < delta)
332
+ warpVal = min (warpVal, shflVal);
333
+ }
334
+ if (tId == 0 )
335
+ {
336
+ warp_smem[0 ] = warpVal; // warpVal - minimum index
337
+ }
338
+ }
339
+
340
+ __syncthreads ();
341
+
342
+ const int minNonZeroIdx = warp_smem[0 ];
343
+ const int minNonZeroVal = HIST_SMEM_NO_BANK_CONFLICT (minNonZeroIdx);
344
+ if (minNonZeroVal == size)
345
+ {
346
+ // This is a special case: the whole image has the same color
347
+
348
+ lut[tId] = 0 ;
349
+ if (tId == minNonZeroIdx)
350
+ lut[tId] = minNonZeroIdx;
351
+ return ;
352
+ }
294
353
295
- device::transform (src, dst, EqualizeHist (scale), WithOutMask (), stream);
354
+ if (tId == 0 )
355
+ HIST_SMEM_NO_BANK_CONFLICT (minNonZeroIdx) = 0 ;
356
+
357
+ __syncthreads ();
358
+
359
+ // Step2 - Inclusive sum
360
+ // Algorithm from GPU Gems 3 (A Work-Efficient Parallel Scan)
361
+ // https://developer.nvidia.com/gpugems/gpugems3/part-vi-gpu-computing/chapter-39-parallel-prefix-sum-scan-cuda
362
+
363
+ // Step2 Phase1 - The Up-Sweep Phase
364
+ for (int delta = 1 ; delta < 256 ; delta *= 2 )
365
+ {
366
+ if (tId < 128 / delta)
367
+ {
368
+ int idx = 255 - 2 * tId * delta;
369
+ HIST_SMEM_NO_BANK_CONFLICT (idx) += HIST_SMEM_NO_BANK_CONFLICT (idx - delta);
370
+ }
371
+ __syncthreads ();
372
+ }
373
+
374
+ // Step2 Phase2 - The Down-Sweep Phase
375
+ if (tId == 0 )
376
+ HIST_SMEM_NO_BANK_CONFLICT (255 ) = 0 ;
377
+
378
+ for (int delta = 128 ; delta >= 1 ; delta /= 2 )
379
+ {
380
+ if (tId < 128 / delta)
381
+ {
382
+ int rootIdx = 255 - tId * delta * 2 ;
383
+ int leftIdx = rootIdx - delta;
384
+ int tmp = HIST_SMEM_NO_BANK_CONFLICT (leftIdx);
385
+ HIST_SMEM_NO_BANK_CONFLICT (leftIdx) = HIST_SMEM_NO_BANK_CONFLICT (rootIdx);
386
+ HIST_SMEM_NO_BANK_CONFLICT (rootIdx) += tmp;
387
+ }
388
+ __syncthreads ();
389
+ }
390
+
391
+ // Step2 Phase3 - Convert exclusive sum to inclusive sum
392
+ int tmp = HIST_SMEM_NO_BANK_CONFLICT (tId);
393
+ __syncthreads ();
394
+ if (tId >= 1 )
395
+ HIST_SMEM_NO_BANK_CONFLICT (tId - 1 ) = tmp;
396
+ if (tId == 255 )
397
+ HIST_SMEM_NO_BANK_CONFLICT (tId) = tmp + hist[tId];
398
+ __syncthreads ();
399
+
400
+ // Step3 - Scale values to build lut
401
+
402
+ lut[tId] = saturate_cast<unsigned char >(HIST_SMEM_NO_BANK_CONFLICT (tId) * (255 .0f / (size - minNonZeroVal)));
403
+
404
+ #undef HIST_SMEM_NO_BANK_CONFLICT
405
+ }
406
+
407
+ void buildLut (PtrStepSzi hist, PtrStepSzb lut, int size, cudaStream_t stream)
408
+ {
409
+ buildLutKernel<<<1 , 256 , 0 , stream>>> (hist.data , lut.data , size);
410
+ cudaSafeCall ( cudaGetLastError () );
411
+
412
+ if (stream == 0 )
413
+ cudaSafeCall ( cudaDeviceSynchronize () );
296
414
}
297
415
}
298
416
0 commit comments