Skip to content

Commit 6630eac

Browse files
committed
Merge pull request opencv#17173 from tomoaki0705:fixOclHogDetect
2 parents 5da4bb7 + 63f5f93 commit 6630eac

File tree

2 files changed

+32
-168
lines changed

2 files changed

+32
-168
lines changed

modules/objdetect/src/hog.cpp

Lines changed: 4 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -1216,15 +1216,6 @@ static bool ocl_compute_hists(int nbins, int block_stride_x, int block_stride_y,
12161216
UMat grad, UMat qangle, UMat gauss_w_lut, UMat block_hists, size_t block_hist_size)
12171217
{
12181218
ocl::Kernel k("compute_hists_lut_kernel", ocl::objdetect::objdetect_hog_oclsrc);
1219-
if(k.empty())
1220-
return false;
1221-
bool is_cpu = cv::ocl::Device::getDefault().type() == cv::ocl::Device::TYPE_CPU;
1222-
cv::String opts;
1223-
if(is_cpu)
1224-
opts = "-D CPU ";
1225-
else
1226-
opts = cv::format("-D WAVE_SIZE=%d", k.preferedWorkGroupSizeMultiple());
1227-
k.create("compute_hists_lut_kernel", ocl::objdetect::objdetect_hog_oclsrc, opts);
12281219
if(k.empty())
12291220
return false;
12301221

@@ -1285,19 +1276,10 @@ static bool ocl_normalize_hists(int nbins, int block_stride_x, int block_stride_
12851276
size_t localThreads[3] = { 1, 1, 1 };
12861277

12871278
int idx = 0;
1288-
bool is_cpu = cv::ocl::Device::getDefault().type() == cv::ocl::Device::TYPE_CPU;
1289-
cv::String opts;
12901279
ocl::Kernel k;
12911280
if ( nbins == 9 )
12921281
{
12931282
k.create("normalize_hists_36_kernel", ocl::objdetect::objdetect_hog_oclsrc, "");
1294-
if(k.empty())
1295-
return false;
1296-
if(is_cpu)
1297-
opts = "-D CPU ";
1298-
else
1299-
opts = cv::format("-D WAVE_SIZE=%d", k.preferedWorkGroupSizeMultiple());
1300-
k.create("normalize_hists_36_kernel", ocl::objdetect::objdetect_hog_oclsrc, opts);
13011283
if(k.empty())
13021284
return false;
13031285

@@ -1309,14 +1291,7 @@ static bool ocl_normalize_hists(int nbins, int block_stride_x, int block_stride_
13091291
}
13101292
else
13111293
{
1312-
k.create("normalize_hists_kernel", ocl::objdetect::objdetect_hog_oclsrc, "-D WAVE_SIZE=32");
1313-
if(k.empty())
1314-
return false;
1315-
if(is_cpu)
1316-
opts = "-D CPU ";
1317-
else
1318-
opts = cv::format("-D WAVE_SIZE=%d", k.preferedWorkGroupSizeMultiple());
1319-
k.create("normalize_hists_kernel", ocl::objdetect::objdetect_hog_oclsrc, opts);
1294+
k.create("normalize_hists_kernel", ocl::objdetect::objdetect_hog_oclsrc, "");
13201295
if(k.empty())
13211296
return false;
13221297

@@ -1733,7 +1708,6 @@ static bool ocl_classify_hists(int win_height, int win_width, int block_stride_y
17331708
float free_coef, float threshold, UMat& labels, Size descr_size, int block_hist_size)
17341709
{
17351710
int nthreads;
1736-
bool is_cpu = cv::ocl::Device::getDefault().type() == cv::ocl::Device::TYPE_CPU;
17371711
cv::String opts;
17381712

17391713
ocl::Kernel k;
@@ -1742,14 +1716,7 @@ static bool ocl_classify_hists(int win_height, int win_width, int block_stride_y
17421716
{
17431717
case 180:
17441718
nthreads = 180;
1745-
k.create("classify_hists_180_kernel", ocl::objdetect::objdetect_hog_oclsrc, "-D WAVE_SIZE=32");
1746-
if(k.empty())
1747-
return false;
1748-
if(is_cpu)
1749-
opts = "-D CPU ";
1750-
else
1751-
opts = cv::format("-D WAVE_SIZE=%d", k.preferedWorkGroupSizeMultiple());
1752-
k.create("classify_hists_180_kernel", ocl::objdetect::objdetect_hog_oclsrc, opts);
1719+
k.create("classify_hists_180_kernel", ocl::objdetect::objdetect_hog_oclsrc, "");
17531720
if(k.empty())
17541721
return false;
17551722
idx = k.set(idx, descr_size.width);
@@ -1758,14 +1725,7 @@ static bool ocl_classify_hists(int win_height, int win_width, int block_stride_y
17581725

17591726
case 252:
17601727
nthreads = 256;
1761-
k.create("classify_hists_252_kernel", ocl::objdetect::objdetect_hog_oclsrc, "-D WAVE_SIZE=32");
1762-
if(k.empty())
1763-
return false;
1764-
if(is_cpu)
1765-
opts = "-D CPU ";
1766-
else
1767-
opts = cv::format("-D WAVE_SIZE=%d", k.preferedWorkGroupSizeMultiple());
1768-
k.create("classify_hists_252_kernel", ocl::objdetect::objdetect_hog_oclsrc, opts);
1728+
k.create("classify_hists_252_kernel", ocl::objdetect::objdetect_hog_oclsrc, "");
17691729
if(k.empty())
17701730
return false;
17711731
idx = k.set(idx, descr_size.width);
@@ -1774,14 +1734,7 @@ static bool ocl_classify_hists(int win_height, int win_width, int block_stride_y
17741734

17751735
default:
17761736
nthreads = 256;
1777-
k.create("classify_hists_kernel", ocl::objdetect::objdetect_hog_oclsrc, "-D WAVE_SIZE=32");
1778-
if(k.empty())
1779-
return false;
1780-
if(is_cpu)
1781-
opts = "-D CPU ";
1782-
else
1783-
opts = cv::format("-D WAVE_SIZE=%d", k.preferedWorkGroupSizeMultiple());
1784-
k.create("classify_hists_kernel", ocl::objdetect::objdetect_hog_oclsrc, opts);
1737+
k.create("classify_hists_kernel", ocl::objdetect::objdetect_hog_oclsrc, "");
17851738
if(k.empty())
17861739
return false;
17871740
idx = k.set(idx, descr_size.area());

modules/objdetect/src/opencl/objdetect_hog.cl

Lines changed: 28 additions & 117 deletions
Original file line numberDiff line numberDiff line change
@@ -134,9 +134,7 @@ __kernel void compute_hists_lut_kernel(
134134
barrier(CLK_LOCAL_MEM_FENCE);
135135
if (cell_thread_x < 3)
136136
hist_[0] += hist_[3];
137-
#ifdef CPU
138137
barrier(CLK_LOCAL_MEM_FENCE);
139-
#endif
140138
if (cell_thread_x == 0)
141139
final_hist[(cell_x * 2 + cell_y) * cnbins + bin_id] =
142140
hist_[0] + hist_[1] + hist_[2];
@@ -218,7 +216,6 @@ inline float reduce_smem(volatile __local float* smem, int size)
218216
barrier(CLK_LOCAL_MEM_FENCE); }
219217
if (size >= 128) { if (tid < 64) smem[tid] = sum = sum + smem[tid + 64];
220218
barrier(CLK_LOCAL_MEM_FENCE); }
221-
#ifdef CPU
222219
if (size >= 64) { if (tid < 32) smem[tid] = sum = sum + smem[tid + 32];
223220
barrier(CLK_LOCAL_MEM_FENCE); }
224221
if (size >= 32) { if (tid < 16) smem[tid] = sum = sum + smem[tid + 16];
@@ -231,21 +228,6 @@ inline float reduce_smem(volatile __local float* smem, int size)
231228
barrier(CLK_LOCAL_MEM_FENCE); }
232229
if (size >= 2) { if (tid < 1) smem[tid] = sum = sum + smem[tid + 1];
233230
barrier(CLK_LOCAL_MEM_FENCE); }
234-
#else
235-
if (tid < 32)
236-
{
237-
if (size >= 64) smem[tid] = sum = sum + smem[tid + 32];
238-
#if WAVE_SIZE < 32
239-
} barrier(CLK_LOCAL_MEM_FENCE);
240-
if (tid < 16) {
241-
#endif
242-
if (size >= 32) smem[tid] = sum = sum + smem[tid + 16];
243-
if (size >= 16) smem[tid] = sum = sum + smem[tid + 8];
244-
if (size >= 8) smem[tid] = sum = sum + smem[tid + 4];
245-
if (size >= 4) smem[tid] = sum = sum + smem[tid + 2];
246-
if (size >= 2) smem[tid] = sum = sum + smem[tid + 1];
247-
}
248-
#endif
249231

250232
return sum;
251233
}
@@ -284,6 +266,10 @@ __kernel void normalize_hists_kernel(
284266
hist[0] = elem * scale;
285267
}
286268

269+
#define reduce_with_sync(target, sharedMemory, localMemory, tid, offset) \
270+
if (tid < target) sharedMemory[tid] = localMemory = localMemory + sharedMemory[tid + offset]; \
271+
barrier(CLK_LOCAL_MEM_FENCE);
272+
287273
//---------------------------------------------------------------------
288274
// Linear SVM based classification
289275
// 48x96 window, 9 bins and default parameters
@@ -316,43 +302,16 @@ __kernel void classify_hists_180_kernel(
316302

317303
barrier(CLK_LOCAL_MEM_FENCE);
318304

319-
if (tid < 90) products[tid] = product = product + products[tid + 90];
320-
barrier(CLK_LOCAL_MEM_FENCE);
321-
322-
if (tid < 45) products[tid] = product = product + products[tid + 45];
323-
barrier(CLK_LOCAL_MEM_FENCE);
324-
325-
volatile __local float* smem = products;
326-
#ifdef CPU
327-
if (tid < 13) smem[tid] = product = product + smem[tid + 32];
328-
barrier(CLK_LOCAL_MEM_FENCE);
329-
if (tid < 16) smem[tid] = product = product + smem[tid + 16];
330-
barrier(CLK_LOCAL_MEM_FENCE);
331-
if(tid<8) smem[tid] = product = product + smem[tid + 8];
332-
barrier(CLK_LOCAL_MEM_FENCE);
333-
if(tid<4) smem[tid] = product = product + smem[tid + 4];
334-
barrier(CLK_LOCAL_MEM_FENCE);
335-
if(tid<2) smem[tid] = product = product + smem[tid + 2];
336-
barrier(CLK_LOCAL_MEM_FENCE);
337-
#else
338-
if (tid < 13)
339-
{
340-
smem[tid] = product = product + smem[tid + 32];
341-
}
342-
#if WAVE_SIZE < 32
343-
barrier(CLK_LOCAL_MEM_FENCE);
344-
#endif
345-
if (tid < 16)
346-
{
347-
smem[tid] = product = product + smem[tid + 16];
348-
smem[tid] = product = product + smem[tid + 8];
349-
smem[tid] = product = product + smem[tid + 4];
350-
smem[tid] = product = product + smem[tid + 2];
351-
}
352-
#endif
305+
reduce_with_sync(90, products, product, tid, 90);
306+
reduce_with_sync(45, products, product, tid, 45);
307+
reduce_with_sync(13, products, product, tid, 32); // 13 is not typo
308+
reduce_with_sync(16, products, product, tid, 16);
309+
reduce_with_sync(8, products, product, tid, 8);
310+
reduce_with_sync(4, products, product, tid, 4);
311+
reduce_with_sync(2, products, product, tid, 2);
353312

354313
if (tid == 0){
355-
product = product + smem[tid + 1];
314+
product = product + products[tid + 1];
356315
labels[gidY * img_win_width + gidX] = (product + free_coef >= threshold);
357316
}
358317
}
@@ -389,40 +348,16 @@ __kernel void classify_hists_252_kernel(
389348

390349
barrier(CLK_LOCAL_MEM_FENCE);
391350

392-
if (tid < 128) products[tid] = product = product + products[tid + 128];
393-
barrier(CLK_LOCAL_MEM_FENCE);
351+
reduce_with_sync(128, products, product, tid, 128);
352+
reduce_with_sync(64, products, product, tid, 64);
353+
reduce_with_sync(32, products, product, tid, 32);
354+
reduce_with_sync(16, products, product, tid, 16);
355+
reduce_with_sync(8, products, product, tid, 8);
356+
reduce_with_sync(4, products, product, tid, 4);
357+
reduce_with_sync(2, products, product, tid, 2);
394358

395-
if (tid < 64) products[tid] = product = product + products[tid + 64];
396-
barrier(CLK_LOCAL_MEM_FENCE);
397-
398-
volatile __local float* smem = products;
399-
#ifdef CPU
400-
if(tid<32) smem[tid] = product = product + smem[tid + 32];
401-
barrier(CLK_LOCAL_MEM_FENCE);
402-
if(tid<16) smem[tid] = product = product + smem[tid + 16];
403-
barrier(CLK_LOCAL_MEM_FENCE);
404-
if(tid<8) smem[tid] = product = product + smem[tid + 8];
405-
barrier(CLK_LOCAL_MEM_FENCE);
406-
if(tid<4) smem[tid] = product = product + smem[tid + 4];
407-
barrier(CLK_LOCAL_MEM_FENCE);
408-
if(tid<2) smem[tid] = product = product + smem[tid + 2];
409-
barrier(CLK_LOCAL_MEM_FENCE);
410-
#else
411-
if (tid < 32)
412-
{
413-
smem[tid] = product = product + smem[tid + 32];
414-
#if WAVE_SIZE < 32
415-
} barrier(CLK_LOCAL_MEM_FENCE);
416-
if (tid < 16) {
417-
#endif
418-
smem[tid] = product = product + smem[tid + 16];
419-
smem[tid] = product = product + smem[tid + 8];
420-
smem[tid] = product = product + smem[tid + 4];
421-
smem[tid] = product = product + smem[tid + 2];
422-
}
423-
#endif
424359
if (tid == 0){
425-
product = product + smem[tid + 1];
360+
product = product + products[tid + 1];
426361
labels[gidY * img_win_width + gidX] = (product + free_coef >= threshold);
427362
}
428363
}
@@ -459,40 +394,16 @@ __kernel void classify_hists_kernel(
459394

460395
barrier(CLK_LOCAL_MEM_FENCE);
461396

462-
if (tid < 128) products[tid] = product = product + products[tid + 128];
463-
barrier(CLK_LOCAL_MEM_FENCE);
397+
reduce_with_sync(128, products, product, tid, 128);
398+
reduce_with_sync(64, products, product, tid, 64);
399+
reduce_with_sync(32, products, product, tid, 32);
400+
reduce_with_sync(16, products, product, tid, 16);
401+
reduce_with_sync(8, products, product, tid, 8);
402+
reduce_with_sync(4, products, product, tid, 4);
403+
reduce_with_sync(2, products, product, tid, 2);
464404

465-
if (tid < 64) products[tid] = product = product + products[tid + 64];
466-
barrier(CLK_LOCAL_MEM_FENCE);
467-
468-
volatile __local float* smem = products;
469-
#ifdef CPU
470-
if(tid<32) smem[tid] = product = product + smem[tid + 32];
471-
barrier(CLK_LOCAL_MEM_FENCE);
472-
if(tid<16) smem[tid] = product = product + smem[tid + 16];
473-
barrier(CLK_LOCAL_MEM_FENCE);
474-
if(tid<8) smem[tid] = product = product + smem[tid + 8];
475-
barrier(CLK_LOCAL_MEM_FENCE);
476-
if(tid<4) smem[tid] = product = product + smem[tid + 4];
477-
barrier(CLK_LOCAL_MEM_FENCE);
478-
if(tid<2) smem[tid] = product = product + smem[tid + 2];
479-
barrier(CLK_LOCAL_MEM_FENCE);
480-
#else
481-
if (tid < 32)
482-
{
483-
smem[tid] = product = product + smem[tid + 32];
484-
#if WAVE_SIZE < 32
485-
} barrier(CLK_LOCAL_MEM_FENCE);
486-
if (tid < 16) {
487-
#endif
488-
smem[tid] = product = product + smem[tid + 16];
489-
smem[tid] = product = product + smem[tid + 8];
490-
smem[tid] = product = product + smem[tid + 4];
491-
smem[tid] = product = product + smem[tid + 2];
492-
}
493-
#endif
494405
if (tid == 0){
495-
smem[tid] = product = product + smem[tid + 1];
406+
products[tid] = product = product + products[tid + 1];
496407
labels[gidY * img_win_width + gidX] = (product + free_coef >= threshold);
497408
}
498409
}

0 commit comments

Comments
 (0)