@@ -134,9 +134,7 @@ __kernel void compute_hists_lut_kernel(
134
134
barrier (CLK_LOCAL_MEM_FENCE );
135
135
if (cell_thread_x < 3 )
136
136
hist_ [0 ] += hist_ [3 ];
137
- #ifdef CPU
138
137
barrier (CLK_LOCAL_MEM_FENCE );
139
- #endif
140
138
if (cell_thread_x == 0 )
141
139
final_hist [(cell_x * 2 + cell_y ) * cnbins + bin_id ] =
142
140
hist_ [0 ] + hist_ [1 ] + hist_ [2 ];
@@ -218,7 +216,6 @@ inline float reduce_smem(volatile __local float* smem, int size)
218
216
barrier (CLK_LOCAL_MEM_FENCE ); }
219
217
if (size >= 128 ) { if (tid < 64 ) smem [tid ] = sum = sum + smem [tid + 64 ];
220
218
barrier (CLK_LOCAL_MEM_FENCE ); }
221
- #ifdef CPU
222
219
if (size >= 64 ) { if (tid < 32 ) smem [tid ] = sum = sum + smem [tid + 32 ];
223
220
barrier (CLK_LOCAL_MEM_FENCE ); }
224
221
if (size >= 32 ) { if (tid < 16 ) smem [tid ] = sum = sum + smem [tid + 16 ];
@@ -231,21 +228,6 @@ inline float reduce_smem(volatile __local float* smem, int size)
231
228
barrier (CLK_LOCAL_MEM_FENCE ); }
232
229
if (size >= 2 ) { if (tid < 1 ) smem [tid ] = sum = sum + smem [tid + 1 ];
233
230
barrier (CLK_LOCAL_MEM_FENCE ); }
234
- #else
235
- if (tid < 32 )
236
- {
237
- if (size >= 64 ) smem [tid ] = sum = sum + smem [tid + 32 ];
238
- #if WAVE_SIZE < 32
239
- } barrier (CLK_LOCAL_MEM_FENCE );
240
- if (tid < 16 ) {
241
- #endif
242
- if (size >= 32 ) smem [tid ] = sum = sum + smem [tid + 16 ];
243
- if (size >= 16 ) smem [tid ] = sum = sum + smem [tid + 8 ];
244
- if (size >= 8 ) smem [tid ] = sum = sum + smem [tid + 4 ];
245
- if (size >= 4 ) smem [tid ] = sum = sum + smem [tid + 2 ];
246
- if (size >= 2 ) smem [tid ] = sum = sum + smem [tid + 1 ];
247
- }
248
- #endif
249
231
250
232
return sum ;
251
233
}
@@ -284,6 +266,10 @@ __kernel void normalize_hists_kernel(
284
266
hist [0 ] = elem * scale ;
285
267
}
286
268
269
+ #define reduce_with_sync (target , sharedMemory , localMemory , tid , offset ) \
270
+ if (tid < target) sharedMemory[tid] = localMemory = localMemory + sharedMemory[tid + offset]; \
271
+ barrier(CLK_LOCAL_MEM_FENCE);
272
+
287
273
//---------------------------------------------------------------------
288
274
// Linear SVM based classification
289
275
// 48x96 window, 9 bins and default parameters
@@ -316,43 +302,16 @@ __kernel void classify_hists_180_kernel(
316
302
317
303
barrier (CLK_LOCAL_MEM_FENCE );
318
304
319
- if (tid < 90 ) products [tid ] = product = product + products [tid + 90 ];
320
- barrier (CLK_LOCAL_MEM_FENCE );
321
-
322
- if (tid < 45 ) products [tid ] = product = product + products [tid + 45 ];
323
- barrier (CLK_LOCAL_MEM_FENCE );
324
-
325
- volatile __local float * smem = products ;
326
- #ifdef CPU
327
- if (tid < 13 ) smem [tid ] = product = product + smem [tid + 32 ];
328
- barrier (CLK_LOCAL_MEM_FENCE );
329
- if (tid < 16 ) smem [tid ] = product = product + smem [tid + 16 ];
330
- barrier (CLK_LOCAL_MEM_FENCE );
331
- if (tid < 8 ) smem [tid ] = product = product + smem [tid + 8 ];
332
- barrier (CLK_LOCAL_MEM_FENCE );
333
- if (tid < 4 ) smem [tid ] = product = product + smem [tid + 4 ];
334
- barrier (CLK_LOCAL_MEM_FENCE );
335
- if (tid < 2 ) smem [tid ] = product = product + smem [tid + 2 ];
336
- barrier (CLK_LOCAL_MEM_FENCE );
337
- #else
338
- if (tid < 13 )
339
- {
340
- smem [tid ] = product = product + smem [tid + 32 ];
341
- }
342
- #if WAVE_SIZE < 32
343
- barrier (CLK_LOCAL_MEM_FENCE );
344
- #endif
345
- if (tid < 16 )
346
- {
347
- smem [tid ] = product = product + smem [tid + 16 ];
348
- smem [tid ] = product = product + smem [tid + 8 ];
349
- smem [tid ] = product = product + smem [tid + 4 ];
350
- smem [tid ] = product = product + smem [tid + 2 ];
351
- }
352
- #endif
305
+ reduce_with_sync (90 , products , product , tid , 90 );
306
+ reduce_with_sync (45 , products , product , tid , 45 );
307
+ reduce_with_sync (13 , products , product , tid , 32 ); // 13 is not typo
308
+ reduce_with_sync (16 , products , product , tid , 16 );
309
+ reduce_with_sync (8 , products , product , tid , 8 );
310
+ reduce_with_sync (4 , products , product , tid , 4 );
311
+ reduce_with_sync (2 , products , product , tid , 2 );
353
312
354
313
if (tid == 0 ){
355
- product = product + smem [tid + 1 ];
314
+ product = product + products [tid + 1 ];
356
315
labels [gidY * img_win_width + gidX ] = (product + free_coef >= threshold );
357
316
}
358
317
}
@@ -389,40 +348,16 @@ __kernel void classify_hists_252_kernel(
389
348
390
349
barrier (CLK_LOCAL_MEM_FENCE );
391
350
392
- if (tid < 128 ) products [tid ] = product = product + products [tid + 128 ];
393
- barrier (CLK_LOCAL_MEM_FENCE );
351
+ reduce_with_sync (128 , products , product , tid , 128 );
352
+ reduce_with_sync (64 , products , product , tid , 64 );
353
+ reduce_with_sync (32 , products , product , tid , 32 );
354
+ reduce_with_sync (16 , products , product , tid , 16 );
355
+ reduce_with_sync (8 , products , product , tid , 8 );
356
+ reduce_with_sync (4 , products , product , tid , 4 );
357
+ reduce_with_sync (2 , products , product , tid , 2 );
394
358
395
- if (tid < 64 ) products [tid ] = product = product + products [tid + 64 ];
396
- barrier (CLK_LOCAL_MEM_FENCE );
397
-
398
- volatile __local float * smem = products ;
399
- #ifdef CPU
400
- if (tid < 32 ) smem [tid ] = product = product + smem [tid + 32 ];
401
- barrier (CLK_LOCAL_MEM_FENCE );
402
- if (tid < 16 ) smem [tid ] = product = product + smem [tid + 16 ];
403
- barrier (CLK_LOCAL_MEM_FENCE );
404
- if (tid < 8 ) smem [tid ] = product = product + smem [tid + 8 ];
405
- barrier (CLK_LOCAL_MEM_FENCE );
406
- if (tid < 4 ) smem [tid ] = product = product + smem [tid + 4 ];
407
- barrier (CLK_LOCAL_MEM_FENCE );
408
- if (tid < 2 ) smem [tid ] = product = product + smem [tid + 2 ];
409
- barrier (CLK_LOCAL_MEM_FENCE );
410
- #else
411
- if (tid < 32 )
412
- {
413
- smem [tid ] = product = product + smem [tid + 32 ];
414
- #if WAVE_SIZE < 32
415
- } barrier (CLK_LOCAL_MEM_FENCE );
416
- if (tid < 16 ) {
417
- #endif
418
- smem [tid ] = product = product + smem [tid + 16 ];
419
- smem [tid ] = product = product + smem [tid + 8 ];
420
- smem [tid ] = product = product + smem [tid + 4 ];
421
- smem [tid ] = product = product + smem [tid + 2 ];
422
- }
423
- #endif
424
359
if (tid == 0 ){
425
- product = product + smem [tid + 1 ];
360
+ product = product + products [tid + 1 ];
426
361
labels [gidY * img_win_width + gidX ] = (product + free_coef >= threshold );
427
362
}
428
363
}
@@ -459,40 +394,16 @@ __kernel void classify_hists_kernel(
459
394
460
395
barrier (CLK_LOCAL_MEM_FENCE );
461
396
462
- if (tid < 128 ) products [tid ] = product = product + products [tid + 128 ];
463
- barrier (CLK_LOCAL_MEM_FENCE );
397
+ reduce_with_sync (128 , products , product , tid , 128 );
398
+ reduce_with_sync (64 , products , product , tid , 64 );
399
+ reduce_with_sync (32 , products , product , tid , 32 );
400
+ reduce_with_sync (16 , products , product , tid , 16 );
401
+ reduce_with_sync (8 , products , product , tid , 8 );
402
+ reduce_with_sync (4 , products , product , tid , 4 );
403
+ reduce_with_sync (2 , products , product , tid , 2 );
464
404
465
- if (tid < 64 ) products [tid ] = product = product + products [tid + 64 ];
466
- barrier (CLK_LOCAL_MEM_FENCE );
467
-
468
- volatile __local float * smem = products ;
469
- #ifdef CPU
470
- if (tid < 32 ) smem [tid ] = product = product + smem [tid + 32 ];
471
- barrier (CLK_LOCAL_MEM_FENCE );
472
- if (tid < 16 ) smem [tid ] = product = product + smem [tid + 16 ];
473
- barrier (CLK_LOCAL_MEM_FENCE );
474
- if (tid < 8 ) smem [tid ] = product = product + smem [tid + 8 ];
475
- barrier (CLK_LOCAL_MEM_FENCE );
476
- if (tid < 4 ) smem [tid ] = product = product + smem [tid + 4 ];
477
- barrier (CLK_LOCAL_MEM_FENCE );
478
- if (tid < 2 ) smem [tid ] = product = product + smem [tid + 2 ];
479
- barrier (CLK_LOCAL_MEM_FENCE );
480
- #else
481
- if (tid < 32 )
482
- {
483
- smem [tid ] = product = product + smem [tid + 32 ];
484
- #if WAVE_SIZE < 32
485
- } barrier (CLK_LOCAL_MEM_FENCE );
486
- if (tid < 16 ) {
487
- #endif
488
- smem [tid ] = product = product + smem [tid + 16 ];
489
- smem [tid ] = product = product + smem [tid + 8 ];
490
- smem [tid ] = product = product + smem [tid + 4 ];
491
- smem [tid ] = product = product + smem [tid + 2 ];
492
- }
493
- #endif
494
405
if (tid == 0 ){
495
- smem [tid ] = product = product + smem [tid + 1 ];
406
+ products [tid ] = product = product + products [tid + 1 ];
496
407
labels [gidY * img_win_width + gidX ] = (product + free_coef >= threshold );
497
408
}
498
409
}
0 commit comments