Skip to content

Commit 41995b7

Browse files
sovrasovvpisarev
authored andcommitted
KCF speedup (#1374)
* kcf use float data type rather than double. In our practice, float is good enough and could get better performance. With this patch, one of my benchmark could get about 20% performance gain. Signed-off-by: Zhigang Gong <zhigang.gong@intel.com> * Offload transpose matrix multiplication to ocl. The matrix multiplication in updateProjectMatrix is one of the hotspot. And because of the matrix shape is special, say the m is very short but the n is very large. The GEMM implementation in neither the clBLAS nor the in trunk implementation are very inefficient, I implement an standalone transpose matrix mulplication kernel here. It can get about 10% performance gain on Intel desktop platform or 20% performance gain on a braswell platform. And in the mean time, the CPU utilization will be lower. Signed-off-by: Zhigang Gong <zhigang.gong@intel.com> * Add verification code for kcf ocl transpose mm kernel. Signed-off-by: Zhigang Gong <zhigang.gong@linux.intel.com> * tracking: show FPS in traker sample * tracking: fix MSVC warnings in KCF * tracking: move OCL kernel initialization to constructor in KCF
1 parent 0058eca commit 41995b7

File tree

7 files changed

+32993
-32837
lines changed

7 files changed

+32993
-32837
lines changed

modules/tracking/include/opencv2/tracking/tracker.hpp

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1236,12 +1236,12 @@ class CV_EXPORTS_W TrackerKCF : public Tracker
12361236
*/
12371237
void write(FileStorage& /*fs*/) const;
12381238

1239-
double detect_thresh; //!< detection confidence threshold
1240-
double sigma; //!< gaussian kernel bandwidth
1241-
double lambda; //!< regularization
1242-
double interp_factor; //!< linear interpolation factor for adaptation
1243-
double output_sigma_factor; //!< spatial bandwidth (proportional to target)
1244-
double pca_learning_rate; //!< compression learning rate
1239+
float detect_thresh; //!< detection confidence threshold
1240+
float sigma; //!< gaussian kernel bandwidth
1241+
float lambda; //!< regularization
1242+
float interp_factor; //!< linear interpolation factor for adaptation
1243+
float output_sigma_factor; //!< spatial bandwidth (proportional to target)
1244+
float pca_learning_rate; //!< compression learning rate
12451245
bool resize; //!< activate the resize feature to improve the processing speed
12461246
bool split_coeff; //!< split the training coefficients into two matrices
12471247
bool wrap_kernel; //!< wrap around the kernel values

modules/tracking/samples/tracker.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,7 @@ int main( int argc, char** argv ){
117117

118118
bool initialized = false;
119119
int frameCounter = 0;
120+
int64 timeTotal = 0;
120121

121122
for ( ;; )
122123
{
@@ -142,11 +143,14 @@ int main( int argc, char** argv ){
142143
}
143144
else if( initialized )
144145
{
146+
int64 frameTime = getTickCount();
145147
//updates the tracker
146148
if( tracker->update( frame, boundingBox ) )
147149
{
148150
rectangle( image, boundingBox, Scalar( 255, 0, 0 ), 2, 1 );
149151
}
152+
frameTime = getTickCount() - frameTime;
153+
timeTotal += frameTime;
150154
}
151155
imshow( "Tracking API", image );
152156
frameCounter++;
@@ -159,5 +163,8 @@ int main( int argc, char** argv ){
159163
paused = !paused;
160164
}
161165

166+
double s = frameCounter / (timeTotal / getTickFrequency());
167+
printf("FPS: %f\n", s);
168+
162169
return 0;
163170
}

modules/tracking/src/featureColorName.cpp

Lines changed: 32770 additions & 32770 deletions
Large diffs are not rendered by default.

modules/tracking/src/opencl/tmm.cl

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
// This file is part of OpenCV project.
2+
// It is subject to the license terms in the LICENSE file found in the top-level directory
3+
// of this distribution and at http://opencv.org/license.html.
4+
5+
// Copyright (C) 2016, Intel, Inc., all rights reserved.
6+
// Third party copyrights are property of their respective owners.
7+
8+
#define LOCAL_SIZE_X 64
9+
#define BLOCK_SIZE_X 3
10+
11+
__kernel void tmm(__global float *A, int m, int n, float alpha, __global float *D)
12+
{
13+
int lidX = get_local_id(0);
14+
uint lsizeX = get_local_size(0);
15+
16+
uint matI = get_group_id(1);
17+
uint matJ = get_group_id(0);
18+
19+
if (matI < matJ)
20+
return;
21+
22+
__local float4 a[LOCAL_SIZE_X], b[LOCAL_SIZE_X];
23+
float4 result;
24+
__local uint cnt;
25+
result = 0;
26+
cnt = 0;
27+
barrier(CLK_LOCAL_MEM_FENCE);
28+
do {
29+
// load block data to SLM.
30+
int global_block_base = (lidX + cnt * lsizeX) * BLOCK_SIZE_X;
31+
float4 pa[BLOCK_SIZE_X], pb[BLOCK_SIZE_X];
32+
33+
#pragma unroll
34+
for(uint j = 0; j < BLOCK_SIZE_X && (cnt * lsizeX + lidX) * BLOCK_SIZE_X < n / 4; j++) {
35+
pa[j] = *(__global float4*)&A[matI * n + (global_block_base + j) * 4];
36+
if (matI != matJ)
37+
pb[j] = *(__global float4*)&A[matJ * n + (global_block_base + j) * 4];
38+
else
39+
pb[j] = pa[j];
40+
}
41+
42+
// zero the data out-of-boundary.
43+
if (global_block_base + BLOCK_SIZE_X - 1 >= n/4) {
44+
#pragma unroll
45+
for(int i = 0; i < BLOCK_SIZE_X; i++) {
46+
if (global_block_base + i >= n/4)
47+
pb[i] = 0;
48+
}
49+
}
50+
51+
pb[0] *= pa[0];
52+
53+
for(int j = 1; j < BLOCK_SIZE_X; j++)
54+
pb[0] = fma(pb[j], pa[j], pb[0]);
55+
56+
b[lidX] = pb[0];
57+
barrier(CLK_LOCAL_MEM_FENCE);
58+
59+
// perform reduce add
60+
for(int offset = LOCAL_SIZE_X / 2; offset > 0; offset >>= 1) {
61+
if (lidX < offset)
62+
b[lidX] += b[(lidX + offset)];
63+
barrier(CLK_LOCAL_MEM_FENCE);
64+
}
65+
if (lidX == 0) {
66+
result += b[0];
67+
cnt++;
68+
}
69+
barrier(CLK_LOCAL_MEM_FENCE);
70+
} while(cnt * BLOCK_SIZE_X * lsizeX < n / 4);
71+
if (lidX == 0) {
72+
float ret = (result.s0 + result.s1 + result.s2 + result.s3) * alpha;
73+
D[matI * m + matJ] = ret;
74+
if (matI != matJ)
75+
D[matJ * m + matI] = ret;
76+
}
77+
}

modules/tracking/src/precomp.hpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@
4242
#ifndef __OPENCV_PRECOMP_H__
4343
#define __OPENCV_PRECOMP_H__
4444

45+
#include "cvconfig.h"
4546
#include "opencv2/tracking.hpp"
4647
#include "opencv2/core/utility.hpp"
4748
#include "opencv2/core/ocl.hpp"
@@ -50,7 +51,7 @@
5051

5152
namespace cv
5253
{
53-
extern const double ColorNames[][10];
54+
extern const float ColorNames[][10];
5455

5556
namespace tracking {
5657

0 commit comments

Comments
 (0)