Skip to content

Commit a160e4f

Browse files
authored
Merge pull request opencv#17647 from jinyup100:add-siamrpnpp
[GSoC] Add siamrpnpp.py * Updated base branch with siamrpnpp.py * Addition of Parsers * Merged to using few ONNX files, Changes to Parsers, Links to Repo * Deleted whitespace * Adjusting flake8 error * Fixes according to review * Fix according to review * Addition of OpenVINO backends and Computation target devices * Fix on backend after review * Fixes after review * Remove extra white space * Removed Repeated Varaibles
1 parent c7422e4 commit a160e4f

File tree

1 file changed

+397
-0
lines changed

1 file changed

+397
-0
lines changed

samples/dnn/siamrpnpp.py

Lines changed: 397 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,397 @@
1+
import argparse
2+
import cv2 as cv
3+
import numpy as np
4+
import os
5+
6+
"""
7+
Link to original paper : https://arxiv.org/abs/1812.11703
8+
Link to original repo : https://github.com/STVIR/pysot
9+
10+
You can download the pre-trained weights of the Tracker Model from https://drive.google.com/file/d/11bwgPFVkps9AH2NOD1zBDdpF_tQghAB-/view?usp=sharing
11+
You can download the target net (target branch of SiamRPN++) from https://drive.google.com/file/d/1dw_Ne3UMcCnFsaD6xkZepwE4GEpqq7U_/view?usp=sharing
12+
You can download the search net (search branch of SiamRPN++) from https://drive.google.com/file/d/1Lt4oE43ZSucJvze3Y-Z87CVDreO-Afwl/view?usp=sharing
13+
You can download the head model (RPN Head) from https://drive.google.com/file/d/1zT1yu12mtj3JQEkkfKFJWiZ71fJ-dQTi/view?usp=sharing
14+
"""
15+
16+
class ModelBuilder():
17+
""" This class generates the SiamRPN++ Tracker Model by using Imported ONNX Nets
18+
"""
19+
def __init__(self, target_net, search_net, rpn_head):
20+
super(ModelBuilder, self).__init__()
21+
# Build the target branch
22+
self.target_net = target_net
23+
# Build the search branch
24+
self.search_net = search_net
25+
# Build RPN_Head
26+
self.rpn_head = rpn_head
27+
28+
def template(self, z):
29+
""" Takes the template of size (1, 1, 127, 127) as an input to generate kernel
30+
"""
31+
self.target_net.setInput(z)
32+
outNames = self.target_net.getUnconnectedOutLayersNames()
33+
self.zfs_1, self.zfs_2, self.zfs_3 = self.target_net.forward(outNames)
34+
35+
def track(self, x):
36+
""" Takes the search of size (1, 1, 255, 255) as an input to generate classification score and bounding box regression
37+
"""
38+
self.search_net.setInput(x)
39+
outNames = self.search_net.getUnconnectedOutLayersNames()
40+
xfs_1, xfs_2, xfs_3 = self.search_net.forward(outNames)
41+
self.rpn_head.setInput(np.stack([self.zfs_1, self.zfs_2, self.zfs_3]), 'input_1')
42+
self.rpn_head.setInput(np.stack([xfs_1, xfs_2, xfs_3]), 'input_2')
43+
outNames = self.rpn_head.getUnconnectedOutLayersNames()
44+
cls, loc = self.rpn_head.forward(outNames)
45+
return {'cls': cls, 'loc': loc}
46+
47+
class Anchors:
48+
""" This class generate anchors.
49+
"""
50+
def __init__(self, stride, ratios, scales, image_center=0, size=0):
51+
self.stride = stride
52+
self.ratios = ratios
53+
self.scales = scales
54+
self.image_center = image_center
55+
self.size = size
56+
self.anchor_num = len(self.scales) * len(self.ratios)
57+
self.anchors = self.generate_anchors()
58+
59+
def generate_anchors(self):
60+
"""
61+
generate anchors based on predefined configuration
62+
"""
63+
anchors = np.zeros((self.anchor_num, 4), dtype=np.float32)
64+
size = self.stride**2
65+
count = 0
66+
for r in self.ratios:
67+
ws = int(np.sqrt(size * 1. / r))
68+
hs = int(ws * r)
69+
70+
for s in self.scales:
71+
w = ws * s
72+
h = hs * s
73+
anchors[count][:] = [-w * 0.5, -h * 0.5, w * 0.5, h * 0.5][:]
74+
count += 1
75+
return anchors
76+
77+
class SiamRPNTracker:
78+
def __init__(self, model):
79+
super(SiamRPNTracker, self).__init__()
80+
self.anchor_stride = 8
81+
self.anchor_ratios = [0.33, 0.5, 1, 2, 3]
82+
self.anchor_scales = [8]
83+
self.track_base_size = 8
84+
self.track_context_amount = 0.5
85+
self.track_exemplar_size = 127
86+
self.track_instance_size = 255
87+
self.track_lr = 0.4
88+
self.track_penalty_k = 0.04
89+
self.track_window_influence = 0.44
90+
self.score_size = (self.track_instance_size - self.track_exemplar_size) // \
91+
self.anchor_stride + 1 + self.track_base_size
92+
self.anchor_num = len(self.anchor_ratios) * len(self.anchor_scales)
93+
hanning = np.hanning(self.score_size)
94+
window = np.outer(hanning, hanning)
95+
self.window = np.tile(window.flatten(), self.anchor_num)
96+
self.anchors = self.generate_anchor(self.score_size)
97+
self.model = model
98+
99+
def get_subwindow(self, im, pos, model_sz, original_sz, avg_chans):
100+
"""
101+
Args:
102+
im: bgr based input image frame
103+
pos: position of the center of the frame
104+
model_sz: exemplar / target image size
105+
s_z: original / search image size
106+
avg_chans: channel average
107+
Return:
108+
im_patch: sub_windows for the given image input
109+
"""
110+
if isinstance(pos, float):
111+
pos = [pos, pos]
112+
sz = original_sz
113+
im_h, im_w, im_d = im.shape
114+
c = (original_sz + 1) / 2
115+
cx, cy = pos
116+
context_xmin = np.floor(cx - c + 0.5)
117+
context_xmax = context_xmin + sz - 1
118+
context_ymin = np.floor(cy - c + 0.5)
119+
context_ymax = context_ymin + sz - 1
120+
left_pad = int(max(0., -context_xmin))
121+
top_pad = int(max(0., -context_ymin))
122+
right_pad = int(max(0., context_xmax - im_w + 1))
123+
bottom_pad = int(max(0., context_ymax - im_h + 1))
124+
context_xmin += left_pad
125+
context_xmax += left_pad
126+
context_ymin += top_pad
127+
context_ymax += top_pad
128+
129+
if any([top_pad, bottom_pad, left_pad, right_pad]):
130+
size = (im_h + top_pad + bottom_pad, im_w + left_pad + right_pad, im_d)
131+
te_im = np.zeros(size, np.uint8)
132+
te_im[top_pad:top_pad + im_h, left_pad:left_pad + im_w, :] = im
133+
if top_pad:
134+
te_im[0:top_pad, left_pad:left_pad + im_w, :] = avg_chans
135+
if bottom_pad:
136+
te_im[im_h + top_pad:, left_pad:left_pad + im_w, :] = avg_chans
137+
if left_pad:
138+
te_im[:, 0:left_pad, :] = avg_chans
139+
if right_pad:
140+
te_im[:, im_w + left_pad:, :] = avg_chans
141+
im_patch = te_im[int(context_ymin):int(context_ymax + 1),
142+
int(context_xmin):int(context_xmax + 1), :]
143+
else:
144+
im_patch = im[int(context_ymin):int(context_ymax + 1),
145+
int(context_xmin):int(context_xmax + 1), :]
146+
147+
if not np.array_equal(model_sz, original_sz):
148+
im_patch = cv.resize(im_patch, (model_sz, model_sz))
149+
im_patch = im_patch.transpose(2, 0, 1)
150+
im_patch = im_patch[np.newaxis, :, :, :]
151+
im_patch = im_patch.astype(np.float32)
152+
return im_patch
153+
154+
def generate_anchor(self, score_size):
155+
"""
156+
Args:
157+
im: bgr based input image frame
158+
pos: position of the center of the frame
159+
model_sz: exemplar / target image size
160+
s_z: original / search image size
161+
avg_chans: channel average
162+
Return:
163+
anchor: anchors for pre-determined values of stride, ratio, and scale
164+
"""
165+
anchors = Anchors(self.anchor_stride, self.anchor_ratios, self.anchor_scales)
166+
anchor = anchors.anchors
167+
x1, y1, x2, y2 = anchor[:, 0], anchor[:, 1], anchor[:, 2], anchor[:, 3]
168+
anchor = np.stack([(x1 + x2) * 0.5, (y1 + y2) * 0.5, x2 - x1, y2 - y1], 1)
169+
total_stride = anchors.stride
170+
anchor_num = anchors.anchor_num
171+
anchor = np.tile(anchor, score_size * score_size).reshape((-1, 4))
172+
ori = - (score_size // 2) * total_stride
173+
xx, yy = np.meshgrid([ori + total_stride * dx for dx in range(score_size)],
174+
[ori + total_stride * dy for dy in range(score_size)])
175+
xx, yy = np.tile(xx.flatten(), (anchor_num, 1)).flatten(), \
176+
np.tile(yy.flatten(), (anchor_num, 1)).flatten()
177+
anchor[:, 0], anchor[:, 1] = xx.astype(np.float32), yy.astype(np.float32)
178+
return anchor
179+
180+
def _convert_bbox(self, delta, anchor):
181+
"""
182+
Args:
183+
delta: localisation
184+
anchor: anchor of pre-determined anchor size
185+
Return:
186+
delta: prediction of bounding box
187+
"""
188+
delta_transpose = np.transpose(delta, (1, 2, 3, 0))
189+
delta_contig = np.ascontiguousarray(delta_transpose)
190+
delta = delta_contig.reshape(4, -1)
191+
delta[0, :] = delta[0, :] * anchor[:, 2] + anchor[:, 0]
192+
delta[1, :] = delta[1, :] * anchor[:, 3] + anchor[:, 1]
193+
delta[2, :] = np.exp(delta[2, :]) * anchor[:, 2]
194+
delta[3, :] = np.exp(delta[3, :]) * anchor[:, 3]
195+
return delta
196+
197+
def _softmax(self, x):
198+
"""
199+
Softmax in the direction of the depth of the layer
200+
"""
201+
x = x.astype(dtype=np.float32)
202+
x_max = x.max(axis=1)[:, np.newaxis]
203+
e_x = np.exp(x-x_max)
204+
div = np.sum(e_x, axis=1)[:, np.newaxis]
205+
y = e_x / div
206+
return y
207+
208+
def _convert_score(self, score):
209+
"""
210+
Args:
211+
cls: score
212+
Return:
213+
cls: score for cls
214+
"""
215+
score_transpose = np.transpose(score, (1, 2, 3, 0))
216+
score_con = np.ascontiguousarray(score_transpose)
217+
score_view = score_con.reshape(2, -1)
218+
score = np.transpose(score_view, (1, 0))
219+
score = self._softmax(score)
220+
return score[:,1]
221+
222+
def _bbox_clip(self, cx, cy, width, height, boundary):
223+
"""
224+
Adjusting the bounding box
225+
"""
226+
bbox_h, bbox_w = boundary
227+
cx = max(0, min(cx, bbox_w))
228+
cy = max(0, min(cy, bbox_h))
229+
width = max(10, min(width, bbox_w))
230+
height = max(10, min(height, bbox_h))
231+
return cx, cy, width, height
232+
233+
def init(self, img, bbox):
234+
"""
235+
Args:
236+
img(np.ndarray): bgr based input image frame
237+
bbox: (x,y,w,h): bounding box
238+
"""
239+
x,y,h,w = bbox
240+
self.center_pos = np.array([x + (h - 1) / 2, y + (w - 1) / 2])
241+
self.h = h
242+
self.w = w
243+
w_z = self.w + self.track_context_amount * np.add(h, w)
244+
h_z = self.h + self.track_context_amount * np.add(h, w)
245+
s_z = round(np.sqrt(w_z * h_z))
246+
self.channel_average = np.mean(img, axis=(0, 1))
247+
z_crop = self.get_subwindow(img, self.center_pos, self.track_exemplar_size, s_z, self.channel_average)
248+
self.model.template(z_crop)
249+
250+
def track(self, img):
251+
"""
252+
Args:
253+
img(np.ndarray): BGR image
254+
Return:
255+
bbox(list):[x, y, width, height]
256+
"""
257+
w_z = self.w + self.track_context_amount * np.add(self.w, self.h)
258+
h_z = self.h + self.track_context_amount * np.add(self.w, self.h)
259+
s_z = np.sqrt(w_z * h_z)
260+
scale_z = self.track_exemplar_size / s_z
261+
s_x = s_z * (self.track_instance_size / self.track_exemplar_size)
262+
x_crop = self.get_subwindow(img, self.center_pos, self.track_instance_size, round(s_x), self.channel_average)
263+
outputs = self.model.track(x_crop)
264+
score = self._convert_score(outputs['cls'])
265+
pred_bbox = self._convert_bbox(outputs['loc'], self.anchors)
266+
267+
def change(r):
268+
return np.maximum(r, 1. / r)
269+
270+
def sz(w, h):
271+
pad = (w + h) * 0.5
272+
return np.sqrt((w + pad) * (h + pad))
273+
274+
# scale penalty
275+
s_c = change(sz(pred_bbox[2, :], pred_bbox[3, :]) /
276+
(sz(self.w * scale_z, self.h * scale_z)))
277+
278+
# aspect ratio penalty
279+
r_c = change((self.w / self.h) /
280+
(pred_bbox[2, :] / pred_bbox[3, :]))
281+
penalty = np.exp(-(r_c * s_c - 1) * self.track_penalty_k)
282+
pscore = penalty * score
283+
284+
# window penalty
285+
pscore = pscore * (1 - self.track_window_influence) + \
286+
self.window * self.track_window_influence
287+
best_idx = np.argmax(pscore)
288+
bbox = pred_bbox[:, best_idx] / scale_z
289+
lr = penalty[best_idx] * score[best_idx] * self.track_lr
290+
291+
cpx, cpy = self.center_pos
292+
x,y,w,h = bbox
293+
cx = x + cpx
294+
cy = y + cpy
295+
296+
# smooth bbox
297+
width = self.w * (1 - lr) + w * lr
298+
height = self.h * (1 - lr) + h * lr
299+
300+
# clip boundary
301+
cx, cy, width, height = self._bbox_clip(cx, cy, width, height, img.shape[:2])
302+
303+
# udpate state
304+
self.center_pos = np.array([cx, cy])
305+
self.w = width
306+
self.h = height
307+
bbox = [cx - width / 2, cy - height / 2, width, height]
308+
best_score = score[best_idx]
309+
return {'bbox': bbox, 'best_score': best_score}
310+
311+
def get_frames(video_name):
312+
"""
313+
Args:
314+
Path to input video frame
315+
Return:
316+
Frame
317+
"""
318+
cap = cv.VideoCapture(video_name if video_name else 0)
319+
while True:
320+
ret, frame = cap.read()
321+
if ret:
322+
yield frame
323+
else:
324+
break
325+
326+
def main():
327+
""" Sample SiamRPN Tracker
328+
"""
329+
# Computation backends supported by layers
330+
backends = (cv.dnn.DNN_BACKEND_DEFAULT, cv.dnn.DNN_BACKEND_HALIDE, cv.dnn.DNN_BACKEND_INFERENCE_ENGINE, cv.dnn.DNN_BACKEND_OPENCV)
331+
# Target Devices for computation
332+
targets = (cv.dnn.DNN_TARGET_CPU, cv.dnn.DNN_TARGET_OPENCL, cv.dnn.DNN_TARGET_OPENCL_FP16, cv.dnn.DNN_TARGET_MYRIAD)
333+
334+
parser = argparse.ArgumentParser(description='Use this script to run SiamRPN++ Visual Tracker',
335+
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
336+
parser.add_argument('--input_video', type=str, help='Path to input video file. Skip this argument to capture frames from a camera.')
337+
parser.add_argument('--target_net', type=str, default='target_net.onnx', help='Path to part of SiamRPN++ ran on target frame.')
338+
parser.add_argument('--search_net', type=str, default='search_net.onnx', help='Path to part of SiamRPN++ ran on search frame.')
339+
parser.add_argument('--rpn_head', type=str, default='rpn_head.onnx', help='Path to RPN Head ONNX model.')
340+
parser.add_argument('--backend', choices=backends, default=cv.dnn.DNN_BACKEND_DEFAULT, type=int,
341+
help='Select a computation backend: '
342+
"%d: automatically (by default) "
343+
"%d: Halide"
344+
"%d: Intel's Deep Learning Inference Engine (https://software.intel.com/openvino-toolkit)"
345+
"%d: OpenCV Implementation" % backends)
346+
parser.add_argument('--target', choices=targets, default=cv.dnn.DNN_TARGET_CPU, type=int,
347+
help='Select a target device: '
348+
"%d: CPU target (by default)"
349+
"%d: OpenCL"
350+
"%d: OpenCL FP16"
351+
"%d: Myriad" % targets)
352+
args, _ = parser.parse_known_args()
353+
354+
if args.input_video and not os.path.isfile(args.input_video):
355+
raise OSError("Input video file does not exist")
356+
if not os.path.isfile(args.target_net):
357+
raise OSError("Target Net does not exist")
358+
if not os.path.isfile(args.search_net):
359+
raise OSError("Search Net does not exist")
360+
if not os.path.isfile(args.rpn_head):
361+
raise OSError("RPN Head Net does not exist")
362+
363+
#Load the Networks
364+
target_net = cv.dnn.readNetFromONNX(args.target_net)
365+
target_net.setPreferableBackend(args.backend)
366+
target_net.setPreferableTarget(args.target)
367+
search_net = cv.dnn.readNetFromONNX(args.search_net)
368+
search_net.setPreferableBackend(args.backend)
369+
search_net.setPreferableTarget(args.target)
370+
rpn_head = cv.dnn.readNetFromONNX(args.rpn_head)
371+
rpn_head.setPreferableBackend(args.backend)
372+
rpn_head.setPreferableTarget(args.target)
373+
model = ModelBuilder(target_net, search_net, rpn_head)
374+
tracker = SiamRPNTracker(model)
375+
376+
first_frame = True
377+
cv.namedWindow('SiamRPN++ Tracker', cv.WINDOW_AUTOSIZE)
378+
for frame in get_frames(args.input_video):
379+
if first_frame:
380+
try:
381+
init_rect = cv.selectROI('SiamRPN++ Tracker', frame, False, False)
382+
except:
383+
exit()
384+
tracker.init(frame, init_rect)
385+
first_frame = False
386+
else:
387+
outputs = tracker.track(frame)
388+
bbox = list(map(int, outputs['bbox']))
389+
x,y,w,h = bbox
390+
cv.rectangle(frame, (x, y), (x+w, y+h), (0, 255, 0), 3)
391+
cv.imshow('SiamRPN++ Tracker', frame)
392+
key = cv.waitKey(1)
393+
if key == ord("q"):
394+
break
395+
396+
if __name__ == '__main__':
397+
main()

0 commit comments

Comments
 (0)