|
| 1 | +import argparse |
| 2 | +import cv2 as cv |
| 3 | +import numpy as np |
| 4 | +import os |
| 5 | + |
| 6 | +""" |
| 7 | +Link to original paper : https://arxiv.org/abs/1812.11703 |
| 8 | +Link to original repo : https://github.com/STVIR/pysot |
| 9 | +
|
| 10 | +You can download the pre-trained weights of the Tracker Model from https://drive.google.com/file/d/11bwgPFVkps9AH2NOD1zBDdpF_tQghAB-/view?usp=sharing |
| 11 | +You can download the target net (target branch of SiamRPN++) from https://drive.google.com/file/d/1dw_Ne3UMcCnFsaD6xkZepwE4GEpqq7U_/view?usp=sharing |
| 12 | +You can download the search net (search branch of SiamRPN++) from https://drive.google.com/file/d/1Lt4oE43ZSucJvze3Y-Z87CVDreO-Afwl/view?usp=sharing |
| 13 | +You can download the head model (RPN Head) from https://drive.google.com/file/d/1zT1yu12mtj3JQEkkfKFJWiZ71fJ-dQTi/view?usp=sharing |
| 14 | +""" |
| 15 | + |
| 16 | +class ModelBuilder(): |
| 17 | + """ This class generates the SiamRPN++ Tracker Model by using Imported ONNX Nets |
| 18 | + """ |
| 19 | + def __init__(self, target_net, search_net, rpn_head): |
| 20 | + super(ModelBuilder, self).__init__() |
| 21 | + # Build the target branch |
| 22 | + self.target_net = target_net |
| 23 | + # Build the search branch |
| 24 | + self.search_net = search_net |
| 25 | + # Build RPN_Head |
| 26 | + self.rpn_head = rpn_head |
| 27 | + |
| 28 | + def template(self, z): |
| 29 | + """ Takes the template of size (1, 1, 127, 127) as an input to generate kernel |
| 30 | + """ |
| 31 | + self.target_net.setInput(z) |
| 32 | + outNames = self.target_net.getUnconnectedOutLayersNames() |
| 33 | + self.zfs_1, self.zfs_2, self.zfs_3 = self.target_net.forward(outNames) |
| 34 | + |
| 35 | + def track(self, x): |
| 36 | + """ Takes the search of size (1, 1, 255, 255) as an input to generate classification score and bounding box regression |
| 37 | + """ |
| 38 | + self.search_net.setInput(x) |
| 39 | + outNames = self.search_net.getUnconnectedOutLayersNames() |
| 40 | + xfs_1, xfs_2, xfs_3 = self.search_net.forward(outNames) |
| 41 | + self.rpn_head.setInput(np.stack([self.zfs_1, self.zfs_2, self.zfs_3]), 'input_1') |
| 42 | + self.rpn_head.setInput(np.stack([xfs_1, xfs_2, xfs_3]), 'input_2') |
| 43 | + outNames = self.rpn_head.getUnconnectedOutLayersNames() |
| 44 | + cls, loc = self.rpn_head.forward(outNames) |
| 45 | + return {'cls': cls, 'loc': loc} |
| 46 | + |
| 47 | +class Anchors: |
| 48 | + """ This class generate anchors. |
| 49 | + """ |
| 50 | + def __init__(self, stride, ratios, scales, image_center=0, size=0): |
| 51 | + self.stride = stride |
| 52 | + self.ratios = ratios |
| 53 | + self.scales = scales |
| 54 | + self.image_center = image_center |
| 55 | + self.size = size |
| 56 | + self.anchor_num = len(self.scales) * len(self.ratios) |
| 57 | + self.anchors = self.generate_anchors() |
| 58 | + |
| 59 | + def generate_anchors(self): |
| 60 | + """ |
| 61 | + generate anchors based on predefined configuration |
| 62 | + """ |
| 63 | + anchors = np.zeros((self.anchor_num, 4), dtype=np.float32) |
| 64 | + size = self.stride**2 |
| 65 | + count = 0 |
| 66 | + for r in self.ratios: |
| 67 | + ws = int(np.sqrt(size * 1. / r)) |
| 68 | + hs = int(ws * r) |
| 69 | + |
| 70 | + for s in self.scales: |
| 71 | + w = ws * s |
| 72 | + h = hs * s |
| 73 | + anchors[count][:] = [-w * 0.5, -h * 0.5, w * 0.5, h * 0.5][:] |
| 74 | + count += 1 |
| 75 | + return anchors |
| 76 | + |
| 77 | +class SiamRPNTracker: |
| 78 | + def __init__(self, model): |
| 79 | + super(SiamRPNTracker, self).__init__() |
| 80 | + self.anchor_stride = 8 |
| 81 | + self.anchor_ratios = [0.33, 0.5, 1, 2, 3] |
| 82 | + self.anchor_scales = [8] |
| 83 | + self.track_base_size = 8 |
| 84 | + self.track_context_amount = 0.5 |
| 85 | + self.track_exemplar_size = 127 |
| 86 | + self.track_instance_size = 255 |
| 87 | + self.track_lr = 0.4 |
| 88 | + self.track_penalty_k = 0.04 |
| 89 | + self.track_window_influence = 0.44 |
| 90 | + self.score_size = (self.track_instance_size - self.track_exemplar_size) // \ |
| 91 | + self.anchor_stride + 1 + self.track_base_size |
| 92 | + self.anchor_num = len(self.anchor_ratios) * len(self.anchor_scales) |
| 93 | + hanning = np.hanning(self.score_size) |
| 94 | + window = np.outer(hanning, hanning) |
| 95 | + self.window = np.tile(window.flatten(), self.anchor_num) |
| 96 | + self.anchors = self.generate_anchor(self.score_size) |
| 97 | + self.model = model |
| 98 | + |
| 99 | + def get_subwindow(self, im, pos, model_sz, original_sz, avg_chans): |
| 100 | + """ |
| 101 | + Args: |
| 102 | + im: bgr based input image frame |
| 103 | + pos: position of the center of the frame |
| 104 | + model_sz: exemplar / target image size |
| 105 | + s_z: original / search image size |
| 106 | + avg_chans: channel average |
| 107 | + Return: |
| 108 | + im_patch: sub_windows for the given image input |
| 109 | + """ |
| 110 | + if isinstance(pos, float): |
| 111 | + pos = [pos, pos] |
| 112 | + sz = original_sz |
| 113 | + im_h, im_w, im_d = im.shape |
| 114 | + c = (original_sz + 1) / 2 |
| 115 | + cx, cy = pos |
| 116 | + context_xmin = np.floor(cx - c + 0.5) |
| 117 | + context_xmax = context_xmin + sz - 1 |
| 118 | + context_ymin = np.floor(cy - c + 0.5) |
| 119 | + context_ymax = context_ymin + sz - 1 |
| 120 | + left_pad = int(max(0., -context_xmin)) |
| 121 | + top_pad = int(max(0., -context_ymin)) |
| 122 | + right_pad = int(max(0., context_xmax - im_w + 1)) |
| 123 | + bottom_pad = int(max(0., context_ymax - im_h + 1)) |
| 124 | + context_xmin += left_pad |
| 125 | + context_xmax += left_pad |
| 126 | + context_ymin += top_pad |
| 127 | + context_ymax += top_pad |
| 128 | + |
| 129 | + if any([top_pad, bottom_pad, left_pad, right_pad]): |
| 130 | + size = (im_h + top_pad + bottom_pad, im_w + left_pad + right_pad, im_d) |
| 131 | + te_im = np.zeros(size, np.uint8) |
| 132 | + te_im[top_pad:top_pad + im_h, left_pad:left_pad + im_w, :] = im |
| 133 | + if top_pad: |
| 134 | + te_im[0:top_pad, left_pad:left_pad + im_w, :] = avg_chans |
| 135 | + if bottom_pad: |
| 136 | + te_im[im_h + top_pad:, left_pad:left_pad + im_w, :] = avg_chans |
| 137 | + if left_pad: |
| 138 | + te_im[:, 0:left_pad, :] = avg_chans |
| 139 | + if right_pad: |
| 140 | + te_im[:, im_w + left_pad:, :] = avg_chans |
| 141 | + im_patch = te_im[int(context_ymin):int(context_ymax + 1), |
| 142 | + int(context_xmin):int(context_xmax + 1), :] |
| 143 | + else: |
| 144 | + im_patch = im[int(context_ymin):int(context_ymax + 1), |
| 145 | + int(context_xmin):int(context_xmax + 1), :] |
| 146 | + |
| 147 | + if not np.array_equal(model_sz, original_sz): |
| 148 | + im_patch = cv.resize(im_patch, (model_sz, model_sz)) |
| 149 | + im_patch = im_patch.transpose(2, 0, 1) |
| 150 | + im_patch = im_patch[np.newaxis, :, :, :] |
| 151 | + im_patch = im_patch.astype(np.float32) |
| 152 | + return im_patch |
| 153 | + |
| 154 | + def generate_anchor(self, score_size): |
| 155 | + """ |
| 156 | + Args: |
| 157 | + im: bgr based input image frame |
| 158 | + pos: position of the center of the frame |
| 159 | + model_sz: exemplar / target image size |
| 160 | + s_z: original / search image size |
| 161 | + avg_chans: channel average |
| 162 | + Return: |
| 163 | + anchor: anchors for pre-determined values of stride, ratio, and scale |
| 164 | + """ |
| 165 | + anchors = Anchors(self.anchor_stride, self.anchor_ratios, self.anchor_scales) |
| 166 | + anchor = anchors.anchors |
| 167 | + x1, y1, x2, y2 = anchor[:, 0], anchor[:, 1], anchor[:, 2], anchor[:, 3] |
| 168 | + anchor = np.stack([(x1 + x2) * 0.5, (y1 + y2) * 0.5, x2 - x1, y2 - y1], 1) |
| 169 | + total_stride = anchors.stride |
| 170 | + anchor_num = anchors.anchor_num |
| 171 | + anchor = np.tile(anchor, score_size * score_size).reshape((-1, 4)) |
| 172 | + ori = - (score_size // 2) * total_stride |
| 173 | + xx, yy = np.meshgrid([ori + total_stride * dx for dx in range(score_size)], |
| 174 | + [ori + total_stride * dy for dy in range(score_size)]) |
| 175 | + xx, yy = np.tile(xx.flatten(), (anchor_num, 1)).flatten(), \ |
| 176 | + np.tile(yy.flatten(), (anchor_num, 1)).flatten() |
| 177 | + anchor[:, 0], anchor[:, 1] = xx.astype(np.float32), yy.astype(np.float32) |
| 178 | + return anchor |
| 179 | + |
| 180 | + def _convert_bbox(self, delta, anchor): |
| 181 | + """ |
| 182 | + Args: |
| 183 | + delta: localisation |
| 184 | + anchor: anchor of pre-determined anchor size |
| 185 | + Return: |
| 186 | + delta: prediction of bounding box |
| 187 | + """ |
| 188 | + delta_transpose = np.transpose(delta, (1, 2, 3, 0)) |
| 189 | + delta_contig = np.ascontiguousarray(delta_transpose) |
| 190 | + delta = delta_contig.reshape(4, -1) |
| 191 | + delta[0, :] = delta[0, :] * anchor[:, 2] + anchor[:, 0] |
| 192 | + delta[1, :] = delta[1, :] * anchor[:, 3] + anchor[:, 1] |
| 193 | + delta[2, :] = np.exp(delta[2, :]) * anchor[:, 2] |
| 194 | + delta[3, :] = np.exp(delta[3, :]) * anchor[:, 3] |
| 195 | + return delta |
| 196 | + |
| 197 | + def _softmax(self, x): |
| 198 | + """ |
| 199 | + Softmax in the direction of the depth of the layer |
| 200 | + """ |
| 201 | + x = x.astype(dtype=np.float32) |
| 202 | + x_max = x.max(axis=1)[:, np.newaxis] |
| 203 | + e_x = np.exp(x-x_max) |
| 204 | + div = np.sum(e_x, axis=1)[:, np.newaxis] |
| 205 | + y = e_x / div |
| 206 | + return y |
| 207 | + |
| 208 | + def _convert_score(self, score): |
| 209 | + """ |
| 210 | + Args: |
| 211 | + cls: score |
| 212 | + Return: |
| 213 | + cls: score for cls |
| 214 | + """ |
| 215 | + score_transpose = np.transpose(score, (1, 2, 3, 0)) |
| 216 | + score_con = np.ascontiguousarray(score_transpose) |
| 217 | + score_view = score_con.reshape(2, -1) |
| 218 | + score = np.transpose(score_view, (1, 0)) |
| 219 | + score = self._softmax(score) |
| 220 | + return score[:,1] |
| 221 | + |
| 222 | + def _bbox_clip(self, cx, cy, width, height, boundary): |
| 223 | + """ |
| 224 | + Adjusting the bounding box |
| 225 | + """ |
| 226 | + bbox_h, bbox_w = boundary |
| 227 | + cx = max(0, min(cx, bbox_w)) |
| 228 | + cy = max(0, min(cy, bbox_h)) |
| 229 | + width = max(10, min(width, bbox_w)) |
| 230 | + height = max(10, min(height, bbox_h)) |
| 231 | + return cx, cy, width, height |
| 232 | + |
| 233 | + def init(self, img, bbox): |
| 234 | + """ |
| 235 | + Args: |
| 236 | + img(np.ndarray): bgr based input image frame |
| 237 | + bbox: (x,y,w,h): bounding box |
| 238 | + """ |
| 239 | + x,y,h,w = bbox |
| 240 | + self.center_pos = np.array([x + (h - 1) / 2, y + (w - 1) / 2]) |
| 241 | + self.h = h |
| 242 | + self.w = w |
| 243 | + w_z = self.w + self.track_context_amount * np.add(h, w) |
| 244 | + h_z = self.h + self.track_context_amount * np.add(h, w) |
| 245 | + s_z = round(np.sqrt(w_z * h_z)) |
| 246 | + self.channel_average = np.mean(img, axis=(0, 1)) |
| 247 | + z_crop = self.get_subwindow(img, self.center_pos, self.track_exemplar_size, s_z, self.channel_average) |
| 248 | + self.model.template(z_crop) |
| 249 | + |
| 250 | + def track(self, img): |
| 251 | + """ |
| 252 | + Args: |
| 253 | + img(np.ndarray): BGR image |
| 254 | + Return: |
| 255 | + bbox(list):[x, y, width, height] |
| 256 | + """ |
| 257 | + w_z = self.w + self.track_context_amount * np.add(self.w, self.h) |
| 258 | + h_z = self.h + self.track_context_amount * np.add(self.w, self.h) |
| 259 | + s_z = np.sqrt(w_z * h_z) |
| 260 | + scale_z = self.track_exemplar_size / s_z |
| 261 | + s_x = s_z * (self.track_instance_size / self.track_exemplar_size) |
| 262 | + x_crop = self.get_subwindow(img, self.center_pos, self.track_instance_size, round(s_x), self.channel_average) |
| 263 | + outputs = self.model.track(x_crop) |
| 264 | + score = self._convert_score(outputs['cls']) |
| 265 | + pred_bbox = self._convert_bbox(outputs['loc'], self.anchors) |
| 266 | + |
| 267 | + def change(r): |
| 268 | + return np.maximum(r, 1. / r) |
| 269 | + |
| 270 | + def sz(w, h): |
| 271 | + pad = (w + h) * 0.5 |
| 272 | + return np.sqrt((w + pad) * (h + pad)) |
| 273 | + |
| 274 | + # scale penalty |
| 275 | + s_c = change(sz(pred_bbox[2, :], pred_bbox[3, :]) / |
| 276 | + (sz(self.w * scale_z, self.h * scale_z))) |
| 277 | + |
| 278 | + # aspect ratio penalty |
| 279 | + r_c = change((self.w / self.h) / |
| 280 | + (pred_bbox[2, :] / pred_bbox[3, :])) |
| 281 | + penalty = np.exp(-(r_c * s_c - 1) * self.track_penalty_k) |
| 282 | + pscore = penalty * score |
| 283 | + |
| 284 | + # window penalty |
| 285 | + pscore = pscore * (1 - self.track_window_influence) + \ |
| 286 | + self.window * self.track_window_influence |
| 287 | + best_idx = np.argmax(pscore) |
| 288 | + bbox = pred_bbox[:, best_idx] / scale_z |
| 289 | + lr = penalty[best_idx] * score[best_idx] * self.track_lr |
| 290 | + |
| 291 | + cpx, cpy = self.center_pos |
| 292 | + x,y,w,h = bbox |
| 293 | + cx = x + cpx |
| 294 | + cy = y + cpy |
| 295 | + |
| 296 | + # smooth bbox |
| 297 | + width = self.w * (1 - lr) + w * lr |
| 298 | + height = self.h * (1 - lr) + h * lr |
| 299 | + |
| 300 | + # clip boundary |
| 301 | + cx, cy, width, height = self._bbox_clip(cx, cy, width, height, img.shape[:2]) |
| 302 | + |
| 303 | + # udpate state |
| 304 | + self.center_pos = np.array([cx, cy]) |
| 305 | + self.w = width |
| 306 | + self.h = height |
| 307 | + bbox = [cx - width / 2, cy - height / 2, width, height] |
| 308 | + best_score = score[best_idx] |
| 309 | + return {'bbox': bbox, 'best_score': best_score} |
| 310 | + |
| 311 | +def get_frames(video_name): |
| 312 | + """ |
| 313 | + Args: |
| 314 | + Path to input video frame |
| 315 | + Return: |
| 316 | + Frame |
| 317 | + """ |
| 318 | + cap = cv.VideoCapture(video_name if video_name else 0) |
| 319 | + while True: |
| 320 | + ret, frame = cap.read() |
| 321 | + if ret: |
| 322 | + yield frame |
| 323 | + else: |
| 324 | + break |
| 325 | + |
| 326 | +def main(): |
| 327 | + """ Sample SiamRPN Tracker |
| 328 | + """ |
| 329 | + # Computation backends supported by layers |
| 330 | + backends = (cv.dnn.DNN_BACKEND_DEFAULT, cv.dnn.DNN_BACKEND_HALIDE, cv.dnn.DNN_BACKEND_INFERENCE_ENGINE, cv.dnn.DNN_BACKEND_OPENCV) |
| 331 | + # Target Devices for computation |
| 332 | + targets = (cv.dnn.DNN_TARGET_CPU, cv.dnn.DNN_TARGET_OPENCL, cv.dnn.DNN_TARGET_OPENCL_FP16, cv.dnn.DNN_TARGET_MYRIAD) |
| 333 | + |
| 334 | + parser = argparse.ArgumentParser(description='Use this script to run SiamRPN++ Visual Tracker', |
| 335 | + formatter_class=argparse.ArgumentDefaultsHelpFormatter) |
| 336 | + parser.add_argument('--input_video', type=str, help='Path to input video file. Skip this argument to capture frames from a camera.') |
| 337 | + parser.add_argument('--target_net', type=str, default='target_net.onnx', help='Path to part of SiamRPN++ ran on target frame.') |
| 338 | + parser.add_argument('--search_net', type=str, default='search_net.onnx', help='Path to part of SiamRPN++ ran on search frame.') |
| 339 | + parser.add_argument('--rpn_head', type=str, default='rpn_head.onnx', help='Path to RPN Head ONNX model.') |
| 340 | + parser.add_argument('--backend', choices=backends, default=cv.dnn.DNN_BACKEND_DEFAULT, type=int, |
| 341 | + help='Select a computation backend: ' |
| 342 | + "%d: automatically (by default) " |
| 343 | + "%d: Halide" |
| 344 | + "%d: Intel's Deep Learning Inference Engine (https://software.intel.com/openvino-toolkit)" |
| 345 | + "%d: OpenCV Implementation" % backends) |
| 346 | + parser.add_argument('--target', choices=targets, default=cv.dnn.DNN_TARGET_CPU, type=int, |
| 347 | + help='Select a target device: ' |
| 348 | + "%d: CPU target (by default)" |
| 349 | + "%d: OpenCL" |
| 350 | + "%d: OpenCL FP16" |
| 351 | + "%d: Myriad" % targets) |
| 352 | + args, _ = parser.parse_known_args() |
| 353 | + |
| 354 | + if args.input_video and not os.path.isfile(args.input_video): |
| 355 | + raise OSError("Input video file does not exist") |
| 356 | + if not os.path.isfile(args.target_net): |
| 357 | + raise OSError("Target Net does not exist") |
| 358 | + if not os.path.isfile(args.search_net): |
| 359 | + raise OSError("Search Net does not exist") |
| 360 | + if not os.path.isfile(args.rpn_head): |
| 361 | + raise OSError("RPN Head Net does not exist") |
| 362 | + |
| 363 | + #Load the Networks |
| 364 | + target_net = cv.dnn.readNetFromONNX(args.target_net) |
| 365 | + target_net.setPreferableBackend(args.backend) |
| 366 | + target_net.setPreferableTarget(args.target) |
| 367 | + search_net = cv.dnn.readNetFromONNX(args.search_net) |
| 368 | + search_net.setPreferableBackend(args.backend) |
| 369 | + search_net.setPreferableTarget(args.target) |
| 370 | + rpn_head = cv.dnn.readNetFromONNX(args.rpn_head) |
| 371 | + rpn_head.setPreferableBackend(args.backend) |
| 372 | + rpn_head.setPreferableTarget(args.target) |
| 373 | + model = ModelBuilder(target_net, search_net, rpn_head) |
| 374 | + tracker = SiamRPNTracker(model) |
| 375 | + |
| 376 | + first_frame = True |
| 377 | + cv.namedWindow('SiamRPN++ Tracker', cv.WINDOW_AUTOSIZE) |
| 378 | + for frame in get_frames(args.input_video): |
| 379 | + if first_frame: |
| 380 | + try: |
| 381 | + init_rect = cv.selectROI('SiamRPN++ Tracker', frame, False, False) |
| 382 | + except: |
| 383 | + exit() |
| 384 | + tracker.init(frame, init_rect) |
| 385 | + first_frame = False |
| 386 | + else: |
| 387 | + outputs = tracker.track(frame) |
| 388 | + bbox = list(map(int, outputs['bbox'])) |
| 389 | + x,y,w,h = bbox |
| 390 | + cv.rectangle(frame, (x, y), (x+w, y+h), (0, 255, 0), 3) |
| 391 | + cv.imshow('SiamRPN++ Tracker', frame) |
| 392 | + key = cv.waitKey(1) |
| 393 | + if key == ord("q"): |
| 394 | + break |
| 395 | + |
| 396 | +if __name__ == '__main__': |
| 397 | + main() |
0 commit comments