Merge pull request opencv#17384 from dkurt:efficientdet

alalek · alalek · commit 319db07b6b2a · 2020-05-28T22:48:52.000Z
diff --git a/modules/dnn/perf/perf_net.cpp b/modules/dnn/perf/perf_net.cpp
@@ -235,6 +235,17 @@ PERF_TEST_P_(DNNTestNetwork, Inception_v2_Faster_RCNN)
                Mat(cv::Size(800, 600), CV_32FC3));
 }
 
+PERF_TEST_P_(DNNTestNetwork, EfficientDet)
+{
+    if (backend == DNN_BACKEND_HALIDE || target != DNN_TARGET_CPU)
+        throw SkipTestException("");
+    Mat sample = imread(findDataFile("dnn/dog416.png"));
+    resize(sample, sample, Size(512, 512));
+    Mat inp;
+    sample.convertTo(inp, CV_32FC3, 1.0/255);
+    processNet("dnn/efficientdet-d0.pb", "dnn/efficientdet-d0.pbtxt", "", inp);
+}
+
 INSTANTIATE_TEST_CASE_P(/*nothing*/, DNNTestNetwork, dnnBackendsAndTargets());
 
 } // namespace
diff --git a/modules/dnn/src/tensorflow/tf_importer.cpp b/modules/dnn/src/tensorflow/tf_importer.cpp
@@ -1542,22 +1542,32 @@ void TFImporter::populateNet(Net dstNet)
 
             connect(layer_id, dstNet, parsePin(layer.input(0)), id, 0);
         }
-        else if (type == "Mul")
+        else if (type == "Mul" || type == "RealDiv")
         {
-            bool haveConst = false;
-            for(int ii = 0; !haveConst && ii < layer.input_size(); ++ii)
+            int constId = -1;
+            for(int ii = 0; ii < layer.input_size(); ++ii)
             {
                 Pin input = parsePin(layer.input(ii));
-                haveConst = value_id.find(input.name) != value_id.end();
+                if (value_id.find(input.name) != value_id.end())
+                {
+                    constId = ii;
+                    break;
+                }
             }
-            CV_Assert(!haveConst || layer.input_size() == 2);
+            CV_Assert((constId != -1) || (layer.input_size() == 2));
 
-            if (haveConst)
+            if (constId != -1)
             {
                 // Multiplication by constant.
                 CV_Assert(layer.input_size() == 2);
                 Mat scaleMat = getTensorContent(getConstBlob(layer, value_id));
                 CV_Assert(scaleMat.type() == CV_32FC1);
+                if (type == "RealDiv")
+                {
+                    if (constId == 0)
+                        CV_Error(Error::StsNotImplemented, "Division of constant over variable");
+                    scaleMat = 1.0f / scaleMat;
+                }
 
                 int id;
                 if (scaleMat.total() == 1)  // is a scalar.
@@ -1659,11 +1669,15 @@ void TFImporter::populateNet(Net dstNet)
                 int id;
                 if (equalInpShapes || netInputShapes.empty())
                 {
-                    layerParams.set("operation", "prod");
+                    layerParams.set("operation", type == "RealDiv" ? "div" : "prod");
                     id = dstNet.addLayer(name, "Eltwise", layerParams);
                 }
                 else
+                {
+                    if (type == "RealDiv")
+                        CV_Error(Error::StsNotImplemented, "Division of non equal tensors");
                     id = dstNet.addLayer(name, "Scale", layerParams);
+                }
 
                 layer_id[name] = id;
 
diff --git a/modules/dnn/test/test_tf_importer.cpp b/modules/dnn/test/test_tf_importer.cpp
@@ -1128,4 +1128,37 @@ TEST_P(Test_TensorFlow_nets, Mask_RCNN)
         expectNoFallbacks(net);
 }
 
+TEST_P(Test_TensorFlow_nets, EfficientDet)
+{
+    if (target != DNN_TARGET_CPU)
+    {
+        if (target == DNN_TARGET_OPENCL_FP16) applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL_FP16);
+        if (target == DNN_TARGET_OPENCL)      applyTestTag(CV_TEST_TAG_DNN_SKIP_OPENCL);
+        if (target == DNN_TARGET_MYRIAD)      applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD);
+    }
+    checkBackend();
+    std::string proto = findDataFile("dnn/efficientdet-d0.pbtxt");
+    std::string model = findDataFile("dnn/efficientdet-d0.pb");
+
+    Net net = readNetFromTensorflow(model, proto);
+    Mat img = imread(findDataFile("dnn/dog416.png"));
+    Mat blob = blobFromImage(img, 1.0/255, Size(512, 512), Scalar(123.675, 116.28, 103.53));
+
+    net.setPreferableBackend(backend);
+    net.setPreferableTarget(target);
+    net.setInput(blob);
+    // Output has shape 1x1xNx7 where N - number of detections.
+    // An every detection is a vector of values [id, classId, confidence, left, top, right, bottom]
+    Mat out = net.forward();
+
+    // References are from test for TensorFlow model.
+    Mat ref = (Mat_<float>(3, 7) << 0, 1, 0.8437444, 0.153996080160141, 0.20534580945968628, 0.7463544607162476, 0.7414066195487976,
+                                    0, 17, 0.8245924, 0.16657517850399017, 0.3996818959712982, 0.4111558794975281, 0.9306337833404541,
+                                    0, 7, 0.8039304, 0.6118435263633728, 0.13175517320632935, 0.9065558314323425, 0.2943994700908661);
+    double scoreDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 4e-3 : 1e-5;
+    double iouDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 2e-3 : 1e-4;
+    normAssertDetections(ref, out, "", 0.5, scoreDiff, iouDiff);
+    expectNoFallbacksFromIE(net);
+}
+
 }
diff --git a/samples/dnn/tf_text_graph_common.py b/samples/dnn/tf_text_graph_common.py
@@ -269,7 +269,7 @@ def parseTextGraph(filePath):
 def removeIdentity(graph_def):
     identities = {}
     for node in graph_def.node:
-        if node.op == 'Identity':
+        if node.op == 'Identity' or node.op == 'IdentityN':
             identities[node.name] = node.input[0]
             graph_def.node.remove(node)
 
diff --git a/samples/dnn/tf_text_graph_efficientdet.py b/samples/dnn/tf_text_graph_efficientdet.py
@@ -0,0 +1,236 @@
+# This file is a part of OpenCV project.
+# It is a subject to the license terms in the LICENSE file found in the top-level directory
+# of this distribution and at http://opencv.org/license.html.
+#
+# Copyright (C) 2020, Intel Corporation, all rights reserved.
+# Third party copyrights are property of their respective owners.
+#
+# Use this script to get the text graph representation (.pbtxt) of EfficientDet
+# deep learning network trained in https://github.com/google/automl.
+# Then you can import it with a binary frozen graph (.pb) using readNetFromTensorflow() function.
+# See details and examples on the following wiki page: https://github.com/opencv/opencv/wiki/TensorFlow-Object-Detection-API
+import argparse
+import re
+from math import sqrt
+from tf_text_graph_common import *
+
+
+class AnchorGenerator:
+    def __init__(self, min_level, aspect_ratios, num_scales, anchor_scale):
+        self.min_level = min_level
+        self.aspect_ratios = aspect_ratios
+        self.anchor_scale = anchor_scale
+        self.scales = [2**(float(s) / num_scales) for s in range(num_scales)]
+
+    def get(self, layer_id):
+        widths = []
+        heights = []
+        for s in self.scales:
+            for a in self.aspect_ratios:
+                base_anchor_size = 2**(self.min_level + layer_id) * self.anchor_scale
+                heights.append(base_anchor_size * s * a[1])
+                widths.append(base_anchor_size * s * a[0])
+        return widths, heights
+
+
+def createGraph(modelPath, outputPath, min_level, aspect_ratios, num_scales,
+                anchor_scale, num_classes, image_width, image_height):
+    print('Min level: %d' % min_level)
+    print('Anchor scale: %f' % anchor_scale)
+    print('Num scales: %d' % num_scales)
+    print('Aspect ratios: %s' % str(aspect_ratios))
+    print('Number of classes: %d' % num_classes)
+    print('Input image size: %dx%d' % (image_width, image_height))
+
+    # Read the graph.
+    _inpNames = ['image_arrays']
+    outNames = ['detections']
+
+    writeTextGraph(modelPath, outputPath, outNames)
+    graph_def = parseTextGraph(outputPath)
+
+    def getUnconnectedNodes():
+        unconnected = []
+        for node in graph_def.node:
+            if node.op == 'Const':
+                continue
+            unconnected.append(node.name)
+            for inp in node.input:
+                if inp in unconnected:
+                    unconnected.remove(inp)
+        return unconnected
+
+
+    nodesToKeep = ['truediv']  # Keep preprocessing nodes
+
+    removeIdentity(graph_def)
+
+    scopesToKeep = ('image_arrays', 'efficientnet', 'resample_p6', 'resample_p7',
+                    'fpn_cells', 'class_net', 'box_net', 'Reshape', 'concat')
+
+    addConstNode('scale_w', [2.0], graph_def)
+    addConstNode('scale_h', [2.0], graph_def)
+    nodesToKeep += ['scale_w', 'scale_h']
+
+    for node in graph_def.node:
+        if re.match('efficientnet-(.*)/blocks_\d+/se/mul_1', node.name):
+            node.input[0], node.input[1] = node.input[1], node.input[0]
+
+        if re.match('fpn_cells/cell_\d+/fnode\d+/resample(.*)/nearest_upsampling/Reshape_1$', node.name):
+            node.op = 'ResizeNearestNeighbor'
+            node.input[1] = 'scale_w'
+            node.input.append('scale_h')
+
+            for inpNode in graph_def.node:
+                if inpNode.name == node.name[:node.name.rfind('_')]:
+                    node.input[0] = inpNode.input[0]
+
+        if re.match('box_net/box-predict(_\d)*/separable_conv2d$', node.name):
+            node.addAttr('loc_pred_transposed', True)
+
+        # Replace RealDiv to Mul with inversed scale for compatibility
+        if node.op == 'RealDiv':
+            for inpNode in graph_def.node:
+                if inpNode.name != node.input[1] or not 'value' in inpNode.attr:
+                    continue
+
+                tensor = inpNode.attr['value']['tensor'][0]
+                if not 'float_val' in tensor:
+                    continue
+                scale = float(inpNode.attr['value']['tensor'][0]['float_val'][0])
+
+                addConstNode(inpNode.name + '/inv', [1.0 / scale], graph_def)
+                nodesToKeep.append(inpNode.name + '/inv')
+                node.input[1] = inpNode.name + '/inv'
+                node.op = 'Mul'
+                break
+
+
+    def to_remove(name, op):
+        if name in nodesToKeep:
+            return False
+        return op == 'Const' or not name.startswith(scopesToKeep)
+
+    removeUnusedNodesAndAttrs(to_remove, graph_def)
+
+    # Attach unconnected preprocessing
+    assert(graph_def.node[1].name == 'truediv' and graph_def.node[1].op == 'RealDiv')
+    graph_def.node[1].input.insert(0, 'image_arrays')
+    graph_def.node[2].input.insert(0, 'truediv')
+
+    priors_generator = AnchorGenerator(min_level, aspect_ratios, num_scales, anchor_scale)
+    priorBoxes = []
+    for i in range(5):
+        inpName = ''
+        for node in graph_def.node:
+            if node.name == 'Reshape_%d' % (i * 2 + 1):
+                inpName = node.input[0]
+                break
+
+        priorBox = NodeDef()
+        priorBox.name = 'PriorBox_%d' % i
+        priorBox.op = 'PriorBox'
+        priorBox.input.append(inpName)
+        priorBox.input.append(graph_def.node[0].name)  # image_tensor
+
+        priorBox.addAttr('flip', False)
+        priorBox.addAttr('clip', False)
+
+        widths, heights = priors_generator.get(i)
+
+        priorBox.addAttr('width', widths)
+        priorBox.addAttr('height', heights)
+        priorBox.addAttr('variance', [1.0, 1.0, 1.0, 1.0])
+
+        graph_def.node.extend([priorBox])
+        priorBoxes.append(priorBox.name)
+
+    addConstNode('concat/axis_flatten', [-1], graph_def)
+
+    def addConcatNode(name, inputs, axisNodeName):
+        concat = NodeDef()
+        concat.name = name
+        concat.op = 'ConcatV2'
+        for inp in inputs:
+            concat.input.append(inp)
+        concat.input.append(axisNodeName)
+        graph_def.node.extend([concat])
+
+    addConcatNode('PriorBox/concat', priorBoxes, 'concat/axis_flatten')
+
+    sigmoid = NodeDef()
+    sigmoid.name = 'concat/sigmoid'
+    sigmoid.op = 'Sigmoid'
+    sigmoid.input.append('concat')
+    graph_def.node.extend([sigmoid])
+
+    addFlatten(sigmoid.name, sigmoid.name + '/Flatten', graph_def)
+    addFlatten('concat_1', 'concat_1/Flatten', graph_def)
+
+    detectionOut = NodeDef()
+    detectionOut.name = 'detection_out'
+    detectionOut.op = 'DetectionOutput'
+
+    detectionOut.input.append('concat_1/Flatten')
+    detectionOut.input.append(sigmoid.name + '/Flatten')
+    detectionOut.input.append('PriorBox/concat')
+
+    detectionOut.addAttr('num_classes', num_classes)
+    detectionOut.addAttr('share_location', True)
+    detectionOut.addAttr('background_label_id', num_classes + 1)
+    detectionOut.addAttr('nms_threshold', 0.6)
+    detectionOut.addAttr('confidence_threshold', 0.2)
+    detectionOut.addAttr('top_k', 100)
+    detectionOut.addAttr('keep_top_k', 100)
+    detectionOut.addAttr('code_type', "CENTER_SIZE")
+    graph_def.node.extend([detectionOut])
+
+    graph_def.node[0].attr['shape'] =  {
+            'shape': {
+                'dim': [
+                    {'size': -1},
+                    {'size': image_height},
+                    {'size': image_width},
+                    {'size': 3}
+                ]
+            }
+        }
+
+    while True:
+        unconnectedNodes = getUnconnectedNodes()
+        unconnectedNodes.remove(detectionOut.name)
+        if not unconnectedNodes:
+            break
+
+        for name in unconnectedNodes:
+            for i in range(len(graph_def.node)):
+                if graph_def.node[i].name == name:
+                    del graph_def.node[i]
+                    break
+
+    # Save as text
+    graph_def.save(outputPath)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='Run this script to get a text graph of '
+                                                 'SSD model from TensorFlow Object Detection API. '
+                                                 'Then pass it with .pb file to cv::dnn::readNetFromTensorflow function.')
+    parser.add_argument('--input', required=True, help='Path to frozen TensorFlow graph.')
+    parser.add_argument('--output', required=True, help='Path to output text graph.')
+    parser.add_argument('--min_level', default=3, type=int, help='Parameter from training config')
+    parser.add_argument('--num_scales', default=3, type=int, help='Parameter from training config')
+    parser.add_argument('--anchor_scale', default=4.0, type=float, help='Parameter from training config')
+    parser.add_argument('--aspect_ratios', default=[1.0, 1.0, 1.4, 0.7, 0.7, 1.4],
+                        nargs='+', type=float, help='Parameter from training config')
+    parser.add_argument('--num_classes', default=90, type=int, help='Number of classes to detect')
+    parser.add_argument('--width', default=512, type=int, help='Network input width')
+    parser.add_argument('--height', default=512, type=int, help='Network input height')
+    args = parser.parse_args()
+
+    ar = args.aspect_ratios
+    assert(len(ar) % 2 == 0)
+    ar = list(zip(ar[::2], ar[1::2]))
+
+    createGraph(args.input, args.output, args.min_level, ar, args.num_scales,
+                args.anchor_scale, args.num_classes, args.width, args.height)