Merge pull request #1399 from sovrasov:text_detector_dnn

vpisarev · vpisarev · commit 6651fb0b45f5 · 2017-10-31T09:41:07.000Z
diff --git a/modules/text/README.md b/modules/text/README.md
@@ -47,3 +47,12 @@ Notes
 2. Tesseract configure script may fail to detect leptonica, so you may have to edit the configure script - comment off some if's around this message and retain only "then" branch.
 
 3. You are encouraged to search the Net for some better pre-trained classifiers, as well as classifiers for other languages.
+
+
+Text Detection CNN
+=================
+
+Intro
+-----
+
+The text module now have a text detection and recognition using deep CNN. The text detector deep CNN that takes an image which may contain multiple words. This outputs a list of Rects with bounding boxes and probability of text there. The text recognizer provides a probabillity over a given vocabulary for each of these rects.
diff --git a/modules/text/cmake/FindTesseract.cmake b/modules/text/cmake/FindTesseract.cmake
@@ -5,14 +5,17 @@ endif()
 if(NOT Tesseract_FOUND)
   find_path(Tesseract_INCLUDE_DIR tesseract/baseapi.h
     HINTS
+    /usr/include
     /usr/local/include)
 
   find_library(Tesseract_LIBRARY NAMES tesseract
     HINTS
+    /usr/lib
     /usr/local/lib)
 
   find_library(Lept_LIBRARY NAMES lept
     HINTS
+    /usr/lib
     /usr/local/lib)
 
   if(Tesseract_INCLUDE_DIR AND Tesseract_LIBRARY AND Lept_LIBRARY)
diff --git a/modules/text/doc/text.bib b/modules/text/doc/text.bib
@@ -31,4 +31,14 @@ @article{Gomez14
   journal   = {CoRR},
   volume    = {abs/1407.7504},
   year      = {2014},
-}
+}
+@inproceedings{LiaoSBWL17,
+  author    = {Minghui Liao and
+               Baoguang Shi and
+               Xiang Bai and
+               Xinggang Wang and
+               Wenyu Liu},
+  title     = {TextBoxes: {A} Fast Text Detector with a Single Deep Neural Network},
+  booktitle = {AAAI},
+  year      = {2017}
+}
diff --git a/modules/text/include/opencv2/text.hpp b/modules/text/include/opencv2/text.hpp
@@ -41,6 +41,7 @@ the use of this software, even if advised of the possibility of such damage.
 
 #include "opencv2/text/erfilter.hpp"
 #include "opencv2/text/ocr.hpp"
+#include "opencv2/text/textDetector.hpp"
 
 /** @defgroup text Scene Text Detection and Recognition
 
diff --git a/modules/text/include/opencv2/text/ocr.hpp b/modules/text/include/opencv2/text/ocr.hpp
@@ -44,6 +44,8 @@
 #ifndef __OPENCV_TEXT_OCR_HPP__
 #define __OPENCV_TEXT_OCR_HPP__
 
+#include <opencv2/core.hpp>
+
 #include <vector>
 #include <string>
 
diff --git a/modules/text/include/opencv2/text/textDetector.hpp b/modules/text/include/opencv2/text/textDetector.hpp
@@ -0,0 +1,73 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef __OPENCV_TEXT_TEXTDETECTOR_HPP__
+#define __OPENCV_TEXT_TEXTDETECTOR_HPP__
+
+#include"ocr.hpp"
+
+namespace cv
+{
+namespace text
+{
+
+//! @addtogroup text_detect
+//! @{
+
+/** @brief An abstract class providing interface for text detection algorithms
+ */
+class CV_EXPORTS_W TextDetector
+{
+public:
+    /**
+    @brief Method that provides a quick and simple interface to detect text inside an image
+
+    @param inputImage an image to process
+    @param Bbox a vector of Rect that will store the detected word bounding box
+    @param confidence a vector of float that will be updated with the confidence the classifier has for the selected bounding box
+    */
+    CV_WRAP virtual void detect(InputArray inputImage, CV_OUT std::vector<Rect>& Bbox, CV_OUT std::vector<float>& confidence) = 0;
+    virtual ~TextDetector() {}
+};
+
+/** @brief TextDetectorCNN class provides the functionallity of text bounding box detection.
+ This class is representing to find bounding boxes of text words given an input image.
+ This class uses OpenCV dnn module to load pre-trained model described in @cite LiaoSBWL17.
+ The original repository with the modified SSD Caffe version: https://github.com/MhLiao/TextBoxes.
+ Model can be downloaded from [DropBox](https://www.dropbox.com/s/g8pjzv2de9gty8g/TextBoxes_icdar13.caffemodel?dl=0).
+ Modified .prototxt file with the model description can be found in `opencv_contrib/modules/text/samples/textbox.prototxt`.
+ */
+class CV_EXPORTS_W TextDetectorCNN : public TextDetector
+{
+public:
+    /**
+    @overload
+
+    @param inputImage an image expected to be a CV_U8C3 of any size
+    @param Bbox a vector of Rect that will store the detected word bounding box
+    @param confidence a vector of float that will be updated with the confidence the classifier has for the selected bounding box
+    */
+    CV_WRAP virtual void detect(InputArray inputImage, CV_OUT std::vector<Rect>& Bbox, CV_OUT std::vector<float>& confidence) = 0;
+
+    /** @brief Creates an instance of the TextDetectorCNN class using the provided parameters.
+
+    @param modelArchFilename the relative or absolute path to the prototxt file describing the classifiers architecture.
+    @param modelWeightsFilename the relative or absolute path to the file containing the pretrained weights of the model in caffe-binary form.
+    @param detectionSizes a list of sizes for multiscale detection. The values`[(300,300),(700,500),(700,300),(700,700),(1600,1600)]` are
+    recommended in @cite LiaoSBWL17 to achieve the best quality.
+    */
+    static Ptr<TextDetectorCNN> create(const String& modelArchFilename, const String& modelWeightsFilename,
+                                               std::vector<Size> detectionSizes);
+    /**
+      @overload
+    */
+    CV_WRAP static Ptr<TextDetectorCNN> create(const String& modelArchFilename, const String& modelWeightsFilename);
+};
+
+//! @}
+}//namespace text
+}//namespace cv
+
+
+#endif // _OPENCV_TEXT_OCR_HPP_
diff --git a/modules/text/samples/deeptextdetection.py b/modules/text/samples/deeptextdetection.py
@@ -0,0 +1,37 @@
+# -*- coding: utf-8 -*-
+#!/usr/bin/python
+import sys
+import os
+import cv2
+import numpy as np
+
+def main():
+    print('\nDeeptextdetection.py')
+    print('       A demo script of text box alogorithm of the paper:')
+    print('       * Minghui Liao et al.: TextBoxes: A Fast Text Detector with a Single Deep Neural Network https://arxiv.org/abs/1611.06779\n')
+
+    if (len(sys.argv) < 2):
+        print(' (ERROR) You must call this script with an argument (path_to_image_to_be_processed)\n')
+        quit()
+
+    if not os.path.isfile('TextBoxes_icdar13.caffemodel') or not os.path.isfile('textbox.prototxt'):
+        print " Model files not found in current directory. Aborting"
+        print " See the documentation of text::TextDetectorCNN class to get download links."
+        quit()
+
+    img = cv2.imread(str(sys.argv[1]))
+    textSpotter = cv2.text.TextDetectorCNN_create("textbox.prototxt", "TextBoxes_icdar13.caffemodel")
+    rects, outProbs = textSpotter.detect(img);
+    vis = img.copy()
+    thres = 0.6
+
+    for r in range(np.shape(rects)[0]):
+        if outProbs[r] > thres:
+            rect = rects[r]
+            cv2.rectangle(vis, (rect[0],rect[1]), (rect[0] + rect[2], rect[1] + rect[3]), (255, 0, 0), 2)
+
+    cv2.imshow("Text detection result", vis)
+    cv2.waitKey()
+
+if __name__ == "__main__":
+    main()
diff --git a/modules/text/samples/dictnet_demo.cpp b/modules/text/samples/dictnet_demo.cpp
@@ -1,12 +1,3 @@
-/*
- * dictnet_demo.cpp
- *
- * Demonstrates simple use of the holistic word classifier in C++
- *
- * Created on: June 26, 2016
- *     Author: Anguelos Nicolaou <anguelos.nicolaou AT gmail.com>
- */
-
 #include  "opencv2/text.hpp"
 #include  "opencv2/highgui.hpp"
 #include  "opencv2/imgproc.hpp"
diff --git a/modules/text/samples/text_recognition_cnn.cpp b/modules/text/samples/text_recognition_cnn.cpp
@@ -0,0 +1,122 @@
+#include <opencv2/text.hpp>
+#include <opencv2/highgui.hpp>
+#include <opencv2/imgproc.hpp>
+#include <opencv2/dnn.hpp>
+
+#include  <iostream>
+#include  <fstream>
+
+using namespace cv;
+using namespace std;
+
+namespace
+{
+void printHelpStr(const string& progFname)
+{
+    cout << "   Demo of text recognition CNN for text detection." << endl
+         << "   Max Jaderberg et al.: Reading Text in the Wild with Convolutional Neural Networks, IJCV 2015"<<endl<<endl
+         << "   Usage: " << progFname << " <output_file> <input_image>" << endl
+         << "   Caffe Model files (textbox.prototxt, TextBoxes_icdar13.caffemodel)"<<endl
+         << "     must be in the current directory. See the documentation of text::TextDetectorCNN class to get download links." << endl
+         << "   Obtaining text recognition Caffe Model files in linux shell:" << endl
+         << "   wget http://nicolaou.homouniversalis.org/assets/vgg_text/dictnet_vgg.caffemodel" << endl
+         << "   wget http://nicolaou.homouniversalis.org/assets/vgg_text/dictnet_vgg_deploy.prototxt" << endl
+         << "   wget http://nicolaou.homouniversalis.org/assets/vgg_text/dictnet_vgg_labels.txt" <<endl << endl;
+}
+
+bool fileExists (const string& filename)
+{
+    ifstream f(filename.c_str());
+    return f.good();
+}
+
+void textbox_draw(Mat src, std::vector<Rect>& groups, std::vector<float>& probs, std::vector<int>& indexes)
+{
+    for (size_t i = 0; i < indexes.size(); i++)
+    {
+        if (src.type() == CV_8UC3)
+        {
+            Rect currrentBox = groups[indexes[i]];
+            rectangle(src, currrentBox, Scalar( 0, 255, 255 ), 2, LINE_AA);
+            String label = format("%.2f", probs[indexes[i]]);
+            std::cout << "text box: " << currrentBox << " confidence: " << probs[indexes[i]] << "\n";
+
+            int baseLine = 0;
+            Size labelSize = getTextSize(label, FONT_HERSHEY_PLAIN, 1, 1, &baseLine);
+            int yLeftBottom = std::max(currrentBox.y, labelSize.height);
+            rectangle(src, Point(currrentBox.x, yLeftBottom - labelSize.height),
+                      Point(currrentBox.x + labelSize.width, yLeftBottom + baseLine), Scalar( 255, 255, 255 ), FILLED);
+
+            putText(src, label, Point(currrentBox.x, yLeftBottom), FONT_HERSHEY_PLAIN, 1, Scalar( 0,0,0 ), 1, LINE_AA);
+        }
+        else
+            rectangle(src, groups[i], Scalar( 255 ), 3, 8 );
+    }
+}
+
+}
+
+int main(int argc, const char * argv[])
+{
+    if (argc < 2)
+    {
+        printHelpStr(argv[0]);
+        cout << "Insufiecient parameters. Aborting!" << endl;
+        exit(1);
+    }
+
+    const string modelArch = "textbox.prototxt";
+    const string moddelWeights = "TextBoxes_icdar13.caffemodel";
+
+    if (!fileExists(modelArch) || !fileExists(moddelWeights))
+    {
+        printHelpStr(argv[0]);
+        cout << "Model files not found in the current directory. Aborting!" << endl;
+        exit(1);
+    }
+
+    Mat image = imread(String(argv[1]), IMREAD_COLOR);
+
+    cout << "Starting Text Box Demo" << endl;
+    Ptr<text::TextDetectorCNN> textSpotter =
+            text::TextDetectorCNN::create(modelArch, moddelWeights);
+
+    vector<Rect> bbox;
+    vector<float> outProbabillities;
+    textSpotter->detect(image, bbox, outProbabillities);
+    std::vector<int> indexes;
+    cv::dnn::NMSBoxes(bbox, outProbabillities, 0.4f, 0.5f, indexes);
+
+    Mat image_copy = image.clone();
+    textbox_draw(image_copy, bbox, outProbabillities, indexes);
+    imshow("Text detection", image_copy);
+    image_copy = image.clone();
+
+    Ptr<text::OCRHolisticWordRecognizer> wordSpotter =
+            text::OCRHolisticWordRecognizer::create("dictnet_vgg_deploy.prototxt", "dictnet_vgg.caffemodel", "dictnet_vgg_labels.txt");
+
+    for(size_t i = 0; i < indexes.size(); i++)
+    {
+        Mat wordImg;
+        cvtColor(image(bbox[indexes[i]]), wordImg, COLOR_BGR2GRAY);
+        string word;
+        vector<float> confs;
+        wordSpotter->run(wordImg, word, NULL, NULL, &confs);
+
+        Rect currrentBox = bbox[indexes[i]];
+        rectangle(image_copy, currrentBox, Scalar( 0, 255, 255 ), 2, LINE_AA);
+
+        int baseLine = 0;
+        Size labelSize = getTextSize(word, FONT_HERSHEY_PLAIN, 1, 1, &baseLine);
+        int yLeftBottom = std::max(currrentBox.y, labelSize.height);
+        rectangle(image_copy, Point(currrentBox.x, yLeftBottom - labelSize.height),
+                  Point(currrentBox.x + labelSize.width, yLeftBottom + baseLine), Scalar( 255, 255, 255 ), FILLED);
+
+        putText(image_copy, word, Point(currrentBox.x, yLeftBottom), FONT_HERSHEY_PLAIN, 1, Scalar( 0,0,0 ), 1, LINE_AA);
+
+    }
+    imshow("Text recognition", image_copy);
+    cout << "Recognition finished. Press any key to exit.\n";
+    waitKey();
+    return 0;
+}
diff --git a/modules/text/samples/textbox.prototxt b/modules/text/samples/textbox.prototxt
diff --git a/modules/text/samples/textbox_demo.cpp b/modules/text/samples/textbox_demo.cpp
diff --git a/modules/text/src/ocr_holistic.cpp b/modules/text/src/ocr_holistic.cpp
diff --git a/modules/text/src/text_detectorCNN.cpp b/modules/text/src/text_detectorCNN.cpp