Skip to content

CNN-based text detector #1399

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 36 commits into from
Oct 31, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
9ae765a
Text detector class and Custom Image processor Class
sghoshcvc Jun 22, 2017
40db962
Add sample script
sghoshcvc Jun 22, 2017
fc9c41b
Minor modification
sghoshcvc Jun 23, 2017
e494efb
Added comments
sghoshcvc Jun 23, 2017
2b8ed12
added instructions to build
sghoshcvc Jul 5, 2017
be395e5
Modified the class heirarchy
sghoshcvc Jul 19, 2017
1bc908b
Added python sample script
sghoshcvc Jul 19, 2017
73ddeab
simple cleaning and added comments
sghoshcvc Jul 19, 2017
9071ca7
Merge branch 'master' into gsoc_textDetect_2017
sghoshcvc Jul 21, 2017
8cf800e
fix a dependency bug
sghoshcvc Jul 21, 2017
a617059
removed Java Wrapper
sghoshcvc Jul 21, 2017
ca2a2ab
Removed white space errors and platform specific warnings
sghoshcvc Jul 21, 2017
b913cac
Fixed Doxygen Warning and error
sghoshcvc Jul 22, 2017
4c9af58
Fixed Text box demo error
sghoshcvc Jul 22, 2017
103fbaf
White Space error in sample python script
sghoshcvc Jul 23, 2017
0e74d63
Modified to handle windows warning
sghoshcvc Jul 23, 2017
111b3be
Modified to silent Clang warnings
sghoshcvc Jul 24, 2017
a2cab07
DNN backend initial commit
sghoshcvc Aug 22, 2017
c697e41
added calculation of output size
sghoshcvc Aug 28, 2017
731637e
Merge branch 'master' into GSOC_text_detect_DNN_backend
sghoshcvc Aug 28, 2017
dc48968
removed blanks, fixed Cmake issue
sghoshcvc Sep 5, 2017
e98f42e
Merge branch 'GSOC_text_detect_DNN_backend' of https://github.com/sgh…
sghoshcvc Sep 5, 2017
af536b1
seperate image pre-processing from ocr code
sghoshcvc Sep 5, 2017
efc864c
removed hard coding height and width
sghoshcvc Sep 15, 2017
887e6e5
removed hard codinginput parameters
sghoshcvc Sep 17, 2017
878258b
modified initializers
sghoshcvc Sep 17, 2017
bf630be
Modified initializers list
sghoshcvc Sep 18, 2017
c33629e
Merge branch 'master' into text_detector_dnn
sovrasov Oct 9, 2017
951e182
text: cleanup dnn text detection part
sovrasov Oct 5, 2017
1306621
text: add prototxt for text detection model
sovrasov Oct 10, 2017
3253fe9
text: impovements in samples and module interface
sovrasov Oct 10, 2017
fb0338f
Merge branch 'master' into text_detector_dnn
sovrasov Oct 11, 2017
9195d2e
text: small adjustments in samples and image preprocessing
sovrasov Oct 11, 2017
7031316
text: add text recognition sample
sovrasov Oct 11, 2017
27961cd
text: fix wrong channel swap in TestDetectorCNN
sovrasov Oct 11, 2017
fd2e37d
text: improve DL-based samples
sovrasov Oct 30, 2017
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions modules/text/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,3 +47,12 @@ Notes
2. Tesseract configure script may fail to detect leptonica, so you may have to edit the configure script - comment off some if's around this message and retain only "then" branch.

3. You are encouraged to search the Net for some better pre-trained classifiers, as well as classifiers for other languages.


Text Detection CNN
=================

Intro
-----

The text module now have a text detection and recognition using deep CNN. The text detector deep CNN that takes an image which may contain multiple words. This outputs a list of Rects with bounding boxes and probability of text there. The text recognizer provides a probabillity over a given vocabulary for each of these rects.
3 changes: 3 additions & 0 deletions modules/text/cmake/FindTesseract.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,17 @@ endif()
if(NOT Tesseract_FOUND)
find_path(Tesseract_INCLUDE_DIR tesseract/baseapi.h
HINTS
/usr/include
/usr/local/include)

find_library(Tesseract_LIBRARY NAMES tesseract
HINTS
/usr/lib
/usr/local/lib)

find_library(Lept_LIBRARY NAMES lept
HINTS
/usr/lib
/usr/local/lib)

if(Tesseract_INCLUDE_DIR AND Tesseract_LIBRARY AND Lept_LIBRARY)
Expand Down
12 changes: 11 additions & 1 deletion modules/text/doc/text.bib
Original file line number Diff line number Diff line change
Expand Up @@ -31,4 +31,14 @@ @article{Gomez14
journal = {CoRR},
volume = {abs/1407.7504},
year = {2014},
}
}
@inproceedings{LiaoSBWL17,
author = {Minghui Liao and
Baoguang Shi and
Xiang Bai and
Xinggang Wang and
Wenyu Liu},
title = {TextBoxes: {A} Fast Text Detector with a Single Deep Neural Network},
booktitle = {AAAI},
year = {2017}
}
1 change: 1 addition & 0 deletions modules/text/include/opencv2/text.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ the use of this software, even if advised of the possibility of such damage.

#include "opencv2/text/erfilter.hpp"
#include "opencv2/text/ocr.hpp"
#include "opencv2/text/textDetector.hpp"

/** @defgroup text Scene Text Detection and Recognition
Expand Down
2 changes: 2 additions & 0 deletions modules/text/include/opencv2/text/ocr.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@
#ifndef __OPENCV_TEXT_OCR_HPP__
#define __OPENCV_TEXT_OCR_HPP__

#include <opencv2/core.hpp>

#include <vector>
#include <string>

Expand Down
73 changes: 73 additions & 0 deletions modules/text/include/opencv2/text/textDetector.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
// This file is part of OpenCV project.
// It is subject to the license terms in the LICENSE file found in the top-level directory
// of this distribution and at http://opencv.org/license.html.

#ifndef __OPENCV_TEXT_TEXTDETECTOR_HPP__
#define __OPENCV_TEXT_TEXTDETECTOR_HPP__

#include"ocr.hpp"

namespace cv
{
namespace text
{

//! @addtogroup text_detect
//! @{

/** @brief An abstract class providing interface for text detection algorithms
*/
class CV_EXPORTS_W TextDetector
{
public:
/**
@brief Method that provides a quick and simple interface to detect text inside an image

@param inputImage an image to process
@param Bbox a vector of Rect that will store the detected word bounding box
@param confidence a vector of float that will be updated with the confidence the classifier has for the selected bounding box
*/
CV_WRAP virtual void detect(InputArray inputImage, CV_OUT std::vector<Rect>& Bbox, CV_OUT std::vector<float>& confidence) = 0;
virtual ~TextDetector() {}
};

/** @brief TextDetectorCNN class provides the functionallity of text bounding box detection.
This class is representing to find bounding boxes of text words given an input image.
This class uses OpenCV dnn module to load pre-trained model described in @cite LiaoSBWL17.
The original repository with the modified SSD Caffe version: https://github.com/MhLiao/TextBoxes.
Model can be downloaded from [DropBox](https://www.dropbox.com/s/g8pjzv2de9gty8g/TextBoxes_icdar13.caffemodel?dl=0).
Modified .prototxt file with the model description can be found in `opencv_contrib/modules/text/samples/textbox.prototxt`.
*/
class CV_EXPORTS_W TextDetectorCNN : public TextDetector
{
public:
/**
@overload

@param inputImage an image expected to be a CV_U8C3 of any size
@param Bbox a vector of Rect that will store the detected word bounding box
@param confidence a vector of float that will be updated with the confidence the classifier has for the selected bounding box
*/
CV_WRAP virtual void detect(InputArray inputImage, CV_OUT std::vector<Rect>& Bbox, CV_OUT std::vector<float>& confidence) = 0;

/** @brief Creates an instance of the TextDetectorCNN class using the provided parameters.

@param modelArchFilename the relative or absolute path to the prototxt file describing the classifiers architecture.
@param modelWeightsFilename the relative or absolute path to the file containing the pretrained weights of the model in caffe-binary form.
@param detectionSizes a list of sizes for multiscale detection. The values`[(300,300),(700,500),(700,300),(700,700),(1600,1600)]` are
recommended in @cite LiaoSBWL17 to achieve the best quality.
*/
static Ptr<TextDetectorCNN> create(const String& modelArchFilename, const String& modelWeightsFilename,
std::vector<Size> detectionSizes);
/**
@overload
*/
CV_WRAP static Ptr<TextDetectorCNN> create(const String& modelArchFilename, const String& modelWeightsFilename);
};

//! @}
}//namespace text
}//namespace cv


#endif // _OPENCV_TEXT_OCR_HPP_
37 changes: 37 additions & 0 deletions modules/text/samples/deeptextdetection.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
# -*- coding: utf-8 -*-
#!/usr/bin/python
import sys
import os
import cv2
import numpy as np

def main():
print('\nDeeptextdetection.py')
print(' A demo script of text box alogorithm of the paper:')
print(' * Minghui Liao et al.: TextBoxes: A Fast Text Detector with a Single Deep Neural Network https://arxiv.org/abs/1611.06779\n')

if (len(sys.argv) < 2):
print(' (ERROR) You must call this script with an argument (path_to_image_to_be_processed)\n')
quit()

if not os.path.isfile('TextBoxes_icdar13.caffemodel') or not os.path.isfile('textbox.prototxt'):
print " Model files not found in current directory. Aborting"
print " See the documentation of text::TextDetectorCNN class to get download links."
quit()

img = cv2.imread(str(sys.argv[1]))
textSpotter = cv2.text.TextDetectorCNN_create("textbox.prototxt", "TextBoxes_icdar13.caffemodel")
rects, outProbs = textSpotter.detect(img);
vis = img.copy()
thres = 0.6

for r in range(np.shape(rects)[0]):
if outProbs[r] > thres:
rect = rects[r]
cv2.rectangle(vis, (rect[0],rect[1]), (rect[0] + rect[2], rect[1] + rect[3]), (255, 0, 0), 2)

cv2.imshow("Text detection result", vis)
cv2.waitKey()

if __name__ == "__main__":
main()
9 changes: 0 additions & 9 deletions modules/text/samples/dictnet_demo.cpp
Original file line number Diff line number Diff line change
@@ -1,12 +1,3 @@
/*
* dictnet_demo.cpp
*
* Demonstrates simple use of the holistic word classifier in C++
*
* Created on: June 26, 2016
* Author: Anguelos Nicolaou <anguelos.nicolaou AT gmail.com>
*/

#include "opencv2/text.hpp"
#include "opencv2/highgui.hpp"
#include "opencv2/imgproc.hpp"
Expand Down
122 changes: 122 additions & 0 deletions modules/text/samples/text_recognition_cnn.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
#include <opencv2/text.hpp>
#include <opencv2/highgui.hpp>
#include <opencv2/imgproc.hpp>
#include <opencv2/dnn.hpp>

#include <iostream>
#include <fstream>

using namespace cv;
using namespace std;

namespace
{
void printHelpStr(const string& progFname)
{
cout << " Demo of text recognition CNN for text detection." << endl
<< " Max Jaderberg et al.: Reading Text in the Wild with Convolutional Neural Networks, IJCV 2015"<<endl<<endl
<< " Usage: " << progFname << " <output_file> <input_image>" << endl
<< " Caffe Model files (textbox.prototxt, TextBoxes_icdar13.caffemodel)"<<endl
<< " must be in the current directory. See the documentation of text::TextDetectorCNN class to get download links." << endl
<< " Obtaining text recognition Caffe Model files in linux shell:" << endl
<< " wget http://nicolaou.homouniversalis.org/assets/vgg_text/dictnet_vgg.caffemodel" << endl
<< " wget http://nicolaou.homouniversalis.org/assets/vgg_text/dictnet_vgg_deploy.prototxt" << endl
<< " wget http://nicolaou.homouniversalis.org/assets/vgg_text/dictnet_vgg_labels.txt" <<endl << endl;
}

bool fileExists (const string& filename)
{
ifstream f(filename.c_str());
return f.good();
}

void textbox_draw(Mat src, std::vector<Rect>& groups, std::vector<float>& probs, std::vector<int>& indexes)
{
for (size_t i = 0; i < indexes.size(); i++)
{
if (src.type() == CV_8UC3)
{
Rect currrentBox = groups[indexes[i]];
rectangle(src, currrentBox, Scalar( 0, 255, 255 ), 2, LINE_AA);
String label = format("%.2f", probs[indexes[i]]);
std::cout << "text box: " << currrentBox << " confidence: " << probs[indexes[i]] << "\n";

int baseLine = 0;
Size labelSize = getTextSize(label, FONT_HERSHEY_PLAIN, 1, 1, &baseLine);
int yLeftBottom = std::max(currrentBox.y, labelSize.height);
rectangle(src, Point(currrentBox.x, yLeftBottom - labelSize.height),
Point(currrentBox.x + labelSize.width, yLeftBottom + baseLine), Scalar( 255, 255, 255 ), FILLED);

putText(src, label, Point(currrentBox.x, yLeftBottom), FONT_HERSHEY_PLAIN, 1, Scalar( 0,0,0 ), 1, LINE_AA);
}
else
rectangle(src, groups[i], Scalar( 255 ), 3, 8 );
}
}

}

int main(int argc, const char * argv[])
{
if (argc < 2)
{
printHelpStr(argv[0]);
cout << "Insufiecient parameters. Aborting!" << endl;
exit(1);
}

const string modelArch = "textbox.prototxt";
const string moddelWeights = "TextBoxes_icdar13.caffemodel";

if (!fileExists(modelArch) || !fileExists(moddelWeights))
{
printHelpStr(argv[0]);
cout << "Model files not found in the current directory. Aborting!" << endl;
exit(1);
}

Mat image = imread(String(argv[1]), IMREAD_COLOR);

cout << "Starting Text Box Demo" << endl;
Ptr<text::TextDetectorCNN> textSpotter =
text::TextDetectorCNN::create(modelArch, moddelWeights);

vector<Rect> bbox;
vector<float> outProbabillities;
textSpotter->detect(image, bbox, outProbabillities);
std::vector<int> indexes;
cv::dnn::NMSBoxes(bbox, outProbabillities, 0.4f, 0.5f, indexes);

Mat image_copy = image.clone();
textbox_draw(image_copy, bbox, outProbabillities, indexes);
imshow("Text detection", image_copy);
image_copy = image.clone();

Ptr<text::OCRHolisticWordRecognizer> wordSpotter =
text::OCRHolisticWordRecognizer::create("dictnet_vgg_deploy.prototxt", "dictnet_vgg.caffemodel", "dictnet_vgg_labels.txt");

for(size_t i = 0; i < indexes.size(); i++)
{
Mat wordImg;
cvtColor(image(bbox[indexes[i]]), wordImg, COLOR_BGR2GRAY);
string word;
vector<float> confs;
wordSpotter->run(wordImg, word, NULL, NULL, &confs);

Rect currrentBox = bbox[indexes[i]];
rectangle(image_copy, currrentBox, Scalar( 0, 255, 255 ), 2, LINE_AA);

int baseLine = 0;
Size labelSize = getTextSize(word, FONT_HERSHEY_PLAIN, 1, 1, &baseLine);
int yLeftBottom = std::max(currrentBox.y, labelSize.height);
rectangle(image_copy, Point(currrentBox.x, yLeftBottom - labelSize.height),
Point(currrentBox.x + labelSize.width, yLeftBottom + baseLine), Scalar( 255, 255, 255 ), FILLED);

putText(image_copy, word, Point(currrentBox.x, yLeftBottom), FONT_HERSHEY_PLAIN, 1, Scalar( 0,0,0 ), 1, LINE_AA);

}
imshow("Text recognition", image_copy);
cout << "Recognition finished. Press any key to exit.\n";
waitKey();
return 0;
}
Loading