Skip to content

Commit 6651fb0

Browse files
committed
Merge pull request #1399 from sovrasov:text_detector_dnn
2 parents e85a802 + fd2e37d commit 6651fb0

File tree

13 files changed

+2063
-10
lines changed

13 files changed

+2063
-10
lines changed

modules/text/README.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,3 +47,12 @@ Notes
4747
2. Tesseract configure script may fail to detect leptonica, so you may have to edit the configure script - comment off some if's around this message and retain only "then" branch.
4848

4949
3. You are encouraged to search the Net for some better pre-trained classifiers, as well as classifiers for other languages.
50+
51+
52+
Text Detection CNN
53+
=================
54+
55+
Intro
56+
-----
57+
58+
The text module now have a text detection and recognition using deep CNN. The text detector deep CNN that takes an image which may contain multiple words. This outputs a list of Rects with bounding boxes and probability of text there. The text recognizer provides a probabillity over a given vocabulary for each of these rects.

modules/text/cmake/FindTesseract.cmake

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,14 +5,17 @@ endif()
55
if(NOT Tesseract_FOUND)
66
find_path(Tesseract_INCLUDE_DIR tesseract/baseapi.h
77
HINTS
8+
/usr/include
89
/usr/local/include)
910

1011
find_library(Tesseract_LIBRARY NAMES tesseract
1112
HINTS
13+
/usr/lib
1214
/usr/local/lib)
1315

1416
find_library(Lept_LIBRARY NAMES lept
1517
HINTS
18+
/usr/lib
1619
/usr/local/lib)
1720

1821
if(Tesseract_INCLUDE_DIR AND Tesseract_LIBRARY AND Lept_LIBRARY)

modules/text/doc/text.bib

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,4 +31,14 @@ @article{Gomez14
3131
journal = {CoRR},
3232
volume = {abs/1407.7504},
3333
year = {2014},
34-
}
34+
}
35+
@inproceedings{LiaoSBWL17,
36+
author = {Minghui Liao and
37+
Baoguang Shi and
38+
Xiang Bai and
39+
Xinggang Wang and
40+
Wenyu Liu},
41+
title = {TextBoxes: {A} Fast Text Detector with a Single Deep Neural Network},
42+
booktitle = {AAAI},
43+
year = {2017}
44+
}

modules/text/include/opencv2/text.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ the use of this software, even if advised of the possibility of such damage.
4141

4242
#include "opencv2/text/erfilter.hpp"
4343
#include "opencv2/text/ocr.hpp"
44+
#include "opencv2/text/textDetector.hpp"
4445

4546
/** @defgroup text Scene Text Detection and Recognition
4647

modules/text/include/opencv2/text/ocr.hpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,8 @@
4444
#ifndef __OPENCV_TEXT_OCR_HPP__
4545
#define __OPENCV_TEXT_OCR_HPP__
4646

47+
#include <opencv2/core.hpp>
48+
4749
#include <vector>
4850
#include <string>
4951

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
// This file is part of OpenCV project.
2+
// It is subject to the license terms in the LICENSE file found in the top-level directory
3+
// of this distribution and at http://opencv.org/license.html.
4+
5+
#ifndef __OPENCV_TEXT_TEXTDETECTOR_HPP__
6+
#define __OPENCV_TEXT_TEXTDETECTOR_HPP__
7+
8+
#include"ocr.hpp"
9+
10+
namespace cv
11+
{
12+
namespace text
13+
{
14+
15+
//! @addtogroup text_detect
16+
//! @{
17+
18+
/** @brief An abstract class providing interface for text detection algorithms
19+
*/
20+
class CV_EXPORTS_W TextDetector
21+
{
22+
public:
23+
/**
24+
@brief Method that provides a quick and simple interface to detect text inside an image
25+
26+
@param inputImage an image to process
27+
@param Bbox a vector of Rect that will store the detected word bounding box
28+
@param confidence a vector of float that will be updated with the confidence the classifier has for the selected bounding box
29+
*/
30+
CV_WRAP virtual void detect(InputArray inputImage, CV_OUT std::vector<Rect>& Bbox, CV_OUT std::vector<float>& confidence) = 0;
31+
virtual ~TextDetector() {}
32+
};
33+
34+
/** @brief TextDetectorCNN class provides the functionallity of text bounding box detection.
35+
This class is representing to find bounding boxes of text words given an input image.
36+
This class uses OpenCV dnn module to load pre-trained model described in @cite LiaoSBWL17.
37+
The original repository with the modified SSD Caffe version: https://github.com/MhLiao/TextBoxes.
38+
Model can be downloaded from [DropBox](https://www.dropbox.com/s/g8pjzv2de9gty8g/TextBoxes_icdar13.caffemodel?dl=0).
39+
Modified .prototxt file with the model description can be found in `opencv_contrib/modules/text/samples/textbox.prototxt`.
40+
*/
41+
class CV_EXPORTS_W TextDetectorCNN : public TextDetector
42+
{
43+
public:
44+
/**
45+
@overload
46+
47+
@param inputImage an image expected to be a CV_U8C3 of any size
48+
@param Bbox a vector of Rect that will store the detected word bounding box
49+
@param confidence a vector of float that will be updated with the confidence the classifier has for the selected bounding box
50+
*/
51+
CV_WRAP virtual void detect(InputArray inputImage, CV_OUT std::vector<Rect>& Bbox, CV_OUT std::vector<float>& confidence) = 0;
52+
53+
/** @brief Creates an instance of the TextDetectorCNN class using the provided parameters.
54+
55+
@param modelArchFilename the relative or absolute path to the prototxt file describing the classifiers architecture.
56+
@param modelWeightsFilename the relative or absolute path to the file containing the pretrained weights of the model in caffe-binary form.
57+
@param detectionSizes a list of sizes for multiscale detection. The values`[(300,300),(700,500),(700,300),(700,700),(1600,1600)]` are
58+
recommended in @cite LiaoSBWL17 to achieve the best quality.
59+
*/
60+
static Ptr<TextDetectorCNN> create(const String& modelArchFilename, const String& modelWeightsFilename,
61+
std::vector<Size> detectionSizes);
62+
/**
63+
@overload
64+
*/
65+
CV_WRAP static Ptr<TextDetectorCNN> create(const String& modelArchFilename, const String& modelWeightsFilename);
66+
};
67+
68+
//! @}
69+
}//namespace text
70+
}//namespace cv
71+
72+
73+
#endif // _OPENCV_TEXT_OCR_HPP_
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
# -*- coding: utf-8 -*-
2+
#!/usr/bin/python
3+
import sys
4+
import os
5+
import cv2
6+
import numpy as np
7+
8+
def main():
9+
print('\nDeeptextdetection.py')
10+
print(' A demo script of text box alogorithm of the paper:')
11+
print(' * Minghui Liao et al.: TextBoxes: A Fast Text Detector with a Single Deep Neural Network https://arxiv.org/abs/1611.06779\n')
12+
13+
if (len(sys.argv) < 2):
14+
print(' (ERROR) You must call this script with an argument (path_to_image_to_be_processed)\n')
15+
quit()
16+
17+
if not os.path.isfile('TextBoxes_icdar13.caffemodel') or not os.path.isfile('textbox.prototxt'):
18+
print " Model files not found in current directory. Aborting"
19+
print " See the documentation of text::TextDetectorCNN class to get download links."
20+
quit()
21+
22+
img = cv2.imread(str(sys.argv[1]))
23+
textSpotter = cv2.text.TextDetectorCNN_create("textbox.prototxt", "TextBoxes_icdar13.caffemodel")
24+
rects, outProbs = textSpotter.detect(img);
25+
vis = img.copy()
26+
thres = 0.6
27+
28+
for r in range(np.shape(rects)[0]):
29+
if outProbs[r] > thres:
30+
rect = rects[r]
31+
cv2.rectangle(vis, (rect[0],rect[1]), (rect[0] + rect[2], rect[1] + rect[3]), (255, 0, 0), 2)
32+
33+
cv2.imshow("Text detection result", vis)
34+
cv2.waitKey()
35+
36+
if __name__ == "__main__":
37+
main()

modules/text/samples/dictnet_demo.cpp

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,3 @@
1-
/*
2-
* dictnet_demo.cpp
3-
*
4-
* Demonstrates simple use of the holistic word classifier in C++
5-
*
6-
* Created on: June 26, 2016
7-
* Author: Anguelos Nicolaou <anguelos.nicolaou AT gmail.com>
8-
*/
9-
101
#include "opencv2/text.hpp"
112
#include "opencv2/highgui.hpp"
123
#include "opencv2/imgproc.hpp"
Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
#include <opencv2/text.hpp>
2+
#include <opencv2/highgui.hpp>
3+
#include <opencv2/imgproc.hpp>
4+
#include <opencv2/dnn.hpp>
5+
6+
#include <iostream>
7+
#include <fstream>
8+
9+
using namespace cv;
10+
using namespace std;
11+
12+
namespace
13+
{
14+
void printHelpStr(const string& progFname)
15+
{
16+
cout << " Demo of text recognition CNN for text detection." << endl
17+
<< " Max Jaderberg et al.: Reading Text in the Wild with Convolutional Neural Networks, IJCV 2015"<<endl<<endl
18+
<< " Usage: " << progFname << " <output_file> <input_image>" << endl
19+
<< " Caffe Model files (textbox.prototxt, TextBoxes_icdar13.caffemodel)"<<endl
20+
<< " must be in the current directory. See the documentation of text::TextDetectorCNN class to get download links." << endl
21+
<< " Obtaining text recognition Caffe Model files in linux shell:" << endl
22+
<< " wget http://nicolaou.homouniversalis.org/assets/vgg_text/dictnet_vgg.caffemodel" << endl
23+
<< " wget http://nicolaou.homouniversalis.org/assets/vgg_text/dictnet_vgg_deploy.prototxt" << endl
24+
<< " wget http://nicolaou.homouniversalis.org/assets/vgg_text/dictnet_vgg_labels.txt" <<endl << endl;
25+
}
26+
27+
bool fileExists (const string& filename)
28+
{
29+
ifstream f(filename.c_str());
30+
return f.good();
31+
}
32+
33+
void textbox_draw(Mat src, std::vector<Rect>& groups, std::vector<float>& probs, std::vector<int>& indexes)
34+
{
35+
for (size_t i = 0; i < indexes.size(); i++)
36+
{
37+
if (src.type() == CV_8UC3)
38+
{
39+
Rect currrentBox = groups[indexes[i]];
40+
rectangle(src, currrentBox, Scalar( 0, 255, 255 ), 2, LINE_AA);
41+
String label = format("%.2f", probs[indexes[i]]);
42+
std::cout << "text box: " << currrentBox << " confidence: " << probs[indexes[i]] << "\n";
43+
44+
int baseLine = 0;
45+
Size labelSize = getTextSize(label, FONT_HERSHEY_PLAIN, 1, 1, &baseLine);
46+
int yLeftBottom = std::max(currrentBox.y, labelSize.height);
47+
rectangle(src, Point(currrentBox.x, yLeftBottom - labelSize.height),
48+
Point(currrentBox.x + labelSize.width, yLeftBottom + baseLine), Scalar( 255, 255, 255 ), FILLED);
49+
50+
putText(src, label, Point(currrentBox.x, yLeftBottom), FONT_HERSHEY_PLAIN, 1, Scalar( 0,0,0 ), 1, LINE_AA);
51+
}
52+
else
53+
rectangle(src, groups[i], Scalar( 255 ), 3, 8 );
54+
}
55+
}
56+
57+
}
58+
59+
int main(int argc, const char * argv[])
60+
{
61+
if (argc < 2)
62+
{
63+
printHelpStr(argv[0]);
64+
cout << "Insufiecient parameters. Aborting!" << endl;
65+
exit(1);
66+
}
67+
68+
const string modelArch = "textbox.prototxt";
69+
const string moddelWeights = "TextBoxes_icdar13.caffemodel";
70+
71+
if (!fileExists(modelArch) || !fileExists(moddelWeights))
72+
{
73+
printHelpStr(argv[0]);
74+
cout << "Model files not found in the current directory. Aborting!" << endl;
75+
exit(1);
76+
}
77+
78+
Mat image = imread(String(argv[1]), IMREAD_COLOR);
79+
80+
cout << "Starting Text Box Demo" << endl;
81+
Ptr<text::TextDetectorCNN> textSpotter =
82+
text::TextDetectorCNN::create(modelArch, moddelWeights);
83+
84+
vector<Rect> bbox;
85+
vector<float> outProbabillities;
86+
textSpotter->detect(image, bbox, outProbabillities);
87+
std::vector<int> indexes;
88+
cv::dnn::NMSBoxes(bbox, outProbabillities, 0.4f, 0.5f, indexes);
89+
90+
Mat image_copy = image.clone();
91+
textbox_draw(image_copy, bbox, outProbabillities, indexes);
92+
imshow("Text detection", image_copy);
93+
image_copy = image.clone();
94+
95+
Ptr<text::OCRHolisticWordRecognizer> wordSpotter =
96+
text::OCRHolisticWordRecognizer::create("dictnet_vgg_deploy.prototxt", "dictnet_vgg.caffemodel", "dictnet_vgg_labels.txt");
97+
98+
for(size_t i = 0; i < indexes.size(); i++)
99+
{
100+
Mat wordImg;
101+
cvtColor(image(bbox[indexes[i]]), wordImg, COLOR_BGR2GRAY);
102+
string word;
103+
vector<float> confs;
104+
wordSpotter->run(wordImg, word, NULL, NULL, &confs);
105+
106+
Rect currrentBox = bbox[indexes[i]];
107+
rectangle(image_copy, currrentBox, Scalar( 0, 255, 255 ), 2, LINE_AA);
108+
109+
int baseLine = 0;
110+
Size labelSize = getTextSize(word, FONT_HERSHEY_PLAIN, 1, 1, &baseLine);
111+
int yLeftBottom = std::max(currrentBox.y, labelSize.height);
112+
rectangle(image_copy, Point(currrentBox.x, yLeftBottom - labelSize.height),
113+
Point(currrentBox.x + labelSize.width, yLeftBottom + baseLine), Scalar( 255, 255, 255 ), FILLED);
114+
115+
putText(image_copy, word, Point(currrentBox.x, yLeftBottom), FONT_HERSHEY_PLAIN, 1, Scalar( 0,0,0 ), 1, LINE_AA);
116+
117+
}
118+
imshow("Text recognition", image_copy);
119+
cout << "Recognition finished. Press any key to exit.\n";
120+
waitKey();
121+
return 0;
122+
}

0 commit comments

Comments
 (0)