Merge pull request #1384 from mshabunin:pr723

vpisarev · vpisarev · commit 68736a2ce5c1 · 2017-10-10T09:54:46.000Z
diff --git a/modules/text/CMakeLists.txt b/modules/text/CMakeLists.txt
@@ -1,5 +1,5 @@
 set(the_description "Text Detection and Recognition")
-ocv_define_module(text opencv_ml opencv_imgproc opencv_core opencv_features2d OPTIONAL opencv_highgui WRAP python java)
+ocv_define_module(text opencv_ml opencv_imgproc opencv_core opencv_features2d opencv_dnn OPTIONAL opencv_highgui WRAP python java)
 
 if(NOT CMAKE_CROSSCOMPILING OR OPENCV_FIND_TESSERACT)
   set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
diff --git a/modules/text/include/opencv2/text/ocr.hpp b/modules/text/include/opencv2/text/ocr.hpp
@@ -536,8 +536,66 @@ at each window location.
 
 CV_EXPORTS_W Ptr<OCRBeamSearchDecoder::ClassifierCallback> loadOCRBeamSearchClassifierCNN(const String& filename);
 
+
+/** @brief OCRHolisticWordRecognizer class provides the functionallity of segmented wordspotting.
+ * Given a predefined vocabulary , a DictNet is employed to select the most probable
+ * word given an input image.
+ *
+ * DictNet is described in detail in:
+ * Max Jaderberg et al.: Reading Text in the Wild with Convolutional Neural Networks, IJCV 2015
+ * http://arxiv.org/abs/1412.1842
+ */
+class CV_EXPORTS OCRHolisticWordRecognizer : public BaseOCR
+{
+public:
+    virtual void run(Mat& image,
+                     std::string& output_text,
+                     std::vector<Rect>* component_rects = NULL,
+                     std::vector<std::string>* component_texts = NULL,
+                     std::vector<float>* component_confidences = NULL,
+                     int component_level = OCR_LEVEL_WORD) = 0;
+
+    /** @brief Recognize text using a segmentation based word-spotting/classifier cnn.
+
+    Takes image on input and returns recognized text in the output_text parameter. Optionally
+    provides also the Rects for individual text elements found (e.g. words), and the list of those
+    text elements with their confidence values.
+
+    @param image Input image CV_8UC1 or CV_8UC3
+
+    @param mask is totally ignored and is only available for compatibillity reasons
+
+    @param output_text Output text of the the word spoting, always one that exists in the dictionary.
+
+    @param component_rects Not applicable for word spotting can be be NULL if not, a single elemnt will
+        be put in the vector.
+
+    @param component_texts Not applicable for word spotting can be be NULL if not, a single elemnt will
+        be put in the vector.
+
+    @param component_confidences Not applicable for word spotting can be be NULL if not, a single elemnt will
+        be put in the vector.
+
+    @param component_level must be OCR_LEVEL_WORD.
+     */
+    virtual void run(Mat& image,
+                     Mat& mask,
+                     std::string& output_text,
+                     std::vector<Rect>* component_rects = NULL,
+                     std::vector<std::string>* component_texts = NULL,
+                     std::vector<float>* component_confidences = NULL,
+                     int component_level = OCR_LEVEL_WORD) = 0;
+
+    /** @brief Creates an instance of the OCRHolisticWordRecognizer class.
+     */
+    static Ptr<OCRHolisticWordRecognizer> create(const std::string &archFilename,
+                                                 const std::string &weightsFilename,
+                                                 const std::string &wordsFilename);
+};
+
 //! @}
 
-}
-}
+}} // cv::text::
+
+
 #endif // _OPENCV_TEXT_OCR_HPP_
diff --git a/modules/text/samples/dictnet_demo.cpp b/modules/text/samples/dictnet_demo.cpp
@@ -0,0 +1,61 @@
+/*
+ * dictnet_demo.cpp
+ *
+ * Demonstrates simple use of the holistic word classifier in C++
+ *
+ * Created on: June 26, 2016
+ *     Author: Anguelos Nicolaou <anguelos.nicolaou AT gmail.com>
+ */
+
+#include  "opencv2/text.hpp"
+#include  "opencv2/highgui.hpp"
+#include  "opencv2/imgproc.hpp"
+
+#include  <sstream>
+#include  <iostream>
+
+using namespace std;
+using namespace cv;
+using namespace cv::text;
+
+inline void printHelp()
+{
+    cout << "    Demo of wordspotting CNN for text recognition." << endl;
+    cout << "    Max Jaderberg et al.: Reading Text in the Wild with Convolutional Neural Networks, IJCV 2015"<<std::endl<<std::endl;
+
+    cout << "    Usage: program <input_image>" << endl;
+    cout << "    Caffe Model files  (dictnet_vgg.caffemodel, dictnet_vgg_deploy.prototxt, dictnet_vgg_labels.txt)"<<endl;
+    cout << "      must be in the current directory." << endl << endl;
+
+    cout << "    Obtaining Caffe Model files in linux shell:"<<endl;
+    cout << "    wget http://nicolaou.homouniversalis.org/assets/vgg_text/dictnet_vgg.caffemodel"<<endl;
+    cout << "    wget http://nicolaou.homouniversalis.org/assets/vgg_text/dictnet_vgg_deploy.prototxt"<<endl;
+    cout << "    wget http://nicolaou.homouniversalis.org/assets/vgg_text/dictnet_vgg_labels.txt"<<endl<<endl;
+}
+
+int main(int argc, const char * argv[])
+{
+    if (argc != 2)
+    {
+        printHelp();
+        exit(1);
+    }
+
+    Mat image = imread(argv[1], IMREAD_GRAYSCALE);
+
+    cout << "Read image (" << argv[1] << "): " << image.size << ", channels: " << image.channels() << ", depth: " << image.depth() << endl;
+
+    if (image.empty())
+    {
+        printHelp();
+        exit(1);
+    }
+
+    Ptr<OCRHolisticWordRecognizer> wordSpotter = OCRHolisticWordRecognizer::create("dictnet_vgg_deploy.prototxt", "dictnet_vgg.caffemodel", "dictnet_vgg_labels.txt");
+
+    std::string word;
+    vector<float> confs;
+    wordSpotter->run(image, word, 0, 0, &confs);
+
+    cout << "Detected word: '" << word << "', confidence: " << confs[0] << endl;
+}
diff --git a/modules/text/src/ocr_holistic.cpp b/modules/text/src/ocr_holistic.cpp
@@ -0,0 +1,102 @@
+#include "precomp.hpp"
+#include "opencv2/imgproc.hpp"
+#include "opencv2/core.hpp"
+#include "opencv2/dnn.hpp"
+
+#include <fstream>
+
+using namespace std;
+
+namespace cv { namespace text {
+
+class OCRHolisticWordRecognizerImpl : public OCRHolisticWordRecognizer
+{
+private:
+    dnn::Net net;
+    vector<string> words;
+
+public:
+    OCRHolisticWordRecognizerImpl(const string &archFilename, const string &weightsFilename, const string &wordsFilename)
+    {
+        net = dnn::readNetFromCaffe(archFilename, weightsFilename);
+        std::ifstream in(wordsFilename.c_str());
+        if (!in)
+        {
+            CV_Error(Error::StsError, "Could not read Labels from file");
+        }
+        std::string line;
+        while (std::getline(in, line))
+            words.push_back(line);
+        CV_Assert(getClassCount() == words.size());
+    }
+
+    void run(Mat& image, std::string& output_text, std::vector<Rect>* component_rects=NULL, std::vector<std::string>* component_texts=NULL, std::vector<float>* component_confidences=NULL, int component_level=0)
+    {
+        CV_Assert(component_level==OCR_LEVEL_WORD); //Componnents not applicable for word spotting
+        double confidence;
+        output_text = classify(image, confidence);
+        if(component_rects!=NULL){
+            component_rects->resize(1);
+            (*component_rects)[0]=Rect(0,0,image.size().width,image.size().height);
+        }
+        if(component_texts!=NULL){
+            component_texts->resize(1);
+            (*component_texts)[0] = output_text;
+        }
+        if(component_confidences!=NULL){
+            component_confidences->resize(1);
+            (*component_confidences)[0] = float(confidence);
+        }
+    }
+
+    void run(Mat& image, Mat& mask, std::string& output_text, std::vector<Rect>* component_rects=NULL, std::vector<std::string>* component_texts=NULL, std::vector<float>* component_confidences=NULL, int component_level=0)
+    {
+        //Mask is ignored because the CNN operates on a full image
+        CV_Assert(mask.cols == image.cols && mask.rows == image.rows);
+        this->run(image, output_text, component_rects, component_texts, component_confidences, component_level);
+    }
+
+protected:
+    Size getPerceptiveField() const
+    {
+        return Size(100, 32);
+    }
+
+    size_t getClassCount()
+    {
+        int id = net.getLayerId("prob");
+        dnn::MatShape inputShape;
+        inputShape.push_back(1);
+        inputShape.push_back(1);
+        inputShape.push_back(getPerceptiveField().height);
+        inputShape.push_back(getPerceptiveField().width);
+        vector<dnn::MatShape> inShapes, outShapes;
+        net.getLayerShapes(inputShape, id, inShapes, outShapes);
+        CV_Assert(outShapes.size() == 1 && outShapes[0].size() == 4);
+        CV_Assert(outShapes[0][0] == 1 && outShapes[0][2] == 1 && outShapes[0][3] == 1);
+        return outShapes[0][1];
+    }
+
+    string classify(InputArray image, double & conf)
+    {
+        CV_Assert(image.channels() == 1 && image.depth() == CV_8U);
+        Mat resized;
+        resize(image, resized, getPerceptiveField());
+        Mat blob = dnn::blobFromImage(resized);
+        net.setInput(blob, "data");
+        Mat prob = net.forward("prob");
+        CV_Assert(prob.dims == 4 && !prob.empty() && prob.size[1] == (int)getClassCount());
+        int idx[4] = {0};
+        minMaxIdx(prob, 0, &conf, 0, idx);
+        CV_Assert(0 <= idx[1] && idx[1] < (int)words.size());
+        return words[idx[1]];
+    }
+
+};
+
+Ptr<OCRHolisticWordRecognizer> OCRHolisticWordRecognizer::create(const string &archFilename, const string &weightsFilename, const string &wordsFilename)
+{
+    return makePtr<OCRHolisticWordRecognizerImpl>(archFilename, weightsFilename, wordsFilename);
+}
+
+}} // cv::text::
diff --git a/modules/text/text_config.hpp.in b/modules/text/text_config.hpp.in
@@ -4,4 +4,4 @@
 // HAVE OCR Tesseract
 #cmakedefine HAVE_TESSERACT
 
-#endif
+#endif