diff --git a/modules/text/CMakeLists.txt b/modules/text/CMakeLists.txt index 98a332bd7bf..b58fd41cf1d 100644 --- a/modules/text/CMakeLists.txt +++ b/modules/text/CMakeLists.txt @@ -1,24 +1,84 @@ set(the_description "Text Detection and Recognition") -ocv_define_module(text opencv_ml opencv_imgproc opencv_core opencv_features2d OPTIONAL opencv_highgui WRAP python java) - -if(NOT CMAKE_CROSSCOMPILING OR OPENCV_FIND_TESSERACT) - set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${CMAKE_CURRENT_SOURCE_DIR}/cmake) - find_package(Tesseract QUIET) - if(Tesseract_FOUND) - message(STATUS "Tesseract: YES") - set(HAVE_TESSERACT 1) - ocv_include_directories(${Tesseract_INCLUDE_DIR}) - ocv_target_link_libraries(${the_module} ${Tesseract_LIBRARIES}) - else() - message(STATUS "Tesseract: NO") - endif() + +if(POLICY CMP0023) + message(STATUS "Explicitly setting policy CMP0023 to OLD") + cmake_policy(SET CMP0023 OLD) +endif(POLICY CMP0023) + +# Using cmake scripts and modules +list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}) + +set(TEXT_DEPS opencv_ml opencv_highgui opencv_imgproc opencv_core opencv_features2d opencv_calib3d) + +find_package(Caffe) +if(Caffe_FOUND) + message(STATUS "Caffe: YES") + set(HAVE_CAFFE 1) +else() + message(STATUS "Caffe: NO") +# list(APPEND TEXT_DEPS opencv_dnn) +endif() + +#internal dependencies +find_package(Protobuf) +if(Protobuf_FOUND) + message(STATUS "Protobuf: YES") + set(HAVE_PROTOBUF 1) +else() + message(STATUS "Protobuf: NO") +endif() + +find_package(Glog) +if(Glog_FOUND) + message(STATUS "Glog: YES") + set(HAVE_GLOG 1) +else() + message(STATUS "Glog: NO") endif() -configure_file(${CMAKE_CURRENT_SOURCE_DIR}/text_config.hpp.in - ${CMAKE_BINARY_DIR}/text_config.hpp @ONLY) +ocv_define_module(text opencv_ml opencv_imgproc opencv_core opencv_features2d opencv_calib3d OPTIONAL opencv_dnn WRAP python) +#ocv_define_module(text ${TEXT_DEPS} WRAP python) + +#set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${CMAKE_CURRENT_SOURCE_DIR}) + +find_package(Tesseract) +if(${Tesseract_FOUND}) + message(STATUS "Tesseract: YES") + include_directories(${Tesseract_INCLUDE_DIR}) + target_link_libraries(opencv_text ${Tesseract_LIBS}) + add_definitions(-DHAVE_TESSERACT) +else() + message(STATUS "Tesseract: NO") + endif() + + +if(HAVE_CAFFE AND HAVE_GLOG AND HAVE_PROTOBUF) + include_directories(${Caffe_INCLUDE_DIR}) + find_package(HDF5 COMPONENTS HL REQUIRED) + include_directories(SYSTEM ${HDF5_INCLUDE_DIRS} ${HDF5_HL_INCLUDE_DIR}) + list(APPEND Caffe_LINKER_LIBS ${HDF5_LIBRARIES}) + find_package(Boost 1.46 REQUIRED COMPONENTS system thread filesystem) + include_directories(SYSTEM ${Boost_INCLUDE_DIR}) + include_directories(SYSTEM ${CUDA_INCLUDE_DIR}) + link_directories(SYSTEM ${CUDA_LIBS}) + # include_directories(SYSTEM /usr/local/cuda-8.0/targets/x86_64-linux/include/ usr/local/cuda-8.0/include/ /usr/local/cuda-7.5/targets/x86_64-linux/include/ ) + #link_directories(SYSTEM /usr/local/cuda-8.0/targets/x86_64-linux/lib/ usr/local/cuda-8.0/lib/ /usr/local/cuda-7.5/targets/x86_64-linux/lib/ /usr/lib/openblas-base/lib /usr/local/cuda-8.0/lib64) + list(APPEND Caffe_LINKER_LIBS ${Boost_LIBRARIES}) + target_link_libraries(opencv_text atlas blas ${Caffe_LIBS} ${Glog_LIBS} ${Protobuf_LIBS} ${HDF5_LIBRARIES} ${Boost_LIBRARIES}) + add_definitions(-DHAVE_CAFFE) +endif() #HAVE_CAFFE -ocv_include_directories(${CMAKE_CURRENT_BINARY_DIR}) +message(STATUS "TEXT CAFFE SEARCH") +if() + message(STATUS "TEXT NO CAFFE CONFLICT") +else() + message(STATUS "TEXT CAFFE CONFLICT") +endif() -ocv_add_testdata(samples/ contrib/text - FILES_MATCHING PATTERN "*.xml" PATTERN "*.xml.gz" REGEX "scenetext[0-9]+.jpg" -) +if(HAVE_opencv_dnn) + message(STATUS "dnn module found") + add_definitions(-DHAVE_DNN) + set(HAVE_DNN 1) +else() + message(STATUS "dnn module not found") +endif() diff --git a/modules/text/FindCaffe.cmake b/modules/text/FindCaffe.cmake new file mode 100644 index 00000000000..12948f62992 --- /dev/null +++ b/modules/text/FindCaffe.cmake @@ -0,0 +1,14 @@ +# Caffe package for CNN Triplet training +unset(Caffe_FOUND) + +find_path(Caffe_INCLUDE_DIR NAMES caffe/caffe.hpp caffe/common.hpp caffe/net.hpp caffe/proto/caffe.pb.h caffe/util/io.hpp caffe/vision_layers.hpp + HINTS + /usr/local/include) + +find_library(Caffe_LIBS NAMES caffe + HINTS + /usr/local/lib) + +if(Caffe_LIBS AND Caffe_INCLUDE_DIR) + set(Caffe_FOUND 1) +endif() diff --git a/modules/text/FindGlog.cmake b/modules/text/FindGlog.cmake new file mode 100755 index 00000000000..c30e9f4a6ab --- /dev/null +++ b/modules/text/FindGlog.cmake @@ -0,0 +1,10 @@ +#Required for Caffe +unset(Glog_FOUND) + +find_library(Glog_LIBS NAMES glog + HINTS + /usr/local/lib) + +if(Glog_LIBS) + set(Glog_FOUND 1) +endif() diff --git a/modules/text/FindProtobuf.cmake b/modules/text/FindProtobuf.cmake new file mode 100644 index 00000000000..6d0ad56a1f7 --- /dev/null +++ b/modules/text/FindProtobuf.cmake @@ -0,0 +1,10 @@ +#Protobuf package required for Caffe +unset(Protobuf_FOUND) + +find_library(Protobuf_LIBS NAMES protobuf + HINTS + /usr/local/lib) + +if(Protobuf_LIBS) + set(Protobuf_FOUND 1) +endif() diff --git a/modules/text/FindTesseract.cmake b/modules/text/FindTesseract.cmake new file mode 100644 index 00000000000..01835e61bc7 --- /dev/null +++ b/modules/text/FindTesseract.cmake @@ -0,0 +1,22 @@ +# Tesseract OCR +unset(Tesseract_FOUND) + +find_path(Tesseract_INCLUDE_DIR tesseract/baseapi.h + HINTS + /usr/include + /usr/local/include) + +find_library(Tesseract_LIBRARY NAMES tesseract + HINTS + /usr/lib + /usr/local/lib) + +find_library(Lept_LIBRARY NAMES lept + HINTS + /usr/lib + /usr/local/lib) + +set(Tesseract_LIBS ${Tesseract_LIBRARY} ${Lept_LIBRARY}) +if(Tesseract_LIBS AND Tesseract_INCLUDE_DIR) + set(Tesseract_FOUND 1) +endif() diff --git a/modules/text/README.md b/modules/text/README.md index 488518a28de..fd33980e80e 100644 --- a/modules/text/README.md +++ b/modules/text/README.md @@ -47,3 +47,83 @@ Notes 2. Tesseract configure script may fail to detect leptonica, so you may have to edit the configure script - comment off some if's around this message and retain only "then" branch. 3. You are encouraged to search the Net for some better pre-trained classifiers, as well as classifiers for other languages. + + +Text Detection CNN +================= + +Intro +----- + +The text module now have a text detection and recognition using deep CNN. The text detector deep CNN that takes an image which may contain multiple words. This outputs a list of Rects with bounding boxes and probability of text there. The text recognizer provides a probabillity over a given vocabulary for each of these rects. + +Two backends are supported 1) caffe 2) opencv-dnn + + + + +Instalation of Caffe backend +---------------------------- +* Please note a custom caffe based on SSD branch is required, the link of the custom caffe is provided below +The caffe wrapping backend has the requirements caffe does. +* Caffe can be built against OpenCV, if the caffe backend is enabled, a circular bependency arises. +The simplest solution is to build caffe without support for OpenCV. +* Only the OS supported by Caffe are supported by the backend. +The scripts describing the module have been developed in ubuntu 16.04 and assume such a system. +Other UNIX systems including OSX should be easy to adapt. + +Sample script for building Caffe + +```bash +#!/bin/bash +SRCROOT="${HOME}/caffe_inst/" +mkdir -p "$SRCROOT" +cd "$SRCROOT" +git clone https://github.com/sghoshcvc/TextBoxes.git +cd TextBoxes +cat Makefile.config.example > Makefile.config +echo 'USE_OPENCV := 0' >> Makefile.config +echo 'INCLUDE_DIRS += /usr/include/hdf5/serial/' >> Makefile.config +echo 'LIBRARY_DIRS += /usr/lib/x86_64-linux-gnu/hdf5/serial/' >> Makefile.config + + +echo "--- /tmp/caffe/include/caffe/net.hpp 2017-05-28 04:55:47.929623902 +0200 ++++ caffe/distribute/include/caffe/net.hpp 2017-05-28 04:51:33.437090768 +0200 +@@ -234,6 +234,7 @@ + + template + friend class Net; ++ virtual ~Callback(){} + }; + const vector& before_forward() const { return before_forward_; } + void add_before_forward(Callback* value) { +">/tmp/cleanup_caffe.diff + +patch < /tmp/cleanup_caffe.diff + + +make -j 6 + +make pycaffe + +make distribute +``` + + +```bash +#!/bin/bash +cd $OPENCV_BUILD_DIR #You must set this +CAFFEROOT="${HOME}/caffe_inst/" #If you used the previous code to compile Caffe in ubuntu 16.04 + +cmake -DCaffe_LIBS:FILEPATH="$CAFFEROOT/caffe/distribute/lib/libcaffe.so" -DBUILD_opencv_ts:BOOL="0" -DBUILD_opencv_dnn:BOOL="0" -DBUILD_opencv_dnn_modern:BOOL="0" -DCaffe_INCLUDE_DIR:PATH="$CAFFEROOT/caffe/distribute/include" -DWITH_MATLAB:BOOL="0" -DBUILD_opencv_cudabgsegm:BOOL="0" -DWITH_QT:BOOL="1" -DBUILD_opencv_cudaoptflow:BOOL="0" -DBUILD_opencv_cudastereo:BOOL="0" -DBUILD_opencv_cudafilters:BOOL="0" -DBUILD_opencv_cudev:BOOL="1" -DOPENCV_EXTRA_MODULES_PATH:PATH="$OPENCV_CONTRIB/modules" ./ + + +``` +where $OPECV_CONTRIB is the root directory containing opencv_contrib module + +Instalation of Caffe backend +---------------------------- + +Use of opencv-dnn does not need any additional library. + +The recent opencv-3.3.0 needs to be build with extra modules to use text module. diff --git a/modules/text/include/opencv2/text.hpp b/modules/text/include/opencv2/text.hpp index cea49c69c0f..85b8b741982 100644 --- a/modules/text/include/opencv2/text.hpp +++ b/modules/text/include/opencv2/text.hpp @@ -41,6 +41,7 @@ the use of this software, even if advised of the possibility of such damage. #include "opencv2/text/erfilter.hpp" #include "opencv2/text/ocr.hpp" +#include "opencv2/text/textDetector.hpp" /** @defgroup text Scene Text Detection and Recognition @@ -92,7 +93,7 @@ grouping horizontally aligned text, and the method proposed by Lluis Gomez and D in @cite Gomez13 @cite Gomez14 for grouping arbitrary oriented text (see erGrouping). To see the text detector at work, have a look at the textdetection demo: - + @defgroup text_recognize Scene Text Recognition @} diff --git a/modules/text/include/opencv2/text/erfilter.hpp b/modules/text/include/opencv2/text/erfilter.hpp index c9bac2b3272..2bd1c56a356 100644 --- a/modules/text/include/opencv2/text/erfilter.hpp +++ b/modules/text/include/opencv2/text/erfilter.hpp @@ -65,6 +65,7 @@ component tree of the image. : */ struct CV_EXPORTS ERStat { + public: //! Constructor explicit ERStat(int level = 256, int pixel = 0, int x = 0, int y = 0); diff --git a/modules/text/include/opencv2/text/ocr.hpp b/modules/text/include/opencv2/text/ocr.hpp index 1ba37f03a28..df9c2b4aa59 100644 --- a/modules/text/include/opencv2/text/ocr.hpp +++ b/modules/text/include/opencv2/text/ocr.hpp @@ -46,6 +46,10 @@ #include #include +#include +#include + + namespace cv { @@ -87,61 +91,100 @@ enum ocr_engine_mode }; //base class BaseOCR declares a common API that would be used in a typical text recognition scenario + class CV_EXPORTS_W BaseOCR { -public: + public: virtual ~BaseOCR() {}; - virtual void run(Mat& image, std::string& output_text, std::vector* component_rects=NULL, - std::vector* component_texts=NULL, std::vector* component_confidences=NULL, + + virtual void run(Mat& image, std::string& output_text, + std::vector* component_rects=NULL, + std::vector* component_texts=NULL, + std::vector* component_confidences=NULL, int component_level=0) = 0; - virtual void run(Mat& image, Mat& mask, std::string& output_text, std::vector* component_rects=NULL, - std::vector* component_texts=NULL, std::vector* component_confidences=NULL, + + virtual void run(Mat& image, Mat& mask, std::string& output_text, + std::vector* component_rects=NULL, + std::vector* component_texts=NULL, + std::vector* component_confidences=NULL, int component_level=0) = 0; + + /** @brief Main functionality of the OCR Hierarchy. Subclasses provide + * default parameters for all parameters other than the input image. + */ + virtual String run(InputArray image){ + std::string res; + std::vector component_rects; + std::vector component_confidences; + std::vector component_texts; + Mat inputImage=image.getMat(); + this->run(inputImage,res,&component_rects,&component_texts, + &component_confidences,OCR_LEVEL_WORD); + return res; + } + }; -/** @brief OCRTesseract class provides an interface with the tesseract-ocr API (v3.02.02) in C++. +/** @brief OCRTesseract class provides an interface with the tesseract-ocr API + * (v3.02.02) in C++. Notice that it is compiled only when tesseract-ocr is correctly installed. @note - - (C++) An example of OCRTesseract recognition combined with scene text detection can be found - at the end_to_end_recognition demo: - - - (C++) Another example of OCRTesseract recognition combined with scene text detection can be - found at the webcam_demo: - + - (C++) An example of OCRTesseract recognition combined with scene text + detection can be found at the end_to_end_recognition demo: + + - (C++) Another example of OCRTesseract recognition combined with scene + text detection can be found at the webcam_demo: + */ class CV_EXPORTS_W OCRTesseract : public BaseOCR { public: /** @brief Recognize text using the tesseract-ocr API. - Takes image on input and returns recognized text in the output_text parameter. Optionally - provides also the Rects for individual text elements found (e.g. words), and the list of those - text elements with their confidence values. + Takes image on input and returns recognized text in the output_text + parameter. Optionally provides also the Rects for individual text elements + found (e.g. words), and the list of those text elements with their + confidence values. @param image Input image CV_8UC1 or CV_8UC3 + @param output_text Output text of the tesseract-ocr. - @param component_rects If provided the method will output a list of Rects for the individual - text elements found (e.g. words or text lines). - @param component_texts If provided the method will output a list of text strings for the - recognition of individual text elements found (e.g. words or text lines). - @param component_confidences If provided the method will output a list of confidence values - for the recognition of individual text elements found (e.g. words or text lines). - @param component_level OCR_LEVEL_WORD (by default), or OCR_LEVEL_TEXTLINE. + + @param component_rects If provided the method will output a list of Rects + for the individual text elements found (e.g. words or text lines). + + @param component_texts If provided the method will output a list of text + strings for the recognition of individual text elements found (e.g. words or + text lines). + + @param component_confidences If provided the method will output a list of + confidence values for the recognition of individual text elements found + (e.g. words or text lines). + + @param component_level OCR_LEVEL_WORD (by default), or OCR_LEVEL_TEXT_LINE. + */ - virtual void run(Mat& image, std::string& output_text, std::vector* component_rects=NULL, - std::vector* component_texts=NULL, std::vector* component_confidences=NULL, + using BaseOCR::run; + virtual void run (Mat& image, std::string& output_text, + std::vector* component_rects=NULL, + std::vector* component_texts=NULL, + std::vector* component_confidences=NULL, int component_level=0); - virtual void run(Mat& image, Mat& mask, std::string& output_text, std::vector* component_rects=NULL, - std::vector* component_texts=NULL, std::vector* component_confidences=NULL, - int component_level=0); + virtual void run (Mat& image, Mat& mask, std::string& output_text, + std::vector* component_rects=NULL, + std::vector* component_texts=NULL, + std::vector* component_confidences=NULL, + int component_level=0); // aliases for scripting - CV_WRAP String run(InputArray image, int min_confidence, int component_level=0); + CV_WRAP String run (InputArray image, int min_confidence, + int component_level=0); - CV_WRAP String run(InputArray image, InputArray mask, int min_confidence, int component_level=0); + CV_WRAP String run(InputArray image, InputArray mask, + int min_confidence, int component_level=0); CV_WRAP virtual void setWhiteList(const String& char_whitelist) = 0; @@ -162,6 +205,7 @@ class CV_EXPORTS_W OCRTesseract : public BaseOCR */ CV_WRAP static Ptr create(const char* datapath=NULL, const char* language=NULL, const char* char_whitelist=NULL, int oem=OEM_DEFAULT, int psmode=PSM_AUTO); + }; @@ -181,19 +225,19 @@ enum classifier_type /** @brief OCRHMMDecoder class provides an interface for OCR using Hidden Markov Models. -@note - - (C++) An example on using OCRHMMDecoder recognition combined with scene text detection can - be found at the webcam_demo sample: - + + * @note + * - (C++) An example on using OCRHMMDecoder recognition combined with scene + * text detection can be found at the webcam_demo sample: + * */ -class CV_EXPORTS_W OCRHMMDecoder : public BaseOCR -{ -public: +class CV_EXPORTS_W OCRHMMDecoder : public BaseOCR { + public: /** @brief Callback with the character classifier is made a class. - This way it hides the feature extractor and the classifier itself, so developers can write - their own OCR code. + * This way it hides the feature extractor and the classifier itself, so + * developers can write their own OCR code. The default character classifier and feature extractor can be loaded using the utility function loadOCRHMMClassifierNM and KNN model provided in @@ -202,92 +246,120 @@ class CV_EXPORTS_W OCRHMMDecoder : public BaseOCR class CV_EXPORTS_W ClassifierCallback { public: + virtual ~ClassifierCallback() { } - /** @brief The character classifier must return a (ranked list of) class(es) id('s) + /** @brief The character classifier must return a (ranked list of) + * class(es) id('s) - @param image Input image CV_8UC1 or CV_8UC3 with a single letter. - @param out_class The classifier returns the character class categorical label, or list of - class labels, to which the input image corresponds. - @param out_confidence The classifier returns the probability of the input image - corresponding to each classes in out_class. + * @param image Input image CV_8UC1 or CV_8UC3 with a single letter. + * @param out_class The classifier returns the character class + * categorical label, or list of class labels, to which the input image + * corresponds. + + * @param out_confidence The classifier returns the probability of the + * input image corresponding to each classes in out_class. */ - virtual void eval( InputArray image, std::vector& out_class, std::vector& out_confidence); + virtual void eval (InputArray image, std::vector& out_class, + std::vector& out_confidence); }; -public: /** @brief Recognize text using HMM. - Takes binary image on input and returns recognized text in the output_text parameter. Optionally - provides also the Rects for individual text elements found (e.g. words), and the list of those - text elements with their confidence values. + * Takes binary image on input and returns recognized text in the output_text + * parameter. Optionally provides also the Rects for individual text elements + * found (e.g. words), and the list of those text elements with their + * confidence values. - @param image Input binary image CV_8UC1 with a single text line (or word). + * @param image Input binary image CV_8UC1 with a single text line (or word). - @param output_text Output text. Most likely character sequence found by the HMM decoder. + * @param output_text Output text. Most likely character sequence found by + * the HMM decoder. - @param component_rects If provided the method will output a list of Rects for the individual - text elements found (e.g. words). + * @param component_rects If provided the method will output a list of Rects + * for the individual text elements found (e.g. words). - @param component_texts If provided the method will output a list of text strings for the - recognition of individual text elements found (e.g. words). + * @param component_texts If provided the method will output a list of text + * strings for the recognition of individual text elements found (e.g. words). - @param component_confidences If provided the method will output a list of confidence values - for the recognition of individual text elements found (e.g. words). + * @param component_confidences If provided the method will output a list of + * confidence values for the recognition of individual text elements found + * (e.g. words). - @param component_level Only OCR_LEVEL_WORD is supported. - */ - virtual void run(Mat& image, std::string& output_text, std::vector* component_rects=NULL, - std::vector* component_texts=NULL, std::vector* component_confidences=NULL, - int component_level=0); + * @param component_level Only OCR_LEVEL_WORD is supported. + */ + using BaseOCR::run; + virtual void run (Mat& image, std::string& output_text, + std::vector* component_rects=NULL, + std::vector* component_texts=NULL, + std::vector* component_confidences=NULL, + int component_level=0); /** @brief Recognize text using HMM. - Takes an image and a mask (where each connected component corresponds to a segmented character) - on input and returns recognized text in the output_text parameter. Optionally - provides also the Rects for individual text elements found (e.g. words), and the list of those - text elements with their confidence values. + * Takes an image and a mask (where each connected component corresponds to a + * segmented character) on input and returns recognized text in the + * output_text parameter. Optionally provides also the Rects for individual + * text elements found (e.g. words), and the list of those text elements with + * their confidence values. - @param image Input image CV_8UC1 or CV_8UC3 with a single text line (or word). - @param mask Input binary image CV_8UC1 same size as input image. Each connected component in mask corresponds to a segmented character in the input image. + * @param image Input image CV_8UC1 or CV_8UC3 with a single text line + * (or word). - @param output_text Output text. Most likely character sequence found by the HMM decoder. + * @param mask Input binary image CV_8UC1 same size as input image. Each + * connected component in mask corresponds to a segmented character in the + * input image. - @param component_rects If provided the method will output a list of Rects for the individual - text elements found (e.g. words). + * @param output_text Output text. Most likely character sequence found by + * the HMM decoder. - @param component_texts If provided the method will output a list of text strings for the - recognition of individual text elements found (e.g. words). + * @param component_rects If provided the method will output a list of Rects + * for the individual text elements found (e.g. words). - @param component_confidences If provided the method will output a list of confidence values - for the recognition of individual text elements found (e.g. words). + * @param component_texts If provided the method will output a list of text + * strings for the recognition of individual text elements found (e.g. words). - @param component_level Only OCR_LEVEL_WORD is supported. - */ - virtual void run(Mat& image, Mat& mask, std::string& output_text, std::vector* component_rects=NULL, - std::vector* component_texts=NULL, std::vector* component_confidences=NULL, + * @param component_confidences If provided the method will output a list of + * confidence values for the recognition of individual text elements found + * (e.g. words). + + * @param component_level Only OCR_LEVEL_WORD is supported. + */ + virtual void run(Mat& image, Mat& mask, std::string& output_text, + std::vector* component_rects=NULL, + std::vector* component_texts=NULL, + std::vector* component_confidences=NULL, int component_level=0); // aliases for scripting - CV_WRAP String run(InputArray image, int min_confidence, int component_level=0); + CV_WRAP String run(InputArray image, + int min_confidence, + int component_level=0); - CV_WRAP String run(InputArray image, InputArray mask, int min_confidence, int component_level=0); + CV_WRAP String run(InputArray image, + InputArray mask, + int min_confidence, + int component_level=0); - /** @brief Creates an instance of the OCRHMMDecoder class. Initializes HMMDecoder. + /** @brief Creates an instance of the OCRHMMDecoder class. Initializes + * HMMDecoder. - @param classifier The character classifier with built in feature extractor. + * @param classifier The character classifier with built in feature + * extractor. - @param vocabulary The language vocabulary (chars when ascii english text). vocabulary.size() - must be equal to the number of classes of the classifier. + * @param vocabulary The language vocabulary (chars when ascii english text) + * . vocabulary.size() must be equal to the number of classes of the + * classifier. - @param transition_probabilities_table Table with transition probabilities between character - pairs. cols == rows == vocabulary.size(). + * @param transition_probabilities_table Table with transition probabilities + * between character pairs. cols == rows == vocabulary.size(). - @param emission_probabilities_table Table with observation emission probabilities. cols == - rows == vocabulary.size(). + * @param emission_probabilities_table Table with observation emission + * probabilities. cols == rows == vocabulary.size(). - @param mode HMM Decoding algorithm. Only OCR_DECODER_VITERBI is available for the moment - (). + * @param mode HMM Decoding algorithm. Only OCR_DECODER_VITERBI is available + * for the moment (). */ + static Ptr create(const Ptr classifier,// The character classifier with built in feature extractor const std::string& vocabulary, // The language vocabulary (chars when ASCII English text) // size() must be equal to the number of classes @@ -330,9 +402,11 @@ class CV_EXPORTS_W OCRHMMDecoder : public BaseOCR decoder_mode mode; }; -/** @brief Allow to implicitly load the default character classifier when creating an OCRHMMDecoder object. +/** @brief Allow to implicitly load the default character classifier when + * creating an OCRHMMDecoder object. + + @param filename The XML or YAML file with the classifier model (e.g.OCRHMM_knn_model_data.xml) -@param filename The XML or YAML file with the classifier model (e.g. OCRHMM_knn_model_data.xml) The KNN default classifier is based in the scene text recognition method proposed by Lukás Neumann & Jiri Matas in [Neumann11b]. Basically, the region (contour) in the input image is normalized to a @@ -342,11 +416,16 @@ using a KNN model trained with synthetic data of rendered characters with differ types. @deprecated loadOCRHMMClassifier instead + */ +CV_EXPORTS_W Ptr loadOCRHMMClassifierNM ( + const String& filename); -CV_EXPORTS_W Ptr loadOCRHMMClassifierNM(const String& filename); +/** @brief Allow to implicitly load the default character classifier when + * creating an OCRHMMDecoder object. + + @param filename The XML or YAML file with the classifier model (e.g.OCRBeamSearch_CNN_model_data.xml.gz) -/** @brief Allow to implicitly load the default character classifier when creating an OCRHMMDecoder object. @param filename The XML or YAML file with the classifier model (e.g. OCRBeamSearch_CNN_model_data.xml.gz) @@ -356,8 +435,10 @@ a linear classifier. It is applied to the input image in a sliding window fashio at each window location. @deprecated use loadOCRHMMClassifier instead + */ -CV_EXPORTS_W Ptr loadOCRHMMClassifierCNN(const String& filename); +CV_EXPORTS_W Ptr loadOCRHMMClassifierCNN ( + const String& filename); /** @brief Allow to implicitly load the default character classifier when creating an OCRHMMDecoder object. @@ -369,49 +450,64 @@ CV_EXPORTS_W Ptr loadOCRHMMClassifierCNN(cons CV_EXPORTS_W Ptr loadOCRHMMClassifier(const String& filename, int classifier); //! @} + /** @brief Utility function to create a tailored language model transitions table from a given list of words (lexicon). * * @param vocabulary The language vocabulary (chars when ASCII English text). * * @param lexicon The list of words that are expected to be found in a particular image. - * - * @param transition_probabilities_table Output table with transition probabilities between character pairs. cols == rows == vocabulary.size(). - * - * The function calculate frequency statistics of character pairs from the given lexicon and fills the output transition_probabilities_table with them. The transition_probabilities_table can be used as input in the OCRHMMDecoder::create() and OCRBeamSearchDecoder::create() methods. + + * @param transition_probabilities_table Output table with transition + * probabilities between character pairs. cols == rows == vocabulary.size(). + + * The function calculate frequency statistics of character pairs from the given + * lexicon and fills the output transition_probabilities_table with them. The + * transition_probabilities_table can be used as input in the + * OCRHMMDecoder::create() and OCRBeamSearchDecoder::create() methods. * @note - * - (C++) An alternative would be to load the default generic language transition table provided in the text module samples folder (created from ispell 42869 english words list) : - * + * - (C++) An alternative would be to load the default generic language + * transition table provided in the text module samples folder (created + * from ispell 42869 english words list) : + * **/ -CV_EXPORTS void createOCRHMMTransitionsTable(std::string& vocabulary, std::vector& lexicon, OutputArray transition_probabilities_table); - -CV_EXPORTS_W Mat createOCRHMMTransitionsTable(const String& vocabulary, std::vector& lexicon); +CV_EXPORTS void createOCRHMMTransitionsTable ( + std::string& vocabulary, std::vector& lexicon, + OutputArray transition_probabilities_table); +CV_EXPORTS_W Mat createOCRHMMTransitionsTable ( + const String& vocabulary, std::vector& lexicon); /* OCR BeamSearch Decoder */ -/** @brief OCRBeamSearchDecoder class provides an interface for OCR using Beam Search algorithm. +/** @brief OCRBeamSearchDecoder class provides an interface for OCR using Beam + * Search algorithm. @note - - (C++) An example on using OCRBeamSearchDecoder recognition combined with scene text detection can - be found at the demo sample: - + - (C++) An example on using OCRBeamSearchDecoder recognition combined with + scene text detection can be found at the demo sample: + */ -class CV_EXPORTS_W OCRBeamSearchDecoder : public BaseOCR -{ -public: + + +/* Forward declaration of class that can be used to generate an OCRBeamSearchDecoder::ClassifierCallbac */ +class TextImageClassifier; + +class CV_EXPORTS_W OCRBeamSearchDecoder : public BaseOCR{ + + public: /** @brief Callback with the character classifier is made a class. - This way it hides the feature extractor and the classifier itself, so developers can write - their own OCR code. + * This way it hides the feature extractor and the classifier itself, so + * developers can write their own OCR code. - The default character classifier and feature extractor can be loaded using the utility funtion - loadOCRBeamSearchClassifierCNN with all its parameters provided in - . + * The default character classifier and feature extractor can be loaded + * using the utility funtion loadOCRBeamSearchClassifierCNN with all its + * parameters provided in + * . */ - class CV_EXPORTS_W ClassifierCallback - { - public: + class CV_EXPORTS_W ClassifierCallback{ + public: virtual ~ClassifierCallback() { } /** @brief The character classifier must return a (ranked list of) class(es) id('s) @@ -423,8 +519,8 @@ class CV_EXPORTS_W OCRBeamSearchDecoder : public BaseOCR */ virtual void eval( InputArray image, std::vector< std::vector >& recognition_probabilities, std::vector& oversegmentation ); - int getWindowSize() {return 0;} - int getStepSize() {return 0;} + virtual int getWindowSize() {return 0;} + virtual int getStepSize() {return 0;} }; public: @@ -449,6 +545,7 @@ class CV_EXPORTS_W OCRBeamSearchDecoder : public BaseOCR @param component_level Only OCR_LEVEL_WORD is supported. */ + using BaseOCR::run; virtual void run(Mat& image, std::string& output_text, std::vector* component_rects=NULL, std::vector* component_texts=NULL, std::vector* component_confidences=NULL, int component_level=0); @@ -480,6 +577,7 @@ class CV_EXPORTS_W OCRBeamSearchDecoder : public BaseOCR @param beam_size Size of the beam in Beam Search algorithm. */ + static Ptr create(const Ptr classifier,// The character classifier with built in feature extractor const std::string& vocabulary, // The language vocabulary (chars when ASCII English text) // size() must be equal to the number of classes @@ -500,10 +598,29 @@ class CV_EXPORTS_W OCRBeamSearchDecoder : public BaseOCR int mode = OCR_DECODER_VITERBI, // HMM Decoding algorithm (only Viterbi for the moment) int beam_size = 500); // Size of the beam in Beam Search algorithm + + + + /** @brief Creates an instance of the OCRBeamSearchDecoder class. Initializes HMMDecoder from the specified path. @overload + @param filename path to a character classifier file + + @param vocabulary The language vocabulary (chars when ASCII English text). vocabulary.size() + must be equal to the number of classes of the classifier.. + + @param transition_probabilities_table Table with transition probabilities between character + pairs. cols == rows == vocabulary.size(). + + @param emission_probabilities_table Table with observation emission probabilities. cols == + rows == vocabulary.size(). + + @param mode HMM Decoding algorithm (only Viterbi for the moment) + + @param beam_size Size of the beam in Beam Search algorithm + */ CV_WRAP static Ptr create(const String& filename, // The character classifier file const String& vocabulary, // The language vocabulary (chars when ASCII English text) @@ -514,6 +631,7 @@ class CV_EXPORTS_W OCRBeamSearchDecoder : public BaseOCR // cols == rows == vocabulary.size() int mode = OCR_DECODER_VITERBI, // HMM Decoding algorithm (only Viterbi for the moment) int beam_size = 500); + protected: Ptr classifier; @@ -538,6 +656,402 @@ CV_EXPORTS_W Ptr loadOCRBeamSearchClas //! @} -} -} + +//Classifiers should provide diferent backends + +enum{ + OCR_HOLISTIC_BACKEND_NONE, //No back end + OCR_HOLISTIC_BACKEND_DNN, // dnn backend opencv_dnn + OCR_HOLISTIC_BACKEND_CAFFE, // caffe based backend + OCR_HOLISTIC_BACKEND_DEFAULT // to store default value based on environment +}; + +class TextImageClassifier; + +/** + * @brief The ImagePreprocessor class + */ +class CV_EXPORTS_W ImagePreprocessor{ +protected: + virtual void preprocess_(const Mat& input,Mat& output,Size outputSize,int outputChannels)=0; + virtual void set_mean_(Mat){} + +public: + virtual ~ImagePreprocessor(){} + + /** @brief this method in provides public acces to the preprocessing with respect to a specific + * classifier + * + * This method's main use would be to use the preprocessor without feeding it to a classifier. + * Determining the exact behavior of a preprocessor is the main motivation for this. + * + * @param input an image without any constraints + * + * @param output in most cases an image of fixed depth size and whitened + * + * @param sz the size to which the image would be resize if the preprocessor resizes inputs + * + * @param outputChannels the number of channels for the output image + */ + CV_WRAP void preprocess(InputArray input,OutputArray output,Size sz,int outputChannels); + + /** @brief this method in provides public acces to set the mean of the input images + * mean can be a mat either of same size of the image or one value per color channel + * A preprocessor can be created without the mean( the pre processor will calculate mean for every image + * in that case + * + + * @param mean which will be subtracted from the images + * + */ + + CV_WRAP void set_mean(Mat mean); + + /** @brief Creates a functor that only resizes and changes the channels of the input + * without further processing. + * + * @return shared pointer to the generated preprocessor + */ + CV_WRAP static Ptr createResizer(); + + /** @brief + * + * @param sigma + * + * @return shared pointer to generated preprocessor + */ + CV_WRAP static Ptr createImageStandarizer(double sigma); + + /** @brief + * + * @return shared pointer to generated preprocessor + */ + CV_WRAP static Ptr createImageMeanSubtractor(InputArray meanImg); + /** @brief + * create a functor with the parameters, parameters can be changes by corresponding set functions + * @return shared pointer to generated preprocessor + */ + + CV_WRAP static PtrcreateImageCustomPreprocessor(double rawval=1.0,String channel_order="BGR"); + + friend class TextImageClassifier; + +}; + +/** @brief Abstract class that implements the classifcation of text images. + * + * The interface is generic enough to describe any image classifier. And allows + * to take advantage of compouting in batches. While word classifiers are the default + * networks, any image classifers should work. + * + */ +class CV_EXPORTS_W TextImageClassifier +{ +protected: + Size inputGeometry_; + Size outputGeometry_; + int channelCount_; + Ptr preprocessor_; + /** @brief all image preprocessing is handled here including whitening etc. + * + * @param input the image to be preprocessed for the classifier. If the depth + * is CV_U8 values should be in [0,255] otherwise values are assumed to be in [0,1] + * + * @param output reference to the image to be fed to the classifier, the preprocessor will + * resize the image to the apropriate size and convert it to the apropriate depth\ + * + * The method preprocess should never be used externally, it is up to classify and classifyBatch + * methods to employ it. + */ + virtual void preprocess(const Mat& input,Mat& output); +public: + virtual ~TextImageClassifier() {} + + /** @brief + */ + CV_WRAP virtual void setPreprocessor(Ptr ptr); + + /** @brief + */ + CV_WRAP Ptr getPreprocessor(); + + /** @brief produces a class confidence row-vector given an image + */ + CV_WRAP virtual void classify(InputArray image, OutputArray classProbabilities) = 0; + + /** @brief produces a matrix containing class confidence row-vectors given an collection of images + */ + CV_WRAP virtual void classifyBatch(InputArrayOfArrays image, OutputArray classProbabilities) = 0; + + /** @brief simple getter method returning the number of channels each input sample has + */ + CV_WRAP virtual int getInputChannelCount(){return this->channelCount_;} + + /** @brief simple getter method returning the size of the input sample + */ + CV_WRAP virtual Size getInputSize(){return this->inputGeometry_;} + + /** @brief simple getter method returning the size of the oputput row-vector + */ + CV_WRAP virtual int getOutputSize()=0; + /** @brief simple getter method returning the shape of the oputput from caffe + */ + CV_WRAP virtual Size getOutputGeometry()=0; + + /** @brief simple getter method returning the size of the minibatches for this classifier. + * If not applicabe this method should return 1 + */ + CV_WRAP virtual int getMinibatchSize()=0; + + friend class ImagePreprocessor; +}; + + + +class CV_EXPORTS_W DeepCNN:public TextImageClassifier +{ + /** @brief Class that uses a pretrained caffe model for word classification. + * + * This network is described in detail in: + * Max Jaderberg et al.: Reading Text in the Wild with Convolutional Neural Networks, IJCV 2015 + * http://arxiv.org/abs/1412.1842 + */ +public: + virtual ~DeepCNN() {}; + + /** @brief Constructs a DeepCNN object from a caffe pretrained model + * + * @param archFilename is the path to the prototxt file containing the deployment model architecture description. + * + * @param weightsFilename is the path to the pretrained weights of the model in binary fdorm. This file can be + * very large, up to 2GB. + * + * @param preprocessor is a pointer to the instance of a ImagePreprocessor implementing the preprocess_ protecteed method; + * + * @param minibatchSz the maximum number of samples that can processed in parallel. In practice this parameter + * has an effect only when computing in the GPU and should be set with respect to the memory available in the GPU. + * + * @param backEnd integer parameter selecting the coputation framework. For now OCR_HOLISTIC_BACKEND_CAFFE is + * the only option + */ + CV_WRAP static Ptr create(String archFilename,String weightsFilename,Ptr preprocessor,int minibatchSz=100,int backEnd=OCR_HOLISTIC_BACKEND_DEFAULT); + + /** @brief Constructs a DeepCNN intended to be used for word spotting. + * + * This method loads a pretrained classifier and couples him with a preprocessor that standarises pixels with a + * deviation of 113. The architecture file can be downloaded from: + * + * While the weights can be downloaded from: + * + * The words assigned to the network outputs are available at: + * + * + * @param archFilename is the path to the prototxt file containing the deployment model architecture description. + * When employing OCR_HOLISTIC_BACKEND_CAFFE this is the path to the deploy ".prototxt". + * + * @param weightsFilename is the path to the pretrained weights of the model. When employing + * OCR_HOLISTIC_BACKEND_CAFFE this is the path to the ".caffemodel" file. This file can be very large, the + * pretrained DictNet uses 2GB. + * + * @param backEnd integer parameter selecting the coputation framework. For now OCR_HOLISTIC_BACKEND_CAFFE is + * the only option + */ + CV_WRAP static Ptr createDictNet(String archFilename,String weightsFilename,int backEnd=OCR_HOLISTIC_BACKEND_DEFAULT); + +}; + +namespace cnn_config{ + +/** @brief runtime backend information + * + * this function finds the status of backends compiled with this module + * + * @return a list of backends (caffe,opencv-dnn etc.) + * */ +CV_EXPORTS_W std::vector getAvailableBackends(); + +namespace caffe_backend{ + +/** @brief Prompts Caffe on the computation device beeing used + * + * Caffe can only be controlled globally on whether the GPU or the CPU is used has a + * global behavior. This function queries the current state of caffe. + * If the module is built without caffe, this method throws an exception. + * + * @return true if caffe is computing on the GPU, false if caffe is computing on the CPU + */ +CV_EXPORTS_W bool getCaffeGpuMode(); + +/** @brief Sets the computation device beeing used by Caffe + * + * Caffe can only be controlled globally on whether the GPU or the CPU is used has a + * global behavior. This function queries the current state of caffe. + * If the module is built without caffe, this method throws an exception. + * + * @param useGpu set to true for caffe to be computing on the GPU, false if caffe is + * computing on the CPU + */ +CV_EXPORTS_W void setCaffeGpuMode(bool useGpu); + +/** @brief Provides runtime information on whether Caffe support was compiled in. + * + * The text module API is the same regardless of whether CAffe was available or not + * During compilation. When methods that require Caffe are invocked while Caffe support + * is not compiled in, exceptions are thrown. This method allows to test whether the + * text module was built with caffe during runtime. + * + * @return true if Caffe support for the the text module was provided during compilation, + * false if Caffe was unavailable. + */ +CV_EXPORTS_W bool getCaffeAvailable(); + +}//caffe +namespace dnn_backend { + +/** @brief Provides runtime information on whether DNN module was compiled in. + * + * The text module API is the same regardless of whether DNN module was available or not + * During compilation. When methods that require backend are invocked while no backend support + * is compiled, exceptions are thrown. This method allows to test whether the + * text module was built with dnn_backend during runtime. + * + * @return true if opencv_dnn support for the the text module was provided during compilation, + * false if opencv_dnn was unavailable. + */ +CV_EXPORTS_W bool getDNNAvailable(); + +}//dnn_backend +}//cnn_config + +/** @brief OCRHolisticWordRecognizer class provides the functionallity of segmented wordspotting. + * Given a predefined vocabulary , a TextImageClassifier is employed to select the most probable + * word given an input image. + * + * This class implements the logic of providing transcriptions given a vocabulary and and an image + * classifer. The classifier has to be any TextImageClassifier but the classifier for which this + * class was built is the DictNet. In order to load it the following files should be downloaded: + + * + * + * + */ +class CV_EXPORTS_W OCRHolisticWordRecognizer : public BaseOCR +{ +public: + virtual void run(Mat& image, std::string& output_text, std::vector* component_rects=NULL, + std::vector* component_texts=NULL, std::vector* component_confidences=NULL, + int component_level=OCR_LEVEL_WORD)=0; + + /** @brief Recognize text using a segmentation based word-spotting/classifier cnn. + + Takes image on input and returns recognized text in the output_text parameter. Optionally + provides also the Rects for individual text elements found (e.g. words), and the list of those + text elements with their confidence values. + + @param image Input image CV_8UC1 or CV_8UC3 + + @param mask is totally ignored and is only available for compatibillity reasons + + @param output_text Output text of the the word spoting, always one that exists in the dictionary. + + @param component_rects Not applicable for word spotting can be be NULL if not, a single elemnt will + be put in the vector. + + @param component_texts Not applicable for word spotting can be be NULL if not, a single elemnt will + be put in the vector. + + @param component_confidences Not applicable for word spotting can be be NULL if not, a single elemnt will + be put in the vector. + + @param component_level must be OCR_LEVEL_WORD. + */ + + virtual void run(Mat& image, Mat& mask, std::string& output_text, std::vector* component_rects=NULL, + std::vector* component_texts=NULL, std::vector* component_confidences=NULL, + int component_level=OCR_LEVEL_WORD)=0; + + + /** + @brief Method that provides a quick and simple interface to a single word image classifcation + + @param inputImage an image expected to be a CV_U8C1 or CV_U8C3 of any size assumed to contain a single word + + @param transcription an opencv string that will store the detected word transcription + + @param confidence a double that will be updated with the confidence the classifier has for the selected word + */ + CV_WRAP virtual void recogniseImage(InputArray inputImage,CV_OUT String& transcription,CV_OUT double& confidence)=0; + + /** + @brief Method that provides a quick and simple interface to a multiple word image classifcation taking advantage + the classifiers parallel capabilities. + + @param inputImageList an list of images expected to be a CV_U8C1 or CV_U8C3 each image can be of any size and is assumed + to contain a single word. + + @param transcriptions a vector of opencv strings that will store the detected word transcriptions, one for each + input image + + @param confidences a vector of double that will be updated with the confidence the classifier has for each of the + selected words. + */ + CV_WRAP virtual void recogniseImageBatch(InputArrayOfArrays inputImageList,CV_OUT std::vector& transcriptions,CV_OUT std::vector& confidences)=0; + + + /** + @brief simple getter for the vocabulary employed + */ + CV_WRAP virtual const std::vector& getVocabulary()=0; + + /** @brief simple getter for the preprocessing functor + */ + CV_WRAP virtual Ptr getClassifier()=0; + + /** @brief Creates an instance of the OCRHolisticWordRecognizer class. + + @param classifierPtr an instance of TextImageClassifier, normaly a DeepCNN instance + + @param vocabularyFilename the relative or absolute path to the file containing all words in the vocabulary. Each text line + in the file is assumed to be a single word. The number of words in the vocabulary must be exactly the same as the outputSize + of the classifier. + */ + CV_WRAP static Ptr create(Ptr classifierPtr,String vocabularyFilename); + + + /** @brief Creates an instance of the OCRHolisticWordRecognizer class and implicitly also a DeepCNN classifier. + + @param modelArchFilename the relative or absolute path to the prototxt file describing the classifiers architecture. + + @param modelWeightsFilename the relative or absolute path to the file containing the pretrained weights of the model in caffe-binary form. + + @param vocabularyFilename the relative or absolute path to the file containing all words in the vocabulary. Each text line + in the file is assumed to be a single word. The number of words in the vocabulary must be exactly the same as the outputSize + of the classifier. + */ + CV_WRAP static Ptr create(String modelArchFilename, String modelWeightsFilename, String vocabularyFilename); + + /** @brief + * + * @param classifierPtr + * + * @param vocabulary + */ + CV_WRAP static Ptr create(Ptr classifierPtr,const std::vector& vocabulary); + + /** @brief + * + * @param modelArchFilename + * + * @param modelWeightsFilename + * + * @param vocabulary + */ + CV_WRAP static Ptr create (String modelArchFilename, String modelWeightsFilename, const std::vector& vocabulary); +}; + + +}//namespace text +}//namespace cv + + #endif // _OPENCV_TEXT_OCR_HPP_ diff --git a/modules/text/include/opencv2/text/textDetector.hpp b/modules/text/include/opencv2/text/textDetector.hpp new file mode 100644 index 00000000000..eda74801449 --- /dev/null +++ b/modules/text/include/opencv2/text/textDetector.hpp @@ -0,0 +1,271 @@ +/*M////////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 2013, OpenCV Foundation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#ifndef __OPENCV_TEXT_TEXTDETECTOR_HPP__ +#define __OPENCV_TEXT_TEXTDETECTOR_HPP__ + +#include +#include +#include +#include +#include"ocr.hpp" + + +namespace cv +{ +namespace text +{ + +//! @addtogroup text_detect +//! @{ + + + +//base class BaseDetector declares a common API that would be used in a typical text +//detection scenario +class CV_EXPORTS_W BaseDetector +{ +public: + virtual ~BaseDetector() {}; + + virtual void run(Mat& image, + std::vector* component_rects=NULL, + std::vector* component_confidences=NULL, + int component_level=0) = 0; + + virtual void run(Mat& image, Mat& mask, + std::vector* component_rects=NULL, + std::vector* component_confidences=NULL, + int component_level=0) = 0; + +}; +/** A virtual class for different models of text detection (including CNN based deep models) + */ + +class CV_EXPORTS_W TextRegionDetector +{ +protected: + /** Stores input and output size + */ + //netGeometry inputGeometry_; + //netGeometry outputGeometry_; + Size inputGeometry_; + Size outputGeometry_; + int inputChannelCount_; + int outputChannelCount_; + +public: + virtual ~TextRegionDetector() {} + + /** @brief produces a list of Bounding boxes and an estimate of text-ness confidence of Bounding Boxes + */ + CV_WRAP virtual void detect(InputArray image, OutputArray bboxProb ) = 0; + + + /** @brief simple getter method returning the size (height, width) of the input sample + */ + CV_WRAP virtual Size getInputGeometry(){return this->inputGeometry_;} + + /** @brief simple getter method returning the shape of the oputput + * Any text detector should output a number of text regions alongwith a score of text-ness + * From the shape it can be inferred the number of text regions and number of returned value + * for each region + */ + CV_WRAP virtual Size getOutputGeometry(){return this->outputGeometry_;} + + + +}; + +/** Generic structure of Deep CNN based Text Detectors + * */ +class CV_EXPORTS_W DeepCNNTextDetector : public TextRegionDetector +{ + /** @brief Class that uses a pretrained caffe model for text detection. + * Any text detection should + * This network is described in detail in: + * Minghui Liao et al.: TextBoxes: A Fast Text Detector with a Single Deep Neural Network + * https://arxiv.org/abs/1611.06779 + */ +protected: + /** all deep CNN based text detectors have a preprocessor (normally) + */ + Ptr preprocessor_; + /** @brief all image preprocessing is handled here including whitening etc. + * + * @param input the image to be preprocessed for the classifier. If the depth + * is CV_U8 values should be in [0,255] otherwise values are assumed to be in [0,1] + * + * @param output reference to the image to be fed to the classifier, the preprocessor will + * resize the image to the apropriate size and convert it to the apropriate depth\ + * + * The method preprocess should never be used externally, it is up to classify and classifyBatch + * methods to employ it. + */ + virtual void preprocess(const Mat& input,Mat& output); +public: + virtual ~DeepCNNTextDetector() {}; + + /** @brief Constructs a DeepCNNTextDetector object from a caffe pretrained model + * + * @param archFilename is the path to the prototxt file containing the deployment model architecture description. + * + * @param weightsFilename is the path to the pretrained weights of the model in binary fdorm. + * + * @param preprocessor is a pointer to the instance of a ImagePreprocessor implementing the preprocess_ protecteed method; + * + * @param minibatchSz the maximum number of samples that can processed in parallel. In practice this parameter + * has an effect only when computing in the GPU and should be set with respect to the memory available in the GPU. + * + * @param backEnd integer parameter selecting the coputation framework. For now OCR_HOLISTIC_BACKEND_CAFFE is + * the only option + */ + CV_WRAP static Ptr create(String archFilename,String weightsFilename,Ptr preprocessor,int minibatchSz=100,int backEnd=OCR_HOLISTIC_BACKEND_DEFAULT); + + /** @brief Constructs a DeepCNNTextDetector intended to be used for text area detection. + * + * This method loads a pretrained classifier and couples with a preprocessor that preprocess the image with mean subtraction of () + * The architecture and models weights can be downloaded from: + * https://github.com/sghoshcvc/TextBox-Models.git (size is around 100 MB) + + * @param archFilename is the path to the prototxt file containing the deployment model architecture description. + * When employing OCR_HOLISTIC_BACKEND_CAFFE this is the path to the deploy ".prototxt". + * + * @param weightsFilename is the path to the pretrained weights of the model. When employing + * OCR_HOLISTIC_BACKEND_CAFFE this is the path to the ".caffemodel" file. + * + * @param backEnd integer parameter selecting the coputation framework. For now OCR_HOLISTIC_BACKEND_CAFFE is + * the only option + */ + CV_WRAP static Ptr createTextBoxNet(String archFilename,String weightsFilename,int backEnd=OCR_HOLISTIC_BACKEND_DEFAULT); + friend class ImagePreprocessor; + +}; + +/** @brief textDetector class provides the functionallity of text bounding box detection. + * A TextRegionDetector is employed to find bounding boxes of text + * words given an input image. + * + * This class implements the logic of providing text bounding boxes in a vector of rects given an TextRegionDetector + * The TextRegionDetector can be any text detector + * + */ + +class CV_EXPORTS_W textDetector : public BaseDetector +{ +public: + virtual void run(Mat& image, std::vector* component_rects=NULL, + std::vector* component_confidences=NULL, + int component_level=OCR_LEVEL_WORD)=0; + + /** @brief detect text with a cnn, input is one image with (multiple) ocuurance of text. + + Takes image on input and returns recognized text in the output_text parameter. Optionally + provides also the Rects for individual text elements found (e.g. words), and the list of those + text elements with their confidence values. + + @param image Input image CV_8UC1 or CV_8UC3 + + @param mask is totally ignored and is only available for compatibillity reasons + + + @param component_rects a vector of Rects, each rect is one text bounding box. + + + + @param component_confidences A vector of float returns confidence of text bounding boxes + + @param component_level must be OCR_LEVEL_WORD. + */ + + virtual void run(Mat& image, Mat& mask, std::vector* component_rects=NULL, + std::vector* component_confidences=NULL, + int component_level=OCR_LEVEL_WORD)=0; + + + /** + @brief Method that provides a quick and simple interface to detect text inside an image + + @param inputImage an image expected to be a CV_U8C3 of any size + + @param Bbox a vector of Rect that will store the detected word bounding box + + @param confidence a vector of float that will be updated with the confidence the classifier has for the selected bounding box + */ + CV_WRAP virtual void textDetectInImage(InputArray inputImage,CV_OUT std::vector& Bbox,CV_OUT std::vector& confidence)=0; + + + + + /** @brief simple getter for the preprocessing functor + */ + CV_WRAP virtual Ptr getClassifier()=0; + + /** @brief Creates an instance of the textDetector class. + + @param classifierPtr an instance of TextImageClassifier, normaly a DeepCNN instance + + + */ + CV_WRAP static Ptr create(Ptr classifierPtr); + + + /** @brief Creates an instance of the textDetector class and implicitly also a DeepCNN classifier. + + @param modelArchFilename the relative or absolute path to the prototxt file describing the classifiers architecture. + + @param modelWeightsFilename the relative or absolute path to the file containing the pretrained weights of the model in caffe-binary form. + + + */ + CV_WRAP static Ptr create(String modelArchFilename, String modelWeightsFilename); + + +}; + +//! @} +}//namespace text +}//namespace cv + + +#endif // _OPENCV_TEXT_OCR_HPP_ diff --git a/modules/text/samples/deeptextdetection.py b/modules/text/samples/deeptextdetection.py new file mode 100644 index 00000000000..2e8395b60f1 --- /dev/null +++ b/modules/text/samples/deeptextdetection.py @@ -0,0 +1,57 @@ +# -*- coding: utf-8 -*- +""" +Created on Wed Jul 19 17:54:00 2017 + +@author: sgnosh +""" + +#!/usr/bin/python + +import sys +import os + +import cv2 +import numpy as np + +print('\nDeeptextdetection.py') +print(' A demo script of text box alogorithm of the paper:') +print(' * Minghui Liao et al.: TextBoxes: A Fast Text Detector with a Single Deep Neural Network https://arxiv.org/abs/1611.06779\n') + + +if (len(sys.argv) < 2): + print(' (ERROR) You must call this script with an argument (path_to_image_to_be_processed)\n') + quit() +#if not cv2.text.cnn_config.caffe_backend.getCaffeAvailable(): +# print"The text module was compiled without Caffe which is the only available DeepCNN backend.\nAborting!\n" +# +# quit() +# check model and architecture file existance +if not os.path.isfile('textbox.caffemodel') or not os.path.isfile('textbox_deploy.prototxt'): + print " Model files not found in current directory. Aborting" + print " Model files should be downloaded from https://github.com/sghoshcvc/TextBox-Models" + quit() +cv2.text.cnn_config.caffe_backend.setCaffeGpuMode(True); +pathname = os.path.dirname(sys.argv[0]) + + +img = cv2.imread(str(sys.argv[1])) +textSpotter=cv2.text.textDetector_create( + "textbox_deploy.prototxt","textbox.caffemodel") +rects,outProbs = textSpotter.textDetectInImage(img); +# for visualization +vis = img.copy() +# Threshold to select rectangles : All rectangles for which outProbs is more than this threshold will be shown +thres = 0.6 + + + #Visualization +for r in range(0,np.shape(rects)[0]): + if outProbs[r] >thres: + rect = rects[r] + cv2.rectangle(vis, (rect[0],rect[1]), (rect[0]+rect[2],rect[1]+rect[3]), (255, 0, 0), 2) + # cv2.rectangle(vis, (rect[0],rect[1]), (rect[0]+rect[2],rect[1]+rect[3]), (255, 255, 255), 1) + + +#Visualization +cv2.imshow("Text detection result", vis) +cv2.waitKey(0) \ No newline at end of file diff --git a/modules/text/samples/textbox_demo.cpp b/modules/text/samples/textbox_demo.cpp new file mode 100644 index 00000000000..b76658e1b7a --- /dev/null +++ b/modules/text/samples/textbox_demo.cpp @@ -0,0 +1,151 @@ +/* + * dictnet_demo.cpp + * + * Demonstrates simple use of the holistic word classifier in C++ + * + * Created on: June 26, 2016 + * Author: Anguelos Nicolaou + */ + +#include "opencv2/text.hpp" +#include "opencv2/highgui.hpp" +#include "opencv2/imgproc.hpp" + +#include +#include +#include +#include +#include + +void textbox_draw(cv::Mat &src, std::vector &groups,std::vector &probs,std::vector wordList,float thres); +inline std::string getHelpStr(std::string progFname){ + std::stringstream out; + out << " Demo of text detection CNN for text detection." << std::endl; + out << " Max Jaderberg et al.: Reading Text in the Wild with Convolutional Neural Networks, IJCV 2015"< " << std::endl; + out << " Caffe Model files (textbox.caffemodel, textbox_deploy.prototxt)"< &groups,std::vector &probs,std::vector wordList,float thres=0.6) +{ + for (int i=0;i<(int)groups.size(); i++) + { + if(probs[i]>thres) + { + if (src.type() == CV_8UC3) + { + cv::rectangle(src,groups.at(i).tl(),groups.at(i).br(),cv::Scalar( 0, 255, 255 ), 3, 8 ); + cv::putText(src, wordList[i],groups.at(i).tl() , cv::FONT_HERSHEY_PLAIN, 1, cv::Scalar( 0,0,255 )); + } + else + rectangle(src,groups.at(i).tl(),groups.at(i).br(),cv::Scalar( 255 ), 3, 8 ); + } + } +} + + +int main(int argc, const char * argv[]){ + if(!cv::text::cnn_config::caffe_backend::getCaffeAvailable()){ + std::cout<<"The text module was compiled without Caffe which is the only available DeepCNN backend.\nAborting!\n"; + //exit(1); + } + std::vector backends=cv::text::cnn_config::getAvailableBackends(); + std::cout << "The Following backends are available" << "\n"; + for (int i=0;i textSpotter=cv::text::textDetector::create( + "textbox_deploy.prototxt","textbox.caffemodel"); + + //cv::Ptr wordSpotter= + // cv::text::textDetector::create(cnn); + std::cout<<"Created Text Spotter with text Boxes"; + + std::vector bbox; + std::vector outProbabillities; + textSpotter->textDetectInImage(image,bbox,outProbabillities); + // textbox_draw(image, bbox,outProbabillities); + float thres =0.6f; + std::vector imageList; + for(int imageIdx=0;imageIdx<(int)bbox.size();imageIdx++){ + if(outProbabillities[imageIdx]>thres){ + imageList.push_back(image(bbox.at(imageIdx))); + } + + } + // call dict net here for all detected parts + cv::Ptr cnn=cv::text::DeepCNN::createDictNet( + "dictnet_vgg_deploy.prototxt","dictnet_vgg.caffemodel",cv::text::OCR_HOLISTIC_BACKEND_DNN); + + cv::Ptr wordSpotter= + cv::text::OCRHolisticWordRecognizer::create(cnn,"dictnet_vgg_labels.txt"); + + std::vector wordList; + std::vector wordProbabillities; + wordSpotter->recogniseImageBatch(imageList,wordList,wordProbabillities); + // write the output in file + std::ofstream out; + out.open(argv[1]); + + + for (int i=0;i<(int)wordList.size(); i++) + { + cv::Point tl_ = bbox.at(i).tl(); + cv::Point br_ = bbox.at(i).br(); + + out< +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace cv { namespace text { +//************************************************************************************ +//****************** ImagePreprocessor ******************************************* +//************************************************************************************ + +void ImagePreprocessor::preprocess(InputArray input,OutputArray output,Size sz,int outputChannels){ + Mat inpImg=input.getMat(); + Mat outImg; + this->preprocess_(inpImg,outImg,sz,outputChannels); + outImg.copyTo(output); +} +void ImagePreprocessor::set_mean(Mat mean){ + + + this->set_mean_(mean); + +} + + + +class ResizerPreprocessor: public ImagePreprocessor{ +protected: + void preprocess_(const Mat& input,Mat& output,Size outputSize,int outputChannels){ + //TODO put all the logic of channel and depth conversions in ImageProcessor class + CV_Assert(outputChannels==1 || outputChannels==3); + CV_Assert(input.channels()==1 || input.channels()==3); + if(input.channels()!=outputChannels) + { + Mat tmpInput; + if(outputChannels==1){ + cvtColor(input,tmpInput,COLOR_BGR2GRAY); + if(input.depth()==CV_8U) + { + tmpInput.convertTo(output,CV_32FC1,1/255.0); + }else + {//Assuming values are at the desired [0,1] range + tmpInput.convertTo(output, CV_32FC1); + } + }else + { + cvtColor(input,tmpInput,COLOR_GRAY2BGR); + if(input.depth()==CV_8U) + { + tmpInput.convertTo(output,CV_32FC3,1/255.0); + }else + {//Assuming values are at the desired [0,1] range + tmpInput.convertTo(output, CV_32FC3); + } + } + }else + { + if(input.channels()==1) + { + if(input.depth()==CV_8U) + { + input.convertTo(output, CV_32FC1,1/255.0); + }else + {//Assuming values are at the desired [0,1] range + input.convertTo(output, CV_32FC1); + } + }else + { + if(input.depth()==CV_8U){ + input.convertTo(output, CV_32FC3,1/255.0); + }else + {//Assuming values are at the desired [0,1] range + input.convertTo(output, CV_32FC3); + } + } + } + if(outputSize.width!=0 && outputSize.height!=0) + { + resize(output,output,outputSize); + } + } + //void set_mean_(Mat m){} +public: + ResizerPreprocessor(){} + ~ResizerPreprocessor(){} +}; + +class StandarizerPreprocessor: public ImagePreprocessor{ +protected: + double sigma_; + //void set_mean_(Mat M){} + + void preprocess_(const Mat& input,Mat& output,Size outputSize,int outputChannels){ + + //TODO put all the logic of channel and depth conversions in ImageProcessor class + CV_Assert(outputChannels==1 || outputChannels==3); + CV_Assert(input.channels()==1 || input.channels()==3); + if(input.channels()!=outputChannels) + { + Mat tmpInput; + if(outputChannels==1) + { + cvtColor(input,tmpInput,COLOR_BGR2GRAY); + if(input.depth()==CV_8U) + { + tmpInput.convertTo(output,CV_32FC1,1/255.0); + }else + {//Assuming values are at the desired [0,1] range + tmpInput.convertTo(output, CV_32FC1); + } + }else + { + cvtColor(input,tmpInput,COLOR_GRAY2BGR); + if(input.depth()==CV_8U) + { + tmpInput.convertTo(output,CV_32FC3,1/255.0); + }else + {//Assuming values are at the desired [0,1] range + tmpInput.convertTo(output, CV_32FC3); + } + } + }else + { + if(input.channels()==1) + { + if(input.depth()==CV_8U) + { + input.convertTo(output, CV_32FC1,1/255.0); + }else + {//Assuming values are at the desired [0,1] range + input.convertTo(output, CV_32FC1); + } + }else + { + if(input.depth()==CV_8U) + { + input.convertTo(output, CV_32FC3,1/255.0); + }else + {//Assuming values are at the desired [0,1] range + input.convertTo(output, CV_32FC3); + } + } + } + if(outputSize.width!=0 && outputSize.height!=0) + { + resize(output,output,outputSize); + } + + Scalar mean,dev; + meanStdDev(output,mean,dev); + subtract(output,mean[0],output); + divide(output,(dev[0]/sigma_),output); + } +public: + StandarizerPreprocessor(double sigma):sigma_(sigma){} + ~StandarizerPreprocessor(){} + +}; + +class customPreprocessor:public ImagePreprocessor{ +protected: + + double rawval_; + Mat mean_; + String channel_order_; + + void set_mean_(Mat imMean_){ + + imMean_.copyTo(this->mean_); + + + } + + void set_raw_scale(int rawval){ + rawval_ = rawval; + + } + void set_channels(String channel_order){ + channel_order_=channel_order; + } + + + void preprocess_(const Mat& input,Mat& output,Size outputSize,int outputChannels){ + //TODO put all the logic of channel and depth conversions in ImageProcessor class + + CV_Assert(outputChannels==1 || outputChannels==3); + CV_Assert(input.channels()==1 || input.channels()==3); + if(input.channels()!=outputChannels) + { + Mat tmpInput; + if(outputChannels==1) + { + cvtColor(input,tmpInput,COLOR_BGR2GRAY); + if(input.depth()==CV_8U) + { + if (rawval_ == 1) + tmpInput.convertTo(output,CV_32FC3,1/255.0); + else + tmpInput.convertTo(output,CV_32FC1); + }else + {//Assuming values are at the desired [0,1] range + if (rawval_ ==1) + tmpInput.convertTo(output, CV_32FC1); + else + tmpInput.convertTo(output, CV_32FC1,rawval_); + } + }else + { + cvtColor(input,tmpInput,COLOR_GRAY2BGR); + if(input.depth()==CV_8U) + { + if (rawval_ == 1) + tmpInput.convertTo(output,CV_32FC3,1/255.0); + else + tmpInput.convertTo(output,CV_32FC1); + }else + {//Assuming values are at the desired [0,1] range + if (rawval_ ==1) + tmpInput.convertTo(output, CV_32FC1); + else + tmpInput.convertTo(output, CV_32FC1,rawval_); + } + } + }else + { + if(input.channels()==1) + { + if(input.depth()==CV_8U) + { + if (rawval_ == 1) + input.convertTo(output,CV_32FC1,1/255.0); + else + input.convertTo(output,CV_32FC1); + }else + {//Assuming values are at the desired [0,1] range + if (rawval_ ==1) + input.convertTo(output, CV_32FC1); + else + input.convertTo(output, CV_32FC1,rawval_); + } + }else + { + if(input.depth()==CV_8U) + { + if (rawval_ == 1) + input.convertTo(output,CV_32FC3,1/255.0); + else + input.convertTo(output,CV_32FC3); + }else + {//Assuming values are at the desired [0,1] range + if (rawval_ ==1) + input.convertTo(output, CV_32FC3); + else + input.convertTo(output, CV_32FC3,rawval_); + } + } + } + if(outputSize.width!=0 && outputSize.height!=0) + { + resize(output,output,outputSize); + } + + if (!this->mean_.empty()){ + + Scalar mean_s(this->mean_.at(0,0),this->mean_.at(0,1),this->mean_.at(0,2)); + subtract(output,mean_s,output); + } + else{ + Scalar mean_s; + mean_s = mean(output); + subtract(output,mean_s,output); + } + + } + +public: + customPreprocessor( double rawval,String channel_order):rawval_(rawval),channel_order_(channel_order){} + ~customPreprocessor(){} + +}; + +class MeanSubtractorPreprocessor: public ImagePreprocessor{ +protected: + Mat mean_; + //void set_mean_(Mat m){} + void preprocess_(const Mat& input,Mat& output,Size outputSize,int outputChannels){ + //TODO put all the logic of channel and depth conversions in ImageProcessor class + CV_Assert(this->mean_.cols==outputSize.width && this->mean_.rows ==outputSize.height); + CV_Assert(outputChannels==1 || outputChannels==3); + CV_Assert(input.channels()==1 || input.channels()==3); + if(input.channels()!=outputChannels) + { + Mat tmpInput; + if(outputChannels==1) + { + cvtColor(input,tmpInput,COLOR_BGR2GRAY); + if(input.depth()==CV_8U) + { + tmpInput.convertTo(output,CV_32FC1,1/255.0); + }else + {//Assuming values are at the desired [0,1] range + tmpInput.convertTo(output, CV_32FC1); + } + }else + { + cvtColor(input,tmpInput,COLOR_GRAY2BGR); + if(input.depth()==CV_8U) + { + tmpInput.convertTo(output,CV_32FC3,1/255.0); + }else + {//Assuming values are at the desired [0,1] range + tmpInput.convertTo(output, CV_32FC3); + } + } + }else + { + if(input.channels()==1) + { + if(input.depth()==CV_8U) + { + input.convertTo(output, CV_32FC1,1/255.0); + }else + {//Assuming values are at the desired [0,1] range + input.convertTo(output, CV_32FC1); + } + }else + { + if(input.depth()==CV_8U) + { + input.convertTo(output, CV_32FC3,1/255.0); + }else + {//Assuming values are at the desired [0,1] range + input.convertTo(output, CV_32FC3); + } + } + } + if(outputSize.width!=0 && outputSize.height!=0) + { + resize(output,output,outputSize); + } + subtract(output,this->mean_,output); + } +public: + MeanSubtractorPreprocessor(Mat mean) + { + mean.copyTo(this->mean_); + } + + ~MeanSubtractorPreprocessor(){} +}; + + + +Ptr ImagePreprocessor::createResizer() +{ + return Ptr(new ResizerPreprocessor); +} + +Ptr ImagePreprocessor::createImageStandarizer(double sigma) +{ + return Ptr(new StandarizerPreprocessor(sigma)); +} +Ptr ImagePreprocessor::createImageCustomPreprocessor(double rawval,String channel_order) +{ + + return Ptr(new customPreprocessor(rawval,channel_order)); +} + +Ptr ImagePreprocessor::createImageMeanSubtractor(InputArray meanImg) +{ + Mat tmp=meanImg.getMat(); + return Ptr(new MeanSubtractorPreprocessor(tmp)); +} +} +} diff --git a/modules/text/src/ocr_holistic.cpp b/modules/text/src/ocr_holistic.cpp new file mode 100644 index 00000000000..035f104f28a --- /dev/null +++ b/modules/text/src/ocr_holistic.cpp @@ -0,0 +1,697 @@ +#include "precomp.hpp" +#include "opencv2/imgproc.hpp" +#include "opencv2/highgui.hpp" +#include "opencv2/core.hpp" + + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#ifdef HAVE_CAFFE +#include "caffe/caffe.hpp" +#endif + +#ifdef HAVE_DNN +#include "opencv2/dnn.hpp" +#endif + +using namespace cv; +using namespace cv::dnn; +using namespace std; +namespace cv { namespace text { + +//Maybe OpenCV has a routine better suited +inline bool fileExists (String filename) { + std::ifstream f(filename.c_str()); + return f.good(); +} + + + +//************************************************************************************ +//****************** TextImageClassifier ***************************************** +//************************************************************************************ + +void TextImageClassifier::preprocess(const Mat& input,Mat& output) +{ + this->preprocessor_->preprocess_(input,output,this->inputGeometry_,this->channelCount_); +} + +void TextImageClassifier::setPreprocessor(Ptr ptr) +{ + CV_Assert(!ptr.empty()); + preprocessor_=ptr; +} + +Ptr TextImageClassifier::getPreprocessor() +{ + return preprocessor_; +} + + +class DeepCNNCaffeImpl: public DeepCNN{ +protected: + void classifyMiniBatch(std::vector inputImageList, Mat outputMat) + { + //Classifies a list of images containing at most minibatchSz_ images + CV_Assert(int(inputImageList.size())<=this->minibatchSz_); + CV_Assert(outputMat.isContinuous()); + + +#ifdef HAVE_CAFFE + net_->input_blobs()[0]->Reshape(inputImageList.size(), this->channelCount_,this->inputGeometry_.height,this->inputGeometry_.width); + net_->Reshape(); + float* inputBuffer=net_->input_blobs()[0]->mutable_cpu_data(); + float* inputData=inputBuffer; + + for(size_t imgNum=0;imgNum input_channels; + Mat preprocessed; + // if the image have multiple color channels the input layer should be populated accordingly + for (int channel=0;channel < this->channelCount_;channel++){ + + cv::Mat netInputWraped(this->inputGeometry_.height, this->inputGeometry_.width, CV_32FC1, inputData); + input_channels.push_back(netInputWraped); + //input_data += width * height; + inputData+=(this->inputGeometry_.height*this->inputGeometry_.width); + + } + this->preprocess(inputImageList[imgNum],preprocessed); + split(preprocessed, input_channels); + + + } + this->net_->ForwardPrefilled(); + const float* outputNetData=net_->output_blobs()[0]->cpu_data(); + this->outputGeometry_ = Size(net_->output_blobs()[0]->width(),net_->output_blobs()[0]->height()); + int outputSz = this->outputSize_ * this->outputGeometry_.height * this->outputGeometry_.width; + + + //outputMat.resize(this->outputGeometry_.height * this->outputGeometry_.width); + float*outputMatData=(float*)(outputMat.data); + memcpy(outputMatData,outputNetData,sizeof(float)*outputSz*inputImageList.size()); + +#endif + } + +#ifdef HAVE_CAFFE + Ptr > net_; +#endif + //Size inputGeometry_;//=Size(100,32); + int minibatchSz_;//The existence of the assignment operator mandates this to be nonconst + int outputSize_; + //Size outputGeometry_; +public: + DeepCNNCaffeImpl(const DeepCNNCaffeImpl& dn): + minibatchSz_(dn.minibatchSz_),outputSize_(dn.outputSize_){ + channelCount_=dn.channelCount_; + inputGeometry_=dn.inputGeometry_; + //Implemented to supress Visual Studio warning "assignment operator could not be generated" +#ifdef HAVE_CAFFE + this->net_=dn.net_; +#endif + } + DeepCNNCaffeImpl& operator=(const DeepCNNCaffeImpl &dn) + { +#ifdef HAVE_CAFFE + this->net_=dn.net_; +#endif + this->setPreprocessor(dn.preprocessor_); + this->inputGeometry_=dn.inputGeometry_; + this->channelCount_=dn.channelCount_; + this->minibatchSz_=dn.minibatchSz_; + this->outputSize_=dn.outputSize_; + this->preprocessor_=dn.preprocessor_; + this->outputGeometry_=dn.outputGeometry_; + return *this; + //Implemented to supress Visual Studio warning "assignment operator could not be generated" + } + + DeepCNNCaffeImpl(String modelArchFilename, String modelWeightsFilename,Ptr preprocessor, int maxMinibatchSz) + :minibatchSz_(maxMinibatchSz) + { + + CV_Assert(this->minibatchSz_>0); + CV_Assert(fileExists(modelArchFilename)); + CV_Assert(fileExists(modelWeightsFilename)); + CV_Assert(!preprocessor.empty()); + this->setPreprocessor(preprocessor); +#ifdef HAVE_CAFFE + this->net_.reset(new caffe::Net(modelArchFilename, caffe::TEST)); + CV_Assert(net_->num_inputs()==1); + CV_Assert(net_->num_outputs()==1); + CV_Assert(this->net_->input_blobs()[0]->channels()==1 + ||this->net_->input_blobs()[0]->channels()==3); + this->channelCount_=this->net_->input_blobs()[0]->channels(); + + + + this->net_->CopyTrainedLayersFrom(modelWeightsFilename); + + caffe::Blob* inputLayer = this->net_->input_blobs()[0]; + + this->inputGeometry_=Size(inputLayer->width(), inputLayer->height()); + this->channelCount_ = inputLayer->channels(); + + inputLayer->Reshape(this->minibatchSz_,this->channelCount_,this->inputGeometry_.height, this->inputGeometry_.width); + net_->Reshape(); + this->outputSize_=net_->output_blobs()[0]->channels(); + this->outputGeometry_ = Size(net_->output_blobs()[0]->width(),net_->output_blobs()[0]->height()); + + + + + +#else + CV_Error(Error::StsError,"Caffe not available during compilation!"); +#endif + } + + void classify(InputArray image, OutputArray classProbabilities) + { + std::vector inputImageList; + inputImageList.push_back(image.getMat()); + classifyBatch(inputImageList,classProbabilities); + } + + void classifyBatch(InputArrayOfArrays inputImageList, OutputArray classProbabilities) + { + std::vector allImageVector; + inputImageList.getMatVector(allImageVector); + size_t outputSize=size_t(this->outputSize_);//temporary variable to avoid int to size_t arithmentic + + size_t minibatchSize=size_t(this->minibatchSz_);//temporary variable to avoid int to size_t arithmentic + classProbabilities.create(Size(int(outputSize),int(allImageVector.size())),CV_32F); + Mat outputMat = classProbabilities.getMat(); + for(size_t imgNum=0;imgNum(allImageVector.size()-imgNum,minibatchSize); + std::vector::const_iterator from=std::vector::const_iterator(allImageVector.begin()+imgNum); + std::vector::const_iterator to=std::vector::const_iterator(allImageVector.begin()+rangeEnd); + std::vector minibatchInput(from,to); + classifyMiniBatch(minibatchInput,outputMat.rowRange(int(imgNum),int(rangeEnd))); + + } + + } + + int getOutputSize() + { + return this->outputSize_; + } + Size getOutputGeometry() + { + return this->outputGeometry_; + } + + int getMinibatchSize() + { + return this->minibatchSz_; + } + + int getBackend() + { + return OCR_HOLISTIC_BACKEND_CAFFE; + } +}; + +class DeepCNNOpenCvDNNImpl: public DeepCNN{ +protected: + + void classifyMiniBatch(std::vector inputImageList, Mat outputMat) + { + //Classifies a list of images containing at most minibatchSz_ images + CV_Assert(int(inputImageList.size())<=this->minibatchSz_); + CV_Assert(outputMat.isContinuous()); + +#ifdef HAVE_DNN + + std::vector preProcessedImList; // to store preprocessed images, should it be handled inside preprocessing class? + + Mat preprocessed; + // preprocesses each image in the inputImageList and push to preprocessedImList + for(size_t imgNum=0;imgNumpreprocess(inputImageList[imgNum],preprocessed); + preProcessedImList.push_back(preprocessed); + } + // set input data blob in dnn::net + net_->setInput(blobFromImages(preProcessedImList,1, this->inputGeometry_), "data"); + + float*outputMatData=(float*)(outputMat.data); + //Mat outputNet(inputImageList.size(),this->outputSize_,CV_32FC1,outputMatData) ; + Mat outputNet = this->net_->forward(); + outputNet = outputNet.reshape(1, 1); + + float*outputNetData=(float*)(outputNet.data); + + memcpy(outputMatData,outputNetData,sizeof(float)*this->outputSize_*inputImageList.size()); + +#endif + } + +#ifdef HAVE_DNN + Ptr net_; +#endif + // hard coding input image size. anything in DNN library to get that from prototxt?? + // Size inputGeometry_;//=Size(100,32); + int minibatchSz_;//The existence of the assignment operator mandates this to be nonconst + int outputSize_; + //Size outputGeometry_;//= Size(1,1); + //int channelCount_; + // int inputChannel_ ;//=1; + // int _inputHeight; + //int _inputWidth ; + //int _inputChannel ; +public: + DeepCNNOpenCvDNNImpl(const DeepCNNOpenCvDNNImpl& dn): + minibatchSz_(dn.minibatchSz_),outputSize_(dn.outputSize_){ + channelCount_=dn.channelCount_; + inputGeometry_=dn.inputGeometry_; + //Implemented to supress Visual Studio warning "assignment operator could not be generated" +#ifdef HAVE_DNN + this->net_=dn.net_; +#endif + } + DeepCNNOpenCvDNNImpl& operator=(const DeepCNNOpenCvDNNImpl &dn) + { +#ifdef HAVE_DNN + this->net_=dn.net_; +#endif + this->setPreprocessor(dn.preprocessor_); + this->inputGeometry_=dn.inputGeometry_; + this->channelCount_=dn.channelCount_; + this->minibatchSz_=dn.minibatchSz_; + this->outputSize_=dn.outputSize_; + this->preprocessor_=dn.preprocessor_; + this->outputGeometry_=dn.outputGeometry_; + return *this; + //Implemented to supress Visual Studio warning "assignment operator could not be generated" + } + + DeepCNNOpenCvDNNImpl(String modelArchFilename, String modelWeightsFilename,Ptr preprocessor, int maxMinibatchSz,int inputWidth ,int inputHeight ,int inputChannel ) + :minibatchSz_(maxMinibatchSz) + { + + CV_Assert(this->minibatchSz_>0); + CV_Assert(fileExists(modelArchFilename)); + CV_Assert(fileExists(modelWeightsFilename)); + CV_Assert(!preprocessor.empty()); + this->setPreprocessor(preprocessor); +#ifdef HAVE_DNN + + this->net_ = makePtr(readNetFromCaffe(modelArchFilename,modelWeightsFilename)); + + + + if (this->net_.empty()) + { + std::cerr << "Can't load network by using the following files: " << std::endl; + std::cerr << "prototxt: " << modelArchFilename << std::endl; + std::cerr << "caffemodel: " << modelWeightsFilename << std::endl; + //std::cerr << "bvlc_googlenet.caffemodel can be downloaded here:" << std::endl; + //std::cerr << "http://dl.caffe.berkeleyvision.org/bvlc_googlenet.caffemodel" << std::endl; + exit(-1); + } + + + this->inputGeometry_=Size(inputWidth,inputHeight);// Size(inputLayer->width(), inputLayer->height()); + this->channelCount_ = inputChannel;//inputLayer->channels(); + + //inputLayer->Reshape(this->minibatchSz_,this->channelCount_,this->inputGeometry_.height, this->inputGeometry_.width); + Ptr< Layer > outLayer= net_->getLayer (net_->getLayerId (net_->getLayerNames()[net_->getLayerNames().size()-2])); + //std::vector blobs = outLayer->blobs; + + this->outputSize_=(outLayer->blobs)[1].size[0] ;//net_->output_blobs()[0]->channels(); + //this->outputGeometry_ = Size(1,1);//Size(net_->output_blobs()[0]->width(),net_->output_blobs()[0]->height()); + + + + + + +#else + CV_Error(Error::StsError,"DNN module not available during compilation!"); +#endif + } + + void classify(InputArray image, OutputArray classProbabilities) + { + std::vector inputImageList; + inputImageList.push_back(image.getMat()); + classifyBatch(inputImageList,classProbabilities); + } + + void classifyBatch(InputArrayOfArrays inputImageList, OutputArray classProbabilities) + { + std::vector allImageVector; + inputImageList.getMatVector(allImageVector); + size_t outputSize=size_t(this->outputSize_);//temporary variable to avoid int to size_t arithmentic + + size_t minibatchSize=size_t(this->minibatchSz_);//temporary variable to avoid int to size_t arithmentic + classProbabilities.create(Size(int(outputSize),int(allImageVector.size())),CV_32F); + Mat outputMat = classProbabilities.getMat(); + + for(size_t imgNum=0;imgNum(allImageVector.size()-imgNum,minibatchSize); + std::vector::const_iterator from=std::vector::const_iterator(allImageVector.begin()+imgNum); + std::vector::const_iterator to=std::vector::const_iterator(allImageVector.begin()+rangeEnd); + std::vector minibatchInput(from,to); + classifyMiniBatch(minibatchInput,outputMat.rowRange(int(imgNum),int(rangeEnd))); + + } + + } + + int getOutputSize() + { + return this->outputSize_; + } + Size getOutputGeometry() + { + return this->outputGeometry_; + } + + int getMinibatchSize() + { + return this->minibatchSz_; + } + + int getBackend() + { + return OCR_HOLISTIC_BACKEND_DNN; + } +}; + +Ptr DeepCNN::create(String archFilename,String weightsFilename,Ptr preprocessor,int minibatchSz,int backEnd) +{ + if(preprocessor.empty()) + { + preprocessor=ImagePreprocessor::createResizer(); + } + switch(backEnd){ + case OCR_HOLISTIC_BACKEND_DEFAULT: + +#ifdef HAVE_CAFFE + return Ptr(new DeepCNNCaffeImpl(archFilename, weightsFilename,preprocessor, minibatchSz)); + +#elif defined(HAVE_DNN) + return Ptr(new DeepCNNOpenCvDNNImpl(archFilename, weightsFilename,preprocessor, minibatchSz,100,32,1)); +#else + CV_Error(Error::StsError,"DeepCNN::create backend not implemented"); + return Ptr(); +#endif + break; + + case OCR_HOLISTIC_BACKEND_CAFFE: + return Ptr(new DeepCNNCaffeImpl(archFilename, weightsFilename,preprocessor, minibatchSz)); + break; + case OCR_HOLISTIC_BACKEND_DNN: + return Ptr(new DeepCNNOpenCvDNNImpl(archFilename, weightsFilename,preprocessor, minibatchSz,100,32,1)); + break; + case OCR_HOLISTIC_BACKEND_NONE: + default: + CV_Error(Error::StsError,"DeepCNN::create backend not implemented"); + return Ptr(); + break; + } +} + + +Ptr DeepCNN::createDictNet(String archFilename,String weightsFilename,int backEnd) +{ + Ptr preprocessor=ImagePreprocessor::createImageStandarizer(113); + switch(backEnd){ + case OCR_HOLISTIC_BACKEND_DEFAULT: + +#ifdef HAVE_CAFFE + return Ptr(new DeepCNNCaffeImpl(archFilename, weightsFilename,preprocessor, 100)); + +#elif defined(HAVE_DNN) + return Ptr(new DeepCNNOpenCvDNNImpl(archFilename, weightsFilename,preprocessor, 100,100,32,1)); +#else + CV_Error(Error::StsError,"DeepCNN::create backend not implemented"); + return Ptr(); +#endif + break; + + case OCR_HOLISTIC_BACKEND_CAFFE: + return Ptr(new DeepCNNCaffeImpl(archFilename, weightsFilename,preprocessor, 100)); + break; + case OCR_HOLISTIC_BACKEND_DNN: + return Ptr(new DeepCNNOpenCvDNNImpl(archFilename, weightsFilename,preprocessor, 100,100,32,1)); + break; + case OCR_HOLISTIC_BACKEND_NONE: + default: + CV_Error(Error::StsError,"DeepCNN::create backend not implemented"); + return Ptr(); + break; + } +} + +namespace cnn_config{ +std::vector getAvailableBackends() +{ + std::vector backends; + +#ifdef HAVE_CAFFE + backends.push_back("CAFFE, OCR_HOLISTIC_BACKEND_CAFFE"); // dnn backend opencv_dnn + +#endif +#ifdef HAVE_DNN + backends.push_back("DNN, OCR_HOLISTIC_BACKEND_DNN");// opencv_dnn based backend" +#endif + return backends; + + +} + +namespace caffe_backend{ + +#ifdef HAVE_CAFFE + +bool getCaffeGpuMode() +{ + return caffe::Caffe::mode()==caffe::Caffe::GPU; +} + +void setCaffeGpuMode(bool useGpu) +{ + if(useGpu) + { + caffe::Caffe::set_mode(caffe::Caffe::GPU); + }else + { + caffe::Caffe::set_mode(caffe::Caffe::CPU); + } +} + +bool getCaffeAvailable() +{ + return true; +} +#else + +bool getCaffeGpuMode() +{ + CV_Error(Error::StsError,"Caffe not available during compilation!"); + return 0; +} + +void setCaffeGpuMode(bool useGpu) +{ + CV_Error(Error::StsError,"Caffe not available during compilation!"); + CV_Assert(useGpu==1);//Compilation directives force +} + +bool getCaffeAvailable(){ + return 0; +} + +#endif + +}//namespace caffe +namespace dnn_backend{ +#ifdef HAVE_DNN + + +bool getDNNAvailable(){ + return true; +} +#else +bool getDNNAvailable(){ + return 0; +} +#endif +}//namspace dnn_backend +}//namespace cnn_config + +class OCRHolisticWordRecognizerImpl: public OCRHolisticWordRecognizer{ +private: + struct NetOutput{ + //Auxiliary structure that handles the logic of getting class ids and probabillities from + //the raw outputs of caffe + int wordIdx; + float probabillity; + + static bool sorter(const NetOutput& o1,const NetOutput& o2) + {//used with std::sort to provide the most probable class + return o1.probabillity>o2.probabillity; + } + + static void getOutputs(const float* buffer,int nbOutputs,std::vector& res) + { + res.resize(nbOutputs); + for(int k=0;k tmp; + getOutputs(buffer,nbOutputs,tmp); + classNum=tmp[0].wordIdx; + confidence=tmp[0].probabillity; + + } + }; +protected: + std::vector labels_; + Ptr classifier_; +public: + OCRHolisticWordRecognizerImpl(Ptr classifierPtr,String vocabularyFilename):classifier_(classifierPtr) + { + CV_Assert(fileExists(vocabularyFilename));//this fails for some rason + std::ifstream labelsFile(vocabularyFilename.c_str()); + if(!labelsFile) + { + CV_Error(Error::StsError,"Could not read Labels from file"); + } + std::string line; + while (std::getline(labelsFile, line)) + { + labels_.push_back(std::string(line)); + } + CV_Assert(this->classifier_->getOutputSize()==int(this->labels_.size())); + } + + OCRHolisticWordRecognizerImpl(Ptr classifierPtr,const std::vector& vocabulary):classifier_(classifierPtr) + { + this->labels_=vocabulary; + CV_Assert(this->classifier_->getOutputSize()==int(this->labels_.size())); + } + + void recogniseImage(InputArray inputImage,CV_OUT String& transcription,CV_OUT double& confidence) + { + Mat netOutput; + this->classifier_->classify(inputImage,netOutput); + int classNum; + NetOutput::getClassification((float*)(netOutput.data),this->classifier_->getOutputSize(),classNum,confidence); + transcription=this->labels_[classNum]; + } + + void recogniseImageBatch(InputArrayOfArrays inputImageList,CV_OUT std::vector& transcriptionVec,CV_OUT std::vector& confidenceVec) + { + Mat netOutput; + this->classifier_->classifyBatch(inputImageList,netOutput); + + for(int k=0;kclassifier_->getOutputSize(),classNum,confidence); + transcriptionVec.push_back(this->labels_[classNum]); + confidenceVec.push_back(confidence); + } + } + + + void run(Mat& image, std::string& output_text, std::vector* component_rects=NULL, + std::vector* component_texts=NULL, std::vector* component_confidences=NULL, + int component_level=0) + { + CV_Assert(component_level==OCR_LEVEL_WORD);//Componnents not applicable for word spotting + double confidence; + String transcription; + recogniseImage(image,transcription,confidence); + output_text=transcription.c_str(); + if(component_rects!=NULL) + { + component_rects->resize(1); + (*component_rects)[0]=Rect(0,0,image.size().width,image.size().height); + } + if(component_texts!=NULL) + { + component_texts->resize(1); + (*component_texts)[0]=transcription.c_str(); + } + if(component_confidences!=NULL) + { + component_confidences->resize(1); + (*component_confidences)[0]=float(confidence); + } + } + + void run(Mat& image, Mat& mask, std::string& output_text, std::vector* component_rects=NULL, + std::vector* component_texts=NULL, std::vector* component_confidences=NULL, + int component_level=0) + { + CV_Assert(mask.cols==image.cols && mask.rows== image.rows);//Mask is ignored because the CNN operates on a full image + this->run(image,output_text,component_rects,component_texts,component_confidences,component_level); + } + + std::vector& getVocabulary() + { + return this->labels_; + } + + Ptr getClassifier() + { + return this->classifier_; + } +}; + +Ptr OCRHolisticWordRecognizer::create(Ptr classifierPtr,String vocabularyFilename ) +{ + return Ptr(new OCRHolisticWordRecognizerImpl(classifierPtr,vocabularyFilename)); +} + +Ptr OCRHolisticWordRecognizer::create(String modelArchFilename, String modelWeightsFilename, String vocabularyFilename) +{ + Ptr preprocessor=ImagePreprocessor::createImageStandarizer(113); + Ptr classifierPtr(new DeepCNNCaffeImpl(modelArchFilename,modelWeightsFilename,preprocessor,100)); + return Ptr(new OCRHolisticWordRecognizerImpl(classifierPtr,vocabularyFilename)); +} + +Ptr OCRHolisticWordRecognizer::create(Ptr classifierPtr,const std::vector& vocabulary) +{ + return Ptr(new OCRHolisticWordRecognizerImpl(classifierPtr,vocabulary)); +} + +Ptr OCRHolisticWordRecognizer::create(String modelArchFilename, String modelWeightsFilename,const std::vector& vocabulary){ + Ptr preprocessor=ImagePreprocessor::createImageStandarizer(113); + Ptr classifierPtr(new DeepCNNCaffeImpl(modelArchFilename,modelWeightsFilename,preprocessor,100)); + return Ptr(new OCRHolisticWordRecognizerImpl(classifierPtr,vocabulary)); +} + + + + + +} } //namespace text namespace cv diff --git a/modules/text/src/precomp.hpp b/modules/text/src/precomp.hpp index 7ccda150f37..e85e4eb85cb 100644 --- a/modules/text/src/precomp.hpp +++ b/modules/text/src/precomp.hpp @@ -45,8 +45,6 @@ #include "opencv2/text.hpp" -#include "text_config.hpp" - #ifdef HAVE_TESSERACT #if !defined(USE_STD_NAMESPACE) #define USE_STD_NAMESPACE diff --git a/modules/text/src/text_detector.cpp b/modules/text/src/text_detector.cpp new file mode 100644 index 00000000000..949f5f86dc4 --- /dev/null +++ b/modules/text/src/text_detector.cpp @@ -0,0 +1,169 @@ +#include "precomp.hpp" +#include "opencv2/imgproc.hpp" +#include "opencv2/core.hpp" + + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +//#ifdef HAVE_CAFFE +//#include "caffe/caffe.hpp" +//#endif + +namespace cv { namespace text { + + + + +class textDetectImpl: public textDetector{ +private: + struct NetOutput{ + //Auxiliary structure that handles the logic of getting bounding box and confidences of textness from + //the raw outputs of caffe + Rect bbox; + float probability; + + + static void getOutputs(const float* buffer,int nbrTextBoxes,int nCol,std::vector& res,Size inputShape) + { + + res.resize(nbrTextBoxes); + for(int k=0;k inputShape.width?inputShape.width-1:x_max; + y_max = y_max > inputShape.height?inputShape.height-1:y_max; + float wd = x_max-x_min+1; + float ht = y_max-y_min+1; + + res[k].bbox=Rect(int(x_min),int(y_min),int(wd),int(ht)); + + res[k].probability=buffer[k*nCol+2]; + } + + } + + + }; +protected: + + Ptr classifier_; +public: + textDetectImpl(Ptr classifierPtr):classifier_(classifierPtr) + { + + } + + + + void textDetectInImage(InputArray inputImage,CV_OUT std::vector& Bbox,CV_OUT std::vector& confidence) + { + Mat netOutput; + // call the detect function of deepTextCNN class + this->classifier_->detect(inputImage,netOutput); + // get the output geometry i.e height and width of output blob from caffe + Size OutputGeometry_ = this->classifier_->getOutputGeometry(); + int nbrTextBoxes = OutputGeometry_.height; + int nCol = OutputGeometry_.width; + + std::vector tmp; + // the output bounding box needs to be resized by the input height and width + Size inputImageShape = Size(inputImage.cols(),inputImage.rows()); + NetOutput::getOutputs((float*)(netOutput.data),nbrTextBoxes,nCol,tmp,inputImageShape); + // put the output in CV_OUT + + for (int k=0;k* component_rects=NULL, + std::vector* component_confidences=NULL, + int component_level=0) + { + CV_Assert(component_level==OCR_LEVEL_WORD);//Componnents not applicable for word spotting + + std::vector bbox; + std::vector score; + textDetectInImage(image,bbox,score); + + if(component_rects!=NULL) + { + component_rects->resize(bbox.size()); // should be a user behavior + + component_rects = &bbox; + } + + if(component_confidences!=NULL) + { + component_confidences->resize(score.size()); // shoub be a user behavior + + component_confidences = &score; + } + } + + void run(Mat& image, Mat& mask, std::vector* component_rects=NULL, + std::vector* component_confidences=NULL, + int component_level=0) + { + CV_Assert(mask.cols==image.cols && mask.rows== image.rows);//Mask is ignored because the CNN operates on a full image + this->run(image,component_rects,component_confidences,component_level); + } + + + + Ptr getClassifier() + { + return this->classifier_; + } +}; + +Ptr textDetector::create(Ptr classifierPtr) +{ + return Ptr(new textDetectImpl(classifierPtr)); +} + +Ptr textDetector::create(String modelArchFilename, String modelWeightsFilename) +{ + +// create a custom preprocessor with rawval + Ptr preprocessor=ImagePreprocessor::createImageCustomPreprocessor(255); +// set the mean for the preprocessor + + Mat textbox_mean(1,3,CV_8U); + textbox_mean.at(0,0)=104; + textbox_mean.at(0,1)=117; + textbox_mean.at(0,2)=123; + preprocessor->set_mean(textbox_mean); +// create a pointer to text box detector(textDetector) + Ptr classifierPtr(DeepCNNTextDetector::create(modelArchFilename,modelWeightsFilename,preprocessor,1)); + return Ptr(new textDetectImpl(classifierPtr)); +} + + + + + + + +} } //namespace text namespace cv diff --git a/modules/text/src/text_detectorCNN.cpp b/modules/text/src/text_detectorCNN.cpp new file mode 100644 index 00000000000..5267b390fed --- /dev/null +++ b/modules/text/src/text_detectorCNN.cpp @@ -0,0 +1,453 @@ +#include "precomp.hpp" +#include "opencv2/imgproc.hpp" +#include "opencv2/core.hpp" + + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#ifdef HAVE_CAFFE +#include "caffe/caffe.hpp" +#endif + +#ifdef HAVE_DNN +#include "opencv2/dnn.hpp" +#endif + +using namespace cv::dnn; + +#define CV_WARN(message) fprintf(stderr, "warning: %s (%s:%d)\n", message, __FILE__, __LINE__) + +namespace cv { namespace text { + +inline bool fileExists (String filename) { + std::ifstream f(filename.c_str()); + return f.good(); +} + +class DeepCNNTextDetectorCaffeImpl: public DeepCNNTextDetector{ +protected: + + + void process_(Mat inputImage, Mat &outputMat) + { + // do forward pass and stores the output in outputMat + CV_Assert(outputMat.isContinuous()); + if (inputImage.channels() != this->inputChannelCount_) + CV_WARN("Number of input channel(s) in the model is not same as input"); + + +#ifdef HAVE_CAFFE + net_->input_blobs()[0]->Reshape(1, this->inputChannelCount_,this->inputGeometry_.height,this->inputGeometry_.width); + net_->Reshape(); + float* inputBuffer=net_->input_blobs()[0]->mutable_cpu_data(); + float* inputData=inputBuffer; + + std::vector input_channels; + Mat preprocessed; + // if the image have multiple color channels the input layer should be populated accordingly + for (int channel=0;channel < this->inputChannelCount_;channel++){ + + cv::Mat netInputWraped(this->inputGeometry_.height, this->inputGeometry_.width, CV_32FC1, inputData); + input_channels.push_back(netInputWraped); + //input_data += width * height; + inputData+=(this->inputGeometry_.height*this->inputGeometry_.width); + } + this->preprocess(inputImage,preprocessed); + split(preprocessed, input_channels); + + //preprocessed.copyTo(netInputWraped); + + + this->net_->Forward(); + const float* outputNetData=net_->output_blobs()[0]->cpu_data(); + // const float* outputNetData1=net_->output_blobs()[1]->cpu_data(); + + + + + this->outputGeometry_.height = net_->output_blobs()[0]->height(); + this->outputGeometry_.width = net_->output_blobs()[0]->width(); + this->outputChannelCount_ = net_->output_blobs()[0]->channels(); + int outputSz = this->outputChannelCount_ * this->outputGeometry_.height * this->outputGeometry_.width; + outputMat.create(this->outputGeometry_.height , this->outputGeometry_.width,CV_32FC1); + float*outputMatData=(float*)(outputMat.data); + + memcpy(outputMatData,outputNetData,sizeof(float)*outputSz); + + + +#endif + } + + +#ifdef HAVE_CAFFE + Ptr > net_; +#endif + //Size inputGeometry_; + int minibatchSz_;//The existence of the assignment operator mandates this to be nonconst + //int outputSize_; +public: + DeepCNNTextDetectorCaffeImpl(const DeepCNNTextDetectorCaffeImpl& dn): + minibatchSz_(dn.minibatchSz_){ + outputGeometry_=dn.outputGeometry_; + inputGeometry_=dn.inputGeometry_; + //Implemented to supress Visual Studio warning "assignment operator could not be generated" +#ifdef HAVE_CAFFE + this->net_=dn.net_; +#endif + } + DeepCNNTextDetectorCaffeImpl& operator=(const DeepCNNTextDetectorCaffeImpl &dn) + { +#ifdef HAVE_CAFFE + this->net_=dn.net_; +#endif + this->setPreprocessor(dn.preprocessor_); + this->inputGeometry_=dn.inputGeometry_; + this->inputChannelCount_=dn.inputChannelCount_; + this->outputChannelCount_ = dn.outputChannelCount_; + // this->minibatchSz_=dn.minibatchSz_; + //this->outputGeometry_=dn.outputSize_; + this->preprocessor_=dn.preprocessor_; + this->outputGeometry_=dn.outputGeometry_; + return *this; + //Implemented to supress Visual Studio warning "assignment operator could not be generated" + } + + DeepCNNTextDetectorCaffeImpl(String modelArchFilename, String modelWeightsFilename,Ptr preprocessor, int maxMinibatchSz) + :minibatchSz_(maxMinibatchSz) + { + + CV_Assert(this->minibatchSz_>0); + CV_Assert(fileExists(modelArchFilename)); + CV_Assert(fileExists(modelWeightsFilename)); + CV_Assert(!preprocessor.empty()); + this->setPreprocessor(preprocessor); +#ifdef HAVE_CAFFE + this->net_.reset(new caffe::Net(modelArchFilename, caffe::TEST)); + CV_Assert(net_->num_inputs()==1); + CV_Assert(net_->num_outputs()==1); + CV_Assert(this->net_->input_blobs()[0]->channels()==1 + ||this->net_->input_blobs()[0]->channels()==3); + // this->channelCount_=this->net_->input_blobs()[0]->channels(); + + + + this->net_->CopyTrainedLayersFrom(modelWeightsFilename); + + caffe::Blob* inputLayer = this->net_->input_blobs()[0]; + + this->inputGeometry_.height = inputLayer->height(); + this->inputGeometry_.width = inputLayer->width(); + this->inputChannelCount_ = inputLayer->channels(); + //this->inputGeometry_.batchSize =1; + + inputLayer->Reshape(this->minibatchSz_,this->inputChannelCount_,this->inputGeometry_.height, this->inputGeometry_.width); + net_->Reshape(); + this->outputChannelCount_ = net_->output_blobs()[0]->channels(); + //this->outputGeometry_.batchSize =1; + this->outputGeometry_.height =net_->output_blobs()[0]->height(); + this->outputGeometry_.width = net_->output_blobs()[0]->width(); + +#else + CV_Error(Error::StsError,"Caffe not available during compilation!"); +#endif + } + + + void detect(InputArray image, OutputArray Bbox_prob) + { + Size outSize = Size(this->outputGeometry_.height,outputGeometry_.width); + Bbox_prob.create(outSize,CV_32F); // dummy initialization is it needed + Mat outputMat = Bbox_prob.getMat(); + process_(image.getMat(),outputMat); + //copy back to outputArray + outputMat.copyTo(Bbox_prob); + } + + Size getOutputGeometry() + { + return this->outputGeometry_; + } + Size getinputGeometry() + { + return this->inputGeometry_; + } + + int getMinibatchSize() + { + return this->minibatchSz_; + } + + int getBackend() + { + return OCR_HOLISTIC_BACKEND_CAFFE; + } + void setPreprocessor(Ptr ptr) + { + CV_Assert(!ptr.empty()); + preprocessor_=ptr; + } + + Ptr getPreprocessor() + { + return preprocessor_; + } +}; + + +class DeepCNNTextDetectorDNNImpl: public DeepCNNTextDetector{ +protected: + + + void process_(Mat inputImage, Mat &outputMat) + { + // do forward pass and stores the output in outputMat + CV_Assert(outputMat.isContinuous()); + if (inputImage.channels() != this->inputChannelCount_) + CV_WARN("Number of input channel(s) in the model is not same as input"); + + +#ifdef HAVE_DNN + + Mat preprocessed; + this->preprocess(inputImage,preprocessed); + + net_->setInput(blobFromImage(preprocessed,1, this->inputGeometry_), "data"); + + Mat outputNet = this->net_->forward( ); + + this->outputGeometry_.height = outputNet.size[2]; + this->outputGeometry_.width = outputNet.size[3]; + this->outputChannelCount_ = outputNet.size[1]; + + outputMat.create(this->outputGeometry_.height , this->outputGeometry_.width,CV_32FC1); + float*outputMatData=(float*)(outputMat.data); + float*outputNetData=(float*)(outputNet.data); + int outputSz = this->outputChannelCount_ * this->outputGeometry_.height * this->outputGeometry_.width; + + memcpy(outputMatData,outputNetData,sizeof(float)*outputSz); + + + + +#endif + } + + + +#ifdef HAVE_DNN + Ptr net_; +#endif + //Size inputGeometry_; + int minibatchSz_;//The existence of the assignment operator mandates this to be nonconst + //int outputSize_; + //int inputHeight_; + //int inputWidth_; + //int inputChannel_; +public: + DeepCNNTextDetectorDNNImpl(const DeepCNNTextDetectorDNNImpl& dn): + minibatchSz_(dn.minibatchSz_){ + outputGeometry_=dn.outputGeometry_; + inputGeometry_=dn.inputGeometry_; + //Implemented to supress Visual Studio warning "assignment operator could not be generated" +#ifdef HAVE_DNN + this->net_=dn.net_; +#endif + } + DeepCNNTextDetectorDNNImpl& operator=(const DeepCNNTextDetectorDNNImpl &dn) + { +#ifdef HAVE_DNN + this->net_=dn.net_; +#endif + this->setPreprocessor(dn.preprocessor_); + this->inputGeometry_=dn.inputGeometry_; + this->inputChannelCount_=dn.inputChannelCount_; + this->outputChannelCount_ = dn.outputChannelCount_; + // this->minibatchSz_=dn.minibatchSz_; + //this->outputGeometry_=dn.outputSize_; + this->preprocessor_=dn.preprocessor_; + this->outputGeometry_=dn.outputGeometry_; + return *this; + //Implemented to supress Visual Studio warning "assignment operator could not be generated" + } + + DeepCNNTextDetectorDNNImpl(String modelArchFilename, String modelWeightsFilename,Ptr preprocessor, int maxMinibatchSz,int inputHeight=700,int inputWidth =700,int inputChannel =3) + :minibatchSz_(maxMinibatchSz) + { + + CV_Assert(this->minibatchSz_>0); + CV_Assert(fileExists(modelArchFilename)); + CV_Assert(fileExists(modelWeightsFilename)); + CV_Assert(!preprocessor.empty()); + this->setPreprocessor(preprocessor); +#ifdef HAVE_DNN + this->net_ = makePtr(readNetFromCaffe(modelArchFilename,modelWeightsFilename)); + + if (this->net_.empty()) + { + std::cerr << "Can't load network by using the following files: " << std::endl; + std::cerr << "prototxt: " << modelArchFilename << std::endl; + std::cerr << "caffemodel: " << modelWeightsFilename << std::endl; + //std::cerr << "bvlc_googlenet.caffemodel can be downloaded here:" << std::endl; + //std::cerr << "http://dl.caffe.berkeleyvision.org/bvlc_googlenet.caffemodel" << std::endl; + exit(-1); + } + + this->inputGeometry_.height =inputHeight; + this->inputGeometry_.width = inputWidth ;//inputLayer->width(); + this->inputChannelCount_ = inputChannel ;//inputLayer->channels(); + +#else + CV_Error(Error::StsError,"DNN module not available during compilation!"); +#endif + } + + + void detect(InputArray image, OutputArray Bbox_prob) + { + Size outSize = Size(this->outputGeometry_.height,outputGeometry_.width); + Bbox_prob.create(outSize,CV_32F); // dummy initialization is it needed + Mat outputMat = Bbox_prob.getMat(); + + process_(image.getMat(),outputMat); + //copy back to outputArray + outputMat.copyTo(Bbox_prob); + } + + Size getOutputGeometry() + { + return this->outputGeometry_; + } + Size getinputGeometry() + { + return this->inputGeometry_; + } + + int getMinibatchSize() + { + return this->minibatchSz_; + } + + int getBackend() + { + return OCR_HOLISTIC_BACKEND_DNN; + } + void setPreprocessor(Ptr ptr) + { + CV_Assert(!ptr.empty()); + preprocessor_=ptr; + } + + Ptr getPreprocessor() + { + return preprocessor_; + } +}; + +Ptr DeepCNNTextDetector::create(String archFilename,String weightsFilename,Ptr preprocessor,int minibatchSz,int backEnd) +{ + if(preprocessor.empty()) + { + // create a custom preprocessor with rawval + preprocessor=ImagePreprocessor::createImageCustomPreprocessor(255); + // set the mean for the preprocessor + + Mat textbox_mean(1,3,CV_8U); + textbox_mean.at(0,0)=104; + textbox_mean.at(0,1)=117; + textbox_mean.at(0,2)=123; + preprocessor->set_mean(textbox_mean); + } + switch(backEnd){ + case OCR_HOLISTIC_BACKEND_DEFAULT: + +#ifdef HAVE_CAFFE + return Ptr(new DeepCNNTextDetectorCaffeImpl(archFilename, weightsFilename,preprocessor, minibatchSz)); + +#elif defined(HAVE_DNN) + return Ptr(new DeepCNNTextDetectorDNNImpl(archFilename, weightsFilename,preprocessor, minibatchSz,700,700,3)); +#else + CV_Error(Error::StsError,"DeepCNNTextDetector::create backend not implemented"); + return Ptr(); +#endif + case OCR_HOLISTIC_BACKEND_CAFFE: + + return Ptr(new DeepCNNTextDetectorCaffeImpl(archFilename, weightsFilename,preprocessor, minibatchSz)); + break; + + case OCR_HOLISTIC_BACKEND_DNN: + return Ptr(new DeepCNNTextDetectorDNNImpl(archFilename, weightsFilename,preprocessor, minibatchSz,700,700,3)); + break; + + case OCR_HOLISTIC_BACKEND_NONE: + default: + CV_Error(Error::StsError,"DeepCNNTextDetector::create backend not implemented"); + return Ptr(); + break; + } + //return Ptr(); + +} + + +Ptr DeepCNNTextDetector::createTextBoxNet(String archFilename,String weightsFilename,int backEnd) +{ + + // create a custom preprocessor with rawval + Ptr preprocessor=ImagePreprocessor::createImageCustomPreprocessor(255); + // set the mean for the preprocessor + + Mat textbox_mean(1,3,CV_8U); + textbox_mean.at(0,0)=104; + textbox_mean.at(0,1)=117; + textbox_mean.at(0,2)=123; + preprocessor->set_mean(textbox_mean); + switch(backEnd){ + case OCR_HOLISTIC_BACKEND_DEFAULT: + +#ifdef HAVE_CAFFE + return Ptr(new DeepCNNTextDetectorCaffeImpl(archFilename, weightsFilename,preprocessor, 1)); + +#elif defined(HAVE_DNN) + return Ptr(new DeepCNNTextDetectorDNNImpl(archFilename, weightsFilename,preprocessor, 1,700,700,3)); +#else + CV_Error(Error::StsError,"DeepCNNTextDetector::create backend not implemented"); + return Ptr(); +#endif + break; + case OCR_HOLISTIC_BACKEND_CAFFE: + return Ptr(new DeepCNNTextDetectorCaffeImpl(archFilename, weightsFilename,preprocessor, 1)); + break; + case OCR_HOLISTIC_BACKEND_DNN: + return Ptr(new DeepCNNTextDetectorDNNImpl(archFilename, weightsFilename,preprocessor, 1,700,700,3)); + break; + case OCR_HOLISTIC_BACKEND_NONE: + default: + CV_Error(Error::StsError,"DeepCNNTextDetector::create backend not implemented"); + return Ptr(); + break; + } + //return Ptr(); + +} + +void DeepCNNTextDetector::preprocess(const Mat& input,Mat& output) +{ + Size inputHtWd = Size(this->inputGeometry_.height,this->inputGeometry_.width); + this->preprocessor_->preprocess(input,output,inputHtWd,this->inputChannelCount_); +} + + + +} } //namespace text namespace cv diff --git a/modules/text/text_config.hpp.in b/modules/text/text_config.hpp.in index 30089bd3c55..81e624bab37 100644 --- a/modules/text/text_config.hpp.in +++ b/modules/text/text_config.hpp.in @@ -1,7 +1,4 @@ #ifndef __OPENCV_TEXT_CONFIG_HPP__ #define __OPENCV_TEXT_CONFIG_HPP__ -// HAVE OCR Tesseract -#cmakedefine HAVE_TESSERACT - -#endif \ No newline at end of file +#endif