From 9ae765a197d411a9016134cda0217a4a512aaabf Mon Sep 17 00:00:00 2001 From: sghoshcvc Date: Thu, 22 Jun 2017 18:31:12 +0200 Subject: [PATCH 01/31] Text detector class and Custom Image processor Class --- modules/text/CMakeLists.txt | 85 +- modules/text/FindCaffe.cmake | 14 + modules/text/FindGlog.cmake | 10 + modules/text/FindProtobuf.cmake | 10 + modules/text/FindTesseract.cmake | 24 + modules/text/README.md | 72 ++ modules/text/include/opencv2/text.hpp | 3 +- modules/text/include/opencv2/text/ocr.hpp | 849 +++++++++++++---- .../include/opencv2/text/textDetector.hpp | 235 +++++ modules/text/src/ocr_holistic.cpp | 879 ++++++++++++++++++ modules/text/src/text_detector.cpp | 643 +++++++++++++ modules/text/text_config.hpp.in | 10 +- 12 files changed, 2632 insertions(+), 202 deletions(-) create mode 100644 modules/text/FindCaffe.cmake create mode 100755 modules/text/FindGlog.cmake create mode 100644 modules/text/FindProtobuf.cmake create mode 100644 modules/text/FindTesseract.cmake create mode 100644 modules/text/include/opencv2/text/textDetector.hpp create mode 100644 modules/text/src/ocr_holistic.cpp create mode 100644 modules/text/src/text_detector.cpp diff --git a/modules/text/CMakeLists.txt b/modules/text/CMakeLists.txt index 7ec4d246451..52bd828d905 100644 --- a/modules/text/CMakeLists.txt +++ b/modules/text/CMakeLists.txt @@ -1,24 +1,71 @@ set(the_description "Text Detection and Recognition") -ocv_define_module(text opencv_ml opencv_imgproc opencv_core opencv_features2d OPTIONAL opencv_highgui WRAP python) - -if(NOT CMAKE_CROSSCOMPILING OR OPENCV_FIND_TESSERACT) - set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${CMAKE_CURRENT_SOURCE_DIR}/cmake) - find_package(Tesseract QUIET) - if(Tesseract_FOUND) - message(STATUS "Tesseract: YES") - set(HAVE_TESSERACT 1) - ocv_include_directories(${Tesseract_INCLUDE_DIR}) - ocv_target_link_libraries(${the_module} ${Tesseract_LIBRARIES}) - else() - message(STATUS "Tesseract: NO") - endif() +# Using cmake scripts and modules +list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}) + +set(TEXT_DEPS opencv_ml opencv_highgui opencv_imgproc opencv_core opencv_features2d opencv_calib3d) + +find_package(Caffe) +if(Caffe_FOUND) + message(STATUS "Caffe: YES") + set(HAVE_CAFFE 1) +else() + message(STATUS "Caffe: NO") +# list(APPEND TEXT_DEPS opencv_dnn) +endif() + +#internal dependencies +find_package(Protobuf) +if(Protobuf_FOUND) + message(STATUS "Protobuf: YES") + set(HAVE_PROTOBUF 1) +else() + message(STATUS "Protobuf: NO") +endif() + +find_package(Glog) +if(Glog_FOUND) + message(STATUS "Glog: YES") + set(HAVE_GLOG 1) +else() + message(STATUS "Glog: NO") +endif() + +ocv_define_module(text opencv_ml opencv_imgproc opencv_core opencv_features2d opencv_calib3d WRAP python) +#ocv_define_module(text ${TEXT_DEPS} WRAP python) + +#set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${CMAKE_CURRENT_SOURCE_DIR}) + +find_package(Tesseract) +if(${Tesseract_FOUND}) + message(STATUS "Tesseract: YES") + include_directories(${Tesseract_INCLUDE_DIR}) + target_link_libraries(opencv_text ${Tesseract_LIBS}) + add_definitions(-DHAVE_TESSERACT) +else() + message(STATUS "Tesseract: NO") endif() -configure_file(${CMAKE_CURRENT_SOURCE_DIR}/text_config.hpp.in - ${CMAKE_BINARY_DIR}/text_config.hpp @ONLY) -ocv_include_directories(${CMAKE_CURRENT_BINARY_DIR}) -ocv_add_testdata(samples/ contrib/text - FILES_MATCHING PATTERN "*.xml" PATTERN "*.xml.gz" REGEX "scenetext[0-9]+.jpg" -) + +if(HAVE_CAFFE AND HAVE_GLOG AND HAVE_PROTOBUF) + include_directories(${Caffe_INCLUDE_DIR}) + find_package(HDF5 COMPONENTS HL REQUIRED) + include_directories(SYSTEM ${HDF5_INCLUDE_DIRS} ${HDF5_HL_INCLUDE_DIR}) + list(APPEND Caffe_LINKER_LIBS ${HDF5_LIBRARIES}) + find_package(Boost 1.46 REQUIRED COMPONENTS system thread filesystem) + include_directories(SYSTEM ${Boost_INCLUDE_DIR}) + include_directories(SYSTEM /usr/local/cuda-8.0/targets/x86_64-linux/include/ usr/local/cuda-8.0/include/ /usr/local/cuda-7.5/targets/x86_64-linux/include/ ) + link_directories(SYSTEM /usr/local/cuda-8.0/targets/x86_64-linux/lib/ usr/local/cuda-8.0/lib/ /usr/local/cuda-7.5/targets/x86_64-linux/lib/ /usr/lib/openblas-base/lib /usr/local/cuda-8.0/lib64) + list(APPEND Caffe_LINKER_LIBS ${Boost_LIBRARIES}) + target_link_libraries(opencv_text atlas blas ${Caffe_LIBS} ${Glog_LIBS} ${Protobuf_LIBS} ${HDF5_LIBRARIES} ${Boost_LIBRARIES}) + add_definitions(-DHAVE_CAFFE) +endif() #HAVE_CAFFE + +message(STATUS "TEXT CAFFE SEARCH") +if() + message(STATUS "TEXT NO CAFFE CONFLICT") +else() + message(STATUS "TEXT CAFFE CONFLICT") +endif() + diff --git a/modules/text/FindCaffe.cmake b/modules/text/FindCaffe.cmake new file mode 100644 index 00000000000..12948f62992 --- /dev/null +++ b/modules/text/FindCaffe.cmake @@ -0,0 +1,14 @@ +# Caffe package for CNN Triplet training +unset(Caffe_FOUND) + +find_path(Caffe_INCLUDE_DIR NAMES caffe/caffe.hpp caffe/common.hpp caffe/net.hpp caffe/proto/caffe.pb.h caffe/util/io.hpp caffe/vision_layers.hpp + HINTS + /usr/local/include) + +find_library(Caffe_LIBS NAMES caffe + HINTS + /usr/local/lib) + +if(Caffe_LIBS AND Caffe_INCLUDE_DIR) + set(Caffe_FOUND 1) +endif() diff --git a/modules/text/FindGlog.cmake b/modules/text/FindGlog.cmake new file mode 100755 index 00000000000..c30e9f4a6ab --- /dev/null +++ b/modules/text/FindGlog.cmake @@ -0,0 +1,10 @@ +#Required for Caffe +unset(Glog_FOUND) + +find_library(Glog_LIBS NAMES glog + HINTS + /usr/local/lib) + +if(Glog_LIBS) + set(Glog_FOUND 1) +endif() diff --git a/modules/text/FindProtobuf.cmake b/modules/text/FindProtobuf.cmake new file mode 100644 index 00000000000..6d0ad56a1f7 --- /dev/null +++ b/modules/text/FindProtobuf.cmake @@ -0,0 +1,10 @@ +#Protobuf package required for Caffe +unset(Protobuf_FOUND) + +find_library(Protobuf_LIBS NAMES protobuf + HINTS + /usr/local/lib) + +if(Protobuf_LIBS) + set(Protobuf_FOUND 1) +endif() diff --git a/modules/text/FindTesseract.cmake b/modules/text/FindTesseract.cmake new file mode 100644 index 00000000000..54c4a49297d --- /dev/null +++ b/modules/text/FindTesseract.cmake @@ -0,0 +1,24 @@ +# Tesseract OCR +unset(Tesseract_FOUND) + +find_path(Tesseract_INCLUDE_DIR tesseract/baseapi.h + HINTS + /usr/include + /usr/local/include) + +find_library(Tesseract_LIBRARY NAMES tesseract + HINTS + /usr/lib + /usr/local/lib) + +find_library(Lept_LIBRARY NAMES lept + HINTS + /usr/lib + /usr/local/lib) + +set(Tesseract_LIBS ${Tesseract_LIBRARY} ${Lept_LIBRARY}) +if(Tesseract_LIBS AND Tesseract_INCLUDE_DIR) + set(Tesseract_FOUND 1) +endif() + + diff --git a/modules/text/README.md b/modules/text/README.md index bbbad11a165..3a3a897f7c3 100644 --- a/modules/text/README.md +++ b/modules/text/README.md @@ -47,3 +47,75 @@ Notes 2. Tesseract configure script may fail to detect leptonica, so you may have to edit the configure script - comment off some if's around this message and retain only "then" branch. 3. You are encouraged to search the Net for some better pre-trained classifiers, as well as classifiers for other languages. + + +Word spotting CNN +================= + +Intro +----- + +A word spotting CNN is a CNN that takes an image assumed to contain a single word and provides a probabillity over a given vocabulary. +Although other backends will be supported, for the moment only the Caffe backend is supported. + + + + +Instalation of Caffe backend +---------------------------- +The caffe wrapping backend has the requirements caffe does. +* Caffe can be built against OpenCV, if the caffe backend is enabled, a circular bependency arises. +The simplest solution is to build caffe without support for OpenCV. +* Only the OS supported by Caffe are supported by the backend. +The scripts describing the module have been developed in ubuntu 16.04 and assume such a system. +Other UNIX systems including OSX should be easy to adapt. + +Sample script for building Caffe + +```bash +#!/bin/bash +SRCROOT="${HOME}/caffe_inst/" +mkdir -p "$SRCROOT" +cd "$SRCROOT" +git clone https://github.com/BVLC/caffe.git +cd caffe +git checkout 91b09280f5233cafc62954c98ce8bc4c204e7475 +git branch 91b09280f5233cafc62954c98ce8bc4c204e7475 +cat Makefile.config.example > Makefile.config +echo 'USE_OPENCV := 0' >> Makefile.config +echo 'INCLUDE_DIRS += /usr/include/hdf5/serial/' >> Makefile.config +echo 'LIBRARY_DIRS += /usr/lib/x86_64-linux-gnu/hdf5/serial/' >> Makefile.config + + +echo "--- /tmp/caffe/include/caffe/net.hpp 2017-05-28 04:55:47.929623902 +0200 ++++ caffe/distribute/include/caffe/net.hpp 2017-05-28 04:51:33.437090768 +0200 +@@ -234,6 +234,7 @@ + + template + friend class Net; ++ virtual ~Callback(){} + }; + const vector& before_forward() const { return before_forward_; } + void add_before_forward(Callback* value) { +">/tmp/cleanup_caffe.diff + +patch < /tmp/cleanup_caffe.diff + + +make -j 6 + +make pycaffe + +make distribute +``` + + +```bash +#!/bin/bash +cd $OPENCV_BUILD_DIR #You must set this +CAFFEROOT="${HOME}/caffe_inst/" #If you used the previous code to compile Caffe in ubuntu 16.04 + +cmake -DCaffe_LIBS:FILEPATH="$CAFFEROOT/caffe/distribute/lib/libcaffe.so" -DBUILD_opencv_ts:BOOL="0" -DBUILD_opencv_dnn:BOOL="0" -DBUILD_opencv_dnn_modern:BOOL="0" -DCaffe_INCLUDE_DIR:PATH="$CAFFEROOT/caffe/distribute/include" -DWITH_MATLAB:BOOL="0" -DBUILD_opencv_cudabgsegm:BOOL="0" -DWITH_QT:BOOL="1" -DBUILD_opencv_cudaoptflow:BOOL="0" -DBUILD_opencv_cudastereo:BOOL="0" -DBUILD_opencv_cudafilters:BOOL="0" -DBUILD_opencv_cudev:BOOL="1" -DOPENCV_EXTRA_MODULES_PATH:PATH="/home/anguelos/work/projects/opencv_gsoc/opencv_contrib/modules" ./ + + +``` diff --git a/modules/text/include/opencv2/text.hpp b/modules/text/include/opencv2/text.hpp index 945194a16b6..c4c2975b8dd 100644 --- a/modules/text/include/opencv2/text.hpp +++ b/modules/text/include/opencv2/text.hpp @@ -41,6 +41,7 @@ the use of this software, even if advised of the possibility of such damage. #include "opencv2/text/erfilter.hpp" #include "opencv2/text/ocr.hpp" +#include "opencv2/text/textDetector.hpp" /** @defgroup text Scene Text Detection and Recognition @@ -92,7 +93,7 @@ grouping horizontally aligned text, and the method proposed by Lluis Gomez and D in [Gomez13][Gomez14] for grouping arbitrary oriented text (see erGrouping). To see the text detector at work, have a look at the textdetection demo: - + @defgroup text_recognize Scene Text Recognition @} diff --git a/modules/text/include/opencv2/text/ocr.hpp b/modules/text/include/opencv2/text/ocr.hpp index 1261046cd07..9fc5403fdef 100644 --- a/modules/text/include/opencv2/text/ocr.hpp +++ b/modules/text/include/opencv2/text/ocr.hpp @@ -46,6 +46,10 @@ #include #include +#include +#include + + namespace cv { @@ -61,82 +65,126 @@ enum OCR_LEVEL_TEXTLINE }; -//base class BaseOCR declares a common API that would be used in a typical text recognition scenario +//base class BaseOCR declares a common API that would be used in a typical text +//recognition scenario class CV_EXPORTS_W BaseOCR { -public: + public: virtual ~BaseOCR() {}; - virtual void run(Mat& image, std::string& output_text, std::vector* component_rects=NULL, - std::vector* component_texts=NULL, std::vector* component_confidences=NULL, + + virtual void run(Mat& image, std::string& output_text, + std::vector* component_rects=NULL, + std::vector* component_texts=NULL, + std::vector* component_confidences=NULL, int component_level=0) = 0; - virtual void run(Mat& image, Mat& mask, std::string& output_text, std::vector* component_rects=NULL, - std::vector* component_texts=NULL, std::vector* component_confidences=NULL, + + virtual void run(Mat& image, Mat& mask, std::string& output_text, + std::vector* component_rects=NULL, + std::vector* component_texts=NULL, + std::vector* component_confidences=NULL, int component_level=0) = 0; + + /** @brief Main functionality of the OCR Hierarchy. Subclasses provide + * default parameters for all parameters other than the input image. + */ + virtual String run(InputArray image){ + std::string res; + std::vector component_rects; + std::vector component_confidences; + std::vector component_texts; + Mat inputImage=image.getMat(); + this->run(inputImage,res,&component_rects,&component_texts, + &component_confidences,OCR_LEVEL_WORD); + return res; + } + }; -/** @brief OCRTesseract class provides an interface with the tesseract-ocr API (v3.02.02) in C++. +/** @brief OCRTesseract class provides an interface with the tesseract-ocr API + * (v3.02.02) in C++. Notice that it is compiled only when tesseract-ocr is correctly installed. @note - - (C++) An example of OCRTesseract recognition combined with scene text detection can be found - at the end_to_end_recognition demo: - - - (C++) Another example of OCRTesseract recognition combined with scene text detection can be - found at the webcam_demo: - + - (C++) An example of OCRTesseract recognition combined with scene text + detection can be found at the end_to_end_recognition demo: + + - (C++) Another example of OCRTesseract recognition combined with scene + text detection can be found at the webcam_demo: + */ class CV_EXPORTS_W OCRTesseract : public BaseOCR { public: /** @brief Recognize text using the tesseract-ocr API. - Takes image on input and returns recognized text in the output_text parameter. Optionally - provides also the Rects for individual text elements found (e.g. words), and the list of those - text elements with their confidence values. + Takes image on input and returns recognized text in the output_text + parameter. Optionally provides also the Rects for individual text elements + found (e.g. words), and the list of those text elements with their + confidence values. @param image Input image CV_8UC1 or CV_8UC3 + @param output_text Output text of the tesseract-ocr. - @param component_rects If provided the method will output a list of Rects for the individual - text elements found (e.g. words or text lines). - @param component_texts If provided the method will output a list of text strings for the - recognition of individual text elements found (e.g. words or text lines). - @param component_confidences If provided the method will output a list of confidence values - for the recognition of individual text elements found (e.g. words or text lines). + + @param component_rects If provided the method will output a list of Rects + for the individual text elements found (e.g. words or text lines). + + @param component_texts If provided the method will output a list of text + strings for the recognition of individual text elements found (e.g. words or + text lines). + + @param component_confidences If provided the method will output a list of + confidence values for the recognition of individual text elements found + (e.g. words or text lines). + @param component_level OCR_LEVEL_WORD (by default), or OCR_LEVEL_TEXT_LINE. */ - virtual void run(Mat& image, std::string& output_text, std::vector* component_rects=NULL, - std::vector* component_texts=NULL, std::vector* component_confidences=NULL, + virtual void run (Mat& image, std::string& output_text, + std::vector* component_rects=NULL, + std::vector* component_texts=NULL, + std::vector* component_confidences=NULL, int component_level=0); - virtual void run(Mat& image, Mat& mask, std::string& output_text, std::vector* component_rects=NULL, - std::vector* component_texts=NULL, std::vector* component_confidences=NULL, - int component_level=0); + virtual void run (Mat& image, Mat& mask, std::string& output_text, + std::vector* component_rects=NULL, + std::vector* component_texts=NULL, + std::vector* component_confidences=NULL, + int component_level=0); // aliases for scripting - CV_WRAP String run(InputArray image, int min_confidence, int component_level=0); + CV_WRAP String run (InputArray image, int min_confidence, + int component_level=0); - CV_WRAP String run(InputArray image, InputArray mask, int min_confidence, int component_level=0); + CV_WRAP String run(InputArray image, InputArray mask, + int min_confidence, int component_level=0); CV_WRAP virtual void setWhiteList(const String& char_whitelist) = 0; - /** @brief Creates an instance of the OCRTesseract class. Initializes Tesseract. + /** @brief Creates an instance of the OCRTesseract class. Initializes + * Tesseract. + + * @param datapath the name of the parent directory of tessdata ended with + * "/", or NULL to use the system's default directory. + + * @param language an ISO 639-3 code or NULL will default to "eng". + + * @param char_whitelist specifies the list of characters used for + * recognition. NULL defaults to "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ". - @param datapath the name of the parent directory of tessdata ended with "/", or NULL to use the - system's default directory. - @param language an ISO 639-3 code or NULL will default to "eng". - @param char_whitelist specifies the list of characters used for recognition. NULL defaults to - "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ". - @param oem tesseract-ocr offers different OCR Engine Modes (OEM), by deffault - tesseract::OEM_DEFAULT is used. See the tesseract-ocr API documentation for other possible - values. - @param psmode tesseract-ocr offers different Page Segmentation Modes (PSM) tesseract::PSM_AUTO - (fully automatic layout analysis) is used. See the tesseract-ocr API documentation for other - possible values. + * @param oem tesseract-ocr offers different OCR Engine Modes (OEM), by + * default tesseract::OEM_DEFAULT is used. See the tesseract-ocr API + * documentation for other possible values. + + * @param psmode tesseract-ocr offers different Page Segmentation Modes + * (PSM) tesseract::PSM_AUTO (fully automatic layout analysis) is used. See + * the tesseract-ocr API documentation for other possible values. */ - CV_WRAP static Ptr create(const char* datapath=NULL, const char* language=NULL, - const char* char_whitelist=NULL, int oem=3, int psmode=3); + CV_WRAP static Ptr create (const char* datapath=NULL, + const char* language=NULL, + const char* char_whitelist=NULL, + int oem=3, int psmode=3); }; @@ -147,134 +195,156 @@ enum decoder_mode OCR_DECODER_VITERBI = 0 // Other algorithms may be added }; -/** @brief OCRHMMDecoder class provides an interface for OCR using Hidden Markov Models. +/** @brief OCRHMMDecoder class provides an interface for OCR using Hidden Markov + * Models. -@note - - (C++) An example on using OCRHMMDecoder recognition combined with scene text detection can - be found at the webcam_demo sample: - + * @note + * - (C++) An example on using OCRHMMDecoder recognition combined with scene + * text detection can be found at the webcam_demo sample: + * */ -class CV_EXPORTS_W OCRHMMDecoder : public BaseOCR -{ -public: +class CV_EXPORTS_W OCRHMMDecoder : public BaseOCR { + public: /** @brief Callback with the character classifier is made a class. - This way it hides the feature extractor and the classifier itself, so developers can write - their own OCR code. + * This way it hides the feature extractor and the classifier itself, so + * developers can write their own OCR code. - The default character classifier and feature extractor can be loaded using the utility funtion - loadOCRHMMClassifierNM and KNN model provided in - . - */ - class CV_EXPORTS_W ClassifierCallback - { - public: + * The default character classifier and feature extractor can be loaded using + * the utility funtion loadOCRHMMClassifierNM and KNN model provided in + * . + */ + class CV_EXPORTS_W ClassifierCallback{ + public: virtual ~ClassifierCallback() { } - /** @brief The character classifier must return a (ranked list of) class(es) id('s) + /** @brief The character classifier must return a (ranked list of) + * class(es) id('s) - @param image Input image CV_8UC1 or CV_8UC3 with a single letter. - @param out_class The classifier returns the character class categorical label, or list of - class labels, to which the input image corresponds. - @param out_confidence The classifier returns the probability of the input image - corresponding to each classes in out_class. + * @param image Input image CV_8UC1 or CV_8UC3 with a single letter. + * @param out_class The classifier returns the character class + * categorical label, or list of class labels, to which the input image + * corresponds. + + * @param out_confidence The classifier returns the probability of the + * input image corresponding to each classes in out_class. */ - virtual void eval( InputArray image, std::vector& out_class, std::vector& out_confidence); + virtual void eval (InputArray image, std::vector& out_class, + std::vector& out_confidence); }; -public: /** @brief Recognize text using HMM. - Takes binary image on input and returns recognized text in the output_text parameter. Optionally - provides also the Rects for individual text elements found (e.g. words), and the list of those - text elements with their confidence values. + * Takes binary image on input and returns recognized text in the output_text + * parameter. Optionally provides also the Rects for individual text elements + * found (e.g. words), and the list of those text elements with their + * confidence values. - @param image Input binary image CV_8UC1 with a single text line (or word). + * @param image Input binary image CV_8UC1 with a single text line (or word). - @param output_text Output text. Most likely character sequence found by the HMM decoder. + * @param output_text Output text. Most likely character sequence found by + * the HMM decoder. - @param component_rects If provided the method will output a list of Rects for the individual - text elements found (e.g. words). + * @param component_rects If provided the method will output a list of Rects + * for the individual text elements found (e.g. words). - @param component_texts If provided the method will output a list of text strings for the - recognition of individual text elements found (e.g. words). + * @param component_texts If provided the method will output a list of text + * strings for the recognition of individual text elements found (e.g. words) + * . - @param component_confidences If provided the method will output a list of confidence values - for the recognition of individual text elements found (e.g. words). + * @param component_confidences If provided the method will output a list of + * confidence values for the recognition of individual text elements found + * (e.g. words). - @param component_level Only OCR_LEVEL_WORD is supported. - */ - virtual void run(Mat& image, std::string& output_text, std::vector* component_rects=NULL, - std::vector* component_texts=NULL, std::vector* component_confidences=NULL, - int component_level=0); + * @param component_level Only OCR_LEVEL_WORD is supported. + */ + virtual void run (Mat& image, std::string& output_text, + std::vector* component_rects=NULL, + std::vector* component_texts=NULL, + std::vector* component_confidences=NULL, + int component_level=0); /** @brief Recognize text using HMM. - Takes an image and a mask (where each connected component corresponds to a segmented character) - on input and returns recognized text in the output_text parameter. Optionally - provides also the Rects for individual text elements found (e.g. words), and the list of those - text elements with their confidence values. + * Takes an image and a mask (where each connected component corresponds to a + * segmented character) on input and returns recognized text in the + * output_text parameter. Optionally provides also the Rects for individual + * text elements found (e.g. words), and the list of those text elements with + * their confidence values. - @param image Input image CV_8UC1 or CV_8UC3 with a single text line (or word). - @param mask Input binary image CV_8UC1 same size as input image. Each connected component in mask corresponds to a segmented character in the input image. + * @param image Input image CV_8UC1 or CV_8UC3 with a single text line + * (or word). - @param output_text Output text. Most likely character sequence found by the HMM decoder. + * @param mask Input binary image CV_8UC1 same size as input image. Each + * connected component in mask corresponds to a segmented character in the + * input image. - @param component_rects If provided the method will output a list of Rects for the individual - text elements found (e.g. words). + * @param output_text Output text. Most likely character sequence found by + * the HMM decoder. - @param component_texts If provided the method will output a list of text strings for the - recognition of individual text elements found (e.g. words). + * @param component_rects If provided the method will output a list of Rects + * for the individual text elements found (e.g. words). - @param component_confidences If provided the method will output a list of confidence values - for the recognition of individual text elements found (e.g. words). + * @param component_texts If provided the method will output a list of text + * strings for the recognition of individual text elements found (e.g. words) + * . - @param component_level Only OCR_LEVEL_WORD is supported. - */ - virtual void run(Mat& image, Mat& mask, std::string& output_text, std::vector* component_rects=NULL, - std::vector* component_texts=NULL, std::vector* component_confidences=NULL, + * @param component_confidences If provided the method will output a list of + * confidence values for the recognition of individual text elements found + * (e.g. words). + + * @param component_level Only OCR_LEVEL_WORD is supported. + */ + virtual void run(Mat& image, Mat& mask, std::string& output_text, + std::vector* component_rects=NULL, + std::vector* component_texts=NULL, + std::vector* component_confidences=NULL, int component_level=0); // aliases for scripting - CV_WRAP String run(InputArray image, int min_confidence, int component_level=0); + CV_WRAP String run(InputArray image, + int min_confidence, + int component_level=0); - CV_WRAP String run(InputArray image, InputArray mask, int min_confidence, int component_level=0); + CV_WRAP String run(InputArray image, + InputArray mask, + int min_confidence, + int component_level=0); - /** @brief Creates an instance of the OCRHMMDecoder class. Initializes HMMDecoder. + /** @brief Creates an instance of the OCRHMMDecoder class. Initializes + * HMMDecoder. - @param classifier The character classifier with built in feature extractor. + * @param classifier The character classifier with built in feature + * extractor. - @param vocabulary The language vocabulary (chars when ascii english text). vocabulary.size() - must be equal to the number of classes of the classifier. + * @param vocabulary The language vocabulary (chars when ascii english text) + * . vocabulary.size() must be equal to the number of classes of the + * classifier. - @param transition_probabilities_table Table with transition probabilities between character - pairs. cols == rows == vocabulary.size(). + * @param transition_probabilities_table Table with transition probabilities + * between character pairs. cols == rows == vocabulary.size(). - @param emission_probabilities_table Table with observation emission probabilities. cols == - rows == vocabulary.size(). + * @param emission_probabilities_table Table with observation emission + * probabilities. cols == rows == vocabulary.size(). - @param mode HMM Decoding algorithm. Only OCR_DECODER_VITERBI is available for the moment - (). + * @param mode HMM Decoding algorithm. Only OCR_DECODER_VITERBI is available + * for the moment (). */ - static Ptr create(const Ptr classifier,// The character classifier with built in feature extractor - const std::string& vocabulary, // The language vocabulary (chars when ascii english text) - // size() must be equal to the number of classes - InputArray transition_probabilities_table, // Table with transition probabilities between character pairs - // cols == rows == vocabulari.size() - InputArray emission_probabilities_table, // Table with observation emission probabilities - // cols == rows == vocabulari.size() - decoder_mode mode = OCR_DECODER_VITERBI); // HMM Decoding algorithm (only Viterbi for the moment) - - CV_WRAP static Ptr create(const Ptr classifier,// The character classifier with built in feature extractor - const String& vocabulary, // The language vocabulary (chars when ascii english text) - // size() must be equal to the number of classes - InputArray transition_probabilities_table, // Table with transition probabilities between character pairs - // cols == rows == vocabulari.size() - InputArray emission_probabilities_table, // Table with observation emission probabilities - // cols == rows == vocabulari.size() - int mode = OCR_DECODER_VITERBI); // HMM Decoding algorithm (only Viterbi for the moment) - -protected: + static Ptr create( + const Ptr classifier, // The character classifier with built in feature extractor + const std::string& vocabulary, // The language vocabulary (chars when ascii english text) size() must be equal to the number of classes + InputArray transition_probabilities_table, // Table with transition probabilities between character pairs cols == rows == vocabulari.size() + InputArray emission_probabilities_table, // Table with observation emission probabilities cols == rows == vocabulari.size() + decoder_mode mode = OCR_DECODER_VITERBI); // HMM Decoding algorithm (only Viterbi for the moment) + + CV_WRAP static Ptr create( + const Ptr classifier, // The character classifier with built in feature extractor + const String& vocabulary, // The language vocabulary (chars when ascii english text) size() must be equal to the number of classes + InputArray transition_probabilities_table, // Table with transition probabilities between character pairs cols == rows == vocabulari.size() + InputArray emission_probabilities_table, // Table with observation emission probabilities cols == rows == vocabulari.size() + int mode = OCR_DECODER_VITERBI); // HMM Decoding algorithm (only Viterbi for the moment) + + protected: Ptr classifier; std::string vocabulary; @@ -283,76 +353,98 @@ class CV_EXPORTS_W OCRHMMDecoder : public BaseOCR decoder_mode mode; }; -/** @brief Allow to implicitly load the default character classifier when creating an OCRHMMDecoder object. +/** @brief Allow to implicitly load the default character classifier when + * creating an OCRHMMDecoder object. -@param filename The XML or YAML file with the classifier model (e.g. OCRHMM_knn_model_data.xml) + * @param filename The XML or YAML file with the classifier model (e.g. + * OCRHMM_knn_model_data.xml) -The KNN default classifier is based in the scene text recognition method proposed by Lukás Neumann & -Jiri Matas in [Neumann11b]. Basically, the region (contour) in the input image is normalized to a -fixed size, while retaining the centroid and aspect ratio, in order to extract a feature vector -based on gradient orientations along the chain-code of its perimeter. Then, the region is classified -using a KNN model trained with synthetic data of rendered characters with different standard font -types. + * The KNN default classifier is based in the scene text recognition method + * proposed by Lukás Neumann & Jiri Matas in [Neumann11b]. Basically, the region + * (contour) in the input image is normalized to a fixed size, while retaining + * the centroid and aspect ratio, in order to extract a feature vector based on + * gradient orientations along the chain-code of its perimeter. Then, the region + * is classified using a KNN model trained with synthetic data of rendered + * characters with different standard font types. */ +CV_EXPORTS_W Ptr loadOCRHMMClassifierNM ( + const String& filename); -CV_EXPORTS_W Ptr loadOCRHMMClassifierNM(const String& filename); +/** @brief Allow to implicitly load the default character classifier when + * creating an OCRHMMDecoder object. -/** @brief Allow to implicitly load the default character classifier when creating an OCRHMMDecoder object. + * @param filename The XML or YAML file with the classifier model (e.g. + * OCRBeamSearch_CNN_model_data.xml.gz) -@param filename The XML or YAML file with the classifier model (e.g. OCRBeamSearch_CNN_model_data.xml.gz) - -The CNN default classifier is based in the scene text recognition method proposed by Adam Coates & -Andrew NG in [Coates11a]. The character classifier consists in a Single Layer Convolutional Neural Network and -a linear classifier. It is applied to the input image in a sliding window fashion, providing a set of recognitions -at each window location. + * The CNN default classifier is based in the scene text recognition method + * proposed by Adam Coates & Andrew NG in [Coates11a]. The character classifier + * consists in a Single Layer Convolutional Neural Network and a linear + * classifier. It is applied to the input image in a sliding window fashion, + * providing a set of recognitions at each window location. */ -CV_EXPORTS_W Ptr loadOCRHMMClassifierCNN(const String& filename); +CV_EXPORTS_W Ptr loadOCRHMMClassifierCNN ( + const String& filename); //! @} -/** @brief Utility function to create a tailored language model transitions table from a given list of words (lexicon). - * +/** @brief Utility function to create a tailored language model transitions + * table from a given list of words (lexicon). + * @param vocabulary The language vocabulary (chars when ascii english text). - * + * @param lexicon The list of words that are expected to be found in a particular image. - * - * @param transition_probabilities_table Output table with transition probabilities between character pairs. cols == rows == vocabulary.size(). - * - * The function calculate frequency statistics of character pairs from the given lexicon and fills the output transition_probabilities_table with them. The transition_probabilities_table can be used as input in the OCRHMMDecoder::create() and OCRBeamSearchDecoder::create() methods. + + * @param transition_probabilities_table Output table with transition + * probabilities between character pairs. cols == rows == vocabulary.size(). + + * The function calculate frequency statistics of character pairs from the given + * lexicon and fills the output transition_probabilities_table with them. The + * transition_probabilities_table can be used as input in the + * OCRHMMDecoder::create() and OCRBeamSearchDecoder::create() methods. * @note - * - (C++) An alternative would be to load the default generic language transition table provided in the text module samples folder (created from ispell 42869 english words list) : - * + * - (C++) An alternative would be to load the default generic language + * transition table provided in the text module samples folder (created + * from ispell 42869 english words list) : + * **/ -CV_EXPORTS void createOCRHMMTransitionsTable(std::string& vocabulary, std::vector& lexicon, OutputArray transition_probabilities_table); - -CV_EXPORTS_W Mat createOCRHMMTransitionsTable(const String& vocabulary, std::vector& lexicon); +CV_EXPORTS void createOCRHMMTransitionsTable ( + std::string& vocabulary, std::vector& lexicon, + OutputArray transition_probabilities_table); +CV_EXPORTS_W Mat createOCRHMMTransitionsTable ( + const String& vocabulary, std::vector& lexicon); /* OCR BeamSearch Decoder */ -/** @brief OCRBeamSearchDecoder class provides an interface for OCR using Beam Search algorithm. +/** @brief OCRBeamSearchDecoder class provides an interface for OCR using Beam + * Search algorithm. @note - - (C++) An example on using OCRBeamSearchDecoder recognition combined with scene text detection can - be found at the demo sample: - + - (C++) An example on using OCRBeamSearchDecoder recognition combined with + scene text detection can be found at the demo sample: + */ -class CV_EXPORTS_W OCRBeamSearchDecoder : public BaseOCR -{ -public: + + +/* Forward declaration of class that can be used to generate an OCRBeamSearchDecoder::ClassifierCallbac */ +class TextImageClassifier; + +class CV_EXPORTS_W OCRBeamSearchDecoder : public BaseOCR{ + + public: /** @brief Callback with the character classifier is made a class. - This way it hides the feature extractor and the classifier itself, so developers can write - their own OCR code. + * This way it hides the feature extractor and the classifier itself, so + * developers can write their own OCR code. - The default character classifier and feature extractor can be loaded using the utility funtion - loadOCRBeamSearchClassifierCNN with all its parameters provided in - . + * The default character classifier and feature extractor can be loaded + * using the utility funtion loadOCRBeamSearchClassifierCNN with all its + * parameters provided in + * . */ - class CV_EXPORTS_W ClassifierCallback - { - public: + class CV_EXPORTS_W ClassifierCallback{ + public: virtual ~ClassifierCallback() { } /** @brief The character classifier must return a (ranked list of) class(es) id('s) @@ -364,8 +456,8 @@ class CV_EXPORTS_W OCRBeamSearchDecoder : public BaseOCR */ virtual void eval( InputArray image, std::vector< std::vector >& recognition_probabilities, std::vector& oversegmentation ); - int getWindowSize() {return 0;} - int getStepSize() {return 0;} + virtual int getWindowSize() {return 0;} + virtual int getStepSize() {return 0;} }; public: @@ -421,6 +513,7 @@ class CV_EXPORTS_W OCRBeamSearchDecoder : public BaseOCR @param beam_size Size of the beam in Beam Search algorithm. */ + static Ptr create(const Ptr classifier,// The character classifier with built in feature extractor const std::string& vocabulary, // The language vocabulary (chars when ascii english text) // size() must be equal to the number of classes @@ -441,6 +534,44 @@ class CV_EXPORTS_W OCRBeamSearchDecoder : public BaseOCR int mode = OCR_DECODER_VITERBI, // HMM Decoding algorithm (only Viterbi for the moment) int beam_size = 500); // Size of the beam in Beam Search algorithm + /** @brief This method allows to plug a classifier that is derivative of TextImageClassifier in to + * OCRBeamSearchDecoder as a ClassifierCallback. + + @param classifier A pointer to a TextImageClassifier decendent + + @param alphabet The language alphabet one char per symbol. alphabet.size() must be equal to the number of classes + of the classifier. In future editinons it should be replaced with a vector of strings. + + @param transition_probabilities_table Table with transition probabilities between character + pairs. cols == rows == alphabet.size(). + + @param emission_probabilities_table Table with observation emission probabilities. cols == + rows == alphabet.size(). + + @param windowWidth The width of the windows to which the sliding window will be iterated. The height will + be the height of the image. The windows might be resized to fit the classifiers input by the classifiers + preprocessor. + + @param windowStep The step for the sliding window + + @param mode HMM Decoding algorithm (only Viterbi for the moment) + + @param beam_size Size of the beam in Beam Search algorithm + */ +// CV_WRAP static Ptr create(const Ptr classifier, // The character classifier with built in feature extractor +// String alphabet, // The language alphabet one char per symbol +// // size() must be equal to the number of classes +// InputArray transition_probabilities_table, // Table with transition probabilities between character pairs +// // cols == rows == alphabet.size() +// InputArray emission_probabilities_table, // Table with observation emission probabilities +// // cols == rows == alphabet.size() +// int windowWidth, // The width of the windows to which the sliding window will be iterated. +// // The height will be the height of the image. The windows might be resized to +// // fit the classifiers input by the classifiers preprocessor +// int windowStep = 1 , // The step for the sliding window +// int mode = OCR_DECODER_VITERBI, // HMM Decoding algorithm (only Viterbi for the moment) +// int beam_size = 500); // Size of the beam in Beam Search algorithm + protected: Ptr classifier; @@ -465,6 +596,364 @@ CV_EXPORTS_W Ptr loadOCRBeamSearchClas //! @} -} -} + +//Classifiers should provide diferent backends +//For the moment only caffe is implemeted +enum{ + OCR_HOLISTIC_BACKEND_NONE, + OCR_HOLISTIC_BACKEND_CAFFE +}; + +class TextImageClassifier; + +/** + * @brief The ImagePreprocessor class + */ +class CV_EXPORTS_W ImagePreprocessor{ +protected: + virtual void preprocess_(const Mat& input,Mat& output,Size outputSize,int outputChannels)=0; + virtual void set_mean_(Mat){} + +public: + virtual ~ImagePreprocessor(){} + + /** @brief this method in provides public acces to the preprocessing with respect to a specific + * classifier + * + * This method's main use would be to use the preprocessor without feeding it to a classifier. + * Determining the exact behavior of a preprocessor is the main motivation for this. + * + * @param input an image without any constraints + * + * @param output in most cases an image of fixed depth size and whitened + * + * @param sz the size to which the image would be resize if the preprocessor resizes inputs + * + * @param outputChannels the number of channels for the output image + */ + CV_WRAP void preprocess(InputArray input,OutputArray output,Size sz,int outputChannels); + + CV_WRAP void set_mean(Mat mean); + + /** @brief Creates a functor that only resizes and changes the channels of the input + * without further processing. + * + * @return shared pointer to the generated preprocessor + */ + CV_WRAP static Ptr createResizer(); + + /** @brief + * + * @param sigma + * + * @return shared pointer to generated preprocessor + */ + CV_WRAP static Ptr createImageStandarizer(double sigma); + + /** @brief + * + * @return shared pointer to generated preprocessor + */ + CV_WRAP static Ptr createImageMeanSubtractor(InputArray meanImg); + + CV_WRAP static PtrcreateImageCustomPreprocessor(double rawval=1.0,String channel_order="BGR"); + + friend class TextImageClassifier; + +}; + +/** @brief Abstract class that implements the classifcation of text images. + * + * The interface is generic enough to describe any image classifier. And allows + * to take advantage of compouting in batches. While word classifiers are the default + * networks, any image classifers should work. + * + */ +class CV_EXPORTS_W TextImageClassifier +{ +protected: + Size inputGeometry_; + Size outputGeometry_; + int channelCount_; + Ptr preprocessor_; + /** @brief all image preprocessing is handled here including whitening etc. + * + * @param input the image to be preprocessed for the classifier. If the depth + * is CV_U8 values should be in [0,255] otherwise values are assumed to be in [0,1] + * + * @param output reference to the image to be fed to the classifier, the preprocessor will + * resize the image to the apropriate size and convert it to the apropriate depth\ + * + * The method preprocess should never be used externally, it is up to classify and classifyBatch + * methods to employ it. + */ + virtual void preprocess(const Mat& input,Mat& output); +public: + virtual ~TextImageClassifier() {} + + /** @brief + */ + CV_WRAP virtual void setPreprocessor(Ptr ptr); + + /** @brief + */ + CV_WRAP Ptr getPreprocessor(); + + /** @brief produces a class confidence row-vector given an image + */ + CV_WRAP virtual void classify(InputArray image, OutputArray classProbabilities) = 0; + /** @brief produces a list of bounding box given an image + */ + + CV_WRAP virtual void detect(InputArray image, OutputArray classProbabilities) = 0; + + /** @brief produces a matrix containing class confidence row-vectors given an collection of images + */ + CV_WRAP virtual void classifyBatch(InputArrayOfArrays image, OutputArray classProbabilities) = 0; + + /** @brief simple getter method returning the number of channels each input sample has + */ + CV_WRAP virtual int getInputChannelCount(){return this->channelCount_;} + + /** @brief simple getter method returning the size of the input sample + */ + CV_WRAP virtual Size getInputSize(){return this->inputGeometry_;} + + /** @brief simple getter method returning the size of the oputput row-vector + */ + CV_WRAP virtual int getOutputSize()=0; + CV_WRAP virtual Size getOutputGeometry()=0; + + /** @brief simple getter method returning the size of the minibatches for this classifier. + * If not applicabe this method should return 1 + */ + CV_WRAP virtual int getMinibatchSize()=0; + + friend class ImagePreprocessor; +}; + + + +class CV_EXPORTS_W DeepCNN:public TextImageClassifier +{ + /** @brief Class that uses a pretrained caffe model for word classification. + * + * This network is described in detail in: + * Max Jaderberg et al.: Reading Text in the Wild with Convolutional Neural Networks, IJCV 2015 + * http://arxiv.org/abs/1412.1842 + */ +public: + virtual ~DeepCNN() {}; + + /** @brief Constructs a DeepCNN object from a caffe pretrained model + * + * @param archFilename is the path to the prototxt file containing the deployment model architecture description. + * + * @param weightsFilename is the path to the pretrained weights of the model in binary fdorm. This file can be + * very large, up to 2GB. + * + * @param preprocessor is a pointer to the instance of a ImagePreprocessor implementing the preprocess_ protecteed method; + * + * @param minibatchSz the maximum number of samples that can processed in parallel. In practice this parameter + * has an effect only when computing in the GPU and should be set with respect to the memory available in the GPU. + * + * @param backEnd integer parameter selecting the coputation framework. For now OCR_HOLISTIC_BACKEND_CAFFE is + * the only option + */ + CV_WRAP static Ptr create(String archFilename,String weightsFilename,Ptr preprocessor,int minibatchSz=100,int backEnd=OCR_HOLISTIC_BACKEND_CAFFE); + + /** @brief Constructs a DeepCNN intended to be used for word spotting. + * + * This method loads a pretrained classifier and couples him with a preprocessor that standarises pixels with a + * deviation of 113. The architecture file can be downloaded from: + * + * While the weights can be downloaded from: + * + * The words assigned to the network outputs are available at: + * + * + * @param archFilename is the path to the prototxt file containing the deployment model architecture description. + * When employing OCR_HOLISTIC_BACKEND_CAFFE this is the path to the deploy ".prototxt". + * + * @param weightsFilename is the path to the pretrained weights of the model. When employing + * OCR_HOLISTIC_BACKEND_CAFFE this is the path to the ".caffemodel" file. This file can be very large, the + * pretrained DictNet uses 2GB. + * + * @param backEnd integer parameter selecting the coputation framework. For now OCR_HOLISTIC_BACKEND_CAFFE is + * the only option + */ + CV_WRAP static Ptr createDictNet(String archFilename,String weightsFilename,int backEnd=OCR_HOLISTIC_BACKEND_CAFFE); + +}; + +namespace cnn_config{ +namespace caffe_backend{ + +/** @brief Prompts Caffe on the computation device beeing used + * + * Caffe can only be controlled globally on whether the GPU or the CPU is used has a + * global behavior. This function queries the current state of caffe. + * If the module is built without caffe, this method throws an exception. + * + * @return true if caffe is computing on the GPU, false if caffe is computing on the CPU + */ +CV_EXPORTS_W bool getCaffeGpuMode(); + +/** @brief Sets the computation device beeing used by Caffe + * + * Caffe can only be controlled globally on whether the GPU or the CPU is used has a + * global behavior. This function queries the current state of caffe. + * If the module is built without caffe, this method throws an exception. + * + * @param useGpu set to true for caffe to be computing on the GPU, false if caffe is + * computing on the CPU + */ +CV_EXPORTS_W void setCaffeGpuMode(bool useGpu); + +/** @brief Provides runtime information on whether Caffe support was compiled in. + * + * The text module API is the same regardless of whether CAffe was available or not + * During compilation. When methods that require Caffe are invocked while Caffe support + * is not compiled in, exceptions are thrown. This method allows to test whether the + * text module was built with caffe during runtime. + * + * @return true if Caffe support for the the text module was provided during compilation, + * false if Caffe was unavailable. + */ +CV_EXPORTS_W bool getCaffeAvailable(); + +}//caffe +}//cnn_config + +/** @brief OCRHolisticWordRecognizer class provides the functionallity of segmented wordspotting. + * Given a predefined vocabulary , a TextImageClassifier is employed to select the most probable + * word given an input image. + * + * This class implements the logic of providing transcriptions given a vocabulary and and an image + * classifer. The classifier has to be any TextImageClassifier but the classifier for which this + * class was built is the DictNet. In order to load it the following files should be downloaded: + + * + * + * + */ +class CV_EXPORTS_W OCRHolisticWordRecognizer : public BaseOCR +{ +public: + virtual void run(Mat& image, std::string& output_text, std::vector* component_rects=NULL, + std::vector* component_texts=NULL, std::vector* component_confidences=NULL, + int component_level=OCR_LEVEL_WORD)=0; + + /** @brief Recognize text using a segmentation based word-spotting/classifier cnn. + + Takes image on input and returns recognized text in the output_text parameter. Optionally + provides also the Rects for individual text elements found (e.g. words), and the list of those + text elements with their confidence values. + + @param image Input image CV_8UC1 or CV_8UC3 + + @param mask is totally ignored and is only available for compatibillity reasons + + @param output_text Output text of the the word spoting, always one that exists in the dictionary. + + @param component_rects Not applicable for word spotting can be be NULL if not, a single elemnt will + be put in the vector. + + @param component_texts Not applicable for word spotting can be be NULL if not, a single elemnt will + be put in the vector. + + @param component_confidences Not applicable for word spotting can be be NULL if not, a single elemnt will + be put in the vector. + + @param component_level must be OCR_LEVEL_WORD. + */ + + virtual void run(Mat& image, Mat& mask, std::string& output_text, std::vector* component_rects=NULL, + std::vector* component_texts=NULL, std::vector* component_confidences=NULL, + int component_level=OCR_LEVEL_WORD)=0; + + + /** + @brief Method that provides a quick and simple interface to a single word image classifcation + + @param inputImage an image expected to be a CV_U8C1 or CV_U8C3 of any size assumed to contain a single word + + @param transcription an opencv string that will store the detected word transcription + + @param confidence a double that will be updated with the confidence the classifier has for the selected word + */ + CV_WRAP virtual void recogniseImage(InputArray inputImage,CV_OUT String& transcription,CV_OUT double& confidence)=0; + + /** + @brief Method that provides a quick and simple interface to a multiple word image classifcation taking advantage + the classifiers parallel capabilities. + + @param inputImageList an list of images expected to be a CV_U8C1 or CV_U8C3 each image can be of any size and is assumed + to contain a single word. + + @param transcriptions a vector of opencv strings that will store the detected word transcriptions, one for each + input image + + @param confidences a vector of double that will be updated with the confidence the classifier has for each of the + selected words. + */ + CV_WRAP virtual void recogniseImageBatch(InputArrayOfArrays inputImageList,CV_OUT std::vector& transcriptions,CV_OUT std::vector& confidences)=0; + + + /** + @brief simple getter for the vocabulary employed + */ + CV_WRAP virtual const std::vector& getVocabulary()=0; + + /** @brief simple getter for the preprocessing functor + */ + CV_WRAP virtual Ptr getClassifier()=0; + + /** @brief Creates an instance of the OCRHolisticWordRecognizer class. + + @param classifierPtr an instance of TextImageClassifier, normaly a DeepCNN instance + + @param vocabularyFilename the relative or absolute path to the file containing all words in the vocabulary. Each text line + in the file is assumed to be a single word. The number of words in the vocabulary must be exactly the same as the outputSize + of the classifier. + */ + CV_WRAP static Ptr create(Ptr classifierPtr,String vocabularyFilename); + + + /** @brief Creates an instance of the OCRHolisticWordRecognizer class and implicitly also a DeepCNN classifier. + + @param modelArchFilename the relative or absolute path to the prototxt file describing the classifiers architecture. + + @param modelWeightsFilename the relative or absolute path to the file containing the pretrained weights of the model in caffe-binary form. + + @param vocabularyFilename the relative or absolute path to the file containing all words in the vocabulary. Each text line + in the file is assumed to be a single word. The number of words in the vocabulary must be exactly the same as the outputSize + of the classifier. + */ + CV_WRAP static Ptr create(String modelArchFilename, String modelWeightsFilename, String vocabularyFilename); + + /** @brief + * + * @param classifierPtr + * + * @param vocabulary + */ + CV_WRAP static Ptr create(Ptr classifierPtr,const std::vector& vocabulary); + + /** @brief + * + * @param modelArchFilename + * + * @param modelWeightsFilename + * + * @param vocabulary + */ + CV_WRAP static Ptr create (String modelArchFilename, String modelWeightsFilename, const std::vector& vocabulary); +}; + + +}//namespace text +}//namespace cv + + #endif // _OPENCV_TEXT_OCR_HPP_ diff --git a/modules/text/include/opencv2/text/textDetector.hpp b/modules/text/include/opencv2/text/textDetector.hpp new file mode 100644 index 00000000000..262795733d9 --- /dev/null +++ b/modules/text/include/opencv2/text/textDetector.hpp @@ -0,0 +1,235 @@ +/*M////////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 2013, OpenCV Foundation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#ifndef __OPENCV_TEXT_TEXTDETECTOR_HPP__ +#define __OPENCV_TEXT_TEXTDETECTOR_HPP__ + +#include +#include +#include +#include +#include"ocr.hpp" + + +namespace cv +{ +namespace text +{ + +//! @addtogroup text_recognize +//! @{ + + + +//base class BaseDetector declares a common API that would be used in a typical text +//recognition scenario +class CV_EXPORTS_W BaseDetector +{ + public: + virtual ~BaseDetector() {}; + + virtual void run(Mat& image, + std::vector* component_rects=NULL, + std::vector* component_confidences=NULL, + int component_level=0) = 0; + + virtual void run(Mat& image, Mat& mask, + std::vector* component_rects=NULL, + std::vector* component_confidences=NULL, + int component_level=0) = 0; + + /** @brief Main functionality of the OCR Hierarchy. Subclasses provide + * default parameters for all parameters other than the input image. + */ +// virtual std::vector* run(InputArray image){ +// //std::string res; +// std::vector component_rects; +// std::vector component_confidences; +// //std::vector component_texts; +// Mat inputImage=image.getMat(); +// this->run(inputImage,&component_rects, +// &component_confidences,OCR_LEVEL_WORD); +// return *component_rects; +// } + +}; + + +//Classifiers should provide diferent backends +//For the moment only caffe is implemeted +//enum{ +// OCR_HOLISTIC_BACKEND_NONE, +// OCR_HOLISTIC_BACKEND_CAFFE +//}; + + + + + +/** @brief OCRHolisticWordRecognizer class provides the functionallity of segmented wordspotting. + * Given a predefined vocabulary , a TextImageClassifier is employed to select the most probable + * word given an input image. + * + * This class implements the logic of providing transcriptions given a vocabulary and and an image + * classifer. The classifier has to be any TextImageClassifier but the classifier for which this + * class was built is the DictNet. In order to load it the following files should be downloaded: + + * + * + * + */ +class CV_EXPORTS_W textDetector : public BaseDetector +{ +public: + virtual void run(Mat& image, std::vector* component_rects=NULL, + std::vector* component_confidences=NULL, + int component_level=OCR_LEVEL_WORD)=0; + + /** @brief Recognize text using a segmentation based word-spotting/classifier cnn. + + Takes image on input and returns recognized text in the output_text parameter. Optionally + provides also the Rects for individual text elements found (e.g. words), and the list of those + text elements with their confidence values. + + @param image Input image CV_8UC1 or CV_8UC3 + + @param mask is totally ignored and is only available for compatibillity reasons + + @param output_text Output text of the the word spoting, always one that exists in the dictionary. + + @param component_rects Not applicable for word spotting can be be NULL if not, a single elemnt will + be put in the vector. + + @param component_texts Not applicable for word spotting can be be NULL if not, a single elemnt will + be put in the vector. + + @param component_confidences Not applicable for word spotting can be be NULL if not, a single elemnt will + be put in the vector. + + @param component_level must be OCR_LEVEL_WORD. + */ + + virtual void run(Mat& image, Mat& mask, std::vector* component_rects=NULL, + std::vector* component_confidences=NULL, + int component_level=OCR_LEVEL_WORD)=0; + + + /** + @brief Method that provides a quick and simple interface to a single word image classifcation + + @param inputImage an image expected to be a CV_U8C1 or CV_U8C3 of any size + + @param transcription an opencv string that will store the detected word transcription + + @param confidence a double that will be updated with the confidence the classifier has for the selected word + */ + CV_WRAP virtual void textDetectInImage(InputArray inputImage,CV_OUT std::vector& Bbox,CV_OUT std::vector& confidence)=0; + + /** + @brief Method that provides a quick and simple interface to a multiple word image classifcation taking advantage + the classifiers parallel capabilities. + + @param inputImageList an list of images expected to be a CV_U8C1 or CV_U8C3 each image can be of any size and is assumed + to contain a single word. + + @param transcriptions a vector of opencv strings that will store the detected word transcriptions, one for each + input image + + @param confidences a vector of double that will be updated with the confidence the classifier has for each of the + selected words. + */ + //CV_WRAP virtual void recogniseImageBatch(InputArrayOfArrays inputImageList,CV_OUT std::vector& transcriptions,CV_OUT std::vector& confidences)=0; + + + /** @brief simple getter for the preprocessing functor + */ + CV_WRAP virtual Ptr getClassifier()=0; + + /** @brief Creates an instance of the OCRHolisticWordRecognizer class. + + @param classifierPtr an instance of TextImageClassifier, normaly a DeepCNN instance + + @param vocabularyFilename the relative or absolute path to the file containing all words in the vocabulary. Each text line + in the file is assumed to be a single word. The number of words in the vocabulary must be exactly the same as the outputSize + of the classifier. + */ + CV_WRAP static Ptr create(Ptr classifierPtr); + + + /** @brief Creates an instance of the OCRHolisticWordRecognizer class and implicitly also a DeepCNN classifier. + + @param modelArchFilename the relative or absolute path to the prototxt file describing the classifiers architecture. + + @param modelWeightsFilename the relative or absolute path to the file containing the pretrained weights of the model in caffe-binary form. + + @param vocabularyFilename the relative or absolute path to the file containing all words in the vocabulary. Each text line + in the file is assumed to be a single word. The number of words in the vocabulary must be exactly the same as the outputSize + of the classifier. + */ + CV_WRAP static Ptr create(String modelArchFilename, String modelWeightsFilename); + + /** @brief + * + * @param classifierPtr + * + * @param vocabulary + */ + // CV_WRAP static Ptr create(Ptr classifierPtr,const std::vector& vocabulary); + + /** @brief + * + * @param modelArchFilename + * + * @param modelWeightsFilename + * + * @param vocabulary + */ + // CV_WRAP static Ptr create (String modelArchFilename, String modelWeightsFilename, const std::vector& vocabulary); +}; + + +}//namespace text +}//namespace cv + + +#endif // _OPENCV_TEXT_OCR_HPP_ diff --git a/modules/text/src/ocr_holistic.cpp b/modules/text/src/ocr_holistic.cpp new file mode 100644 index 00000000000..9791e62bbf5 --- /dev/null +++ b/modules/text/src/ocr_holistic.cpp @@ -0,0 +1,879 @@ +#include "precomp.hpp" +#include "opencv2/imgproc.hpp" +#include "opencv2/highgui.hpp" +#include "opencv2/core.hpp" + + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#ifdef HAVE_CAFFE +#include "caffe/caffe.hpp" +#endif + +namespace cv { namespace text { + +//Maybe OpenCV has a routine better suited +inline bool fileExists (String filename) { + std::ifstream f(filename.c_str()); + return f.good(); +} + +//************************************************************************************ +//****************** ImagePreprocessor ******************************************* +//************************************************************************************ + +void ImagePreprocessor::preprocess(InputArray input,OutputArray output,Size sz,int outputChannels){ + Mat inpImg=input.getMat(); + Mat outImg; + this->preprocess_(inpImg,outImg,sz,outputChannels); + outImg.copyTo(output); +} +void ImagePreprocessor::set_mean(Mat mean){ + + + this->set_mean_(mean); + +} + + +class ResizerPreprocessor: public ImagePreprocessor{ +protected: + void preprocess_(const Mat& input,Mat& output,Size outputSize,int outputChannels){ + //TODO put all the logic of channel and depth conversions in ImageProcessor class + CV_Assert(outputChannels==1 || outputChannels==3); + CV_Assert(input.channels()==1 || input.channels()==3); + if(input.channels()!=outputChannels) + { + Mat tmpInput; + if(outputChannels==1){ + cvtColor(input,tmpInput,COLOR_BGR2GRAY); + if(input.depth()==CV_8U) + { + tmpInput.convertTo(output,CV_32FC1,1/255.0); + }else + {//Assuming values are at the desired [0,1] range + tmpInput.convertTo(output, CV_32FC1); + } + }else + { + cvtColor(input,tmpInput,COLOR_GRAY2BGR); + if(input.depth()==CV_8U) + { + tmpInput.convertTo(output,CV_32FC3,1/255.0); + }else + {//Assuming values are at the desired [0,1] range + tmpInput.convertTo(output, CV_32FC3); + } + } + }else + { + if(input.channels()==1) + { + if(input.depth()==CV_8U) + { + input.convertTo(output, CV_32FC1,1/255.0); + }else + {//Assuming values are at the desired [0,1] range + input.convertTo(output, CV_32FC1); + } + }else + { + if(input.depth()==CV_8U){ + input.convertTo(output, CV_32FC3,1/255.0); + }else + {//Assuming values are at the desired [0,1] range + input.convertTo(output, CV_32FC3); + } + } + } + if(outputSize.width!=0 && outputSize.height!=0) + { + resize(output,output,outputSize); + } + } + //void set_mean_(Mat m){} +public: + ResizerPreprocessor(){} + ~ResizerPreprocessor(){} +}; + +class StandarizerPreprocessor: public ImagePreprocessor{ +protected: + double sigma_; + //void set_mean_(Mat M){} + + void preprocess_(const Mat& input,Mat& output,Size outputSize,int outputChannels){ + //TODO put all the logic of channel and depth conversions in ImageProcessor class + CV_Assert(outputChannels==1 || outputChannels==3); + CV_Assert(input.channels()==1 || input.channels()==3); + if(input.channels()!=outputChannels) + { + Mat tmpInput; + if(outputChannels==1) + { + cvtColor(input,tmpInput,COLOR_BGR2GRAY); + if(input.depth()==CV_8U) + { + tmpInput.convertTo(output,CV_32FC1,1/255.0); + }else + {//Assuming values are at the desired [0,1] range + tmpInput.convertTo(output, CV_32FC1); + } + }else + { + cvtColor(input,tmpInput,COLOR_GRAY2BGR); + if(input.depth()==CV_8U) + { + tmpInput.convertTo(output,CV_32FC3,1/255.0); + }else + {//Assuming values are at the desired [0,1] range + tmpInput.convertTo(output, CV_32FC3); + } + } + }else + { + if(input.channels()==1) + { + if(input.depth()==CV_8U) + { + input.convertTo(output, CV_32FC1,1/255.0); + }else + {//Assuming values are at the desired [0,1] range + input.convertTo(output, CV_32FC1); + } + }else + { + if(input.depth()==CV_8U) + { + input.convertTo(output, CV_32FC3,1/255.0); + }else + {//Assuming values are at the desired [0,1] range + input.convertTo(output, CV_32FC3); + } + } + } + if(outputSize.width!=0 && outputSize.height!=0) + { + resize(output,output,outputSize); + } + + Scalar mean,dev; + meanStdDev(output,mean,dev); + subtract(output,mean[0],output); + divide(output,(dev[0]/sigma_),output); + } +public: + StandarizerPreprocessor(double sigma):sigma_(sigma){} + ~StandarizerPreprocessor(){} + +}; + +class customPreprocessor:public ImagePreprocessor{ +protected: + + double rawval_; + Mat mean_; + String channel_order_; + + void set_mean_(Mat imMean_){ + + imMean_.copyTo(this->mean_); + + + } + + void set_raw_scale(int rawval){ + rawval_ = rawval; + + } + void set_channels(String channel_order){ + channel_order_=channel_order; + } + + + void preprocess_(const Mat& input,Mat& output,Size outputSize,int outputChannels){ + //TODO put all the logic of channel and depth conversions in ImageProcessor class + + CV_Assert(outputChannels==1 || outputChannels==3); + CV_Assert(input.channels()==1 || input.channels()==3); + if(input.channels()!=outputChannels) + { + Mat tmpInput; + if(outputChannels==1) + { + cvtColor(input,tmpInput,COLOR_BGR2GRAY); + if(input.depth()==CV_8U) + { + if (rawval_ == 1) + tmpInput.convertTo(output,CV_32FC3,1/255.0); + else + tmpInput.convertTo(output,CV_32FC1); + }else + {//Assuming values are at the desired [0,1] range + if (rawval_ ==1) + tmpInput.convertTo(output, CV_32FC1); + else + tmpInput.convertTo(output, CV_32FC1,rawval_); + } + }else + { + cvtColor(input,tmpInput,COLOR_GRAY2BGR); + if(input.depth()==CV_8U) + { + if (rawval_ == 1) + tmpInput.convertTo(output,CV_32FC3,1/255.0); + else + tmpInput.convertTo(output,CV_32FC1); + }else + {//Assuming values are at the desired [0,1] range + if (rawval_ ==1) + tmpInput.convertTo(output, CV_32FC1); + else + tmpInput.convertTo(output, CV_32FC1,rawval_); + } + } + }else + { + if(input.channels()==1) + { + if(input.depth()==CV_8U) + { + if (rawval_ == 1) + input.convertTo(output,CV_32FC1,1/255.0); + else + input.convertTo(output,CV_32FC1); + }else + {//Assuming values are at the desired [0,1] range + if (rawval_ ==1) + input.convertTo(output, CV_32FC1); + else + input.convertTo(output, CV_32FC1,rawval_); + } + }else + { + if(input.depth()==CV_8U) + { + if (rawval_ == 1) + input.convertTo(output,CV_32FC3,1/255.0); + else + input.convertTo(output,CV_32FC3); + }else + {//Assuming values are at the desired [0,1] range + if (rawval_ ==1) + input.convertTo(output, CV_32FC3); + else + input.convertTo(output, CV_32FC3,rawval_); + } + } + } + if(outputSize.width!=0 && outputSize.height!=0) + { + resize(output,output,outputSize); + } + + if (!this->mean_.empty()){ + + Scalar mean_s(this->mean_.at(0,0),this->mean_.at(0,1),this->mean_.at(0,2)); + subtract(output,mean_s,output); + } + else{ + Scalar mean_s; + mean_s = mean(output); + subtract(output,mean_s,output); + } + + } + +public: + customPreprocessor( double rawval,String channel_order):rawval_(rawval),channel_order_(channel_order){} + ~customPreprocessor(){} + +}; + +class MeanSubtractorPreprocessor: public ImagePreprocessor{ +protected: + Mat mean_; + //void set_mean_(Mat m){} + void preprocess_(const Mat& input,Mat& output,Size outputSize,int outputChannels){ + //TODO put all the logic of channel and depth conversions in ImageProcessor class + CV_Assert(this->mean_.cols==outputSize.width && this->mean_.rows ==outputSize.height); + CV_Assert(outputChannels==1 || outputChannels==3); + CV_Assert(input.channels()==1 || input.channels()==3); + if(input.channels()!=outputChannels) + { + Mat tmpInput; + if(outputChannels==1) + { + cvtColor(input,tmpInput,COLOR_BGR2GRAY); + if(input.depth()==CV_8U) + { + tmpInput.convertTo(output,CV_32FC1,1/255.0); + }else + {//Assuming values are at the desired [0,1] range + tmpInput.convertTo(output, CV_32FC1); + } + }else + { + cvtColor(input,tmpInput,COLOR_GRAY2BGR); + if(input.depth()==CV_8U) + { + tmpInput.convertTo(output,CV_32FC3,1/255.0); + }else + {//Assuming values are at the desired [0,1] range + tmpInput.convertTo(output, CV_32FC3); + } + } + }else + { + if(input.channels()==1) + { + if(input.depth()==CV_8U) + { + input.convertTo(output, CV_32FC1,1/255.0); + }else + {//Assuming values are at the desired [0,1] range + input.convertTo(output, CV_32FC1); + } + }else + { + if(input.depth()==CV_8U) + { + input.convertTo(output, CV_32FC3,1/255.0); + }else + {//Assuming values are at the desired [0,1] range + input.convertTo(output, CV_32FC3); + } + } + } + if(outputSize.width!=0 && outputSize.height!=0) + { + resize(output,output,outputSize); + } + subtract(output,this->mean_,output); + } +public: + MeanSubtractorPreprocessor(Mat mean) + { + mean.copyTo(this->mean_); + } + + ~MeanSubtractorPreprocessor(){} +}; + + + + + +Ptr ImagePreprocessor::createResizer() +{ + return Ptr(new ResizerPreprocessor); +} + +Ptr ImagePreprocessor::createImageStandarizer(double sigma) +{ + return Ptr(new StandarizerPreprocessor(sigma)); +} +Ptr ImagePreprocessor::createImageCustomPreprocessor(double rawval,String channel_order) +{ + + return Ptr(new customPreprocessor(rawval,channel_order)); +} + +Ptr ImagePreprocessor::createImageMeanSubtractor(InputArray meanImg) +{ + Mat tmp=meanImg.getMat(); + return Ptr(new MeanSubtractorPreprocessor(tmp)); +} + +//************************************************************************************ +//****************** TextImageClassifier ***************************************** +//************************************************************************************ + +void TextImageClassifier::preprocess(const Mat& input,Mat& output) +{ + this->preprocessor_->preprocess_(input,output,this->inputGeometry_,this->channelCount_); +} + +void TextImageClassifier::setPreprocessor(Ptr ptr) +{ + CV_Assert(!ptr.empty()); + preprocessor_=ptr; +} + +Ptr TextImageClassifier::getPreprocessor() +{ + return preprocessor_; +} + + +class DeepCNNCaffeImpl: public DeepCNN{ +protected: + void classifyMiniBatch(std::vector inputImageList, Mat outputMat) + { + //Classifies a list of images containing at most minibatchSz_ images + CV_Assert(int(inputImageList.size())<=this->minibatchSz_); + CV_Assert(outputMat.isContinuous()); + +#ifdef HAVE_CAFFE + net_->input_blobs()[0]->Reshape(inputImageList.size(), this->channelCount_,this->inputGeometry_.height,this->inputGeometry_.width); + net_->Reshape(); + float* inputBuffer=net_->input_blobs()[0]->mutable_cpu_data(); + float* inputData=inputBuffer; + + for(size_t imgNum=0;imgNum input_channels; + Mat preprocessed; + // if the image have multiple color channels the input layer should be populated accordingly + for (int channel=0;channel < this->channelCount_;channel++){ + + cv::Mat netInputWraped(this->inputGeometry_.height, this->inputGeometry_.width, CV_32FC1, inputData); + input_channels.push_back(netInputWraped); + //input_data += width * height; + inputData+=(this->inputGeometry_.height*this->inputGeometry_.width); + } + this->preprocess(inputImageList[imgNum],preprocessed); + split(preprocessed, input_channels); + + } + this->net_->ForwardPrefilled(); + const float* outputNetData=net_->output_blobs()[0]->cpu_data(); + this->outputGeometry_ = Size(net_->output_blobs()[0]->width(),net_->output_blobs()[0]->height()); + int outputSz = this->outputSize_ * this->outputGeometry_.height * this->outputGeometry_.width; + + //outputMat.resize(this->outputGeometry_.height * this->outputGeometry_.width); + float*outputMatData=(float*)(outputMat.data); + memcpy(outputMatData,outputNetData,sizeof(float)*outputSz*inputImageList.size()); + +#endif + } + + void process_(Mat inputImage, Mat &outputMat) + { + // do forward pass and stores the output in outputMat + //Process one image + CV_Assert(this->minibatchSz_==1); + //CV_Assert(outputMat.isContinuous()); + +#ifdef HAVE_CAFFE + net_->input_blobs()[0]->Reshape(1, this->channelCount_,this->inputGeometry_.height,this->inputGeometry_.width); + net_->Reshape(); + float* inputBuffer=net_->input_blobs()[0]->mutable_cpu_data(); + float* inputData=inputBuffer; + + std::vector input_channels; + Mat preprocessed; + // if the image have multiple color channels the input layer should be populated accordingly + for (int channel=0;channel < this->channelCount_;channel++){ + + cv::Mat netInputWraped(this->inputGeometry_.height, this->inputGeometry_.width, CV_32FC1, inputData); + input_channels.push_back(netInputWraped); + //input_data += width * height; + inputData+=(this->inputGeometry_.height*this->inputGeometry_.width); + } + this->preprocess(inputImage,preprocessed); + split(preprocessed, input_channels); + + //preprocessed.copyTo(netInputWraped); + + + this->net_->Forward(); + const float* outputNetData=net_->output_blobs()[0]->cpu_data(); + // const float* outputNetData1=net_->output_blobs()[1]->cpu_data(); + + + + + this->outputGeometry_ = Size(net_->output_blobs()[0]->width(),net_->output_blobs()[0]->height()); + int outputSz = this->outputSize_ * this->outputGeometry_.height * this->outputGeometry_.width; + outputMat.create(this->outputGeometry_.height , this->outputGeometry_.width,CV_32FC1); + float*outputMatData=(float*)(outputMat.data); + + memcpy(outputMatData,outputNetData,sizeof(float)*outputSz); + + + +#endif + } + + + +#ifdef HAVE_CAFFE + Ptr > net_; +#endif + //Size inputGeometry_; + int minibatchSz_;//The existence of the assignment operator mandates this to be nonconst + int outputSize_; +public: + DeepCNNCaffeImpl(const DeepCNNCaffeImpl& dn): + minibatchSz_(dn.minibatchSz_),outputSize_(dn.outputSize_){ + channelCount_=dn.channelCount_; + inputGeometry_=dn.inputGeometry_; + //Implemented to supress Visual Studio warning "assignment operator could not be generated" +#ifdef HAVE_CAFFE + this->net_=dn.net_; +#endif + } + DeepCNNCaffeImpl& operator=(const DeepCNNCaffeImpl &dn) + { +#ifdef HAVE_CAFFE + this->net_=dn.net_; +#endif + this->setPreprocessor(dn.preprocessor_); + this->inputGeometry_=dn.inputGeometry_; + this->channelCount_=dn.channelCount_; + this->minibatchSz_=dn.minibatchSz_; + this->outputSize_=dn.outputSize_; + this->preprocessor_=dn.preprocessor_; + this->outputGeometry_=dn.outputGeometry_; + return *this; + //Implemented to supress Visual Studio warning "assignment operator could not be generated" + } + + DeepCNNCaffeImpl(String modelArchFilename, String modelWeightsFilename,Ptr preprocessor, int maxMinibatchSz) + :minibatchSz_(maxMinibatchSz) + { + + CV_Assert(this->minibatchSz_>0); + CV_Assert(fileExists(modelArchFilename)); + CV_Assert(fileExists(modelWeightsFilename)); + CV_Assert(!preprocessor.empty()); + this->setPreprocessor(preprocessor); +#ifdef HAVE_CAFFE + this->net_.reset(new caffe::Net(modelArchFilename, caffe::TEST)); + CV_Assert(net_->num_inputs()==1); + CV_Assert(net_->num_outputs()==1); + CV_Assert(this->net_->input_blobs()[0]->channels()==1 + ||this->net_->input_blobs()[0]->channels()==3); + this->channelCount_=this->net_->input_blobs()[0]->channels(); + + + + this->net_->CopyTrainedLayersFrom(modelWeightsFilename); + + caffe::Blob* inputLayer = this->net_->input_blobs()[0]; + + this->inputGeometry_=Size(inputLayer->width(), inputLayer->height()); + this->channelCount_ = inputLayer->channels(); + + inputLayer->Reshape(this->minibatchSz_,this->channelCount_,this->inputGeometry_.height, this->inputGeometry_.width); + net_->Reshape(); + this->outputSize_=net_->output_blobs()[0]->channels(); + this->outputGeometry_ = Size(net_->output_blobs()[0]->width(),net_->output_blobs()[0]->height()); + + + + + +#else + CV_Error(Error::StsError,"Caffe not available during compilation!"); +#endif + } + + void classify(InputArray image, OutputArray classProbabilities) + { + std::vector inputImageList; + inputImageList.push_back(image.getMat()); + classifyBatch(inputImageList,classProbabilities); + } + void detect(InputArray image, OutputArray Bbox_prob) + { + + Bbox_prob.create(this->outputGeometry_,CV_32F); // dummy initialization is it needed + Mat outputMat = Bbox_prob.getMat(); + process_(image.getMat(),outputMat); + //copy back to outputArray + outputMat.copyTo(Bbox_prob); + } + + void classifyBatch(InputArrayOfArrays inputImageList, OutputArray classProbabilities) + { + std::vector allImageVector; + inputImageList.getMatVector(allImageVector); + size_t outputSize=size_t(this->outputSize_);//temporary variable to avoid int to size_t arithmentic + + size_t minibatchSize=size_t(this->minibatchSz_);//temporary variable to avoid int to size_t arithmentic + classProbabilities.create(Size(int(outputSize),int(allImageVector.size())),CV_32F); + Mat outputMat = classProbabilities.getMat(); + for(size_t imgNum=0;imgNum(allImageVector.size()-imgNum,minibatchSize); + std::vector::const_iterator from=std::vector::const_iterator(allImageVector.begin()+imgNum); + std::vector::const_iterator to=std::vector::const_iterator(allImageVector.begin()+rangeEnd); + std::vector minibatchInput(from,to); + classifyMiniBatch(minibatchInput,outputMat.rowRange(int(imgNum),int(rangeEnd))); + + } + + } + + int getOutputSize() + { + return this->outputSize_; + } + Size getOutputGeometry() + { + return this->outputGeometry_; + } + + int getMinibatchSize() + { + return this->minibatchSz_; + } + + int getBackend() + { + return OCR_HOLISTIC_BACKEND_CAFFE; + } +}; + + +Ptr DeepCNN::create(String archFilename,String weightsFilename,Ptr preprocessor,int minibatchSz,int backEnd) +{ + if(preprocessor.empty()) + { + preprocessor=ImagePreprocessor::createResizer(); + } + switch(backEnd){ + case OCR_HOLISTIC_BACKEND_CAFFE: + return Ptr(new DeepCNNCaffeImpl(archFilename, weightsFilename,preprocessor, minibatchSz)); + break; + case OCR_HOLISTIC_BACKEND_NONE: + default: + CV_Error(Error::StsError,"DeepCNN::create backend not implemented"); + return Ptr(); + break; + } +} + + +Ptr DeepCNN::createDictNet(String archFilename,String weightsFilename,int backEnd) +{ + Ptr preprocessor=ImagePreprocessor::createImageStandarizer(113); + switch(backEnd){ + case OCR_HOLISTIC_BACKEND_CAFFE: + return Ptr(new DeepCNNCaffeImpl(archFilename, weightsFilename,preprocessor, 100)); + break; + case OCR_HOLISTIC_BACKEND_NONE: + default: + CV_Error(Error::StsError,"DeepCNN::create backend not implemented"); + return Ptr(); + break; + } +} + +namespace cnn_config{ +namespace caffe_backend{ + +#ifdef HAVE_CAFFE + +bool getCaffeGpuMode() +{ + return caffe::Caffe::mode()==caffe::Caffe::GPU; +} + +void setCaffeGpuMode(bool useGpu) +{ + if(useGpu) + { + caffe::Caffe::set_mode(caffe::Caffe::GPU); + }else + { + caffe::Caffe::set_mode(caffe::Caffe::CPU); + } +} + +bool getCaffeAvailable() +{ + return true; +} + +#else + +bool getCaffeGpuMode() +{ + CV_Error(Error::StsError,"Caffe not available during compilation!"); + return 0; +} + +void setCaffeGpuMode(bool useGpu) +{ + CV_Error(Error::StsError,"Caffe not available during compilation!"); + CV_Assert(useGpu==1);//Compilation directives force +} + +bool getCaffeAvailable(){ + return 0; +} + +#endif + +}//namespace caffe +}//namespace cnn_config + +class OCRHolisticWordRecognizerImpl: public OCRHolisticWordRecognizer{ +private: + struct NetOutput{ + //Auxiliary structure that handles the logic of getting class ids and probabillities from + //the raw outputs of caffe + int wordIdx; + float probabillity; + + static bool sorter(const NetOutput& o1,const NetOutput& o2) + {//used with std::sort to provide the most probable class + return o1.probabillity>o2.probabillity; + } + + static void getOutputs(const float* buffer,int nbOutputs,std::vector& res) + { + res.resize(nbOutputs); + for(int k=0;k tmp; + getOutputs(buffer,nbOutputs,tmp); + classNum=tmp[0].wordIdx; + confidence=tmp[0].probabillity; + } + }; +protected: + std::vector labels_; + Ptr classifier_; +public: + OCRHolisticWordRecognizerImpl(Ptr classifierPtr,String vocabularyFilename):classifier_(classifierPtr) + { + CV_Assert(fileExists(vocabularyFilename));//this fails for some rason + std::ifstream labelsFile(vocabularyFilename.c_str()); + if(!labelsFile) + { + CV_Error(Error::StsError,"Could not read Labels from file"); + } + std::string line; + while (std::getline(labelsFile, line)) + { + labels_.push_back(std::string(line)); + } + CV_Assert(this->classifier_->getOutputSize()==int(this->labels_.size())); + } + + OCRHolisticWordRecognizerImpl(Ptr classifierPtr,const std::vector& vocabulary):classifier_(classifierPtr) + { + this->labels_=vocabulary; + CV_Assert(this->classifier_->getOutputSize()==int(this->labels_.size())); + } + + void recogniseImage(InputArray inputImage,CV_OUT String& transcription,CV_OUT double& confidence) + { + Mat netOutput; + this->classifier_->classify(inputImage,netOutput); + int classNum; + NetOutput::getClassification((float*)(netOutput.data),this->classifier_->getOutputSize(),classNum,confidence); + transcription=this->labels_[classNum]; + } + + void recogniseImageBatch(InputArrayOfArrays inputImageList,CV_OUT std::vector& transcriptionVec,CV_OUT std::vector& confidenceVec) + { + Mat netOutput; + this->classifier_->classifyBatch(inputImageList,netOutput); + for(int k=0;kclassifier_->getOutputSize(),classNum,confidence); + transcriptionVec.push_back(this->labels_[classNum]); + confidenceVec.push_back(confidence); + } + } + + + void run(Mat& image, std::string& output_text, std::vector* component_rects=NULL, + std::vector* component_texts=NULL, std::vector* component_confidences=NULL, + int component_level=0) + { + CV_Assert(component_level==OCR_LEVEL_WORD);//Componnents not applicable for word spotting + double confidence; + String transcription; + recogniseImage(image,transcription,confidence); + output_text=transcription.c_str(); + if(component_rects!=NULL) + { + component_rects->resize(1); + (*component_rects)[0]=Rect(0,0,image.size().width,image.size().height); + } + if(component_texts!=NULL) + { + component_texts->resize(1); + (*component_texts)[0]=transcription.c_str(); + } + if(component_confidences!=NULL) + { + component_confidences->resize(1); + (*component_confidences)[0]=float(confidence); + } + } + + void run(Mat& image, Mat& mask, std::string& output_text, std::vector* component_rects=NULL, + std::vector* component_texts=NULL, std::vector* component_confidences=NULL, + int component_level=0) + { + CV_Assert(mask.cols==image.cols && mask.rows== image.rows);//Mask is ignored because the CNN operates on a full image + this->run(image,output_text,component_rects,component_texts,component_confidences,component_level); + } + + std::vector& getVocabulary() + { + return this->labels_; + } + + Ptr getClassifier() + { + return this->classifier_; + } +}; + +Ptr OCRHolisticWordRecognizer::create(Ptr classifierPtr,String vocabularyFilename ) +{ + return Ptr(new OCRHolisticWordRecognizerImpl(classifierPtr,vocabularyFilename)); +} + +Ptr OCRHolisticWordRecognizer::create(String modelArchFilename, String modelWeightsFilename, String vocabularyFilename) +{ + Ptr preprocessor=ImagePreprocessor::createImageStandarizer(113); + Ptr classifierPtr(new DeepCNNCaffeImpl(modelArchFilename,modelWeightsFilename,preprocessor,100)); + return Ptr(new OCRHolisticWordRecognizerImpl(classifierPtr,vocabularyFilename)); +} + +Ptr OCRHolisticWordRecognizer::create(Ptr classifierPtr,const std::vector& vocabulary) +{ + return Ptr(new OCRHolisticWordRecognizerImpl(classifierPtr,vocabulary)); +} + +Ptr OCRHolisticWordRecognizer::create(String modelArchFilename, String modelWeightsFilename,const std::vector& vocabulary){ + Ptr preprocessor=ImagePreprocessor::createImageStandarizer(113); + Ptr classifierPtr(new DeepCNNCaffeImpl(modelArchFilename,modelWeightsFilename,preprocessor,100)); + return Ptr(new OCRHolisticWordRecognizerImpl(classifierPtr,vocabulary)); +} + + + + + +} } //namespace text namespace cv diff --git a/modules/text/src/text_detector.cpp b/modules/text/src/text_detector.cpp new file mode 100644 index 00000000000..8f224a70f14 --- /dev/null +++ b/modules/text/src/text_detector.cpp @@ -0,0 +1,643 @@ +#include "precomp.hpp" +#include "opencv2/imgproc.hpp" +#include "opencv2/core.hpp" + + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#ifdef HAVE_CAFFE +#include "caffe/caffe.hpp" +#endif + +namespace cv { namespace text { + +//Maybe OpenCV has a routine better suited +//inline bool fileExists (String filename) { +// std::ifstream f(filename.c_str()); +// return f.good(); +//} + +//************************************************************************************ +//****************** ImagePreprocessor ******************************************* +//************************************************************************************ + +/*void ImagePreprocessor::preprocess(InputArray input,OutputArray output,Size sz,int outputChannels){ + Mat inpImg=input.getMat(); + Mat outImg; + this->preprocess_(inpImg,outImg,sz,outputChannels); + outImg.copyTo(output); +}*/ + + +/*class ResizerPreprocessor: public ImagePreprocessor{ +protected: + void preprocess_(const Mat& input,Mat& output,Size outputSize,int outputChannels){ + //TODO put all the logic of channel and depth conversions in ImageProcessor class + CV_Assert(outputChannels==1 || outputChannels==3); + CV_Assert(input.channels()==1 || input.channels()==3); + if(input.channels()!=outputChannels) + { + Mat tmpInput; + if(outputChannels==1){ + cvtColor(input,tmpInput,COLOR_BGR2GRAY); + if(input.depth()==CV_8U) + { + tmpInput.convertTo(output,CV_32FC1,1/255.0); + }else + {//Assuming values are at the desired [0,1] range + tmpInput.convertTo(output, CV_32FC1); + } + }else + { + cvtColor(input,tmpInput,COLOR_GRAY2BGR); + if(input.depth()==CV_8U) + { + tmpInput.convertTo(output,CV_32FC3,1/255.0); + }else + {//Assuming values are at the desired [0,1] range + tmpInput.convertTo(output, CV_32FC3); + } + } + }else + { + if(input.channels()==1) + { + if(input.depth()==CV_8U) + { + input.convertTo(output, CV_32FC1,1/255.0); + }else + {//Assuming values are at the desired [0,1] range + input.convertTo(output, CV_32FC1); + } + }else + { + if(input.depth()==CV_8U){ + input.convertTo(output, CV_32FC3,1/255.0); + }else + {//Assuming values are at the desired [0,1] range + input.convertTo(output, CV_32FC3); + } + } + } + if(outputSize.width!=0 && outputSize.height!=0) + { + resize(output,output,outputSize); + } + } +public: + ResizerPreprocessor(){} + ~ResizerPreprocessor(){} +}; + +class StandarizerPreprocessor: public ImagePreprocessor{ +protected: + double sigma_; + void preprocess_(const Mat& input,Mat& output,Size outputSize,int outputChannels){ + //TODO put all the logic of channel and depth conversions in ImageProcessor class + CV_Assert(outputChannels==1 || outputChannels==3); + CV_Assert(input.channels()==1 || input.channels()==3); + if(input.channels()!=outputChannels) + { + Mat tmpInput; + if(outputChannels==1) + { + cvtColor(input,tmpInput,COLOR_BGR2GRAY); + if(input.depth()==CV_8U) + { + tmpInput.convertTo(output,CV_32FC1,1/255.0); + }else + {//Assuming values are at the desired [0,1] range + tmpInput.convertTo(output, CV_32FC1); + } + }else + { + cvtColor(input,tmpInput,COLOR_GRAY2BGR); + if(input.depth()==CV_8U) + { + tmpInput.convertTo(output,CV_32FC3,1/255.0); + }else + {//Assuming values are at the desired [0,1] range + tmpInput.convertTo(output, CV_32FC3); + } + } + }else + { + if(input.channels()==1) + { + if(input.depth()==CV_8U) + { + input.convertTo(output, CV_32FC1,1/255.0); + }else + {//Assuming values are at the desired [0,1] range + input.convertTo(output, CV_32FC1); + } + }else + { + if(input.depth()==CV_8U) + { + input.convertTo(output, CV_32FC3,1/255.0); + }else + {//Assuming values are at the desired [0,1] range + input.convertTo(output, CV_32FC3); + } + } + } + if(outputSize.width!=0 && outputSize.height!=0) + { + resize(output,output,outputSize); + } + Scalar dev,mean; + meanStdDev(output,mean,dev); + subtract(output,mean[0],output); + divide(output,(dev[0]/sigma_),output); + } +public: + StandarizerPreprocessor(double sigma):sigma_(sigma){} + ~StandarizerPreprocessor(){} +}; + +class MeanSubtractorPreprocessor: public ImagePreprocessor{ +protected: + Mat mean_; + void preprocess_(const Mat& input,Mat& output,Size outputSize,int outputChannels){ + //TODO put all the logic of channel and depth conversions in ImageProcessor class + CV_Assert(this->mean_.cols==outputSize.width && this->mean_.rows ==outputSize.height); + CV_Assert(outputChannels==1 || outputChannels==3); + CV_Assert(input.channels()==1 || input.channels()==3); + if(input.channels()!=outputChannels) + { + Mat tmpInput; + if(outputChannels==1) + { + cvtColor(input,tmpInput,COLOR_BGR2GRAY); + if(input.depth()==CV_8U) + { + tmpInput.convertTo(output,CV_32FC1,1/255.0); + }else + {//Assuming values are at the desired [0,1] range + tmpInput.convertTo(output, CV_32FC1); + } + }else + { + cvtColor(input,tmpInput,COLOR_GRAY2BGR); + if(input.depth()==CV_8U) + { + tmpInput.convertTo(output,CV_32FC3,1/255.0); + }else + {//Assuming values are at the desired [0,1] range + tmpInput.convertTo(output, CV_32FC3); + } + } + }else + { + if(input.channels()==1) + { + if(input.depth()==CV_8U) + { + input.convertTo(output, CV_32FC1,1/255.0); + }else + {//Assuming values are at the desired [0,1] range + input.convertTo(output, CV_32FC1); + } + }else + { + if(input.depth()==CV_8U) + { + input.convertTo(output, CV_32FC3,1/255.0); + }else + {//Assuming values are at the desired [0,1] range + input.convertTo(output, CV_32FC3); + } + } + } + if(outputSize.width!=0 && outputSize.height!=0) + { + resize(output,output,outputSize); + } + subtract(output,this->mean_,output); + } +public: + MeanSubtractorPreprocessor(Mat mean) + { + mean.copyTo(this->mean_); + } + + ~MeanSubtractorPreprocessor(){} +}; + + +Ptr ImagePreprocessor::createResizer() +{ + return Ptr(new ResizerPreprocessor); +} + +Ptr ImagePreprocessor::createImageStandarizer(double sigma) +{ + return Ptr(new StandarizerPreprocessor(sigma)); +} + +Ptr ImagePreprocessor::createImageMeanSubtractor(InputArray meanImg) +{ + Mat tmp=meanImg.getMat(); + return Ptr(new MeanSubtractorPreprocessor(tmp)); +} + +//************************************************************************************ +//****************** TextImageClassifier ***************************************** +//************************************************************************************ + +void TextImageClassifier::preprocess(const Mat& input,Mat& output) +{ + this->preprocessor_->preprocess_(input,output,this->inputGeometry_,this->channelCount_); +} + +void TextImageClassifier::setPreprocessor(Ptr ptr) +{ + CV_Assert(!ptr.empty()); + preprocessor_=ptr; +} + +Ptr TextImageClassifier::getPreprocessor() +{ + return preprocessor_; +}*/ + +/* +class DeepCNNCaffeImpl: public DeepCNN{ +protected: + void classifyMiniBatch(std::vector inputImageList, Mat outputMat) + { + //Classifies a list of images containing at most minibatchSz_ images + CV_Assert(int(inputImageList.size())<=this->minibatchSz_); + CV_Assert(outputMat.isContinuous()); +#ifdef HAVE_CAFFE + net_->input_blobs()[0]->Reshape(inputImageList.size(), 1,this->inputGeometry_.height,this->inputGeometry_.width); + net_->Reshape(); + float* inputBuffer=net_->input_blobs()[0]->mutable_cpu_data(); + float* inputData=inputBuffer; + for(size_t imgNum=0;imgNuminputGeometry_.height, this->inputGeometry_.width, CV_32FC1, inputData); + this->preprocess(inputImageList[imgNum],preprocessed); + preprocessed.copyTo(netInputWraped); + inputData+=(this->inputGeometry_.height*this->inputGeometry_.width); + } + this->net_->ForwardPrefilled(); + const float* outputNetData=net_->output_blobs()[0]->cpu_data(); + float*outputMatData=(float*)(outputMat.data); + memcpy(outputMatData,outputNetData,sizeof(float)*this->outputSize_*inputImageList.size()); +#endif + } + +#ifdef HAVE_CAFFE + Ptr > net_; +#endif + //Size inputGeometry_; + int minibatchSz_;//The existence of the assignment operator mandates this to be nonconst + int outputSize_; +public: + DeepCNNCaffeImpl(const DeepCNNCaffeImpl& dn): + minibatchSz_(dn.minibatchSz_),outputSize_(dn.outputSize_){ + channelCount_=dn.channelCount_; + inputGeometry_=dn.inputGeometry_; + //Implemented to supress Visual Studio warning "assignment operator could not be generated" +#ifdef HAVE_CAFFE + this->net_=dn.net_; +#endif + } + DeepCNNCaffeImpl& operator=(const DeepCNNCaffeImpl &dn) + { +#ifdef HAVE_CAFFE + this->net_=dn.net_; +#endif + this->setPreprocessor(dn.preprocessor_); + this->inputGeometry_=dn.inputGeometry_; + this->channelCount_=dn.channelCount_; + this->minibatchSz_=dn.minibatchSz_; + this->outputSize_=dn.outputSize_; + this->preprocessor_=dn.preprocessor_; + return *this; + //Implemented to supress Visual Studio warning "assignment operator could not be generated" + } + + DeepCNNCaffeImpl(String modelArchFilename, String modelWeightsFilename,Ptr preprocessor, int maxMinibatchSz) + :minibatchSz_(maxMinibatchSz) + { + CV_Assert(this->minibatchSz_>0); + CV_Assert(fileExists(modelArchFilename)); + CV_Assert(fileExists(modelWeightsFilename)); + CV_Assert(!preprocessor.empty()); + this->setPreprocessor(preprocessor); +#ifdef HAVE_CAFFE + this->net_.reset(new caffe::Net(modelArchFilename, caffe::TEST)); + CV_Assert(net_->num_inputs()==1); + CV_Assert(net_->num_outputs()==1); + CV_Assert(this->net_->input_blobs()[0]->channels()==1 + ||this->net_->input_blobs()[0]->channels()==3); + this->channelCount_=this->net_->input_blobs()[0]->channels(); + this->net_->CopyTrainedLayersFrom(modelWeightsFilename); + caffe::Blob* inputLayer = this->net_->input_blobs()[0]; + this->inputGeometry_=Size(inputLayer->width(), inputLayer->height()); + inputLayer->Reshape(this->minibatchSz_,1,this->inputGeometry_.height, this->inputGeometry_.width); + net_->Reshape(); + this->outputSize_=net_->output_blobs()[0]->channels(); + +#else + CV_Error(Error::StsError,"Caffe not available during compilation!"); +#endif + } + + void classify(InputArray image, OutputArray classProbabilities) + { + std::vector inputImageList; + inputImageList.push_back(image.getMat()); + classifyBatch(inputImageList,classProbabilities); + } + + void classifyBatch(InputArrayOfArrays inputImageList, OutputArray classProbabilities) + { + std::vector allImageVector; + inputImageList.getMatVector(allImageVector); + size_t outputSize=size_t(this->outputSize_);//temporary variable to avoid int to size_t arithmentic + size_t minibatchSize=size_t(this->minibatchSz_);//temporary variable to avoid int to size_t arithmentic + classProbabilities.create(Size(int(outputSize),int(allImageVector.size())),CV_32F); + Mat outputMat = classProbabilities.getMat(); + for(size_t imgNum=0;imgNum(allImageVector.size()-imgNum,minibatchSize); + std::vector::const_iterator from=std::vector::const_iterator(allImageVector.begin()+imgNum); + std::vector::const_iterator to=std::vector::const_iterator(allImageVector.begin()+rangeEnd); + std::vector minibatchInput(from,to); + classifyMiniBatch(minibatchInput,outputMat.rowRange(int(imgNum),int(rangeEnd))); + } + } + + int getOutputSize() + { + return this->outputSize_; + } + + int getMinibatchSize() + { + return this->minibatchSz_; + } + + int getBackend() + { + return OCR_HOLISTIC_BACKEND_CAFFE; + } +}; + + +Ptr DeepCNN::create(String archFilename,String weightsFilename,Ptr preprocessor,int minibatchSz,int backEnd) +{ + if(preprocessor.empty()) + { + preprocessor=ImagePreprocessor::createResizer(); + } + switch(backEnd){ + case OCR_HOLISTIC_BACKEND_CAFFE: + return Ptr(new DeepCNNCaffeImpl(archFilename, weightsFilename,preprocessor, minibatchSz)); + break; + case OCR_HOLISTIC_BACKEND_NONE: + default: + CV_Error(Error::StsError,"DeepCNN::create backend not implemented"); + return Ptr(); + break; + } +} + + +Ptr DeepCNN::createDictNet(String archFilename,String weightsFilename,int backEnd) +{ + Ptr preprocessor=ImagePreprocessor::createImageStandarizer(113); + switch(backEnd){ + case OCR_HOLISTIC_BACKEND_CAFFE: + return Ptr(new DeepCNNCaffeImpl(archFilename, weightsFilename,preprocessor, 100)); + break; + case OCR_HOLISTIC_BACKEND_NONE: + default: + CV_Error(Error::StsError,"DeepCNN::create backend not implemented"); + return Ptr(); + break; + } +} + +namespace cnn_config{ +namespace caffe_backend{ + +#ifdef HAVE_CAFFE + +bool getCaffeGpuMode() +{ + return caffe::Caffe::mode()==caffe::Caffe::GPU; +} + +void setCaffeGpuMode(bool useGpu) +{ + if(useGpu) + { + caffe::Caffe::set_mode(caffe::Caffe::GPU); + }else + { + caffe::Caffe::set_mode(caffe::Caffe::CPU); + } +} + +bool getCaffeAvailable() +{ + return true; +} + +#else + +bool getCaffeGpuMode() +{ + CV_Error(Error::StsError,"Caffe not available during compilation!"); + return 0; +} + +void setCaffeGpuMode(bool useGpu) +{ + CV_Error(Error::StsError,"Caffe not available during compilation!"); + CV_Assert(useGpu==1);//Compilation directives force +} + +bool getCaffeAvailable(){ + return 0; +} + +#endif + +}//namespace caffe +}//namespace cnn_config +*/ + +class textDetectImpl: public textDetector{ +private: + struct NetOutput{ + //Auxiliary structure that handles the logic of getting bounding box and confidences of textness from + //the raw outputs of caffe + Rect bbox; + float probability; + +// static bool sorter(const NetOutput& o1,const NetOutput& o2) +// {//used with std::sort to provide the most probable class +// return o1.probabillity>o2.probabillity; +// } + + static void getOutputs(const float* buffer,int nbrTextBoxes,int nCol,std::vector& res,Size inputShape) + { + + res.resize(nbrTextBoxes); + for(int k=0;k inputShape.width?inputShape.width-1:x_max; + y_max = y_max > inputShape.height?inputShape.height-1:y_max; + float wd = x_max-x_min+1; + float ht = y_max-y_min+1; + + res[k].bbox=Rect(int(x_min),int(y_min),int(wd),int(ht)); + // printf("%f %f %f %f\n",buffer[k*nCol+3],buffer[k*nCol+4],buffer[k*nCol+5],buffer[k*nCol+6]); + res[k].probability=buffer[k*nCol+2]; + } +// std::sort(res.begin(),res.end(),NetOutput::sorter); + } + +// static void getDetections(const float* buffer,int nbOutputs,int &classNum,double& confidence) +// { +// std::vector tmp; +// getOutputs(buffer,nbOutputs,tmp); +// classNum=tmp[0].wordIdx; +// confidence=tmp[0].probabillity; +// } + }; +protected: + //std::vector labels_; + Ptr classifier_; +public: + textDetectImpl(Ptr classifierPtr):classifier_(classifierPtr) + { + + } + + + + void textDetectInImage(InputArray inputImage,CV_OUT std::vector& Bbox,CV_OUT std::vector& confidence) + { + Mat netOutput; + //std::cout<<"started detect"<classifier_->detect(inputImage,netOutput); + //std::cout<<"After Detect"<classifier_->getOutputGeometry(); + int nbrTextBoxes = OutputGeometry_.height; + int nCol = OutputGeometry_.width; + //std::cout< tmp; + Size inputImageShape = Size(inputImage.cols(),inputImage.rows()); + NetOutput::getOutputs((float*)(netOutput.data),nbrTextBoxes,nCol,tmp,inputImageShape); + //Bbox.resize(nbrTextBoxes); + //confidence.resize(nbrTextBoxes); + for (int k=0;k* component_rects=NULL, + std::vector* component_confidences=NULL, + int component_level=0) + { + CV_Assert(component_level==OCR_LEVEL_WORD);//Componnents not applicable for word spotting + //double confidence; + //String transcription; + std::vector bbox; + std::vector score; + textDetectInImage(image,bbox,score); + //output_text=transcription.c_str(); + if(component_rects!=NULL) + { + component_rects->resize(bbox.size()); // should be a user behavior + + component_rects = &bbox; + } + + if(component_confidences!=NULL) + { + component_confidences->resize(score.size()); // shoub be a user behavior + + component_confidences = &score; + } + } + + void run(Mat& image, Mat& mask, std::vector* component_rects=NULL, + std::vector* component_confidences=NULL, + int component_level=0) + { + CV_Assert(mask.cols==image.cols && mask.rows== image.rows);//Mask is ignored because the CNN operates on a full image + this->run(image,component_rects,component_confidences,component_level); + } + +// std::vector& getVocabulary() +// { +// return this->labels_; +// } + + Ptr getClassifier() + { + return this->classifier_; + } +}; + +Ptr textDetector::create(Ptr classifierPtr) +{ + return Ptr(new textDetectImpl(classifierPtr)); +} + +Ptr textDetector::create(String modelArchFilename, String modelWeightsFilename) +{ + + + Ptr preprocessor=ImagePreprocessor::createImageCustomPreprocessor(255); + + Mat textbox_mean(1,3,CV_8U); + textbox_mean.at(0,0)=104; + textbox_mean.at(0,1)=117; + textbox_mean.at(0,2)=123; + preprocessor->set_mean(textbox_mean); + + Ptr classifierPtr(DeepCNN::create(modelArchFilename,modelWeightsFilename,preprocessor,1)); + return Ptr(new textDetectImpl(classifierPtr)); +} + + + + + + + +} } //namespace text namespace cv diff --git a/modules/text/text_config.hpp.in b/modules/text/text_config.hpp.in index 30089bd3c55..71b32993acf 100644 --- a/modules/text/text_config.hpp.in +++ b/modules/text/text_config.hpp.in @@ -1,7 +1,13 @@ #ifndef __OPENCV_TEXT_CONFIG_HPP__ #define __OPENCV_TEXT_CONFIG_HPP__ +// HAVE QT5 +//#cmakedefine HAVE_QT5GUI + +// HAVE CAFFE +//#cmakedefine HAVE_CAFFE + // HAVE OCR Tesseract -#cmakedefine HAVE_TESSERACT +//#cmakedefine HAVE_TESSERACT -#endif \ No newline at end of file +#endif From 40db962641ded7f125a0baecddf193968cf6656c Mon Sep 17 00:00:00 2001 From: sghoshcvc Date: Thu, 22 Jun 2017 19:19:43 +0200 Subject: [PATCH 02/31] Add sample script --- modules/text/samples/textbox_demo.cpp | 146 ++++++++++++++++++++++++++ 1 file changed, 146 insertions(+) create mode 100644 modules/text/samples/textbox_demo.cpp diff --git a/modules/text/samples/textbox_demo.cpp b/modules/text/samples/textbox_demo.cpp new file mode 100644 index 00000000000..a4155893543 --- /dev/null +++ b/modules/text/samples/textbox_demo.cpp @@ -0,0 +1,146 @@ +/* + * dictnet_demo.cpp + * + * Demonstrates simple use of the holistic word classifier in C++ + * + * Created on: June 26, 2016 + * Author: Anguelos Nicolaou + */ + +#include "opencv2/text.hpp" +#include "opencv2/highgui.hpp" +#include "opencv2/imgproc.hpp" + +#include +#include +#include +#include +#include + +inline std::string getHelpStr(std::string progFname){ + std::stringstream out; + out << " Demo of text detection CNN for text detection." << std::endl; + out << " Max Jaderberg et al.: Reading Text in the Wild with Convolutional Neural Networks, IJCV 2015"< " << std::endl; + out << " Caffe Model files (textbox.caffemodel, textbox_deploy.prototxt)"< &groups,std::vector &probs,std::vector wordList,float thres=0.6) +{ + for (int i=0;i<(int)groups.size(); i++) + { + if(probs[i]>thres) + { + if (src.type() == CV_8UC3) + { + cv::rectangle(src,groups.at(i).tl(),groups.at(i).br(),cv::Scalar( 0, 255, 255 ), 3, 8 ); + cv::putText(src, wordList[i],groups.at(i).tl() , cv::FONT_HERSHEY_PLAIN, 1, cv::Scalar( 0,0,255 )); + } + else + rectangle(src,groups.at(i).tl(),groups.at(i).br(),cv::Scalar( 255 ), 3, 8 ); + } + } +} + + +int main(int argc, const char * argv[]){ + if(!cv::text::cnn_config::caffe_backend::getCaffeAvailable()){ + std::cout<<"The text module was compiled without Caffe which is the only available DeepCNN backend.\nAborting!\n"; + exit(1); + } + //set to true if you have a GPU with more than 3GB + cv::text::cnn_config::caffe_backend::setCaffeGpuMode(false); + + if (argc < 3){ + std::cout< textSpotter=cv::text::textDetector::create( + "textbox_deploy.prototxt","textbox.caffemodel"); + + //cv::Ptr wordSpotter= + // cv::text::textDetector::create(cnn); + std::cout<<"Created Text Spotter with text Boxes"; + + std::vector bbox; + std::vector outProbabillities; + textSpotter->textDetectInImage(image,bbox,outProbabillities); + // textbox_draw(image, bbox,outProbabillities); + float thres =0.6; + std::vector imageList; + for(int imageIdx=0;imageIdx<(int)bbox.size();imageIdx++){ + if(outProbabillities[imageIdx]>thres){ + imageList.push_back(image(bbox.at(imageIdx))); + } + + } + // call dict net here for all detected parts + cv::Ptr cnn=cv::text::DeepCNN::createDictNet( + "dictnet_vgg_deploy.prototxt","dictnet_vgg.caffemodel"); + + cv::Ptr wordSpotter= + cv::text::OCRHolisticWordRecognizer::create(cnn,"dictnet_vgg_labels.txt"); + + std::vector wordList; + std::vector wordProbabillities; + wordSpotter->recogniseImageBatch(imageList,wordList,wordProbabillities); + // write the output in file + std::ofstream out; + out.open(argv[1]); + + + for (int i=0;i<(int)wordList.size(); i++) + { + cv::Point tl_ = bbox.at(i).tl(); + cv::Point br_ = bbox.at(i).br(); + + out< Date: Fri, 23 Jun 2017 18:36:33 +0200 Subject: [PATCH 03/31] Minor modification --- modules/text/include/opencv2/text/ocr.hpp | 2 + modules/text/src/text_detector.cpp | 503 +--------------------- 2 files changed, 16 insertions(+), 489 deletions(-) diff --git a/modules/text/include/opencv2/text/ocr.hpp b/modules/text/include/opencv2/text/ocr.hpp index 9fc5403fdef..e0afe5ca4d6 100644 --- a/modules/text/include/opencv2/text/ocr.hpp +++ b/modules/text/include/opencv2/text/ocr.hpp @@ -722,6 +722,8 @@ class CV_EXPORTS_W TextImageClassifier /** @brief simple getter method returning the size of the oputput row-vector */ CV_WRAP virtual int getOutputSize()=0; + /** @brief simple getter method returning the shape of the oputput from caffe + */ CV_WRAP virtual Size getOutputGeometry()=0; /** @brief simple getter method returning the size of the minibatches for this classifier. diff --git a/modules/text/src/text_detector.cpp b/modules/text/src/text_detector.cpp index 8f224a70f14..5b18e970861 100644 --- a/modules/text/src/text_detector.cpp +++ b/modules/text/src/text_detector.cpp @@ -22,468 +22,6 @@ namespace cv { namespace text { -//Maybe OpenCV has a routine better suited -//inline bool fileExists (String filename) { -// std::ifstream f(filename.c_str()); -// return f.good(); -//} - -//************************************************************************************ -//****************** ImagePreprocessor ******************************************* -//************************************************************************************ - -/*void ImagePreprocessor::preprocess(InputArray input,OutputArray output,Size sz,int outputChannels){ - Mat inpImg=input.getMat(); - Mat outImg; - this->preprocess_(inpImg,outImg,sz,outputChannels); - outImg.copyTo(output); -}*/ - - -/*class ResizerPreprocessor: public ImagePreprocessor{ -protected: - void preprocess_(const Mat& input,Mat& output,Size outputSize,int outputChannels){ - //TODO put all the logic of channel and depth conversions in ImageProcessor class - CV_Assert(outputChannels==1 || outputChannels==3); - CV_Assert(input.channels()==1 || input.channels()==3); - if(input.channels()!=outputChannels) - { - Mat tmpInput; - if(outputChannels==1){ - cvtColor(input,tmpInput,COLOR_BGR2GRAY); - if(input.depth()==CV_8U) - { - tmpInput.convertTo(output,CV_32FC1,1/255.0); - }else - {//Assuming values are at the desired [0,1] range - tmpInput.convertTo(output, CV_32FC1); - } - }else - { - cvtColor(input,tmpInput,COLOR_GRAY2BGR); - if(input.depth()==CV_8U) - { - tmpInput.convertTo(output,CV_32FC3,1/255.0); - }else - {//Assuming values are at the desired [0,1] range - tmpInput.convertTo(output, CV_32FC3); - } - } - }else - { - if(input.channels()==1) - { - if(input.depth()==CV_8U) - { - input.convertTo(output, CV_32FC1,1/255.0); - }else - {//Assuming values are at the desired [0,1] range - input.convertTo(output, CV_32FC1); - } - }else - { - if(input.depth()==CV_8U){ - input.convertTo(output, CV_32FC3,1/255.0); - }else - {//Assuming values are at the desired [0,1] range - input.convertTo(output, CV_32FC3); - } - } - } - if(outputSize.width!=0 && outputSize.height!=0) - { - resize(output,output,outputSize); - } - } -public: - ResizerPreprocessor(){} - ~ResizerPreprocessor(){} -}; - -class StandarizerPreprocessor: public ImagePreprocessor{ -protected: - double sigma_; - void preprocess_(const Mat& input,Mat& output,Size outputSize,int outputChannels){ - //TODO put all the logic of channel and depth conversions in ImageProcessor class - CV_Assert(outputChannels==1 || outputChannels==3); - CV_Assert(input.channels()==1 || input.channels()==3); - if(input.channels()!=outputChannels) - { - Mat tmpInput; - if(outputChannels==1) - { - cvtColor(input,tmpInput,COLOR_BGR2GRAY); - if(input.depth()==CV_8U) - { - tmpInput.convertTo(output,CV_32FC1,1/255.0); - }else - {//Assuming values are at the desired [0,1] range - tmpInput.convertTo(output, CV_32FC1); - } - }else - { - cvtColor(input,tmpInput,COLOR_GRAY2BGR); - if(input.depth()==CV_8U) - { - tmpInput.convertTo(output,CV_32FC3,1/255.0); - }else - {//Assuming values are at the desired [0,1] range - tmpInput.convertTo(output, CV_32FC3); - } - } - }else - { - if(input.channels()==1) - { - if(input.depth()==CV_8U) - { - input.convertTo(output, CV_32FC1,1/255.0); - }else - {//Assuming values are at the desired [0,1] range - input.convertTo(output, CV_32FC1); - } - }else - { - if(input.depth()==CV_8U) - { - input.convertTo(output, CV_32FC3,1/255.0); - }else - {//Assuming values are at the desired [0,1] range - input.convertTo(output, CV_32FC3); - } - } - } - if(outputSize.width!=0 && outputSize.height!=0) - { - resize(output,output,outputSize); - } - Scalar dev,mean; - meanStdDev(output,mean,dev); - subtract(output,mean[0],output); - divide(output,(dev[0]/sigma_),output); - } -public: - StandarizerPreprocessor(double sigma):sigma_(sigma){} - ~StandarizerPreprocessor(){} -}; - -class MeanSubtractorPreprocessor: public ImagePreprocessor{ -protected: - Mat mean_; - void preprocess_(const Mat& input,Mat& output,Size outputSize,int outputChannels){ - //TODO put all the logic of channel and depth conversions in ImageProcessor class - CV_Assert(this->mean_.cols==outputSize.width && this->mean_.rows ==outputSize.height); - CV_Assert(outputChannels==1 || outputChannels==3); - CV_Assert(input.channels()==1 || input.channels()==3); - if(input.channels()!=outputChannels) - { - Mat tmpInput; - if(outputChannels==1) - { - cvtColor(input,tmpInput,COLOR_BGR2GRAY); - if(input.depth()==CV_8U) - { - tmpInput.convertTo(output,CV_32FC1,1/255.0); - }else - {//Assuming values are at the desired [0,1] range - tmpInput.convertTo(output, CV_32FC1); - } - }else - { - cvtColor(input,tmpInput,COLOR_GRAY2BGR); - if(input.depth()==CV_8U) - { - tmpInput.convertTo(output,CV_32FC3,1/255.0); - }else - {//Assuming values are at the desired [0,1] range - tmpInput.convertTo(output, CV_32FC3); - } - } - }else - { - if(input.channels()==1) - { - if(input.depth()==CV_8U) - { - input.convertTo(output, CV_32FC1,1/255.0); - }else - {//Assuming values are at the desired [0,1] range - input.convertTo(output, CV_32FC1); - } - }else - { - if(input.depth()==CV_8U) - { - input.convertTo(output, CV_32FC3,1/255.0); - }else - {//Assuming values are at the desired [0,1] range - input.convertTo(output, CV_32FC3); - } - } - } - if(outputSize.width!=0 && outputSize.height!=0) - { - resize(output,output,outputSize); - } - subtract(output,this->mean_,output); - } -public: - MeanSubtractorPreprocessor(Mat mean) - { - mean.copyTo(this->mean_); - } - - ~MeanSubtractorPreprocessor(){} -}; - - -Ptr ImagePreprocessor::createResizer() -{ - return Ptr(new ResizerPreprocessor); -} - -Ptr ImagePreprocessor::createImageStandarizer(double sigma) -{ - return Ptr(new StandarizerPreprocessor(sigma)); -} - -Ptr ImagePreprocessor::createImageMeanSubtractor(InputArray meanImg) -{ - Mat tmp=meanImg.getMat(); - return Ptr(new MeanSubtractorPreprocessor(tmp)); -} - -//************************************************************************************ -//****************** TextImageClassifier ***************************************** -//************************************************************************************ - -void TextImageClassifier::preprocess(const Mat& input,Mat& output) -{ - this->preprocessor_->preprocess_(input,output,this->inputGeometry_,this->channelCount_); -} - -void TextImageClassifier::setPreprocessor(Ptr ptr) -{ - CV_Assert(!ptr.empty()); - preprocessor_=ptr; -} - -Ptr TextImageClassifier::getPreprocessor() -{ - return preprocessor_; -}*/ - -/* -class DeepCNNCaffeImpl: public DeepCNN{ -protected: - void classifyMiniBatch(std::vector inputImageList, Mat outputMat) - { - //Classifies a list of images containing at most minibatchSz_ images - CV_Assert(int(inputImageList.size())<=this->minibatchSz_); - CV_Assert(outputMat.isContinuous()); -#ifdef HAVE_CAFFE - net_->input_blobs()[0]->Reshape(inputImageList.size(), 1,this->inputGeometry_.height,this->inputGeometry_.width); - net_->Reshape(); - float* inputBuffer=net_->input_blobs()[0]->mutable_cpu_data(); - float* inputData=inputBuffer; - for(size_t imgNum=0;imgNuminputGeometry_.height, this->inputGeometry_.width, CV_32FC1, inputData); - this->preprocess(inputImageList[imgNum],preprocessed); - preprocessed.copyTo(netInputWraped); - inputData+=(this->inputGeometry_.height*this->inputGeometry_.width); - } - this->net_->ForwardPrefilled(); - const float* outputNetData=net_->output_blobs()[0]->cpu_data(); - float*outputMatData=(float*)(outputMat.data); - memcpy(outputMatData,outputNetData,sizeof(float)*this->outputSize_*inputImageList.size()); -#endif - } - -#ifdef HAVE_CAFFE - Ptr > net_; -#endif - //Size inputGeometry_; - int minibatchSz_;//The existence of the assignment operator mandates this to be nonconst - int outputSize_; -public: - DeepCNNCaffeImpl(const DeepCNNCaffeImpl& dn): - minibatchSz_(dn.minibatchSz_),outputSize_(dn.outputSize_){ - channelCount_=dn.channelCount_; - inputGeometry_=dn.inputGeometry_; - //Implemented to supress Visual Studio warning "assignment operator could not be generated" -#ifdef HAVE_CAFFE - this->net_=dn.net_; -#endif - } - DeepCNNCaffeImpl& operator=(const DeepCNNCaffeImpl &dn) - { -#ifdef HAVE_CAFFE - this->net_=dn.net_; -#endif - this->setPreprocessor(dn.preprocessor_); - this->inputGeometry_=dn.inputGeometry_; - this->channelCount_=dn.channelCount_; - this->minibatchSz_=dn.minibatchSz_; - this->outputSize_=dn.outputSize_; - this->preprocessor_=dn.preprocessor_; - return *this; - //Implemented to supress Visual Studio warning "assignment operator could not be generated" - } - - DeepCNNCaffeImpl(String modelArchFilename, String modelWeightsFilename,Ptr preprocessor, int maxMinibatchSz) - :minibatchSz_(maxMinibatchSz) - { - CV_Assert(this->minibatchSz_>0); - CV_Assert(fileExists(modelArchFilename)); - CV_Assert(fileExists(modelWeightsFilename)); - CV_Assert(!preprocessor.empty()); - this->setPreprocessor(preprocessor); -#ifdef HAVE_CAFFE - this->net_.reset(new caffe::Net(modelArchFilename, caffe::TEST)); - CV_Assert(net_->num_inputs()==1); - CV_Assert(net_->num_outputs()==1); - CV_Assert(this->net_->input_blobs()[0]->channels()==1 - ||this->net_->input_blobs()[0]->channels()==3); - this->channelCount_=this->net_->input_blobs()[0]->channels(); - this->net_->CopyTrainedLayersFrom(modelWeightsFilename); - caffe::Blob* inputLayer = this->net_->input_blobs()[0]; - this->inputGeometry_=Size(inputLayer->width(), inputLayer->height()); - inputLayer->Reshape(this->minibatchSz_,1,this->inputGeometry_.height, this->inputGeometry_.width); - net_->Reshape(); - this->outputSize_=net_->output_blobs()[0]->channels(); - -#else - CV_Error(Error::StsError,"Caffe not available during compilation!"); -#endif - } - - void classify(InputArray image, OutputArray classProbabilities) - { - std::vector inputImageList; - inputImageList.push_back(image.getMat()); - classifyBatch(inputImageList,classProbabilities); - } - - void classifyBatch(InputArrayOfArrays inputImageList, OutputArray classProbabilities) - { - std::vector allImageVector; - inputImageList.getMatVector(allImageVector); - size_t outputSize=size_t(this->outputSize_);//temporary variable to avoid int to size_t arithmentic - size_t minibatchSize=size_t(this->minibatchSz_);//temporary variable to avoid int to size_t arithmentic - classProbabilities.create(Size(int(outputSize),int(allImageVector.size())),CV_32F); - Mat outputMat = classProbabilities.getMat(); - for(size_t imgNum=0;imgNum(allImageVector.size()-imgNum,minibatchSize); - std::vector::const_iterator from=std::vector::const_iterator(allImageVector.begin()+imgNum); - std::vector::const_iterator to=std::vector::const_iterator(allImageVector.begin()+rangeEnd); - std::vector minibatchInput(from,to); - classifyMiniBatch(minibatchInput,outputMat.rowRange(int(imgNum),int(rangeEnd))); - } - } - - int getOutputSize() - { - return this->outputSize_; - } - - int getMinibatchSize() - { - return this->minibatchSz_; - } - - int getBackend() - { - return OCR_HOLISTIC_BACKEND_CAFFE; - } -}; - - -Ptr DeepCNN::create(String archFilename,String weightsFilename,Ptr preprocessor,int minibatchSz,int backEnd) -{ - if(preprocessor.empty()) - { - preprocessor=ImagePreprocessor::createResizer(); - } - switch(backEnd){ - case OCR_HOLISTIC_BACKEND_CAFFE: - return Ptr(new DeepCNNCaffeImpl(archFilename, weightsFilename,preprocessor, minibatchSz)); - break; - case OCR_HOLISTIC_BACKEND_NONE: - default: - CV_Error(Error::StsError,"DeepCNN::create backend not implemented"); - return Ptr(); - break; - } -} - - -Ptr DeepCNN::createDictNet(String archFilename,String weightsFilename,int backEnd) -{ - Ptr preprocessor=ImagePreprocessor::createImageStandarizer(113); - switch(backEnd){ - case OCR_HOLISTIC_BACKEND_CAFFE: - return Ptr(new DeepCNNCaffeImpl(archFilename, weightsFilename,preprocessor, 100)); - break; - case OCR_HOLISTIC_BACKEND_NONE: - default: - CV_Error(Error::StsError,"DeepCNN::create backend not implemented"); - return Ptr(); - break; - } -} - -namespace cnn_config{ -namespace caffe_backend{ - -#ifdef HAVE_CAFFE - -bool getCaffeGpuMode() -{ - return caffe::Caffe::mode()==caffe::Caffe::GPU; -} - -void setCaffeGpuMode(bool useGpu) -{ - if(useGpu) - { - caffe::Caffe::set_mode(caffe::Caffe::GPU); - }else - { - caffe::Caffe::set_mode(caffe::Caffe::CPU); - } -} - -bool getCaffeAvailable() -{ - return true; -} - -#else - -bool getCaffeGpuMode() -{ - CV_Error(Error::StsError,"Caffe not available during compilation!"); - return 0; -} - -void setCaffeGpuMode(bool useGpu) -{ - CV_Error(Error::StsError,"Caffe not available during compilation!"); - CV_Assert(useGpu==1);//Compilation directives force -} - -bool getCaffeAvailable(){ - return 0; -} - -#endif - -}//namespace caffe -}//namespace cnn_config -*/ class textDetectImpl: public textDetector{ private: @@ -493,10 +31,6 @@ class textDetectImpl: public textDetector{ Rect bbox; float probability; -// static bool sorter(const NetOutput& o1,const NetOutput& o2) -// {//used with std::sort to provide the most probable class -// return o1.probabillity>o2.probabillity; -// } static void getOutputs(const float* buffer,int nbrTextBoxes,int nCol,std::vector& res,Size inputShape) { @@ -516,22 +50,16 @@ class textDetectImpl: public textDetector{ float ht = y_max-y_min+1; res[k].bbox=Rect(int(x_min),int(y_min),int(wd),int(ht)); - // printf("%f %f %f %f\n",buffer[k*nCol+3],buffer[k*nCol+4],buffer[k*nCol+5],buffer[k*nCol+6]); + res[k].probability=buffer[k*nCol+2]; } -// std::sort(res.begin(),res.end(),NetOutput::sorter); + } -// static void getDetections(const float* buffer,int nbOutputs,int &classNum,double& confidence) -// { -// std::vector tmp; -// getOutputs(buffer,nbOutputs,tmp); -// classNum=tmp[0].wordIdx; -// confidence=tmp[0].probabillity; -// } + }; protected: - //std::vector labels_; + Ptr classifier_; public: textDetectImpl(Ptr classifierPtr):classifier_(classifierPtr) @@ -544,25 +72,24 @@ class textDetectImpl: public textDetector{ void textDetectInImage(InputArray inputImage,CV_OUT std::vector& Bbox,CV_OUT std::vector& confidence) { Mat netOutput; - //std::cout<<"started detect"<classifier_->detect(inputImage,netOutput); - //std::cout<<"After Detect"<classifier_->getOutputGeometry(); int nbrTextBoxes = OutputGeometry_.height; int nCol = OutputGeometry_.width; - //std::cout< tmp; + // the output bounding box needs to be resized by the input height and width Size inputImageShape = Size(inputImage.cols(),inputImage.rows()); NetOutput::getOutputs((float*)(netOutput.data),nbrTextBoxes,nCol,tmp,inputImageShape); - //Bbox.resize(nbrTextBoxes); - //confidence.resize(nbrTextBoxes); + // put the output in CV_OUT + for (int k=0;krun(image,component_rects,component_confidences,component_level); } -// std::vector& getVocabulary() -// { -// return this->labels_; -// } + Ptr getClassifier() { @@ -621,15 +145,16 @@ Ptr textDetector::create(Ptr classifierPtr) Ptr textDetector::create(String modelArchFilename, String modelWeightsFilename) { - +// create a custom preprocessor with rawval Ptr preprocessor=ImagePreprocessor::createImageCustomPreprocessor(255); +// set the mean for the preprocessor Mat textbox_mean(1,3,CV_8U); textbox_mean.at(0,0)=104; textbox_mean.at(0,1)=117; textbox_mean.at(0,2)=123; preprocessor->set_mean(textbox_mean); - +// create a pointer to text box detector(textDetector) Ptr classifierPtr(DeepCNN::create(modelArchFilename,modelWeightsFilename,preprocessor,1)); return Ptr(new textDetectImpl(classifierPtr)); } From e494efb4b0884c0b68a8de7d7684ee385d8e222e Mon Sep 17 00:00:00 2001 From: sghoshcvc Date: Fri, 23 Jun 2017 19:09:17 +0200 Subject: [PATCH 04/31] Added comments --- modules/text/include/opencv2/text/ocr.hpp | 14 +++ .../include/opencv2/text/textDetector.hpp | 104 +++--------------- 2 files changed, 28 insertions(+), 90 deletions(-) diff --git a/modules/text/include/opencv2/text/ocr.hpp b/modules/text/include/opencv2/text/ocr.hpp index e0afe5ca4d6..9593a1415fd 100644 --- a/modules/text/include/opencv2/text/ocr.hpp +++ b/modules/text/include/opencv2/text/ocr.hpp @@ -633,6 +633,16 @@ class CV_EXPORTS_W ImagePreprocessor{ */ CV_WRAP void preprocess(InputArray input,OutputArray output,Size sz,int outputChannels); + /** @brief this method in provides public acces to set the mean of the input images + * mean can be a mat either of same size of the image or one value per color channel + * A preprocessor can be created without the mean( the pre processor will calculate mean for every image + * in that case + * + + * @param mean which will be subtracted from the images + * + */ + CV_WRAP void set_mean(Mat mean); /** @brief Creates a functor that only resizes and changes the channels of the input @@ -655,6 +665,10 @@ class CV_EXPORTS_W ImagePreprocessor{ * @return shared pointer to generated preprocessor */ CV_WRAP static Ptr createImageMeanSubtractor(InputArray meanImg); + /** @brief + * create a functor with the parameters, parameters can be changes by corresponding set functions + * @return shared pointer to generated preprocessor + */ CV_WRAP static PtrcreateImageCustomPreprocessor(double rawval=1.0,String channel_order="BGR"); diff --git a/modules/text/include/opencv2/text/textDetector.hpp b/modules/text/include/opencv2/text/textDetector.hpp index 262795733d9..ea1c7de9d4b 100644 --- a/modules/text/include/opencv2/text/textDetector.hpp +++ b/modules/text/include/opencv2/text/textDetector.hpp @@ -62,7 +62,7 @@ namespace text //base class BaseDetector declares a common API that would be used in a typical text -//recognition scenario +//detection scenario class CV_EXPORTS_W BaseDetector { public: @@ -78,46 +78,7 @@ class CV_EXPORTS_W BaseDetector std::vector* component_confidences=NULL, int component_level=0) = 0; - /** @brief Main functionality of the OCR Hierarchy. Subclasses provide - * default parameters for all parameters other than the input image. - */ -// virtual std::vector* run(InputArray image){ -// //std::string res; -// std::vector component_rects; -// std::vector component_confidences; -// //std::vector component_texts; -// Mat inputImage=image.getMat(); -// this->run(inputImage,&component_rects, -// &component_confidences,OCR_LEVEL_WORD); -// return *component_rects; -// } - -}; - - -//Classifiers should provide diferent backends -//For the moment only caffe is implemeted -//enum{ -// OCR_HOLISTIC_BACKEND_NONE, -// OCR_HOLISTIC_BACKEND_CAFFE -//}; - - - - - -/** @brief OCRHolisticWordRecognizer class provides the functionallity of segmented wordspotting. - * Given a predefined vocabulary , a TextImageClassifier is employed to select the most probable - * word given an input image. - * - * This class implements the logic of providing transcriptions given a vocabulary and and an image - * classifer. The classifier has to be any TextImageClassifier but the classifier for which this - * class was built is the DictNet. In order to load it the following files should be downloaded: - * - * - * - */ class CV_EXPORTS_W textDetector : public BaseDetector { public: @@ -125,7 +86,7 @@ class CV_EXPORTS_W textDetector : public BaseDetector std::vector* component_confidences=NULL, int component_level=OCR_LEVEL_WORD)=0; - /** @brief Recognize text using a segmentation based word-spotting/classifier cnn. + /** @brief detect text with a cnn, input is one image with (multiple) ocuurance of text. Takes image on input and returns recognized text in the output_text parameter. Optionally provides also the Rects for individual text elements found (e.g. words), and the list of those @@ -135,16 +96,12 @@ class CV_EXPORTS_W textDetector : public BaseDetector @param mask is totally ignored and is only available for compatibillity reasons - @param output_text Output text of the the word spoting, always one that exists in the dictionary. - @param component_rects Not applicable for word spotting can be be NULL if not, a single elemnt will - be put in the vector. + @param component_rects a vector of Rects, each rect is one text bounding box. - @param component_texts Not applicable for word spotting can be be NULL if not, a single elemnt will - be put in the vector. - @param component_confidences Not applicable for word spotting can be be NULL if not, a single elemnt will - be put in the vector. + + @param component_confidences A vector of float returns confidence of text bounding boxes @param component_level must be OCR_LEVEL_WORD. */ @@ -155,76 +112,43 @@ class CV_EXPORTS_W textDetector : public BaseDetector /** - @brief Method that provides a quick and simple interface to a single word image classifcation + @brief Method that provides a quick and simple interface to detect text inside an image - @param inputImage an image expected to be a CV_U8C1 or CV_U8C3 of any size + @param inputImage an image expected to be a CV_U8C3 of any size - @param transcription an opencv string that will store the detected word transcription + @param Bbox a vector of Rect that will store the detected word bounding box - @param confidence a double that will be updated with the confidence the classifier has for the selected word + @param confidence a vector of float that will be updated with the confidence the classifier has for the selected bounding box */ CV_WRAP virtual void textDetectInImage(InputArray inputImage,CV_OUT std::vector& Bbox,CV_OUT std::vector& confidence)=0; - /** - @brief Method that provides a quick and simple interface to a multiple word image classifcation taking advantage - the classifiers parallel capabilities. - - @param inputImageList an list of images expected to be a CV_U8C1 or CV_U8C3 each image can be of any size and is assumed - to contain a single word. - @param transcriptions a vector of opencv strings that will store the detected word transcriptions, one for each - input image - - @param confidences a vector of double that will be updated with the confidence the classifier has for each of the - selected words. - */ - //CV_WRAP virtual void recogniseImageBatch(InputArrayOfArrays inputImageList,CV_OUT std::vector& transcriptions,CV_OUT std::vector& confidences)=0; /** @brief simple getter for the preprocessing functor */ CV_WRAP virtual Ptr getClassifier()=0; - /** @brief Creates an instance of the OCRHolisticWordRecognizer class. + /** @brief Creates an instance of the textDetector class. @param classifierPtr an instance of TextImageClassifier, normaly a DeepCNN instance - @param vocabularyFilename the relative or absolute path to the file containing all words in the vocabulary. Each text line - in the file is assumed to be a single word. The number of words in the vocabulary must be exactly the same as the outputSize - of the classifier. + */ CV_WRAP static Ptr create(Ptr classifierPtr); - /** @brief Creates an instance of the OCRHolisticWordRecognizer class and implicitly also a DeepCNN classifier. + /** @brief Creates an instance of the textDetector class and implicitly also a DeepCNN classifier. @param modelArchFilename the relative or absolute path to the prototxt file describing the classifiers architecture. @param modelWeightsFilename the relative or absolute path to the file containing the pretrained weights of the model in caffe-binary form. - @param vocabularyFilename the relative or absolute path to the file containing all words in the vocabulary. Each text line - in the file is assumed to be a single word. The number of words in the vocabulary must be exactly the same as the outputSize - of the classifier. + */ CV_WRAP static Ptr create(String modelArchFilename, String modelWeightsFilename); - /** @brief - * - * @param classifierPtr - * - * @param vocabulary - */ - // CV_WRAP static Ptr create(Ptr classifierPtr,const std::vector& vocabulary); - - /** @brief - * - * @param modelArchFilename - * - * @param modelWeightsFilename - * - * @param vocabulary - */ - // CV_WRAP static Ptr create (String modelArchFilename, String modelWeightsFilename, const std::vector& vocabulary); + }; From 2b8ed124f2eacae9c4c8833382ceea30eee67447 Mon Sep 17 00:00:00 2001 From: sghoshcvc Date: Wed, 5 Jul 2017 16:34:55 +0200 Subject: [PATCH 05/31] added instructions to build --- modules/text/README.md | 70 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) diff --git a/modules/text/README.md b/modules/text/README.md index 3a3a897f7c3..8d0648cfe59 100644 --- a/modules/text/README.md +++ b/modules/text/README.md @@ -118,4 +118,74 @@ CAFFEROOT="${HOME}/caffe_inst/" #If you used the previous code to compile Caffe cmake -DCaffe_LIBS:FILEPATH="$CAFFEROOT/caffe/distribute/lib/libcaffe.so" -DBUILD_opencv_ts:BOOL="0" -DBUILD_opencv_dnn:BOOL="0" -DBUILD_opencv_dnn_modern:BOOL="0" -DCaffe_INCLUDE_DIR:PATH="$CAFFEROOT/caffe/distribute/include" -DWITH_MATLAB:BOOL="0" -DBUILD_opencv_cudabgsegm:BOOL="0" -DWITH_QT:BOOL="1" -DBUILD_opencv_cudaoptflow:BOOL="0" -DBUILD_opencv_cudastereo:BOOL="0" -DBUILD_opencv_cudafilters:BOOL="0" -DBUILD_opencv_cudev:BOOL="1" -DOPENCV_EXTRA_MODULES_PATH:PATH="/home/anguelos/work/projects/opencv_gsoc/opencv_contrib/modules" ./ +``` + +Text Detection CNN +================= + +Intro +----- + +A text detection CNN is a CNN that takes an image which may contain multiple words. This outputs a list of Rects with bounding boxes and probability of text there. +Although other backends will be supported, for the moment only the Caffe backend is supported. + + + + +Instalation of Caffe backend +---------------------------- +* Please note a custom caffe based on SSD branch is required, the link of the custom caffe is provided below +The caffe wrapping backend has the requirements caffe does. +* Caffe can be built against OpenCV, if the caffe backend is enabled, a circular bependency arises. +The simplest solution is to build caffe without support for OpenCV. +* Only the OS supported by Caffe are supported by the backend. +The scripts describing the module have been developed in ubuntu 16.04 and assume such a system. +Other UNIX systems including OSX should be easy to adapt. + +Sample script for building Caffe + +```bash +#!/bin/bash +SRCROOT="${HOME}/caffe_inst/" +mkdir -p "$SRCROOT" +cd "$SRCROOT" +git clone https://github.com/sghoshcvc/TextBoxes.git +cd TextBoxes +cat Makefile.config.example > Makefile.config +echo 'USE_OPENCV := 0' >> Makefile.config +echo 'INCLUDE_DIRS += /usr/include/hdf5/serial/' >> Makefile.config +echo 'LIBRARY_DIRS += /usr/lib/x86_64-linux-gnu/hdf5/serial/' >> Makefile.config + + +echo "--- /tmp/caffe/include/caffe/net.hpp 2017-05-28 04:55:47.929623902 +0200 ++++ caffe/distribute/include/caffe/net.hpp 2017-05-28 04:51:33.437090768 +0200 +@@ -234,6 +234,7 @@ + + template + friend class Net; ++ virtual ~Callback(){} + }; + const vector& before_forward() const { return before_forward_; } + void add_before_forward(Callback* value) { +">/tmp/cleanup_caffe.diff + +patch < /tmp/cleanup_caffe.diff + + +make -j 6 + +make pycaffe + +make distribute +``` + + +```bash +#!/bin/bash +cd $OPENCV_BUILD_DIR #You must set this +CAFFEROOT="${HOME}/caffe_inst/" #If you used the previous code to compile Caffe in ubuntu 16.04 + +cmake -DCaffe_LIBS:FILEPATH="$CAFFEROOT/caffe/distribute/lib/libcaffe.so" -DBUILD_opencv_ts:BOOL="0" -DBUILD_opencv_dnn:BOOL="0" -DBUILD_opencv_dnn_modern:BOOL="0" -DCaffe_INCLUDE_DIR:PATH="$CAFFEROOT/caffe/distribute/include" -DWITH_MATLAB:BOOL="0" -DBUILD_opencv_cudabgsegm:BOOL="0" -DWITH_QT:BOOL="1" -DBUILD_opencv_cudaoptflow:BOOL="0" -DBUILD_opencv_cudastereo:BOOL="0" -DBUILD_opencv_cudafilters:BOOL="0" -DBUILD_opencv_cudev:BOOL="1" -DOPENCV_EXTRA_MODULES_PATH:PATH="/home/anguelos/work/projects/opencv_gsoc/opencv_contrib/modules" ./ + + ``` From be395e59814a32f4ff856e295a076c596de06a6e Mon Sep 17 00:00:00 2001 From: sghoshcvc Date: Wed, 19 Jul 2017 16:58:11 +0200 Subject: [PATCH 06/31] Modified the class heirarchy --- modules/text/include/opencv2/text/ocr.hpp | 4 - .../include/opencv2/text/textDetector.hpp | 124 ++++++- modules/text/src/ocr_holistic.cpp | 82 ++--- modules/text/src/text_detector.cpp | 12 +- modules/text/src/text_detectorCNN.cpp | 343 ++++++++++++++++++ 5 files changed, 509 insertions(+), 56 deletions(-) create mode 100644 modules/text/src/text_detectorCNN.cpp diff --git a/modules/text/include/opencv2/text/ocr.hpp b/modules/text/include/opencv2/text/ocr.hpp index 9593a1415fd..bd1c18ffb11 100644 --- a/modules/text/include/opencv2/text/ocr.hpp +++ b/modules/text/include/opencv2/text/ocr.hpp @@ -716,10 +716,6 @@ class CV_EXPORTS_W TextImageClassifier /** @brief produces a class confidence row-vector given an image */ CV_WRAP virtual void classify(InputArray image, OutputArray classProbabilities) = 0; - /** @brief produces a list of bounding box given an image - */ - - CV_WRAP virtual void detect(InputArray image, OutputArray classProbabilities) = 0; /** @brief produces a matrix containing class confidence row-vectors given an collection of images */ diff --git a/modules/text/include/opencv2/text/textDetector.hpp b/modules/text/include/opencv2/text/textDetector.hpp index ea1c7de9d4b..efbec6bffa9 100644 --- a/modules/text/include/opencv2/text/textDetector.hpp +++ b/modules/text/include/opencv2/text/textDetector.hpp @@ -65,19 +65,131 @@ namespace text //detection scenario class CV_EXPORTS_W BaseDetector { - public: +public: virtual ~BaseDetector() {}; virtual void run(Mat& image, - std::vector* component_rects=NULL, + std::vector* component_rects=NULL, std::vector* component_confidences=NULL, int component_level=0) = 0; virtual void run(Mat& image, Mat& mask, - std::vector* component_rects=NULL, + std::vector* component_rects=NULL, std::vector* component_confidences=NULL, int component_level=0) = 0; +}; +/** A virtual class for different models of text detection (including CNN based deep models) + */ + +class CV_EXPORTS_W TextRegionDetector +{ +protected: + /** Stores input and output size + */ + //netGeometry inputGeometry_; + //netGeometry outputGeometry_; + Size inputGeometry_; + Size outputGeometry_; + int inputChannelCount_; + int outputChannelCount_; + +public: + virtual ~TextRegionDetector() {} + + /** @brief produces a list of Bounding boxes and an estimate of text-ness confidence of Bounding Boxes + */ + CV_WRAP virtual void detect(InputArray image, OutputArray bboxProb ) = 0; + + + /** @brief simple getter method returning the size (height, width) of the input sample + */ + CV_WRAP virtual Size getInputGeometry(){return this->inputGeometry_;} + + /** @brief simple getter method returning the shape of the oputput + * Any text detector should output a number of text regions alongwith a score of text-ness + * From the shape it can be inferred the number of text regions and number of returned value + * for each region + */ + CV_WRAP virtual Size getOutputGeometry(){return this->outputGeometry_;} + + + +}; + +/** Generic structure of Deep CNN based Text Detectors + * */ +class CV_EXPORTS_W DeepCNNTextDetector : public TextRegionDetector +{ + /** @brief Class that uses a pretrained caffe model for text detection. + * Any text detection should + * This network is described in detail in: + * Minghui Liao et al.: TextBoxes: A Fast Text Detector with a Single Deep Neural Network + * https://arxiv.org/abs/1611.06779 + */ +protected: + /** all deep CNN based text detectors have a preprocessor (normally) + */ + Ptr preprocessor_; + /** @brief all image preprocessing is handled here including whitening etc. + * + * @param input the image to be preprocessed for the classifier. If the depth + * is CV_U8 values should be in [0,255] otherwise values are assumed to be in [0,1] + * + * @param output reference to the image to be fed to the classifier, the preprocessor will + * resize the image to the apropriate size and convert it to the apropriate depth\ + * + * The method preprocess should never be used externally, it is up to classify and classifyBatch + * methods to employ it. + */ + virtual void preprocess(const Mat& input,Mat& output); +public: + virtual ~DeepCNNTextDetector() {}; + + /** @brief Constructs a DeepCNNTextDetector object from a caffe pretrained model + * + * @param archFilename is the path to the prototxt file containing the deployment model architecture description. + * + * @param weightsFilename is the path to the pretrained weights of the model in binary fdorm. + * + * @param preprocessor is a pointer to the instance of a ImagePreprocessor implementing the preprocess_ protecteed method; + * + * @param minibatchSz the maximum number of samples that can processed in parallel. In practice this parameter + * has an effect only when computing in the GPU and should be set with respect to the memory available in the GPU. + * + * @param backEnd integer parameter selecting the coputation framework. For now OCR_HOLISTIC_BACKEND_CAFFE is + * the only option + */ + CV_WRAP static Ptr create(String archFilename,String weightsFilename,Ptr preprocessor,int minibatchSz=100,int backEnd=OCR_HOLISTIC_BACKEND_CAFFE); + + /** @brief Constructs a DeepCNNTextDetector intended to be used for text area detection. + * + * This method loads a pretrained classifier and couples with a preprocessor that preprocess the image with mean subtraction of () + * The architecture and models weights can be downloaded from: + * https://github.com/sghoshcvc/TextBox-Models.git (size is around 100 MB) + + * @param archFilename is the path to the prototxt file containing the deployment model architecture description. + * When employing OCR_HOLISTIC_BACKEND_CAFFE this is the path to the deploy ".prototxt". + * + * @param weightsFilename is the path to the pretrained weights of the model. When employing + * OCR_HOLISTIC_BACKEND_CAFFE this is the path to the ".caffemodel" file. + * + * @param backEnd integer parameter selecting the coputation framework. For now OCR_HOLISTIC_BACKEND_CAFFE is + * the only option + */ + CV_WRAP static Ptr createTextBoxNet(String archFilename,String weightsFilename,int backEnd=OCR_HOLISTIC_BACKEND_CAFFE); + friend class ImagePreprocessor; + +}; + +/** @brief textDetector class provides the functionallity of text bounding box detection. + * A TextRegionDetector is employed to find bounding boxes of text + * words given an input image. + * + * This class implements the logic of providing text bounding boxes in a vector of rects given an TextRegionDetector + * The TextRegionDetector can be any text detector + * + */ class CV_EXPORTS_W textDetector : public BaseDetector { @@ -125,9 +237,9 @@ class CV_EXPORTS_W textDetector : public BaseDetector - /** @brief simple getter for the preprocessing functor + /** @brief simple getter for the preprocessing functor */ - CV_WRAP virtual Ptr getClassifier()=0; + CV_WRAP virtual Ptr getClassifier()=0; /** @brief Creates an instance of the textDetector class. @@ -135,7 +247,7 @@ class CV_EXPORTS_W textDetector : public BaseDetector */ - CV_WRAP static Ptr create(Ptr classifierPtr); + CV_WRAP static Ptr create(Ptr classifierPtr); /** @brief Creates an instance of the textDetector class and implicitly also a DeepCNN classifier. diff --git a/modules/text/src/ocr_holistic.cpp b/modules/text/src/ocr_holistic.cpp index 9791e62bbf5..ae73b04dc86 100644 --- a/modules/text/src/ocr_holistic.cpp +++ b/modules/text/src/ocr_holistic.cpp @@ -459,53 +459,53 @@ class DeepCNNCaffeImpl: public DeepCNN{ #endif } - void process_(Mat inputImage, Mat &outputMat) - { - // do forward pass and stores the output in outputMat - //Process one image - CV_Assert(this->minibatchSz_==1); - //CV_Assert(outputMat.isContinuous()); +// void process_(Mat inputImage, Mat &outputMat) +// { +// // do forward pass and stores the output in outputMat +// //Process one image +// CV_Assert(this->minibatchSz_==1); +// //CV_Assert(outputMat.isContinuous()); -#ifdef HAVE_CAFFE - net_->input_blobs()[0]->Reshape(1, this->channelCount_,this->inputGeometry_.height,this->inputGeometry_.width); - net_->Reshape(); - float* inputBuffer=net_->input_blobs()[0]->mutable_cpu_data(); - float* inputData=inputBuffer; +//#ifdef HAVE_CAFFE +// net_->input_blobs()[0]->Reshape(1, this->channelCount_,this->inputGeometry_.height,this->inputGeometry_.width); +// net_->Reshape(); +// float* inputBuffer=net_->input_blobs()[0]->mutable_cpu_data(); +// float* inputData=inputBuffer; - std::vector input_channels; - Mat preprocessed; - // if the image have multiple color channels the input layer should be populated accordingly - for (int channel=0;channel < this->channelCount_;channel++){ +// std::vector input_channels; +// Mat preprocessed; +// // if the image have multiple color channels the input layer should be populated accordingly +// for (int channel=0;channel < this->channelCount_;channel++){ - cv::Mat netInputWraped(this->inputGeometry_.height, this->inputGeometry_.width, CV_32FC1, inputData); - input_channels.push_back(netInputWraped); - //input_data += width * height; - inputData+=(this->inputGeometry_.height*this->inputGeometry_.width); - } - this->preprocess(inputImage,preprocessed); - split(preprocessed, input_channels); +// cv::Mat netInputWraped(this->inputGeometry_.height, this->inputGeometry_.width, CV_32FC1, inputData); +// input_channels.push_back(netInputWraped); +// //input_data += width * height; +// inputData+=(this->inputGeometry_.height*this->inputGeometry_.width); +// } +// this->preprocess(inputImage,preprocessed); +// split(preprocessed, input_channels); - //preprocessed.copyTo(netInputWraped); +// //preprocessed.copyTo(netInputWraped); - this->net_->Forward(); - const float* outputNetData=net_->output_blobs()[0]->cpu_data(); - // const float* outputNetData1=net_->output_blobs()[1]->cpu_data(); +// this->net_->Forward(); +// const float* outputNetData=net_->output_blobs()[0]->cpu_data(); +// // const float* outputNetData1=net_->output_blobs()[1]->cpu_data(); - this->outputGeometry_ = Size(net_->output_blobs()[0]->width(),net_->output_blobs()[0]->height()); - int outputSz = this->outputSize_ * this->outputGeometry_.height * this->outputGeometry_.width; - outputMat.create(this->outputGeometry_.height , this->outputGeometry_.width,CV_32FC1); - float*outputMatData=(float*)(outputMat.data); +// this->outputGeometry_ = Size(net_->output_blobs()[0]->width(),net_->output_blobs()[0]->height()); +// int outputSz = this->outputSize_ * this->outputGeometry_.height * this->outputGeometry_.width; +// outputMat.create(this->outputGeometry_.height , this->outputGeometry_.width,CV_32FC1); +// float*outputMatData=(float*)(outputMat.data); - memcpy(outputMatData,outputNetData,sizeof(float)*outputSz); +// memcpy(outputMatData,outputNetData,sizeof(float)*outputSz); -#endif - } +//#endif +// } @@ -587,15 +587,15 @@ class DeepCNNCaffeImpl: public DeepCNN{ inputImageList.push_back(image.getMat()); classifyBatch(inputImageList,classProbabilities); } - void detect(InputArray image, OutputArray Bbox_prob) - { +// void detect(InputArray image, OutputArray Bbox_prob) +// { - Bbox_prob.create(this->outputGeometry_,CV_32F); // dummy initialization is it needed - Mat outputMat = Bbox_prob.getMat(); - process_(image.getMat(),outputMat); - //copy back to outputArray - outputMat.copyTo(Bbox_prob); - } +// Bbox_prob.create(this->outputGeometry_,CV_32F); // dummy initialization is it needed +// Mat outputMat = Bbox_prob.getMat(); +// process_(image.getMat(),outputMat); +// //copy back to outputArray +// outputMat.copyTo(Bbox_prob); +// } void classifyBatch(InputArrayOfArrays inputImageList, OutputArray classProbabilities) { diff --git a/modules/text/src/text_detector.cpp b/modules/text/src/text_detector.cpp index 5b18e970861..1b979c253bf 100644 --- a/modules/text/src/text_detector.cpp +++ b/modules/text/src/text_detector.cpp @@ -23,6 +23,8 @@ namespace cv { namespace text { + + class textDetectImpl: public textDetector{ private: struct NetOutput{ @@ -60,9 +62,9 @@ class textDetectImpl: public textDetector{ }; protected: - Ptr classifier_; + Ptr classifier_; public: - textDetectImpl(Ptr classifierPtr):classifier_(classifierPtr) + textDetectImpl(Ptr classifierPtr):classifier_(classifierPtr) { } @@ -131,13 +133,13 @@ class textDetectImpl: public textDetector{ - Ptr getClassifier() + Ptr getClassifier() { return this->classifier_; } }; -Ptr textDetector::create(Ptr classifierPtr) +Ptr textDetector::create(Ptr classifierPtr) { return Ptr(new textDetectImpl(classifierPtr)); } @@ -155,7 +157,7 @@ Ptr textDetector::create(String modelArchFilename, String modelWei textbox_mean.at(0,2)=123; preprocessor->set_mean(textbox_mean); // create a pointer to text box detector(textDetector) - Ptr classifierPtr(DeepCNN::create(modelArchFilename,modelWeightsFilename,preprocessor,1)); + Ptr classifierPtr(DeepCNNTextDetector::create(modelArchFilename,modelWeightsFilename,preprocessor,1)); return Ptr(new textDetectImpl(classifierPtr)); } diff --git a/modules/text/src/text_detectorCNN.cpp b/modules/text/src/text_detectorCNN.cpp new file mode 100644 index 00000000000..b48e97e7cd2 --- /dev/null +++ b/modules/text/src/text_detectorCNN.cpp @@ -0,0 +1,343 @@ +#include "precomp.hpp" +#include "opencv2/imgproc.hpp" +#include "opencv2/core.hpp" + + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#ifdef HAVE_CAFFE +#include "caffe/caffe.hpp" +#endif +namespace cv { namespace text { + +inline bool fileExists (String filename) { + std::ifstream f(filename.c_str()); + return f.good(); +} + +//************************************************************************************ +//****************** TextImageClassifier ***************************************** +//************************************************************************************ + +//void TextImageClassifier::preprocess(const Mat& input,Mat& output) +//{ +// this->preprocessor_->preprocess_(input,output,this->inputGeometry_,this->channelCount_); +//} + +//void TextImageClassifier::setPreprocessor(Ptr ptr) +//{ +// CV_Assert(!ptr.empty()); +// preprocessor_=ptr; +//} + +//Ptr TextImageClassifier::getPreprocessor() +//{ +// return preprocessor_; +//} + + +class DeepCNNTextDetectorCaffeImpl: public DeepCNNTextDetector{ +protected: + + + void process_(Mat inputImage, Mat &outputMat) + { + // do forward pass and stores the output in outputMat + //Process one image + // CV_Assert(this->outputGeometry_.batchSize==1); + //CV_Assert(outputMat.isContinuous()); + +#ifdef HAVE_CAFFE + net_->input_blobs()[0]->Reshape(1, this->inputChannelCount_,this->inputGeometry_.height,this->inputGeometry_.width); + net_->Reshape(); + float* inputBuffer=net_->input_blobs()[0]->mutable_cpu_data(); + float* inputData=inputBuffer; + + std::vector input_channels; + Mat preprocessed; + // if the image have multiple color channels the input layer should be populated accordingly + for (int channel=0;channel < this->inputChannelCount_;channel++){ + + cv::Mat netInputWraped(this->inputGeometry_.height, this->inputGeometry_.width, CV_32FC1, inputData); + input_channels.push_back(netInputWraped); + //input_data += width * height; + inputData+=(this->inputGeometry_.height*this->inputGeometry_.width); + } + this->preprocess(inputImage,preprocessed); + split(preprocessed, input_channels); + + //preprocessed.copyTo(netInputWraped); + + + this->net_->Forward(); + const float* outputNetData=net_->output_blobs()[0]->cpu_data(); + // const float* outputNetData1=net_->output_blobs()[1]->cpu_data(); + + + + + this->outputGeometry_.height = net_->output_blobs()[0]->height(); + this->outputGeometry_.width = net_->output_blobs()[0]->width(); + this->outputChannelCount_ = net_->output_blobs()[0]->channels(); + int outputSz = this->outputChannelCount_ * this->outputGeometry_.height * this->outputGeometry_.width; + outputMat.create(this->outputGeometry_.height , this->outputGeometry_.width,CV_32FC1); + float*outputMatData=(float*)(outputMat.data); + + memcpy(outputMatData,outputNetData,sizeof(float)*outputSz); + + + +#endif + } + + + +#ifdef HAVE_CAFFE + Ptr > net_; +#endif + //Size inputGeometry_; + int minibatchSz_;//The existence of the assignment operator mandates this to be nonconst + //int outputSize_; +public: + DeepCNNTextDetectorCaffeImpl(const DeepCNNTextDetectorCaffeImpl& dn): + minibatchSz_(dn.minibatchSz_){ + outputGeometry_=dn.outputGeometry_; + inputGeometry_=dn.inputGeometry_; + //Implemented to supress Visual Studio warning "assignment operator could not be generated" +#ifdef HAVE_CAFFE + this->net_=dn.net_; +#endif + } + DeepCNNTextDetectorCaffeImpl& operator=(const DeepCNNTextDetectorCaffeImpl &dn) + { +#ifdef HAVE_CAFFE + this->net_=dn.net_; +#endif + this->setPreprocessor(dn.preprocessor_); + this->inputGeometry_=dn.inputGeometry_; + this->inputChannelCount_=dn.inputChannelCount_; + this->outputChannelCount_ = dn.outputChannelCount_; + // this->minibatchSz_=dn.minibatchSz_; + //this->outputGeometry_=dn.outputSize_; + this->preprocessor_=dn.preprocessor_; + this->outputGeometry_=dn.outputGeometry_; + return *this; + //Implemented to supress Visual Studio warning "assignment operator could not be generated" + } + + DeepCNNTextDetectorCaffeImpl(String modelArchFilename, String modelWeightsFilename,Ptr preprocessor, int maxMinibatchSz) + :minibatchSz_(maxMinibatchSz) + { + + CV_Assert(this->minibatchSz_>0); + CV_Assert(fileExists(modelArchFilename)); + CV_Assert(fileExists(modelWeightsFilename)); + CV_Assert(!preprocessor.empty()); + this->setPreprocessor(preprocessor); +#ifdef HAVE_CAFFE + this->net_.reset(new caffe::Net(modelArchFilename, caffe::TEST)); + CV_Assert(net_->num_inputs()==1); + CV_Assert(net_->num_outputs()==1); + CV_Assert(this->net_->input_blobs()[0]->channels()==1 + ||this->net_->input_blobs()[0]->channels()==3); + // this->channelCount_=this->net_->input_blobs()[0]->channels(); + + + + this->net_->CopyTrainedLayersFrom(modelWeightsFilename); + + caffe::Blob* inputLayer = this->net_->input_blobs()[0]; + + this->inputGeometry_.height = inputLayer->height(); + this->inputGeometry_.width = inputLayer->width(); + this->inputChannelCount_ = inputLayer->channels(); + //this->inputGeometry_.batchSize =1; + + inputLayer->Reshape(this->minibatchSz_,this->inputChannelCount_,this->inputGeometry_.height, this->inputGeometry_.width); + net_->Reshape(); + this->outputChannelCount_ = net_->output_blobs()[0]->channels(); + //this->outputGeometry_.batchSize =1; + this->outputGeometry_.height =net_->output_blobs()[0]->height(); + this->outputGeometry_.width = net_->output_blobs()[0]->width(); + + + + + +#else + CV_Error(Error::StsError,"Caffe not available during compilation!"); +#endif + } + + + void detect(InputArray image, OutputArray Bbox_prob) + { + Size outSize = Size(this->outputGeometry_.height,outputGeometry_.width); + Bbox_prob.create(outSize,CV_32F); // dummy initialization is it needed + Mat outputMat = Bbox_prob.getMat(); + process_(image.getMat(),outputMat); + //copy back to outputArray + outputMat.copyTo(Bbox_prob); + } + + + + //int getOutputSize() + //{ + // return this->outputSize_; + //} + Size getOutputGeometry() + { + return this->outputGeometry_; + } + Size getinputGeometry() + { + return this->inputGeometry_; + } + + int getMinibatchSize() + { + return this->minibatchSz_; + } + + int getBackend() + { + return OCR_HOLISTIC_BACKEND_CAFFE; + } + void setPreprocessor(Ptr ptr) + { + CV_Assert(!ptr.empty()); + preprocessor_=ptr; + } + + Ptr getPreprocessor() + { + return preprocessor_; + } +}; + + +Ptr DeepCNNTextDetector::create(String archFilename,String weightsFilename,Ptr preprocessor,int minibatchSz,int backEnd) +{ + if(preprocessor.empty()) + { + // create a custom preprocessor with rawval + Ptr preprocessor=ImagePreprocessor::createImageCustomPreprocessor(255); + // set the mean for the preprocessor + + Mat textbox_mean(1,3,CV_8U); + textbox_mean.at(0,0)=104; + textbox_mean.at(0,1)=117; + textbox_mean.at(0,2)=123; + preprocessor->set_mean(textbox_mean); + } + switch(backEnd){ + case OCR_HOLISTIC_BACKEND_CAFFE: + + return Ptr(new DeepCNNTextDetectorCaffeImpl(archFilename, weightsFilename,preprocessor, minibatchSz)); + break; + case OCR_HOLISTIC_BACKEND_NONE: + default: + CV_Error(Error::StsError,"DeepCNN::create backend not implemented"); + return Ptr(); + break; + } + return Ptr(); + +} + + +Ptr DeepCNNTextDetector::createTextBoxNet(String archFilename,String weightsFilename,int backEnd) +{ + + // create a custom preprocessor with rawval + Ptr preprocessor=ImagePreprocessor::createImageCustomPreprocessor(255); + // set the mean for the preprocessor + + Mat textbox_mean(1,3,CV_8U); + textbox_mean.at(0,0)=104; + textbox_mean.at(0,1)=117; + textbox_mean.at(0,2)=123; + preprocessor->set_mean(textbox_mean); + switch(backEnd){ + case OCR_HOLISTIC_BACKEND_CAFFE: + return Ptr(new DeepCNNTextDetectorCaffeImpl(archFilename, weightsFilename,preprocessor, 100)); + break; + case OCR_HOLISTIC_BACKEND_NONE: + default: + CV_Error(Error::StsError,"DeepCNN::create backend not implemented"); + return Ptr(); + break; + } + return Ptr(); + +} + +void DeepCNNTextDetector::preprocess(const Mat& input,Mat& output) +{ + Size inputHtWd = Size(this->inputGeometry_.height,this->inputGeometry_.width); + this->preprocessor_->preprocess(input,output,inputHtWd,this->inputChannelCount_); +} + +//namespace cnn_config{ +//namespace caffe_backend{ + +//#ifdef HAVE_CAFFE + +//bool getCaffeGpuMode() +//{ +// return caffe::Caffe::mode()==caffe::Caffe::GPU; +//} + +//void setCaffeGpuMode(bool useGpu) +//{ +// if(useGpu) +// { +// caffe::Caffe::set_mode(caffe::Caffe::GPU); +// }else +// { +// caffe::Caffe::set_mode(caffe::Caffe::CPU); +// } +//} + +//bool getCaffeAvailable() +//{ +// return true; +//} + +//#else + +//bool getCaffeGpuMode() +//{ +// CV_Error(Error::StsError,"Caffe not available during compilation!"); +// return 0; +//} + +//void setCaffeGpuMode(bool useGpu) +//{ +// CV_Error(Error::StsError,"Caffe not available during compilation!"); +// CV_Assert(useGpu==1);//Compilation directives force +//} + +//bool getCaffeAvailable(){ +// return 0; +//} + +//#endif + +//}//namespace caffe +//}//namespace cnn_config + +} } //namespace text namespace cv + From 1bc908bdbd0b5f95c729d81d24b4862d9ff40f3e Mon Sep 17 00:00:00 2001 From: sghoshcvc Date: Wed, 19 Jul 2017 18:57:16 +0200 Subject: [PATCH 07/31] Added python sample script --- modules/text/samples/deeptextdetection.py | 59 +++++++++++++++++++++++ 1 file changed, 59 insertions(+) create mode 100644 modules/text/samples/deeptextdetection.py diff --git a/modules/text/samples/deeptextdetection.py b/modules/text/samples/deeptextdetection.py new file mode 100644 index 00000000000..e2f67a3f113 --- /dev/null +++ b/modules/text/samples/deeptextdetection.py @@ -0,0 +1,59 @@ +# -*- coding: utf-8 -*- +""" +Created on Wed Jul 19 17:54:00 2017 + +@author: sgnosh +""" + +#!/usr/bin/python + +import sys +import os + +import cv2 +import numpy as np + +print('\nDeeptextdetection.py') +print(' A demo script of text box alogorithm of the paper:') +print(' * Minghui Liao et al.: TextBoxes: A Fast Text Detector with a Single Deep Neural Network https://arxiv.org/abs/1611.06779\n') + + +if (len(sys.argv) < 2): + print(' (ERROR) You must call this script with an argument (path_to_image_to_be_processed)\n') + quit() +#if not cv2.text.cnn_config.caffe_backend.getCaffeAvailable(): +# print"The text module was compiled without Caffe which is the only available DeepCNN backend.\nAborting!\n" +# +# quit() +# check model and architecture file existance +if not os.path.isfile('textbox.caffemodel') or not os.path.isfile('textbox_deploy.prototxt'): + print " Model files not found in current directory. Aborting" + print " Model files should be downloaded from https://github.com/sghoshcvc/TextBox-Models" + + quit() + +cv2.text.cnn_config.caffe_backend.setCaffeGpuMode(True); +pathname = os.path.dirname(sys.argv[0]) + + +img = cv2.imread(str(sys.argv[1])) +textSpotter=cv2.text.textDetector_create( + "textbox_deploy.prototxt","textbox.caffemodel") +rects,outProbs = textSpotter.textDetectInImage(img); +# for visualization +vis = img.copy() +# Threshold to select rectangles : All rectangles for which outProbs is more than this threshold will be shown +thres = 0.6 + + + #Visualization +for r in range(0,np.shape(rects)[0]): + if outProbs[r] >thres: + rect = rects[r] + cv2.rectangle(vis, (rect[0],rect[1]), (rect[0]+rect[2],rect[1]+rect[3]), (255, 0, 0), 2) + # cv2.rectangle(vis, (rect[0],rect[1]), (rect[0]+rect[2],rect[1]+rect[3]), (255, 255, 255), 1) + + +#Visualization +cv2.imshow("Text detection result", vis) +cv2.waitKey(0) \ No newline at end of file From 73ddeab66f1d7c92458c0f60bfce23bea6eb13a4 Mon Sep 17 00:00:00 2001 From: sghoshcvc Date: Wed, 19 Jul 2017 19:01:30 +0200 Subject: [PATCH 08/31] simple cleaning and added comments --- .../text/include/opencv2/text/erfilter.hpp | 1 + modules/text/src/ocr_holistic.cpp | 59 -------------- modules/text/src/text_detector.cpp | 7 +- modules/text/src/text_detectorCNN.cpp | 80 +------------------ 4 files changed, 6 insertions(+), 141 deletions(-) diff --git a/modules/text/include/opencv2/text/erfilter.hpp b/modules/text/include/opencv2/text/erfilter.hpp index af983c6c168..84d72d2a0a4 100644 --- a/modules/text/include/opencv2/text/erfilter.hpp +++ b/modules/text/include/opencv2/text/erfilter.hpp @@ -65,6 +65,7 @@ component tree of the image. : */ struct CV_EXPORTS ERStat { + public: //! Constructor explicit ERStat(int level = 256, int pixel = 0, int x = 0, int y = 0); diff --git a/modules/text/src/ocr_holistic.cpp b/modules/text/src/ocr_holistic.cpp index ae73b04dc86..670d1a2154f 100644 --- a/modules/text/src/ocr_holistic.cpp +++ b/modules/text/src/ocr_holistic.cpp @@ -459,56 +459,6 @@ class DeepCNNCaffeImpl: public DeepCNN{ #endif } -// void process_(Mat inputImage, Mat &outputMat) -// { -// // do forward pass and stores the output in outputMat -// //Process one image -// CV_Assert(this->minibatchSz_==1); -// //CV_Assert(outputMat.isContinuous()); - -//#ifdef HAVE_CAFFE -// net_->input_blobs()[0]->Reshape(1, this->channelCount_,this->inputGeometry_.height,this->inputGeometry_.width); -// net_->Reshape(); -// float* inputBuffer=net_->input_blobs()[0]->mutable_cpu_data(); -// float* inputData=inputBuffer; - -// std::vector input_channels; -// Mat preprocessed; -// // if the image have multiple color channels the input layer should be populated accordingly -// for (int channel=0;channel < this->channelCount_;channel++){ - -// cv::Mat netInputWraped(this->inputGeometry_.height, this->inputGeometry_.width, CV_32FC1, inputData); -// input_channels.push_back(netInputWraped); -// //input_data += width * height; -// inputData+=(this->inputGeometry_.height*this->inputGeometry_.width); -// } -// this->preprocess(inputImage,preprocessed); -// split(preprocessed, input_channels); - -// //preprocessed.copyTo(netInputWraped); - - -// this->net_->Forward(); -// const float* outputNetData=net_->output_blobs()[0]->cpu_data(); -// // const float* outputNetData1=net_->output_blobs()[1]->cpu_data(); - - - - -// this->outputGeometry_ = Size(net_->output_blobs()[0]->width(),net_->output_blobs()[0]->height()); -// int outputSz = this->outputSize_ * this->outputGeometry_.height * this->outputGeometry_.width; -// outputMat.create(this->outputGeometry_.height , this->outputGeometry_.width,CV_32FC1); -// float*outputMatData=(float*)(outputMat.data); - -// memcpy(outputMatData,outputNetData,sizeof(float)*outputSz); - - - -//#endif -// } - - - #ifdef HAVE_CAFFE Ptr > net_; #endif @@ -587,15 +537,6 @@ class DeepCNNCaffeImpl: public DeepCNN{ inputImageList.push_back(image.getMat()); classifyBatch(inputImageList,classProbabilities); } -// void detect(InputArray image, OutputArray Bbox_prob) -// { - -// Bbox_prob.create(this->outputGeometry_,CV_32F); // dummy initialization is it needed -// Mat outputMat = Bbox_prob.getMat(); -// process_(image.getMat(),outputMat); -// //copy back to outputArray -// outputMat.copyTo(Bbox_prob); -// } void classifyBatch(InputArrayOfArrays inputImageList, OutputArray classProbabilities) { diff --git a/modules/text/src/text_detector.cpp b/modules/text/src/text_detector.cpp index 1b979c253bf..9b6d4f966a4 100644 --- a/modules/text/src/text_detector.cpp +++ b/modules/text/src/text_detector.cpp @@ -74,7 +74,7 @@ class textDetectImpl: public textDetector{ void textDetectInImage(InputArray inputImage,CV_OUT std::vector& Bbox,CV_OUT std::vector& confidence) { Mat netOutput; - // call the detect function of deepCNN class + // call the detect function of deepTextCNN class this->classifier_->detect(inputImage,netOutput); // get the output geometry i.e height and width of output blob from caffe Size OutputGeometry_ = this->classifier_->getOutputGeometry(); @@ -102,12 +102,11 @@ class textDetectImpl: public textDetector{ int component_level=0) { CV_Assert(component_level==OCR_LEVEL_WORD);//Componnents not applicable for word spotting - //double confidence; - //String transcription; + std::vector bbox; std::vector score; textDetectInImage(image,bbox,score); - //output_text=transcription.c_str(); + if(component_rects!=NULL) { component_rects->resize(bbox.size()); // should be a user behavior diff --git a/modules/text/src/text_detectorCNN.cpp b/modules/text/src/text_detectorCNN.cpp index b48e97e7cd2..14cdaeb3887 100644 --- a/modules/text/src/text_detectorCNN.cpp +++ b/modules/text/src/text_detectorCNN.cpp @@ -26,27 +26,6 @@ inline bool fileExists (String filename) { return f.good(); } -//************************************************************************************ -//****************** TextImageClassifier ***************************************** -//************************************************************************************ - -//void TextImageClassifier::preprocess(const Mat& input,Mat& output) -//{ -// this->preprocessor_->preprocess_(input,output,this->inputGeometry_,this->channelCount_); -//} - -//void TextImageClassifier::setPreprocessor(Ptr ptr) -//{ -// CV_Assert(!ptr.empty()); -// preprocessor_=ptr; -//} - -//Ptr TextImageClassifier::getPreprocessor() -//{ -// return preprocessor_; -//} - - class DeepCNNTextDetectorCaffeImpl: public DeepCNNTextDetector{ protected: @@ -54,9 +33,7 @@ class DeepCNNTextDetectorCaffeImpl: public DeepCNNTextDetector{ void process_(Mat inputImage, Mat &outputMat) { // do forward pass and stores the output in outputMat - //Process one image - // CV_Assert(this->outputGeometry_.batchSize==1); - //CV_Assert(outputMat.isContinuous()); + #ifdef HAVE_CAFFE net_->input_blobs()[0]->Reshape(1, this->inputChannelCount_,this->inputGeometry_.height,this->inputGeometry_.width); @@ -191,12 +168,6 @@ class DeepCNNTextDetectorCaffeImpl: public DeepCNNTextDetector{ outputMat.copyTo(Bbox_prob); } - - - //int getOutputSize() - //{ - // return this->outputSize_; - //} Size getOutputGeometry() { return this->outputGeometry_; @@ -290,54 +261,7 @@ void DeepCNNTextDetector::preprocess(const Mat& input,Mat& output) this->preprocessor_->preprocess(input,output,inputHtWd,this->inputChannelCount_); } -//namespace cnn_config{ -//namespace caffe_backend{ - -//#ifdef HAVE_CAFFE - -//bool getCaffeGpuMode() -//{ -// return caffe::Caffe::mode()==caffe::Caffe::GPU; -//} - -//void setCaffeGpuMode(bool useGpu) -//{ -// if(useGpu) -// { -// caffe::Caffe::set_mode(caffe::Caffe::GPU); -// }else -// { -// caffe::Caffe::set_mode(caffe::Caffe::CPU); -// } -//} - -//bool getCaffeAvailable() -//{ -// return true; -//} - -//#else - -//bool getCaffeGpuMode() -//{ -// CV_Error(Error::StsError,"Caffe not available during compilation!"); -// return 0; -//} - -//void setCaffeGpuMode(bool useGpu) -//{ -// CV_Error(Error::StsError,"Caffe not available during compilation!"); -// CV_Assert(useGpu==1);//Compilation directives force -//} - -//bool getCaffeAvailable(){ -// return 0; -//} - -//#endif - -//}//namespace caffe -//}//namespace cnn_config + } } //namespace text namespace cv From 8cf800e650522e6f78070aa4224880c334181a16 Mon Sep 17 00:00:00 2001 From: sghoshcvc Date: Fri, 21 Jul 2017 03:09:06 +0200 Subject: [PATCH 09/31] fix a dependency bug --- modules/text/src/precomp.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/text/src/precomp.hpp b/modules/text/src/precomp.hpp index 94f05d8cc99..c7371db1e79 100644 --- a/modules/text/src/precomp.hpp +++ b/modules/text/src/precomp.hpp @@ -45,7 +45,7 @@ #include "opencv2/text.hpp" -#include "text_config.hpp" +//#include "text_config.hpp" #ifdef HAVE_TESSERACT #include From a617059f24bef66c160606b6952ebd7154a31d7f Mon Sep 17 00:00:00 2001 From: sghoshcvc Date: Fri, 21 Jul 2017 13:11:58 +0200 Subject: [PATCH 10/31] removed Java Wrapper --- modules/text/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/text/CMakeLists.txt b/modules/text/CMakeLists.txt index 861848f704a..a8a32326f52 100644 --- a/modules/text/CMakeLists.txt +++ b/modules/text/CMakeLists.txt @@ -31,7 +31,7 @@ else() message(STATUS "Glog: NO") endif() -ocv_define_module(text opencv_ml opencv_imgproc opencv_core opencv_features2d opencv_calib3d WRAP python java) +ocv_define_module(text opencv_ml opencv_imgproc opencv_core opencv_features2d opencv_calib3d WRAP python) #ocv_define_module(text ${TEXT_DEPS} WRAP python) #set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${CMAKE_CURRENT_SOURCE_DIR}) From ca2a2abed0bdb56d796b144505591a083a40a6a3 Mon Sep 17 00:00:00 2001 From: sghoshcvc Date: Sat, 22 Jul 2017 00:24:17 +0200 Subject: [PATCH 11/31] Removed white space errors and platform specific warnings --- modules/text/CMakeLists.txt | 1 - modules/text/FindTesseract.cmake | 4 +-- modules/text/README.md | 8 ++--- modules/text/include/opencv2/text/ocr.hpp | 36 ++++++++++------------- modules/text/samples/deeptextdetection.py | 3 +- modules/text/samples/textbox_demo.cpp | 4 +-- modules/text/src/text_detectorCNN.cpp | 9 ++++-- 7 files changed, 31 insertions(+), 34 deletions(-) diff --git a/modules/text/CMakeLists.txt b/modules/text/CMakeLists.txt index a8a32326f52..5d5a52b4ad6 100644 --- a/modules/text/CMakeLists.txt +++ b/modules/text/CMakeLists.txt @@ -67,4 +67,3 @@ if() else() message(STATUS "TEXT CAFFE CONFLICT") endif() - diff --git a/modules/text/FindTesseract.cmake b/modules/text/FindTesseract.cmake index 54c4a49297d..4622ece142e 100644 --- a/modules/text/FindTesseract.cmake +++ b/modules/text/FindTesseract.cmake @@ -19,6 +19,4 @@ find_library(Lept_LIBRARY NAMES lept set(Tesseract_LIBS ${Tesseract_LIBRARY} ${Lept_LIBRARY}) if(Tesseract_LIBS AND Tesseract_INCLUDE_DIR) set(Tesseract_FOUND 1) -endif() - - +endif() diff --git a/modules/text/README.md b/modules/text/README.md index a82bef20f06..2caf58a1e17 100644 --- a/modules/text/README.md +++ b/modules/text/README.md @@ -66,7 +66,7 @@ Instalation of Caffe backend The caffe wrapping backend has the requirements caffe does. * Caffe can be built against OpenCV, if the caffe backend is enabled, a circular bependency arises. The simplest solution is to build caffe without support for OpenCV. -* Only the OS supported by Caffe are supported by the backend. +* Only the OS supported by Caffe are supported by the backend. The scripts describing the module have been developed in ubuntu 16.04 and assume such a system. Other UNIX systems including OSX should be easy to adapt. @@ -90,7 +90,7 @@ echo 'LIBRARY_DIRS += /usr/lib/x86_64-linux-gnu/hdf5/serial/' >> Makefile.config echo "--- /tmp/caffe/include/caffe/net.hpp 2017-05-28 04:55:47.929623902 +0200 +++ caffe/distribute/include/caffe/net.hpp 2017-05-28 04:51:33.437090768 +0200 @@ -234,6 +234,7 @@ - + template friend class Net; + virtual ~Callback(){} @@ -138,7 +138,7 @@ Instalation of Caffe backend The caffe wrapping backend has the requirements caffe does. * Caffe can be built against OpenCV, if the caffe backend is enabled, a circular bependency arises. The simplest solution is to build caffe without support for OpenCV. -* Only the OS supported by Caffe are supported by the backend. +* Only the OS supported by Caffe are supported by the backend. The scripts describing the module have been developed in ubuntu 16.04 and assume such a system. Other UNIX systems including OSX should be easy to adapt. @@ -160,7 +160,7 @@ echo 'LIBRARY_DIRS += /usr/lib/x86_64-linux-gnu/hdf5/serial/' >> Makefile.config echo "--- /tmp/caffe/include/caffe/net.hpp 2017-05-28 04:55:47.929623902 +0200 +++ caffe/distribute/include/caffe/net.hpp 2017-05-28 04:51:33.437090768 +0200 @@ -234,6 +234,7 @@ - + template friend class Net; + virtual ~Callback(){} diff --git a/modules/text/include/opencv2/text/ocr.hpp b/modules/text/include/opencv2/text/ocr.hpp index 8030fcb63e9..e01a16f7275 100644 --- a/modules/text/include/opencv2/text/ocr.hpp +++ b/modules/text/include/opencv2/text/ocr.hpp @@ -91,7 +91,7 @@ enum ocr_engine_mode }; //base class BaseOCR declares a common API that would be used in a typical text recognition scenario - + class CV_EXPORTS_W BaseOCR { public: @@ -188,7 +188,7 @@ class CV_EXPORTS_W OCRTesseract : public BaseOCR /** @brief Creates an instance of the OCRTesseract class. Initializes Tesseract. - + @param datapath the name of the parent directory of tessdata ended with "/", or NULL to use the system's default directory. @param language an ISO 639-3 code or NULL will default to "eng". @@ -277,8 +277,7 @@ class CV_EXPORTS_W OCRHMMDecoder : public BaseOCR { * for the individual text elements found (e.g. words). * @param component_texts If provided the method will output a list of text - * strings for the recognition of individual text elements found (e.g. words) - * . + * strings for the recognition of individual text elements found (e.g. words). * @param component_confidences If provided the method will output a list of * confidence values for the recognition of individual text elements found @@ -314,8 +313,7 @@ class CV_EXPORTS_W OCRHMMDecoder : public BaseOCR { * for the individual text elements found (e.g. words). * @param component_texts If provided the method will output a list of text - * strings for the recognition of individual text elements found (e.g. words) - * . + * strings for the recognition of individual text elements found (e.g. words). * @param component_confidences If provided the method will output a list of * confidence values for the recognition of individual text elements found @@ -596,34 +594,32 @@ class CV_EXPORTS_W OCRBeamSearchDecoder : public BaseOCR{ int mode = OCR_DECODER_VITERBI, // HMM Decoding algorithm (only Viterbi for the moment) int beam_size = 500); // Size of the beam in Beam Search algorithm - /** @brief This method allows to plug a classifier that is derivative of TextImageClassifier in to - * OCRBeamSearchDecoder as a ClassifierCallback. - @param classifier A pointer to a TextImageClassifier decendent - @param alphabet The language alphabet one char per symbol. alphabet.size() must be equal to the number of classes - of the classifier. In future editinons it should be replaced with a vector of strings. + + + /** @brief Creates an instance of the OCRBeamSearchDecoder class. Initializes HMMDecoder from the specified path. + + @overload + + @param filename path to a character classifier file + + @param vocabulary The language vocabulary (chars when ASCII English text). vocabulary.size() + must be equal to the number of classes of the classifier.. @param transition_probabilities_table Table with transition probabilities between character - pairs. cols == rows == alphabet.size(). + pairs. cols == rows == vocabulary.size(). @param emission_probabilities_table Table with observation emission probabilities. cols == - rows == alphabet.size(). + rows == vocabulary.size(). @param windowWidth The width of the windows to which the sliding window will be iterated. The height will be the height of the image. The windows might be resized to fit the classifiers input by the classifiers preprocessor. - @param windowStep The step for the sliding window - @param mode HMM Decoding algorithm (only Viterbi for the moment) @param beam_size Size of the beam in Beam Search algorithm - */ - - /** @brief Creates an instance of the OCRBeamSearchDecoder class. Initializes HMMDecoder from the specified path. - - @overload */ CV_WRAP static Ptr create(const String& filename, // The character classifier file diff --git a/modules/text/samples/deeptextdetection.py b/modules/text/samples/deeptextdetection.py index e2f67a3f113..8bc7a642255 100644 --- a/modules/text/samples/deeptextdetection.py +++ b/modules/text/samples/deeptextdetection.py @@ -25,11 +25,10 @@ # print"The text module was compiled without Caffe which is the only available DeepCNN backend.\nAborting!\n" # # quit() -# check model and architecture file existance +# check model and architecture file existance if not os.path.isfile('textbox.caffemodel') or not os.path.isfile('textbox_deploy.prototxt'): print " Model files not found in current directory. Aborting" print " Model files should be downloaded from https://github.com/sghoshcvc/TextBox-Models" - quit() cv2.text.cnn_config.caffe_backend.setCaffeGpuMode(True); diff --git a/modules/text/samples/textbox_demo.cpp b/modules/text/samples/textbox_demo.cpp index a4155893543..e36015831cf 100644 --- a/modules/text/samples/textbox_demo.cpp +++ b/modules/text/samples/textbox_demo.cpp @@ -61,7 +61,7 @@ int main(int argc, const char * argv[]){ exit(1); } //set to true if you have a GPU with more than 3GB - cv::text::cnn_config::caffe_backend::setCaffeGpuMode(false); + cv::text::cnn_config::caffe_backend::setCaffeGpuMode(true); if (argc < 3){ std::cout< outProbabillities; textSpotter->textDetectInImage(image,bbox,outProbabillities); // textbox_draw(image, bbox,outProbabillities); - float thres =0.6; + float thres =0.6f; std::vector imageList; for(int imageIdx=0;imageIdx<(int)bbox.size();imageIdx++){ if(outProbabillities[imageIdx]>thres){ diff --git a/modules/text/src/text_detectorCNN.cpp b/modules/text/src/text_detectorCNN.cpp index 14cdaeb3887..cf3a0c8baa0 100644 --- a/modules/text/src/text_detectorCNN.cpp +++ b/modules/text/src/text_detectorCNN.cpp @@ -19,6 +19,9 @@ #ifdef HAVE_CAFFE #include "caffe/caffe.hpp" #endif + +#define CV_WARN(message) fprintf(stderr, "warning: %s (%s:%d)\n", message, __FILE__, __LINE__) + namespace cv { namespace text { inline bool fileExists (String filename) { @@ -33,6 +36,9 @@ class DeepCNNTextDetectorCaffeImpl: public DeepCNNTextDetector{ void process_(Mat inputImage, Mat &outputMat) { // do forward pass and stores the output in outputMat + CV_Assert(outputMat.isContinuous()); + if (inputImage.channels() != this->inputChannelCount_) + CV_WARN("Number of input channel(s) in the model is not same as input"); #ifdef HAVE_CAFFE @@ -204,7 +210,7 @@ Ptr DeepCNNTextDetector::create(String archFilename,String if(preprocessor.empty()) { // create a custom preprocessor with rawval - Ptr preprocessor=ImagePreprocessor::createImageCustomPreprocessor(255); + preprocessor=ImagePreprocessor::createImageCustomPreprocessor(255); // set the mean for the preprocessor Mat textbox_mean(1,3,CV_8U); @@ -264,4 +270,3 @@ void DeepCNNTextDetector::preprocess(const Mat& input,Mat& output) } } //namespace text namespace cv - From b913cac1df768f615b31c8bb70a87217d08cba53 Mon Sep 17 00:00:00 2001 From: sghoshcvc Date: Sat, 22 Jul 2017 19:13:41 +0200 Subject: [PATCH 12/31] Fixed Doxygen Warning and error --- modules/text/FindTesseract.cmake | 2 +- modules/text/include/opencv2/text/ocr.hpp | 4 ---- modules/text/samples/deeptextdetection.py | 4 ++-- modules/text/samples/textbox_demo.cpp | 4 +--- 4 files changed, 4 insertions(+), 10 deletions(-) diff --git a/modules/text/FindTesseract.cmake b/modules/text/FindTesseract.cmake index 4622ece142e..01835e61bc7 100644 --- a/modules/text/FindTesseract.cmake +++ b/modules/text/FindTesseract.cmake @@ -19,4 +19,4 @@ find_library(Lept_LIBRARY NAMES lept set(Tesseract_LIBS ${Tesseract_LIBRARY} ${Lept_LIBRARY}) if(Tesseract_LIBS AND Tesseract_INCLUDE_DIR) set(Tesseract_FOUND 1) -endif() +endif() diff --git a/modules/text/include/opencv2/text/ocr.hpp b/modules/text/include/opencv2/text/ocr.hpp index e01a16f7275..258273f710e 100644 --- a/modules/text/include/opencv2/text/ocr.hpp +++ b/modules/text/include/opencv2/text/ocr.hpp @@ -613,10 +613,6 @@ class CV_EXPORTS_W OCRBeamSearchDecoder : public BaseOCR{ @param emission_probabilities_table Table with observation emission probabilities. cols == rows == vocabulary.size(). - @param windowWidth The width of the windows to which the sliding window will be iterated. The height will - be the height of the image. The windows might be resized to fit the classifiers input by the classifiers - preprocessor. - @param mode HMM Decoding algorithm (only Viterbi for the moment) @param beam_size Size of the beam in Beam Search algorithm diff --git a/modules/text/samples/deeptextdetection.py b/modules/text/samples/deeptextdetection.py index 8bc7a642255..060fbacacab 100644 --- a/modules/text/samples/deeptextdetection.py +++ b/modules/text/samples/deeptextdetection.py @@ -25,12 +25,12 @@ # print"The text module was compiled without Caffe which is the only available DeepCNN backend.\nAborting!\n" # # quit() -# check model and architecture file existance +# check model and architecture file existance if not os.path.isfile('textbox.caffemodel') or not os.path.isfile('textbox_deploy.prototxt'): print " Model files not found in current directory. Aborting" print " Model files should be downloaded from https://github.com/sghoshcvc/TextBox-Models" quit() - + cv2.text.cnn_config.caffe_backend.setCaffeGpuMode(True); pathname = os.path.dirname(sys.argv[0]) diff --git a/modules/text/samples/textbox_demo.cpp b/modules/text/samples/textbox_demo.cpp index e36015831cf..49d9b6a792a 100644 --- a/modules/text/samples/textbox_demo.cpp +++ b/modules/text/samples/textbox_demo.cpp @@ -17,6 +17,7 @@ #include #include +void textbox_draw(cv::Mat &src, std::vector &groups,std::vector &probs,std::vector wordList,float thres=0.6); inline std::string getHelpStr(std::string progFname){ std::stringstream out; out << " Demo of text detection CNN for text detection." << std::endl; @@ -140,7 +141,4 @@ int main(int argc, const char * argv[]){ std::cout << "Press any key to exit." << std::endl << std::endl; if ((cv::waitKey()&0xff) == ' ') return 0; - - } - From 4c9af581335e867e8494a2f7958c89c1f18fe73b Mon Sep 17 00:00:00 2001 From: sghoshcvc Date: Sat, 22 Jul 2017 19:24:39 +0200 Subject: [PATCH 13/31] Fixed Text box demo error --- modules/text/samples/textbox_demo.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/text/samples/textbox_demo.cpp b/modules/text/samples/textbox_demo.cpp index 49d9b6a792a..8dbf2469264 100644 --- a/modules/text/samples/textbox_demo.cpp +++ b/modules/text/samples/textbox_demo.cpp @@ -17,7 +17,7 @@ #include #include -void textbox_draw(cv::Mat &src, std::vector &groups,std::vector &probs,std::vector wordList,float thres=0.6); +void textbox_draw(cv::Mat &src, std::vector &groups,std::vector &probs,std::vector wordList,float thres); inline std::string getHelpStr(std::string progFname){ std::stringstream out; out << " Demo of text detection CNN for text detection." << std::endl; From 103fbaf4f2933f99cbb92a3cd7be0b3d1ad819c5 Mon Sep 17 00:00:00 2001 From: sghoshcvc Date: Mon, 24 Jul 2017 01:11:01 +0200 Subject: [PATCH 14/31] White Space error in sample python script --- modules/text/samples/deeptextdetection.py | 1 - 1 file changed, 1 deletion(-) diff --git a/modules/text/samples/deeptextdetection.py b/modules/text/samples/deeptextdetection.py index 060fbacacab..2e8395b60f1 100644 --- a/modules/text/samples/deeptextdetection.py +++ b/modules/text/samples/deeptextdetection.py @@ -30,7 +30,6 @@ print " Model files not found in current directory. Aborting" print " Model files should be downloaded from https://github.com/sghoshcvc/TextBox-Models" quit() - cv2.text.cnn_config.caffe_backend.setCaffeGpuMode(True); pathname = os.path.dirname(sys.argv[0]) From 0e74d63d2f894731aec7b7644be8ad042801e979 Mon Sep 17 00:00:00 2001 From: sghoshcvc Date: Mon, 24 Jul 2017 01:16:15 +0200 Subject: [PATCH 15/31] Modified to handle windows warning --- modules/text/src/text_detectorCNN.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/text/src/text_detectorCNN.cpp b/modules/text/src/text_detectorCNN.cpp index cf3a0c8baa0..3865e186c7b 100644 --- a/modules/text/src/text_detectorCNN.cpp +++ b/modules/text/src/text_detectorCNN.cpp @@ -230,7 +230,7 @@ Ptr DeepCNNTextDetector::create(String archFilename,String return Ptr(); break; } - return Ptr(); + //return Ptr(); } @@ -257,7 +257,7 @@ Ptr DeepCNNTextDetector::createTextBoxNet(String archFilena return Ptr(); break; } - return Ptr(); + //return Ptr(); } From 111b3bed7d50f6e39ffd912cca1b761d8c21009a Mon Sep 17 00:00:00 2001 From: sghoshcvc Date: Mon, 24 Jul 2017 13:29:45 +0200 Subject: [PATCH 16/31] Modified to silent Clang warnings --- modules/text/include/opencv2/text/ocr.hpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/modules/text/include/opencv2/text/ocr.hpp b/modules/text/include/opencv2/text/ocr.hpp index 258273f710e..15db8de8231 100644 --- a/modules/text/include/opencv2/text/ocr.hpp +++ b/modules/text/include/opencv2/text/ocr.hpp @@ -165,6 +165,7 @@ class CV_EXPORTS_W OCRTesseract : public BaseOCR @param component_level OCR_LEVEL_WORD (by default), or OCR_LEVEL_TEXT_LINE. */ + using BaseOCR::run; virtual void run (Mat& image, std::string& output_text, std::vector* component_rects=NULL, std::vector* component_texts=NULL, @@ -285,6 +286,7 @@ class CV_EXPORTS_W OCRHMMDecoder : public BaseOCR { * @param component_level Only OCR_LEVEL_WORD is supported. */ + using BaseOCR::run; virtual void run (Mat& image, std::string& output_text, std::vector* component_rects=NULL, std::vector* component_texts=NULL, @@ -542,6 +544,7 @@ class CV_EXPORTS_W OCRBeamSearchDecoder : public BaseOCR{ @param component_level Only OCR_LEVEL_WORD is supported. */ + using BaseOCR::run; virtual void run(Mat& image, std::string& output_text, std::vector* component_rects=NULL, std::vector* component_texts=NULL, std::vector* component_confidences=NULL, int component_level=0); From a2cab07193689e4f3552e0c12a0256da030bf22f Mon Sep 17 00:00:00 2001 From: sghoshcvc Date: Tue, 22 Aug 2017 11:12:33 +0200 Subject: [PATCH 17/31] DNN backend initial commit --- modules/text/CMakeLists.txt | 10 +- modules/text/include/opencv2/text/ocr.hpp | 11 +- .../include/opencv2/text/textDetector.hpp | 4 +- modules/text/samples/textbox_demo.cpp | 3 +- modules/text/src/ocr_holistic.cpp | 238 ++++++++++++++++ modules/text/src/text_detectorCNN.cpp | 255 +++++++++++++++++- 6 files changed, 511 insertions(+), 10 deletions(-) diff --git a/modules/text/CMakeLists.txt b/modules/text/CMakeLists.txt index 5d5a52b4ad6..f9649ca336f 100644 --- a/modules/text/CMakeLists.txt +++ b/modules/text/CMakeLists.txt @@ -31,7 +31,7 @@ else() message(STATUS "Glog: NO") endif() -ocv_define_module(text opencv_ml opencv_imgproc opencv_core opencv_features2d opencv_calib3d WRAP python) +ocv_define_module(text opencv_ml opencv_imgproc opencv_core opencv_features2d opencv_calib3d OPTIONAL opencv_dnn WRAP python) #ocv_define_module(text ${TEXT_DEPS} WRAP python) #set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${CMAKE_CURRENT_SOURCE_DIR}) @@ -67,3 +67,11 @@ if() else() message(STATUS "TEXT CAFFE CONFLICT") endif() + +if(HAVE_opencv_dnn) + message(STATUS "dnn module found") + add_definitions(-DHAVE_DNN) + set(HAVE_DNN 1) +else() + message(STATUS "dnn module not found") +endif() diff --git a/modules/text/include/opencv2/text/ocr.hpp b/modules/text/include/opencv2/text/ocr.hpp index 15db8de8231..3c739093559 100644 --- a/modules/text/include/opencv2/text/ocr.hpp +++ b/modules/text/include/opencv2/text/ocr.hpp @@ -658,9 +658,12 @@ CV_EXPORTS_W Ptr loadOCRBeamSearchClas //Classifiers should provide diferent backends //For the moment only caffe is implemeted + enum{ - OCR_HOLISTIC_BACKEND_NONE, - OCR_HOLISTIC_BACKEND_CAFFE + OCR_HOLISTIC_BACKEND_NONE, //No back end + OCR_HOLISTIC_BACKEND_DNN, // dnn backend opencv_dnn + OCR_HOLISTIC_BACKEND_CAFFE, // caffe based backend + OCR_HOLISTIC_BACKEND_DEFAULT // to store default value based on environment }; class TextImageClassifier; @@ -831,7 +834,7 @@ class CV_EXPORTS_W DeepCNN:public TextImageClassifier * @param backEnd integer parameter selecting the coputation framework. For now OCR_HOLISTIC_BACKEND_CAFFE is * the only option */ - CV_WRAP static Ptr create(String archFilename,String weightsFilename,Ptr preprocessor,int minibatchSz=100,int backEnd=OCR_HOLISTIC_BACKEND_CAFFE); + CV_WRAP static Ptr create(String archFilename,String weightsFilename,Ptr preprocessor,int minibatchSz=100,int backEnd=OCR_HOLISTIC_BACKEND_DEFAULT); /** @brief Constructs a DeepCNN intended to be used for word spotting. * @@ -853,7 +856,7 @@ class CV_EXPORTS_W DeepCNN:public TextImageClassifier * @param backEnd integer parameter selecting the coputation framework. For now OCR_HOLISTIC_BACKEND_CAFFE is * the only option */ - CV_WRAP static Ptr createDictNet(String archFilename,String weightsFilename,int backEnd=OCR_HOLISTIC_BACKEND_CAFFE); + CV_WRAP static Ptr createDictNet(String archFilename,String weightsFilename,int backEnd=OCR_HOLISTIC_BACKEND_DEFAULT); }; diff --git a/modules/text/include/opencv2/text/textDetector.hpp b/modules/text/include/opencv2/text/textDetector.hpp index efbec6bffa9..ad1b53deed3 100644 --- a/modules/text/include/opencv2/text/textDetector.hpp +++ b/modules/text/include/opencv2/text/textDetector.hpp @@ -160,7 +160,7 @@ class CV_EXPORTS_W DeepCNNTextDetector : public TextRegionDetector * @param backEnd integer parameter selecting the coputation framework. For now OCR_HOLISTIC_BACKEND_CAFFE is * the only option */ - CV_WRAP static Ptr create(String archFilename,String weightsFilename,Ptr preprocessor,int minibatchSz=100,int backEnd=OCR_HOLISTIC_BACKEND_CAFFE); + CV_WRAP static Ptr create(String archFilename,String weightsFilename,Ptr preprocessor,int minibatchSz=100,int backEnd=OCR_HOLISTIC_BACKEND_DEFAULT); /** @brief Constructs a DeepCNNTextDetector intended to be used for text area detection. * @@ -177,7 +177,7 @@ class CV_EXPORTS_W DeepCNNTextDetector : public TextRegionDetector * @param backEnd integer parameter selecting the coputation framework. For now OCR_HOLISTIC_BACKEND_CAFFE is * the only option */ - CV_WRAP static Ptr createTextBoxNet(String archFilename,String weightsFilename,int backEnd=OCR_HOLISTIC_BACKEND_CAFFE); + CV_WRAP static Ptr createTextBoxNet(String archFilename,String weightsFilename,int backEnd=OCR_HOLISTIC_BACKEND_DEFAULT); friend class ImagePreprocessor; }; diff --git a/modules/text/samples/textbox_demo.cpp b/modules/text/samples/textbox_demo.cpp index 8dbf2469264..75a18a31552 100644 --- a/modules/text/samples/textbox_demo.cpp +++ b/modules/text/samples/textbox_demo.cpp @@ -59,9 +59,10 @@ void textbox_draw(cv::Mat &src, std::vector &groups,std::vector inputImageList, Mat outputMat) + { + //Classifies a list of images containing at most minibatchSz_ images + CV_Assert(int(inputImageList.size())<=this->minibatchSz_); + CV_Assert(outputMat.isContinuous()); + +#ifdef HAVE_DNN + + std::vector preProcessedImList; // to store preprocessed images, should it be handled inside preprocessing class? + + Mat preprocessed; + // preprocesses each image in the inputImageList and push to preprocessedImList + for(size_t imgNum=0;imgNumpreprocess(inputImageList[imgNum],preprocessed); + preProcessedImList.push_back(preprocessed); + } + // set input data blob in dnn::net + net_->setInput(blobFromImages(preProcessedImList,1, Size(100, 32)), "data"); + + float*outputMatData=(float*)(outputMat.data); + //Mat outputNet(inputImageList.size(),this->outputSize_,CV_32FC1,outputMatData) ; + Mat outputNet = this->net_->forward(); + outputNet = outputNet.reshape(1, 1); + + float*outputNetData=(float*)(outputNet.data); + + memcpy(outputMatData,outputNetData,sizeof(float)*this->outputSize_*inputImageList.size()); + +#endif + } + +#ifdef HAVE_DNN + Ptr net_; +#endif + //Size inputGeometry_; + int minibatchSz_;//The existence of the assignment operator mandates this to be nonconst + int outputSize_; +public: + DeepCNNOpenCvDNNImpl(const DeepCNNOpenCvDNNImpl& dn): + minibatchSz_(dn.minibatchSz_),outputSize_(dn.outputSize_){ + channelCount_=dn.channelCount_; + inputGeometry_=dn.inputGeometry_; + //Implemented to supress Visual Studio warning "assignment operator could not be generated" +#ifdef HAVE_DNN + this->net_=dn.net_; +#endif + } + DeepCNNOpenCvDNNImpl& operator=(const DeepCNNOpenCvDNNImpl &dn) + { +#ifdef HAVE_DNN + this->net_=dn.net_; +#endif + this->setPreprocessor(dn.preprocessor_); + this->inputGeometry_=dn.inputGeometry_; + this->channelCount_=dn.channelCount_; + this->minibatchSz_=dn.minibatchSz_; + this->outputSize_=dn.outputSize_; + this->preprocessor_=dn.preprocessor_; + this->outputGeometry_=dn.outputGeometry_; + return *this; + //Implemented to supress Visual Studio warning "assignment operator could not be generated" + } + + DeepCNNOpenCvDNNImpl(String modelArchFilename, String modelWeightsFilename,Ptr preprocessor, int maxMinibatchSz) + :minibatchSz_(maxMinibatchSz) + { + + CV_Assert(this->minibatchSz_>0); + CV_Assert(fileExists(modelArchFilename)); + CV_Assert(fileExists(modelWeightsFilename)); + CV_Assert(!preprocessor.empty()); + this->setPreprocessor(preprocessor); +#ifdef HAVE_DNN + + this->net_ = makePtr(readNetFromCaffe(modelArchFilename,modelWeightsFilename)); + + + + if (this->net_.empty()) + { + std::cerr << "Can't load network by using the following files: " << std::endl; + std::cerr << "prototxt: " << modelArchFilename << std::endl; + std::cerr << "caffemodel: " << modelWeightsFilename << std::endl; + //std::cerr << "bvlc_googlenet.caffemodel can be downloaded here:" << std::endl; + //std::cerr << "http://dl.caffe.berkeleyvision.org/bvlc_googlenet.caffemodel" << std::endl; + exit(-1); + } +// find a wa to check the followings in cv::dnn ??? +// CV_Assert(net_->num_inputs()==1); +// CV_Assert(net_->num_outputs()==1); +// CV_Assert(this->net_->input_blobs()[0]->channels()==1 +// ||this->net_->input_blobs()[0]->channels()==3); +// this->channelCount_=this->net_->input_blobs()[0]->channels(); + + + + //this->net_->CopyTrainedLayersFrom(modelWeightsFilename); + + //caffe::Blob* inputLayer = this->net_->input_blobs()[0]; + //inputLayerId = net_->getLayerId('data'); + + // inputLayerShape = net_->getLayerShapes(const MatShape& netInputShape, + // inputLayerId, + // std::vector* inLayerShapes, + // std::vector* outLayerShapes) const; + // should not be hard coded ideally + + this->inputGeometry_=Size(100,32);// Size(inputLayer->width(), inputLayer->height()); + this->channelCount_ = 1;//inputLayer->channels(); + + //inputLayer->Reshape(this->minibatchSz_,this->channelCount_,this->inputGeometry_.height, this->inputGeometry_.width); + //net_->Reshape(); + this->outputSize_=88172 ;//net_->output_blobs()[0]->channels(); + this->outputGeometry_ = Size(1,1);//Size(net_->output_blobs()[0]->width(),net_->output_blobs()[0]->height()); + + + + + + +#else + CV_Error(Error::StsError,"DNN module not available during compilation!"); +#endif + } + + void classify(InputArray image, OutputArray classProbabilities) + { + std::vector inputImageList; + inputImageList.push_back(image.getMat()); + classifyBatch(inputImageList,classProbabilities); + } + + void classifyBatch(InputArrayOfArrays inputImageList, OutputArray classProbabilities) + { + std::vector allImageVector; + inputImageList.getMatVector(allImageVector); + size_t outputSize=size_t(this->outputSize_);//temporary variable to avoid int to size_t arithmentic + + size_t minibatchSize=size_t(this->minibatchSz_);//temporary variable to avoid int to size_t arithmentic + classProbabilities.create(Size(int(outputSize),int(allImageVector.size())),CV_32F); + Mat outputMat = classProbabilities.getMat(); + printf("ekhane"); + for(size_t imgNum=0;imgNum(allImageVector.size()-imgNum,minibatchSize); + std::vector::const_iterator from=std::vector::const_iterator(allImageVector.begin()+imgNum); + std::vector::const_iterator to=std::vector::const_iterator(allImageVector.begin()+rangeEnd); + std::vector minibatchInput(from,to); + classifyMiniBatch(minibatchInput,outputMat.rowRange(int(imgNum),int(rangeEnd))); + + } + + } + + int getOutputSize() + { + return this->outputSize_; + } + Size getOutputGeometry() + { + return this->outputGeometry_; + } + + int getMinibatchSize() + { + return this->minibatchSz_; + } + + int getBackend() + { + return OCR_HOLISTIC_BACKEND_DNN; + } +}; Ptr DeepCNN::create(String archFilename,String weightsFilename,Ptr preprocessor,int minibatchSz,int backEnd) { @@ -587,9 +772,25 @@ Ptr DeepCNN::create(String archFilename,String weightsFilename,Ptr(new DeepCNNCaffeImpl(archFilename, weightsFilename,preprocessor, minibatchSz)); + +#elif defined(HAVE_DNN) + return Ptr(new DeepCNNOpenCvDNNImpl(archFilename, weightsFilename,preprocessor, minibatchSz)); +#else + CV_Error(Error::StsError,"DeepCNN::create backend not implemented"); + return Ptr(); +#endif + break; + case OCR_HOLISTIC_BACKEND_CAFFE: return Ptr(new DeepCNNCaffeImpl(archFilename, weightsFilename,preprocessor, minibatchSz)); break; + case OCR_HOLISTIC_BACKEND_DNN: + return Ptr(new DeepCNNOpenCvDNNImpl(archFilename, weightsFilename,preprocessor, minibatchSz)); + break; case OCR_HOLISTIC_BACKEND_NONE: default: CV_Error(Error::StsError,"DeepCNN::create backend not implemented"); @@ -603,9 +804,25 @@ Ptr DeepCNN::createDictNet(String archFilename,String weightsFilename,i { Ptr preprocessor=ImagePreprocessor::createImageStandarizer(113); switch(backEnd){ + case OCR_HOLISTIC_BACKEND_DEFAULT: + +#ifdef HAVE_CAFFE + return Ptr(new DeepCNNCaffeImpl(archFilename, weightsFilename,preprocessor, 100)); + +#elif defined(HAVE_DNN) + return Ptr(new DeepCNNOpenCvDNNImpl(archFilename, weightsFilename,preprocessor, 100)); +#else + CV_Error(Error::StsError,"DeepCNN::create backend not implemented"); + return Ptr(); +#endif + break; + case OCR_HOLISTIC_BACKEND_CAFFE: return Ptr(new DeepCNNCaffeImpl(archFilename, weightsFilename,preprocessor, 100)); break; + case OCR_HOLISTIC_BACKEND_DNN: + return Ptr(new DeepCNNOpenCvDNNImpl(archFilename, weightsFilename,preprocessor, 100)); + break; case OCR_HOLISTIC_BACKEND_NONE: default: CV_Error(Error::StsError,"DeepCNN::create backend not implemented"); @@ -639,6 +856,27 @@ bool getCaffeAvailable() { return true; } +#elif defined(HAVE_DNN) + +bool getCaffeGpuMode() +{ + CV_Error(Error::StsError,"Caffe not available during compilation!"); + return 0; +} + +void setCaffeGpuMode(bool useGpu) +{ + CV_Error(Error::StsError,"Caffe not available during compilation!"); + CV_Assert(useGpu==1);//Compilation directives force +} + +bool getCaffeAvailable(){ + return 0; +} +bool getDNNAvailable(){ + return true; +} + #else diff --git a/modules/text/src/text_detectorCNN.cpp b/modules/text/src/text_detectorCNN.cpp index 3865e186c7b..a2c583c7f10 100644 --- a/modules/text/src/text_detectorCNN.cpp +++ b/modules/text/src/text_detectorCNN.cpp @@ -20,6 +20,12 @@ #include "caffe/caffe.hpp" #endif +#ifdef HAVE_DNN +#include "opencv2/dnn.hpp" +#endif + +using namespace cv::dnn; + #define CV_WARN(message) fprintf(stderr, "warning: %s (%s:%d)\n", message, __FILE__, __LINE__) namespace cv { namespace text { @@ -205,6 +211,220 @@ class DeepCNNTextDetectorCaffeImpl: public DeepCNNTextDetector{ }; +class DeepCNNTextDetectorDNNImpl: public DeepCNNTextDetector{ +protected: + + + void process_(Mat inputImage, Mat &outputMat) + { + // do forward pass and stores the output in outputMat + CV_Assert(outputMat.isContinuous()); + if (inputImage.channels() != this->inputChannelCount_) + CV_WARN("Number of input channel(s) in the model is not same as input"); + + +#ifdef HAVE_DNN + + //std::vector preProcessedImList; // to store preprocessed images, should it be handled inside preprocessing class? + + Mat preprocessed; + this->preprocess(inputImage,preprocessed); + printf("After preprocess"); + // preprocesses each image in the inputImageList and push to preprocessedImList +// for(size_t imgNum=0;imgNumpreprocess(inputImageList[imgNum],preprocessed); +// preProcessedImList.push_back(preprocessed); +// } + // set input data blob in dnn::net + //Mat temp =blobFromImage(preprocessed,1, Size(700, 700)); + //printf("%d %d %d ",temp.size[1],temp.size[2],temp.size[3]); + net_->setInput(blobFromImage(preprocessed,1, Size(700, 700)), "data"); + printf("Input layer"); + + + //Mat outputNet(inputImageList.size(),this->outputSize_,CV_32FC1,outputMatData) ; + Mat outputNet = this->net_->forward( );//"mbox_priorbox"); + printf("After forward"); + //outputNet = outputNet.reshape(1, 1); + this->outputGeometry_.height = outputNet.size[2]; + this->outputGeometry_.width = outputNet.size[3]; + this->outputChannelCount_ = outputNet.size[1]; + printf("%d %d %d ",outputNet.size[1],outputNet.size[2],outputNet.size[3]); + outputMat.create(this->outputGeometry_.height , this->outputGeometry_.width,CV_32FC1); + float*outputMatData=(float*)(outputMat.data); + float*outputNetData=(float*)(outputNet.data); + int outputSz = this->outputChannelCount_ * this->outputGeometry_.height * this->outputGeometry_.width; + + memcpy(outputMatData,outputNetData,sizeof(float)*outputSz); +// net_->input_blobs()[0]->Reshape(1, this->inputChannelCount_,this->inputGeometry_.height,this->inputGeometry_.width); +// net_->Reshape(); +// float* inputBuffer=net_->input_blobs()[0]->mutable_cpu_data(); +// float* inputData=inputBuffer; + +// std::vector input_channels; +// Mat preprocessed; +// // if the image have multiple color channels the input layer should be populated accordingly +// for (int channel=0;channel < this->inputChannelCount_;channel++){ + +// cv::Mat netInputWraped(this->inputGeometry_.height, this->inputGeometry_.width, CV_32FC1, inputData); +// input_channels.push_back(netInputWraped); +// //input_data += width * height; +// inputData+=(this->inputGeometry_.height*this->inputGeometry_.width); +// } +// this->preprocess(inputImage,preprocessed); +// split(preprocessed, input_channels); + +// //preprocessed.copyTo(netInputWraped); + + +// this->net_->Forward(); +// const float* outputNetData=net_->output_blobs()[0]->cpu_data(); +// // const float* outputNetData1=net_->output_blobs()[1]->cpu_data(); + + + + +// this->outputGeometry_.height = net_->output_blobs()[0]->height(); +// this->outputGeometry_.width = net_->output_blobs()[0]->width(); +// this->outputChannelCount_ = net_->output_blobs()[0]->channels(); +// int outputSz = this->outputChannelCount_ * this->outputGeometry_.height * this->outputGeometry_.width; +// outputMat.create(this->outputGeometry_.height , this->outputGeometry_.width,CV_32FC1); +// float*outputMatData=(float*)(outputMat.data); + +// memcpy(outputMatData,outputNetData,sizeof(float)*outputSz); + + + +#endif + } + + + +#ifdef HAVE_DNN + Ptr net_; +#endif + //Size inputGeometry_; + int minibatchSz_;//The existence of the assignment operator mandates this to be nonconst + //int outputSize_; +public: + DeepCNNTextDetectorDNNImpl(const DeepCNNTextDetectorDNNImpl& dn): + minibatchSz_(dn.minibatchSz_){ + outputGeometry_=dn.outputGeometry_; + inputGeometry_=dn.inputGeometry_; + //Implemented to supress Visual Studio warning "assignment operator could not be generated" +#ifdef HAVE_DNN + this->net_=dn.net_; +#endif + } + DeepCNNTextDetectorDNNImpl& operator=(const DeepCNNTextDetectorDNNImpl &dn) + { +#ifdef HAVE_DNN + this->net_=dn.net_; +#endif + this->setPreprocessor(dn.preprocessor_); + this->inputGeometry_=dn.inputGeometry_; + this->inputChannelCount_=dn.inputChannelCount_; + this->outputChannelCount_ = dn.outputChannelCount_; + // this->minibatchSz_=dn.minibatchSz_; + //this->outputGeometry_=dn.outputSize_; + this->preprocessor_=dn.preprocessor_; + this->outputGeometry_=dn.outputGeometry_; + return *this; + //Implemented to supress Visual Studio warning "assignment operator could not be generated" + } + + DeepCNNTextDetectorDNNImpl(String modelArchFilename, String modelWeightsFilename,Ptr preprocessor, int maxMinibatchSz) + :minibatchSz_(maxMinibatchSz) + { + + CV_Assert(this->minibatchSz_>0); + CV_Assert(fileExists(modelArchFilename)); + CV_Assert(fileExists(modelWeightsFilename)); + CV_Assert(!preprocessor.empty()); + this->setPreprocessor(preprocessor); +#ifdef HAVE_DNN + this->net_ = makePtr(readNetFromCaffe(modelArchFilename,modelWeightsFilename)); + + if (this->net_.empty()) + { + std::cerr << "Can't load network by using the following files: " << std::endl; + std::cerr << "prototxt: " << modelArchFilename << std::endl; + std::cerr << "caffemodel: " << modelWeightsFilename << std::endl; + //std::cerr << "bvlc_googlenet.caffemodel can be downloaded here:" << std::endl; + //std::cerr << "http://dl.caffe.berkeleyvision.org/bvlc_googlenet.caffemodel" << std::endl; + exit(-1); + } +// this->net_.reset(new caffe::Net(modelArchFilename, caffe::TEST)); +// CV_Assert(net_->num_inputs()==1); +// CV_Assert(net_->num_outputs()==1); +// CV_Assert(this->net_->input_blobs()[0]->channels()==1 +// ||this->net_->input_blobs()[0]->channels()==3); +// // this->channelCount_=this->net_->input_blobs()[0]->channels(); + + +// this->inputGeometry_.height = inputLayer->height(); +// this->inputGeometry_.width = inputLayer->width(); +// this->inputChannelCount_ = inputLayer->channels(); +// //this->inputGeometry_.batchSize =1; + +// inputLayer->Reshape(this->minibatchSz_,this->inputChannelCount_,this->inputGeometry_.height, this->inputGeometry_.width); +// net_->Reshape(); +// this->outputChannelCount_ = net_->output_blobs()[0]->channels(); +// //this->outputGeometry_.batchSize =1; +// this->outputGeometry_.height =net_->output_blobs()[0]->height(); +// this->outputGeometry_.width = net_->output_blobs()[0]->width(); + this->inputGeometry_.height =700; + this->inputGeometry_.width = 700 ;//inputLayer->width(); + this->inputChannelCount_ = 3 ;//inputLayer->channels(); + +#else + CV_Error(Error::StsError,"DNN module not available during compilation!"); +#endif + } + + + void detect(InputArray image, OutputArray Bbox_prob) + { + Size outSize = Size(this->outputGeometry_.height,outputGeometry_.width); + Bbox_prob.create(outSize,CV_32F); // dummy initialization is it needed + Mat outputMat = Bbox_prob.getMat(); + printf("calling"); + process_(image.getMat(),outputMat); + //copy back to outputArray + outputMat.copyTo(Bbox_prob); + } + + Size getOutputGeometry() + { + return this->outputGeometry_; + } + Size getinputGeometry() + { + return this->inputGeometry_; + } + + int getMinibatchSize() + { + return this->minibatchSz_; + } + + int getBackend() + { + return OCR_HOLISTIC_BACKEND_DNN; + } + void setPreprocessor(Ptr ptr) + { + CV_Assert(!ptr.empty()); + preprocessor_=ptr; + } + + Ptr getPreprocessor() + { + return preprocessor_; + } +}; + Ptr DeepCNNTextDetector::create(String archFilename,String weightsFilename,Ptr preprocessor,int minibatchSz,int backEnd) { if(preprocessor.empty()) @@ -220,13 +440,29 @@ Ptr DeepCNNTextDetector::create(String archFilename,String preprocessor->set_mean(textbox_mean); } switch(backEnd){ + case OCR_HOLISTIC_BACKEND_DEFAULT: + +#ifdef HAVE_CAFFE + return Ptr(new DeepCNNTextDetectorCaffeImpl(archFilename, weightsFilename,preprocessor, minibatchSz)); + +#elif defined(HAVE_DNN) + return Ptr(new DeepCNNTextDetectorDNNImpl(archFilename, weightsFilename,preprocessor, minibatchSz)); +#else + CV_Error(Error::StsError,"DeepCNNTextDetector::create backend not implemented"); + return Ptr(); +#endif case OCR_HOLISTIC_BACKEND_CAFFE: return Ptr(new DeepCNNTextDetectorCaffeImpl(archFilename, weightsFilename,preprocessor, minibatchSz)); break; + + case OCR_HOLISTIC_BACKEND_DNN: + return Ptr(new DeepCNNTextDetectorDNNImpl(archFilename, weightsFilename,preprocessor, minibatchSz)); + break; + case OCR_HOLISTIC_BACKEND_NONE: default: - CV_Error(Error::StsError,"DeepCNN::create backend not implemented"); + CV_Error(Error::StsError,"DeepCNNTextDetector::create backend not implemented"); return Ptr(); break; } @@ -248,12 +484,27 @@ Ptr DeepCNNTextDetector::createTextBoxNet(String archFilena textbox_mean.at(0,2)=123; preprocessor->set_mean(textbox_mean); switch(backEnd){ + case OCR_HOLISTIC_BACKEND_DEFAULT: + +#ifdef HAVE_CAFFE + return Ptr(new DeepCNNTextDetectorCaffeImpl(archFilename, weightsFilename,preprocessor, 100)); + +#elif defined(HAVE_DNN) + return Ptr(new DeepCNNTextDetectorDNNImpl(archFilename, weightsFilename,preprocessor, 100)); +#else + CV_Error(Error::StsError,"DeepCNNTextDetector::create backend not implemented"); + return Ptr(); +#endif + break; case OCR_HOLISTIC_BACKEND_CAFFE: return Ptr(new DeepCNNTextDetectorCaffeImpl(archFilename, weightsFilename,preprocessor, 100)); break; + case OCR_HOLISTIC_BACKEND_DNN: + return Ptr(new DeepCNNTextDetectorDNNImpl(archFilename, weightsFilename,preprocessor, 100)); + break; case OCR_HOLISTIC_BACKEND_NONE: default: - CV_Error(Error::StsError,"DeepCNN::create backend not implemented"); + CV_Error(Error::StsError,"DeepCNNTextDetector::create backend not implemented"); return Ptr(); break; } From c697e41b8d8415084971e5e8dc1f73d2867eab37 Mon Sep 17 00:00:00 2001 From: sghoshcvc Date: Mon, 28 Aug 2017 19:25:58 +0200 Subject: [PATCH 18/31] added calculation of output size --- modules/text/include/opencv2/text/ocr.hpp | 24 +++++ .../include/opencv2/text/textDetector.hpp | 4 +- modules/text/samples/textbox_demo.cpp | 10 +- modules/text/src/ocr_holistic.cpp | 98 +++++++++-------- modules/text/src/text_detector.cpp | 6 +- modules/text/src/text_detectorCNN.cpp | 101 ++++-------------- 6 files changed, 107 insertions(+), 136 deletions(-) diff --git a/modules/text/include/opencv2/text/ocr.hpp b/modules/text/include/opencv2/text/ocr.hpp index 3c739093559..14dfc092456 100644 --- a/modules/text/include/opencv2/text/ocr.hpp +++ b/modules/text/include/opencv2/text/ocr.hpp @@ -861,6 +861,15 @@ class CV_EXPORTS_W DeepCNN:public TextImageClassifier }; namespace cnn_config{ + +/** @brief runtime backend information + * + * this function finds the status of backends compiled with this module + * + * @return a list of backends (caffe,opencv-dnn etc.) + * */ +CV_EXPORTS_W std::vector getAvailableBackends(); + namespace caffe_backend{ /** @brief Prompts Caffe on the computation device beeing used @@ -897,6 +906,21 @@ CV_EXPORTS_W void setCaffeGpuMode(bool useGpu); CV_EXPORTS_W bool getCaffeAvailable(); }//caffe +namespace dnn_backend { + +/** @brief Provides runtime information on whether DNN module was compiled in. + * + * The text module API is the same regardless of whether DNN module was available or not + * During compilation. When methods that require backend are invocked while no backend support + * is compiled, exceptions are thrown. This method allows to test whether the + * text module was built with dnn_backend during runtime. + * + * @return true if opencv_dnn support for the the text module was provided during compilation, + * false if opencv_dnn was unavailable. + */ +CV_EXPORTS_W bool getDNNAvailable(); + +}//dnn_backend }//cnn_config /** @brief OCRHolisticWordRecognizer class provides the functionallity of segmented wordspotting. diff --git a/modules/text/include/opencv2/text/textDetector.hpp b/modules/text/include/opencv2/text/textDetector.hpp index ad1b53deed3..eda74801449 100644 --- a/modules/text/include/opencv2/text/textDetector.hpp +++ b/modules/text/include/opencv2/text/textDetector.hpp @@ -56,7 +56,7 @@ namespace cv namespace text { -//! @addtogroup text_recognize +//! @addtogroup text_detect //! @{ @@ -263,7 +263,7 @@ class CV_EXPORTS_W textDetector : public BaseDetector }; - +//! @} }//namespace text }//namespace cv diff --git a/modules/text/samples/textbox_demo.cpp b/modules/text/samples/textbox_demo.cpp index 75a18a31552..b76658e1b7a 100644 --- a/modules/text/samples/textbox_demo.cpp +++ b/modules/text/samples/textbox_demo.cpp @@ -61,6 +61,12 @@ int main(int argc, const char * argv[]){ std::cout<<"The text module was compiled without Caffe which is the only available DeepCNN backend.\nAborting!\n"; //exit(1); } + std::vector backends=cv::text::cnn_config::getAvailableBackends(); + std::cout << "The Following backends are available" << "\n"; + for (int i=0;i cnn=cv::text::DeepCNN::createDictNet( - "dictnet_vgg_deploy.prototxt","dictnet_vgg.caffemodel"); + "dictnet_vgg_deploy.prototxt","dictnet_vgg.caffemodel",cv::text::OCR_HOLISTIC_BACKEND_DNN); cv::Ptr wordSpotter= cv::text::OCRHolisticWordRecognizer::create(cnn,"dictnet_vgg_labels.txt"); @@ -130,7 +136,7 @@ int main(int argc, const char * argv[]){ cv::Point tl_ = bbox.at(i).tl(); cv::Point br_ = bbox.at(i).br(); - out<minibatchSz_); CV_Assert(outputMat.isContinuous()); + #ifdef HAVE_CAFFE net_->input_blobs()[0]->Reshape(inputImageList.size(), this->channelCount_,this->inputGeometry_.height,this->inputGeometry_.width); net_->Reshape(); @@ -450,16 +452,19 @@ class DeepCNNCaffeImpl: public DeepCNN{ input_channels.push_back(netInputWraped); //input_data += width * height; inputData+=(this->inputGeometry_.height*this->inputGeometry_.width); + } this->preprocess(inputImageList[imgNum],preprocessed); split(preprocessed, input_channels); + } this->net_->ForwardPrefilled(); const float* outputNetData=net_->output_blobs()[0]->cpu_data(); this->outputGeometry_ = Size(net_->output_blobs()[0]->width(),net_->output_blobs()[0]->height()); int outputSz = this->outputSize_ * this->outputGeometry_.height * this->outputGeometry_.width; + //outputMat.resize(this->outputGeometry_.height * this->outputGeometry_.width); float*outputMatData=(float*)(outputMat.data); memcpy(outputMatData,outputNetData,sizeof(float)*outputSz*inputImageList.size()); @@ -470,9 +475,10 @@ class DeepCNNCaffeImpl: public DeepCNN{ #ifdef HAVE_CAFFE Ptr > net_; #endif - //Size inputGeometry_; + //Size inputGeometry_;//=Size(100,32); int minibatchSz_;//The existence of the assignment operator mandates this to be nonconst int outputSize_; + //Size outputGeometry_; public: DeepCNNCaffeImpl(const DeepCNNCaffeImpl& dn): minibatchSz_(dn.minibatchSz_),outputSize_(dn.outputSize_){ @@ -608,7 +614,7 @@ class DeepCNNOpenCvDNNImpl: public DeepCNN{ preProcessedImList.push_back(preprocessed); } // set input data blob in dnn::net - net_->setInput(blobFromImages(preProcessedImList,1, Size(100, 32)), "data"); + net_->setInput(blobFromImages(preProcessedImList,1, this->inputGeometry_), "data"); float*outputMatData=(float*)(outputMat.data); //Mat outputNet(inputImageList.size(),this->outputSize_,CV_32FC1,outputMatData) ; @@ -625,9 +631,16 @@ class DeepCNNOpenCvDNNImpl: public DeepCNN{ #ifdef HAVE_DNN Ptr net_; #endif - //Size inputGeometry_; + // hard coding input image size. anything in DNN library to get that from prototxt?? + // Size inputGeometry_;//=Size(100,32); int minibatchSz_;//The existence of the assignment operator mandates this to be nonconst int outputSize_; + //Size outputGeometry_;//= Size(1,1); + //int channelCount_; + // int inputChannel_ ;//=1; + const int _inputHeight =32; + const int _inputWidth =100; + const int _inputChannel =1; public: DeepCNNOpenCvDNNImpl(const DeepCNNOpenCvDNNImpl& dn): minibatchSz_(dn.minibatchSz_),outputSize_(dn.outputSize_){ @@ -678,33 +691,17 @@ class DeepCNNOpenCvDNNImpl: public DeepCNN{ //std::cerr << "http://dl.caffe.berkeleyvision.org/bvlc_googlenet.caffemodel" << std::endl; exit(-1); } -// find a wa to check the followings in cv::dnn ??? -// CV_Assert(net_->num_inputs()==1); -// CV_Assert(net_->num_outputs()==1); -// CV_Assert(this->net_->input_blobs()[0]->channels()==1 -// ||this->net_->input_blobs()[0]->channels()==3); -// this->channelCount_=this->net_->input_blobs()[0]->channels(); - - - //this->net_->CopyTrainedLayersFrom(modelWeightsFilename); - //caffe::Blob* inputLayer = this->net_->input_blobs()[0]; - //inputLayerId = net_->getLayerId('data'); - - // inputLayerShape = net_->getLayerShapes(const MatShape& netInputShape, - // inputLayerId, - // std::vector* inLayerShapes, - // std::vector* outLayerShapes) const; - // should not be hard coded ideally - - this->inputGeometry_=Size(100,32);// Size(inputLayer->width(), inputLayer->height()); - this->channelCount_ = 1;//inputLayer->channels(); + this->inputGeometry_=Size(_inputWidth,_inputHeight);// Size(inputLayer->width(), inputLayer->height()); + this->channelCount_ = _inputChannel;//inputLayer->channels(); //inputLayer->Reshape(this->minibatchSz_,this->channelCount_,this->inputGeometry_.height, this->inputGeometry_.width); - //net_->Reshape(); - this->outputSize_=88172 ;//net_->output_blobs()[0]->channels(); - this->outputGeometry_ = Size(1,1);//Size(net_->output_blobs()[0]->width(),net_->output_blobs()[0]->height()); + Ptr< Layer > outLayer= net_->getLayer (net_->getLayerId (net_->getLayerNames()[net_->getLayerNames().size()-2])); + //std::vector blobs = outLayer->blobs; + + this->outputSize_=(outLayer->blobs)[1].size[0] ;//net_->output_blobs()[0]->channels(); + //this->outputGeometry_ = Size(1,1);//Size(net_->output_blobs()[0]->width(),net_->output_blobs()[0]->height()); @@ -732,7 +729,7 @@ class DeepCNNOpenCvDNNImpl: public DeepCNN{ size_t minibatchSize=size_t(this->minibatchSz_);//temporary variable to avoid int to size_t arithmentic classProbabilities.create(Size(int(outputSize),int(allImageVector.size())),CV_32F); Mat outputMat = classProbabilities.getMat(); - printf("ekhane"); + for(size_t imgNum=0;imgNum(allImageVector.size()-imgNum,minibatchSize); @@ -832,6 +829,22 @@ Ptr DeepCNN::createDictNet(String archFilename,String weightsFilename,i } namespace cnn_config{ +std::vector getAvailableBackends() +{ + std::vector backends; + +#ifdef HAVE_CAFFE + backends.push_back("CAFFE, OCR_HOLISTIC_BACKEND_CAFFE"); // dnn backend opencv_dnn + +#endif +#ifdef HAVE_DNN + backends.push_back("DNN, OCR_HOLISTIC_BACKEND_DNN");// opencv_dnn based backend" +#endif + return backends; + + +} + namespace caffe_backend{ #ifdef HAVE_CAFFE @@ -856,7 +869,7 @@ bool getCaffeAvailable() { return true; } -#elif defined(HAVE_DNN) +#else bool getCaffeGpuMode() { @@ -873,32 +886,23 @@ void setCaffeGpuMode(bool useGpu) bool getCaffeAvailable(){ return 0; } -bool getDNNAvailable(){ - return true; -} +#endif -#else +}//namespace caffe +namespace dnn_backend{ +#ifdef HAVE_DNN -bool getCaffeGpuMode() -{ - CV_Error(Error::StsError,"Caffe not available during compilation!"); - return 0; -} -void setCaffeGpuMode(bool useGpu) -{ - CV_Error(Error::StsError,"Caffe not available during compilation!"); - CV_Assert(useGpu==1);//Compilation directives force +bool getDNNAvailable(){ + return true; } - -bool getCaffeAvailable(){ +#else +bool getDNNAvailable(){ return 0; } - #endif - -}//namespace caffe +}//namspace dnn_backend }//namespace cnn_config class OCRHolisticWordRecognizerImpl: public OCRHolisticWordRecognizer{ @@ -931,6 +935,7 @@ class OCRHolisticWordRecognizerImpl: public OCRHolisticWordRecognizer{ getOutputs(buffer,nbOutputs,tmp); classNum=tmp[0].wordIdx; confidence=tmp[0].probabillity; + } }; protected: @@ -972,6 +977,7 @@ class OCRHolisticWordRecognizerImpl: public OCRHolisticWordRecognizer{ { Mat netOutput; this->classifier_->classifyBatch(inputImageList,netOutput); + for(int k=0;k -#ifdef HAVE_CAFFE -#include "caffe/caffe.hpp" -#endif +//#ifdef HAVE_CAFFE +//#include "caffe/caffe.hpp" +//#endif namespace cv { namespace text { diff --git a/modules/text/src/text_detectorCNN.cpp b/modules/text/src/text_detectorCNN.cpp index a2c583c7f10..90d6fd9b8ee 100644 --- a/modules/text/src/text_detectorCNN.cpp +++ b/modules/text/src/text_detectorCNN.cpp @@ -225,75 +225,25 @@ class DeepCNNTextDetectorDNNImpl: public DeepCNNTextDetector{ #ifdef HAVE_DNN - //std::vector preProcessedImList; // to store preprocessed images, should it be handled inside preprocessing class? - Mat preprocessed; this->preprocess(inputImage,preprocessed); - printf("After preprocess"); - // preprocesses each image in the inputImageList and push to preprocessedImList -// for(size_t imgNum=0;imgNumpreprocess(inputImageList[imgNum],preprocessed); -// preProcessedImList.push_back(preprocessed); -// } - // set input data blob in dnn::net - //Mat temp =blobFromImage(preprocessed,1, Size(700, 700)); - //printf("%d %d %d ",temp.size[1],temp.size[2],temp.size[3]); - net_->setInput(blobFromImage(preprocessed,1, Size(700, 700)), "data"); - printf("Input layer"); - - - //Mat outputNet(inputImageList.size(),this->outputSize_,CV_32FC1,outputMatData) ; - Mat outputNet = this->net_->forward( );//"mbox_priorbox"); - printf("After forward"); - //outputNet = outputNet.reshape(1, 1); + + net_->setInput(blobFromImage(preprocessed,1, this->inputGeometry_), "data"); + + Mat outputNet = this->net_->forward( ); + this->outputGeometry_.height = outputNet.size[2]; this->outputGeometry_.width = outputNet.size[3]; this->outputChannelCount_ = outputNet.size[1]; - printf("%d %d %d ",outputNet.size[1],outputNet.size[2],outputNet.size[3]); + outputMat.create(this->outputGeometry_.height , this->outputGeometry_.width,CV_32FC1); float*outputMatData=(float*)(outputMat.data); float*outputNetData=(float*)(outputNet.data); int outputSz = this->outputChannelCount_ * this->outputGeometry_.height * this->outputGeometry_.width; memcpy(outputMatData,outputNetData,sizeof(float)*outputSz); -// net_->input_blobs()[0]->Reshape(1, this->inputChannelCount_,this->inputGeometry_.height,this->inputGeometry_.width); -// net_->Reshape(); -// float* inputBuffer=net_->input_blobs()[0]->mutable_cpu_data(); -// float* inputData=inputBuffer; - -// std::vector input_channels; -// Mat preprocessed; -// // if the image have multiple color channels the input layer should be populated accordingly -// for (int channel=0;channel < this->inputChannelCount_;channel++){ - -// cv::Mat netInputWraped(this->inputGeometry_.height, this->inputGeometry_.width, CV_32FC1, inputData); -// input_channels.push_back(netInputWraped); -// //input_data += width * height; -// inputData+=(this->inputGeometry_.height*this->inputGeometry_.width); -// } -// this->preprocess(inputImage,preprocessed); -// split(preprocessed, input_channels); - -// //preprocessed.copyTo(netInputWraped); -// this->net_->Forward(); -// const float* outputNetData=net_->output_blobs()[0]->cpu_data(); -// // const float* outputNetData1=net_->output_blobs()[1]->cpu_data(); - - - - -// this->outputGeometry_.height = net_->output_blobs()[0]->height(); -// this->outputGeometry_.width = net_->output_blobs()[0]->width(); -// this->outputChannelCount_ = net_->output_blobs()[0]->channels(); -// int outputSz = this->outputChannelCount_ * this->outputGeometry_.height * this->outputGeometry_.width; -// outputMat.create(this->outputGeometry_.height , this->outputGeometry_.width,CV_32FC1); -// float*outputMatData=(float*)(outputMat.data); - -// memcpy(outputMatData,outputNetData,sizeof(float)*outputSz); - #endif @@ -307,6 +257,9 @@ class DeepCNNTextDetectorDNNImpl: public DeepCNNTextDetector{ //Size inputGeometry_; int minibatchSz_;//The existence of the assignment operator mandates this to be nonconst //int outputSize_; + const int _inputHeight =700; + const int _inputWidth =700; + const int _inputChannel =3; public: DeepCNNTextDetectorDNNImpl(const DeepCNNTextDetectorDNNImpl& dn): minibatchSz_(dn.minibatchSz_){ @@ -355,28 +308,10 @@ class DeepCNNTextDetectorDNNImpl: public DeepCNNTextDetector{ //std::cerr << "http://dl.caffe.berkeleyvision.org/bvlc_googlenet.caffemodel" << std::endl; exit(-1); } -// this->net_.reset(new caffe::Net(modelArchFilename, caffe::TEST)); -// CV_Assert(net_->num_inputs()==1); -// CV_Assert(net_->num_outputs()==1); -// CV_Assert(this->net_->input_blobs()[0]->channels()==1 -// ||this->net_->input_blobs()[0]->channels()==3); -// // this->channelCount_=this->net_->input_blobs()[0]->channels(); - - -// this->inputGeometry_.height = inputLayer->height(); -// this->inputGeometry_.width = inputLayer->width(); -// this->inputChannelCount_ = inputLayer->channels(); -// //this->inputGeometry_.batchSize =1; - -// inputLayer->Reshape(this->minibatchSz_,this->inputChannelCount_,this->inputGeometry_.height, this->inputGeometry_.width); -// net_->Reshape(); -// this->outputChannelCount_ = net_->output_blobs()[0]->channels(); -// //this->outputGeometry_.batchSize =1; -// this->outputGeometry_.height =net_->output_blobs()[0]->height(); -// this->outputGeometry_.width = net_->output_blobs()[0]->width(); - this->inputGeometry_.height =700; - this->inputGeometry_.width = 700 ;//inputLayer->width(); - this->inputChannelCount_ = 3 ;//inputLayer->channels(); + + this->inputGeometry_.height =_inputHeight; + this->inputGeometry_.width = _inputWidth ;//inputLayer->width(); + this->inputChannelCount_ = _inputChannel ;//inputLayer->channels(); #else CV_Error(Error::StsError,"DNN module not available during compilation!"); @@ -389,7 +324,7 @@ class DeepCNNTextDetectorDNNImpl: public DeepCNNTextDetector{ Size outSize = Size(this->outputGeometry_.height,outputGeometry_.width); Bbox_prob.create(outSize,CV_32F); // dummy initialization is it needed Mat outputMat = Bbox_prob.getMat(); - printf("calling"); + process_(image.getMat(),outputMat); //copy back to outputArray outputMat.copyTo(Bbox_prob); @@ -487,20 +422,20 @@ Ptr DeepCNNTextDetector::createTextBoxNet(String archFilena case OCR_HOLISTIC_BACKEND_DEFAULT: #ifdef HAVE_CAFFE - return Ptr(new DeepCNNTextDetectorCaffeImpl(archFilename, weightsFilename,preprocessor, 100)); + return Ptr(new DeepCNNTextDetectorCaffeImpl(archFilename, weightsFilename,preprocessor, 1)); #elif defined(HAVE_DNN) - return Ptr(new DeepCNNTextDetectorDNNImpl(archFilename, weightsFilename,preprocessor, 100)); + return Ptr(new DeepCNNTextDetectorDNNImpl(archFilename, weightsFilename,preprocessor, 1)); #else CV_Error(Error::StsError,"DeepCNNTextDetector::create backend not implemented"); return Ptr(); #endif break; case OCR_HOLISTIC_BACKEND_CAFFE: - return Ptr(new DeepCNNTextDetectorCaffeImpl(archFilename, weightsFilename,preprocessor, 100)); + return Ptr(new DeepCNNTextDetectorCaffeImpl(archFilename, weightsFilename,preprocessor, 1)); break; case OCR_HOLISTIC_BACKEND_DNN: - return Ptr(new DeepCNNTextDetectorDNNImpl(archFilename, weightsFilename,preprocessor, 100)); + return Ptr(new DeepCNNTextDetectorDNNImpl(archFilename, weightsFilename,preprocessor, 1)); break; case OCR_HOLISTIC_BACKEND_NONE: default: From dc48968f1cdcce55643b1df08b6b2d878f18978b Mon Sep 17 00:00:00 2001 From: sghoshcvc Date: Tue, 5 Sep 2017 06:16:50 +0200 Subject: [PATCH 19/31] removed blanks, fixed Cmake issue --- modules/text/CMakeLists.txt | 5 ++ modules/text/README.md | 84 +++-------------------- modules/text/include/opencv2/text/ocr.hpp | 1 - modules/text/src/precomp.hpp | 2 - modules/text/src/text_detectorCNN.cpp | 5 -- modules/text/text_config.hpp.in | 9 --- 6 files changed, 16 insertions(+), 90 deletions(-) diff --git a/modules/text/CMakeLists.txt b/modules/text/CMakeLists.txt index f9649ca336f..18173db830b 100644 --- a/modules/text/CMakeLists.txt +++ b/modules/text/CMakeLists.txt @@ -1,5 +1,10 @@ set(the_description "Text Detection and Recognition") +if(POLICY CMP0023) + message(STATUS "Explicitly setting policy CMP0023 to OLD") + cmake_policy(SET CMP0023 OLD) +endif(POLICY CMP0023) + # Using cmake scripts and modules list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}) diff --git a/modules/text/README.md b/modules/text/README.md index 2caf58a1e17..fd33980e80e 100644 --- a/modules/text/README.md +++ b/modules/text/README.md @@ -49,20 +49,22 @@ Notes 3. You are encouraged to search the Net for some better pre-trained classifiers, as well as classifiers for other languages. -Word spotting CNN +Text Detection CNN ================= Intro ----- -A word spotting CNN is a CNN that takes an image assumed to contain a single word and provides a probabillity over a given vocabulary. -Although other backends will be supported, for the moment only the Caffe backend is supported. +The text module now have a text detection and recognition using deep CNN. The text detector deep CNN that takes an image which may contain multiple words. This outputs a list of Rects with bounding boxes and probability of text there. The text recognizer provides a probabillity over a given vocabulary for each of these rects. + +Two backends are supported 1) caffe 2) opencv-dnn Instalation of Caffe backend ---------------------------- +* Please note a custom caffe based on SSD branch is required, the link of the custom caffe is provided below The caffe wrapping backend has the requirements caffe does. * Caffe can be built against OpenCV, if the caffe backend is enabled, a circular bependency arises. The simplest solution is to build caffe without support for OpenCV. @@ -77,10 +79,8 @@ Sample script for building Caffe SRCROOT="${HOME}/caffe_inst/" mkdir -p "$SRCROOT" cd "$SRCROOT" -git clone https://github.com/BVLC/caffe.git -cd caffe -git checkout 91b09280f5233cafc62954c98ce8bc4c204e7475 -git branch 91b09280f5233cafc62954c98ce8bc4c204e7475 +git clone https://github.com/sghoshcvc/TextBoxes.git +cd TextBoxes cat Makefile.config.example > Makefile.config echo 'USE_OPENCV := 0' >> Makefile.config echo 'INCLUDE_DIRS += /usr/include/hdf5/serial/' >> Makefile.config @@ -115,77 +115,15 @@ make distribute cd $OPENCV_BUILD_DIR #You must set this CAFFEROOT="${HOME}/caffe_inst/" #If you used the previous code to compile Caffe in ubuntu 16.04 -cmake -DCaffe_LIBS:FILEPATH="$CAFFEROOT/caffe/distribute/lib/libcaffe.so" -DBUILD_opencv_ts:BOOL="0" -DBUILD_opencv_dnn:BOOL="0" -DBUILD_opencv_dnn_modern:BOOL="0" -DCaffe_INCLUDE_DIR:PATH="$CAFFEROOT/caffe/distribute/include" -DWITH_MATLAB:BOOL="0" -DBUILD_opencv_cudabgsegm:BOOL="0" -DWITH_QT:BOOL="1" -DBUILD_opencv_cudaoptflow:BOOL="0" -DBUILD_opencv_cudastereo:BOOL="0" -DBUILD_opencv_cudafilters:BOOL="0" -DBUILD_opencv_cudev:BOOL="1" -DOPENCV_EXTRA_MODULES_PATH:PATH="/home/anguelos/work/projects/opencv_gsoc/opencv_contrib/modules" ./ +cmake -DCaffe_LIBS:FILEPATH="$CAFFEROOT/caffe/distribute/lib/libcaffe.so" -DBUILD_opencv_ts:BOOL="0" -DBUILD_opencv_dnn:BOOL="0" -DBUILD_opencv_dnn_modern:BOOL="0" -DCaffe_INCLUDE_DIR:PATH="$CAFFEROOT/caffe/distribute/include" -DWITH_MATLAB:BOOL="0" -DBUILD_opencv_cudabgsegm:BOOL="0" -DWITH_QT:BOOL="1" -DBUILD_opencv_cudaoptflow:BOOL="0" -DBUILD_opencv_cudastereo:BOOL="0" -DBUILD_opencv_cudafilters:BOOL="0" -DBUILD_opencv_cudev:BOOL="1" -DOPENCV_EXTRA_MODULES_PATH:PATH="$OPENCV_CONTRIB/modules" ./ ``` - -Text Detection CNN -================= - -Intro ------ - -A text detection CNN is a CNN that takes an image which may contain multiple words. This outputs a list of Rects with bounding boxes and probability of text there. -Although other backends will be supported, for the moment only the Caffe backend is supported. - - - +where $OPECV_CONTRIB is the root directory containing opencv_contrib module Instalation of Caffe backend ---------------------------- -* Please note a custom caffe based on SSD branch is required, the link of the custom caffe is provided below -The caffe wrapping backend has the requirements caffe does. -* Caffe can be built against OpenCV, if the caffe backend is enabled, a circular bependency arises. -The simplest solution is to build caffe without support for OpenCV. -* Only the OS supported by Caffe are supported by the backend. -The scripts describing the module have been developed in ubuntu 16.04 and assume such a system. -Other UNIX systems including OSX should be easy to adapt. -Sample script for building Caffe +Use of opencv-dnn does not need any additional library. -```bash -#!/bin/bash -SRCROOT="${HOME}/caffe_inst/" -mkdir -p "$SRCROOT" -cd "$SRCROOT" -git clone https://github.com/sghoshcvc/TextBoxes.git -cd TextBoxes -cat Makefile.config.example > Makefile.config -echo 'USE_OPENCV := 0' >> Makefile.config -echo 'INCLUDE_DIRS += /usr/include/hdf5/serial/' >> Makefile.config -echo 'LIBRARY_DIRS += /usr/lib/x86_64-linux-gnu/hdf5/serial/' >> Makefile.config - - -echo "--- /tmp/caffe/include/caffe/net.hpp 2017-05-28 04:55:47.929623902 +0200 -+++ caffe/distribute/include/caffe/net.hpp 2017-05-28 04:51:33.437090768 +0200 -@@ -234,6 +234,7 @@ - - template - friend class Net; -+ virtual ~Callback(){} - }; - const vector& before_forward() const { return before_forward_; } - void add_before_forward(Callback* value) { -">/tmp/cleanup_caffe.diff - -patch < /tmp/cleanup_caffe.diff - - -make -j 6 - -make pycaffe - -make distribute -``` - - -```bash -#!/bin/bash -cd $OPENCV_BUILD_DIR #You must set this -CAFFEROOT="${HOME}/caffe_inst/" #If you used the previous code to compile Caffe in ubuntu 16.04 - -cmake -DCaffe_LIBS:FILEPATH="$CAFFEROOT/caffe/distribute/lib/libcaffe.so" -DBUILD_opencv_ts:BOOL="0" -DBUILD_opencv_dnn:BOOL="0" -DBUILD_opencv_dnn_modern:BOOL="0" -DCaffe_INCLUDE_DIR:PATH="$CAFFEROOT/caffe/distribute/include" -DWITH_MATLAB:BOOL="0" -DBUILD_opencv_cudabgsegm:BOOL="0" -DWITH_QT:BOOL="1" -DBUILD_opencv_cudaoptflow:BOOL="0" -DBUILD_opencv_cudastereo:BOOL="0" -DBUILD_opencv_cudafilters:BOOL="0" -DBUILD_opencv_cudev:BOOL="1" -DOPENCV_EXTRA_MODULES_PATH:PATH="/home/anguelos/work/projects/opencv_gsoc/opencv_contrib/modules" ./ - - -``` +The recent opencv-3.3.0 needs to be build with extra modules to use text module. diff --git a/modules/text/include/opencv2/text/ocr.hpp b/modules/text/include/opencv2/text/ocr.hpp index 14dfc092456..b77a3e1321b 100644 --- a/modules/text/include/opencv2/text/ocr.hpp +++ b/modules/text/include/opencv2/text/ocr.hpp @@ -657,7 +657,6 @@ CV_EXPORTS_W Ptr loadOCRBeamSearchClas //Classifiers should provide diferent backends -//For the moment only caffe is implemeted enum{ OCR_HOLISTIC_BACKEND_NONE, //No back end diff --git a/modules/text/src/precomp.hpp b/modules/text/src/precomp.hpp index c7371db1e79..72a23a9b34a 100644 --- a/modules/text/src/precomp.hpp +++ b/modules/text/src/precomp.hpp @@ -45,8 +45,6 @@ #include "opencv2/text.hpp" -//#include "text_config.hpp" - #ifdef HAVE_TESSERACT #include #include diff --git a/modules/text/src/text_detectorCNN.cpp b/modules/text/src/text_detectorCNN.cpp index 90d6fd9b8ee..a8d04db3722 100644 --- a/modules/text/src/text_detectorCNN.cpp +++ b/modules/text/src/text_detectorCNN.cpp @@ -91,7 +91,6 @@ class DeepCNNTextDetectorCaffeImpl: public DeepCNNTextDetector{ } - #ifdef HAVE_CAFFE Ptr > net_; #endif @@ -160,10 +159,6 @@ class DeepCNNTextDetectorCaffeImpl: public DeepCNNTextDetector{ this->outputGeometry_.height =net_->output_blobs()[0]->height(); this->outputGeometry_.width = net_->output_blobs()[0]->width(); - - - - #else CV_Error(Error::StsError,"Caffe not available during compilation!"); #endif diff --git a/modules/text/text_config.hpp.in b/modules/text/text_config.hpp.in index 71b32993acf..81e624bab37 100644 --- a/modules/text/text_config.hpp.in +++ b/modules/text/text_config.hpp.in @@ -1,13 +1,4 @@ #ifndef __OPENCV_TEXT_CONFIG_HPP__ #define __OPENCV_TEXT_CONFIG_HPP__ -// HAVE QT5 -//#cmakedefine HAVE_QT5GUI - -// HAVE CAFFE -//#cmakedefine HAVE_CAFFE - -// HAVE OCR Tesseract -//#cmakedefine HAVE_TESSERACT - #endif From af536b13530b34b552592e5b9cb31bdd8a941157 Mon Sep 17 00:00:00 2001 From: sghoshcvc Date: Tue, 5 Sep 2017 07:02:34 +0200 Subject: [PATCH 20/31] seperate image pre-processing from ocr code --- modules/text/src/image_preprocessor.cpp | 387 ++++++++++++++++++++++++ modules/text/src/ocr_holistic.cpp | 367 ---------------------- 2 files changed, 387 insertions(+), 367 deletions(-) create mode 100644 modules/text/src/image_preprocessor.cpp diff --git a/modules/text/src/image_preprocessor.cpp b/modules/text/src/image_preprocessor.cpp new file mode 100644 index 00000000000..3a65a210863 --- /dev/null +++ b/modules/text/src/image_preprocessor.cpp @@ -0,0 +1,387 @@ +#include "precomp.hpp" +#include "opencv2/imgproc.hpp" +#include "opencv2/highgui.hpp" +#include "opencv2/core.hpp" + + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace cv { namespace text { +//************************************************************************************ +//****************** ImagePreprocessor ******************************************* +//************************************************************************************ + +void ImagePreprocessor::preprocess(InputArray input,OutputArray output,Size sz,int outputChannels){ + Mat inpImg=input.getMat(); + Mat outImg; + this->preprocess_(inpImg,outImg,sz,outputChannels); + outImg.copyTo(output); +} +void ImagePreprocessor::set_mean(Mat mean){ + + + this->set_mean_(mean); + +} + + + +class ResizerPreprocessor: public ImagePreprocessor{ +protected: + void preprocess_(const Mat& input,Mat& output,Size outputSize,int outputChannels){ + //TODO put all the logic of channel and depth conversions in ImageProcessor class + CV_Assert(outputChannels==1 || outputChannels==3); + CV_Assert(input.channels()==1 || input.channels()==3); + if(input.channels()!=outputChannels) + { + Mat tmpInput; + if(outputChannels==1){ + cvtColor(input,tmpInput,COLOR_BGR2GRAY); + if(input.depth()==CV_8U) + { + tmpInput.convertTo(output,CV_32FC1,1/255.0); + }else + {//Assuming values are at the desired [0,1] range + tmpInput.convertTo(output, CV_32FC1); + } + }else + { + cvtColor(input,tmpInput,COLOR_GRAY2BGR); + if(input.depth()==CV_8U) + { + tmpInput.convertTo(output,CV_32FC3,1/255.0); + }else + {//Assuming values are at the desired [0,1] range + tmpInput.convertTo(output, CV_32FC3); + } + } + }else + { + if(input.channels()==1) + { + if(input.depth()==CV_8U) + { + input.convertTo(output, CV_32FC1,1/255.0); + }else + {//Assuming values are at the desired [0,1] range + input.convertTo(output, CV_32FC1); + } + }else + { + if(input.depth()==CV_8U){ + input.convertTo(output, CV_32FC3,1/255.0); + }else + {//Assuming values are at the desired [0,1] range + input.convertTo(output, CV_32FC3); + } + } + } + if(outputSize.width!=0 && outputSize.height!=0) + { + resize(output,output,outputSize); + } + } + //void set_mean_(Mat m){} +public: + ResizerPreprocessor(){} + ~ResizerPreprocessor(){} +}; + +class StandarizerPreprocessor: public ImagePreprocessor{ +protected: + double sigma_; + //void set_mean_(Mat M){} + + void preprocess_(const Mat& input,Mat& output,Size outputSize,int outputChannels){ + + //TODO put all the logic of channel and depth conversions in ImageProcessor class + CV_Assert(outputChannels==1 || outputChannels==3); + CV_Assert(input.channels()==1 || input.channels()==3); + if(input.channels()!=outputChannels) + { + Mat tmpInput; + if(outputChannels==1) + { + cvtColor(input,tmpInput,COLOR_BGR2GRAY); + if(input.depth()==CV_8U) + { + tmpInput.convertTo(output,CV_32FC1,1/255.0); + }else + {//Assuming values are at the desired [0,1] range + tmpInput.convertTo(output, CV_32FC1); + } + }else + { + cvtColor(input,tmpInput,COLOR_GRAY2BGR); + if(input.depth()==CV_8U) + { + tmpInput.convertTo(output,CV_32FC3,1/255.0); + }else + {//Assuming values are at the desired [0,1] range + tmpInput.convertTo(output, CV_32FC3); + } + } + }else + { + if(input.channels()==1) + { + if(input.depth()==CV_8U) + { + input.convertTo(output, CV_32FC1,1/255.0); + }else + {//Assuming values are at the desired [0,1] range + input.convertTo(output, CV_32FC1); + } + }else + { + if(input.depth()==CV_8U) + { + input.convertTo(output, CV_32FC3,1/255.0); + }else + {//Assuming values are at the desired [0,1] range + input.convertTo(output, CV_32FC3); + } + } + } + if(outputSize.width!=0 && outputSize.height!=0) + { + resize(output,output,outputSize); + } + + Scalar mean,dev; + meanStdDev(output,mean,dev); + subtract(output,mean[0],output); + divide(output,(dev[0]/sigma_),output); + } +public: + StandarizerPreprocessor(double sigma):sigma_(sigma){} + ~StandarizerPreprocessor(){} + +}; + +class customPreprocessor:public ImagePreprocessor{ +protected: + + double rawval_; + Mat mean_; + String channel_order_; + + void set_mean_(Mat imMean_){ + + imMean_.copyTo(this->mean_); + + + } + + void set_raw_scale(int rawval){ + rawval_ = rawval; + + } + void set_channels(String channel_order){ + channel_order_=channel_order; + } + + + void preprocess_(const Mat& input,Mat& output,Size outputSize,int outputChannels){ + //TODO put all the logic of channel and depth conversions in ImageProcessor class + + CV_Assert(outputChannels==1 || outputChannels==3); + CV_Assert(input.channels()==1 || input.channels()==3); + if(input.channels()!=outputChannels) + { + Mat tmpInput; + if(outputChannels==1) + { + cvtColor(input,tmpInput,COLOR_BGR2GRAY); + if(input.depth()==CV_8U) + { + if (rawval_ == 1) + tmpInput.convertTo(output,CV_32FC3,1/255.0); + else + tmpInput.convertTo(output,CV_32FC1); + }else + {//Assuming values are at the desired [0,1] range + if (rawval_ ==1) + tmpInput.convertTo(output, CV_32FC1); + else + tmpInput.convertTo(output, CV_32FC1,rawval_); + } + }else + { + cvtColor(input,tmpInput,COLOR_GRAY2BGR); + if(input.depth()==CV_8U) + { + if (rawval_ == 1) + tmpInput.convertTo(output,CV_32FC3,1/255.0); + else + tmpInput.convertTo(output,CV_32FC1); + }else + {//Assuming values are at the desired [0,1] range + if (rawval_ ==1) + tmpInput.convertTo(output, CV_32FC1); + else + tmpInput.convertTo(output, CV_32FC1,rawval_); + } + } + }else + { + if(input.channels()==1) + { + if(input.depth()==CV_8U) + { + if (rawval_ == 1) + input.convertTo(output,CV_32FC1,1/255.0); + else + input.convertTo(output,CV_32FC1); + }else + {//Assuming values are at the desired [0,1] range + if (rawval_ ==1) + input.convertTo(output, CV_32FC1); + else + input.convertTo(output, CV_32FC1,rawval_); + } + }else + { + if(input.depth()==CV_8U) + { + if (rawval_ == 1) + input.convertTo(output,CV_32FC3,1/255.0); + else + input.convertTo(output,CV_32FC3); + }else + {//Assuming values are at the desired [0,1] range + if (rawval_ ==1) + input.convertTo(output, CV_32FC3); + else + input.convertTo(output, CV_32FC3,rawval_); + } + } + } + if(outputSize.width!=0 && outputSize.height!=0) + { + resize(output,output,outputSize); + } + + if (!this->mean_.empty()){ + + Scalar mean_s(this->mean_.at(0,0),this->mean_.at(0,1),this->mean_.at(0,2)); + subtract(output,mean_s,output); + } + else{ + Scalar mean_s; + mean_s = mean(output); + subtract(output,mean_s,output); + } + + } + +public: + customPreprocessor( double rawval,String channel_order):rawval_(rawval),channel_order_(channel_order){} + ~customPreprocessor(){} + +}; + +class MeanSubtractorPreprocessor: public ImagePreprocessor{ +protected: + Mat mean_; + //void set_mean_(Mat m){} + void preprocess_(const Mat& input,Mat& output,Size outputSize,int outputChannels){ + //TODO put all the logic of channel and depth conversions in ImageProcessor class + CV_Assert(this->mean_.cols==outputSize.width && this->mean_.rows ==outputSize.height); + CV_Assert(outputChannels==1 || outputChannels==3); + CV_Assert(input.channels()==1 || input.channels()==3); + if(input.channels()!=outputChannels) + { + Mat tmpInput; + if(outputChannels==1) + { + cvtColor(input,tmpInput,COLOR_BGR2GRAY); + if(input.depth()==CV_8U) + { + tmpInput.convertTo(output,CV_32FC1,1/255.0); + }else + {//Assuming values are at the desired [0,1] range + tmpInput.convertTo(output, CV_32FC1); + } + }else + { + cvtColor(input,tmpInput,COLOR_GRAY2BGR); + if(input.depth()==CV_8U) + { + tmpInput.convertTo(output,CV_32FC3,1/255.0); + }else + {//Assuming values are at the desired [0,1] range + tmpInput.convertTo(output, CV_32FC3); + } + } + }else + { + if(input.channels()==1) + { + if(input.depth()==CV_8U) + { + input.convertTo(output, CV_32FC1,1/255.0); + }else + {//Assuming values are at the desired [0,1] range + input.convertTo(output, CV_32FC1); + } + }else + { + if(input.depth()==CV_8U) + { + input.convertTo(output, CV_32FC3,1/255.0); + }else + {//Assuming values are at the desired [0,1] range + input.convertTo(output, CV_32FC3); + } + } + } + if(outputSize.width!=0 && outputSize.height!=0) + { + resize(output,output,outputSize); + } + subtract(output,this->mean_,output); + } +public: + MeanSubtractorPreprocessor(Mat mean) + { + mean.copyTo(this->mean_); + } + + ~MeanSubtractorPreprocessor(){} +}; + + + +Ptr ImagePreprocessor::createResizer() +{ + return Ptr(new ResizerPreprocessor); +} + +Ptr ImagePreprocessor::createImageStandarizer(double sigma) +{ + return Ptr(new StandarizerPreprocessor(sigma)); +} +Ptr ImagePreprocessor::createImageCustomPreprocessor(double rawval,String channel_order) +{ + + return Ptr(new customPreprocessor(rawval,channel_order)); +} + +Ptr ImagePreprocessor::createImageMeanSubtractor(InputArray meanImg) +{ + Mat tmp=meanImg.getMat(); + return Ptr(new MeanSubtractorPreprocessor(tmp)); +} +} +} diff --git a/modules/text/src/ocr_holistic.cpp b/modules/text/src/ocr_holistic.cpp index f41fb7eb1c7..157637c2b2d 100644 --- a/modules/text/src/ocr_holistic.cpp +++ b/modules/text/src/ocr_holistic.cpp @@ -36,374 +36,7 @@ inline bool fileExists (String filename) { return f.good(); } -//************************************************************************************ -//****************** ImagePreprocessor ******************************************* -//************************************************************************************ - -void ImagePreprocessor::preprocess(InputArray input,OutputArray output,Size sz,int outputChannels){ - Mat inpImg=input.getMat(); - Mat outImg; - this->preprocess_(inpImg,outImg,sz,outputChannels); - outImg.copyTo(output); -} -void ImagePreprocessor::set_mean(Mat mean){ - - - this->set_mean_(mean); - -} - - - -class ResizerPreprocessor: public ImagePreprocessor{ -protected: - void preprocess_(const Mat& input,Mat& output,Size outputSize,int outputChannels){ - //TODO put all the logic of channel and depth conversions in ImageProcessor class - CV_Assert(outputChannels==1 || outputChannels==3); - CV_Assert(input.channels()==1 || input.channels()==3); - if(input.channels()!=outputChannels) - { - Mat tmpInput; - if(outputChannels==1){ - cvtColor(input,tmpInput,COLOR_BGR2GRAY); - if(input.depth()==CV_8U) - { - tmpInput.convertTo(output,CV_32FC1,1/255.0); - }else - {//Assuming values are at the desired [0,1] range - tmpInput.convertTo(output, CV_32FC1); - } - }else - { - cvtColor(input,tmpInput,COLOR_GRAY2BGR); - if(input.depth()==CV_8U) - { - tmpInput.convertTo(output,CV_32FC3,1/255.0); - }else - {//Assuming values are at the desired [0,1] range - tmpInput.convertTo(output, CV_32FC3); - } - } - }else - { - if(input.channels()==1) - { - if(input.depth()==CV_8U) - { - input.convertTo(output, CV_32FC1,1/255.0); - }else - {//Assuming values are at the desired [0,1] range - input.convertTo(output, CV_32FC1); - } - }else - { - if(input.depth()==CV_8U){ - input.convertTo(output, CV_32FC3,1/255.0); - }else - {//Assuming values are at the desired [0,1] range - input.convertTo(output, CV_32FC3); - } - } - } - if(outputSize.width!=0 && outputSize.height!=0) - { - resize(output,output,outputSize); - } - } - //void set_mean_(Mat m){} -public: - ResizerPreprocessor(){} - ~ResizerPreprocessor(){} -}; - -class StandarizerPreprocessor: public ImagePreprocessor{ -protected: - double sigma_; - //void set_mean_(Mat M){} - - void preprocess_(const Mat& input,Mat& output,Size outputSize,int outputChannels){ - - //TODO put all the logic of channel and depth conversions in ImageProcessor class - CV_Assert(outputChannels==1 || outputChannels==3); - CV_Assert(input.channels()==1 || input.channels()==3); - if(input.channels()!=outputChannels) - { - Mat tmpInput; - if(outputChannels==1) - { - cvtColor(input,tmpInput,COLOR_BGR2GRAY); - if(input.depth()==CV_8U) - { - tmpInput.convertTo(output,CV_32FC1,1/255.0); - }else - {//Assuming values are at the desired [0,1] range - tmpInput.convertTo(output, CV_32FC1); - } - }else - { - cvtColor(input,tmpInput,COLOR_GRAY2BGR); - if(input.depth()==CV_8U) - { - tmpInput.convertTo(output,CV_32FC3,1/255.0); - }else - {//Assuming values are at the desired [0,1] range - tmpInput.convertTo(output, CV_32FC3); - } - } - }else - { - if(input.channels()==1) - { - if(input.depth()==CV_8U) - { - input.convertTo(output, CV_32FC1,1/255.0); - }else - {//Assuming values are at the desired [0,1] range - input.convertTo(output, CV_32FC1); - } - }else - { - if(input.depth()==CV_8U) - { - input.convertTo(output, CV_32FC3,1/255.0); - }else - {//Assuming values are at the desired [0,1] range - input.convertTo(output, CV_32FC3); - } - } - } - if(outputSize.width!=0 && outputSize.height!=0) - { - resize(output,output,outputSize); - } - - Scalar mean,dev; - meanStdDev(output,mean,dev); - subtract(output,mean[0],output); - divide(output,(dev[0]/sigma_),output); - } -public: - StandarizerPreprocessor(double sigma):sigma_(sigma){} - ~StandarizerPreprocessor(){} - -}; - -class customPreprocessor:public ImagePreprocessor{ -protected: - - double rawval_; - Mat mean_; - String channel_order_; - - void set_mean_(Mat imMean_){ - - imMean_.copyTo(this->mean_); - - - } - - void set_raw_scale(int rawval){ - rawval_ = rawval; - - } - void set_channels(String channel_order){ - channel_order_=channel_order; - } - - void preprocess_(const Mat& input,Mat& output,Size outputSize,int outputChannels){ - //TODO put all the logic of channel and depth conversions in ImageProcessor class - - CV_Assert(outputChannels==1 || outputChannels==3); - CV_Assert(input.channels()==1 || input.channels()==3); - if(input.channels()!=outputChannels) - { - Mat tmpInput; - if(outputChannels==1) - { - cvtColor(input,tmpInput,COLOR_BGR2GRAY); - if(input.depth()==CV_8U) - { - if (rawval_ == 1) - tmpInput.convertTo(output,CV_32FC3,1/255.0); - else - tmpInput.convertTo(output,CV_32FC1); - }else - {//Assuming values are at the desired [0,1] range - if (rawval_ ==1) - tmpInput.convertTo(output, CV_32FC1); - else - tmpInput.convertTo(output, CV_32FC1,rawval_); - } - }else - { - cvtColor(input,tmpInput,COLOR_GRAY2BGR); - if(input.depth()==CV_8U) - { - if (rawval_ == 1) - tmpInput.convertTo(output,CV_32FC3,1/255.0); - else - tmpInput.convertTo(output,CV_32FC1); - }else - {//Assuming values are at the desired [0,1] range - if (rawval_ ==1) - tmpInput.convertTo(output, CV_32FC1); - else - tmpInput.convertTo(output, CV_32FC1,rawval_); - } - } - }else - { - if(input.channels()==1) - { - if(input.depth()==CV_8U) - { - if (rawval_ == 1) - input.convertTo(output,CV_32FC1,1/255.0); - else - input.convertTo(output,CV_32FC1); - }else - {//Assuming values are at the desired [0,1] range - if (rawval_ ==1) - input.convertTo(output, CV_32FC1); - else - input.convertTo(output, CV_32FC1,rawval_); - } - }else - { - if(input.depth()==CV_8U) - { - if (rawval_ == 1) - input.convertTo(output,CV_32FC3,1/255.0); - else - input.convertTo(output,CV_32FC3); - }else - {//Assuming values are at the desired [0,1] range - if (rawval_ ==1) - input.convertTo(output, CV_32FC3); - else - input.convertTo(output, CV_32FC3,rawval_); - } - } - } - if(outputSize.width!=0 && outputSize.height!=0) - { - resize(output,output,outputSize); - } - - if (!this->mean_.empty()){ - - Scalar mean_s(this->mean_.at(0,0),this->mean_.at(0,1),this->mean_.at(0,2)); - subtract(output,mean_s,output); - } - else{ - Scalar mean_s; - mean_s = mean(output); - subtract(output,mean_s,output); - } - - } - -public: - customPreprocessor( double rawval,String channel_order):rawval_(rawval),channel_order_(channel_order){} - ~customPreprocessor(){} - -}; - -class MeanSubtractorPreprocessor: public ImagePreprocessor{ -protected: - Mat mean_; - //void set_mean_(Mat m){} - void preprocess_(const Mat& input,Mat& output,Size outputSize,int outputChannels){ - //TODO put all the logic of channel and depth conversions in ImageProcessor class - CV_Assert(this->mean_.cols==outputSize.width && this->mean_.rows ==outputSize.height); - CV_Assert(outputChannels==1 || outputChannels==3); - CV_Assert(input.channels()==1 || input.channels()==3); - if(input.channels()!=outputChannels) - { - Mat tmpInput; - if(outputChannels==1) - { - cvtColor(input,tmpInput,COLOR_BGR2GRAY); - if(input.depth()==CV_8U) - { - tmpInput.convertTo(output,CV_32FC1,1/255.0); - }else - {//Assuming values are at the desired [0,1] range - tmpInput.convertTo(output, CV_32FC1); - } - }else - { - cvtColor(input,tmpInput,COLOR_GRAY2BGR); - if(input.depth()==CV_8U) - { - tmpInput.convertTo(output,CV_32FC3,1/255.0); - }else - {//Assuming values are at the desired [0,1] range - tmpInput.convertTo(output, CV_32FC3); - } - } - }else - { - if(input.channels()==1) - { - if(input.depth()==CV_8U) - { - input.convertTo(output, CV_32FC1,1/255.0); - }else - {//Assuming values are at the desired [0,1] range - input.convertTo(output, CV_32FC1); - } - }else - { - if(input.depth()==CV_8U) - { - input.convertTo(output, CV_32FC3,1/255.0); - }else - {//Assuming values are at the desired [0,1] range - input.convertTo(output, CV_32FC3); - } - } - } - if(outputSize.width!=0 && outputSize.height!=0) - { - resize(output,output,outputSize); - } - subtract(output,this->mean_,output); - } -public: - MeanSubtractorPreprocessor(Mat mean) - { - mean.copyTo(this->mean_); - } - - ~MeanSubtractorPreprocessor(){} -}; - - - - - -Ptr ImagePreprocessor::createResizer() -{ - return Ptr(new ResizerPreprocessor); -} - -Ptr ImagePreprocessor::createImageStandarizer(double sigma) -{ - return Ptr(new StandarizerPreprocessor(sigma)); -} -Ptr ImagePreprocessor::createImageCustomPreprocessor(double rawval,String channel_order) -{ - - return Ptr(new customPreprocessor(rawval,channel_order)); -} - -Ptr ImagePreprocessor::createImageMeanSubtractor(InputArray meanImg) -{ - Mat tmp=meanImg.getMat(); - return Ptr(new MeanSubtractorPreprocessor(tmp)); -} //************************************************************************************ //****************** TextImageClassifier ***************************************** From efc864c5fe68bc526aa57e9245af3489c3358c2a Mon Sep 17 00:00:00 2001 From: Suman Ghosh Date: Fri, 15 Sep 2017 21:00:26 +0200 Subject: [PATCH 21/31] removed hard coding height and width --- modules/text/CMakeLists.txt | 6 ++++-- modules/text/src/ocr_holistic.cpp | 14 +++++++------- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/modules/text/CMakeLists.txt b/modules/text/CMakeLists.txt index 18173db830b..b58fd41cf1d 100644 --- a/modules/text/CMakeLists.txt +++ b/modules/text/CMakeLists.txt @@ -59,8 +59,10 @@ if(HAVE_CAFFE AND HAVE_GLOG AND HAVE_PROTOBUF) list(APPEND Caffe_LINKER_LIBS ${HDF5_LIBRARIES}) find_package(Boost 1.46 REQUIRED COMPONENTS system thread filesystem) include_directories(SYSTEM ${Boost_INCLUDE_DIR}) - include_directories(SYSTEM /usr/local/cuda-8.0/targets/x86_64-linux/include/ usr/local/cuda-8.0/include/ /usr/local/cuda-7.5/targets/x86_64-linux/include/ ) - link_directories(SYSTEM /usr/local/cuda-8.0/targets/x86_64-linux/lib/ usr/local/cuda-8.0/lib/ /usr/local/cuda-7.5/targets/x86_64-linux/lib/ /usr/lib/openblas-base/lib /usr/local/cuda-8.0/lib64) + include_directories(SYSTEM ${CUDA_INCLUDE_DIR}) + link_directories(SYSTEM ${CUDA_LIBS}) + # include_directories(SYSTEM /usr/local/cuda-8.0/targets/x86_64-linux/include/ usr/local/cuda-8.0/include/ /usr/local/cuda-7.5/targets/x86_64-linux/include/ ) + #link_directories(SYSTEM /usr/local/cuda-8.0/targets/x86_64-linux/lib/ usr/local/cuda-8.0/lib/ /usr/local/cuda-7.5/targets/x86_64-linux/lib/ /usr/lib/openblas-base/lib /usr/local/cuda-8.0/lib64) list(APPEND Caffe_LINKER_LIBS ${Boost_LIBRARIES}) target_link_libraries(opencv_text atlas blas ${Caffe_LIBS} ${Glog_LIBS} ${Protobuf_LIBS} ${HDF5_LIBRARIES} ${Boost_LIBRARIES}) add_definitions(-DHAVE_CAFFE) diff --git a/modules/text/src/ocr_holistic.cpp b/modules/text/src/ocr_holistic.cpp index 157637c2b2d..cd24f3a9616 100644 --- a/modules/text/src/ocr_holistic.cpp +++ b/modules/text/src/ocr_holistic.cpp @@ -271,9 +271,9 @@ class DeepCNNOpenCvDNNImpl: public DeepCNN{ //Size outputGeometry_;//= Size(1,1); //int channelCount_; // int inputChannel_ ;//=1; - const int _inputHeight =32; - const int _inputWidth =100; - const int _inputChannel =1; + //const int _inputHeight =32; + //const int _inputWidth =100; + //const int _inputChannel =1; public: DeepCNNOpenCvDNNImpl(const DeepCNNOpenCvDNNImpl& dn): minibatchSz_(dn.minibatchSz_),outputSize_(dn.outputSize_){ @@ -300,8 +300,8 @@ class DeepCNNOpenCvDNNImpl: public DeepCNN{ //Implemented to supress Visual Studio warning "assignment operator could not be generated" } - DeepCNNOpenCvDNNImpl(String modelArchFilename, String modelWeightsFilename,Ptr preprocessor, int maxMinibatchSz) - :minibatchSz_(maxMinibatchSz) + DeepCNNOpenCvDNNImpl(String modelArchFilename, String modelWeightsFilename,Ptr preprocessor, int maxMinibatchSz,int inputWidth =100,int inputHeight = 32) + :minibatchSz_(maxMinibatchSz),_inputWidth(inputWidth),_inputHeight(inputHeight) { CV_Assert(this->minibatchSz_>0); @@ -612,13 +612,13 @@ class OCRHolisticWordRecognizerImpl: public OCRHolisticWordRecognizer{ this->classifier_->classifyBatch(inputImageList,netOutput); for(int k=0;kclassifier_->getOutputSize(),classNum,confidence); transcriptionVec.push_back(this->labels_[classNum]); confidenceVec.push_back(confidence); - } + }https://www.google.es/?gfe_rd=cr&dcr=0&ei=4fq7We8Bk9jyB8zPp5AL } From 887e6e5ed6c8967a3ac2a61d7e106022ba99fcf4 Mon Sep 17 00:00:00 2001 From: Suman Ghosh Date: Sun, 17 Sep 2017 20:57:52 +0200 Subject: [PATCH 22/31] removed hard codinginput parameters --- modules/text/src/ocr_holistic.cpp | 14 +++++++------- modules/text/src/text_detectorCNN.cpp | 16 ++++++++-------- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/modules/text/src/ocr_holistic.cpp b/modules/text/src/ocr_holistic.cpp index cd24f3a9616..8e0bae0073e 100644 --- a/modules/text/src/ocr_holistic.cpp +++ b/modules/text/src/ocr_holistic.cpp @@ -271,9 +271,9 @@ class DeepCNNOpenCvDNNImpl: public DeepCNN{ //Size outputGeometry_;//= Size(1,1); //int channelCount_; // int inputChannel_ ;//=1; - //const int _inputHeight =32; - //const int _inputWidth =100; - //const int _inputChannel =1; + int _inputHeight; + int _inputWidth ; + int _inputChannel ; public: DeepCNNOpenCvDNNImpl(const DeepCNNOpenCvDNNImpl& dn): minibatchSz_(dn.minibatchSz_),outputSize_(dn.outputSize_){ @@ -300,8 +300,8 @@ class DeepCNNOpenCvDNNImpl: public DeepCNN{ //Implemented to supress Visual Studio warning "assignment operator could not be generated" } - DeepCNNOpenCvDNNImpl(String modelArchFilename, String modelWeightsFilename,Ptr preprocessor, int maxMinibatchSz,int inputWidth =100,int inputHeight = 32) - :minibatchSz_(maxMinibatchSz),_inputWidth(inputWidth),_inputHeight(inputHeight) + DeepCNNOpenCvDNNImpl(String modelArchFilename, String modelWeightsFilename,Ptr preprocessor, int maxMinibatchSz,int inputWidth =100,int inputHeight = 32,int inputChannel =1) + :minibatchSz_(maxMinibatchSz),_inputWidth(inputWidth),_inputHeight(inputHeight),_inputChannel(inputChannel) { CV_Assert(this->minibatchSz_>0); @@ -612,13 +612,13 @@ class OCRHolisticWordRecognizerImpl: public OCRHolisticWordRecognizer{ this->classifier_->classifyBatch(inputImageList,netOutput); for(int k=0;kclassifier_->getOutputSize(),classNum,confidence); transcriptionVec.push_back(this->labels_[classNum]); confidenceVec.push_back(confidence); - }https://www.google.es/?gfe_rd=cr&dcr=0&ei=4fq7We8Bk9jyB8zPp5AL + } } diff --git a/modules/text/src/text_detectorCNN.cpp b/modules/text/src/text_detectorCNN.cpp index a8d04db3722..9b2e61ac6f4 100644 --- a/modules/text/src/text_detectorCNN.cpp +++ b/modules/text/src/text_detectorCNN.cpp @@ -252,9 +252,9 @@ class DeepCNNTextDetectorDNNImpl: public DeepCNNTextDetector{ //Size inputGeometry_; int minibatchSz_;//The existence of the assignment operator mandates this to be nonconst //int outputSize_; - const int _inputHeight =700; - const int _inputWidth =700; - const int _inputChannel =3; + int inputHeight_; + int inputWidth_; + int inputChannel_; public: DeepCNNTextDetectorDNNImpl(const DeepCNNTextDetectorDNNImpl& dn): minibatchSz_(dn.minibatchSz_){ @@ -282,8 +282,8 @@ class DeepCNNTextDetectorDNNImpl: public DeepCNNTextDetector{ //Implemented to supress Visual Studio warning "assignment operator could not be generated" } - DeepCNNTextDetectorDNNImpl(String modelArchFilename, String modelWeightsFilename,Ptr preprocessor, int maxMinibatchSz) - :minibatchSz_(maxMinibatchSz) + DeepCNNTextDetectorDNNImpl(String modelArchFilename, String modelWeightsFilename,Ptr preprocessor, int maxMinibatchSz,int inputHeight=700,int inputWidth =700,int inputChannel =3) + :minibatchSz_(maxMinibatchSz),inputHeight_(inputHeight),inputWidth_(inputWidth),inputChannel_(inputChannel) { CV_Assert(this->minibatchSz_>0); @@ -304,9 +304,9 @@ class DeepCNNTextDetectorDNNImpl: public DeepCNNTextDetector{ exit(-1); } - this->inputGeometry_.height =_inputHeight; - this->inputGeometry_.width = _inputWidth ;//inputLayer->width(); - this->inputChannelCount_ = _inputChannel ;//inputLayer->channels(); + this->inputGeometry_.height =inputHeight_; + this->inputGeometry_.width = inputWidth_ ;//inputLayer->width(); + this->inputChannelCount_ = inputChannel_ ;//inputLayer->channels(); #else CV_Error(Error::StsError,"DNN module not available during compilation!"); From 878258bc13f724071968e3a6cbb89d1c6fe63b7f Mon Sep 17 00:00:00 2001 From: Suman Ghosh Date: Sun, 17 Sep 2017 21:56:08 +0200 Subject: [PATCH 23/31] modified initializers --- modules/text/src/ocr_holistic.cpp | 22 +++++++++++----------- modules/text/src/text_detectorCNN.cpp | 8 ++++---- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/modules/text/src/ocr_holistic.cpp b/modules/text/src/ocr_holistic.cpp index 8e0bae0073e..035f104f28a 100644 --- a/modules/text/src/ocr_holistic.cpp +++ b/modules/text/src/ocr_holistic.cpp @@ -271,9 +271,9 @@ class DeepCNNOpenCvDNNImpl: public DeepCNN{ //Size outputGeometry_;//= Size(1,1); //int channelCount_; // int inputChannel_ ;//=1; - int _inputHeight; - int _inputWidth ; - int _inputChannel ; + // int _inputHeight; + //int _inputWidth ; + //int _inputChannel ; public: DeepCNNOpenCvDNNImpl(const DeepCNNOpenCvDNNImpl& dn): minibatchSz_(dn.minibatchSz_),outputSize_(dn.outputSize_){ @@ -300,8 +300,8 @@ class DeepCNNOpenCvDNNImpl: public DeepCNN{ //Implemented to supress Visual Studio warning "assignment operator could not be generated" } - DeepCNNOpenCvDNNImpl(String modelArchFilename, String modelWeightsFilename,Ptr preprocessor, int maxMinibatchSz,int inputWidth =100,int inputHeight = 32,int inputChannel =1) - :minibatchSz_(maxMinibatchSz),_inputWidth(inputWidth),_inputHeight(inputHeight),_inputChannel(inputChannel) + DeepCNNOpenCvDNNImpl(String modelArchFilename, String modelWeightsFilename,Ptr preprocessor, int maxMinibatchSz,int inputWidth ,int inputHeight ,int inputChannel ) + :minibatchSz_(maxMinibatchSz) { CV_Assert(this->minibatchSz_>0); @@ -326,8 +326,8 @@ class DeepCNNOpenCvDNNImpl: public DeepCNN{ } - this->inputGeometry_=Size(_inputWidth,_inputHeight);// Size(inputLayer->width(), inputLayer->height()); - this->channelCount_ = _inputChannel;//inputLayer->channels(); + this->inputGeometry_=Size(inputWidth,inputHeight);// Size(inputLayer->width(), inputLayer->height()); + this->channelCount_ = inputChannel;//inputLayer->channels(); //inputLayer->Reshape(this->minibatchSz_,this->channelCount_,this->inputGeometry_.height, this->inputGeometry_.width); Ptr< Layer > outLayer= net_->getLayer (net_->getLayerId (net_->getLayerNames()[net_->getLayerNames().size()-2])); @@ -408,7 +408,7 @@ Ptr DeepCNN::create(String archFilename,String weightsFilename,Ptr(new DeepCNNCaffeImpl(archFilename, weightsFilename,preprocessor, minibatchSz)); #elif defined(HAVE_DNN) - return Ptr(new DeepCNNOpenCvDNNImpl(archFilename, weightsFilename,preprocessor, minibatchSz)); + return Ptr(new DeepCNNOpenCvDNNImpl(archFilename, weightsFilename,preprocessor, minibatchSz,100,32,1)); #else CV_Error(Error::StsError,"DeepCNN::create backend not implemented"); return Ptr(); @@ -419,7 +419,7 @@ Ptr DeepCNN::create(String archFilename,String weightsFilename,Ptr(new DeepCNNCaffeImpl(archFilename, weightsFilename,preprocessor, minibatchSz)); break; case OCR_HOLISTIC_BACKEND_DNN: - return Ptr(new DeepCNNOpenCvDNNImpl(archFilename, weightsFilename,preprocessor, minibatchSz)); + return Ptr(new DeepCNNOpenCvDNNImpl(archFilename, weightsFilename,preprocessor, minibatchSz,100,32,1)); break; case OCR_HOLISTIC_BACKEND_NONE: default: @@ -440,7 +440,7 @@ Ptr DeepCNN::createDictNet(String archFilename,String weightsFilename,i return Ptr(new DeepCNNCaffeImpl(archFilename, weightsFilename,preprocessor, 100)); #elif defined(HAVE_DNN) - return Ptr(new DeepCNNOpenCvDNNImpl(archFilename, weightsFilename,preprocessor, 100)); + return Ptr(new DeepCNNOpenCvDNNImpl(archFilename, weightsFilename,preprocessor, 100,100,32,1)); #else CV_Error(Error::StsError,"DeepCNN::create backend not implemented"); return Ptr(); @@ -451,7 +451,7 @@ Ptr DeepCNN::createDictNet(String archFilename,String weightsFilename,i return Ptr(new DeepCNNCaffeImpl(archFilename, weightsFilename,preprocessor, 100)); break; case OCR_HOLISTIC_BACKEND_DNN: - return Ptr(new DeepCNNOpenCvDNNImpl(archFilename, weightsFilename,preprocessor, 100)); + return Ptr(new DeepCNNOpenCvDNNImpl(archFilename, weightsFilename,preprocessor, 100,100,32,1)); break; case OCR_HOLISTIC_BACKEND_NONE: default: diff --git a/modules/text/src/text_detectorCNN.cpp b/modules/text/src/text_detectorCNN.cpp index 9b2e61ac6f4..87f132850ae 100644 --- a/modules/text/src/text_detectorCNN.cpp +++ b/modules/text/src/text_detectorCNN.cpp @@ -376,7 +376,7 @@ Ptr DeepCNNTextDetector::create(String archFilename,String return Ptr(new DeepCNNTextDetectorCaffeImpl(archFilename, weightsFilename,preprocessor, minibatchSz)); #elif defined(HAVE_DNN) - return Ptr(new DeepCNNTextDetectorDNNImpl(archFilename, weightsFilename,preprocessor, minibatchSz)); + return Ptr(new DeepCNNTextDetectorDNNImpl(archFilename, weightsFilename,preprocessor, minibatchSz,700,700,3)); #else CV_Error(Error::StsError,"DeepCNNTextDetector::create backend not implemented"); return Ptr(); @@ -387,7 +387,7 @@ Ptr DeepCNNTextDetector::create(String archFilename,String break; case OCR_HOLISTIC_BACKEND_DNN: - return Ptr(new DeepCNNTextDetectorDNNImpl(archFilename, weightsFilename,preprocessor, minibatchSz)); + return Ptr(new DeepCNNTextDetectorDNNImpl(archFilename, weightsFilename,preprocessor, minibatchSz,700,700,3)); break; case OCR_HOLISTIC_BACKEND_NONE: @@ -420,7 +420,7 @@ Ptr DeepCNNTextDetector::createTextBoxNet(String archFilena return Ptr(new DeepCNNTextDetectorCaffeImpl(archFilename, weightsFilename,preprocessor, 1)); #elif defined(HAVE_DNN) - return Ptr(new DeepCNNTextDetectorDNNImpl(archFilename, weightsFilename,preprocessor, 1)); + return Ptr(new DeepCNNTextDetectorDNNImpl(archFilename, weightsFilename,preprocessor, 1,700,700,3)); #else CV_Error(Error::StsError,"DeepCNNTextDetector::create backend not implemented"); return Ptr(); @@ -430,7 +430,7 @@ Ptr DeepCNNTextDetector::createTextBoxNet(String archFilena return Ptr(new DeepCNNTextDetectorCaffeImpl(archFilename, weightsFilename,preprocessor, 1)); break; case OCR_HOLISTIC_BACKEND_DNN: - return Ptr(new DeepCNNTextDetectorDNNImpl(archFilename, weightsFilename,preprocessor, 1)); + return Ptr(new DeepCNNTextDetectorDNNImpl(archFilename, weightsFilename,preprocessor, 1,700,700,3)); break; case OCR_HOLISTIC_BACKEND_NONE: default: From bf630bef4ee22d35eae18eac2487ffe368be71e7 Mon Sep 17 00:00:00 2001 From: Suman Ghosh Date: Mon, 18 Sep 2017 12:16:25 +0200 Subject: [PATCH 24/31] Modified initializers list --- modules/text/src/text_detectorCNN.cpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/modules/text/src/text_detectorCNN.cpp b/modules/text/src/text_detectorCNN.cpp index 87f132850ae..5267b390fed 100644 --- a/modules/text/src/text_detectorCNN.cpp +++ b/modules/text/src/text_detectorCNN.cpp @@ -252,9 +252,9 @@ class DeepCNNTextDetectorDNNImpl: public DeepCNNTextDetector{ //Size inputGeometry_; int minibatchSz_;//The existence of the assignment operator mandates this to be nonconst //int outputSize_; - int inputHeight_; - int inputWidth_; - int inputChannel_; + //int inputHeight_; + //int inputWidth_; + //int inputChannel_; public: DeepCNNTextDetectorDNNImpl(const DeepCNNTextDetectorDNNImpl& dn): minibatchSz_(dn.minibatchSz_){ @@ -283,7 +283,7 @@ class DeepCNNTextDetectorDNNImpl: public DeepCNNTextDetector{ } DeepCNNTextDetectorDNNImpl(String modelArchFilename, String modelWeightsFilename,Ptr preprocessor, int maxMinibatchSz,int inputHeight=700,int inputWidth =700,int inputChannel =3) - :minibatchSz_(maxMinibatchSz),inputHeight_(inputHeight),inputWidth_(inputWidth),inputChannel_(inputChannel) + :minibatchSz_(maxMinibatchSz) { CV_Assert(this->minibatchSz_>0); @@ -304,9 +304,9 @@ class DeepCNNTextDetectorDNNImpl: public DeepCNNTextDetector{ exit(-1); } - this->inputGeometry_.height =inputHeight_; - this->inputGeometry_.width = inputWidth_ ;//inputLayer->width(); - this->inputChannelCount_ = inputChannel_ ;//inputLayer->channels(); + this->inputGeometry_.height =inputHeight; + this->inputGeometry_.width = inputWidth ;//inputLayer->width(); + this->inputChannelCount_ = inputChannel ;//inputLayer->channels(); #else CV_Error(Error::StsError,"DNN module not available during compilation!"); From 951e18272dcf13ecede1e5c3b7d9f2b2b0e3c456 Mon Sep 17 00:00:00 2001 From: Vladislav Sovrasov Date: Thu, 5 Oct 2017 16:42:30 +0300 Subject: [PATCH 25/31] text: cleanup dnn text detection part --- modules/text/CMakeLists.txt | 98 +-- modules/text/FindCaffe.cmake | 14 - modules/text/FindGlog.cmake | 10 - modules/text/FindProtobuf.cmake | 10 - modules/text/FindTesseract.cmake | 22 - modules/text/README.md | 71 -- modules/text/cmake/FindTesseract.cmake | 3 + modules/text/include/opencv2/text.hpp | 2 +- .../text/include/opencv2/text/erfilter.hpp | 1 - modules/text/include/opencv2/text/ocr.hpp | 764 +++--------------- .../include/opencv2/text/textDetector.hpp | 248 +----- modules/text/samples/deeptextdetection.py | 68 +- modules/text/samples/textbox_demo.cpp | 157 ++-- modules/text/src/image_preprocessor.cpp | 387 --------- modules/text/src/ocr_holistic.cpp | 697 ---------------- modules/text/src/precomp.hpp | 2 + modules/text/src/text_detector.cpp | 169 ---- modules/text/src/text_detectorCNN.cpp | 480 ++--------- modules/text/text_config.hpp.in | 3 + 19 files changed, 308 insertions(+), 2898 deletions(-) delete mode 100644 modules/text/FindCaffe.cmake delete mode 100755 modules/text/FindGlog.cmake delete mode 100644 modules/text/FindProtobuf.cmake delete mode 100644 modules/text/FindTesseract.cmake delete mode 100644 modules/text/src/image_preprocessor.cpp delete mode 100644 modules/text/src/ocr_holistic.cpp delete mode 100644 modules/text/src/text_detector.cpp diff --git a/modules/text/CMakeLists.txt b/modules/text/CMakeLists.txt index b58fd41cf1d..5d0f89f0da0 100644 --- a/modules/text/CMakeLists.txt +++ b/modules/text/CMakeLists.txt @@ -1,84 +1,24 @@ set(the_description "Text Detection and Recognition") - -if(POLICY CMP0023) - message(STATUS "Explicitly setting policy CMP0023 to OLD") - cmake_policy(SET CMP0023 OLD) -endif(POLICY CMP0023) - -# Using cmake scripts and modules -list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}) - -set(TEXT_DEPS opencv_ml opencv_highgui opencv_imgproc opencv_core opencv_features2d opencv_calib3d) - -find_package(Caffe) -if(Caffe_FOUND) - message(STATUS "Caffe: YES") - set(HAVE_CAFFE 1) -else() - message(STATUS "Caffe: NO") -# list(APPEND TEXT_DEPS opencv_dnn) -endif() - -#internal dependencies -find_package(Protobuf) -if(Protobuf_FOUND) - message(STATUS "Protobuf: YES") - set(HAVE_PROTOBUF 1) -else() - message(STATUS "Protobuf: NO") -endif() - -find_package(Glog) -if(Glog_FOUND) - message(STATUS "Glog: YES") - set(HAVE_GLOG 1) -else() - message(STATUS "Glog: NO") -endif() - -ocv_define_module(text opencv_ml opencv_imgproc opencv_core opencv_features2d opencv_calib3d OPTIONAL opencv_dnn WRAP python) -#ocv_define_module(text ${TEXT_DEPS} WRAP python) - -#set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${CMAKE_CURRENT_SOURCE_DIR}) - -find_package(Tesseract) -if(${Tesseract_FOUND}) - message(STATUS "Tesseract: YES") - include_directories(${Tesseract_INCLUDE_DIR}) - target_link_libraries(opencv_text ${Tesseract_LIBS}) - add_definitions(-DHAVE_TESSERACT) -else() - message(STATUS "Tesseract: NO") +ocv_define_module(text opencv_ml opencv_imgproc opencv_core opencv_features2d opencv_dnn OPTIONAL opencv_highgui WRAP python java) + +if(NOT CMAKE_CROSSCOMPILING OR OPENCV_FIND_TESSERACT) + set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${CMAKE_CURRENT_SOURCE_DIR}/cmake) + find_package(Tesseract QUIET) + if(Tesseract_FOUND) + message(STATUS "Tesseract: YES") + set(HAVE_TESSERACT 1) + ocv_include_directories(${Tesseract_INCLUDE_DIR}) + ocv_target_link_libraries(${the_module} ${Tesseract_LIBRARIES}) + else() + message(STATUS "Tesseract: NO") endif() +endif() +configure_file(${CMAKE_CURRENT_SOURCE_DIR}/text_config.hpp.in + ${CMAKE_BINARY_DIR}/text_config.hpp @ONLY) -if(HAVE_CAFFE AND HAVE_GLOG AND HAVE_PROTOBUF) - include_directories(${Caffe_INCLUDE_DIR}) - find_package(HDF5 COMPONENTS HL REQUIRED) - include_directories(SYSTEM ${HDF5_INCLUDE_DIRS} ${HDF5_HL_INCLUDE_DIR}) - list(APPEND Caffe_LINKER_LIBS ${HDF5_LIBRARIES}) - find_package(Boost 1.46 REQUIRED COMPONENTS system thread filesystem) - include_directories(SYSTEM ${Boost_INCLUDE_DIR}) - include_directories(SYSTEM ${CUDA_INCLUDE_DIR}) - link_directories(SYSTEM ${CUDA_LIBS}) - # include_directories(SYSTEM /usr/local/cuda-8.0/targets/x86_64-linux/include/ usr/local/cuda-8.0/include/ /usr/local/cuda-7.5/targets/x86_64-linux/include/ ) - #link_directories(SYSTEM /usr/local/cuda-8.0/targets/x86_64-linux/lib/ usr/local/cuda-8.0/lib/ /usr/local/cuda-7.5/targets/x86_64-linux/lib/ /usr/lib/openblas-base/lib /usr/local/cuda-8.0/lib64) - list(APPEND Caffe_LINKER_LIBS ${Boost_LIBRARIES}) - target_link_libraries(opencv_text atlas blas ${Caffe_LIBS} ${Glog_LIBS} ${Protobuf_LIBS} ${HDF5_LIBRARIES} ${Boost_LIBRARIES}) - add_definitions(-DHAVE_CAFFE) -endif() #HAVE_CAFFE - -message(STATUS "TEXT CAFFE SEARCH") -if() - message(STATUS "TEXT NO CAFFE CONFLICT") -else() - message(STATUS "TEXT CAFFE CONFLICT") -endif() +ocv_include_directories(${CMAKE_CURRENT_BINARY_DIR}) -if(HAVE_opencv_dnn) - message(STATUS "dnn module found") - add_definitions(-DHAVE_DNN) - set(HAVE_DNN 1) -else() - message(STATUS "dnn module not found") -endif() +ocv_add_testdata(samples/ contrib/text + FILES_MATCHING PATTERN "*.xml" PATTERN "*.xml.gz" REGEX "scenetext[0-9]+.jpg" +) diff --git a/modules/text/FindCaffe.cmake b/modules/text/FindCaffe.cmake deleted file mode 100644 index 12948f62992..00000000000 --- a/modules/text/FindCaffe.cmake +++ /dev/null @@ -1,14 +0,0 @@ -# Caffe package for CNN Triplet training -unset(Caffe_FOUND) - -find_path(Caffe_INCLUDE_DIR NAMES caffe/caffe.hpp caffe/common.hpp caffe/net.hpp caffe/proto/caffe.pb.h caffe/util/io.hpp caffe/vision_layers.hpp - HINTS - /usr/local/include) - -find_library(Caffe_LIBS NAMES caffe - HINTS - /usr/local/lib) - -if(Caffe_LIBS AND Caffe_INCLUDE_DIR) - set(Caffe_FOUND 1) -endif() diff --git a/modules/text/FindGlog.cmake b/modules/text/FindGlog.cmake deleted file mode 100755 index c30e9f4a6ab..00000000000 --- a/modules/text/FindGlog.cmake +++ /dev/null @@ -1,10 +0,0 @@ -#Required for Caffe -unset(Glog_FOUND) - -find_library(Glog_LIBS NAMES glog - HINTS - /usr/local/lib) - -if(Glog_LIBS) - set(Glog_FOUND 1) -endif() diff --git a/modules/text/FindProtobuf.cmake b/modules/text/FindProtobuf.cmake deleted file mode 100644 index 6d0ad56a1f7..00000000000 --- a/modules/text/FindProtobuf.cmake +++ /dev/null @@ -1,10 +0,0 @@ -#Protobuf package required for Caffe -unset(Protobuf_FOUND) - -find_library(Protobuf_LIBS NAMES protobuf - HINTS - /usr/local/lib) - -if(Protobuf_LIBS) - set(Protobuf_FOUND 1) -endif() diff --git a/modules/text/FindTesseract.cmake b/modules/text/FindTesseract.cmake deleted file mode 100644 index 01835e61bc7..00000000000 --- a/modules/text/FindTesseract.cmake +++ /dev/null @@ -1,22 +0,0 @@ -# Tesseract OCR -unset(Tesseract_FOUND) - -find_path(Tesseract_INCLUDE_DIR tesseract/baseapi.h - HINTS - /usr/include - /usr/local/include) - -find_library(Tesseract_LIBRARY NAMES tesseract - HINTS - /usr/lib - /usr/local/lib) - -find_library(Lept_LIBRARY NAMES lept - HINTS - /usr/lib - /usr/local/lib) - -set(Tesseract_LIBS ${Tesseract_LIBRARY} ${Lept_LIBRARY}) -if(Tesseract_LIBS AND Tesseract_INCLUDE_DIR) - set(Tesseract_FOUND 1) -endif() diff --git a/modules/text/README.md b/modules/text/README.md index fd33980e80e..b6955fd9847 100644 --- a/modules/text/README.md +++ b/modules/text/README.md @@ -56,74 +56,3 @@ Intro ----- The text module now have a text detection and recognition using deep CNN. The text detector deep CNN that takes an image which may contain multiple words. This outputs a list of Rects with bounding boxes and probability of text there. The text recognizer provides a probabillity over a given vocabulary for each of these rects. - -Two backends are supported 1) caffe 2) opencv-dnn - - - - -Instalation of Caffe backend ----------------------------- -* Please note a custom caffe based on SSD branch is required, the link of the custom caffe is provided below -The caffe wrapping backend has the requirements caffe does. -* Caffe can be built against OpenCV, if the caffe backend is enabled, a circular bependency arises. -The simplest solution is to build caffe without support for OpenCV. -* Only the OS supported by Caffe are supported by the backend. -The scripts describing the module have been developed in ubuntu 16.04 and assume such a system. -Other UNIX systems including OSX should be easy to adapt. - -Sample script for building Caffe - -```bash -#!/bin/bash -SRCROOT="${HOME}/caffe_inst/" -mkdir -p "$SRCROOT" -cd "$SRCROOT" -git clone https://github.com/sghoshcvc/TextBoxes.git -cd TextBoxes -cat Makefile.config.example > Makefile.config -echo 'USE_OPENCV := 0' >> Makefile.config -echo 'INCLUDE_DIRS += /usr/include/hdf5/serial/' >> Makefile.config -echo 'LIBRARY_DIRS += /usr/lib/x86_64-linux-gnu/hdf5/serial/' >> Makefile.config - - -echo "--- /tmp/caffe/include/caffe/net.hpp 2017-05-28 04:55:47.929623902 +0200 -+++ caffe/distribute/include/caffe/net.hpp 2017-05-28 04:51:33.437090768 +0200 -@@ -234,6 +234,7 @@ - - template - friend class Net; -+ virtual ~Callback(){} - }; - const vector& before_forward() const { return before_forward_; } - void add_before_forward(Callback* value) { -">/tmp/cleanup_caffe.diff - -patch < /tmp/cleanup_caffe.diff - - -make -j 6 - -make pycaffe - -make distribute -``` - - -```bash -#!/bin/bash -cd $OPENCV_BUILD_DIR #You must set this -CAFFEROOT="${HOME}/caffe_inst/" #If you used the previous code to compile Caffe in ubuntu 16.04 - -cmake -DCaffe_LIBS:FILEPATH="$CAFFEROOT/caffe/distribute/lib/libcaffe.so" -DBUILD_opencv_ts:BOOL="0" -DBUILD_opencv_dnn:BOOL="0" -DBUILD_opencv_dnn_modern:BOOL="0" -DCaffe_INCLUDE_DIR:PATH="$CAFFEROOT/caffe/distribute/include" -DWITH_MATLAB:BOOL="0" -DBUILD_opencv_cudabgsegm:BOOL="0" -DWITH_QT:BOOL="1" -DBUILD_opencv_cudaoptflow:BOOL="0" -DBUILD_opencv_cudastereo:BOOL="0" -DBUILD_opencv_cudafilters:BOOL="0" -DBUILD_opencv_cudev:BOOL="1" -DOPENCV_EXTRA_MODULES_PATH:PATH="$OPENCV_CONTRIB/modules" ./ - - -``` -where $OPECV_CONTRIB is the root directory containing opencv_contrib module - -Instalation of Caffe backend ----------------------------- - -Use of opencv-dnn does not need any additional library. - -The recent opencv-3.3.0 needs to be build with extra modules to use text module. diff --git a/modules/text/cmake/FindTesseract.cmake b/modules/text/cmake/FindTesseract.cmake index 2a5d868f91f..5bdbe243616 100644 --- a/modules/text/cmake/FindTesseract.cmake +++ b/modules/text/cmake/FindTesseract.cmake @@ -5,14 +5,17 @@ endif() if(NOT Tesseract_FOUND) find_path(Tesseract_INCLUDE_DIR tesseract/baseapi.h HINTS + /usr/include /usr/local/include) find_library(Tesseract_LIBRARY NAMES tesseract HINTS + /usr/lib /usr/local/lib) find_library(Lept_LIBRARY NAMES lept HINTS + /usr/lib /usr/local/lib) if(Tesseract_INCLUDE_DIR AND Tesseract_LIBRARY AND Lept_LIBRARY) diff --git a/modules/text/include/opencv2/text.hpp b/modules/text/include/opencv2/text.hpp index 85b8b741982..c06c889838c 100644 --- a/modules/text/include/opencv2/text.hpp +++ b/modules/text/include/opencv2/text.hpp @@ -93,7 +93,7 @@ grouping horizontally aligned text, and the method proposed by Lluis Gomez and D in @cite Gomez13 @cite Gomez14 for grouping arbitrary oriented text (see erGrouping). To see the text detector at work, have a look at the textdetection demo: - + @defgroup text_recognize Scene Text Recognition @} diff --git a/modules/text/include/opencv2/text/erfilter.hpp b/modules/text/include/opencv2/text/erfilter.hpp index 2bd1c56a356..c9bac2b3272 100644 --- a/modules/text/include/opencv2/text/erfilter.hpp +++ b/modules/text/include/opencv2/text/erfilter.hpp @@ -65,7 +65,6 @@ component tree of the image. : */ struct CV_EXPORTS ERStat { - public: //! Constructor explicit ERStat(int level = 256, int pixel = 0, int x = 0, int y = 0); diff --git a/modules/text/include/opencv2/text/ocr.hpp b/modules/text/include/opencv2/text/ocr.hpp index df9c2b4aa59..22c98448cf1 100644 --- a/modules/text/include/opencv2/text/ocr.hpp +++ b/modules/text/include/opencv2/text/ocr.hpp @@ -44,12 +44,10 @@ #ifndef __OPENCV_TEXT_OCR_HPP__ #define __OPENCV_TEXT_OCR_HPP__ +#include + #include #include -#include -#include - - namespace cv { @@ -91,100 +89,61 @@ enum ocr_engine_mode }; //base class BaseOCR declares a common API that would be used in a typical text recognition scenario - class CV_EXPORTS_W BaseOCR { - public: +public: virtual ~BaseOCR() {}; - - virtual void run(Mat& image, std::string& output_text, - std::vector* component_rects=NULL, - std::vector* component_texts=NULL, - std::vector* component_confidences=NULL, + virtual void run(Mat& image, std::string& output_text, std::vector* component_rects=NULL, + std::vector* component_texts=NULL, std::vector* component_confidences=NULL, int component_level=0) = 0; - - virtual void run(Mat& image, Mat& mask, std::string& output_text, - std::vector* component_rects=NULL, - std::vector* component_texts=NULL, - std::vector* component_confidences=NULL, + virtual void run(Mat& image, Mat& mask, std::string& output_text, std::vector* component_rects=NULL, + std::vector* component_texts=NULL, std::vector* component_confidences=NULL, int component_level=0) = 0; - - /** @brief Main functionality of the OCR Hierarchy. Subclasses provide - * default parameters for all parameters other than the input image. - */ - virtual String run(InputArray image){ - std::string res; - std::vector component_rects; - std::vector component_confidences; - std::vector component_texts; - Mat inputImage=image.getMat(); - this->run(inputImage,res,&component_rects,&component_texts, - &component_confidences,OCR_LEVEL_WORD); - return res; - } - }; -/** @brief OCRTesseract class provides an interface with the tesseract-ocr API - * (v3.02.02) in C++. +/** @brief OCRTesseract class provides an interface with the tesseract-ocr API (v3.02.02) in C++. Notice that it is compiled only when tesseract-ocr is correctly installed. @note - - (C++) An example of OCRTesseract recognition combined with scene text - detection can be found at the end_to_end_recognition demo: - - - (C++) Another example of OCRTesseract recognition combined with scene - text detection can be found at the webcam_demo: - + - (C++) An example of OCRTesseract recognition combined with scene text detection can be found + at the end_to_end_recognition demo: + + - (C++) Another example of OCRTesseract recognition combined with scene text detection can be + found at the webcam_demo: + */ class CV_EXPORTS_W OCRTesseract : public BaseOCR { public: /** @brief Recognize text using the tesseract-ocr API. - Takes image on input and returns recognized text in the output_text - parameter. Optionally provides also the Rects for individual text elements - found (e.g. words), and the list of those text elements with their - confidence values. + Takes image on input and returns recognized text in the output_text parameter. Optionally + provides also the Rects for individual text elements found (e.g. words), and the list of those + text elements with their confidence values. @param image Input image CV_8UC1 or CV_8UC3 - @param output_text Output text of the tesseract-ocr. - - @param component_rects If provided the method will output a list of Rects - for the individual text elements found (e.g. words or text lines). - - @param component_texts If provided the method will output a list of text - strings for the recognition of individual text elements found (e.g. words or - text lines). - - @param component_confidences If provided the method will output a list of - confidence values for the recognition of individual text elements found - (e.g. words or text lines). - - @param component_level OCR_LEVEL_WORD (by default), or OCR_LEVEL_TEXT_LINE. - + @param component_rects If provided the method will output a list of Rects for the individual + text elements found (e.g. words or text lines). + @param component_texts If provided the method will output a list of text strings for the + recognition of individual text elements found (e.g. words or text lines). + @param component_confidences If provided the method will output a list of confidence values + for the recognition of individual text elements found (e.g. words or text lines). + @param component_level OCR_LEVEL_WORD (by default), or OCR_LEVEL_TEXTLINE. */ - using BaseOCR::run; - virtual void run (Mat& image, std::string& output_text, - std::vector* component_rects=NULL, - std::vector* component_texts=NULL, - std::vector* component_confidences=NULL, + virtual void run(Mat& image, std::string& output_text, std::vector* component_rects=NULL, + std::vector* component_texts=NULL, std::vector* component_confidences=NULL, int component_level=0); - virtual void run (Mat& image, Mat& mask, std::string& output_text, - std::vector* component_rects=NULL, - std::vector* component_texts=NULL, - std::vector* component_confidences=NULL, - int component_level=0); + virtual void run(Mat& image, Mat& mask, std::string& output_text, std::vector* component_rects=NULL, + std::vector* component_texts=NULL, std::vector* component_confidences=NULL, + int component_level=0); // aliases for scripting - CV_WRAP String run (InputArray image, int min_confidence, - int component_level=0); + CV_WRAP String run(InputArray image, int min_confidence, int component_level=0); - CV_WRAP String run(InputArray image, InputArray mask, - int min_confidence, int component_level=0); + CV_WRAP String run(InputArray image, InputArray mask, int min_confidence, int component_level=0); CV_WRAP virtual void setWhiteList(const String& char_whitelist) = 0; @@ -205,7 +164,6 @@ class CV_EXPORTS_W OCRTesseract : public BaseOCR */ CV_WRAP static Ptr create(const char* datapath=NULL, const char* language=NULL, const char* char_whitelist=NULL, int oem=OEM_DEFAULT, int psmode=PSM_AUTO); - }; @@ -225,19 +183,19 @@ enum classifier_type /** @brief OCRHMMDecoder class provides an interface for OCR using Hidden Markov Models. - - * @note - * - (C++) An example on using OCRHMMDecoder recognition combined with scene - * text detection can be found at the webcam_demo sample: - * +@note + - (C++) An example on using OCRHMMDecoder recognition combined with scene text detection can + be found at the webcam_demo sample: + */ -class CV_EXPORTS_W OCRHMMDecoder : public BaseOCR { - public: +class CV_EXPORTS_W OCRHMMDecoder : public BaseOCR +{ +public: /** @brief Callback with the character classifier is made a class. - * This way it hides the feature extractor and the classifier itself, so - * developers can write their own OCR code. + This way it hides the feature extractor and the classifier itself, so developers can write + their own OCR code. The default character classifier and feature extractor can be loaded using the utility function loadOCRHMMClassifierNM and KNN model provided in @@ -246,120 +204,92 @@ class CV_EXPORTS_W OCRHMMDecoder : public BaseOCR { class CV_EXPORTS_W ClassifierCallback { public: - virtual ~ClassifierCallback() { } - /** @brief The character classifier must return a (ranked list of) - * class(es) id('s) - - * @param image Input image CV_8UC1 or CV_8UC3 with a single letter. - * @param out_class The classifier returns the character class - * categorical label, or list of class labels, to which the input image - * corresponds. + /** @brief The character classifier must return a (ranked list of) class(es) id('s) - * @param out_confidence The classifier returns the probability of the - * input image corresponding to each classes in out_class. + @param image Input image CV_8UC1 or CV_8UC3 with a single letter. + @param out_class The classifier returns the character class categorical label, or list of + class labels, to which the input image corresponds. + @param out_confidence The classifier returns the probability of the input image + corresponding to each classes in out_class. */ - virtual void eval (InputArray image, std::vector& out_class, - std::vector& out_confidence); + virtual void eval( InputArray image, std::vector& out_class, std::vector& out_confidence); }; +public: /** @brief Recognize text using HMM. - * Takes binary image on input and returns recognized text in the output_text - * parameter. Optionally provides also the Rects for individual text elements - * found (e.g. words), and the list of those text elements with their - * confidence values. + Takes binary image on input and returns recognized text in the output_text parameter. Optionally + provides also the Rects for individual text elements found (e.g. words), and the list of those + text elements with their confidence values. - * @param image Input binary image CV_8UC1 with a single text line (or word). + @param image Input binary image CV_8UC1 with a single text line (or word). - * @param output_text Output text. Most likely character sequence found by - * the HMM decoder. + @param output_text Output text. Most likely character sequence found by the HMM decoder. - * @param component_rects If provided the method will output a list of Rects - * for the individual text elements found (e.g. words). + @param component_rects If provided the method will output a list of Rects for the individual + text elements found (e.g. words). - * @param component_texts If provided the method will output a list of text - * strings for the recognition of individual text elements found (e.g. words). + @param component_texts If provided the method will output a list of text strings for the + recognition of individual text elements found (e.g. words). - * @param component_confidences If provided the method will output a list of - * confidence values for the recognition of individual text elements found - * (e.g. words). + @param component_confidences If provided the method will output a list of confidence values + for the recognition of individual text elements found (e.g. words). - * @param component_level Only OCR_LEVEL_WORD is supported. - */ - using BaseOCR::run; - virtual void run (Mat& image, std::string& output_text, - std::vector* component_rects=NULL, - std::vector* component_texts=NULL, - std::vector* component_confidences=NULL, - int component_level=0); + @param component_level Only OCR_LEVEL_WORD is supported. + */ + virtual void run(Mat& image, std::string& output_text, std::vector* component_rects=NULL, + std::vector* component_texts=NULL, std::vector* component_confidences=NULL, + int component_level=0); /** @brief Recognize text using HMM. - * Takes an image and a mask (where each connected component corresponds to a - * segmented character) on input and returns recognized text in the - * output_text parameter. Optionally provides also the Rects for individual - * text elements found (e.g. words), and the list of those text elements with - * their confidence values. - - * @param image Input image CV_8UC1 or CV_8UC3 with a single text line - * (or word). + Takes an image and a mask (where each connected component corresponds to a segmented character) + on input and returns recognized text in the output_text parameter. Optionally + provides also the Rects for individual text elements found (e.g. words), and the list of those + text elements with their confidence values. - * @param mask Input binary image CV_8UC1 same size as input image. Each - * connected component in mask corresponds to a segmented character in the - * input image. + @param image Input image CV_8UC1 or CV_8UC3 with a single text line (or word). + @param mask Input binary image CV_8UC1 same size as input image. Each connected component in mask corresponds to a segmented character in the input image. - * @param output_text Output text. Most likely character sequence found by - * the HMM decoder. + @param output_text Output text. Most likely character sequence found by the HMM decoder. - * @param component_rects If provided the method will output a list of Rects - * for the individual text elements found (e.g. words). + @param component_rects If provided the method will output a list of Rects for the individual + text elements found (e.g. words). - * @param component_texts If provided the method will output a list of text - * strings for the recognition of individual text elements found (e.g. words). + @param component_texts If provided the method will output a list of text strings for the + recognition of individual text elements found (e.g. words). - * @param component_confidences If provided the method will output a list of - * confidence values for the recognition of individual text elements found - * (e.g. words). + @param component_confidences If provided the method will output a list of confidence values + for the recognition of individual text elements found (e.g. words). - * @param component_level Only OCR_LEVEL_WORD is supported. - */ - virtual void run(Mat& image, Mat& mask, std::string& output_text, - std::vector* component_rects=NULL, - std::vector* component_texts=NULL, - std::vector* component_confidences=NULL, + @param component_level Only OCR_LEVEL_WORD is supported. + */ + virtual void run(Mat& image, Mat& mask, std::string& output_text, std::vector* component_rects=NULL, + std::vector* component_texts=NULL, std::vector* component_confidences=NULL, int component_level=0); // aliases for scripting - CV_WRAP String run(InputArray image, - int min_confidence, - int component_level=0); + CV_WRAP String run(InputArray image, int min_confidence, int component_level=0); - CV_WRAP String run(InputArray image, - InputArray mask, - int min_confidence, - int component_level=0); + CV_WRAP String run(InputArray image, InputArray mask, int min_confidence, int component_level=0); - /** @brief Creates an instance of the OCRHMMDecoder class. Initializes - * HMMDecoder. + /** @brief Creates an instance of the OCRHMMDecoder class. Initializes HMMDecoder. - * @param classifier The character classifier with built in feature - * extractor. + @param classifier The character classifier with built in feature extractor. - * @param vocabulary The language vocabulary (chars when ascii english text) - * . vocabulary.size() must be equal to the number of classes of the - * classifier. + @param vocabulary The language vocabulary (chars when ascii english text). vocabulary.size() + must be equal to the number of classes of the classifier. - * @param transition_probabilities_table Table with transition probabilities - * between character pairs. cols == rows == vocabulary.size(). + @param transition_probabilities_table Table with transition probabilities between character + pairs. cols == rows == vocabulary.size(). - * @param emission_probabilities_table Table with observation emission - * probabilities. cols == rows == vocabulary.size(). + @param emission_probabilities_table Table with observation emission probabilities. cols == + rows == vocabulary.size(). - * @param mode HMM Decoding algorithm. Only OCR_DECODER_VITERBI is available - * for the moment (). + @param mode HMM Decoding algorithm. Only OCR_DECODER_VITERBI is available for the moment + (). */ - static Ptr create(const Ptr classifier,// The character classifier with built in feature extractor const std::string& vocabulary, // The language vocabulary (chars when ASCII English text) // size() must be equal to the number of classes @@ -402,11 +332,9 @@ class CV_EXPORTS_W OCRHMMDecoder : public BaseOCR { decoder_mode mode; }; -/** @brief Allow to implicitly load the default character classifier when - * creating an OCRHMMDecoder object. - - @param filename The XML or YAML file with the classifier model (e.g.OCRHMM_knn_model_data.xml) +/** @brief Allow to implicitly load the default character classifier when creating an OCRHMMDecoder object. +@param filename The XML or YAML file with the classifier model (e.g. OCRHMM_knn_model_data.xml) The KNN default classifier is based in the scene text recognition method proposed by Lukás Neumann & Jiri Matas in [Neumann11b]. Basically, the region (contour) in the input image is normalized to a @@ -416,16 +344,11 @@ using a KNN model trained with synthetic data of rendered characters with differ types. @deprecated loadOCRHMMClassifier instead - */ -CV_EXPORTS_W Ptr loadOCRHMMClassifierNM ( - const String& filename); -/** @brief Allow to implicitly load the default character classifier when - * creating an OCRHMMDecoder object. - - @param filename The XML or YAML file with the classifier model (e.g.OCRBeamSearch_CNN_model_data.xml.gz) +CV_EXPORTS_W Ptr loadOCRHMMClassifierNM(const String& filename); +/** @brief Allow to implicitly load the default character classifier when creating an OCRHMMDecoder object. @param filename The XML or YAML file with the classifier model (e.g. OCRBeamSearch_CNN_model_data.xml.gz) @@ -435,10 +358,8 @@ a linear classifier. It is applied to the input image in a sliding window fashio at each window location. @deprecated use loadOCRHMMClassifier instead - */ -CV_EXPORTS_W Ptr loadOCRHMMClassifierCNN ( - const String& filename); +CV_EXPORTS_W Ptr loadOCRHMMClassifierCNN(const String& filename); /** @brief Allow to implicitly load the default character classifier when creating an OCRHMMDecoder object. @@ -450,64 +371,49 @@ CV_EXPORTS_W Ptr loadOCRHMMClassifierCNN ( CV_EXPORTS_W Ptr loadOCRHMMClassifier(const String& filename, int classifier); //! @} - /** @brief Utility function to create a tailored language model transitions table from a given list of words (lexicon). * * @param vocabulary The language vocabulary (chars when ASCII English text). * * @param lexicon The list of words that are expected to be found in a particular image. - - * @param transition_probabilities_table Output table with transition - * probabilities between character pairs. cols == rows == vocabulary.size(). - - * The function calculate frequency statistics of character pairs from the given - * lexicon and fills the output transition_probabilities_table with them. The - * transition_probabilities_table can be used as input in the - * OCRHMMDecoder::create() and OCRBeamSearchDecoder::create() methods. + * + * @param transition_probabilities_table Output table with transition probabilities between character pairs. cols == rows == vocabulary.size(). + * + * The function calculate frequency statistics of character pairs from the given lexicon and fills the output transition_probabilities_table with them. The transition_probabilities_table can be used as input in the OCRHMMDecoder::create() and OCRBeamSearchDecoder::create() methods. * @note - * - (C++) An alternative would be to load the default generic language - * transition table provided in the text module samples folder (created - * from ispell 42869 english words list) : - * + * - (C++) An alternative would be to load the default generic language transition table provided in the text module samples folder (created from ispell 42869 english words list) : + * **/ -CV_EXPORTS void createOCRHMMTransitionsTable ( - std::string& vocabulary, std::vector& lexicon, - OutputArray transition_probabilities_table); +CV_EXPORTS void createOCRHMMTransitionsTable(std::string& vocabulary, std::vector& lexicon, OutputArray transition_probabilities_table); + +CV_EXPORTS_W Mat createOCRHMMTransitionsTable(const String& vocabulary, std::vector& lexicon); -CV_EXPORTS_W Mat createOCRHMMTransitionsTable ( - const String& vocabulary, std::vector& lexicon); /* OCR BeamSearch Decoder */ -/** @brief OCRBeamSearchDecoder class provides an interface for OCR using Beam - * Search algorithm. +/** @brief OCRBeamSearchDecoder class provides an interface for OCR using Beam Search algorithm. @note - - (C++) An example on using OCRBeamSearchDecoder recognition combined with - scene text detection can be found at the demo sample: - + - (C++) An example on using OCRBeamSearchDecoder recognition combined with scene text detection can + be found at the demo sample: + */ - - -/* Forward declaration of class that can be used to generate an OCRBeamSearchDecoder::ClassifierCallbac */ -class TextImageClassifier; - -class CV_EXPORTS_W OCRBeamSearchDecoder : public BaseOCR{ - - public: +class CV_EXPORTS_W OCRBeamSearchDecoder : public BaseOCR +{ +public: /** @brief Callback with the character classifier is made a class. - * This way it hides the feature extractor and the classifier itself, so - * developers can write their own OCR code. + This way it hides the feature extractor and the classifier itself, so developers can write + their own OCR code. - * The default character classifier and feature extractor can be loaded - * using the utility funtion loadOCRBeamSearchClassifierCNN with all its - * parameters provided in - * . + The default character classifier and feature extractor can be loaded using the utility funtion + loadOCRBeamSearchClassifierCNN with all its parameters provided in + . */ - class CV_EXPORTS_W ClassifierCallback{ - public: + class CV_EXPORTS_W ClassifierCallback + { + public: virtual ~ClassifierCallback() { } /** @brief The character classifier must return a (ranked list of) class(es) id('s) @@ -519,8 +425,8 @@ class CV_EXPORTS_W OCRBeamSearchDecoder : public BaseOCR{ */ virtual void eval( InputArray image, std::vector< std::vector >& recognition_probabilities, std::vector& oversegmentation ); - virtual int getWindowSize() {return 0;} - virtual int getStepSize() {return 0;} + int getWindowSize() {return 0;} + int getStepSize() {return 0;} }; public: @@ -545,7 +451,6 @@ class CV_EXPORTS_W OCRBeamSearchDecoder : public BaseOCR{ @param component_level Only OCR_LEVEL_WORD is supported. */ - using BaseOCR::run; virtual void run(Mat& image, std::string& output_text, std::vector* component_rects=NULL, std::vector* component_texts=NULL, std::vector* component_confidences=NULL, int component_level=0); @@ -577,7 +482,6 @@ class CV_EXPORTS_W OCRBeamSearchDecoder : public BaseOCR{ @param beam_size Size of the beam in Beam Search algorithm. */ - static Ptr create(const Ptr classifier,// The character classifier with built in feature extractor const std::string& vocabulary, // The language vocabulary (chars when ASCII English text) // size() must be equal to the number of classes @@ -598,29 +502,10 @@ class CV_EXPORTS_W OCRBeamSearchDecoder : public BaseOCR{ int mode = OCR_DECODER_VITERBI, // HMM Decoding algorithm (only Viterbi for the moment) int beam_size = 500); // Size of the beam in Beam Search algorithm - - - - /** @brief Creates an instance of the OCRBeamSearchDecoder class. Initializes HMMDecoder from the specified path. @overload - @param filename path to a character classifier file - - @param vocabulary The language vocabulary (chars when ASCII English text). vocabulary.size() - must be equal to the number of classes of the classifier.. - - @param transition_probabilities_table Table with transition probabilities between character - pairs. cols == rows == vocabulary.size(). - - @param emission_probabilities_table Table with observation emission probabilities. cols == - rows == vocabulary.size(). - - @param mode HMM Decoding algorithm (only Viterbi for the moment) - - @param beam_size Size of the beam in Beam Search algorithm - */ CV_WRAP static Ptr create(const String& filename, // The character classifier file const String& vocabulary, // The language vocabulary (chars when ASCII English text) @@ -631,7 +516,6 @@ class CV_EXPORTS_W OCRBeamSearchDecoder : public BaseOCR{ // cols == rows == vocabulary.size() int mode = OCR_DECODER_VITERBI, // HMM Decoding algorithm (only Viterbi for the moment) int beam_size = 500); - protected: Ptr classifier; @@ -656,402 +540,6 @@ CV_EXPORTS_W Ptr loadOCRBeamSearchClas //! @} - -//Classifiers should provide diferent backends - -enum{ - OCR_HOLISTIC_BACKEND_NONE, //No back end - OCR_HOLISTIC_BACKEND_DNN, // dnn backend opencv_dnn - OCR_HOLISTIC_BACKEND_CAFFE, // caffe based backend - OCR_HOLISTIC_BACKEND_DEFAULT // to store default value based on environment -}; - -class TextImageClassifier; - -/** - * @brief The ImagePreprocessor class - */ -class CV_EXPORTS_W ImagePreprocessor{ -protected: - virtual void preprocess_(const Mat& input,Mat& output,Size outputSize,int outputChannels)=0; - virtual void set_mean_(Mat){} - -public: - virtual ~ImagePreprocessor(){} - - /** @brief this method in provides public acces to the preprocessing with respect to a specific - * classifier - * - * This method's main use would be to use the preprocessor without feeding it to a classifier. - * Determining the exact behavior of a preprocessor is the main motivation for this. - * - * @param input an image without any constraints - * - * @param output in most cases an image of fixed depth size and whitened - * - * @param sz the size to which the image would be resize if the preprocessor resizes inputs - * - * @param outputChannels the number of channels for the output image - */ - CV_WRAP void preprocess(InputArray input,OutputArray output,Size sz,int outputChannels); - - /** @brief this method in provides public acces to set the mean of the input images - * mean can be a mat either of same size of the image or one value per color channel - * A preprocessor can be created without the mean( the pre processor will calculate mean for every image - * in that case - * - - * @param mean which will be subtracted from the images - * - */ - - CV_WRAP void set_mean(Mat mean); - - /** @brief Creates a functor that only resizes and changes the channels of the input - * without further processing. - * - * @return shared pointer to the generated preprocessor - */ - CV_WRAP static Ptr createResizer(); - - /** @brief - * - * @param sigma - * - * @return shared pointer to generated preprocessor - */ - CV_WRAP static Ptr createImageStandarizer(double sigma); - - /** @brief - * - * @return shared pointer to generated preprocessor - */ - CV_WRAP static Ptr createImageMeanSubtractor(InputArray meanImg); - /** @brief - * create a functor with the parameters, parameters can be changes by corresponding set functions - * @return shared pointer to generated preprocessor - */ - - CV_WRAP static PtrcreateImageCustomPreprocessor(double rawval=1.0,String channel_order="BGR"); - - friend class TextImageClassifier; - -}; - -/** @brief Abstract class that implements the classifcation of text images. - * - * The interface is generic enough to describe any image classifier. And allows - * to take advantage of compouting in batches. While word classifiers are the default - * networks, any image classifers should work. - * - */ -class CV_EXPORTS_W TextImageClassifier -{ -protected: - Size inputGeometry_; - Size outputGeometry_; - int channelCount_; - Ptr preprocessor_; - /** @brief all image preprocessing is handled here including whitening etc. - * - * @param input the image to be preprocessed for the classifier. If the depth - * is CV_U8 values should be in [0,255] otherwise values are assumed to be in [0,1] - * - * @param output reference to the image to be fed to the classifier, the preprocessor will - * resize the image to the apropriate size and convert it to the apropriate depth\ - * - * The method preprocess should never be used externally, it is up to classify and classifyBatch - * methods to employ it. - */ - virtual void preprocess(const Mat& input,Mat& output); -public: - virtual ~TextImageClassifier() {} - - /** @brief - */ - CV_WRAP virtual void setPreprocessor(Ptr ptr); - - /** @brief - */ - CV_WRAP Ptr getPreprocessor(); - - /** @brief produces a class confidence row-vector given an image - */ - CV_WRAP virtual void classify(InputArray image, OutputArray classProbabilities) = 0; - - /** @brief produces a matrix containing class confidence row-vectors given an collection of images - */ - CV_WRAP virtual void classifyBatch(InputArrayOfArrays image, OutputArray classProbabilities) = 0; - - /** @brief simple getter method returning the number of channels each input sample has - */ - CV_WRAP virtual int getInputChannelCount(){return this->channelCount_;} - - /** @brief simple getter method returning the size of the input sample - */ - CV_WRAP virtual Size getInputSize(){return this->inputGeometry_;} - - /** @brief simple getter method returning the size of the oputput row-vector - */ - CV_WRAP virtual int getOutputSize()=0; - /** @brief simple getter method returning the shape of the oputput from caffe - */ - CV_WRAP virtual Size getOutputGeometry()=0; - - /** @brief simple getter method returning the size of the minibatches for this classifier. - * If not applicabe this method should return 1 - */ - CV_WRAP virtual int getMinibatchSize()=0; - - friend class ImagePreprocessor; -}; - - - -class CV_EXPORTS_W DeepCNN:public TextImageClassifier -{ - /** @brief Class that uses a pretrained caffe model for word classification. - * - * This network is described in detail in: - * Max Jaderberg et al.: Reading Text in the Wild with Convolutional Neural Networks, IJCV 2015 - * http://arxiv.org/abs/1412.1842 - */ -public: - virtual ~DeepCNN() {}; - - /** @brief Constructs a DeepCNN object from a caffe pretrained model - * - * @param archFilename is the path to the prototxt file containing the deployment model architecture description. - * - * @param weightsFilename is the path to the pretrained weights of the model in binary fdorm. This file can be - * very large, up to 2GB. - * - * @param preprocessor is a pointer to the instance of a ImagePreprocessor implementing the preprocess_ protecteed method; - * - * @param minibatchSz the maximum number of samples that can processed in parallel. In practice this parameter - * has an effect only when computing in the GPU and should be set with respect to the memory available in the GPU. - * - * @param backEnd integer parameter selecting the coputation framework. For now OCR_HOLISTIC_BACKEND_CAFFE is - * the only option - */ - CV_WRAP static Ptr create(String archFilename,String weightsFilename,Ptr preprocessor,int minibatchSz=100,int backEnd=OCR_HOLISTIC_BACKEND_DEFAULT); - - /** @brief Constructs a DeepCNN intended to be used for word spotting. - * - * This method loads a pretrained classifier and couples him with a preprocessor that standarises pixels with a - * deviation of 113. The architecture file can be downloaded from: - * - * While the weights can be downloaded from: - * - * The words assigned to the network outputs are available at: - * - * - * @param archFilename is the path to the prototxt file containing the deployment model architecture description. - * When employing OCR_HOLISTIC_BACKEND_CAFFE this is the path to the deploy ".prototxt". - * - * @param weightsFilename is the path to the pretrained weights of the model. When employing - * OCR_HOLISTIC_BACKEND_CAFFE this is the path to the ".caffemodel" file. This file can be very large, the - * pretrained DictNet uses 2GB. - * - * @param backEnd integer parameter selecting the coputation framework. For now OCR_HOLISTIC_BACKEND_CAFFE is - * the only option - */ - CV_WRAP static Ptr createDictNet(String archFilename,String weightsFilename,int backEnd=OCR_HOLISTIC_BACKEND_DEFAULT); - -}; - -namespace cnn_config{ - -/** @brief runtime backend information - * - * this function finds the status of backends compiled with this module - * - * @return a list of backends (caffe,opencv-dnn etc.) - * */ -CV_EXPORTS_W std::vector getAvailableBackends(); - -namespace caffe_backend{ - -/** @brief Prompts Caffe on the computation device beeing used - * - * Caffe can only be controlled globally on whether the GPU or the CPU is used has a - * global behavior. This function queries the current state of caffe. - * If the module is built without caffe, this method throws an exception. - * - * @return true if caffe is computing on the GPU, false if caffe is computing on the CPU - */ -CV_EXPORTS_W bool getCaffeGpuMode(); - -/** @brief Sets the computation device beeing used by Caffe - * - * Caffe can only be controlled globally on whether the GPU or the CPU is used has a - * global behavior. This function queries the current state of caffe. - * If the module is built without caffe, this method throws an exception. - * - * @param useGpu set to true for caffe to be computing on the GPU, false if caffe is - * computing on the CPU - */ -CV_EXPORTS_W void setCaffeGpuMode(bool useGpu); - -/** @brief Provides runtime information on whether Caffe support was compiled in. - * - * The text module API is the same regardless of whether CAffe was available or not - * During compilation. When methods that require Caffe are invocked while Caffe support - * is not compiled in, exceptions are thrown. This method allows to test whether the - * text module was built with caffe during runtime. - * - * @return true if Caffe support for the the text module was provided during compilation, - * false if Caffe was unavailable. - */ -CV_EXPORTS_W bool getCaffeAvailable(); - -}//caffe -namespace dnn_backend { - -/** @brief Provides runtime information on whether DNN module was compiled in. - * - * The text module API is the same regardless of whether DNN module was available or not - * During compilation. When methods that require backend are invocked while no backend support - * is compiled, exceptions are thrown. This method allows to test whether the - * text module was built with dnn_backend during runtime. - * - * @return true if opencv_dnn support for the the text module was provided during compilation, - * false if opencv_dnn was unavailable. - */ -CV_EXPORTS_W bool getDNNAvailable(); - -}//dnn_backend -}//cnn_config - -/** @brief OCRHolisticWordRecognizer class provides the functionallity of segmented wordspotting. - * Given a predefined vocabulary , a TextImageClassifier is employed to select the most probable - * word given an input image. - * - * This class implements the logic of providing transcriptions given a vocabulary and and an image - * classifer. The classifier has to be any TextImageClassifier but the classifier for which this - * class was built is the DictNet. In order to load it the following files should be downloaded: - - * - * - * - */ -class CV_EXPORTS_W OCRHolisticWordRecognizer : public BaseOCR -{ -public: - virtual void run(Mat& image, std::string& output_text, std::vector* component_rects=NULL, - std::vector* component_texts=NULL, std::vector* component_confidences=NULL, - int component_level=OCR_LEVEL_WORD)=0; - - /** @brief Recognize text using a segmentation based word-spotting/classifier cnn. - - Takes image on input and returns recognized text in the output_text parameter. Optionally - provides also the Rects for individual text elements found (e.g. words), and the list of those - text elements with their confidence values. - - @param image Input image CV_8UC1 or CV_8UC3 - - @param mask is totally ignored and is only available for compatibillity reasons - - @param output_text Output text of the the word spoting, always one that exists in the dictionary. - - @param component_rects Not applicable for word spotting can be be NULL if not, a single elemnt will - be put in the vector. - - @param component_texts Not applicable for word spotting can be be NULL if not, a single elemnt will - be put in the vector. - - @param component_confidences Not applicable for word spotting can be be NULL if not, a single elemnt will - be put in the vector. - - @param component_level must be OCR_LEVEL_WORD. - */ - - virtual void run(Mat& image, Mat& mask, std::string& output_text, std::vector* component_rects=NULL, - std::vector* component_texts=NULL, std::vector* component_confidences=NULL, - int component_level=OCR_LEVEL_WORD)=0; - - - /** - @brief Method that provides a quick and simple interface to a single word image classifcation - - @param inputImage an image expected to be a CV_U8C1 or CV_U8C3 of any size assumed to contain a single word - - @param transcription an opencv string that will store the detected word transcription - - @param confidence a double that will be updated with the confidence the classifier has for the selected word - */ - CV_WRAP virtual void recogniseImage(InputArray inputImage,CV_OUT String& transcription,CV_OUT double& confidence)=0; - - /** - @brief Method that provides a quick and simple interface to a multiple word image classifcation taking advantage - the classifiers parallel capabilities. - - @param inputImageList an list of images expected to be a CV_U8C1 or CV_U8C3 each image can be of any size and is assumed - to contain a single word. - - @param transcriptions a vector of opencv strings that will store the detected word transcriptions, one for each - input image - - @param confidences a vector of double that will be updated with the confidence the classifier has for each of the - selected words. - */ - CV_WRAP virtual void recogniseImageBatch(InputArrayOfArrays inputImageList,CV_OUT std::vector& transcriptions,CV_OUT std::vector& confidences)=0; - - - /** - @brief simple getter for the vocabulary employed - */ - CV_WRAP virtual const std::vector& getVocabulary()=0; - - /** @brief simple getter for the preprocessing functor - */ - CV_WRAP virtual Ptr getClassifier()=0; - - /** @brief Creates an instance of the OCRHolisticWordRecognizer class. - - @param classifierPtr an instance of TextImageClassifier, normaly a DeepCNN instance - - @param vocabularyFilename the relative or absolute path to the file containing all words in the vocabulary. Each text line - in the file is assumed to be a single word. The number of words in the vocabulary must be exactly the same as the outputSize - of the classifier. - */ - CV_WRAP static Ptr create(Ptr classifierPtr,String vocabularyFilename); - - - /** @brief Creates an instance of the OCRHolisticWordRecognizer class and implicitly also a DeepCNN classifier. - - @param modelArchFilename the relative or absolute path to the prototxt file describing the classifiers architecture. - - @param modelWeightsFilename the relative or absolute path to the file containing the pretrained weights of the model in caffe-binary form. - - @param vocabularyFilename the relative or absolute path to the file containing all words in the vocabulary. Each text line - in the file is assumed to be a single word. The number of words in the vocabulary must be exactly the same as the outputSize - of the classifier. - */ - CV_WRAP static Ptr create(String modelArchFilename, String modelWeightsFilename, String vocabularyFilename); - - /** @brief - * - * @param classifierPtr - * - * @param vocabulary - */ - CV_WRAP static Ptr create(Ptr classifierPtr,const std::vector& vocabulary); - - /** @brief - * - * @param modelArchFilename - * - * @param modelWeightsFilename - * - * @param vocabulary - */ - CV_WRAP static Ptr create (String modelArchFilename, String modelWeightsFilename, const std::vector& vocabulary); -}; - - -}//namespace text -}//namespace cv - - +} +} #endif // _OPENCV_TEXT_OCR_HPP_ diff --git a/modules/text/include/opencv2/text/textDetector.hpp b/modules/text/include/opencv2/text/textDetector.hpp index eda74801449..0e51df39f4c 100644 --- a/modules/text/include/opencv2/text/textDetector.hpp +++ b/modules/text/include/opencv2/text/textDetector.hpp @@ -1,56 +1,12 @@ -/*M////////////////////////////////////////////////////////////////////////////////////////// -// -// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. -// -// By downloading, copying, installing or using the software you agree to this license. -// If you do not agree to this license, do not download, install, -// copy or use the software. -// -// -// License Agreement -// For Open Source Computer Vision Library -// -// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. -// Copyright (C) 2009, Willow Garage Inc., all rights reserved. -// Copyright (C) 2013, OpenCV Foundation, all rights reserved. -// Third party copyrights are property of their respective owners. -// -// Redistribution and use in source and binary forms, with or without modification, -// are permitted provided that the following conditions are met: -// -// * Redistribution's of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// -// * Redistribution's in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// -// * The name of the copyright holders may not be used to endorse or promote products -// derived from this software without specific prior written permission. -// -// This software is provided by the copyright holders and contributors "as is" and -// any express or implied warranties, including, but not limited to, the implied -// warranties of merchantability and fitness for a particular purpose are disclaimed. -// In no event shall the Intel Corporation or contributors be liable for any direct, -// indirect, incidental, special, exemplary, or consequential damages -// (including, but not limited to, procurement of substitute goods or services; -// loss of use, data, or profits; or business interruption) however caused -// and on any theory of liability, whether in contract, strict liability, -// or tort (including negligence or otherwise) arising in any way out of -// the use of this software, even if advised of the possibility of such damage. -// -//M*/ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. #ifndef __OPENCV_TEXT_TEXTDETECTOR_HPP__ #define __OPENCV_TEXT_TEXTDETECTOR_HPP__ -#include -#include -#include -#include #include"ocr.hpp" - namespace cv { namespace text @@ -59,208 +15,44 @@ namespace text //! @addtogroup text_detect //! @{ - - -//base class BaseDetector declares a common API that would be used in a typical text -//detection scenario -class CV_EXPORTS_W BaseDetector -{ -public: - virtual ~BaseDetector() {}; - - virtual void run(Mat& image, - std::vector* component_rects=NULL, - std::vector* component_confidences=NULL, - int component_level=0) = 0; - - virtual void run(Mat& image, Mat& mask, - std::vector* component_rects=NULL, - std::vector* component_confidences=NULL, - int component_level=0) = 0; - -}; -/** A virtual class for different models of text detection (including CNN based deep models) +/** @brief An abstract class providing interface for text detection algorithms */ - -class CV_EXPORTS_W TextRegionDetector +class CV_EXPORTS_W TextDetector { -protected: - /** Stores input and output size - */ - //netGeometry inputGeometry_; - //netGeometry outputGeometry_; - Size inputGeometry_; - Size outputGeometry_; - int inputChannelCount_; - int outputChannelCount_; - public: - virtual ~TextRegionDetector() {} - - /** @brief produces a list of Bounding boxes and an estimate of text-ness confidence of Bounding Boxes - */ - CV_WRAP virtual void detect(InputArray image, OutputArray bboxProb ) = 0; - - - /** @brief simple getter method returning the size (height, width) of the input sample - */ - CV_WRAP virtual Size getInputGeometry(){return this->inputGeometry_;} - - /** @brief simple getter method returning the shape of the oputput - * Any text detector should output a number of text regions alongwith a score of text-ness - * From the shape it can be inferred the number of text regions and number of returned value - * for each region - */ - CV_WRAP virtual Size getOutputGeometry(){return this->outputGeometry_;} - - - -}; - -/** Generic structure of Deep CNN based Text Detectors - * */ -class CV_EXPORTS_W DeepCNNTextDetector : public TextRegionDetector -{ - /** @brief Class that uses a pretrained caffe model for text detection. - * Any text detection should - * This network is described in detail in: - * Minghui Liao et al.: TextBoxes: A Fast Text Detector with a Single Deep Neural Network - * https://arxiv.org/abs/1611.06779 - */ -protected: - /** all deep CNN based text detectors have a preprocessor (normally) - */ - Ptr preprocessor_; - /** @brief all image preprocessing is handled here including whitening etc. - * - * @param input the image to be preprocessed for the classifier. If the depth - * is CV_U8 values should be in [0,255] otherwise values are assumed to be in [0,1] - * - * @param output reference to the image to be fed to the classifier, the preprocessor will - * resize the image to the apropriate size and convert it to the apropriate depth\ - * - * The method preprocess should never be used externally, it is up to classify and classifyBatch - * methods to employ it. - */ - virtual void preprocess(const Mat& input,Mat& output); -public: - virtual ~DeepCNNTextDetector() {}; - - /** @brief Constructs a DeepCNNTextDetector object from a caffe pretrained model - * - * @param archFilename is the path to the prototxt file containing the deployment model architecture description. - * - * @param weightsFilename is the path to the pretrained weights of the model in binary fdorm. - * - * @param preprocessor is a pointer to the instance of a ImagePreprocessor implementing the preprocess_ protecteed method; - * - * @param minibatchSz the maximum number of samples that can processed in parallel. In practice this parameter - * has an effect only when computing in the GPU and should be set with respect to the memory available in the GPU. - * - * @param backEnd integer parameter selecting the coputation framework. For now OCR_HOLISTIC_BACKEND_CAFFE is - * the only option - */ - CV_WRAP static Ptr create(String archFilename,String weightsFilename,Ptr preprocessor,int minibatchSz=100,int backEnd=OCR_HOLISTIC_BACKEND_DEFAULT); - - /** @brief Constructs a DeepCNNTextDetector intended to be used for text area detection. - * - * This method loads a pretrained classifier and couples with a preprocessor that preprocess the image with mean subtraction of () - * The architecture and models weights can be downloaded from: - * https://github.com/sghoshcvc/TextBox-Models.git (size is around 100 MB) - - * @param archFilename is the path to the prototxt file containing the deployment model architecture description. - * When employing OCR_HOLISTIC_BACKEND_CAFFE this is the path to the deploy ".prototxt". - * - * @param weightsFilename is the path to the pretrained weights of the model. When employing - * OCR_HOLISTIC_BACKEND_CAFFE this is the path to the ".caffemodel" file. - * - * @param backEnd integer parameter selecting the coputation framework. For now OCR_HOLISTIC_BACKEND_CAFFE is - * the only option - */ - CV_WRAP static Ptr createTextBoxNet(String archFilename,String weightsFilename,int backEnd=OCR_HOLISTIC_BACKEND_DEFAULT); - friend class ImagePreprocessor; + /** + @brief Method that provides a quick and simple interface to detect text inside an image + @param inputImage an image to process + @param Bbox a vector of Rect that will store the detected word bounding box + @param confidence a vector of float that will be updated with the confidence the classifier has for the selected bounding box + */ + virtual void textDetectInImage(InputArray inputImage, CV_OUT std::vector& Bbox, CV_OUT std::vector& confidence) = 0; + virtual ~TextDetector() {} }; -/** @brief textDetector class provides the functionallity of text bounding box detection. - * A TextRegionDetector is employed to find bounding boxes of text - * words given an input image. - * - * This class implements the logic of providing text bounding boxes in a vector of rects given an TextRegionDetector - * The TextRegionDetector can be any text detector - * +/** @brief TextDetectorCNN class provides the functionallity of text bounding box detection. + * A TextDetectorCNN is employed to find bounding boxes of text words given an input image. */ - -class CV_EXPORTS_W textDetector : public BaseDetector +class CV_EXPORTS_W TextDetectorCNN : public TextDetector { public: - virtual void run(Mat& image, std::vector* component_rects=NULL, - std::vector* component_confidences=NULL, - int component_level=OCR_LEVEL_WORD)=0; - - /** @brief detect text with a cnn, input is one image with (multiple) ocuurance of text. - - Takes image on input and returns recognized text in the output_text parameter. Optionally - provides also the Rects for individual text elements found (e.g. words), and the list of those - text elements with their confidence values. - - @param image Input image CV_8UC1 or CV_8UC3 - - @param mask is totally ignored and is only available for compatibillity reasons - - - @param component_rects a vector of Rects, each rect is one text bounding box. - - - - @param component_confidences A vector of float returns confidence of text bounding boxes - - @param component_level must be OCR_LEVEL_WORD. - */ - - virtual void run(Mat& image, Mat& mask, std::vector* component_rects=NULL, - std::vector* component_confidences=NULL, - int component_level=OCR_LEVEL_WORD)=0; - - /** - @brief Method that provides a quick and simple interface to detect text inside an image + @overload @param inputImage an image expected to be a CV_U8C3 of any size - @param Bbox a vector of Rect that will store the detected word bounding box - @param confidence a vector of float that will be updated with the confidence the classifier has for the selected bounding box */ - CV_WRAP virtual void textDetectInImage(InputArray inputImage,CV_OUT std::vector& Bbox,CV_OUT std::vector& confidence)=0; - - - - - /** @brief simple getter for the preprocessing functor - */ - CV_WRAP virtual Ptr getClassifier()=0; - - /** @brief Creates an instance of the textDetector class. - - @param classifierPtr an instance of TextImageClassifier, normaly a DeepCNN instance - - - */ - CV_WRAP static Ptr create(Ptr classifierPtr); - + CV_WRAP virtual void textDetectInImage(InputArray inputImage, CV_OUT std::vector& Bbox, CV_OUT std::vector& confidence) = 0; /** @brief Creates an instance of the textDetector class and implicitly also a DeepCNN classifier. @param modelArchFilename the relative or absolute path to the prototxt file describing the classifiers architecture. - @param modelWeightsFilename the relative or absolute path to the file containing the pretrained weights of the model in caffe-binary form. - - + @param detectMultiscale if true, multiple scales of the input image will be used as network input */ - CV_WRAP static Ptr create(String modelArchFilename, String modelWeightsFilename); - - + CV_WRAP static Ptr create(const String& modelArchFilename, const String& modelWeightsFilename, bool detectMultiscale = false); }; //! @} diff --git a/modules/text/samples/deeptextdetection.py b/modules/text/samples/deeptextdetection.py index 2e8395b60f1..09dcb24927d 100644 --- a/modules/text/samples/deeptextdetection.py +++ b/modules/text/samples/deeptextdetection.py @@ -1,57 +1,37 @@ # -*- coding: utf-8 -*- -""" -Created on Wed Jul 19 17:54:00 2017 - -@author: sgnosh -""" - #!/usr/bin/python - import sys import os - import cv2 import numpy as np -print('\nDeeptextdetection.py') -print(' A demo script of text box alogorithm of the paper:') -print(' * Minghui Liao et al.: TextBoxes: A Fast Text Detector with a Single Deep Neural Network https://arxiv.org/abs/1611.06779\n') - - -if (len(sys.argv) < 2): - print(' (ERROR) You must call this script with an argument (path_to_image_to_be_processed)\n') - quit() -#if not cv2.text.cnn_config.caffe_backend.getCaffeAvailable(): -# print"The text module was compiled without Caffe which is the only available DeepCNN backend.\nAborting!\n" -# -# quit() -# check model and architecture file existance -if not os.path.isfile('textbox.caffemodel') or not os.path.isfile('textbox_deploy.prototxt'): - print " Model files not found in current directory. Aborting" - print " Model files should be downloaded from https://github.com/sghoshcvc/TextBox-Models" - quit() -cv2.text.cnn_config.caffe_backend.setCaffeGpuMode(True); -pathname = os.path.dirname(sys.argv[0]) +def main(): + print('\nDeeptextdetection.py') + print(' A demo script of text box alogorithm of the paper:') + print(' * Minghui Liao et al.: TextBoxes: A Fast Text Detector with a Single Deep Neural Network https://arxiv.org/abs/1611.06779\n') + if (len(sys.argv) < 2): + print(' (ERROR) You must call this script with an argument (path_to_image_to_be_processed)\n') + quit() -img = cv2.imread(str(sys.argv[1])) -textSpotter=cv2.text.textDetector_create( - "textbox_deploy.prototxt","textbox.caffemodel") -rects,outProbs = textSpotter.textDetectInImage(img); -# for visualization -vis = img.copy() -# Threshold to select rectangles : All rectangles for which outProbs is more than this threshold will be shown -thres = 0.6 + if not os.path.isfile('textbox.caffemodel') or not os.path.isfile('textbox_deploy.prototxt'): + print " Model files not found in current directory. Aborting" + print " Model files should be downloaded from https://github.com/sghoshcvc/TextBox-Models" + quit() + img = cv2.imread(str(sys.argv[1])) + textSpotter = cv2.text.TextDetectorCNN_create("textbox_deploy.prototxt","textbox.caffemodel") + rects, outProbs = textSpotter.textDetectInImage(img); + vis = img.copy() + thres = 0.6 - #Visualization -for r in range(0,np.shape(rects)[0]): - if outProbs[r] >thres: - rect = rects[r] - cv2.rectangle(vis, (rect[0],rect[1]), (rect[0]+rect[2],rect[1]+rect[3]), (255, 0, 0), 2) - # cv2.rectangle(vis, (rect[0],rect[1]), (rect[0]+rect[2],rect[1]+rect[3]), (255, 255, 255), 1) + for r in range(np.shape(rects)[0]): + if outProbs[r] > thres: + rect = rects[r] + cv2.rectangle(vis, (rect[0],rect[1]), (rect[0] + rect[2], rect[1] + rect[3]), (255, 0, 0), 2) + cv2.imshow("Text detection result", vis) + cv2.waitKey() -#Visualization -cv2.imshow("Text detection result", vis) -cv2.waitKey(0) \ No newline at end of file +if __name__ == "__main__": + main() diff --git a/modules/text/samples/textbox_demo.cpp b/modules/text/samples/textbox_demo.cpp index b76658e1b7a..9975c394730 100644 --- a/modules/text/samples/textbox_demo.cpp +++ b/modules/text/samples/textbox_demo.cpp @@ -1,151 +1,86 @@ -/* - * dictnet_demo.cpp - * - * Demonstrates simple use of the holistic word classifier in C++ - * - * Created on: June 26, 2016 - * Author: Anguelos Nicolaou - */ - -#include "opencv2/text.hpp" -#include "opencv2/highgui.hpp" -#include "opencv2/imgproc.hpp" +#include +#include +#include #include -#include #include -#include #include -void textbox_draw(cv::Mat &src, std::vector &groups,std::vector &probs,std::vector wordList,float thres); -inline std::string getHelpStr(std::string progFname){ - std::stringstream out; - out << " Demo of text detection CNN for text detection." << std::endl; - out << " Max Jaderberg et al.: Reading Text in the Wild with Convolutional Neural Networks, IJCV 2015"< " << std::endl; - out << " Caffe Model files (textbox.caffemodel, textbox_deploy.prototxt)"< " << std::endl + << " Caffe Model files (textbox.caffemodel, textbox_deploy.prototxt)"< &groups,std::vector &probs,std::vector wordList,float thres=0.6) + +void textbox_draw(Mat src, std::vector& groups, std::vector& probs, float thres) { - for (int i=0;i<(int)groups.size(); i++) + for (size_t i = 0; i < groups.size(); i++) { - if(probs[i]>thres) + if(probs[i] > thres) { if (src.type() == CV_8UC3) { - cv::rectangle(src,groups.at(i).tl(),groups.at(i).br(),cv::Scalar( 0, 255, 255 ), 3, 8 ); - cv::putText(src, wordList[i],groups.at(i).tl() , cv::FONT_HERSHEY_PLAIN, 1, cv::Scalar( 0,0,255 )); + rectangle(src, groups[i], Scalar( 0, 255, 255 ), 2, LINE_AA); + String label = format("%.2f", probs[i]); + std::cout << "text box: " << groups[i] << " confidence: " << probs[i] << "\n"; + putText(src, label, groups.at(i).tl(), FONT_HERSHEY_PLAIN, 1, Scalar( 0,0,255 ), 1, LINE_AA); } else - rectangle(src,groups.at(i).tl(),groups.at(i).br(),cv::Scalar( 255 ), 3, 8 ); + rectangle(src, groups[i], Scalar( 255 ), 3, 8 ); } } } +} -int main(int argc, const char * argv[]){ - if(!cv::text::cnn_config::caffe_backend::getCaffeAvailable()){ - std::cout<<"The text module was compiled without Caffe which is the only available DeepCNN backend.\nAborting!\n"; - //exit(1); - } - std::vector backends=cv::text::cnn_config::getAvailableBackends(); - std::cout << "The Following backends are available" << "\n"; - for (int i=0;i textSpotter=cv::text::textDetector::create( - "textbox_deploy.prototxt","textbox.caffemodel"); + std::cout << "Starting Text Box Demo" << std::endl; + Ptr textSpotter = + text::TextDetectorCNN::create("textbox_deploy.prototxt","textbox.caffemodel", false); - //cv::Ptr wordSpotter= - // cv::text::textDetector::create(cnn); - std::cout<<"Created Text Spotter with text Boxes"; - - std::vector bbox; + std::vector bbox; std::vector outProbabillities; - textSpotter->textDetectInImage(image,bbox,outProbabillities); - // textbox_draw(image, bbox,outProbabillities); - float thres =0.6f; - std::vector imageList; - for(int imageIdx=0;imageIdx<(int)bbox.size();imageIdx++){ - if(outProbabillities[imageIdx]>thres){ - imageList.push_back(image(bbox.at(imageIdx))); - } - - } - // call dict net here for all detected parts - cv::Ptr cnn=cv::text::DeepCNN::createDictNet( - "dictnet_vgg_deploy.prototxt","dictnet_vgg.caffemodel",cv::text::OCR_HOLISTIC_BACKEND_DNN); - - cv::Ptr wordSpotter= - cv::text::OCRHolisticWordRecognizer::create(cnn,"dictnet_vgg_labels.txt"); - - std::vector wordList; - std::vector wordProbabillities; - wordSpotter->recogniseImageBatch(imageList,wordList,wordProbabillities); - // write the output in file - std::ofstream out; - out.open(argv[1]); - - - for (int i=0;i<(int)wordList.size(); i++) - { - cv::Point tl_ = bbox.at(i).tl(); - cv::Point br_ = bbox.at(i).br(); - - out<textDetectInImage(image, bbox, outProbabillities); + textbox_draw(image, bbox, outProbabillities, 0.5f); - cv::imshow("TextBox Demo",image); + imshow("TextBox Demo",image); std::cout << "Done!" << std::endl << std::endl; std::cout << "Press any key to exit." << std::endl << std::endl; - if ((cv::waitKey()&0xff) == ' ') - return 0; + waitKey(); + return 0; } diff --git a/modules/text/src/image_preprocessor.cpp b/modules/text/src/image_preprocessor.cpp deleted file mode 100644 index 3a65a210863..00000000000 --- a/modules/text/src/image_preprocessor.cpp +++ /dev/null @@ -1,387 +0,0 @@ -#include "precomp.hpp" -#include "opencv2/imgproc.hpp" -#include "opencv2/highgui.hpp" -#include "opencv2/core.hpp" - - - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace cv { namespace text { -//************************************************************************************ -//****************** ImagePreprocessor ******************************************* -//************************************************************************************ - -void ImagePreprocessor::preprocess(InputArray input,OutputArray output,Size sz,int outputChannels){ - Mat inpImg=input.getMat(); - Mat outImg; - this->preprocess_(inpImg,outImg,sz,outputChannels); - outImg.copyTo(output); -} -void ImagePreprocessor::set_mean(Mat mean){ - - - this->set_mean_(mean); - -} - - - -class ResizerPreprocessor: public ImagePreprocessor{ -protected: - void preprocess_(const Mat& input,Mat& output,Size outputSize,int outputChannels){ - //TODO put all the logic of channel and depth conversions in ImageProcessor class - CV_Assert(outputChannels==1 || outputChannels==3); - CV_Assert(input.channels()==1 || input.channels()==3); - if(input.channels()!=outputChannels) - { - Mat tmpInput; - if(outputChannels==1){ - cvtColor(input,tmpInput,COLOR_BGR2GRAY); - if(input.depth()==CV_8U) - { - tmpInput.convertTo(output,CV_32FC1,1/255.0); - }else - {//Assuming values are at the desired [0,1] range - tmpInput.convertTo(output, CV_32FC1); - } - }else - { - cvtColor(input,tmpInput,COLOR_GRAY2BGR); - if(input.depth()==CV_8U) - { - tmpInput.convertTo(output,CV_32FC3,1/255.0); - }else - {//Assuming values are at the desired [0,1] range - tmpInput.convertTo(output, CV_32FC3); - } - } - }else - { - if(input.channels()==1) - { - if(input.depth()==CV_8U) - { - input.convertTo(output, CV_32FC1,1/255.0); - }else - {//Assuming values are at the desired [0,1] range - input.convertTo(output, CV_32FC1); - } - }else - { - if(input.depth()==CV_8U){ - input.convertTo(output, CV_32FC3,1/255.0); - }else - {//Assuming values are at the desired [0,1] range - input.convertTo(output, CV_32FC3); - } - } - } - if(outputSize.width!=0 && outputSize.height!=0) - { - resize(output,output,outputSize); - } - } - //void set_mean_(Mat m){} -public: - ResizerPreprocessor(){} - ~ResizerPreprocessor(){} -}; - -class StandarizerPreprocessor: public ImagePreprocessor{ -protected: - double sigma_; - //void set_mean_(Mat M){} - - void preprocess_(const Mat& input,Mat& output,Size outputSize,int outputChannels){ - - //TODO put all the logic of channel and depth conversions in ImageProcessor class - CV_Assert(outputChannels==1 || outputChannels==3); - CV_Assert(input.channels()==1 || input.channels()==3); - if(input.channels()!=outputChannels) - { - Mat tmpInput; - if(outputChannels==1) - { - cvtColor(input,tmpInput,COLOR_BGR2GRAY); - if(input.depth()==CV_8U) - { - tmpInput.convertTo(output,CV_32FC1,1/255.0); - }else - {//Assuming values are at the desired [0,1] range - tmpInput.convertTo(output, CV_32FC1); - } - }else - { - cvtColor(input,tmpInput,COLOR_GRAY2BGR); - if(input.depth()==CV_8U) - { - tmpInput.convertTo(output,CV_32FC3,1/255.0); - }else - {//Assuming values are at the desired [0,1] range - tmpInput.convertTo(output, CV_32FC3); - } - } - }else - { - if(input.channels()==1) - { - if(input.depth()==CV_8U) - { - input.convertTo(output, CV_32FC1,1/255.0); - }else - {//Assuming values are at the desired [0,1] range - input.convertTo(output, CV_32FC1); - } - }else - { - if(input.depth()==CV_8U) - { - input.convertTo(output, CV_32FC3,1/255.0); - }else - {//Assuming values are at the desired [0,1] range - input.convertTo(output, CV_32FC3); - } - } - } - if(outputSize.width!=0 && outputSize.height!=0) - { - resize(output,output,outputSize); - } - - Scalar mean,dev; - meanStdDev(output,mean,dev); - subtract(output,mean[0],output); - divide(output,(dev[0]/sigma_),output); - } -public: - StandarizerPreprocessor(double sigma):sigma_(sigma){} - ~StandarizerPreprocessor(){} - -}; - -class customPreprocessor:public ImagePreprocessor{ -protected: - - double rawval_; - Mat mean_; - String channel_order_; - - void set_mean_(Mat imMean_){ - - imMean_.copyTo(this->mean_); - - - } - - void set_raw_scale(int rawval){ - rawval_ = rawval; - - } - void set_channels(String channel_order){ - channel_order_=channel_order; - } - - - void preprocess_(const Mat& input,Mat& output,Size outputSize,int outputChannels){ - //TODO put all the logic of channel and depth conversions in ImageProcessor class - - CV_Assert(outputChannels==1 || outputChannels==3); - CV_Assert(input.channels()==1 || input.channels()==3); - if(input.channels()!=outputChannels) - { - Mat tmpInput; - if(outputChannels==1) - { - cvtColor(input,tmpInput,COLOR_BGR2GRAY); - if(input.depth()==CV_8U) - { - if (rawval_ == 1) - tmpInput.convertTo(output,CV_32FC3,1/255.0); - else - tmpInput.convertTo(output,CV_32FC1); - }else - {//Assuming values are at the desired [0,1] range - if (rawval_ ==1) - tmpInput.convertTo(output, CV_32FC1); - else - tmpInput.convertTo(output, CV_32FC1,rawval_); - } - }else - { - cvtColor(input,tmpInput,COLOR_GRAY2BGR); - if(input.depth()==CV_8U) - { - if (rawval_ == 1) - tmpInput.convertTo(output,CV_32FC3,1/255.0); - else - tmpInput.convertTo(output,CV_32FC1); - }else - {//Assuming values are at the desired [0,1] range - if (rawval_ ==1) - tmpInput.convertTo(output, CV_32FC1); - else - tmpInput.convertTo(output, CV_32FC1,rawval_); - } - } - }else - { - if(input.channels()==1) - { - if(input.depth()==CV_8U) - { - if (rawval_ == 1) - input.convertTo(output,CV_32FC1,1/255.0); - else - input.convertTo(output,CV_32FC1); - }else - {//Assuming values are at the desired [0,1] range - if (rawval_ ==1) - input.convertTo(output, CV_32FC1); - else - input.convertTo(output, CV_32FC1,rawval_); - } - }else - { - if(input.depth()==CV_8U) - { - if (rawval_ == 1) - input.convertTo(output,CV_32FC3,1/255.0); - else - input.convertTo(output,CV_32FC3); - }else - {//Assuming values are at the desired [0,1] range - if (rawval_ ==1) - input.convertTo(output, CV_32FC3); - else - input.convertTo(output, CV_32FC3,rawval_); - } - } - } - if(outputSize.width!=0 && outputSize.height!=0) - { - resize(output,output,outputSize); - } - - if (!this->mean_.empty()){ - - Scalar mean_s(this->mean_.at(0,0),this->mean_.at(0,1),this->mean_.at(0,2)); - subtract(output,mean_s,output); - } - else{ - Scalar mean_s; - mean_s = mean(output); - subtract(output,mean_s,output); - } - - } - -public: - customPreprocessor( double rawval,String channel_order):rawval_(rawval),channel_order_(channel_order){} - ~customPreprocessor(){} - -}; - -class MeanSubtractorPreprocessor: public ImagePreprocessor{ -protected: - Mat mean_; - //void set_mean_(Mat m){} - void preprocess_(const Mat& input,Mat& output,Size outputSize,int outputChannels){ - //TODO put all the logic of channel and depth conversions in ImageProcessor class - CV_Assert(this->mean_.cols==outputSize.width && this->mean_.rows ==outputSize.height); - CV_Assert(outputChannels==1 || outputChannels==3); - CV_Assert(input.channels()==1 || input.channels()==3); - if(input.channels()!=outputChannels) - { - Mat tmpInput; - if(outputChannels==1) - { - cvtColor(input,tmpInput,COLOR_BGR2GRAY); - if(input.depth()==CV_8U) - { - tmpInput.convertTo(output,CV_32FC1,1/255.0); - }else - {//Assuming values are at the desired [0,1] range - tmpInput.convertTo(output, CV_32FC1); - } - }else - { - cvtColor(input,tmpInput,COLOR_GRAY2BGR); - if(input.depth()==CV_8U) - { - tmpInput.convertTo(output,CV_32FC3,1/255.0); - }else - {//Assuming values are at the desired [0,1] range - tmpInput.convertTo(output, CV_32FC3); - } - } - }else - { - if(input.channels()==1) - { - if(input.depth()==CV_8U) - { - input.convertTo(output, CV_32FC1,1/255.0); - }else - {//Assuming values are at the desired [0,1] range - input.convertTo(output, CV_32FC1); - } - }else - { - if(input.depth()==CV_8U) - { - input.convertTo(output, CV_32FC3,1/255.0); - }else - {//Assuming values are at the desired [0,1] range - input.convertTo(output, CV_32FC3); - } - } - } - if(outputSize.width!=0 && outputSize.height!=0) - { - resize(output,output,outputSize); - } - subtract(output,this->mean_,output); - } -public: - MeanSubtractorPreprocessor(Mat mean) - { - mean.copyTo(this->mean_); - } - - ~MeanSubtractorPreprocessor(){} -}; - - - -Ptr ImagePreprocessor::createResizer() -{ - return Ptr(new ResizerPreprocessor); -} - -Ptr ImagePreprocessor::createImageStandarizer(double sigma) -{ - return Ptr(new StandarizerPreprocessor(sigma)); -} -Ptr ImagePreprocessor::createImageCustomPreprocessor(double rawval,String channel_order) -{ - - return Ptr(new customPreprocessor(rawval,channel_order)); -} - -Ptr ImagePreprocessor::createImageMeanSubtractor(InputArray meanImg) -{ - Mat tmp=meanImg.getMat(); - return Ptr(new MeanSubtractorPreprocessor(tmp)); -} -} -} diff --git a/modules/text/src/ocr_holistic.cpp b/modules/text/src/ocr_holistic.cpp deleted file mode 100644 index 035f104f28a..00000000000 --- a/modules/text/src/ocr_holistic.cpp +++ /dev/null @@ -1,697 +0,0 @@ -#include "precomp.hpp" -#include "opencv2/imgproc.hpp" -#include "opencv2/highgui.hpp" -#include "opencv2/core.hpp" - - - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -#ifdef HAVE_CAFFE -#include "caffe/caffe.hpp" -#endif - -#ifdef HAVE_DNN -#include "opencv2/dnn.hpp" -#endif - -using namespace cv; -using namespace cv::dnn; -using namespace std; -namespace cv { namespace text { - -//Maybe OpenCV has a routine better suited -inline bool fileExists (String filename) { - std::ifstream f(filename.c_str()); - return f.good(); -} - - - -//************************************************************************************ -//****************** TextImageClassifier ***************************************** -//************************************************************************************ - -void TextImageClassifier::preprocess(const Mat& input,Mat& output) -{ - this->preprocessor_->preprocess_(input,output,this->inputGeometry_,this->channelCount_); -} - -void TextImageClassifier::setPreprocessor(Ptr ptr) -{ - CV_Assert(!ptr.empty()); - preprocessor_=ptr; -} - -Ptr TextImageClassifier::getPreprocessor() -{ - return preprocessor_; -} - - -class DeepCNNCaffeImpl: public DeepCNN{ -protected: - void classifyMiniBatch(std::vector inputImageList, Mat outputMat) - { - //Classifies a list of images containing at most minibatchSz_ images - CV_Assert(int(inputImageList.size())<=this->minibatchSz_); - CV_Assert(outputMat.isContinuous()); - - -#ifdef HAVE_CAFFE - net_->input_blobs()[0]->Reshape(inputImageList.size(), this->channelCount_,this->inputGeometry_.height,this->inputGeometry_.width); - net_->Reshape(); - float* inputBuffer=net_->input_blobs()[0]->mutable_cpu_data(); - float* inputData=inputBuffer; - - for(size_t imgNum=0;imgNum input_channels; - Mat preprocessed; - // if the image have multiple color channels the input layer should be populated accordingly - for (int channel=0;channel < this->channelCount_;channel++){ - - cv::Mat netInputWraped(this->inputGeometry_.height, this->inputGeometry_.width, CV_32FC1, inputData); - input_channels.push_back(netInputWraped); - //input_data += width * height; - inputData+=(this->inputGeometry_.height*this->inputGeometry_.width); - - } - this->preprocess(inputImageList[imgNum],preprocessed); - split(preprocessed, input_channels); - - - } - this->net_->ForwardPrefilled(); - const float* outputNetData=net_->output_blobs()[0]->cpu_data(); - this->outputGeometry_ = Size(net_->output_blobs()[0]->width(),net_->output_blobs()[0]->height()); - int outputSz = this->outputSize_ * this->outputGeometry_.height * this->outputGeometry_.width; - - - //outputMat.resize(this->outputGeometry_.height * this->outputGeometry_.width); - float*outputMatData=(float*)(outputMat.data); - memcpy(outputMatData,outputNetData,sizeof(float)*outputSz*inputImageList.size()); - -#endif - } - -#ifdef HAVE_CAFFE - Ptr > net_; -#endif - //Size inputGeometry_;//=Size(100,32); - int minibatchSz_;//The existence of the assignment operator mandates this to be nonconst - int outputSize_; - //Size outputGeometry_; -public: - DeepCNNCaffeImpl(const DeepCNNCaffeImpl& dn): - minibatchSz_(dn.minibatchSz_),outputSize_(dn.outputSize_){ - channelCount_=dn.channelCount_; - inputGeometry_=dn.inputGeometry_; - //Implemented to supress Visual Studio warning "assignment operator could not be generated" -#ifdef HAVE_CAFFE - this->net_=dn.net_; -#endif - } - DeepCNNCaffeImpl& operator=(const DeepCNNCaffeImpl &dn) - { -#ifdef HAVE_CAFFE - this->net_=dn.net_; -#endif - this->setPreprocessor(dn.preprocessor_); - this->inputGeometry_=dn.inputGeometry_; - this->channelCount_=dn.channelCount_; - this->minibatchSz_=dn.minibatchSz_; - this->outputSize_=dn.outputSize_; - this->preprocessor_=dn.preprocessor_; - this->outputGeometry_=dn.outputGeometry_; - return *this; - //Implemented to supress Visual Studio warning "assignment operator could not be generated" - } - - DeepCNNCaffeImpl(String modelArchFilename, String modelWeightsFilename,Ptr preprocessor, int maxMinibatchSz) - :minibatchSz_(maxMinibatchSz) - { - - CV_Assert(this->minibatchSz_>0); - CV_Assert(fileExists(modelArchFilename)); - CV_Assert(fileExists(modelWeightsFilename)); - CV_Assert(!preprocessor.empty()); - this->setPreprocessor(preprocessor); -#ifdef HAVE_CAFFE - this->net_.reset(new caffe::Net(modelArchFilename, caffe::TEST)); - CV_Assert(net_->num_inputs()==1); - CV_Assert(net_->num_outputs()==1); - CV_Assert(this->net_->input_blobs()[0]->channels()==1 - ||this->net_->input_blobs()[0]->channels()==3); - this->channelCount_=this->net_->input_blobs()[0]->channels(); - - - - this->net_->CopyTrainedLayersFrom(modelWeightsFilename); - - caffe::Blob* inputLayer = this->net_->input_blobs()[0]; - - this->inputGeometry_=Size(inputLayer->width(), inputLayer->height()); - this->channelCount_ = inputLayer->channels(); - - inputLayer->Reshape(this->minibatchSz_,this->channelCount_,this->inputGeometry_.height, this->inputGeometry_.width); - net_->Reshape(); - this->outputSize_=net_->output_blobs()[0]->channels(); - this->outputGeometry_ = Size(net_->output_blobs()[0]->width(),net_->output_blobs()[0]->height()); - - - - - -#else - CV_Error(Error::StsError,"Caffe not available during compilation!"); -#endif - } - - void classify(InputArray image, OutputArray classProbabilities) - { - std::vector inputImageList; - inputImageList.push_back(image.getMat()); - classifyBatch(inputImageList,classProbabilities); - } - - void classifyBatch(InputArrayOfArrays inputImageList, OutputArray classProbabilities) - { - std::vector allImageVector; - inputImageList.getMatVector(allImageVector); - size_t outputSize=size_t(this->outputSize_);//temporary variable to avoid int to size_t arithmentic - - size_t minibatchSize=size_t(this->minibatchSz_);//temporary variable to avoid int to size_t arithmentic - classProbabilities.create(Size(int(outputSize),int(allImageVector.size())),CV_32F); - Mat outputMat = classProbabilities.getMat(); - for(size_t imgNum=0;imgNum(allImageVector.size()-imgNum,minibatchSize); - std::vector::const_iterator from=std::vector::const_iterator(allImageVector.begin()+imgNum); - std::vector::const_iterator to=std::vector::const_iterator(allImageVector.begin()+rangeEnd); - std::vector minibatchInput(from,to); - classifyMiniBatch(minibatchInput,outputMat.rowRange(int(imgNum),int(rangeEnd))); - - } - - } - - int getOutputSize() - { - return this->outputSize_; - } - Size getOutputGeometry() - { - return this->outputGeometry_; - } - - int getMinibatchSize() - { - return this->minibatchSz_; - } - - int getBackend() - { - return OCR_HOLISTIC_BACKEND_CAFFE; - } -}; - -class DeepCNNOpenCvDNNImpl: public DeepCNN{ -protected: - - void classifyMiniBatch(std::vector inputImageList, Mat outputMat) - { - //Classifies a list of images containing at most minibatchSz_ images - CV_Assert(int(inputImageList.size())<=this->minibatchSz_); - CV_Assert(outputMat.isContinuous()); - -#ifdef HAVE_DNN - - std::vector preProcessedImList; // to store preprocessed images, should it be handled inside preprocessing class? - - Mat preprocessed; - // preprocesses each image in the inputImageList and push to preprocessedImList - for(size_t imgNum=0;imgNumpreprocess(inputImageList[imgNum],preprocessed); - preProcessedImList.push_back(preprocessed); - } - // set input data blob in dnn::net - net_->setInput(blobFromImages(preProcessedImList,1, this->inputGeometry_), "data"); - - float*outputMatData=(float*)(outputMat.data); - //Mat outputNet(inputImageList.size(),this->outputSize_,CV_32FC1,outputMatData) ; - Mat outputNet = this->net_->forward(); - outputNet = outputNet.reshape(1, 1); - - float*outputNetData=(float*)(outputNet.data); - - memcpy(outputMatData,outputNetData,sizeof(float)*this->outputSize_*inputImageList.size()); - -#endif - } - -#ifdef HAVE_DNN - Ptr net_; -#endif - // hard coding input image size. anything in DNN library to get that from prototxt?? - // Size inputGeometry_;//=Size(100,32); - int minibatchSz_;//The existence of the assignment operator mandates this to be nonconst - int outputSize_; - //Size outputGeometry_;//= Size(1,1); - //int channelCount_; - // int inputChannel_ ;//=1; - // int _inputHeight; - //int _inputWidth ; - //int _inputChannel ; -public: - DeepCNNOpenCvDNNImpl(const DeepCNNOpenCvDNNImpl& dn): - minibatchSz_(dn.minibatchSz_),outputSize_(dn.outputSize_){ - channelCount_=dn.channelCount_; - inputGeometry_=dn.inputGeometry_; - //Implemented to supress Visual Studio warning "assignment operator could not be generated" -#ifdef HAVE_DNN - this->net_=dn.net_; -#endif - } - DeepCNNOpenCvDNNImpl& operator=(const DeepCNNOpenCvDNNImpl &dn) - { -#ifdef HAVE_DNN - this->net_=dn.net_; -#endif - this->setPreprocessor(dn.preprocessor_); - this->inputGeometry_=dn.inputGeometry_; - this->channelCount_=dn.channelCount_; - this->minibatchSz_=dn.minibatchSz_; - this->outputSize_=dn.outputSize_; - this->preprocessor_=dn.preprocessor_; - this->outputGeometry_=dn.outputGeometry_; - return *this; - //Implemented to supress Visual Studio warning "assignment operator could not be generated" - } - - DeepCNNOpenCvDNNImpl(String modelArchFilename, String modelWeightsFilename,Ptr preprocessor, int maxMinibatchSz,int inputWidth ,int inputHeight ,int inputChannel ) - :minibatchSz_(maxMinibatchSz) - { - - CV_Assert(this->minibatchSz_>0); - CV_Assert(fileExists(modelArchFilename)); - CV_Assert(fileExists(modelWeightsFilename)); - CV_Assert(!preprocessor.empty()); - this->setPreprocessor(preprocessor); -#ifdef HAVE_DNN - - this->net_ = makePtr(readNetFromCaffe(modelArchFilename,modelWeightsFilename)); - - - - if (this->net_.empty()) - { - std::cerr << "Can't load network by using the following files: " << std::endl; - std::cerr << "prototxt: " << modelArchFilename << std::endl; - std::cerr << "caffemodel: " << modelWeightsFilename << std::endl; - //std::cerr << "bvlc_googlenet.caffemodel can be downloaded here:" << std::endl; - //std::cerr << "http://dl.caffe.berkeleyvision.org/bvlc_googlenet.caffemodel" << std::endl; - exit(-1); - } - - - this->inputGeometry_=Size(inputWidth,inputHeight);// Size(inputLayer->width(), inputLayer->height()); - this->channelCount_ = inputChannel;//inputLayer->channels(); - - //inputLayer->Reshape(this->minibatchSz_,this->channelCount_,this->inputGeometry_.height, this->inputGeometry_.width); - Ptr< Layer > outLayer= net_->getLayer (net_->getLayerId (net_->getLayerNames()[net_->getLayerNames().size()-2])); - //std::vector blobs = outLayer->blobs; - - this->outputSize_=(outLayer->blobs)[1].size[0] ;//net_->output_blobs()[0]->channels(); - //this->outputGeometry_ = Size(1,1);//Size(net_->output_blobs()[0]->width(),net_->output_blobs()[0]->height()); - - - - - - -#else - CV_Error(Error::StsError,"DNN module not available during compilation!"); -#endif - } - - void classify(InputArray image, OutputArray classProbabilities) - { - std::vector inputImageList; - inputImageList.push_back(image.getMat()); - classifyBatch(inputImageList,classProbabilities); - } - - void classifyBatch(InputArrayOfArrays inputImageList, OutputArray classProbabilities) - { - std::vector allImageVector; - inputImageList.getMatVector(allImageVector); - size_t outputSize=size_t(this->outputSize_);//temporary variable to avoid int to size_t arithmentic - - size_t minibatchSize=size_t(this->minibatchSz_);//temporary variable to avoid int to size_t arithmentic - classProbabilities.create(Size(int(outputSize),int(allImageVector.size())),CV_32F); - Mat outputMat = classProbabilities.getMat(); - - for(size_t imgNum=0;imgNum(allImageVector.size()-imgNum,minibatchSize); - std::vector::const_iterator from=std::vector::const_iterator(allImageVector.begin()+imgNum); - std::vector::const_iterator to=std::vector::const_iterator(allImageVector.begin()+rangeEnd); - std::vector minibatchInput(from,to); - classifyMiniBatch(minibatchInput,outputMat.rowRange(int(imgNum),int(rangeEnd))); - - } - - } - - int getOutputSize() - { - return this->outputSize_; - } - Size getOutputGeometry() - { - return this->outputGeometry_; - } - - int getMinibatchSize() - { - return this->minibatchSz_; - } - - int getBackend() - { - return OCR_HOLISTIC_BACKEND_DNN; - } -}; - -Ptr DeepCNN::create(String archFilename,String weightsFilename,Ptr preprocessor,int minibatchSz,int backEnd) -{ - if(preprocessor.empty()) - { - preprocessor=ImagePreprocessor::createResizer(); - } - switch(backEnd){ - case OCR_HOLISTIC_BACKEND_DEFAULT: - -#ifdef HAVE_CAFFE - return Ptr(new DeepCNNCaffeImpl(archFilename, weightsFilename,preprocessor, minibatchSz)); - -#elif defined(HAVE_DNN) - return Ptr(new DeepCNNOpenCvDNNImpl(archFilename, weightsFilename,preprocessor, minibatchSz,100,32,1)); -#else - CV_Error(Error::StsError,"DeepCNN::create backend not implemented"); - return Ptr(); -#endif - break; - - case OCR_HOLISTIC_BACKEND_CAFFE: - return Ptr(new DeepCNNCaffeImpl(archFilename, weightsFilename,preprocessor, minibatchSz)); - break; - case OCR_HOLISTIC_BACKEND_DNN: - return Ptr(new DeepCNNOpenCvDNNImpl(archFilename, weightsFilename,preprocessor, minibatchSz,100,32,1)); - break; - case OCR_HOLISTIC_BACKEND_NONE: - default: - CV_Error(Error::StsError,"DeepCNN::create backend not implemented"); - return Ptr(); - break; - } -} - - -Ptr DeepCNN::createDictNet(String archFilename,String weightsFilename,int backEnd) -{ - Ptr preprocessor=ImagePreprocessor::createImageStandarizer(113); - switch(backEnd){ - case OCR_HOLISTIC_BACKEND_DEFAULT: - -#ifdef HAVE_CAFFE - return Ptr(new DeepCNNCaffeImpl(archFilename, weightsFilename,preprocessor, 100)); - -#elif defined(HAVE_DNN) - return Ptr(new DeepCNNOpenCvDNNImpl(archFilename, weightsFilename,preprocessor, 100,100,32,1)); -#else - CV_Error(Error::StsError,"DeepCNN::create backend not implemented"); - return Ptr(); -#endif - break; - - case OCR_HOLISTIC_BACKEND_CAFFE: - return Ptr(new DeepCNNCaffeImpl(archFilename, weightsFilename,preprocessor, 100)); - break; - case OCR_HOLISTIC_BACKEND_DNN: - return Ptr(new DeepCNNOpenCvDNNImpl(archFilename, weightsFilename,preprocessor, 100,100,32,1)); - break; - case OCR_HOLISTIC_BACKEND_NONE: - default: - CV_Error(Error::StsError,"DeepCNN::create backend not implemented"); - return Ptr(); - break; - } -} - -namespace cnn_config{ -std::vector getAvailableBackends() -{ - std::vector backends; - -#ifdef HAVE_CAFFE - backends.push_back("CAFFE, OCR_HOLISTIC_BACKEND_CAFFE"); // dnn backend opencv_dnn - -#endif -#ifdef HAVE_DNN - backends.push_back("DNN, OCR_HOLISTIC_BACKEND_DNN");// opencv_dnn based backend" -#endif - return backends; - - -} - -namespace caffe_backend{ - -#ifdef HAVE_CAFFE - -bool getCaffeGpuMode() -{ - return caffe::Caffe::mode()==caffe::Caffe::GPU; -} - -void setCaffeGpuMode(bool useGpu) -{ - if(useGpu) - { - caffe::Caffe::set_mode(caffe::Caffe::GPU); - }else - { - caffe::Caffe::set_mode(caffe::Caffe::CPU); - } -} - -bool getCaffeAvailable() -{ - return true; -} -#else - -bool getCaffeGpuMode() -{ - CV_Error(Error::StsError,"Caffe not available during compilation!"); - return 0; -} - -void setCaffeGpuMode(bool useGpu) -{ - CV_Error(Error::StsError,"Caffe not available during compilation!"); - CV_Assert(useGpu==1);//Compilation directives force -} - -bool getCaffeAvailable(){ - return 0; -} - -#endif - -}//namespace caffe -namespace dnn_backend{ -#ifdef HAVE_DNN - - -bool getDNNAvailable(){ - return true; -} -#else -bool getDNNAvailable(){ - return 0; -} -#endif -}//namspace dnn_backend -}//namespace cnn_config - -class OCRHolisticWordRecognizerImpl: public OCRHolisticWordRecognizer{ -private: - struct NetOutput{ - //Auxiliary structure that handles the logic of getting class ids and probabillities from - //the raw outputs of caffe - int wordIdx; - float probabillity; - - static bool sorter(const NetOutput& o1,const NetOutput& o2) - {//used with std::sort to provide the most probable class - return o1.probabillity>o2.probabillity; - } - - static void getOutputs(const float* buffer,int nbOutputs,std::vector& res) - { - res.resize(nbOutputs); - for(int k=0;k tmp; - getOutputs(buffer,nbOutputs,tmp); - classNum=tmp[0].wordIdx; - confidence=tmp[0].probabillity; - - } - }; -protected: - std::vector labels_; - Ptr classifier_; -public: - OCRHolisticWordRecognizerImpl(Ptr classifierPtr,String vocabularyFilename):classifier_(classifierPtr) - { - CV_Assert(fileExists(vocabularyFilename));//this fails for some rason - std::ifstream labelsFile(vocabularyFilename.c_str()); - if(!labelsFile) - { - CV_Error(Error::StsError,"Could not read Labels from file"); - } - std::string line; - while (std::getline(labelsFile, line)) - { - labels_.push_back(std::string(line)); - } - CV_Assert(this->classifier_->getOutputSize()==int(this->labels_.size())); - } - - OCRHolisticWordRecognizerImpl(Ptr classifierPtr,const std::vector& vocabulary):classifier_(classifierPtr) - { - this->labels_=vocabulary; - CV_Assert(this->classifier_->getOutputSize()==int(this->labels_.size())); - } - - void recogniseImage(InputArray inputImage,CV_OUT String& transcription,CV_OUT double& confidence) - { - Mat netOutput; - this->classifier_->classify(inputImage,netOutput); - int classNum; - NetOutput::getClassification((float*)(netOutput.data),this->classifier_->getOutputSize(),classNum,confidence); - transcription=this->labels_[classNum]; - } - - void recogniseImageBatch(InputArrayOfArrays inputImageList,CV_OUT std::vector& transcriptionVec,CV_OUT std::vector& confidenceVec) - { - Mat netOutput; - this->classifier_->classifyBatch(inputImageList,netOutput); - - for(int k=0;kclassifier_->getOutputSize(),classNum,confidence); - transcriptionVec.push_back(this->labels_[classNum]); - confidenceVec.push_back(confidence); - } - } - - - void run(Mat& image, std::string& output_text, std::vector* component_rects=NULL, - std::vector* component_texts=NULL, std::vector* component_confidences=NULL, - int component_level=0) - { - CV_Assert(component_level==OCR_LEVEL_WORD);//Componnents not applicable for word spotting - double confidence; - String transcription; - recogniseImage(image,transcription,confidence); - output_text=transcription.c_str(); - if(component_rects!=NULL) - { - component_rects->resize(1); - (*component_rects)[0]=Rect(0,0,image.size().width,image.size().height); - } - if(component_texts!=NULL) - { - component_texts->resize(1); - (*component_texts)[0]=transcription.c_str(); - } - if(component_confidences!=NULL) - { - component_confidences->resize(1); - (*component_confidences)[0]=float(confidence); - } - } - - void run(Mat& image, Mat& mask, std::string& output_text, std::vector* component_rects=NULL, - std::vector* component_texts=NULL, std::vector* component_confidences=NULL, - int component_level=0) - { - CV_Assert(mask.cols==image.cols && mask.rows== image.rows);//Mask is ignored because the CNN operates on a full image - this->run(image,output_text,component_rects,component_texts,component_confidences,component_level); - } - - std::vector& getVocabulary() - { - return this->labels_; - } - - Ptr getClassifier() - { - return this->classifier_; - } -}; - -Ptr OCRHolisticWordRecognizer::create(Ptr classifierPtr,String vocabularyFilename ) -{ - return Ptr(new OCRHolisticWordRecognizerImpl(classifierPtr,vocabularyFilename)); -} - -Ptr OCRHolisticWordRecognizer::create(String modelArchFilename, String modelWeightsFilename, String vocabularyFilename) -{ - Ptr preprocessor=ImagePreprocessor::createImageStandarizer(113); - Ptr classifierPtr(new DeepCNNCaffeImpl(modelArchFilename,modelWeightsFilename,preprocessor,100)); - return Ptr(new OCRHolisticWordRecognizerImpl(classifierPtr,vocabularyFilename)); -} - -Ptr OCRHolisticWordRecognizer::create(Ptr classifierPtr,const std::vector& vocabulary) -{ - return Ptr(new OCRHolisticWordRecognizerImpl(classifierPtr,vocabulary)); -} - -Ptr OCRHolisticWordRecognizer::create(String modelArchFilename, String modelWeightsFilename,const std::vector& vocabulary){ - Ptr preprocessor=ImagePreprocessor::createImageStandarizer(113); - Ptr classifierPtr(new DeepCNNCaffeImpl(modelArchFilename,modelWeightsFilename,preprocessor,100)); - return Ptr(new OCRHolisticWordRecognizerImpl(classifierPtr,vocabulary)); -} - - - - - -} } //namespace text namespace cv diff --git a/modules/text/src/precomp.hpp b/modules/text/src/precomp.hpp index e85e4eb85cb..7ccda150f37 100644 --- a/modules/text/src/precomp.hpp +++ b/modules/text/src/precomp.hpp @@ -45,6 +45,8 @@ #include "opencv2/text.hpp" +#include "text_config.hpp" + #ifdef HAVE_TESSERACT #if !defined(USE_STD_NAMESPACE) #define USE_STD_NAMESPACE diff --git a/modules/text/src/text_detector.cpp b/modules/text/src/text_detector.cpp deleted file mode 100644 index 949f5f86dc4..00000000000 --- a/modules/text/src/text_detector.cpp +++ /dev/null @@ -1,169 +0,0 @@ -#include "precomp.hpp" -#include "opencv2/imgproc.hpp" -#include "opencv2/core.hpp" - - - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -//#ifdef HAVE_CAFFE -//#include "caffe/caffe.hpp" -//#endif - -namespace cv { namespace text { - - - - -class textDetectImpl: public textDetector{ -private: - struct NetOutput{ - //Auxiliary structure that handles the logic of getting bounding box and confidences of textness from - //the raw outputs of caffe - Rect bbox; - float probability; - - - static void getOutputs(const float* buffer,int nbrTextBoxes,int nCol,std::vector& res,Size inputShape) - { - - res.resize(nbrTextBoxes); - for(int k=0;k inputShape.width?inputShape.width-1:x_max; - y_max = y_max > inputShape.height?inputShape.height-1:y_max; - float wd = x_max-x_min+1; - float ht = y_max-y_min+1; - - res[k].bbox=Rect(int(x_min),int(y_min),int(wd),int(ht)); - - res[k].probability=buffer[k*nCol+2]; - } - - } - - - }; -protected: - - Ptr classifier_; -public: - textDetectImpl(Ptr classifierPtr):classifier_(classifierPtr) - { - - } - - - - void textDetectInImage(InputArray inputImage,CV_OUT std::vector& Bbox,CV_OUT std::vector& confidence) - { - Mat netOutput; - // call the detect function of deepTextCNN class - this->classifier_->detect(inputImage,netOutput); - // get the output geometry i.e height and width of output blob from caffe - Size OutputGeometry_ = this->classifier_->getOutputGeometry(); - int nbrTextBoxes = OutputGeometry_.height; - int nCol = OutputGeometry_.width; - - std::vector tmp; - // the output bounding box needs to be resized by the input height and width - Size inputImageShape = Size(inputImage.cols(),inputImage.rows()); - NetOutput::getOutputs((float*)(netOutput.data),nbrTextBoxes,nCol,tmp,inputImageShape); - // put the output in CV_OUT - - for (int k=0;k* component_rects=NULL, - std::vector* component_confidences=NULL, - int component_level=0) - { - CV_Assert(component_level==OCR_LEVEL_WORD);//Componnents not applicable for word spotting - - std::vector bbox; - std::vector score; - textDetectInImage(image,bbox,score); - - if(component_rects!=NULL) - { - component_rects->resize(bbox.size()); // should be a user behavior - - component_rects = &bbox; - } - - if(component_confidences!=NULL) - { - component_confidences->resize(score.size()); // shoub be a user behavior - - component_confidences = &score; - } - } - - void run(Mat& image, Mat& mask, std::vector* component_rects=NULL, - std::vector* component_confidences=NULL, - int component_level=0) - { - CV_Assert(mask.cols==image.cols && mask.rows== image.rows);//Mask is ignored because the CNN operates on a full image - this->run(image,component_rects,component_confidences,component_level); - } - - - - Ptr getClassifier() - { - return this->classifier_; - } -}; - -Ptr textDetector::create(Ptr classifierPtr) -{ - return Ptr(new textDetectImpl(classifierPtr)); -} - -Ptr textDetector::create(String modelArchFilename, String modelWeightsFilename) -{ - -// create a custom preprocessor with rawval - Ptr preprocessor=ImagePreprocessor::createImageCustomPreprocessor(255); -// set the mean for the preprocessor - - Mat textbox_mean(1,3,CV_8U); - textbox_mean.at(0,0)=104; - textbox_mean.at(0,1)=117; - textbox_mean.at(0,2)=123; - preprocessor->set_mean(textbox_mean); -// create a pointer to text box detector(textDetector) - Ptr classifierPtr(DeepCNNTextDetector::create(modelArchFilename,modelWeightsFilename,preprocessor,1)); - return Ptr(new textDetectImpl(classifierPtr)); -} - - - - - - - -} } //namespace text namespace cv diff --git a/modules/text/src/text_detectorCNN.cpp b/modules/text/src/text_detectorCNN.cpp index 5267b390fed..1c3933fda47 100644 --- a/modules/text/src/text_detectorCNN.cpp +++ b/modules/text/src/text_detectorCNN.cpp @@ -1,453 +1,101 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + #include "precomp.hpp" #include "opencv2/imgproc.hpp" #include "opencv2/core.hpp" - - -#include #include -#include -#include #include -#include -#include -#include -#include -#include - - -#ifdef HAVE_CAFFE -#include "caffe/caffe.hpp" -#endif -#ifdef HAVE_DNN #include "opencv2/dnn.hpp" -#endif using namespace cv::dnn; -#define CV_WARN(message) fprintf(stderr, "warning: %s (%s:%d)\n", message, __FILE__, __LINE__) - -namespace cv { namespace text { - -inline bool fileExists (String filename) { - std::ifstream f(filename.c_str()); - return f.good(); -} - -class DeepCNNTextDetectorCaffeImpl: public DeepCNNTextDetector{ -protected: - - - void process_(Mat inputImage, Mat &outputMat) - { - // do forward pass and stores the output in outputMat - CV_Assert(outputMat.isContinuous()); - if (inputImage.channels() != this->inputChannelCount_) - CV_WARN("Number of input channel(s) in the model is not same as input"); - - -#ifdef HAVE_CAFFE - net_->input_blobs()[0]->Reshape(1, this->inputChannelCount_,this->inputGeometry_.height,this->inputGeometry_.width); - net_->Reshape(); - float* inputBuffer=net_->input_blobs()[0]->mutable_cpu_data(); - float* inputData=inputBuffer; - - std::vector input_channels; - Mat preprocessed; - // if the image have multiple color channels the input layer should be populated accordingly - for (int channel=0;channel < this->inputChannelCount_;channel++){ - - cv::Mat netInputWraped(this->inputGeometry_.height, this->inputGeometry_.width, CV_32FC1, inputData); - input_channels.push_back(netInputWraped); - //input_data += width * height; - inputData+=(this->inputGeometry_.height*this->inputGeometry_.width); - } - this->preprocess(inputImage,preprocessed); - split(preprocessed, input_channels); - - //preprocessed.copyTo(netInputWraped); - - - this->net_->Forward(); - const float* outputNetData=net_->output_blobs()[0]->cpu_data(); - // const float* outputNetData1=net_->output_blobs()[1]->cpu_data(); - - - - - this->outputGeometry_.height = net_->output_blobs()[0]->height(); - this->outputGeometry_.width = net_->output_blobs()[0]->width(); - this->outputChannelCount_ = net_->output_blobs()[0]->channels(); - int outputSz = this->outputChannelCount_ * this->outputGeometry_.height * this->outputGeometry_.width; - outputMat.create(this->outputGeometry_.height , this->outputGeometry_.width,CV_32FC1); - float*outputMatData=(float*)(outputMat.data); - - memcpy(outputMatData,outputNetData,sizeof(float)*outputSz); - - - -#endif - } - - -#ifdef HAVE_CAFFE - Ptr > net_; -#endif - //Size inputGeometry_; - int minibatchSz_;//The existence of the assignment operator mandates this to be nonconst - //int outputSize_; -public: - DeepCNNTextDetectorCaffeImpl(const DeepCNNTextDetectorCaffeImpl& dn): - minibatchSz_(dn.minibatchSz_){ - outputGeometry_=dn.outputGeometry_; - inputGeometry_=dn.inputGeometry_; - //Implemented to supress Visual Studio warning "assignment operator could not be generated" -#ifdef HAVE_CAFFE - this->net_=dn.net_; -#endif - } - DeepCNNTextDetectorCaffeImpl& operator=(const DeepCNNTextDetectorCaffeImpl &dn) - { -#ifdef HAVE_CAFFE - this->net_=dn.net_; -#endif - this->setPreprocessor(dn.preprocessor_); - this->inputGeometry_=dn.inputGeometry_; - this->inputChannelCount_=dn.inputChannelCount_; - this->outputChannelCount_ = dn.outputChannelCount_; - // this->minibatchSz_=dn.minibatchSz_; - //this->outputGeometry_=dn.outputSize_; - this->preprocessor_=dn.preprocessor_; - this->outputGeometry_=dn.outputGeometry_; - return *this; - //Implemented to supress Visual Studio warning "assignment operator could not be generated" - } - - DeepCNNTextDetectorCaffeImpl(String modelArchFilename, String modelWeightsFilename,Ptr preprocessor, int maxMinibatchSz) - :minibatchSz_(maxMinibatchSz) - { - - CV_Assert(this->minibatchSz_>0); - CV_Assert(fileExists(modelArchFilename)); - CV_Assert(fileExists(modelWeightsFilename)); - CV_Assert(!preprocessor.empty()); - this->setPreprocessor(preprocessor); -#ifdef HAVE_CAFFE - this->net_.reset(new caffe::Net(modelArchFilename, caffe::TEST)); - CV_Assert(net_->num_inputs()==1); - CV_Assert(net_->num_outputs()==1); - CV_Assert(this->net_->input_blobs()[0]->channels()==1 - ||this->net_->input_blobs()[0]->channels()==3); - // this->channelCount_=this->net_->input_blobs()[0]->channels(); - - - - this->net_->CopyTrainedLayersFrom(modelWeightsFilename); - - caffe::Blob* inputLayer = this->net_->input_blobs()[0]; - - this->inputGeometry_.height = inputLayer->height(); - this->inputGeometry_.width = inputLayer->width(); - this->inputChannelCount_ = inputLayer->channels(); - //this->inputGeometry_.batchSize =1; - - inputLayer->Reshape(this->minibatchSz_,this->inputChannelCount_,this->inputGeometry_.height, this->inputGeometry_.width); - net_->Reshape(); - this->outputChannelCount_ = net_->output_blobs()[0]->channels(); - //this->outputGeometry_.batchSize =1; - this->outputGeometry_.height =net_->output_blobs()[0]->height(); - this->outputGeometry_.width = net_->output_blobs()[0]->width(); - -#else - CV_Error(Error::StsError,"Caffe not available during compilation!"); -#endif - } - - - void detect(InputArray image, OutputArray Bbox_prob) - { - Size outSize = Size(this->outputGeometry_.height,outputGeometry_.width); - Bbox_prob.create(outSize,CV_32F); // dummy initialization is it needed - Mat outputMat = Bbox_prob.getMat(); - process_(image.getMat(),outputMat); - //copy back to outputArray - outputMat.copyTo(Bbox_prob); - } - - Size getOutputGeometry() - { - return this->outputGeometry_; - } - Size getinputGeometry() - { - return this->inputGeometry_; - } - - int getMinibatchSize() - { - return this->minibatchSz_; - } - - int getBackend() - { - return OCR_HOLISTIC_BACKEND_CAFFE; - } - void setPreprocessor(Ptr ptr) - { - CV_Assert(!ptr.empty()); - preprocessor_=ptr; - } - - Ptr getPreprocessor() - { - return preprocessor_; - } -}; - +namespace cv +{ +namespace text +{ -class DeepCNNTextDetectorDNNImpl: public DeepCNNTextDetector{ +class TextDetectorCNNImpl : public TextDetectorCNN +{ protected: + Net net_; + std::vector sizes_; + int inputChannelCount_; + bool detectMultiscale_; - void process_(Mat inputImage, Mat &outputMat) + void getOutputs(const float* buffer,int nbrTextBoxes,int nCol, + std::vector& Bbox, std::vector& confidence, Size inputShape) { - // do forward pass and stores the output in outputMat - CV_Assert(outputMat.isContinuous()); - if (inputImage.channels() != this->inputChannelCount_) - CV_WARN("Number of input channel(s) in the model is not same as input"); - - -#ifdef HAVE_DNN - - Mat preprocessed; - this->preprocess(inputImage,preprocessed); - - net_->setInput(blobFromImage(preprocessed,1, this->inputGeometry_), "data"); - - Mat outputNet = this->net_->forward( ); - - this->outputGeometry_.height = outputNet.size[2]; - this->outputGeometry_.width = outputNet.size[3]; - this->outputChannelCount_ = outputNet.size[1]; + for(int k = 0; k < nbrTextBoxes; k++) + { + float x_min = buffer[k*nCol + 3]*inputShape.width; + float y_min = buffer[k*nCol + 4]*inputShape.height; - outputMat.create(this->outputGeometry_.height , this->outputGeometry_.width,CV_32FC1); - float*outputMatData=(float*)(outputMat.data); - float*outputNetData=(float*)(outputNet.data); - int outputSz = this->outputChannelCount_ * this->outputGeometry_.height * this->outputGeometry_.width; + float x_max = buffer[k*nCol + 5]*inputShape.width; + float y_max = buffer[k*nCol + 6]*inputShape.height; - memcpy(outputMatData,outputNetData,sizeof(float)*outputSz); + CV_Assert(x_min < x_max, y_min < y_max); + x_min = std::max(0.f, x_min); + y_min = std::max(0.f, y_min); + x_max = std::min(inputShape.width - 1.f, x_max); + y_max = std::min(inputShape.height - 1.f, y_max); + int wd = cvRound(x_max - x_min); + int ht = cvRound(y_max - y_min); -#endif + Bbox.push_back(Rect(cvRound(x_min), cvRound(y_min), wd, ht)); + confidence.push_back(buffer[k*nCol + 2]); + } } - - -#ifdef HAVE_DNN - Ptr net_; -#endif - //Size inputGeometry_; - int minibatchSz_;//The existence of the assignment operator mandates this to be nonconst - //int outputSize_; - //int inputHeight_; - //int inputWidth_; - //int inputChannel_; public: - DeepCNNTextDetectorDNNImpl(const DeepCNNTextDetectorDNNImpl& dn): - minibatchSz_(dn.minibatchSz_){ - outputGeometry_=dn.outputGeometry_; - inputGeometry_=dn.inputGeometry_; - //Implemented to supress Visual Studio warning "assignment operator could not be generated" -#ifdef HAVE_DNN - this->net_=dn.net_; -#endif - } - DeepCNNTextDetectorDNNImpl& operator=(const DeepCNNTextDetectorDNNImpl &dn) - { -#ifdef HAVE_DNN - this->net_=dn.net_; -#endif - this->setPreprocessor(dn.preprocessor_); - this->inputGeometry_=dn.inputGeometry_; - this->inputChannelCount_=dn.inputChannelCount_; - this->outputChannelCount_ = dn.outputChannelCount_; - // this->minibatchSz_=dn.minibatchSz_; - //this->outputGeometry_=dn.outputSize_; - this->preprocessor_=dn.preprocessor_; - this->outputGeometry_=dn.outputGeometry_; - return *this; - //Implemented to supress Visual Studio warning "assignment operator could not be generated" - } - - DeepCNNTextDetectorDNNImpl(String modelArchFilename, String modelWeightsFilename,Ptr preprocessor, int maxMinibatchSz,int inputHeight=700,int inputWidth =700,int inputChannel =3) - :minibatchSz_(maxMinibatchSz) + TextDetectorCNNImpl(const String& modelArchFilename, const String& modelWeightsFilename, bool detectMultiscale) : + detectMultiscale_(detectMultiscale) { + net_ = readNetFromCaffe(modelArchFilename, modelWeightsFilename); + CV_Assert(!net_.empty()); + inputChannelCount_ = 3; + sizes_.push_back(Size(700, 700)); - CV_Assert(this->minibatchSz_>0); - CV_Assert(fileExists(modelArchFilename)); - CV_Assert(fileExists(modelWeightsFilename)); - CV_Assert(!preprocessor.empty()); - this->setPreprocessor(preprocessor); -#ifdef HAVE_DNN - this->net_ = makePtr(readNetFromCaffe(modelArchFilename,modelWeightsFilename)); - - if (this->net_.empty()) + if(detectMultiscale_) { - std::cerr << "Can't load network by using the following files: " << std::endl; - std::cerr << "prototxt: " << modelArchFilename << std::endl; - std::cerr << "caffemodel: " << modelWeightsFilename << std::endl; - //std::cerr << "bvlc_googlenet.caffemodel can be downloaded here:" << std::endl; - //std::cerr << "http://dl.caffe.berkeleyvision.org/bvlc_googlenet.caffemodel" << std::endl; - exit(-1); + sizes_.push_back(Size(300, 300)); + sizes_.push_back(Size(700,500)); + sizes_.push_back(Size(700,300)); + sizes_.push_back(Size(1600,1600)); } - - this->inputGeometry_.height =inputHeight; - this->inputGeometry_.width = inputWidth ;//inputLayer->width(); - this->inputChannelCount_ = inputChannel ;//inputLayer->channels(); - -#else - CV_Error(Error::StsError,"DNN module not available during compilation!"); -#endif } - - void detect(InputArray image, OutputArray Bbox_prob) + void textDetectInImage(InputArray inputImage_, std::vector& Bbox, std::vector& confidence) { - Size outSize = Size(this->outputGeometry_.height,outputGeometry_.width); - Bbox_prob.create(outSize,CV_32F); // dummy initialization is it needed - Mat outputMat = Bbox_prob.getMat(); + CV_Assert(inputImage_.channels() == inputChannelCount_); + Mat inputImage = inputImage_.getMat().clone(); + Bbox.resize(0); + confidence.resize(0); - process_(image.getMat(),outputMat); - //copy back to outputArray - outputMat.copyTo(Bbox_prob); - } - - Size getOutputGeometry() - { - return this->outputGeometry_; - } - Size getinputGeometry() - { - return this->inputGeometry_; - } - - int getMinibatchSize() - { - return this->minibatchSz_; - } - - int getBackend() - { - return OCR_HOLISTIC_BACKEND_DNN; - } - void setPreprocessor(Ptr ptr) - { - CV_Assert(!ptr.empty()); - preprocessor_=ptr; - } - - Ptr getPreprocessor() - { - return preprocessor_; - } + for(size_t i = 0; i < sizes_.size(); i++) + { + Size inputGeometry = sizes_[i]; + net_.setInput(blobFromImage(inputImage, 1, inputGeometry, Scalar(123, 117, 104)), "data"); + Mat outputNet = net_.forward(); + int nbrTextBoxes = outputNet.size[2]; + int nCol = outputNet.size[3]; + int outputChannelCount = outputNet.size[1]; + CV_Assert(outputChannelCount == 1); + getOutputs((float*)(outputNet.data), nbrTextBoxes, nCol, Bbox, confidence, inputImage.size()); + } + } }; -Ptr DeepCNNTextDetector::create(String archFilename,String weightsFilename,Ptr preprocessor,int minibatchSz,int backEnd) +Ptr TextDetectorCNN::create(const String &modelArchFilename, const String &modelWeightsFilename, bool detectMultiscale) { - if(preprocessor.empty()) - { - // create a custom preprocessor with rawval - preprocessor=ImagePreprocessor::createImageCustomPreprocessor(255); - // set the mean for the preprocessor - - Mat textbox_mean(1,3,CV_8U); - textbox_mean.at(0,0)=104; - textbox_mean.at(0,1)=117; - textbox_mean.at(0,2)=123; - preprocessor->set_mean(textbox_mean); - } - switch(backEnd){ - case OCR_HOLISTIC_BACKEND_DEFAULT: - -#ifdef HAVE_CAFFE - return Ptr(new DeepCNNTextDetectorCaffeImpl(archFilename, weightsFilename,preprocessor, minibatchSz)); - -#elif defined(HAVE_DNN) - return Ptr(new DeepCNNTextDetectorDNNImpl(archFilename, weightsFilename,preprocessor, minibatchSz,700,700,3)); -#else - CV_Error(Error::StsError,"DeepCNNTextDetector::create backend not implemented"); - return Ptr(); -#endif - case OCR_HOLISTIC_BACKEND_CAFFE: - - return Ptr(new DeepCNNTextDetectorCaffeImpl(archFilename, weightsFilename,preprocessor, minibatchSz)); - break; - - case OCR_HOLISTIC_BACKEND_DNN: - return Ptr(new DeepCNNTextDetectorDNNImpl(archFilename, weightsFilename,preprocessor, minibatchSz,700,700,3)); - break; - - case OCR_HOLISTIC_BACKEND_NONE: - default: - CV_Error(Error::StsError,"DeepCNNTextDetector::create backend not implemented"); - return Ptr(); - break; - } - //return Ptr(); - + return makePtr(modelArchFilename, modelWeightsFilename, detectMultiscale); } - - -Ptr DeepCNNTextDetector::createTextBoxNet(String archFilename,String weightsFilename,int backEnd) -{ - - // create a custom preprocessor with rawval - Ptr preprocessor=ImagePreprocessor::createImageCustomPreprocessor(255); - // set the mean for the preprocessor - - Mat textbox_mean(1,3,CV_8U); - textbox_mean.at(0,0)=104; - textbox_mean.at(0,1)=117; - textbox_mean.at(0,2)=123; - preprocessor->set_mean(textbox_mean); - switch(backEnd){ - case OCR_HOLISTIC_BACKEND_DEFAULT: - -#ifdef HAVE_CAFFE - return Ptr(new DeepCNNTextDetectorCaffeImpl(archFilename, weightsFilename,preprocessor, 1)); - -#elif defined(HAVE_DNN) - return Ptr(new DeepCNNTextDetectorDNNImpl(archFilename, weightsFilename,preprocessor, 1,700,700,3)); -#else - CV_Error(Error::StsError,"DeepCNNTextDetector::create backend not implemented"); - return Ptr(); -#endif - break; - case OCR_HOLISTIC_BACKEND_CAFFE: - return Ptr(new DeepCNNTextDetectorCaffeImpl(archFilename, weightsFilename,preprocessor, 1)); - break; - case OCR_HOLISTIC_BACKEND_DNN: - return Ptr(new DeepCNNTextDetectorDNNImpl(archFilename, weightsFilename,preprocessor, 1,700,700,3)); - break; - case OCR_HOLISTIC_BACKEND_NONE: - default: - CV_Error(Error::StsError,"DeepCNNTextDetector::create backend not implemented"); - return Ptr(); - break; - } - //return Ptr(); - -} - -void DeepCNNTextDetector::preprocess(const Mat& input,Mat& output) -{ - Size inputHtWd = Size(this->inputGeometry_.height,this->inputGeometry_.width); - this->preprocessor_->preprocess(input,output,inputHtWd,this->inputChannelCount_); -} - - - -} } //namespace text namespace cv +} //namespace text +} //namespace cv diff --git a/modules/text/text_config.hpp.in b/modules/text/text_config.hpp.in index 81e624bab37..ec5120a4160 100644 --- a/modules/text/text_config.hpp.in +++ b/modules/text/text_config.hpp.in @@ -1,4 +1,7 @@ #ifndef __OPENCV_TEXT_CONFIG_HPP__ #define __OPENCV_TEXT_CONFIG_HPP__ +// HAVE OCR Tesseract +#cmakedefine HAVE_TESSERACT + #endif From 1306621f3d17f695565ff5cc39ecee953ca93ee0 Mon Sep 17 00:00:00 2001 From: Vladislav Sovrasov Date: Tue, 10 Oct 2017 15:29:20 +0300 Subject: [PATCH 26/31] text: add prototxt for text detection model --- modules/text/samples/textbox.prototxt | 1605 +++++++++++++++++++++++++ 1 file changed, 1605 insertions(+) create mode 100644 modules/text/samples/textbox.prototxt diff --git a/modules/text/samples/textbox.prototxt b/modules/text/samples/textbox.prototxt new file mode 100644 index 00000000000..6e8cb688ef4 --- /dev/null +++ b/modules/text/samples/textbox.prototxt @@ -0,0 +1,1605 @@ +name: "VGG_text_longer_conv_300x300_deploy" +input: "data" +input_shape { + dim: 1 + dim: 3 + dim: 700 + dim: 700 +} +layer { + name: "conv1_1" + type: "Convolution" + bottom: "data" + top: "conv1_1" + param { + lr_mult: 0 + decay_mult: 0 + } + param { + lr_mult: 0 + decay_mult: 0 + } + convolution_param { + num_output: 64 + pad: 1 + kernel_size: 3 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "relu1_1" + type: "ReLU" + bottom: "conv1_1" + top: "conv1_1" +} +layer { + name: "conv1_2" + type: "Convolution" + bottom: "conv1_1" + top: "conv1_2" + param { + lr_mult: 0 + decay_mult: 0 + } + param { + lr_mult: 0 + decay_mult: 0 + } + convolution_param { + num_output: 64 + pad: 1 + kernel_size: 3 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "relu1_2" + type: "ReLU" + bottom: "conv1_2" + top: "conv1_2" +} +layer { + name: "pool1" + type: "Pooling" + bottom: "conv1_2" + top: "pool1" + pooling_param { + pool: MAX + kernel_size: 2 + stride: 2 + } +} +layer { + name: "conv2_1" + type: "Convolution" + bottom: "pool1" + top: "conv2_1" + param { + lr_mult: 0 + decay_mult: 0 + } + param { + lr_mult: 0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 1 + kernel_size: 3 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "relu2_1" + type: "ReLU" + bottom: "conv2_1" + top: "conv2_1" +} +layer { + name: "conv2_2" + type: "Convolution" + bottom: "conv2_1" + top: "conv2_2" + param { + lr_mult: 0 + decay_mult: 0 + } + param { + lr_mult: 0 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 1 + kernel_size: 3 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "relu2_2" + type: "ReLU" + bottom: "conv2_2" + top: "conv2_2" +} +layer { + name: "pool2" + type: "Pooling" + bottom: "conv2_2" + top: "pool2" + pooling_param { + pool: MAX + kernel_size: 2 + stride: 2 + } +} +layer { + name: "conv3_1" + type: "Convolution" + bottom: "pool2" + top: "conv3_1" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 256 + pad: 1 + kernel_size: 3 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "relu3_1" + type: "ReLU" + bottom: "conv3_1" + top: "conv3_1" +} +layer { + name: "conv3_2" + type: "Convolution" + bottom: "conv3_1" + top: "conv3_2" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 256 + pad: 1 + kernel_size: 3 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "relu3_2" + type: "ReLU" + bottom: "conv3_2" + top: "conv3_2" +} +layer { + name: "conv3_3" + type: "Convolution" + bottom: "conv3_2" + top: "conv3_3" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 256 + pad: 1 + kernel_size: 3 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "relu3_3" + type: "ReLU" + bottom: "conv3_3" + top: "conv3_3" +} +layer { + name: "pool3" + type: "Pooling" + bottom: "conv3_3" + top: "pool3" + pooling_param { + pool: MAX + kernel_size: 2 + stride: 2 + } +} +layer { + name: "conv4_1" + type: "Convolution" + bottom: "pool3" + top: "conv4_1" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 512 + pad: 1 + kernel_size: 3 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "relu4_1" + type: "ReLU" + bottom: "conv4_1" + top: "conv4_1" +} +layer { + name: "conv4_2" + type: "Convolution" + bottom: "conv4_1" + top: "conv4_2" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 512 + pad: 1 + kernel_size: 3 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "relu4_2" + type: "ReLU" + bottom: "conv4_2" + top: "conv4_2" +} +layer { + name: "conv4_3" + type: "Convolution" + bottom: "conv4_2" + top: "conv4_3" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 512 + pad: 1 + kernel_size: 3 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "relu4_3" + type: "ReLU" + bottom: "conv4_3" + top: "conv4_3" +} +layer { + name: "pool4" + type: "Pooling" + bottom: "conv4_3" + top: "pool4" + pooling_param { + pool: MAX + kernel_size: 2 + stride: 2 + } +} +layer { + name: "conv5_1" + type: "Convolution" + bottom: "pool4" + top: "conv5_1" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 512 + pad: 1 + kernel_size: 3 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "relu5_1" + type: "ReLU" + bottom: "conv5_1" + top: "conv5_1" +} +layer { + name: "conv5_2" + type: "Convolution" + bottom: "conv5_1" + top: "conv5_2" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 512 + pad: 1 + kernel_size: 3 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "relu5_2" + type: "ReLU" + bottom: "conv5_2" + top: "conv5_2" +} +layer { + name: "conv5_3" + type: "Convolution" + bottom: "conv5_2" + top: "conv5_3" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 512 + pad: 1 + kernel_size: 3 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "relu5_3" + type: "ReLU" + bottom: "conv5_3" + top: "conv5_3" +} +layer { + name: "pool5" + type: "Pooling" + bottom: "conv5_3" + top: "pool5" + pooling_param { + pool: MAX + kernel_size: 3 + stride: 1 + pad: 1 + } +} +layer { + name: "fc6" + type: "Convolution" + bottom: "pool5" + top: "fc6" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 1024 + pad: 6 + kernel_size: 3 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + dilation: 6 + } +} +layer { + name: "relu6" + type: "ReLU" + bottom: "fc6" + top: "fc6" +} +layer { + name: "fc7" + type: "Convolution" + bottom: "fc6" + top: "fc7" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 1024 + kernel_size: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "relu7" + type: "ReLU" + bottom: "fc7" + top: "fc7" +} +layer { + name: "conv6_1" + type: "Convolution" + bottom: "fc7" + top: "conv6_1" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 256 + pad: 0 + kernel_size: 1 + stride: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "conv6_1_relu" + type: "ReLU" + bottom: "conv6_1" + top: "conv6_1" +} +layer { + name: "conv6_2" + type: "Convolution" + bottom: "conv6_1" + top: "conv6_2" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 512 + pad: 1 + kernel_size: 3 + stride: 2 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "conv6_2_relu" + type: "ReLU" + bottom: "conv6_2" + top: "conv6_2" +} +layer { + name: "conv7_1" + type: "Convolution" + bottom: "conv6_2" + top: "conv7_1" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 0 + kernel_size: 1 + stride: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "conv7_1_relu" + type: "ReLU" + bottom: "conv7_1" + top: "conv7_1" +} +layer { + name: "conv7_2" + type: "Convolution" + bottom: "conv7_1" + top: "conv7_2" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 256 + pad: 1 + kernel_size: 3 + stride: 2 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "conv7_2_relu" + type: "ReLU" + bottom: "conv7_2" + top: "conv7_2" +} +layer { + name: "conv8_1" + type: "Convolution" + bottom: "conv7_2" + top: "conv8_1" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 128 + pad: 0 + kernel_size: 1 + stride: 1 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "conv8_1_relu" + type: "ReLU" + bottom: "conv8_1" + top: "conv8_1" +} +layer { + name: "conv8_2" + type: "Convolution" + bottom: "conv8_1" + top: "conv8_2" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 256 + pad: 1 + kernel_size: 3 + stride: 2 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + } +} +layer { + name: "conv8_2_relu" + type: "ReLU" + bottom: "conv8_2" + top: "conv8_2" +} +layer { + name: "pool6" + type: "Pooling" + bottom: "conv8_2" + top: "pool6" + pooling_param { + pool: AVE + global_pooling: true + } +} +layer { + name: "conv4_3_norm" + type: "Normalize" + bottom: "conv4_3" + top: "conv4_3_norm" + norm_param { + across_spatial: false + scale_filler { + type: "constant" + value: 20 + } + channel_shared: false + } +} +layer { + name: "conv4_3_norm_mbox_loc" + type: "Convolution" + bottom: "conv4_3_norm" + top: "conv4_3_norm_mbox_loc" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 48 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + pad_h: 0 + pad_w: 2 + kernel_h: 1 + kernel_w: 5 + stride_h: 1 + stride_w: 1 + } +} +layer { + name: "conv4_3_norm_mbox_loc_perm" + type: "Permute" + bottom: "conv4_3_norm_mbox_loc" + top: "conv4_3_norm_mbox_loc_perm" + permute_param { + order: 0 + order: 2 + order: 3 + order: 1 + } +} +layer { + name: "conv4_3_norm_mbox_loc_flat" + type: "Flatten" + bottom: "conv4_3_norm_mbox_loc_perm" + top: "conv4_3_norm_mbox_loc_flat" + flatten_param { + axis: 1 + } +} +layer { + name: "conv4_3_norm_mbox_conf" + type: "Convolution" + bottom: "conv4_3_norm" + top: "conv4_3_norm_mbox_conf" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 24 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + pad_h: 0 + pad_w: 2 + kernel_h: 1 + kernel_w: 5 + stride_h: 1 + stride_w: 1 + } +} +layer { + name: "conv4_3_norm_mbox_conf_perm" + type: "Permute" + bottom: "conv4_3_norm_mbox_conf" + top: "conv4_3_norm_mbox_conf_perm" + permute_param { + order: 0 + order: 2 + order: 3 + order: 1 + } +} +layer { + name: "conv4_3_norm_mbox_conf_flat" + type: "Flatten" + bottom: "conv4_3_norm_mbox_conf_perm" + top: "conv4_3_norm_mbox_conf_flat" + flatten_param { + axis: 1 + } +} +layer { + name: "conv4_3_norm_mbox_priorbox" + type: "PriorBox" + bottom: "conv4_3_norm" + bottom: "data" + top: "conv4_3_norm_mbox_priorbox" + prior_box_param { + min_size: 30.0 + aspect_ratio: 2 + aspect_ratio: 3 + aspect_ratio: 5 + aspect_ratio: 7 + aspect_ratio: 10 + flip: false + clip: true + variance: 0.1 + variance: 0.1 + variance: 0.2 + variance: 0.2 + } +} +layer { + name: "fc7_mbox_loc" + type: "Convolution" + bottom: "fc7" + top: "fc7_mbox_loc" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 56 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + pad_h: 0 + pad_w: 2 + kernel_h: 1 + kernel_w: 5 + stride_h: 1 + stride_w: 1 + } +} +layer { + name: "fc7_mbox_loc_perm" + type: "Permute" + bottom: "fc7_mbox_loc" + top: "fc7_mbox_loc_perm" + permute_param { + order: 0 + order: 2 + order: 3 + order: 1 + } +} +layer { + name: "fc7_mbox_loc_flat" + type: "Flatten" + bottom: "fc7_mbox_loc_perm" + top: "fc7_mbox_loc_flat" + flatten_param { + axis: 1 + } +} +layer { + name: "fc7_mbox_conf" + type: "Convolution" + bottom: "fc7" + top: "fc7_mbox_conf" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 28 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + pad_h: 0 + pad_w: 2 + kernel_h: 1 + kernel_w: 5 + stride_h: 1 + stride_w: 1 + } +} +layer { + name: "fc7_mbox_conf_perm" + type: "Permute" + bottom: "fc7_mbox_conf" + top: "fc7_mbox_conf_perm" + permute_param { + order: 0 + order: 2 + order: 3 + order: 1 + } +} +layer { + name: "fc7_mbox_conf_flat" + type: "Flatten" + bottom: "fc7_mbox_conf_perm" + top: "fc7_mbox_conf_flat" + flatten_param { + axis: 1 + } +} +layer { + name: "fc7_mbox_priorbox" + type: "PriorBox" + bottom: "fc7" + bottom: "data" + top: "fc7_mbox_priorbox" + prior_box_param { + min_size: 60.0 + max_size: 114.0 + aspect_ratio: 2 + aspect_ratio: 3 + aspect_ratio: 5 + aspect_ratio: 7 + aspect_ratio: 10 + flip: false + clip: true + variance: 0.1 + variance: 0.1 + variance: 0.2 + variance: 0.2 + } +} +layer { + name: "conv6_2_mbox_loc" + type: "Convolution" + bottom: "conv6_2" + top: "conv6_2_mbox_loc" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 56 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + pad_h: 0 + pad_w: 2 + kernel_h: 1 + kernel_w: 5 + stride_h: 1 + stride_w: 1 + } +} +layer { + name: "conv6_2_mbox_loc_perm" + type: "Permute" + bottom: "conv6_2_mbox_loc" + top: "conv6_2_mbox_loc_perm" + permute_param { + order: 0 + order: 2 + order: 3 + order: 1 + } +} +layer { + name: "conv6_2_mbox_loc_flat" + type: "Flatten" + bottom: "conv6_2_mbox_loc_perm" + top: "conv6_2_mbox_loc_flat" + flatten_param { + axis: 1 + } +} +layer { + name: "conv6_2_mbox_conf" + type: "Convolution" + bottom: "conv6_2" + top: "conv6_2_mbox_conf" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 28 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + pad_h: 0 + pad_w: 2 + kernel_h: 1 + kernel_w: 5 + stride_h: 1 + stride_w: 1 + } +} +layer { + name: "conv6_2_mbox_conf_perm" + type: "Permute" + bottom: "conv6_2_mbox_conf" + top: "conv6_2_mbox_conf_perm" + permute_param { + order: 0 + order: 2 + order: 3 + order: 1 + } +} +layer { + name: "conv6_2_mbox_conf_flat" + type: "Flatten" + bottom: "conv6_2_mbox_conf_perm" + top: "conv6_2_mbox_conf_flat" + flatten_param { + axis: 1 + } +} +layer { + name: "conv6_2_mbox_priorbox" + type: "PriorBox" + bottom: "conv6_2" + bottom: "data" + top: "conv6_2_mbox_priorbox" + prior_box_param { + min_size: 114.0 + max_size: 168.0 + aspect_ratio: 2 + aspect_ratio: 3 + aspect_ratio: 5 + aspect_ratio: 7 + aspect_ratio: 10 + flip: false + clip: true + variance: 0.1 + variance: 0.1 + variance: 0.2 + variance: 0.2 + } +} +layer { + name: "conv7_2_mbox_loc" + type: "Convolution" + bottom: "conv7_2" + top: "conv7_2_mbox_loc" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 56 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + pad_h: 0 + pad_w: 2 + kernel_h: 1 + kernel_w: 5 + stride_h: 1 + stride_w: 1 + } +} +layer { + name: "conv7_2_mbox_loc_perm" + type: "Permute" + bottom: "conv7_2_mbox_loc" + top: "conv7_2_mbox_loc_perm" + permute_param { + order: 0 + order: 2 + order: 3 + order: 1 + } +} +layer { + name: "conv7_2_mbox_loc_flat" + type: "Flatten" + bottom: "conv7_2_mbox_loc_perm" + top: "conv7_2_mbox_loc_flat" + flatten_param { + axis: 1 + } +} +layer { + name: "conv7_2_mbox_conf" + type: "Convolution" + bottom: "conv7_2" + top: "conv7_2_mbox_conf" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 28 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + pad_h: 0 + pad_w: 2 + kernel_h: 1 + kernel_w: 5 + stride_h: 1 + stride_w: 1 + } +} +layer { + name: "conv7_2_mbox_conf_perm" + type: "Permute" + bottom: "conv7_2_mbox_conf" + top: "conv7_2_mbox_conf_perm" + permute_param { + order: 0 + order: 2 + order: 3 + order: 1 + } +} +layer { + name: "conv7_2_mbox_conf_flat" + type: "Flatten" + bottom: "conv7_2_mbox_conf_perm" + top: "conv7_2_mbox_conf_flat" + flatten_param { + axis: 1 + } +} +layer { + name: "conv7_2_mbox_priorbox" + type: "PriorBox" + bottom: "conv7_2" + bottom: "data" + top: "conv7_2_mbox_priorbox" + prior_box_param { + min_size: 168.0 + max_size: 222.0 + aspect_ratio: 2 + aspect_ratio: 3 + aspect_ratio: 5 + aspect_ratio: 7 + aspect_ratio: 10 + flip: false + clip: true + variance: 0.1 + variance: 0.1 + variance: 0.2 + variance: 0.2 + } +} +layer { + name: "conv8_2_mbox_loc" + type: "Convolution" + bottom: "conv8_2" + top: "conv8_2_mbox_loc" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 56 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + pad_h: 0 + pad_w: 2 + kernel_h: 1 + kernel_w: 5 + stride_h: 1 + stride_w: 1 + } +} +layer { + name: "conv8_2_mbox_loc_perm" + type: "Permute" + bottom: "conv8_2_mbox_loc" + top: "conv8_2_mbox_loc_perm" + permute_param { + order: 0 + order: 2 + order: 3 + order: 1 + } +} +layer { + name: "conv8_2_mbox_loc_flat" + type: "Flatten" + bottom: "conv8_2_mbox_loc_perm" + top: "conv8_2_mbox_loc_flat" + flatten_param { + axis: 1 + } +} +layer { + name: "conv8_2_mbox_conf" + type: "Convolution" + bottom: "conv8_2" + top: "conv8_2_mbox_conf" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 28 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + pad_h: 0 + pad_w: 2 + kernel_h: 1 + kernel_w: 5 + stride_h: 1 + stride_w: 1 + } +} +layer { + name: "conv8_2_mbox_conf_perm" + type: "Permute" + bottom: "conv8_2_mbox_conf" + top: "conv8_2_mbox_conf_perm" + permute_param { + order: 0 + order: 2 + order: 3 + order: 1 + } +} +layer { + name: "conv8_2_mbox_conf_flat" + type: "Flatten" + bottom: "conv8_2_mbox_conf_perm" + top: "conv8_2_mbox_conf_flat" + flatten_param { + axis: 1 + } +} +layer { + name: "conv8_2_mbox_priorbox" + type: "PriorBox" + bottom: "conv8_2" + bottom: "data" + top: "conv8_2_mbox_priorbox" + prior_box_param { + min_size: 222.0 + max_size: 276.0 + aspect_ratio: 2 + aspect_ratio: 3 + aspect_ratio: 5 + aspect_ratio: 7 + aspect_ratio: 10 + flip: false + clip: true + variance: 0.1 + variance: 0.1 + variance: 0.2 + variance: 0.2 + } +} +layer { + name: "pool6_mbox_loc" + type: "Convolution" + bottom: "pool6" + top: "pool6_mbox_loc" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 56 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + pad_h: 0 + pad_w: 2 + kernel_h: 1 + kernel_w: 5 + stride_h: 1 + stride_w: 1 + } +} +layer { + name: "pool6_mbox_loc_perm" + type: "Permute" + bottom: "pool6_mbox_loc" + top: "pool6_mbox_loc_perm" + permute_param { + order: 0 + order: 2 + order: 3 + order: 1 + } +} +layer { + name: "pool6_mbox_loc_flat" + type: "Flatten" + bottom: "pool6_mbox_loc_perm" + top: "pool6_mbox_loc_flat" + flatten_param { + axis: 1 + } +} +layer { + name: "pool6_mbox_conf" + type: "Convolution" + bottom: "pool6" + top: "pool6_mbox_conf" + param { + lr_mult: 1 + decay_mult: 1 + } + param { + lr_mult: 2 + decay_mult: 0 + } + convolution_param { + num_output: 28 + weight_filler { + type: "xavier" + } + bias_filler { + type: "constant" + value: 0 + } + pad_h: 0 + pad_w: 2 + kernel_h: 1 + kernel_w: 5 + stride_h: 1 + stride_w: 1 + } +} +layer { + name: "pool6_mbox_conf_perm" + type: "Permute" + bottom: "pool6_mbox_conf" + top: "pool6_mbox_conf_perm" + permute_param { + order: 0 + order: 2 + order: 3 + order: 1 + } +} +layer { + name: "pool6_mbox_conf_flat" + type: "Flatten" + bottom: "pool6_mbox_conf_perm" + top: "pool6_mbox_conf_flat" + flatten_param { + axis: 1 + } +} +layer { + name: "pool6_mbox_priorbox" + type: "PriorBox" + bottom: "pool6" + bottom: "data" + top: "pool6_mbox_priorbox" + prior_box_param { + min_size: 276.0 + max_size: 330.0 + aspect_ratio: 2 + aspect_ratio: 3 + aspect_ratio: 5 + aspect_ratio: 7 + aspect_ratio: 10 + flip: false + clip: true + variance: 0.1 + variance: 0.1 + variance: 0.2 + variance: 0.2 + } +} +layer { + name: "mbox_loc" + type: "Concat" + bottom: "conv4_3_norm_mbox_loc_flat" + bottom: "fc7_mbox_loc_flat" + bottom: "conv6_2_mbox_loc_flat" + bottom: "conv7_2_mbox_loc_flat" + bottom: "conv8_2_mbox_loc_flat" + bottom: "pool6_mbox_loc_flat" + top: "mbox_loc" + concat_param { + axis: 1 + } +} +layer { + name: "mbox_conf" + type: "Concat" + bottom: "conv4_3_norm_mbox_conf_flat" + bottom: "fc7_mbox_conf_flat" + bottom: "conv6_2_mbox_conf_flat" + bottom: "conv7_2_mbox_conf_flat" + bottom: "conv8_2_mbox_conf_flat" + bottom: "pool6_mbox_conf_flat" + top: "mbox_conf" + concat_param { + axis: 1 + } +} +layer { + name: "mbox_priorbox" + type: "Concat" + bottom: "conv4_3_norm_mbox_priorbox" + bottom: "fc7_mbox_priorbox" + bottom: "conv6_2_mbox_priorbox" + bottom: "conv7_2_mbox_priorbox" + bottom: "conv8_2_mbox_priorbox" + bottom: "pool6_mbox_priorbox" + top: "mbox_priorbox" + concat_param { + axis: 2 + } +} +layer { + name: "mbox_conf_reshape" + type: "Reshape" + bottom: "mbox_conf" + top: "mbox_conf_reshape" + reshape_param { + shape { + dim: 0 + dim: -1 + dim: 2 + } + } +} +layer { + name: "mbox_conf_softmax" + type: "Softmax" + bottom: "mbox_conf_reshape" + top: "mbox_conf_softmax" + softmax_param { + axis: 2 + } +} +layer { + name: "mbox_conf_flatten" + type: "Flatten" + bottom: "mbox_conf_softmax" + top: "mbox_conf_flatten" + flatten_param { + axis: 1 + } +} +layer { + name: "detection_out" + type: "DetectionOutput" + bottom: "mbox_loc" + bottom: "mbox_conf_flatten" + bottom: "mbox_priorbox" + top: "detection_out" + include { + phase: TEST + } + detection_output_param { + num_classes: 2 + share_location: true + background_label_id: 0 + nms_param { + nms_threshold: 0.45 + top_k: 400 + } + code_type: CENTER_SIZE + keep_top_k: 200 + confidence_threshold: 0.01 + } +} From 3253fe9f7ef4abe0ffa7f1eb0a800c23e2c26978 Mon Sep 17 00:00:00 2001 From: Vladislav Sovrasov Date: Tue, 10 Oct 2017 16:08:35 +0300 Subject: [PATCH 27/31] text: impovements in samples and module interface --- modules/text/doc/text.bib | 12 +++++++++++- .../text/include/opencv2/text/textDetector.hpp | 12 ++++++++---- modules/text/samples/deeptextdetection.py | 8 ++++---- modules/text/samples/textbox.prototxt | 6 ++++++ modules/text/samples/textbox_demo.cpp | 17 +++++++++-------- modules/text/src/text_detectorCNN.cpp | 2 +- 6 files changed, 39 insertions(+), 18 deletions(-) diff --git a/modules/text/doc/text.bib b/modules/text/doc/text.bib index 64a8f4a197a..d2ed9f9b6d8 100644 --- a/modules/text/doc/text.bib +++ b/modules/text/doc/text.bib @@ -31,4 +31,14 @@ @article{Gomez14 journal = {CoRR}, volume = {abs/1407.7504}, year = {2014}, -} \ No newline at end of file +} +@inproceedings{LiaoSBWL17, + author = {Minghui Liao and + Baoguang Shi and + Xiang Bai and + Xinggang Wang and + Wenyu Liu}, + title = {TextBoxes: {A} Fast Text Detector with a Single Deep Neural Network}, + booktitle = {AAAI}, + year = {2017} +} diff --git a/modules/text/include/opencv2/text/textDetector.hpp b/modules/text/include/opencv2/text/textDetector.hpp index 0e51df39f4c..9c780ae31e4 100644 --- a/modules/text/include/opencv2/text/textDetector.hpp +++ b/modules/text/include/opencv2/text/textDetector.hpp @@ -27,12 +27,16 @@ class CV_EXPORTS_W TextDetector @param Bbox a vector of Rect that will store the detected word bounding box @param confidence a vector of float that will be updated with the confidence the classifier has for the selected bounding box */ - virtual void textDetectInImage(InputArray inputImage, CV_OUT std::vector& Bbox, CV_OUT std::vector& confidence) = 0; + CV_WRAP virtual void detect(InputArray inputImage, CV_OUT std::vector& Bbox, CV_OUT std::vector& confidence) = 0; virtual ~TextDetector() {} }; /** @brief TextDetectorCNN class provides the functionallity of text bounding box detection. - * A TextDetectorCNN is employed to find bounding boxes of text words given an input image. + This class is representing to find bounding boxes of text words given an input image. + This class uses OpenCV dnn module to load pre-trained model described in @cite LiaoSBWL17. + The original repository with the modified SSD Caffe version: https://github.com/MhLiao/TextBoxes. + Model can be downloaded from [DropBox](https://www.dropbox.com/s/g8pjzv2de9gty8g/TextBoxes_icdar13.caffemodel?dl=0). + Modified .prototxt file with the model description can be found in `opencv_contrib/modules/text/samples/textbox.prototxt`. */ class CV_EXPORTS_W TextDetectorCNN : public TextDetector { @@ -44,9 +48,9 @@ class CV_EXPORTS_W TextDetectorCNN : public TextDetector @param Bbox a vector of Rect that will store the detected word bounding box @param confidence a vector of float that will be updated with the confidence the classifier has for the selected bounding box */ - CV_WRAP virtual void textDetectInImage(InputArray inputImage, CV_OUT std::vector& Bbox, CV_OUT std::vector& confidence) = 0; + CV_WRAP virtual void detect(InputArray inputImage, CV_OUT std::vector& Bbox, CV_OUT std::vector& confidence) = 0; - /** @brief Creates an instance of the textDetector class and implicitly also a DeepCNN classifier. + /** @brief Creates an instance of the TextDetectorCNN class using the provided parameters. @param modelArchFilename the relative or absolute path to the prototxt file describing the classifiers architecture. @param modelWeightsFilename the relative or absolute path to the file containing the pretrained weights of the model in caffe-binary form. diff --git a/modules/text/samples/deeptextdetection.py b/modules/text/samples/deeptextdetection.py index 09dcb24927d..256a28e9eba 100644 --- a/modules/text/samples/deeptextdetection.py +++ b/modules/text/samples/deeptextdetection.py @@ -14,14 +14,14 @@ def main(): print(' (ERROR) You must call this script with an argument (path_to_image_to_be_processed)\n') quit() - if not os.path.isfile('textbox.caffemodel') or not os.path.isfile('textbox_deploy.prototxt'): + if not os.path.isfile('TextBoxes_icdar13.caffemodel') or not os.path.isfile('textbox.prototxt'): print " Model files not found in current directory. Aborting" - print " Model files should be downloaded from https://github.com/sghoshcvc/TextBox-Models" + print " See the documentation of text::TextDetectorCNN class to get download links." quit() img = cv2.imread(str(sys.argv[1])) - textSpotter = cv2.text.TextDetectorCNN_create("textbox_deploy.prototxt","textbox.caffemodel") - rects, outProbs = textSpotter.textDetectInImage(img); + textSpotter = cv2.text.TextDetectorCNN_create("textbox.prototxt", "TextBoxes_icdar13.caffemodel") + rects, outProbs = textSpotter.detect(img); vis = img.copy() thres = 0.6 diff --git a/modules/text/samples/textbox.prototxt b/modules/text/samples/textbox.prototxt index 6e8cb688ef4..bb80198281d 100644 --- a/modules/text/samples/textbox.prototxt +++ b/modules/text/samples/textbox.prototxt @@ -885,6 +885,7 @@ layer { variance: 0.1 variance: 0.2 variance: 0.2 + additional_y_offset: true } } layer { @@ -1009,6 +1010,7 @@ layer { variance: 0.1 variance: 0.2 variance: 0.2 + additional_y_offset: true } } layer { @@ -1133,6 +1135,7 @@ layer { variance: 0.1 variance: 0.2 variance: 0.2 + additional_y_offset: true } } layer { @@ -1257,6 +1260,7 @@ layer { variance: 0.1 variance: 0.2 variance: 0.2 + additional_y_offset: true } } layer { @@ -1381,6 +1385,7 @@ layer { variance: 0.1 variance: 0.2 variance: 0.2 + additional_y_offset: true } } layer { @@ -1505,6 +1510,7 @@ layer { variance: 0.1 variance: 0.2 variance: 0.2 + additional_y_offset: true } } layer { diff --git a/modules/text/samples/textbox_demo.cpp b/modules/text/samples/textbox_demo.cpp index 9975c394730..f3c292836a5 100644 --- a/modules/text/samples/textbox_demo.cpp +++ b/modules/text/samples/textbox_demo.cpp @@ -10,15 +10,14 @@ using namespace cv; namespace { -std::string getHelpStr(std::string progFname) +std::string getHelpStr(const std::string& progFname) { std::stringstream out; out << " Demo of text detection CNN for text detection." << std::endl << " Max Jaderberg et al.: Reading Text in the Wild with Convolutional Neural Networks, IJCV 2015"< " << std::endl - << " Caffe Model files (textbox.caffemodel, textbox_deploy.prototxt)"< textSpotter = - text::TextDetectorCNN::create("textbox_deploy.prototxt","textbox.caffemodel", false); + text::TextDetectorCNN::create(modelArch, moddelWeights, false); std::vector bbox; std::vector outProbabillities; - textSpotter->textDetectInImage(image, bbox, outProbabillities); + textSpotter->detect(image, bbox, outProbabillities); textbox_draw(image, bbox, outProbabillities, 0.5f); diff --git a/modules/text/src/text_detectorCNN.cpp b/modules/text/src/text_detectorCNN.cpp index 1c3933fda47..cd624985fcd 100644 --- a/modules/text/src/text_detectorCNN.cpp +++ b/modules/text/src/text_detectorCNN.cpp @@ -72,7 +72,7 @@ class TextDetectorCNNImpl : public TextDetectorCNN } } - void textDetectInImage(InputArray inputImage_, std::vector& Bbox, std::vector& confidence) + void detect(InputArray inputImage_, std::vector& Bbox, std::vector& confidence) { CV_Assert(inputImage_.channels() == inputChannelCount_); Mat inputImage = inputImage_.getMat().clone(); From 9195d2e6140acecb0312d0ccf04f8cbb98a22a87 Mon Sep 17 00:00:00 2001 From: Vladislav Sovrasov Date: Wed, 11 Oct 2017 14:47:52 +0300 Subject: [PATCH 28/31] text: small adjustments in samples and image preprocessing --- modules/text/samples/dictnet_demo.cpp | 9 --------- modules/text/samples/textbox_demo.cpp | 4 ++-- modules/text/src/ocr_holistic.cpp | 4 ++++ modules/text/src/text_detectorCNN.cpp | 9 +++++---- 4 files changed, 11 insertions(+), 15 deletions(-) diff --git a/modules/text/samples/dictnet_demo.cpp b/modules/text/samples/dictnet_demo.cpp index 277a1c9be35..f70f2c17592 100644 --- a/modules/text/samples/dictnet_demo.cpp +++ b/modules/text/samples/dictnet_demo.cpp @@ -1,12 +1,3 @@ -/* - * dictnet_demo.cpp - * - * Demonstrates simple use of the holistic word classifier in C++ - * - * Created on: June 26, 2016 - * Author: Anguelos Nicolaou - */ - #include "opencv2/text.hpp" #include "opencv2/highgui.hpp" #include "opencv2/imgproc.hpp" diff --git a/modules/text/samples/textbox_demo.cpp b/modules/text/samples/textbox_demo.cpp index f3c292836a5..e6412f9f569 100644 --- a/modules/text/samples/textbox_demo.cpp +++ b/modules/text/samples/textbox_demo.cpp @@ -14,14 +14,14 @@ std::string getHelpStr(const std::string& progFname) { std::stringstream out; out << " Demo of text detection CNN for text detection." << std::endl - << " Max Jaderberg et al.: Reading Text in the Wild with Convolutional Neural Networks, IJCV 2015"< " << std::endl << " Caffe Model files (textbox.prototxt, TextBoxes_icdar13.caffemodel)"< #include -#include "opencv2/dnn.hpp" - using namespace cv::dnn; namespace cv @@ -75,20 +74,22 @@ class TextDetectorCNNImpl : public TextDetectorCNN void detect(InputArray inputImage_, std::vector& Bbox, std::vector& confidence) { CV_Assert(inputImage_.channels() == inputChannelCount_); - Mat inputImage = inputImage_.getMat().clone(); + Size inputSize = inputImage_.getMat().size(); Bbox.resize(0); confidence.resize(0); for(size_t i = 0; i < sizes_.size(); i++) { Size inputGeometry = sizes_[i]; + Mat inputImage = inputImage_.getMat().clone(); + resize(inputImage, inputImage, inputGeometry); net_.setInput(blobFromImage(inputImage, 1, inputGeometry, Scalar(123, 117, 104)), "data"); Mat outputNet = net_.forward(); int nbrTextBoxes = outputNet.size[2]; int nCol = outputNet.size[3]; int outputChannelCount = outputNet.size[1]; CV_Assert(outputChannelCount == 1); - getOutputs((float*)(outputNet.data), nbrTextBoxes, nCol, Bbox, confidence, inputImage.size()); + getOutputs((float*)(outputNet.data), nbrTextBoxes, nCol, Bbox, confidence, inputSize); } } }; From 7031316cb7f4700cf720ce2969020de0e399e685 Mon Sep 17 00:00:00 2001 From: Vladislav Sovrasov Date: Wed, 11 Oct 2017 14:48:35 +0300 Subject: [PATCH 29/31] text: add text recognition sample --- modules/text/samples/text_recognition_cnn.cpp | 109 ++++++++++++++++++ 1 file changed, 109 insertions(+) create mode 100644 modules/text/samples/text_recognition_cnn.cpp diff --git a/modules/text/samples/text_recognition_cnn.cpp b/modules/text/samples/text_recognition_cnn.cpp new file mode 100644 index 00000000000..f0269a7d9fc --- /dev/null +++ b/modules/text/samples/text_recognition_cnn.cpp @@ -0,0 +1,109 @@ +#include +#include +#include + +#include +#include + +using namespace cv; +using namespace std; + +namespace +{ +void printHelpStr(const string& progFname) +{ + cout << " Demo of text recognition CNN for text detection." << endl + << " Max Jaderberg et al.: Reading Text in the Wild with Convolutional Neural Networks, IJCV 2015"< " << endl + << " Caffe Model files (textbox.prototxt, TextBoxes_icdar13.caffemodel)"<& groups, vector& probs, float thres) +{ + for (size_t i = 0; i < groups.size(); i++) + { + if(probs[i] > thres) + { + if (src.type() == CV_8UC3) + { + rectangle(src, groups[i], Scalar( 0, 255, 255 ), 2, LINE_AA); + String label = format("%.2f", probs[i]); + cout << "text box: " << groups[i] << " confidence: " << probs[i] << "\n"; + putText(src, label, groups.at(i).tl(), FONT_HERSHEY_PLAIN, 1, Scalar( 0,0,255 ), 1, LINE_AA); + } + else + rectangle(src, groups[i], Scalar( 255 ), 3, 8 ); + } + } +} + +} + +int main(int argc, const char * argv[]) +{ + if (argc < 2) + { + printHelpStr(argv[0]); + cout << "Insufiecient parameters. Aborting!" << endl; + exit(1); + } + + const string modelArch = "textbox.prototxt"; + const string moddelWeights = "TextBoxes_icdar13.caffemodel"; + + if (!fileExists(modelArch) || !fileExists(moddelWeights)) + { + printHelpStr(argv[0]); + cout << "Model files not found in the current directory. Aborting!" << endl; + exit(1); + } + + Mat image = imread(String(argv[1]), IMREAD_COLOR); + + cout << "Starting Text Box Demo" << endl; + Ptr textSpotter = + text::TextDetectorCNN::create(modelArch, moddelWeights, false); + + vector bbox; + vector outProbabillities; + textSpotter->detect(image, bbox, outProbabillities); + + float prob_threshold = 0.6f; + Mat image_copy = image.clone(); + textbox_draw(image_copy, bbox, outProbabillities, prob_threshold); + imshow("Text detection", image_copy); + image_copy = image.clone(); + + Ptr wordSpotter = + text::OCRHolisticWordRecognizer::create("dictnet_vgg_deploy.prototxt", "dictnet_vgg.caffemodel", "dictnet_vgg_labels.txt"); + + for(size_t i = 0; i < bbox.size(); i++) + { + if(outProbabillities[i] > prob_threshold) + { + Mat wordImg; + cvtColor(image(bbox[i]), wordImg, COLOR_BGR2GRAY); + string word; + vector confs; + wordSpotter->run(wordImg, word, NULL, NULL, &confs); + rectangle(image_copy, bbox[i], Scalar(0, 255, 255), 1, LINE_AA); + putText(image_copy, word, bbox[i].tl(), FONT_HERSHEY_PLAIN, 1, Scalar(0, 0, 255), 1, LINE_AA); + } + } + imshow("Text recognition", image_copy); + cout << "Recognition finished. Press any key to exit.\n"; + waitKey(); + return 0; +} + From 27961cd8ccc043ccd20ca54d89859710f14a8559 Mon Sep 17 00:00:00 2001 From: Vladislav Sovrasov Date: Wed, 11 Oct 2017 16:34:06 +0300 Subject: [PATCH 30/31] text: fix wrong channel swap in TestDetectorCNN --- modules/text/samples/text_recognition_cnn.cpp | 1 - modules/text/src/text_detectorCNN.cpp | 8 +++----- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/modules/text/samples/text_recognition_cnn.cpp b/modules/text/samples/text_recognition_cnn.cpp index f0269a7d9fc..d7a95398bff 100644 --- a/modules/text/samples/text_recognition_cnn.cpp +++ b/modules/text/samples/text_recognition_cnn.cpp @@ -106,4 +106,3 @@ int main(int argc, const char * argv[]) waitKey(); return 0; } - diff --git a/modules/text/src/text_detectorCNN.cpp b/modules/text/src/text_detectorCNN.cpp index 23a84f01597..e74594bac0b 100644 --- a/modules/text/src/text_detectorCNN.cpp +++ b/modules/text/src/text_detectorCNN.cpp @@ -74,22 +74,20 @@ class TextDetectorCNNImpl : public TextDetectorCNN void detect(InputArray inputImage_, std::vector& Bbox, std::vector& confidence) { CV_Assert(inputImage_.channels() == inputChannelCount_); - Size inputSize = inputImage_.getMat().size(); + Mat inputImage = inputImage_.getMat(); Bbox.resize(0); confidence.resize(0); for(size_t i = 0; i < sizes_.size(); i++) { Size inputGeometry = sizes_[i]; - Mat inputImage = inputImage_.getMat().clone(); - resize(inputImage, inputImage, inputGeometry); - net_.setInput(blobFromImage(inputImage, 1, inputGeometry, Scalar(123, 117, 104)), "data"); + net_.setInput(blobFromImage(inputImage, 1, inputGeometry, Scalar(123, 117, 104), false, false), "data"); Mat outputNet = net_.forward(); int nbrTextBoxes = outputNet.size[2]; int nCol = outputNet.size[3]; int outputChannelCount = outputNet.size[1]; CV_Assert(outputChannelCount == 1); - getOutputs((float*)(outputNet.data), nbrTextBoxes, nCol, Bbox, confidence, inputSize); + getOutputs((float*)(outputNet.data), nbrTextBoxes, nCol, Bbox, confidence, inputImage.size()); } } }; From fd2e37da56e945f741ee7296ef8745473a9f7b64 Mon Sep 17 00:00:00 2001 From: Vladislav Sovrasov Date: Mon, 30 Oct 2017 15:33:12 +0300 Subject: [PATCH 31/31] text: improve DL-based samples --- .../include/opencv2/text/textDetector.hpp | 10 ++- modules/text/samples/text_recognition_cnn.cpp | 66 +++++++++++-------- modules/text/samples/textbox_demo.cpp | 39 ++++++----- modules/text/src/text_detectorCNN.cpp | 24 +++---- 4 files changed, 81 insertions(+), 58 deletions(-) diff --git a/modules/text/include/opencv2/text/textDetector.hpp b/modules/text/include/opencv2/text/textDetector.hpp index 9c780ae31e4..fdb92fdfbd0 100644 --- a/modules/text/include/opencv2/text/textDetector.hpp +++ b/modules/text/include/opencv2/text/textDetector.hpp @@ -54,9 +54,15 @@ class CV_EXPORTS_W TextDetectorCNN : public TextDetector @param modelArchFilename the relative or absolute path to the prototxt file describing the classifiers architecture. @param modelWeightsFilename the relative or absolute path to the file containing the pretrained weights of the model in caffe-binary form. - @param detectMultiscale if true, multiple scales of the input image will be used as network input + @param detectionSizes a list of sizes for multiscale detection. The values`[(300,300),(700,500),(700,300),(700,700),(1600,1600)]` are + recommended in @cite LiaoSBWL17 to achieve the best quality. */ - CV_WRAP static Ptr create(const String& modelArchFilename, const String& modelWeightsFilename, bool detectMultiscale = false); + static Ptr create(const String& modelArchFilename, const String& modelWeightsFilename, + std::vector detectionSizes); + /** + @overload + */ + CV_WRAP static Ptr create(const String& modelArchFilename, const String& modelWeightsFilename); }; //! @} diff --git a/modules/text/samples/text_recognition_cnn.cpp b/modules/text/samples/text_recognition_cnn.cpp index d7a95398bff..84df57d297d 100644 --- a/modules/text/samples/text_recognition_cnn.cpp +++ b/modules/text/samples/text_recognition_cnn.cpp @@ -1,6 +1,7 @@ #include #include #include +#include #include #include @@ -29,22 +30,27 @@ bool fileExists (const string& filename) return f.good(); } -void textbox_draw(Mat src, vector& groups, vector& probs, float thres) +void textbox_draw(Mat src, std::vector& groups, std::vector& probs, std::vector& indexes) { - for (size_t i = 0; i < groups.size(); i++) + for (size_t i = 0; i < indexes.size(); i++) { - if(probs[i] > thres) + if (src.type() == CV_8UC3) { - if (src.type() == CV_8UC3) - { - rectangle(src, groups[i], Scalar( 0, 255, 255 ), 2, LINE_AA); - String label = format("%.2f", probs[i]); - cout << "text box: " << groups[i] << " confidence: " << probs[i] << "\n"; - putText(src, label, groups.at(i).tl(), FONT_HERSHEY_PLAIN, 1, Scalar( 0,0,255 ), 1, LINE_AA); - } - else - rectangle(src, groups[i], Scalar( 255 ), 3, 8 ); + Rect currrentBox = groups[indexes[i]]; + rectangle(src, currrentBox, Scalar( 0, 255, 255 ), 2, LINE_AA); + String label = format("%.2f", probs[indexes[i]]); + std::cout << "text box: " << currrentBox << " confidence: " << probs[indexes[i]] << "\n"; + + int baseLine = 0; + Size labelSize = getTextSize(label, FONT_HERSHEY_PLAIN, 1, 1, &baseLine); + int yLeftBottom = std::max(currrentBox.y, labelSize.height); + rectangle(src, Point(currrentBox.x, yLeftBottom - labelSize.height), + Point(currrentBox.x + labelSize.width, yLeftBottom + baseLine), Scalar( 255, 255, 255 ), FILLED); + + putText(src, label, Point(currrentBox.x, yLeftBottom), FONT_HERSHEY_PLAIN, 1, Scalar( 0,0,0 ), 1, LINE_AA); } + else + rectangle(src, groups[i], Scalar( 255 ), 3, 8 ); } } @@ -73,33 +79,41 @@ int main(int argc, const char * argv[]) cout << "Starting Text Box Demo" << endl; Ptr textSpotter = - text::TextDetectorCNN::create(modelArch, moddelWeights, false); + text::TextDetectorCNN::create(modelArch, moddelWeights); vector bbox; vector outProbabillities; textSpotter->detect(image, bbox, outProbabillities); + std::vector indexes; + cv::dnn::NMSBoxes(bbox, outProbabillities, 0.4f, 0.5f, indexes); - float prob_threshold = 0.6f; Mat image_copy = image.clone(); - textbox_draw(image_copy, bbox, outProbabillities, prob_threshold); + textbox_draw(image_copy, bbox, outProbabillities, indexes); imshow("Text detection", image_copy); image_copy = image.clone(); Ptr wordSpotter = text::OCRHolisticWordRecognizer::create("dictnet_vgg_deploy.prototxt", "dictnet_vgg.caffemodel", "dictnet_vgg_labels.txt"); - for(size_t i = 0; i < bbox.size(); i++) + for(size_t i = 0; i < indexes.size(); i++) { - if(outProbabillities[i] > prob_threshold) - { - Mat wordImg; - cvtColor(image(bbox[i]), wordImg, COLOR_BGR2GRAY); - string word; - vector confs; - wordSpotter->run(wordImg, word, NULL, NULL, &confs); - rectangle(image_copy, bbox[i], Scalar(0, 255, 255), 1, LINE_AA); - putText(image_copy, word, bbox[i].tl(), FONT_HERSHEY_PLAIN, 1, Scalar(0, 0, 255), 1, LINE_AA); - } + Mat wordImg; + cvtColor(image(bbox[indexes[i]]), wordImg, COLOR_BGR2GRAY); + string word; + vector confs; + wordSpotter->run(wordImg, word, NULL, NULL, &confs); + + Rect currrentBox = bbox[indexes[i]]; + rectangle(image_copy, currrentBox, Scalar( 0, 255, 255 ), 2, LINE_AA); + + int baseLine = 0; + Size labelSize = getTextSize(word, FONT_HERSHEY_PLAIN, 1, 1, &baseLine); + int yLeftBottom = std::max(currrentBox.y, labelSize.height); + rectangle(image_copy, Point(currrentBox.x, yLeftBottom - labelSize.height), + Point(currrentBox.x + labelSize.width, yLeftBottom + baseLine), Scalar( 255, 255, 255 ), FILLED); + + putText(image_copy, word, Point(currrentBox.x, yLeftBottom), FONT_HERSHEY_PLAIN, 1, Scalar( 0,0,0 ), 1, LINE_AA); + } imshow("Text recognition", image_copy); cout << "Recognition finished. Press any key to exit.\n"; diff --git a/modules/text/samples/textbox_demo.cpp b/modules/text/samples/textbox_demo.cpp index e6412f9f569..1cf9a9aabf4 100644 --- a/modules/text/samples/textbox_demo.cpp +++ b/modules/text/samples/textbox_demo.cpp @@ -1,6 +1,7 @@ #include #include #include +#include #include #include @@ -27,22 +28,27 @@ bool fileExists (const std::string& filename) return f.good(); } -void textbox_draw(Mat src, std::vector& groups, std::vector& probs, float thres) +void textbox_draw(Mat src, std::vector& groups, std::vector& probs, std::vector& indexes) { - for (size_t i = 0; i < groups.size(); i++) + for (size_t i = 0; i < indexes.size(); i++) { - if(probs[i] > thres) + if (src.type() == CV_8UC3) { - if (src.type() == CV_8UC3) - { - rectangle(src, groups[i], Scalar( 0, 255, 255 ), 2, LINE_AA); - String label = format("%.2f", probs[i]); - std::cout << "text box: " << groups[i] << " confidence: " << probs[i] << "\n"; - putText(src, label, groups.at(i).tl(), FONT_HERSHEY_PLAIN, 1, Scalar( 0,0,255 ), 1, LINE_AA); - } - else - rectangle(src, groups[i], Scalar( 255 ), 3, 8 ); + Rect currrentBox = groups[indexes[i]]; + rectangle(src, currrentBox, Scalar( 0, 255, 255 ), 2, LINE_AA); + String label = format("%.2f", probs[indexes[i]]); + std::cout << "text box: " << currrentBox << " confidence: " << probs[indexes[i]] << "\n"; + + int baseLine = 0; + Size labelSize = getTextSize(label, FONT_HERSHEY_PLAIN, 1, 1, &baseLine); + int yLeftBottom = std::max(currrentBox.y, labelSize.height); + rectangle(src, Point(currrentBox.x, yLeftBottom - labelSize.height), + Point(currrentBox.x + labelSize.width, yLeftBottom + baseLine), Scalar( 255, 255, 255 ), FILLED); + + putText(src, label, Point(currrentBox.x, yLeftBottom), FONT_HERSHEY_PLAIN, 1, Scalar( 0,0,0 ), 1, LINE_AA); } + else + rectangle(src, groups[i], Scalar( 255 ), 3, 8 ); } } @@ -62,7 +68,7 @@ int main(int argc, const char * argv[]) if (!fileExists(modelArch) || !fileExists(moddelWeights)) { - std::cout< textSpotter = - text::TextDetectorCNN::create(modelArch, moddelWeights, false); + text::TextDetectorCNN::create(modelArch, moddelWeights); std::vector bbox; std::vector outProbabillities; textSpotter->detect(image, bbox, outProbabillities); - textbox_draw(image, bbox, outProbabillities, 0.5f); + std::vector indexes; + cv::dnn::NMSBoxes(bbox, outProbabillities, 0.3f, 0.4f, indexes); + + textbox_draw(image, bbox, outProbabillities, indexes); imshow("TextBox Demo",image); std::cout << "Done!" << std::endl << std::endl; diff --git a/modules/text/src/text_detectorCNN.cpp b/modules/text/src/text_detectorCNN.cpp index e74594bac0b..84f769b4251 100644 --- a/modules/text/src/text_detectorCNN.cpp +++ b/modules/text/src/text_detectorCNN.cpp @@ -23,8 +23,6 @@ class TextDetectorCNNImpl : public TextDetectorCNN Net net_; std::vector sizes_; int inputChannelCount_; - bool detectMultiscale_; - void getOutputs(const float* buffer,int nbrTextBoxes,int nCol, std::vector& Bbox, std::vector& confidence, Size inputShape) @@ -54,21 +52,12 @@ class TextDetectorCNNImpl : public TextDetectorCNN } public: - TextDetectorCNNImpl(const String& modelArchFilename, const String& modelWeightsFilename, bool detectMultiscale) : - detectMultiscale_(detectMultiscale) + TextDetectorCNNImpl(const String& modelArchFilename, const String& modelWeightsFilename, std::vector detectionSizes) : + sizes_(detectionSizes) { net_ = readNetFromCaffe(modelArchFilename, modelWeightsFilename); CV_Assert(!net_.empty()); inputChannelCount_ = 3; - sizes_.push_back(Size(700, 700)); - - if(detectMultiscale_) - { - sizes_.push_back(Size(300, 300)); - sizes_.push_back(Size(700,500)); - sizes_.push_back(Size(700,300)); - sizes_.push_back(Size(1600,1600)); - } } void detect(InputArray inputImage_, std::vector& Bbox, std::vector& confidence) @@ -92,9 +81,14 @@ class TextDetectorCNNImpl : public TextDetectorCNN } }; -Ptr TextDetectorCNN::create(const String &modelArchFilename, const String &modelWeightsFilename, bool detectMultiscale) +Ptr TextDetectorCNN::create(const String &modelArchFilename, const String &modelWeightsFilename, std::vector detectionSizes) +{ + return makePtr(modelArchFilename, modelWeightsFilename, detectionSizes); +} + +Ptr TextDetectorCNN::create(const String &modelArchFilename, const String &modelWeightsFilename) { - return makePtr(modelArchFilename, modelWeightsFilename, detectMultiscale); + return create(modelArchFilename, modelWeightsFilename, std::vector(1, Size(300, 300))); } } //namespace text } //namespace cv