diff --git a/modules/text/CMakeLists.txt b/modules/text/CMakeLists.txt index 173e87f0677..6653c7a9d65 100644 --- a/modules/text/CMakeLists.txt +++ b/modules/text/CMakeLists.txt @@ -1,25 +1,72 @@ set(the_description "Text Detection and Recognition") -ocv_define_module(text opencv_ml opencv_highgui opencv_imgproc opencv_core opencv_features2d WRAP python) +ocv_define_module(text opencv_ml opencv_highgui opencv_imgproc opencv_core opencv_features2d opencv_calib3d WRAP python) set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${CMAKE_CURRENT_SOURCE_DIR}) find_package(Tesseract) -if(Tesseract_FOUND) - message(STATUS "Tesseract: YES") - set(HAVE_TESSERACT 1) +if(${Tesseract_FOUND}) + message(STATUS "Tesseract: YES") + include_directories(${Tesseract_INCLUDE_DIR}) + target_link_libraries(opencv_text ${Tesseract_LIBS}) + add_definitions(-DHAVE_TESSERACT) else() - message(STATUS "Tesseract: NO") + message(STATUS "Tesseract: NO") endif() -configure_file(${CMAKE_CURRENT_SOURCE_DIR}/text_config.hpp.in - ${CMAKE_BINARY_DIR}/text_config.hpp @ONLY) +find_package(Protobuf) +if(Protobuf_FOUND) + message(STATUS "Protobuf: YES") + set(HAVE_PROTOBUF 1) +else() + message(STATUS "Protobuf: NO") +endif() -include_directories(${CMAKE_CURRENT_BINARY_DIR}) +find_package(Glog) +if(Glog_FOUND) + message(STATUS "Glog: YES") + set(HAVE_GLOG 1) +else() + message(STATUS "Glog: NO") +endif() -if(${Tesseract_FOUND}) -include_directories(${Tesseract_INCLUDE_DIR}) +find_package(Caffe) +if(Caffe_FOUND) + message(STATUS "Caffe: YES") + set(HAVE_CAFFE 1) +else() + message(STATUS "Caffe: NO") endif() -if(${Tesseract_FOUND}) - target_link_libraries(opencv_text ${Tesseract_LIBS}) + +if(HAVE_CAFFE AND HAVE_GLOG AND HAVE_PROTOBUF) + include_directories(${Caffe_INCLUDE_DIR}) + find_package(HDF5 COMPONENTS HL REQUIRED) + include_directories(SYSTEM ${HDF5_INCLUDE_DIRS} ${HDF5_HL_INCLUDE_DIR}) + list(APPEND Caffe_LINKER_LIBS ${HDF5_LIBRARIES}) + find_package(Boost 1.46 REQUIRED COMPONENTS system thread filesystem) + include_directories(SYSTEM ${Boost_INCLUDE_DIR}) + include_directories(SYSTEM /usr/local/cuda-7.5/targets/x86_64-linux/include/) + list(APPEND Caffe_LINKER_LIBS ${Boost_LIBRARIES}) + target_link_libraries(opencv_text ${Caffe_LIBS} ${Glog_LIBS} ${Protobuf_LIBS} ${HDF5_LIBRARIES} ${Boost_LIBRARIES}) + add_definitions(-DHAVE_CAFFE) +endif() #HAVE_CAFFE + +message(STATUS "TEXT CAFFE SEARCH") +if() + message(STATUS "TEXT NO CAFFE CONFLICT") +else() + message(STATUS "TEXT CAFFE CONFLICT") +endif() + +find_package(Qt5Gui) +if(Qt5Gui_FOUND) + message(STATUS "text module found Qt5Gui: YES") + add_definitions(-DHAVE_QT5GUI) + foreach(dt5_dep Gui) + add_definitions(${Qt5${dt5_dep}_DEFINITIONS}) + include_directories(${Qt5${dt5_dep}_INCLUDE_DIRS}) + target_link_libraries(opencv_text ${Qt5${dt5_dep}_LIBRARIES}) + endforeach() +else() + message(STATUS "text module found Qt5Gui: NO") endif() diff --git a/modules/text/FindCaffe.cmake b/modules/text/FindCaffe.cmake new file mode 100644 index 00000000000..12948f62992 --- /dev/null +++ b/modules/text/FindCaffe.cmake @@ -0,0 +1,14 @@ +# Caffe package for CNN Triplet training +unset(Caffe_FOUND) + +find_path(Caffe_INCLUDE_DIR NAMES caffe/caffe.hpp caffe/common.hpp caffe/net.hpp caffe/proto/caffe.pb.h caffe/util/io.hpp caffe/vision_layers.hpp + HINTS + /usr/local/include) + +find_library(Caffe_LIBS NAMES caffe + HINTS + /usr/local/lib) + +if(Caffe_LIBS AND Caffe_INCLUDE_DIR) + set(Caffe_FOUND 1) +endif() diff --git a/modules/text/FindGlog.cmake b/modules/text/FindGlog.cmake new file mode 100755 index 00000000000..c30e9f4a6ab --- /dev/null +++ b/modules/text/FindGlog.cmake @@ -0,0 +1,10 @@ +#Required for Caffe +unset(Glog_FOUND) + +find_library(Glog_LIBS NAMES glog + HINTS + /usr/local/lib) + +if(Glog_LIBS) + set(Glog_FOUND 1) +endif() diff --git a/modules/text/FindProtobuf.cmake b/modules/text/FindProtobuf.cmake new file mode 100755 index 00000000000..6d0ad56a1f7 --- /dev/null +++ b/modules/text/FindProtobuf.cmake @@ -0,0 +1,10 @@ +#Protobuf package required for Caffe +unset(Protobuf_FOUND) + +find_library(Protobuf_LIBS NAMES protobuf + HINTS + /usr/local/lib) + +if(Protobuf_LIBS) + set(Protobuf_FOUND 1) +endif() diff --git a/modules/text/doc/DeepCNN_classdiagram.pdf b/modules/text/doc/DeepCNN_classdiagram.pdf new file mode 100644 index 00000000000..ac94773299d Binary files /dev/null and b/modules/text/doc/DeepCNN_classdiagram.pdf differ diff --git a/modules/text/include/opencv2/text/ocr.hpp b/modules/text/include/opencv2/text/ocr.hpp index 651934b0cb0..109b6671e8d 100644 --- a/modules/text/include/opencv2/text/ocr.hpp +++ b/modules/text/include/opencv2/text/ocr.hpp @@ -46,6 +46,10 @@ #include #include +#include +#include + + namespace cv { @@ -61,29 +65,52 @@ enum OCR_LEVEL_TEXTLINE }; -//base class BaseOCR declares a common API that would be used in a typical text recognition scenario +//base class BaseOCR declares a common API that would be used in a typical text +//recognition scenario class CV_EXPORTS_W BaseOCR { -public: + public: virtual ~BaseOCR() {}; - virtual void run(Mat& image, std::string& output_text, std::vector* component_rects=NULL, - std::vector* component_texts=NULL, std::vector* component_confidences=NULL, + + virtual void run(Mat& image, std::string& output_text, + std::vector* component_rects=NULL, + std::vector* component_texts=NULL, + std::vector* component_confidences=NULL, int component_level=0) = 0; - virtual void run(Mat& image, Mat& mask, std::string& output_text, std::vector* component_rects=NULL, - std::vector* component_texts=NULL, std::vector* component_confidences=NULL, + + virtual void run(Mat& image, Mat& mask, std::string& output_text, + std::vector* component_rects=NULL, + std::vector* component_texts=NULL, + std::vector* component_confidences=NULL, int component_level=0) = 0; + + /** @brief Main functionality of the OCR Hierarchy. Subclasses provide + * default parameters for all parameters other than the input image. + */ + virtual String run(InputArray image){ + std::string res; + std::vector component_rects; + std::vector component_confidences; + std::vector component_texts; + Mat inputImage=image.getMat(); + this->run(inputImage,res,&component_rects,&component_texts, + &component_confidences,OCR_LEVEL_WORD); + return res; + } + }; -/** @brief OCRTesseract class provides an interface with the tesseract-ocr API (v3.02.02) in C++. +/** @brief OCRTesseract class provides an interface with the tesseract-ocr API + * (v3.02.02) in C++. Notice that it is compiled only when tesseract-ocr is correctly installed. @note - - (C++) An example of OCRTesseract recognition combined with scene text detection can be found - at the end_to_end_recognition demo: + - (C++) An example of OCRTesseract recognition combined with scene text + detection can be found at the end_to_end_recognition demo: - - (C++) Another example of OCRTesseract recognition combined with scene text detection can be - found at the webcam_demo: + - (C++) Another example of OCRTesseract recognition combined with scene + text detection can be found at the webcam_demo: */ class CV_EXPORTS_W OCRTesseract : public BaseOCR @@ -91,52 +118,73 @@ class CV_EXPORTS_W OCRTesseract : public BaseOCR public: /** @brief Recognize text using the tesseract-ocr API. - Takes image on input and returns recognized text in the output_text parameter. Optionally - provides also the Rects for individual text elements found (e.g. words), and the list of those - text elements with their confidence values. + Takes image on input and returns recognized text in the output_text + parameter. Optionally provides also the Rects for individual text elements + found (e.g. words), and the list of those text elements with their + confidence values. @param image Input image CV_8UC1 or CV_8UC3 + @param output_text Output text of the tesseract-ocr. - @param component_rects If provided the method will output a list of Rects for the individual - text elements found (e.g. words or text lines). - @param component_texts If provided the method will output a list of text strings for the - recognition of individual text elements found (e.g. words or text lines). - @param component_confidences If provided the method will output a list of confidence values - for the recognition of individual text elements found (e.g. words or text lines). + + @param component_rects If provided the method will output a list of Rects + for the individual text elements found (e.g. words or text lines). + + @param component_texts If provided the method will output a list of text + strings for the recognition of individual text elements found (e.g. words or + text lines). + + @param component_confidences If provided the method will output a list of + confidence values for the recognition of individual text elements found + (e.g. words or text lines). + @param component_level OCR_LEVEL_WORD (by default), or OCR_LEVEL_TEXT_LINE. */ - virtual void run(Mat& image, std::string& output_text, std::vector* component_rects=NULL, - std::vector* component_texts=NULL, std::vector* component_confidences=NULL, + virtual void run (Mat& image, std::string& output_text, + std::vector* component_rects=NULL, + std::vector* component_texts=NULL, + std::vector* component_confidences=NULL, int component_level=0); - virtual void run(Mat& image, Mat& mask, std::string& output_text, std::vector* component_rects=NULL, - std::vector* component_texts=NULL, std::vector* component_confidences=NULL, - int component_level=0); + virtual void run (Mat& image, Mat& mask, std::string& output_text, + std::vector* component_rects=NULL, + std::vector* component_texts=NULL, + std::vector* component_confidences=NULL, + int component_level=0); // aliases for scripting - CV_WRAP String run(InputArray image, int min_confidence, int component_level=0); + CV_WRAP String run (InputArray image, int min_confidence, + int component_level=0); - CV_WRAP String run(InputArray image, InputArray mask, int min_confidence, int component_level=0); + CV_WRAP String run(InputArray image, InputArray mask, + int min_confidence, int component_level=0); CV_WRAP virtual void setWhiteList(const String& char_whitelist) = 0; - /** @brief Creates an instance of the OCRTesseract class. Initializes Tesseract. + /** @brief Creates an instance of the OCRTesseract class. Initializes + * Tesseract. + + * @param datapath the name of the parent directory of tessdata ended with + * "/", or NULL to use the system's default directory. + + * @param language an ISO 639-3 code or NULL will default to "eng". - @param datapath the name of the parent directory of tessdata ended with "/", or NULL to use the - system's default directory. - @param language an ISO 639-3 code or NULL will default to "eng". - @param char_whitelist specifies the list of characters used for recognition. NULL defaults to - "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ". - @param oem tesseract-ocr offers different OCR Engine Modes (OEM), by deffault - tesseract::OEM_DEFAULT is used. See the tesseract-ocr API documentation for other possible - values. - @param psmode tesseract-ocr offers different Page Segmentation Modes (PSM) tesseract::PSM_AUTO - (fully automatic layout analysis) is used. See the tesseract-ocr API documentation for other - possible values. + * @param char_whitelist specifies the list of characters used for + * recognition. NULL defaults to "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ". + + * @param oem tesseract-ocr offers different OCR Engine Modes (OEM), by + * default tesseract::OEM_DEFAULT is used. See the tesseract-ocr API + * documentation for other possible values. + + * @param psmode tesseract-ocr offers different Page Segmentation Modes + * (PSM) tesseract::PSM_AUTO (fully automatic layout analysis) is used. See + * the tesseract-ocr API documentation for other possible values. */ - CV_WRAP static Ptr create(const char* datapath=NULL, const char* language=NULL, - const char* char_whitelist=NULL, int oem=3, int psmode=3); + CV_WRAP static Ptr create (const char* datapath=NULL, + const char* language=NULL, + const char* char_whitelist=NULL, + int oem=3, int psmode=3); }; @@ -147,134 +195,156 @@ enum decoder_mode OCR_DECODER_VITERBI = 0 // Other algorithms may be added }; -/** @brief OCRHMMDecoder class provides an interface for OCR using Hidden Markov Models. +/** @brief OCRHMMDecoder class provides an interface for OCR using Hidden Markov + * Models. -@note - - (C++) An example on using OCRHMMDecoder recognition combined with scene text detection can - be found at the webcam_demo sample: - + * @note + * - (C++) An example on using OCRHMMDecoder recognition combined with scene + * text detection can be found at the webcam_demo sample: + * */ -class CV_EXPORTS_W OCRHMMDecoder : public BaseOCR -{ -public: +class CV_EXPORTS_W OCRHMMDecoder : public BaseOCR { + public: /** @brief Callback with the character classifier is made a class. - This way it hides the feature extractor and the classifier itself, so developers can write - their own OCR code. + * This way it hides the feature extractor and the classifier itself, so + * developers can write their own OCR code. - The default character classifier and feature extractor can be loaded using the utility funtion - loadOCRHMMClassifierNM and KNN model provided in - . - */ - class CV_EXPORTS_W ClassifierCallback - { - public: + * The default character classifier and feature extractor can be loaded using + * the utility funtion loadOCRHMMClassifierNM and KNN model provided in + * . + */ + class CV_EXPORTS_W ClassifierCallback{ + public: virtual ~ClassifierCallback() { } - /** @brief The character classifier must return a (ranked list of) class(es) id('s) + /** @brief The character classifier must return a (ranked list of) + * class(es) id('s) - @param image Input image CV_8UC1 or CV_8UC3 with a single letter. - @param out_class The classifier returns the character class categorical label, or list of - class labels, to which the input image corresponds. - @param out_confidence The classifier returns the probability of the input image - corresponding to each classes in out_class. + * @param image Input image CV_8UC1 or CV_8UC3 with a single letter. + * @param out_class The classifier returns the character class + * categorical label, or list of class labels, to which the input image + * corresponds. + + * @param out_confidence The classifier returns the probability of the + * input image corresponding to each classes in out_class. */ - virtual void eval( InputArray image, std::vector& out_class, std::vector& out_confidence); + virtual void eval (InputArray image, std::vector& out_class, + std::vector& out_confidence); }; -public: /** @brief Recognize text using HMM. - Takes binary image on input and returns recognized text in the output_text parameter. Optionally - provides also the Rects for individual text elements found (e.g. words), and the list of those - text elements with their confidence values. + * Takes binary image on input and returns recognized text in the output_text + * parameter. Optionally provides also the Rects for individual text elements + * found (e.g. words), and the list of those text elements with their + * confidence values. - @param image Input binary image CV_8UC1 with a single text line (or word). + * @param image Input binary image CV_8UC1 with a single text line (or word). - @param output_text Output text. Most likely character sequence found by the HMM decoder. + * @param output_text Output text. Most likely character sequence found by + * the HMM decoder. - @param component_rects If provided the method will output a list of Rects for the individual - text elements found (e.g. words). + * @param component_rects If provided the method will output a list of Rects + * for the individual text elements found (e.g. words). - @param component_texts If provided the method will output a list of text strings for the - recognition of individual text elements found (e.g. words). + * @param component_texts If provided the method will output a list of text + * strings for the recognition of individual text elements found (e.g. words) + * . - @param component_confidences If provided the method will output a list of confidence values - for the recognition of individual text elements found (e.g. words). + * @param component_confidences If provided the method will output a list of + * confidence values for the recognition of individual text elements found + * (e.g. words). - @param component_level Only OCR_LEVEL_WORD is supported. - */ - virtual void run(Mat& image, std::string& output_text, std::vector* component_rects=NULL, - std::vector* component_texts=NULL, std::vector* component_confidences=NULL, - int component_level=0); + * @param component_level Only OCR_LEVEL_WORD is supported. + */ + virtual void run (Mat& image, std::string& output_text, + std::vector* component_rects=NULL, + std::vector* component_texts=NULL, + std::vector* component_confidences=NULL, + int component_level=0); /** @brief Recognize text using HMM. - Takes an image and a mask (where each connected component corresponds to a segmented character) - on input and returns recognized text in the output_text parameter. Optionally - provides also the Rects for individual text elements found (e.g. words), and the list of those - text elements with their confidence values. + * Takes an image and a mask (where each connected component corresponds to a + * segmented character) on input and returns recognized text in the + * output_text parameter. Optionally provides also the Rects for individual + * text elements found (e.g. words), and the list of those text elements with + * their confidence values. - @param image Input image CV_8UC1 or CV_8UC3 with a single text line (or word). - @param mask Input binary image CV_8UC1 same size as input image. Each connected component in mask corresponds to a segmented character in the input image. + * @param image Input image CV_8UC1 or CV_8UC3 with a single text line + * (or word). - @param output_text Output text. Most likely character sequence found by the HMM decoder. + * @param mask Input binary image CV_8UC1 same size as input image. Each + * connected component in mask corresponds to a segmented character in the + * input image. - @param component_rects If provided the method will output a list of Rects for the individual - text elements found (e.g. words). + * @param output_text Output text. Most likely character sequence found by + * the HMM decoder. - @param component_texts If provided the method will output a list of text strings for the - recognition of individual text elements found (e.g. words). + * @param component_rects If provided the method will output a list of Rects + * for the individual text elements found (e.g. words). - @param component_confidences If provided the method will output a list of confidence values - for the recognition of individual text elements found (e.g. words). + * @param component_texts If provided the method will output a list of text + * strings for the recognition of individual text elements found (e.g. words) + * . - @param component_level Only OCR_LEVEL_WORD is supported. - */ - virtual void run(Mat& image, Mat& mask, std::string& output_text, std::vector* component_rects=NULL, - std::vector* component_texts=NULL, std::vector* component_confidences=NULL, + * @param component_confidences If provided the method will output a list of + * confidence values for the recognition of individual text elements found + * (e.g. words). + + * @param component_level Only OCR_LEVEL_WORD is supported. + */ + virtual void run(Mat& image, Mat& mask, std::string& output_text, + std::vector* component_rects=NULL, + std::vector* component_texts=NULL, + std::vector* component_confidences=NULL, int component_level=0); // aliases for scripting - CV_WRAP String run(InputArray image, int min_confidence, int component_level=0); + CV_WRAP String run(InputArray image, + int min_confidence, + int component_level=0); - CV_WRAP String run(InputArray image, InputArray mask, int min_confidence, int component_level=0); + CV_WRAP String run(InputArray image, + InputArray mask, + int min_confidence, + int component_level=0); - /** @brief Creates an instance of the OCRHMMDecoder class. Initializes HMMDecoder. + /** @brief Creates an instance of the OCRHMMDecoder class. Initializes + * HMMDecoder. - @param classifier The character classifier with built in feature extractor. + * @param classifier The character classifier with built in feature + * extractor. - @param vocabulary The language vocabulary (chars when ascii english text). vocabulary.size() - must be equal to the number of classes of the classifier. + * @param vocabulary The language vocabulary (chars when ascii english text) + * . vocabulary.size() must be equal to the number of classes of the + * classifier. - @param transition_probabilities_table Table with transition probabilities between character - pairs. cols == rows == vocabulary.size(). + * @param transition_probabilities_table Table with transition probabilities + * between character pairs. cols == rows == vocabulary.size(). - @param emission_probabilities_table Table with observation emission probabilities. cols == - rows == vocabulary.size(). + * @param emission_probabilities_table Table with observation emission + * probabilities. cols == rows == vocabulary.size(). - @param mode HMM Decoding algorithm. Only OCR_DECODER_VITERBI is available for the moment - (). + * @param mode HMM Decoding algorithm. Only OCR_DECODER_VITERBI is available + * for the moment (). */ - static Ptr create(const Ptr classifier,// The character classifier with built in feature extractor - const std::string& vocabulary, // The language vocabulary (chars when ascii english text) - // size() must be equal to the number of classes - InputArray transition_probabilities_table, // Table with transition probabilities between character pairs - // cols == rows == vocabulari.size() - InputArray emission_probabilities_table, // Table with observation emission probabilities - // cols == rows == vocabulari.size() - decoder_mode mode = OCR_DECODER_VITERBI); // HMM Decoding algorithm (only Viterbi for the moment) - - CV_WRAP static Ptr create(const Ptr classifier,// The character classifier with built in feature extractor - const String& vocabulary, // The language vocabulary (chars when ascii english text) - // size() must be equal to the number of classes - InputArray transition_probabilities_table, // Table with transition probabilities between character pairs - // cols == rows == vocabulari.size() - InputArray emission_probabilities_table, // Table with observation emission probabilities - // cols == rows == vocabulari.size() - int mode = OCR_DECODER_VITERBI); // HMM Decoding algorithm (only Viterbi for the moment) - -protected: + static Ptr create( + const Ptr classifier, // The character classifier with built in feature extractor + const std::string& vocabulary, // The language vocabulary (chars when ascii english text) size() must be equal to the number of classes + InputArray transition_probabilities_table, // Table with transition probabilities between character pairs cols == rows == vocabulari.size() + InputArray emission_probabilities_table, // Table with observation emission probabilities cols == rows == vocabulari.size() + decoder_mode mode = OCR_DECODER_VITERBI); // HMM Decoding algorithm (only Viterbi for the moment) + + CV_WRAP static Ptr create( + const Ptr classifier, // The character classifier with built in feature extractor + const String& vocabulary, // The language vocabulary (chars when ascii english text) size() must be equal to the number of classes + InputArray transition_probabilities_table, // Table with transition probabilities between character pairs cols == rows == vocabulari.size() + InputArray emission_probabilities_table, // Table with observation emission probabilities cols == rows == vocabulari.size() + int mode = OCR_DECODER_VITERBI); // HMM Decoding algorithm (only Viterbi for the moment) + + protected: Ptr classifier; std::string vocabulary; @@ -283,76 +353,98 @@ class CV_EXPORTS_W OCRHMMDecoder : public BaseOCR decoder_mode mode; }; -/** @brief Allow to implicitly load the default character classifier when creating an OCRHMMDecoder object. +/** @brief Allow to implicitly load the default character classifier when + * creating an OCRHMMDecoder object. -@param filename The XML or YAML file with the classifier model (e.g. OCRHMM_knn_model_data.xml) + * @param filename The XML or YAML file with the classifier model (e.g. + * OCRHMM_knn_model_data.xml) -The KNN default classifier is based in the scene text recognition method proposed by Lukás Neumann & -Jiri Matas in [Neumann11b]. Basically, the region (contour) in the input image is normalized to a -fixed size, while retaining the centroid and aspect ratio, in order to extract a feature vector -based on gradient orientations along the chain-code of its perimeter. Then, the region is classified -using a KNN model trained with synthetic data of rendered characters with different standard font -types. + * The KNN default classifier is based in the scene text recognition method + * proposed by Lukás Neumann & Jiri Matas in [Neumann11b]. Basically, the region + * (contour) in the input image is normalized to a fixed size, while retaining + * the centroid and aspect ratio, in order to extract a feature vector based on + * gradient orientations along the chain-code of its perimeter. Then, the region + * is classified using a KNN model trained with synthetic data of rendered + * characters with different standard font types. */ +CV_EXPORTS_W Ptr loadOCRHMMClassifierNM ( + const String& filename); -CV_EXPORTS_W Ptr loadOCRHMMClassifierNM(const String& filename); - -/** @brief Allow to implicitly load the default character classifier when creating an OCRHMMDecoder object. +/** @brief Allow to implicitly load the default character classifier when + * creating an OCRHMMDecoder object. -@param filename The XML or YAML file with the classifier model (e.g. OCRBeamSearch_CNN_model_data.xml.gz) + * @param filename The XML or YAML file with the classifier model (e.g. + * OCRBeamSearch_CNN_model_data.xml.gz) -The CNN default classifier is based in the scene text recognition method proposed by Adam Coates & -Andrew NG in [Coates11a]. The character classifier consists in a Single Layer Convolutional Neural Network and -a linear classifier. It is applied to the input image in a sliding window fashion, providing a set of recognitions -at each window location. + * The CNN default classifier is based in the scene text recognition method + * proposed by Adam Coates & Andrew NG in [Coates11a]. The character classifier + * consists in a Single Layer Convolutional Neural Network and a linear + * classifier. It is applied to the input image in a sliding window fashion, + * providing a set of recognitions at each window location. */ -CV_EXPORTS_W Ptr loadOCRHMMClassifierCNN(const String& filename); +CV_EXPORTS_W Ptr loadOCRHMMClassifierCNN ( + const String& filename); //! @} -/** @brief Utility function to create a tailored language model transitions table from a given list of words (lexicon). - * +/** @brief Utility function to create a tailored language model transitions + * table from a given list of words (lexicon). + * @param vocabulary The language vocabulary (chars when ascii english text). - * + * @param lexicon The list of words that are expected to be found in a particular image. - * - * @param transition_probabilities_table Output table with transition probabilities between character pairs. cols == rows == vocabulary.size(). - * - * The function calculate frequency statistics of character pairs from the given lexicon and fills the output transition_probabilities_table with them. The transition_probabilities_table can be used as input in the OCRHMMDecoder::create() and OCRBeamSearchDecoder::create() methods. + + * @param transition_probabilities_table Output table with transition + * probabilities between character pairs. cols == rows == vocabulary.size(). + + * The function calculate frequency statistics of character pairs from the given + * lexicon and fills the output transition_probabilities_table with them. The + * transition_probabilities_table can be used as input in the + * OCRHMMDecoder::create() and OCRBeamSearchDecoder::create() methods. * @note - * - (C++) An alternative would be to load the default generic language transition table provided in the text module samples folder (created from ispell 42869 english words list) : + * - (C++) An alternative would be to load the default generic language + * transition table provided in the text module samples folder (created + * from ispell 42869 english words list) : * **/ -CV_EXPORTS void createOCRHMMTransitionsTable(std::string& vocabulary, std::vector& lexicon, OutputArray transition_probabilities_table); - -CV_EXPORTS_W Mat createOCRHMMTransitionsTable(const String& vocabulary, std::vector& lexicon); +CV_EXPORTS void createOCRHMMTransitionsTable ( + std::string& vocabulary, std::vector& lexicon, + OutputArray transition_probabilities_table); +CV_EXPORTS_W Mat createOCRHMMTransitionsTable ( + const String& vocabulary, std::vector& lexicon); /* OCR BeamSearch Decoder */ -/** @brief OCRBeamSearchDecoder class provides an interface for OCR using Beam Search algorithm. +/** @brief OCRBeamSearchDecoder class provides an interface for OCR using Beam + * Search algorithm. @note - - (C++) An example on using OCRBeamSearchDecoder recognition combined with scene text detection can - be found at the demo sample: + - (C++) An example on using OCRBeamSearchDecoder recognition combined with + scene text detection can be found at the demo sample: */ -class CV_EXPORTS_W OCRBeamSearchDecoder : public BaseOCR -{ -public: + + +/* Forward declaration of class that can be used to generate an OCRBeamSearchDecoder::ClassifierCallbac */ +class TextImageClassifier; + +class CV_EXPORTS_W OCRBeamSearchDecoder : public BaseOCR{ + + public: /** @brief Callback with the character classifier is made a class. - This way it hides the feature extractor and the classifier itself, so developers can write - their own OCR code. + * This way it hides the feature extractor and the classifier itself, so + * developers can write their own OCR code. - The default character classifier and feature extractor can be loaded using the utility funtion - loadOCRBeamSearchClassifierCNN with all its parameters provided in - . + * The default character classifier and feature extractor can be loaded + * using the utility funtion loadOCRBeamSearchClassifierCNN with all its + * parameters provided in + * . */ - class CV_EXPORTS_W ClassifierCallback - { - public: + class CV_EXPORTS_W ClassifierCallback{ + public: virtual ~ClassifierCallback() { } /** @brief The character classifier must return a (ranked list of) class(es) id('s) @@ -364,8 +456,8 @@ class CV_EXPORTS_W OCRBeamSearchDecoder : public BaseOCR */ virtual void eval( InputArray image, std::vector< std::vector >& recognition_probabilities, std::vector& oversegmentation ); - int getWindowSize() {return 0;} - int getStepSize() {return 0;} + virtual int getWindowSize() {return 0;} + virtual int getStepSize() {return 0;} }; public: @@ -421,6 +513,7 @@ class CV_EXPORTS_W OCRBeamSearchDecoder : public BaseOCR @param beam_size Size of the beam in Beam Search algorithm. */ + static Ptr create(const Ptr classifier,// The character classifier with built in feature extractor const std::string& vocabulary, // The language vocabulary (chars when ascii english text) // size() must be equal to the number of classes @@ -441,6 +534,44 @@ class CV_EXPORTS_W OCRBeamSearchDecoder : public BaseOCR int mode = OCR_DECODER_VITERBI, // HMM Decoding algorithm (only Viterbi for the moment) int beam_size = 500); // Size of the beam in Beam Search algorithm + /** @brief This method allows to plug a classifier that is derivative of TextImageClassifier in to + * OCRBeamSearchDecoder as a ClassifierCallback. + + @param classifier A pointer to a TextImageClassifier decendent + + @param alphabet The language alphabet one char per symbol. alphabet.size() must be equal to the number of classes + of the classifier. In future editinons it should be replaced with a vector of strings. + + @param transition_probabilities_table Table with transition probabilities between character + pairs. cols == rows == alphabet.size(). + + @param emission_probabilities_table Table with observation emission probabilities. cols == + rows == alphabet.size(). + + @param windowWidth The width of the windows to which the sliding window will be iterated. The height will + be the height of the image. The windows might be resized to fit the classifiers input by the classifiers + preprocessor. + + @param windowStep The step for the sliding window + + @param mode HMM Decoding algorithm (only Viterbi for the moment) + + @param beam_size Size of the beam in Beam Search algorithm + */ + CV_WRAP static Ptr create(const Ptr classifier, // The character classifier with built in feature extractor + String alphabet, // The language alphabet one char per symbol + // size() must be equal to the number of classes + InputArray transition_probabilities_table, // Table with transition probabilities between character pairs + // cols == rows == alphabet.size() + InputArray emission_probabilities_table, // Table with observation emission probabilities + // cols == rows == alphabet.size() + int windowWidth, // The width of the windows to which the sliding window will be iterated. + // The height will be the height of the image. The windows might be resized to + // fit the classifiers input by the classifiers preprocessor + int windowStep = 1 , // The step for the sliding window + int mode = OCR_DECODER_VITERBI, // HMM Decoding algorithm (only Viterbi for the moment) + int beam_size = 500); // Size of the beam in Beam Search algorithm + protected: Ptr classifier; @@ -465,6 +596,349 @@ CV_EXPORTS_W Ptr loadOCRBeamSearchClas //! @} -} -} + +//Classifiers should provide diferent backends +//For the moment only caffe is implemeted +enum{ + OCR_HOLISTIC_BACKEND_NONE, + OCR_HOLISTIC_BACKEND_CAFFE +}; + +class TextImageClassifier; + +/** + * @brief The ImagePreprocessor class + */ +class CV_EXPORTS_W ImagePreprocessor{ +protected: + virtual void preprocess_(const Mat& input,Mat& output,Size outputSize,int outputChannels)=0; + +public: + virtual ~ImagePreprocessor(){} + + /** @brief this method in provides public acces to the preprocessing with respect to a specific + * classifier + * + * This method's main use would be to use the preprocessor without feeding it to a classifier. + * Determining the exact behavior of a preprocessor is the main motivation for this. + * + * @param input an image without any constraints + * + * @param output in most cases an image of fixed depth size and whitened + * + * @param sz the size to which the image would be resize if the preprocessor resizes inputs + * + * @param outputChannels the number of channels for the output image + */ + CV_WRAP void preprocess(InputArray input,OutputArray output,Size sz,int outputChannels); + + /** @brief Creates a functor that only resizes and changes the channels of the input + * without further processing. + * + * @return shared pointer to the generated preprocessor + */ + CV_WRAP static Ptr createResizer(); + + /** @brief + * + * @param sigma + * + * @return shared pointer to generated preprocessor + */ + CV_WRAP static Ptr createImageStandarizer(double sigma); + + /** @brief + * + * @return shared pointer to generated preprocessor + */ + CV_WRAP static Ptr createImageMeanSubtractor(InputArray meanImg); + + friend class TextImageClassifier; + +}; + +/** @brief Abstract class that implements the classifcation of text images. + * + * The interface is generic enough to describe any image classifier. And allows + * to take advantage of compouting in batches. While word classifiers are the default + * networks, any image classifers should work. + * + */ +class CV_EXPORTS_W TextImageClassifier +{ +protected: + Size inputGeometry_; + int channelCount_; + Ptr preprocessor_; + /** @brief all image preprocessing is handled here including whitening etc. + * + * @param input the image to be preprocessed for the classifier. If the depth + * is CV_U8 values should be in [0,255] otherwise values are assumed to be in [0,1] + * + * @param output reference to the image to be fed to the classifier, the preprocessor will + * resize the image to the apropriate size and convert it to the apropriate depth\ + * + * The method preprocess should never be used externally, it is up to classify and classifyBatch + * methods to employ it. + */ + virtual void preprocess(const Mat& input,Mat& output); +public: + virtual ~TextImageClassifier() {} + + /** @brief + */ + CV_WRAP virtual void setPreprocessor(Ptr ptr); + + /** @brief + */ + CV_WRAP Ptr getPreprocessor(); + + /** @brief produces a class confidence row-vector given an image + */ + CV_WRAP virtual void classify(InputArray image, OutputArray classProbabilities) = 0; + + /** @brief produces a matrix containing class confidence row-vectors given an collection of images + */ + CV_WRAP virtual void classifyBatch(InputArrayOfArrays image, OutputArray classProbabilities) = 0; + + /** @brief simple getter method returning the number of channels each input sample has + */ + CV_WRAP virtual int getInputChannelCount(){return this->channelCount_;} + + /** @brief simple getter method returning the size of the input sample + */ + CV_WRAP virtual Size getInputSize(){return this->inputGeometry_;} + + /** @brief simple getter method returning the size of the oputput row-vector + */ + CV_WRAP virtual int getOutputSize()=0; + + /** @brief simple getter method returning the size of the minibatches for this classifier. + * If not applicabe this method should return 1 + */ + CV_WRAP virtual int getMinibatchSize()=0; + + friend class ImagePreprocessor; +}; + + + +class CV_EXPORTS_W DeepCNN:public TextImageClassifier +{ + /** @brief Class that uses a pretrained caffe model for word classification. + * + * This network is described in detail in: + * Max Jaderberg et al.: Reading Text in the Wild with Convolutional Neural Networks, IJCV 2015 + * http://arxiv.org/abs/1412.1842 + */ +public: + virtual ~DeepCNN() {}; + + /** @brief Constructs a DeepCNN object from a caffe pretrained model + * + * @param archFilename is the path to the prototxt file containing the deployment model architecture description. + * + * @param weightsFilename is the path to the pretrained weights of the model in binary fdorm. This file can be + * very large, up to 2GB. + * + * @param preprocessor is a pointer to the instance of a ImagePreprocessor implementing the preprocess_ protecteed method; + * + * @param minibatchSz the maximum number of samples that can processed in parallel. In practice this parameter + * has an effect only when computing in the GPU and should be set with respect to the memory available in the GPU. + * + * @param backEnd integer parameter selecting the coputation framework. For now OCR_HOLISTIC_BACKEND_CAFFE is + * the only option + */ + CV_WRAP static Ptr create(String archFilename,String weightsFilename,Ptr preprocessor,int minibatchSz=100,int backEnd=OCR_HOLISTIC_BACKEND_CAFFE); + + /** @brief Constructs a DeepCNN intended to be used for word spotting. + * + * This method loads a pretrained classifier and couples him with a preprocessor that standarises pixels with a + * deviation of 113. The architecture file can be downloaded from: + * + * While the weights can be downloaded from: + * + * The words assigned to the network outputs are available at: + * + * + * @param archFilename is the path to the prototxt file containing the deployment model architecture description. + * When employing OCR_HOLISTIC_BACKEND_CAFFE this is the path to the deploy ".prototxt". + * + * @param weightsFilename is the path to the pretrained weights of the model. When employing + * OCR_HOLISTIC_BACKEND_CAFFE this is the path to the ".caffemodel" file. This file can be very large, the + * pretrained DictNet uses 2GB. + * + * @param backEnd integer parameter selecting the coputation framework. For now OCR_HOLISTIC_BACKEND_CAFFE is + * the only option + */ + CV_WRAP static Ptr createDictNet(String archFilename,String weightsFilename,int backEnd=OCR_HOLISTIC_BACKEND_CAFFE); + +}; + + +/** @brief Prompts Caffe on the computation device beeing used + * + * Caffe can only be controlled globally on whether the GPU or the CPU is used has a + * global behavior. This function queries the current state of caffe. + * If the module is built without caffe, this method throws an exception. + * + * @return true if caffe is computing on the GPU, false if caffe is computing on the CPU + */ +CV_EXPORTS_W bool getCaffeGpuMode(); + +/** @brief Sets the computation device beeing used by Caffe + * + * Caffe can only be controlled globally on whether the GPU or the CPU is used has a + * global behavior. This function queries the current state of caffe. + * If the module is built without caffe, this method throws an exception. + * + * @param useGpu set to true for caffe to be computing on the GPU, false if caffe is + * computing on the CPU + */ +CV_EXPORTS_W void setCaffeGpuMode(bool useGpu); + +/** @brief Provides runtime information on whether Caffe support was compiled in. + * + * The text module API is the same regardless of whether CAffe was available or not + * During compilation. When methods that require Caffe are invocked while Caffe support + * is not compiled in, exceptions are thrown. This method allows to test whether the + * text module was built with caffe during runtime. + * + * @return true if Caffe support for the the text module was provided during compilation, + * false if Caffe was unavailable. + */ +CV_EXPORTS_W bool getCaffeAvailable(); + + +/** @brief OCRHolisticWordRecognizer class provides the functionallity of segmented wordspotting. + * Given a predefined vocabulary , a TextImageClassifier is employed to select the most probable + * word given an input image. + * + * This class implements the logic of providing transcriptions given a vocabulary and and an image + * classifer. The classifier has to be any TextImageClassifier but the classifier for which this + * class was built is the DictNet. In order to load it the following files should be downloaded: + + * + * + * + */ +class CV_EXPORTS_W OCRHolisticWordRecognizer : public BaseOCR +{ +public: + virtual void run(Mat& image, std::string& output_text, std::vector* component_rects=NULL, + std::vector* component_texts=NULL, std::vector* component_confidences=NULL, + int component_level=OCR_LEVEL_WORD)=0; + + /** @brief Recognize text using a segmentation based word-spotting/classifier cnn. + + Takes image on input and returns recognized text in the output_text parameter. Optionally + provides also the Rects for individual text elements found (e.g. words), and the list of those + text elements with their confidence values. + + @param image Input image CV_8UC1 or CV_8UC3 + + @param mask is totally ignored and is only available for compatibillity reasons + + @param output_text Output text of the the word spoting, always one that exists in the dictionary. + + @param component_rects Not applicable for word spotting can be be NULL if not, a single elemnt will + be put in the vector. + + @param component_texts Not applicable for word spotting can be be NULL if not, a single elemnt will + be put in the vector. + + @param component_confidences Not applicable for word spotting can be be NULL if not, a single elemnt will + be put in the vector. + + @param component_level must be OCR_LEVEL_WORD. + */ + + virtual void run(Mat& image, Mat& mask, std::string& output_text, std::vector* component_rects=NULL, + std::vector* component_texts=NULL, std::vector* component_confidences=NULL, + int component_level=OCR_LEVEL_WORD)=0; + + + /** + @brief Method that provides a quick and simple interface to a single word image classifcation + + @param inputImage an image expected to be a CV_U8C1 or CV_U8C3 of any size assumed to contain a single word + + @param transcription an opencv string that will store the detected word transcription + + @param confidence a double that will be updated with the confidence the classifier has for the selected word + */ + CV_WRAP virtual void recogniseImage(InputArray inputImage,CV_OUT String& transcription,CV_OUT double& confidence)=0; + + /** + @brief Method that provides a quick and simple interface to a multiple word image classifcation taking advantage + the classifiers parallel capabilities. + + @param inputImageList an list of images expected to be a CV_U8C1 or CV_U8C3 each image can be of any size and is assumed + to contain a single word. + + @param transcriptions a vector of opencv strings that will store the detected word transcriptions, one for each + input image + + @param confidences a vector of double that will be updated with the confidence the classifier has for each of the + selected words. + */ + CV_WRAP virtual void recogniseImageBatch(InputArrayOfArrays inputImageList,CV_OUT std::vector& transcriptions,CV_OUT std::vector& confidences)=0; + + + /** + @brief simple getter for the vocabulary employed + */ + CV_WRAP virtual const std::vector& getVocabulary()=0; + + /** @brief simple getter for the preprocessing functor + */ + CV_WRAP virtual Ptr getClassifier()=0; + + /** @brief Creates an instance of the OCRHolisticWordRecognizer class. + + @param classifierPtr an instance of TextImageClassifier, normaly a DeepCNN instance + + @param vocabularyFilename the relative or absolute path to the file containing all words in the vocabulary. Each text line + in the file is assumed to be a single word. The number of words in the vocabulary must be exactly the same as the outputSize + of the classifier. + */ + CV_WRAP static Ptr create(Ptr classifierPtr,String vocabularyFilename); + + + /** @brief Creates an instance of the OCRHolisticWordRecognizer class and implicitly also a DeepCNN classifier. + + @param modelArchFilename the relative or absolute path to the prototxt file describing the classifiers architecture. + + @param modelWeightsFilename the relative or absolute path to the file containing the pretrained weights of the model in caffe-binary form. + + @param vocabularyFilename the relative or absolute path to the file containing all words in the vocabulary. Each text line + in the file is assumed to be a single word. The number of words in the vocabulary must be exactly the same as the outputSize + of the classifier. + */ + CV_WRAP static Ptr create(String modelArchFilename, String modelWeightsFilename, String vocabularyFilename); + + /** @brief + * + * @param classifierPtr + * + * @param vocabulary + */ + CV_WRAP static Ptr create(Ptr classifierPtr,const std::vector& vocabulary); + + /** @brief + * + * @param modelArchFilename + * + * @param modelWeightsFilename + * + * @param vocabulary + */ + CV_WRAP static Ptr create (String modelArchFilename, String modelWeightsFilename, const std::vector& vocabulary); +}; + + +}//namespace text +}//namespace cv + + #endif // _OPENCV_TEXT_OCR_HPP_ diff --git a/modules/text/include/opencv2/text/text_synthesizer.hpp b/modules/text/include/opencv2/text/text_synthesizer.hpp new file mode 100644 index 00000000000..ce898e84639 --- /dev/null +++ b/modules/text/include/opencv2/text/text_synthesizer.hpp @@ -0,0 +1,378 @@ +/*M////////////////////////////////////////////////////////////////////////////////////////// +// +// IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. +// +// By downloading, copying, installing or using the software you agree to this license. +// If you do not agree to this license, do not download, install, +// copy or use the software. +// +// +// License Agreement +// For Open Source Computer Vision Library +// +// Copyright (C) 2000-2008, Intel Corporation, all rights reserved. +// Copyright (C) 2009, Willow Garage Inc., all rights reserved. +// Copyright (C) 2013, OpenCV Foundation, all rights reserved. +// Third party copyrights are property of their respective owners. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// +// * Redistribution's of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// * Redistribution's in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// * The name of the copyright holders may not be used to endorse or promote products +// derived from this software without specific prior written permission. +// +// This software is provided by the copyright holders and contributors "as is" and +// any express or implied warranties, including, but not limited to, the implied +// warranties of merchantability and fitness for a particular purpose are disclaimed. +// In no event shall the Intel Corporation or contributors be liable for any direct, +// indirect, incidental, special, exemplary, or consequential damages +// (including, but not limited to, procurement of substitute goods or services; +// loss of use, data, or profits; or business interruption) however caused +// and on any theory of liability, whether in contract, strict liability, +// or tort (including negligence or otherwise) arising in any way out of +// the use of this software, even if advised of the possibility of such damage. +// +//M*/ + +#ifndef TEXT_SYNTHESIZER_HPP +#define TEXT_SYNTHESIZER_HPP + + + +namespace cv +{ +namespace text +{ + +enum{ + //based on QFontDatabase::WritingSystem + //Qt is the sole backend + CV_TEXT_SYNTHESIZER_SCRIPT_ANY, + CV_TEXT_SYNTHESIZER_SCRIPT_LATIN, + CV_TEXT_SYNTHESIZER_SCRIPT_GREEK, + CV_TEXT_SYNTHESIZER_SCRIPT_CYRILLIC, + CV_TEXT_SYNTHESIZER_SCRIPT_ARMENIAN, + CV_TEXT_SYNTHESIZER_SCRIPT_ARABIC, + CV_TEXT_SYNTHESIZER_SCRIPT_HEBREW, + CV_TEXT_SYNTHESIZER_SCRIPT_SYRIAC, + CV_TEXT_SYNTHESIZER_SCRIPT_THAANA, + CV_TEXT_SYNTHESIZER_SCRIPT_DEVANAGARI, + CV_TEXT_SYNTHESIZER_SCRIPT_BENGALI, + CV_TEXT_SYNTHESIZER_SCRIPT_GURMUKHI, + CV_TEXT_SYNTHESIZER_SCRIPT_GUJARATI, + CV_TEXT_SYNTHESIZER_SCRIPT_ORIYA, + CV_TEXT_SYNTHESIZER_SCRIPT_TAMIL, + CV_TEXT_SYNTHESIZER_SCRIPT_TELUGU, + CV_TEXT_SYNTHESIZER_SCRIPT_KANNADA, + CV_TEXT_SYNTHESIZER_SCRIPT_MALAYALAM, + CV_TEXT_SYNTHESIZER_SCRIPT_SINHALA, + CV_TEXT_SYNTHESIZER_SCRIPT_THAI, + CV_TEXT_SYNTHESIZER_SCRIPT_LAO, + CV_TEXT_SYNTHESIZER_SCRIPT_TIBETAN, + CV_TEXT_SYNTHESIZER_SCRIPT_MYANMAR, + CV_TEXT_SYNTHESIZER_SCRIPT_GEORGIAN, + CV_TEXT_SYNTHESIZER_SCRIPT_KHMER, + CV_TEXT_SYNTHESIZER_SCRIPT_CHINESE_SIMPLIFIED, + CV_TEXT_SYNTHESIZER_SCRIPT_CHINESE_TRADITIONAL, + CV_TEXT_SYNTHESIZER_SCRIPT_JAPANESE, + CV_TEXT_SYNTHESIZER_SCRIPT_KOREAM, + CV_TEXT_SYNTHESIZER_SCRIPT_VIETNAMESE +}; + +/** @brief class that renders synthetic text images for training a CNN on + * word spotting + * + * This functionallity is based on "Synthetic Data and Artificial Neural + * Networks for Natural Scene Text Recognition" by Max Jaderberg. + * available at + * + * @note + * - (Python) a demo generating some samples in Greek can be found in: + * + */ +class CV_EXPORTS_W TextSynthesizer{ + protected: + int resHeight_; + int maxResWidth_; + + double underlineProbabillity_; + double italicProbabillity_; + double boldProbabillity_; + double maxPerspectiveDistortion_; + + double shadowProbabillity_; + double maxShadowOpacity_; + int maxShadowSize_; + int maxShadowHoffset_; + int maxShadowVoffset_; + + double borderProbabillity_; + int maxBorderSize_; + + double curvingProbabillity_; + double maxHeightDistortionPercentage_; + double maxCurveArch_; + + double finalBlendAlpha_; + double finalBlendProb_; + + double compressionNoiseProb_; + TextSynthesizer(int maxSampleWidth,int sampleHeight); + public: + CV_WRAP int getMaxSampleWidth () const {return maxResWidth_;} + CV_WRAP int getSampleHeight () const {return resHeight_;} + + CV_WRAP double getUnderlineProbabillity () const {return underlineProbabillity_;} + CV_WRAP double getItalicProballity () const {return italicProbabillity_;} + CV_WRAP double getBoldProbabillity () const {return boldProbabillity_;} + CV_WRAP double getMaxPerspectiveDistortion () const {return maxPerspectiveDistortion_;} + + CV_WRAP double getShadowProbabillity () const {return shadowProbabillity_;} + CV_WRAP double getMaxShadowOpacity () const {return maxShadowOpacity_;} + CV_WRAP int getMaxShadowSize () const {return maxShadowSize_;} + CV_WRAP int getMaxShadowHoffset () const {return maxShadowHoffset_;} + CV_WRAP int getMaxShadowVoffset () const {return maxShadowVoffset_;} + + CV_WRAP double getBorderProbabillity () const {return borderProbabillity_;} + CV_WRAP int getMaxBorderSize () const {return maxBorderSize_;} + + CV_WRAP double getCurvingProbabillity () const {return curvingProbabillity_;} + CV_WRAP double getMaxHeightDistortionPercentage () const {return maxHeightDistortionPercentage_;} + CV_WRAP double getMaxCurveArch () const {return maxCurveArch_;} + CV_WRAP double getBlendAlpha () const {return finalBlendAlpha_;} + CV_WRAP double getBlendProb () const {return finalBlendProb_;} + CV_WRAP double getCompressionNoiseProb () const {return compressionNoiseProb_;} + + /** + * @param v the probabillity the text will be generated with an underlined font + */ + CV_WRAP void setUnderlineProbabillity (double v) {CV_Assert(v >= 0 && v <= 1); underlineProbabillity_ = v;} + + /** + * @param v the probabillity the text will be generated with italic font instead of regular + */ + CV_WRAP void setItalicProballity (double v) {CV_Assert(v >= 0 && v <= 1); italicProbabillity_ = v;} + + /** + * @param v the probabillity the text will be generated with italic font instead of regular + */ + CV_WRAP void setBoldProbabillity (double v) {CV_Assert(v >= 0 && v <= 1);boldProbabillity_ = v;} + + /** Perspective deformation is performed by calculating a homgraphy on a square whose edges + * have moved randomly inside it. + + * @param v the percentage of the side of a ractangle each point is allowed moving + */ + CV_WRAP void setMaxPerspectiveDistortion (double v) {CV_Assert(v >= 0 && v < 50); maxPerspectiveDistortion_ = v;} + + /** + * @param v the probabillity a shadow will apear under the text. + */ + CV_WRAP void setShadowProbabillity (double v) {CV_Assert(v >= 0 && v <= 1); shadowProbabillity_ = v;} + + /** + * @param v the alpha value of the text shadow will be sampled uniformly between 0 and v + */ + CV_WRAP void setMaxShadowOpacity (double v) {CV_Assert(v >= 0 && v <= 1);maxShadowOpacity_ = v;} + + /** + * @param v the maximum size of the shadow in pixels. + */ + CV_WRAP void setMaxShadowSize (int v) {maxShadowSize_ = v;} + + /** + * @param v the maximum number of pixels the shadow can be horizontaly off-center. + */ + CV_WRAP void setMaxShadowHoffset (int v) {maxShadowHoffset_ = v;} + + /** + * @param v the maximum number of pixels the shadow can be vertically off-center. + */ + CV_WRAP void setMaxShadowVoffset (int v) {maxShadowVoffset_ = v;} + + /** + * @param v the probabillity of a border apearing around the text as oposed to shadows, + * borders are always opaque and centered. + */ + CV_WRAP void setBorderProbabillity (double v) {CV_Assert(v >= 0 && v <= 1); borderProbabillity_ = v;} + + /** + * @param v the size in pixels used for border before geometric distortions. + */ + CV_WRAP void setMaxBorderSize (int v) {maxBorderSize_ = v;} + + /** + * @param v the probabillity the text will be curved. + */ + CV_WRAP void setCurvingProbabillity (double v) {CV_Assert(v >= 0 && v <= 1);curvingProbabillity_ = v;} + + /** + * @param v the maximum effect curving will have as a percentage of the samples height + */ + CV_WRAP void setMaxHeightDistortionPercentage (double v) {CV_Assert(v >= 0 && v <= 100);maxHeightDistortionPercentage_ = v;} + + /** + * @param v the arch in radians whose cosine will curve the text + */ + CV_WRAP void setMaxCurveArch (double v) {maxCurveArch_ = v;} + + /** + * @param v the maximum alpha used when blending text to the background with opacity + */ + CV_WRAP void setBlendAlpha (double v) {CV_Assert(v >= 0 && v <= 1); finalBlendAlpha_ = v;} + + /** + * @param v the probability the text will be blended with the background with alpha blending. + */ + CV_WRAP void setBlendProb (double v) {CV_Assert(v >= 0 && v <= 1); finalBlendProb_ = v;} + + /** + * @param v the probability the sample will be distorted by compression artifacts + */ + CV_WRAP void setCompressionNoiseProb (double v) {CV_Assert(v >= 0 && v <= 1); compressionNoiseProb_ = v;} + + + /** @brief adds ttf fonts to the Font Database system + * + * Note: for the moment adding non system fonts in X11 systems is not an option. + * + * Fonts should be added to the system if the are to be used with the syntheciser + * + * @param fntList a list of TTF files to be incorporated in to the system. + */ + CV_WRAP virtual void addFontFiles (const std::vector& fntList) = 0; + + /** @brief retrieves the font family names that are beeing used by the text + * synthesizer + * + * @return a list of strings with the names from which fonts are sampled. + */ + CV_WRAP virtual std::vector listAvailableFonts () const = 0; + + /** @brief updates retrieves the font family names that are randomly sampled + * + * This function indirectly allows you to define arbitrary font occurence + * probabilities. Since fonts are uniformly sampled from this list if a font + * is repeated, its occurence probabillity doubles. + * + * @param fntList a list of strings with the family names from which fonts + * are sampled. Only font families available in the system can be added. + */ + CV_WRAP virtual void modifyAvailableFonts (std::vector& fntList) = 0; + + /** @brief appends an image in to the collection of images from which + * backgrounds are sampled. + * + * This function indirectly allows you to define arbitrary occurence + * probabilities. Since background images are uniformly sampled from this + * list if an image is repeated, its occurence probabillity doubles. + * + * @param image an image to be inserted. It should be an 8UC3 matrix which + * must be least bigger than the generated samples. + */ + CV_WRAP virtual void addBgSampleImage (const Mat& image) = 0; + + /** @brief provides the data from which text colors are sampled + * + * @param clusters a 8UC3 Matrix whith three columns and N rows + */ + CV_WRAP virtual void getColorClusters (CV_OUT Mat& clusters) const = 0; + + /** @brief defines the data from which text colors are sampled. + * + * Text has three color parameters and in order to be able to sample a joined + * distribution instead of independently sampled, colors are uniformly sampled + * as color triplets from a fixed collection. + * This function indirectly allows you to define arbitrary occurence + * probabilities for every triplet by repeating it samples or polulating with + * samples. + * + * @param clusters a matrix that must be 8UC3, must have 3 columns and any + * number of rows. Text color is the first matrix color, border color is the + * second column and shadow color is the third color. + */ + CV_WRAP virtual void setColorClusters (Mat clusters) = 0; + + /** @brief provides a randomly selected patch exactly as they are provided to text + * syntheciser + * + * @param sample a result variable containing a 8UC3 matrix. + */ + CV_WRAP virtual void generateBgSample (CV_OUT Mat& sample) = 0; + + /** @brief provides the randomly rendered text with border and shadow. + * + * @param caption the string which will be rendered. Multilingual strings in + * UTF8 are suported but some fonts might not support it. The syntheciser should + * be created with a specific script for fonts guarantiing rendering of the script. + * + * @param sample an out variable containing a 32FC3 matrix with the rendered text + * including border and shadow. + * + * @param sampleMask a result parameter which contains the alpha value which is usefull + * for overlaying the text sample on other images. + */ + CV_WRAP virtual void generateTxtSample (String caption, CV_OUT Mat& sample, CV_OUT Mat& sampleMask) = 0; + + + /** @brief generates a random text sample given a string + * + * This is the principal function of the text synthciser + * + * @param caption the transcription to be written. + * + * @param sample the resulting text sample. + */ + CV_WRAP virtual void generateSample (String caption, CV_OUT Mat& sample) = 0; + + /** @brief returns the name of the script beeing used + * + * @return a string with the name of the script + */ + CV_WRAP virtual String getScriptName () = 0; + + /** @brief returns the random seed used by the synthesizer + * + * @return a matrix containing a 1 x 8 uint8 matrix containing the state of + * the random seed. + */ + CV_WRAP virtual void getRandomSeed (OutputArray res) const = 0; + + /** @brief stets the random seed used by the synthesizer + * + * @param state a 1 x 8 matrix of uint8 containing the random state as + * returned by getRandomSeed(); + */ + CV_WRAP virtual void setRandomSeed (Mat state) = 0; + + /** @brief public constructor for a syntheciser + * + * This constructor assigns only imutable properties of the syntheciser. + * + * @param sampleHeight the height of final samples in pixels + * + * @param maxWidth the maximum width of a sample. Any text requiring more + * width to be rendered will be ignored. + * + * @param script an enumaration which is used to constrain the available fonts + * to the ones beeing able to render strings in that script. + */ + CV_WRAP static Ptr create (int sampleHeight = 50, + int maxWidth = 600, + int script = CV_TEXT_SYNTHESIZER_SCRIPT_ANY); + + virtual ~TextSynthesizer () {} +}; + + +}//text +}//cv + +#endif // TEXT_SYNTHESIZER_HPP diff --git a/modules/text/samples/1000_color_clusters.png b/modules/text/samples/1000_color_clusters.png new file mode 100644 index 00000000000..e151964dd77 Binary files /dev/null and b/modules/text/samples/1000_color_clusters.png differ diff --git a/modules/text/samples/cropped_word_recognition.cpp b/modules/text/samples/cropped_word_recognition.cpp index 32e3570e586..78f3862a66c 100644 --- a/modules/text/samples/cropped_word_recognition.cpp +++ b/modules/text/samples/cropped_word_recognition.cpp @@ -36,7 +36,7 @@ int main(int argc, char* argv[]) return(0); } - string vocabulary = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"; // must have the same order as the clasifier output classes + string vocabulary = "##0123456789AaBbCcDdEeFfGgHhIiJjKkLlMmNnOoPpQqRrSsTtUuVvWwXxYyZz";//"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"; // must have the same order as the clasifier output classes vector lexicon; // a list of words expected to be found on the input image lexicon.push_back(string("abb")); lexicon.push_back(string("riser")); @@ -62,11 +62,16 @@ int main(int argc, char* argv[]) Mat emission_p = Mat::eye(62,62,CV_64FC1); + Ptr ocr = OCRBeamSearchDecoder::create( + loadOCRBeamSearchClassifierCNN("OCRBeamSearch_CNN_model_data.xml.gz"), + vocabulary, transition_p, emission_p, OCR_DECODER_VITERBI, 50); + + // Notice we set here a beam size of 50. This is much faster than using the default value (500). // 50 works well with our tiny lexicon example, but may not with larger dictionaries. - Ptr ocr = OCRBeamSearchDecoder::create( +/* Ptr ocr = OCRBeamSearchDecoder::create( loadOCRBeamSearchClassifierCNN("OCRBeamSearch_CNN_model_data.xml.gz"), - vocabulary, transition_p, emission_p, OCR_DECODER_VITERBI, 50); + vocabulary, transition_p, emission_p, OCR_DECODER_VITERBI, 50);*/ double t_r = (double)getTickCount(); string output; diff --git a/modules/text/samples/dictnet_demo.cpp b/modules/text/samples/dictnet_demo.cpp new file mode 100644 index 00000000000..10133c2d9de --- /dev/null +++ b/modules/text/samples/dictnet_demo.cpp @@ -0,0 +1,95 @@ +/* + * dictnet_demo.cpp + * + * Demonstrates simple use of the holistic word classifier in C++ + * + * Created on: June 26, 2016 + * Author: Anguelos Nicolaou + */ + +#include "opencv2/text.hpp" +#include "opencv2/highgui.hpp" +#include "opencv2/imgproc.hpp" + +#include +#include +#include +#include +#include + +inline std::string getHelpStr(std::string progFname){ + std::stringstream out; + out << " Demo of wordspotting CNN for text recognition." << std::endl; + out << " Max Jaderberg et al.: Reading Text in the Wild with Convolutional Neural Networks, IJCV 2015"< ... " << std::endl; + out << " Caffe Model files (dictnet_vgg.caffemodel, dictnet_vgg_deploy.prototxt, dictnet_vgg_labels.txt)"< imageList; + for(int imageIdx=2;imageIdx cnn=cv::text::DeepCNN::createDictNet( + "dictnet_vgg_deploy.prototxt","dictnet_vgg.caffemodel"); + + cv::Ptr wordSpotter= + cv::text::OCRHolisticWordRecognizer::create(cnn,"dictnet_vgg_labels.txt"); + + std::vector wordList; + std::vector outProbabillities; + wordSpotter->recogniseImageBatch(imageList,wordList,outProbabillities); + + std::ofstream out; + out.open(argv[1]); + for(int imgIdx=0;imgIdx #include @@ -189,7 +190,6 @@ class OCRBeamSearchDecoderImpl : public OCRBeamSearchDecoder ~OCRBeamSearchDecoderImpl() { } - void run( Mat& src, Mat& mask, string& out_sequence, @@ -211,7 +211,6 @@ class OCRBeamSearchDecoderImpl : public OCRBeamSearchDecoder vector* component_confidences, int component_level) { - CV_Assert( (src.type() == CV_8UC1) || (src.type() == CV_8UC3) ); CV_Assert( (src.cols > 0) && (src.rows > 0) ); CV_Assert( component_level == OCR_LEVEL_WORD ); @@ -228,15 +227,16 @@ class OCRBeamSearchDecoderImpl : public OCRBeamSearchDecoder cvtColor(src,src,COLOR_RGB2GRAY); } - // TODO if input is a text line (not a word) we may need to split into words here! // do sliding window classification along a croped word image classifier->eval(src, recognition_probabilities, oversegmentation); // if the number of oversegmentation points found is less than 2 we can not do nothing!! - if (oversegmentation.size() < 2) return; - + if (oversegmentation.size() < 2){ + out_sequence="###";//TODO find the output class transcription for the single window + return; + } //NMS of recognitions double last_best_p = 0; @@ -338,9 +338,17 @@ class OCRBeamSearchDecoderImpl : public OCRBeamSearchDecoder double lp = score_segmentation( beam[0].segmentation, out_sequence ); // fill other (dummy) output parameters - component_rects->push_back(Rect(0,0,src.cols,src.rows)); - component_texts->push_back(out_sequence); - component_confidences->push_back((float)exp(lp)); + if(component_rects!=NULL){ + component_rects->push_back(Rect(0,0,src.cols,src.rows)); + } + + if(component_texts!=NULL){ + component_texts->push_back(out_sequence); + } + + if(component_confidences!=NULL){ + component_confidences->push_back((float)exp(lp)); + } return; } @@ -777,10 +785,116 @@ double OCRBeamSearchClassifierCNN::eval_feature(Mat& feature, double* prob_estim } Ptr loadOCRBeamSearchClassifierCNN(const String& filename) - { return makePtr(std::string(filename)); } + +/* This class is used to bridge the gap between TextImageClassifier and + * OCRBeamSearchDecoder::ClassifierCallback. In practice it implements the logic + * of invocking a TextImageClassifier in a sliding window. Eventually this functionality + * should be moved inside OCRBeamSearchDecoder. The class has no footprint in public API. + * The method could also provide compatibillitywith the OCRHMMDecoder::ClassifierCallback + * but a letter segmenter will be needed. + */ +class TextImageClassifierBeamSearchCallback: public OCRBeamSearchDecoder::ClassifierCallback + //, public OCRHMMDecoder::ClassifierCallback// A letter segmenter will be needed +{ + //TODO: once opencv supports "enable_shared_from_this" this class shoulb be removed from + //the public API (ocr.hpp) and add a single method in TextImageClassifier returning a + //Ptr +protected: + int stepSize_; + int windowWidth_; + Ptr classifier_; +public: + virtual ~TextImageClassifierBeamSearchCallback() { } + + TextImageClassifierBeamSearchCallback(Ptr classifier,int stepSize,int windowWidth) + :stepSize_(stepSize),windowWidth_(windowWidth),classifier_(classifier) + { + if(windowWidth_<=0) + { + windowWidth_=classifier_->getInputSize().width; + } + } + + virtual void eval( InputArray _img, std::vector< std::vector >& recognitionProbabilities, std::vector& oversegmentation ) + { + + if (!recognitionProbabilities.empty()) + { + for (size_t i=0; i windowList; + int counter=0; + + for(int x=0;x+windowWidth_classifier_->classifyBatch(windowList,windowProbabilities); + recognitionProbabilities.resize(windowProbabilities.rows); + for(int windowNum=0;windowNum(windowNum,clNum));//+.02; + } + break; + case CV_32F: + for(int clNum=2;clNum(windowNum,clNum));//+.02; + } + break; + default: + CV_Error(Error::StsError,"The network outputs should be either float or double!"); + } + } + + } + + virtual int getWindowSize(){return stepSize_;} + + virtual int getStepSize(){return windowWidth_;} + + static Ptr create(Ptr classifier,int stepSize=8,int windowWidth=-1); +}; + + +Ptr OCRBeamSearchDecoder::create(const Ptr classifier, + String alphabet, + InputArray transitionProbabilitiesTable, + InputArray emissionProbabilitiesTable, + int windowWidth, + int windowStep, + int mode, + int beamSize){ + Ptr callback= + Ptr(new + TextImageClassifierBeamSearchCallback(classifier,windowStep,windowWidth)); + + return Ptr(new OCRBeamSearchDecoderImpl(callback, + alphabet, + transitionProbabilitiesTable, + emissionProbabilitiesTable, + decoder_mode(mode), + beamSize) + ); +} + + + } } diff --git a/modules/text/src/ocr_holistic.cpp b/modules/text/src/ocr_holistic.cpp new file mode 100644 index 00000000000..3f92fa0eb83 --- /dev/null +++ b/modules/text/src/ocr_holistic.cpp @@ -0,0 +1,636 @@ +#include "precomp.hpp" +#include "opencv2/imgproc.hpp" +#include "opencv2/core.hpp" + + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#ifdef HAVE_CAFFE +#include "caffe/caffe.hpp" +#endif + +namespace cv { namespace text { + +//Maybe OpenCV has a routine better suited +inline bool fileExists (String filename) { + std::ifstream f(filename.c_str()); + return f.good(); +} + +//************************************************************************************ +//****************** ImagePreprocessor ******************************************* +//************************************************************************************ + +void ImagePreprocessor::preprocess(InputArray input,OutputArray output,Size sz,int outputChannels){ + Mat inpImg=input.getMat(); + Mat outImg; + this->preprocess_(inpImg,outImg,sz,outputChannels); + outImg.copyTo(output); +} + + +class ResizerPreprocessor: public ImagePreprocessor{ +protected: + void preprocess_(const Mat& input,Mat& output,Size outputSize,int outputChannels){ + //TODO put all the logic of channel and depth conversions in ImageProcessor class + CV_Assert(outputChannels==1 || outputChannels==3); + CV_Assert(input.channels()==1 || input.channels()==3); + if(input.channels()!=outputChannels) + { + Mat tmpInput; + if(outputChannels==1){ + cvtColor(input,tmpInput,COLOR_BGR2GRAY); + if(input.depth()==CV_8U) + { + tmpInput.convertTo(output,CV_32FC1,1/255.0); + }else + {//Assuming values are at the desired [0,1] range + tmpInput.convertTo(output, CV_32FC1); + } + }else + { + cvtColor(input,tmpInput,COLOR_GRAY2BGR); + if(input.depth()==CV_8U) + { + tmpInput.convertTo(output,CV_32FC3,1/255.0); + }else + {//Assuming values are at the desired [0,1] range + tmpInput.convertTo(output, CV_32FC3); + } + } + }else + { + if(input.channels()==1) + { + if(input.depth()==CV_8U) + { + input.convertTo(output, CV_32FC1,1/255.0); + }else + {//Assuming values are at the desired [0,1] range + input.convertTo(output, CV_32FC1); + } + }else + { + if(input.depth()==CV_8U){ + input.convertTo(output, CV_32FC3,1/255.0); + }else + {//Assuming values are at the desired [0,1] range + input.convertTo(output, CV_32FC3); + } + } + } + if(outputSize.width!=0 && outputSize.height!=0) + { + resize(output,output,outputSize); + } + } +public: + ResizerPreprocessor(){} + ~ResizerPreprocessor(){} +}; + +class StandarizerPreprocessor: public ImagePreprocessor{ +protected: + double sigma_; + void preprocess_(const Mat& input,Mat& output,Size outputSize,int outputChannels){ + //TODO put all the logic of channel and depth conversions in ImageProcessor class + CV_Assert(outputChannels==1 || outputChannels==3); + CV_Assert(input.channels()==1 || input.channels()==3); + if(input.channels()!=outputChannels) + { + Mat tmpInput; + if(outputChannels==1) + { + cvtColor(input,tmpInput,COLOR_BGR2GRAY); + if(input.depth()==CV_8U) + { + tmpInput.convertTo(output,CV_32FC1,1/255.0); + }else + {//Assuming values are at the desired [0,1] range + tmpInput.convertTo(output, CV_32FC1); + } + }else + { + cvtColor(input,tmpInput,COLOR_GRAY2BGR); + if(input.depth()==CV_8U) + { + tmpInput.convertTo(output,CV_32FC3,1/255.0); + }else + {//Assuming values are at the desired [0,1] range + tmpInput.convertTo(output, CV_32FC3); + } + } + }else + { + if(input.channels()==1) + { + if(input.depth()==CV_8U) + { + input.convertTo(output, CV_32FC1,1/255.0); + }else + {//Assuming values are at the desired [0,1] range + input.convertTo(output, CV_32FC1); + } + }else + { + if(input.depth()==CV_8U) + { + input.convertTo(output, CV_32FC3,1/255.0); + }else + {//Assuming values are at the desired [0,1] range + input.convertTo(output, CV_32FC3); + } + } + } + if(outputSize.width!=0 && outputSize.height!=0) + { + resize(output,output,outputSize); + } + Scalar dev,mean; + meanStdDev(output,mean,dev); + subtract(output,mean[0],output); + divide(output,(dev[0]/sigma_),output); + } +public: + StandarizerPreprocessor(double sigma):sigma_(sigma){} + ~StandarizerPreprocessor(){} +}; + +class MeanSubtractorPreprocessor: public ImagePreprocessor{ +protected: + Mat mean_; + void preprocess_(const Mat& input,Mat& output,Size outputSize,int outputChannels){ + //TODO put all the logic of channel and depth conversions in ImageProcessor class + CV_Assert(this->mean_.cols==outputSize.width && this->mean_.rows ==outputSize.height); + CV_Assert(outputChannels==1 || outputChannels==3); + CV_Assert(input.channels()==1 || input.channels()==3); + if(input.channels()!=outputChannels) + { + Mat tmpInput; + if(outputChannels==1) + { + cvtColor(input,tmpInput,COLOR_BGR2GRAY); + if(input.depth()==CV_8U) + { + tmpInput.convertTo(output,CV_32FC1,1/255.0); + }else + {//Assuming values are at the desired [0,1] range + tmpInput.convertTo(output, CV_32FC1); + } + }else + { + cvtColor(input,tmpInput,COLOR_GRAY2BGR); + if(input.depth()==CV_8U) + { + tmpInput.convertTo(output,CV_32FC3,1/255.0); + }else + {//Assuming values are at the desired [0,1] range + tmpInput.convertTo(output, CV_32FC3); + } + } + }else + { + if(input.channels()==1) + { + if(input.depth()==CV_8U) + { + input.convertTo(output, CV_32FC1,1/255.0); + }else + {//Assuming values are at the desired [0,1] range + input.convertTo(output, CV_32FC1); + } + }else + { + if(input.depth()==CV_8U) + { + input.convertTo(output, CV_32FC3,1/255.0); + }else + {//Assuming values are at the desired [0,1] range + input.convertTo(output, CV_32FC3); + } + } + } + if(outputSize.width!=0 && outputSize.height!=0) + { + resize(output,output,outputSize); + } + subtract(output,this->mean_,output); + } +public: + MeanSubtractorPreprocessor(Mat mean) + { + mean.copyTo(this->mean_); + } + + ~MeanSubtractorPreprocessor(){} +}; + + +Ptr ImagePreprocessor::createResizer() +{ + return Ptr(new ResizerPreprocessor); +} + +Ptr ImagePreprocessor::createImageStandarizer(double sigma) +{ + return Ptr(new StandarizerPreprocessor(sigma)); +} + +Ptr ImagePreprocessor::createImageMeanSubtractor(InputArray meanImg) +{ + Mat tmp=meanImg.getMat(); + return Ptr(new MeanSubtractorPreprocessor(tmp)); +} + +//************************************************************************************ +//****************** TextImageClassifier ***************************************** +//************************************************************************************ + +void TextImageClassifier::preprocess(const Mat& input,Mat& output) +{ + this->preprocessor_->preprocess_(input,output,this->inputGeometry_,this->channelCount_); +} + +void TextImageClassifier::setPreprocessor(Ptr ptr) +{ + CV_Assert(!ptr.empty()); + preprocessor_=ptr; +} + +Ptr TextImageClassifier::getPreprocessor() +{ + return preprocessor_; +} + + +class DeepCNNCaffeImpl: public DeepCNN{ +protected: + void classifyMiniBatch(std::vector inputImageList, Mat outputMat) + { + //Classifies a list of images containing at most minibatchSz_ images + CV_Assert(int(inputImageList.size())<=this->minibatchSz_); + CV_Assert(outputMat.isContinuous()); +#ifdef HAVE_CAFFE + net_->input_blobs()[0]->Reshape(inputImageList.size(), 1,this->inputGeometry_.height,this->inputGeometry_.width); + net_->Reshape(); + float* inputBuffer=net_->input_blobs()[0]->mutable_cpu_data(); + float* inputData=inputBuffer; + for(size_t imgNum=0;imgNuminputGeometry_.height, this->inputGeometry_.width, CV_32FC1, inputData); + this->preprocess(inputImageList[imgNum],preprocessed); + preprocessed.copyTo(netInputWraped); + inputData+=(this->inputGeometry_.height*this->inputGeometry_.width); + } + this->net_->ForwardPrefilled(); + const float* outputNetData=net_->output_blobs()[0]->cpu_data(); + float*outputMatData=(float*)(outputMat.data); + memcpy(outputMatData,outputNetData,sizeof(float)*this->outputSize_*inputImageList.size()); +#endif + } + +#ifdef HAVE_CAFFE + Ptr > net_; +#endif + //Size inputGeometry_; + int minibatchSz_;//The existence of the assignment operator mandates this to be nonconst + int outputSize_; +public: + DeepCNNCaffeImpl(const DeepCNNCaffeImpl& dn): + minibatchSz_(dn.minibatchSz_),outputSize_(dn.outputSize_){ + channelCount_=dn.channelCount_; + inputGeometry_=dn.inputGeometry_; + //Implemented to supress Visual Studio warning "assignment operator could not be generated" +#ifdef HAVE_CAFFE + this->net_=dn.net_; +#endif + } + DeepCNNCaffeImpl& operator=(const DeepCNNCaffeImpl &dn) + { +#ifdef HAVE_CAFFE + this->net_=dn.net_; +#endif + this->setPreprocessor(dn.preprocessor_); + this->inputGeometry_=dn.inputGeometry_; + this->channelCount_=dn.channelCount_; + this->minibatchSz_=dn.minibatchSz_; + this->outputSize_=dn.outputSize_; + this->preprocessor_=dn.preprocessor_; + return *this; + //Implemented to supress Visual Studio warning "assignment operator could not be generated" + } + + DeepCNNCaffeImpl(String modelArchFilename, String modelWeightsFilename,Ptr preprocessor, int maxMinibatchSz) + :minibatchSz_(maxMinibatchSz) + { + CV_Assert(this->minibatchSz_>0); + CV_Assert(fileExists(modelArchFilename)); + CV_Assert(fileExists(modelWeightsFilename)); + CV_Assert(!preprocessor.empty()); + this->setPreprocessor(preprocessor); +#ifdef HAVE_CAFFE + this->net_.reset(new caffe::Net(modelArchFilename, caffe::TEST)); + CV_Assert(net_->num_inputs()==1); + CV_Assert(net_->num_outputs()==1); + CV_Assert(this->net_->input_blobs()[0]->channels()==1 + ||this->net_->input_blobs()[0]->channels()==3); + this->channelCount_=this->net_->input_blobs()[0]->channels(); + this->net_->CopyTrainedLayersFrom(modelWeightsFilename); + caffe::Blob* inputLayer = this->net_->input_blobs()[0]; + this->inputGeometry_=Size(inputLayer->width(), inputLayer->height()); + inputLayer->Reshape(this->minibatchSz_,1,this->inputGeometry_.height, this->inputGeometry_.width); + net_->Reshape(); + this->outputSize_=net_->output_blobs()[0]->channels(); + +#else + CV_Error(Error::StsError,"Caffe not available during compilation!"); +#endif + } + + void classify(InputArray image, OutputArray classProbabilities) + { + std::vector inputImageList; + inputImageList.push_back(image.getMat()); + classifyBatch(inputImageList,classProbabilities); + } + + void classifyBatch(InputArrayOfArrays inputImageList, OutputArray classProbabilities) + { + std::vector allImageVector; + inputImageList.getMatVector(allImageVector); + size_t outputSize=size_t(this->outputSize_);//temporary variable to avoid int to size_t arithmentic + size_t minibatchSize=size_t(this->minibatchSz_);//temporary variable to avoid int to size_t arithmentic + classProbabilities.create(Size(int(outputSize),int(allImageVector.size())),CV_32F); + Mat outputMat = classProbabilities.getMat(); + for(size_t imgNum=0;imgNum(allImageVector.size()-imgNum,minibatchSize); + std::vector::const_iterator from=std::vector::const_iterator(allImageVector.begin()+imgNum); + std::vector::const_iterator to=std::vector::const_iterator(allImageVector.begin()+rangeEnd); + std::vector minibatchInput(from,to); + classifyMiniBatch(minibatchInput,outputMat.rowRange(int(imgNum),int(rangeEnd))); + } + } + + int getOutputSize() + { + return this->outputSize_; + } + + int getMinibatchSize() + { + return this->minibatchSz_; + } + + int getBackend() + { + return OCR_HOLISTIC_BACKEND_CAFFE; + } +}; + + +Ptr DeepCNN::create(String archFilename,String weightsFilename,Ptr preprocessor,int minibatchSz,int backEnd) +{ + if(preprocessor.empty()) + { + preprocessor=ImagePreprocessor::createResizer(); + } + switch(backEnd){ + case OCR_HOLISTIC_BACKEND_CAFFE: + return Ptr(new DeepCNNCaffeImpl(archFilename, weightsFilename,preprocessor, minibatchSz)); + break; + case OCR_HOLISTIC_BACKEND_NONE: + default: + CV_Error(Error::StsError,"DeepCNN::create backend not implemented"); + return Ptr(); + break; + } +} + + +Ptr DeepCNN::createDictNet(String archFilename,String weightsFilename,int backEnd) +{ + Ptr preprocessor=ImagePreprocessor::createImageStandarizer(113); + switch(backEnd){ + case OCR_HOLISTIC_BACKEND_CAFFE: + return Ptr(new DeepCNNCaffeImpl(archFilename, weightsFilename,preprocessor, 100)); + break; + case OCR_HOLISTIC_BACKEND_NONE: + default: + CV_Error(Error::StsError,"DeepCNN::create backend not implemented"); + return Ptr(); + break; + } +} + +#ifdef HAVE_CAFFE + +bool getCaffeGpuMode() +{ + return caffe::Caffe::mode()==caffe::Caffe::GPU; +} + +void setCaffeGpuMode(bool useGpu) +{ + if(useGpu) + { + caffe::Caffe::set_mode(caffe::Caffe::GPU); + }else + { + caffe::Caffe::set_mode(caffe::Caffe::CPU); + } +} + +bool getCaffeAvailable() +{ + return true; +} + +#else + +bool getCaffeGpuMode() +{ + CV_Error(Error::StsError,"Caffe not available during compilation!"); + return 0; +} + +void setCaffeGpuMode(bool useGpu) +{ + CV_Error(Error::StsError,"Caffe not available during compilation!"); + CV_Assert(useGpu==1);//Compilation directives force +} + +bool getCaffeAvailable(){ + return 0; +} + +#endif + + +class OCRHolisticWordRecognizerImpl: public OCRHolisticWordRecognizer{ +private: + struct NetOutput{ + //Auxiliary structure that handles the logic of getting class ids and probabillities from + //the raw outputs of caffe + int wordIdx; + float probabillity; + + static bool sorter(const NetOutput& o1,const NetOutput& o2) + {//used with std::sort to provide the most probable class + return o1.probabillity>o2.probabillity; + } + + static void getOutputs(const float* buffer,int nbOutputs,std::vector& res) + { + res.resize(nbOutputs); + for(int k=0;k tmp; + getOutputs(buffer,nbOutputs,tmp); + classNum=tmp[0].wordIdx; + confidence=tmp[0].probabillity; + } + }; +protected: + std::vector labels_; + Ptr classifier_; +public: + OCRHolisticWordRecognizerImpl(Ptr classifierPtr,String vocabularyFilename):classifier_(classifierPtr) + { + CV_Assert(fileExists(vocabularyFilename));//this fails for some rason + std::ifstream labelsFile(vocabularyFilename.c_str()); + if(!labelsFile) + { + CV_Error(Error::StsError,"Could not read Labels from file"); + } + std::string line; + while (std::getline(labelsFile, line)) + { + labels_.push_back(std::string(line)); + } + CV_Assert(this->classifier_->getOutputSize()==int(this->labels_.size())); + } + + OCRHolisticWordRecognizerImpl(Ptr classifierPtr,const std::vector& vocabulary):classifier_(classifierPtr) + { + this->labels_=vocabulary; + CV_Assert(this->classifier_->getOutputSize()==int(this->labels_.size())); + } + + void recogniseImage(InputArray inputImage,CV_OUT String& transcription,CV_OUT double& confidence) + { + Mat netOutput; + this->classifier_->classify(inputImage,netOutput); + int classNum; + NetOutput::getClassification((float*)(netOutput.data),this->classifier_->getOutputSize(),classNum,confidence); + transcription=this->labels_[classNum]; + } + + void recogniseImageBatch(InputArrayOfArrays inputImageList,CV_OUT std::vector& transcriptionVec,CV_OUT std::vector& confidenceVec) + { + Mat netOutput; + this->classifier_->classifyBatch(inputImageList,netOutput); + for(int k=0;kclassifier_->getOutputSize(),classNum,confidence); + transcriptionVec.push_back(this->labels_[classNum]); + confidenceVec.push_back(confidence); + } + } + + + void run(Mat& image, std::string& output_text, std::vector* component_rects=NULL, + std::vector* component_texts=NULL, std::vector* component_confidences=NULL, + int component_level=0) + { + CV_Assert(component_level==OCR_LEVEL_WORD);//Componnents not applicable for word spotting + double confidence; + String transcription; + recogniseImage(image,transcription,confidence); + output_text=transcription.c_str(); + if(component_rects!=NULL) + { + component_rects->resize(1); + (*component_rects)[0]=Rect(0,0,image.size().width,image.size().height); + } + if(component_texts!=NULL) + { + component_texts->resize(1); + (*component_texts)[0]=transcription.c_str(); + } + if(component_confidences!=NULL) + { + component_confidences->resize(1); + (*component_confidences)[0]=float(confidence); + } + } + + void run(Mat& image, Mat& mask, std::string& output_text, std::vector* component_rects=NULL, + std::vector* component_texts=NULL, std::vector* component_confidences=NULL, + int component_level=0) + { + CV_Assert(mask.cols==image.cols && mask.rows== image.rows);//Mask is ignored because the CNN operates on a full image + this->run(image,output_text,component_rects,component_texts,component_confidences,component_level); + } + + std::vector& getVocabulary() + { + return this->labels_; + } + + Ptr getClassifier() + { + return this->classifier_; + } +}; + +Ptr OCRHolisticWordRecognizer::create(Ptr classifierPtr,String vocabularyFilename ) +{ + return Ptr(new OCRHolisticWordRecognizerImpl(classifierPtr,vocabularyFilename)); +} + +Ptr OCRHolisticWordRecognizer::create(String modelArchFilename, String modelWeightsFilename, String vocabularyFilename) +{ + Ptr preprocessor=ImagePreprocessor::createImageStandarizer(113); + Ptr classifierPtr(new DeepCNNCaffeImpl(modelArchFilename,modelWeightsFilename,preprocessor,100)); + return Ptr(new OCRHolisticWordRecognizerImpl(classifierPtr,vocabularyFilename)); +} + +Ptr OCRHolisticWordRecognizer::create(Ptr classifierPtr,const std::vector& vocabulary) +{ + return Ptr(new OCRHolisticWordRecognizerImpl(classifierPtr,vocabulary)); +} + +Ptr OCRHolisticWordRecognizer::create(String modelArchFilename, String modelWeightsFilename,const std::vector& vocabulary){ + Ptr preprocessor=ImagePreprocessor::createImageStandarizer(113); + Ptr classifierPtr(new DeepCNNCaffeImpl(modelArchFilename,modelWeightsFilename,preprocessor,100)); + return Ptr(new OCRHolisticWordRecognizerImpl(classifierPtr,vocabulary)); +} + + + + + +} } //namespace text namespace cv diff --git a/modules/text/src/text_synthesizer.cpp b/modules/text/src/text_synthesizer.cpp new file mode 100644 index 00000000000..413a422b833 --- /dev/null +++ b/modules/text/src/text_synthesizer.cpp @@ -0,0 +1,769 @@ +#include "precomp.hpp" +#include "opencv2/imgproc.hpp" +#include "opencv2/core.hpp" +#include "opencv2/highgui.hpp" +#include "opencv2/calib3d.hpp" + +#include "opencv2/text/text_synthesizer.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef HAVE_QT5GUI +#include +#include +#include +#include +#include +#endif + + +//TODO FIND apropriate +#define CV_IMWRITE_JPEG_QUALITY 1 +#define CV_LOAD_IMAGE_COLOR 1 +namespace cv{ +namespace text{ + +namespace { +//Unnamed namespace with auxiliary classes and functions used for quick computation +template T min_ (T v1, T v2) { + return (v1 < v2) * v1 + (v1 >= v2) * v2; +} + +template T max_(T v1, T v2) { + return (v1 > v2)* v1 + (v1 <= v2) * v2; +} + +template void blendRGBA(Mat& out, const Mat &in1, const Mat& in2){ + CV_Assert (out.cols == in1.cols && out.cols == in2.cols); + CV_Assert (out.rows == in1.rows && out.rows == in2.rows); + CV_Assert (out.channels() == 4 && in1.channels() == 4 && in2.channels() == 4); + int lineWidth=out.cols * 4; + BL blend; + BL_A blendA; + for(int y = 0; y < out.rows; y++){ + const P* in1B = in1.ptr

(y) ; + const P* in1G = in1.ptr

(y) + 1; + const P* in1R = in1.ptr

(y) + 2; + const P* in1A = in1.ptr

(y) + 3; + + const P* in2B = in2.ptr

(y); + const P* in2G = in2.ptr

(y) + 1; + const P* in2R = in2.ptr

(y) + 2; + const P* in2A = in2.ptr

(y) + 3; + + P* outB = out.ptr

(y); + P* outG = out.ptr

(y) + 1; + P* outR = out.ptr

(y) + 2; + P* outA = out.ptr

(y) + 3; + + for(int x = 0; x < lineWidth; x += 4){ + outB[x] = blend(in1B + x, in1A + x, in2B + x, in2A + x); + outG[x] = blend(in1G + x, in1A + x, in2G + x, in2A + x); + outR[x] = blend(in1R + x, in1A + x, in2R + x, in2A + x); + outA[x] = blendA(in1A[x], in2A[x]); + } + } +} + +#ifdef HAVE_QT5GUI +std::map initQt2CvScriptCodeMap () ; +std::map initQt2CvScriptCodeMap () { + std::map res; + res[CV_TEXT_SYNTHESIZER_SCRIPT_ANY] = QFontDatabase::Any; + res[CV_TEXT_SYNTHESIZER_SCRIPT_LATIN] = QFontDatabase::Latin; + res[CV_TEXT_SYNTHESIZER_SCRIPT_GREEK] = QFontDatabase::Greek; + res[CV_TEXT_SYNTHESIZER_SCRIPT_CYRILLIC] = QFontDatabase::Cyrillic; + res[CV_TEXT_SYNTHESIZER_SCRIPT_ARMENIAN] = QFontDatabase::Armenian; + res[CV_TEXT_SYNTHESIZER_SCRIPT_ARABIC] = QFontDatabase::Arabic; + res[CV_TEXT_SYNTHESIZER_SCRIPT_HEBREW] = QFontDatabase::Hebrew; + res[CV_TEXT_SYNTHESIZER_SCRIPT_SYRIAC] = QFontDatabase::Syriac; + res[CV_TEXT_SYNTHESIZER_SCRIPT_THAANA] = QFontDatabase::Thaana; + res[CV_TEXT_SYNTHESIZER_SCRIPT_DEVANAGARI] = QFontDatabase::Devanagari; + res[CV_TEXT_SYNTHESIZER_SCRIPT_BENGALI] = QFontDatabase::Bengali; + res[CV_TEXT_SYNTHESIZER_SCRIPT_GURMUKHI] = QFontDatabase::Gurmukhi; + res[CV_TEXT_SYNTHESIZER_SCRIPT_GUJARATI] = QFontDatabase::Gujarati; + res[CV_TEXT_SYNTHESIZER_SCRIPT_ORIYA] = QFontDatabase::Oriya; + res[CV_TEXT_SYNTHESIZER_SCRIPT_TAMIL] = QFontDatabase::Tamil; + res[CV_TEXT_SYNTHESIZER_SCRIPT_TELUGU] = QFontDatabase::Telugu; + res[CV_TEXT_SYNTHESIZER_SCRIPT_KANNADA] = QFontDatabase::Kannada; + res[CV_TEXT_SYNTHESIZER_SCRIPT_MALAYALAM] = QFontDatabase::Malayalam; + res[CV_TEXT_SYNTHESIZER_SCRIPT_SINHALA] = QFontDatabase::Sinhala; + res[CV_TEXT_SYNTHESIZER_SCRIPT_THAI] = QFontDatabase::Thai; + res[CV_TEXT_SYNTHESIZER_SCRIPT_LAO] = QFontDatabase::Lao; + res[CV_TEXT_SYNTHESIZER_SCRIPT_TIBETAN] = QFontDatabase::Tibetan; + res[CV_TEXT_SYNTHESIZER_SCRIPT_MYANMAR] = QFontDatabase::Myanmar; + res[CV_TEXT_SYNTHESIZER_SCRIPT_GEORGIAN] = QFontDatabase::Georgian; + res[CV_TEXT_SYNTHESIZER_SCRIPT_KHMER] = QFontDatabase::Khmer; + res[CV_TEXT_SYNTHESIZER_SCRIPT_CHINESE_SIMPLIFIED] = QFontDatabase::SimplifiedChinese; + res[CV_TEXT_SYNTHESIZER_SCRIPT_CHINESE_TRADITIONAL] = QFontDatabase::TraditionalChinese; + res[CV_TEXT_SYNTHESIZER_SCRIPT_JAPANESE] = QFontDatabase::Japanese; + res[CV_TEXT_SYNTHESIZER_SCRIPT_KOREAM] = QFontDatabase::Korean; + res[CV_TEXT_SYNTHESIZER_SCRIPT_VIETNAMESE] = QFontDatabase::Vietnamese; + return res; +} + + +int getQt2CvScriptCode(int cvScriptCode); +int getQt2CvScriptCode(int cvScriptCode){ + static std::map m(initQt2CvScriptCodeMap()); + if(m.find(cvScriptCode)!=m.end()){ + return m[cvScriptCode]; + }else{ + CV_Error(Error::StsError,"Unknown script_code"); + return 0; + } +} +#endif //HAVE_QT5GUI + + +std::map initScriptCode2StringMap(); +std::map initScriptCode2StringMap(){ + std::map res; + res[CV_TEXT_SYNTHESIZER_SCRIPT_ANY]="Any"; + res[CV_TEXT_SYNTHESIZER_SCRIPT_LATIN]="Latin"; + res[CV_TEXT_SYNTHESIZER_SCRIPT_GREEK]="Greek"; + res[CV_TEXT_SYNTHESIZER_SCRIPT_CYRILLIC]="Cyrillic"; + res[CV_TEXT_SYNTHESIZER_SCRIPT_ARMENIAN]="Armenian"; + res[CV_TEXT_SYNTHESIZER_SCRIPT_ARABIC]="Arabic"; + res[CV_TEXT_SYNTHESIZER_SCRIPT_HEBREW]="Hebrew"; + res[CV_TEXT_SYNTHESIZER_SCRIPT_SYRIAC]="Syriac"; + res[CV_TEXT_SYNTHESIZER_SCRIPT_THAANA]="Thaana"; + res[CV_TEXT_SYNTHESIZER_SCRIPT_DEVANAGARI]="Devanagari"; + res[CV_TEXT_SYNTHESIZER_SCRIPT_BENGALI]="Bengali"; + res[CV_TEXT_SYNTHESIZER_SCRIPT_GURMUKHI]="Gurmukhi"; + res[CV_TEXT_SYNTHESIZER_SCRIPT_GUJARATI]="Gujarati"; + res[CV_TEXT_SYNTHESIZER_SCRIPT_ORIYA]="Oriya"; + res[CV_TEXT_SYNTHESIZER_SCRIPT_TAMIL]="Tamil"; + res[CV_TEXT_SYNTHESIZER_SCRIPT_TELUGU]="Telugu"; + res[CV_TEXT_SYNTHESIZER_SCRIPT_KANNADA]="Kannada"; + res[CV_TEXT_SYNTHESIZER_SCRIPT_MALAYALAM]="Malayalam"; + res[CV_TEXT_SYNTHESIZER_SCRIPT_SINHALA]="Sinhala"; + res[CV_TEXT_SYNTHESIZER_SCRIPT_THAI]="Thai"; + res[CV_TEXT_SYNTHESIZER_SCRIPT_LAO]="Lao"; + res[CV_TEXT_SYNTHESIZER_SCRIPT_TIBETAN]="Tibetan"; + res[CV_TEXT_SYNTHESIZER_SCRIPT_MYANMAR]="Myanmar"; + res[CV_TEXT_SYNTHESIZER_SCRIPT_GEORGIAN]="Georgian"; + res[CV_TEXT_SYNTHESIZER_SCRIPT_KHMER]="Khmer"; + res[CV_TEXT_SYNTHESIZER_SCRIPT_CHINESE_SIMPLIFIED]="SimplifiedChinese"; + res[CV_TEXT_SYNTHESIZER_SCRIPT_CHINESE_TRADITIONAL]="TraditionalChinese"; + res[CV_TEXT_SYNTHESIZER_SCRIPT_JAPANESE]="Japanese"; + res[CV_TEXT_SYNTHESIZER_SCRIPT_KOREAM]="Korean"; + res[CV_TEXT_SYNTHESIZER_SCRIPT_VIETNAMESE]="Vietnamese"; + return res; +} + + +String getCvScriptCode2String(int cvScriptCode); +String getCvScriptCode2String(int cvScriptCode){ + static std::map m(initScriptCode2StringMap()); + if(m.find(cvScriptCode)!=m.end()){ + return m[cvScriptCode]; + }else{ + CV_Error(Error::StsError,"Unknown script_code"); + return "Error"; + } +} + + +}//unnamed namespace +void blendWeighted(Mat& out,Mat& top,Mat& bottom,float topMask,float bottomMask); +void blendWeighted(Mat& out,Mat& top,Mat& bottom,float topMask,float bottomMask){ + if(out.channels( )==3 && top.channels( )==3 && bottom.channels( )==3 ){ + for(int y=0;y(y); + float* outG=out.ptr(y)+1; + float* outB=out.ptr(y)+2; + + float* topR=top.ptr(y); + float* topG=top.ptr(y)+1; + float* topB=top.ptr(y)+2; + + float* bottomR=bottom.ptr(y); + float* bottomG=bottom.ptr(y)+1; + float* bottomB=bottom.ptr(y)+2; + + for(int x=0;x(y); + float* topG=top.ptr(y); + float* bottomG=bottom.ptr(y); + for(int x=0;x(y); + float* outG=out.ptr(y)+1; + float* outB=out.ptr(y)+2; + + float* topR=top.ptr(y); + float* topG=top.ptr(y)+1; + float* topB=top.ptr(y)+2; + + float* bottomR=bottom.ptr(y); + float* bottomG=bottom.ptr(y)+1; + float* bottomB=bottom.ptr(y)+2; + + float* topMask=topMask_.ptr(y); + float* bottomMask=bottomMask_.ptr(y); + + for(int x=0;x(y); + float* outG=out.ptr(y)+1; + float* outB=out.ptr(y)+2; + + float* topR=top.ptr(y); + float* topG=top.ptr(y)+1; + float* topB=top.ptr(y)+2; + + float* bottomR=bottom.ptr(y); + float* bottomG=bottom.ptr(y)+1; + float* bottomB=bottom.ptr(y)+2; + + float* mask=topMask.ptr(y); + + for(int x=0;x(y); + float* outG=out.ptr(y)+1; + float* outB=out.ptr(y)+2; + + float* mask=topMask.ptr(y); + + for(int x=0;xrng_.next()%10000)<(10000*v); + } + + void updateFontNameList(std::vector& fntList){ +#ifdef HAVE_QT5GUI + fntList.clear(); + QStringList lst=this->fntDb_->families(QFontDatabase::WritingSystem(getQt2CvScriptCode(this->script_))); + for(int k=0;k& fntList){ + std::vector dbList; + this->updateFontNameList(dbList); + for(size_t k =0;kavailableFonts_=fntList; + } + +#ifdef HAVE_QT5GUI + QFont generateFont(){ + CV_Assert(this->availableFonts_.size()); + QFont fnt(this->availableFonts_[rng_.next() % this->availableFonts_.size()].c_str()); + fnt.setPixelSize(this->resHeight_-2*this->txtPad_); + if(this->rndProbUnder(this->underlineProbabillity_)){ + fnt.setUnderline(true); + }else{ + fnt.setUnderline(false); + } + if(this->rndProbUnder(this->boldProbabillity_)){ + fnt.setBold(true); + }else{ + fnt.setBold(false); + } + if(this->rndProbUnder(this->italicProbabillity_)){ + fnt.setItalic(true); + }else{ + fnt.setItalic(false); + } + return fnt; + } +#endif + void generateTxtPatch(Mat& output,Mat& outputMask,String caption){ + const int maxTxtWidth=this->maxResWidth_; + Mat textImg; + textImg =cv::Mat(this->resHeight_,maxTxtWidth,CV_8UC3,Scalar_(0,0,0)); +#ifdef HAVE_QT5GUI + QImage qimg((unsigned char*)(textImg.data), textImg.cols, textImg.rows, textImg.step, QImage::Format_RGB888); + QPainter qp(&qimg); + qp.setPen(QColor(255,255,255)); + QFont fnt=this->generateFont(); + QFontMetrics fntMtr(fnt,qp.device()); + QRect bbox=fntMtr.tightBoundingRect(caption.c_str()); + qp.setFont(fnt); + qp.drawText(QPoint(txtPad_,txtPad_+ bbox.height()), caption.c_str()); + qp.end(); + textImg=textImg.colRange(0,min( bbox.width()+2*txtPad_,maxTxtWidth-1)); +#else + int fontFace = FONT_HERSHEY_SCRIPT_SIMPLEX; + double fontScale = 1; + int thickness = 2; + int baseline = 0; + Size textSize = getTextSize(caption, fontFace, fontScale, thickness, &baseline); + putText(textImg, caption, Point(this->txtPad_,this->resHeight_-this->txtPad_), + FONT_HERSHEY_SCRIPT_SIMPLEX, fontScale, Scalar_(255,255,255), thickness, 8); + textImg=textImg.colRange(0,min( textSize.width+2*txtPad_,maxTxtWidth-1)); + //TODO Warn without throuwing an exception +#endif + Mat textGrayImg; + cvtColor(textImg,textGrayImg,COLOR_RGB2GRAY); + //Obtaining color triplet + int colorTriplet=this->rng_.next()%this->colorClusters_.rows; + uchar* cVal=this->colorClusters_.ptr(colorTriplet); + Scalar_ fgText(float(cVal[0]/255.0),float(cVal[1]/255.0),float(cVal[2]/255.0)); + Scalar_ fgBorder(float(cVal[3]/255.0),float(cVal[4]/255.0),float(cVal[5]/255.0)); + Scalar_ fgShadow(float(cVal[6]/255.0),float(cVal[7]/255.0),float(cVal[8]/255.0)); + + Mat floatTxt;Mat floatBorder;Mat floatShadow; + textGrayImg.convertTo(floatTxt, CV_32FC1, 1.0/255.0); + + //Sampling uniform distributionfor sizes + int borderSize=(this->rng_.next()%this->maxBorderSize_)*this->rndProbUnder(this->borderProbabillity_); + int shadowSize=(this->rng_.next()%this->maxShadowSize_)*this->rndProbUnder(this->shadowProbabillity_); + int voffset=(this->rng_.next()%(shadowSize*2+1))-shadowSize; + int hoffset=(this->rng_.next()%(shadowSize*2+1))-shadowSize; + float shadowOpacity=float(((this->rng_.next()%10000)*maxShadowOpacity_)/10000.0); + + //generating shadows + generateDilation(floatBorder,floatTxt,borderSize,0,0); + generateDilation(floatShadow,floatBorder,shadowSize,voffset,hoffset); + + Mat floatBordered=Mat(floatTxt.rows,floatTxt.cols,CV_32FC3); + Mat floatShadowed=Mat(floatTxt.rows,floatTxt.cols,CV_32FC3); + Mat floatMixed=Mat(floatTxt.rows,floatTxt.cols,CV_32FC3); + Mat floatMask=Mat(floatTxt.rows,floatTxt.cols,CV_32FC1); + + blendOverlay(floatBordered,fgText,fgBorder, floatTxt); + blendOverlay(floatShadowed,fgShadow,fgShadow, floatTxt); + blendOverlay(floatMixed,floatBordered,floatShadowed, floatBorder); + blendWeighted(floatMask,floatShadow,floatBorder, shadowOpacity,1-shadowOpacity); + floatMixed.copyTo(output);floatMask.copyTo(outputMask); + } + + String getScriptName() { + return getCvScriptCode2String(this->script_); + } + + void generateDilation(Mat& outputImg, + const Mat& inputImg,int dilationSize, int horizOffset,int vertOffset){ + //erosion is defined as a negative dilation size + if (dilationSize==0) { + inputImg.copyTo(outputImg); + } else { + if (dilationSize > 0) { + if(horizOffset==0 && vertOffset==0){ + dilate(inputImg,outputImg,Mat(),Point(-1, -1),dilationSize); + }else{ + Mat tmpMat; + dilate(inputImg,tmpMat,Mat(),Point(-1, -1),dilationSize); + outputImg=Mat(inputImg.rows,inputImg.cols,inputImg.type(),Scalar(0)); + int validWidth=inputImg.cols-abs(horizOffset); + int validHeight=inputImg.rows-abs(vertOffset); + tmpMat(Rect(max(0,-horizOffset),max(0,-vertOffset), validWidth,validHeight)). + copyTo(outputImg(Rect(max(0,horizOffset),max(0,vertOffset), validWidth,validHeight))); + } + }else{ + if(horizOffset==0 && vertOffset==0){ + dilate(inputImg,outputImg,Mat(),Point(-1, -1),-dilationSize); + }else{ + Mat tmpMat; + erode(inputImg,tmpMat,Mat(),Point(-1, -1),-dilationSize); + outputImg=Mat(inputImg.rows,inputImg.cols,inputImg.type(),Scalar(0)); + int validWidth=inputImg.cols-abs(horizOffset); + int validHeight=inputImg.rows-abs(vertOffset); + tmpMat(Rect(max(0,-horizOffset),max(0,-vertOffset), validWidth,validHeight)). + copyTo(outputImg(Rect(max(0,horizOffset),max(0,vertOffset), validWidth,validHeight))); + } + } + } + } + + void randomlyDistortPerspective(const Mat& inputImg,Mat& outputImg){ + int N=int(this->maxPerspectiveDistortion_); + if(N>0){ + float xa=this->rng_.next()%N; + float xb=this->rng_.next()%N; + float xc=this->rng_.next()%N; + float xd=this->rng_.next()%N; + + float ya=this->rng_.next()%N; + float yb=this->rng_.next()%N; + float yc=this->rng_.next()%N; + float yd=this->rng_.next()%N; + + float left=min_(xa,xd); + float top=min_(ya,yb); + float right=100-min_(xb,xc); + float bottom=100-min_(yc,yd); + + float horizCoef; + float vertCoef; + + if(right-left src(4);std::vector dst(4); + src[0]=Point2f(0,0);src[1]=Point2f(100,0);src[2]=Point2f(100,100);src[3]=Point2f(0,100); + dst[0]=Point2f(xa,ya); + dst[1]=Point2f(xb,yb); + dst[2]=Point2f(xc,yc); + dst[3]=Point2f(xd,yd); + Mat h=findHomography(src,dst); + warpPerspective(inputImg,outputImg,h,inputImg.size()); + }else{ + outputImg=inputImg; + } + } + + void addCurveDeformation(const Mat& inputImg,Mat& outputImg){ + if (this->rndProbUnder(this->curvingProbabillity_)){ + Mat X=Mat(inputImg.rows,inputImg.cols,CV_32FC1); + Mat Y=Mat(inputImg.rows,inputImg.cols,CV_32FC1); + int xAdd=-int(this->rng_.next()%inputImg.cols); + float xMult=(this->rng_.next()%10000)*float(maxCurveArch_)/10000; + int sign=(this->rng_.next()%2)?-1:1; + for(int y=0;y(y); + float* yRow=Y.ptr(y); + for(int x=0;xrndProbUnder(this->compressionNoiseProb_)){ + std::vector buffer; + std::vector parameters; + parameters.push_back(CV_IMWRITE_JPEG_QUALITY); + parameters.push_back(this->rng_.next() % 100); + Mat ucharImg; + img.convertTo(ucharImg,CV_8UC3,255); + imencode(".jpg",ucharImg,buffer,parameters); + ucharImg=imdecode(buffer,CV_LOAD_IMAGE_COLOR); + ucharImg.convertTo(img,CV_32FC3,1.0/255); + } + } + + void initColorClusters(){ + this->colorClusters_=Mat(4,3,CV_8UC3,Scalar(32,32,32)); + + this->colorClusters_.at(0, 0)=Vec3b(192,32,32); + this->colorClusters_.at(0, 1)=Vec3b(192,255,32); + this->colorClusters_.at(0, 2)=Vec3b(0,32,32); + + this->colorClusters_.at(0, 0)=Vec3b(0,32,192); + this->colorClusters_.at(0, 1)=Vec3b(0,255,32); + this->colorClusters_.at(0, 2)=Vec3b(0,0,64); + + this->colorClusters_.at(0, 0)=Vec3b(128,128,128); + this->colorClusters_.at(0, 1)=Vec3b(255,255,255); + this->colorClusters_.at(0, 2)=Vec3b(0,0,0); + + this->colorClusters_.at(0, 0)=Vec3b(255,255,255); + this->colorClusters_.at(0, 1)=Vec3b(128,128,128); + this->colorClusters_.at(0, 2)=Vec3b(0,0,0); + } + + RNG rng_;//Randon number generator used for all distributions + int txtPad_; +#ifdef HAVE_QT5GUI + Ptr fntDb_; +#endif + std::vector availableFonts_; + std::vector availableBgSampleFiles_; + std::vector availableBgSampleImages_; + Mat colorClusters_; + int script_; + public: + TextSynthesizerQtImpl(int script, + int maxSampleWidth = 400, + int sampleHeight = 50, + uint64 rndState = 0) + : TextSynthesizer(maxSampleWidth, sampleHeight) + , rng_(rndState != 0 ? rndState:std::time(NULL)) + , txtPad_(10) { +#ifdef HAVE_QT5GUI + CV_Assert(initQt2CvScriptCodeMap().count(script));//making sure script is a valid script code +#endif + this->script_=script; + //QT needs to be initialised. Highgui does this + namedWindow("__w"); + waitKey(1); + destroyWindow("__w"); +#ifdef HAVE_QT5GUI + this->fntDb_ = Ptr(new QFontDatabase()); +#endif + this->updateFontNameList(this->availableFonts_); + this->initColorClusters(); + } + + void getRandomSeed (OutputArray res) const { + Mat tmpMat(1,8,CV_8UC1); + tmpMat.ptr(0)[0] = this->rng_.state; + tmpMat.copyTo(res); + } + + void setRandomSeed (Mat state) { + CV_Assert (state.rows == 1 && state.cols == 8); + CV_Assert (state.depth() == CV_8U && state.channels() == 1); + this->rng_.state=state.ptr(0)[0]; + } + + void generateBgSample(CV_OUT Mat& sample){ + if(this->availableBgSampleImages_.size()!=0){ + Mat& img=availableBgSampleImages_[this->rng_.next()%availableBgSampleImages_.size()]; + int left=this->rng_.next()%(img.cols-maxResWidth_); + int top=this->rng_.next()%(img.rows-resHeight_); + img.colRange(Range(left,left+maxResWidth_)).rowRange(Range(top,top+resHeight_)).copyTo(sample); + }else{ + if(this->availableBgSampleFiles_.size()==0){ + Mat res(this->resHeight_,this->maxResWidth_,CV_8UC3); + this->rng_.fill(res,RNG::UNIFORM,0,256); + res.copyTo(sample); + }else{ + Mat img; + img=imread(this->availableBgSampleFiles_[this->rng_.next()%availableBgSampleFiles_.size()].c_str(),IMREAD_COLOR); + CV_Assert(img.data != NULL); + CV_Assert(img.cols>maxResWidth_ && img.rows> resHeight_); + int left=this->rng_.next()%(img.cols-maxResWidth_); + int top=this->rng_.next()%(img.rows-resHeight_); + img.colRange(Range(left,left+maxResWidth_)).rowRange(Range(top,top+resHeight_)).copyTo(sample); + } + } + if(sample.channels()==4){ + Mat rgb; + cvtColor(sample,rgb,COLOR_RGBA2RGB); + sample=rgb; + } + if(sample.channels()==1){ + Mat rgb; + cvtColor(sample,rgb,COLOR_GRAY2RGB); + sample=rgb; + } + } + + void generateTxtSample(String caption,CV_OUT Mat& sample,CV_OUT Mat& sampleMask){ + generateTxtPatch(sample,sampleMask,caption); + } + + void generateSample(String caption,CV_OUT Mat & sample){ + Mat txtSample; + Mat txtCurved; + Mat txtDistorted; + Mat bgSample; + Mat bgResized; + Mat txtMask; + Mat txtMerged; + Mat floatBg; + std::vector txtChannels; + generateTxtPatch(txtSample,txtMask,caption); + + split(txtSample,txtChannels); + txtChannels.push_back(txtMask); + merge(txtChannels,txtMerged); + addCurveDeformation(txtMerged,txtCurved); + randomlyDistortPerspective(txtCurved,txtDistorted); + split(txtDistorted,txtChannels); + txtMask=txtChannels[3]; + txtChannels.pop_back(); + merge(txtChannels,txtSample); + + generateBgSample(bgSample); + bgSample.convertTo(floatBg, CV_32FC3, 1.0/255.0); + bgResized=floatBg.colRange(0,txtSample.cols); + sample=Mat(txtDistorted.rows,txtDistorted.cols,CV_32FC3); + + blendOverlay(sample,txtSample,bgResized,txtMask); + float blendAlpha=float(this->finalBlendAlpha_*(this->rng_.next()%1000)/1000.0); + if(this->rndProbUnder(this->finalBlendProb_)){ + blendWeighted(sample,sample,bgResized,1-blendAlpha,blendAlpha); + } + addCompressionArtifacts(sample); + } + + void getColorClusters(CV_OUT Mat& clusters) const { + this->colorClusters_.copyTo(clusters); + } + + void setColorClusters(Mat clusters){ + CV_Assert(clusters.type()==CV_8UC3); + CV_Assert(clusters.cols==3); + clusters.copyTo(this->colorClusters_); + } + + std::vector listAvailableFonts() const { + std::vector res; + res=this->availableFonts_; + return res; + } + + virtual void addBgSampleImage(const Mat& inImg){ + CV_Assert(inImg.cols>maxResWidth_ && inImg.rows> resHeight_); + Mat img; + switch(inImg.type()){ + case CV_8UC1: { + cvtColor(inImg, img, COLOR_GRAY2RGBA); + break; + } + case CV_8UC3: { + cvtColor(inImg, img, COLOR_RGB2RGBA); + break; + } + case CV_8UC4: { + inImg.copyTo(img); + break; + } + default:{ + CV_Error(Error::StsError, + "Only uchar images of 1, 3, or 4 channels are accepted"); + } + } + this->availableBgSampleImages_.push_back(img); + } + + void addFontFiles(const std::vector& fntList){ +#ifdef HAVE_QT5GUI + for(size_t n=0;nfntDb_->addApplicationFont(fntList[n].c_str()); + if(addFontSucces==-1){ + CV_Error(Error::StsError,"Failed to load ttf font. QT5 currently doesn't support this under X11"); + } + } + this->updateFontNameList(this->availableFonts_); +#else + CV_Assert(fntList.size()>0);//to supress compilation warning + CV_Error(Error::StsError,"QT5 not available, TextSynthesiser is not fully functional."); +#endif + } + + std::vector listBgSampleFiles(){ + std::vector res(this->availableBgSampleFiles_.size()); + std::copy(this->availableBgSampleFiles_.begin(),this->availableBgSampleFiles_.end(),res.begin()); + return res; + } +}; + +Ptr TextSynthesizer::create(int sampleHeight, int maxWidth, int script){ + Ptr res(new TextSynthesizerQtImpl(script, maxWidth,sampleHeight)); + return res; +} + +} //namespace text +} //namespace cv diff --git a/modules/text/text_config.hpp.in b/modules/text/text_config.hpp.in index 30089bd3c55..71b32993acf 100644 --- a/modules/text/text_config.hpp.in +++ b/modules/text/text_config.hpp.in @@ -1,7 +1,13 @@ #ifndef __OPENCV_TEXT_CONFIG_HPP__ #define __OPENCV_TEXT_CONFIG_HPP__ +// HAVE QT5 +//#cmakedefine HAVE_QT5GUI + +// HAVE CAFFE +//#cmakedefine HAVE_CAFFE + // HAVE OCR Tesseract -#cmakedefine HAVE_TESSERACT +//#cmakedefine HAVE_TESSERACT -#endif \ No newline at end of file +#endif