diff --git a/README.md b/README.md index 2ecda66..43c2a5c 100644 --- a/README.md +++ b/README.md @@ -113,121 +113,106 @@ cuDNN下载地址:https://developer.nvidia.com/rdp/form/cudnn-download-survey ## 2. 配置化 -1. config.yaml - System Config - ```yaml - # - requirement.txt - GPU: tensorflow-gpu, CPU: tensorflow - # - If you use the GPU version, you need to install some additional applications. - # TrainRegex and TestRegex: Default matching apple_20181010121212.jpg file. - # - The Default is .*?(?=_.*\.) - # TrainsPath and TestPath: The local absolute path of your training and testing set. - # TestSetNum: This is an optional parameter that is used when you want to extract some of the test set - # - from the training set when you are not preparing the test set separately. - System: - DeviceUsage: 0.5 - TrainRegex: '.*?(?=_)' - TestRegex: '.*?(?=_)' - TestSetNum: 300 - - # CNNNetwork: [CNN5, DenseNet] - # RecurrentNetwork: [BLSTM, LSTM] - # - The recommended configuration is CNN5+BLSTM / DenseNet+BLSTM - # HiddenNum: [64, 128, 256] - # - This parameter indicates the number of nodes used to remember and store past states. - NeuralNet: - CNNNetwork: CNN5 - RecurrentNetwork: BLSTM - HiddenNum: 64 - KeepProb: 0.99 - - # SavedEpochs: A Session.run() execution is called a Epochs, - # - Used to save traininsed to calculate accuracy, Default value is 100. - # TestNum: The number of samples for each test batch. - # - A test for every saved steps. - # CompileAcc: When the accuracy reaches the set threshold, - # - the model will be compiled together each time it is archived. - # - Available for specific usage scenarios. - # EndAcc: Finish the training when the accuracy reaches [EndAcc*100]%. - # EndEpochs: Finish the training when the epoch is greater than the defined epoch. - # PreprocessCollapseRepe ated: If True, then a preprocessing step runs - # - before loss calculation, wherein repeated labels passed to the loss - # - are merged into single labels. This is useful if the training labels come - # - from, e.g., forced alignments and therefore have unnecessary repetitions. - # CTCMergeRepeated: If False, then deep within the CTC calculation, - # - repeated non-blank labels will not be merged and are interpreted - # - as individual labels. This is a simplified (non-standard) version of CTC. - Trains: - SavedSteps: 100 - ValidationSteps: 500 - EndAcc: 0.98 - EndCost: 1 - EndEpochs: 2 - BatchSize: 64 - TestBatchSize: 300 - LearningRate: 0.01 - DecayRate: 0.98 - DecaySteps: 100000 - PreprocessCollapseRepeated: False - CTCMergeRepeated: True - CTCBeamWidth: 5 - CTCTopPaths: 1 - - ``` - - There are several common examples of TrainRegex: - i. apple_20181010121212.jpg - - ``` - .*?(?=_.*\.) - ``` - - ii apple.png - - ``` - .*?(?=\.) - ``` - -2. model.yaml - Model Config +1. model.yaml - Model Config ```yaml - # Sites: A bindable parameter used to select a model. - # - If this parameter is defined, - # - it can be identified by using the model_site parameter - # - to identify a model that is inconsistent with the actual size of the current model. - # ModelName: Corresponding to the model file in the model directory, - # - such as YourModelName.pb, fill in YourModelName here. - # ModelType: This parameter is also used to locate the model. - # - The difference from the sites is that if there is no corresponding site, - # - the size will be used to assign the model. - # - If a model of the corresponding size and corresponding to the ModelType is not found, - # - the model belonging to the category is preferentially selected. - # CharSet: Provides a default optional built-in solution: - # - [ALPHANUMERIC, ALPHANUMERIC_LOWER, ALPHANUMERIC_UPPER, - # -- NUMERIC, ALPHABET_LOWER, ALPHABET_UPPER, ALPHABET] - # - Or you can use your own customized character set like: ['a', '1', '2']. - # CharExclude: CharExclude should be a list, like: ['a', '1', '2'] - # - which is convenient for users to freely combine character sets. - # - If you don't want to manually define the character set manually, - # - you can choose a built-in character set - # - and set the characters to be excluded by CharExclude parameter. - Model: - Sites: [] - ModelName: YourModelName - ModelType: 150x50 - CharSet: ALPHANUMERIC_LOWER - CharExclude: [] - CharReplace: {} - ImageWidth: 150 - ImageHeight: 50 - - # Binaryzation: [-1: Off, >0 and < 255: On]. - # Smoothing: [-1: Off, >0: On]. - # Blur: [-1: Off, >0: On]. - Pretreatment: - Binaryzation: -1 - Smoothing: -1 - Blur: -1 - Resize: [150, 50] + # - requirement.txt - GPU: tensorflow-gpu, CPU: tensorflow + # - If you use the GPU version, you need to install some additional applications. + System: + DeviceUsage: 0.7 + + # ModelName: Corresponding to the model file in the model directory, + # - such as YourModelName.pb, fill in YourModelName here. + # CharSet: Provides a default optional built-in solution: + # - [ALPHANUMERIC, ALPHANUMERIC_LOWER, ALPHANUMERIC_UPPER, + # -- NUMERIC, ALPHABET_LOWER, ALPHABET_UPPER, ALPHABET, ALPHANUMERIC_LOWER_MIX_CHINESE_3500] + # - Or you can use your own customized character set like: ['a', '1', '2']. + # CharMaxLength: Maximum length of characters, used for label padding. + # CharExclude: CharExclude should be a list, like: ['a', '1', '2'] + # - which is convenient for users to freely combine character sets. + # - If you don't want to manually define the character set manually, + # - you can choose a built-in character set + # - and set the characters to be excluded by CharExclude parameter. + Model: + Sites: [ + 'YourModelName' + ] + ModelName: YourModelName + ModelType: 150x50 + CharSet: ALPHANUMERIC_LOWER + CharExclude: [] + CharReplace: {} + ImageWidth: 150 + ImageHeight: 50 + + # Binaryzation: [-1: Off, >0 and < 255: On]. + # Smoothing: [-1: Off, >0: On]. + # Blur: [-1: Off, >0: On]. + # Resize: [WIDTH, HEIGHT] + # - If the image size is too small, the training effect will be poor and you need to zoom in. + # ReplaceTransparent: [True, False] + # - True: Convert transparent images in RGBA format to opaque RGB format, + # - False: Keep the original image + Pretreatment: + Binaryzation: -1 + Smoothing: -1 + Blur: -1 + Resize: [150, 50] + ReplaceTransparent: True + + # CNNNetwork: [CNN5, ResNet, DenseNet] + # RecurrentNetwork: [BLSTM, LSTM, SRU, BSRU, GRU] + # - The recommended configuration is CNN5+BLSTM / ResNet+BLSTM + # HiddenNum: [64, 128, 256] + # - This parameter indicates the number of nodes used to remember and store past states. + # Optimizer: Loss function algorithm for calculating gradient. + # - [AdaBound, Adam, Momentum] + NeuralNet: + CNNNetwork: CNN5 + RecurrentNetwork: BLSTM + HiddenNum: 64 + KeepProb: 0.98 + Optimizer: AdaBound + PreprocessCollapseRepeated: False + CTCMergeRepeated: True + CTCBeamWidth: 1 + CTCTopPaths: 1 + + # TrainsPath and TestPath: The local absolute path of your training and testing set. + # DatasetPath: Package a sample of the TFRecords format from this path. + # TrainRegex and TestRegex: Default matching apple_20181010121212.jpg file. + # - The Default is .*?(?=_.*\.) + # TestSetNum: This is an optional parameter that is used when you want to extract some of the test set + # - from the training set when you are not preparing the test set separately. + # SavedSteps: A Session.run() execution is called a Step, + # - Used to save training progress, Default value is 100. + # ValidationSteps: Used to calculate accuracy, Default value is 500. + # TestSetNum: The number of test sets, if an automatic allocation strategy is used (TestPath not set). + # EndAcc: Finish the training when the accuracy reaches [EndAcc*100]% and other conditions. + # EndCost: Finish the training when the cost reaches EndCost and other conditions. + # EndEpochs: Finish the training when the epoch is greater than the defined epoch and other conditions. + # BatchSize: Number of samples selected for one training step. + # TestBatchSize: Number of samples selected for one validation step. + # LearningRate: Recommended value[0.01: MomentumOptimizer/AdamOptimizer, 0.001: AdaBoundOptimizer] + Trains: + TrainsPath: './dataset/mnist-CNN5BLSTM-H64-28x28_trains.tfrecords' + TestPath: './dataset/mnist-CNN5BLSTM-H64-28x28_test.tfrecords' + DatasetPath: [ + "D:/***" + ] + TrainRegex: '.*?(?=_)' + TestSetNum: 300 + SavedSteps: 100 + ValidationSteps: 500 + EndAcc: 0.95 + EndCost: 0.1 + EndEpochs: 2 + BatchSize: 128 + TestBatchSize: 300 + LearningRate: 0.001 + DecayRate: 0.98 + DecaySteps: 10000 ``` # 工具集 @@ -235,10 +220,7 @@ cuDNN下载地址:https://developer.nvidia.com/rdp/form/cudnn-download-survey 1. 预处理预览工具,只支持为打包的训练集查看 ```python -m tools.preview``` -2. 新手指南 (只支持字符集推荐,我觉得是个鸡肋各位请忽略) - ```python -m tools.navigator``` - -3. PyInstaller 一键打包(训练的话支持不好,部署的打包效果不错) +2. PyInstaller 一键打包(训练的话支持不好,部署的打包效果不错) ``` pip install pyinstaller @@ -249,6 +231,9 @@ cuDNN下载地址:https://developer.nvidia.com/rdp/form/cudnn-download-survey 1. 命令行或终端运行:```python trains.py``` 2. 使用 PyCharm 运行,右键 Run +3. **新手专用**: 使用IDE工具修改 tutorial.py 配置内容并运行,集推荐配置,打包样本,运行于一体。 + + # 开源许可 diff --git a/config.py b/config.py index 9863550..e90dda0 100644 --- a/config.py +++ b/config.py @@ -5,11 +5,10 @@ import os import platform import re -from enum import Enum, unique - import yaml from character import * +from constants import * from exception import exception, ConfigException # Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 @@ -20,30 +19,10 @@ IGNORE_FILES = ['.DS_Store'] -class RunMode(object): - Test = 'test' - Trains = 'trains' - Predict = 'predict' - - -@unique -class CNNNetwork(Enum): - CNN5 = 'CNN5' - ResNet = 'ResNet' - - -@unique -class RecurrentNetwork(Enum): - LSTM = 'LSTM' - BLSTM = 'BLSTM' - SRU = 'SRU' - BSRU = 'BSRU' - GRU = 'GRU' - - NETWORK_MAP = { 'CNN5': CNNNetwork.CNN5, 'ResNet': CNNNetwork.ResNet, + 'DenseNet': CNNNetwork.DenseNet, 'LSTM': RecurrentNetwork.LSTM, 'BLSTM': RecurrentNetwork.BLSTM, 'SRU': RecurrentNetwork.SRU, @@ -51,16 +30,21 @@ class RecurrentNetwork(Enum): 'GRU': RecurrentNetwork.GRU, } -TFRECORDS_NAME_MAP = { - RunMode.Trains: 'trains', - RunMode.Test: 'test' + +OPTIMIZER_MAP = { + 'AdaBound': Optimizer.AdaBound, + 'Adam': Optimizer.Adam, + 'Momentum': Optimizer.Momentum, + 'SGD': Optimizer.SGD, + 'AdaGrad': Optimizer.AdaGrad, + 'RMSProp': Optimizer.RMSProp } PLATFORM = platform.system() -SYS_CONFIG_DEMO_NAME = 'config_demo.yaml' +# SYS_CONFIG_DEMO_NAME = 'config_demo.yaml' MODEL_CONFIG_DEMO_NAME = 'model_demo.yaml' -SYS_CONFIG_NAME = 'config.yaml' +# SYS_CONFIG_NAME = 'config.yaml' MODEL_CONFIG_NAME = 'model.yaml' MODEL_PATH = os.path.join(PROJECT_PATH, 'model') OUTPUT_PATH = os.path.join(PROJECT_PATH, 'out') @@ -68,19 +52,19 @@ class RecurrentNetwork(Enum): PATH_SPLIT = "\\" if PLATFORM == "Windows" else "/" -SYS_CONFIG_PATH = os.path.join(PROJECT_PATH, SYS_CONFIG_NAME) -SYS_CONFIG_PATH = SYS_CONFIG_PATH if os.path.exists(SYS_CONFIG_PATH) else os.path.join("../", SYS_CONFIG_NAME) +# SYS_CONFIG_PATH = os.path.join(PROJECT_PATH, SYS_CONFIG_NAME) +# SYS_CONFIG_PATH = SYS_CONFIG_PATH if os.path.exists(SYS_CONFIG_PATH) else os.path.join("../", SYS_CONFIG_NAME) MODEL_CONFIG_PATH = os.path.join(PROJECT_PATH, MODEL_CONFIG_NAME) MODEL_CONFIG_PATH = MODEL_CONFIG_PATH if os.path.exists(MODEL_CONFIG_PATH) else os.path.join("../", MODEL_CONFIG_NAME) -with open(SYS_CONFIG_PATH, 'r', encoding="utf-8") as sys_fp: - sys_stream = sys_fp.read() - cf_system = yaml.load(sys_stream) +# with open(SYS_CONFIG_PATH, 'r', encoding="utf-8") as sys_fp: +# sys_stream = sys_fp.read() +# cf_system = yaml.load(sys_stream, Loader=yaml.SafeLoader) with open(MODEL_CONFIG_PATH, 'r', encoding="utf-8") as sys_fp: sys_stream = sys_fp.read() - cf_model = yaml.load(sys_stream) + cf_model = yaml.load(sys_stream, Loader=yaml.SafeLoader) def char_set(_type): @@ -115,15 +99,31 @@ def char_set(_type): IMAGE_WIDTH = cf_model['Model'].get('ImageWidth') IMAGE_CHANNEL = cf_model['Model'].get('ImageChannel') IMAGE_CHANNEL = IMAGE_CHANNEL if IMAGE_CHANNEL else 1 +MULTI_SHAPE = False + """NEURAL NETWORK""" -NEU_CNN = cf_system['NeuralNet'].get('CNNNetwork') +NEU_CNN = cf_model['NeuralNet'].get('CNNNetwork') NEU_CNN = NEU_CNN if NEU_CNN else 'CNN5' -NEU_RECURRENT = cf_system['NeuralNet'].get('RecurrentNetwork') +NEU_RECURRENT = cf_model['NeuralNet'].get('RecurrentNetwork') NEU_RECURRENT = NEU_RECURRENT if NEU_RECURRENT else 'BLSTM' -NUM_HIDDEN = cf_system['NeuralNet'].get('HiddenNum') -OUTPUT_KEEP_PROB = cf_system['NeuralNet'].get('KeepProb') +NUM_HIDDEN = cf_model['NeuralNet'].get('HiddenNum') +OUTPUT_KEEP_PROB = cf_model['NeuralNet'].get('KeepProb') LSTM_LAYER_NUM = 2 +NEU_OPTIMIZER = cf_model['NeuralNet'].get('Optimizer') +NEU_OPTIMIZER = NEU_OPTIMIZER if NEU_OPTIMIZER else 'AdaBound' +PREPROCESS_COLLAPSE_REPEATED = cf_model['NeuralNet'].get('PreprocessCollapseRepeated') +PREPROCESS_COLLAPSE_REPEATED = PREPROCESS_COLLAPSE_REPEATED if PREPROCESS_COLLAPSE_REPEATED is not None else False +CTC_MERGE_REPEATED = cf_model['NeuralNet'].get('CTCMergeRepeated') +CTC_MERGE_REPEATED = CTC_MERGE_REPEATED if CTC_MERGE_REPEATED is not None else True +CTC_BEAM_WIDTH = cf_model['NeuralNet'].get('CTCBeamWidth') +CTC_BEAM_WIDTH = CTC_BEAM_WIDTH if CTC_BEAM_WIDTH is not None else 1 +CTC_TOP_PATHS = cf_model['NeuralNet'].get('CTCTopPaths') +CTC_TOP_PATHS = CTC_TOP_PATHS if CTC_TOP_PATHS is not None else 1 +CTC_LOSS_TIME_MAJOR = True +WARP_CTC = cf_model['NeuralNet'].get('WarpCTC') +WARP_CTC = WARP_CTC if WARP_CTC is not None else False + LEAKINESS = 0.01 NUM_CLASSES = CHAR_SET_LEN + 2 @@ -134,25 +134,23 @@ def char_set(_type): SAVE_CHECKPOINT = os.path.join(MODEL_PATH, CHECKPOINT_TAG) """SYSTEM""" -GPU_USAGE = cf_system['System'].get('DeviceUsage') +GPU_USAGE = cf_model['System'].get('DeviceUsage') """PATH & LABEL""" TRAIN_PATH_IN_MODEL = cf_model.get('Trains') -if TRAIN_PATH_IN_MODEL: - TRAINS_PATH = cf_model['Trains'].get('TrainsPath') - TEST_PATH = cf_model['Trains'].get('TestPath') -else: - TRAINS_PATH = cf_system['System'].get('TrainsPath') - TEST_PATH = cf_system['System'].get('TestPath') -TRAINS_REGEX = cf_system['System'].get('TrainRegex') +TRAINS_PATH = cf_model['Trains'].get('TrainsPath') +TEST_PATH = cf_model['Trains'].get('TestPath') +DATASET_PATH = cf_model['Trains'].get('DatasetPath') + +TRAINS_REGEX = cf_model['Trains'].get('TrainRegex') TRAINS_REGEX = TRAINS_REGEX if TRAINS_REGEX else ".*?(?=_)" -TEST_REGEX = cf_system['System'].get('TestRegex') +TEST_REGEX = cf_model['Trains'].get('TestRegex') TEST_REGEX = TEST_REGEX if TEST_REGEX else (TRAINS_REGEX if TRAINS_REGEX else ".*?(?=_)") -TEST_SET_NUM = cf_system['System'].get('TestSetNum') +TEST_SET_NUM = cf_model['Trains'].get('TestSetNum') TEST_SET_NUM = TEST_SET_NUM if TEST_SET_NUM else 1000 HAS_TEST_SET = TEST_PATH and (os.path.exists(TEST_PATH) if isinstance(TEST_PATH, str) else True) @@ -161,27 +159,22 @@ def char_set(_type): TRAINS_USE_TFRECORDS = isinstance(TRAINS_PATH, str) and TRAINS_PATH.endswith("tfrecords") """TRAINS""" -TRAINS_SAVE_STEPS = cf_system['Trains'].get('SavedSteps') -TRAINS_VALIDATION_STEPS = cf_system['Trains'].get('ValidationSteps') -TRAINS_END_ACC = cf_system['Trains'].get('EndAcc') -TRAINS_END_COST = cf_system['Trains'].get('EndCost') +TRAINS_SAVE_STEPS = cf_model['Trains'].get('SavedSteps') +TRAINS_VALIDATION_STEPS = cf_model['Trains'].get('ValidationSteps') +TRAINS_END_ACC = cf_model['Trains'].get('EndAcc') +TRAINS_END_COST = cf_model['Trains'].get('EndCost') TRAINS_END_COST = TRAINS_END_COST if TRAINS_END_COST else 1 -TRAINS_END_EPOCHS = cf_system['Trains'].get('EndEpochs') -TRAINS_LEARNING_RATE = cf_system['Trains'].get('LearningRate') -DECAY_RATE = cf_system['Trains'].get('DecayRate') -DECAY_STEPS = cf_system['Trains'].get('DecaySteps') -BATCH_SIZE = cf_system['Trains'].get('BatchSize') -TEST_BATCH_SIZE = cf_system['Trains'].get('TestBatchSize') -TEST_BATCH_SIZE = TEST_BATCH_SIZE if TEST_BATCH_SIZE else 200 +TRAINS_END_EPOCHS = cf_model['Trains'].get('EndEpochs') +TRAINS_LEARNING_RATE = cf_model['Trains'].get('LearningRate') +DECAY_RATE = cf_model['Trains'].get('DecayRate') +DECAY_RATE = DECAY_RATE if DECAY_RATE else 0.98 +DECAY_STEPS = cf_model['Trains'].get('DecaySteps') +DECAY_STEPS = DECAY_STEPS if DECAY_STEPS else 10000 +BATCH_SIZE = cf_model['Trains'].get('BatchSize') +BATCH_SIZE = BATCH_SIZE if BATCH_SIZE else 64 +TEST_BATCH_SIZE = cf_model['Trains'].get('TestBatchSize') +TEST_BATCH_SIZE = TEST_BATCH_SIZE if TEST_BATCH_SIZE else 300 MOMENTUM = 0.9 -PREPROCESS_COLLAPSE_REPEATED = cf_system['Trains'].get('PreprocessCollapseRepeated') -PREPROCESS_COLLAPSE_REPEATED = PREPROCESS_COLLAPSE_REPEATED if PREPROCESS_COLLAPSE_REPEATED is not None else False -CTC_MERGE_REPEATED = cf_system['Trains'].get('CTCMergeRepeated') -CTC_MERGE_REPEATED = CTC_MERGE_REPEATED if CTC_MERGE_REPEATED is not None else True -CTC_BEAM_WIDTH = cf_system['Trains'].get('CTCBeamWidth') -CTC_BEAM_WIDTH = CTC_BEAM_WIDTH if CTC_BEAM_WIDTH is not None else 1 -CTC_TOP_PATHS = cf_system['Trains'].get('CTCTopPaths') -CTC_TOP_PATHS = CTC_TOP_PATHS if CTC_TOP_PATHS is not None else 1 """PRETREATMENT""" BINARYZATION = cf_model['Pretreatment'].get('Binaryzation') @@ -213,15 +206,15 @@ def init(): if not os.path.exists(OUTPUT_PATH): os.makedirs(OUTPUT_PATH) - if not os.path.exists(SYS_CONFIG_PATH): - exception( - 'Configuration File "{}" No Found. ' - 'If it is used for the first time, please copy one from {} as {}'.format( - SYS_CONFIG_NAME, - SYS_CONFIG_DEMO_NAME, - SYS_CONFIG_NAME - ), ConfigException.SYS_CONFIG_PATH_NOT_EXIST - ) + # if not os.path.exists(SYS_CONFIG_PATH): + # exception( + # 'Configuration File "{}" No Found. ' + # 'If it is used for the first time, please copy one from {} as {}'.format( + # SYS_CONFIG_NAME, + # SYS_CONFIG_DEMO_NAME, + # SYS_CONFIG_NAME + # ), ConfigException.SYS_CONFIG_PATH_NOT_EXIST + # ) if not os.path.exists(MODEL_CONFIG_PATH): exception( @@ -248,7 +241,7 @@ def init(): f.write(checkpoint) -if '../' not in SYS_CONFIG_PATH: +if '../' not in MODEL_CONFIG_PATH: print('Loading Configuration...') print('---------------------------------------------------------------------------------') print("PROJECT_PATH", PROJECT_PATH) @@ -259,6 +252,6 @@ def init(): print('IMAGE_WIDTH: {}, IMAGE_HEIGHT: {}'.format( IMAGE_WIDTH, IMAGE_HEIGHT) ) - print('NEURAL NETWORK: {}'.format(cf_system['NeuralNet'])) + print('NEURAL NETWORK: {}'.format(cf_model['NeuralNet'])) - print('---------------------------------------------------------------------------------') + print('---------------------------------------------------------------------------------') \ No newline at end of file diff --git a/config_demo.yaml b/config_demo.yaml deleted file mode 100644 index 6d7080d..0000000 --- a/config_demo.yaml +++ /dev/null @@ -1,50 +0,0 @@ -# - requirement.txt - GPU: tensorflow-gpu, CPU: tensorflow -# - If you use the GPU version, you need to install some additional applications. -# TrainRegex and TestRegex: Default matching apple_20181010121212.jpg file. -# - The Default is .*?(?=_.*\.) -# TrainsPath and TestPath: The local absolute path of your training and testing set. -# TestSetNum: This is an optional parameter that is used when you want to extract some of the test set -# - from the training set when you are not preparing the test set separately. -System: - DeviceUsage: 0.7 - TrainRegex: '.*?(?=_)' - TestRegex: '.*?(?=_)' - TrainsPath: './dataset/mnist-CNN5BLSTM-H64-28x28_trains.tfrecords' - TestPath: './dataset/mnist-CNN5BLSTM-H64-28x28_test.tfrecords' - TestSetNum: 300 - -# CNNNetwork: [CNN5, ResNet] -# RecurrentNetwork: [BLSTM, LSTM, SRU, BSRU, GRU] -# - The recommended configuration is CNN5+BLSTM / ResNet+BLSTM -# HiddenNum: [64, 128, 256] -# - This parameter indicates the number of nodes used to remember and store past states. -NeuralNet: - CNNNetwork: CNN5 - RecurrentNetwork: BLSTM - HiddenNum: 64 - KeepProb: 0.98 - -# SavedSteps: A Session.run() execution is called a Epochs, -# - Used to save training progress, Default value is 100. -# ValidationSteps: Used to calculate accuracy, Default value is 100. -# TestNum: The number of samples for each test batch. -# - A test for every saved steps. -# EndAcc: Finish the training when the accuracy reaches [EndAcc*100]%. -# EndEpochs: Finish the training when the epoch is greater than the defined epoch. -Trains: - SavedSteps: 100 - ValidationSteps: 500 - EndAcc: 0.975 - EndCost: 1 - EndEpochs: 1 - BatchSize: 64 - TestBatchSize: 300 - LearningRate: 0.01 - DecayRate: 0.98 - DecaySteps: 10000 - PreprocessCollapseRepeated: False - CTCMergeRepeated: True - CTCBeamWidth: 5 - CTCTopPaths: 1 - - diff --git a/constants.py b/constants.py new file mode 100644 index 0000000..5e6aaa0 --- /dev/null +++ b/constants.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- +# Author: kerlomz +from enum import Enum, unique + + +@unique +class RunMode(Enum): + Test = 'test' + Trains = 'trains' + Predict = 'predict' + + +@unique +class CNNNetwork(Enum): + CNN5 = 'CNN5' + ResNet = 'ResNet' + DenseNet = 'DenseNet' + + +@unique +class RecurrentNetwork(Enum): + LSTM = 'LSTM' + BLSTM = 'BLSTM' + SRU = 'SRU' + BSRU = 'BSRU' + GRU = 'GRU' + + +@unique +class Optimizer(Enum): + AdaBound = 'AdaBound' + Adam = 'Adam' + Momentum = 'Momentum' + SGD = 'SGD' + AdaGrad = 'AdaGrad' + RMSProp = 'RMSProp' + + +@unique +class SimpleCharset(Enum): + NUMERIC = 'NUMBER' + ALPHANUMERIC = 'ALPHANUMERIC' + ALPHANUMERIC_LOWER = 'ALPHANUMERIC_LOWER' + ALPHANUMERIC_UPPER = 'ALPHANUMERIC_UPPER' + ALPHABET_LOWER = 'ALPHABET_LOWER' + ALPHABET_UPPER = 'ALPHABET_UPPER' + ALPHABET = 'ALPHABET' + ARITHMETIC = 'ARITHMETIC' + ALPHANUMERIC_LOWER_MIX_ARITHMETIC = 'ALPHANUMERIC_LOWER_MIX_ARITHMETIC' + FLOAT = 'FLOAT' + CHINESE_3500 = 'CHINESE_3500' + ALPHANUMERIC_LOWER_MIX_CHINESE_3500 = 'ALPHANUMERIC_LOWER_MIX_CHINESE_3500' + diff --git a/framework.py b/framework.py index 4509ffd..a76b3a1 100644 --- a/framework.py +++ b/framework.py @@ -2,16 +2,18 @@ # -*- coding:utf-8 -*- # Author: kerlomz import sys - import tensorflow as tf +from importlib import import_module from distutils.version import StrictVersion from config import * -from network.CNN5 import CNN5 +from network.CNN import CNN5 from network.GRU import GRU from network.LSTM import LSTM, BLSTM from network.ResNet import ResNet50 +from network.DenseNet import DenseNet from network.SRU import SRU, BSRU from network.utils import NetworkUtils +from optimizer.AdaBound import AdaBoundOptimizer class GraphOCR(object): @@ -21,7 +23,7 @@ def __init__(self, mode, cnn: CNNNetwork, recurrent: RecurrentNetwork): self.utils = NetworkUtils(mode) self.network = cnn self.recurrent = recurrent - self.inputs = tf.placeholder(tf.float32, [None, RESIZE[0], RESIZE[1], IMAGE_CHANNEL], name='input') + self.inputs = tf.placeholder(tf.float32, [None, None, RESIZE[1], IMAGE_CHANNEL], name='input') self.labels = tf.sparse_placeholder(tf.int32, name='labels') self.seq_len = None self.merged_summary = None @@ -32,18 +34,25 @@ def build_graph(self): self.merged_summary = tf.summary.merge_all() def _build_model(self): + if self.network == CNNNetwork.CNN5: x = CNN5(inputs=self.inputs, utils=self.utils).build() elif self.network == CNNNetwork.ResNet: x = ResNet50(inputs=self.inputs, utils=self.utils).build() + elif self.network == CNNNetwork.DenseNet: + x = DenseNet(inputs=self.inputs, utils=self.utils).build() + else: - print('This cnn neural network is not supported at this time.') + tf.logging.error('This cnn neural network is not supported at this time.') sys.exit(-1) - shape_list = x.get_shape().as_list() - self.seq_len = tf.fill([tf.shape(x)[0]], shape_list[1], name="seq_len") + # time_major = True: [max_time_step, batch_size, num_classes] + # time_major = False: [batch_size, max_time_step, num_classes] + tf.logging.info("CNN Output: {}".format(x.get_shape())) + + self.seq_len = tf.fill([tf.shape(x)[0]], tf.shape(x)[1], name="seq_len") if self.recurrent == RecurrentNetwork.LSTM: recurrent_network_builder = LSTM(self.utils, x, self.seq_len) @@ -56,7 +65,7 @@ def _build_model(self): elif self.recurrent == RecurrentNetwork.BSRU: recurrent_network_builder = BSRU(self.utils, x, self.seq_len) else: - print('This recurrent neural network is not supported at this time.') + tf.logging.error('This recurrent neural network is not supported at this time.') sys.exit(-1) outputs = recurrent_network_builder.build() @@ -69,10 +78,10 @@ def _build_model(self): name='weight', shape=[outputs.get_shape()[1] if self.network == CNNNetwork.ResNet else NUM_HIDDEN * 2, NUM_CLASSES], dtype=tf.float32, - initializer=tf.truncated_normal_initializer(stddev=0.1), + initializer=tf.contrib.layers.xavier_initializer(), + # initializer=tf.truncated_normal_initializer(stddev=0.1), + # initializer=tf.glorot_normal_initializer(), # initializer=tf.glorot_uniform_initializer(), - # initializer=tf.contrib.layers.xavier_initializer(), - # initializer=tf.truncated_normal([NUM_HIDDEN, NUM_CLASSES], stddev=0.1), ) biases_out = tf.get_variable( name='biases', @@ -91,20 +100,27 @@ def _build_model(self): def _build_train_op(self): self.global_step = tf.train.get_or_create_global_step() # ctc loss function, using forward and backward algorithms and maximum likelihood. - - self.loss = tf.nn.ctc_loss( - labels=self.labels, - inputs=self.predict, - sequence_length=self.seq_len, - ctc_merge_repeated=CTC_MERGE_REPEATED, - preprocess_collapse_repeated=PREPROCESS_COLLAPSE_REPEATED, - ignore_longer_outputs_than_inputs=False, - time_major=True - ) + if WARP_CTC: + import_module('warpctc_tensorflow') + with tf.get_default_graph()._kernel_label_map({"CTCLoss": "WarpCTC"}): + self.loss = tf.nn.ctc_loss( + inputs=self.predict, + labels=self.labels, + sequence_length=self.seq_len + ) + else: + self.loss = tf.nn.ctc_loss( + labels=self.labels, + inputs=self.predict, + sequence_length=self.seq_len, + ctc_merge_repeated=CTC_MERGE_REPEATED, + preprocess_collapse_repeated=PREPROCESS_COLLAPSE_REPEATED, + ignore_longer_outputs_than_inputs=False, + time_major=CTC_LOSS_TIME_MAJOR + ) self.cost = tf.reduce_mean(self.loss) tf.summary.scalar('cost', self.cost) - self.lrn_rate = tf.train.exponential_decay( TRAINS_LEARNING_RATE, self.global_step, @@ -114,18 +130,59 @@ def _build_train_op(self): ) tf.summary.scalar('learning_rate', self.lrn_rate) - self.optimizer = tf.train.MomentumOptimizer( - learning_rate=self.lrn_rate, - use_nesterov=True, - momentum=MOMENTUM, - ).minimize( - self.cost, - global_step=self.global_step - ) - + update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) + # print(update_ops) # Storing adjusted smoothed mean and smoothed variance operations - train_ops = [self.optimizer] + self.utils.extra_train_ops - self.train_op = tf.group(*train_ops) + with tf.control_dependencies(update_ops): + if OPTIMIZER_MAP[NEU_OPTIMIZER] == Optimizer.AdaBound: + self.train_op = AdaBoundOptimizer( + learning_rate=self.lrn_rate, + final_lr=0.1, + beta1=0.9, + beta2=0.999, + amsbound=True + ).minimize( + loss=self.cost, + global_step=self.global_step + ) + elif OPTIMIZER_MAP[NEU_OPTIMIZER] == Optimizer.Adam: + self.train_op = tf.train.AdamOptimizer( + learning_rate=self.lrn_rate + ).minimize( + self.cost, + global_step=self.global_step + ) + elif OPTIMIZER_MAP[NEU_OPTIMIZER] == Optimizer.Momentum: + self.train_op = tf.train.MomentumOptimizer( + learning_rate=self.lrn_rate, + use_nesterov=True, + momentum=MOMENTUM, + ).minimize( + self.cost, + global_step=self.global_step + ) + elif OPTIMIZER_MAP[NEU_OPTIMIZER] == Optimizer.SGD: + self.train_op = tf.train.GradientDescentOptimizer( + learning_rate=self.lrn_rate, + ).minimize( + self.cost, + global_step=self.global_step + ) + elif OPTIMIZER_MAP[NEU_OPTIMIZER] == Optimizer.AdaGrad: + self.train_op = tf.train.AdagradOptimizer( + learning_rate=self.lrn_rate, + ).minimize( + self.cost, + global_step=self.global_step + ) + elif OPTIMIZER_MAP[NEU_OPTIMIZER] == Optimizer.RMSProp: + self.train_op = tf.train.RMSPropOptimizer( + learning_rate=self.lrn_rate, + decay=DECAY_RATE, + ).minimize( + self.cost, + global_step=self.global_step + ) # Option 2: tf.contrib.ctc.ctc_beam_search_decoder # (it's slower but you'll get better results) @@ -137,13 +194,18 @@ def _build_train_op(self): # Find the optimal path self.decoded, self.log_prob = tf.nn.ctc_beam_search_decoder( - self.predict, - self.seq_len, + inputs=self.predict, + sequence_length=self.seq_len, merge_repeated=False, beam_width=CTC_BEAM_WIDTH, top_paths=CTC_TOP_PATHS, ) + if StrictVersion(tf.__version__) >= StrictVersion('1.12.0'): self.dense_decoded = tf.sparse.to_dense(self.decoded[0], default_value=-1, name="dense_decoded") else: self.dense_decoded = tf.sparse_tensor_to_dense(self.decoded[0], default_value=-1, name="dense_decoded") + + +if __name__ == '__main__': + GraphOCR(RunMode.Predict, CNNNetwork.CNN5, RecurrentNetwork.BLSTM).build_graph() diff --git a/make_dataset.py b/make_dataset.py index c3dcb6a..bc2617c 100644 --- a/make_dataset.py +++ b/make_dataset.py @@ -5,40 +5,33 @@ import random import tensorflow as tf from config import * - -REGEX_MAP = { - RunMode.Trains: TRAINS_REGEX, - RunMode.Test: TEST_REGEX -} +from constants import RunMode _RANDOM_SEED = 0 +label_max_length = 0 + +TFRECORDS_TYPE = [ + RunMode.Trains, + RunMode.Test +] if not os.path.exists(TFRECORDS_DIR): os.makedirs(TFRECORDS_DIR) def _image(path): - with open(path, "rb") as f: return f.read() def _dataset_exists(dataset_dir): - for split_name in TFRECORDS_NAME_MAP.values(): - output_filename = os.path.join(dataset_dir, "{}_{}.tfrecords".format(TARGET_MODEL, split_name)) + for split_name in TFRECORDS_TYPE: + output_filename = os.path.join(dataset_dir, "{}_{}.tfrecords".format(TARGET_MODEL, split_name.value)) if not tf.gfile.Exists(output_filename): return False return True -def _get_all_files(dataset_dir): - file_list = [] - for filename in os.listdir(dataset_dir): - path = os.path.join(dataset_dir, filename) - file_list.append(path) - return file_list - - def bytes_feature(values): return tf.train.Feature(bytes_list=tf.train.BytesList(value=[values])) @@ -51,17 +44,20 @@ def image_to_tfrecords(image_data, label): def _convert_dataset(file_list, mode): - - output_filename = os.path.join(TFRECORDS_DIR, "{}_{}.tfrecords".format(TARGET_MODEL, TFRECORDS_NAME_MAP[mode])) + output_filename = os.path.join(TFRECORDS_DIR, "{}_{}.tfrecords".format(TARGET_MODEL, mode.value)) with tf.python_io.TFRecordWriter(output_filename) as writer: for i, file_name in enumerate(file_list): try: sys.stdout.write('\r>> Converting image %d/%d ' % (i + 1, len(file_list))) sys.stdout.flush() image_data = _image(file_name) - labels = re.search(REGEX_MAP[mode], file_name.split(PATH_SPLIT)[-1]).group() - labels = labels.encode('utf-8') - + labels = re.search(TRAINS_REGEX, file_name.split(PATH_SPLIT)[-1]) + if labels: + labels = labels.group() + else: + raise NameError('invalid filename {}'.format(file_name)) + labelx=labels.split('/')[len(labels.split('/'))-1] + labels = labelx.encode('utf-8') example = image_to_tfrecords(image_data, labels) writer.write(example.SerializeToString()) @@ -73,29 +69,28 @@ def _convert_dataset(file_list, mode): sys.stdout.flush() -def run(): +def make_dataset(): + dataset_path = DATASET_PATH if _dataset_exists(TFRECORDS_DIR): print('Exists!') else: - if isinstance(TRAINS_PATH, list): + if not DATASET_PATH and isinstance(TRAINS_PATH, str) and not TRAINS_PATH.endswith("tfrecords"): + dataset_path = TRAINS_PATH + elif not DATASET_PATH and isinstance(TRAINS_PATH, str) and TRAINS_PATH.endswith("tfrecords"): + print('DATASET_PATH is not configured!') + exit(-1) + + if isinstance(dataset_path, list): origin_dataset = [] - for trains_path in TRAINS_PATH: + for trains_path in dataset_path: origin_dataset += [os.path.join(trains_path, trains) for trains in os.listdir(trains_path)] else: - origin_dataset = [os.path.join(TRAINS_PATH, trains) for trains in os.listdir(TRAINS_PATH)] - if HAS_TEST_SET: - trains_dataset = origin_dataset - if isinstance(TEST_PATH, list): - test_dataset = [] - for test_path in TEST_PATH: - test_dataset += [os.path.join(test_path, test) for test in os.listdir(test_path)] - else: - test_dataset = [os.path.join(TEST_PATH, test) for test in os.listdir(TEST_PATH)] - else: - random.seed(_RANDOM_SEED) - random.shuffle(origin_dataset) - test_dataset = origin_dataset[:TEST_SET_NUM] - trains_dataset = origin_dataset[TEST_SET_NUM:] + origin_dataset = [os.path.join(TRAINS_PATH, trains) for trains in os.listdir(dataset_path)] + + random.seed(_RANDOM_SEED) + random.shuffle(origin_dataset) + test_dataset = origin_dataset[:TEST_SET_NUM] + trains_dataset = origin_dataset[TEST_SET_NUM:] _convert_dataset(test_dataset, mode=RunMode.Test) _convert_dataset(trains_dataset, mode=RunMode.Trains) @@ -103,4 +98,4 @@ def run(): if __name__ == '__main__': - run() + make_dataset() diff --git a/model_demo.yaml b/model_demo.yaml index fb93134..353e49d 100644 --- a/model_demo.yaml +++ b/model_demo.yaml @@ -1,25 +1,24 @@ -# Sites: A bindable parameter used to select a model. -# - If this parameter is defined, -# - it can be identified by using the model_site parameter -# - to identify a model that is inconsistent with the actual size of the current model. +# - requirement.txt - GPU: tensorflow-gpu, CPU: tensorflow +# - If you use the GPU version, you need to install some additional applications. +System: + DeviceUsage: 0.7 + # ModelName: Corresponding to the model file in the model directory, # - such as YourModelName.pb, fill in YourModelName here. -# ModelType: This parameter is also used to locate the model. -# - The difference from the sites is that if there is no corresponding site, -# - the size will be used to assign the model. -# - If a model of the corresponding size and corresponding to the ModelType is not found, -# - the model belonging to the category is preferentially selected. # CharSet: Provides a default optional built-in solution: # - [ALPHANUMERIC, ALPHANUMERIC_LOWER, ALPHANUMERIC_UPPER, -# -- NUMERIC, ALPHABET_LOWER, ALPHABET_UPPER, ALPHABET] +# -- NUMERIC, ALPHABET_LOWER, ALPHABET_UPPER, ALPHABET, ALPHANUMERIC_LOWER_MIX_CHINESE_3500] # - Or you can use your own customized character set like: ['a', '1', '2']. +# CharMaxLength: Maximum length of characters, used for label padding. # CharExclude: CharExclude should be a list, like: ['a', '1', '2'] # - which is convenient for users to freely combine character sets. # - If you don't want to manually define the character set manually, # - you can choose a built-in character set # - and set the characters to be excluded by CharExclude parameter. Model: - Sites: [] + Sites: [ + 'YourModelName' + ] ModelName: YourModelName ModelType: 150x50 CharSet: ALPHANUMERIC_LOWER @@ -27,15 +26,74 @@ Model: CharReplace: {} ImageWidth: 150 ImageHeight: 50 - ImageChannel: 1 - Version: 1.0 # Binaryzation: [-1: Off, >0 and < 255: On]. # Smoothing: [-1: Off, >0: On]. # Blur: [-1: Off, >0: On]. # Resize: [WIDTH, HEIGHT] # - If the image size is too small, the training effect will be poor and you need to zoom in. +# ReplaceTransparent: [True, False] +# - True: Convert transparent images in RGBA format to opaque RGB format, +# - False: Keep the original image Pretreatment: Binaryzation: -1 Smoothing: -1 - Blur: -1 \ No newline at end of file + Blur: -1 + Resize: [150, 50] + ReplaceTransparent: True + +# CNNNetwork: [CNN5, ResNet, DenseNet] +# RecurrentNetwork: [BLSTM, LSTM, SRU, BSRU, GRU] +# - The recommended configuration is CNN5+BLSTM / ResNet+BLSTM +# HiddenNum: [64, 128, 256] +# - This parameter indicates the number of nodes used to remember and store past states. +# Optimizer: Loss function algorithm for calculating gradient. +# - [AdaBound, Adam, Momentum] +NeuralNet: + CNNNetwork: CNN5 + RecurrentNetwork: BLSTM + HiddenNum: 64 + KeepProb: 0.98 + Optimizer: AdaBound + PreprocessCollapseRepeated: False + CTCMergeRepeated: True + CTCBeamWidth: 1 + CTCTopPaths: 1 + WarpCTC: False + +# TrainsPath and TestPath: The local absolute path of your training and testing set. +# DatasetPath: Package a sample of the TFRecords format from this path. +# TrainRegex and TestRegex: Default matching apple_20181010121212.jpg file. +# - The Default is .*?(?=_.*\.) +# TestSetNum: This is an optional parameter that is used when you want to extract some of the test set +# - from the training set when you are not preparing the test set separately. +# SavedSteps: A Session.run() execution is called a Step, +# - Used to save training progress, Default value is 100. +# ValidationSteps: Used to calculate accuracy, Default value is 500. +# TestSetNum: The number of test sets, if an automatic allocation strategy is used (TestPath not set). +# EndAcc: Finish the training when the accuracy reaches [EndAcc*100]% and other conditions. +# EndCost: Finish the training when the cost reaches EndCost and other conditions. +# EndEpochs: Finish the training when the epoch is greater than the defined epoch and other conditions. +# BatchSize: Number of samples selected for one training step. +# TestBatchSize: Number of samples selected for one validation step. +# LearningRate: Recommended value[0.01: MomentumOptimizer/AdamOptimizer, 0.001: AdaBoundOptimizer] +Trains: + TrainsPath: './dataset/mnist-CNN5BLSTM-H64-28x28_trains.tfrecords' + TestPath: './dataset/mnist-CNN5BLSTM-H64-28x28_test.tfrecords' + DatasetPath: [ + "D:/***" + ] + TrainRegex: '.*?(?=_)' + TestSetNum: 300 + SavedSteps: 100 + ValidationSteps: 500 + EndAcc: 0.95 + EndCost: 0.1 + EndEpochs: 2 + BatchSize: 128 + TestBatchSize: 300 + LearningRate: 0.001 + DecayRate: 0.98 + DecaySteps: 10000 + + diff --git a/network/CNN.py b/network/CNN.py new file mode 100644 index 0000000..b59e648 --- /dev/null +++ b/network/CNN.py @@ -0,0 +1,32 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- +# Author: kerlomz +import tensorflow as tf +from network.utils import NetworkUtils +from config import IMAGE_CHANNEL + + +class CNN5(object): + + def __init__(self, inputs: tf.Tensor, utils: NetworkUtils): + self.inputs = inputs + self.utils = utils + # (in_channels, out_channels) + self.filters = [(IMAGE_CHANNEL, 32), (32, 64), (64, 128), (128, 128), (128, 64)] + # (conv2d_strides, max_pool_strides) + self.strides = [(1, 1), (1, 2), (1, 2), (1, 2), (1, 2)] + self.filter_size = [7, 5, 3, 3, 3] + + def build(self): + with tf.variable_scope('cnn'): + x = self.inputs + x = self.utils.cnn_layers( + inputs=x, + filter_size=self.filter_size, + filters=self.filters, + strides=self.strides + ) + + shape_list = x.get_shape().as_list() + x = tf.reshape(x, [tf.shape(x)[0], -1, shape_list[2] * shape_list[3]]) + return x diff --git a/network/CNN5.py b/network/CNN5.py deleted file mode 100644 index 018d52a..0000000 --- a/network/CNN5.py +++ /dev/null @@ -1,51 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding:utf-8 -*- -# Author: kerlomz -import tensorflow as tf -from network.utils import NetworkUtils -from config import IMAGE_CHANNEL - - -class CNN5(object): - - def __init__(self, inputs: tf.Tensor, utils: NetworkUtils): - self.inputs = inputs - self.utils = utils - self.filters = [32, 64, 128, 128, 64] - self.strides = [1, 2] - - def build(self): - with tf.variable_scope('cnn'): - with tf.variable_scope('unit-1'): - x = self.utils.conv2d(self.inputs, 'cnn-1', 7, IMAGE_CHANNEL, self.filters[0], self.strides[0]) - x = self.utils.batch_norm('bn1', x) - x = self.utils.leaky_relu(x, 0.01) - x = self.utils.max_pool(x, 2, self.strides[0]) - - with tf.variable_scope('unit-2'): - x = self.utils.conv2d(x, 'cnn-2', 5, self.filters[0], self.filters[1], self.strides[0]) - x = self.utils.batch_norm('bn2', x) - x = self.utils.leaky_relu(x, 0.01) - x = self.utils.max_pool(x, 2, self.strides[1]) - - with tf.variable_scope('unit-3'): - x = self.utils.conv2d(x, 'cnn-3', 3, self.filters[1], self.filters[2], self.strides[0]) - x = self.utils.batch_norm('bn3', x) - x = self.utils.leaky_relu(x, 0.01) - x = self.utils.max_pool(x, 2, self.strides[1]) - - with tf.variable_scope('unit-4'): - x = self.utils.conv2d(x, 'cnn-4', 3, self.filters[2], self.filters[3], self.strides[0]) - x = self.utils.batch_norm('bn4', x) - x = self.utils.leaky_relu(x, 0.01) - x = self.utils.max_pool(x, 2, self.strides[1]) - - with tf.variable_scope('unit-5'): - x = self.utils.conv2d(x, 'cnn-5', 3, self.filters[3], self.filters[4], self.strides[0]) - x = self.utils.batch_norm('bn5', x) - x = self.utils.leaky_relu(x, 0.01) - x = self.utils.max_pool(x, 2, self.strides[1]) - - shape_list = x.get_shape().as_list() - x = tf.reshape(x, [-1, shape_list[1], shape_list[2] * shape_list[3]]) - return x diff --git a/network/DenseNet.py b/network/DenseNet.py new file mode 100644 index 0000000..b81e55f --- /dev/null +++ b/network/DenseNet.py @@ -0,0 +1,36 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- +# Author: kerlomz +import tensorflow as tf +from network.utils import NetworkUtils + + +class DenseNet(object): + + def __init__(self, inputs: tf.Tensor, utils: NetworkUtils): + self.inputs = inputs + self.utils = utils + self.nb_filter = 12 + self.strides = (2, 2) + self.kernel_size = 5 + self.padding = "SAME" + + def build(self): + with tf.variable_scope('DenseNet'): + x = tf.layers.conv2d( + inputs=self.inputs, + filters=self.nb_filter, + kernel_size=self.kernel_size, + strides=self.strides, + padding=self.padding, + use_bias=False + ) + x, nb_filter = self.utils.dense_block(x, 8, 8, self.nb_filter) + x, nb_filter = self.utils.transition_block(x, 128, pool_type=2) + x, nb_filter = self.utils.dense_block(x, 8, 8, nb_filter) + x, nb_filter = self.utils.transition_block(x, 128, pool_type=3) + x, nb_filter = self.utils.dense_block(x, 8, 8, nb_filter) + + shape_list = x.get_shape().as_list() + x = tf.reshape(x, [tf.shape(x)[0], -1, shape_list[2] * shape_list[3]]) + return x diff --git a/network/ResNet.py b/network/ResNet.py index 8216498..77b3f79 100644 --- a/network/ResNet.py +++ b/network/ResNet.py @@ -29,8 +29,8 @@ def build(self): ) a1 = self.utils.batch_norm(x=a1, name='bn_conv1') - a1 = tf.nn.relu(a1) - # a1 = self._leaky_relu(a1) + # a1 = tf.nn.relu(a1) + a1 = self.utils.leaky_relu(a1) a1 = tf.nn.max_pool(a1, ksize=(1, 3, 3, 1), strides=(1, 2, 2, 1), padding='VALID') @@ -59,5 +59,5 @@ def build(self): x = self.utils.identity_block(a5, 3, [512, 512, 2048], stage=5, block='c') shape_list = x.get_shape().as_list() - x = tf.reshape(x, [-1, shape_list[1] * shape_list[2], shape_list[3]]) + x = tf.reshape(x, [tf.shape(x)[0], tf.shape(x)[1] * shape_list[2], shape_list[3]]) return x diff --git a/network/utils.py b/network/utils.py index 37d6525..0dc7872 100644 --- a/network/utils.py +++ b/network/utils.py @@ -1,9 +1,9 @@ #!/usr/bin/env python3 # -*- coding:utf-8 -*- # Author: kerlomz +import math import tensorflow as tf from config import * -from tensorflow.python.training import moving_averages class NetworkUtils(object): @@ -18,15 +18,43 @@ def zero_padding(x, pad=(3, 3)): return tf.pad(x, padding, 'CONSTANT') @staticmethod - def conv2d(x, name, filter_size, in_channels, out_channels, strides, padding='SAME'): + def msra_initializer(kl, dl): + """ MSRA weight initializer + (https://arxiv.org/pdf/1502.01852.pdf) + Keyword arguments: + kl -- kernel size + dl -- filter numbers + """ + + stddev = math.sqrt(2. / (kl ** 2 * dl)) + return tf.truncated_normal_initializer(stddev=stddev) + + def cnn_layers(self, inputs, filter_size, filters, strides): + x = inputs + for i in range(len(filter_size)): + with tf.variable_scope('unit-{}'.format(i + 1)): + x = self.conv2d( + x=x, + name='cnn-{}'.format(i + 1), + filter_size=filter_size[i], + in_channels=filters[i][0], + out_channels=filters[i][1], + strides=strides[i][0] + ) + x = self.batch_norm('bn{}'.format(i + 1), x) + x = self.leaky_relu(x, 0.01) + x = self.max_pool(x, 2, strides[i][1]) + return x + + def conv2d(self, x, name, filter_size, in_channels, out_channels, strides, padding='SAME'): # n = filter_size * filter_size * out_channels with tf.variable_scope(name): kernel = tf.get_variable( name='DW', shape=[filter_size, filter_size, in_channels, out_channels], dtype=tf.float32, - initializer=tf.contrib.layers.xavier_initializer() - # initializer=tf.random_normal_initializer(stddev=np.sqrt(2.0 / n)) + # initializer=tf.contrib.layers.xavier_initializer(), + initializer=self.msra_initializer(filter_size, in_channels), ) b = tf.get_variable( @@ -200,10 +228,14 @@ def leaky_relu(x, leakiness=0.0): @staticmethod def max_pool(x, ksize, strides): + if isinstance(ksize, int): + ksize = [ksize, ksize] + if isinstance(strides, int): + strides = [strides, strides] return tf.nn.max_pool( x, - ksize=[1, ksize, ksize, 1], - strides=[1, strides, strides, 1], + ksize=[1, ksize[0], ksize[1], 1], + strides=[1, strides[0], strides[1], 1], padding='SAME', name='max_pool' ) @@ -239,63 +271,48 @@ def stacked_bidirectional_rnn(rnn, num_units, num_layers, inputs, seq_lengths): return _inputs def batch_norm(self, name, x): - with tf.variable_scope(name): - params_shape = [x.get_shape()[-1]] - # offset - beta = tf.get_variable( - 'beta', - params_shape, - tf.float32, - initializer=tf.constant_initializer(0.0, tf.float32) - ) - # scale - gamma = tf.get_variable( - 'gamma', - params_shape, - tf.float32, - initializer=tf.constant_initializer(1.0, tf.float32) - ) + return tf.layers.batch_normalization(x, training=self.mode == RunMode.Trains, fused=True, name=name) - if self.mode == RunMode.Trains: - # Calculate the mean and standard deviation for each channel. - mean, variance = tf.nn.moments(x, [0, 1, 2], name='moments') - # New or build batch average, standard deviation used in the test phase. - moving_mean = tf.get_variable( - 'moving_mean', - params_shape, tf.float32, - initializer=tf.constant_initializer(0.0, tf.float32), - trainable=False - ) - moving_variance = tf.get_variable( - 'moving_variance', - params_shape, tf.float32, - initializer=tf.constant_initializer(1.0, tf.float32), - trainable=False - ) - # Add update operation for batch mean and standard deviation (sliding average) - # moving_mean = moving_mean * decay + mean * (1 - decay) - # moving_variance = moving_variance * decay + variance * (1 - decay) - self.extra_train_ops.append(moving_averages.assign_moving_average(moving_mean, mean, 0.9)) - self.extra_train_ops.append(moving_averages.assign_moving_average(moving_variance, variance, 0.9)) - else: - # Obtain the batch mean and standard deviation accumulated during training. - mean = tf.get_variable( - 'moving_mean', - params_shape, tf.float32, - initializer=tf.constant_initializer(0.0, tf.float32), - trainable=False - ) - variance = tf.get_variable( - 'moving_variance', - params_shape, tf.float32, - initializer=tf.constant_initializer(1.0, tf.float32), - trainable=False - ) - # Add to histogram summary. - tf.summary.histogram(mean.op.name, mean) - tf.summary.histogram(variance.op.name, variance) - - # BN Layer:((x-mean)/var)*gamma+beta - y = tf.nn.batch_normalization(x, mean, variance, beta, gamma, 0.001) - y.set_shape(x.get_shape()) - return y + def conv_block(self, x, growth_rate, dropout_rate=None): + _x = self.batch_norm(name=None, x=x) + _x = self.leaky_relu(_x) + + _x = tf.layers.conv2d( + inputs=_x, + filters=growth_rate, + kernel_size=3, + strides=(1, 1), + padding='SAME', + kernel_initializer=self.msra_initializer(3, growth_rate) + ) + if dropout_rate is not None: + _x = tf.nn.dropout(_x, dropout_rate) + return _x + + def dense_block(self, x, nb_layers, growth_rate, nb_filter, dropout_rate=0.2): + for i in range(nb_layers): + cb = self.conv_block(x, growth_rate, dropout_rate) + x = tf.concat([x, cb], 3) + nb_filter += growth_rate + return x, nb_filter + + def transition_block(self, x, filters, dropout_kp=None, pool_type=1): + _x = self.batch_norm(name=None, x=x) + _x = self.leaky_relu(_x) + _x = tf.layers.conv2d( + inputs=_x, + filters=filters, + kernel_size=1, + strides=(1, 1), + padding='SAME', + kernel_initializer=self.msra_initializer(3, filters) + ) + if dropout_kp is not None: + _x = tf.nn.dropout(_x, dropout_kp) + if pool_type == 2: + _x = tf.nn.avg_pool(_x, [1, 2, 2, 1], [1, 2, 2, 1], "VALID") + elif pool_type == 1: + _x = tf.nn.avg_pool(_x, [1, 2, 2, 1], [1, 2, 1, 1], "SAME") + elif pool_type == 3: + _x = tf.nn.avg_pool(_x, [1, 2, 2, 1], [1, 1, 2, 1], "SAME") + return _x, filters diff --git a/optimizer/AdaBound.py b/optimizer/AdaBound.py new file mode 100644 index 0000000..4cee26b --- /dev/null +++ b/optimizer/AdaBound.py @@ -0,0 +1,264 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- +# Author: kerlomz +import tensorflow as tf +from distutils.version import StrictVersion +from tensorflow.python.eager import context +from tensorflow.python.framework import ops +from tensorflow.python.ops import control_flow_ops +from tensorflow.python.ops import math_ops +from tensorflow.python.ops import resource_variable_ops +from tensorflow.python.ops import state_ops +from tensorflow.python.ops import variable_scope +from tensorflow.python.training import optimizer +from tensorflow.python.ops.clip_ops import clip_by_value + +"""Implements AdaBound algorithm. + It has been proposed in `Adaptive Gradient Methods with Dynamic Bound of Learning Rate`_. + Arguments: + params (iterable): iterable of parameters to optimize or dicts defining + parameter groups + lr (float, optional): Adam learning rate (default: 1e-3) + betas (Tuple[float, float], optional): coefficients used for computing + running averages of gradient and its square (default: (0.9, 0.999)) + final_lr (float, optional): final (SGD) learning rate (default: 0.1) + gamma (float, optional): convergence speed of the bound functions (default: 1e-3) + eps (float, optional): term added to the denominator to improve + numerical stability (default: 1e-8) + weight_decay (float, optional): weight decay (L2 penalty) (default: 0) + amsbound (boolean, optional): whether to use the AMSBound variant of this algorithm + .. Adaptive Gradient Methods with Dynamic Bound of Learning Rate: + https://openreview.net/forum?id=Bkg3g2R9FX + """ + + +class AdaBoundOptimizer(optimizer.Optimizer): + def __init__(self, learning_rate=0.001, final_lr=0.1, beta1=0.9, beta2=0.999, + gamma=1e-3, epsilon=1e-8, amsbound=False, + use_locking=False, name="AdaBound"): + super(AdaBoundOptimizer, self).__init__(use_locking, name) + self._lr = learning_rate + self._final_lr = final_lr + self._beta1 = beta1 + self._beta2 = beta2 + self._epsilon = epsilon + + self._gamma = gamma + self._amsbound = amsbound + + self._lr_t = None + self._beta1_t = None + self._beta2_t = None + self._epsilon_t = None + + def _create_slots(self, var_list): + first_var = min(var_list, key=lambda x: x.name) + if StrictVersion(tf.__version__) >= StrictVersion('1.10.0'): + graph = None if context.executing_eagerly() else ops.get_default_graph() + else: + graph = ops.get_default_graph() + create_new = self._get_non_slot_variable("beta1_power", graph) is None + if not create_new and context.in_graph_mode(): + create_new = (self._get_non_slot_variable("beta1_power", graph).graph is not first_var.graph) + + if create_new: + self._create_non_slot_variable(initial_value=self._beta1, + name="beta1_power", + colocate_with=first_var) + self._create_non_slot_variable(initial_value=self._beta2, + name="beta2_power", + colocate_with=first_var) + self._create_non_slot_variable(initial_value=self._gamma, + name="gamma_multi", + colocate_with=first_var) + # Create slots for the first and second moments. + for v in var_list : + self._zeros_slot(v, "m", self._name) + self._zeros_slot(v, "v", self._name) + self._zeros_slot(v, "vhat", self._name) + + def _prepare(self): + self._lr_t = ops.convert_to_tensor(self._lr) + self._base_lr_t = ops.convert_to_tensor(self._lr) + self._beta1_t = ops.convert_to_tensor(self._beta1) + self._beta2_t = ops.convert_to_tensor(self._beta2) + self._epsilon_t = ops.convert_to_tensor(self._epsilon) + self._gamma_t = ops.convert_to_tensor(self._gamma) + + def _apply_dense(self, grad, var): + if StrictVersion(tf.__version__) >= StrictVersion('1.10.0'): + graph = None if context.executing_eagerly() else ops.get_default_graph() + else: + graph = ops.get_default_graph() + beta1_power = math_ops.cast(self._get_non_slot_variable("beta1_power", graph=graph), var.dtype.base_dtype) + beta2_power = math_ops.cast(self._get_non_slot_variable("beta2_power", graph=graph), var.dtype.base_dtype) + lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype) + base_lr_t = math_ops.cast(self._base_lr_t, var.dtype.base_dtype) + beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype) + beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype) + epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype) + gamma_multi = math_ops.cast(self._get_non_slot_variable("gamma_multi", graph=graph), var.dtype.base_dtype) + + step_size = (lr_t * math_ops.sqrt(1 - beta2_power) / (1 - beta1_power)) + final_lr = self._final_lr * lr_t / base_lr_t + lower_bound = final_lr * (1. - 1. / (gamma_multi + 1.)) + upper_bound = final_lr * (1. + 1. / (gamma_multi)) + + # m_t = beta1 * m + (1 - beta1) * g_t + m = self.get_slot(var, "m") + m_scaled_g_values = grad * (1 - beta1_t) + m_t = state_ops.assign(m, beta1_t * m + m_scaled_g_values, use_locking=self._use_locking) + + # v_t = beta2 * v + (1 - beta2) * (g_t * g_t) + v = self.get_slot(var, "v") + v_scaled_g_values = (grad * grad) * (1 - beta2_t) + v_t = state_ops.assign(v, beta2_t * v + v_scaled_g_values, use_locking=self._use_locking) + + # amsgrad + vhat = self.get_slot(var, "vhat") + if self._amsbound : + vhat_t = state_ops.assign(vhat, math_ops.maximum(v_t, vhat)) + v_sqrt = math_ops.sqrt(vhat_t) + else: + vhat_t = state_ops.assign(vhat, vhat) + v_sqrt = math_ops.sqrt(v_t) + + # Compute the bounds + step_size_bound = step_size / (v_sqrt + epsilon_t) + bounded_lr = m_t * clip_by_value(step_size_bound, lower_bound, upper_bound) + + var_update = state_ops.assign_sub(var, bounded_lr, use_locking=self._use_locking) + return control_flow_ops.group(*[var_update, m_t, v_t, vhat_t]) + + def _resource_apply_dense(self, grad, var): + if StrictVersion(tf.__version__) >= StrictVersion('1.10.0'): + graph = None if context.executing_eagerly() else ops.get_default_graph() + else: + graph = ops.get_default_graph() + beta1_power = math_ops.cast(self._get_non_slot_variable("beta1_power", graph=graph), grad.dtype.base_dtype) + beta2_power = math_ops.cast(self._get_non_slot_variable("beta2_power", graph=graph), grad.dtype.base_dtype) + lr_t = math_ops.cast(self._lr_t, grad.dtype.base_dtype) + base_lr_t = math_ops.cast(self._base_lr_t, var.dtype.base_dtype) + beta1_t = math_ops.cast(self._beta1_t, grad.dtype.base_dtype) + beta2_t = math_ops.cast(self._beta2_t, grad.dtype.base_dtype) + epsilon_t = math_ops.cast(self._epsilon_t, grad.dtype.base_dtype) + gamma_multi = math_ops.cast(self._get_non_slot_variable("gamma_multi", graph=graph), var.dtype.base_dtype) + + step_size = (lr_t * math_ops.sqrt(1 - beta2_power) / (1 - beta1_power)) + final_lr = self._final_lr * lr_t / base_lr_t + lower_bound = final_lr * (1. - 1. / (gamma_multi + 1.)) + upper_bound = final_lr * (1. + 1. / (gamma_multi)) + + # m_t = beta1 * m + (1 - beta1) * g_t + m = self.get_slot(var, "m") + m_scaled_g_values = grad * (1 - beta1_t) + m_t = state_ops.assign(m, beta1_t * m + m_scaled_g_values, use_locking=self._use_locking) + + # v_t = beta2 * v + (1 - beta2) * (g_t * g_t) + v = self.get_slot(var, "v") + v_scaled_g_values = (grad * grad) * (1 - beta2_t) + v_t = state_ops.assign(v, beta2_t * v + v_scaled_g_values, use_locking=self._use_locking) + + # amsgrad + vhat = self.get_slot(var, "vhat") + if self._amsbound: + vhat_t = state_ops.assign(vhat, math_ops.maximum(v_t, vhat)) + v_sqrt = math_ops.sqrt(vhat_t) + else: + vhat_t = state_ops.assign(vhat, vhat) + v_sqrt = math_ops.sqrt(v_t) + + # Compute the bounds + step_size_bound = step_size / (v_sqrt + epsilon_t) + bounded_lr = m_t * clip_by_value(step_size_bound, lower_bound, upper_bound) + + var_update = state_ops.assign_sub(var, bounded_lr, use_locking=self._use_locking) + + return control_flow_ops.group(*[var_update, m_t, v_t, vhat_t]) + + def _apply_sparse_shared(self, grad, var, indices, scatter_add): + if StrictVersion(tf.__version__) >= StrictVersion('1.10.0'): + graph = None if context.executing_eagerly() else ops.get_default_graph() + else: + graph = ops.get_default_graph() + beta1_power = math_ops.cast(self._get_non_slot_variable("beta1_power", graph=graph), var.dtype.base_dtype) + beta2_power = math_ops.cast(self._get_non_slot_variable("beta2_power", graph=graph), var.dtype.base_dtype) + lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype) + base_lr_t = math_ops.cast(self._base_lr_t, var.dtype.base_dtype) + beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype) + beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype) + epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype) + gamma_t = math_ops.cast(self._gamma_t, var.dtype.base_dtype) + + step_size = (lr_t * math_ops.sqrt(1 - beta2_power) / (1 - beta1_power)) + final_lr = self._final_lr * lr_t / base_lr_t + lower_bound = final_lr * (1. - 1. / (gamma_t + 1.)) + upper_bound = final_lr * (1. + 1. / (gamma_t)) + + # m_t = beta1 * m + (1 - beta1) * g_t + m = self.get_slot(var, "m") + m_scaled_g_values = grad * (1 - beta1_t) + m_t = state_ops.assign(m, m * beta1_t, use_locking=self._use_locking) + with ops.control_dependencies([m_t]): + m_t = scatter_add(m, indices, m_scaled_g_values) + + # v_t = beta2 * v + (1 - beta2) * (g_t * g_t) + v = self.get_slot(var, "v") + v_scaled_g_values = (grad * grad) * (1 - beta2_t) + v_t = state_ops.assign(v, v * beta2_t, use_locking=self._use_locking) + with ops.control_dependencies([v_t]): + v_t = scatter_add(v, indices, v_scaled_g_values) + + # amsgrad + vhat = self.get_slot(var, "vhat") + if self._amsbound: + vhat_t = state_ops.assign(vhat, math_ops.maximum(v_t, vhat)) + v_sqrt = math_ops.sqrt(vhat_t) + else: + vhat_t = state_ops.assign(vhat, vhat) + v_sqrt = math_ops.sqrt(v_t) + + # Compute the bounds + step_size_bound = step_size / (v_sqrt + epsilon_t) + bounded_lr = m_t * clip_by_value(step_size_bound, lower_bound, upper_bound) + + var_update = state_ops.assign_sub(var, bounded_lr, use_locking=self._use_locking) + + return control_flow_ops.group(*[var_update, m_t, v_t, vhat_t]) + + def _apply_sparse(self, grad, var): + return self._apply_sparse_shared( + grad.values, var, grad.indices, + lambda x, i, v: state_ops.scatter_add( # pylint: disable=g-long-lambda + x, i, v, use_locking=self._use_locking)) + + def _resource_scatter_add(self, x, i, v): + with ops.control_dependencies( + [resource_variable_ops.resource_scatter_add(x, i, v)]): + return x.value() + + def _resource_apply_sparse(self, grad, var, indices): + return self._apply_sparse_shared( + grad, var, indices, self._resource_scatter_add) + + def _finish(self, update_ops, name_scope): + # Update the power accumulators. + with ops.control_dependencies(update_ops): + if StrictVersion(tf.__version__) >= StrictVersion('1.10.0'): + graph = None if context.executing_eagerly() else ops.get_default_graph() + else: + graph = ops.get_default_graph() + beta1_power = self._get_non_slot_variable("beta1_power", graph=graph) + beta2_power = self._get_non_slot_variable("beta2_power", graph=graph) + gamma_multi = self._get_non_slot_variable("gamma_multi", graph=graph) + with ops.colocate_with(beta1_power): + update_beta1 = beta1_power.assign( + beta1_power * self._beta1_t, + use_locking=self._use_locking) + update_beta2 = beta2_power.assign( + beta2_power * self._beta2_t, + use_locking=self._use_locking) + update_gamma = gamma_multi.assign( + gamma_multi + self._gamma_t, + use_locking=self._use_locking) + return control_flow_ops.group(*update_ops + [update_beta1, update_beta2, update_gamma], name=name_scope) diff --git a/optimizer/__init__.py b/optimizer/__init__.py new file mode 100644 index 0000000..6c85277 --- /dev/null +++ b/optimizer/__init__.py @@ -0,0 +1,3 @@ +#!/usr/bin/env python3 +# -*- coding:utf-8 -*- +# Author: kerlomz \ No newline at end of file diff --git a/predict_testing.py b/predict_testing.py index e9c6c08..b49c097 100644 --- a/predict_testing.py +++ b/predict_testing.py @@ -6,8 +6,11 @@ import numpy as np import PIL.Image as PIL_Image import tensorflow as tf +from importlib import import_module from config import * +from constants import RunMode from pretreatment import preprocessing +from framework import GraphOCR def get_image_batch(img_bytes): @@ -28,7 +31,12 @@ def load_image(image_bytes): im = np.array(pil_image) im = preprocessing(im, BINARYZATION, SMOOTH, BLUR).astype(np.float32) - im = cv2.resize(im, (RESIZE[0], RESIZE[1])) + if RESIZE[0] == -1: + ratio = RESIZE[1] / size[1] + resize_width = int(ratio * size[0]) + im = cv2.resize(im, (resize_width, RESIZE[1])) + else: + im = cv2.resize(im, (RESIZE[0], RESIZE[1])) im = im.swapaxes(0, 1) return (im[:, :, np.newaxis] if IMAGE_CHANNEL == 1 else im[:, :]) / 255. @@ -58,6 +66,8 @@ def predict_func(image_batch, _sess, dense_decoded, op_input): if __name__ == '__main__': + if WARP_CTC: + import_module('warpctc_tensorflow') graph = tf.Graph() tf_checkpoint = tf.train.latest_checkpoint(MODEL_PATH) sess = tf.Session( @@ -66,17 +76,29 @@ def predict_func(image_batch, _sess, dense_decoded, op_input): # allow_soft_placement=True, # log_device_placement=True, gpu_options=tf.GPUOptions( + allocator_type='BFC', # allow_growth=True, # it will cause fragmentation. - per_process_gpu_memory_fraction=0.1 + per_process_gpu_memory_fraction=0.01 )) ) graph_def = graph.as_graph_def() with graph.as_default(): sess.run(tf.global_variables_initializer()) + # with tf.gfile.GFile(COMPILE_MODEL_PATH.replace('.pb', '_{}.pb'.format(int(0.95 * 10000))), "rb") as f: + # graph_def_file = f.read() + # graph_def.ParseFromString(graph_def_file) + # print('{}.meta'.format(tf_checkpoint)) + model = GraphOCR( + RunMode.Predict, + NETWORK_MAP[NEU_CNN], + NETWORK_MAP[NEU_RECURRENT] + ) + model.build_graph() + saver = tf.train.Saver(tf.global_variables()) + + saver.restore(sess, tf.train.latest_checkpoint(MODEL_PATH)) _ = tf.import_graph_def(graph_def, name="") - saver = tf.train.import_meta_graph('{}.meta'.format(tf_checkpoint)) - saver.restore(sess, tf_checkpoint) dense_decoded_op = sess.graph.get_tensor_by_name("dense_decoded:0") x_op = sess.graph.get_tensor_by_name('input:0') diff --git a/trains.py b/trains.py index e742e22..d172e24 100644 --- a/trains.py +++ b/trains.py @@ -3,7 +3,6 @@ # Author: kerlomz import time import random -import logging import numpy as np import tensorflow as tf import framework @@ -13,9 +12,7 @@ from PIL import ImageFile ImageFile.LOAD_TRUNCATED_IMAGES = True - -logger = logging.getLogger('Training for OCR using {}+{}+CTC'.format(NEU_CNN, NEU_RECURRENT)) -logger.setLevel(logging.INFO) +tf.logging.set_verbosity(tf.logging.INFO) def compile_graph(acc): @@ -30,7 +27,8 @@ def compile_graph(acc): ) model.build_graph() input_graph_def = sess.graph.as_graph_def() - saver = tf.train.Saver() + saver = tf.train.Saver(var_list=tf.global_variables()) + tf.logging.info(tf.train.latest_checkpoint(MODEL_PATH)) saver.restore(sess, tf.train.latest_checkpoint(MODEL_PATH)) output_graph_def = convert_variables_to_constants( @@ -40,7 +38,7 @@ def compile_graph(acc): ) last_compile_model_path = COMPILE_MODEL_PATH.replace('.pb', '_{}.pb'.format(int(acc * 10000))) - with tf.gfile.FastGFile(last_compile_model_path, mode='wb') as gf: + with tf.gfile.GFile(last_compile_model_path, mode='wb') as gf: gf.write(output_graph_def.SerializeToString()) generate_config(acc) @@ -50,11 +48,11 @@ def train_process(mode=RunMode.Trains): model = framework.GraphOCR(mode, NETWORK_MAP[NEU_CNN], NETWORK_MAP[NEU_RECURRENT]) model.build_graph() - print('Loading Trains DataSet...') + tf.logging.info('Loading Trains DataSet...') train_feeder = utils.DataIterator(mode=RunMode.Trains) if TRAINS_USE_TFRECORDS: train_feeder.read_sample_from_tfrecords(TRAINS_PATH) - print('Loading Test DataSet...') + tf.logging.info('Loading Test DataSet...') test_feeder = utils.DataIterator(mode=RunMode.Test) test_feeder.read_sample_from_tfrecords(TEST_PATH) else: @@ -64,7 +62,7 @@ def train_process(mode=RunMode.Trains): origin_list += [os.path.join(trains_path, trains) for trains in os.listdir(trains_path)] else: origin_list = [os.path.join(TRAINS_PATH, trains) for trains in os.listdir(TRAINS_PATH)] - random.shuffle(origin_list) + np.random.shuffle(origin_list) if not HAS_TEST_SET: test_list = origin_list[:TEST_SET_NUM] trains_list = origin_list[TEST_SET_NUM:] @@ -75,15 +73,15 @@ def train_process(mode=RunMode.Trains): test_list += [os.path.join(test_path, test) for test in os.listdir(test_path)] else: test_list = [os.path.join(TEST_PATH, test) for test in os.listdir(TEST_PATH)] - random.shuffle(test_list) + np.random.shuffle(test_list) trains_list = origin_list train_feeder.read_sample_from_files(trains_list) - print('Loading Test DataSet...') + tf.logging.info('Loading Test DataSet...') test_feeder = utils.DataIterator(mode=RunMode.Test) test_feeder.read_sample_from_files(test_list) - print('Total {} Trains DataSets'.format(train_feeder.size)) - print('Total {} Test DataSets'.format(test_feeder.size)) + tf.logging.info('Total {} Trains DataSets'.format(train_feeder.size)) + tf.logging.info('Total {} Test DataSets'.format(test_feeder.size)) if test_feeder.size >= train_feeder.size: exception("The number of training sets cannot be less than the test set.", ) @@ -97,10 +95,11 @@ def train_process(mode=RunMode.Trains): num_batches_per_epoch = int(num_train_samples / BATCH_SIZE) config = tf.ConfigProto( - allow_soft_placement=True, + # allow_soft_placement=True, log_device_placement=False, gpu_options=tf.GPUOptions( - # allow_growth=True, # it will cause fragmentation. + allocator_type='BFC', + allow_growth=True, # it will cause fragmentation. per_process_gpu_memory_fraction=GPU_USAGE) ) accuracy = 0 @@ -116,14 +115,12 @@ def train_process(mode=RunMode.Trains): saver.restore(sess, tf.train.latest_checkpoint(MODEL_PATH)) except ValueError: pass - - print('Start training...') + tf.logging.info('Start training...') while 1: shuffle_trains_idx = np.random.permutation(num_train_samples) - train_cost = 0 start_time = time.time() - _avg_train_cost = 0 + last_train_avg_cost = 0 for cur_batch in range(num_batches_per_epoch): batch_time = time.time() index_list = [ @@ -131,32 +128,39 @@ def train_process(mode=RunMode.Trains): range(cur_batch * BATCH_SIZE, (cur_batch + 1) * BATCH_SIZE) ] if TRAINS_USE_TFRECORDS: - batch_inputs, batch_seq_len, batch_labels = train_feeder.generate_batch_by_tfrecords(sess) + classified_batch = train_feeder.generate_batch_by_tfrecords(sess) else: - batch_inputs, batch_seq_len, batch_labels = train_feeder.generate_batch_by_files(index_list) - - feed = { - model.inputs: batch_inputs, - model.labels: batch_labels, - } - - summary_str, batch_cost, step, _ = sess.run( - [model.merged_summary, model.cost, model.global_step, model.train_op], - feed_dict=feed - ) - train_cost += batch_cost * BATCH_SIZE - avg_train_cost = train_cost / ((cur_batch + 1) * BATCH_SIZE) - - train_writer.add_summary(summary_str, step) - - if step % 100 == 0 and step != 0: - print('Step: {} Time: {:.3f}, Cost = {:.5f}'.format(step, time.time() - batch_time, avg_train_cost)) - - if step % TRAINS_SAVE_STEPS == 0 and step != 0: - saver.save(sess, SAVE_MODEL, global_step=step) - logger.info('save checkpoint at step {0}', format(step)) + classified_batch = train_feeder.generate_batch_by_files(index_list) + step = 0 + class_num = len(classified_batch) + avg_cost = 0 + for index, (shape, batch) in enumerate(classified_batch.items()): + batch_inputs, batch_seq_len, batch_labels = batch + feed = { + model.inputs: batch_inputs, + model.labels: batch_labels, + } - if step % TRAINS_VALIDATION_STEPS == 0 and step != 0: + summary_str, batch_cost, step, _ = sess.run( + [model.merged_summary, model.cost, model.global_step, model.train_op], + feed_dict=feed + ) + avg_cost += batch_cost + last_train_avg_cost = avg_cost / class_num + train_writer.add_summary(summary_str, step) + if step % 100 == index and step not in range(class_num): + tf.logging.info('Step: {} Time: {:.3f} sec/batch, Cost = {:.5f}, {}-BatchSize: {}'.format( + step, + time.time() - batch_time, + batch_cost, + shape, + len(batch_inputs) + )) + if step % TRAINS_SAVE_STEPS == index and index == (class_num - 1) and step not in range(class_num): + saver.save(sess, SAVE_MODEL, global_step=step) + # tf.logging.info('save checkpoint at step {0}'.format(step)) + + if step % TRAINS_VALIDATION_STEPS == (class_num - 1) and step not in range(class_num): shuffle_test_idx = np.random.permutation(num_test_samples) batch_time = time.time() index_test = [ @@ -164,34 +168,44 @@ def train_process(mode=RunMode.Trains): range(cur_batch * TEST_BATCH_SIZE, (cur_batch + 1) * TEST_BATCH_SIZE) ] if TRAINS_USE_TFRECORDS: - test_inputs, batch_seq_len, test_labels = test_feeder.generate_batch_by_tfrecords(sess) + classified_batch = test_feeder.generate_batch_by_tfrecords(sess) else: - test_inputs, batch_seq_len, test_labels = test_feeder.generate_batch_by_files(index_test) - - val_feed = { - model.inputs: test_inputs, - model.labels: test_labels - } - dense_decoded, lr = sess.run( - [model.dense_decoded, model.lrn_rate], - feed_dict=val_feed - ) + classified_batch = test_feeder.generate_batch_by_files(index_test) + + all_dense_decoded = [] + lr = 0 + + for index, (shape, batch) in enumerate(classified_batch.items()): + test_inputs, batch_seq_len, test_labels = batch + val_feed = { + model.inputs: test_inputs, + model.labels: test_labels + } + dense_decoded, sub_lr = sess.run( + [model.dense_decoded, model.lrn_rate], + feed_dict=val_feed + ) + all_dense_decoded += dense_decoded.tolist() + lr += sub_lr accuracy = utils.accuracy_calculation( - test_feeder.labels(None if TRAINS_USE_TFRECORDS else index_test), - dense_decoded, + test_feeder.labels, + all_dense_decoded, ignore_value=[0, -1], ) log = "Epoch: {}, Step: {}, Accuracy = {:.4f}, Cost = {:.5f}, " \ - "Time = {:.3f}, LearningRate: {}" - print(log.format( - epoch_count, step, accuracy, avg_train_cost, time.time() - batch_time, lr + "Time = {:.3f} sec/batch, LearningRate: {}" + tf.logging.info(log.format( + epoch_count, + step, + accuracy, + last_train_avg_cost, time.time() - batch_time, lr / len(classified_batch) )) - _avg_train_cost = avg_train_cost - if accuracy >= TRAINS_END_ACC and epoch_count >= TRAINS_END_EPOCHS and avg_train_cost <= TRAINS_END_COST: + + if accuracy >= TRAINS_END_ACC and epoch_count >= TRAINS_END_EPOCHS and last_train_avg_cost <= TRAINS_END_COST: break - if accuracy >= TRAINS_END_ACC and epoch_count >= TRAINS_END_EPOCHS and _avg_train_cost <= TRAINS_END_COST: + if accuracy >= TRAINS_END_ACC and epoch_count >= TRAINS_END_EPOCHS and last_train_avg_cost <= TRAINS_END_COST: compile_graph(accuracy) - print('Total Time: {}'.format(time.time() - start_time)) + tf.logging.info('Total Time: {} sec.'.format(time.time() - start_time)) break epoch_count += 1 @@ -207,7 +221,7 @@ def generate_config(acc): def main(_): init() train_process() - print('Training completed.') + tf.logging.info('Training completed.') pass diff --git a/tutorial.py b/tutorial.py index ac8ae95..fe74166 100644 --- a/tutorial.py +++ b/tutorial.py @@ -3,29 +3,44 @@ # Author: kerlomz import os import json -import PIL.Image as pil_image +import PIL.Image as pilImage +from constants import * +# - [ALPHANUMERIC, ALPHANUMERIC_LOWER, ALPHANUMERIC_UPPER, +# -- NUMERIC, ALPHABET_LOWER, ALPHABET_UPPER, ALPHABET, ALPHANUMERIC_LOWER_MIX_CHINESE_3500] +charset = SimpleCharset.ALPHANUMERIC_LOWER -class RecurrentNetwork: - LSTM = 'LSTM' - BLSTM = 'BLSTM' - SRU = 'SRU' - BSRU = 'BSRU' - +cnn_network = CNNNetwork.CNN5 +recurrent_network = RecurrentNetwork.BLSTM +optimizer = Optimizer.AdaBound -charset = "ALPHANUMERIC_LOWER" -network = RecurrentNetwork.BLSTM trains_path = [ r"D:\TrainSet\***", ] +test_num = 500 +hidden_num = 64 +beam_width = 1 +learning_rate = None + +name_prefix = None +name_suffix = None +name_prefix = name_prefix if name_prefix else "tutorial" +name_suffix = '-' + str(name_suffix) if name_suffix else '' + model = """ +# - requirement.txt - GPU: tensorflow-gpu, CPU: tensorflow +# - If you use the GPU version, you need to install some additional applications. +System: + DeviceUsage: 0.7 + # ModelName: Corresponding to the model file in the model directory, # - such as YourModelName.pb, fill in YourModelName here. # CharSet: Provides a default optional built-in solution: # - [ALPHANUMERIC, ALPHANUMERIC_LOWER, ALPHANUMERIC_UPPER, # -- NUMERIC, ALPHABET_LOWER, ALPHABET_UPPER, ALPHABET, ALPHANUMERIC_LOWER_MIX_CHINESE_3500] # - Or you can use your own customized character set like: ['a', '1', '2']. +# CharMaxLength: Maximum length of characters, used for label padding. # CharExclude: CharExclude should be a list, like: ['a', '1', '2'] # - which is convenient for users to freely combine character sets. # - If you don't want to manually define the character set manually, @@ -47,46 +62,142 @@ class RecurrentNetwork: # Blur: [-1: Off, >0: On]. # Resize: [WIDTH, HEIGHT] # - If the image size is too small, the training effect will be poor and you need to zoom in. +# ReplaceTransparent: [True, False] +# - True: Convert transparent images in RGBA format to opaque RGB format, +# - False: Keep the original image Pretreatment: Binaryzation: -1 Smoothing: -1 Blur: -1 Resize: @resize + ReplaceTransparent: True -Trains: -# TrainsPath: './dataset/@model_name_trains.tfrecords' -# TestPath: './dataset/@model_name_test.tfrecords' - TrainsPath: @trains_path +# CNNNetwork: [CNN5, ResNet, DenseNet] +# RecurrentNetwork: [BLSTM, LSTM, SRU, BSRU, GRU] +# - The recommended configuration is CNN5+BLSTM / ResNet+BLSTM +# HiddenNum: [64, 128, 256] +# - This parameter indicates the number of nodes used to remember and store past states. +# Optimizer: Loss function algorithm for calculating gradient. +# - [AdaBound, Adam, Momentum, SGD, AdaGrad, RMSProp] +NeuralNet: + CNNNetwork: @cnn_network + RecurrentNetwork: @recurrent_network + HiddenNum: @hidden_num + KeepProb: 0.98 + Optimizer: @optimizer + PreprocessCollapseRepeated: False + CTCMergeRepeated: True + CTCBeamWidth: @beam_width + CTCTopPaths: 1 + WarpCTC: False +# TrainsPath and TestPath: The local absolute path of your training and testing set. +# DatasetPath: Package a sample of the TFRecords format from this path. +# TrainRegex and TestRegex: Default matching apple_20181010121212.jpg file. +# - The Default is .*?(?=_.*\.) +# TestSetNum: This is an optional parameter that is used when you want to extract some of the test set +# - from the training set when you are not preparing the test set separately. +# SavedSteps: A Session.run() execution is called a Step, +# - Used to save training progress, Default value is 100. +# ValidationSteps: Used to calculate accuracy, Default value is 500. +# TestSetNum: The number of test sets, if an automatic allocation strategy is used (TestPath not set). +# EndAcc: Finish the training when the accuracy reaches [EndAcc*100]% and other conditions. +# EndCost: Finish the training when the cost reaches EndCost and other conditions. +# EndEpochs: Finish the training when the epoch is greater than the defined epoch and other conditions. +# BatchSize: Number of samples selected for one training step. +# TestBatchSize: Number of samples selected for one validation step. +# LearningRate: Recommended value[0.01: MomentumOptimizer/AdamOptimizer, 0.001: AdaBoundOptimizer] +Trains: + TrainsPath: './dataset/@model_name_trains.tfrecords' + TestPath: './dataset/@model_name_test.tfrecords' + DatasetPath: @trains_path + TrainRegex: '.*?(?=_)' + TestSetNum: @test_num + SavedSteps: 100 + ValidationSteps: 500 + EndAcc: 0.95 + EndCost: 0.1 + EndEpochs: 2 + BatchSize: 128 + TestBatchSize: 300 + LearningRate: @learning_rate + DecayRate: 0.98 + DecaySteps: 10000 """ -# - [ALPHANUMERIC, ALPHANUMERIC_LOWER, ALPHANUMERIC_UPPER, -# -- NUMERIC, ALPHABET_LOWER, ALPHABET_UPPER, ALPHABET, ALPHANUMERIC_LOWER_MIX_CHINESE_3500] - trains_path = [i.replace("\\", "/") for i in trains_path] file_name = os.listdir(trains_path[0])[0] -size = pil_image.open(os.path.join(trains_path[0], file_name)).size +size = pilImage.open(os.path.join(trains_path[0], file_name)).size width = size[0] height = size[1] + size_str = "{}x{}".format(width, height) -if width > 180 or width < 120: +if width > 160 or width < 120: r_height = int(height * 150 / width) else: r_height = height resize = "[{}, {}]".format(width if r_height == height else 150, r_height) -model_name = 'sell-mix-CNN5{}-{}'.format(network, size_str) -trains_path = json.dumps(trains_path, ensure_ascii=False).replace("]", " ]") -result = model.replace("@trains_path", trains_path).replace("@model_name", model_name).replace("@resize", resize).replace("@size_str", size_str).replace("@width", str(width)).replace("@height", str(height)).replace("@charset", charset) + +model_name = '{}-mix-{}{}-{}-H{}{}'.format( + name_prefix, + cnn_network.value, + recurrent_network.value, + size_str, + hidden_num, + name_suffix +) +trains_path = json.dumps(trains_path, ensure_ascii=False, indent=2).replace('\n', '\n ') + +BEST_LEARNING_RATE = { + Optimizer.AdaBound: 0.001, + Optimizer.Momentum: 0.01, + Optimizer.Adam: 0.01, + Optimizer.SGD: 0.01, + Optimizer.RMSProp: 0.01, + Optimizer.AdaGrad: 0.01, +} + +learning_rate = BEST_LEARNING_RATE[optimizer] if not learning_rate else learning_rate + + +result = model.replace( + "@trains_path", trains_path +).replace( + "@model_name", model_name +).replace( + "@resize", resize +).replace( + "@size_str", size_str +).replace( + "@width", str(width) +).replace( + "@height", str(height) +).replace( + "@charset", str(charset.value) if isinstance(charset, SimpleCharset) else str(charset) +).replace( + "@test_num", str(test_num) +).replace( + "@optimizer", str(optimizer.value) +).replace( + "@hidden_num", str(hidden_num) +).replace( + "@cnn_network", str(cnn_network.value) +).replace( + "@recurrent_network", str(recurrent_network.value) +).replace( + "@beam_width", str(beam_width) +).replace( + "@learning_rate", str(learning_rate) +) print(result) + with open("model.yaml".format(size_str), "w", encoding="utf8") as f: f.write(result) -from make_dataset import run +from make_dataset import make_dataset from trains import main -run() -with open("model.yaml".format(size_str), "w") as f: - f.write("\n".join(result.split("\n")[:-3]).replace("# TrainsPath", " TrainsPath").replace("# TestPath", " TestPath")) +make_dataset() main(None) \ No newline at end of file diff --git a/utils.py b/utils.py index 537b075..c1b9f36 100644 --- a/utils.py +++ b/utils.py @@ -4,10 +4,12 @@ import io import PIL.Image import cv2 +import random import numpy as np import tensorflow as tf - +from tensorflow import keras from config import * +from constants import RunMode from pretreatment import preprocessing PATH_MAP = { @@ -15,11 +17,6 @@ RunMode.Test: TEST_PATH } -REGEX_MAP = { - RunMode.Trains: TRAINS_REGEX, - RunMode.Test: TEST_REGEX -} - def encode_maps(): return {char: i for i, char in enumerate(GEN_CHAR_SET, 0)} @@ -38,7 +35,10 @@ def __init__(self, mode: RunMode): self.next_element = None self.image_path = [] self.label_list = [] + self._label_list = [] self._size = 0 + self.max_length = 0 + self.is_first = True @staticmethod def _encoder(code): @@ -49,10 +49,10 @@ def _encoder(code): if not k or not v: break code.replace(k, v) - code = code.lower() if 'LOWER' in CHAR_SET or not CASE_SENSITIVE else code + code = code.lower() if 'LOWER' in CHAR_SET else code code = code.upper() if 'UPPER' in CHAR_SET else code try: - return [SPACE_INDEX if code == SPACE_TOKEN else encode_maps()[c] for c in list(code)] + return [encode_maps()[c] for c in list(code)] except KeyError as e: exception( 'The sample label {} contains invalid charset: {}.'.format( @@ -64,16 +64,16 @@ def read_sample_from_files(self, data_set=None): if data_set: self.image_path = data_set try: - self.label_list = [ - self._encoder(re.search(REGEX_MAP[self.mode], i.split(PATH_SPLIT)[-1]).group()) for i in data_set + self._label_list = [ + self._encoder(re.search(TRAINS_REGEX, i.split(PATH_SPLIT)[-1]).group()) for i in data_set ] except AttributeError as e: regex_not_found = "group" in e.args[0] if regex_not_found: exception( "Configured {} is '{}', it may be wrong and unable to get label properly.".format( - "TrainRegex" if self.mode == RunMode.Trains else "TestRegex", - TRAINS_REGEX if self.mode == RunMode.Trains else TEST_REGEX + "TrainRegex", + TRAINS_REGEX ), ConfigException.GET_LABEL_REGEX_ERROR ) @@ -86,13 +86,13 @@ def read_sample_from_files(self, data_set=None): self.image_path.append(image_name) # Get the label from the file name based on the regular expression. code = re.search( - REGEX_MAP[self.mode], image_name.split(PATH_SPLIT)[-1] + TRAINS_REGEX, image_name.split(PATH_SPLIT)[-1] ) if not code: exception( "Configured {} is '{}', it may be wrong and unable to get label properly.".format( - "TrainRegex" if self.mode == RunMode.Trains else "TestRegex", - TRAINS_REGEX if self.mode == RunMode.Trains else TEST_REGEX + "TrainRegex", + TRAINS_REGEX ), ConfigException.GET_LABEL_REGEX_ERROR ) @@ -100,8 +100,8 @@ def read_sample_from_files(self, data_set=None): # The manual verification code platform is not case sensitive, # - it will affect the accuracy of the training set. # Here is a case conversion based on the selected character set. - self.label_list.append(self._encoder(code)) - self._size = len(self.label_list) + self._label_list.append(self._encoder(code)) + self._size = len(self._label_list) @staticmethod def parse_example(serial_example): @@ -124,8 +124,13 @@ def read_sample_from_tfrecords(self, path): min_after_dequeue = 1000 batch = BATCH_SIZE if self.mode == RunMode.Trains else TEST_BATCH_SIZE - dataset_train = tf.data.TFRecordDataset(path).map(self.parse_example) - dataset_train = dataset_train.shuffle(min_after_dequeue).batch(batch).repeat() + dataset_train = tf.data.TFRecordDataset( + filenames=path, + # num_parallel_reads=20 + ).map(self.parse_example) + dataset_train = dataset_train.shuffle( + min_after_dequeue + ).batch(batch).repeat() iterator = dataset_train.make_one_shot_iterator() self.next_element = iterator.get_next() @@ -133,14 +138,12 @@ def read_sample_from_tfrecords(self, path): def size(self): return self._size - def labels(self, index): - if (TRAINS_USE_TFRECORDS and self.mode == RunMode.Trains) or (TEST_USE_TFRECORDS and self.mode == RunMode.Test): - return self.label_list - else: - return [self.label_list[i] for i in index] + @property + def labels(self): + return self.label_list @staticmethod - def _image(path_or_bytes): + def _image(path_or_bytes, is_random=False): # im = cv2.imread(path, cv2.IMREAD_GRAYSCALE) # The OpenCV cannot handle gif format images, it will return None. @@ -160,7 +163,16 @@ def _image(path_or_bytes): im = np.array(pil_image) im = preprocessing(im, BINARYZATION, SMOOTH, BLUR).astype(np.float32) - im = cv2.resize(im, (RESIZE[0], RESIZE[1])) + + if RESIZE[0] == -1: + random_ratio = random.choice([2.5, 3, 3.5, 3.2, 2.7, 2.75]) + ratio = RESIZE[1] / size[1] + random_width = int(random_ratio * RESIZE[1]) + resize_width = int(ratio * size[0]) + resize_width = random_width if is_random else resize_width + im = cv2.resize(im, (resize_width, RESIZE[1])) + else: + im = cv2.resize(im, (RESIZE[0], RESIZE[1])) im = im.swapaxes(0, 1) return np.array((im[:, :, np.newaxis] if IMAGE_CHANNEL == 1 else im[:, :]) / 255.) @@ -169,64 +181,190 @@ def _get_input_lens(sequences): lengths = np.asarray([len(_) for _ in sequences], dtype=np.int64) return sequences, lengths - def generate_batch_by_files(self, index=None): - if index: - image_batch = [self._image(self.image_path[i]) for i in index] - label_batch = [self.label_list[i] for i in index] + def generate_batch_by_files(self, image_index=None): + batch = {} + image_batch = [] + label_batch = [] + + if image_index: + # if len(image_index) == TEST_BATCH_SIZE: + # ii = image_index[0] + # ll = self._label_list[ii] + # ll = "".join([GEN_CHAR_SET[_] for _ in ll]) + # import shutil + # shutil.copy(self.image_path[ii], "image/{}.png".format(ll)) + for i, index in enumerate(image_index): + try: + is_training = len(image_index) == BATCH_SIZE + is_random = bool(random.getrandbits(1)) + + image_array = self._image(self.image_path[index], is_random=is_training and is_random) + label_array = self._label_list[index] + if MULTI_SHAPE: + image_shape = "{}x{}".format(image_array.shape[0], image_array.shape[1]) + if image_shape in batch: + batch[image_shape].append((image_array, label_array)) + else: + batch[image_shape] = [(image_array, label_array)] + else: + image_batch.append(image_array) + label_batch.append(label_array) + except OSError: + continue + # else: + # for i, path in enumerate(self.image_path): + # try: + # if i == 0: + # import shutil + # print('----') + # + # shutil.copy(self.image_path[path], "{}.png".format(self._label_list[path])) + # is_random = bool(random.getrandbits(1)) + # image_array = self._image(self.image_path[path], is_random=is_random) + # label_array = self._label_list[path] + # if MULTI_SHAPE: + # image_shape = "{}x{}".format(image_array.shape[0], image_array.shape[1]) + # if image_shape in batch: + # batch[image_shape].append((image_array, label_array)) + # else: + # batch[image_shape] = [(image_array, label_array)] + # else: + # image_batch.append(image_array) + # label_batch.append(label_array) + # except OSError: + # continue + + if MULTI_SHAPE: + self.label_list = sum([v for k, v in batch.items()], []) + self.label_list = [i[1] for i in self.label_list] + return self.classified_generate_batch(batch) else: - image_batch = [self._image(i) for i in self.image_path] - label_batch = self.label_list - return self._generate_batch(image_batch, label_batch) - - def _generate_batch(self, image_batch, label_batch): + if RESIZE[0] == -1: + image_batch = keras.preprocessing.sequence.pad_sequences( + sequences=image_batch, + maxlen=None, + dtype='float32', + padding='post', + truncating='post', + value=0 + ) + # image_batch = self.padding(image_batch) + self.label_list = label_batch + return self.padded_generate_batch(image_batch, label_batch) + + def padded_generate_batch(self, image_batch, label_batch): + classified_batch = {} batch_inputs, batch_seq_len = self._get_input_lens(np.array(image_batch)) batch_labels = sparse_tuple_from_label(label_batch) - self._label_batch = batch_labels - return batch_inputs, batch_seq_len, batch_labels + classified_batch['{}x{}'.format(RESIZE[0], RESIZE[1])] = [batch_inputs, batch_seq_len, batch_labels] + return classified_batch + + def classified_generate_batch(self, batch): + classified_batch = {} + for shape, v in batch.items(): + batch_inputs, batch_seq_len = self._get_input_lens(np.array([i[0] for i in v])) + batch_labels = sparse_tuple_from_label([i[1] for i in v]) + if shape in classified_batch: + classified_batch[shape].append([batch_inputs, batch_seq_len, batch_labels]) + else: + classified_batch[shape] = [batch_inputs, batch_seq_len, batch_labels] + return classified_batch + + @staticmethod + def padding(image_batch): + + max_width = max([np.shape(_)[0] for _ in image_batch]) + padded_image_batch = [] + for image in image_batch: + output_img = np.zeros([max_width, RESIZE[1], IMAGE_CHANNEL]) + output_img[0: np.shape(image)[0]] = image + padded_image_batch.append(output_img) + return padded_image_batch def generate_batch_by_tfrecords(self, sess): + _image, _label = sess.run(self.next_element) + batch = {} + image_batch = [] + label_batch = [] - image_batch, label_batch = [], [] - for (i1, i2) in zip(_image, _label): + for index, (i1, i2) in enumerate(zip(_image, _label)): try: - image_batch.append(self._image(i1)) - label_batch.append(self._encoder(i2)) + is_random = bool(random.getrandbits(1)) + random_and_training = is_random and self.mode == RunMode.Trains + image_array = self._image(i1, is_random=random_and_training) + label_array = self._encoder(i2) + if MULTI_SHAPE: + image_shape = "{}x{}".format(image_array.shape[0], image_array.shape[1]) + if image_shape in batch: + batch[image_shape].append((image_array, label_array)) + else: + batch[image_shape] = [(image_array, label_array)] + else: + image_batch.append(image_array) + label_batch.append(label_array) + except OSError: continue - self.label_list = label_batch - return self._generate_batch(image_batch, label_batch) + + if MULTI_SHAPE: + self.label_list = sum([v for k, v in batch.items()], []) + self.label_list = [i[1] for i in self.label_list] + return self.classified_generate_batch(batch) + else: + if RESIZE[0] == -1: + # image_batch = self.padding(image_batch) + image_batch = keras.preprocessing.sequence.pad_sequences( + sequences=image_batch, + maxlen=None, + dtype='float32', + padding='post', + truncating='post', + value=0 + ) + self.label_list = label_batch + return self.padded_generate_batch(image_batch, label_batch) def accuracy_calculation(original_seq, decoded_seq, ignore_value=None): if ignore_value is None: - ignore_value = [-1, 0] + ignore_value = [-1] original_seq_len = len(original_seq) decoded_seq_len = len(decoded_seq) if original_seq_len != decoded_seq_len: - print(original_seq) - print('original lengths {} is different from the decoded_seq {}, please check again'.format( + tf.logging.error(original_seq) + tf.logging.error('original lengths {} is different from the decoded_seq {}, please check again'.format( original_seq_len, decoded_seq_len )) return 0 count = 0 # Here is for debugging, positioning error source use - # error_sample = [] + error_sample = [] for i, origin_label in enumerate(original_seq): decoded_label = [j for j in decoded_seq[i] if j not in ignore_value] if i < 5: - print(i, len(origin_label), len(decoded_label), origin_label, decoded_label) + tf.logging.info( + "{} {} {} {} {} --> {} {}".format( + i, + len(origin_label), + len(decoded_label), + origin_label, + decoded_label, + [GEN_CHAR_SET[_] for _ in origin_label], + [GEN_CHAR_SET[_] for _ in decoded_label] + ) + ) if origin_label == decoded_label: count += 1 # Training is not useful for decoding # Here is for debugging, positioning error source use - # if origin_label != decoded_label and len(error_sample) < 500: - # error_sample.append({ - # "origin": "".join([decode_maps()[i] for i in origin_label]), - # "decode": "".join([decode_maps()[i] for i in decoded_label]) - # }) - # print(error_sample) + if origin_label != decoded_label and len(error_sample) < 5: + error_sample.append({ + "origin": "".join([GEN_CHAR_SET[_] for _ in origin_label]), + "decode": "".join([GEN_CHAR_SET[_] for _ in decoded_label]) + }) + tf.logging.error(error_sample) return count * 1.0 / len(original_seq) @@ -241,4 +379,3 @@ def sparse_tuple_from_label(sequences, dtype=np.int32): values = np.asarray(values, dtype=dtype) shape = np.asarray([len(sequences), np.asarray(indices).max(0)[1] + 1], dtype=np.int64) return indices, values, shape -