-
Couldn't load subscription status.
- Fork 60
Add RecResizeNormImg in Rec Transform to manage padding and norm in resize, add yaml of crnn for server version [WIP] #428
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 8 commits
85a3394
575e050
2b8eb1d
a691bd8
84bd673
dfe5313
95c5b52
fbf174f
3c9ac0c
57da9e2
804af55
81cd5ca
e1aca38
4b0d840
4f87096
ba0ea01
a1ff7fd
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,150 @@ | ||
| system: | ||
| mode: 0 # 0 for graph mode, 1 for pynative mode in MindSpore | ||
| distribute: True | ||
| amp_level: 'O3' | ||
| seed: 42 | ||
| log_interval: 100 | ||
| val_while_train: True | ||
| drop_overflow_update: False | ||
|
|
||
| common: | ||
| character_dict_path: &character_dict_path mindocr/utils/dict/en_dict.txt | ||
| num_classes: &num_classes 96 # num_chars_in_dict+1, TODO: retreive it from dict or check correctness | ||
| max_text_len: &max_text_len 24 | ||
| infer_mode: &infer_mode False | ||
| use_space_char: &use_space_char True | ||
| lower: &lower False | ||
| batch_size: &batch_size 64 | ||
|
|
||
| model: | ||
| type: rec | ||
| transform: null | ||
| backbone: | ||
| name: rec_resnet34 | ||
| pretrained: False | ||
| neck: | ||
| name: RNNEncoder | ||
| hidden_size: 256 | ||
| head: | ||
| name: CTCHead | ||
| weight_init: crnn_customised | ||
| bias_init: crnn_customised | ||
| out_channels: *num_classes | ||
|
|
||
| postprocess: | ||
| name: RecCTCLabelDecode | ||
| character_dict_path: *character_dict_path | ||
| use_space_char: *use_space_char | ||
|
|
||
| metric: | ||
| name: RecMetric | ||
| main_indicator: acc | ||
| character_dict_path: *character_dict_path | ||
| ignore_space: True | ||
| print_flag: False | ||
|
|
||
| loss: | ||
| name: CTCLoss | ||
| pred_seq_len: 25 # TODO: retrieve from the network output shape. | ||
| max_label_len: *max_text_len # this value should be smaller than pre_seq_len | ||
| batch_size: *batch_size | ||
|
|
||
| scheduler: | ||
| scheduler: warmup_cosine_decay | ||
| min_lr: 0.000001 | ||
| lr: 0.001 | ||
| num_epochs: 30 | ||
| warmup_epochs: 2 | ||
| decay_epochs: 28 | ||
|
|
||
| optimizer: | ||
| opt: adamw | ||
| filter_bias_and_bn: True | ||
| momentum: 0.95 | ||
| weight_decay: 0.0001 | ||
| nesterov: False | ||
|
|
||
| loss_scaler: | ||
| type: dynamic | ||
| loss_scale: 512 | ||
| scale_factor: 2.0 | ||
| scale_window: 1000 | ||
|
|
||
| train: | ||
| ckpt_save_dir: './crnn_resnet34_server' | ||
| pred_cast_fp32: False # let CTCLoss cast internally | ||
| ema: True # added | ||
| dataset_sink_mode: False | ||
| dataset: | ||
| type: LMDBDataset | ||
| dataset_root: /path/to/data_lmdb_release/ | ||
| data_dir: training/ | ||
| # label_file: # not required when using LMDBDataset | ||
| sample_ratio: 1.0 | ||
| shuffle: True | ||
| transform_pipeline: | ||
| - DecodeImage: | ||
| img_mode: RGB # changed | ||
| to_float32: False | ||
| - RecCTCLabelEncode: | ||
| max_text_len: *max_text_len | ||
| character_dict_path: *character_dict_path | ||
| use_space_char: *use_space_char | ||
| lower: *lower | ||
| - RecResizeNormImg: | ||
| image_shape: [32, 100] # H, W | ||
| infer_mode: *infer_mode | ||
| character_dict_path: *character_dict_path | ||
| padding: True # aspect ratio will be preserved if true. changed | ||
| norm_before_pad: True # changed | ||
| - ToCHWImage: | ||
| # the order of the dataloader list, matching the network input and the input labels for the loss function, and optional data for debug/visaulize | ||
| output_columns: ['image', 'text_seq'] #, 'length'] #'img_path'] | ||
| net_input_column_index: [0] # input indices for network forward func in output_columns | ||
| label_column_index: [1] # input indices marked as label | ||
| #keys_for_loss: 4 # num labels for loss func | ||
|
|
||
| loader: | ||
| shuffle: True | ||
| batch_size: *batch_size | ||
| drop_remainder: True | ||
| max_rowsize: 12 | ||
| num_workers: 8 | ||
|
|
||
| eval: | ||
| ckpt_load_path: ./crnn_resnet34_server/best.ckpt | ||
| dataset_sink_mode: False | ||
| dataset: | ||
| type: LMDBDataset | ||
| dataset_root: /path/to/data_lmdb_release/ | ||
| data_dir: validation/ | ||
| # label_file: # not required when using LMDBDataset | ||
| sample_ratio: 1.0 | ||
| shuffle: False | ||
| transform_pipeline: | ||
| - DecodeImage: | ||
| img_mode: RGB # changed | ||
| to_float32: False | ||
| - RecCTCLabelEncode: | ||
| max_text_len: *max_text_len | ||
| character_dict_path: *character_dict_path | ||
| use_space_char: *use_space_char | ||
| lower: *lower | ||
| - RecResizeNormImg: | ||
| image_shape: [32, 100] # H, W | ||
| infer_mode: *infer_mode | ||
| character_dict_path: *character_dict_path | ||
| padding: True # aspect ratio will be preserved if true. changed | ||
| norm_before_pad: True # changed | ||
| - ToCHWImage: | ||
| # the order of the dataloader list, matching the network input and the input labels for the loss function, and optional data for debug/visaulize | ||
| output_columns: ['image', 'text_padded', 'text_length'] # TODO return text string padding w/ fixed length, and a scaler to indicate the length | ||
| net_input_column_index: [0] # input indices for network forward func in output_columns | ||
| label_column_index: [1, 2] # input indices marked as label | ||
|
|
||
| loader: | ||
| shuffle: False # TODO: tbc | ||
| batch_size: 64 | ||
| drop_remainder: False | ||
| max_rowsize: 12 | ||
| num_workers: 8 |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -11,6 +11,7 @@ | |
| "RecCTCLabelEncode", | ||
| "RecAttnLabelEncode", | ||
| "RecResizeImg", | ||
| "RecResizeNormImg", | ||
| "RecResizeNormForInfer", | ||
| "SVTRRecResizeImg", | ||
| "Rotate90IfVertical", | ||
|
|
@@ -247,7 +248,13 @@ def str2idx(text: str, label_dict: Dict[str, int], max_text_len: int = 23, lower | |
|
|
||
|
|
||
| # TODO: reorganize the code for different resize transformation in rec task | ||
| def resize_norm_img(img, image_shape, padding=True, interpolation=cv2.INTER_LINEAR): | ||
| def resize_norm_img(img, | ||
| image_shape, | ||
| padding=True, | ||
| norm_before_pad=False, | ||
| mean=[127.0, 127.0, 127.0], | ||
| std=[127.0, 127.0, 127.0], | ||
| interpolation=cv2.INTER_LINEAR): | ||
| """ | ||
| resize image | ||
| Args: | ||
|
|
@@ -261,7 +268,8 @@ def resize_norm_img(img, image_shape, padding=True, interpolation=cv2.INTER_LINE | |
| w = img.shape[1] | ||
| c = img.shape[2] | ||
| if not padding: | ||
| resized_image = cv2.resize(img, (imgW, imgH), interpolation=interpolation) | ||
| resized_image = cv2.resize( | ||
| img, (imgW, imgH), interpolation=interpolation) | ||
| resized_w = imgW | ||
| else: | ||
| ratio = w / float(h) | ||
|
|
@@ -271,81 +279,126 @@ def resize_norm_img(img, image_shape, padding=True, interpolation=cv2.INTER_LINE | |
| resized_w = int(math.ceil(imgH * ratio)) | ||
| resized_image = cv2.resize(img, (resized_w, imgH)) | ||
|
|
||
| """ | ||
| resized_image = resized_image.astype('float32') | ||
| if image_shape[0] == 1: | ||
| resized_image = resized_image / 255 | ||
| resized_image = resized_image[np.newaxis, :] | ||
| else: | ||
| resized_image = resized_image.transpose((2, 0, 1)) / 255 | ||
| resized_image -= 0.5 | ||
| resized_image /= 0.5 | ||
| """ | ||
| padding_im = np.zeros((imgH, imgW, c), dtype=np.uint8) | ||
| padding_im[:, 0:resized_w, :] = resized_image | ||
| valid_ratio = min(1.0, float(resized_w / imgW)) | ||
| return padding_im, valid_ratio | ||
|
|
||
| if padding: | ||
| if norm_before_pad: | ||
| resized_image = (resized_image - mean) / std | ||
|
|
||
| padded_img = np.zeros((imgH, imgW, c), dtype=resized_image.dtype) | ||
| padded_img[:, 0:resized_w, :] = resized_image | ||
|
|
||
| if not norm_before_pad: | ||
| padded_img = (padded_img - mean) / std | ||
|
|
||
| return padded_img, valid_ratio | ||
| else: | ||
| resized_image = (resized_image - mean) / std | ||
| return resized_image, valid_ratio | ||
|
|
||
|
|
||
| # TODO: check diff from resize_norm_img | ||
| def resize_norm_img_chinese(img, image_shape): | ||
| """adopted from paddle""" | ||
| def resize_norm_img_chinese(img, | ||
| image_shape, | ||
| norm_before_pad=False, | ||
| mean=[127.0, 127.0, 127.0], | ||
| std=[127.0, 127.0, 127.0], | ||
| interpolation=cv2.INTER_LINEAR): | ||
| ''' | ||
| resize image with aspect-ratio keeping and padding | ||
| Args: | ||
| img: shape (H, W, C) | ||
| image_shape: image shape after resize, in (C, H, W) | ||
|
|
||
| ''' | ||
| imgH, imgW = image_shape | ||
| # todo: change to 0 and modified image shape | ||
| max_wh_ratio = imgW * 1.0 / imgH | ||
| h, w = img.shape[0], img.shape[1] | ||
| c = img.shape[2] | ||
| ratio = w * 1.0 / h | ||
|
|
||
| max_wh_ratio = min(max(max_wh_ratio, ratio), max_wh_ratio) | ||
| imgW = int(imgH * max_wh_ratio) | ||
| if math.ceil(imgH * ratio) > imgW: | ||
| resized_w = imgW | ||
| else: | ||
| resized_w = int(math.ceil(imgH * ratio)) | ||
| resized_image = cv2.resize(img, (resized_w, imgH)) | ||
|
|
||
| """ | ||
| resized_image = resized_image.astype('float32') | ||
| if image_shape[0] == 1: | ||
| resized_image = resized_image / 255 | ||
| resized_image = resized_image[np.newaxis, :] | ||
| else: | ||
| resized_image = resized_image.transpose((2, 0, 1)) / 255 | ||
| resized_image -= 0.5 | ||
| resized_image /= 0.5 | ||
| """ | ||
| # padding_im = np.zeros((imgC, imgH, imgW), dtype=np.float32) | ||
| padding_im = np.zeros((imgH, imgW, c), dtype=np.uint8) | ||
| # padding_im[:, :, 0:resized_w] = resized_image | ||
| padding_im[:, 0:resized_w, :] = resized_image | ||
| valid_ratio = min(1.0, float(resized_w / imgW)) | ||
| return padding_im, valid_ratio | ||
|
|
||
| if norm_before_pad: | ||
| resized_image = (resized_image - mean) / std | ||
|
|
||
| # TODO: remove infer_mode and character_dict_path if they are not necesary | ||
| class RecResizeImg(object): | ||
| """adopted from paddle | ||
| resize, convert from hwc to chw, rescale pixel value to -1 to 1 | ||
| """ | ||
| padded_img = np.zeros((imgH, imgW, c), dtype=resized_image.dtype) | ||
| padded_img[:, 0:resized_w, :] = resized_image | ||
|
|
||
| def __init__(self, image_shape, infer_mode=False, character_dict_path=None, padding=True, **kwargs): | ||
| if not norm_before_pad: | ||
| padded_img = (padded_img - mean) / std | ||
|
|
||
| return padded_img, valid_ratio | ||
|
|
||
|
|
||
| class RecResizeNormImg(object): | ||
| ''' adopted from paddle | ||
| Resize and normalize image, and pad image if needed. | ||
|
|
||
| Args: | ||
| norm_before_pad: If True, perform normalization before padding (by doing so, the padding values will beall zero. Good practice.). Otherwise, per Default: False | ||
| ''' | ||
| def __init__(self, | ||
| image_shape, | ||
| infer_mode=False, | ||
| character_dict_path=None, | ||
| padding=True, | ||
| norm_before_pad=False, | ||
| mean=[127.0, 127.0, 127.0], | ||
| std=[127.0, 127.0, 127.0], | ||
| **kwargs): | ||
| self.image_shape = image_shape | ||
| self.infer_mode = infer_mode | ||
| self.character_dict_path = character_dict_path | ||
| self.padding = padding | ||
| self.norm_before_pad = norm_before_pad | ||
| self.mean = np.array(mean, dtype="float32") | ||
| self.std = np.array(std, dtype="float32") | ||
|
|
||
| def __call__(self, data): | ||
| img = data["image"] | ||
| img = data['image'] | ||
| if self.infer_mode and self.character_dict_path is not None: | ||
| norm_img, valid_ratio = resize_norm_img_chinese(img, self.image_shape) | ||
| norm_img, valid_ratio = resize_norm_img_chinese(img, | ||
| self.image_shape, | ||
| self.norm_before_pad, | ||
| self.mean, | ||
| self.std | ||
| ) | ||
| else: | ||
| norm_img, valid_ratio = resize_norm_img(img, self.image_shape, self.padding) | ||
| data["image"] = norm_img | ||
| data["valid_ratio"] = valid_ratio | ||
| # TODO: data['shape_list'] = ? | ||
| norm_img, valid_ratio = resize_norm_img(img, | ||
| self.image_shape, | ||
| self.padding, | ||
| self.norm_before_pad, | ||
| self.mean, | ||
| self.std, | ||
| ) | ||
| data['image'] = norm_img | ||
| data['valid_ratio'] = valid_ratio | ||
| return data | ||
|
|
||
|
|
||
| # TODO: remove infer_mode and character_dict_path if they are not necesary | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 那个 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 好,我看下
|
||
| class RecResizeImg(RecResizeNormImg): | ||
| ''' | ||
| This is to make compatible with older version code that uses RecResizeImg, which is to be updated. | ||
|
|
||
| TODO: replace RecResizeImg followed by NormlaizeImage in yaml files with RecResizeNormImg op. | ||
| ''' | ||
| def __init__(self, image_shape, infer_mode=False, character_dict_path=None, padding=True, **kwargs): | ||
| super.__init__( | ||
| image_shape, infer_mode, character_dict_path, padding, norm_befoer_pad=False, | ||
| mean=[0., 0., 0.], std=[1., 1., 1.], | ||
| ) | ||
|
|
||
|
|
||
| class SVTRRecResizeImg(object): | ||
| def __init__(self, image_shape, padding=True, **kwargs): | ||
| self.image_shape = image_shape | ||
|
|
@@ -425,9 +478,7 @@ def __call__(self, data): | |
|
|
||
| # TODO: norm before padding | ||
|
|
||
| data["shape_list"] = np.array( | ||
| [h, w, resize_h / h, resize_w / w], dtype=np.float32 | ||
| ) # TODO: reformat, currently align to det | ||
| data['shape_list'] = [h, w, resize_h / h, resize_w / w] # TODO: reformat, currently align to det | ||
| if self.norm_before_pad: | ||
| resized_img = self.norm(resized_img) | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
这一步是多余的。 等价于 max_wh_ratio=max_wh_ratio
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
嗯嗯fixed