Skip to content

Commit 58417d8

Browse files
TingquanGaoliushuai35changdazhou
authored
[Cherry pick] PP-StructureV3 (#3620)
* fix pre cut&doc title * update pre_cut labels&fix table formula bug * don't process space for reference block * bugfix * sort the reference block that cross mulitple columns independently * fix channel order of layout result * structurev3 support to render imgs in table when save markdown * sort formulas and text in a line && bug fix * bugfix: process imgs in doc * convert "References" and "Abstract" text into title format (#3615) * convert "References" and "Abstract" text into title format * add space when concating title texts * resolve conflicts --------- Co-authored-by: liushuai35 <2690170518@qq.com> Co-authored-by: zhouchangda <zhouchangda@baidu.com>
1 parent 75a9260 commit 58417d8

File tree

8 files changed

+2837
-2281
lines changed

8 files changed

+2837
-2281
lines changed

docs/pipeline_usage/tutorials/ocr_pipelines/PP-StructureV3.en.md

Lines changed: 1833 additions & 0 deletions
Large diffs are not rendered by default.

docs/pipeline_usage/tutorials/ocr_pipelines/layout_parsing.md renamed to docs/pipeline_usage/tutorials/ocr_pipelines/PP-StructureV3.md

Lines changed: 294 additions & 158 deletions
Large diffs are not rendered by default.

docs/pipeline_usage/tutorials/ocr_pipelines/layout_parsing.en.md

Lines changed: 0 additions & 1521 deletions
This file was deleted.

paddlex/configs/pipelines/PP-StructureV3.yaml

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,10 @@ SubModules:
1616
7: 0.3
1717
layout_nms: True
1818
layout_unclip_ratio: 1.0
19-
layout_merge_bboxes_mode: "large"
19+
layout_merge_bboxes_mode:
20+
1: "large" # image
21+
18: "large" # chart
22+
7: "large" # formula
2023

2124
SubPipelines:
2225
DocPreprocessor:
@@ -41,7 +44,7 @@ SubPipelines:
4144
SubModules:
4245
TextDetection:
4346
module_name: text_detection
44-
model_name: PP-OCRv4_mobile_det
47+
model_name: PP-OCRv4_server_det
4548
model_dir: null
4649
limit_side_len: 960
4750
limit_type: max

paddlex/configs/pipelines/layout_parsing.yaml

Lines changed: 0 additions & 101 deletions
This file was deleted.

paddlex/inference/pipelines/layout_parsing/pipeline_v2.py

Lines changed: 55 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
from typing import Optional, Union, Tuple, Iterator
1717
import numpy as np
1818
import re
19+
import copy
1920

2021
from ....utils import logging
2122
from ...common.batch_sampler import ImageBatchSampler
@@ -25,8 +26,7 @@
2526
from ..base import BasePipeline
2627
from ..ocr.result import OCRResult
2728
from .result_v2 import LayoutParsingResultV2
28-
from .utils import get_single_block_parsing_res
29-
from .utils import get_sub_regions_ocr_res
29+
from .utils import get_single_block_parsing_res, get_sub_regions_ocr_res, gather_imgs
3030

3131

3232
class LayoutParsingPipelineV2(BasePipeline):
@@ -227,6 +227,7 @@ def get_layout_parsing_res(
227227
table_res_list: list,
228228
seal_res_list: list,
229229
formula_res_list: list,
230+
imgs_in_doc: list,
230231
text_det_limit_side_len: Optional[int] = None,
231232
text_det_limit_type: Optional[str] = None,
232233
text_det_thresh: Optional[float] = None,
@@ -309,14 +310,17 @@ def get_layout_parsing_res(
309310
del overall_ocr_res["rec_polys"][matched_idx]
310311
del overall_ocr_res["rec_scores"][matched_idx]
311312

312-
if sub_ocr_res["rec_boxes"] != []:
313+
if sub_ocr_res["rec_boxes"].size > 0:
314+
sub_ocr_res["rec_labels"] = ["text"] * len(sub_ocr_res["rec_texts"])
315+
313316
overall_ocr_res["dt_polys"].extend(sub_ocr_res["dt_polys"])
314317
overall_ocr_res["rec_texts"].extend(sub_ocr_res["rec_texts"])
315318
overall_ocr_res["rec_boxes"] = np.concatenate(
316319
[overall_ocr_res["rec_boxes"], sub_ocr_res["rec_boxes"]], axis=0
317320
)
318321
overall_ocr_res["rec_polys"].extend(sub_ocr_res["rec_polys"])
319322
overall_ocr_res["rec_scores"].extend(sub_ocr_res["rec_scores"])
323+
overall_ocr_res["rec_labels"].extend(sub_ocr_res["rec_labels"])
320324

321325
for formula_res in formula_res_list:
322326
x_min, y_min, x_max, y_max = list(map(int, formula_res["dt_polys"]))
@@ -331,14 +335,17 @@ def get_layout_parsing_res(
331335
overall_ocr_res["rec_boxes"] = np.vstack(
332336
(overall_ocr_res["rec_boxes"], [formula_res["dt_polys"]])
333337
)
338+
overall_ocr_res["rec_labels"].append("formula")
334339
overall_ocr_res["rec_polys"].append(poly_points)
335340
overall_ocr_res["rec_scores"].append(1)
336341

337342
parsing_res_list = get_single_block_parsing_res(
343+
self.general_ocr_pipeline,
338344
overall_ocr_res=overall_ocr_res,
339345
layout_det_res=layout_det_res,
340346
table_res_list=table_res_list,
341347
seal_res_list=seal_res_list,
348+
imgs_in_doc=imgs_in_doc,
342349
)
343350

344351
return parsing_res_list
@@ -472,7 +479,7 @@ def predict(
472479
if not self.check_model_settings_valid(model_settings):
473480
yield {"error": "the input params for model settings are invalid!"}
474481

475-
for img_id, batch_data in enumerate(self.batch_sampler(input)):
482+
for batch_data in self.batch_sampler(input):
476483
image_array = self.img_reader(batch_data.instances)[0]
477484

478485
if model_settings["use_doc_preprocessor"]:
@@ -497,6 +504,7 @@ def predict(
497504
layout_merge_bboxes_mode=layout_merge_bboxes_mode,
498505
)
499506
)
507+
imgs_in_doc = gather_imgs(doc_preprocessor_image, layout_det_res["boxes"])
500508

501509
if model_settings["use_formula_recognition"]:
502510
formula_res_all = next(
@@ -535,15 +543,55 @@ def predict(
535543
else:
536544
overall_ocr_res = {}
537545

546+
overall_ocr_res["rec_labels"] = ["text"] * len(overall_ocr_res["rec_texts"])
547+
538548
if model_settings["use_table_recognition"]:
549+
table_contents = copy.deepcopy(overall_ocr_res)
550+
for formula_res in formula_res_list:
551+
x_min, y_min, x_max, y_max = list(map(int, formula_res["dt_polys"]))
552+
poly_points = [
553+
(x_min, y_min),
554+
(x_max, y_min),
555+
(x_max, y_max),
556+
(x_min, y_max),
557+
]
558+
table_contents["dt_polys"].append(poly_points)
559+
table_contents["rec_texts"].append(
560+
f"${formula_res['rec_formula']}$"
561+
)
562+
table_contents["rec_boxes"] = np.vstack(
563+
(table_contents["rec_boxes"], [formula_res["dt_polys"]])
564+
)
565+
table_contents["rec_polys"].append(poly_points)
566+
table_contents["rec_scores"].append(1)
567+
568+
for img in imgs_in_doc:
569+
img_path = img["path"]
570+
x_min, y_min, x_max, y_max = img["coordinate"]
571+
poly_points = [
572+
(x_min, y_min),
573+
(x_max, y_min),
574+
(x_max, y_max),
575+
(x_min, y_max),
576+
]
577+
table_contents["dt_polys"].append(poly_points)
578+
table_contents["rec_texts"].append(
579+
f'<div style="text-align: center;"><img src="{img_path}" alt="Image" /></div>'
580+
)
581+
table_contents["rec_boxes"] = np.vstack(
582+
(table_contents["rec_boxes"], img["coordinate"])
583+
)
584+
table_contents["rec_polys"].append(poly_points)
585+
table_contents["rec_scores"].append(img["score"])
586+
539587
table_res_all = next(
540588
self.table_recognition_pipeline(
541589
doc_preprocessor_image,
542590
use_doc_orientation_classify=False,
543591
use_doc_unwarping=False,
544592
use_layout_detection=False,
545593
use_ocr_model=False,
546-
overall_ocr_res=overall_ocr_res,
594+
overall_ocr_res=table_contents,
547595
layout_det_res=layout_det_res,
548596
cell_sort_by_y_projection=True,
549597
),
@@ -579,6 +627,7 @@ def predict(
579627
table_res_list=table_res_list,
580628
seal_res_list=seal_res_list,
581629
formula_res_list=formula_res_list,
630+
imgs_in_doc=imgs_in_doc,
582631
text_det_limit_side_len=text_det_limit_side_len,
583632
text_det_limit_type=text_det_limit_type,
584633
text_det_thresh=text_det_thresh,
@@ -603,6 +652,7 @@ def predict(
603652
"seal_res_list": seal_res_list,
604653
"formula_res_list": formula_res_list,
605654
"parsing_res_list": parsing_res_list,
655+
"imgs_in_doc": imgs_in_doc,
606656
"model_settings": model_settings,
607657
}
608658
yield LayoutParsingResultV2(single_img_res)

paddlex/inference/pipelines/layout_parsing/result_v2.py

Lines changed: 22 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,6 @@
3030
XlsxMixin,
3131
)
3232
from .utils import get_layout_ordering
33-
from .utils import recursive_img_array2path
3433
from .utils import get_show_color
3534

3635

@@ -90,7 +89,7 @@ def _to_img(self) -> dict[str, np.ndarray]:
9089
res_img_dict[key] = sub_seal_res_dict["ocr_res_img"]
9190

9291
# for layout ordering image
93-
image = Image.fromarray(self["doc_preprocessor_res"]["output_img"])
92+
image = Image.fromarray(self["doc_preprocessor_res"]["output_img"][:, :, ::-1])
9493
draw = ImageDraw.Draw(image, "RGBA")
9594
parsing_result = self["parsing_res_list"]
9695
for block in parsing_result:
@@ -238,7 +237,6 @@ def _to_markdown(self) -> dict:
238237
Returns:
239238
Dict
240239
"""
241-
recursive_img_array2path(self["parsing_res_list"], labels=["block_image"])
242240

243241
def _format_data(obj):
244242

@@ -276,14 +274,16 @@ def format_image(label):
276274
)
277275
return "\n".join(img_tags)
278276

279-
def format_reference():
280-
pattern = r"\s*\[\s*\d+\s*\]\s*"
281-
res = re.sub(
282-
pattern,
283-
lambda match: "\n" + match.group(),
284-
block["reference"].replace("\n", ""),
285-
)
286-
return "\n" + res
277+
def format_first_line(templates, format_func, spliter):
278+
lines = block["block_content"].split(spliter)
279+
for idx in range(len(lines)):
280+
line = lines[idx]
281+
if line.strip() == "":
282+
continue
283+
if line.lower() in templates:
284+
lines[idx] = format_func(line)
285+
break
286+
return spliter.join(lines)
287287

288288
def format_table():
289289
return "\n" + block["block_content"]
@@ -300,19 +300,21 @@ def format_table():
300300
"text": lambda: block["block_content"]
301301
.replace("-\n", " ")
302302
.replace("\n", " "),
303-
"abstract": lambda: block["block_content"]
304-
.replace("-\n", " ")
305-
.replace("\n", " "),
303+
"abstract": lambda: format_first_line(
304+
["摘要", "abstract"], lambda l: f"## {l}\n", " "
305+
),
306306
"content": lambda: block["block_content"]
307307
.replace("-\n", " ")
308308
.replace("\n", " "),
309309
"image": lambda: format_image("block_image"),
310310
"chart": lambda: format_image("block_image"),
311311
"formula": lambda: f"$${block['block_content']}$$",
312312
"table": format_table,
313-
"reference": lambda: block["block_content"],
313+
"reference": lambda: format_first_line(
314+
["参考文献", "references"], lambda l: f"## {l}", "\n"
315+
),
314316
"algorithm": lambda: block["block_content"].strip("\n"),
315-
"seal": lambda: format_image("block_content"),
317+
"seal": lambda: f"Words of Seals:\n{block['block_content']}",
316318
}
317319
parsing_res_list = obj["parsing_res_list"]
318320
markdown_content = ""
@@ -382,10 +384,9 @@ def format_table():
382384
page_first_element_seg_start_flag,
383385
page_last_element_seg_end_flag,
384386
)
385-
markdown_info["markdown_images"] = dict()
386-
for block in self["parsing_res_list"]:
387-
if block["block_label"] in ["image", "chart"]:
388-
image_path, image_value = next(iter(block["block_image"].items()))
389-
markdown_info["markdown_images"][image_path] = image_value
387+
388+
markdown_info["markdown_images"] = {}
389+
for img in self["imgs_in_doc"]:
390+
markdown_info["markdown_images"][img["path"]] = img["img"]
390391

391392
return markdown_info

0 commit comments

Comments
 (0)