PaddlePaddle
diff --git a/‎docs/pipeline_usage/tutorials/ocr_pipelines/PP-StructureV3.en.md
Lines changed: 1833 additions & 0 deletions b/‎docs/pipeline_usage/tutorials/ocr_pipelines/PP-StructureV3.en.md
Lines changed: 1833 additions & 0 deletions
diff --git a/‎docs/pipeline_usage/tutorials/ocr_pipelines/layout_parsing.md renamed to ‎docs/pipeline_usage/tutorials/ocr_pipelines/PP-StructureV3.md
Lines changed: 294 additions & 158 deletions b/‎docs/pipeline_usage/tutorials/ocr_pipelines/layout_parsing.md renamed to ‎docs/pipeline_usage/tutorials/ocr_pipelines/PP-StructureV3.md
Lines changed: 294 additions & 158 deletions
diff --git a/‎docs/pipeline_usage/tutorials/ocr_pipelines/layout_parsing.en.md
Lines changed: 0 additions & 1521 deletions b/‎docs/pipeline_usage/tutorials/ocr_pipelines/layout_parsing.en.md
Lines changed: 0 additions & 1521 deletions
diff --git a/‎paddlex/configs/pipelines/PP-StructureV3.yaml
Lines changed: 5 additions & 2 deletions b/‎paddlex/configs/pipelines/PP-StructureV3.yaml
Lines changed: 5 additions & 2 deletions
diff --git a/‎paddlex/configs/pipelines/layout_parsing.yaml
Lines changed: 0 additions & 101 deletions b/‎paddlex/configs/pipelines/layout_parsing.yaml
Lines changed: 0 additions & 101 deletions
diff --git a/‎paddlex/inference/pipelines/layout_parsing/pipeline_v2.py
Lines changed: 55 additions & 5 deletions b/‎paddlex/inference/pipelines/layout_parsing/pipeline_v2.py
Lines changed: 55 additions & 5 deletions
diff --git a/‎paddlex/inference/pipelines/layout_parsing/result_v2.py
Lines changed: 22 additions & 21 deletions b/‎paddlex/inference/pipelines/layout_parsing/result_v2.py
Lines changed: 22 additions & 21 deletions
@@ -16,7 +16,10 @@ SubModules:
       7: 0.3
     layout_nms: True
     layout_unclip_ratio: 1.0
-    layout_merge_bboxes_mode: "large"
+    layout_merge_bboxes_mode: 
+      1: "large"  # image
+      18: "large" # chart
+      7: "large"  # formula
 
 SubPipelines:
   DocPreprocessor:
@@ -41,7 +44,7 @@ SubPipelines:
     SubModules:
       TextDetection:
         module_name: text_detection
-        model_name: PP-OCRv4_mobile_det
+        model_name: PP-OCRv4_server_det
         model_dir: null
         limit_side_len: 960
         limit_type: max
 
@@ -16,6 +16,7 @@
 from typing import Optional, Union, Tuple, Iterator
 import numpy as np
 import re
+import copy
 
 from ....utils import logging
 from ...common.batch_sampler import ImageBatchSampler
@@ -25,8 +26,7 @@
 from ..base import BasePipeline
 from ..ocr.result import OCRResult
 from .result_v2 import LayoutParsingResultV2
-from .utils import get_single_block_parsing_res
-from .utils import get_sub_regions_ocr_res
+from .utils import get_single_block_parsing_res, get_sub_regions_ocr_res, gather_imgs
 
 
 class LayoutParsingPipelineV2(BasePipeline):
@@ -227,6 +227,7 @@ def get_layout_parsing_res(
         table_res_list: list,
         seal_res_list: list,
         formula_res_list: list,
+        imgs_in_doc: list,
         text_det_limit_side_len: Optional[int] = None,
         text_det_limit_type: Optional[str] = None,
         text_det_thresh: Optional[float] = None,
@@ -309,14 +310,17 @@ def get_layout_parsing_res(
                     del overall_ocr_res["rec_polys"][matched_idx]
                     del overall_ocr_res["rec_scores"][matched_idx]
 
-                if sub_ocr_res["rec_boxes"] != []:
+                if sub_ocr_res["rec_boxes"].size > 0:
+                    sub_ocr_res["rec_labels"] = ["text"] * len(sub_ocr_res["rec_texts"])
+
                     overall_ocr_res["dt_polys"].extend(sub_ocr_res["dt_polys"])
                     overall_ocr_res["rec_texts"].extend(sub_ocr_res["rec_texts"])
                     overall_ocr_res["rec_boxes"] = np.concatenate(
                         [overall_ocr_res["rec_boxes"], sub_ocr_res["rec_boxes"]], axis=0
                     )
                     overall_ocr_res["rec_polys"].extend(sub_ocr_res["rec_polys"])
                     overall_ocr_res["rec_scores"].extend(sub_ocr_res["rec_scores"])
+                    overall_ocr_res["rec_labels"].extend(sub_ocr_res["rec_labels"])
 
         for formula_res in formula_res_list:
             x_min, y_min, x_max, y_max = list(map(int, formula_res["dt_polys"]))
@@ -331,14 +335,17 @@ def get_layout_parsing_res(
             overall_ocr_res["rec_boxes"] = np.vstack(
                 (overall_ocr_res["rec_boxes"], [formula_res["dt_polys"]])
             )
+            overall_ocr_res["rec_labels"].append("formula")
             overall_ocr_res["rec_polys"].append(poly_points)
             overall_ocr_res["rec_scores"].append(1)
 
         parsing_res_list = get_single_block_parsing_res(
+            self.general_ocr_pipeline,
             overall_ocr_res=overall_ocr_res,
             layout_det_res=layout_det_res,
             table_res_list=table_res_list,
             seal_res_list=seal_res_list,
+            imgs_in_doc=imgs_in_doc,
         )
 
         return parsing_res_list
@@ -472,7 +479,7 @@ def predict(
         if not self.check_model_settings_valid(model_settings):
             yield {"error": "the input params for model settings are invalid!"}
 
-        for img_id, batch_data in enumerate(self.batch_sampler(input)):
+        for batch_data in self.batch_sampler(input):
             image_array = self.img_reader(batch_data.instances)[0]
 
             if model_settings["use_doc_preprocessor"]:
@@ -497,6 +504,7 @@ def predict(
                     layout_merge_bboxes_mode=layout_merge_bboxes_mode,
                 )
             )
+            imgs_in_doc = gather_imgs(doc_preprocessor_image, layout_det_res["boxes"])
 
             if model_settings["use_formula_recognition"]:
                 formula_res_all = next(
@@ -535,15 +543,55 @@ def predict(
             else:
                 overall_ocr_res = {}
 
+            overall_ocr_res["rec_labels"] = ["text"] * len(overall_ocr_res["rec_texts"])
+
             if model_settings["use_table_recognition"]:
+                table_contents = copy.deepcopy(overall_ocr_res)
+                for formula_res in formula_res_list:
+                    x_min, y_min, x_max, y_max = list(map(int, formula_res["dt_polys"]))
+                    poly_points = [
+                        (x_min, y_min),
+                        (x_max, y_min),
+                        (x_max, y_max),
+                        (x_min, y_max),
+                    ]
+                    table_contents["dt_polys"].append(poly_points)
+                    table_contents["rec_texts"].append(
+                        f"${formula_res['rec_formula']}$"
+                    )
+                    table_contents["rec_boxes"] = np.vstack(
+                        (table_contents["rec_boxes"], [formula_res["dt_polys"]])
+                    )
+                    table_contents["rec_polys"].append(poly_points)
+                    table_contents["rec_scores"].append(1)
+
+                for img in imgs_in_doc:
+                    img_path = img["path"]
+                    x_min, y_min, x_max, y_max = img["coordinate"]
+                    poly_points = [
+                        (x_min, y_min),
+                        (x_max, y_min),
+                        (x_max, y_max),
+                        (x_min, y_max),
+                    ]
+                    table_contents["dt_polys"].append(poly_points)
+                    table_contents["rec_texts"].append(
+                        f'<div style="text-align: center;"><img src="{img_path}" alt="Image" /></div>'
+                    )
+                    table_contents["rec_boxes"] = np.vstack(
+                        (table_contents["rec_boxes"], img["coordinate"])
+                    )
+                    table_contents["rec_polys"].append(poly_points)
+                    table_contents["rec_scores"].append(img["score"])
+
                 table_res_all = next(
                     self.table_recognition_pipeline(
                         doc_preprocessor_image,
                         use_doc_orientation_classify=False,
                         use_doc_unwarping=False,
                         use_layout_detection=False,
                         use_ocr_model=False,
-                        overall_ocr_res=overall_ocr_res,
+                        overall_ocr_res=table_contents,
                         layout_det_res=layout_det_res,
                         cell_sort_by_y_projection=True,
                     ),
@@ -579,6 +627,7 @@ def predict(
                 table_res_list=table_res_list,
                 seal_res_list=seal_res_list,
                 formula_res_list=formula_res_list,
+                imgs_in_doc=imgs_in_doc,
                 text_det_limit_side_len=text_det_limit_side_len,
                 text_det_limit_type=text_det_limit_type,
                 text_det_thresh=text_det_thresh,
@@ -603,6 +652,7 @@ def predict(
                 "seal_res_list": seal_res_list,
                 "formula_res_list": formula_res_list,
                 "parsing_res_list": parsing_res_list,
+                "imgs_in_doc": imgs_in_doc,
                 "model_settings": model_settings,
             }
             yield LayoutParsingResultV2(single_img_res)
 
@@ -30,7 +30,6 @@
     XlsxMixin,
 )
 from .utils import get_layout_ordering
-from .utils import recursive_img_array2path
 from .utils import get_show_color
 
 
@@ -90,7 +89,7 @@ def _to_img(self) -> dict[str, np.ndarray]:
                 res_img_dict[key] = sub_seal_res_dict["ocr_res_img"]
 
         # for layout ordering image
-        image = Image.fromarray(self["doc_preprocessor_res"]["output_img"])
+        image = Image.fromarray(self["doc_preprocessor_res"]["output_img"][:, :, ::-1])
         draw = ImageDraw.Draw(image, "RGBA")
         parsing_result = self["parsing_res_list"]
         for block in parsing_result:
@@ -238,7 +237,6 @@ def _to_markdown(self) -> dict:
         Returns:
             Dict
         """
-        recursive_img_array2path(self["parsing_res_list"], labels=["block_image"])
 
         def _format_data(obj):
 
@@ -276,14 +274,16 @@ def format_image(label):
                 )
                 return "\n".join(img_tags)
 
-            def format_reference():
-                pattern = r"\s*\[\s*\d+\s*\]\s*"
-                res = re.sub(
-                    pattern,
-                    lambda match: "\n" + match.group(),
-                    block["reference"].replace("\n", ""),
-                )
-                return "\n" + res
+            def format_first_line(templates, format_func, spliter):
+                lines = block["block_content"].split(spliter)
+                for idx in range(len(lines)):
+                    line = lines[idx]
+                    if line.strip() == "":
+                        continue
+                    if line.lower() in templates:
+                        lines[idx] = format_func(line)
+                    break
+                return spliter.join(lines)
 
             def format_table():
                 return "\n" + block["block_content"]
@@ -300,19 +300,21 @@ def format_table():
                 "text": lambda: block["block_content"]
                 .replace("-\n", " ")
                 .replace("\n", " "),
-                "abstract": lambda: block["block_content"]
-                .replace("-\n", " ")
-                .replace("\n", " "),
+                "abstract": lambda: format_first_line(
+                    ["摘要", "abstract"], lambda l: f"## {l}\n", " "
+                ),
                 "content": lambda: block["block_content"]
                 .replace("-\n", " ")
                 .replace("\n", " "),
                 "image": lambda: format_image("block_image"),
                 "chart": lambda: format_image("block_image"),
                 "formula": lambda: f"$${block['block_content']}$$",
                 "table": format_table,
-                "reference": lambda: block["block_content"],
+                "reference": lambda: format_first_line(
+                    ["参考文献", "references"], lambda l: f"## {l}", "\n"
+                ),
                 "algorithm": lambda: block["block_content"].strip("\n"),
-                "seal": lambda: format_image("block_content"),
+                "seal": lambda: f"Words of Seals:\n{block['block_content']}",
             }
             parsing_res_list = obj["parsing_res_list"]
             markdown_content = ""
@@ -382,10 +384,9 @@ def format_table():
             page_first_element_seg_start_flag,
             page_last_element_seg_end_flag,
         )
-        markdown_info["markdown_images"] = dict()
-        for block in self["parsing_res_list"]:
-            if block["block_label"] in ["image", "chart"]:
-                image_path, image_value = next(iter(block["block_image"].items()))
-                markdown_info["markdown_images"][image_path] = image_value
+
+        markdown_info["markdown_images"] = {}
+        for img in self["imgs_in_doc"]:
+            markdown_info["markdown_images"][img["path"]] = img["img"]
 
         return markdown_info