From 387ea449631f9a99e5c301d466c31870e5b9efce Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Sun, 24 Dec 2023 11:00:34 +0100 Subject: [PATCH] MAINT: Change the positions of the calls of the visitor-function Before the text-visitor-function had been called at each change of the output. But this can lead to wrong coordinates because the output may sent after changing the text-matrix for the next text. As an example have a look at resources/Sample_Td-matrix.pdf: The text_matrix is computed correctly at the Td-operations but the text had been sent after applying the next transformation. In this pull request the texts are sent inside the TJ and Tj operations. This may lead to sending letters instead of words: ``` x=264.53, y=403.13, text='M' x=264.53, y=403.13, text='etad' x=264.53, y=403.13, text='ata' x=307.85, y=403.13, text=' ' ``` Therefore there is a second commit which introduces a temporarily visitor inside the processing of TJ. The temp visitor ist used to collect the letters of TJ which will be sent after processing of TJ. When setting the temp visitor the original parameter is manipulated. I don't know if this is bad style in python. In case of bad style a local variable current_text_visitor may be introduced. See also issue #1377. I haven't checked if #1377 had the Td-matrix-problem or the one to be solved by this PR. -- This PR is a copy of https://github.com/py-pdf/pypdf/pull/1389 The PR#1389 was made a long time ago (before we renamed to pypdf), but it seems still valuable. This PR migrated the changes to the new codebase. Full credit to rogmann for all of the changes. Co-authored-by: rogmann --- pypdf/_page.py | 66 +++++++++++++++++++++--------- pypdf/_text_extraction/__init__.py | 43 +++++++++++++++---- tests/test_page.py | 38 ++++++++++++----- 3 files changed, 109 insertions(+), 38 deletions(-) diff --git a/pypdf/_page.py b/pypdf/_page.py index 120e15a19..8b6461082 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -1873,6 +1873,7 @@ def _extract_text( visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None, visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None, visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None, + group_TJ: bool = True, ) -> str: """ See extract_text for most arguments. @@ -1957,16 +1958,12 @@ def process_operation(operator: bytes, operands: List[Any]) -> None: if operator == b"BT": tm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] output += text - if visitor_text is not None: - visitor_text(text, memo_cm, memo_tm, cmap[3], font_size) text = "" memo_cm = cm_matrix.copy() memo_tm = tm_matrix.copy() return None elif operator == b"ET": output += text - if visitor_text is not None: - visitor_text(text, memo_cm, memo_tm, cmap[3], font_size) text = "" memo_cm = cm_matrix.copy() memo_tm = tm_matrix.copy() @@ -1999,8 +1996,6 @@ def process_operation(operator: bytes, operands: List[Any]) -> None: cm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] elif operator == b"cm": output += text - if visitor_text is not None: - visitor_text(text, memo_cm, memo_tm, cmap[3], font_size) text = "" cm_matrix = mult( [ @@ -2025,8 +2020,6 @@ def process_operation(operator: bytes, operands: List[Any]) -> None: elif operator == b"Tf": if text != "": output += text # .translate(cmap) - if visitor_text is not None: - visitor_text(text, memo_cm, memo_tm, cmap[3], font_size) text = "" memo_cm = cm_matrix.copy() memo_tm = tm_matrix.copy() @@ -2132,6 +2125,34 @@ def process_operation(operator: bytes, operands: List[Any]) -> None: process_operation(b"TL", [-operands[1]]) process_operation(b"Td", operands) elif operator == b"TJ": + if visitor_text is not None and group_TJ: + # To prevent sending letters instead of words we + # override the visitor temporarily. + visitor_text_before = visitor_text + tm_matrix_before = [ + tm_matrix[0], + tm_matrix[1], + tm_matrix[2], + tm_matrix[3], + tm_matrix[4], + tm_matrix[5], + ] + text_TJ: List[str] = [] + + def visitor_text( + text: str, + cm_matrix: Any, + tm_matrix: Any, + font_dict: Any, + font_size: Any, + ) -> None: + # TODO cases where the current inserting order is kept + if rtl_dir: + # right-to-left + text_TJ.insert(0, text) # noqa + else: + text_TJ.append(text) # noqa + for op in operands[0]: if isinstance(op, (str, bytes)): process_operation(b"Tj", [op]) @@ -2141,10 +2162,17 @@ def process_operation(operator: bytes, operands: List[Any]) -> None: and (text[-1] != " ") ): process_operation(b"Tj", [" "]) + if visitor_text is not None and group_TJ: + visitor_text = visitor_text_before + visitor_text( + "".join(text_TJ), + cm_matrix, + tm_matrix_before, + cmap[3], + font_size, + ) elif operator == b"Do": output += text - if visitor_text is not None: - visitor_text(text, memo_cm, memo_tm, cmap[3], font_size) try: if output[-1] != "\n": output += "\n" @@ -2168,16 +2196,9 @@ def process_operation(operator: bytes, operands: List[Any]) -> None: visitor_operand_before, visitor_operand_after, visitor_text, + group_TJ, ) output += text - if visitor_text is not None: - visitor_text( - text, - memo_cm, - memo_tm, - cmap[3], - font_size, - ) except Exception: logger_warning( f" impossible to decode XFormObject {operands[0]}", @@ -2193,8 +2214,6 @@ def process_operation(operator: bytes, operands: List[Any]) -> None: if visitor_operand_after is not None: visitor_operand_after(operator, operands, cm_matrix, tm_matrix) output += text # just in case of - if text != "" and visitor_text is not None: - visitor_text(text, memo_cm, memo_tm, cmap[3], font_size) return output def extract_text( @@ -2207,6 +2226,7 @@ def extract_text( visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None, visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None, visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None, + group_TJ: bool = True, ) -> str: """ Locate all text drawing commands, in the order they are provided in the @@ -2246,6 +2266,8 @@ def extract_text( text matrix, font-dictionary and font-size. The font-dictionary may be None in case of unknown fonts. If not None it may e.g. contain key "/BaseFont" with value "/Arial,Bold". + group_TJ: True for one call of visitor_text at each TJ, + False for calls of visitor_text at each text-fragment of TJ. Returns: The extracted text @@ -2295,6 +2317,7 @@ def extract_text( visitor_operand_before, visitor_operand_after, visitor_text, + group_TJ, ) def extract_xform_text( @@ -2305,6 +2328,7 @@ def extract_xform_text( visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None, visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None, visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None, + group_TJ: bool = True, ) -> str: """ Extract text from an XObject. @@ -2316,6 +2340,8 @@ def extract_xform_text( visitor_operand_before: visitor_operand_after: visitor_text: + group_TJ: True for one call of visitor_text at each TJ, + False for calls of visitor_text at each text-fragment of TJ. Returns: The extracted text diff --git a/pypdf/_text_extraction/__init__.py b/pypdf/_text_extraction/__init__.py index 37af3cd54..9e1b08164 100644 --- a/pypdf/_text_extraction/__init__.py +++ b/pypdf/_text_extraction/__init__.py @@ -123,7 +123,7 @@ def crlf_space_check( output += text + "\n" if visitor_text is not None: visitor_text( - text + "\n", + "\n", memo_cm, memo_tm, cmap[3], @@ -136,13 +136,21 @@ def crlf_space_check( and (output + text)[-1] != " " ): text += " " + if visitor_text is not None: + visitor_text( + " ", + cm_matrix, + tm_matrix, + cmap[3], + font_size, + ) elif orientation == 180: if delta_y > 0.8 * f: if (output + text)[-1] != "\n": output += text + "\n" if visitor_text is not None: visitor_text( - text + "\n", + "\n", memo_cm, memo_tm, cmap[3], @@ -155,13 +163,21 @@ def crlf_space_check( and (output + text)[-1] != " " ): text += " " + if visitor_text is not None: + visitor_text( + " ", + cm_matrix, + tm_matrix, + cmap[3], + font_size, + ) elif orientation == 90: if delta_x > 0.8 * f: if (output + text)[-1] != "\n": output += text + "\n" if visitor_text is not None: visitor_text( - text + "\n", + "\n", memo_cm, memo_tm, cmap[3], @@ -180,7 +196,7 @@ def crlf_space_check( output += text + "\n" if visitor_text is not None: visitor_text( - text + "\n", + "\n", memo_cm, memo_tm, cmap[3], @@ -193,6 +209,14 @@ def crlf_space_check( and (output + text)[-1] != " " ): text += " " + if visitor_text is not None: + visitor_text( + " ", + cm_matrix, + tm_matrix, + cmap[3], + font_size, + ) except Exception: pass tm_prev = tm_matrix.copy() @@ -214,12 +238,13 @@ def handle_tj( rtl_dir: bool, visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]], ) -> Tuple[str, bool]: - m = mult(tm_matrix, cm_matrix) orientation = orient(m) if orientation in orientations and len(operands) > 0: if isinstance(operands[0], str): text += operands[0] + if visitor_text is not None: + visitor_text(operands[0], cm_matrix, tm_matrix, cmap[3], font_size) else: t: str = "" tt: bytes = ( @@ -243,6 +268,7 @@ def handle_tj( [cmap[0][x] if x in cmap[0] else bytes((x,)).decode() for x in tt] ) # "\u0590 - \u08FF \uFB50 - \uFDFF" + tj_text = "" for x in [cmap[1][x] if x in cmap[1] else x for x in t]: # x can be a sequence of bytes ; ex: habibi.pdf if len(x) == 1: @@ -258,7 +284,7 @@ def handle_tj( or 0x20A0 <= xx <= 0x21FF # but (numbers) indices/exponents or xx in CUSTOM_RTL_SPECIAL_CHARS # customized.... ): - text = x + text if rtl_dir else text + x + tj_text = x + tj_text if rtl_dir else tj_text + x elif ( # right-to-left characters set 0x0590 <= xx <= 0x08FF or 0xFB1D <= xx <= 0xFDFF @@ -280,6 +306,9 @@ def handle_tj( if visitor_text is not None: visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size) text = "" - text = text + x + tj_text = tj_text + x # fmt: on + text = tj_text + text if rtl_dir else text + tj_text + if visitor_text is not None: + visitor_text(tj_text, cm_matrix, tm_matrix, cmap[3], font_size) return text, rtl_dir diff --git a/tests/test_page.py b/tests/test_page.py index 1c388c426..a8dbf0a79 100644 --- a/tests/test_page.py +++ b/tests/test_page.py @@ -1,6 +1,7 @@ """Test the pypdf._page module.""" import json import math +import re from copy import deepcopy from io import BytesIO from pathlib import Path @@ -545,7 +546,7 @@ def print_op_b(op, args, cm_matrix, tm_matrix) -> None: rectangles.append(r) def print_visi(text, cm_matrix, tm_matrix, font_dict, font_size) -> None: - if text.strip() != "": + if text != "": if logger.isEnabledFor(logging.DEBUG): logger.debug(f"at {cm_matrix}, {tm_matrix}, font size={font_size}") texts.append( @@ -571,7 +572,7 @@ def extract_table( It is expected that each cell is marked by a rectangle-object. It is expected that the page contains one table only. - It is expected that the table contains at least 3 columns and 2 rows. + It is expected that the table contains at least 2 columns and 2 rows. A list of rows is returned. Each row contains a list of cells. @@ -623,8 +624,8 @@ def extract_table( curr_y = None curr_row = None for r in rectangles_filtered: - if col2count[r.x] < 3 or row2count[r.y] < 2: - # We expect at least 3 columns and 2 rows. + if col2count[r.x] < 2 or row2count[r.y] < 2: + # We expect at least 2 columns and 2 rows. continue if curr_y is None or r.y != curr_y: # next row @@ -646,7 +647,8 @@ def extract_table( def extract_cell_text(cell_texts: List[PositionedText]) -> str: """Joins the text-objects of a cell.""" - return ("".join(t.text for t in cell_texts)).strip() + text_raw = "".join(t.text for t in cell_texts) + return re.sub(r" +\n", "\n", text_raw.strip()) # Test 1: We test the analysis of page 7 "2.1 LRS model". reader = PdfReader(RESOURCE_ROOT / "GeoBase_NHNC1_Data_Model_UML_EN.pdf") @@ -667,12 +669,16 @@ def ignore_large_rectangles(r) -> bool: for t in texts: for r in rectangles: if r.contains(t.x, t.y): - texts = rectangle2texts.setdefault(r, []) - texts.append(t.text.strip()) + rtexts = rectangle2texts.setdefault(r, []) + if t.text != "": + rtexts.append(t.text) break # Five boxes and the figure-description below. - assert len(rectangle2texts) == 6 - box_texts = [" ".join(texts) for texts in rectangle2texts.values()] + assert len(rectangle2texts) == 11 + box_texts = [ + re.sub(" *\n", " ", "".join(texts).strip()) + for texts in rectangle2texts.values() + ] assert "Hydro Network" in box_texts assert "Hydro Events" in box_texts assert "Metadata" in box_texts @@ -697,10 +703,10 @@ def filter_first_table(r) -> bool: assert extract_cell_text(rows[0][2]) == "Description" assert extract_cell_text(rows[1][0]) == "September 2002" # The line break between "English review;" - # and "Remove" is not detected. + # and "Remove" is detected. assert ( extract_cell_text(rows[6][2]) - == "English review;Remove the UML model for the Segmented view." + == "English review;\nRemove the UML model for the Segmented view." ) assert extract_cell_text(rows[7][2]) == "Update from the March Workshop comments." @@ -738,6 +744,16 @@ def visitor_td(op, args, cm, tm) -> None: assert list_td[2] == (210.0, 210.0) assert list_td[3] == (410.0, 210.0) + # Test 3b: check extract_visitor in Sample_Td-matrix.pdf + # + (texts, rectangles) = extract_text_and_rectangles(page_td_model) + rows = extract_table(texts, rectangles) + assert len(rows) == 2 + assert extract_cell_text(rows[0][0]) == "Hello PDF!" + assert extract_cell_text(rows[0][1]) == "Hello PDF 200 0 Td!" + assert extract_cell_text(rows[1][0]) == "Hello PDF 2 1!" + assert extract_cell_text(rows[1][1]) == "Hello PDF 10 7!" + @pytest.mark.parametrize( ("pdf_path", "password", "embedded", "unembedded"),