Skip to content

Commit 387ea44

Browse files
MartinThomasrogmann
andcommitted
MAINT: Change the positions of the calls of the visitor-function
Before the text-visitor-function had been called at each change of the output. But this can lead to wrong coordinates because the output may sent after changing the text-matrix for the next text. As an example have a look at resources/Sample_Td-matrix.pdf: The text_matrix is computed correctly at the Td-operations but the text had been sent after applying the next transformation. In this pull request the texts are sent inside the TJ and Tj operations. This may lead to sending letters instead of words: ``` x=264.53, y=403.13, text='M' x=264.53, y=403.13, text='etad' x=264.53, y=403.13, text='ata' x=307.85, y=403.13, text=' ' ``` Therefore there is a second commit which introduces a temporarily visitor inside the processing of TJ. The temp visitor ist used to collect the letters of TJ which will be sent after processing of TJ. When setting the temp visitor the original parameter is manipulated. I don't know if this is bad style in python. In case of bad style a local variable current_text_visitor may be introduced. See also issue #1377. I haven't checked if #1377 had the Td-matrix-problem or the one to be solved by this PR. -- This PR is a copy of #1389 The PR#1389 was made a long time ago (before we renamed to pypdf), but it seems still valuable. This PR migrated the changes to the new codebase. Full credit to rogmann for all of the changes. Co-authored-by: rogmann <github@rogmann.org>
1 parent 3ab1581 commit 387ea44

File tree

3 files changed

+109
-38
lines changed

3 files changed

+109
-38
lines changed

pypdf/_page.py

Lines changed: 46 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1873,6 +1873,7 @@ def _extract_text(
18731873
visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None,
18741874
visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None,
18751875
visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None,
1876+
group_TJ: bool = True,
18761877
) -> str:
18771878
"""
18781879
See extract_text for most arguments.
@@ -1957,16 +1958,12 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
19571958
if operator == b"BT":
19581959
tm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
19591960
output += text
1960-
if visitor_text is not None:
1961-
visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
19621961
text = ""
19631962
memo_cm = cm_matrix.copy()
19641963
memo_tm = tm_matrix.copy()
19651964
return None
19661965
elif operator == b"ET":
19671966
output += text
1968-
if visitor_text is not None:
1969-
visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
19701967
text = ""
19711968
memo_cm = cm_matrix.copy()
19721969
memo_tm = tm_matrix.copy()
@@ -1999,8 +1996,6 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
19991996
cm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
20001997
elif operator == b"cm":
20011998
output += text
2002-
if visitor_text is not None:
2003-
visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
20041999
text = ""
20052000
cm_matrix = mult(
20062001
[
@@ -2025,8 +2020,6 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
20252020
elif operator == b"Tf":
20262021
if text != "":
20272022
output += text # .translate(cmap)
2028-
if visitor_text is not None:
2029-
visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
20302023
text = ""
20312024
memo_cm = cm_matrix.copy()
20322025
memo_tm = tm_matrix.copy()
@@ -2132,6 +2125,34 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
21322125
process_operation(b"TL", [-operands[1]])
21332126
process_operation(b"Td", operands)
21342127
elif operator == b"TJ":
2128+
if visitor_text is not None and group_TJ:
2129+
# To prevent sending letters instead of words we
2130+
# override the visitor temporarily.
2131+
visitor_text_before = visitor_text
2132+
tm_matrix_before = [
2133+
tm_matrix[0],
2134+
tm_matrix[1],
2135+
tm_matrix[2],
2136+
tm_matrix[3],
2137+
tm_matrix[4],
2138+
tm_matrix[5],
2139+
]
2140+
text_TJ: List[str] = []
2141+
2142+
def visitor_text(
2143+
text: str,
2144+
cm_matrix: Any,
2145+
tm_matrix: Any,
2146+
font_dict: Any,
2147+
font_size: Any,
2148+
) -> None:
2149+
# TODO cases where the current inserting order is kept
2150+
if rtl_dir:
2151+
# right-to-left
2152+
text_TJ.insert(0, text) # noqa
2153+
else:
2154+
text_TJ.append(text) # noqa
2155+
21352156
for op in operands[0]:
21362157
if isinstance(op, (str, bytes)):
21372158
process_operation(b"Tj", [op])
@@ -2141,10 +2162,17 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
21412162
and (text[-1] != " ")
21422163
):
21432164
process_operation(b"Tj", [" "])
2165+
if visitor_text is not None and group_TJ:
2166+
visitor_text = visitor_text_before
2167+
visitor_text(
2168+
"".join(text_TJ),
2169+
cm_matrix,
2170+
tm_matrix_before,
2171+
cmap[3],
2172+
font_size,
2173+
)
21442174
elif operator == b"Do":
21452175
output += text
2146-
if visitor_text is not None:
2147-
visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
21482176
try:
21492177
if output[-1] != "\n":
21502178
output += "\n"
@@ -2168,16 +2196,9 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
21682196
visitor_operand_before,
21692197
visitor_operand_after,
21702198
visitor_text,
2199+
group_TJ,
21712200
)
21722201
output += text
2173-
if visitor_text is not None:
2174-
visitor_text(
2175-
text,
2176-
memo_cm,
2177-
memo_tm,
2178-
cmap[3],
2179-
font_size,
2180-
)
21812202
except Exception:
21822203
logger_warning(
21832204
f" impossible to decode XFormObject {operands[0]}",
@@ -2193,8 +2214,6 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
21932214
if visitor_operand_after is not None:
21942215
visitor_operand_after(operator, operands, cm_matrix, tm_matrix)
21952216
output += text # just in case of
2196-
if text != "" and visitor_text is not None:
2197-
visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
21982217
return output
21992218

22002219
def extract_text(
@@ -2207,6 +2226,7 @@ def extract_text(
22072226
visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None,
22082227
visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None,
22092228
visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None,
2229+
group_TJ: bool = True,
22102230
) -> str:
22112231
"""
22122232
Locate all text drawing commands, in the order they are provided in the
@@ -2246,6 +2266,8 @@ def extract_text(
22462266
text matrix, font-dictionary and font-size.
22472267
The font-dictionary may be None in case of unknown fonts.
22482268
If not None it may e.g. contain key "/BaseFont" with value "/Arial,Bold".
2269+
group_TJ: True for one call of visitor_text at each TJ,
2270+
False for calls of visitor_text at each text-fragment of TJ.
22492271
22502272
Returns:
22512273
The extracted text
@@ -2295,6 +2317,7 @@ def extract_text(
22952317
visitor_operand_before,
22962318
visitor_operand_after,
22972319
visitor_text,
2320+
group_TJ,
22982321
)
22992322

23002323
def extract_xform_text(
@@ -2305,6 +2328,7 @@ def extract_xform_text(
23052328
visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None,
23062329
visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None,
23072330
visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None,
2331+
group_TJ: bool = True,
23082332
) -> str:
23092333
"""
23102334
Extract text from an XObject.
@@ -2316,6 +2340,8 @@ def extract_xform_text(
23162340
visitor_operand_before:
23172341
visitor_operand_after:
23182342
visitor_text:
2343+
group_TJ: True for one call of visitor_text at each TJ,
2344+
False for calls of visitor_text at each text-fragment of TJ.
23192345
23202346
Returns:
23212347
The extracted text

pypdf/_text_extraction/__init__.py

Lines changed: 36 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -123,7 +123,7 @@ def crlf_space_check(
123123
output += text + "\n"
124124
if visitor_text is not None:
125125
visitor_text(
126-
text + "\n",
126+
"\n",
127127
memo_cm,
128128
memo_tm,
129129
cmap[3],
@@ -136,13 +136,21 @@ def crlf_space_check(
136136
and (output + text)[-1] != " "
137137
):
138138
text += " "
139+
if visitor_text is not None:
140+
visitor_text(
141+
" ",
142+
cm_matrix,
143+
tm_matrix,
144+
cmap[3],
145+
font_size,
146+
)
139147
elif orientation == 180:
140148
if delta_y > 0.8 * f:
141149
if (output + text)[-1] != "\n":
142150
output += text + "\n"
143151
if visitor_text is not None:
144152
visitor_text(
145-
text + "\n",
153+
"\n",
146154
memo_cm,
147155
memo_tm,
148156
cmap[3],
@@ -155,13 +163,21 @@ def crlf_space_check(
155163
and (output + text)[-1] != " "
156164
):
157165
text += " "
166+
if visitor_text is not None:
167+
visitor_text(
168+
" ",
169+
cm_matrix,
170+
tm_matrix,
171+
cmap[3],
172+
font_size,
173+
)
158174
elif orientation == 90:
159175
if delta_x > 0.8 * f:
160176
if (output + text)[-1] != "\n":
161177
output += text + "\n"
162178
if visitor_text is not None:
163179
visitor_text(
164-
text + "\n",
180+
"\n",
165181
memo_cm,
166182
memo_tm,
167183
cmap[3],
@@ -180,7 +196,7 @@ def crlf_space_check(
180196
output += text + "\n"
181197
if visitor_text is not None:
182198
visitor_text(
183-
text + "\n",
199+
"\n",
184200
memo_cm,
185201
memo_tm,
186202
cmap[3],
@@ -193,6 +209,14 @@ def crlf_space_check(
193209
and (output + text)[-1] != " "
194210
):
195211
text += " "
212+
if visitor_text is not None:
213+
visitor_text(
214+
" ",
215+
cm_matrix,
216+
tm_matrix,
217+
cmap[3],
218+
font_size,
219+
)
196220
except Exception:
197221
pass
198222
tm_prev = tm_matrix.copy()
@@ -214,12 +238,13 @@ def handle_tj(
214238
rtl_dir: bool,
215239
visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]],
216240
) -> Tuple[str, bool]:
217-
218241
m = mult(tm_matrix, cm_matrix)
219242
orientation = orient(m)
220243
if orientation in orientations and len(operands) > 0:
221244
if isinstance(operands[0], str):
222245
text += operands[0]
246+
if visitor_text is not None:
247+
visitor_text(operands[0], cm_matrix, tm_matrix, cmap[3], font_size)
223248
else:
224249
t: str = ""
225250
tt: bytes = (
@@ -243,6 +268,7 @@ def handle_tj(
243268
[cmap[0][x] if x in cmap[0] else bytes((x,)).decode() for x in tt]
244269
)
245270
# "\u0590 - \u08FF \uFB50 - \uFDFF"
271+
tj_text = ""
246272
for x in [cmap[1][x] if x in cmap[1] else x for x in t]:
247273
# x can be a sequence of bytes ; ex: habibi.pdf
248274
if len(x) == 1:
@@ -258,7 +284,7 @@ def handle_tj(
258284
or 0x20A0 <= xx <= 0x21FF # but (numbers) indices/exponents
259285
or xx in CUSTOM_RTL_SPECIAL_CHARS # customized....
260286
):
261-
text = x + text if rtl_dir else text + x
287+
tj_text = x + tj_text if rtl_dir else tj_text + x
262288
elif ( # right-to-left characters set
263289
0x0590 <= xx <= 0x08FF
264290
or 0xFB1D <= xx <= 0xFDFF
@@ -280,6 +306,9 @@ def handle_tj(
280306
if visitor_text is not None:
281307
visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size)
282308
text = ""
283-
text = text + x
309+
tj_text = tj_text + x
284310
# fmt: on
311+
text = tj_text + text if rtl_dir else text + tj_text
312+
if visitor_text is not None:
313+
visitor_text(tj_text, cm_matrix, tm_matrix, cmap[3], font_size)
285314
return text, rtl_dir

tests/test_page.py

Lines changed: 27 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
"""Test the pypdf._page module."""
22
import json
33
import math
4+
import re
45
from copy import deepcopy
56
from io import BytesIO
67
from pathlib import Path
@@ -545,7 +546,7 @@ def print_op_b(op, args, cm_matrix, tm_matrix) -> None:
545546
rectangles.append(r)
546547

547548
def print_visi(text, cm_matrix, tm_matrix, font_dict, font_size) -> None:
548-
if text.strip() != "":
549+
if text != "":
549550
if logger.isEnabledFor(logging.DEBUG):
550551
logger.debug(f"at {cm_matrix}, {tm_matrix}, font size={font_size}")
551552
texts.append(
@@ -571,7 +572,7 @@ def extract_table(
571572
572573
It is expected that each cell is marked by a rectangle-object.
573574
It is expected that the page contains one table only.
574-
It is expected that the table contains at least 3 columns and 2 rows.
575+
It is expected that the table contains at least 2 columns and 2 rows.
575576
576577
A list of rows is returned.
577578
Each row contains a list of cells.
@@ -623,8 +624,8 @@ def extract_table(
623624
curr_y = None
624625
curr_row = None
625626
for r in rectangles_filtered:
626-
if col2count[r.x] < 3 or row2count[r.y] < 2:
627-
# We expect at least 3 columns and 2 rows.
627+
if col2count[r.x] < 2 or row2count[r.y] < 2:
628+
# We expect at least 2 columns and 2 rows.
628629
continue
629630
if curr_y is None or r.y != curr_y:
630631
# next row
@@ -646,7 +647,8 @@ def extract_table(
646647

647648
def extract_cell_text(cell_texts: List[PositionedText]) -> str:
648649
"""Joins the text-objects of a cell."""
649-
return ("".join(t.text for t in cell_texts)).strip()
650+
text_raw = "".join(t.text for t in cell_texts)
651+
return re.sub(r" +\n", "\n", text_raw.strip())
650652

651653
# Test 1: We test the analysis of page 7 "2.1 LRS model".
652654
reader = PdfReader(RESOURCE_ROOT / "GeoBase_NHNC1_Data_Model_UML_EN.pdf")
@@ -667,12 +669,16 @@ def ignore_large_rectangles(r) -> bool:
667669
for t in texts:
668670
for r in rectangles:
669671
if r.contains(t.x, t.y):
670-
texts = rectangle2texts.setdefault(r, [])
671-
texts.append(t.text.strip())
672+
rtexts = rectangle2texts.setdefault(r, [])
673+
if t.text != "":
674+
rtexts.append(t.text)
672675
break
673676
# Five boxes and the figure-description below.
674-
assert len(rectangle2texts) == 6
675-
box_texts = [" ".join(texts) for texts in rectangle2texts.values()]
677+
assert len(rectangle2texts) == 11
678+
box_texts = [
679+
re.sub(" *\n", " ", "".join(texts).strip())
680+
for texts in rectangle2texts.values()
681+
]
676682
assert "Hydro Network" in box_texts
677683
assert "Hydro Events" in box_texts
678684
assert "Metadata" in box_texts
@@ -697,10 +703,10 @@ def filter_first_table(r) -> bool:
697703
assert extract_cell_text(rows[0][2]) == "Description"
698704
assert extract_cell_text(rows[1][0]) == "September 2002"
699705
# The line break between "English review;"
700-
# and "Remove" is not detected.
706+
# and "Remove" is detected.
701707
assert (
702708
extract_cell_text(rows[6][2])
703-
== "English review;Remove the UML model for the Segmented view."
709+
== "English review;\nRemove the UML model for the Segmented view."
704710
)
705711
assert extract_cell_text(rows[7][2]) == "Update from the March Workshop comments."
706712

@@ -738,6 +744,16 @@ def visitor_td(op, args, cm, tm) -> None:
738744
assert list_td[2] == (210.0, 210.0)
739745
assert list_td[3] == (410.0, 210.0)
740746

747+
# Test 3b: check extract_visitor in Sample_Td-matrix.pdf
748+
#
749+
(texts, rectangles) = extract_text_and_rectangles(page_td_model)
750+
rows = extract_table(texts, rectangles)
751+
assert len(rows) == 2
752+
assert extract_cell_text(rows[0][0]) == "Hello PDF!"
753+
assert extract_cell_text(rows[0][1]) == "Hello PDF 200 0 Td!"
754+
assert extract_cell_text(rows[1][0]) == "Hello PDF 2 1!"
755+
assert extract_cell_text(rows[1][1]) == "Hello PDF 10 7!"
756+
741757

742758
@pytest.mark.parametrize(
743759
("pdf_path", "password", "embedded", "unembedded"),

0 commit comments

Comments
 (0)