Skip to content

Commit bcd85c4

Browse files
authored
BUG: invalid cm/tm in visitor functions (#2206)
Reworks and is still valid to close #2059 Closes #2200 Closes #2075
1 parent 126f6be commit bcd85c4

File tree

6 files changed

+171
-45
lines changed

6 files changed

+171
-45
lines changed

docs/user/extract-text.md

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -27,14 +27,27 @@ Refer to [extract\_text](../modules/PageObject.html#pypdf._page.PageObject.extra
2727
You can use visitor-functions to control which part of a page you want to process and extract. The visitor-functions you provide will get called for each operator or for each text fragment.
2828

2929
The function provided in argument visitor_text of function extract_text has five arguments:
30-
text, current transformation matrix, text matrix, font-dictionary and font-size.
31-
In most cases the x and y coordinates of the current position
32-
are in index 4 and 5 of the current transformation matrix.
30+
* text: the current text (as long as possible, can be up to a full line)
31+
* user_matrix: current matrix to move from user coordinate space (also known as CTM)
32+
* tm_matrix: current matrix from text coordinate space
33+
* font-dictionary: full font dictionary
34+
* font-size: the size (in text coordinate space)
35+
36+
The matrix stores 6 parameters. The first 4 provide the rotation/scaling matrix and the last two provide the translation (horizontal/vertical)
37+
It is recommended to use the user_matrix as it takes into all transformations.
38+
39+
Notes :
40+
41+
- as indicated in the PDF 1.7 reference, page 204 the user matrix applies to text space/image space/form space/pattern space.
42+
- if you want to get the full transformation from text to user space, you can use the `mult` function (availalbe in global import) as follows:
43+
`txt2user = mult(tm, cm))`
44+
The font-size is the raw text size, that is affected by the `user_matrix`
45+
3346

3447
The font-dictionary may be None in case of unknown fonts.
3548
If not None it may e.g. contain key "/BaseFont" with value "/Arial,Bold".
3649

37-
**Caveat**: In complicated documents the calculated positions might be wrong.
50+
**Caveat**: In complicated documents the calculated positions may be difficult to (if you move from multiple forms to page user space for example).
3851

3952
The function provided in argument visitor_operand_before has four arguments:
4053
operator, operand-arguments, current transformation matrix and text matrix.
@@ -53,7 +66,7 @@ parts = []
5366

5467

5568
def visitor_body(text, cm, tm, font_dict, font_size):
56-
y = tm[5]
69+
y = cm[5]
5770
if y > 50 and y < 720:
5871
parts.append(text)
5972

@@ -88,7 +101,7 @@ def visitor_svg_rect(op, args, cm, tm):
88101

89102

90103
def visitor_svg_text(text, cm, tm, fontDict, fontSize):
91-
(x, y) = (tm[4], tm[5])
104+
(x, y) = (cm[4], cm[5])
92105
dwg.add(dwg.text(text, insert=(x, y), fill="blue"))
93106

94107

pypdf/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from ._crypt_providers import crypt_provider
1111
from ._encryption import PasswordType
1212
from ._merger import PdfFileMerger, PdfMerger
13-
from ._page import PageObject, Transformation
13+
from ._page import PageObject, Transformation, mult
1414
from ._reader import DocumentInformation, PdfFileReader, PdfReader
1515
from ._version import __version__
1616
from ._writer import ObjectDeletionFlag, PdfFileWriter, PdfWriter
@@ -31,6 +31,7 @@
3131
__all__ = [
3232
"__version__",
3333
"_debug_versions",
34+
"mult",
3435
"PageRange",
3536
"PaperSize",
3637
"DocumentInformation",

pypdf/_page.py

Lines changed: 47 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1921,18 +1921,17 @@ def _extract_text(
19211921
# are strings where the byte->string encoding was unknown, so adding
19221922
# them to the text here would be gibberish.
19231923

1924-
cm_prev: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
19251924
cm_matrix: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
19261925
cm_stack = []
19271926
tm_matrix: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
1928-
tm_prev: List[float] = [
1929-
1.0,
1930-
0.0,
1931-
0.0,
1932-
1.0,
1933-
0.0,
1934-
0.0,
1935-
] # will store previous tm_matrix
1927+
1928+
# cm/tm_prev stores the last modified matrices can be an intermediate position
1929+
cm_prev: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
1930+
tm_prev: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
1931+
1932+
# memo_cm/tm will be used to store the position at the beginning of building the text
1933+
memo_cm: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
1934+
memo_tm: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
19361935
char_scale = 1.0
19371936
space_scale = 1.0
19381937
_space_width: float = 500.0 # will be set correctly at first Tf
@@ -1943,9 +1942,9 @@ def current_spacewidth() -> float:
19431942
return _space_width / 1000.0
19441943

19451944
def process_operation(operator: bytes, operands: List) -> None:
1946-
nonlocal cm_matrix, cm_stack, tm_matrix, cm_prev, tm_prev, output, text
1945+
nonlocal cm_matrix, cm_stack, tm_matrix, cm_prev, tm_prev, memo_cm, memo_tm
19471946
nonlocal char_scale, space_scale, _space_width, TL, font_size, cmap
1948-
nonlocal orientations, rtl_dir, visitor_text
1947+
nonlocal orientations, rtl_dir, visitor_text, output, text
19491948
global CUSTOM_RTL_MIN, CUSTOM_RTL_MAX, CUSTOM_RTL_SPECIAL_CHARS
19501949

19511950
check_crlf_space: bool = False
@@ -1954,14 +1953,18 @@ def process_operation(operator: bytes, operands: List) -> None:
19541953
tm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
19551954
output += text
19561955
if visitor_text is not None:
1957-
visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size)
1956+
visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
19581957
text = ""
1958+
memo_cm = cm_matrix.copy()
1959+
memo_tm = tm_matrix.copy()
19591960
return None
19601961
elif operator == b"ET":
19611962
output += text
19621963
if visitor_text is not None:
1963-
visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size)
1964+
visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
19641965
text = ""
1966+
memo_cm = cm_matrix.copy()
1967+
memo_tm = tm_matrix.copy()
19651968
# table 4.7 "Graphics state operators", page 219
19661969
# cm_matrix calculation is a reserved for the moment
19671970
elif operator == b"q":
@@ -1992,7 +1995,7 @@ def process_operation(operator: bytes, operands: List) -> None:
19921995
elif operator == b"cm":
19931996
output += text
19941997
if visitor_text is not None:
1995-
visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size)
1998+
visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
19961999
text = ""
19972000
cm_matrix = mult(
19982001
[
@@ -2005,6 +2008,8 @@ def process_operation(operator: bytes, operands: List) -> None:
20052008
],
20062009
cm_matrix,
20072010
)
2011+
memo_cm = cm_matrix.copy()
2012+
memo_tm = tm_matrix.copy()
20082013
# Table 5.2 page 398
20092014
elif operator == b"Tz":
20102015
char_scale = float(operands[0]) / 100.0
@@ -2016,8 +2021,10 @@ def process_operation(operator: bytes, operands: List) -> None:
20162021
if text != "":
20172022
output += text # .translate(cmap)
20182023
if visitor_text is not None:
2019-
visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size)
2024+
visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
20202025
text = ""
2026+
memo_cm = cm_matrix.copy()
2027+
memo_tm = tm_matrix.copy()
20212028
try:
20222029
# charMapTuple: font_type, float(sp_width / 2), encoding,
20232030
# map_dict, font-dictionary
@@ -2088,17 +2095,19 @@ def process_operation(operator: bytes, operands: List) -> None:
20882095
try:
20892096
text, output, cm_prev, tm_prev = crlf_space_check(
20902097
text,
2091-
cm_prev,
2092-
tm_prev,
2093-
cm_matrix,
2094-
tm_matrix,
2098+
(cm_prev, tm_prev),
2099+
(cm_matrix, tm_matrix),
2100+
(memo_cm, memo_tm),
20952101
cmap,
20962102
orientations,
20972103
output,
20982104
font_size,
20992105
visitor_text,
21002106
current_spacewidth(),
21012107
)
2108+
if text == "":
2109+
memo_cm = cm_matrix.copy()
2110+
memo_tm = tm_matrix.copy()
21022111
except OrientationNotFoundError:
21032112
return None
21042113

@@ -2130,12 +2139,18 @@ def process_operation(operator: bytes, operands: List) -> None:
21302139
elif operator == b"Do":
21312140
output += text
21322141
if visitor_text is not None:
2133-
visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size)
2142+
visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
21342143
try:
21352144
if output[-1] != "\n":
21362145
output += "\n"
21372146
if visitor_text is not None:
2138-
visitor_text("\n", cm_matrix, tm_matrix, cmap[3], font_size)
2147+
visitor_text(
2148+
"\n",
2149+
memo_cm,
2150+
memo_tm,
2151+
cmap[3],
2152+
font_size,
2153+
)
21392154
except IndexError:
21402155
pass
21412156
try:
@@ -2151,21 +2166,30 @@ def process_operation(operator: bytes, operands: List) -> None:
21512166
)
21522167
output += text
21532168
if visitor_text is not None:
2154-
visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size)
2169+
visitor_text(
2170+
text,
2171+
memo_cm,
2172+
memo_tm,
2173+
cmap[3],
2174+
font_size,
2175+
)
21552176
except Exception:
21562177
logger_warning(
21572178
f" impossible to decode XFormObject {operands[0]}",
21582179
__name__,
21592180
)
21602181
finally:
21612182
text = ""
2183+
memo_cm = cm_matrix.copy()
2184+
memo_tm = tm_matrix.copy()
2185+
21622186
else:
21632187
process_operation(operator, operands)
21642188
if visitor_operand_after is not None:
21652189
visitor_operand_after(operator, operands, cm_matrix, tm_matrix)
21662190
output += text # just in case of
21672191
if text != "" and visitor_text is not None:
2168-
visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size)
2192+
visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
21692193
return output
21702194

21712195
def extract_text(

pypdf/_text_extraction/__init__.py

Lines changed: 19 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -87,10 +87,9 @@ def orient(m: List[float]) -> int:
8787

8888
def crlf_space_check(
8989
text: str,
90-
cm_prev: List[float],
91-
tm_prev: List[float],
92-
cm_matrix: List[float],
93-
tm_matrix: List[float],
90+
cmtm_prev: Tuple[List[float], List[float]],
91+
cmtm_matrix: Tuple[List[float], List[float]],
92+
memo_cmtm: Tuple[List[float], List[float]],
9493
cmap: Tuple[
9594
Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]
9695
],
@@ -100,13 +99,21 @@ def crlf_space_check(
10099
visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]],
101100
spacewidth: float,
102101
) -> Tuple[str, str, List[float], List[float]]:
102+
cm_prev = cmtm_prev[0]
103+
tm_prev = cmtm_prev[1]
104+
cm_matrix = cmtm_matrix[0]
105+
tm_matrix = cmtm_matrix[1]
106+
memo_cm = memo_cmtm[0]
107+
memo_tm = memo_cmtm[1]
108+
103109
m_prev = mult(tm_prev, cm_prev)
104110
m = mult(tm_matrix, cm_matrix)
105111
orientation = orient(m)
106112
delta_x = m[4] - m_prev[4]
107113
delta_y = m[5] - m_prev[5]
108114
k = math.sqrt(abs(m[0] * m[3]) + abs(m[1] * m[2]))
109115
f = font_size * k
116+
cm_prev = m
110117
if orientation not in orientations:
111118
raise OrientationNotFoundError
112119
try:
@@ -117,8 +124,8 @@ def crlf_space_check(
117124
if visitor_text is not None:
118125
visitor_text(
119126
text + "\n",
120-
cm_prev,
121-
tm_prev,
127+
memo_cm,
128+
memo_tm,
122129
cmap[3],
123130
font_size,
124131
)
@@ -136,8 +143,8 @@ def crlf_space_check(
136143
if visitor_text is not None:
137144
visitor_text(
138145
text + "\n",
139-
cm_prev,
140-
tm_prev,
146+
memo_cm,
147+
memo_tm,
141148
cmap[3],
142149
font_size,
143150
)
@@ -155,8 +162,8 @@ def crlf_space_check(
155162
if visitor_text is not None:
156163
visitor_text(
157164
text + "\n",
158-
cm_prev,
159-
tm_prev,
165+
memo_cm,
166+
memo_tm,
160167
cmap[3],
161168
font_size,
162169
)
@@ -174,8 +181,8 @@ def crlf_space_check(
174181
if visitor_text is not None:
175182
visitor_text(
176183
text + "\n",
177-
cm_prev,
178-
tm_prev,
184+
memo_cm,
185+
memo_tm,
179186
cmap[3],
180187
font_size,
181188
)

tests/test_page.py

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1288,3 +1288,81 @@ def test_get_contents_from_nullobject():
12881288
p = writer.add_blank_page(100, 100)
12891289
p[NameObject("/Contents")] = writer._add_object(NullObject())
12901290
p.get_contents()
1291+
1292+
1293+
@pytest.mark.enable_socket()
1294+
def test_pos_text_in_textvisitor():
1295+
"""See #2200"""
1296+
url = "https://github.com/py-pdf/pypdf/files/12675974/page_178.pdf"
1297+
name = "test_text_pos.pdf"
1298+
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
1299+
p = ()
1300+
1301+
def visitor_body2(text, cm, tm, fontdict, fontsize) -> None:
1302+
nonlocal p
1303+
if text.startswith("5425."):
1304+
p = (tm[4], tm[5])
1305+
1306+
reader.pages[0].extract_text(visitor_text=visitor_body2)
1307+
assert abs(p[0] - 323.5) < 0.1
1308+
assert abs(p[1] - 457.4) < 0.1
1309+
1310+
1311+
@pytest.mark.enable_socket()
1312+
def test_pos_text_in_textvisitor2():
1313+
"""See #2075"""
1314+
url = "https://github.com/py-pdf/pypdf/files/12318042/LegIndex-page6.pdf"
1315+
name = "LegIndex-page6.pdf"
1316+
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
1317+
x_lvl = 26
1318+
lst = []
1319+
1320+
def visitor_lvl(text, cm, tm, fontdict, fontsize) -> None:
1321+
nonlocal x_lvl, lst
1322+
if abs(tm[4] - x_lvl) < 2 and tm[5] < 740 and tm[5] > 210:
1323+
lst.append(text.strip(" \n"))
1324+
1325+
reader.pages[0].extract_text(visitor_text=visitor_lvl)
1326+
assert lst == [
1327+
"ACUPUNCTURE BOARD",
1328+
"ACUPUNCTURISTS AND ACUPUNCTURE",
1329+
"ADMINISTRATIVE LAW AND PROCEDURE",
1330+
"ADMINISTRATIVE LAW, OFFICE OF",
1331+
"ADOPTION",
1332+
"ADULT EDUCATION",
1333+
"ADVERTISING. See also MARKETING; and particular subject matter (e.g.,",
1334+
]
1335+
x_lvl = 35
1336+
lst = []
1337+
reader.pages[0].extract_text(visitor_text=visitor_lvl)
1338+
assert lst == [
1339+
"members, AB 1264",
1340+
"assistants, acupuncture, AB 1264",
1341+
"complaints, investigations, etc., AB 1264",
1342+
"day, california acupuncture, HR 48",
1343+
"massage services, asian, AB 1264",
1344+
"supervising acupuncturists, AB 1264",
1345+
"supportive acupuncture services, basic, AB 1264",
1346+
"rules and regulations—",
1347+
"professional assistants and employees: employment and compensation, AB 916",
1348+
"adults, adoption of, AB 1756",
1349+
"agencies, organizations, etc.: requirements, prohibitions, etc., SB 807",
1350+
"assistance programs, adoption: nonminor dependents, SB 9",
1351+
"birth certificates, AB 1302",
1352+
"contact agreements, postadoption—",
1353+
"facilitators, adoption, AB 120",
1354+
"failed adoptions: reproductive loss leave, SB 848",
1355+
"hearings, adoption finalization: remote proceedings, technology, etc., SB 21",
1356+
"native american tribes, AB 120",
1357+
"parental rights, reinstatement of, AB 20",
1358+
"parents, prospective adoptive: criminal background checks, SB 824",
1359+
"services, adult educational, SB 877",
1360+
"week, adult education, ACR 31",
1361+
"alcoholic beverages: tied-house restrictions, AB 546",
1362+
"campaign re social equity, civil rights, etc., SB 447",
1363+
"cannabis, AB 794",
1364+
"elections. See ELECTIONS.",
1365+
"false, misleading, etc., advertising—",
1366+
"hotels, short-term rentals, etc., advertised rates: mandatory fee disclosures, SB 683",
1367+
"housing rental properties advertised rates: disclosures, SB 611",
1368+
]

0 commit comments

Comments
 (0)