Skip to content

Commit da7f0af

Browse files
MAINT: Move text extraction state to TextExtraction class (#3350)
--------- Co-authored-by: Stefan <96178532+stefan6419846@users.noreply.github.com>
1 parent 6b52a0d commit da7f0af

File tree

4 files changed

+312
-254
lines changed

4 files changed

+312
-254
lines changed

pypdf/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,9 @@
1111
from ._doc_common import DocumentInformation
1212
from ._encryption import PasswordType
1313
from ._merger import PdfMerger
14-
from ._page import PageObject, Transformation, mult
14+
from ._page import PageObject, Transformation
1515
from ._reader import PdfReader
16+
from ._text_extraction import mult
1617
from ._version import __version__
1718
from ._writer import ObjectDeletionFlag, PdfWriter
1819
from .constants import ImageType

pypdf/_page.py

Lines changed: 51 additions & 248 deletions
Original file line numberDiff line numberDiff line change
@@ -51,14 +51,10 @@
5151

5252
from ._cmap import (
5353
build_char_map,
54-
unknown_char_map,
5554
)
5655
from ._protocols import PdfCommonDocProtocol
5756
from ._text_extraction import (
58-
OrientationNotFoundError,
5957
_layout_mode,
60-
crlf_space_check,
61-
mult,
6258
)
6359
from ._text_extraction._text_extractor import TextExtraction
6460
from ._utils import (
@@ -1657,7 +1653,7 @@ def _debug_for_extract(self) -> str: # pragma: no cover
16571653
out += "No Font\n"
16581654
return out
16591655

1660-
def _extract_text( # noqa: C901, PLR0915 # Will be fixed soon.
1656+
def _extract_text(
16611657
self,
16621658
obj: Any,
16631659
pdf: Any,
@@ -1678,9 +1674,6 @@ def _extract_text( # noqa: C901, PLR0915 # Will be fixed soon.
16781674
16791675
"""
16801676
extractor = TextExtraction()
1681-
text: str = ""
1682-
output: str = ""
1683-
rtl_dir: bool = False # right-to-left
16841677
cmaps: Dict[
16851678
str,
16861679
Tuple[
@@ -1707,14 +1700,6 @@ def _extract_text( # noqa: C901, PLR0915 # Will be fixed soon.
17071700
cmaps[f] = build_char_map(f, space_width, obj)
17081701
except TypeError:
17091702
pass
1710-
cmap: Tuple[
1711-
Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]
1712-
] = (
1713-
"charmap",
1714-
{},
1715-
"NotInitialized",
1716-
None,
1717-
) # (encoding, CMAP, font resource name, font)
17181703

17191704
try:
17201705
content = (
@@ -1728,245 +1713,57 @@ def _extract_text( # noqa: C901, PLR0915 # Will be fixed soon.
17281713
# are strings where the byte->string encoding was unknown, so adding
17291714
# them to the text here would be gibberish.
17301715

1731-
cm_matrix: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
1732-
tm_matrix: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
1733-
cm_stack = []
1734-
1735-
# Store the last modified matrices; can be an intermediate position
1736-
cm_prev: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
1737-
tm_prev: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
1738-
1739-
# Store the position at the beginning of building the text
1740-
memo_cm: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
1741-
memo_tm: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
1742-
1743-
char_scale = 1.0
1744-
space_scale = 1.0
1745-
_space_width: float = 500.0 # will be set correctly at first Tf
1746-
_actual_str_size: Dict[str, float] = {
1747-
"str_widths": 0.0, "space_width": 0.0, "str_height": 0.0
1748-
} # will be set to string length calculation result
1749-
TL = 0.0
1750-
font_size = 12.0 # init just in case of
1751-
1752-
def compute_str_widths(str_widths: float) -> float:
1753-
return str_widths / 1000
1754-
1755-
def process_operation(operator: bytes, operands: List[Any]) -> None:
1756-
nonlocal cm_matrix, tm_matrix, cm_stack, cm_prev, tm_prev, memo_cm, memo_tm
1757-
nonlocal char_scale, space_scale, _space_width, TL, font_size, cmap
1758-
nonlocal orientations, rtl_dir, visitor_text, output, text, _actual_str_size
1759-
1760-
str_widths: float = 0.0
1761-
1762-
# Table 5.4 page 405
1763-
if operator == b"BT": # Begin Text
1764-
tm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
1765-
# Flush text:
1766-
output += text
1767-
if visitor_text is not None:
1768-
visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
1769-
text = ""
1770-
memo_cm = cm_matrix.copy()
1771-
memo_tm = tm_matrix.copy()
1772-
return
1773-
if operator == b"ET": # End Text
1774-
# Flush text:
1775-
output += text
1776-
if visitor_text is not None:
1777-
visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
1778-
text = ""
1779-
memo_cm = cm_matrix.copy()
1780-
memo_tm = tm_matrix.copy()
1781-
1782-
# Table 4.7 "Graphics state operators", page 219
1783-
# cm_matrix calculation is reserved for later
1784-
elif operator == b"q": # Save graphics state
1785-
cm_stack.append(
1786-
(
1787-
cm_matrix,
1788-
cmap,
1789-
font_size,
1790-
char_scale,
1791-
space_scale,
1792-
_space_width,
1793-
TL,
1794-
)
1795-
)
1796-
elif operator == b"Q": # Restore graphics state
1797-
try:
1798-
(
1799-
cm_matrix,
1800-
cmap,
1801-
font_size,
1802-
char_scale,
1803-
space_scale,
1804-
_space_width,
1805-
TL,
1806-
) = cm_stack.pop()
1807-
except Exception:
1808-
cm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
1809-
elif operator == b"cm": # Modify current matrix
1810-
output += text
1811-
if visitor_text is not None:
1812-
visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
1813-
text = ""
1814-
try:
1815-
cm_matrix = mult(
1816-
[float(operand) for operand in operands[:6]],
1817-
cm_matrix
1818-
)
1819-
except Exception:
1820-
cm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
1821-
memo_cm = cm_matrix.copy()
1822-
memo_tm = tm_matrix.copy()
1823-
1824-
# Table 5.2 page 398
1825-
elif operator == b"Tz": # Set horizontal text scaling
1826-
char_scale = float(operands[0]) / 100 if operands else 1.0
1827-
elif operator == b"Tw": # Set word spacing
1828-
space_scale = 1.0 + float(operands[0] if operands else 0.0)
1829-
elif operator == b"TL": # Set Text Leading
1830-
scale_x = math.sqrt(tm_matrix[0]**2 + tm_matrix[2]**2)
1831-
TL = float(operands[0] if operands else 0.0) * font_size * scale_x
1832-
elif operator == b"Tf": # Set font size
1833-
if text != "":
1834-
output += text # .translate(cmap)
1835-
if visitor_text is not None:
1836-
visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
1837-
text = ""
1838-
memo_cm = cm_matrix.copy()
1839-
memo_tm = tm_matrix.copy()
1840-
try:
1841-
# char_map_tuple: font_type,
1842-
# float(sp_width / 2),
1843-
# encoding,
1844-
# map_dict,
1845-
# font_dict (describes the font)
1846-
char_map_tuple = cmaps[operands[0]]
1847-
# current cmap: encoding,
1848-
# map_dict,
1849-
# font resource name (internal name, not the real font name),
1850-
# font_dict
1851-
cmap = (
1852-
char_map_tuple[2],
1853-
char_map_tuple[3],
1854-
operands[0],
1855-
char_map_tuple[4],
1856-
)
1857-
_space_width = char_map_tuple[1]
1858-
except KeyError: # font not found
1859-
cmap = (
1860-
unknown_char_map[2],
1861-
unknown_char_map[3],
1862-
f"???{operands[0]}",
1863-
None,
1864-
)
1865-
_space_width = unknown_char_map[1]
1866-
try:
1867-
font_size = float(operands[1])
1868-
except Exception:
1869-
pass # keep previous size
1870-
# Table 5.5 page 406
1871-
elif operator == b"Td": # Move text position
1872-
# A special case is a translating only tm:
1873-
# tm = [1, 0, 0, 1, e, f]
1874-
# i.e. tm[4] += tx, tm[5] += ty.
1875-
tx, ty = float(operands[0]), float(operands[1])
1876-
tm_matrix[4] += tx * tm_matrix[0] + ty * tm_matrix[2]
1877-
tm_matrix[5] += tx * tm_matrix[1] + ty * tm_matrix[3]
1878-
str_widths = compute_str_widths(_actual_str_size["str_widths"])
1879-
_actual_str_size["str_widths"] = 0.0
1880-
elif operator == b"Tm": # Set text matrix
1881-
tm_matrix = [float(operand) for operand in operands[:6]]
1882-
str_widths = compute_str_widths(_actual_str_size["str_widths"])
1883-
_actual_str_size["str_widths"] = 0.0
1884-
elif operator == b"T*": # Move to next line
1885-
tm_matrix[4] -= TL * tm_matrix[2]
1886-
tm_matrix[5] -= TL * tm_matrix[3]
1887-
str_widths = compute_str_widths(_actual_str_size["str_widths"])
1888-
_actual_str_size["str_widths"] = 0.0
1889-
elif operator == b"Tj": # Show text
1890-
text, rtl_dir, _actual_str_size = extractor._handle_tj(
1891-
text,
1892-
operands,
1893-
cm_matrix,
1894-
tm_matrix,
1895-
cmap,
1896-
orientations,
1897-
font_size,
1898-
rtl_dir,
1899-
visitor_text,
1900-
_space_width,
1901-
_actual_str_size,
1902-
)
1903-
else:
1904-
return
1905-
1906-
if operator in {b"Td", b"Tm", b"T*", b"Tj"}:
1907-
try:
1908-
text, output, cm_prev, tm_prev = crlf_space_check(
1909-
text,
1910-
(cm_prev, tm_prev),
1911-
(cm_matrix, tm_matrix),
1912-
(memo_cm, memo_tm),
1913-
cmap,
1914-
orientations,
1915-
output,
1916-
font_size,
1917-
visitor_text,
1918-
str_widths,
1919-
compute_str_widths(_actual_str_size["space_width"]),
1920-
_actual_str_size["str_height"]
1921-
)
1922-
if text == "":
1923-
memo_cm = cm_matrix.copy()
1924-
memo_tm = tm_matrix.copy()
1925-
except OrientationNotFoundError:
1926-
return
1716+
# Initialize the extractor with the necessary parameters
1717+
extractor.initialize_extraction(orientations, visitor_text, cmaps)
19271718

19281719
for operands, operator in content.operations:
19291720
if visitor_operand_before is not None:
1930-
visitor_operand_before(operator, operands, cm_matrix, tm_matrix)
1721+
visitor_operand_before(operator, operands, extractor.cm_matrix, extractor.tm_matrix)
19311722
# Multiple operators are handled here
19321723
if operator == b"'":
1933-
process_operation(b"T*", [])
1934-
process_operation(b"Tj", operands)
1724+
extractor.process_operation(b"T*", [])
1725+
extractor.process_operation(b"Tj", operands)
19351726
elif operator == b'"':
1936-
process_operation(b"Tw", [operands[0]])
1937-
process_operation(b"Tc", [operands[1]])
1938-
process_operation(b"T*", [])
1939-
process_operation(b"Tj", operands[2:])
1727+
extractor.process_operation(b"Tw", [operands[0]])
1728+
extractor.process_operation(b"Tc", [operands[1]])
1729+
extractor.process_operation(b"T*", [])
1730+
extractor.process_operation(b"Tj", operands[2:])
19401731
elif operator == b"TJ":
19411732
# The space width may be smaller than the font width, so the width should be 95%.
1942-
_confirm_space_width = _space_width * 0.95
1733+
_confirm_space_width = extractor._space_width * 0.95
19431734
if operands:
19441735
for op in operands[0]:
19451736
if isinstance(op, (str, bytes)):
1946-
process_operation(b"Tj", [op])
1737+
extractor.process_operation(b"Tj", [op])
19471738
if isinstance(op, (int, float, NumberObject, FloatObject)) and (
19481739
abs(float(op)) >= _confirm_space_width
1949-
and text
1950-
and text[-1] != " "
1740+
and extractor.text
1741+
and extractor.text[-1] != " "
19511742
):
1952-
process_operation(b"Tj", [" "])
1743+
extractor.process_operation(b"Tj", [" "])
19531744
elif operator == b"TD":
1954-
process_operation(b"TL", [-operands[1]])
1955-
process_operation(b"Td", operands)
1745+
extractor.process_operation(b"TL", [-operands[1]])
1746+
extractor.process_operation(b"Td", operands)
19561747
elif operator == b"Do":
1957-
output += text
1748+
extractor.output += extractor.text
19581749
if visitor_text is not None:
1959-
visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
1750+
visitor_text(
1751+
extractor.text,
1752+
extractor.memo_cm,
1753+
extractor.memo_tm,
1754+
extractor.cmap[3],
1755+
extractor.font_size,
1756+
)
19601757
try:
1961-
if output[-1] != "\n":
1962-
output += "\n"
1758+
if extractor.output[-1] != "\n":
1759+
extractor.output += "\n"
19631760
if visitor_text is not None:
19641761
visitor_text(
19651762
"\n",
1966-
memo_cm,
1967-
memo_tm,
1968-
cmap[3],
1969-
font_size,
1763+
extractor.memo_cm,
1764+
extractor.memo_tm,
1765+
extractor.cmap[3],
1766+
extractor.font_size,
19701767
)
19711768
except IndexError:
19721769
pass
@@ -1981,32 +1778,38 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
19811778
visitor_operand_after,
19821779
visitor_text,
19831780
)
1984-
output += text
1781+
extractor.output += text
19851782
if visitor_text is not None:
19861783
visitor_text(
19871784
text,
1988-
memo_cm,
1989-
memo_tm,
1990-
cmap[3],
1991-
font_size,
1785+
extractor.memo_cm,
1786+
extractor.memo_tm,
1787+
extractor.cmap[3],
1788+
extractor.font_size,
19921789
)
19931790
except Exception as exception:
19941791
logger_warning(
19951792
f"Impossible to decode XFormObject {operands[0]}: {exception}",
19961793
__name__,
19971794
)
19981795
finally:
1999-
text = ""
2000-
memo_cm = cm_matrix.copy()
2001-
memo_tm = tm_matrix.copy()
1796+
extractor.text = ""
1797+
extractor.memo_cm = extractor.cm_matrix.copy()
1798+
extractor.memo_tm = extractor.tm_matrix.copy()
20021799
else:
2003-
process_operation(operator, operands)
1800+
extractor.process_operation(operator, operands)
20041801
if visitor_operand_after is not None:
2005-
visitor_operand_after(operator, operands, cm_matrix, tm_matrix)
2006-
output += text # just in case
2007-
if text != "" and visitor_text is not None:
2008-
visitor_text(text, memo_cm, memo_tm, cmap[3], font_size)
2009-
return output
1802+
visitor_operand_after(operator, operands, extractor.cm_matrix, extractor.tm_matrix)
1803+
extractor.output += extractor.text # just in case
1804+
if extractor.text != "" and visitor_text is not None:
1805+
visitor_text(
1806+
extractor.text,
1807+
extractor.memo_cm,
1808+
extractor.memo_tm,
1809+
extractor.cmap[3],
1810+
extractor.font_size,
1811+
)
1812+
return extractor.output
20101813

20111814
def _layout_mode_fonts(self) -> Dict[str, _layout_mode.Font]:
20121815
"""

0 commit comments

Comments
 (0)