Skip to content

Commit 76ca472

Browse files
committed
MAINT: Move code from _page.py to _text_extraction
1 parent 323e649 commit 76ca472

File tree

2 files changed

+122
-71
lines changed

2 files changed

+122
-71
lines changed

pypdf/_page.py

Lines changed: 4 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@
6565
get_text_operands,
6666
mult,
6767
)
68+
from ._text_extraction._text_extractor import TextExtraction
6869
from ._utils import (
6970
CompressedTransformationMatrix,
7071
TransformationMatrixType,
@@ -1662,77 +1663,8 @@ def _debug_for_extract(self) -> str: # pragma: no cover
16621663
out += "No Font\n"
16631664
return out
16641665

1665-
def _get_actual_font_widths(
1666-
self,
1667-
cmap: Tuple[
1668-
Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]
1669-
],
1670-
text_operands: str,
1671-
font_size: float,
1672-
space_width: float
1673-
) -> Tuple[float, float, float]:
1674-
font_widths: float = 0
1675-
font_name: str = cmap[2]
1676-
if font_name not in self._font_width_maps:
1677-
if cmap[3] is None:
1678-
font_width_map: Dict[Any, float] = {}
1679-
space_char = " "
1680-
actual_space_width: float = space_width
1681-
font_width_map["default"] = actual_space_width * 2
1682-
else:
1683-
space_char = get_actual_str_key(" ", cmap[0], cmap[1])
1684-
font_width_map = build_font_width_map(cmap[3], space_width * 2)
1685-
actual_space_width = compute_font_width(font_width_map, space_char)
1686-
if actual_space_width == 0:
1687-
actual_space_width = space_width
1688-
self._font_width_maps[font_name] = (font_width_map, space_char, actual_space_width)
1689-
font_width_map = self._font_width_maps[font_name][0]
1690-
space_char = self._font_width_maps[font_name][1]
1691-
actual_space_width = self._font_width_maps[font_name][2]
1692-
1693-
if text_operands:
1694-
for char in text_operands:
1695-
if char == space_char:
1696-
font_widths += actual_space_width
1697-
continue
1698-
font_widths += compute_font_width(font_width_map, char)
1699-
return (font_widths * font_size, space_width * font_size, font_size)
17001666

1701-
def _handle_tj(
1702-
self,
1703-
text: str,
1704-
operands: List[Union[str, TextStringObject]],
1705-
cm_matrix: List[float],
1706-
tm_matrix: List[float],
1707-
cmap: Tuple[
1708-
Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]
1709-
],
1710-
orientations: Tuple[int, ...],
1711-
font_size: float,
1712-
rtl_dir: bool,
1713-
visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]],
1714-
space_width: float,
1715-
actual_str_size: Dict[str, float]
1716-
) -> Tuple[str, bool, Dict[str, float]]:
1717-
text_operands, is_str_operands = get_text_operands(
1718-
operands, cm_matrix, tm_matrix, cmap, orientations)
1719-
if is_str_operands:
1720-
text += text_operands
1721-
else:
1722-
text, rtl_dir = get_display_str(
1723-
text,
1724-
cm_matrix,
1725-
tm_matrix, # text matrix
1726-
cmap,
1727-
text_operands,
1728-
font_size,
1729-
rtl_dir,
1730-
visitor_text)
1731-
font_widths, actual_str_size["space_width"], actual_str_size["str_height"] = (
1732-
self._get_actual_font_widths(cmap, text_operands, font_size, space_width))
1733-
actual_str_size["str_widths"] += font_widths
1734-
1735-
return text, rtl_dir, actual_str_size
1667+
17361668

17371669
def _extract_text(
17381670
self,
@@ -1754,6 +1686,7 @@ def _extract_text(
17541686
default = "/Content"
17551687
17561688
"""
1689+
extractor = TextExtraction()
17571690
text: str = ""
17581691
output: str = ""
17591692
rtl_dir: bool = False # right-to-left
@@ -1960,7 +1893,7 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
19601893
str_widths = compute_str_widths(_actual_str_size["str_widths"])
19611894
_actual_str_size["str_widths"] = 0.0
19621895
elif operator == b"Tj": # Show text
1963-
text, rtl_dir, _actual_str_size = self._handle_tj(
1896+
text, rtl_dir, _actual_str_size = extractor._handle_tj(
19641897
text,
19651898
operands,
19661899
cm_matrix,
Lines changed: 118 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,118 @@
1+
# Copyright (c) 2006, Mathieu Fenniak
2+
# Copyright (c) 2007, Ashish Kulkarni <kulkarni.ashish@gmail.com>
3+
#
4+
# All rights reserved.
5+
#
6+
# Redistribution and use in source and binary forms, with or without
7+
# modification, are permitted provided that the following conditions are
8+
# met:
9+
#
10+
# * Redistributions of source code must retain the above copyright notice,
11+
# this list of conditions and the following disclaimer.
12+
# * Redistributions in binary form must reproduce the above copyright notice,
13+
# this list of conditions and the following disclaimer in the documentation
14+
# and/or other materials provided with the distribution.
15+
# * The name of the author may not be used to endorse or promote products
16+
# derived from this software without specific prior written permission.
17+
#
18+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19+
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20+
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21+
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
22+
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23+
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24+
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25+
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26+
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27+
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28+
# POSSIBILITY OF SUCH DAMAGE.
29+
30+
from typing import Tuple, Union, Dict, Optional, Any, List, Callable
31+
32+
from .._cmap import get_actual_str_key, build_font_width_map, compute_font_width
33+
from ..generic import DictionaryObject, TextStringObject
34+
from . import get_text_operands, get_display_str
35+
36+
class TextExtraction:
37+
"""
38+
A class to handle PDF text extraction operations.
39+
40+
This class encapsulates all the state and operations needed for extracting
41+
text from PDF content streams, replacing the nested functions and nonlocal
42+
variables in the original implementation.
43+
"""
44+
45+
def __init__(self):
46+
self._font_width_maps: Dict[str, Tuple[Dict[Any, float], str, float]] = {}
47+
48+
def _get_actual_font_widths(
49+
self,
50+
cmap: Tuple[
51+
Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]
52+
],
53+
text_operands: str,
54+
font_size: float,
55+
space_width: float
56+
) -> Tuple[float, float, float]:
57+
font_widths: float = 0
58+
font_name: str = cmap[2]
59+
if font_name not in self._font_width_maps:
60+
if cmap[3] is None:
61+
font_width_map: Dict[Any, float] = {}
62+
space_char = " "
63+
actual_space_width: float = space_width
64+
font_width_map["default"] = actual_space_width * 2
65+
else:
66+
space_char = get_actual_str_key(" ", cmap[0], cmap[1])
67+
font_width_map = build_font_width_map(cmap[3], space_width * 2)
68+
actual_space_width = compute_font_width(font_width_map, space_char)
69+
if actual_space_width == 0:
70+
actual_space_width = space_width
71+
self._font_width_maps[font_name] = (font_width_map, space_char, actual_space_width)
72+
font_width_map = self._font_width_maps[font_name][0]
73+
space_char = self._font_width_maps[font_name][1]
74+
actual_space_width = self._font_width_maps[font_name][2]
75+
76+
if text_operands:
77+
for char in text_operands:
78+
if char == space_char:
79+
font_widths += actual_space_width
80+
continue
81+
font_widths += compute_font_width(font_width_map, char)
82+
return (font_widths * font_size, space_width * font_size, font_size)
83+
84+
def _handle_tj(
85+
self,
86+
text: str,
87+
operands: List[Union[str, TextStringObject]],
88+
cm_matrix: List[float],
89+
tm_matrix: List[float],
90+
cmap: Tuple[
91+
Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]
92+
],
93+
orientations: Tuple[int, ...],
94+
font_size: float,
95+
rtl_dir: bool,
96+
visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]],
97+
space_width: float,
98+
actual_str_size: Dict[str, float]
99+
) -> Tuple[str, bool, Dict[str, float]]:
100+
text_operands, is_str_operands = get_text_operands(
101+
operands, cm_matrix, tm_matrix, cmap, orientations)
102+
if is_str_operands:
103+
text += text_operands
104+
else:
105+
text, rtl_dir = get_display_str(
106+
text,
107+
cm_matrix,
108+
tm_matrix, # text matrix
109+
cmap,
110+
text_operands,
111+
font_size,
112+
rtl_dir,
113+
visitor_text)
114+
font_widths, actual_str_size["space_width"], actual_str_size["str_height"] = (
115+
self._get_actual_font_widths(cmap, text_operands, font_size, space_width))
116+
actual_str_size["str_widths"] += font_widths
117+
118+
return text, rtl_dir, actual_str_size

0 commit comments

Comments
 (0)