Skip to content

Commit af645a4

Browse files
authored
MAINT: Move code from _page.py to _text_extraction (#3343)
1 parent ffd406a commit af645a4

File tree

2 files changed

+123
-77
lines changed

2 files changed

+123
-77
lines changed

pypdf/_page.py

Lines changed: 4 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -51,20 +51,16 @@
5151

5252
from ._cmap import (
5353
build_char_map,
54-
build_font_width_map,
55-
compute_font_width,
56-
get_actual_str_key,
5754
unknown_char_map,
5855
)
5956
from ._protocols import PdfCommonDocProtocol
6057
from ._text_extraction import (
6158
OrientationNotFoundError,
6259
_layout_mode,
6360
crlf_space_check,
64-
get_display_str,
65-
get_text_operands,
6661
mult,
6762
)
63+
from ._text_extraction._text_extractor import TextExtraction
6864
from ._utils import (
6965
CompressedTransformationMatrix,
7066
TransformationMatrixType,
@@ -92,7 +88,6 @@
9288
PdfObject,
9389
RectangleObject,
9490
StreamObject,
95-
TextStringObject,
9691
is_null_or_none,
9792
)
9893

@@ -1662,77 +1657,8 @@ def _debug_for_extract(self) -> str: # pragma: no cover
16621657
out += "No Font\n"
16631658
return out
16641659

1665-
def _get_actual_font_widths(
1666-
self,
1667-
cmap: Tuple[
1668-
Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]
1669-
],
1670-
text_operands: str,
1671-
font_size: float,
1672-
space_width: float
1673-
) -> Tuple[float, float, float]:
1674-
font_widths: float = 0
1675-
font_name: str = cmap[2]
1676-
if font_name not in self._font_width_maps:
1677-
if cmap[3] is None:
1678-
font_width_map: Dict[Any, float] = {}
1679-
space_char = " "
1680-
actual_space_width: float = space_width
1681-
font_width_map["default"] = actual_space_width * 2
1682-
else:
1683-
space_char = get_actual_str_key(" ", cmap[0], cmap[1])
1684-
font_width_map = build_font_width_map(cmap[3], space_width * 2)
1685-
actual_space_width = compute_font_width(font_width_map, space_char)
1686-
if actual_space_width == 0:
1687-
actual_space_width = space_width
1688-
self._font_width_maps[font_name] = (font_width_map, space_char, actual_space_width)
1689-
font_width_map = self._font_width_maps[font_name][0]
1690-
space_char = self._font_width_maps[font_name][1]
1691-
actual_space_width = self._font_width_maps[font_name][2]
1692-
1693-
if text_operands:
1694-
for char in text_operands:
1695-
if char == space_char:
1696-
font_widths += actual_space_width
1697-
continue
1698-
font_widths += compute_font_width(font_width_map, char)
1699-
return (font_widths * font_size, space_width * font_size, font_size)
17001660

1701-
def _handle_tj(
1702-
self,
1703-
text: str,
1704-
operands: List[Union[str, TextStringObject]],
1705-
cm_matrix: List[float],
1706-
tm_matrix: List[float],
1707-
cmap: Tuple[
1708-
Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]
1709-
],
1710-
orientations: Tuple[int, ...],
1711-
font_size: float,
1712-
rtl_dir: bool,
1713-
visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]],
1714-
space_width: float,
1715-
actual_str_size: Dict[str, float]
1716-
) -> Tuple[str, bool, Dict[str, float]]:
1717-
text_operands, is_str_operands = get_text_operands(
1718-
operands, cm_matrix, tm_matrix, cmap, orientations)
1719-
if is_str_operands:
1720-
text += text_operands
1721-
else:
1722-
text, rtl_dir = get_display_str(
1723-
text,
1724-
cm_matrix,
1725-
tm_matrix, # text matrix
1726-
cmap,
1727-
text_operands,
1728-
font_size,
1729-
rtl_dir,
1730-
visitor_text)
1731-
font_widths, actual_str_size["space_width"], actual_str_size["str_height"] = (
1732-
self._get_actual_font_widths(cmap, text_operands, font_size, space_width))
1733-
actual_str_size["str_widths"] += font_widths
1734-
1735-
return text, rtl_dir, actual_str_size
1661+
17361662

17371663
def _extract_text(
17381664
self,
@@ -1754,6 +1680,7 @@ def _extract_text(
17541680
default = "/Content"
17551681
17561682
"""
1683+
extractor = TextExtraction()
17571684
text: str = ""
17581685
output: str = ""
17591686
rtl_dir: bool = False # right-to-left
@@ -1960,7 +1887,7 @@ def process_operation(operator: bytes, operands: List[Any]) -> None:
19601887
str_widths = compute_str_widths(_actual_str_size["str_widths"])
19611888
_actual_str_size["str_widths"] = 0.0
19621889
elif operator == b"Tj": # Show text
1963-
text, rtl_dir, _actual_str_size = self._handle_tj(
1890+
text, rtl_dir, _actual_str_size = extractor._handle_tj(
19641891
text,
19651892
operands,
19661893
cm_matrix,
Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
# Copyright (c) 2006, Mathieu Fenniak
2+
# Copyright (c) 2007, Ashish Kulkarni <kulkarni.ashish@gmail.com>
3+
#
4+
# All rights reserved.
5+
#
6+
# Redistribution and use in source and binary forms, with or without
7+
# modification, are permitted provided that the following conditions are
8+
# met:
9+
#
10+
# * Redistributions of source code must retain the above copyright notice,
11+
# this list of conditions and the following disclaimer.
12+
# * Redistributions in binary form must reproduce the above copyright notice,
13+
# this list of conditions and the following disclaimer in the documentation
14+
# and/or other materials provided with the distribution.
15+
# * The name of the author may not be used to endorse or promote products
16+
# derived from this software without specific prior written permission.
17+
#
18+
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19+
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20+
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21+
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
22+
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23+
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24+
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25+
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26+
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27+
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28+
# POSSIBILITY OF SUCH DAMAGE.
29+
30+
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
31+
32+
from .._cmap import build_font_width_map, compute_font_width, get_actual_str_key
33+
from ..generic import DictionaryObject, TextStringObject
34+
from . import get_display_str, get_text_operands
35+
36+
37+
class TextExtraction:
38+
"""
39+
A class to handle PDF text extraction operations.
40+
41+
This class encapsulates all the state and operations needed for extracting
42+
text from PDF content streams, replacing the nested functions and nonlocal
43+
variables in the original implementation.
44+
"""
45+
46+
def __init__(self) -> None:
47+
self._font_width_maps: Dict[str, Tuple[Dict[Any, float], str, float]] = {}
48+
49+
def _get_actual_font_widths(
50+
self,
51+
cmap: Tuple[
52+
Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]
53+
],
54+
text_operands: str,
55+
font_size: float,
56+
space_width: float
57+
) -> Tuple[float, float, float]:
58+
font_widths: float = 0
59+
font_name: str = cmap[2]
60+
if font_name not in self._font_width_maps:
61+
if cmap[3] is None:
62+
font_width_map: Dict[Any, float] = {}
63+
space_char = " "
64+
actual_space_width: float = space_width
65+
font_width_map["default"] = actual_space_width * 2
66+
else:
67+
space_char = get_actual_str_key(" ", cmap[0], cmap[1])
68+
font_width_map = build_font_width_map(cmap[3], space_width * 2)
69+
actual_space_width = compute_font_width(font_width_map, space_char)
70+
if actual_space_width == 0:
71+
actual_space_width = space_width
72+
self._font_width_maps[font_name] = (font_width_map, space_char, actual_space_width)
73+
font_width_map = self._font_width_maps[font_name][0]
74+
space_char = self._font_width_maps[font_name][1]
75+
actual_space_width = self._font_width_maps[font_name][2]
76+
77+
if text_operands:
78+
for char in text_operands:
79+
if char == space_char:
80+
font_widths += actual_space_width
81+
continue
82+
font_widths += compute_font_width(font_width_map, char)
83+
return (font_widths * font_size, space_width * font_size, font_size)
84+
85+
def _handle_tj(
86+
self,
87+
text: str,
88+
operands: List[Union[str, TextStringObject]],
89+
cm_matrix: List[float],
90+
tm_matrix: List[float],
91+
cmap: Tuple[
92+
Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]
93+
],
94+
orientations: Tuple[int, ...],
95+
font_size: float,
96+
rtl_dir: bool,
97+
visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]],
98+
space_width: float,
99+
actual_str_size: Dict[str, float]
100+
) -> Tuple[str, bool, Dict[str, float]]:
101+
text_operands, is_str_operands = get_text_operands(
102+
operands, cm_matrix, tm_matrix, cmap, orientations)
103+
if is_str_operands:
104+
text += text_operands
105+
else:
106+
text, rtl_dir = get_display_str(
107+
text,
108+
cm_matrix,
109+
tm_matrix, # text matrix
110+
cmap,
111+
text_operands,
112+
font_size,
113+
rtl_dir,
114+
visitor_text)
115+
font_widths, actual_str_size["space_width"], actual_str_size["str_height"] = (
116+
self._get_actual_font_widths(cmap, text_operands, font_size, space_width))
117+
actual_str_size["str_widths"] += font_widths
118+
119+
return text, rtl_dir, actual_str_size

0 commit comments

Comments
 (0)