Skip to content

Commit f082a56

Browse files
committed
Reduce diff
1 parent 2c643d8 commit f082a56

File tree

1 file changed

+39
-42
lines changed

1 file changed

+39
-42
lines changed

pypdf/_text_extraction/_text_extractor.py

Lines changed: 39 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -81,28 +81,32 @@ def __init__(
8181
self.page_obj = page_obj # Reference to the PageObject for font width maps
8282
self.obj = obj
8383
self.pdf = pdf
84-
self.orientations = orientations
84+
8585
self.space_width = space_width
8686
self.content_key = content_key
8787
self.visitor_operand_before = visitor_operand_before
8888
self.visitor_operand_after = visitor_operand_after
89-
self.visitor_text = visitor_text
90-
91-
# Text state
92-
self.text: str = ""
93-
self.output: str = ""
94-
self.rtl_dir: bool = False # right-to-left
9589

9690
# Matrix state
9791
self.cm_matrix: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
9892
self.tm_matrix: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
99-
self.cm_stack: List[Tuple[Any, ...]] = []
100-
101-
# Previous matrices for tracking changes
93+
self.cm_stack: List[
94+
Tuple[
95+
List[float],
96+
Tuple[Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]],
97+
float,
98+
float,
99+
float,
100+
float,
101+
float,
102+
]
103+
] = []
104+
105+
# Store the last modified matrices; can be an intermediate position
102106
self.cm_prev: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
103107
self.tm_prev: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
104108

105-
# Memo matrices for visitor callbacks
109+
# Store the position at the beginning of building the text
106110
self.memo_cm: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
107111
self.memo_tm: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
108112

@@ -113,23 +117,25 @@ def __init__(
113117
self.TL: float = 0.0
114118
self.font_size: float = 12.0 # init just in case
115119

120+
# Text state
121+
self.text: str = ""
122+
self.output: str = ""
123+
self.rtl_dir: bool = False # right-to-left
124+
116125
# Character map state
117126
self.cmap: Tuple[Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]] = (
118127
"charmap",
119128
{},
120129
"NotInitialized",
121130
None,
122131
) # (encoding, CMAP, font resource name, font)
132+
self.orientations: Tuple[int, ...] = orientations
133+
self.visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None
134+
self.cmaps: Dict[str, Tuple[str, float, Union[str, Dict[int, str]], Dict[str, str], DictionaryObject]] = {}
123135

124136
# Actual string size tracking
125137
self._actual_str_size: Dict[str, float] = {"str_widths": 0.0, "space_width": 0.0, "str_height": 0.0}
126138

127-
# Character maps for fonts
128-
self.cmaps: Dict[
129-
str,
130-
Tuple[str, float, Union[str, Dict[int, str]], Dict[str, str], DictionaryObject],
131-
] = {}
132-
133139
# Resources dictionary
134140
self.resources_dict: Optional[DictionaryObject] = None
135141

@@ -231,8 +237,7 @@ def _process_operation(self, operator: bytes, operands: List[Any]) -> None:
231237
if self.visitor_operand_after is not None:
232238
self.visitor_operand_after(operator, operands, self.cm_matrix, self.tm_matrix)
233239

234-
def _compute_str_widths(self, str_widths: float) -> float:
235-
"""Compute string widths."""
240+
def compute_str_widths(self, str_widths: float) -> float:
236241
return str_widths / 1000
237242

238243
def _flush_text(self) -> None:
@@ -355,22 +360,22 @@ def _handle_operation_move_text_position(self, operands: List[Any]) -> None:
355360
tx, ty = float(operands[0]), float(operands[1])
356361
self.tm_matrix[4] += tx * self.tm_matrix[0] + ty * self.tm_matrix[2]
357362
self.tm_matrix[5] += tx * self.tm_matrix[1] + ty * self.tm_matrix[3]
358-
str_widths = self._compute_str_widths(self._actual_str_size["str_widths"])
363+
str_widths = self.compute_str_widths(self._actual_str_size["str_widths"])
359364
self._actual_str_size["str_widths"] = 0.0
360365
self._handle_position_change(str_widths)
361366

362367
def _handle_operation_set_text_matrix(self, operands: List[Any]) -> None:
363368
"""Handle Tm (Set text matrix) operation."""
364369
self.tm_matrix = [float(operand) for operand in operands[:6]]
365-
str_widths = self._compute_str_widths(self._actual_str_size["str_widths"])
370+
str_widths = self.compute_str_widths(self._actual_str_size["str_widths"])
366371
self._actual_str_size["str_widths"] = 0.0
367372
self._handle_position_change(str_widths)
368373

369374
def _handle_operation_move_to_next_line(self, operands: List[Any]) -> None:
370375
"""Handle T* (Move to next line) operation."""
371376
self.tm_matrix[4] -= self.TL * self.tm_matrix[2]
372377
self.tm_matrix[5] -= self.TL * self.tm_matrix[3]
373-
str_widths = self._compute_str_widths(self._actual_str_size["str_widths"])
378+
str_widths = self.compute_str_widths(self._actual_str_size["str_widths"])
374379
self._actual_str_size["str_widths"] = 0.0
375380
self._handle_position_change(str_widths)
376381

@@ -389,7 +394,7 @@ def _handle_operation_show_text(self, operands: List[Any]) -> None:
389394
self._space_width,
390395
self._actual_str_size,
391396
)
392-
str_widths = self._compute_str_widths(self._actual_str_size["str_widths"])
397+
str_widths = self.compute_str_widths(self._actual_str_size["str_widths"])
393398
self._handle_position_change(str_widths)
394399

395400
def _handle_operation_show_text_with_positioning(self, operands: List[Any]) -> None:
@@ -471,7 +476,7 @@ def _handle_position_change(self, str_widths: float) -> None:
471476
self.font_size,
472477
self.visitor_text,
473478
str_widths,
474-
self._compute_str_widths(self._actual_str_size["space_width"]),
479+
self.compute_str_widths(self._actual_str_size["space_width"]),
475480
self._actual_str_size["str_height"],
476481
)
477482
if self.text == "":
@@ -482,16 +487,15 @@ def _handle_position_change(self, str_widths: float) -> None:
482487

483488
def _get_actual_font_widths(
484489
self,
485-
cmap: Tuple[Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]],
490+
cmap: Tuple[
491+
Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]
492+
],
486493
text_operands: str,
487494
font_size: float,
488495
space_width: float,
489496
) -> Tuple[float, float, float]:
490-
"""Get actual font widths for text operands."""
491497
font_widths: float = 0
492498
font_name: str = cmap[2]
493-
494-
# Use the page object's font width maps
495499
if font_name not in self.page_obj._font_width_maps:
496500
if cmap[3] is None:
497501
font_width_map: Dict[Any, float] = {}
@@ -505,7 +509,6 @@ def _get_actual_font_widths(
505509
if actual_space_width == 0:
506510
actual_space_width = space_width
507511
self.page_obj._font_width_maps[font_name] = (font_width_map, space_char, actual_space_width)
508-
509512
font_width_map = self.page_obj._font_width_maps[font_name][0]
510513
space_char = self.page_obj._font_width_maps[font_name][1]
511514
actual_space_width = self.page_obj._font_width_maps[font_name][2]
@@ -516,27 +519,26 @@ def _get_actual_font_widths(
516519
font_widths += actual_space_width
517520
continue
518521
font_widths += compute_font_width(font_width_map, char)
519-
520522
return (font_widths * font_size, space_width * font_size, font_size)
521523

522-
523-
524524
def _handle_tj(
525525
self,
526526
text: str,
527527
operands: List[Union[str, TextStringObject]],
528528
cm_matrix: List[float],
529529
tm_matrix: List[float],
530-
cmap: Tuple[Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]],
530+
cmap: Tuple[
531+
Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]
532+
],
531533
orientations: Tuple[int, ...],
532534
font_size: float,
533535
rtl_dir: bool,
534536
visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]],
535537
space_width: float,
536538
actual_str_size: Dict[str, float],
537539
) -> Tuple[str, bool, Dict[str, float]]:
538-
"""Handle text showing operations."""
539-
text_operands, is_str_operands = get_text_operands(operands, cm_matrix, tm_matrix, cmap, orientations)
540+
text_operands, is_str_operands = get_text_operands(
541+
operands, cm_matrix, tm_matrix, cmap, orientations)
540542
if is_str_operands:
541543
text += text_operands
542544
else:
@@ -550,13 +552,8 @@ def _handle_tj(
550552
rtl_dir,
551553
visitor_text,
552554
)
553-
554-
font_widths, actual_str_size["space_width"], actual_str_size["str_height"] = self._get_actual_font_widths(
555-
cmap,
556-
text_operands,
557-
font_size,
558-
space_width,
559-
)
555+
font_widths, actual_str_size["space_width"], actual_str_size["str_height"] = (
556+
self._get_actual_font_widths(cmap, text_operands, font_size, space_width))
560557
actual_str_size["str_widths"] += font_widths
561558

562559
return text, rtl_dir, actual_str_size

0 commit comments

Comments
 (0)