Skip to content

Commit c93aadd

Browse files
committed
Reduce diff
1 parent 2c643d8 commit c93aadd

File tree

1 file changed

+43
-47
lines changed

1 file changed

+43
-47
lines changed

pypdf/_text_extraction/_text_extractor.py

Lines changed: 43 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -81,55 +81,60 @@ def __init__(
8181
self.page_obj = page_obj # Reference to the PageObject for font width maps
8282
self.obj = obj
8383
self.pdf = pdf
84-
self.orientations = orientations
84+
8585
self.space_width = space_width
8686
self.content_key = content_key
8787
self.visitor_operand_before = visitor_operand_before
8888
self.visitor_operand_after = visitor_operand_after
89-
self.visitor_text = visitor_text
90-
91-
# Text state
92-
self.text: str = ""
93-
self.output: str = ""
94-
self.rtl_dir: bool = False # right-to-left
9589

9690
# Matrix state
9791
self.cm_matrix: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
9892
self.tm_matrix: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
99-
self.cm_stack: List[Tuple[Any, ...]] = []
100-
101-
# Previous matrices for tracking changes
93+
self.cm_stack: List[
94+
Tuple[
95+
List[float],
96+
Tuple[Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]],
97+
float,
98+
float,
99+
float,
100+
float,
101+
float,
102+
]
103+
] = []
104+
105+
# Store the last modified matrices; can be an intermediate position
102106
self.cm_prev: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
103107
self.tm_prev: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
104108

105-
# Memo matrices for visitor callbacks
109+
# Store the position at the beginning of building the text
106110
self.memo_cm: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
107111
self.memo_tm: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
108112

109113
# Font and text scaling state
110-
self.char_scale: float = 1.0
111-
self.space_scale: float = 1.0
114+
self.char_scale = 1.0
115+
self.space_scale = 1.0
112116
self._space_width: float = 500.0 # will be set correctly at first Tf
113-
self.TL: float = 0.0
114-
self.font_size: float = 12.0 # init just in case
117+
self.TL = 0.0
118+
self.font_size = 12.0 # init just in case
119+
120+
# Text state
121+
self.text: str = ""
122+
self.output: str = ""
123+
self.rtl_dir: bool = False # right-to-left
115124

116-
# Character map state
117125
self.cmap: Tuple[Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]] = (
118126
"charmap",
119127
{},
120128
"NotInitialized",
121129
None,
122130
) # (encoding, CMAP, font resource name, font)
131+
self.orientations: Tuple[int, ...] = orientations
132+
self.visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None
133+
self.cmaps: Dict[str, Tuple[str, float, Union[str, Dict[int, str]], Dict[str, str], DictionaryObject]] = {}
123134

124135
# Actual string size tracking
125136
self._actual_str_size: Dict[str, float] = {"str_widths": 0.0, "space_width": 0.0, "str_height": 0.0}
126137

127-
# Character maps for fonts
128-
self.cmaps: Dict[
129-
str,
130-
Tuple[str, float, Union[str, Dict[int, str]], Dict[str, str], DictionaryObject],
131-
] = {}
132-
133138
# Resources dictionary
134139
self.resources_dict: Optional[DictionaryObject] = None
135140

@@ -231,8 +236,7 @@ def _process_operation(self, operator: bytes, operands: List[Any]) -> None:
231236
if self.visitor_operand_after is not None:
232237
self.visitor_operand_after(operator, operands, self.cm_matrix, self.tm_matrix)
233238

234-
def _compute_str_widths(self, str_widths: float) -> float:
235-
"""Compute string widths."""
239+
def compute_str_widths(self, str_widths: float) -> float:
236240
return str_widths / 1000
237241

238242
def _flush_text(self) -> None:
@@ -355,22 +359,22 @@ def _handle_operation_move_text_position(self, operands: List[Any]) -> None:
355359
tx, ty = float(operands[0]), float(operands[1])
356360
self.tm_matrix[4] += tx * self.tm_matrix[0] + ty * self.tm_matrix[2]
357361
self.tm_matrix[5] += tx * self.tm_matrix[1] + ty * self.tm_matrix[3]
358-
str_widths = self._compute_str_widths(self._actual_str_size["str_widths"])
362+
str_widths = self.compute_str_widths(self._actual_str_size["str_widths"])
359363
self._actual_str_size["str_widths"] = 0.0
360364
self._handle_position_change(str_widths)
361365

362366
def _handle_operation_set_text_matrix(self, operands: List[Any]) -> None:
363367
"""Handle Tm (Set text matrix) operation."""
364368
self.tm_matrix = [float(operand) for operand in operands[:6]]
365-
str_widths = self._compute_str_widths(self._actual_str_size["str_widths"])
369+
str_widths = self.compute_str_widths(self._actual_str_size["str_widths"])
366370
self._actual_str_size["str_widths"] = 0.0
367371
self._handle_position_change(str_widths)
368372

369373
def _handle_operation_move_to_next_line(self, operands: List[Any]) -> None:
370374
"""Handle T* (Move to next line) operation."""
371375
self.tm_matrix[4] -= self.TL * self.tm_matrix[2]
372376
self.tm_matrix[5] -= self.TL * self.tm_matrix[3]
373-
str_widths = self._compute_str_widths(self._actual_str_size["str_widths"])
377+
str_widths = self.compute_str_widths(self._actual_str_size["str_widths"])
374378
self._actual_str_size["str_widths"] = 0.0
375379
self._handle_position_change(str_widths)
376380

@@ -389,7 +393,7 @@ def _handle_operation_show_text(self, operands: List[Any]) -> None:
389393
self._space_width,
390394
self._actual_str_size,
391395
)
392-
str_widths = self._compute_str_widths(self._actual_str_size["str_widths"])
396+
str_widths = self.compute_str_widths(self._actual_str_size["str_widths"])
393397
self._handle_position_change(str_widths)
394398

395399
def _handle_operation_show_text_with_positioning(self, operands: List[Any]) -> None:
@@ -471,7 +475,7 @@ def _handle_position_change(self, str_widths: float) -> None:
471475
self.font_size,
472476
self.visitor_text,
473477
str_widths,
474-
self._compute_str_widths(self._actual_str_size["space_width"]),
478+
self.compute_str_widths(self._actual_str_size["space_width"]),
475479
self._actual_str_size["str_height"],
476480
)
477481
if self.text == "":
@@ -482,16 +486,15 @@ def _handle_position_change(self, str_widths: float) -> None:
482486

483487
def _get_actual_font_widths(
484488
self,
485-
cmap: Tuple[Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]],
489+
cmap: Tuple[
490+
Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]
491+
],
486492
text_operands: str,
487493
font_size: float,
488494
space_width: float,
489495
) -> Tuple[float, float, float]:
490-
"""Get actual font widths for text operands."""
491496
font_widths: float = 0
492497
font_name: str = cmap[2]
493-
494-
# Use the page object's font width maps
495498
if font_name not in self.page_obj._font_width_maps:
496499
if cmap[3] is None:
497500
font_width_map: Dict[Any, float] = {}
@@ -505,7 +508,6 @@ def _get_actual_font_widths(
505508
if actual_space_width == 0:
506509
actual_space_width = space_width
507510
self.page_obj._font_width_maps[font_name] = (font_width_map, space_char, actual_space_width)
508-
509511
font_width_map = self.page_obj._font_width_maps[font_name][0]
510512
space_char = self.page_obj._font_width_maps[font_name][1]
511513
actual_space_width = self.page_obj._font_width_maps[font_name][2]
@@ -516,27 +518,26 @@ def _get_actual_font_widths(
516518
font_widths += actual_space_width
517519
continue
518520
font_widths += compute_font_width(font_width_map, char)
519-
520521
return (font_widths * font_size, space_width * font_size, font_size)
521522

522-
523-
524523
def _handle_tj(
525524
self,
526525
text: str,
527526
operands: List[Union[str, TextStringObject]],
528527
cm_matrix: List[float],
529528
tm_matrix: List[float],
530-
cmap: Tuple[Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]],
529+
cmap: Tuple[
530+
Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]
531+
],
531532
orientations: Tuple[int, ...],
532533
font_size: float,
533534
rtl_dir: bool,
534535
visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]],
535536
space_width: float,
536537
actual_str_size: Dict[str, float],
537538
) -> Tuple[str, bool, Dict[str, float]]:
538-
"""Handle text showing operations."""
539-
text_operands, is_str_operands = get_text_operands(operands, cm_matrix, tm_matrix, cmap, orientations)
539+
text_operands, is_str_operands = get_text_operands(
540+
operands, cm_matrix, tm_matrix, cmap, orientations)
540541
if is_str_operands:
541542
text += text_operands
542543
else:
@@ -550,13 +551,8 @@ def _handle_tj(
550551
rtl_dir,
551552
visitor_text,
552553
)
553-
554-
font_widths, actual_str_size["space_width"], actual_str_size["str_height"] = self._get_actual_font_widths(
555-
cmap,
556-
text_operands,
557-
font_size,
558-
space_width,
559-
)
554+
font_widths, actual_str_size["space_width"], actual_str_size["str_height"] = (
555+
self._get_actual_font_widths(cmap, text_operands, font_size, space_width))
560556
actual_str_size["str_widths"] += font_widths
561557

562558
return text, rtl_dir, actual_str_size

0 commit comments

Comments
 (0)