Skip to content

Commit b85814e

Browse files
committed
Reduce diff
1 parent 2c643d8 commit b85814e

File tree

1 file changed

+33
-33
lines changed

1 file changed

+33
-33
lines changed

pypdf/_text_extraction/_text_extractor.py

Lines changed: 33 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -81,12 +81,11 @@ def __init__(
8181
self.page_obj = page_obj # Reference to the PageObject for font width maps
8282
self.obj = obj
8383
self.pdf = pdf
84-
self.orientations = orientations
84+
8585
self.space_width = space_width
8686
self.content_key = content_key
8787
self.visitor_operand_before = visitor_operand_before
8888
self.visitor_operand_after = visitor_operand_after
89-
self.visitor_text = visitor_text
9089

9190
# Text state
9291
self.text: str = ""
@@ -96,7 +95,17 @@ def __init__(
9695
# Matrix state
9796
self.cm_matrix: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
9897
self.tm_matrix: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
99-
self.cm_stack: List[Tuple[Any, ...]] = []
98+
self.cm_stack: List[
99+
Tuple[
100+
List[float],
101+
Tuple[Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]],
102+
float,
103+
float,
104+
float,
105+
float,
106+
float,
107+
]
108+
] = []
100109

101110
# Previous matrices for tracking changes
102111
self.cm_prev: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
@@ -120,16 +129,16 @@ def __init__(
120129
"NotInitialized",
121130
None,
122131
) # (encoding, CMAP, font resource name, font)
123-
124-
# Actual string size tracking
125-
self._actual_str_size: Dict[str, float] = {"str_widths": 0.0, "space_width": 0.0, "str_height": 0.0}
126-
127-
# Character maps for fonts
132+
self.orientations = orientations
133+
self.visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None
128134
self.cmaps: Dict[
129135
str,
130136
Tuple[str, float, Union[str, Dict[int, str]], Dict[str, str], DictionaryObject],
131137
] = {}
132138

139+
# Actual string size tracking
140+
self._actual_str_size: Dict[str, float] = {"str_widths": 0.0, "space_width": 0.0, "str_height": 0.0}
141+
133142
# Resources dictionary
134143
self.resources_dict: Optional[DictionaryObject] = None
135144

@@ -231,8 +240,7 @@ def _process_operation(self, operator: bytes, operands: List[Any]) -> None:
231240
if self.visitor_operand_after is not None:
232241
self.visitor_operand_after(operator, operands, self.cm_matrix, self.tm_matrix)
233242

234-
def _compute_str_widths(self, str_widths: float) -> float:
235-
"""Compute string widths."""
243+
def compute_str_widths(self, str_widths: float) -> float:
236244
return str_widths / 1000
237245

238246
def _flush_text(self) -> None:
@@ -355,22 +363,22 @@ def _handle_operation_move_text_position(self, operands: List[Any]) -> None:
355363
tx, ty = float(operands[0]), float(operands[1])
356364
self.tm_matrix[4] += tx * self.tm_matrix[0] + ty * self.tm_matrix[2]
357365
self.tm_matrix[5] += tx * self.tm_matrix[1] + ty * self.tm_matrix[3]
358-
str_widths = self._compute_str_widths(self._actual_str_size["str_widths"])
366+
str_widths = self.compute_str_widths(self._actual_str_size["str_widths"])
359367
self._actual_str_size["str_widths"] = 0.0
360368
self._handle_position_change(str_widths)
361369

362370
def _handle_operation_set_text_matrix(self, operands: List[Any]) -> None:
363371
"""Handle Tm (Set text matrix) operation."""
364372
self.tm_matrix = [float(operand) for operand in operands[:6]]
365-
str_widths = self._compute_str_widths(self._actual_str_size["str_widths"])
373+
str_widths = self.compute_str_widths(self._actual_str_size["str_widths"])
366374
self._actual_str_size["str_widths"] = 0.0
367375
self._handle_position_change(str_widths)
368376

369377
def _handle_operation_move_to_next_line(self, operands: List[Any]) -> None:
370378
"""Handle T* (Move to next line) operation."""
371379
self.tm_matrix[4] -= self.TL * self.tm_matrix[2]
372380
self.tm_matrix[5] -= self.TL * self.tm_matrix[3]
373-
str_widths = self._compute_str_widths(self._actual_str_size["str_widths"])
381+
str_widths = self.compute_str_widths(self._actual_str_size["str_widths"])
374382
self._actual_str_size["str_widths"] = 0.0
375383
self._handle_position_change(str_widths)
376384

@@ -389,7 +397,7 @@ def _handle_operation_show_text(self, operands: List[Any]) -> None:
389397
self._space_width,
390398
self._actual_str_size,
391399
)
392-
str_widths = self._compute_str_widths(self._actual_str_size["str_widths"])
400+
str_widths = self.compute_str_widths(self._actual_str_size["str_widths"])
393401
self._handle_position_change(str_widths)
394402

395403
def _handle_operation_show_text_with_positioning(self, operands: List[Any]) -> None:
@@ -471,7 +479,7 @@ def _handle_position_change(self, str_widths: float) -> None:
471479
self.font_size,
472480
self.visitor_text,
473481
str_widths,
474-
self._compute_str_widths(self._actual_str_size["space_width"]),
482+
self.compute_str_widths(self._actual_str_size["space_width"]),
475483
self._actual_str_size["str_height"],
476484
)
477485
if self.text == "":
@@ -482,16 +490,15 @@ def _handle_position_change(self, str_widths: float) -> None:
482490

483491
def _get_actual_font_widths(
484492
self,
485-
cmap: Tuple[Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]],
493+
cmap: Tuple[
494+
Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]
495+
],
486496
text_operands: str,
487497
font_size: float,
488498
space_width: float,
489499
) -> Tuple[float, float, float]:
490-
"""Get actual font widths for text operands."""
491500
font_widths: float = 0
492501
font_name: str = cmap[2]
493-
494-
# Use the page object's font width maps
495502
if font_name not in self.page_obj._font_width_maps:
496503
if cmap[3] is None:
497504
font_width_map: Dict[Any, float] = {}
@@ -505,7 +512,6 @@ def _get_actual_font_widths(
505512
if actual_space_width == 0:
506513
actual_space_width = space_width
507514
self.page_obj._font_width_maps[font_name] = (font_width_map, space_char, actual_space_width)
508-
509515
font_width_map = self.page_obj._font_width_maps[font_name][0]
510516
space_char = self.page_obj._font_width_maps[font_name][1]
511517
actual_space_width = self.page_obj._font_width_maps[font_name][2]
@@ -516,27 +522,26 @@ def _get_actual_font_widths(
516522
font_widths += actual_space_width
517523
continue
518524
font_widths += compute_font_width(font_width_map, char)
519-
520525
return (font_widths * font_size, space_width * font_size, font_size)
521526

522-
523-
524527
def _handle_tj(
525528
self,
526529
text: str,
527530
operands: List[Union[str, TextStringObject]],
528531
cm_matrix: List[float],
529532
tm_matrix: List[float],
530-
cmap: Tuple[Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]],
533+
cmap: Tuple[
534+
Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]
535+
],
531536
orientations: Tuple[int, ...],
532537
font_size: float,
533538
rtl_dir: bool,
534539
visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]],
535540
space_width: float,
536541
actual_str_size: Dict[str, float],
537542
) -> Tuple[str, bool, Dict[str, float]]:
538-
"""Handle text showing operations."""
539-
text_operands, is_str_operands = get_text_operands(operands, cm_matrix, tm_matrix, cmap, orientations)
543+
text_operands, is_str_operands = get_text_operands(
544+
operands, cm_matrix, tm_matrix, cmap, orientations)
540545
if is_str_operands:
541546
text += text_operands
542547
else:
@@ -550,13 +555,8 @@ def _handle_tj(
550555
rtl_dir,
551556
visitor_text,
552557
)
553-
554-
font_widths, actual_str_size["space_width"], actual_str_size["str_height"] = self._get_actual_font_widths(
555-
cmap,
556-
text_operands,
557-
font_size,
558-
space_width,
559-
)
558+
font_widths, actual_str_size["space_width"], actual_str_size["str_height"] = (
559+
self._get_actual_font_widths(cmap, text_operands, font_size, space_width))
560560
actual_str_size["str_widths"] += font_widths
561561

562562
return text, rtl_dir, actual_str_size

0 commit comments

Comments
 (0)