diff --git a/pypdf/_page.py b/pypdf/_page.py index f71ea6c04..557d62921 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -1673,143 +1673,18 @@ def _extract_text( default = "/Content" """ - extractor = TextExtraction() - cmaps: Dict[ - str, - Tuple[ - str, float, Union[str, Dict[int, str]], Dict[str, str], DictionaryObject - ], - ] = {} - - try: - objr = obj - while NameObject(PG.RESOURCES) not in objr: - # /Resources can be inherited so we look to parents - objr = objr["/Parent"].get_object() - # If no parents then no /Resources will be available, - # so an exception will be raised - resources_dict = cast(DictionaryObject, objr[PG.RESOURCES]) - except Exception: - # No resources means no text is possible (no font); we consider the - # file as not damaged, no need to check for TJ or Tj - return "" - - if not is_null_or_none(resources_dict) and "/Font" in resources_dict and (font := resources_dict["/Font"]): - for f in cast(DictionaryObject, font): - try: - cmaps[f] = build_char_map(f, space_width, obj) - except TypeError: - pass - - try: - content = ( - obj[content_key].get_object() if isinstance(content_key, str) else obj - ) - if not isinstance(content, ContentStream): - content = ContentStream(content, pdf, "bytes") - except (AttributeError, KeyError): # no content can be extracted (certainly empty page) - return "" - # We check all strings are TextStringObjects. ByteStringObjects - # are strings where the byte->string encoding was unknown, so adding - # them to the text here would be gibberish. - - # Initialize the extractor with the necessary parameters - extractor.initialize_extraction(orientations, visitor_text, cmaps) - - for operands, operator in content.operations: - if visitor_operand_before is not None: - visitor_operand_before(operator, operands, extractor.cm_matrix, extractor.tm_matrix) - # Multiple operators are handled here - if operator == b"'": - extractor.process_operation(b"T*", []) - extractor.process_operation(b"Tj", operands) - elif operator == b'"': - extractor.process_operation(b"Tw", [operands[0]]) - extractor.process_operation(b"Tc", [operands[1]]) - extractor.process_operation(b"T*", []) - extractor.process_operation(b"Tj", operands[2:]) - elif operator == b"TJ": - # The space width may be smaller than the font width, so the width should be 95%. - _confirm_space_width = extractor._space_width * 0.95 - if operands: - for op in operands[0]: - if isinstance(op, (str, bytes)): - extractor.process_operation(b"Tj", [op]) - if isinstance(op, (int, float, NumberObject, FloatObject)) and ( - abs(float(op)) >= _confirm_space_width - and extractor.text - and extractor.text[-1] != " " - ): - extractor.process_operation(b"Tj", [" "]) - elif operator == b"TD": - extractor.process_operation(b"TL", [-operands[1]]) - extractor.process_operation(b"Td", operands) - elif operator == b"Do": - extractor.output += extractor.text - if visitor_text is not None: - visitor_text( - extractor.text, - extractor.memo_cm, - extractor.memo_tm, - extractor.cmap[3], - extractor.font_size, - ) - try: - if extractor.output[-1] != "\n": - extractor.output += "\n" - if visitor_text is not None: - visitor_text( - "\n", - extractor.memo_cm, - extractor.memo_tm, - extractor.cmap[3], - extractor.font_size, - ) - except IndexError: - pass - try: - xobj = resources_dict["/XObject"] - if xobj[operands[0]]["/Subtype"] != "/Image": # type: ignore - text = self.extract_xform_text( - xobj[operands[0]], # type: ignore - orientations, - space_width, - visitor_operand_before, - visitor_operand_after, - visitor_text, - ) - extractor.output += text - if visitor_text is not None: - visitor_text( - text, - extractor.memo_cm, - extractor.memo_tm, - extractor.cmap[3], - extractor.font_size, - ) - except Exception as exception: - logger_warning( - f"Impossible to decode XFormObject {operands[0]}: {exception}", - __name__, - ) - finally: - extractor.text = "" - extractor.memo_cm = extractor.cm_matrix.copy() - extractor.memo_tm = extractor.tm_matrix.copy() - else: - extractor.process_operation(operator, operands) - if visitor_operand_after is not None: - visitor_operand_after(operator, operands, extractor.cm_matrix, extractor.tm_matrix) - extractor.output += extractor.text # just in case - if extractor.text != "" and visitor_text is not None: - visitor_text( - extractor.text, - extractor.memo_cm, - extractor.memo_tm, - extractor.cmap[3], - extractor.font_size, - ) - return extractor.output + extractor = TextExtraction( + self, # Pass the page object for font width maps + obj, + pdf, + orientations, + space_width, + content_key, + visitor_operand_before, + visitor_operand_after, + visitor_text, + ) + return extractor.extract_text() def _layout_mode_fonts(self) -> Dict[str, _layout_mode.Font]: """ diff --git a/pypdf/_text_extraction/_text_extractor.py b/pypdf/_text_extraction/_text_extractor.py index 8739b25a1..891ae058f 100644 --- a/pypdf/_text_extraction/_text_extractor.py +++ b/pypdf/_text_extraction/_text_extractor.py @@ -28,11 +28,32 @@ # POSSIBILITY OF SUCH DAMAGE. import math -from typing import Any, Callable, Dict, List, Optional, Tuple, Union +from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast -from .._cmap import build_font_width_map, compute_font_width, get_actual_str_key -from ..generic import DictionaryObject, TextStringObject -from . import OrientationNotFoundError, crlf_space_check, get_display_str, get_text_operands, mult +from .._cmap import ( + build_char_map, + build_font_width_map, + compute_font_width, + get_actual_str_key, + unknown_char_map, +) +from .._utils import logger_warning +from ..constants import PageAttributes as PG +from ..generic import ( + ContentStream, + DictionaryObject, + FloatObject, + NameObject, + NumberObject, + TextStringObject, +) +from . import ( + OrientationNotFoundError, + crlf_space_check, + get_display_str, + get_text_operands, + mult, +) class TextExtraction: @@ -44,10 +65,29 @@ class TextExtraction: variables in the original implementation. """ - def __init__(self) -> None: - self._font_width_maps: Dict[str, Tuple[Dict[Any, float], str, float]] = {} + def __init__( + self, + page_obj: Any, # PageObject reference + obj: Any, + pdf: Any, + orientations: Tuple[int, ...] = (0, 90, 180, 270), + space_width: float = 200.0, + content_key: Optional[str] = PG.CONTENTS, + visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None, + visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None, + visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None, + ) -> None: + """Initialize the text extraction with parameters and state.""" + self.page_obj = page_obj # Reference to the PageObject for font width maps + self.obj = obj + self.pdf = pdf + + self.space_width = space_width + self.content_key = content_key + self.visitor_operand_before = visitor_operand_before + self.visitor_operand_after = visitor_operand_after - # Text extraction state variables + # Matrix state self.cm_matrix: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] self.tm_matrix: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] self.cm_stack: List[ @@ -70,221 +110,380 @@ def __init__(self) -> None: self.memo_cm: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] self.memo_tm: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] + # Font and text scaling state self.char_scale = 1.0 self.space_scale = 1.0 self._space_width: float = 500.0 # will be set correctly at first Tf - self._actual_str_size: Dict[str, float] = { - "str_widths": 0.0, - "space_width": 0.0, - "str_height": 0.0, - } # will be set to string length calculation result self.TL = 0.0 - self.font_size = 12.0 # init just in case of + self.font_size = 12.0 # init just in case - # Text extraction variables + # Text state self.text: str = "" self.output: str = "" self.rtl_dir: bool = False # right-to-left + self.cmap: Tuple[Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]] = ( "charmap", {}, "NotInitialized", None, ) # (encoding, CMAP, font resource name, font) - self.orientations: Tuple[int, ...] = (0, 90, 180, 270) + self.orientations: Tuple[int, ...] = orientations self.visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None self.cmaps: Dict[str, Tuple[str, float, Union[str, Dict[int, str]], Dict[str, str], DictionaryObject]] = {} - def initialize_extraction( - self, - orientations: Tuple[int, ...] = (0, 90, 180, 270), - visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None, - cmaps: Optional[ - Dict[str, Tuple[str, float, Union[str, Dict[int, str]], Dict[str, str], DictionaryObject]] - ] = None, - ) -> None: - """Initialize the extractor with extraction parameters.""" - self.orientations = orientations - self.visitor_text = visitor_text - self.cmaps = cmaps or {} + # Actual string size tracking + self._actual_str_size: Dict[str, float] = {"str_widths": 0.0, "space_width": 0.0, "str_height": 0.0} - # Reset state - self.text = "" - self.output = "" - self.rtl_dir = False + # Resources dictionary + self.resources_dict: Optional[DictionaryObject] = None + + # Operation handler mapping + self.operation_handlers = { + b"BT": self._handle_operation_begin_text, + b"ET": self._handle_operation_end_text, + b"q": self._handle_operation_save_graphics_state, + b"Q": self._handle_operation_restore_graphics_state, + b"cm": self._handle_operation_modify_current_matrix, + b"Tz": self._handle_operation_horizontal_text_scaling, + b"Tw": self._handle_operation_word_spacing, + b"TL": self._handle_operation_text_leading, + b"Tf": self._handle_operation_set_font, + b"Td": self._handle_operation_move_text_position, + b"Tm": self._handle_operation_set_text_matrix, + b"T*": self._handle_operation_move_to_next_line, + b"Tj": self._handle_operation_show_text, + } + + def extract_text(self) -> str: + """Extract text from the PDF object.""" + # Initialize resources and content + if not self._initialize_resources(): + return "" + + content = self._get_content() + if content is None: + return "" + + # Process all operations in the content stream + for operands, operator in content.operations: + self._process_operation(operator, operands) + + # Add any remaining text to output + self.output += self.text + if self.text != "" and self.visitor_text is not None: + self.visitor_text(self.text, self.memo_cm, self.memo_tm, self.cmap[3], self.font_size) + + return self.output + + def _initialize_resources(self) -> bool: + """Initialize resources dictionary and character maps.""" + try: + objr = self.obj + while NameObject(PG.RESOURCES) not in objr: + # /Resources can be inherited so we look to parents + objr = objr["/Parent"].get_object() + # If no parents then no /Resources will be available, + # so an exception will be raised + self.resources_dict = cast("DictionaryObject", objr[PG.RESOURCES]) + except Exception: + # No resources means no text is possible (no font) + return False + + if "/Font" in self.resources_dict and (font := self.resources_dict["/Font"]): + for f in cast("DictionaryObject", font): + self.cmaps[f] = build_char_map(f, self.space_width, self.obj) + + return True + + def _get_content(self) -> Optional[ContentStream]: + """Get the content stream from the object.""" + try: + content = self.obj[self.content_key].get_object() if isinstance(self.content_key, str) else self.obj + if not isinstance(content, ContentStream): + content = ContentStream(content, self.pdf, "bytes") + return content + except (AttributeError, KeyError): + return None + + def _process_operation(self, operator: bytes, operands: List[Any]) -> None: + """Process a single PDF operation.""" + if self.visitor_operand_before is not None: + self.visitor_operand_before(operator, operands, self.cm_matrix, self.tm_matrix) + + # Handle compound operators + if operator == b"'": + self._handle_operation_move_to_next_line([]) + self._handle_operation_show_text(operands) + elif operator == b'"': + self._handle_operation_word_spacing([operands[0]]) + self._handle_operation_character_spacing([operands[1]]) + self._handle_operation_move_to_next_line([]) + self._handle_operation_show_text(operands[2:]) + elif operator == b"TJ": + self._handle_operation_show_text_with_positioning(operands) + elif operator == b"TD": + self._handle_operation_text_leading([-operands[1]]) + self._handle_operation_move_text_position(operands) + elif operator == b"Do": + self._handle_operation_do(operands) + else: + # Use the operation handler mapping + handler = self.operation_handlers.get(operator) + if handler: + handler(operands) + + if self.visitor_operand_after is not None: + self.visitor_operand_after(operator, operands, self.cm_matrix, self.tm_matrix) def compute_str_widths(self, str_widths: float) -> float: return str_widths / 1000 - def process_operation(self, operator: bytes, operands: List[Any]) -> None: - str_widths: float = 0.0 + def _flush_text(self) -> None: + """Flush current text to output.""" + self.output += self.text + if self.visitor_text is not None: + self.visitor_text(self.text, self.memo_cm, self.memo_tm, self.cmap[3], self.font_size) + self.text = "" + self.memo_cm = self.cm_matrix.copy() + self.memo_tm = self.tm_matrix.copy() - # Table 5.4 page 405 - if operator == b"BT": # Begin Text - self.tm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] - # Flush text: - self.output += self.text - if self.visitor_text is not None: - self.visitor_text(self.text, self.memo_cm, self.memo_tm, self.cmap[3], self.font_size) - self.text = "" - self.memo_cm = self.cm_matrix.copy() - self.memo_tm = self.tm_matrix.copy() - return - if operator == b"ET": # End Text - # Flush text: - self.output += self.text - if self.visitor_text is not None: - self.visitor_text(self.text, self.memo_cm, self.memo_tm, self.cmap[3], self.font_size) - self.text = "" - self.memo_cm = self.cm_matrix.copy() - self.memo_tm = self.tm_matrix.copy() + # Operation handlers + def _handle_operation_begin_text(self, operands: List[Any]) -> None: + """Handle BT (Begin Text) operation.""" + self.tm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] + self._flush_text() - # Table 4.7 "Graphics state operators", page 219 - # cm_matrix calculation is reserved for later - elif operator == b"q": # Save graphics state - self.cm_stack.append( - ( - self.cm_matrix, - self.cmap, - self.font_size, - self.char_scale, - self.space_scale, - self._space_width, - self.TL, - ) + def _handle_operation_end_text(self, operands: List[Any]) -> None: + """Handle ET (End Text) operation.""" + self._flush_text() + + def _handle_operation_save_graphics_state(self, operands: List[Any]) -> None: + """Handle q (Save graphics state) operation.""" + self.cm_stack.append( + ( + self.cm_matrix, + self.cmap, + self.font_size, + self.char_scale, + self.space_scale, + self._space_width, + self.TL, + ), + ) + + def _handle_operation_restore_graphics_state(self, operands: List[Any]) -> None: + """Handle Q (Restore graphics state) operation.""" + try: + ( + self.cm_matrix, + self.cmap, + self.font_size, + self.char_scale, + self.space_scale, + self._space_width, + self.TL, + ) = self.cm_stack.pop() + except Exception: + self.cm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] + + def _handle_operation_modify_current_matrix(self, operands: List[Any]) -> None: + """Handle cm (Modify current matrix) operation.""" + self._flush_text() + try: + self.cm_matrix = mult([float(operand) for operand in operands[:6]], self.cm_matrix) + except Exception: + self.cm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] + self.memo_cm = self.cm_matrix.copy() + self.memo_tm = self.tm_matrix.copy() + + def _handle_operation_horizontal_text_scaling(self, operands: List[Any]) -> None: + """Handle Tz (Set horizontal text scaling) operation.""" + self.char_scale = float(operands[0]) / 100 if operands else 1.0 + + def _handle_operation_word_spacing(self, operands: List[Any]) -> None: + """Handle Tw (Set word spacing) operation.""" + self.space_scale = 1.0 + float(operands[0] if operands else 0.0) + + def _handle_operation_character_spacing(self, operands: List[Any]) -> None: + """Handle Tc (Set character spacing) operation.""" + # This is a placeholder for character spacing handling + + def _handle_operation_text_leading(self, operands: List[Any]) -> None: + """Handle TL (Set Text Leading) operation.""" + scale_x = math.sqrt(self.tm_matrix[0] ** 2 + self.tm_matrix[2] ** 2) + self.TL = float(operands[0] if operands else 0.0) * self.font_size * scale_x + + def _handle_operation_set_font(self, operands: List[Any]) -> None: + """Handle Tf (Set font size) operation.""" + if self.text != "": + self._flush_text() + + try: + # char_map_tuple: font_type, + # float(sp_width / 2), + # encoding, + # map_dict, + # font_dict (describes the font) + char_map_tuple = self.cmaps[operands[0]] + # current cmap: encoding, + # map_dict, + # font resource name (internal name, not the real font name), + # font_dict + self.cmap = ( + char_map_tuple[2], + char_map_tuple[3], + operands[0], + char_map_tuple[4], ) - elif operator == b"Q": # Restore graphics state - try: - ( - self.cm_matrix, - self.cmap, - self.font_size, - self.char_scale, - self.space_scale, - self._space_width, - self.TL, - ) = self.cm_stack.pop() - except Exception: - self.cm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] - elif operator == b"cm": # Modify current matrix - self.output += self.text - if self.visitor_text is not None: - self.visitor_text(self.text, self.memo_cm, self.memo_tm, self.cmap[3], self.font_size) - self.text = "" - try: - self.cm_matrix = mult([float(operand) for operand in operands[:6]], self.cm_matrix) - except Exception: - self.cm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] - self.memo_cm = self.cm_matrix.copy() - self.memo_tm = self.tm_matrix.copy() + self._space_width = char_map_tuple[1] + except KeyError: # font not found + self.cmap = ( + unknown_char_map[2], + unknown_char_map[3], + f"???{operands[0]}", + None, + ) + self._space_width = unknown_char_map[1] + + try: + self.font_size = float(operands[1]) + except Exception: + pass # keep previous size + + def _handle_operation_move_text_position(self, operands: List[Any]) -> None: + """Handle Td (Move text position) operation.""" + # A special case is a translating only tm: + # tm = [1, 0, 0, 1, e, f] + # i.e. tm[4] += tx, tm[5] += ty. + tx, ty = float(operands[0]), float(operands[1]) + self.tm_matrix[4] += tx * self.tm_matrix[0] + ty * self.tm_matrix[2] + self.tm_matrix[5] += tx * self.tm_matrix[1] + ty * self.tm_matrix[3] + str_widths = self.compute_str_widths(self._actual_str_size["str_widths"]) + self._actual_str_size["str_widths"] = 0.0 + self._handle_position_change(str_widths) + + def _handle_operation_set_text_matrix(self, operands: List[Any]) -> None: + """Handle Tm (Set text matrix) operation.""" + self.tm_matrix = [float(operand) for operand in operands[:6]] + str_widths = self.compute_str_widths(self._actual_str_size["str_widths"]) + self._actual_str_size["str_widths"] = 0.0 + self._handle_position_change(str_widths) - # Table 5.2 page 398 - elif operator == b"Tz": # Set horizontal text scaling - self.char_scale = float(operands[0]) / 100 if operands else 1.0 - elif operator == b"Tw": # Set word spacing - self.space_scale = 1.0 + float(operands[0] if operands else 0.0) - elif operator == b"TL": # Set Text Leading - scale_x = math.sqrt(self.tm_matrix[0] ** 2 + self.tm_matrix[2] ** 2) - self.TL = float(operands[0] if operands else 0.0) * self.font_size * scale_x - elif operator == b"Tf": # Set font size - if self.text != "": - self.output += self.text # .translate(cmap) + def _handle_operation_move_to_next_line(self, operands: List[Any]) -> None: + """Handle T* (Move to next line) operation.""" + self.tm_matrix[4] -= self.TL * self.tm_matrix[2] + self.tm_matrix[5] -= self.TL * self.tm_matrix[3] + str_widths = self.compute_str_widths(self._actual_str_size["str_widths"]) + self._actual_str_size["str_widths"] = 0.0 + self._handle_position_change(str_widths) + + def _handle_operation_show_text(self, operands: List[Any]) -> None: + """Handle Tj (Show text) operation.""" + self.text, self.rtl_dir, self._actual_str_size = self._handle_tj( + self.text, + operands, + self.cm_matrix, + self.tm_matrix, + self.cmap, + self.orientations, + self.font_size, + self.rtl_dir, + self.visitor_text, + self._space_width, + self._actual_str_size, + ) + str_widths = self.compute_str_widths(self._actual_str_size["str_widths"]) + self._handle_position_change(str_widths) + + def _handle_operation_show_text_with_positioning(self, operands: List[Any]) -> None: + """Handle TJ (Show text with positioning) operation.""" + # The space width may be smaller than the font width, so the width should be 95%. + _confirm_space_width = self._space_width * 0.95 + if operands: + for op in operands[0]: + if isinstance(op, (str, bytes)): + self._handle_operation_show_text([op]) + if isinstance(op, (int, float, NumberObject, FloatObject)) and ( + abs(float(op)) >= _confirm_space_width and self.text and self.text[-1] != " " + ): + self._handle_operation_show_text([" "]) + + def _handle_operation_do(self, operands: List[Any]) -> None: + """Handle Do (Execute XObject) operation.""" + self._flush_text() + try: + if self.output and self.output[-1] != "\n": + self.output += "\n" + if self.visitor_text is not None: + self.visitor_text( + "\n", + self.memo_cm, + self.memo_tm, + self.cmap[3], + self.font_size, + ) + except IndexError: + pass + + try: + xobj = self.resources_dict["/XObject"] # type: ignore + if xobj[operands[0]]["/Subtype"] != "/Image": # type: ignore + # Extract text from XForm object + xform_extractor = TextExtraction( + self.page_obj, + xobj[operands[0]], # type: ignore + self.pdf, + self.orientations, + self.space_width, + None, # content_key = None for XForm objects + self.visitor_operand_before, + self.visitor_operand_after, + self.visitor_text, + ) + text = xform_extractor.extract_text() + self.output += text if self.visitor_text is not None: - self.visitor_text(self.text, self.memo_cm, self.memo_tm, self.cmap[3], self.font_size) + self.visitor_text( + text, + self.memo_cm, + self.memo_tm, + self.cmap[3], + self.font_size, + ) + except Exception as exception: + logger_warning( + f"Impossible to decode XFormObject {operands[0]}: {exception}", + __name__, + ) + finally: self.text = "" self.memo_cm = self.cm_matrix.copy() self.memo_tm = self.tm_matrix.copy() - try: - # Import here to avoid circular imports - from .._cmap import unknown_char_map # noqa: PLC0415 - - # char_map_tuple: font_type, - # float(sp_width / 2), - # encoding, - # map_dict, - # font_dict (describes the font) - char_map_tuple = self.cmaps[operands[0]] - # current cmap: encoding, - # map_dict, - # font resource name (internal name, not the real font name), - # font_dict - self.cmap = ( - char_map_tuple[2], - char_map_tuple[3], - operands[0], - char_map_tuple[4], - ) - self._space_width = char_map_tuple[1] - except KeyError: # font not found - self.cmap = ( - unknown_char_map[2], - unknown_char_map[3], - f"???{operands[0]}", - None, - ) - self._space_width = unknown_char_map[1] - try: - self.font_size = float(operands[1]) - except Exception: - pass # keep previous size - # Table 5.5 page 406 - elif operator == b"Td": # Move text position - # A special case is a translating only tm: - # tm = [1, 0, 0, 1, e, f] - # i.e. tm[4] += tx, tm[5] += ty. - tx, ty = float(operands[0]), float(operands[1]) - self.tm_matrix[4] += tx * self.tm_matrix[0] + ty * self.tm_matrix[2] - self.tm_matrix[5] += tx * self.tm_matrix[1] + ty * self.tm_matrix[3] - str_widths = self.compute_str_widths(self._actual_str_size["str_widths"]) - self._actual_str_size["str_widths"] = 0.0 - elif operator == b"Tm": # Set text matrix - self.tm_matrix = [float(operand) for operand in operands[:6]] - str_widths = self.compute_str_widths(self._actual_str_size["str_widths"]) - self._actual_str_size["str_widths"] = 0.0 - elif operator == b"T*": # Move to next line - self.tm_matrix[4] -= self.TL * self.tm_matrix[2] - self.tm_matrix[5] -= self.TL * self.tm_matrix[3] - str_widths = self.compute_str_widths(self._actual_str_size["str_widths"]) - self._actual_str_size["str_widths"] = 0.0 - elif operator == b"Tj": # Show text - self.text, self.rtl_dir, self._actual_str_size = self._handle_tj( + + def _handle_position_change(self, str_widths: float) -> None: + """Handle position changes for text positioning operations.""" + try: + self.text, self.output, self.cm_prev, self.tm_prev = crlf_space_check( self.text, - operands, - self.cm_matrix, - self.tm_matrix, + (self.cm_prev, self.tm_prev), + (self.cm_matrix, self.tm_matrix), + (self.memo_cm, self.memo_tm), self.cmap, self.orientations, + self.output, self.font_size, - self.rtl_dir, self.visitor_text, - self._space_width, - self._actual_str_size, + str_widths, + self.compute_str_widths(self._actual_str_size["space_width"]), + self._actual_str_size["str_height"], ) - else: + if self.text == "": + self.memo_cm = self.cm_matrix.copy() + self.memo_tm = self.tm_matrix.copy() + except OrientationNotFoundError: return - if operator in {b"Td", b"Tm", b"T*", b"Tj"}: - try: - self.text, self.output, self.cm_prev, self.tm_prev = crlf_space_check( - self.text, - (self.cm_prev, self.tm_prev), - (self.cm_matrix, self.tm_matrix), - (self.memo_cm, self.memo_tm), - self.cmap, - self.orientations, - self.output, - self.font_size, - self.visitor_text, - str_widths, - self.compute_str_widths(self._actual_str_size["space_width"]), - self._actual_str_size["str_height"], - ) - if self.text == "": - self.memo_cm = self.cm_matrix.copy() - self.memo_tm = self.tm_matrix.copy() - except OrientationNotFoundError: - return - def _get_actual_font_widths( self, cmap: Tuple[ @@ -296,7 +495,7 @@ def _get_actual_font_widths( ) -> Tuple[float, float, float]: font_widths: float = 0 font_name: str = cmap[2] - if font_name not in self._font_width_maps: + if font_name not in self.page_obj._font_width_maps: if cmap[3] is None: font_width_map: Dict[Any, float] = {} space_char = " " @@ -308,10 +507,10 @@ def _get_actual_font_widths( actual_space_width = compute_font_width(font_width_map, space_char) if actual_space_width == 0: actual_space_width = space_width - self._font_width_maps[font_name] = (font_width_map, space_char, actual_space_width) - font_width_map = self._font_width_maps[font_name][0] - space_char = self._font_width_maps[font_name][1] - actual_space_width = self._font_width_maps[font_name][2] + self.page_obj._font_width_maps[font_name] = (font_width_map, space_char, actual_space_width) + font_width_map = self.page_obj._font_width_maps[font_name][0] + space_char = self.page_obj._font_width_maps[font_name][1] + actual_space_width = self.page_obj._font_width_maps[font_name][2] if text_operands: for char in text_operands: