Skip to content

Commit d9649d0

Browse files
authored
MAINT: Use dispatch table for TextExtraction.process_operation (#3360)
1 parent 38db653 commit d9649d0

File tree

1 file changed

+201
-160
lines changed

1 file changed

+201
-160
lines changed

pypdf/_text_extraction/_text_extractor.py

Lines changed: 201 additions & 160 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,22 @@ def __init__(self) -> None:
9595
self.visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None
9696
self.cmaps: Dict[str, Tuple[str, float, Union[str, Dict[int, str]], Dict[str, str], DictionaryObject]] = {}
9797

98+
self.operation_handlers = {
99+
b"BT": self._handle_bt,
100+
b"ET": self._handle_et,
101+
b"q": self._handle_save_graphics_state,
102+
b"Q": self._handle_restore_graphics_state,
103+
b"cm": self._handle_cm,
104+
b"Tz": self._handle_tz,
105+
b"Tw": self._handle_tw,
106+
b"TL": self._handle_tl,
107+
b"Tf": self._handle_tf,
108+
b"Td": self._handle_td,
109+
b"Tm": self._handle_tm,
110+
b"T*": self._handle_t_star,
111+
b"Tj": self._handle_tj_operation,
112+
}
113+
98114
def initialize_extraction(
99115
self,
100116
orientations: Tuple[int, ...] = (0, 90, 180, 270),
@@ -117,173 +133,36 @@ def compute_str_widths(self, str_widths: float) -> float:
117133
return str_widths / 1000
118134

119135
def process_operation(self, operator: bytes, operands: List[Any]) -> None:
120-
str_widths: float = 0.0
136+
if operator in self.operation_handlers:
137+
handler = self.operation_handlers[operator]
138+
str_widths = handler(operands)
121139

122-
# Table 5.4 page 405
123-
if operator == b"BT": # Begin Text
124-
self.tm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
125-
# Flush text:
126-
self.output += self.text
127-
if self.visitor_text is not None:
128-
self.visitor_text(self.text, self.memo_cm, self.memo_tm, self.cmap[3], self.font_size)
129-
self.text = ""
130-
self.memo_cm = self.cm_matrix.copy()
131-
self.memo_tm = self.tm_matrix.copy()
132-
return
133-
if operator == b"ET": # End Text
134-
# Flush text:
135-
self.output += self.text
136-
if self.visitor_text is not None:
137-
self.visitor_text(self.text, self.memo_cm, self.memo_tm, self.cmap[3], self.font_size)
138-
self.text = ""
139-
self.memo_cm = self.cm_matrix.copy()
140-
self.memo_tm = self.tm_matrix.copy()
141-
142-
# Table 4.7 "Graphics state operators", page 219
143-
# cm_matrix calculation is reserved for later
144-
elif operator == b"q": # Save graphics state
145-
self.cm_stack.append(
146-
(
147-
self.cm_matrix,
148-
self.cmap,
149-
self.font_size,
150-
self.char_scale,
151-
self.space_scale,
152-
self._space_width,
153-
self.TL,
154-
)
155-
)
156-
elif operator == b"Q": # Restore graphics state
157-
try:
158-
(
159-
self.cm_matrix,
160-
self.cmap,
161-
self.font_size,
162-
self.char_scale,
163-
self.space_scale,
164-
self._space_width,
165-
self.TL,
166-
) = self.cm_stack.pop()
167-
except Exception:
168-
self.cm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
169-
elif operator == b"cm": # Modify current matrix
170-
self.output += self.text
171-
if self.visitor_text is not None:
172-
self.visitor_text(self.text, self.memo_cm, self.memo_tm, self.cmap[3], self.font_size)
173-
self.text = ""
174-
try:
175-
self.cm_matrix = mult([float(operand) for operand in operands[:6]], self.cm_matrix)
176-
except Exception:
177-
self.cm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
178-
self.memo_cm = self.cm_matrix.copy()
179-
self.memo_tm = self.tm_matrix.copy()
180-
181-
# Table 5.2 page 398
182-
elif operator == b"Tz": # Set horizontal text scaling
183-
self.char_scale = float(operands[0]) / 100 if operands else 1.0
184-
elif operator == b"Tw": # Set word spacing
185-
self.space_scale = 1.0 + float(operands[0] if operands else 0.0)
186-
elif operator == b"TL": # Set Text Leading
187-
scale_x = math.sqrt(self.tm_matrix[0] ** 2 + self.tm_matrix[2] ** 2)
188-
self.TL = float(operands[0] if operands else 0.0) * self.font_size * scale_x
189-
elif operator == b"Tf": # Set font size
190-
if self.text != "":
191-
self.output += self.text # .translate(cmap)
192-
if self.visitor_text is not None:
193-
self.visitor_text(self.text, self.memo_cm, self.memo_tm, self.cmap[3], self.font_size)
194-
self.text = ""
195-
self.memo_cm = self.cm_matrix.copy()
196-
self.memo_tm = self.tm_matrix.copy()
197-
try:
198-
# Import here to avoid circular imports
199-
from .._cmap import unknown_char_map # noqa: PLC0415
200-
201-
# char_map_tuple: font_type,
202-
# float(sp_width / 2),
203-
# encoding,
204-
# map_dict,
205-
# font_dict (describes the font)
206-
char_map_tuple = self.cmaps[operands[0]]
207-
# current cmap: encoding,
208-
# map_dict,
209-
# font resource name (internal name, not the real font name),
210-
# font_dict
211-
self.cmap = (
212-
char_map_tuple[2],
213-
char_map_tuple[3],
214-
operands[0],
215-
char_map_tuple[4],
216-
)
217-
self._space_width = char_map_tuple[1]
218-
except KeyError: # font not found
219-
self.cmap = (
220-
unknown_char_map[2],
221-
unknown_char_map[3],
222-
f"???{operands[0]}",
223-
None,
224-
)
225-
self._space_width = unknown_char_map[1]
226-
try:
227-
self.font_size = float(operands[1])
228-
except Exception:
229-
pass # keep previous size
230-
# Table 5.5 page 406
231-
elif operator == b"Td": # Move text position
232-
# A special case is a translating only tm:
233-
# tm = [1, 0, 0, 1, e, f]
234-
# i.e. tm[4] += tx, tm[5] += ty.
235-
tx, ty = float(operands[0]), float(operands[1])
236-
self.tm_matrix[4] += tx * self.tm_matrix[0] + ty * self.tm_matrix[2]
237-
self.tm_matrix[5] += tx * self.tm_matrix[1] + ty * self.tm_matrix[3]
238-
str_widths = self.compute_str_widths(self._actual_str_size["str_widths"])
239-
self._actual_str_size["str_widths"] = 0.0
240-
elif operator == b"Tm": # Set text matrix
241-
self.tm_matrix = [float(operand) for operand in operands[:6]]
242-
str_widths = self.compute_str_widths(self._actual_str_size["str_widths"])
243-
self._actual_str_size["str_widths"] = 0.0
244-
elif operator == b"T*": # Move to next line
245-
self.tm_matrix[4] -= self.TL * self.tm_matrix[2]
246-
self.tm_matrix[5] -= self.TL * self.tm_matrix[3]
247-
str_widths = self.compute_str_widths(self._actual_str_size["str_widths"])
248-
self._actual_str_size["str_widths"] = 0.0
249-
elif operator == b"Tj": # Show text
250-
self.text, self.rtl_dir, self._actual_str_size = self._handle_tj(
140+
# Post-process operations that affect text positioning
141+
if operator in {b"Td", b"Tm", b"T*", b"Tj"}:
142+
self._post_process_text_operation(str_widths or 0.0)
143+
144+
def _post_process_text_operation(self, str_widths: float) -> None:
145+
"""Handle common post-processing for text positioning operations."""
146+
try:
147+
self.text, self.output, self.cm_prev, self.tm_prev = crlf_space_check(
251148
self.text,
252-
operands,
253-
self.cm_matrix,
254-
self.tm_matrix,
149+
(self.cm_prev, self.tm_prev),
150+
(self.cm_matrix, self.tm_matrix),
151+
(self.memo_cm, self.memo_tm),
255152
self.cmap,
256153
self.orientations,
154+
self.output,
257155
self.font_size,
258-
self.rtl_dir,
259156
self.visitor_text,
260-
self._space_width,
261-
self._actual_str_size,
157+
str_widths,
158+
self.compute_str_widths(self._actual_str_size["space_width"]),
159+
self._actual_str_size["str_height"],
262160
)
263-
else:
264-
return
265-
266-
if operator in {b"Td", b"Tm", b"T*", b"Tj"}:
267-
try:
268-
self.text, self.output, self.cm_prev, self.tm_prev = crlf_space_check(
269-
self.text,
270-
(self.cm_prev, self.tm_prev),
271-
(self.cm_matrix, self.tm_matrix),
272-
(self.memo_cm, self.memo_tm),
273-
self.cmap,
274-
self.orientations,
275-
self.output,
276-
self.font_size,
277-
self.visitor_text,
278-
str_widths,
279-
self.compute_str_widths(self._actual_str_size["space_width"]),
280-
self._actual_str_size["str_height"],
281-
)
282-
if self.text == "":
283-
self.memo_cm = self.cm_matrix.copy()
284-
self.memo_tm = self.tm_matrix.copy()
285-
except OrientationNotFoundError:
286-
return
161+
if self.text == "":
162+
self.memo_cm = self.cm_matrix.copy()
163+
self.memo_tm = self.tm_matrix.copy()
164+
except OrientationNotFoundError:
165+
pass
287166

288167
def _get_actual_font_widths(
289168
self,
@@ -357,3 +236,165 @@ def _handle_tj(
357236
actual_str_size["str_widths"] += font_widths
358237

359238
return text, rtl_dir, actual_str_size
239+
240+
def _flush_text(self) -> None:
241+
"""Flush accumulated text to output and call visitor if present."""
242+
self.output += self.text
243+
if self.visitor_text is not None:
244+
self.visitor_text(self.text, self.memo_cm, self.memo_tm, self.cmap[3], self.font_size)
245+
self.text = ""
246+
self.memo_cm = self.cm_matrix.copy()
247+
self.memo_tm = self.tm_matrix.copy()
248+
249+
# Operation handlers
250+
251+
def _handle_bt(self, operands: List[Any]) -> None:
252+
"""Handle BT (Begin Text) operation - Table 5.4 page 405."""
253+
self.tm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
254+
self._flush_text()
255+
256+
def _handle_et(self, operands: List[Any]) -> None:
257+
"""Handle ET (End Text) operation - Table 5.4 page 405."""
258+
self._flush_text()
259+
260+
def _handle_save_graphics_state(self, operands: List[Any]) -> None:
261+
"""Handle q (Save graphics state) operation - Table 4.7 page 219."""
262+
self.cm_stack.append(
263+
(
264+
self.cm_matrix,
265+
self.cmap,
266+
self.font_size,
267+
self.char_scale,
268+
self.space_scale,
269+
self._space_width,
270+
self.TL,
271+
)
272+
)
273+
274+
def _handle_restore_graphics_state(self, operands: List[Any]) -> None:
275+
"""Handle Q (Restore graphics state) operation - Table 4.7 page 219."""
276+
try:
277+
(
278+
self.cm_matrix,
279+
self.cmap,
280+
self.font_size,
281+
self.char_scale,
282+
self.space_scale,
283+
self._space_width,
284+
self.TL,
285+
) = self.cm_stack.pop()
286+
except Exception:
287+
self.cm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
288+
289+
def _handle_cm(self, operands: List[Any]) -> None:
290+
"""Handle cm (Modify current matrix) operation - Table 4.7 page 219."""
291+
self.output += self.text
292+
if self.visitor_text is not None:
293+
self.visitor_text(self.text, self.memo_cm, self.memo_tm, self.cmap[3], self.font_size)
294+
self.text = ""
295+
try:
296+
self.cm_matrix = mult([float(operand) for operand in operands[:6]], self.cm_matrix)
297+
except Exception:
298+
self.cm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]
299+
self.memo_cm = self.cm_matrix.copy()
300+
self.memo_tm = self.tm_matrix.copy()
301+
302+
def _handle_tz(self, operands: List[Any]) -> None:
303+
"""Handle Tz (Set horizontal text scaling) operation - Table 5.2 page 398."""
304+
self.char_scale = float(operands[0]) / 100 if operands else 1.0
305+
306+
def _handle_tw(self, operands: List[Any]) -> None:
307+
"""Handle Tw (Set word spacing) operation - Table 5.2 page 398."""
308+
self.space_scale = 1.0 + float(operands[0] if operands else 0.0)
309+
310+
def _handle_tl(self, operands: List[Any]) -> None:
311+
"""Handle TL (Set Text Leading) operation - Table 5.2 page 398."""
312+
scale_x = math.sqrt(self.tm_matrix[0] ** 2 + self.tm_matrix[2] ** 2)
313+
self.TL = float(operands[0] if operands else 0.0) * self.font_size * scale_x
314+
315+
def _handle_tf(self, operands: List[Any]) -> None:
316+
"""Handle Tf (Set font size) operation - Table 5.2 page 398."""
317+
if self.text != "":
318+
self.output += self.text # .translate(cmap)
319+
if self.visitor_text is not None:
320+
self.visitor_text(self.text, self.memo_cm, self.memo_tm, self.cmap[3], self.font_size)
321+
self.text = ""
322+
self.memo_cm = self.cm_matrix.copy()
323+
self.memo_tm = self.tm_matrix.copy()
324+
try:
325+
# Import here to avoid circular imports
326+
from .._cmap import unknown_char_map # noqa: PLC0415
327+
328+
# char_map_tuple: font_type,
329+
# float(sp_width / 2),
330+
# encoding,
331+
# map_dict,
332+
# font_dict (describes the font)
333+
char_map_tuple = self.cmaps[operands[0]]
334+
# current cmap: encoding,
335+
# map_dict,
336+
# font resource name (internal name, not the real font name),
337+
# font_dict
338+
self.cmap = (
339+
char_map_tuple[2],
340+
char_map_tuple[3],
341+
operands[0],
342+
char_map_tuple[4],
343+
)
344+
self._space_width = char_map_tuple[1]
345+
except KeyError: # font not found
346+
self.cmap = (
347+
unknown_char_map[2],
348+
unknown_char_map[3],
349+
f"???{operands[0]}",
350+
None,
351+
)
352+
self._space_width = unknown_char_map[1]
353+
try:
354+
self.font_size = float(operands[1])
355+
except Exception:
356+
pass # keep previous size
357+
358+
def _handle_td(self, operands: List[Any]) -> float:
359+
"""Handle Td (Move text position) operation - Table 5.5 page 406."""
360+
# A special case is a translating only tm:
361+
# tm = [1, 0, 0, 1, e, f]
362+
# i.e. tm[4] += tx, tm[5] += ty.
363+
tx, ty = float(operands[0]), float(operands[1])
364+
self.tm_matrix[4] += tx * self.tm_matrix[0] + ty * self.tm_matrix[2]
365+
self.tm_matrix[5] += tx * self.tm_matrix[1] + ty * self.tm_matrix[3]
366+
str_widths = self.compute_str_widths(self._actual_str_size["str_widths"])
367+
self._actual_str_size["str_widths"] = 0.0
368+
return str_widths
369+
370+
def _handle_tm(self, operands: List[Any]) -> float:
371+
"""Handle Tm (Set text matrix) operation - Table 5.5 page 406."""
372+
self.tm_matrix = [float(operand) for operand in operands[:6]]
373+
str_widths = self.compute_str_widths(self._actual_str_size["str_widths"])
374+
self._actual_str_size["str_widths"] = 0.0
375+
return str_widths
376+
377+
def _handle_t_star(self, operands: List[Any]) -> float:
378+
"""Handle T* (Move to next line) operation - Table 5.5 page 406."""
379+
self.tm_matrix[4] -= self.TL * self.tm_matrix[2]
380+
self.tm_matrix[5] -= self.TL * self.tm_matrix[3]
381+
str_widths = self.compute_str_widths(self._actual_str_size["str_widths"])
382+
self._actual_str_size["str_widths"] = 0.0
383+
return str_widths
384+
385+
def _handle_tj_operation(self, operands: List[Any]) -> float:
386+
"""Handle Tj (Show text) operation - Table 5.5 page 406."""
387+
self.text, self.rtl_dir, self._actual_str_size = self._handle_tj(
388+
self.text,
389+
operands,
390+
self.cm_matrix,
391+
self.tm_matrix,
392+
self.cmap,
393+
self.orientations,
394+
self.font_size,
395+
self.rtl_dir,
396+
self.visitor_text,
397+
self._space_width,
398+
self._actual_str_size,
399+
)
400+
return 0.0 # str_widths will be handled in post-processing

0 commit comments

Comments
 (0)