@@ -95,6 +95,22 @@ def __init__(self) -> None:
95
95
self .visitor_text : Optional [Callable [[Any , Any , Any , Any , Any ], None ]] = None
96
96
self .cmaps : Dict [str , Tuple [str , float , Union [str , Dict [int , str ]], Dict [str , str ], DictionaryObject ]] = {}
97
97
98
+ self .operation_handlers = {
99
+ b"BT" : self ._handle_bt ,
100
+ b"ET" : self ._handle_et ,
101
+ b"q" : self ._handle_save_graphics_state ,
102
+ b"Q" : self ._handle_restore_graphics_state ,
103
+ b"cm" : self ._handle_cm ,
104
+ b"Tz" : self ._handle_tz ,
105
+ b"Tw" : self ._handle_tw ,
106
+ b"TL" : self ._handle_tl ,
107
+ b"Tf" : self ._handle_tf ,
108
+ b"Td" : self ._handle_td ,
109
+ b"Tm" : self ._handle_tm ,
110
+ b"T*" : self ._handle_t_star ,
111
+ b"Tj" : self ._handle_tj_operation ,
112
+ }
113
+
98
114
def initialize_extraction (
99
115
self ,
100
116
orientations : Tuple [int , ...] = (0 , 90 , 180 , 270 ),
@@ -117,173 +133,36 @@ def compute_str_widths(self, str_widths: float) -> float:
117
133
return str_widths / 1000
118
134
119
135
def process_operation (self , operator : bytes , operands : List [Any ]) -> None :
120
- str_widths : float = 0.0
136
+ if operator in self .operation_handlers :
137
+ handler = self .operation_handlers [operator ]
138
+ str_widths = handler (operands )
121
139
122
- # Table 5.4 page 405
123
- if operator == b"BT" : # Begin Text
124
- self .tm_matrix = [1.0 , 0.0 , 0.0 , 1.0 , 0.0 , 0.0 ]
125
- # Flush text:
126
- self .output += self .text
127
- if self .visitor_text is not None :
128
- self .visitor_text (self .text , self .memo_cm , self .memo_tm , self .cmap [3 ], self .font_size )
129
- self .text = ""
130
- self .memo_cm = self .cm_matrix .copy ()
131
- self .memo_tm = self .tm_matrix .copy ()
132
- return
133
- if operator == b"ET" : # End Text
134
- # Flush text:
135
- self .output += self .text
136
- if self .visitor_text is not None :
137
- self .visitor_text (self .text , self .memo_cm , self .memo_tm , self .cmap [3 ], self .font_size )
138
- self .text = ""
139
- self .memo_cm = self .cm_matrix .copy ()
140
- self .memo_tm = self .tm_matrix .copy ()
141
-
142
- # Table 4.7 "Graphics state operators", page 219
143
- # cm_matrix calculation is reserved for later
144
- elif operator == b"q" : # Save graphics state
145
- self .cm_stack .append (
146
- (
147
- self .cm_matrix ,
148
- self .cmap ,
149
- self .font_size ,
150
- self .char_scale ,
151
- self .space_scale ,
152
- self ._space_width ,
153
- self .TL ,
154
- )
155
- )
156
- elif operator == b"Q" : # Restore graphics state
157
- try :
158
- (
159
- self .cm_matrix ,
160
- self .cmap ,
161
- self .font_size ,
162
- self .char_scale ,
163
- self .space_scale ,
164
- self ._space_width ,
165
- self .TL ,
166
- ) = self .cm_stack .pop ()
167
- except Exception :
168
- self .cm_matrix = [1.0 , 0.0 , 0.0 , 1.0 , 0.0 , 0.0 ]
169
- elif operator == b"cm" : # Modify current matrix
170
- self .output += self .text
171
- if self .visitor_text is not None :
172
- self .visitor_text (self .text , self .memo_cm , self .memo_tm , self .cmap [3 ], self .font_size )
173
- self .text = ""
174
- try :
175
- self .cm_matrix = mult ([float (operand ) for operand in operands [:6 ]], self .cm_matrix )
176
- except Exception :
177
- self .cm_matrix = [1.0 , 0.0 , 0.0 , 1.0 , 0.0 , 0.0 ]
178
- self .memo_cm = self .cm_matrix .copy ()
179
- self .memo_tm = self .tm_matrix .copy ()
180
-
181
- # Table 5.2 page 398
182
- elif operator == b"Tz" : # Set horizontal text scaling
183
- self .char_scale = float (operands [0 ]) / 100 if operands else 1.0
184
- elif operator == b"Tw" : # Set word spacing
185
- self .space_scale = 1.0 + float (operands [0 ] if operands else 0.0 )
186
- elif operator == b"TL" : # Set Text Leading
187
- scale_x = math .sqrt (self .tm_matrix [0 ] ** 2 + self .tm_matrix [2 ] ** 2 )
188
- self .TL = float (operands [0 ] if operands else 0.0 ) * self .font_size * scale_x
189
- elif operator == b"Tf" : # Set font size
190
- if self .text != "" :
191
- self .output += self .text # .translate(cmap)
192
- if self .visitor_text is not None :
193
- self .visitor_text (self .text , self .memo_cm , self .memo_tm , self .cmap [3 ], self .font_size )
194
- self .text = ""
195
- self .memo_cm = self .cm_matrix .copy ()
196
- self .memo_tm = self .tm_matrix .copy ()
197
- try :
198
- # Import here to avoid circular imports
199
- from .._cmap import unknown_char_map # noqa: PLC0415
200
-
201
- # char_map_tuple: font_type,
202
- # float(sp_width / 2),
203
- # encoding,
204
- # map_dict,
205
- # font_dict (describes the font)
206
- char_map_tuple = self .cmaps [operands [0 ]]
207
- # current cmap: encoding,
208
- # map_dict,
209
- # font resource name (internal name, not the real font name),
210
- # font_dict
211
- self .cmap = (
212
- char_map_tuple [2 ],
213
- char_map_tuple [3 ],
214
- operands [0 ],
215
- char_map_tuple [4 ],
216
- )
217
- self ._space_width = char_map_tuple [1 ]
218
- except KeyError : # font not found
219
- self .cmap = (
220
- unknown_char_map [2 ],
221
- unknown_char_map [3 ],
222
- f"???{ operands [0 ]} " ,
223
- None ,
224
- )
225
- self ._space_width = unknown_char_map [1 ]
226
- try :
227
- self .font_size = float (operands [1 ])
228
- except Exception :
229
- pass # keep previous size
230
- # Table 5.5 page 406
231
- elif operator == b"Td" : # Move text position
232
- # A special case is a translating only tm:
233
- # tm = [1, 0, 0, 1, e, f]
234
- # i.e. tm[4] += tx, tm[5] += ty.
235
- tx , ty = float (operands [0 ]), float (operands [1 ])
236
- self .tm_matrix [4 ] += tx * self .tm_matrix [0 ] + ty * self .tm_matrix [2 ]
237
- self .tm_matrix [5 ] += tx * self .tm_matrix [1 ] + ty * self .tm_matrix [3 ]
238
- str_widths = self .compute_str_widths (self ._actual_str_size ["str_widths" ])
239
- self ._actual_str_size ["str_widths" ] = 0.0
240
- elif operator == b"Tm" : # Set text matrix
241
- self .tm_matrix = [float (operand ) for operand in operands [:6 ]]
242
- str_widths = self .compute_str_widths (self ._actual_str_size ["str_widths" ])
243
- self ._actual_str_size ["str_widths" ] = 0.0
244
- elif operator == b"T*" : # Move to next line
245
- self .tm_matrix [4 ] -= self .TL * self .tm_matrix [2 ]
246
- self .tm_matrix [5 ] -= self .TL * self .tm_matrix [3 ]
247
- str_widths = self .compute_str_widths (self ._actual_str_size ["str_widths" ])
248
- self ._actual_str_size ["str_widths" ] = 0.0
249
- elif operator == b"Tj" : # Show text
250
- self .text , self .rtl_dir , self ._actual_str_size = self ._handle_tj (
140
+ # Post-process operations that affect text positioning
141
+ if operator in {b"Td" , b"Tm" , b"T*" , b"Tj" }:
142
+ self ._post_process_text_operation (str_widths or 0.0 )
143
+
144
+ def _post_process_text_operation (self , str_widths : float ) -> None :
145
+ """Handle common post-processing for text positioning operations."""
146
+ try :
147
+ self .text , self .output , self .cm_prev , self .tm_prev = crlf_space_check (
251
148
self .text ,
252
- operands ,
253
- self .cm_matrix ,
254
- self .tm_matrix ,
149
+ ( self . cm_prev , self . tm_prev ) ,
150
+ ( self .cm_matrix , self . tm_matrix ) ,
151
+ ( self .memo_cm , self . memo_tm ) ,
255
152
self .cmap ,
256
153
self .orientations ,
154
+ self .output ,
257
155
self .font_size ,
258
- self .rtl_dir ,
259
156
self .visitor_text ,
260
- self ._space_width ,
261
- self ._actual_str_size ,
157
+ str_widths ,
158
+ self .compute_str_widths (self ._actual_str_size ["space_width" ]),
159
+ self ._actual_str_size ["str_height" ],
262
160
)
263
- else :
264
- return
265
-
266
- if operator in {b"Td" , b"Tm" , b"T*" , b"Tj" }:
267
- try :
268
- self .text , self .output , self .cm_prev , self .tm_prev = crlf_space_check (
269
- self .text ,
270
- (self .cm_prev , self .tm_prev ),
271
- (self .cm_matrix , self .tm_matrix ),
272
- (self .memo_cm , self .memo_tm ),
273
- self .cmap ,
274
- self .orientations ,
275
- self .output ,
276
- self .font_size ,
277
- self .visitor_text ,
278
- str_widths ,
279
- self .compute_str_widths (self ._actual_str_size ["space_width" ]),
280
- self ._actual_str_size ["str_height" ],
281
- )
282
- if self .text == "" :
283
- self .memo_cm = self .cm_matrix .copy ()
284
- self .memo_tm = self .tm_matrix .copy ()
285
- except OrientationNotFoundError :
286
- return
161
+ if self .text == "" :
162
+ self .memo_cm = self .cm_matrix .copy ()
163
+ self .memo_tm = self .tm_matrix .copy ()
164
+ except OrientationNotFoundError :
165
+ pass
287
166
288
167
def _get_actual_font_widths (
289
168
self ,
@@ -357,3 +236,165 @@ def _handle_tj(
357
236
actual_str_size ["str_widths" ] += font_widths
358
237
359
238
return text , rtl_dir , actual_str_size
239
+
240
+ def _flush_text (self ) -> None :
241
+ """Flush accumulated text to output and call visitor if present."""
242
+ self .output += self .text
243
+ if self .visitor_text is not None :
244
+ self .visitor_text (self .text , self .memo_cm , self .memo_tm , self .cmap [3 ], self .font_size )
245
+ self .text = ""
246
+ self .memo_cm = self .cm_matrix .copy ()
247
+ self .memo_tm = self .tm_matrix .copy ()
248
+
249
+ # Operation handlers
250
+
251
+ def _handle_bt (self , operands : List [Any ]) -> None :
252
+ """Handle BT (Begin Text) operation - Table 5.4 page 405."""
253
+ self .tm_matrix = [1.0 , 0.0 , 0.0 , 1.0 , 0.0 , 0.0 ]
254
+ self ._flush_text ()
255
+
256
+ def _handle_et (self , operands : List [Any ]) -> None :
257
+ """Handle ET (End Text) operation - Table 5.4 page 405."""
258
+ self ._flush_text ()
259
+
260
+ def _handle_save_graphics_state (self , operands : List [Any ]) -> None :
261
+ """Handle q (Save graphics state) operation - Table 4.7 page 219."""
262
+ self .cm_stack .append (
263
+ (
264
+ self .cm_matrix ,
265
+ self .cmap ,
266
+ self .font_size ,
267
+ self .char_scale ,
268
+ self .space_scale ,
269
+ self ._space_width ,
270
+ self .TL ,
271
+ )
272
+ )
273
+
274
+ def _handle_restore_graphics_state (self , operands : List [Any ]) -> None :
275
+ """Handle Q (Restore graphics state) operation - Table 4.7 page 219."""
276
+ try :
277
+ (
278
+ self .cm_matrix ,
279
+ self .cmap ,
280
+ self .font_size ,
281
+ self .char_scale ,
282
+ self .space_scale ,
283
+ self ._space_width ,
284
+ self .TL ,
285
+ ) = self .cm_stack .pop ()
286
+ except Exception :
287
+ self .cm_matrix = [1.0 , 0.0 , 0.0 , 1.0 , 0.0 , 0.0 ]
288
+
289
+ def _handle_cm (self , operands : List [Any ]) -> None :
290
+ """Handle cm (Modify current matrix) operation - Table 4.7 page 219."""
291
+ self .output += self .text
292
+ if self .visitor_text is not None :
293
+ self .visitor_text (self .text , self .memo_cm , self .memo_tm , self .cmap [3 ], self .font_size )
294
+ self .text = ""
295
+ try :
296
+ self .cm_matrix = mult ([float (operand ) for operand in operands [:6 ]], self .cm_matrix )
297
+ except Exception :
298
+ self .cm_matrix = [1.0 , 0.0 , 0.0 , 1.0 , 0.0 , 0.0 ]
299
+ self .memo_cm = self .cm_matrix .copy ()
300
+ self .memo_tm = self .tm_matrix .copy ()
301
+
302
+ def _handle_tz (self , operands : List [Any ]) -> None :
303
+ """Handle Tz (Set horizontal text scaling) operation - Table 5.2 page 398."""
304
+ self .char_scale = float (operands [0 ]) / 100 if operands else 1.0
305
+
306
+ def _handle_tw (self , operands : List [Any ]) -> None :
307
+ """Handle Tw (Set word spacing) operation - Table 5.2 page 398."""
308
+ self .space_scale = 1.0 + float (operands [0 ] if operands else 0.0 )
309
+
310
+ def _handle_tl (self , operands : List [Any ]) -> None :
311
+ """Handle TL (Set Text Leading) operation - Table 5.2 page 398."""
312
+ scale_x = math .sqrt (self .tm_matrix [0 ] ** 2 + self .tm_matrix [2 ] ** 2 )
313
+ self .TL = float (operands [0 ] if operands else 0.0 ) * self .font_size * scale_x
314
+
315
+ def _handle_tf (self , operands : List [Any ]) -> None :
316
+ """Handle Tf (Set font size) operation - Table 5.2 page 398."""
317
+ if self .text != "" :
318
+ self .output += self .text # .translate(cmap)
319
+ if self .visitor_text is not None :
320
+ self .visitor_text (self .text , self .memo_cm , self .memo_tm , self .cmap [3 ], self .font_size )
321
+ self .text = ""
322
+ self .memo_cm = self .cm_matrix .copy ()
323
+ self .memo_tm = self .tm_matrix .copy ()
324
+ try :
325
+ # Import here to avoid circular imports
326
+ from .._cmap import unknown_char_map # noqa: PLC0415
327
+
328
+ # char_map_tuple: font_type,
329
+ # float(sp_width / 2),
330
+ # encoding,
331
+ # map_dict,
332
+ # font_dict (describes the font)
333
+ char_map_tuple = self .cmaps [operands [0 ]]
334
+ # current cmap: encoding,
335
+ # map_dict,
336
+ # font resource name (internal name, not the real font name),
337
+ # font_dict
338
+ self .cmap = (
339
+ char_map_tuple [2 ],
340
+ char_map_tuple [3 ],
341
+ operands [0 ],
342
+ char_map_tuple [4 ],
343
+ )
344
+ self ._space_width = char_map_tuple [1 ]
345
+ except KeyError : # font not found
346
+ self .cmap = (
347
+ unknown_char_map [2 ],
348
+ unknown_char_map [3 ],
349
+ f"???{ operands [0 ]} " ,
350
+ None ,
351
+ )
352
+ self ._space_width = unknown_char_map [1 ]
353
+ try :
354
+ self .font_size = float (operands [1 ])
355
+ except Exception :
356
+ pass # keep previous size
357
+
358
+ def _handle_td (self , operands : List [Any ]) -> float :
359
+ """Handle Td (Move text position) operation - Table 5.5 page 406."""
360
+ # A special case is a translating only tm:
361
+ # tm = [1, 0, 0, 1, e, f]
362
+ # i.e. tm[4] += tx, tm[5] += ty.
363
+ tx , ty = float (operands [0 ]), float (operands [1 ])
364
+ self .tm_matrix [4 ] += tx * self .tm_matrix [0 ] + ty * self .tm_matrix [2 ]
365
+ self .tm_matrix [5 ] += tx * self .tm_matrix [1 ] + ty * self .tm_matrix [3 ]
366
+ str_widths = self .compute_str_widths (self ._actual_str_size ["str_widths" ])
367
+ self ._actual_str_size ["str_widths" ] = 0.0
368
+ return str_widths
369
+
370
+ def _handle_tm (self , operands : List [Any ]) -> float :
371
+ """Handle Tm (Set text matrix) operation - Table 5.5 page 406."""
372
+ self .tm_matrix = [float (operand ) for operand in operands [:6 ]]
373
+ str_widths = self .compute_str_widths (self ._actual_str_size ["str_widths" ])
374
+ self ._actual_str_size ["str_widths" ] = 0.0
375
+ return str_widths
376
+
377
+ def _handle_t_star (self , operands : List [Any ]) -> float :
378
+ """Handle T* (Move to next line) operation - Table 5.5 page 406."""
379
+ self .tm_matrix [4 ] -= self .TL * self .tm_matrix [2 ]
380
+ self .tm_matrix [5 ] -= self .TL * self .tm_matrix [3 ]
381
+ str_widths = self .compute_str_widths (self ._actual_str_size ["str_widths" ])
382
+ self ._actual_str_size ["str_widths" ] = 0.0
383
+ return str_widths
384
+
385
+ def _handle_tj_operation (self , operands : List [Any ]) -> float :
386
+ """Handle Tj (Show text) operation - Table 5.5 page 406."""
387
+ self .text , self .rtl_dir , self ._actual_str_size = self ._handle_tj (
388
+ self .text ,
389
+ operands ,
390
+ self .cm_matrix ,
391
+ self .tm_matrix ,
392
+ self .cmap ,
393
+ self .orientations ,
394
+ self .font_size ,
395
+ self .rtl_dir ,
396
+ self .visitor_text ,
397
+ self ._space_width ,
398
+ self ._actual_str_size ,
399
+ )
400
+ return 0.0 # str_widths will be handled in post-processing
0 commit comments