1
+ # Copyright (c) 2006, Mathieu Fenniak
2
+ # Copyright (c) 2007, Ashish Kulkarni <kulkarni.ashish@gmail.com>
3
+ #
4
+ # All rights reserved.
5
+ #
6
+ # Redistribution and use in source and binary forms, with or without
7
+ # modification, are permitted provided that the following conditions are
8
+ # met:
9
+ #
10
+ # * Redistributions of source code must retain the above copyright notice,
11
+ # this list of conditions and the following disclaimer.
12
+ # * Redistributions in binary form must reproduce the above copyright notice,
13
+ # this list of conditions and the following disclaimer in the documentation
14
+ # and/or other materials provided with the distribution.
15
+ # * The name of the author may not be used to endorse or promote products
16
+ # derived from this software without specific prior written permission.
17
+ #
18
+ # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19
+ # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20
+ # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21
+ # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
22
+ # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23
+ # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24
+ # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25
+ # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26
+ # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27
+ # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28
+ # POSSIBILITY OF SUCH DAMAGE.
29
+
30
+ from typing import Tuple , Union , Dict , Optional , Any , List , Callable
31
+
32
+ from .._cmap import get_actual_str_key , build_font_width_map , compute_font_width
33
+ from ..generic import DictionaryObject , TextStringObject
34
+ from . import get_text_operands , get_display_str
35
+
36
+ class TextExtraction :
37
+ """
38
+ A class to handle PDF text extraction operations.
39
+
40
+ This class encapsulates all the state and operations needed for extracting
41
+ text from PDF content streams, replacing the nested functions and nonlocal
42
+ variables in the original implementation.
43
+ """
44
+
45
+ def __init__ (self ):
46
+ self ._font_width_maps : Dict [str , Tuple [Dict [Any , float ], str , float ]] = {}
47
+
48
+ def _get_actual_font_widths (
49
+ self ,
50
+ cmap : Tuple [
51
+ Union [str , Dict [int , str ]], Dict [str , str ], str , Optional [DictionaryObject ]
52
+ ],
53
+ text_operands : str ,
54
+ font_size : float ,
55
+ space_width : float
56
+ ) -> Tuple [float , float , float ]:
57
+ font_widths : float = 0
58
+ font_name : str = cmap [2 ]
59
+ if font_name not in self ._font_width_maps :
60
+ if cmap [3 ] is None :
61
+ font_width_map : Dict [Any , float ] = {}
62
+ space_char = " "
63
+ actual_space_width : float = space_width
64
+ font_width_map ["default" ] = actual_space_width * 2
65
+ else :
66
+ space_char = get_actual_str_key (" " , cmap [0 ], cmap [1 ])
67
+ font_width_map = build_font_width_map (cmap [3 ], space_width * 2 )
68
+ actual_space_width = compute_font_width (font_width_map , space_char )
69
+ if actual_space_width == 0 :
70
+ actual_space_width = space_width
71
+ self ._font_width_maps [font_name ] = (font_width_map , space_char , actual_space_width )
72
+ font_width_map = self ._font_width_maps [font_name ][0 ]
73
+ space_char = self ._font_width_maps [font_name ][1 ]
74
+ actual_space_width = self ._font_width_maps [font_name ][2 ]
75
+
76
+ if text_operands :
77
+ for char in text_operands :
78
+ if char == space_char :
79
+ font_widths += actual_space_width
80
+ continue
81
+ font_widths += compute_font_width (font_width_map , char )
82
+ return (font_widths * font_size , space_width * font_size , font_size )
83
+
84
+ def _handle_tj (
85
+ self ,
86
+ text : str ,
87
+ operands : List [Union [str , TextStringObject ]],
88
+ cm_matrix : List [float ],
89
+ tm_matrix : List [float ],
90
+ cmap : Tuple [
91
+ Union [str , Dict [int , str ]], Dict [str , str ], str , Optional [DictionaryObject ]
92
+ ],
93
+ orientations : Tuple [int , ...],
94
+ font_size : float ,
95
+ rtl_dir : bool ,
96
+ visitor_text : Optional [Callable [[Any , Any , Any , Any , Any ], None ]],
97
+ space_width : float ,
98
+ actual_str_size : Dict [str , float ]
99
+ ) -> Tuple [str , bool , Dict [str , float ]]:
100
+ text_operands , is_str_operands = get_text_operands (
101
+ operands , cm_matrix , tm_matrix , cmap , orientations )
102
+ if is_str_operands :
103
+ text += text_operands
104
+ else :
105
+ text , rtl_dir = get_display_str (
106
+ text ,
107
+ cm_matrix ,
108
+ tm_matrix , # text matrix
109
+ cmap ,
110
+ text_operands ,
111
+ font_size ,
112
+ rtl_dir ,
113
+ visitor_text )
114
+ font_widths , actual_str_size ["space_width" ], actual_str_size ["str_height" ] = (
115
+ self ._get_actual_font_widths (cmap , text_operands , font_size , space_width ))
116
+ actual_str_size ["str_widths" ] += font_widths
117
+
118
+ return text , rtl_dir , actual_str_size
0 commit comments