3
3
import math
4
4
import os
5
5
import subprocess
6
+ from collections import namedtuple
6
7
from typing import List , Optional , Tuple
7
8
8
9
import numpy as np
9
10
10
11
from dedoc .common .exceptions .java_not_found_error import JavaNotFoundError
11
12
from dedoc .common .exceptions .tabby_pdf_error import TabbyPdfError
12
13
from dedoc .data_structures .bbox import BBox
14
+ from dedoc .data_structures .concrete_annotations .bbox_annotation import BBoxAnnotation
13
15
from dedoc .data_structures .concrete_annotations .bold_annotation import BoldAnnotation
14
16
from dedoc .data_structures .concrete_annotations .indentation_annotation import IndentationAnnotation
15
17
from dedoc .data_structures .concrete_annotations .italic_annotation import ItalicAnnotation
33
35
from dedoc .utils .parameter_utils import get_param_page_slice
34
36
from dedoc .utils .utils import calculate_file_hash
35
37
38
+ CellPropertyInfo = namedtuple ("NamedTuple" , "colspan, rowspan, invisible" )
39
+
36
40
37
41
class PdfTabbyReader (PdfBaseReader ):
38
42
"""
@@ -76,7 +80,7 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio
76
80
Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters.
77
81
"""
78
82
parameters = {} if parameters is None else parameters
79
- lines , scan_tables = self .__extract (path = path )
83
+ lines , scan_tables , tables_cell_properties = self .__extract (path = path )
80
84
warnings = []
81
85
document_metadata = None
82
86
@@ -93,10 +97,12 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio
93
97
94
98
lines = self .linker .link_objects (lines = lines , tables = scan_tables , images = [])
95
99
tables = []
96
- for scan_table in scan_tables :
100
+ assert len (scan_tables ) == len (tables_cell_properties )
101
+ for scan_table , table_cells_property in zip (scan_tables , tables_cell_properties ):
102
+ cell_properties = [[cellp for cellp in row ] for row in table_cells_property ]
97
103
metadata = TableMetadata (page_id = scan_table .page_number , uid = scan_table .name )
98
104
cells = [[cell for cell in row ] for row in scan_table .matrix_cells ]
99
- table = Table (metadata = metadata , cells = cells )
105
+ table = Table (metadata = metadata , cells = cells , cells_properties = cell_properties )
100
106
tables .append (table )
101
107
102
108
attachments = []
@@ -111,23 +117,26 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio
111
117
112
118
return self ._postprocess (result )
113
119
114
- def __extract (self , path : str , start_page : int = None , end_page : int = None ) -> Tuple [List [LineWithMeta ], List [ScanTable ]]:
120
+ def __extract (self , path : str , start_page : int = None , end_page : int = None ) -> Tuple [List [LineWithMeta ], List [ScanTable ], List [ List [ CellPropertyInfo ]] ]:
115
121
file_hash = calculate_file_hash (path = path )
116
122
document = self .__process_pdf (path = path , start_page = start_page , end_page = end_page )
117
123
all_lines = []
118
124
all_tables = []
125
+ all_cell_properties = []
119
126
for page in document .get ("pages" , []):
120
127
lines = self .__get_lines_with_location (page , file_hash )
121
128
if lines :
122
129
all_lines .extend (lines )
123
- tables = self .__get_tables (page , file_hash )
130
+ tables , cell_properties = self .__get_tables (page , file_hash )
124
131
if tables :
125
132
all_tables .extend (tables )
133
+ all_cell_properties .extend (cell_properties )
126
134
127
- return all_lines , all_tables
135
+ return all_lines , all_tables , all_cell_properties
128
136
129
137
def __get_tables (self , page : dict , file_hash : str ) -> List [ScanTable ]:
130
138
tables = []
139
+ cell_properties = []
131
140
page_number = page ["number" ]
132
141
i = 0
133
142
for table in page ["tables" ]:
@@ -138,26 +147,44 @@ def __get_tables(self, page: dict, file_hash: str) -> List[ScanTable]:
138
147
y_bottom_right = y_top_left + table ["height" ]
139
148
order = table ["order" ]
140
149
rows = table ["rows" ]
150
+ cell_properties_json = table ["cell_properties" ]
151
+ cell_property_list = []
152
+
153
+ for cell_properties_row in cell_properties_json :
154
+ cell_property_row_list = []
155
+
156
+ for cell_property in cell_properties_row :
157
+ cell_property_info = CellPropertyInfo (cell_property ["col_span" ],
158
+ cell_property ["row_span" ],
159
+ bool (cell_property ["invisible" ]))
160
+
161
+ cell_property_row_list .append (cell_property_info )
162
+
163
+ cell_property_list .append (cell_property_row_list )
164
+
141
165
cells = [row for row in rows ]
142
166
bbox = BBox .from_two_points ((x_top_left , y_top_left ), (x_bottom_right , y_bottom_right ))
143
167
144
168
tables .append (ScanTable (matrix_cells = cells , page_number = page_number , bbox = bbox , name = file_hash + str (page_number ) + str (i ), order = order ))
169
+ cell_properties .append (cell_property_list )
145
170
146
- return tables
171
+ return tables , cell_properties
147
172
148
173
def __get_lines_with_location (self , page : dict , file_hash : str ) -> List [LineWithLocation ]:
149
174
lines = []
150
175
page_number = page ["number" ]
176
+ page_width = int (page ["width" ])
177
+ page_height = int (page ["height" ])
151
178
prev_line = None
152
179
153
180
for block in page ["blocks" ]:
154
181
annotations = []
155
182
order = block ["order" ]
156
183
block_text = block ["text" ]
157
- bx_top_left = block ["x_top_left" ]
158
- by_top_left = block ["y_top_left" ]
159
- bx_bottom_right = bx_top_left + block ["width" ]
160
- by_bottom_right = by_top_left + block ["height" ]
184
+ bx_top_left = int ( block ["x_top_left" ])
185
+ by_top_left = int ( block ["y_top_left" ])
186
+ bx_bottom_right = bx_top_left + int ( block ["width" ])
187
+ by_bottom_right = by_top_left + int ( block ["height" ])
161
188
indent = block ["indent" ]
162
189
spacing = block ["spacing" ]
163
190
len_block = len (block_text )
@@ -173,7 +200,12 @@ def __get_lines_with_location(self, page: dict, file_hash: str) -> List[LineWith
173
200
url = annotation ["url" ]
174
201
start = annotation ["start" ]
175
202
end = annotation ["end" ]
176
-
203
+ x_top_left = int (annotation ["x_top_left" ])
204
+ y_top_left = int (annotation ["y_top_left" ])
205
+ x_bottom_right = bx_top_left + int (annotation ["width" ])
206
+ y_bottom_right = by_top_left + int (annotation ["height" ])
207
+ box = BBox .from_two_points ((x_top_left , y_top_left ), (x_bottom_right , y_bottom_right ))
208
+ annotations .append (BBoxAnnotation (start , end , box , page_width = page_width , page_height = page_height ))
177
209
annotations .append (SizeAnnotation (start , end , str (font_size )))
178
210
annotations .append (StyleAnnotation (start , end , font_name ))
179
211
@@ -189,6 +221,7 @@ def __get_lines_with_location(self, page: dict, file_hash: str) -> List[LineWith
189
221
meta = block ["metadata" ].lower ()
190
222
uid = f"txt_{ file_hash } _{ order } "
191
223
bbox = BBox .from_two_points ((bx_top_left , by_top_left ), (bx_bottom_right , by_bottom_right ))
224
+ annotations .append (BBoxAnnotation (0 , len_block , bbox , page_width = page_width , page_height = page_height ))
192
225
193
226
metadata = LineMetadata (page_id = page_number , line_id = order )
194
227
line_with_location = LineWithLocation (line = block_text ,
0 commit comments