2
2
import logging
3
3
import os
4
4
from itertools import chain
5
- from typing import Optional , Tuple , List
6
- import numpy as np
5
+ from typing import Optional
7
6
8
- from dedoc .common .exceptions .bad_file_exception import BadFileFormatException
9
- from dedoc .config import get_config
10
7
from dedoc .data_structures .concrete_annotations .table_annotation import TableAnnotation
11
8
from dedoc .data_structures .line_with_meta import LineWithMeta
12
9
from dedoc .data_structures .unstructured_document import UnstructuredDocument
13
10
from dedoc .extensions import recognized_mimes
14
11
from dedoc .readers .base_reader import BaseReader
15
- from dedoc .readers .pdf_reader .pdf_txtlayer_reader .pdf_tabby_reader import PdfTabbyReader
16
- from dedoc .utils .parameter_utils import get_param_pdf_with_txt_layer
17
- from dedoc .utils .pdf_utils import get_page_slice , get_page_image , get_pdf_page_count
18
- from dedoc .readers .pdf_reader .pdf_image_reader .columns_orientation_classifier .columns_orientation_classifier import ColumnsOrientationClassifier
12
+ from dedoc .readers .pdf_reader .pdf_auto_reader .txtlayer_detector import TxtLayerDetector
19
13
from dedoc .readers .pdf_reader .pdf_image_reader .pdf_image_reader import PdfImageReader
14
+ from dedoc .readers .pdf_reader .pdf_txtlayer_reader .pdf_tabby_reader import PdfTabbyReader
20
15
from dedoc .readers .pdf_reader .pdf_txtlayer_reader .pdf_txtlayer_reader import PdfTxtlayerReader
21
- from dedoc .readers . pdf_reader . pdf_auto_reader . pdf_txtlayer_correctness import PdfTextLayerCorrectness
16
+ from dedoc .utils . parameter_utils import get_param_pdf_with_txt_layer , get_param_page_slice
22
17
23
18
24
- # TODO delete parameter is_one_column_document_list
25
19
class PdfAutoReader (BaseReader ):
26
20
"""
27
21
This class allows to extract content from the .pdf documents of any kind.
@@ -40,23 +34,13 @@ def __init__(self, *, config: dict) -> None:
40
34
"""
41
35
:param config: configuration of the reader, e.g. logger for logging
42
36
"""
43
- self .pdf_parser = PdfTxtlayerReader (config = config )
44
- self .tabby_parser = PdfTabbyReader (config = config )
37
+ self .pdf_txtlayer_reader = PdfTxtlayerReader (config = config )
38
+ self .pdf_tabby_reader = PdfTabbyReader (config = config )
45
39
self .pdf_image_reader = PdfImageReader (config = config )
40
+ self .txtlayer_detector = TxtLayerDetector (pdf_txtlayer_reader = self .pdf_txtlayer_reader , pdf_tabby_reader = self .pdf_tabby_reader , config = config )
41
+
46
42
self .config = config
47
43
self .logger = config .get ("logger" , logging .getLogger ())
48
- self .__checkpoint_path = get_config ()["resources_path" ]
49
- self ._orientation_classifier = None
50
- self .pdf_correctness = PdfTextLayerCorrectness (config = config )
51
-
52
- @property
53
- def orientation_classifier (self ) -> ColumnsOrientationClassifier :
54
- if self ._orientation_classifier is None :
55
- self ._orientation_classifier = ColumnsOrientationClassifier (on_gpu = False ,
56
- checkpoint_path = self .__checkpoint_path ,
57
- delete_lines = False ,
58
- config = self .config )
59
- return self ._orientation_classifier
60
44
61
45
def can_read (self , path : str , mime : str , extension : str , document_type : Optional [str ] = None , parameters : Optional [dict ] = None ) -> bool :
62
46
"""
@@ -69,99 +53,88 @@ def can_read(self, path: str, mime: str, extension: str, document_type: Optional
69
53
70
54
Look to the documentation of :meth:`~dedoc.readers.BaseReader.can_read` to get information about the method's parameters.
71
55
"""
72
- parameters = {} if parameters is None else parameters
73
-
74
- is_pdf = mime in recognized_mimes .pdf_like_format
75
- if not is_pdf :
56
+ if mime not in recognized_mimes .pdf_like_format :
76
57
return False
77
58
59
+ parameters = {} if parameters is None else parameters
78
60
pdf_with_txt_layer = get_param_pdf_with_txt_layer (parameters )
79
- return is_pdf and pdf_with_txt_layer in ("auto" , "auto_tabby" )
61
+ return pdf_with_txt_layer in ("auto" , "auto_tabby" )
80
62
81
63
def read (self , path : str , document_type : Optional [str ] = None , parameters : Optional [dict ] = None ) -> UnstructuredDocument :
82
64
"""
83
65
The method return document content with all document's lines, tables and attachments.
84
66
This reader is able to add some additional information to the `tag_hierarchy_level` of :class:`~dedoc.data_structures.LineMetadata`.
85
67
Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters.
86
68
"""
87
- pdf_with_txt_layer = get_param_pdf_with_txt_layer (parameters )
88
69
warnings = []
70
+ txtlayer_parameters = self .txtlayer_detector .detect_txtlayer (path = path , parameters = parameters )
89
71
90
- is_one_column_document_list , warning_list = self .__get_one_column_document (parameters , path = path )
91
- parameters ["is_one_column_document_list" ] = is_one_column_document_list
92
- parameters_copy = copy .deepcopy (parameters )
93
- parameters_copy ["is_one_column_document" ] = "true" if is_one_column_document_list [0 ] else "false"
94
- for warning in warning_list :
95
- if warning is not None :
96
- warnings .append (warning )
97
- text_layer_parameters = self .pdf_correctness .with_text_layer (path = path ,
98
- parameters = parameters ,
99
- is_one_column_list = is_one_column_document_list )
100
- is_booklet = text_layer_parameters .is_booklet
101
- pdf_with_text_layer = text_layer_parameters .correct_text_layout
102
- is_first_page_correct = text_layer_parameters .correct_first_page
103
-
104
- if is_booklet :
105
- message = "assume document is booklet"
106
- warnings .append (message )
107
- self .logger .warning (message + " " + os .path .basename (path ))
108
-
109
- if pdf_with_text_layer :
110
- result = self ._handle_correct_layer (document_type = document_type ,
111
- is_first_page_correct = is_first_page_correct ,
112
- parameters = parameters ,
113
- parameters_copy = parameters_copy ,
114
- path = path ,
115
- warnings = warnings ,
116
- pdf_with_txt_layer = pdf_with_txt_layer )
72
+ if txtlayer_parameters .is_correct_text_layer :
73
+ result = self .__handle_correct_text_layer (is_first_page_correct = txtlayer_parameters .is_first_page_correct ,
74
+ parameters = parameters ,
75
+ path = path ,
76
+ warnings = warnings )
117
77
else :
118
- result = self ._handle_incorrect_text_layer (document_type , parameters_copy , path , warnings )
119
- parameters_copy ["pdf_with_text_layer" ] = str (pdf_with_text_layer )
78
+ result = self .__handle_incorrect_text_layer (parameters , path , warnings )
120
79
121
80
result .warnings .extend (warnings )
122
81
return result
123
82
124
- def _handle_incorrect_text_layer (self , document_type : str , parameters_copy : dict , path : str , warnings : list ) -> UnstructuredDocument :
125
- message = "assume document has incorrect text layer"
126
- warnings .append (message )
127
- warnings .append (message + " " + os .path .basename (path ))
128
- self .logger .info (message .format (os .path .basename (path )))
129
- result = self .pdf_image_reader .read (path = path , document_type = document_type , parameters = parameters_copy )
83
+ def __handle_incorrect_text_layer (self , parameters_copy : dict , path : str , warnings : list ) -> UnstructuredDocument :
84
+ self .logger .info (f"Assume document { os .path .basename (path )} has incorrect textual layer" )
85
+ warnings .append ("Assume document has incorrect textual layer" )
86
+ result = self .pdf_image_reader .read (path = path , parameters = parameters_copy )
130
87
return result
131
88
132
- def _handle_correct_layer (self ,
133
- document_type : str ,
134
- is_first_page_correct : bool ,
135
- parameters : dict ,
136
- parameters_copy : dict ,
137
- path : str ,
138
- pdf_with_txt_layer : str ,
139
- warnings : list ) -> UnstructuredDocument :
140
- message = "assume {} has correct text layer"
141
- self .logger .info (message .format (os .path .basename (path )))
142
- warnings .append (message .format ("document" ))
143
- prefix = None
89
+ def __handle_correct_text_layer (self ,
90
+ is_first_page_correct : bool ,
91
+ parameters : dict ,
92
+ path : str ,
93
+ warnings : list ) -> UnstructuredDocument :
94
+ self .logger .info (f"Assume document { os .path .basename (path )} has a correct textual layer" )
95
+ warnings .append ("Assume document has a correct textual layer" )
96
+ recognized_first_page = None
97
+
144
98
if not is_first_page_correct :
145
- message = "assume first page has no text layer"
99
+ message = "Assume the first page hasn't a textual layer"
146
100
warnings .append (message )
147
101
self .logger .info (message )
148
- first_page , last_page = get_page_slice (parameters_copy )
149
- first_page = 1 if first_page is None else first_page + 1
150
- last_page = 1
151
- scan_parameters = copy .deepcopy (parameters )
152
- scan_parameters ["pages" ] = f"{ first_page } :{ last_page } "
153
- prefix = self .pdf_image_reader .read (path = path , document_type = document_type , parameters = scan_parameters )
154
- reader = self .pdf_parser if pdf_with_txt_layer == "auto" else self .tabby_parser
155
- if not is_first_page_correct :
156
- first_page , last_page = get_page_slice (parameters_copy )
157
- first_page = 2 if first_page is None else first_page + 1
158
- last_page = "" if last_page is None else last_page
159
- parameters_copy ["pages" ] = f"{ first_page } :{ last_page } "
160
- result = reader .read (path = path , document_type = document_type , parameters = parameters_copy )
161
- result = self ._merge_documents (prefix , result ) if prefix is not None else result
102
+
103
+ # GET THE FIRST PAGE: recognize the first page like a scanned page
104
+ scan_parameters = self .__preparing_first_page_parameters (parameters )
105
+ recognized_first_page = self .pdf_image_reader .read (path = path , parameters = scan_parameters )
106
+
107
+ # PREPARE PARAMETERS: from the second page we recognize the content like PDF with a textual layer
108
+ parameters = self .__preparing_other_pages_parameters (parameters )
109
+
110
+ pdf_with_txt_layer = get_param_pdf_with_txt_layer (parameters )
111
+ reader = self .pdf_txtlayer_reader if pdf_with_txt_layer == "auto" else self .pdf_tabby_reader
112
+ result = reader .read (path = path , parameters = parameters )
113
+ result = self .__merge_documents (recognized_first_page , result ) if recognized_first_page is not None else result
162
114
return result
163
115
164
- def _merge_documents (self , first : UnstructuredDocument , second : UnstructuredDocument ) -> UnstructuredDocument :
116
+ def __preparing_first_page_parameters (self , parameters : dict ) -> dict :
117
+ first_page , last_page = get_param_page_slice (parameters )
118
+ # calculate indexes for the first page parsing
119
+ first_page_index = 0 if first_page is None else first_page
120
+ last_page_index = 0
121
+ scan_parameters = copy .deepcopy (parameters )
122
+
123
+ # page numeration in parameters starts with 1, both ends are included
124
+ scan_parameters ["pages" ] = f"{ first_page_index + 1 } :{ last_page_index + 1 } "
125
+ # if the first page != 0 then we won't read it (because first_page_index > last_page_index)
126
+ return scan_parameters
127
+
128
+ def __preparing_other_pages_parameters (self , parameters : dict ) -> dict :
129
+ first_page , last_page = get_param_page_slice (parameters )
130
+ # parameters for reading pages from the second page
131
+ first_page_index = 1 if first_page is None else first_page
132
+ last_page_index = "" if last_page is None else last_page
133
+ parameters ["pages" ] = f"{ first_page_index + 1 } :{ last_page_index } "
134
+
135
+ return parameters
136
+
137
+ def __merge_documents (self , first : UnstructuredDocument , second : UnstructuredDocument ) -> UnstructuredDocument :
165
138
tables = first .tables
166
139
dropped_tables = set ()
167
140
for table in second .tables :
@@ -183,50 +156,3 @@ def _merge_documents(self, first: UnstructuredDocument, second: UnstructuredDocu
183
156
lines = lines ,
184
157
attachments = first .attachments + second .attachments ,
185
158
metadata = second .metadata )
186
-
187
- def __get_one_column_document (self , parameters : Optional [dict ], path : str ) -> Tuple [List [bool ], List [Optional [str ]]]:
188
- if parameters is None :
189
- parameters = {}
190
- is_one_column_document = str (parameters .get ("is_one_column_document" , "auto" ))
191
- page_count = get_pdf_page_count (path )
192
- if is_one_column_document .lower () != "auto" :
193
- return [is_one_column_document .lower () == "true" for _ in range (page_count )], [None ]
194
-
195
- if page_count is None :
196
- return self ._get_page_is_one_columns_list (path = path , start = 0 , stop = 1 )[0 ], [None ]
197
- page_check_count = min (3 , page_count )
198
- is_one_columns_list , warnings = self ._get_page_is_one_columns_list (path = path , start = 0 , stop = page_check_count )
199
- if page_count == page_check_count :
200
- self .logger .info (warnings )
201
- return is_one_columns_list , warnings
202
-
203
- if is_one_columns_list [1 ] == is_one_columns_list [2 ]:
204
- is_one_columns_list .extend (is_one_columns_list [1 ] for _ in range (page_count - page_check_count ))
205
- warnings_count = min (5 , page_count )
206
- for i in range (page_check_count , warnings_count ):
207
- warning = warnings [2 ].replace ("page " + str (page_check_count - 1 ), "page " + str (i ))
208
- warnings .append (warning )
209
- else :
210
- is_one_columns , warnings_next = self ._get_page_is_one_columns_list (path = path , start = page_check_count ,
211
- stop = page_count )
212
- is_one_columns_list += is_one_columns
213
- warnings += warnings_next [:5 ]
214
- self .logger .info (warnings )
215
- return is_one_columns_list , warnings
216
-
217
- def _get_page_is_one_columns_list (self , path : str , start : int , stop : int ) -> Tuple [List [bool ], List [Optional [str ]]]:
218
- is_one_columns_list = []
219
- warnings = []
220
- for page_id in range (start , stop ):
221
- try :
222
- image = get_page_image (path = path , page_id = page_id )
223
- if image is None :
224
- return [False ], ["fail to read image from pdf" ]
225
- except Exception as ex :
226
- self .logger .warning ("It seems the input PDF-file is uncorrected" )
227
- raise BadFileFormatException (msg = f"It seems the input PDF-file is uncorrected. Exception: { ex } " )
228
-
229
- columns , _ = self .orientation_classifier .predict (np .array (image ))
230
- is_one_columns_list .append (columns == 1 )
231
- warnings .append ("assume page {} has {} columns" .format (page_id , columns ))
232
- return is_one_columns_list , warnings
0 commit comments