Skip to content

Commit 0b19ba4

Browse files
dronperminovNastyBogetalexander1999-hubAlexander Golodkov
authored
update master (#297)
* TLDR-387 fix bug for diplomas with insert_table=true (#289) * Tldr 353 refactor detect text correction (#290) * TLDR-353 start refactoring * TLDR-353 pdf_auto_reader refactoring * TLDR-353 tests fixed * TLDR-384 change PdfAutoReader logic and fix PDF slicing (#292) * TLDR-384 change PdfAutoReader logic and fix page slicing and logging for pdf * TLDR-384 review fixes * TLDR-379 update bold classifier (#294) * add bold classifier based on rules * replace nn classifier with ruled classifier * remove downloading deleted model * fix style tests * review fixes * add test for bold classifier * TLDR-293 Tests Refactoring (#293) * added comments with suggestions for refactoring and fixed some tests * refactored another batch of files * refactored one more batch of files * refactored one more batch of files * renamed files * deleted unused data files and old unit tests * fixed code according to comments * fixed test for font classifier --------- Co-authored-by: Alexander Golodkov <golodkov@ispras.ru> * TLDR-413 fix bug in models downloading inside docker container (#295) * TLDR-413 fix bug in models downloading inside docker container * TLDR-413 review fixes * new version 0.9.2 (#296) --------- Co-authored-by: Bogatenkova Anastasiya <bogatenkova.anastasiya@mail.ru> Co-authored-by: Alexander Golodkov <55749660+alexander1999-hub@users.noreply.github.com> Co-authored-by: Alexander Golodkov <golodkov@ispras.ru>
1 parent a2e8feb commit 0b19ba4

File tree

114 files changed

+831
-1589
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

114 files changed

+831
-1589
lines changed

VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
0.9.1
1+
0.9.2

dedoc/config.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,12 @@
99
format="%(asctime)s - %(pathname)s - %(levelname)s - %(message)s")
1010

1111
DEBUG_MODE = False
12+
RESOURCES_PATH = os.environ.get('RESOURCES_PATH', os.path.join(os.path.expanduser('~'), ".cache", "dedoc", "resources"))
1213

1314
_config = dict(
1415
# -----------------------------------------RESOURCES PATH SETTINGS----------------------------------------------------
15-
resources_path=os.path.join(os.path.expanduser('~'), ".cache", "dedoc", "resources"),
16-
intermediate_data_path=os.path.join(os.path.expanduser('~'), ".cache", "dedoc", "resources", "datasets"),
16+
resources_path=RESOURCES_PATH,
17+
intermediate_data_path=os.path.join(RESOURCES_PATH, "datasets"),
1718

1819
# -----------------------------------------COMMON DEBUG SETTINGS----------------------------------------------------
1920
debug_mode=DEBUG_MODE,

dedoc/download_models.py

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ def download_from_hub(out_dir: str, out_name: str, repo_name: str, hub_name: str
2828

2929
def download(resources_path: str) -> None:
3030
download_from_hub(out_dir=resources_path,
31-
out_name="catboost_detect_tl_correctness.pth",
31+
out_name="catboost_detect_tl_correctness.pkl.gz",
3232
repo_name="catboost_detect_tl_correctness",
3333
hub_name="model.pkl.gz")
3434

@@ -37,11 +37,6 @@ def download(resources_path: str) -> None:
3737
repo_name="scan_orientation_efficient_net_b0",
3838
hub_name="model.pth")
3939

40-
download_from_hub(out_dir=resources_path,
41-
out_name="font_classifier.pth",
42-
repo_name="font_classifier",
43-
hub_name="model.pth")
44-
4540
download_from_hub(out_dir=resources_path,
4641
out_name="paragraph_classifier.pkl.gz",
4742
repo_name="paragraph_classifier",

dedoc/readers/pdf_reader/pdf_auto_reader/pdf_auto_reader.py

Lines changed: 66 additions & 140 deletions
Original file line numberDiff line numberDiff line change
@@ -2,26 +2,20 @@
22
import logging
33
import os
44
from itertools import chain
5-
from typing import Optional, Tuple, List
6-
import numpy as np
5+
from typing import Optional
76

8-
from dedoc.common.exceptions.bad_file_exception import BadFileFormatException
9-
from dedoc.config import get_config
107
from dedoc.data_structures.concrete_annotations.table_annotation import TableAnnotation
118
from dedoc.data_structures.line_with_meta import LineWithMeta
129
from dedoc.data_structures.unstructured_document import UnstructuredDocument
1310
from dedoc.extensions import recognized_mimes
1411
from dedoc.readers.base_reader import BaseReader
15-
from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_tabby_reader import PdfTabbyReader
16-
from dedoc.utils.parameter_utils import get_param_pdf_with_txt_layer
17-
from dedoc.utils.pdf_utils import get_page_slice, get_page_image, get_pdf_page_count
18-
from dedoc.readers.pdf_reader.pdf_image_reader.columns_orientation_classifier.columns_orientation_classifier import ColumnsOrientationClassifier
12+
from dedoc.readers.pdf_reader.pdf_auto_reader.txtlayer_detector import TxtLayerDetector
1913
from dedoc.readers.pdf_reader.pdf_image_reader.pdf_image_reader import PdfImageReader
14+
from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_tabby_reader import PdfTabbyReader
2015
from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_txtlayer_reader import PdfTxtlayerReader
21-
from dedoc.readers.pdf_reader.pdf_auto_reader.pdf_txtlayer_correctness import PdfTextLayerCorrectness
16+
from dedoc.utils.parameter_utils import get_param_pdf_with_txt_layer, get_param_page_slice
2217

2318

24-
# TODO delete parameter is_one_column_document_list
2519
class PdfAutoReader(BaseReader):
2620
"""
2721
This class allows to extract content from the .pdf documents of any kind.
@@ -40,23 +34,13 @@ def __init__(self, *, config: dict) -> None:
4034
"""
4135
:param config: configuration of the reader, e.g. logger for logging
4236
"""
43-
self.pdf_parser = PdfTxtlayerReader(config=config)
44-
self.tabby_parser = PdfTabbyReader(config=config)
37+
self.pdf_txtlayer_reader = PdfTxtlayerReader(config=config)
38+
self.pdf_tabby_reader = PdfTabbyReader(config=config)
4539
self.pdf_image_reader = PdfImageReader(config=config)
40+
self.txtlayer_detector = TxtLayerDetector(pdf_txtlayer_reader=self.pdf_txtlayer_reader, pdf_tabby_reader=self.pdf_tabby_reader, config=config)
41+
4642
self.config = config
4743
self.logger = config.get("logger", logging.getLogger())
48-
self.__checkpoint_path = get_config()["resources_path"]
49-
self._orientation_classifier = None
50-
self.pdf_correctness = PdfTextLayerCorrectness(config=config)
51-
52-
@property
53-
def orientation_classifier(self) -> ColumnsOrientationClassifier:
54-
if self._orientation_classifier is None:
55-
self._orientation_classifier = ColumnsOrientationClassifier(on_gpu=False,
56-
checkpoint_path=self.__checkpoint_path,
57-
delete_lines=False,
58-
config=self.config)
59-
return self._orientation_classifier
6044

6145
def can_read(self, path: str, mime: str, extension: str, document_type: Optional[str] = None, parameters: Optional[dict] = None) -> bool:
6246
"""
@@ -69,99 +53,88 @@ def can_read(self, path: str, mime: str, extension: str, document_type: Optional
6953
7054
Look to the documentation of :meth:`~dedoc.readers.BaseReader.can_read` to get information about the method's parameters.
7155
"""
72-
parameters = {} if parameters is None else parameters
73-
74-
is_pdf = mime in recognized_mimes.pdf_like_format
75-
if not is_pdf:
56+
if mime not in recognized_mimes.pdf_like_format:
7657
return False
7758

59+
parameters = {} if parameters is None else parameters
7860
pdf_with_txt_layer = get_param_pdf_with_txt_layer(parameters)
79-
return is_pdf and pdf_with_txt_layer in ("auto", "auto_tabby")
61+
return pdf_with_txt_layer in ("auto", "auto_tabby")
8062

8163
def read(self, path: str, document_type: Optional[str] = None, parameters: Optional[dict] = None) -> UnstructuredDocument:
8264
"""
8365
The method return document content with all document's lines, tables and attachments.
8466
This reader is able to add some additional information to the `tag_hierarchy_level` of :class:`~dedoc.data_structures.LineMetadata`.
8567
Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters.
8668
"""
87-
pdf_with_txt_layer = get_param_pdf_with_txt_layer(parameters)
8869
warnings = []
70+
txtlayer_parameters = self.txtlayer_detector.detect_txtlayer(path=path, parameters=parameters)
8971

90-
is_one_column_document_list, warning_list = self.__get_one_column_document(parameters, path=path)
91-
parameters["is_one_column_document_list"] = is_one_column_document_list
92-
parameters_copy = copy.deepcopy(parameters)
93-
parameters_copy["is_one_column_document"] = "true" if is_one_column_document_list[0] else "false"
94-
for warning in warning_list:
95-
if warning is not None:
96-
warnings.append(warning)
97-
text_layer_parameters = self.pdf_correctness.with_text_layer(path=path,
98-
parameters=parameters,
99-
is_one_column_list=is_one_column_document_list)
100-
is_booklet = text_layer_parameters.is_booklet
101-
pdf_with_text_layer = text_layer_parameters.correct_text_layout
102-
is_first_page_correct = text_layer_parameters.correct_first_page
103-
104-
if is_booklet:
105-
message = "assume document is booklet"
106-
warnings.append(message)
107-
self.logger.warning(message + " " + os.path.basename(path))
108-
109-
if pdf_with_text_layer:
110-
result = self._handle_correct_layer(document_type=document_type,
111-
is_first_page_correct=is_first_page_correct,
112-
parameters=parameters,
113-
parameters_copy=parameters_copy,
114-
path=path,
115-
warnings=warnings,
116-
pdf_with_txt_layer=pdf_with_txt_layer)
72+
if txtlayer_parameters.is_correct_text_layer:
73+
result = self.__handle_correct_text_layer(is_first_page_correct=txtlayer_parameters.is_first_page_correct,
74+
parameters=parameters,
75+
path=path,
76+
warnings=warnings)
11777
else:
118-
result = self._handle_incorrect_text_layer(document_type, parameters_copy, path, warnings)
119-
parameters_copy["pdf_with_text_layer"] = str(pdf_with_text_layer)
78+
result = self.__handle_incorrect_text_layer(parameters, path, warnings)
12079

12180
result.warnings.extend(warnings)
12281
return result
12382

124-
def _handle_incorrect_text_layer(self, document_type: str, parameters_copy: dict, path: str, warnings: list) -> UnstructuredDocument:
125-
message = "assume document has incorrect text layer"
126-
warnings.append(message)
127-
warnings.append(message + " " + os.path.basename(path))
128-
self.logger.info(message.format(os.path.basename(path)))
129-
result = self.pdf_image_reader.read(path=path, document_type=document_type, parameters=parameters_copy)
83+
def __handle_incorrect_text_layer(self, parameters_copy: dict, path: str, warnings: list) -> UnstructuredDocument:
84+
self.logger.info(f"Assume document {os.path.basename(path)} has incorrect textual layer")
85+
warnings.append("Assume document has incorrect textual layer")
86+
result = self.pdf_image_reader.read(path=path, parameters=parameters_copy)
13087
return result
13188

132-
def _handle_correct_layer(self,
133-
document_type: str,
134-
is_first_page_correct: bool,
135-
parameters: dict,
136-
parameters_copy: dict,
137-
path: str,
138-
pdf_with_txt_layer: str,
139-
warnings: list) -> UnstructuredDocument:
140-
message = "assume {} has correct text layer"
141-
self.logger.info(message.format(os.path.basename(path)))
142-
warnings.append(message.format("document"))
143-
prefix = None
89+
def __handle_correct_text_layer(self,
90+
is_first_page_correct: bool,
91+
parameters: dict,
92+
path: str,
93+
warnings: list) -> UnstructuredDocument:
94+
self.logger.info(f"Assume document {os.path.basename(path)} has a correct textual layer")
95+
warnings.append("Assume document has a correct textual layer")
96+
recognized_first_page = None
97+
14498
if not is_first_page_correct:
145-
message = "assume first page has no text layer"
99+
message = "Assume the first page hasn't a textual layer"
146100
warnings.append(message)
147101
self.logger.info(message)
148-
first_page, last_page = get_page_slice(parameters_copy)
149-
first_page = 1 if first_page is None else first_page + 1
150-
last_page = 1
151-
scan_parameters = copy.deepcopy(parameters)
152-
scan_parameters["pages"] = f"{first_page}:{last_page}"
153-
prefix = self.pdf_image_reader.read(path=path, document_type=document_type, parameters=scan_parameters)
154-
reader = self.pdf_parser if pdf_with_txt_layer == "auto" else self.tabby_parser
155-
if not is_first_page_correct:
156-
first_page, last_page = get_page_slice(parameters_copy)
157-
first_page = 2 if first_page is None else first_page + 1
158-
last_page = "" if last_page is None else last_page
159-
parameters_copy["pages"] = f"{first_page}:{last_page}"
160-
result = reader.read(path=path, document_type=document_type, parameters=parameters_copy)
161-
result = self._merge_documents(prefix, result) if prefix is not None else result
102+
103+
# GET THE FIRST PAGE: recognize the first page like a scanned page
104+
scan_parameters = self.__preparing_first_page_parameters(parameters)
105+
recognized_first_page = self.pdf_image_reader.read(path=path, parameters=scan_parameters)
106+
107+
# PREPARE PARAMETERS: from the second page we recognize the content like PDF with a textual layer
108+
parameters = self.__preparing_other_pages_parameters(parameters)
109+
110+
pdf_with_txt_layer = get_param_pdf_with_txt_layer(parameters)
111+
reader = self.pdf_txtlayer_reader if pdf_with_txt_layer == "auto" else self.pdf_tabby_reader
112+
result = reader.read(path=path, parameters=parameters)
113+
result = self.__merge_documents(recognized_first_page, result) if recognized_first_page is not None else result
162114
return result
163115

164-
def _merge_documents(self, first: UnstructuredDocument, second: UnstructuredDocument) -> UnstructuredDocument:
116+
def __preparing_first_page_parameters(self, parameters: dict) -> dict:
117+
first_page, last_page = get_param_page_slice(parameters)
118+
# calculate indexes for the first page parsing
119+
first_page_index = 0 if first_page is None else first_page
120+
last_page_index = 0
121+
scan_parameters = copy.deepcopy(parameters)
122+
123+
# page numeration in parameters starts with 1, both ends are included
124+
scan_parameters["pages"] = f"{first_page_index + 1}:{last_page_index + 1}"
125+
# if the first page != 0 then we won't read it (because first_page_index > last_page_index)
126+
return scan_parameters
127+
128+
def __preparing_other_pages_parameters(self, parameters: dict) -> dict:
129+
first_page, last_page = get_param_page_slice(parameters)
130+
# parameters for reading pages from the second page
131+
first_page_index = 1 if first_page is None else first_page
132+
last_page_index = "" if last_page is None else last_page
133+
parameters["pages"] = f"{first_page_index + 1}:{last_page_index}"
134+
135+
return parameters
136+
137+
def __merge_documents(self, first: UnstructuredDocument, second: UnstructuredDocument) -> UnstructuredDocument:
165138
tables = first.tables
166139
dropped_tables = set()
167140
for table in second.tables:
@@ -183,50 +156,3 @@ def _merge_documents(self, first: UnstructuredDocument, second: UnstructuredDocu
183156
lines=lines,
184157
attachments=first.attachments + second.attachments,
185158
metadata=second.metadata)
186-
187-
def __get_one_column_document(self, parameters: Optional[dict], path: str) -> Tuple[List[bool], List[Optional[str]]]:
188-
if parameters is None:
189-
parameters = {}
190-
is_one_column_document = str(parameters.get("is_one_column_document", "auto"))
191-
page_count = get_pdf_page_count(path)
192-
if is_one_column_document.lower() != "auto":
193-
return [is_one_column_document.lower() == "true" for _ in range(page_count)], [None]
194-
195-
if page_count is None:
196-
return self._get_page_is_one_columns_list(path=path, start=0, stop=1)[0], [None]
197-
page_check_count = min(3, page_count)
198-
is_one_columns_list, warnings = self._get_page_is_one_columns_list(path=path, start=0, stop=page_check_count)
199-
if page_count == page_check_count:
200-
self.logger.info(warnings)
201-
return is_one_columns_list, warnings
202-
203-
if is_one_columns_list[1] == is_one_columns_list[2]:
204-
is_one_columns_list.extend(is_one_columns_list[1] for _ in range(page_count - page_check_count))
205-
warnings_count = min(5, page_count)
206-
for i in range(page_check_count, warnings_count):
207-
warning = warnings[2].replace("page " + str(page_check_count - 1), "page " + str(i))
208-
warnings.append(warning)
209-
else:
210-
is_one_columns, warnings_next = self._get_page_is_one_columns_list(path=path, start=page_check_count,
211-
stop=page_count)
212-
is_one_columns_list += is_one_columns
213-
warnings += warnings_next[:5]
214-
self.logger.info(warnings)
215-
return is_one_columns_list, warnings
216-
217-
def _get_page_is_one_columns_list(self, path: str, start: int, stop: int) -> Tuple[List[bool], List[Optional[str]]]:
218-
is_one_columns_list = []
219-
warnings = []
220-
for page_id in range(start, stop):
221-
try:
222-
image = get_page_image(path=path, page_id=page_id)
223-
if image is None:
224-
return [False], ["fail to read image from pdf"]
225-
except Exception as ex:
226-
self.logger.warning("It seems the input PDF-file is uncorrected")
227-
raise BadFileFormatException(msg=f"It seems the input PDF-file is uncorrected. Exception: {ex}")
228-
229-
columns, _ = self.orientation_classifier.predict(np.array(image))
230-
is_one_columns_list.append(columns == 1)
231-
warnings.append("assume page {} has {} columns".format(page_id, columns))
232-
return is_one_columns_list, warnings

0 commit comments

Comments
 (0)