Skip to content

Commit 8a2678c

Browse files
alexander1999-hubAlexander GolodkovdronperminovNastyBoget
authored
update master (#472)
Co-authored-by: Alexander Golodkov <golodkov@ispras.ru> Co-authored-by: Andrew Perminov <perminov@ispras.ru> Co-authored-by: Bogatenkova Anastasiya <bogatenkova.anastasiya@mail.ru>
1 parent 5750d57 commit 8a2678c

34 files changed

+957
-119
lines changed

.flake8

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,3 +49,4 @@ per-file-ignores =
4949
scripts/benchmark_pdf_performance*:JS101
5050
tests/custom_test_runner.py:ANN001,ANN201,ANN202,ANN204,N802
5151
docs/source/_static/code_examples/*:I251
52+
docs/source/_static/code_examples/langchain/*:FOL001,FOL002,FOL003,FOL004,FOL005,I100,I202,I251

Dockerfile

Lines changed: 7 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,22 @@
11
ARG REPOSITORY="docker.io"
22
FROM dedocproject/dedoc_p3.9_base:version_2023_08_28
33
ARG LANGUAGES=""
4-
RUN for lang in $LANGUAGES; do apt install -y tesseract-ocr-$lang; done
4+
RUN for lang in $LANGUAGES; do apt install -y tesseract-ocr-$(echo $lang | tr "_" "-"); done
55

66
ENV PYTHONPATH "${PYTHONPATH}:/dedoc_root"
77
ENV RESOURCES_PATH "/dedoc_root/resources"
88

9-
ADD requirements.txt .
9+
COPY requirements.txt .
1010
RUN pip3 install --no-cache-dir -r requirements.txt
1111

1212
RUN mkdir /dedoc_root
1313
RUN mkdir /dedoc_root/dedoc
14-
ADD dedoc/config.py /dedoc_root/dedoc/config.py
15-
ADD dedoc/download_models.py /dedoc_root/dedoc/download_models.py
14+
COPY dedoc/config.py /dedoc_root/dedoc/config.py
15+
COPY dedoc/download_models.py /dedoc_root/dedoc/download_models.py
1616
RUN python3 /dedoc_root/dedoc/download_models.py
1717

18-
ADD dedoc /dedoc_root/dedoc
19-
ADD VERSION /dedoc_root
18+
COPY dedoc /dedoc_root/dedoc
19+
COPY VERSION /dedoc_root
2020
RUN echo "__version__ = \"$(cat /dedoc_root/VERSION)\"" > /dedoc_root/dedoc/version.py
2121

22-
ADD tests /dedoc_root/tests
23-
ADD resources /dedoc_root/resources
24-
25-
CMD ["python3", "/dedoc_root/dedoc/main.py"]
22+
CMD [ "python3", "/dedoc_root/dedoc/main.py" ]

VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
2.2.4
1+
2.2.5

dedoc/download_models.py

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,12 @@
55
Keys are the names of repositories with models.
66
"""
77
model_hash_dict = dict(
8-
txtlayer_classifier="94e27e184fa2876883d260e0aa58b042e6ab3e35",
8+
txtlayer_classifier="9ca1de749d8d37147b00a3a228e03ee1776c695f",
99
scan_orientation_efficient_net_b0="9ea283f3d346ae4fdd82463a9f60b5369a3ffb58",
1010
font_classifier="db4481ad60ab050cbb42079b64f97f9e431feb07",
11-
paragraph_classifier="00bf989876cec171c1cf9859a6b712af6445e864",
12-
line_type_classifiers="2e498d1ec82b72c1a96ba0d25344b71402997013",
13-
fintoc_classifiers="42f8ada99a5da608139b078c93bebfffc5b30263"
11+
paragraph_classifier="c26a10193499d3cbc77ffec9842bece24fa8950b",
12+
line_type_classifiers="0568c6e1f49612c0c351f10b80a26dc05f796683",
13+
fintoc_classifiers="6a907b7d2437c3f61ac9c506f67175207982fae8"
1414
)
1515

1616

@@ -27,29 +27,29 @@ def download_from_hub(out_dir: str, out_name: str, repo_name: str, hub_name: str
2727
def download(resources_path: str) -> None:
2828
import os
2929

30-
download_from_hub(out_dir=resources_path, out_name="txtlayer_classifier.pkl.gz", repo_name="txtlayer_classifier", hub_name="model.pkl.gz")
30+
download_from_hub(out_dir=resources_path, out_name="txtlayer_classifier.json", repo_name="txtlayer_classifier", hub_name="model.json")
3131

3232
download_from_hub(out_dir=resources_path,
3333
out_name="scan_orientation_efficient_net_b0.pth",
3434
repo_name="scan_orientation_efficient_net_b0",
3535
hub_name="model.pth")
3636

37-
download_from_hub(out_dir=resources_path, out_name="paragraph_classifier.pkl.gz", repo_name="paragraph_classifier", hub_name="model.pkl.gz")
37+
download_from_hub(out_dir=resources_path, out_name="paragraph_classifier.zip", repo_name="paragraph_classifier", hub_name="model.zip")
3838

3939
line_clf_resources_path = os.path.join(resources_path, "line_type_classifiers")
4040
for classifier_type in ("diploma", "law", "law_txt", "tz", "tz_txt"):
4141
download_from_hub(out_dir=line_clf_resources_path,
42-
out_name=f"{classifier_type}_classifier.pkl.gz",
42+
out_name=f"{classifier_type}_classifier.zip",
4343
repo_name="line_type_classifiers",
44-
hub_name=f"{classifier_type}.pkl.gz")
44+
hub_name=f"{classifier_type}.zip")
4545

4646
fintoc_classifiers_resources_path = os.path.join(resources_path, "fintoc_classifiers")
4747
for language in ("en", "fr", "sp"):
4848
for classifier_type in ("target", "binary"):
4949
download_from_hub(out_dir=fintoc_classifiers_resources_path,
50-
out_name=f"{classifier_type}_classifier_{language}.pkg.gz",
50+
out_name=f"{classifier_type}_classifier_{language}.json",
5151
repo_name="fintoc_classifiers",
52-
hub_name=f"{classifier_type}_classifier_{language}_txt_layer.pkg.gz")
52+
hub_name=f"{classifier_type}_classifier_{language}_txt_layer.json")
5353

5454

5555
if __name__ == "__main__":

dedoc/extensions.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919

2020

2121
converted_extensions = Extensions(
22-
excel_like_format={".ods", "xls"},
22+
excel_like_format={".ods", ".xls"},
2323
docx_like_format={".odt", ".doc", ".rtf"},
2424
pptx_like_format={".odp", ".ppt"},
2525
html_like_format={},

dedoc/readers/email_reader/email_reader.py

Lines changed: 14 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -44,11 +44,13 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure
4444
import os
4545
import uuid
4646
from dedoc.data_structures.attached_file import AttachedFile
47-
from dedoc.utils.parameter_utils import get_param_attachments_dir, get_param_need_content_analysis
47+
from dedoc.utils.parameter_utils import get_param_attachments_dir, get_param_need_content_analysis, get_param_with_attachments
4848
from dedoc.utils.utils import get_unique_name
4949

5050
parameters = {} if parameters is None else parameters
5151
attachments_dir = get_param_attachments_dir(parameters, file_path)
52+
with_attachments = get_param_with_attachments(parameters)
53+
need_content_analysis = get_param_need_content_analysis(parameters)
5254

5355
with open(file_path, "rb") as f:
5456
msg = email.message_from_binary_file(f)
@@ -58,16 +60,15 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure
5860
lines = self.__get_main_fields(msg)
5961
header_filename = "message_header_" + get_unique_name("message_header.json")
6062

61-
# saving message header into separated file as an attachment
62-
header_file_path = os.path.join(attachments_dir, header_filename)
63-
with open(header_file_path, "w", encoding="utf-8") as f:
64-
json.dump(all_header_fields, f, ensure_ascii=False, indent=4)
65-
66-
need_content_analysis = get_param_need_content_analysis(parameters)
67-
attachments.append(AttachedFile(original_name=header_filename,
68-
tmp_file_path=header_file_path,
69-
uid=f"attach_{uuid.uuid1()}",
70-
need_content_analysis=need_content_analysis))
63+
if with_attachments:
64+
# saving message header into separated file as an attachment
65+
header_file_path = os.path.join(attachments_dir, header_filename)
66+
with open(header_file_path, "w", encoding="utf-8") as f:
67+
json.dump(all_header_fields, f, ensure_ascii=False, indent=4)
68+
attachments.append(AttachedFile(original_name=header_filename,
69+
tmp_file_path=header_file_path,
70+
uid=f"attach_{uuid.uuid1()}",
71+
need_content_analysis=need_content_analysis))
7172

7273
html_found = False
7374
text_parts = []
@@ -92,7 +93,8 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure
9293
if part.is_multipart():
9394
continue
9495

95-
self.__add_attachment(part, attachments_dir, attachments, need_content_analysis)
96+
if with_attachments:
97+
self.__add_attachment(part, attachments_dir, attachments, need_content_analysis)
9698

9799
# text/plain has the same content as text/html
98100
if not html_found:

dedoc/readers/mhtml_reader/mhtml_reader.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure
3636
This reader is able to add some additional information to the `tag_hierarchy_level` of :class:`~dedoc.data_structures.LineMetadata`.
3737
Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters.
3838
"""
39-
from dedoc.utils.parameter_utils import get_param_attachments_dir, get_param_need_content_analysis
39+
from dedoc.utils.parameter_utils import get_param_attachments_dir, get_param_need_content_analysis, get_param_with_attachments
4040

4141
parameters = {} if parameters is None else parameters
4242
attachments_dir = get_param_attachments_dir(parameters, file_path)
@@ -51,16 +51,21 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure
5151
lines.extend(result.lines)
5252
tables.extend(result.tables)
5353

54-
need_content_analysis = get_param_need_content_analysis(parameters)
5554
tmp_file_names = []
5655
original_file_names = []
5756
for tmp_file_name, original_file_name in zip(names_list, original_names_list):
5857
if tmp_file_name not in names_html:
5958
tmp_file_names.append(tmp_file_name)
6059
original_file_names.append(original_file_name)
6160

62-
attachments = self.__get_attachments(save_dir=attachments_dir, tmp_names_list=tmp_file_names, original_names_list=original_file_names,
63-
need_content_analysis=need_content_analysis)
61+
with_attachments = get_param_with_attachments(parameters)
62+
need_content_analysis = get_param_need_content_analysis(parameters)
63+
if with_attachments:
64+
attachments = self.__get_attachments(
65+
save_dir=attachments_dir, tmp_names_list=tmp_file_names, original_names_list=original_file_names, need_content_analysis=need_content_analysis
66+
)
67+
else:
68+
attachments = []
6469

6570
return UnstructuredDocument(tables=tables, lines=lines, attachments=attachments)
6671

dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_classifier.py

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,5 @@
1-
import gzip
21
import logging
32
import os
4-
import pickle
53
from typing import List
64

75
from xgboost import XGBClassifier
@@ -22,7 +20,7 @@ def __init__(self, *, config: dict) -> None:
2220
self.logger = config.get("logger", logging.getLogger())
2321

2422
self.feature_extractor = TxtlayerFeatureExtractor()
25-
self.path = os.path.join(get_config()["resources_path"], "txtlayer_classifier.pkl.gz")
23+
self.path = os.path.join(get_config()["resources_path"], "txtlayer_classifier.json")
2624
self.__model = None
2725

2826
@property
@@ -32,11 +30,11 @@ def __get_model(self) -> XGBClassifier:
3230

3331
if not os.path.isfile(self.path):
3432
out_dir, out_name = os.path.split(self.path)
35-
download_from_hub(out_dir=out_dir, out_name=out_name, repo_name="txtlayer_classifier", hub_name="model.pkl.gz")
33+
download_from_hub(out_dir=out_dir, out_name=out_name, repo_name="txtlayer_classifier", hub_name="model.json")
3634

3735
assert os.path.isfile(self.path)
38-
with gzip.open(self.path, "rb") as f:
39-
self.__model = pickle.load(f)
36+
self.__model = XGBClassifier()
37+
self.__model.load_model(self.path)
4038

4139
if get_param_gpu_available(self.config, self.logger):
4240
gpu_params = dict(predictor="gpu_predictor", tree_method="auto", gpu_id=0)

dedoc/readers/pdf_reader/pdf_image_reader/paragraph_extractor/scan_paragraph_classifier_extractor.py

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
1-
import gzip
1+
import json
22
import logging
33
import os
4-
import pickle
4+
import tempfile
5+
import zipfile
56
from typing import List
67

78
from xgboost import XGBClassifier
@@ -21,7 +22,7 @@ class ScanParagraphClassifierExtractor(object):
2122
def __init__(self, *, config: dict) -> None:
2223
super().__init__()
2324
self.logger = config.get("logger", logging.getLogger())
24-
self.path = os.path.join(get_config()["resources_path"], "paragraph_classifier.pkl.gz")
25+
self.path = os.path.join(get_config()["resources_path"], "paragraph_classifier.zip")
2526
self.config = config
2627
self._feature_extractor = None
2728
self._classifier = None
@@ -41,11 +42,17 @@ def classifier(self) -> XGBClassifier:
4142
def _unpickle(self) -> None:
4243
if not os.path.isfile(self.path):
4344
out_dir, out_name = os.path.split(self.path)
44-
download_from_hub(out_dir=out_dir, out_name=out_name, repo_name="paragraph_classifier", hub_name="model.pkl.gz")
45+
download_from_hub(out_dir=out_dir, out_name=out_name, repo_name="paragraph_classifier", hub_name="model.zip")
4546

46-
with gzip.open(self.path) as file:
47-
self._classifier, parameters = pickle.load(file)
48-
self._feature_extractor = ParagraphFeatureExtractor(**parameters, config=self.config)
47+
with tempfile.TemporaryDirectory() as tmpdir:
48+
with zipfile.ZipFile(self.path) as archive:
49+
archive.extractall(tmpdir)
50+
51+
with open(os.path.join(tmpdir, "parameters.json")) as parameters_file:
52+
parameters = json.load(parameters_file)
53+
self._classifier = XGBClassifier()
54+
self._classifier.load_model(os.path.join(tmpdir, "classifier.json"))
55+
self._feature_extractor = ParagraphFeatureExtractor(**parameters, config=self.config)
4956

5057
if get_param_gpu_available(self.config, self.logger):
5158
gpu_params = dict(predictor="gpu_predictor", tree_method="auto", gpu_id=0)

dedoc/structure_extractors/concrete_structure_extractors/abstract_law_structure_extractor.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,8 @@ def __init__(self, *, config: Optional[dict] = None) -> None:
2626
from dedoc.structure_extractors.line_type_classifiers.law_classifier import LawLineTypeClassifier
2727

2828
path = os.path.join(get_config()["resources_path"], "line_type_classifiers")
29-
self.classifier = LawLineTypeClassifier(classifier_type="law", path=os.path.join(path, "law_classifier.pkl.gz"), config=self.config)
30-
self.txt_classifier = LawLineTypeClassifier(classifier_type="law_txt", path=os.path.join(path, "law_txt_classifier.pkl.gz"), config=self.config)
29+
self.classifier = LawLineTypeClassifier(classifier_type="law", path=os.path.join(path, "law_classifier.zip"), config=self.config)
30+
self.txt_classifier = LawLineTypeClassifier(classifier_type="law_txt", path=os.path.join(path, "law_txt_classifier.zip"), config=self.config)
3131
self.hierarchy_level_builders = [StubHierarchyLevelBuilder()]
3232
self.hl_type = "law"
3333
self.init_hl_depth = 1

dedoc/structure_extractors/concrete_structure_extractors/diploma_structure_extractor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ def __init__(self, *, config: Optional[dict] = None) -> None:
3232
self.toc_builder = TocBuilder()
3333
self.body_builder = DiplomaBodyBuilder()
3434
path = os.path.join(get_config()["resources_path"], "line_type_classifiers")
35-
self.classifier = DiplomaLineTypeClassifier(path=os.path.join(path, "diploma_classifier.pkl.gz"), config=self.config)
35+
self.classifier = DiplomaLineTypeClassifier(path=os.path.join(path, "diploma_classifier.zip"), config=self.config)
3636
self.footnote_start_regexp = re.compile(r"^\d+ ")
3737

3838
def extract(self, document: UnstructuredDocument, parameters: Optional[dict] = None) -> UnstructuredDocument:

dedoc/structure_extractors/concrete_structure_extractors/tz_structure_extractor.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,8 @@ def __init__(self, *, config: Optional[dict] = None) -> None:
2929
self.body_builder = TzBodyBuilder()
3030
self.toc_builder = TocBuilder()
3131
path = os.path.join(get_config()["resources_path"], "line_type_classifiers")
32-
self.classifier = TzLineTypeClassifier(classifier_type="tz", path=os.path.join(path, "tz_classifier.pkl.gz"), config=self.config)
33-
self.txt_classifier = TzLineTypeClassifier(classifier_type="tz_txt", path=os.path.join(path, "tz_txt_classifier.pkl.gz"), config=self.config)
32+
self.classifier = TzLineTypeClassifier(classifier_type="tz", path=os.path.join(path, "tz_classifier.zip"), config=self.config)
33+
self.txt_classifier = TzLineTypeClassifier(classifier_type="tz_txt", path=os.path.join(path, "tz_txt_classifier.zip"), config=self.config)
3434

3535
def extract(self, document: UnstructuredDocument, parameters: Optional[dict] = None) -> UnstructuredDocument:
3636
"""

dedoc/structure_extractors/line_type_classifiers/abstract_pickled_classifier.py

Lines changed: 29 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
1-
import gzip
1+
import json
22
import logging
33
import os
4-
import pickle
4+
import tempfile
5+
import zipfile
56
from abc import ABC
67
from typing import Optional, Tuple
78

@@ -32,10 +33,16 @@ def load(self, classifier_type: str, path: str) -> Tuple[XGBClassifier, dict]:
3233
"""
3334
if not os.path.isfile(path):
3435
out_dir, out_name = os.path.split(path)
35-
download_from_hub(out_dir=out_dir, out_name=out_name, repo_name="line_type_classifiers", hub_name=f"{classifier_type}.pkl.gz")
36+
download_from_hub(out_dir=out_dir, out_name=out_name, repo_name="line_type_classifiers", hub_name=f"{classifier_type}.zip")
3637

37-
with gzip.open(path) as file:
38-
classifier, feature_extractor_parameters = pickle.load(file)
38+
with tempfile.TemporaryDirectory() as tmpdir:
39+
with zipfile.ZipFile(path) as archive:
40+
archive.extractall(tmpdir)
41+
42+
with open(os.path.join(tmpdir, "parameters.json")) as parameters_file:
43+
feature_extractor_parameters = json.load(parameters_file)
44+
classifier = XGBClassifier()
45+
classifier.load_model(os.path.join(tmpdir, "classifier.json"))
3946

4047
if get_param_gpu_available(self.config, self.logger):
4148
gpu_params = dict(predictor="gpu_predictor", tree_method="auto", gpu_id=0)
@@ -44,19 +51,27 @@ def load(self, classifier_type: str, path: str) -> Tuple[XGBClassifier, dict]:
4451

4552
return classifier, feature_extractor_parameters
4653

47-
def save(self, path_out: str, object_for_saving: object) -> str:
54+
@staticmethod
55+
def save(path_out: str, classifier: XGBClassifier, parameters: dict) -> str:
4856
"""
49-
Save the pickled classifier (with initialization parameters for a feature extractor) into the `.pkl.gz` file with path=`path_out`
57+
Save the classifier (with initialization parameters for a feature extractor) into the `.zip` file with path=`path_out`
58+
59+
* classifier -> classifier.json
60+
* parameters -> parameters.json
5061
5162
:param path_out: path (with file name) where to save the object
52-
:param object_for_saving: classifier with feature extractor's parameters to save
63+
:param classifier: classifier to save
64+
:param parameters: feature extractor parameters to save
5365
:return: the resulting path of the saved file
5466
"""
55-
if path_out.endswith(".pkl"):
56-
path_out += ".gz"
57-
elif not path_out.endswith(".gz"):
58-
path_out += ".pkl.gz"
67+
with tempfile.TemporaryDirectory() as tmpdir:
68+
clf_path = os.path.join(tmpdir, "classifier.json")
69+
params_path = os.path.join(tmpdir, "parameters.json")
70+
classifier.save_model(clf_path)
71+
with open(params_path, "w") as out_file:
72+
json.dump(parameters, out_file)
5973

60-
with gzip.open(path_out, "wb") as file_out:
61-
pickle.dump(obj=object_for_saving, file=file_out)
74+
with zipfile.ZipFile(path_out, "w") as archive:
75+
archive.write(clf_path, os.path.basename(clf_path))
76+
archive.write(params_path, os.path.basename(params_path))
6277
return path_out

0 commit comments

Comments
 (0)