Skip to content

Commit 765aae2

Browse files
NastyBogetsunveilalexander1999-hubAlexander GolodkovTravvy88
authored
new version 2.2.7 (#486)
Co-authored-by: Andrey Mikhailov <mikhailov@icc.ru> Co-authored-by: Alexander Golodkov <55749660+alexander1999-hub@users.noreply.github.com> Co-authored-by: Alexander Golodkov <golodkov@ispras.ru> Co-authored-by: Nikita Shevtsov <61932814+Travvy88@users.noreply.github.com> Co-authored-by: Nikita Shevtsov <shevtsov@ispras.ru> Co-authored-by: Oksana Belyaeva <belyaeva@ispras.ru>
1 parent d67e6ef commit 765aae2

File tree

79 files changed

+1187
-768
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

79 files changed

+1187
-768
lines changed

.github/workflows/test_labeling.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,4 +31,4 @@ jobs:
3131
python-version: '3.9'
3232
- name: Run tests for labeling
3333
run: |
34-
test="true" docker-compose -f labeling/docker-compose.yml up --build --exit-code-from test
34+
test="true" docker compose -f labeling/docker-compose.yml up --build --exit-code-from test

.github/workflows/test_on_push.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,4 +36,4 @@ jobs:
3636
flake8 .
3737
- name: Run tests
3838
run: |
39-
test="true" docker-compose up --build --exit-code-from test
39+
test="true" docker compose up --build --exit-code-from test

README.md

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,10 @@ It extracts a document’s logical structure and content: tables, text formattin
1717
The document’s content is represented as a tree storing headings and lists of any level.
1818
Dedoc can be integrated in a document contents and structure analysis system as a separate module.
1919

20-
## Workflow
20+
## Star History
21+
[![Star History Chart](https://api.star-history.com/svg?repos=ispras/dedoc&type=Date)](https://tar-history.com/#ispras/dedoc&Date)
2122

23+
## Workflow
2224
![Workflow](https://github.com/ispras/dedoc/raw/master/docs/source/_static/workflow.png)
2325

2426
Workflow description is given [`here`](https://dedoc.readthedocs.io/en/latest/?badge=latest#workflow)
@@ -136,12 +138,12 @@ cd dedoc
136138

137139
### 3. Build the image and run the application
138140
```shell
139-
docker-compose up --build
141+
docker compose up --build
140142
```
141143

142144
### 4. Run container with tests
143145
```shell
144-
test="true" docker-compose up --build
146+
test="true" docker compose up --build
145147
```
146148

147149
If you need to change some application settings, you may update `config.py` according to your needs and re-build the image.

VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
2.2.6
1+
2.2.7

dedoc/api/api_args.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@ class QueryParameters:
2828
# pdf handling
2929
pdf_with_text_layer: str = Form("auto_tabby", enum=["true", "false", "auto", "auto_tabby", "tabby"],
3030
description="Extract text from a text layer of PDF or using OCR methods for image-like documents")
31+
fast_textual_layer_detection: str = Form("false", enum=["true", "false"],
32+
description="Use non-ML solution to detect textual layer. Much faster but less accurate.")
3133
language: str = Form("rus+eng", description="Recognition language ('rus+eng', 'rus', 'eng', 'fra', 'spa')")
3234
pages: str = Form(":", description='Page numbers range for reading PDF or images, "left:right" means read pages from left to right')
3335
is_one_column_document: str = Form("auto", enum=["auto", "true", "false"],

dedoc/api/web/index.html

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,10 @@ <h4>PDF handling</h4>
128128
</label>
129129
</p>
130130

131+
<p>
132+
<label><input name="fast_textual_layer_detection" type="checkbox" value="true"> fast_textual_layer_detection</label>
133+
</p>
134+
131135
<p>
132136
<label> language
133137
<input name="language" list="language" size="8" placeholder="rus+eng">

dedoc/data_structures/concrete_annotations/table_annotation.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,10 @@ class TableAnnotation(Annotation):
88
"""
99
name = "table"
1010

11-
def __init__(self, name: str, start: int, end: int) -> None:
11+
def __init__(self, value: str, start: int, end: int) -> None:
1212
"""
13-
:param name: unique identifier of the table which is referenced inside this annotation
13+
:param value: unique identifier of the table which is referenced inside this annotation
1414
:param start: start of the annotated text (usually zero)
1515
:param end: end of the annotated text (usually end of the line)
1616
"""
17-
super().__init__(start=start, end=end, name=TableAnnotation.name, value=name, is_mergeable=False)
17+
super().__init__(start=start, end=end, name=TableAnnotation.name, value=value, is_mergeable=False)

dedoc/data_structures/line_with_meta.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,9 @@ def uid(self) -> str:
136136
def set_line(self, line: str) -> None:
137137
self._line = line
138138

139+
def set_metadata(self, metadata: LineMetadata) -> None:
140+
self._metadata = metadata
141+
139142
def __repr__(self) -> str:
140143
return (f"LineWithMeta({self.line[:65]}, "
141144
f"tagHL={self.metadata.tag_hierarchy_level.level_1, self.metadata.tag_hierarchy_level.level_2, self.metadata.tag_hierarchy_level.line_type})")

dedoc/download_models.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,10 @@
66
"""
77
model_hash_dict = dict(
88
txtlayer_classifier="9ca1de749d8d37147b00a3a228e03ee1776c695f",
9-
scan_orientation_efficient_net_b0="9ea283f3d346ae4fdd82463a9f60b5369a3ffb58",
9+
scan_orientation_efficient_net_b0="c60812552a1be624476c1e5b58599867b36f8d4e",
1010
font_classifier="db4481ad60ab050cbb42079b64f97f9e431feb07",
11-
paragraph_classifier="c26a10193499d3cbc77ffec9842bece24fa8950b",
12-
line_type_classifiers="0568c6e1f49612c0c351f10b80a26dc05f796683",
11+
paragraph_classifier="97c4b78bc20d87ec7d53389e09f1ca35c6ade067",
12+
line_type_classifiers="6ad0eacbfdea065b658cb6f039d13f75245d51ae",
1313
fintoc_classifiers="6a907b7d2437c3f61ac9c506f67175207982fae8"
1414
)
1515

dedoc/readers/article_reader/article_reader.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -235,7 +235,7 @@ def __create_line_with_refs(self, content: List[Tuple[str, Tag]], bib2uid: dict,
235235
if subpart.get("type") == "bibr" and target in bib2uid:
236236
annotations.append(ReferenceAnnotation(value=bib2uid[target], start=start, end=start + len(sub_text)))
237237
if subpart.get("type") == "table" and target in table2uid:
238-
annotations.append(TableAnnotation(name=table2uid[target], start=start, end=start + len(sub_text)))
238+
annotations.append(TableAnnotation(value=table2uid[target], start=start, end=start + len(sub_text)))
239239
if subpart.get("type") == "figure" and target in attachment2uid:
240240
annotations.append(AttachAnnotation(attach_uid=attachment2uid[target], start=start, end=start + len(sub_text)))
241241
else:

dedoc/readers/docx_reader/data_structures/docx_document.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,7 @@ def __paragraphs2lines(self, image_refs: dict, table_refs: dict, diagram_refs: d
110110

111111
if i in table_refs:
112112
for table_uid in table_refs[i]:
113-
annotation = TableAnnotation(name=table_uid, start=0, end=len(line))
113+
annotation = TableAnnotation(value=table_uid, start=0, end=len(line))
114114
line.annotations.append(annotation)
115115

116116
paragraph_id += 1

dedoc/readers/docx_reader/data_structures/table.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -54,11 +54,14 @@ def to_table(self) -> Table:
5454
if cell.vMerge:
5555
value = cell.vMerge.get("w:val", "continue")
5656
if value == "continue":
57-
cell_lines = cell_list[-1][cell_ind].lines
58-
cell_row_list.append(CellWithMeta(lines=cell_lines, colspan=1, rowspan=1, invisible=True))
59-
last_cell_rowspan = cell_list[rowspan_start_info[cell_ind]][cell_ind]
60-
last_cell_rowspan.rowspan += 1
61-
cell_list[rowspan_start_info[cell_ind]][cell_ind] = last_cell_rowspan
57+
if cell_ind in rowspan_start_info:
58+
cell_lines = cell_list[-1][cell_ind].lines
59+
cell_row_list.append(CellWithMeta(lines=cell_lines, colspan=1, rowspan=1, invisible=True))
60+
last_cell_rowspan = cell_list[rowspan_start_info[cell_ind]][cell_ind]
61+
last_cell_rowspan.rowspan += 1
62+
cell_list[rowspan_start_info[cell_ind]][cell_ind] = last_cell_rowspan
63+
else:
64+
cell_row_list.append(CellWithMeta(lines=cell_lines, colspan=grid_span, rowspan=1, invisible=False))
6265
elif value == "restart":
6366
rowspan_start_info[cell_ind] = row_index
6467
cell_row_list.append(CellWithMeta(lines=cell_lines, colspan=grid_span, rowspan=1, invisible=False))

dedoc/readers/html2pdf_reader/html2pdf_reader.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ def _add_tables(self, document: UnstructuredDocument, tables: Dict[str, Table])
4747
line_id += 1
4848
lines.append(line)
4949
elif previous_line is not None:
50-
table_annotation = TableAnnotation(name=table_uid, start=0, end=len(line.line))
50+
table_annotation = TableAnnotation(value=table_uid, start=0, end=len(line.line))
5151
previous_line.annotations.append(table_annotation)
5252
tables_result.append(tables[table_uid])
5353
return UnstructuredDocument(lines=lines, tables=tables_result, attachments=document.attachments)

dedoc/readers/pdf_reader/data_classes/page_with_bboxes.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88

99
class PageWithBBox:
1010

11-
def __init__(self, image: ndarray, bboxes: List[TextWithBBox], page_num: int, attachments: List[PdfImageAttachment] = None,
11+
def __init__(self, image: Optional[ndarray], bboxes: List[TextWithBBox], page_num: int, attachments: List[PdfImageAttachment] = None,
1212
pdf_page_width: Optional[int] = None, pdf_page_height: Optional[int] = None) -> None:
1313
self.image = image
1414
self.bboxes = bboxes

dedoc/readers/pdf_reader/pdf_auto_reader/pdf_auto_reader.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ def read(self, file_path: str, parameters: Optional[dict] = None) -> Unstructure
5252
Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters.
5353
You can also see :ref:`pdf_handling_parameters` to get more information about `parameters` dictionary possible arguments.
5454
"""
55+
parameters = {} if parameters is None else parameters
5556
warnings = []
5657
txtlayer_parameters = self.txtlayer_detector.detect_txtlayer(path=file_path, parameters=parameters)
5758

dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_detector.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,13 @@ def detect_txtlayer(self, path: str, parameters: dict) -> PdfTxtlayerParameters:
2929
"""
3030
try:
3131
lines = self.__get_lines_for_predict(path=path, parameters=parameters)
32-
is_correct = self.txtlayer_classifier.predict(lines)
33-
first_page_correct = self.__is_first_page_correct(lines=lines, is_txt_layer_correct=is_correct)
32+
if str(parameters.get("fast_textual_layer_detection", "false")).lower() == "true":
33+
is_correct = any(line.line.strip() for line in lines)
34+
first_page_lines = [line for line in lines if line.metadata.page_id == 0]
35+
first_page_correct = bool(first_page_lines) and any(line.line.strip() for line in first_page_lines)
36+
else:
37+
is_correct = self.txtlayer_classifier.predict(lines)
38+
first_page_correct = self.__is_first_page_correct(lines=lines, is_txt_layer_correct=is_correct)
3439
return PdfTxtlayerParameters(is_correct_text_layer=is_correct, is_first_page_correct=first_page_correct)
3540

3641
except Exception as e:

dedoc/readers/pdf_reader/pdf_auto_reader/txtlayer_feature_extractor.py

Lines changed: 15 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -7,29 +7,19 @@
77

88
class TxtlayerFeatureExtractor:
99

10-
def __init__(self) -> None:
11-
self.eng = "".join(list(map(chr, range(ord("a"), ord("z") + 1))))
12-
self.rus = "".join([chr(i) for i in range(ord("а"), ord("а") + 32)] + ["ё"])
13-
self.lower_letters = self.eng + self.rus
14-
self.upper_letters = self.lower_letters.upper()
15-
self.letters = self.upper_letters + self.lower_letters
16-
self.digits = "".join([str(i) for i in range(10)])
17-
self.special_symbols = "<>~!@#$%^&*_+-/\"|?.,:;'`= "
18-
self.brackets = "{}[]()"
19-
self.symbols = self.letters + self.digits + self.brackets + self.special_symbols
20-
21-
self.prohibited_symbols = {s: i for i, s in enumerate("[]<")}
22-
2310
def transform(self, texts: List[str]) -> pd.DataFrame:
11+
from dedoc.structure_extractors.feature_extractors.char_features import letters, digits, special_symbols, brackets, rus, eng, prohibited_symbols, \
12+
lower_letters, upper_letters, symbols, count_symbols
13+
2414
features = defaultdict(list)
2515

2616
for text in texts:
27-
num_letters = self.__count_symbols(text, self.letters)
28-
num_digits = self.__count_symbols(text, self.digits)
29-
num_special_symbols = self.__count_symbols(text, self.special_symbols)
30-
num_brackets = self.__count_symbols(text, self.brackets)
31-
num_rus = self.__count_symbols(text, self.rus + self.rus.upper())
32-
num_eng = self.__count_symbols(text, self.eng + self.eng.upper())
17+
num_letters = count_symbols(text, letters)
18+
num_digits = count_symbols(text, digits)
19+
num_special_symbols = count_symbols(text, special_symbols)
20+
num_brackets = count_symbols(text, brackets)
21+
num_rus = count_symbols(text, rus + rus.upper())
22+
num_eng = count_symbols(text, eng + eng.upper())
3323

3424
features["letters_proportion"].append(num_letters / len(text))
3525
features["digits_proportion"].append(num_digits / len(text))
@@ -38,24 +28,24 @@ def transform(self, texts: List[str]) -> pd.DataFrame:
3828
features["rus_proportion"].append(num_rus / len(text))
3929
features["eng_proportion"].append(num_eng / len(text))
4030

41-
for symbol in self.letters + self.digits:
31+
for symbol in letters + digits:
4232
n = num_letters + num_digits
4333
# proportion of occurring english and russian letters
4434
features[f"{symbol}_proportion"].append(text.count(symbol) / n if n != 0 else 0.0)
4535

46-
for symbol in self.special_symbols + self.brackets:
36+
for symbol in special_symbols + brackets:
4737
# number of symbols
48-
symbol_name = symbol if symbol not in self.prohibited_symbols else f"symbol{self.prohibited_symbols[symbol]}"
38+
symbol_name = symbol if symbol not in prohibited_symbols else f"symbol{prohibited_symbols[symbol]}"
4939
features[f"{symbol_name}_number"].append(text.count(symbol))
5040

5141
# proportion of letters with symbols
5242
features["all_proportion"].append((num_letters + num_digits + num_brackets + num_special_symbols) / len(text) if len(text) != 0 else 0)
5343

54-
case_changes = sum(1 for s1, s2 in zip(text[:-1], text[1:]) if (s1 in self.lower_letters) and (s2 in self.upper_letters))
44+
case_changes = sum(1 for s1, s2 in zip(text[:-1], text[1:]) if (s1 in lower_letters) and (s2 in upper_letters))
5545
features["case_changes"].append(case_changes / len(text))
56-
symbol_changes = sum(1 for s1, s2 in zip(text[:-1], text[1:]) if (s1 in self.symbols) != (s2 in self.symbols))
46+
symbol_changes = sum(1 for s1, s2 in zip(text[:-1], text[1:]) if (s1 in symbols) != (s2 in symbols))
5747
features["symbol_changes"].append(symbol_changes / len(text))
58-
letter_changes = sum(1 for s1, s2 in zip(text[:-1], text[1:]) if (s1 in self.letters) and (s2 not in self.symbols))
48+
letter_changes = sum(1 for s1, s2 in zip(text[:-1], text[1:]) if (s1 in letters) and (s2 not in symbols))
5949
features["letter_changes"].append(letter_changes / len(text))
6050

6151
features["mean_word_length"].append(np.mean([len(word) for word in text.split()]))
@@ -70,6 +60,3 @@ def transform(self, texts: List[str]) -> pd.DataFrame:
7060
features["median_char_ord"].append(np.median(all_characters_ord))
7161
features = pd.DataFrame(features)
7262
return features[sorted(features.columns)].astype(float)
73-
74-
def __count_symbols(self, text: str, symbol_list: str) -> int:
75-
return sum(1 for symbol in text if symbol in symbol_list)

dedoc/readers/pdf_reader/pdf_image_reader/columns_orientation_classifier/columns_orientation_classifier.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import logging
2+
import os
23
import warnings
34
from os import path
45
from typing import Optional, Tuple
@@ -30,11 +31,9 @@ def __init__(self, on_gpu: bool, checkpoint_path: Optional[str], *, config: dict
3031
@property
3132
def net(self) -> ClassificationModelTorch:
3233
if self._net is None:
34+
net = ClassificationModelTorch(self.checkpoint_path)
3335
if self.checkpoint_path is not None:
34-
net = ClassificationModelTorch(path.join(self.checkpoint_path, "scan_orientation_efficient_net_b0.pth"))
3536
self._load_weights(net)
36-
else:
37-
net = ClassificationModelTorch(None)
3837
self._net = net
3938
self._net.to(self.device)
4039
return self._net
@@ -61,17 +60,18 @@ def _set_device(self, on_gpu: bool) -> None:
6160
self.logger.warning(f"Classifier is set to device {self.device}")
6261

6362
def _load_weights(self, net: ClassificationModelTorch) -> None:
64-
path_checkpoint = path.join(self.checkpoint_path, "scan_orientation_efficient_net_b0.pth")
65-
if not path.isfile(path_checkpoint):
66-
download_from_hub(out_dir=self.checkpoint_path,
63+
if not path.isfile(self.checkpoint_path):
64+
from dedoc.config import get_config
65+
self.checkpoint_path = os.path.join(get_config()["resources_path"], "scan_orientation_efficient_net_b0.pth")
66+
download_from_hub(out_dir=os.path.dirname(os.path.abspath(self.checkpoint_path)),
6767
out_name="scan_orientation_efficient_net_b0.pth",
6868
repo_name="scan_orientation_efficient_net_b0",
6969
hub_name="model.pth")
7070

7171
with warnings.catch_warnings():
7272
warnings.simplefilter("ignore")
73-
net.load_state_dict(torch.load(path_checkpoint, map_location=self.location))
74-
self.logger.info(f"Weights were loaded from {path_checkpoint}")
73+
net.load_state_dict(torch.load(self.checkpoint_path, map_location=self.location))
74+
self.logger.info(f"Weights were loaded from {self.checkpoint_path}")
7575

7676
def save_weights(self, path_checkpoint: str) -> None:
7777
torch.save(self.net.state_dict(), path_checkpoint)

dedoc/readers/pdf_reader/pdf_image_reader/line_metadata_extractor/bold_classifier/agglomerative_clusterizer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ def __get_f1_homogeneous(self, x: np.ndarray, x_clusters: np.ndarray) -> float:
5757

5858
w1 = np.std(x) * len(x)
5959
w2 = np.std(x_clust0) * len(x_clust0) + np.std(x_clust1) * len(x_clust1)
60-
f1 = w2 / w1
60+
f1 = w2 / w1 if w1 != 0. else 0.
6161
return f1
6262

6363
def __get_f_criterion_homogeneous(self, n: int, p: int = 2) -> float:

dedoc/readers/pdf_reader/pdf_image_reader/line_metadata_extractor/bold_classifier/valley_emphasis_binarizer.py

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,13 @@ def __init__(self, n: int = 5) -> None:
77
self.n = n
88

99
def binarize(self, image: np.ndarray) -> np.ndarray:
10-
gray_img = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
11-
threshold = self.__get_threshold(gray_img)
10+
if image.shape[-1] == 3:
11+
image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
12+
threshold = self.__get_threshold(image)
1213

13-
gray_img[gray_img <= threshold] = 0
14-
gray_img[gray_img > threshold] = 1
15-
return gray_img
14+
image[image <= threshold] = 0
15+
image[image > threshold] = 1
16+
return image
1617

1718
def __get_threshold(self, gray_img: np.ndarray) -> int:
1819
c, x = np.histogram(gray_img, bins=255)
@@ -33,8 +34,8 @@ def __get_threshold(self, gray_img: np.ndarray) -> int:
3334
omega_1 = omega_1 + c[t] / total
3435
omega_2 = 1 - omega_1
3536
mu_k = mu_k + t * (c[t] / total)
36-
mu_1 = mu_k / omega_1
37-
mu_2 = (sum_val - mu_k) / omega_2
37+
mu_1 = mu_k / omega_1 if omega_1 != 0. else 0.
38+
mu_2 = (sum_val - mu_k) / omega_2 if omega_2 != 0. else 0.
3839
sum_of_neighbors = np.sum(c[max(1, t - self.n):min(255, t + self.n)])
3940
denom = total
4041
current_var = (1 - sum_of_neighbors / denom) * (omega_1 * mu_1 ** 2 + omega_2 * mu_2 ** 2)

dedoc/readers/pdf_reader/pdf_image_reader/line_metadata_extractor/metadata_extractor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ def extract_metadata_and_set_annotations(self, page_with_lines: PageWithBBox, ca
4949
lines = []
5050
for bbox in page_with_lines.bboxes:
5151
lines.append(LineMetadataExtractor.get_line_with_meta(bbox=bbox))
52-
if page_with_lines.image.ndim == 3 and page_with_lines.image.shape[2] == 3:
52+
if page_with_lines.image is not None and page_with_lines.image.ndim == 3 and page_with_lines.image.shape[2] == 3:
5353
color_annotation = self.__get_color_annotation(bbox, page_with_lines.image)
5454
bbox.annotations.append(color_annotation)
5555
self.__add_spacing_annotations(lines)

0 commit comments

Comments
 (0)