Skip to content

Commit 79fb6e3

Browse files
NastyBogetsunveiloksidgydronperminov
authored
update master (#321)
* Add BBoxAnnotation to TabbyPDF reader (#312) * Add BBoxAnnotation to TabbyPDF reader * Fix import and add test * Remove unused import * TLDR-444 added words bbox pdfminer (#313) * TLDR-444 added word supporting into pdfminer-reader * TLDR-444 added word extraction from pdfminer; pdfminer refactoring * TLDR-444 added tests (word bounding box) * TLDR-444 fixed code style * TLDR-444 fixed after review * TLDR-437 plain_text return format added (#314) * Change base image name and tesseract benchmark script (#315) * Change base image name and tesseract benchmark script * Benchmarks updated * Added error hint * Small fix * add version ranges to requirements (#316) * TLDR-440: Tabby pdf cell properties (#319) * Add BBoxAnnotation to TabbyPDF reader * Fix import and add test * Remove unused import * Add cell properties * Fix add CellPropertyInfo * Fix invisible property * Fix colspan and row_span * Add test for tables with merged cells * Add data to test merged cells * Add BBoxAnnotation to TabbyPDF reader (#312) * Add BBoxAnnotation to TabbyPDF reader * Fix import and add test * Remove unused import * TLDR-444 added words bbox pdfminer (#313) * TLDR-444 added word supporting into pdfminer-reader * TLDR-444 added word extraction from pdfminer; pdfminer refactoring * TLDR-444 added tests (word bounding box) * TLDR-444 fixed code style * TLDR-444 fixed after review * TLDR-437 plain_text return format added (#314) * Change base image name and tesseract benchmark script (#315) * Change base image name and tesseract benchmark script * Benchmarks updated * Added error hint * Small fix * Add BBoxAnnotation to TabbyPDF reader * Fix import and add test * Remove unused import * Add BBoxAnnotation to TabbyPDF reader (#312) * Add BBoxAnnotation to TabbyPDF reader * Fix import and add test * Remove unused import * Fix import and add test * Remove unused import * Add test for tables with merged cells * Fix flake8 warnings * Resolve comments Add assert --------- Co-authored-by: Oksana Belyaeva <belyaeva@ispras.ru> Co-authored-by: Bogatenkova Anastasiya <bogatenkova.anastasiya@mail.ru> * new version 0.11.1 (#320) * new version 0.11.1 --------- Co-authored-by: Andrey Mikhailov <mikhailov@icc.ru> Co-authored-by: Oksana Belyaeva <belyaeva@ispras.ru> Co-authored-by: Andrew Perminov <perminov@ispras.ru>
1 parent 9a1f7ff commit 79fb6e3

26 files changed

+435
-416
lines changed

docker/Dockerfile renamed to Dockerfile

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
ARG REPOSITORY="docker.io"
2-
FROM dedocproject/baseimg
2+
FROM dedocproject/dedoc_p3.9_base:version_2023_08_28
33

44
ENV PYTHONPATH "${PYTHONPATH}:/dedoc_root"
55
ENV RESOURCES_PATH "/dedoc_root/resources"
66

77
ADD requirements.txt .
8-
RUN pip3 install -r requirements.txt
8+
RUN pip3 install --no-cache-dir -r requirements.txt
99

1010
RUN mkdir /dedoc_root
1111
ADD dedoc /dedoc_root/dedoc
@@ -17,4 +17,4 @@ RUN python3 /dedoc_root/dedoc/download_models.py
1717
ADD tests /dedoc_root/tests
1818
ADD resources /dedoc_root/resources
1919

20-
CMD ["python3", "/dedoc_root/dedoc/main.py", "-c", "/dedoc_root/dedoc/config.py"]
20+
CMD ["python3", "/dedoc_root/dedoc/main.py", "-c", "/dedoc_root/dedoc/config.py"]

VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
0.11.0
1+
0.11.1

dedoc/api/api_utils.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -219,3 +219,9 @@ def __table2html(table: Table, table2id: Dict[str, int]) -> str:
219219
text += "</tr>\n"
220220
text += "</tbody>\n</table>"
221221
return text
222+
223+
224+
def json2txt(paragraph: TreeNode) -> str:
225+
subparagraphs_text = "\n".join([json2txt(subparagraph) for subparagraph in paragraph.subparagraphs])
226+
text = f"{paragraph.text}\n{subparagraphs_text}"
227+
return text

dedoc/api/dedoc_api.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010

1111
import dedoc
1212
from dedoc.api.api_args import QueryParameters
13-
from dedoc.api.api_utils import json2collapsed_tree, json2html, json2tree
13+
from dedoc.api.api_utils import json2collapsed_tree, json2html, json2tree, json2txt
1414
from dedoc.common.exceptions.dedoc_error import DedocError
1515
from dedoc.common.exceptions.missing_file_error import MissingFileError
1616
from dedoc.config import get_config
@@ -76,6 +76,9 @@ async def upload(file: UploadFile = File(...), query_params: QueryParameters = D
7676
if return_format == "html":
7777
html_content = json2html(text="", paragraph=document_tree.content.structure, tables=document_tree.content.tables, tabs=0)
7878
return HTMLResponse(content=html_content, status_code=200)
79+
elif return_format == "plain_text":
80+
txt_content = json2txt(paragraph=document_tree.content.structure)
81+
return PlainTextResponse(content=txt_content, status_code=200)
7982
elif return_format == "tree":
8083
html_content = json2tree(paragraph=document_tree.content.structure)
8184
return HTMLResponse(content=html_content, status_code=200)

dedoc/api/static/html_eng/form_input.html

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ <h2>Structure Document Recognition</h2>
3232
<select name="return_format">
3333
<option value="html" selected>html</option>
3434
<option value="pretty_json">pretty_json</option>
35+
<option value="plain_text">plain_text</option>
3536
<option value="tree">tree</option>
3637
<option value="json">json</option>
3738
<option value="collapsed_tree">collapsed_tree</option>

dedoc/api/static/html_rus/form_input.html

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ <h2>Распознавание структуры документа</h2>
3333
<select name="return_format">
3434
<option value="html" selected>html</option>
3535
<option value="pretty_json">pretty_json</option>
36+
<option value="plain_text">plain_text</option>
3637
<option value="tree">tree</option>
3738
<option value="json">json</option>
3839
<option value="collapsed_tree">collapsed_tree</option>

dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py

Lines changed: 45 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -3,13 +3,15 @@
33
import math
44
import os
55
import subprocess
6+
from collections import namedtuple
67
from typing import List, Optional, Tuple
78

89
import numpy as np
910

1011
from dedoc.common.exceptions.java_not_found_error import JavaNotFoundError
1112
from dedoc.common.exceptions.tabby_pdf_error import TabbyPdfError
1213
from dedoc.data_structures.bbox import BBox
14+
from dedoc.data_structures.concrete_annotations.bbox_annotation import BBoxAnnotation
1315
from dedoc.data_structures.concrete_annotations.bold_annotation import BoldAnnotation
1416
from dedoc.data_structures.concrete_annotations.indentation_annotation import IndentationAnnotation
1517
from dedoc.data_structures.concrete_annotations.italic_annotation import ItalicAnnotation
@@ -33,6 +35,8 @@
3335
from dedoc.utils.parameter_utils import get_param_page_slice
3436
from dedoc.utils.utils import calculate_file_hash
3537

38+
CellPropertyInfo = namedtuple("NamedTuple", "colspan, rowspan, invisible")
39+
3640

3741
class PdfTabbyReader(PdfBaseReader):
3842
"""
@@ -76,7 +80,7 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio
7680
Look to the documentation of :meth:`~dedoc.readers.BaseReader.read` to get information about the method's parameters.
7781
"""
7882
parameters = {} if parameters is None else parameters
79-
lines, scan_tables = self.__extract(path=path)
83+
lines, scan_tables, tables_cell_properties = self.__extract(path=path)
8084
warnings = []
8185
document_metadata = None
8286

@@ -93,10 +97,12 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio
9397

9498
lines = self.linker.link_objects(lines=lines, tables=scan_tables, images=[])
9599
tables = []
96-
for scan_table in scan_tables:
100+
assert len(scan_tables) == len(tables_cell_properties)
101+
for scan_table, table_cells_property in zip(scan_tables, tables_cell_properties):
102+
cell_properties = [[cellp for cellp in row] for row in table_cells_property]
97103
metadata = TableMetadata(page_id=scan_table.page_number, uid=scan_table.name)
98104
cells = [[cell for cell in row] for row in scan_table.matrix_cells]
99-
table = Table(metadata=metadata, cells=cells)
105+
table = Table(metadata=metadata, cells=cells, cells_properties=cell_properties)
100106
tables.append(table)
101107

102108
attachments = []
@@ -111,23 +117,26 @@ def read(self, path: str, document_type: Optional[str] = None, parameters: Optio
111117

112118
return self._postprocess(result)
113119

114-
def __extract(self, path: str, start_page: int = None, end_page: int = None) -> Tuple[List[LineWithMeta], List[ScanTable]]:
120+
def __extract(self, path: str, start_page: int = None, end_page: int = None) -> Tuple[List[LineWithMeta], List[ScanTable], List[List[CellPropertyInfo]]]:
115121
file_hash = calculate_file_hash(path=path)
116122
document = self.__process_pdf(path=path, start_page=start_page, end_page=end_page)
117123
all_lines = []
118124
all_tables = []
125+
all_cell_properties = []
119126
for page in document.get("pages", []):
120127
lines = self.__get_lines_with_location(page, file_hash)
121128
if lines:
122129
all_lines.extend(lines)
123-
tables = self.__get_tables(page, file_hash)
130+
tables, cell_properties = self.__get_tables(page, file_hash)
124131
if tables:
125132
all_tables.extend(tables)
133+
all_cell_properties.extend(cell_properties)
126134

127-
return all_lines, all_tables
135+
return all_lines, all_tables, all_cell_properties
128136

129137
def __get_tables(self, page: dict, file_hash: str) -> List[ScanTable]:
130138
tables = []
139+
cell_properties = []
131140
page_number = page["number"]
132141
i = 0
133142
for table in page["tables"]:
@@ -138,26 +147,44 @@ def __get_tables(self, page: dict, file_hash: str) -> List[ScanTable]:
138147
y_bottom_right = y_top_left + table["height"]
139148
order = table["order"]
140149
rows = table["rows"]
150+
cell_properties_json = table["cell_properties"]
151+
cell_property_list = []
152+
153+
for cell_properties_row in cell_properties_json:
154+
cell_property_row_list = []
155+
156+
for cell_property in cell_properties_row:
157+
cell_property_info = CellPropertyInfo(cell_property["col_span"],
158+
cell_property["row_span"],
159+
bool(cell_property["invisible"]))
160+
161+
cell_property_row_list.append(cell_property_info)
162+
163+
cell_property_list.append(cell_property_row_list)
164+
141165
cells = [row for row in rows]
142166
bbox = BBox.from_two_points((x_top_left, y_top_left), (x_bottom_right, y_bottom_right))
143167

144168
tables.append(ScanTable(matrix_cells=cells, page_number=page_number, bbox=bbox, name=file_hash + str(page_number) + str(i), order=order))
169+
cell_properties.append(cell_property_list)
145170

146-
return tables
171+
return tables, cell_properties
147172

148173
def __get_lines_with_location(self, page: dict, file_hash: str) -> List[LineWithLocation]:
149174
lines = []
150175
page_number = page["number"]
176+
page_width = int(page["width"])
177+
page_height = int(page["height"])
151178
prev_line = None
152179

153180
for block in page["blocks"]:
154181
annotations = []
155182
order = block["order"]
156183
block_text = block["text"]
157-
bx_top_left = block["x_top_left"]
158-
by_top_left = block["y_top_left"]
159-
bx_bottom_right = bx_top_left + block["width"]
160-
by_bottom_right = by_top_left + block["height"]
184+
bx_top_left = int(block["x_top_left"])
185+
by_top_left = int(block["y_top_left"])
186+
bx_bottom_right = bx_top_left + int(block["width"])
187+
by_bottom_right = by_top_left + int(block["height"])
161188
indent = block["indent"]
162189
spacing = block["spacing"]
163190
len_block = len(block_text)
@@ -173,7 +200,12 @@ def __get_lines_with_location(self, page: dict, file_hash: str) -> List[LineWith
173200
url = annotation["url"]
174201
start = annotation["start"]
175202
end = annotation["end"]
176-
203+
x_top_left = int(annotation["x_top_left"])
204+
y_top_left = int(annotation["y_top_left"])
205+
x_bottom_right = bx_top_left + int(annotation["width"])
206+
y_bottom_right = by_top_left + int(annotation["height"])
207+
box = BBox.from_two_points((x_top_left, y_top_left), (x_bottom_right, y_bottom_right))
208+
annotations.append(BBoxAnnotation(start, end, box, page_width=page_width, page_height=page_height))
177209
annotations.append(SizeAnnotation(start, end, str(font_size)))
178210
annotations.append(StyleAnnotation(start, end, font_name))
179211

@@ -189,6 +221,7 @@ def __get_lines_with_location(self, page: dict, file_hash: str) -> List[LineWith
189221
meta = block["metadata"].lower()
190222
uid = f"txt_{file_hash}_{order}"
191223
bbox = BBox.from_two_points((bx_top_left, by_top_left), (bx_bottom_right, by_bottom_right))
224+
annotations.append(BBoxAnnotation(0, len_block, bbox, page_width=page_width, page_height=page_height))
192225

193226
metadata = LineMetadata(page_id=page_number, line_id=order)
194227
line_with_location = LineWithLocation(line=block_text,

dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
from dedoc.readers.pdf_reader.data_classes.pdf_image_attachment import PdfImageAttachment
99
from dedoc.readers.pdf_reader.data_classes.tables.scantable import ScanTable
1010
from dedoc.readers.pdf_reader.pdf_base_reader import ParametersForParseDoc, PdfBaseReader
11-
from dedoc.readers.pdf_reader.pdf_txtlayer_reader.extractor_pdf_textlayer import ExtractorPdfTextLayer
11+
from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdfminer_reader.pdfminer_extractor import PdfminerExtractor
1212
from dedoc.train_dataset.train_dataset_utils import save_page_with_bbox
1313

1414

@@ -25,7 +25,7 @@ def __init__(self, *, config: dict) -> None:
2525
:param config: configuration of the reader, e.g. logger for logging
2626
"""
2727
super().__init__(config=config)
28-
self.extractor_layer = ExtractorPdfTextLayer(config=config)
28+
self.extractor_layer = PdfminerExtractor(config=config)
2929

3030
def can_read(self, path: str, mime: str, extension: str, document_type: Optional[str] = None, parameters: Optional[dict] = None) -> bool:
3131
"""

dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdfminer_reader/__init__.py

Whitespace-only changes.

0 commit comments

Comments
 (0)