Skip to content

Commit 370f6ef

Browse files
update master (#451)
Co-authored-by: Andrew Perminov <perminov@ispras.ru>
1 parent 6b84563 commit 370f6ef

File tree

22 files changed

+672
-111
lines changed

22 files changed

+672
-111
lines changed

VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
2.2.2
1+
2.2.3

dedoc/api/api_args.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@ class QueryParameters:
1717
need_content_analysis: str = Form("false", enum=["true", "false"], description="Enable parsing contents of the attached files")
1818
recursion_deep_attachments: str = Form("10", description="Depth on which nested attachments will be parsed if need_content_analysis=true")
1919
return_base64: str = Form("false", enum=["true", "false"], description="Save attached images to the document metadata in base64 format")
20-
attachments_dir: Optional[str] = Form(None, description="Path to the directory where to save files' attachments")
2120

2221
# tables handling
2322
need_pdf_table_analysis: str = Form("true", enum=["true", "false"], description="Enable table recognition for pdf")

dedoc/api/api_utils.py

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
from dedoc.data_structures.parsed_document import ParsedDocument
1515
from dedoc.data_structures.table import Table
1616
from dedoc.data_structures.tree_node import TreeNode
17+
from dedoc.extensions import converted_mimes, recognized_mimes
1718

1819

1920
def __prettify_text(text: str) -> Iterator[str]:
@@ -148,11 +149,22 @@ def json2html(text: str,
148149
text += table2html(table, table2id)
149150
text += "<p>&nbsp;</p>"
150151

152+
image_mimes = recognized_mimes.image_like_format.union(converted_mimes.image_like_format)
153+
151154
if attachments is not None and len(attachments) > 0:
152155
text += "<h3> Attachments: </h3>"
153156
for attachment_id, attachment in enumerate(attachments):
154157
attachment_text = json2html(text="", paragraph=attachment.content.structure, tables=attachment.content.tables, attachments=attachment.attachments)
155-
text += f'<div id="{attachment.metadata.uid}"><h4>attachment {attachment_id} ({attachment.metadata.file_name}):</h4>{attachment_text}</div>'
158+
attachment_base64 = f'data:{attachment.metadata.file_type};base64,{attachment.metadata.base64}"'
159+
attachment_link = f'<a href="{attachment_base64}" download="{attachment.metadata.file_name}">{attachment.metadata.file_name}</a>'
160+
is_image = attachment.metadata.file_type in image_mimes
161+
attachment_image = f'<img src="{attachment_base64}">' if is_image else ""
162+
163+
text += f"""<div id="{attachment.metadata.uid}">
164+
<h4>attachment {attachment_id} ({attachment_link}):</h4>
165+
{attachment_image}
166+
{attachment_text}
167+
</div>"""
156168

157169
return text
158170

@@ -193,12 +205,9 @@ def __annotations2html(paragraph: TreeNode, table2id: Dict[str, int], attach2id:
193205
name = annotation.name
194206
value = annotation.value
195207

196-
bool_annotations = [BoldAnnotation.name,
197-
ItalicAnnotation.name,
198-
StrikeAnnotation.name,
199-
SubscriptAnnotation.name,
200-
SuperscriptAnnotation.name,
201-
UnderlinedAnnotation.name]
208+
bool_annotations = [
209+
BoldAnnotation.name, ItalicAnnotation.name, StrikeAnnotation.name, SubscriptAnnotation.name, SuperscriptAnnotation.name, UnderlinedAnnotation.name
210+
]
202211
check_annotations = bool_annotations + [TableAnnotation.name, ReferenceAnnotation.name, AttachAnnotation.name]
203212
if name not in check_annotations and not value.startswith("heading "):
204213
continue

dedoc/api/dedoc_api.py

Lines changed: 28 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import base64
12
import dataclasses
23
import importlib
34
import json
@@ -62,41 +63,57 @@ def _get_static_file_path(request: Request) -> str:
6263
return os.path.abspath(os.path.join(directory, file))
6364

6465

66+
def __add_base64_info_to_attachments(document_tree: ParsedDocument, attachments_dir: str) -> None:
67+
for attachment in document_tree.attachments:
68+
with open(os.path.join(attachments_dir, attachment.metadata.temporary_file_name), "rb") as attachment_file:
69+
attachment.metadata.add_attribute("base64", base64.b64encode(attachment_file.read()).decode("utf-8"))
70+
71+
6572
@app.post("/upload", response_model=ParsedDocument)
6673
async def upload(file: UploadFile = File(...), query_params: QueryParameters = Depends()) -> Response: # noqa
6774
parameters = dataclasses.asdict(query_params)
6875
if not file or file.filename == "":
6976
raise MissingFileError("Error: Missing content in request_post file parameter", version=dedoc.__version__)
7077

78+
return_format = str(parameters.get("return_format", "json")).lower()
79+
7180
with tempfile.TemporaryDirectory() as tmpdir:
7281
file_path = save_upload_file(file, tmpdir)
73-
document_tree = manager.parse(file_path, parameters=dict(parameters))
82+
document_tree = manager.parse(file_path, parameters={**dict(parameters), "attachments_dir": tmpdir})
83+
84+
if return_format == "html":
85+
__add_base64_info_to_attachments(document_tree, tmpdir)
7486

75-
return_format = str(parameters.get("return_format", "json")).lower()
7687
if return_format == "html":
7788
html_content = json2html(
7889
text="",
7990
paragraph=document_tree.content.structure,
8091
tables=document_tree.content.tables,
81-
attachments=document_tree.attachments, tabs=0
92+
attachments=document_tree.attachments,
93+
tabs=0
8294
)
8395
return HTMLResponse(content=html_content)
84-
elif return_format == "plain_text":
96+
97+
if return_format == "plain_text":
8598
txt_content = json2txt(paragraph=document_tree.content.structure)
8699
return PlainTextResponse(content=txt_content)
87-
elif return_format == "tree":
100+
101+
if return_format == "tree":
88102
html_content = json2tree(paragraph=document_tree.content.structure)
89103
return HTMLResponse(content=html_content)
90-
elif return_format == "ujson":
104+
105+
if return_format == "ujson":
91106
return UJSONResponse(content=document_tree.to_api_schema().model_dump())
92-
elif return_format == "collapsed_tree":
107+
108+
if return_format == "collapsed_tree":
93109
html_content = json2collapsed_tree(paragraph=document_tree.content.structure)
94110
return HTMLResponse(content=html_content)
95-
elif return_format == "pretty_json":
111+
112+
if return_format == "pretty_json":
96113
return PlainTextResponse(content=json.dumps(document_tree.to_api_schema().model_dump(), ensure_ascii=False, indent=2))
97-
else:
98-
logger.info(f"Send result. File {file.filename} with parameters {parameters}")
99-
return ORJSONResponse(content=document_tree.to_api_schema().model_dump())
114+
115+
logger.info(f"Send result. File {file.filename} with parameters {parameters}")
116+
return ORJSONResponse(content=document_tree.to_api_schema().model_dump())
100117

101118

102119
@app.get("/upload_example")

dedoc/api/web/index.html

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ <h4>Type of document structure parsing</h4>
7070

7171
<div class="parameters">
7272
<h4>Attachments handling</h4>
73-
<details><summary>with_attachments, need_content_analysis, recursion_deep_attachments, return_base64, attachments_dir</summary>
73+
<details><summary>with_attachments, need_content_analysis, recursion_deep_attachments, return_base64</summary>
7474
<br>
7575
<p>
7676
<label><input name="with_attachments" type="checkbox" value="true"> with_attachments </label>
@@ -87,10 +87,6 @@ <h4>Attachments handling</h4>
8787
<p>
8888
<label><input name="return_base64" type="checkbox" value="true"> return_base64 </label>
8989
</p>
90-
91-
<p>
92-
<label>attachments_dir <input name="attachments_dir" type="text" size="35" value=""></label>
93-
</p>
9490
</details>
9591
</div>
9692

dedoc/data_structures/document_metadata.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import uuid
2-
from typing import Dict, Union
2+
from typing import Any, Dict, Union
33

44
from dedoc.api.schema.document_metadata import DocumentMetadata as ApiDocumentMetadata
55
from dedoc.data_structures.serializable import Serializable
@@ -38,8 +38,11 @@ def __init__(self,
3838
self.access_time = access_time
3939
self.file_type = file_type
4040
for key, value in kwargs.items():
41-
setattr(self, key, value)
41+
self.add_attribute(key, value)
4242
self.uid = f"doc_uid_auto_{uuid.uuid1()}" if uid is None else uid
4343

44+
def add_attribute(self, key: str, value: Any) -> None: # noqa
45+
setattr(self, key, value)
46+
4447
def to_api_schema(self) -> ApiDocumentMetadata:
4548
return ApiDocumentMetadata(**vars(self))

dedoc/readers/docx_reader/data_structures/docx_document.py

Lines changed: 10 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,11 @@
11
import hashlib
22
import logging
3-
import os
43
import re
5-
import zipfile
64
from collections import defaultdict
7-
from typing import List, Optional
5+
from typing import List
86

97
from bs4 import BeautifulSoup, Tag
108

11-
from dedoc.common.exceptions.bad_file_error import BadFileFormatError
129
from dedoc.data_structures.attached_file import AttachedFile
1310
from dedoc.data_structures.concrete_annotations.attach_annotation import AttachAnnotation
1411
from dedoc.data_structures.concrete_annotations.table_annotation import TableAnnotation
@@ -19,6 +16,7 @@
1916
from dedoc.readers.docx_reader.line_with_meta_converter import LineWithMetaConverter
2017
from dedoc.readers.docx_reader.numbering_extractor import NumberingExtractor
2118
from dedoc.readers.docx_reader.styles_extractor import StylesExtractor
19+
from dedoc.utils.office_utils import get_bs_from_zip
2220
from dedoc.utils.utils import calculate_file_hash
2321

2422

@@ -28,8 +26,8 @@ def __init__(self, path: str, attachments: List[AttachedFile], logger: logging.L
2826
self.path = path
2927
self.attachment_name2uid = {attachment.original_name: attachment.uid for attachment in attachments}
3028

31-
self.document_bs_tree = self.__get_bs_tree("word/document.xml")
32-
self.document_bs_tree = self.__get_bs_tree("word/document2.xml") if self.document_bs_tree is None else self.document_bs_tree
29+
self.document_bs_tree = get_bs_from_zip(self.path, "word/document.xml")
30+
self.document_bs_tree = get_bs_from_zip(self.path, "word/document2.xml") if self.document_bs_tree is None else self.document_bs_tree
3331
self.body = self.document_bs_tree.body if self.document_bs_tree else None
3432
self.paragraph_maker = self.__get_paragraph_maker()
3533

@@ -39,8 +37,8 @@ def __init__(self, path: str, attachments: List[AttachedFile], logger: logging.L
3937
self.lines = self.__get_lines()
4038

4139
def __get_paragraph_maker(self) -> ParagraphMaker:
42-
styles_extractor = StylesExtractor(self.__get_bs_tree("word/styles.xml"), self.logger)
43-
num_tree = self.__get_bs_tree("word/numbering.xml")
40+
styles_extractor = StylesExtractor(get_bs_from_zip(self.path, "word/styles.xml"), self.logger)
41+
num_tree = get_bs_from_zip(self.path, "word/numbering.xml")
4442
numbering_extractor = NumberingExtractor(num_tree, styles_extractor) if num_tree else None
4543
styles_extractor.numbering_extractor = numbering_extractor
4644

@@ -49,8 +47,8 @@ def __get_paragraph_maker(self) -> ParagraphMaker:
4947
path_hash=calculate_file_hash(path=self.path),
5048
styles_extractor=styles_extractor,
5149
numbering_extractor=numbering_extractor,
52-
footnote_extractor=FootnoteExtractor(self.__get_bs_tree("word/footnotes.xml")),
53-
endnote_extractor=FootnoteExtractor(self.__get_bs_tree("word/endnotes.xml"), key="endnote")
50+
footnote_extractor=FootnoteExtractor(get_bs_from_zip(self.path, "word/footnotes.xml")),
51+
endnote_extractor=FootnoteExtractor(get_bs_from_zip(self.path, "word/endnotes.xml"), key="endnote")
5452
)
5553

5654
def __get_lines(self) -> List[LineWithMeta]:
@@ -120,23 +118,6 @@ def __paragraphs2lines(self, image_refs: dict, table_refs: dict, diagram_refs: d
120118

121119
return lines_with_meta
122120

123-
def __get_bs_tree(self, filename: str) -> Optional[BeautifulSoup]:
124-
"""
125-
Gets xml bs tree from the given file inside the self.path.
126-
:param filename: name of file to extract the tree
127-
:return: BeautifulSoup tree or None if file wasn't found
128-
"""
129-
try:
130-
with zipfile.ZipFile(self.path) as document:
131-
content = document.read(filename)
132-
content = re.sub(br"\n[\t ]*", b"", content)
133-
soup = BeautifulSoup(content, "xml")
134-
return soup
135-
except KeyError:
136-
return None
137-
except zipfile.BadZipFile:
138-
raise BadFileFormatError(f"Bad docx file:\n file_name = {os.path.basename(self.path)}. Seems docx is broken")
139-
140121
def __handle_table_xml(self, xml: Tag, table_refs: dict) -> None:
141122
table = DocxTable(xml, self.paragraph_maker)
142123
self.tables.append(table.to_table())
@@ -150,9 +131,9 @@ def __handle_table_xml(self, xml: Tag, table_refs: dict) -> None:
150131
table_refs[len(self.paragraph_list) - 1].append(table_uid)
151132

152133
def __handle_images_xml(self, xmls: List[Tag], image_refs: dict) -> None:
153-
rels = self.__get_bs_tree("word/_rels/document.xml.rels")
134+
rels = get_bs_from_zip(self.path, "word/_rels/document.xml.rels")
154135
if rels is None:
155-
rels = self.__get_bs_tree("word/_rels/document2.xml.rels")
136+
rels = get_bs_from_zip(self.path, "word/_rels/document2.xml.rels")
156137

157138
images_rels = dict()
158139
for rel in rels.find_all("Relationship"):
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
class NumberingExtractor:
2+
"""
3+
This class is used to compute numbering text for list items.
4+
For example: "1.", (i), "○"
5+
"""
6+
def __init__(self) -> None:
7+
# Mapping according to the ST_TextAutonumberScheme
8+
# NOTE we ignore chinese, japanese, hindi, thai
9+
self.numbering_types = dict(
10+
arabic="1", # 1, 2, 3, ..., 10, 11, 12, ...
11+
alphaLc="a", # a, b, c, ..., y, z, aa, bb, cc, ..., yy, zz, aaa, bbb, ccc, ...
12+
alphaUc="A", # A, B, C, ..., Y, Z, AA, BB, CC, ..., YY, ZZ, AAA, BBB, CCC, ...
13+
romanLc="i", # i, ii, iii, iv, ..., xviii, xix, xx, xxi, ...
14+
romanUc="I" # I, II, III, IV, ..., XVIII, XIX, XX, XXI, ...
15+
)
16+
17+
self.numbering_formatting = dict(
18+
ParenBoth="({}) ",
19+
ParenR="{}) ",
20+
Period="{}. ",
21+
Plain="{} "
22+
)
23+
24+
self.combined_types = {
25+
num_type + num_formatting: (num_type, num_formatting) for num_type in self.numbering_types for num_formatting in self.numbering_formatting
26+
}
27+
self.roman_mapping = [(1000, "m"), (500, "d"), (100, "c"), (50, "l"), (10, "x"), (5, "v"), (1, "i")]
28+
29+
def get_text(self, numbering: str, shift: int) -> str:
30+
"""
31+
Computes the next item of the list sequence.
32+
:param numbering: type of the numbering, e.g. "arabicPeriod"
33+
:param shift: shift from the beginning of list numbering
34+
:return: string representation of the next numbering item
35+
"""
36+
num_type, num_formatting = self.combined_types.get(numbering, ("arabic", "Period"))
37+
38+
if num_type in ("alphaLc", "alphaUc"):
39+
shift1, shift2 = shift % 26, shift // 26 + 1
40+
num_char = chr(ord(self.numbering_types[num_type]) + shift1) * shift2
41+
elif num_type in ("romanLc", "romanUc"):
42+
num_char = ""
43+
for number, letter in self.roman_mapping:
44+
cnt, shift = shift // number, shift % number
45+
if num_type == "romanUc":
46+
letter = chr(ord(letter) + ord("A") - ord("a"))
47+
num_char += letter * cnt
48+
else:
49+
num_char = str(int(self.numbering_types["arabic"]) + shift)
50+
51+
return self.numbering_formatting[num_formatting].format(num_char)
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
from bs4 import Tag
2+
3+
from dedoc.data_structures import AlignmentAnnotation, BoldAnnotation, HierarchyLevel, ItalicAnnotation, LineMetadata, LineWithMeta, SizeAnnotation, \
4+
StrikeAnnotation, SubscriptAnnotation, SuperscriptAnnotation, UnderlinedAnnotation
5+
from dedoc.readers.pptx_reader.numbering_extractor import NumberingExtractor
6+
from dedoc.readers.pptx_reader.properties_extractor import PropertiesExtractor
7+
from dedoc.utils.annotation_merger import AnnotationMerger
8+
9+
10+
class PptxParagraph:
11+
"""
12+
This class corresponds to one textual paragraph of some entity, e.g. shape or table cell (tag <a:p>).
13+
"""
14+
def __init__(self, xml: Tag, numbering_extractor: NumberingExtractor, properties_extractor: PropertiesExtractor) -> None:
15+
self.xml = xml
16+
self.numbered_list_type = self.xml.buAutoNum.get("type", "arabicPeriod") if self.xml.buAutoNum else None
17+
self.level = int(self.xml.pPr.get("lvl", 0)) + 1 if self.xml.pPr else 1
18+
self.numbering_extractor = numbering_extractor
19+
self.properties_extractor = properties_extractor
20+
self.annotation_merger = AnnotationMerger()
21+
annotations = [BoldAnnotation, ItalicAnnotation, UnderlinedAnnotation, StrikeAnnotation, SuperscriptAnnotation, SubscriptAnnotation]
22+
self.dict2annotation = {annotation.name: annotation for annotation in annotations}
23+
24+
def get_line_with_meta(self, page_id: int, line_id: int, is_title: bool, shift: int = 0) -> LineWithMeta:
25+
text = ""
26+
paragraph_properties = self.properties_extractor.get_properties(self.xml.pPr, level=self.level)
27+
hierarchy_level = HierarchyLevel.create_raw_text()
28+
29+
if is_title or paragraph_properties.title:
30+
hierarchy_level = HierarchyLevel(line_type=HierarchyLevel.header, level_1=1, level_2=self.level, can_be_multiline=False)
31+
elif self.numbered_list_type: # numbered list
32+
text += self.numbering_extractor.get_text(self.numbered_list_type, shift)
33+
hierarchy_level = HierarchyLevel(line_type=HierarchyLevel.list_item, level_1=2, level_2=self.level, can_be_multiline=False)
34+
elif self.xml.buChar: # bullet list
35+
text += self.xml.buChar["char"] + " "
36+
hierarchy_level = HierarchyLevel(line_type=HierarchyLevel.list_item, level_1=3, level_2=self.level, can_be_multiline=False)
37+
38+
annotations = []
39+
if self.xml.r:
40+
for run in self.xml.find_all("a:r"):
41+
prev_text = text
42+
for run_text in run:
43+
if run_text.name == "t" and run.text:
44+
text += run.text
45+
46+
run_properties = self.properties_extractor.get_properties(run.rPr, level=self.level, properties=paragraph_properties)
47+
annotations.append(SizeAnnotation(start=len(prev_text), end=len(text), value=str(run_properties.size)))
48+
for property_name in self.dict2annotation:
49+
if getattr(run_properties, property_name):
50+
annotations.append(self.dict2annotation[property_name](start=len(prev_text), end=len(text), value="True"))
51+
52+
text = f"{text}\n"
53+
annotations = self.annotation_merger.merge_annotations(annotations, text)
54+
annotations.append(AlignmentAnnotation(start=0, end=len(text), value=paragraph_properties.alignment))
55+
return LineWithMeta(text, metadata=LineMetadata(page_id=page_id, line_id=line_id, tag_hierarchy_level=hierarchy_level), annotations=annotations)

0 commit comments

Comments
 (0)