Skip to content

Commit 724e2d2

Browse files
NastyBogetsunveilalexander1999-hubAlexander Golodkov
authored
new version 2.3 (#498)
Co-authored-by: Andrey Mikhailov <mikhailov@icc.ru> Co-authored-by: Alexander Golodkov <55749660+alexander1999-hub@users.noreply.github.com> Co-authored-by: Alexander Golodkov <golodkov@ispras.ru>
1 parent 765aae2 commit 724e2d2

File tree

111 files changed

+2683
-316
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

111 files changed

+2683
-316
lines changed

.flake8

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ exclude =
2828
*__init__.py,
2929
resources,
3030
venv,
31+
.venv,
3132
build,
3233
dedoc.egg-info,
3334
docs/_build,
@@ -48,5 +49,5 @@ per-file-ignores =
4849
scripts/*:T201
4950
scripts/benchmark_pdf_performance*:JS101
5051
tests/custom_test_runner.py:ANN001,ANN201,ANN202,ANN204,N802
51-
docs/source/_static/code_examples/*:I251
52+
docs/source/_static/code_examples/*:I251,T201
5253
docs/source/_static/code_examples/langchain/*:FOL001,FOL002,FOL003,FOL004,FOL005,I100,I202,I251

.github/workflows/docs.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,3 +33,4 @@ jobs:
3333
python dedoc_usage_tutorial.py
3434
python dedoc_add_new_doc_type_tutorial.py
3535
python dedoc_add_new_structure_type_tutorial.py
36+
python dedoc_using_patterns_tutorial.py

.pre-commit-config.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ repos:
33
rev: 5.0.4
44
hooks:
55
- id: flake8
6-
exclude: \.github|.*__init__\.py|resources|docs|venv|build|dedoc\.egg-info|scripts/fintoc2022/metric.py
6+
exclude: \.github|.*__init__\.py|resources|docs|venv|\.venv|build|dedoc\.egg-info|scripts/fintoc2022/metric.py
77
args:
88
- "--config=.flake8"
99
additional_dependencies: [

README.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
# Dedoc
22

3+
[![Telegram](https://img.shields.io/badge/chat-on%20Telegram-2ba2d9.svg)](https://t.me/dedoc_chat)
34
[![image](https://img.shields.io/pypi/pyversions/dedoc.svg)](https://pypi.python.org/pypi/dedoc)
45
[![GitHub release](https://img.shields.io/github/release/ispras/dedoc.svg)](https://github.com/ispras/dedoc/releases/)
56
[![PyPI version](https://badge.fury.io/py/dedoc.svg)](https://badge.fury.io/py/dedoc)
@@ -94,6 +95,12 @@ Relevant documentation of dedoc is available [here](https://dedoc.readthedocs.io
9495
* Article on habr.com [Dedoc: как автоматически извлечь из текстового документа всё и даже немного больше](https://habr.com/ru/companies/isp_ras/articles/779390/) in Russian (2023)
9596
* Article [Dedoc: A Universal System for Extracting Content and Logical Structure From Textual Documents](https://ieeexplore.ieee.org/abstract/document/10508151/) in English (2023)
9697

98+
# Join Our Community
99+
100+
Have questions or want to discuss Dedoc? Join our [Telegram chat](https://t.me/dedoc_chat) and connect with the community and the developers.
101+
102+
Join our [Telegram channel](https://t.me/dedoc_channel) to get notifications about the most recent updates.
103+
97104
# Installation instructions
98105

99106
This project has a REST api and you can run it in Docker container.

VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
2.2.7
1+
2.3

dedoc/api/api_args.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
class QueryParameters:
99
# type of document structure parsing
1010
document_type: str = Form("", enum=["", "law", "tz", "diploma", "article", "fintoc"], description="Document domain")
11+
patterns: str = Form("", description='Patterns for default document type (when document_type="")')
1112
structure_type: str = Form("tree", enum=["linear", "tree"], description="Output structure type")
1213
return_format: str = Form("json", enum=["json", "html", "plain_text", "tree", "collapsed_tree", "ujson", "pretty_json"],
1314
description="Response representation, most types (except json) are used for debug purposes only")
@@ -39,6 +40,7 @@ class QueryParameters:
3940
'"no_change" - set vertical orientation of the document without using an orientation classifier')
4041
need_header_footer_analysis: str = Form("false", enum=["true", "false"], description="Exclude headers and footers from PDF parsing result")
4142
need_binarization: str = Form("false", enum=["true", "false"], description="Binarize document pages (for images or PDF without a textual layer)")
43+
need_gost_frame_analysis: str = Form("false", enum=["true", "false"], description="Parameter for detecting and ignoring GOST frame of the document")
4244

4345
# other formats handling
4446
delimiter: Optional[str] = Form(None, description="Column separator for CSV files")

dedoc/api/schema/annotation.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,16 @@ class Annotation(BaseModel):
55
"""
66
The piece of information about the text line: it's appearance or links to another document object.
77
For example Annotation(1, 13, "italic", "True") says that text between 1st and 13th symbol was written in italic.
8+
9+
:ivar start: start of the annotated text
10+
:ivar end: end of the annotated text (end isn't included)
11+
:ivar name: annotation's name, specific for each type of annotation
12+
:ivar value: information about annotated text, depends on the type of annotation, e.g. "True"/"False", "10.0", etc.
13+
14+
:vartype start: int
15+
:vartype end: int
16+
:vartype name: str
17+
:vartype value: str
818
"""
919
start: int = Field(description="Start of the annotated text", example=0)
1020
end: int = Field(description="End of the annotated text (end isn't included)", example=5)

dedoc/api/schema/cell_with_meta.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,16 @@
88
class CellWithMeta(BaseModel):
99
"""
1010
Holds the information about the cell: list of lines and cell properties (rowspan, colspan, invisible).
11+
12+
:ivar lines: list of textual lines of the cell
13+
:ivar colspan: number of columns to span (for cells merged horizontally)
14+
:ivar rowspan: number of rows to span (for cells merged vertically)
15+
:ivar invisible: indicator for displaying or hiding cell text - cells that are merged with others are hidden (for HTML display)
16+
17+
:vartype lines: List[LineWithMeta]
18+
:vartype colspan: int
19+
:vartype rowspan: int
20+
:vartype invisible: bool
1121
"""
1222
lines: List[LineWithMeta] = Field(description="Textual lines of the cell with annotations")
1323
rowspan: int = Field(description="Number of rows to span like in HTML format", example=1)

dedoc/api/schema/document_content.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,12 @@
99
class DocumentContent(BaseModel):
1010
"""
1111
Content of the document - structured text and tables.
12+
13+
:ivar tables: list of document tables
14+
:ivar structure: tree structure of the document nodes with text and additional metadata
15+
16+
:vartype tables: List[Table]
17+
:vartype structure: TreeNode
1218
"""
1319
structure: TreeNode = Field(description="Tree structure where content of the document is organized")
1420
tables: List[Table] = Field(description="List of document tables")

dedoc/api/schema/document_metadata.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,26 @@
44
class DocumentMetadata(BaseModel):
55
"""
66
Document metadata like its name, size, author, etc.
7+
8+
:ivar file_name: original document name (before rename and conversion, so it can contain non-ascii symbols, spaces and so on)
9+
:ivar temporary_file_name: file name during parsing (unique name after rename and conversion)
10+
:ivar size: size of the original file in bytes
11+
:ivar modified_time: time of the last modification in unix time format (seconds since the epoch)
12+
:ivar created_time: time of the creation in unixtime
13+
:ivar access_time: time of the last access to the file in unixtime
14+
:ivar file_type: mime type of the file
15+
:ivar uid: document unique identifier (useful for attached files)
16+
17+
:vartype file_name: str
18+
:vartype temporary_file_name: str
19+
:vartype size: int
20+
:vartype modified_time: int
21+
:vartype created_time: int
22+
:vartype access_time: int
23+
:vartype file_type: str
24+
:vartype uid: str
25+
26+
Additional variables may be added with other file metadata.
727
"""
828
class Config:
929
extra = Extra.allow

dedoc/api/schema/line_metadata.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,20 @@
66
class LineMetadata(BaseModel):
77
"""
88
Holds information about document node/line metadata, such as page number or line type.
9+
10+
:ivar paragraph_type: type of the document line/paragraph (header, list_item, list, etc.)
11+
:ivar page_id: page number where paragraph starts, the numeration starts from page 0
12+
:ivar line_id: line number inside the entire document, the numeration starts from line 0
13+
14+
:vartype paragraph_type: str
15+
:vartype page_id: int
16+
:vartype line_id: Optional[int]
17+
18+
Additional variables may be added with other line metadata.
919
"""
1020
class Config:
1121
extra = Extra.allow
1222

13-
paragraph_type: str = Field(description="Type of the document line/paragraph (header, list_item, list) and etc.", example="raw_text")
23+
paragraph_type: str = Field(description="Type of the document line/paragraph (header, list_item, list, etc.)", example="raw_text")
1424
page_id: int = Field(description="Page number of the line/paragraph beginning", example=0)
1525
line_id: Optional[int] = Field(description="Line number", example=1)

dedoc/api/schema/line_with_meta.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,12 @@
88
class LineWithMeta(BaseModel):
99
"""
1010
Textual line with text annotations.
11+
12+
:ivar text: text of the line
13+
:ivar annotations: text annotations (font, size, bold, italic, etc.)
14+
15+
:vartype text: str
16+
:vartype annotations: List[Annotation]
1117
"""
1218
text: str = Field(description="Text of the line", example="Some text")
13-
annotations: List[Annotation] = Field(description="Text annotations (font, size, bold, italic and etc)")
19+
annotations: List[Annotation] = Field(description="Text annotations (font, size, bold, italic, etc.)")

dedoc/api/schema/parsed_document.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,18 @@
99
class ParsedDocument(BaseModel):
1010
"""
1111
Holds information about the document content, metadata and attachments.
12+
13+
:ivar content: document text (hierarchy of nodes) and tables
14+
:ivar attachments: result of analysis of attached files (empty if with_attachments=False)
15+
:ivar metadata: document metadata such as size, creation date and so on.
16+
:ivar warnings: list of warnings and possible errors, arising in the process of document parsing
17+
:ivar version: version of the program that parsed this document
18+
19+
:vartype content: DocumentContent
20+
:vartype attachments: List[ParsedDocument]
21+
:vartype metadata: DocumentMetadata
22+
:vartype warnings: List[str]
23+
:vartype version: str
1224
"""
1325
content: DocumentContent = Field(description="Document text and tables")
1426
metadata: DocumentMetadata = Field(description="Document metadata such as size, creation date and so on")

dedoc/api/schema/table.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,12 @@ class Table(BaseModel):
1111
Holds information about tables in the document.
1212
We assume that a table has rectangle form (has the same number of columns in each row).
1313
Table representation is row-based i.e. external list contains list of rows.
14+
15+
:ivar metadata: a list of lists of table cells (cell has text lines, colspan and rowspan attributes)
16+
:ivar cells: table metadata as location, title and so on
17+
18+
:vartype metadata: TableMetadata
19+
:vartype cells: List[List[CellWithMeta]]
1420
"""
1521
cells: List[List[CellWithMeta]] = Field(description="List of lists of table cells (cell has text, colspan and rowspan attributes)")
1622
metadata: TableMetadata = Field(description="Table meta information")

dedoc/api/schema/table_metadata.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,16 @@
66
class TableMetadata(BaseModel):
77
"""
88
Holds the information about table unique identifier, rotation angle (if table has been rotated - for images) and so on.
9+
10+
:ivar page_id: number of the page where table starts
11+
:ivar uid: unique identifier of the table (used for linking table to text)
12+
:ivar rotated_angle: value of the rotation angle by which the table was rotated during recognition
13+
:ivar title: table's title
14+
15+
:vartype page_id: Optional[int]
16+
:vartype uid: str
17+
:vartype rotated_angle: float
18+
:vartype title: str
919
"""
1020
page_id: Optional[int] = Field(description="Number of the page where the table starts", example=0)
1121
uid: str = Field(description="Unique identifier of the table", example="e8ba5523-8546-4804-898c-2f4835a1804f")

dedoc/api/schema/tree_node.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,18 @@ class TreeNode(BaseModel):
1010
"""
1111
Helps to represent document as recursive tree structure.
1212
It has list of children `TreeNode` nodes (empty list for a leaf node).
13+
14+
:ivar node_id: unique node identifier
15+
:ivar text: text of the node (may contain several lines)
16+
:ivar annotations: some metadata related to the part of the text (as font size)
17+
:ivar metadata: metadata refers to entire node (as node type)
18+
:ivar subparagraphs: list of child of this node
19+
20+
:vartype node_id: str
21+
:vartype text: str
22+
:vartype annotations: List[Annotation]
23+
:vartype metadata: LineMetadata
24+
:vartype subparagraphs: List[TreeNode]
1325
"""
1426
node_id: str = Field(description="Document element identifier. It is unique within a document content tree. "
1527
"The identifier consists of numbers separated by dots where each number "

dedoc/api/web/index.html

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ <h3>Parameters configuration</h3>
2828

2929
<div class="parameters">
3030
<h4>Type of document structure parsing</h4>
31-
<details><summary>document_type, structure_type, return_format</summary>
31+
<details><summary>document_type, patterns, structure_type, return_format</summary>
3232
<br>
3333
<p>
3434
<label>
@@ -43,6 +43,14 @@ <h4>Type of document structure parsing</h4>
4343
</label>
4444
</p>
4545

46+
<p>
47+
<div>
48+
Patterns for default structure extractor (document_type="other")<br>
49+
<label><textarea id="patterns" name="patterns" style="width:450px;height:75px;"></textarea></label><br>
50+
<button type="button" onclick="Format()">Format</button>
51+
</div>
52+
</p>
53+
4654
<p>
4755
<label>
4856
<select name="structure_type">
@@ -114,7 +122,7 @@ <h4>Tables handling </h4>
114122

115123
<div class="parameters">
116124
<h4>PDF handling</h4>
117-
<details><summary>pdf_with_text_layer, language, pages, is_one_column_document, document_orientation, need_header_footer_analysis, need_binarization</summary>
125+
<details><summary>pdf_with_text_layer, fast_textual_layer_detection, language, pages, is_one_column_document, document_orientation, need_header_footer_analysis, need_binarization</summary>
118126
<br>
119127
<p>
120128
<label>
@@ -175,6 +183,9 @@ <h4>PDF handling</h4>
175183
<p>
176184
<label><input name="need_binarization" type="checkbox" value="true"> need_binarization</label>
177185
</p>
186+
<p>
187+
<label><input name="need_gost_frame_analysis" type="checkbox" value="true"> need_gost_frame_analysis</label>
188+
</p>
178189
</details>
179190
</div>
180191

@@ -213,4 +224,18 @@ <h3>Useful links</h3>
213224
</ul>
214225

215226
</body>
227+
228+
<script>
229+
function Format() {
230+
try {
231+
let input = document.getElementById("patterns")
232+
let data = JSON.parse(input.value.replaceAll("\\", "\\\\"))
233+
input.value = JSON.stringify(data, null, 2).replaceAll("\\\\", "\\")
234+
}
235+
catch (error) {
236+
alert("Incorrect JSON syntax")
237+
}
238+
}
239+
</script>
240+
216241
</html>

dedoc/data_structures/annotation.py

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,18 @@ class Annotation(Serializable):
77
Base class for text annotations of all kinds.
88
Annotation is the piece of information about the text line: it's appearance or links to another document object.
99
Look to the concrete kind of annotations to get mode examples.
10+
11+
:ivar start: start of the annotated text
12+
:ivar end: end of the annotated text (end isn't included)
13+
:ivar name: annotation's name, specific for each type of annotation
14+
:ivar value: information about annotated text, depends on the type of annotation, e.g. "True"/"False", "10.0", etc.
15+
:ivar is_mergeable: is it possible to merge annotations with the same value
16+
17+
:vartype start: int
18+
:vartype end: int
19+
:vartype name: str
20+
:vartype value: str
21+
:vartype is_mergeable: bool
1022
"""
1123

1224
def __init__(self, start: int, end: int, name: str, value: str, is_mergeable: bool = True) -> None:
@@ -20,11 +32,11 @@ def __init__(self, start: int, end: int, name: str, value: str, is_mergeable: bo
2032
:param value: information about annotated text
2133
:param is_mergeable: is it possible to merge annotations with the same value
2234
"""
23-
self.start = start
24-
self.end = end
25-
self.name = name
26-
self.value = value
27-
self.is_mergeable = is_mergeable
35+
self.start: int = start
36+
self.end: int = end
37+
self.name: str = name
38+
self.value: str = value
39+
self.is_mergeable: bool = is_mergeable
2840

2941
def __eq__(self, o: object) -> bool:
3042
if not isinstance(o, Annotation):
@@ -35,7 +47,7 @@ def __str__(self) -> str:
3547
return f"{self.name.capitalize()}({self.start}:{self.end}, {self.value})"
3648

3749
def __repr__(self) -> str:
38-
return f"{self.name.capitalize()}(...)"
50+
return self.__str__()
3951

4052
def to_api_schema(self) -> ApiAnnotation:
4153
return ApiAnnotation(start=self.start, end=self.end, name=self.name, value=self.value)

dedoc/data_structures/attached_file.py

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,28 @@
11
class AttachedFile:
22
"""
33
Holds information about files, attached to the parsed document.
4+
5+
:ivar original_name: original name of the attached file if it was possible to extract it
6+
:ivar tmp_file_path: path to the attached file on disk - its name is different from original_name
7+
:ivar need_content_analysis: does the attached file need parsing (enable recursive parsing in :class:`~dedoc.DedocManager`)
8+
:ivar uid: unique identifier of the attached file
9+
10+
:vartype original_name: str
11+
:vartype tmp_file_path: str
12+
:vartype need_content_analysis: bool
13+
:vartype uid: str
414
"""
515
def __init__(self, original_name: str, tmp_file_path: str, need_content_analysis: bool, uid: str) -> None:
616
"""
7-
:param original_name: Name of the file from which the attachments are extracted
8-
:param tmp_file_path: path to the attachment file.
17+
:param original_name: original name of the attached file
18+
:param tmp_file_path: path to the attachment file
919
:param need_content_analysis: indicator should we parse the attachment's content or simply save it without parsing
1020
:param uid: unique identifier of the attachment
1121
"""
12-
self.original_name = original_name
13-
self.tmp_file_path = tmp_file_path
14-
self.need_content_analysis = need_content_analysis
15-
self.uid = uid
22+
self.original_name: str = original_name
23+
self.tmp_file_path: str = tmp_file_path
24+
self.need_content_analysis: bool = need_content_analysis
25+
self.uid: str = uid
1626

1727
def get_filename_in_path(self) -> str:
1828
return self.tmp_file_path

0 commit comments

Comments
 (0)