Skip to content

pdf broken encoding reader #522

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 28 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 7 commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
29454d6
встроил, надо разобраться с импортами и т.п.
sinkudo Mar 25, 2025
d3ef3cf
changed script to extract text, but problem with get_text() remains (…
sinkudo Mar 25, 2025
62ec1d1
adding reader to manager, cleaning comments
sinkudo Mar 27, 2025
d0179da
added reader to api
sinkudo Mar 27, 2025
a3b51e7
is pdf valid check, (cid:xxx) instead of chars fix
sinkudo Apr 1, 2025
89a320e
reduntant funcs
sinkudo Apr 2, 2025
db17824
imports
sinkudo Apr 2, 2025
fad75fc
tf optional import(soon will remove and replace with torch)
sinkudo Apr 22, 2025
f9a877b
unit test
sinkudo Apr 22, 2025
78271db
txt for test and remove script
sinkudo Apr 22, 2025
c072785
reader and parameters into docs
sinkudo Apr 22, 2025
9738962
added docstrings, removed reduntant tables var, removed dublicate of…
sinkudo Apr 22, 2025
7ece585
now download model to resources_path, saving pdfdata needed for extra…
sinkudo Apr 22, 2025
25d0adc
moved external functions into functions
sinkudo Apr 22, 2025
a7659dd
model imports
sinkudo Apr 22, 2025
2e6d99b
api test
sinkudo Apr 29, 2025
96b7683
quotes
sinkudo Apr 29, 2025
bff37c5
multilines
sinkudo Apr 29, 2025
847a9e4
imports + return types + var rename
sinkudo Apr 29, 2025
2b17ed0
TLDR-903 upgrade PyPDF2 to pypdf>4; fix bug with PDF attachments (#515)
NastyBoget Feb 14, 2025
e01bd93
torch cnn, change docker to install fontforge, specified exception, …
sinkudo May 6, 2025
86237ea
style changes
sinkudo May 7, 2025
0e7b6b4
more style changes, documentatation fix
sinkudo May 14, 2025
7afe558
more style
sinkudo May 14, 2025
5f735b9
style
sinkudo May 14, 2025
425b311
teper tochno
sinkudo May 14, 2025
ce2bf87
dockerfile fix, txt path in api test fix
sinkudo May 21, 2025
edc7c82
remove optional dependency
sinkudo Jul 10, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ var/
*.egg-info/
.installed.cfg
*.egg
dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_broken_encoding_reader/data/pdfdata

# PyInstaller
# Usually these files are written by a python script from a template
Expand Down Expand Up @@ -148,4 +149,4 @@ crashlytics-build.properties
fabric.properties

# Mac OS extentions
*.DS_Store
*.DS_Store
2 changes: 1 addition & 1 deletion dedoc/api/api_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ class QueryParameters:
table_type: str = Form("", description="Pipeline mode for table recognition")

# pdf handling
pdf_with_text_layer: str = Form("auto_tabby", enum=["true", "false", "auto", "auto_tabby", "tabby"],
pdf_with_text_layer: str = Form("auto_tabby", enum=["true", "false", "auto", "auto_tabby", "tabby","bad_encoding_reader"],
description="Extract text from a text layer of PDF or using OCR methods for image-like documents")
fast_textual_layer_detection: str = Form("false", enum=["true", "false"],
description="Use non-ML solution to detect textual layer. Much faster but less accurate.")
Expand Down
1 change: 1 addition & 0 deletions dedoc/api/web/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,7 @@ <h4>PDF handling</h4>
<option value="auto">auto</option>
<option value="auto_tabby" selected>auto_tabby</option>
<option value="tabby">tabby</option>
<option value="bad_encoding_reader">bad_encoding_reader</option>
</select> pdf_with_text_layer
</label>
</p>
Expand Down
2 changes: 2 additions & 0 deletions dedoc/manager_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ def _get_manager_config(config: dict) -> dict:
from dedoc.readers.pdf_reader.pdf_image_reader.pdf_image_reader import PdfImageReader
from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_tabby_reader import PdfTabbyReader
from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_txtlayer_reader import PdfTxtlayerReader
from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_broken_encoding_reader.pdf_broken_encoding_reader import PdfBrokenEncodingReader
from dedoc.readers.pptx_reader.pptx_reader import PptxReader
from dedoc.readers.reader_composition import ReaderComposition
from dedoc.readers.txt_reader.raw_text_reader import RawTextReader
Expand Down Expand Up @@ -73,6 +74,7 @@ def _get_manager_config(config: dict) -> dict:
PdfAutoReader(config=config),
PdfTabbyReader(config=config),
PdfTxtlayerReader(config=config),
PdfBrokenEncodingReader(config=config),
PdfImageReader(config=config),
EmailReader(config=config),
MhtmlReader(config=config)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
import enum
import glob
import os
from pathlib import Path

from keras.models import load_model

from dedoc.readers.pdf_reader.pdf_txtlayer_reader.pdf_broken_encoding_reader.functions import get_project_root

ROOT_DIR = get_project_root()

char_pool = dict(
rus_eng=['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u',
'v', 'w', 'x', 'y', 'z', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'а', 'б', 'в', 'г', 'д', 'е', 'ж', 'з', 'и', 'й', 'к',
'л', 'м', 'н', 'о', 'п', 'р', 'с', 'т', 'у', 'ф', 'х', 'ц', 'ч', 'ш', 'щ', 'ъ', 'ы', 'ь', 'э', 'ю', 'я',
'А', 'Б', 'В', 'Г', 'Д', 'Е', 'Ж', 'З', 'И', 'Й', 'К', 'Л', 'М', 'Н', 'О', 'П', 'Р', 'С', 'Т', 'У', 'Ф',
'Х', 'Ц', 'Ч', 'Ш', 'Щ', 'Ъ', 'Ы', 'Ь', 'Э', 'Ю', 'Я', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
'!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', '-', '.', ',', '/', ':', ';', '<', '=', '>', '?',
'@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~', '©', '™'],
rus_eng_no_reg_diff=['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's',
't', 'u', 'v', 'w', 'x', 'y', 'z', 'а', 'б', 'в', 'г', 'д', 'е', 'ж', 'з', 'и', 'й', 'к',
'л', 'м', 'н', 'о', 'п', 'р', 'с', 'т', 'у', 'ф', 'х', 'ц', 'ч', 'ш', 'щ', 'ъ', 'ы', 'ь', 'э',
'ю', 'я', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '!', '"', '#', '$', '%', '&', "'",
'(', ')', '*', '+', '-', '.', ',', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^',
'_', '`', '{', '|', '}', '~', '©', '™'],
rus=['а', 'б', 'в', 'г', 'д', 'е', 'ж', 'з', 'и', 'й', 'к', 'л', 'м', 'н', 'о', 'п', 'р', 'с', 'т', 'у', 'ф',
'х', 'ц', 'ч', 'ш', 'щ', 'ъ', 'ы', 'ь', 'э', 'ю', 'я', 'А', 'Б', 'В', 'Г', 'Д', 'Е', 'Ж', 'З', 'И', 'Й',
'К', 'Л', 'М', 'Н', 'О', 'П', 'Р', 'С', 'Т', 'У', 'Ф', 'Х', 'Ц', 'Ч', 'Ш', 'Щ', 'Ъ', 'Ы', 'Ь', 'Э', 'Ю',
'Я', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*',
'+', '-', ',', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|',
'}', '~', '©', '™'],
rus_no_reg_diff=['а', 'б', 'в', 'г', 'д', 'е', 'ж', 'з', 'и', 'й', 'к', 'л', 'м', 'н', 'о', 'п', 'р', 'с', 'т', 'у',
'ф', 'х', 'ц', 'ч', 'ш', 'щ', 'ъ', 'ы', 'ь', 'э', 'ю', 'я', '0', '1', '2', '3', '4', '5', '6', '7',
'8', '9', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', '-', ',', '.', '/', ':', ';', '<',
'=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~', '©', '™'],
eng=['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u',
'v', 'w', 'x', 'y', 'z', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '!',
'"', '#', '$', '%', '&', "'", '(', ')', '*', '+', '-', ',', '.', '/', ':', ';', '<', '=', '>', '?', '@',
'[', '\\', ']', '^', '_', '`', '{', '|', '}', '~', '©', '™'],
eng_no_reg_diff=['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't',
'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '!', '"', '#', '$',
'%', '&', "'", '(', ')', '*', '+', '-', ',', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[',
'\\', ']', '^', '_', '`', '{', '|', '}', '~', '©', '™']
)

other = dict(
bottom_align=[',', '.', '_'],
dont_aug=[",", "dot", "\\", "`", "_", "-", "=", ";", ":", "quotedbl", "colon", "backslash", ")", "(", "[", "]" "<",
">", "~", "+", "'"]
)
convert = dict(
convert_chars_to_rus={"a": "а", "b": "в", 'c': 'с', 'd': 'д', 'e': 'е', "h": "н", 'k': 'к', 'm': 'м', 'o': 'о',
'p': 'р', 'r': 'г', 'y': 'у', "t": "т", "u": "и", 'x': 'х', },
)

folders = dict(
fonts_folders=Path(ROOT_DIR, 'data', 'fonts_folders'),
images_folder=Path(ROOT_DIR, "data/datasets/test2"),
output_train=Path(ROOT_DIR, "data/datasets/images/output"),
last_prepared_data=Path(ROOT_DIR, "data/datasets/last_prepared"),
extracted_data_folder=Path(ROOT_DIR, "data/pdfdata"),
extracted_fonts_folder=Path(ROOT_DIR, "data/pdfdata/extracted_fonts"),
extracted_glyphs_folder=Path(ROOT_DIR, "data/pdfdata/glyph_images"),
default_models_folder=Path(ROOT_DIR, "data/models/default_models"),
custom_models_folder=Path(ROOT_DIR, "data/models/custom_models"),
datasets_folder=Path(ROOT_DIR, 'data', 'datasets'),
ffwraper_folder=Path(ROOT_DIR, 'ffwrapper', 'fontforge_wrapper.py')
)

default_models = [i.split('\\')[-1].split('.')[0] for i in
glob.glob(os.path.join(folders.get('default_models_folder'), "*.h5"))]



def chars_to_code(char_list: list):
return [ord(i) for i in char_list]


class Language(enum.Enum):
Russian_and_English_no_reg_diff = char_pool['rus_eng_no_reg_diff']
Russian_no_reg_diff = char_pool['rus_no_reg_diff']
English_no_reg_diff = char_pool['eng_no_reg_diff']
Russian_and_English = char_pool['rus_eng']
Russian = char_pool['rus']
English = char_pool['eng']


class DefaultModel(enum.Enum):
Russian_and_English = {'model': load_model(Path(folders['default_models_folder'], 'rus_eng.h5')),
'labels': Language.Russian_and_English.value}
Russian = {'model': load_model(Path(folders['default_models_folder'], 'rus.h5')),
'labels': Language.Russian_no_reg_diff.value}
English = {'model': load_model(Path(folders['default_models_folder'], 'eng.h5')),
'labels': Language.English_no_reg_diff.value}

@classmethod
def from_string(cls, model_name: str):
mapping = {
"ruseng": cls.Russian_and_English,
"rus": cls.Russian,
"eng": cls.English
}
try:
return mapping[model_name.lower()]
except KeyError:
raise ValueError(f"Incorrect model_name (rus, eng, ruseng)")
Binary file not shown.
Binary file not shown.
Binary file not shown.
Loading
Loading