Skip to content

Commit 7eb2775

Browse files
✨ add support for invoice splitter auto-extraction (#253)
1 parent eda5fba commit 7eb2775

19 files changed

+355
-12
lines changed
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
import os
2+
3+
from mindee import Client
4+
from mindee.extraction.common.pdf_extractor import PdfExtractor
5+
from mindee.input import PathInput
6+
from mindee.product import InvoiceSplitterV1, InvoiceV4
7+
8+
api_key = os.getenv("MINDEE_API_KEY")
9+
mindee_client = Client(api_key=api_key)
10+
11+
input_path = "path/to/your/file.ext"
12+
input_source = PathInput(input_path)
13+
14+
if input_source.is_pdf():
15+
pdf_extractor = PdfExtractor(input_source)
16+
if pdf_extractor.get_page_count() > 1:
17+
invoice_splitter_response = mindee_client.enqueue_and_parse(
18+
InvoiceSplitterV1, input_source, close_file=False
19+
)
20+
page_groups = (
21+
invoice_splitter_response.document.inference.prediction.invoice_page_groups
22+
)
23+
extracted_pdfs = pdf_extractor.extract_invoices(page_groups, strict=False)
24+
25+
for extracted_pdf in extracted_pdfs:
26+
# Optional: Save the files locally
27+
# extracted_pdf.write_to_file("output/path")
28+
29+
invoice_result = mindee_client.parse(
30+
InvoiceV4, extracted_pdf.as_input_source()
31+
)
32+
print(invoice_result.document)
33+
else:
34+
invoice_result = mindee_client.parse(InvoiceV4, input_source)
35+
print(invoice_result.document)
36+
else:
37+
invoice_result = mindee_client.parse(InvoiceV4, input_source)
38+
print(invoice_result.document)

examples/multi_receipts_tutorial.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from mindee import Client, PredictResponse, product
2-
from mindee.image_extraction.multi_receipts_extractor.multi_receipts_extractor import (
2+
from mindee.extraction.multi_receipts_extractor.multi_receipts_extractor import (
33
extract_receipts,
44
)
55

mindee/extraction/__init__.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
from mindee.extraction.common.extracted_image import ExtractedImage
2+
from mindee.extraction.common.image_extractor import (
3+
attach_image_as_new_file,
4+
extract_multiple_images_from_source,
5+
)
6+
from mindee.extraction.multi_receipts_extractor import multi_receipts_extractor
7+
from mindee.extraction.pdf_extractor.extracted_pdf import ExtractedPdf
8+
from mindee.extraction.pdf_extractor.pdf_extractor import PdfExtractor

mindee/extraction/common/__init__.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
from mindee.extraction.common.extracted_image import ExtractedImage
2+
from mindee.extraction.common.image_extractor import (
3+
attach_image_as_new_file,
4+
extract_multiple_images_from_source,
5+
)

mindee/image_extraction/common/image_extractor.py renamed to mindee/extraction/common/image_extractor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,8 @@
55
from PIL import Image
66

77
from mindee.error import MindeeError
8+
from mindee.extraction.common import ExtractedImage
89
from mindee.geometry import Point, get_min_max_x, get_min_max_y
9-
from mindee.image_extraction.common import ExtractedImage
1010
from mindee.input import BytesInput, LocalInputSource
1111

1212

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
from mindee.extraction.multi_receipts_extractor import multi_receipts_extractor

mindee/image_extraction/multi_receipts_extractor/multi_receipts_extractor.py renamed to mindee/extraction/multi_receipts_extractor/multi_receipts_extractor.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
from typing import List
22

33
from mindee.error import MindeeError
4-
from mindee.image_extraction.common.extracted_image import ExtractedImage
5-
from mindee.image_extraction.common.image_extractor import (
4+
from mindee.extraction.common.extracted_image import ExtractedImage
5+
from mindee.extraction.common.image_extractor import (
66
extract_multiple_images_from_source,
77
)
88
from mindee.input import LocalInputSource
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
from mindee.extraction.pdf_extractor.extracted_pdf import ExtractedPdf
2+
from mindee.extraction.pdf_extractor.pdf_extractor import PdfExtractor
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
from pathlib import Path
2+
from typing import BinaryIO
3+
4+
import pypdfium2 as pdfium
5+
6+
from mindee.error import MindeeError
7+
from mindee.input import BytesInput
8+
9+
10+
class ExtractedPdf:
11+
"""An extracted sub-Pdf."""
12+
13+
pdf_bytes: BinaryIO
14+
filename: str
15+
16+
def __init__(self, pdf_bytes: BinaryIO, filename: str):
17+
self.pdf_bytes = pdf_bytes
18+
self.filename = filename
19+
20+
def get_page_count(self) -> int:
21+
"""Get the number of pages in the PDF file."""
22+
try:
23+
pdf = pdfium.PdfDocument(self.pdf_bytes)
24+
return len(pdf)
25+
except Exception as exc:
26+
raise MindeeError(
27+
"Could not retrieve page count from Extracted PDF object."
28+
) from exc
29+
30+
def write_to_file(self, output_path: str):
31+
"""
32+
Writes the contents of the current PDF object to a file.
33+
34+
:param output_path: Path of the destination file. If not extension is provided, pdf will be appended by default.
35+
"""
36+
out_path = Path(output_path)
37+
if out_path.resolve().is_dir():
38+
raise MindeeError("Provided path is not a file.")
39+
if not output_path or not out_path.parent.exists():
40+
raise MindeeError("Invalid save path provided {}.")
41+
if out_path.suffix.lower() != "pdf":
42+
out_path = out_path.parent / (out_path.stem + "." + "pdf")
43+
with open(out_path, "wb") as out_file:
44+
out_file.write(self.pdf_bytes.read())
45+
46+
def as_input_source(self) -> BytesInput:
47+
"""Returns the current PDF object as a usable BytesInput source."""
48+
self.pdf_bytes.seek(0)
49+
return BytesInput(self.pdf_bytes.read(), self.filename)
Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
import io
2+
from pathlib import Path
3+
from typing import BinaryIO, List, Optional, Union
4+
5+
import pypdfium2 as pdfium
6+
from PIL import Image
7+
8+
from mindee.error import MindeeError
9+
from mindee.extraction.pdf_extractor.extracted_pdf import ExtractedPdf
10+
from mindee.input.sources import LocalInputSource
11+
from mindee.product.invoice_splitter import InvoiceSplitterV1PageGroup
12+
13+
14+
class PdfExtractor:
15+
"""PDF extraction class."""
16+
17+
_source_pdf: BinaryIO
18+
_filename: str
19+
20+
def __init__(self, local_input: LocalInputSource):
21+
self._filename = local_input.filename
22+
if local_input.is_pdf():
23+
self._source_pdf = local_input.file_object
24+
else:
25+
pdf_image = Image.open(local_input.file_object)
26+
self._source_pdf = io.BytesIO()
27+
pdf_image.save(self._source_pdf, format="PDF")
28+
29+
def get_page_count(self) -> int:
30+
"""Get the number of pages in the PDF file."""
31+
pdf = pdfium.PdfDocument(self._source_pdf)
32+
return len(pdf)
33+
34+
def cut_pages(self, page_indexes: List) -> BinaryIO:
35+
"""
36+
Create a new PDF from pages and save it into a buffer.
37+
38+
:param page_indexes: List of pages number to use for merging in the original PDF.
39+
:return: The buffer containing the new PDF.
40+
"""
41+
self._source_pdf.seek(0)
42+
new_pdf = pdfium.PdfDocument.new()
43+
pdf = pdfium.PdfDocument(self._source_pdf)
44+
new_pdf.import_pages(pdf, page_indexes)
45+
bytes_io = io.BytesIO()
46+
new_pdf.save(bytes_io)
47+
return bytes_io
48+
49+
def extract_sub_documents(
50+
self, page_indexes: List[List[int]]
51+
) -> List[ExtractedPdf]:
52+
"""
53+
Extract the sub-documents from the main pdf, based on the given list of page indexes.
54+
55+
:param page_indexes: List of list of numbers, representing page indexes.
56+
:return: A list of created PDFS.
57+
"""
58+
extracted_pdfs: List[ExtractedPdf] = []
59+
extension = Path(self._filename).suffix
60+
stem = Path(self._filename).stem
61+
for page_index_elem in page_indexes:
62+
if not page_index_elem or len(page_index_elem) == 0:
63+
raise MindeeError("Empty indexes aren't allowed for extraction.")
64+
for page_index in page_index_elem:
65+
if page_index > self.get_page_count():
66+
raise MindeeError(f"Index {page_index} is out of range.")
67+
formatted_max_index = f"{page_index_elem[len(page_index_elem) - 1] + 1:03d}"
68+
field_filename = f"{stem}_{(page_index_elem[0] + 1):03d}-{formatted_max_index}{extension}"
69+
extracted_pdf = ExtractedPdf(
70+
self.cut_pages(page_index_elem), field_filename
71+
)
72+
extracted_pdfs.append(extracted_pdf)
73+
return extracted_pdfs
74+
75+
def extract_invoices(
76+
self,
77+
page_indexes: List[Union[InvoiceSplitterV1PageGroup, List[int]]],
78+
strict: bool = False,
79+
) -> List[ExtractedPdf]:
80+
"""
81+
Extracts invoices as complete PDFs from the document.
82+
83+
:param page_indexes: List of sub-lists of pages to keep.
84+
:param strict: Whether to trust confidence scores above 0.5 (included) or not.
85+
:return: A list of extracted invoices.
86+
"""
87+
if len(page_indexes) < 1:
88+
raise MindeeError("No indexes provided.")
89+
if not isinstance(page_indexes[0], InvoiceSplitterV1PageGroup):
90+
return self.extract_sub_documents(page_indexes) # type: ignore
91+
if not strict:
92+
indexes_as_list = [
93+
page_index.page_indexes for page_index in page_indexes # type: ignore
94+
]
95+
return self.extract_sub_documents(indexes_as_list)
96+
correct_page_indexes: List[List[int]] = []
97+
current_list: List[int] = []
98+
previous_confidence: Optional[float] = None
99+
for i, page_index in enumerate(page_indexes):
100+
assert isinstance(page_index, InvoiceSplitterV1PageGroup)
101+
confidence = page_index.confidence
102+
page_list = page_index.page_indexes
103+
104+
if confidence >= 0.5 and previous_confidence is None:
105+
current_list = page_list
106+
elif confidence >= 0.5 and i != len(page_indexes) - 1:
107+
correct_page_indexes.append(current_list)
108+
current_list = page_list
109+
elif confidence < 0.5 and i == len(page_indexes) - 1:
110+
current_list.extend(page_list)
111+
correct_page_indexes.append(current_list)
112+
else:
113+
correct_page_indexes.append(current_list)
114+
correct_page_indexes.append(page_list)
115+
previous_confidence = confidence
116+
return self.extract_sub_documents(correct_page_indexes)

mindee/image_extraction/common/__init__.py

Lines changed: 0 additions & 5 deletions
This file was deleted.

mindee/image_extraction/multi_receipts_extractor/__init__.py

Lines changed: 0 additions & 1 deletion
This file was deleted.
File renamed without changes.

tests/image_extraction/test_image_extractor.py renamed to tests/extraction/test_image_extractor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import pytest
44
from PIL import Image
55

6-
from mindee.image_extraction.common import extract_multiple_images_from_source
6+
from mindee.extraction.common import extract_multiple_images_from_source
77
from mindee.input import PathInput
88
from mindee.product import BarcodeReaderV1
99
from tests.test_inputs import PRODUCT_DATA_DIR
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
from pathlib import Path
2+
3+
import pytest
4+
5+
from mindee import Client
6+
from mindee.extraction.pdf_extractor.pdf_extractor import PdfExtractor
7+
from mindee.input import PathInput
8+
from mindee.parsing.common import Document
9+
from mindee.product import InvoiceSplitterV1, InvoiceV4
10+
from tests.product import get_id, get_version
11+
from tests.test_inputs import PRODUCT_DATA_DIR
12+
13+
14+
@pytest.fixture
15+
def invoice_splitter_5p_path():
16+
return PRODUCT_DATA_DIR / "invoice_splitter" / "invoice_5p.pdf"
17+
18+
19+
def prepare_invoice_return(rst_file_path: Path, invoice_prediction: Document):
20+
with open(rst_file_path, "r") as rst_file:
21+
rst_content = rst_file.read()
22+
parsing_version = invoice_prediction.inference.product.version
23+
parsing_id = invoice_prediction.id
24+
rst_content = rst_content.replace(get_version(rst_content), parsing_version)
25+
rst_content = rst_content.replace(get_id(rst_content), parsing_id)
26+
return rst_content
27+
28+
29+
@pytest.mark.regression
30+
def test_pdf_should_extract_invoices_strict():
31+
client = Client()
32+
invoice_splitter_input = PathInput(
33+
PRODUCT_DATA_DIR / "invoice_splitter" / "default_sample.pdf"
34+
)
35+
response = client.enqueue_and_parse(
36+
InvoiceSplitterV1, invoice_splitter_input, close_file=False
37+
)
38+
inference = response.document.inference
39+
pdf_extractor = PdfExtractor(invoice_splitter_input)
40+
assert pdf_extractor.get_page_count() == 2
41+
42+
extracted_pdfs_strict = pdf_extractor.extract_invoices(
43+
inference.prediction.invoice_page_groups
44+
)
45+
46+
assert len(extracted_pdfs_strict) == 2
47+
assert extracted_pdfs_strict[0].filename == "default_sample_001-001.pdf"
48+
assert extracted_pdfs_strict[1].filename == "default_sample_002-002.pdf"
49+
50+
invoice_0 = client.parse(InvoiceV4, extracted_pdfs_strict[0].as_input_source())
51+
test_string_rst_invoice_0 = prepare_invoice_return(
52+
PRODUCT_DATA_DIR / "invoices" / "response_v4" / "summary_full_invoice_p1.rst",
53+
invoice_0.document,
54+
)
55+
assert test_string_rst_invoice_0 == str(invoice_0.document)

tests/image_extraction/test_multi_receipts_extractor.py renamed to tests/extraction/test_multi_receipts_extractor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import pytest
44
from PIL import Image
55

6-
from mindee.image_extraction.multi_receipts_extractor.multi_receipts_extractor import (
6+
from mindee.extraction.multi_receipts_extractor.multi_receipts_extractor import (
77
extract_receipts,
88
)
99
from mindee.input import PathInput

0 commit comments

Comments
 (0)