Skip to content

✨ add support for invoice splitter auto-extraction #253

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Sep 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 38 additions & 0 deletions examples/auto_invoice_splitter_extraction.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import os

from mindee import Client
from mindee.extraction.common.pdf_extractor import PdfExtractor
from mindee.input import PathInput
from mindee.product import InvoiceSplitterV1, InvoiceV4

api_key = os.getenv("MINDEE_API_KEY")
mindee_client = Client(api_key=api_key)

input_path = "path/to/your/file.ext"
input_source = PathInput(input_path)

if input_source.is_pdf():
pdf_extractor = PdfExtractor(input_source)
if pdf_extractor.get_page_count() > 1:
invoice_splitter_response = mindee_client.enqueue_and_parse(
InvoiceSplitterV1, input_source, close_file=False
)
page_groups = (
invoice_splitter_response.document.inference.prediction.invoice_page_groups
)
extracted_pdfs = pdf_extractor.extract_invoices(page_groups, strict=False)

for extracted_pdf in extracted_pdfs:
# Optional: Save the files locally
# extracted_pdf.write_to_file("output/path")

invoice_result = mindee_client.parse(
InvoiceV4, extracted_pdf.as_input_source()
)
print(invoice_result.document)
else:
invoice_result = mindee_client.parse(InvoiceV4, input_source)
print(invoice_result.document)
else:
invoice_result = mindee_client.parse(InvoiceV4, input_source)
print(invoice_result.document)
2 changes: 1 addition & 1 deletion examples/multi_receipts_tutorial.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from mindee import Client, PredictResponse, product
from mindee.image_extraction.multi_receipts_extractor.multi_receipts_extractor import (
from mindee.extraction.multi_receipts_extractor.multi_receipts_extractor import (
extract_receipts,
)

Expand Down
8 changes: 8 additions & 0 deletions mindee/extraction/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
from mindee.extraction.common.extracted_image import ExtractedImage
from mindee.extraction.common.image_extractor import (
attach_image_as_new_file,
extract_multiple_images_from_source,
)
from mindee.extraction.multi_receipts_extractor import multi_receipts_extractor
from mindee.extraction.pdf_extractor.extracted_pdf import ExtractedPdf
from mindee.extraction.pdf_extractor.pdf_extractor import PdfExtractor
5 changes: 5 additions & 0 deletions mindee/extraction/common/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from mindee.extraction.common.extracted_image import ExtractedImage
from mindee.extraction.common.image_extractor import (
attach_image_as_new_file,
extract_multiple_images_from_source,
)
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
from PIL import Image

from mindee.error import MindeeError
from mindee.extraction.common import ExtractedImage
from mindee.geometry import Point, get_min_max_x, get_min_max_y
from mindee.image_extraction.common import ExtractedImage
from mindee.input import BytesInput, LocalInputSource


Expand Down
1 change: 1 addition & 0 deletions mindee/extraction/multi_receipts_extractor/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from mindee.extraction.multi_receipts_extractor import multi_receipts_extractor
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
from typing import List

from mindee.error import MindeeError
from mindee.image_extraction.common.extracted_image import ExtractedImage
from mindee.image_extraction.common.image_extractor import (
from mindee.extraction.common.extracted_image import ExtractedImage
from mindee.extraction.common.image_extractor import (
extract_multiple_images_from_source,
)
from mindee.input import LocalInputSource
Expand Down
2 changes: 2 additions & 0 deletions mindee/extraction/pdf_extractor/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
from mindee.extraction.pdf_extractor.extracted_pdf import ExtractedPdf
from mindee.extraction.pdf_extractor.pdf_extractor import PdfExtractor
49 changes: 49 additions & 0 deletions mindee/extraction/pdf_extractor/extracted_pdf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
from pathlib import Path
from typing import BinaryIO

import pypdfium2 as pdfium

from mindee.error import MindeeError
from mindee.input import BytesInput


class ExtractedPdf:
"""An extracted sub-Pdf."""

pdf_bytes: BinaryIO
filename: str

def __init__(self, pdf_bytes: BinaryIO, filename: str):
self.pdf_bytes = pdf_bytes
self.filename = filename

def get_page_count(self) -> int:
"""Get the number of pages in the PDF file."""
try:
pdf = pdfium.PdfDocument(self.pdf_bytes)
return len(pdf)
except Exception as exc:
raise MindeeError(
"Could not retrieve page count from Extracted PDF object."
) from exc

def write_to_file(self, output_path: str):
"""
Writes the contents of the current PDF object to a file.

:param output_path: Path of the destination file. If not extension is provided, pdf will be appended by default.
"""
out_path = Path(output_path)
if out_path.resolve().is_dir():
raise MindeeError("Provided path is not a file.")
if not output_path or not out_path.parent.exists():
raise MindeeError("Invalid save path provided {}.")
if out_path.suffix.lower() != "pdf":
out_path = out_path.parent / (out_path.stem + "." + "pdf")
with open(out_path, "wb") as out_file:
out_file.write(self.pdf_bytes.read())

def as_input_source(self) -> BytesInput:
"""Returns the current PDF object as a usable BytesInput source."""
self.pdf_bytes.seek(0)
return BytesInput(self.pdf_bytes.read(), self.filename)
116 changes: 116 additions & 0 deletions mindee/extraction/pdf_extractor/pdf_extractor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
import io
from pathlib import Path
from typing import BinaryIO, List, Optional, Union

import pypdfium2 as pdfium
from PIL import Image

from mindee.error import MindeeError
from mindee.extraction.pdf_extractor.extracted_pdf import ExtractedPdf
from mindee.input.sources import LocalInputSource
from mindee.product.invoice_splitter import InvoiceSplitterV1PageGroup


class PdfExtractor:
"""PDF extraction class."""

_source_pdf: BinaryIO
_filename: str

def __init__(self, local_input: LocalInputSource):
self._filename = local_input.filename
if local_input.is_pdf():
self._source_pdf = local_input.file_object
else:
pdf_image = Image.open(local_input.file_object)
self._source_pdf = io.BytesIO()
pdf_image.save(self._source_pdf, format="PDF")

def get_page_count(self) -> int:
"""Get the number of pages in the PDF file."""
pdf = pdfium.PdfDocument(self._source_pdf)
return len(pdf)

def cut_pages(self, page_indexes: List) -> BinaryIO:
"""
Create a new PDF from pages and save it into a buffer.

:param page_indexes: List of pages number to use for merging in the original PDF.
:return: The buffer containing the new PDF.
"""
self._source_pdf.seek(0)
new_pdf = pdfium.PdfDocument.new()
pdf = pdfium.PdfDocument(self._source_pdf)
new_pdf.import_pages(pdf, page_indexes)
bytes_io = io.BytesIO()
new_pdf.save(bytes_io)
return bytes_io

def extract_sub_documents(
self, page_indexes: List[List[int]]
) -> List[ExtractedPdf]:
"""
Extract the sub-documents from the main pdf, based on the given list of page indexes.

:param page_indexes: List of list of numbers, representing page indexes.
:return: A list of created PDFS.
"""
extracted_pdfs: List[ExtractedPdf] = []
extension = Path(self._filename).suffix
stem = Path(self._filename).stem
for page_index_elem in page_indexes:
if not page_index_elem or len(page_index_elem) == 0:
raise MindeeError("Empty indexes aren't allowed for extraction.")
for page_index in page_index_elem:
if page_index > self.get_page_count():
raise MindeeError(f"Index {page_index} is out of range.")
formatted_max_index = f"{page_index_elem[len(page_index_elem) - 1] + 1:03d}"
field_filename = f"{stem}_{(page_index_elem[0] + 1):03d}-{formatted_max_index}{extension}"
extracted_pdf = ExtractedPdf(
self.cut_pages(page_index_elem), field_filename
)
extracted_pdfs.append(extracted_pdf)
return extracted_pdfs

def extract_invoices(
self,
page_indexes: List[Union[InvoiceSplitterV1PageGroup, List[int]]],
strict: bool = False,
) -> List[ExtractedPdf]:
"""
Extracts invoices as complete PDFs from the document.

:param page_indexes: List of sub-lists of pages to keep.
:param strict: Whether to trust confidence scores above 0.5 (included) or not.
:return: A list of extracted invoices.
"""
if len(page_indexes) < 1:
raise MindeeError("No indexes provided.")
if not isinstance(page_indexes[0], InvoiceSplitterV1PageGroup):
return self.extract_sub_documents(page_indexes) # type: ignore
if not strict:
indexes_as_list = [
page_index.page_indexes for page_index in page_indexes # type: ignore
]
return self.extract_sub_documents(indexes_as_list)
correct_page_indexes: List[List[int]] = []
current_list: List[int] = []
previous_confidence: Optional[float] = None
for i, page_index in enumerate(page_indexes):
assert isinstance(page_index, InvoiceSplitterV1PageGroup)
confidence = page_index.confidence
page_list = page_index.page_indexes

if confidence >= 0.5 and previous_confidence is None:
current_list = page_list
elif confidence >= 0.5 and i != len(page_indexes) - 1:
correct_page_indexes.append(current_list)
current_list = page_list
elif confidence < 0.5 and i == len(page_indexes) - 1:
current_list.extend(page_list)
correct_page_indexes.append(current_list)
else:
correct_page_indexes.append(current_list)
correct_page_indexes.append(page_list)
previous_confidence = confidence
return self.extract_sub_documents(correct_page_indexes)
5 changes: 0 additions & 5 deletions mindee/image_extraction/common/__init__.py

This file was deleted.

This file was deleted.

File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import pytest
from PIL import Image

from mindee.image_extraction.common import extract_multiple_images_from_source
from mindee.extraction.common import extract_multiple_images_from_source
from mindee.input import PathInput
from mindee.product import BarcodeReaderV1
from tests.test_inputs import PRODUCT_DATA_DIR
Expand Down
55 changes: 55 additions & 0 deletions tests/extraction/test_invoice_splitter_auto_extraction.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
from pathlib import Path

import pytest

from mindee import Client
from mindee.extraction.pdf_extractor.pdf_extractor import PdfExtractor
from mindee.input import PathInput
from mindee.parsing.common import Document
from mindee.product import InvoiceSplitterV1, InvoiceV4
from tests.product import get_id, get_version
from tests.test_inputs import PRODUCT_DATA_DIR


@pytest.fixture
def invoice_splitter_5p_path():
return PRODUCT_DATA_DIR / "invoice_splitter" / "invoice_5p.pdf"


def prepare_invoice_return(rst_file_path: Path, invoice_prediction: Document):
with open(rst_file_path, "r") as rst_file:
rst_content = rst_file.read()
parsing_version = invoice_prediction.inference.product.version
parsing_id = invoice_prediction.id
rst_content = rst_content.replace(get_version(rst_content), parsing_version)
rst_content = rst_content.replace(get_id(rst_content), parsing_id)
return rst_content


@pytest.mark.regression
def test_pdf_should_extract_invoices_strict():
client = Client()
invoice_splitter_input = PathInput(
PRODUCT_DATA_DIR / "invoice_splitter" / "default_sample.pdf"
)
response = client.enqueue_and_parse(
InvoiceSplitterV1, invoice_splitter_input, close_file=False
)
inference = response.document.inference
pdf_extractor = PdfExtractor(invoice_splitter_input)
assert pdf_extractor.get_page_count() == 2

extracted_pdfs_strict = pdf_extractor.extract_invoices(
inference.prediction.invoice_page_groups
)

assert len(extracted_pdfs_strict) == 2
assert extracted_pdfs_strict[0].filename == "default_sample_001-001.pdf"
assert extracted_pdfs_strict[1].filename == "default_sample_002-002.pdf"

invoice_0 = client.parse(InvoiceV4, extracted_pdfs_strict[0].as_input_source())
test_string_rst_invoice_0 = prepare_invoice_return(
PRODUCT_DATA_DIR / "invoices" / "response_v4" / "summary_full_invoice_p1.rst",
invoice_0.document,
)
assert test_string_rst_invoice_0 == str(invoice_0.document)
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import pytest
from PIL import Image

from mindee.image_extraction.multi_receipts_extractor.multi_receipts_extractor import (
from mindee.extraction.multi_receipts_extractor.multi_receipts_extractor import (
extract_receipts,
)
from mindee.input import PathInput
Expand Down
Loading
Loading