From 41793b9a2df74a05d9c6e4e52411ea69d368c0b9 Mon Sep 17 00:00:00 2001 From: sebastianMindee Date: Fri, 7 Mar 2025 14:07:27 +0100 Subject: [PATCH 1/2] :recycle: update display & structore for invoice splitter v1 --- .../invoice_splitter_v1_async.txt | 4 +- docs/extras/guide/invoice_splitter_v1.md | 78 +++++++++++-------- docs/product/invoice_splitter_v1.rst | 4 + mindee/product/invoice_splitter/__init__.py | 3 + .../invoice_splitter/invoice_splitter_v1.py | 12 ++- .../invoice_splitter_v1_document.py | 61 ++++++++++----- .../invoice_splitter_v1_invoice_page_group.py | 56 +++++++++++++ .../test_invoice_splitter_v1.py | 47 ++++++----- 8 files changed, 179 insertions(+), 86 deletions(-) create mode 100644 mindee/product/invoice_splitter/invoice_splitter_v1_invoice_page_group.py diff --git a/docs/extras/code_samples/invoice_splitter_v1_async.txt b/docs/extras/code_samples/invoice_splitter_v1_async.txt index e598f3a3..20a96b6d 100644 --- a/docs/extras/code_samples/invoice_splitter_v1_async.txt +++ b/docs/extras/code_samples/invoice_splitter_v1_async.txt @@ -1,6 +1,4 @@ -from mindee import Client, product -from time import sleep -from mindee.parsing.common import AsyncPredictResponse +from mindee import Client, product, AsyncPredictResponse # Init a new client mindee_client = Client(api_key="my-api-key") diff --git a/docs/extras/guide/invoice_splitter_v1.md b/docs/extras/guide/invoice_splitter_v1.md index 59ea0872..5f462f53 100644 --- a/docs/extras/guide/invoice_splitter_v1.md +++ b/docs/extras/guide/invoice_splitter_v1.md @@ -1,21 +1,17 @@ --- -title: Invoice Splitter API Python +title: Invoice Splitter OCR Python category: 622b805aaec68102ea7fcbc2 -slug: python-invoice-splitter-api +slug: python-invoice-splitter-ocr parentDoc: 609808f773b0b90051d839de --- The Python OCR SDK supports the [Invoice Splitter API](https://platform.mindee.com/mindee/invoice_splitter). -Using [this sample](https://github.com/mindee/client-lib-test-data/blob/main/products/invoice_splitter/default_sample.pdf), we are going to illustrate how to detect the pages of multiple invoices within the same document. +Using the [sample below](https://github.com/mindee/client-lib-test-data/blob/main/products/invoice_splitter/default_sample.pdf), we are going to illustrate how to extract the data that we want using the OCR SDK. +![Invoice Splitter sample](https://github.com/mindee/client-lib-test-data/blob/main/products/invoice_splitter/default_sample.pdf?raw=true) # Quick-Start - -> **⚠️ Important:** This API only works **asynchronously**, which means that documents have to be sent and retrieved in a specific way: - ```py -from mindee import Client, product -from time import sleep -from mindee.parsing.common import AsyncPredictResponse +from mindee import Client, product, AsyncPredictResponse # Init a new client mindee_client = Client(api_key="my-api-key") @@ -31,66 +27,80 @@ result: AsyncPredictResponse = mindee_client.enqueue_and_parse( # Print a brief summary of the parsed data print(result.document) + ``` **Output (RST):** - ```rst ######## Document ######## -:Mindee ID: 8c25cc63-212b-4537-9c9b-3fbd3bd0ee20 -:Filename: default_sample.jpg +:Mindee ID: 15ad7a19-7b75-43d0-b0c6-9a641a12b49b +:Filename: default_sample.pdf Inference ######### -:Product: mindee/carte_vitale v1.0 -:Rotation applied: Yes +:Product: mindee/invoice_splitter v1.1 +:Rotation applied: No Prediction ========== -:Given Name(s): NATHALIE -:Surname: DURAND -:Social Security Number: 269054958815780 -:Issuance Date: 2007-01-01 +:Invoice Page Groups: + :Page indexes: 0 + :Page indexes: 1 Page Predictions ================ Page 0 ------ -:Given Name(s): NATHALIE -:Surname: DURAND -:Social Security Number: 269054958815780 -:Issuance Date: 2007-01-01 +:Invoice Page Groups: + +Page 1 +------ +:Invoice Page Groups: ``` # Field Types +## Standard Fields +These fields are generic and used in several products. -## Specific Fields +### BaseField +Each prediction object contains a set of fields that inherit from the generic `BaseField` class. +A typical `BaseField` object will have the following attributes: -### Page Group +* **value** (`Union[float, str]`): corresponds to the field value. Can be `None` if no value was extracted. +* **confidence** (`float`): the confidence score of the field prediction. +* **bounding_box** (`[Point, Point, Point, Point]`): contains exactly 4 relative vertices (points) coordinates of a right rectangle containing the field in the document. +* **polygon** (`List[Point]`): contains the relative vertices coordinates (`Point`) of a polygon containing the field in the image. +* **page_id** (`int`): the ID of the page, always `None` when at document-level. +* **reconstructed** (`bool`): indicates whether an object was reconstructed (not extracted as the API gave it). -List of page group indexes. +> **Note:** A `Point` simply refers to a List of two numbers (`[float, float]`). -An `InvoiceSplitterV1PageGroup` implements the following attributes: -- **page_indexes** (`float`\[]): List of indexes of the pages of a single invoice. -- **confidence** (`float`): The confidence of the prediction. +Aside from the previous attributes, all basic fields have access to a custom `__str__` method that can be used to print their value as a string. -# Attributes +## Specific Fields +Fields which are specific to this product; they are not used in any other product. + +### Invoice Page Groups Field +List of page groups. Each group represents a single invoice within a multi-invoice document. + +A `InvoiceSplitterV1InvoicePageGroup` implements the following attributes: + +* **page_indexes** (`List[int]`): List of page indexes that belong to the same invoice (group). +# Attributes The following fields are extracted for Invoice Splitter V1: ## Invoice Page Groups - -**invoice_page_groups** ([InvoiceSplitterV1PageGroup](#invoice-splitter-v1-page-group)\[]): List of page indexes that belong to the same invoice in the PDF. +**invoice_page_groups** (List[[InvoiceSplitterV1InvoicePageGroup](#invoice-page-groups-field)]): List of page groups. Each group represents a single invoice within a multi-invoice document. ```py -for invoice_page_groups_elem in page.prediction.invoice_page_groups): - print(invoice_page_groups_elem) +for invoice_page_groups_elem in result.document.inference.prediction.invoice_page_groups: + print(invoice_page_groups_elem.value) ``` # Questions? - [Join our Slack](https://join.slack.com/t/mindee-community/shared_invite/zt-2d0ds7dtz-DPAF81ZqTy20chsYpQBW5g) diff --git a/docs/product/invoice_splitter_v1.rst b/docs/product/invoice_splitter_v1.rst index 3d35a647..3c26f8d7 100644 --- a/docs/product/invoice_splitter_v1.rst +++ b/docs/product/invoice_splitter_v1.rst @@ -13,3 +13,7 @@ Invoice Splitter V1 .. autoclass:: mindee.product.invoice_splitter.invoice_splitter_v1_document.InvoiceSplitterV1Document :members: :inherited-members: + +.. autoclass:: mindee.product.invoice_splitter.invoice_splitter_v1_invoice_page_group.InvoiceSplitterV1InvoicePageGroup + :members: + :inherited-members: diff --git a/mindee/product/invoice_splitter/__init__.py b/mindee/product/invoice_splitter/__init__.py index 0ffda80c..a2232ab2 100644 --- a/mindee/product/invoice_splitter/__init__.py +++ b/mindee/product/invoice_splitter/__init__.py @@ -2,6 +2,9 @@ from mindee.product.invoice_splitter.invoice_splitter_v1_document import ( InvoiceSplitterV1Document, ) +from mindee.product.invoice_splitter.invoice_splitter_v1_invoice_page_group import ( + InvoiceSplitterV1InvoicePageGroup, +) from mindee.product.invoice_splitter.invoice_splitter_v1_page_group import ( InvoiceSplitterV1PageGroup, ) diff --git a/mindee/product/invoice_splitter/invoice_splitter_v1.py b/mindee/product/invoice_splitter/invoice_splitter_v1.py index 478abd09..8140bc51 100644 --- a/mindee/product/invoice_splitter/invoice_splitter_v1.py +++ b/mindee/product/invoice_splitter/invoice_splitter_v1.py @@ -9,7 +9,7 @@ class InvoiceSplitterV1(Inference): - """Inference prediction for Invoice Splitter, API version 1.""" + """Invoice Splitter API version 1 inference prediction.""" prediction: InvoiceSplitterV1Document """Document-level prediction.""" @@ -20,14 +20,20 @@ class InvoiceSplitterV1(Inference): endpoint_version = "1" """Version of the endpoint.""" - def __init__(self, raw_prediction: StringDict) -> None: + def __init__(self, raw_prediction: StringDict): """ Invoice Splitter v1 inference. :param raw_prediction: Raw prediction from the HTTP response. """ super().__init__(raw_prediction) + self.prediction = InvoiceSplitterV1Document(raw_prediction["prediction"]) self.pages = [] for page in raw_prediction["pages"]: - self.pages.append(Page(InvoiceSplitterV1Document, page)) + try: + page_prediction = page["prediction"] + except KeyError: + continue + if page_prediction: + self.pages.append(Page(InvoiceSplitterV1Document, page)) diff --git a/mindee/product/invoice_splitter/invoice_splitter_v1_document.py b/mindee/product/invoice_splitter/invoice_splitter_v1_document.py index 502c725d..a3bbdc3e 100644 --- a/mindee/product/invoice_splitter/invoice_splitter_v1_document.py +++ b/mindee/product/invoice_splitter/invoice_splitter_v1_document.py @@ -1,38 +1,57 @@ -from typing import List +from typing import List, Optional from mindee.parsing.common.prediction import Prediction from mindee.parsing.common.string_dict import StringDict from mindee.parsing.common.summary_helper import clean_out_string -from mindee.product.invoice_splitter.invoice_splitter_v1_page_group import ( - InvoiceSplitterV1PageGroup, +from mindee.product.invoice_splitter.invoice_splitter_v1_invoice_page_group import ( + InvoiceSplitterV1InvoicePageGroup, ) class InvoiceSplitterV1Document(Prediction): - """Document data for Invoice Splitter, API version 1.""" + """Invoice Splitter API version 1.2 document data.""" - invoice_page_groups: List[InvoiceSplitterV1PageGroup] - """Page groups linked to an invoice.""" + invoice_page_groups: List[InvoiceSplitterV1InvoicePageGroup] + """List of page groups. Each group represents a single invoice within a multi-invoice document.""" - def __init__(self, raw_prediction: StringDict) -> None: + def __init__( + self, + raw_prediction: StringDict, + page_id: Optional[int] = None, + ): """ Invoice Splitter document. :param raw_prediction: Raw prediction from HTTP response + :param page_id: Page number for multi pages pdf input """ - super().__init__(raw_prediction) - - invoice_page_groups = [] - if ( - "invoice_page_groups" in raw_prediction - and len(raw_prediction["invoice_page_groups"]) > 0 - ): - for prediction in raw_prediction["invoice_page_groups"]: - invoice_page_groups.append(InvoiceSplitterV1PageGroup(prediction)) - self.invoice_page_groups = invoice_page_groups + super().__init__(raw_prediction, page_id) + self.invoice_page_groups = [ + InvoiceSplitterV1InvoicePageGroup(prediction, page_id=page_id) + for prediction in raw_prediction["invoice_page_groups"] + ] + + @staticmethod + def _invoice_page_groups_separator(char: str) -> str: + out_str = " " + out_str += f"+{char * 74}" + return out_str + "+" + + def _invoice_page_groups_to_str(self) -> str: + if not self.invoice_page_groups: + return "" + + lines = f"\n{self._invoice_page_groups_separator('-')}\n ".join( + [item.to_table_line() for item in self.invoice_page_groups] + ) + out_str = "" + out_str += f"\n{self._invoice_page_groups_separator('-')}\n " + out_str += " | Page Indexes " + out_str += f" |\n{self._invoice_page_groups_separator('=')}" + out_str += f"\n {lines}" + out_str += f"\n{self._invoice_page_groups_separator('-')}" + return out_str def __str__(self) -> str: - page_group_str = ":Invoice Page Groups:" - for page_group in self.invoice_page_groups: - page_group_str += f"\n {str(page_group)}" - return clean_out_string(page_group_str) + out_str: str = f":Invoice Page Groups: {self._invoice_page_groups_to_str()}\n" + return clean_out_string(out_str) diff --git a/mindee/product/invoice_splitter/invoice_splitter_v1_invoice_page_group.py b/mindee/product/invoice_splitter/invoice_splitter_v1_invoice_page_group.py new file mode 100644 index 00000000..8bb8c2c7 --- /dev/null +++ b/mindee/product/invoice_splitter/invoice_splitter_v1_invoice_page_group.py @@ -0,0 +1,56 @@ +from typing import Dict, List, Optional + +from mindee.parsing.common.string_dict import StringDict +from mindee.parsing.common.summary_helper import clean_out_string +from mindee.parsing.standard.base import FieldConfidenceMixin, FieldPositionMixin + + +class InvoiceSplitterV1InvoicePageGroup(FieldPositionMixin, FieldConfidenceMixin): + """List of page groups. Each group represents a single invoice within a multi-invoice document.""" + + page_indexes: List[int] + """List of page indexes that belong to the same invoice (group).""" + page_n: int + """The document page on which the information was found.""" + + def __init__( + self, + raw_prediction: StringDict, + page_id: Optional[int] = None, + ): + self._set_confidence(raw_prediction) + self._set_position(raw_prediction) + + if page_id is None: + try: + self.page_n = raw_prediction["page_id"] + except KeyError: + pass + else: + self.page_n = page_id + + self.page_indexes = raw_prediction["page_indexes"] + + def _printable_values(self) -> Dict[str, str]: + """Return values for printing.""" + out_dict: Dict[str, str] = {} + out_dict["page_indexes"] = ", ".join([str(elem) for elem in self.page_indexes]) + return out_dict + + def _table_printable_values(self) -> Dict[str, str]: + """Return values for printing inside an RST table.""" + out_dict: Dict[str, str] = {} + out_dict["page_indexes"] = ", ".join([str(elem) for elem in self.page_indexes]) + return out_dict + + def to_table_line(self) -> str: + """Output in a format suitable for inclusion in an rST table.""" + printable = self._table_printable_values() + out_str: str = f"| {printable['page_indexes']:<72} | " + return clean_out_string(out_str) + + def __str__(self) -> str: + """Default string representation.""" + printable = self._printable_values() + out_str: str = f"Page Indexes: {printable['page_indexes']}, \n" + return clean_out_string(out_str) diff --git a/tests/product/invoice_splitter/test_invoice_splitter_v1.py b/tests/product/invoice_splitter/test_invoice_splitter_v1.py index f229afee..78c63607 100644 --- a/tests/product/invoice_splitter/test_invoice_splitter_v1.py +++ b/tests/product/invoice_splitter/test_invoice_splitter_v1.py @@ -10,40 +10,37 @@ ) from tests.product import PRODUCT_DATA_DIR +RESPONSE_DIR = PRODUCT_DATA_DIR / "invoice_splitter" / "response_v1" + +InvoiceSplitterV1DocumentType = Document[ + InvoiceSplitterV1Document, + Page[InvoiceSplitterV1Document], +] + @pytest.fixture -def complete_doc() -> ( - Document[InvoiceSplitterV1Document, Page[InvoiceSplitterV1Document]] -): - json_data = json.load( - open(PRODUCT_DATA_DIR / "invoice_splitter" / "response_v1" / "complete.json") - ) +def complete_doc() -> InvoiceSplitterV1DocumentType: + file_path = RESPONSE_DIR / "complete.json" + with open(file_path, "r", encoding="utf-8") as open_file: + json_data = json.load(open_file) return Document(InvoiceSplitterV1, json_data["document"]) @pytest.fixture -def empty_doc() -> Document[InvoiceSplitterV1Document, Page[InvoiceSplitterV1Document]]: - json_data = json.load( - open(PRODUCT_DATA_DIR / "invoice_splitter" / "response_v1" / "empty.json") - ) +def empty_doc() -> InvoiceSplitterV1DocumentType: + file_path = RESPONSE_DIR / "empty.json" + with open(file_path, "r", encoding="utf-8") as open_file: + json_data = json.load(open_file) return Document(InvoiceSplitterV1, json_data["document"]) -def test_complete_doc( - complete_doc: Document[InvoiceSplitterV1Document, Page[InvoiceSplitterV1Document]], -): - reference_str = open( - PRODUCT_DATA_DIR / "invoice_splitter" / "response_v1" / "summary_full.rst", - "r", - encoding="utf-8", - ).read() - assert len(complete_doc.inference.prediction.invoice_page_groups) == 3 - assert complete_doc.inference.prediction.invoice_page_groups[0].confidence == 1 - assert complete_doc.inference.prediction.invoice_page_groups[2].confidence == 0 +def test_complete_doc(complete_doc: InvoiceSplitterV1DocumentType): + file_path = RESPONSE_DIR / "summary_full.rst" + with open(file_path, "r", encoding="utf-8") as open_file: + reference_str = open_file.read() assert str(complete_doc) == reference_str -def test_empty_doc( - empty_doc: Document[InvoiceSplitterV1Document, Page[InvoiceSplitterV1Document]] -): - assert len(empty_doc.inference.prediction.invoice_page_groups) == 0 +def test_empty_doc(empty_doc: InvoiceSplitterV1DocumentType): + prediction = empty_doc.inference.prediction + assert len(prediction.invoice_page_groups) == 0 From 0506ae02be96540a490b3c5c0e93d5bd58976622 Mon Sep 17 00:00:00 2001 From: sebastianMindee Date: Fri, 7 Mar 2025 18:04:33 +0100 Subject: [PATCH 2/2] fix tests --- .../extraction/pdf_extractor/pdf_extractor.py | 10 ++++----- mindee/product/invoice_splitter/__init__.py | 3 --- .../invoice_splitter_v1_page_group.py | 22 ------------------- tests/data | 2 +- 4 files changed, 5 insertions(+), 32 deletions(-) delete mode 100644 mindee/product/invoice_splitter/invoice_splitter_v1_page_group.py diff --git a/mindee/extraction/pdf_extractor/pdf_extractor.py b/mindee/extraction/pdf_extractor/pdf_extractor.py index 5d5f2e19..7f9baf7f 100644 --- a/mindee/extraction/pdf_extractor/pdf_extractor.py +++ b/mindee/extraction/pdf_extractor/pdf_extractor.py @@ -8,9 +8,7 @@ from mindee.error.mindee_error import MindeeError from mindee.extraction.pdf_extractor.extracted_pdf import ExtractedPdf from mindee.input.sources.local_input_source import LocalInputSource -from mindee.product.invoice_splitter.invoice_splitter_v1_page_group import ( - InvoiceSplitterV1PageGroup, -) +from mindee.product.invoice_splitter import InvoiceSplitterV1InvoicePageGroup class PdfExtractor: @@ -76,7 +74,7 @@ def extract_sub_documents( def extract_invoices( self, - page_indexes: List[Union[InvoiceSplitterV1PageGroup, List[int]]], + page_indexes: List[Union[InvoiceSplitterV1InvoicePageGroup, List[int]]], strict: bool = False, ) -> List[ExtractedPdf]: """ @@ -88,7 +86,7 @@ def extract_invoices( """ if len(page_indexes) < 1: raise MindeeError("No indexes provided.") - if not isinstance(page_indexes[0], InvoiceSplitterV1PageGroup): + if not isinstance(page_indexes[0], InvoiceSplitterV1InvoicePageGroup): return self.extract_sub_documents(page_indexes) # type: ignore if not strict: indexes_as_list = [page_index.page_indexes for page_index in page_indexes] # type: ignore @@ -97,7 +95,7 @@ def extract_invoices( current_list: List[int] = [] previous_confidence: Optional[float] = None for i, page_index in enumerate(page_indexes): - assert isinstance(page_index, InvoiceSplitterV1PageGroup) + assert isinstance(page_index, InvoiceSplitterV1InvoicePageGroup) confidence = page_index.confidence page_list = page_index.page_indexes diff --git a/mindee/product/invoice_splitter/__init__.py b/mindee/product/invoice_splitter/__init__.py index a2232ab2..5fe68245 100644 --- a/mindee/product/invoice_splitter/__init__.py +++ b/mindee/product/invoice_splitter/__init__.py @@ -5,6 +5,3 @@ from mindee.product.invoice_splitter.invoice_splitter_v1_invoice_page_group import ( InvoiceSplitterV1InvoicePageGroup, ) -from mindee.product.invoice_splitter.invoice_splitter_v1_page_group import ( - InvoiceSplitterV1PageGroup, -) diff --git a/mindee/product/invoice_splitter/invoice_splitter_v1_page_group.py b/mindee/product/invoice_splitter/invoice_splitter_v1_page_group.py deleted file mode 100644 index 003aec57..00000000 --- a/mindee/product/invoice_splitter/invoice_splitter_v1_page_group.py +++ /dev/null @@ -1,22 +0,0 @@ -from typing import List - -from mindee.parsing.common.string_dict import StringDict - - -class InvoiceSplitterV1PageGroup: - """Pages indexes in a group for Invoice Splitter V1.""" - - page_indexes: List[int] - """Index of each page""" - confidence: float - """Confidence score""" - - def __init__(self, raw_prediction: StringDict) -> None: - self.page_indexes = raw_prediction["page_indexes"] - try: - self.confidence = float(raw_prediction["confidence"]) - except (KeyError, TypeError): - self.confidence = 0.0 - - def __str__(self) -> str: - return f":Page indexes: {', '.join([str(page_index) for page_index in self.page_indexes])}" diff --git a/tests/data b/tests/data index 64a81052..415f6bf4 160000 --- a/tests/data +++ b/tests/data @@ -1 +1 @@ -Subproject commit 64a810523d8b06ffb1682934f53f7f2eca2429d7 +Subproject commit 415f6bf4a13f38af2776cbe2222fdfab92f41ee5