🐛 fix full text ocr extra not using the proper syntax; add integration tests (#260)

sebastianMindee · web-flow · commit 08c3c76fbad2 · 2024-09-11T17:17:12.000+02:00
diff --git a/.github/workflows/test-integration.yml b/.github/workflows/test-integration.yml
@@ -0,0 +1,55 @@
+#
+# Run integration tests.
+#
+name: Integration Test
+
+on:
+  pull_request:
+  workflow_run:
+    workflows: ["Test Code Samples"]
+    branches:
+      - '*'
+    types:
+      - completed
+
+jobs:
+  pytest:
+    name: Run Integration Tests
+    timeout-minutes: 30
+    strategy:
+      max-parallel: 2
+      matrix:
+        os:
+          - "ubuntu-latest"
+          - "windows-latest"
+        python-version:
+          - "3.7"
+          - "3.12"
+    runs-on: ${{ matrix.os }}
+    steps:
+    - uses: actions/checkout@v4
+      with:
+        submodules: recursive
+
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v4
+      with:
+        python-version: ${{ matrix.python-version }}
+
+    - name: Cache dependencies
+      uses: actions/cache@v3
+      with:
+        path: ~/.cache/pip
+        key: ${{ runner.os }}-dev-${{ hashFiles('setup.cfg') }}
+        restore-keys: |
+          ${{ runner.os }}-dev-
+
+    - name: Install dependencies
+      run: |
+        python -m pip install pip
+        pip install -e .[dev]
+    - name: Run Integration Testing
+      env:
+          MINDEE_API_KEY: ${{ secrets.MINDEE_API_KEY_SE_TESTS }}
+      run: |
+        pytest -m integration
diff --git a/mindee/mindee_http/endpoint.py b/mindee/mindee_http/endpoint.py
@@ -85,10 +85,9 @@ def _custom_request(
         if include_words:
             data["include_mvision"] = "true"
 
-        if full_text:
-            data["full_text_ocr"] = "true"
-
         params = {}
+        if full_text:
+            params["full_text_ocr"] = "true"
         if cropper:
             params["cropper"] = "true"
 
diff --git a/mindee/parsing/common/document.py b/mindee/parsing/common/document.py
@@ -29,7 +29,7 @@ class Document(Generic[TypePrediction, TypePage]):
     """Result of the base inference"""
     id: str
     """Id of the document as sent back by the server"""
-    extras: Extras
+    extras: Optional[Extras]
     """Potential Extras fields sent back along the prediction"""
     ocr: Optional[Ocr]
     """Potential raw text results read by the OCR (limited feature)"""
@@ -45,7 +45,7 @@ def __init__(
         self.filename = raw_response.get("name", "")
         if "ocr" in raw_response and raw_response["ocr"]:
             self.ocr = Ocr(raw_response["ocr"])
-        if "extras" in raw_response and raw_response["extras"]:
+        if "extras" in raw_response and raw_response["inference"]["extras"]:
             self.extras = Extras(raw_response["extras"])
         self._inject_full_text_ocr(raw_response)
         self.inference = inference_type(raw_response["inference"])
@@ -77,7 +77,7 @@ def _inject_full_text_ocr(self, raw_prediction: StringDict) -> None:
 
         artificial_text_obj = {"content": full_text_content}
 
-        if not hasattr(self, "extras"):
+        if not hasattr(self, "extras") or not self.extras:
             self.extras = Extras({"full_text_ocr": artificial_text_obj})
         else:
             self.extras.add_artificial_extra({"full_text_ocr": artificial_text_obj})
diff --git a/pyproject.toml b/pyproject.toml
@@ -37,12 +37,13 @@ safe_licenses = [
 ]
 
 [tool.pytest.ini_options]
-addopts = "--pyargs --cov mindee --cov-report term:skip-covered --cov-report term-missing -m 'not regression'"
+addopts = "--pyargs --cov mindee --cov-report term:skip-covered --cov-report term-missing -m 'not regression and not integration'"
 python_files = "test*.py"
 junit_family = "xunit2"
 markers = [
   "regression: marks tests as regression tests - select with '-m regression'",
-  "lineitems: debug line items"
+  "lineitems: debug line items",
+  "integration: integration tests that send calls to the API - select with '-m integration'"
 ]
 testpaths = [
   "tests",
diff --git a/tests/extraction/test_invoice_splitter_auto_extraction.py b/tests/extraction/test_invoice_splitter_auto_extraction.py
@@ -26,7 +26,7 @@ def prepare_invoice_return(rst_file_path: Path, invoice_prediction: Document):
     return rst_content
 
 
-@pytest.mark.regression
+@pytest.mark.integration
 def test_pdf_should_extract_invoices_strict():
     client = Client()
     invoice_splitter_input = PathInput(
diff --git a/tests/extras/test_extras_integration.py b/tests/extras/test_extras_integration.py
@@ -0,0 +1,31 @@
+import json
+
+import pytest
+
+from mindee import AsyncPredictResponse, Client
+from mindee.product import InternationalIdV2, InvoiceV4
+from tests.product import PRODUCT_DATA_DIR
+
+
+@pytest.fixture
+def client():
+    client = Client()
+    return client
+
+
+@pytest.mark.integration
+def test_send_cropper_extra(client):
+    sample = client.source_from_path(
+        PRODUCT_DATA_DIR / "invoices" / "default_sample.jpg",
+    )
+    response = client.parse(InvoiceV4, sample, cropper=True)
+    assert response.document.inference.pages[0].extras.cropper
+
+
+@pytest.mark.integration
+def test_send_full_text_ocr_extra(client):
+    sample = client.source_from_path(
+        PRODUCT_DATA_DIR / "international_id" / "default_sample.jpg",
+    )
+    response = client.enqueue_and_parse(InternationalIdV2, sample, full_text=True)
+    assert response.document.extras.full_text_ocr
diff --git a/tests/extras/test_full_text_ocr.py b/tests/extras/test_full_text_ocr.py
@@ -14,7 +14,6 @@
 # def load_pages():
 #     with open(EXTRAS_DIR / "full_text_ocr/complete.json", "r") as file:
 #         prediction_data = json.load(file)
-#     print("PData", AsyncPredictResponse(InternationalIdV2, prediction_data).document.inference.pages)
 #     return AsyncPredictResponse(InternationalIdV2, prediction_data).document.inference.pages