Skip to content

New presence of None values in document.inference.pages[].extras.full_text_ocr causes TypeError in client #312

Closed
@JakobGM

Description

@JakobGM

Our entire receipt parsing system is currently down, so excuse the terse error report, but I will quickly post this here while I try to work around the issue on our side. I will come back and clean up this issue as soon as I have extinguished some fires on our end.

Here is (a really simplified) example for how we use the Mindee SDK:

from __future__ import annotations

from typing import BinaryIO

from mindee import Client
from mindee.product import InvoiceV4

from some_place import MINDEE_API_KEY


def parse(file: BinaryIO):
    client = Client(api_key=MINDEE_API_KEY)
    input_source = client.source_from_bytes(
        input_bytes=file.read(),
        filename=file.name,
    )
    file.seek(0)
    return client.parse(  # <-- This fails
        product_class=InvoiceV4,
        input_source=input_source,
    )

Here is a screenshot that shows Mindee's JSON API response:

Image

This response is not gracefully handled by the Python SDK client for mindee:

File ~/crdbrd/hub/src/hub/receipts/mindee.py:108, in parse(file, media_type)
    103     raise exceptions.ReceiptParsingError from exc
    105 file.seek(0)
    106 parser_result = cast(
    107     PredictResponse[InvoiceV4],
--> 108     client.parse(  # pyright: ignore[reportUnknownMemberType]
    109         product_class=InvoiceV4,
    110         input_source=input_source,
    111     ),
    112 )
    113 document = cast(
    114     Document[InvoiceV4Document, Page[InvoiceV4Document]] | None,
    115     parser_result.document,
    116 )
    117 if document is None:
File ~/crdbrd/hub/.venv/lib/python3.13/site-packages/mindee/client.py:126, in Client.parse(self, product_class, input_source, include_words, close_file, page_options, cropper, endpoint, full_text)
    120     if page_options and input_source.is_pdf():
    121         input_source.process_pdf(
    122             page_options.operation,
    123             page_options.on_min_pages,
    124             page_options.page_indexes,
    125         )
--> 126 return self._make_request(
    127     product_class,
    128     input_source,
    129     endpoint,
    130     include_words,
    131     close_file,
    132     cropper,
    133     full_text,
    134 )
File ~/crdbrd/hub/.venv/lib/python3.13/site-packages/mindee/client.py:427, in Client._make_request(self, product_class, input_source, endpoint, include_words, close_file, cropper, full_text)
    421     clean_response = clean_request_json(response)
    422     raise handle_error(
    423         str(product_class.endpoint_name),
    424         clean_response,
    425     )
--> 427 return PredictResponse(product_class, dict_response)
File ~/crdbrd/hub/.venv/lib/python3.13/site-packages/mindee/parsing/common/predict_response.py:28, in PredictResponse.__init__(self, inference_type, raw_response)
     21 """
     22 Container for the raw API response and the parsed document.
     23 
     24 :param inference_type: Type of the inference.
     25 :param raw_response: json response from HTTP call.
     26 """
     27 super().__init__(raw_response)
---> 28 self.document = Document(inference_type, raw_response["document"])
File ~/crdbrd/hub/.venv/lib/python3.13/site-packages/mindee/parsing/common/document.py:50, in Document.__init__(self, inference_type, raw_response)
     48 if "extras" in raw_response and raw_response["inference"]["extras"]:
     49     self.extras = Extras(raw_response["extras"])
---> 50 self._inject_full_text_ocr(raw_response)
     51 self.inference = inference_type(raw_response["inference"])
     52 self.n_pages = raw_response["n_pages"]
File ~/crdbrd/hub/.venv/lib/python3.13/site-packages/mindee/parsing/common/document.py:72, in Document._inject_full_text_ocr(self, raw_prediction)
     65 if (
     66     not pages
     67     or "extras" not in pages[0]
     68     or "full_text_ocr" not in pages[0]["extras"]
     69 ):
     70     return
---> 72 full_text_content = "\n".join(
     73     page["extras"]["full_text_ocr"]["content"]
     74     for page in pages
     75     if "extras" in page and "full_text_ocr" in page["extras"]
     76 )
     78 artificial_text_obj = {"content": full_text_content}
     80 if not hasattr(self, "extras") or not self.extras:
File ~/crdbrd/hub/.venv/lib/python3.13/site-packages/mindee/parsing/common/document.py:73, in <genexpr>(.0)
     65 if (
     66     not pages
     67     or "extras" not in pages[0]
     68     or "full_text_ocr" not in pages[0]["extras"]
     69 ):
     70     return
     72 full_text_content = "\n".join(
---> 73     page["extras"]["full_text_ocr"]["content"]
     74     for page in pages
     75     if "extras" in page and "full_text_ocr" in page["extras"]
     76 )
     78 artificial_text_obj = {"content": full_text_content}
     80 if not hasattr(self, "extras") or not self.extras:
TypeError: 'NoneType' object is not subscriptable

These keys should not have been inserted in the response or the client should check for None before processing the data.

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions