diff --git a/CHANGELOG.md b/CHANGELOG.md index 2acd6f27..5fd5e3ad 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,6 @@ +## 0.0.88 +* Return 422 HTTP code when PDF can't be processed + ## 0.0.87 * Patch various CVEs * Enable pytest concurrency diff --git a/prepline_general/api/__version__.py b/prepline_general/api/__version__.py index 52a2b406..59542b61 100644 --- a/prepline_general/api/__version__.py +++ b/prepline_general/api/__version__.py @@ -1 +1 @@ -__version__ = "0.0.87" # pragma: no cover +__version__ = "0.0.88" # pragma: no cover diff --git a/prepline_general/api/general.py b/prepline_general/api/general.py index 1aef18b3..61d996d2 100644 --- a/prepline_general/api/general.py +++ b/prepline_general/api/general.py @@ -470,20 +470,38 @@ def _check_free_memory(): def _check_pdf(file: IO[bytes]): - """Check if the PDF file is encrypted, otherwise assume it is not a valid PDF.""" + """ + Check if PDF is: + - Encrypted + - Has corrupted pages + - Has corrupted root object + + Throws: + - HTTPException 442 UNPROCESSABLE ENTITY if file is encrypted or corrupted + """ try: pdf = PdfReader(file) # This will raise if the file is encrypted pdf.metadata + + # This will raise if the file's root object is corrupted + pdf.root_object + + # This will raise if the file's pages are corrupted + list(pdf.pages) + return pdf except FileNotDecryptedError: raise HTTPException( - status_code=400, + status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, detail="File is encrypted. Please decrypt it with password.", ) - except PdfReadError: - raise HTTPException(status_code=422, detail="File does not appear to be a valid PDF") + except PdfReadError as e: + raise HTTPException( + status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, + detail=f"File does not appear to be a valid PDF. Error: {e}", + ) def _validate_strategy(strategy: str) -> str: diff --git a/preprocessing-pipeline-family.yaml b/preprocessing-pipeline-family.yaml index a7c2dac9..3a1a681d 100644 --- a/preprocessing-pipeline-family.yaml +++ b/preprocessing-pipeline-family.yaml @@ -1,2 +1,2 @@ name: general -version: 0.0.87 +version: 0.0.88 diff --git a/sample-docs/failing-encrypted.pdf b/sample-docs/failing-encrypted.pdf new file mode 100644 index 00000000..f207fbaa Binary files /dev/null and b/sample-docs/failing-encrypted.pdf differ diff --git a/sample-docs/failing-invalid.pdf b/sample-docs/failing-invalid.pdf new file mode 100644 index 00000000..a8ca9eae Binary files /dev/null and b/sample-docs/failing-invalid.pdf differ diff --git a/sample-docs/failing-missing-pages.pdf b/sample-docs/failing-missing-pages.pdf new file mode 100644 index 00000000..b36599d1 Binary files /dev/null and b/sample-docs/failing-missing-pages.pdf differ diff --git a/sample-docs/failing-missing-root.pdf b/sample-docs/failing-missing-root.pdf new file mode 100644 index 00000000..1cdf801c Binary files /dev/null and b/sample-docs/failing-missing-root.pdf differ diff --git a/test_general/api/test_app.py b/test_general/api/test_app.py index bdc8c1aa..bc41708b 100644 --- a/test_general/api/test_app.py +++ b/test_general/api/test_app.py @@ -495,7 +495,7 @@ def test_general_api_returns_422_bad_pdf(): response = client.post( MAIN_API_ROUTE, files=[("files", (str(tmp.name), open(tmp.name, "rb"), "application/pdf"))] ) - assert response.json() == {"detail": "File does not appear to be a valid PDF"} + assert "File does not appear to be a valid PDF" in response.json()["detail"] assert response.status_code == 422 tmp.close() @@ -506,10 +506,56 @@ def test_general_api_returns_422_bad_pdf(): files=[("files", (str(test_file), open(test_file, "rb"), "application/pdf"))], ) - assert response.json() == {"detail": "File does not appear to be a valid PDF"} + assert "File does not appear to be a valid PDF" in response.json()["detail"] assert response.status_code == 422 +@pytest.mark.parametrize( + ("pdf_name", "expected_error_message"), + [ + ( + "failing-invalid.pdf", + "File does not appear to be a valid PDF. Error: Stream has ended unexpectedly", + ), + ( + "failing-missing-root.pdf", + "File does not appear to be a valid PDF. Error: Cannot find Root object in pdf", + ), + ( + "failing-missing-pages.pdf", + "File does not appear to be a valid PDF. Error: Invalid object in /Pages", + ), + ], +) +@pytest.mark.parametrize( + "strategy", + [ + "auto", + "fast", + "hi_res", + "ocr_only", + ], +) +def test_general_api_returns_422_invalid_pdf( + pdf_name: str, expected_error_message: str, strategy: str +): + """ + Verify that we get a 422 with the correct error message for invalid PDF files + """ + client = TestClient(app) + test_file = Path(__file__).parent.parent.parent / "sample-docs" / pdf_name + + with open(test_file, "rb") as f: + response = client.post( + MAIN_API_ROUTE, + files=[("files", (str(test_file), f))], + data={"strategy": strategy}, + ) + + assert response.status_code == 422 + assert expected_error_message == str(response.json()["detail"]) + + def test_general_api_returns_503(monkeypatch): """ When available memory is below the minimum. return a 503, unless our origin ip is 10.{4,5}.x.x @@ -939,13 +985,13 @@ def test_encrypted_pdf(): writer.encrypt(user_password="password123") writer.write(temp_file.name) - # Response should be 400 + # Response should be 422 response = client.post( MAIN_API_ROUTE, files=[("files", (str(temp_file.name), open(temp_file.name, "rb"), "application/pdf"))], ) assert response.json() == {"detail": "File is encrypted. Please decrypt it with password."} - assert response.status_code == 400 + assert response.status_code == 422 # This file is owner encrypted, i.e. readable with edit restrictions writer = PdfWriter() diff --git a/test_general/api/test_general.py b/test_general/api/test_general.py new file mode 100644 index 00000000..834048ab --- /dev/null +++ b/test_general/api/test_general.py @@ -0,0 +1,55 @@ +from __future__ import annotations + +import io +from pathlib import Path + +import pytest +from fastapi import HTTPException +from pypdf import PdfReader + +from prepline_general.api.general import _check_pdf + +TEST_ASSETS_DIR = Path(__file__).parent.parent.parent / "sample-docs" + + +def _open_pdf(pdf_path: str) -> io.BytesIO: + with open(pdf_path, "rb") as f: + pdf_content = f.read() + return io.BytesIO(pdf_content) + + +def test_check_pdf_with_valid_pdf(): + pdf_path = str(TEST_ASSETS_DIR / "list-item-example.pdf") + pdf = _open_pdf(pdf_path) + + result = _check_pdf(pdf) + assert isinstance(result, PdfReader) + + +@pytest.mark.parametrize( + ("pdf_name", "expected_error_message"), + [ + ("failing-encrypted.pdf", "File is encrypted. Please decrypt it with password."), + ( + "failing-invalid.pdf", + "File does not appear to be a valid PDF. Error: Stream has ended unexpectedly", + ), + ( + "failing-missing-root.pdf", + "File does not appear to be a valid PDF. Error: Cannot find Root object in pdf", + ), + ( + "failing-missing-pages.pdf", + "File does not appear to be a valid PDF. Error: Invalid object in /Pages", + ), + ], +) +def test_check_pdf_with_invalid_pdf(pdf_name: str, expected_error_message: str): + pdf_path = str(TEST_ASSETS_DIR / pdf_name) + pdf = _open_pdf(pdf_path) + + with pytest.raises(HTTPException) as exc_info: + _check_pdf(pdf) + + assert exc_info.value.status_code == 422 + assert expected_error_message == str(exc_info.value.detail)