Skip to content

fix: Return 422 for failing pdfs #518

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Jun 18, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
## 0.0.88
* Return 422 HTTP code when PDF can't be processed

## 0.0.87
* Patch various CVEs
* Enable pytest concurrency
Expand Down
2 changes: 1 addition & 1 deletion prepline_general/api/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.0.87" # pragma: no cover
__version__ = "0.0.88" # pragma: no cover
26 changes: 22 additions & 4 deletions prepline_general/api/general.py
Original file line number Diff line number Diff line change
Expand Up @@ -470,20 +470,38 @@ def _check_free_memory():


def _check_pdf(file: IO[bytes]):
"""Check if the PDF file is encrypted, otherwise assume it is not a valid PDF."""
"""
Check if PDF is:
- Encrypted
- Has corrupted pages
- Has corrupted root object

Throws:
- HTTPException 442 UNPROCESSABLE ENTITY if file is encrypted or corrupted
"""
try:
pdf = PdfReader(file)

# This will raise if the file is encrypted
pdf.metadata

# This will raise if the file's root object is corrupted
pdf.root_object

# This will raise if the file's pages are corrupted
list(pdf.pages)

return pdf
except FileNotDecryptedError:
raise HTTPException(
status_code=400,
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
detail="File is encrypted. Please decrypt it with password.",
)
except PdfReadError:
raise HTTPException(status_code=422, detail="File does not appear to be a valid PDF")
except PdfReadError as e:
raise HTTPException(
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
detail=f"File does not appear to be a valid PDF. Error: {e}",
)


def _validate_strategy(strategy: str) -> str:
Expand Down
2 changes: 1 addition & 1 deletion preprocessing-pipeline-family.yaml
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
name: general
version: 0.0.87
version: 0.0.88
Binary file added sample-docs/failing-encrypted.pdf
Binary file not shown.
Binary file added sample-docs/failing-invalid.pdf
Binary file not shown.
Binary file added sample-docs/failing-missing-pages.pdf
Binary file not shown.
Binary file added sample-docs/failing-missing-root.pdf
Binary file not shown.
54 changes: 50 additions & 4 deletions test_general/api/test_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -495,7 +495,7 @@ def test_general_api_returns_422_bad_pdf():
response = client.post(
MAIN_API_ROUTE, files=[("files", (str(tmp.name), open(tmp.name, "rb"), "application/pdf"))]
)
assert response.json() == {"detail": "File does not appear to be a valid PDF"}
assert "File does not appear to be a valid PDF" in response.json()["detail"]
assert response.status_code == 422
tmp.close()

Expand All @@ -506,10 +506,56 @@ def test_general_api_returns_422_bad_pdf():
files=[("files", (str(test_file), open(test_file, "rb"), "application/pdf"))],
)

assert response.json() == {"detail": "File does not appear to be a valid PDF"}
assert "File does not appear to be a valid PDF" in response.json()["detail"]
assert response.status_code == 422


@pytest.mark.parametrize(
("pdf_name", "expected_error_message"),
[
(
"failing-invalid.pdf",
"File does not appear to be a valid PDF. Error: Stream has ended unexpectedly",
),
(
"failing-missing-root.pdf",
"File does not appear to be a valid PDF. Error: Cannot find Root object in pdf",
),
(
"failing-missing-pages.pdf",
"File does not appear to be a valid PDF. Error: Invalid object in /Pages",
),
],
)
@pytest.mark.parametrize(
"strategy",
[
"auto",
"fast",
"hi_res",
"ocr_only",
],
)
def test_general_api_returns_422_invalid_pdf(
pdf_name: str, expected_error_message: str, strategy: str
):
"""
Verify that we get a 422 with the correct error message for invalid PDF files
"""
client = TestClient(app)
test_file = Path(__file__).parent.parent.parent / "sample-docs" / pdf_name

with open(test_file, "rb") as f:
response = client.post(
MAIN_API_ROUTE,
files=[("files", (str(test_file), f))],
data={"strategy": strategy},
)

assert response.status_code == 422
assert expected_error_message == str(response.json()["detail"])


def test_general_api_returns_503(monkeypatch):
"""
When available memory is below the minimum. return a 503, unless our origin ip is 10.{4,5}.x.x
Expand Down Expand Up @@ -939,13 +985,13 @@ def test_encrypted_pdf():
writer.encrypt(user_password="password123")
writer.write(temp_file.name)

# Response should be 400
# Response should be 422
response = client.post(
MAIN_API_ROUTE,
files=[("files", (str(temp_file.name), open(temp_file.name, "rb"), "application/pdf"))],
)
assert response.json() == {"detail": "File is encrypted. Please decrypt it with password."}
assert response.status_code == 400
assert response.status_code == 422

# This file is owner encrypted, i.e. readable with edit restrictions
writer = PdfWriter()
Expand Down
55 changes: 55 additions & 0 deletions test_general/api/test_general.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
from __future__ import annotations

import io
from pathlib import Path

import pytest
from fastapi import HTTPException
from pypdf import PdfReader

from prepline_general.api.general import _check_pdf

TEST_ASSETS_DIR = Path(__file__).parent.parent.parent / "sample-docs"


def _open_pdf(pdf_path: str) -> io.BytesIO:
with open(pdf_path, "rb") as f:
pdf_content = f.read()
return io.BytesIO(pdf_content)


def test_check_pdf_with_valid_pdf():
pdf_path = str(TEST_ASSETS_DIR / "list-item-example.pdf")
pdf = _open_pdf(pdf_path)

result = _check_pdf(pdf)
assert isinstance(result, PdfReader)


@pytest.mark.parametrize(
("pdf_name", "expected_error_message"),
[
("failing-encrypted.pdf", "File is encrypted. Please decrypt it with password."),
(
"failing-invalid.pdf",
"File does not appear to be a valid PDF. Error: Stream has ended unexpectedly",
),
(
"failing-missing-root.pdf",
"File does not appear to be a valid PDF. Error: Cannot find Root object in pdf",
),
(
"failing-missing-pages.pdf",
"File does not appear to be a valid PDF. Error: Invalid object in /Pages",
),
],
)
def test_check_pdf_with_invalid_pdf(pdf_name: str, expected_error_message: str):
pdf_path = str(TEST_ASSETS_DIR / pdf_name)
pdf = _open_pdf(pdf_path)

with pytest.raises(HTTPException) as exc_info:
_check_pdf(pdf)

assert exc_info.value.status_code == 422
assert expected_error_message == str(exc_info.value.detail)
Loading