From 5b6d1c47918bf1444d5ff9770961b5cc11d28de6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Po=C5=82om?= Date: Tue, 17 Jun 2025 11:07:45 +0200 Subject: [PATCH 1/7] fix: return 422 http code for pdfs that can't be processed --- prepline_general/api/general.py | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/prepline_general/api/general.py b/prepline_general/api/general.py index 17619702..50073073 100644 --- a/prepline_general/api/general.py +++ b/prepline_general/api/general.py @@ -51,7 +51,9 @@ def is_compatible_response_type(media_type: str, response_type: type) -> bool: return ( False if media_type == "application/json" and response_type not in [dict, list] - else False if media_type == "text/csv" and response_type != str else True + else False + if media_type == "text/csv" and response_type != str + else True ) @@ -469,20 +471,38 @@ def _check_free_memory(): def _check_pdf(file: IO[bytes]): - """Check if the PDF file is encrypted, otherwise assume it is not a valid PDF.""" + """ + Check if PDF is: + - Encrypted + - Has corrupted pages + - Has corrupted root object + + Throws: + - HTTPException 442 UNPROCESSABLE ENTITY if file is encrypted or corrupted + """ try: pdf = PdfReader(file) # This will raise if the file is encrypted pdf.metadata + + # This will raise if the file's root object is corrupted + pdf.root_object + + # This will raise if the file's pages are corrupted + list(pdf.pages) + return pdf except FileNotDecryptedError: raise HTTPException( - status_code=400, + status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, detail="File is encrypted. Please decrypt it with password.", ) - except PdfReadError: - raise HTTPException(status_code=422, detail="File does not appear to be a valid PDF") + except PdfReadError as e: + raise HTTPException( + status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, + detail=f"File does not appear to be a valid PDF. Error: {e}", + ) def _validate_strategy(strategy: str) -> str: From c1637cb4f0c63f33a0c06fa795ac437edd2f442b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Po=C5=82om?= Date: Tue, 17 Jun 2025 11:07:52 +0200 Subject: [PATCH 2/7] test: unit and integration test pdf check thrown error code and message --- sample-docs/failing-encrypted.pdf | Bin 0 -> 936 bytes sample-docs/failing-invalid.pdf | Bin 0 -> 70 bytes sample-docs/failing-missing-pages.pdf | Bin 0 -> 160 bytes sample-docs/failing-missing-root.pdf | Bin 0 -> 4858 bytes test_general/api/test_app.py | 42 ++++++++++++++++++++ test_general/api/test_general.py | 55 ++++++++++++++++++++++++++ 6 files changed, 97 insertions(+) create mode 100644 sample-docs/failing-encrypted.pdf create mode 100644 sample-docs/failing-invalid.pdf create mode 100644 sample-docs/failing-missing-pages.pdf create mode 100644 sample-docs/failing-missing-root.pdf create mode 100644 test_general/api/test_general.py diff --git a/sample-docs/failing-encrypted.pdf b/sample-docs/failing-encrypted.pdf new file mode 100644 index 0000000000000000000000000000000000000000..f207fbaa22ee6787379dad655bd9cb5e67541b67 GIT binary patch literal 936 zcmb_bKWh|06fYwqq|id_Ug4E-=l|@$a)n6_L?y?a+K9#M&g^+|?uNYyBw}x8VJC>6 zLGTOMDOgy<%2E-6jUT|`e0yG!#cQ3%GBfYb@4bERH@)G`V2fRsr1$On=VQWvLY+S* zSw@m!Q~QG=G?1l^389oI@;>SJNvM45!#lorkM;v3LpKR?l5E!pRRl(oTT?%W2O!aL z1P?o*Vl8yrMK`M_n6<@#7JJkQXdzDPI?qLtJHb!gjrtI?P!!EL7|Ssi7E#4e#W|=Y z6yi2`T?1~dr@*&EHHnWviZ<~)Ho-k1S7tA-j^Dkw_TkwkTzLKQOaIOO<;ADP&+OvK z#;ac&C%=DOTD;wSe*E?QR@;|V)T%?e1AZSkNk$+@vP#2vihm1|TWPi77R`g{EMmWt zadee;jVH+-WYoLB%YqrMEOU;PnwBQDDfK2TJ$F_Y!Wx_BxeQD@?F#3mGw8@eE)k7)#f6+N|lg!e;)1Cu9dNHf}l#&62s}>C=GaC8N5I z&~j}=lABdoLrd@Wyn}qnv=$scrIpY^;?4zI?x;ga1=VN}R0v`I3VP!S{V0m={|^Xc e>TXpuNBa?4wk$b6kFJT!CM%5(v$wl9Ab$WW+Vr3R literal 0 HcmV?d00001 diff --git a/sample-docs/failing-invalid.pdf b/sample-docs/failing-invalid.pdf new file mode 100644 index 0000000000000000000000000000000000000000..a8ca9eae865d739f06c073d00a7e5fce6ab1cec3 GIT binary patch literal 70 zcmWH^$ShU>qP+YPg+zq_7dM5p%$!t(L?ACGM^C}4w75i}ASW?1PoX5W0>~^%ElJHw S$;?YvD9Kky0xAUQ(E|WRUl)G> literal 0 HcmV?d00001 diff --git a/sample-docs/failing-missing-pages.pdf b/sample-docs/failing-missing-pages.pdf new file mode 100644 index 0000000000000000000000000000000000000000..b36599d1025da79b2f25508a2c8b4c4d4a33e614 GIT binary patch literal 160 zcmYL?u?~VT7=-tGid#Br0j<%5u&AsO^aX1CArL}M>7YKn&_Qp!`|eIvr*f69w@8`L z-FxIY^zHo=>Z1*Ia4mFYTbY>6K#ih+jE@Dx%S(fV8Iqs4GiGDh@b4WMQ;tONwj0F9 bi);6hxvFdD0+;VNJq7DS+%g)0jEK2af3)aZz72)ZqAW20jR!9?0C;Fkd2Xd>`(<$|n-@#=sxKCn zv1FPz8Qv7;7*r?h1;Qf%n~4WM2h*Qd zR}P-$k{I3Bj07#t>3br{;I&iF^3Ffrf8$R-p5=UzH!EnCuW+7IZ|HF_j^JL3q=i3x zs|TG!`?(~1aPsqC-iF{(pZfT~Qlwi;Gw@&KeKsG-!5rhAb7{2KfiD6VxoiXtIU<=3 zJ)xXLk4AgI!$1zZBaSoAtla1aw-*>Hb-%`+xDHzKR>d_*rT*;e3tb2MFMwTqsq07r zE#Sd#i$`e1v%Cvh19m-q+2?{`vTus>i!|oN36M!3SuAEZP7R`cq3;_nlW*?5_4#+- zM;iex47ed*zk4&#L`OToh3?_kE`!?u(gMlLbD%H148q|EnvZ_HL7NfoE`?hdKVjg} znc8|5>9mQ(*1S&#lTN23rmdzOh|`&DE|)u*v1-n|=M1Oa+Jcy?BSM{eRV*yvd^rBv z@E8KWS;6#H-KSwqW&~%ZD$YN`l8E{fvGf#Ucz%4zUwhF5Y8|q zFt2SlDTn5r_ZDmqZW!8h!wH5YR(0U)wV@P_xD7$E)o9f0WlD2fjy}yE-+}hxLHe7t zmTmD--ba$CHkqo31s|JIN!6=0n@M7d{O_}bE`@vHvtb3DjMW&ko$3;Qlv=6II19)t zx@w3M)++DFiKHAXIumNbN3-E5nxhC;UBr+;Cmp29VO=(Z1(28xm6k=uBp)J`*$PvF z{cpQ1$kAKhILhK;dP1CZy=AK=RxBoO8A7T2%Z6l9+lEk$*g7N#h0RDQc`ZArJ%MZU z%C;cS?-=AWPCZp`I#JNa0!3D_=Cqi1yr!R7vOLgh~S5+g=E#qqHYcrFkmqrojFAr^3J~E*apB zu->yTQlTm_HcpBgDoAIR3^G=@LtkK4ygpMtzW)}8)Uyk89Z1Xn>)jtvwL6d<0)33k zxbvGEq!7WO%y?+s-M2q^Qy8LJH$FHy{`CEyfun)}xO0=Ypz9URQpH-Q3MV8;()-Hc zzW(3;>d*@x-P&HWx%MBa(i{iM$=y^*0pA49tXlUz;N4D|hv;?S#+6-EF0asKK?jC4 z^WzU&UFdxU_~~~@OhqlcP=Mob6ix3ih;)?*kjfzkVU}wAS~z5}eXj_&@2@}MdX;Mj z9=ti0Luag?iXYXw2z>I;l)Sr@Mti&z$p!My7_j#O(TCp~3SEiLhVt1hAyj>M3xel? zUjv?Qk3UTzGFF}LTK^&E27LyRW_0xhhy*cB43Hqhd+#Z^p*gFE+vCRXfvN$1_9K2K`v6P zQcN8XKmbC2|6gwBpIPI*J$pZ;*)nuNJXhs1qVN5P^L>$sVhhlso$rLyt6a7oJt8Lo zC^E%JxU_Wge0UP7bb$6XGN7c;ad8SR1C$c*sZeIRl$c{oEi*+iz$nQw(~?ONnPq0E z^iWISsX?4UGTj7cYS2rl;BXfKYwBPXokp7)u}KAU2g8-5LCcV?VuHm!gH<(>1`4W% zX%#Jyny`dRHJVp7X)#SERwE0NI)(+tv@cshRpKucyu{`uOr-y+Wtqg8mGXJV7WXoqXTp@DV zY|cm~WL?puw3*8#G(%GpvYA!Y)X0}>(B)&t(;~~p_R~?h($2gT!vNMrr}k1eNF8|M z?#=K}_fS6BRXjD&b2SW3%F?qGF(-_PC;P&jO(&yw&;G8i9vK|r*M}}XiRK9);JIFv zGn~@;fB*g)pa-Ap-A~C3K<9G2Uz9EkQYB+RN;r=d@4wgGy~}9s%XUWOs%I|>67C!V z5<>8kb;Oyj>J0Kt5$>?`WA#muj4NAAq~Jam6KQ70z+@6zNWW+NjyIrJ6 z9a>5vz9M!%ZkMbl;zq#u4kMBhzAv_|s7N}3c$-KfQy<}+4pFjG%j^&-_^%-5XhrT6 zDV-v1l>;NtDmD+we1+N!L@ysAoy8uOPbX)`eroa_93x81<6yKQIdaEZHkasjx<*T!$+x3p^FG KHYqFI+W!EY#k1c4 literal 0 HcmV?d00001 diff --git a/test_general/api/test_app.py b/test_general/api/test_app.py index bdc8c1aa..c4c4b51c 100644 --- a/test_general/api/test_app.py +++ b/test_general/api/test_app.py @@ -1155,3 +1155,45 @@ def test_include_slide_notes(monkeypatch, test_default, include_slide_notes, tes assert "Here are important notes" == df["text"][0] else: assert "Here are important notes" != df["text"][0] + + +@pytest.mark.parametrize( + ("pdf_name", "expected_error_message"), + [ + ("failing-encrypted.pdf", "File is encrypted. Please decrypt it with password."), + ( + "failing-invalid.pdf", + "File does not appear to be a valid PDF. Error: Stream has ended unexpectedly", + ), + ( + "failing-missing-root.pdf", + "File does not appear to be a valid PDF. Error: Cannot find Root object in pdf", + ), + ( + "failing-missing-pages.pdf", + "File does not appear to be a valid PDF. Error: Invalid object in /Pages", + ), + ], +) +@pytest.mark.parametrize( + "strategy", + [ + "auto", + "fast", + "hi_res", + "ocr_only", + ], +) +def test_failing_pdfs_return_422(pdf_name: str, expected_error_message: str, strategy: str): + client = TestClient(app) + test_file = Path(__file__).parent.parent.parent / "sample-docs" / pdf_name + + with open(test_file, "rb") as f: + response = client.post( + MAIN_API_ROUTE, + files=[("files", (str(test_file), f))], + data={"strategy": strategy}, + ) + + assert response.status_code == 422 + assert expected_error_message == str(response.json()["detail"]) diff --git a/test_general/api/test_general.py b/test_general/api/test_general.py new file mode 100644 index 00000000..834048ab --- /dev/null +++ b/test_general/api/test_general.py @@ -0,0 +1,55 @@ +from __future__ import annotations + +import io +from pathlib import Path + +import pytest +from fastapi import HTTPException +from pypdf import PdfReader + +from prepline_general.api.general import _check_pdf + +TEST_ASSETS_DIR = Path(__file__).parent.parent.parent / "sample-docs" + + +def _open_pdf(pdf_path: str) -> io.BytesIO: + with open(pdf_path, "rb") as f: + pdf_content = f.read() + return io.BytesIO(pdf_content) + + +def test_check_pdf_with_valid_pdf(): + pdf_path = str(TEST_ASSETS_DIR / "list-item-example.pdf") + pdf = _open_pdf(pdf_path) + + result = _check_pdf(pdf) + assert isinstance(result, PdfReader) + + +@pytest.mark.parametrize( + ("pdf_name", "expected_error_message"), + [ + ("failing-encrypted.pdf", "File is encrypted. Please decrypt it with password."), + ( + "failing-invalid.pdf", + "File does not appear to be a valid PDF. Error: Stream has ended unexpectedly", + ), + ( + "failing-missing-root.pdf", + "File does not appear to be a valid PDF. Error: Cannot find Root object in pdf", + ), + ( + "failing-missing-pages.pdf", + "File does not appear to be a valid PDF. Error: Invalid object in /Pages", + ), + ], +) +def test_check_pdf_with_invalid_pdf(pdf_name: str, expected_error_message: str): + pdf_path = str(TEST_ASSETS_DIR / pdf_name) + pdf = _open_pdf(pdf_path) + + with pytest.raises(HTTPException) as exc_info: + _check_pdf(pdf) + + assert exc_info.value.status_code == 422 + assert expected_error_message == str(exc_info.value.detail) From 4b564118ac6da3e91fc731703ea624f9b65c2313 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Po=C5=82om?= Date: Tue, 17 Jun 2025 11:14:49 +0200 Subject: [PATCH 3/7] fix: black formatting --- prepline_general/api/general.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/prepline_general/api/general.py b/prepline_general/api/general.py index 50073073..d67c3e0b 100644 --- a/prepline_general/api/general.py +++ b/prepline_general/api/general.py @@ -51,9 +51,7 @@ def is_compatible_response_type(media_type: str, response_type: type) -> bool: return ( False if media_type == "application/json" and response_type not in [dict, list] - else False - if media_type == "text/csv" and response_type != str - else True + else False if media_type == "text/csv" and response_type != str else True ) From e96d58398cf197ff6265eec0a9b6ce3e43127b13 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Po=C5=82om?= Date: Tue, 17 Jun 2025 11:20:31 +0200 Subject: [PATCH 4/7] chore: version bump to 0.0.87 --- CHANGELOG.md | 3 +++ prepline_general/api/app.py | 2 +- prepline_general/api/general.py | 4 ++-- preprocessing-pipeline-family.yaml | 2 +- 4 files changed, 7 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index cd37b437..c28d4e4a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,6 @@ +## 0.0.87 +* Return 422 HTTP code when PDF can't be processed + ## 0.0.86 * Patch various CVEs diff --git a/prepline_general/api/app.py b/prepline_general/api/app.py index 26380bfb..5868797d 100644 --- a/prepline_general/api/app.py +++ b/prepline_general/api/app.py @@ -13,7 +13,7 @@ app = FastAPI( title="Unstructured Pipeline API", summary="Partition documents with the Unstructured library", - version="0.0.86", + version="0.0.87", docs_url="/general/docs", openapi_url="/general/openapi.json", servers=[ diff --git a/prepline_general/api/general.py b/prepline_general/api/general.py index d67c3e0b..d5a1b3e3 100644 --- a/prepline_general/api/general.py +++ b/prepline_general/api/general.py @@ -618,7 +618,7 @@ def return_content_type(filename: str): @router.get("/general/v0/general", include_in_schema=False) -@router.get("/general/v0.0.86/general", include_in_schema=False) +@router.get("/general/v0.0.87/general", include_in_schema=False) async def handle_invalid_get_request(): raise HTTPException( status_code=status.HTTP_405_METHOD_NOT_ALLOWED, detail="Only POST requests are supported." @@ -633,7 +633,7 @@ async def handle_invalid_get_request(): description="Description", operation_id="partition_parameters", ) -@router.post("/general/v0.0.86/general", include_in_schema=False) +@router.post("/general/v0.0.87/general", include_in_schema=False) def general_partition( request: Request, # cannot use annotated type here because of a bug described here: diff --git a/preprocessing-pipeline-family.yaml b/preprocessing-pipeline-family.yaml index 039aef55..a7c2dac9 100644 --- a/preprocessing-pipeline-family.yaml +++ b/preprocessing-pipeline-family.yaml @@ -1,2 +1,2 @@ name: general -version: 0.0.86 +version: 0.0.87 From 53e5b705a7d1e66d0efc4bce624c0f367a305e2c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Po=C5=82om?= Date: Tue, 17 Jun 2025 11:37:45 +0200 Subject: [PATCH 5/7] chore: version bump to 0.0.88 --- CHANGELOG.md | 4 ++-- prepline_general/api/__version__.py | 2 +- preprocessing-pipeline-family.yaml | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2f83af61..5fd5e3ad 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,7 @@ -## 0.0.87 +## 0.0.88 * Return 422 HTTP code when PDF can't be processed -## 0.0.86 +## 0.0.87 * Patch various CVEs * Enable pytest concurrency * Enable Claude Code diff --git a/prepline_general/api/__version__.py b/prepline_general/api/__version__.py index 52a2b406..59542b61 100644 --- a/prepline_general/api/__version__.py +++ b/prepline_general/api/__version__.py @@ -1 +1 @@ -__version__ = "0.0.87" # pragma: no cover +__version__ = "0.0.88" # pragma: no cover diff --git a/preprocessing-pipeline-family.yaml b/preprocessing-pipeline-family.yaml index a7c2dac9..3a1a681d 100644 --- a/preprocessing-pipeline-family.yaml +++ b/preprocessing-pipeline-family.yaml @@ -1,2 +1,2 @@ name: general -version: 0.0.87 +version: 0.0.88 From 91f335bfd7771939837711716188c79fe6b5b0e9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Po=C5=82om?= Date: Tue, 17 Jun 2025 12:35:08 +0200 Subject: [PATCH 6/7] chore: clean up tests for invalid pdfs --- test_general/api/test_app.py | 100 +++++++++++++++++++---------------- 1 file changed, 54 insertions(+), 46 deletions(-) diff --git a/test_general/api/test_app.py b/test_general/api/test_app.py index c4c4b51c..58b088ad 100644 --- a/test_general/api/test_app.py +++ b/test_general/api/test_app.py @@ -495,7 +495,9 @@ def test_general_api_returns_422_bad_pdf(): response = client.post( MAIN_API_ROUTE, files=[("files", (str(tmp.name), open(tmp.name, "rb"), "application/pdf"))] ) - assert response.json() == {"detail": "File does not appear to be a valid PDF"} + assert response.json() == { + "detail": "File does not appear to be a valid PDF. Error: Cannot read an empty file" + } assert response.status_code == 422 tmp.close() @@ -506,10 +508,58 @@ def test_general_api_returns_422_bad_pdf(): files=[("files", (str(test_file), open(test_file, "rb"), "application/pdf"))], ) - assert response.json() == {"detail": "File does not appear to be a valid PDF"} + assert response.json() == { + "detail": "File does not appear to be a valid PDF. Error: Cannot read an empty file" + } assert response.status_code == 422 +@pytest.mark.parametrize( + ("pdf_name", "expected_error_message"), + [ + ( + "failing-invalid.pdf", + "File does not appear to be a valid PDF. Error: Stream has ended unexpectedly", + ), + ( + "failing-missing-root.pdf", + "File does not appear to be a valid PDF. Error: Cannot find Root object in pdf", + ), + ( + "failing-missing-pages.pdf", + "File does not appear to be a valid PDF. Error: Invalid object in /Pages", + ), + ], +) +@pytest.mark.parametrize( + "strategy", + [ + "auto", + "fast", + "hi_res", + "ocr_only", + ], +) +def test_general_api_returns_422_invalid_pdf( + pdf_name: str, expected_error_message: str, strategy: str +): + """ + Verify that we get a 422 with the correct error message for invalid PDF files + """ + client = TestClient(app) + test_file = Path(__file__).parent.parent.parent / "sample-docs" / pdf_name + + with open(test_file, "rb") as f: + response = client.post( + MAIN_API_ROUTE, + files=[("files", (str(test_file), f))], + data={"strategy": strategy}, + ) + + assert response.status_code == 422 + assert expected_error_message == str(response.json()["detail"]) + + def test_general_api_returns_503(monkeypatch): """ When available memory is below the minimum. return a 503, unless our origin ip is 10.{4,5}.x.x @@ -939,13 +989,13 @@ def test_encrypted_pdf(): writer.encrypt(user_password="password123") writer.write(temp_file.name) - # Response should be 400 + # Response should be 422 response = client.post( MAIN_API_ROUTE, files=[("files", (str(temp_file.name), open(temp_file.name, "rb"), "application/pdf"))], ) assert response.json() == {"detail": "File is encrypted. Please decrypt it with password."} - assert response.status_code == 400 + assert response.status_code == 422 # This file is owner encrypted, i.e. readable with edit restrictions writer = PdfWriter() @@ -1155,45 +1205,3 @@ def test_include_slide_notes(monkeypatch, test_default, include_slide_notes, tes assert "Here are important notes" == df["text"][0] else: assert "Here are important notes" != df["text"][0] - - -@pytest.mark.parametrize( - ("pdf_name", "expected_error_message"), - [ - ("failing-encrypted.pdf", "File is encrypted. Please decrypt it with password."), - ( - "failing-invalid.pdf", - "File does not appear to be a valid PDF. Error: Stream has ended unexpectedly", - ), - ( - "failing-missing-root.pdf", - "File does not appear to be a valid PDF. Error: Cannot find Root object in pdf", - ), - ( - "failing-missing-pages.pdf", - "File does not appear to be a valid PDF. Error: Invalid object in /Pages", - ), - ], -) -@pytest.mark.parametrize( - "strategy", - [ - "auto", - "fast", - "hi_res", - "ocr_only", - ], -) -def test_failing_pdfs_return_422(pdf_name: str, expected_error_message: str, strategy: str): - client = TestClient(app) - test_file = Path(__file__).parent.parent.parent / "sample-docs" / pdf_name - - with open(test_file, "rb") as f: - response = client.post( - MAIN_API_ROUTE, - files=[("files", (str(test_file), f))], - data={"strategy": strategy}, - ) - - assert response.status_code == 422 - assert expected_error_message == str(response.json()["detail"]) From ca1f3e4a69991a7173b8e74cea016698eb265aed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Po=C5=82om?= Date: Tue, 17 Jun 2025 12:56:49 +0200 Subject: [PATCH 7/7] fix: test_general_api_returns_422_bad_pdf error message assert fix --- test_general/api/test_app.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/test_general/api/test_app.py b/test_general/api/test_app.py index 58b088ad..bc41708b 100644 --- a/test_general/api/test_app.py +++ b/test_general/api/test_app.py @@ -495,9 +495,7 @@ def test_general_api_returns_422_bad_pdf(): response = client.post( MAIN_API_ROUTE, files=[("files", (str(tmp.name), open(tmp.name, "rb"), "application/pdf"))] ) - assert response.json() == { - "detail": "File does not appear to be a valid PDF. Error: Cannot read an empty file" - } + assert "File does not appear to be a valid PDF" in response.json()["detail"] assert response.status_code == 422 tmp.close() @@ -508,9 +506,7 @@ def test_general_api_returns_422_bad_pdf(): files=[("files", (str(test_file), open(test_file, "rb"), "application/pdf"))], ) - assert response.json() == { - "detail": "File does not appear to be a valid PDF. Error: Cannot read an empty file" - } + assert "File does not appear to be a valid PDF" in response.json()["detail"] assert response.status_code == 422