Skip to content

Commit e779b3e

Browse files
committed
Fix file data serialization and add unit tests
1 parent ac790ac commit e779b3e

File tree

11 files changed

+175
-21
lines changed

11 files changed

+175
-21
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
## 0.0.15
2+
3+
* **Bugfix for file data serialization**
4+
15
## 0.0.14
26

37
* **Add support for batch file data**

requirements/cli.txt

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ idna==3.10
3232
# via anyio
3333
importlib-metadata==8.5.0
3434
# via opentelemetry-api
35-
marshmallow==3.23.1
35+
marshmallow==3.23.2
3636
# via dataclasses-json
3737
mypy-extensions==1.0.0
3838
# via typing-inspect
@@ -84,15 +84,15 @@ packaging==24.2
8484
# opentelemetry-instrumentation
8585
pandas==2.2.3
8686
# via unstructured-ingest
87-
protobuf==5.29.1
87+
protobuf==5.29.2
8888
# via
8989
# googleapis-common-protos
9090
# opentelemetry-proto
91-
pydantic==2.10.3
91+
pydantic==2.10.4
9292
# via
9393
# fastapi
9494
# unstructured-ingest
95-
pydantic-core==2.27.1
95+
pydantic-core==2.27.2
9696
# via pydantic
9797
python-dateutil==2.9.0.post0
9898
# via
@@ -122,7 +122,7 @@ typing-inspect==0.9.0
122122
# via dataclasses-json
123123
tzdata==2024.2
124124
# via pandas
125-
unstructured-ingest==0.3.10
125+
unstructured-ingest==0.3.11
126126
# via -r ./cli.in
127127
uvicorn==0.34.0
128128
# via -r ./cli.in

requirements/constraints.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
unstructured-ingest>=0.3.10
1+
unstructured-ingest>=0.3.1

requirements/lint.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ pyflakes==3.2.0
3434
# via
3535
# autoflake
3636
# flake8
37-
ruff==0.8.3
37+
ruff==0.8.4
3838
# via -r ./lint.in
3939
tomli==2.2.1
4040
# via

test/api/__init__.py

Whitespace-only changes.

test/api/test_api.py

Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
from pathlib import Path
2+
from typing import Any, Optional
3+
4+
import pytest
5+
from fastapi.testclient import TestClient
6+
from pydantic import BaseModel
7+
from unstructured_ingest.v2.interfaces import BatchFileData, BatchItem, FileData, SourceIdentifiers
8+
9+
from unstructured_platform_plugins.etl_uvicorn.api_generator import (
10+
EtlApiException,
11+
UsageData,
12+
wrap_in_fastapi,
13+
)
14+
from unstructured_platform_plugins.schema.filedata_meta import FileDataMeta
15+
16+
17+
class InvokeResponse(BaseModel):
18+
usage: list[UsageData]
19+
status_code: int
20+
filedata_meta: FileDataMeta
21+
status_code_text: Optional[str] = None
22+
output: Optional[Any] = None
23+
24+
def generic_validation(self):
25+
assert self.status_code == 200
26+
assert not self.status_code_text
27+
28+
29+
mock_file_data = [
30+
FileData(
31+
identifier="mock file data",
32+
connector_type="CON",
33+
source_identifiers=SourceIdentifiers(filename="n", fullpath="n"),
34+
),
35+
BatchFileData(
36+
identifier="mock file data", connector_type="CON", batch_items=[BatchItem(identifier="bid")]
37+
),
38+
]
39+
40+
41+
@pytest.mark.parametrize(
42+
"file_data", mock_file_data, ids=[type(fd).__name__ for fd in mock_file_data]
43+
)
44+
def test_async_sample_function(file_data):
45+
from test.assets.async_typed_dict_response import async_sample_function as test_fn
46+
47+
client = TestClient(wrap_in_fastapi(func=test_fn, plugin_id="mock_plugin"))
48+
49+
post_body = {"file_data": file_data.model_dump(), "content": {"a": 1, "b": 2}}
50+
resp = client.post("/invoke", json=post_body)
51+
resp_content = resp.json()
52+
invoke_response = InvokeResponse.model_validate(resp_content)
53+
invoke_response.generic_validation()
54+
output = invoke_response.output
55+
assert isinstance(output, dict)
56+
assert output == {"response": {"a_out": 1, "b_out": 2}}
57+
58+
59+
@pytest.mark.parametrize(
60+
"file_data", mock_file_data, ids=[type(fd).__name__ for fd in mock_file_data]
61+
)
62+
def test_dataclass_response(file_data):
63+
from test.assets.dataclass_response import sample_function_with_path as test_fn
64+
65+
client = TestClient(wrap_in_fastapi(func=test_fn, plugin_id="mock_plugin"))
66+
current_path = Path(__file__)
67+
68+
post_body = {"file_data": file_data.model_dump(), "c": 1, "b": "2", "a": str(current_path)}
69+
resp = client.post("/invoke", json=post_body)
70+
resp_content = resp.json()
71+
invoke_response = InvokeResponse.model_validate(resp_content)
72+
invoke_response.generic_validation()
73+
output = invoke_response.output
74+
assert isinstance(output, dict)
75+
assert output == {
76+
"t": "PosixPath",
77+
"exists": True,
78+
"resolved": str(current_path.resolve()),
79+
"b": "2",
80+
"c": 1,
81+
}
82+
83+
84+
@pytest.mark.parametrize(
85+
"file_data", mock_file_data, ids=[type(fd).__name__ for fd in mock_file_data]
86+
)
87+
def test_empty_input_and_output(file_data):
88+
from test.assets.empty_input_and_response import SampleClass as TestClass
89+
90+
test_class = TestClass()
91+
client = TestClient(wrap_in_fastapi(func=test_class.do_nothing, plugin_id="mock_plugin"))
92+
93+
resp = client.post("/invoke", json={"file_data": file_data.model_dump()})
94+
resp_content = resp.json()
95+
invoke_response = InvokeResponse.model_validate(resp_content)
96+
invoke_response.generic_validation()
97+
output = invoke_response.output
98+
assert not output
99+
100+
101+
@pytest.mark.parametrize(
102+
"file_data", mock_file_data, ids=[type(fd).__name__ for fd in mock_file_data]
103+
)
104+
def test_filedata_meta(file_data):
105+
from test.assets.filedata_meta import Input
106+
from test.assets.filedata_meta import process_input as test_fn
107+
108+
client = TestClient(wrap_in_fastapi(func=test_fn, plugin_id="mock_plugin"))
109+
110+
post_body = {"file_data": file_data.model_dump(), "i": Input(m=15).model_dump()}
111+
resp = client.post("/invoke", json=post_body)
112+
resp_content = resp.json()
113+
invoke_response = InvokeResponse.model_validate(resp_content)
114+
invoke_response.generic_validation()
115+
filedata_meta = invoke_response.filedata_meta
116+
assert len(filedata_meta.new_records) == 15
117+
assert filedata_meta.terminate_current
118+
assert not invoke_response.output
119+
120+
121+
def test_improper_function():
122+
from test.assets.improper_function import sample_improper_function as test_fn
123+
124+
with pytest.raises(EtlApiException):
125+
TestClient(wrap_in_fastapi(func=test_fn, plugin_id="mock_plugin"))

test/assets/async_typed_dict_response.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
1-
from typing import Any, TypedDict
1+
from typing import Any
2+
3+
from typing_extensions import TypedDict
24

35

46
class SampleFunctionResponse(TypedDict):

test/assets/filedata_meta.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
import math
2-
from typing import Optional
2+
from typing import Optional, Union
33

44
from pydantic import BaseModel
5-
from unstructured_ingest.v2.interfaces import FileData
5+
from unstructured_ingest.v2.interfaces import BatchFileData, FileData, SourceIdentifiers
66

77
from unstructured_platform_plugins.schema import FileDataMeta, NewRecord
88

@@ -15,17 +15,21 @@ class Output(BaseModel):
1515
n: float
1616

1717

18-
def process_input(i: Input, file_data: FileData, filedata_meta: FileDataMeta) -> Optional[Output]:
18+
def process_input(
19+
i: Input, file_data: Union[FileData, BatchFileData], filedata_meta: FileDataMeta
20+
) -> Optional[Output]:
1921
if i.m > 10:
2022
filedata_meta.terminate_current = True
2123
new_content = [
2224
NewRecord(
2325
file_data=FileData(
24-
identifier=str(i.m + x), connector_type=file_data.connector_type
26+
identifier=str(i.m + x),
27+
connector_type=file_data.connector_type,
28+
source_identifiers=SourceIdentifiers(filename=f"{x}.txt", fullpath=f"{x}.txt"),
2529
),
2630
contents=Output(n=float(i.m + x)),
2731
)
28-
for x in range(5)
32+
for x in range(i.m)
2933
]
3034
filedata_meta.new_records.extend(new_content)
3135
return None
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.0.14" # pragma: no cover
1+
__version__ = "0.0.15" # pragma: no cover

unstructured_platform_plugins/etl_uvicorn/api_generator.py

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@
1111
from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
1212
from pydantic import BaseModel, Field, create_model
1313
from starlette.responses import RedirectResponse
14-
from unstructured_ingest.v2.interfaces import FileData
1514
from uvicorn.config import LOG_LEVELS
1615
from uvicorn.importer import import_from_string
1716

@@ -30,6 +29,11 @@
3029
schema_to_base_model,
3130
)
3231

32+
33+
class EtlApiException(Exception):
34+
pass
35+
36+
3337
logger = logging.getLogger("uvicorn.error")
3438

3539

@@ -100,7 +104,19 @@ def wrap_in_fastapi(
100104
func: Callable,
101105
plugin_id: str,
102106
precheck_func: Optional[Callable] = None,
103-
):
107+
) -> FastAPI:
108+
try:
109+
return _wrap_in_fastapi(func=func, plugin_id=plugin_id, precheck_func=precheck_func)
110+
except Exception as e:
111+
logger.error(f"failed to wrap function in FastAPI: {e}", exc_info=True)
112+
raise EtlApiException(e) from e
113+
114+
115+
def _wrap_in_fastapi(
116+
func: Callable,
117+
plugin_id: str,
118+
precheck_func: Optional[Callable] = None,
119+
) -> FastAPI:
104120
if precheck_func is not None:
105121
check_precheck_func(precheck_func=precheck_func)
106122

@@ -184,11 +200,6 @@ async def run_job(request: input_schema_model) -> ResponseType:
184200
log_func_and_body(func=func, body=request.json())
185201
# Create dictionary from pydantic model while preserving underlying types
186202
request_dict = {f: getattr(request, f) for f in request.model_fields}
187-
# Map FileData back to original dataclass if present
188-
if "file_data" in request_dict:
189-
request_dict["file_data"] = FileData.from_dict(
190-
request_dict["file_data"].model_dump()
191-
)
192203
map_inputs(func=func, raw_inputs=request_dict)
193204
if logger.level == LOG_LEVELS.get("trace", logging.NOTSET):
194205
logger.log(level=logger.level, msg=f"passing inputs to function: {request_dict}")

unstructured_platform_plugins/etl_uvicorn/utils.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,11 +101,19 @@ def map_inputs(func: Callable, raw_inputs: dict[str, Any]) -> dict[str, Any]:
101101
field_value = raw_inputs[field_name]
102102
try:
103103
if (
104+
hasattr(type_data, "__origin__")
105+
and type_data.__origin__ is dict
106+
and isinstance(raw_inputs[field_name], dict)
107+
):
108+
# Expects a dict and value is already a dict
109+
continue
110+
elif (
104111
inspect.isclass(type_data)
105112
and issubclass(type_data, DataClassJsonMixin)
106113
and isinstance(field_value, dict)
107114
):
108115
raw_inputs[field_name] = type_data.from_dict(raw_inputs[field_name])
116+
109117
elif is_dataclass(type_data) and isinstance(field_value, dict):
110118
raw_inputs[field_name] = type_data(**raw_inputs[field_name])
111119
elif isinstance(type_data, EnumMeta):

0 commit comments

Comments
 (0)