From e779b3e9937553fb60bd765f0dffdc741af75224 Mon Sep 17 00:00:00 2001 From: Roman Isecke Date: Thu, 19 Dec 2024 10:59:04 -0500 Subject: [PATCH 1/3] Fix file data serialization and add unit tests --- CHANGELOG.md | 4 + requirements/cli.txt | 10 +- requirements/constraints.txt | 2 +- requirements/lint.txt | 2 +- test/api/__init__.py | 0 test/api/test_api.py | 125 ++++++++++++++++++ test/assets/async_typed_dict_response.py | 4 +- test/assets/filedata_meta.py | 14 +- unstructured_platform_plugins/__version__.py | 2 +- .../etl_uvicorn/api_generator.py | 25 +++- .../etl_uvicorn/utils.py | 8 ++ 11 files changed, 175 insertions(+), 21 deletions(-) create mode 100644 test/api/__init__.py create mode 100644 test/api/test_api.py diff --git a/CHANGELOG.md b/CHANGELOG.md index e91fcdd..caee205 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +## 0.0.15 + +* **Bugfix for file data serialization** + ## 0.0.14 * **Add support for batch file data** diff --git a/requirements/cli.txt b/requirements/cli.txt index 3f40021..747a429 100644 --- a/requirements/cli.txt +++ b/requirements/cli.txt @@ -32,7 +32,7 @@ idna==3.10 # via anyio importlib-metadata==8.5.0 # via opentelemetry-api -marshmallow==3.23.1 +marshmallow==3.23.2 # via dataclasses-json mypy-extensions==1.0.0 # via typing-inspect @@ -84,15 +84,15 @@ packaging==24.2 # opentelemetry-instrumentation pandas==2.2.3 # via unstructured-ingest -protobuf==5.29.1 +protobuf==5.29.2 # via # googleapis-common-protos # opentelemetry-proto -pydantic==2.10.3 +pydantic==2.10.4 # via # fastapi # unstructured-ingest -pydantic-core==2.27.1 +pydantic-core==2.27.2 # via pydantic python-dateutil==2.9.0.post0 # via @@ -122,7 +122,7 @@ typing-inspect==0.9.0 # via dataclasses-json tzdata==2024.2 # via pandas -unstructured-ingest==0.3.10 +unstructured-ingest==0.3.11 # via -r ./cli.in uvicorn==0.34.0 # via -r ./cli.in diff --git a/requirements/constraints.txt b/requirements/constraints.txt index 743fbed..d7a9a12 100644 --- a/requirements/constraints.txt +++ b/requirements/constraints.txt @@ -1 +1 @@ -unstructured-ingest>=0.3.10 +unstructured-ingest>=0.3.1 diff --git a/requirements/lint.txt b/requirements/lint.txt index a29def1..ed26d42 100644 --- a/requirements/lint.txt +++ b/requirements/lint.txt @@ -34,7 +34,7 @@ pyflakes==3.2.0 # via # autoflake # flake8 -ruff==0.8.3 +ruff==0.8.4 # via -r ./lint.in tomli==2.2.1 # via diff --git a/test/api/__init__.py b/test/api/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/test/api/test_api.py b/test/api/test_api.py new file mode 100644 index 0000000..9792bce --- /dev/null +++ b/test/api/test_api.py @@ -0,0 +1,125 @@ +from pathlib import Path +from typing import Any, Optional + +import pytest +from fastapi.testclient import TestClient +from pydantic import BaseModel +from unstructured_ingest.v2.interfaces import BatchFileData, BatchItem, FileData, SourceIdentifiers + +from unstructured_platform_plugins.etl_uvicorn.api_generator import ( + EtlApiException, + UsageData, + wrap_in_fastapi, +) +from unstructured_platform_plugins.schema.filedata_meta import FileDataMeta + + +class InvokeResponse(BaseModel): + usage: list[UsageData] + status_code: int + filedata_meta: FileDataMeta + status_code_text: Optional[str] = None + output: Optional[Any] = None + + def generic_validation(self): + assert self.status_code == 200 + assert not self.status_code_text + + +mock_file_data = [ + FileData( + identifier="mock file data", + connector_type="CON", + source_identifiers=SourceIdentifiers(filename="n", fullpath="n"), + ), + BatchFileData( + identifier="mock file data", connector_type="CON", batch_items=[BatchItem(identifier="bid")] + ), +] + + +@pytest.mark.parametrize( + "file_data", mock_file_data, ids=[type(fd).__name__ for fd in mock_file_data] +) +def test_async_sample_function(file_data): + from test.assets.async_typed_dict_response import async_sample_function as test_fn + + client = TestClient(wrap_in_fastapi(func=test_fn, plugin_id="mock_plugin")) + + post_body = {"file_data": file_data.model_dump(), "content": {"a": 1, "b": 2}} + resp = client.post("/invoke", json=post_body) + resp_content = resp.json() + invoke_response = InvokeResponse.model_validate(resp_content) + invoke_response.generic_validation() + output = invoke_response.output + assert isinstance(output, dict) + assert output == {"response": {"a_out": 1, "b_out": 2}} + + +@pytest.mark.parametrize( + "file_data", mock_file_data, ids=[type(fd).__name__ for fd in mock_file_data] +) +def test_dataclass_response(file_data): + from test.assets.dataclass_response import sample_function_with_path as test_fn + + client = TestClient(wrap_in_fastapi(func=test_fn, plugin_id="mock_plugin")) + current_path = Path(__file__) + + post_body = {"file_data": file_data.model_dump(), "c": 1, "b": "2", "a": str(current_path)} + resp = client.post("/invoke", json=post_body) + resp_content = resp.json() + invoke_response = InvokeResponse.model_validate(resp_content) + invoke_response.generic_validation() + output = invoke_response.output + assert isinstance(output, dict) + assert output == { + "t": "PosixPath", + "exists": True, + "resolved": str(current_path.resolve()), + "b": "2", + "c": 1, + } + + +@pytest.mark.parametrize( + "file_data", mock_file_data, ids=[type(fd).__name__ for fd in mock_file_data] +) +def test_empty_input_and_output(file_data): + from test.assets.empty_input_and_response import SampleClass as TestClass + + test_class = TestClass() + client = TestClient(wrap_in_fastapi(func=test_class.do_nothing, plugin_id="mock_plugin")) + + resp = client.post("/invoke", json={"file_data": file_data.model_dump()}) + resp_content = resp.json() + invoke_response = InvokeResponse.model_validate(resp_content) + invoke_response.generic_validation() + output = invoke_response.output + assert not output + + +@pytest.mark.parametrize( + "file_data", mock_file_data, ids=[type(fd).__name__ for fd in mock_file_data] +) +def test_filedata_meta(file_data): + from test.assets.filedata_meta import Input + from test.assets.filedata_meta import process_input as test_fn + + client = TestClient(wrap_in_fastapi(func=test_fn, plugin_id="mock_plugin")) + + post_body = {"file_data": file_data.model_dump(), "i": Input(m=15).model_dump()} + resp = client.post("/invoke", json=post_body) + resp_content = resp.json() + invoke_response = InvokeResponse.model_validate(resp_content) + invoke_response.generic_validation() + filedata_meta = invoke_response.filedata_meta + assert len(filedata_meta.new_records) == 15 + assert filedata_meta.terminate_current + assert not invoke_response.output + + +def test_improper_function(): + from test.assets.improper_function import sample_improper_function as test_fn + + with pytest.raises(EtlApiException): + TestClient(wrap_in_fastapi(func=test_fn, plugin_id="mock_plugin")) diff --git a/test/assets/async_typed_dict_response.py b/test/assets/async_typed_dict_response.py index fa1db71..e8d9877 100644 --- a/test/assets/async_typed_dict_response.py +++ b/test/assets/async_typed_dict_response.py @@ -1,4 +1,6 @@ -from typing import Any, TypedDict +from typing import Any + +from typing_extensions import TypedDict class SampleFunctionResponse(TypedDict): diff --git a/test/assets/filedata_meta.py b/test/assets/filedata_meta.py index 55ebd0a..d1808db 100644 --- a/test/assets/filedata_meta.py +++ b/test/assets/filedata_meta.py @@ -1,8 +1,8 @@ import math -from typing import Optional +from typing import Optional, Union from pydantic import BaseModel -from unstructured_ingest.v2.interfaces import FileData +from unstructured_ingest.v2.interfaces import BatchFileData, FileData, SourceIdentifiers from unstructured_platform_plugins.schema import FileDataMeta, NewRecord @@ -15,17 +15,21 @@ class Output(BaseModel): n: float -def process_input(i: Input, file_data: FileData, filedata_meta: FileDataMeta) -> Optional[Output]: +def process_input( + i: Input, file_data: Union[FileData, BatchFileData], filedata_meta: FileDataMeta +) -> Optional[Output]: if i.m > 10: filedata_meta.terminate_current = True new_content = [ NewRecord( file_data=FileData( - identifier=str(i.m + x), connector_type=file_data.connector_type + identifier=str(i.m + x), + connector_type=file_data.connector_type, + source_identifiers=SourceIdentifiers(filename=f"{x}.txt", fullpath=f"{x}.txt"), ), contents=Output(n=float(i.m + x)), ) - for x in range(5) + for x in range(i.m) ] filedata_meta.new_records.extend(new_content) return None diff --git a/unstructured_platform_plugins/__version__.py b/unstructured_platform_plugins/__version__.py index b3c929d..5864351 100644 --- a/unstructured_platform_plugins/__version__.py +++ b/unstructured_platform_plugins/__version__.py @@ -1 +1 @@ -__version__ = "0.0.14" # pragma: no cover +__version__ = "0.0.15" # pragma: no cover diff --git a/unstructured_platform_plugins/etl_uvicorn/api_generator.py b/unstructured_platform_plugins/etl_uvicorn/api_generator.py index a9dd8b0..b95e009 100644 --- a/unstructured_platform_plugins/etl_uvicorn/api_generator.py +++ b/unstructured_platform_plugins/etl_uvicorn/api_generator.py @@ -11,7 +11,6 @@ from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor from pydantic import BaseModel, Field, create_model from starlette.responses import RedirectResponse -from unstructured_ingest.v2.interfaces import FileData from uvicorn.config import LOG_LEVELS from uvicorn.importer import import_from_string @@ -30,6 +29,11 @@ schema_to_base_model, ) + +class EtlApiException(Exception): + pass + + logger = logging.getLogger("uvicorn.error") @@ -100,7 +104,19 @@ def wrap_in_fastapi( func: Callable, plugin_id: str, precheck_func: Optional[Callable] = None, -): +) -> FastAPI: + try: + return _wrap_in_fastapi(func=func, plugin_id=plugin_id, precheck_func=precheck_func) + except Exception as e: + logger.error(f"failed to wrap function in FastAPI: {e}", exc_info=True) + raise EtlApiException(e) from e + + +def _wrap_in_fastapi( + func: Callable, + plugin_id: str, + precheck_func: Optional[Callable] = None, +) -> FastAPI: if precheck_func is not None: check_precheck_func(precheck_func=precheck_func) @@ -184,11 +200,6 @@ async def run_job(request: input_schema_model) -> ResponseType: log_func_and_body(func=func, body=request.json()) # Create dictionary from pydantic model while preserving underlying types request_dict = {f: getattr(request, f) for f in request.model_fields} - # Map FileData back to original dataclass if present - if "file_data" in request_dict: - request_dict["file_data"] = FileData.from_dict( - request_dict["file_data"].model_dump() - ) map_inputs(func=func, raw_inputs=request_dict) if logger.level == LOG_LEVELS.get("trace", logging.NOTSET): logger.log(level=logger.level, msg=f"passing inputs to function: {request_dict}") diff --git a/unstructured_platform_plugins/etl_uvicorn/utils.py b/unstructured_platform_plugins/etl_uvicorn/utils.py index d391d82..4f43ec9 100644 --- a/unstructured_platform_plugins/etl_uvicorn/utils.py +++ b/unstructured_platform_plugins/etl_uvicorn/utils.py @@ -101,11 +101,19 @@ def map_inputs(func: Callable, raw_inputs: dict[str, Any]) -> dict[str, Any]: field_value = raw_inputs[field_name] try: if ( + hasattr(type_data, "__origin__") + and type_data.__origin__ is dict + and isinstance(raw_inputs[field_name], dict) + ): + # Expects a dict and value is already a dict + continue + elif ( inspect.isclass(type_data) and issubclass(type_data, DataClassJsonMixin) and isinstance(field_value, dict) ): raw_inputs[field_name] = type_data.from_dict(raw_inputs[field_name]) + elif is_dataclass(type_data) and isinstance(field_value, dict): raw_inputs[field_name] = type_data(**raw_inputs[field_name]) elif isinstance(type_data, EnumMeta): From 934316a542b098cbfed7f2c70417a805d8070e7d Mon Sep 17 00:00:00 2001 From: Roman Isecke Date: Thu, 19 Dec 2024 11:05:21 -0500 Subject: [PATCH 2/3] fix unit test --- test/test_schema.py | 64 ++++++++++++++++++++++----------------------- test/test_utils.py | 3 ++- 2 files changed, 34 insertions(+), 33 deletions(-) diff --git a/test/test_schema.py b/test/test_schema.py index f23a073..9c7ac06 100644 --- a/test/test_schema.py +++ b/test/test_schema.py @@ -449,22 +449,16 @@ def fn(a: FileData) -> list[FileData]: "identifier": {"type": "string"}, "connector_type": {"type": "string"}, "source_identifiers": { - "anyOf": [ - { - "type": "object", - "properties": { - "filename": {"type": "string"}, - "fullpath": {"type": "string"}, - "rel_path": { - "anyOf": [{"type": "string"}, {"type": "null"}], - "default": None, - }, - }, - "required": ["filename", "fullpath"], + "type": "object", + "properties": { + "filename": {"type": "string"}, + "fullpath": {"type": "string"}, + "rel_path": { + "anyOf": [{"type": "string"}, {"type": "null"}], + "default": None, }, - {"type": "null"}, - ], - "default": None, + }, + "required": ["filename", "fullpath"], }, "metadata": { "type": "object", @@ -533,7 +527,13 @@ def fn(a: FileData) -> list[FileData]: "default": None, }, }, - "required": ["identifier", "connector_type", "metadata", "additional_metadata"], + "required": [ + "identifier", + "connector_type", + "source_identifiers", + "metadata", + "additional_metadata", + ], } }, } @@ -551,22 +551,16 @@ def fn(a: FileData) -> list[FileData]: "identifier": {"type": "string"}, "connector_type": {"type": "string"}, "source_identifiers": { - "anyOf": [ - { - "type": "object", - "properties": { - "filename": {"type": "string"}, - "fullpath": {"type": "string"}, - "rel_path": { - "anyOf": [{"type": "string"}, {"type": "null"}], - "default": None, - }, - }, - "required": ["filename", "fullpath"], + "type": "object", + "properties": { + "filename": {"type": "string"}, + "fullpath": {"type": "string"}, + "rel_path": { + "anyOf": [{"type": "string"}, {"type": "null"}], + "default": None, }, - {"type": "null"}, - ], - "default": None, + }, + "required": ["filename", "fullpath"], }, "metadata": { "type": "object", @@ -629,7 +623,13 @@ def fn(a: FileData) -> list[FileData]: }, "display_name": {"anyOf": [{"type": "string"}, {"type": "null"}], "default": None}, }, - "required": ["identifier", "connector_type", "metadata", "additional_metadata"], + "required": [ + "identifier", + "connector_type", + "source_identifiers", + "metadata", + "additional_metadata", + ], }, } diff --git a/test/test_utils.py b/test/test_utils.py index 1d1c944..c4f8ab2 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -4,7 +4,7 @@ import pytest from pydantic import BaseModel -from unstructured_ingest.v2.interfaces import FileData +from unstructured_ingest.v2.interfaces import FileData, SourceIdentifiers from uvicorn.importer import import_from_string from unstructured_platform_plugins.etl_uvicorn import utils @@ -110,6 +110,7 @@ def fn(a: A, b: B, c: MyEnum, d: list, e: FileData) -> None: identifier="custom_file_data", connector_type="mock_connector", additional_metadata={"additional": "metadata"}, + source_identifiers=SourceIdentifiers(filename="file.txt", fullpath="file.txt"), ) inputs = { "a": {"b": 4, "c": 5.6}, From 660738a0143661abca969e506d9760db0fc99bd9 Mon Sep 17 00:00:00 2001 From: Roman Isecke Date: Thu, 19 Dec 2024 11:14:40 -0500 Subject: [PATCH 3/3] add httpx dep to tests --- requirements/test.in | 1 + requirements/test.txt | 24 +++++++++++++++++++++++- 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/requirements/test.in b/requirements/test.in index e079f8a..5769ad4 100644 --- a/requirements/test.in +++ b/requirements/test.in @@ -1 +1,2 @@ pytest +httpx diff --git a/requirements/test.txt b/requirements/test.txt index 67d51f9..516b2f0 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -1,7 +1,25 @@ # This file was autogenerated by uv via the following command: # uv pip compile ./test.in --output-file ./test.txt --no-strip-extras --python-version 3.10 +anyio==4.7.0 + # via httpx +certifi==2024.12.14 + # via + # httpcore + # httpx exceptiongroup==1.2.2 - # via pytest + # via + # anyio + # pytest +h11==0.14.0 + # via httpcore +httpcore==1.0.7 + # via httpx +httpx==0.28.1 + # via -r ./test.in +idna==3.10 + # via + # anyio + # httpx iniconfig==2.0.0 # via pytest packaging==24.2 @@ -10,5 +28,9 @@ pluggy==1.5.0 # via pytest pytest==8.3.4 # via -r ./test.in +sniffio==1.3.1 + # via anyio tomli==2.2.1 # via pytest +typing-extensions==4.12.2 + # via anyio