From bb9bd5442ef37a2bc829f86d866504ca476468a8 Mon Sep 17 00:00:00 2001 From: Roman Isecke Date: Wed, 18 Dec 2024 10:29:14 -0500 Subject: [PATCH 1/2] update ingest dep and add support for batch file data --- requirements/cli.txt | 76 ++++---- requirements/constraints.txt | 2 +- requirements/lint.txt | 18 +- requirements/release.txt | 36 ++-- requirements/test.txt | 14 +- requirements/validate.txt | 12 +- scripts/pip-compile.sh | 32 ++-- test/test_schema.py | 162 +----------------- test/test_utils.py | 2 +- .../schema/json_schema.py | 10 +- 10 files changed, 100 insertions(+), 264 deletions(-) diff --git a/requirements/cli.txt b/requirements/cli.txt index 2d12542..3f40021 100644 --- a/requirements/cli.txt +++ b/requirements/cli.txt @@ -1,12 +1,8 @@ -# -# This file is autogenerated by pip-compile with Python 3.10 -# by the following command: -# -# pip-compile ./cli.in -# +# This file was autogenerated by uv via the following command: +# uv pip compile ./cli.in --output-file ./cli.txt --no-strip-extras --python-version 3.10 annotated-types==0.7.0 # via pydantic -anyio==4.6.0 +anyio==4.7.0 # via starlette asgiref==3.8.1 # via opentelemetry-instrumentation-asgi @@ -17,32 +13,34 @@ click==8.1.7 # uvicorn dataclasses-json==0.6.7 # via unstructured-ingest -deprecated==1.2.14 +deprecated==1.2.15 # via # opentelemetry-api # opentelemetry-exporter-otlp-proto-grpc # opentelemetry-semantic-conventions exceptiongroup==1.2.2 # via anyio -fastapi==0.115.0 +fastapi==0.115.6 # via -r ./cli.in -googleapis-common-protos==1.65.0 +googleapis-common-protos==1.66.0 # via opentelemetry-exporter-otlp-proto-grpc -grpcio==1.66.1 +grpcio==1.68.1 # via opentelemetry-exporter-otlp-proto-grpc h11==0.14.0 # via uvicorn idna==3.10 # via anyio -importlib-metadata==8.4.0 +importlib-metadata==8.5.0 # via opentelemetry-api -marshmallow==3.22.0 +marshmallow==3.23.1 # via dataclasses-json mypy-extensions==1.0.0 # via typing-inspect -numpy==2.1.1 +ndjson==0.3.1 + # via unstructured-ingest +numpy==2.2.0 # via pandas -opentelemetry-api==1.27.0 +opentelemetry-api==1.29.0 # via # opentelemetry-exporter-otlp-proto-grpc # opentelemetry-instrumentation @@ -50,48 +48,51 @@ opentelemetry-api==1.27.0 # opentelemetry-instrumentation-fastapi # opentelemetry-sdk # opentelemetry-semantic-conventions -opentelemetry-exporter-otlp-proto-common==1.27.0 +opentelemetry-exporter-otlp-proto-common==1.29.0 # via opentelemetry-exporter-otlp-proto-grpc -opentelemetry-exporter-otlp-proto-grpc==1.27.0 +opentelemetry-exporter-otlp-proto-grpc==1.29.0 # via -r ./cli.in -opentelemetry-instrumentation==0.48b0 +opentelemetry-instrumentation==0.50b0 # via # opentelemetry-instrumentation-asgi # opentelemetry-instrumentation-fastapi -opentelemetry-instrumentation-asgi==0.48b0 +opentelemetry-instrumentation-asgi==0.50b0 # via opentelemetry-instrumentation-fastapi -opentelemetry-instrumentation-fastapi==0.48b0 +opentelemetry-instrumentation-fastapi==0.50b0 # via -r ./cli.in -opentelemetry-proto==1.27.0 +opentelemetry-proto==1.29.0 # via # opentelemetry-exporter-otlp-proto-common # opentelemetry-exporter-otlp-proto-grpc -opentelemetry-sdk==1.27.0 +opentelemetry-sdk==1.29.0 # via # opentelemetry-exporter-otlp-proto-grpc # unstructured-ingest -opentelemetry-semantic-conventions==0.48b0 +opentelemetry-semantic-conventions==0.50b0 # via + # opentelemetry-instrumentation # opentelemetry-instrumentation-asgi # opentelemetry-instrumentation-fastapi # opentelemetry-sdk -opentelemetry-util-http==0.48b0 +opentelemetry-util-http==0.50b0 # via # opentelemetry-instrumentation-asgi # opentelemetry-instrumentation-fastapi -packaging==24.1 - # via marshmallow +packaging==24.2 + # via + # marshmallow + # opentelemetry-instrumentation pandas==2.2.3 # via unstructured-ingest -protobuf==4.25.5 +protobuf==5.29.1 # via # googleapis-common-protos # opentelemetry-proto -pydantic==2.9.2 +pydantic==2.10.3 # via # fastapi # unstructured-ingest -pydantic-core==2.23.4 +pydantic-core==2.27.1 # via pydantic python-dateutil==2.9.0.post0 # via @@ -99,13 +100,13 @@ python-dateutil==2.9.0.post0 # unstructured-ingest pytz==2024.2 # via pandas -six==1.16.0 +six==1.17.0 # via python-dateutil sniffio==1.3.1 # via anyio -starlette==0.38.6 +starlette==0.41.3 # via fastapi -tqdm==4.66.5 +tqdm==4.67.1 # via unstructured-ingest typing-extensions==4.12.2 # via @@ -121,16 +122,13 @@ typing-inspect==0.9.0 # via dataclasses-json tzdata==2024.2 # via pandas -unstructured-ingest==0.0.18 +unstructured-ingest==0.3.10 # via -r ./cli.in -uvicorn==0.30.6 +uvicorn==0.34.0 # via -r ./cli.in -wrapt==1.16.0 +wrapt==1.17.0 # via # deprecated # opentelemetry-instrumentation -zipp==3.20.2 +zipp==3.21.0 # via importlib-metadata - -# The following packages are considered to be unsafe in a requirements file: -# setuptools diff --git a/requirements/constraints.txt b/requirements/constraints.txt index 747a2b2..743fbed 100644 --- a/requirements/constraints.txt +++ b/requirements/constraints.txt @@ -1 +1 @@ -unstructured-ingest>=0.0.18 +unstructured-ingest>=0.3.10 diff --git a/requirements/lint.txt b/requirements/lint.txt index 908d069..a29def1 100644 --- a/requirements/lint.txt +++ b/requirements/lint.txt @@ -1,12 +1,8 @@ -# -# This file is autogenerated by pip-compile with Python 3.10 -# by the following command: -# -# pip-compile ./lint.in -# +# This file was autogenerated by uv via the following command: +# uv pip compile ./lint.in --output-file ./lint.txt --no-strip-extras --python-version 3.10 autoflake==2.3.1 # via -r ./lint.in -black==24.8.0 +black==24.10.0 # via -r ./lint.in click==8.1.7 # via black @@ -18,13 +14,13 @@ flake8-print==5.0.0 # via -r ./lint.in mccabe==0.7.0 # via flake8 -mypy==1.11.2 +mypy==1.13.0 # via -r ./lint.in mypy-extensions==1.0.0 # via # black # mypy -packaging==24.1 +packaging==24.2 # via black pathspec==0.12.1 # via black @@ -38,9 +34,9 @@ pyflakes==3.2.0 # via # autoflake # flake8 -ruff==0.6.7 +ruff==0.8.3 # via -r ./lint.in -tomli==2.0.1 +tomli==2.2.1 # via # autoflake # black diff --git a/requirements/release.txt b/requirements/release.txt index 5a880bd..e579ff2 100644 --- a/requirements/release.txt +++ b/requirements/release.txt @@ -1,30 +1,24 @@ -# -# This file is autogenerated by pip-compile with Python 3.10 -# by the following command: -# -# pip-compile ./release.in -# +# This file was autogenerated by uv via the following command: +# uv pip compile ./release.in --output-file ./release.txt --no-strip-extras --python-version 3.10 backports-tarfile==1.2.0 # via jaraco-context -certifi==2024.8.30 +certifi==2024.12.14 # via requests -charset-normalizer==3.3.2 +charset-normalizer==3.4.0 # via requests docutils==0.21.2 # via readme-renderer idna==3.10 # via requests importlib-metadata==8.5.0 - # via - # keyring - # twine + # via keyring jaraco-classes==3.4.0 # via keyring jaraco-context==6.0.1 # via keyring -jaraco-functools==4.0.2 +jaraco-functools==4.1.0 # via keyring -keyring==25.4.1 +keyring==25.5.0 # via twine markdown-it-py==3.0.0 # via rich @@ -34,9 +28,11 @@ more-itertools==10.5.0 # via # jaraco-classes # jaraco-functools -nh3==0.2.18 +nh3==0.2.20 # via readme-renderer -pkginfo==1.10.0 +packaging==24.2 + # via twine +pkginfo==1.12.0 # via twine pygments==2.18.0 # via @@ -52,15 +48,17 @@ requests-toolbelt==1.0.0 # via twine rfc3986==2.0.0 # via twine -rich==13.8.1 +rich==13.9.4 # via twine -twine==5.1.1 +twine==6.0.1 # via -r ./release.in +typing-extensions==4.12.2 + # via rich urllib3==2.2.3 # via # requests # twine -wheel==0.44.0 +wheel==0.45.1 # via -r ./release.in -zipp==3.20.2 +zipp==3.21.0 # via importlib-metadata diff --git a/requirements/test.txt b/requirements/test.txt index 36a77e0..67d51f9 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -1,18 +1,14 @@ -# -# This file is autogenerated by pip-compile with Python 3.10 -# by the following command: -# -# pip-compile ./test.in -# +# This file was autogenerated by uv via the following command: +# uv pip compile ./test.in --output-file ./test.txt --no-strip-extras --python-version 3.10 exceptiongroup==1.2.2 # via pytest iniconfig==2.0.0 # via pytest -packaging==24.1 +packaging==24.2 # via pytest pluggy==1.5.0 # via pytest -pytest==8.3.3 +pytest==8.3.4 # via -r ./test.in -tomli==2.0.1 +tomli==2.2.1 # via pytest diff --git a/requirements/validate.txt b/requirements/validate.txt index 6ac5fc8..75b551d 100644 --- a/requirements/validate.txt +++ b/requirements/validate.txt @@ -1,12 +1,8 @@ -# -# This file is autogenerated by pip-compile with Python 3.10 -# by the following command: -# -# pip-compile ./validate.in -# -certifi==2024.8.30 +# This file was autogenerated by uv via the following command: +# uv pip compile ./validate.in --output-file ./validate.txt --no-strip-extras --python-version 3.10 +certifi==2024.12.14 # via requests -charset-normalizer==3.3.2 +charset-normalizer==3.4.0 # via requests click==8.1.7 # via -r ./validate.in diff --git a/scripts/pip-compile.sh b/scripts/pip-compile.sh index 80626d0..ac24cd3 100755 --- a/scripts/pip-compile.sh +++ b/scripts/pip-compile.sh @@ -1,27 +1,23 @@ #!/usr/bin/env bash -pushd ./requirements || exit - -find . -type f -name "*.txt" ! -name "constraints.txt" -exec rm '{}' ';' -find . -type f -name "*.in" -maxdepth 1 -exec pip-compile --upgrade '{}' ';' - -popd || exit +set -e -# Check python version # python version must match lowest supported (3.10) -major=3 -minor=10 +python_version=${UV_PYTHON_VERSION:-"3.10"} -versions=$(cat requirements/* | grep "This file is autogenerated by pip-compile with Python" | awk '{print $NF}' | sort | uniq) -if [[ $(echo $versions | wc -w) -ne 1 ]]; then - echo "Files generated with multiple python version: $versions" +# if major and minor python version (x.y) is not equal to current python_version, error out +if [[ $(python --version | cut -d ' ' -f 2 | cut -d '.' -f 1-2) != $(echo "$python_version" | cut -d '.' -f 1-2) ]]; then + echo "Python version must be $python_version (lowest supported) to be able to pip-compile." exit 1 fi -version_major=$(echo $versions | awk -F"." '{print $1}') -version_minor=$(echo $versions | awk -F"." '{print $2}') +pushd ./requirements || exit -if [[ $major -ne $version_major || $minor -ne $version_minor ]]; then - echo "python version not equal to expected $major.$minor: $versions" - exit 1 -fi +find . -type f -name "*.txt" ! -name "constraints.txt" -exec rm '{}' ';' +find . -type f -name "*.in" -print0 | while read -r -d $'\0' in_file; do + echo "compiling $in_file" + # remove .in extension and add .txt extension + txt_file="${in_file%.in}.txt" + uv pip compile --upgrade "$in_file" --output-file "$txt_file" --no-strip-extras --python-version "$python_version" +done +popd || exit diff --git a/test/test_schema.py b/test/test_schema.py index fef8951..f23a073 100644 --- a/test/test_schema.py +++ b/test/test_schema.py @@ -1,7 +1,6 @@ import inspect from dataclasses import dataclass from enum import Enum -from pathlib import Path from typing import Any, Optional, Union import pytest @@ -10,7 +9,6 @@ from unstructured_ingest.v2.interfaces import FileData import unstructured_platform_plugins.schema.json_schema as js -from unstructured_platform_plugins.etl_uvicorn.utils import get_input_schema from unstructured_platform_plugins.schema.model import is_valid_input_dict, is_valid_response_dict from unstructured_platform_plugins.schema.utils import get_typed_parameters from unstructured_platform_plugins.type_hints import get_type_hints @@ -395,44 +393,6 @@ def fn(q: InputC) -> None: assert is_valid_input_dict(input_schema) -def test_schema_to_base_model(): - class g_enum(Enum): - FIRST = "first" - SECOND = "second" - THIRD = "third" - - def fn( - a: int, - b: float | int = 4, - c: str | None = "my_string", - d: bool = False, - e: Optional[dict[str, Any]] = None, - f: list[float] = None, - g: Optional[g_enum] = None, - h: FileData | None = None, - i: Path | None = None, - ) -> None: - pass - - class ExpectedInputModel(BaseModel): - a: int - b: Union[float, int] = 4 - c: str | None = "my_string" - d: bool = False - e: Optional[dict[str, Any]] = None - f: list[float] = None - g: Optional[g_enum] = None - h: FileData | None = None - i: Path | None = None - - input_schema = get_input_schema(fn) - input_model = js.schema_to_base_model(schema=input_schema) - input_model_schema = input_model.model_json_schema() - expected_model_schema = ExpectedInputModel.model_json_schema() - expected_model_schema["title"] = "reconstructed_model" - assert input_model_schema == expected_model_schema - - # These need to be defined outside the code of test_forward_reference_typing # for references to resolve: @dataclass @@ -485,7 +445,6 @@ def fn(a: FileData) -> list[FileData]: "properties": { "a": { "type": "object", - "is_file_data": True, "properties": { "identifier": {"type": "string"}, "connector_type": {"type": "string"}, @@ -507,7 +466,6 @@ def fn(a: FileData) -> list[FileData]: ], "default": None, }, - "doc_type": {"type": "string", "default": "file"}, "metadata": { "type": "object", "properties": { @@ -560,72 +518,22 @@ def fn(a: FileData) -> list[FileData]: }, }, "required": [], - "default": { - "type": "object", - "properties": { - "url": { - "anyOf": [{"type": "string"}, {"type": "null"}], - "default": None, - }, - "version": { - "anyOf": [{"type": "string"}, {"type": "null"}], - "default": None, - }, - "record_locator": { - "anyOf": [ - { - "type": "object", - "items": {"key": {"type": "string"}, "value": {}}, - }, - {"type": "null"}, - ], - "default": None, - }, - "date_created": { - "anyOf": [{"type": "string"}, {"type": "null"}], - "default": None, - }, - "date_modified": { - "anyOf": [{"type": "string"}, {"type": "null"}], - "default": None, - }, - "date_processed": { - "anyOf": [{"type": "string"}, {"type": "null"}], - "default": None, - }, - "permissions_data": { - "anyOf": [ - { - "type": "array", - "items": { - "type": "object", - "items": {"key": {"type": "string"}, "value": {}}, - }, - }, - {"type": "null"}, - ], - "default": None, - }, - "filesize_bytes": { - "anyOf": [{"type": "integer"}, {"type": "null"}], - "default": None, - }, - }, - "required": [], - }, }, "additional_metadata": { "type": "object", "items": {"key": {"type": "string"}, "value": {}}, - "default": {}, }, "reprocess": {"type": "boolean", "default": False}, "local_download_path": { "anyOf": [{"type": "string"}, {"type": "null"}], "default": None, }, + "display_name": { + "anyOf": [{"type": "string"}, {"type": "null"}], + "default": None, + }, }, - "required": ["identifier", "connector_type"], + "required": ["identifier", "connector_type", "metadata", "additional_metadata"], } }, } @@ -639,7 +547,6 @@ def fn(a: FileData) -> list[FileData]: "type": "array", "items": { "type": "object", - "is_file_data": True, "properties": { "identifier": {"type": "string"}, "connector_type": {"type": "string"}, @@ -661,7 +568,6 @@ def fn(a: FileData) -> list[FileData]: ], "default": None, }, - "doc_type": {"type": "string", "default": "file"}, "metadata": { "type": "object", "properties": { @@ -711,73 +617,21 @@ def fn(a: FileData) -> list[FileData]: }, }, "required": [], - "default": { - "type": "object", - "properties": { - "url": { - "anyOf": [{"type": "string"}, {"type": "null"}], - "default": None, - }, - "version": { - "anyOf": [{"type": "string"}, {"type": "null"}], - "default": None, - }, - "record_locator": { - "anyOf": [ - { - "type": "object", - "items": {"key": {"type": "string"}, "value": {}}, - }, - {"type": "null"}, - ], - "default": None, - }, - "date_created": { - "anyOf": [{"type": "string"}, {"type": "null"}], - "default": None, - }, - "date_modified": { - "anyOf": [{"type": "string"}, {"type": "null"}], - "default": None, - }, - "date_processed": { - "anyOf": [{"type": "string"}, {"type": "null"}], - "default": None, - }, - "permissions_data": { - "anyOf": [ - { - "type": "array", - "items": { - "type": "object", - "items": {"key": {"type": "string"}, "value": {}}, - }, - }, - {"type": "null"}, - ], - "default": None, - }, - "filesize_bytes": { - "anyOf": [{"type": "integer"}, {"type": "null"}], - "default": None, - }, - }, - "required": [], - }, }, "additional_metadata": { "type": "object", "items": {"key": {"type": "string"}, "value": {}}, - "default": {}, }, "reprocess": {"type": "boolean", "default": False}, "local_download_path": { "anyOf": [{"type": "string"}, {"type": "null"}], "default": None, }, + "display_name": {"anyOf": [{"type": "string"}, {"type": "null"}], "default": None}, }, - "required": ["identifier", "connector_type"], + "required": ["identifier", "connector_type", "metadata", "additional_metadata"], }, } + assert output_schema == expected_output_schema assert is_valid_response_dict(output_schema) diff --git a/test/test_utils.py b/test/test_utils.py index bd48316..1d1c944 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -116,7 +116,7 @@ def fn(a: A, b: B, c: MyEnum, d: list, e: FileData) -> None: "b": {"d": True, "e": {"key": "value"}}, "c": MyEnum.VALUE.value, "d": [1, 2, 3], - "e": file_data.to_dict(), + "e": file_data.model_dump(), } mapped_inputs = utils.map_inputs(func=fn, raw_inputs=inputs) diff --git a/unstructured_platform_plugins/schema/json_schema.py b/unstructured_platform_plugins/schema/json_schema.py index f581d19..cba6062 100644 --- a/unstructured_platform_plugins/schema/json_schema.py +++ b/unstructured_platform_plugins/schema/json_schema.py @@ -8,7 +8,7 @@ from pydantic import BaseModel, create_model from pydantic.fields import FieldInfo, PydanticUndefined -from unstructured_ingest.v2.interfaces import FileData +from unstructured_ingest.v2.interfaces import BatchFileData, FileData from unstructured_platform_plugins.schema.utils import TypedParameter from unstructured_platform_plugins.type_hints import get_type_hints @@ -98,9 +98,9 @@ def union_type_to_json_schema(t: UnionType) -> dict: def dataclass_to_json_schema(class_or_instance: Any) -> dict: resp = {"type": "object"} - is_filedata_instance = isinstance(class_or_instance, FileData) - is_filedata_class = class_or_instance is FileData - if is_filedata_instance or is_filedata_class: + if isinstance(class_or_instance, BatchFileData) or class_or_instance is BatchFileData: + resp["is_batch_file_data"] = True + elif isinstance(class_or_instance, FileData) or class_or_instance is FileData: resp["is_file_data"] = True fs = fields(class_or_instance) if not fs: @@ -279,6 +279,8 @@ def schema_to_base_model_type(json_type_name, name: str, type_info: dict) -> Typ t = typed_map_reverse[json_type_name] if t is dict and type_info.get("is_file_data", False): return FileData + if t is dict and type_info.get("is_batch_file_data", False): + return BatchFileData if t is str and type_info.get("is_path", False): return Path if t is dict and "properties" in type_info: From b596077f5782f279a9e4fc16336ef44a1b0fdda7 Mon Sep 17 00:00:00 2001 From: Roman Isecke Date: Wed, 18 Dec 2024 10:29:40 -0500 Subject: [PATCH 2/2] bump changelog --- CHANGELOG.md | 4 ++++ unstructured_platform_plugins/__version__.py | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index cebd389..e91fcdd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +## 0.0.14 + +* **Add support for batch file data** + ## 0.0.13 * **Conform to PEP-625 compliance for project naming** diff --git a/unstructured_platform_plugins/__version__.py b/unstructured_platform_plugins/__version__.py index 0128d6e..b3c929d 100644 --- a/unstructured_platform_plugins/__version__.py +++ b/unstructured_platform_plugins/__version__.py @@ -1 +1 @@ -__version__ = "0.0.13" # pragma: no cover +__version__ = "0.0.14" # pragma: no cover