From f8a67fdfb5c68ef4a8a3e14307f4cb8f9314486c Mon Sep 17 00:00:00 2001 From: Andreas Klos Date: Tue, 29 Apr 2025 13:03:21 +0200 Subject: [PATCH 1/5] chore: update project structure and add test files for admin and document extractor backends --- .gitignore | 4 + .vscode/launch.json | 17 +++- .vscode/settings.json | 13 +++ admin-backend/Dockerfile | 2 +- admin-backend/pyproject.toml | 84 +++++++++---------- .../tests/{dummy_test.py => dummy1_test.py} | 2 +- conftest.py | 15 ++++ .../tests/{dummy_test.py => dummy3_test.py} | 0 rag-backend/pyproject.toml | 10 +-- .../tests/{dummy_test.py => dummy2_test.py} | 2 +- rag-core-library | 2 +- rag-infrastructure | 2 +- 12 files changed, 97 insertions(+), 56 deletions(-) rename admin-backend/tests/{dummy_test.py => dummy1_test.py} (60%) create mode 100644 conftest.py rename document-extractor/tests/{dummy_test.py => dummy3_test.py} (100%) rename rag-backend/tests/{dummy_test.py => dummy2_test.py} (66%) diff --git a/.gitignore b/.gitignore index 5b32331..48241d4 100644 --- a/.gitignore +++ b/.gitignore @@ -7,6 +7,10 @@ pyrightconfig.json notes*.md notes.md +# macOS Finder metadata +.DS_Store +**/.DS_Store + # Node Modules node_modules/ diff --git a/.vscode/launch.json b/.vscode/launch.json index 64d934d..80125b8 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -1,6 +1,21 @@ { "version": "0.2.0", "configurations": [ + { + "name": "Debug Pytest (current file)", + "type": "python", + "request": "launch", + // run pytest as a module + "module": "pytest", + "args": [ + "--maxfail=1", + "--disable-warnings", + "-q", + "${file}" + ], + "console": "integratedTerminal", + "justMyCode": false, + }, { "name": "rag_backend", "type": "python", @@ -94,7 +109,5 @@ } ] } - - ] } diff --git a/.vscode/settings.json b/.vscode/settings.json index 94ff2e2..151bbe2 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -6,6 +6,8 @@ "./rag-core-library/rag-core-api/src", "./rag-core-library/rag-core-lib/src", "./rag-core-library/extractor-api-lib/src", + "./admin-backend", + "./rag-backend" ], "[yaml]": { "editor.tabSize": 2, @@ -13,4 +15,15 @@ "editor.formatOnType": true, "editor.autoIndent": "advanced" }, + "python.testing.pytestEnabled": true, + "python.testing.unittestEnabled": false, + "python.testing.pytestArgs": ["--import-mode","importlib"], + "python.testing.autoTestDiscoverOnSaveEnabled": true, + "python.envFile": "${workspaceFolder}/.env", + "python-envs.defaultEnvManager": "ms-python.python:venv", + "python-envs.defaultPackageManager": "ms-python.python:pip", + "python-envs.pythonProjects": [], } + + + diff --git a/admin-backend/Dockerfile b/admin-backend/Dockerfile index 35aa741..4f4b1c5 100644 --- a/admin-backend/Dockerfile +++ b/admin-backend/Dockerfile @@ -19,7 +19,7 @@ COPY admin-backend/pyproject.toml admin-backend/poetry.lock ./ RUN mkdir log && chmod 700 log RUN touch /app/admin-backend/log/logfile.log && chmod 600 /app/admin-backend/log/logfile.log -RUN poetry config virtualenvs.create false &&\ +RUN poetry config virtualenvs.create false && \ if [ "$dev" = "1" ]; then \ poetry install --no-interaction --no-ansi --no-root --with dev; \ else \ diff --git a/admin-backend/pyproject.toml b/admin-backend/pyproject.toml index 0098e8b..338e57b 100644 --- a/admin-backend/pyproject.toml +++ b/admin-backend/pyproject.toml @@ -1,3 +1,45 @@ +[tool.poetry] +name = "admin_backend" +version = "1.0.0" +description = "The admin backend is responsible for the document management. This includes deletion, upload and getting particular documents or document lists." +authors = ["STACKIT Data and AI Consulting "] +readme = "README.md" + +[tool.poetry.dependencies] +python = "^3.11" +admin-api-lib = {path = "../rag-core-library/admin-api-lib", develop = true} + +[tool.poetry.group.dev.dependencies] +debugpy = "^1.8.1" +pytest = "^8.2.1" +flake8 = "^7.1.0" +flake8-black = "^0.3.6" +flake8-pyproject = "^1.2.3" +coverage = "^7.5.4" +flake8-quotes = "^3.4.0" +flake8-return = "^1.2.0" +flake8-annotations-complexity = "^0.0.8" +flake8-bandit = "^4.1.1" +flake8-bugbear = "^24.8.19" +flake8-builtins = "^2.5.0" +flake8-comprehensions = "^3.15.0" +flake8-eradicate = "^1.5.0" +flake8-expression-complexity = "^0.0.11" +# flake8-logging-format = "^2024.24.12" +# flake8-docstrings = "^1.7.0" +flake8-pytest-style = "^2.0.0" +pep8-naming = "^0.14.1" +flake8-eol = "^0.0.8" +flake8-exceptions = "^0.0.1a0" +flake8-simplify = "^0.21.0" +flake8-wot = "^0.2.0" +flake8-function-order = "^0.0.5" +flake8-tidy-imports = "^4.10.0" + +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api" + [tool.flake8] exclude= [".eggs", "./rag-core-library/*", "./src/admin_backend/models/*", "./src/admin_backend/rag_backend_client/*", "./src/admin_backend/document_extractor_client/*", ".git", ".hg", ".mypy_cache", ".tox", ".venv", ".devcontainer", "venv", "_build", "buck-out", "build", "dist"] statistics = true @@ -49,45 +91,3 @@ skip_gitignore = true [tool.pylint] max-line-length = 120 - -[tool.poetry] -name = "admin_backend" -version = "0.0.1" -description = "The admin backend is responsible for the document management. This includes deletion, upload and getting particular documents or document lists." -authors = ["STACKIT Data and AI Consulting "] -readme = "README.md" - -[tool.poetry.group.dev.dependencies] -debugpy = "^1.8.1" -pytest = "^8.2.1" -flake8 = "^7.1.0" -flake8-black = "^0.3.6" -flake8-pyproject = "^1.2.3" -coverage = "^7.5.4" -flake8-quotes = "^3.4.0" -flake8-return = "^1.2.0" -flake8-annotations-complexity = "^0.0.8" -flake8-bandit = "^4.1.1" -flake8-bugbear = "^24.8.19" -flake8-builtins = "^2.5.0" -flake8-comprehensions = "^3.15.0" -flake8-eradicate = "^1.5.0" -flake8-expression-complexity = "^0.0.11" -# flake8-logging-format = "^2024.24.12" -# flake8-docstrings = "^1.7.0" -flake8-pytest-style = "^2.0.0" -pep8-naming = "^0.14.1" -flake8-eol = "^0.0.8" -flake8-exceptions = "^0.0.1a0" -flake8-simplify = "^0.21.0" -flake8-wot = "^0.2.0" -flake8-function-order = "^0.0.5" -flake8-tidy-imports = "^4.10.0" - -[build-system] -requires = ["poetry-core"] -build-backend = "poetry.core.masonry.api" - -[tool.poetry.dependencies] -python = "^3.11" -admin-api-lib = {path = "../rag-core-library/admin-api-lib", develop = true} diff --git a/admin-backend/tests/dummy_test.py b/admin-backend/tests/dummy1_test.py similarity index 60% rename from admin-backend/tests/dummy_test.py rename to admin-backend/tests/dummy1_test.py index 1428394..7ca7f1b 100644 --- a/admin-backend/tests/dummy_test.py +++ b/admin-backend/tests/dummy1_test.py @@ -1,3 +1,3 @@ -def test_dummy() -> None: +def test_dummy1() -> None: print("Dummy test.") assert True diff --git a/conftest.py b/conftest.py new file mode 100644 index 0000000..e2772f2 --- /dev/null +++ b/conftest.py @@ -0,0 +1,15 @@ +import sys +import os +from pathlib import Path + +# Add project root and specific directories to Python path +project_root = Path(__file__).parent +sys.path.insert(0, str(project_root)) +sys.path.insert(0, str(project_root / "admin-backend")) +sys.path.insert(0, str(project_root / "rag-backend")) +sys.path.insert(0, str(project_root / "document-extractor")) + +# point at each rag-core library's src folder so their packages (admin_api_lib, rag_core_api, etc.) are importable +lib_root = project_root / "rag-core-library" +for lib in ["admin-api-lib", "rag-core-api", "rag-core-lib", "extractor-api-lib"]: + sys.path.insert(0, str(lib_root / lib / "src")) diff --git a/document-extractor/tests/dummy_test.py b/document-extractor/tests/dummy3_test.py similarity index 100% rename from document-extractor/tests/dummy_test.py rename to document-extractor/tests/dummy3_test.py diff --git a/rag-backend/pyproject.toml b/rag-backend/pyproject.toml index 98ef321..7dcba24 100644 --- a/rag-backend/pyproject.toml +++ b/rag-backend/pyproject.toml @@ -1,7 +1,7 @@ [tool.poetry] -name = "rag-usecase-example" -version = "0.1.0" -description = "" +name = "rag-backend" +version = "1.0.0" +description = "The RAG backend is responsible for handling the interaction with the RAG system." authors = ["STACKIT Data and AI Consulting "] [tool.poetry.dependencies] @@ -39,7 +39,6 @@ flake8-tidy-imports = "^4.10.0" requires = ["poetry-core"] build-backend = "poetry.core.masonry.api" - [tool.flake8] exclude= [".eggs", "./rag-core-library/*", ".git", ".hg", ".mypy_cache", ".tox", ".venv", ".devcontainer", "venv", "_build", "buck-out", "build", "dist"] statistics = true @@ -58,7 +57,6 @@ per-file-ignores = """ ./tests/*: S101, """ - [tool.black] line-length = 120 exclude = """ @@ -80,7 +78,6 @@ exclude = """ )/ """ - [tool.isort] profile = "black" skip = ['.eggs', '.git', '.hg', '.mypy_cache', '.nox', '.pants.d', '.tox', '.venv', '_build', 'buck-out', 'build', 'dist', 'node_modules', 'venv'] @@ -88,4 +85,3 @@ skip_gitignore = true [tool.pylint] max-line-length = 120 - diff --git a/rag-backend/tests/dummy_test.py b/rag-backend/tests/dummy2_test.py similarity index 66% rename from rag-backend/tests/dummy_test.py rename to rag-backend/tests/dummy2_test.py index 6e09897..7d002b3 100644 --- a/rag-backend/tests/dummy_test.py +++ b/rag-backend/tests/dummy2_test.py @@ -1,3 +1,3 @@ -def test_dummy() -> None: +def test_dummy2() -> None: print("Dummy test.") assert True # noqa S101 diff --git a/rag-core-library b/rag-core-library index cf78325..f6eb00b 160000 --- a/rag-core-library +++ b/rag-core-library @@ -1 +1 @@ -Subproject commit cf78325c59920fd31170c20b1244092b572d469d +Subproject commit f6eb00b542b0384345737d21b790ce989c8200af diff --git a/rag-infrastructure b/rag-infrastructure index 1dc80f9..24e755e 160000 --- a/rag-infrastructure +++ b/rag-infrastructure @@ -1 +1 @@ -Subproject commit 1dc80f916c67224901e4fa9dc82a744b065335c2 +Subproject commit 24e755ec4961752a01de4a868df2c7f390194b5d From 0d7de1c4a541a5fac34a59898bf8b85f1df43e05 Mon Sep 17 00:00:00 2001 From: Andreas Klos Date: Wed, 30 Apr 2025 13:29:15 +0200 Subject: [PATCH 2/5] chore: add Dockerfile, Makefile, and .gitignore; implement initial tests and update README --- confluence-updater/.gitignore | 140 ++++++++++++++++++++++++ confluence-updater/Dockerfile | 70 ++++++++++++ confluence-updater/Makefile | 12 ++ confluence-updater/README.md | 11 ++ confluence-updater/main.py | 1 + confluence-updater/pyproject.toml | 88 +++++++++++++++ confluence-updater/tests/__init__.py | 0 confluence-updater/tests/dummy6_test.py | 3 + rag-core-library | 2 +- 9 files changed, 326 insertions(+), 1 deletion(-) create mode 100644 confluence-updater/.gitignore create mode 100644 confluence-updater/Dockerfile create mode 100644 confluence-updater/Makefile create mode 100644 confluence-updater/README.md create mode 100644 confluence-updater/main.py create mode 100644 confluence-updater/pyproject.toml create mode 100644 confluence-updater/tests/__init__.py create mode 100644 confluence-updater/tests/dummy6_test.py diff --git a/confluence-updater/.gitignore b/confluence-updater/.gitignore new file mode 100644 index 0000000..a77bf09 --- /dev/null +++ b/confluence-updater/.gitignore @@ -0,0 +1,140 @@ +.openapi-generator/ + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ diff --git a/confluence-updater/Dockerfile b/confluence-updater/Dockerfile new file mode 100644 index 0000000..7415a84 --- /dev/null +++ b/confluence-updater/Dockerfile @@ -0,0 +1,70 @@ +FROM --platform=linux/amd64 python:3.11.7-bookworm AS build + +ARG dev=0 +ENV POETRY_VIRTUALENVS_PATH=/app/document-extractor/.venv +ENV POETRY_VERSION=1.8.3 + +RUN DEBIAN_FRONTEND=noninteractive apt-get update \ + && DEBIAN_FRONTEND=noninteractive apt-get install -y build-essential --no-install-recommends make \ + ffmpeg \ + poppler-utils \ + tesseract-ocr \ + tesseract-ocr-deu \ + tesseract-ocr-eng && \ + python3 -m venv "${POETRY_VIRTUALENVS_PATH}" \ + && $POETRY_VIRTUALENVS_PATH/bin/pip install "poetry==${POETRY_VERSION}" +ENV PATH="${POETRY_VIRTUALENVS_PATH}/bin:$PATH" + +COPY rag-core-library/extractor-api-lib /app/rag-core-library/extractor-api-lib + +WORKDIR /app/document-extractor +COPY document-extractor/pyproject.toml document-extractor/poetry.lock ./ + +RUN mkdir log && chmod 700 log +RUN touch /app/document-extractor/log/logfile.log && chmod 600 /app/document-extractor/log/logfile.log + +RUN poetry config virtualenvs.create false &&\ + if [ "$dev" = "1" ]; then \ + poetry install --no-interaction --no-ansi --no-root --with dev; \ + else \ + poetry install --no-interaction --no-ansi --no-root; \ + fi + +FROM --platform=linux/amd64 python:3.11.7-bookworm +ARG dev=0 + +RUN adduser --disabled-password --gecos "" --uid 65532 nonroot + +ENV POETRY_VIRTUALENVS_PATH=/app/document-extractor/.venv +COPY --from=build --chown=nonroot:nonroot ${POETRY_VIRTUALENVS_PATH} ${POETRY_VIRTUALENVS_PATH} +COPY --from=build /usr/local/bin/ /usr/local/bin/ +COPY --from=build /usr/bin/ /usr/bin/ +COPY --from=build /usr/local/lib/ /usr/local/lib/ +COPY --from=build /usr/lib/x86_64-linux-gnu/ /usr/lib/x86_64-linux-gnu/ +COPY --from=build /usr/share/tesseract-ocr/5/tessdata /usr/share/tesseract-ocr/5/tessdata + +COPY --chown=nonroot:nonroot rag-core-library/extractor-api-lib /app/rag-core-library/extractor-api-lib + +WORKDIR /app/document-extractor + +COPY --chown=nonroot:nonroot document-extractor . + + +# cleanup +RUN apt-get clean autoclean +RUN apt-get autoremove --yes + +RUN if [ "$dev" = "0" ]; then \ + while read -r shell; do rm -f "$shell"; done < /etc/shells; \ + rm -rf /var/lib/{apt,dpkg,cache,log}/ \ + else \ + echo "POETRY_VIRTUALENVS_PATH=/app/document-extractor/.venv" >> /etc/environment;\ + export POETRY_VIRTUALENVS_PATH=/app/document-extractor/.venv;\ + export PATH="${POETRY_VIRTUALENVS_PATH}/bin:$PATH";\ + fi + + +USER nonroot +COPY --from=build --chown=nonroot:nonroot /app/document-extractor/log /app/document-extractor/log + +ENV PATH="${POETRY_VIRTUALENVS_PATH}/bin:${PATH}" diff --git a/confluence-updater/Makefile b/confluence-updater/Makefile new file mode 100644 index 0000000..0daff64 --- /dev/null +++ b/confluence-updater/Makefile @@ -0,0 +1,12 @@ +.PHONY: lint coverage test + +lint: + poetry run flake8 . + +test: + poetry run python -m pytest tests + +coverage: + poetry run coverage run --omit *.pyc --omit *__init__.py --source src/rag_core -m pytest tests + poetry run coverage report -m + poetry run coverage html diff --git a/confluence-updater/README.md b/confluence-updater/README.md new file mode 100644 index 0000000..237f031 --- /dev/null +++ b/confluence-updater/README.md @@ -0,0 +1,11 @@ +# Confluence Updater + + +# Requirements + + +# Deployment +A detailed explanation of the deployment can be found in the [Readme](../README.md) of the project. +The *helm-chart* used for the deployment can be found [here](../helm-chart/charts/admin-backend/). + + diff --git a/confluence-updater/main.py b/confluence-updater/main.py new file mode 100644 index 0000000..3c85496 --- /dev/null +++ b/confluence-updater/main.py @@ -0,0 +1 @@ +from extractor_api_lib.main import app as perfect_extractor_app # noqa: F401 diff --git a/confluence-updater/pyproject.toml b/confluence-updater/pyproject.toml new file mode 100644 index 0000000..d20071d --- /dev/null +++ b/confluence-updater/pyproject.toml @@ -0,0 +1,88 @@ +[tool.flake8] +exclude= [".eggs", "./src/openapi_server/models/*", ".git", ".hg", ".mypy_cache", ".tox", ".venv", ".devcontainer", "venv", "_build", "buck-out", "build", "dist"] +statistics = true +show-source = false +max-complexity = 10 +max-annotations-complexity = 3 +docstring-convent = 'numpy' +max-line-length = 120 +ignore = ["E203", "W503", "E704"] +inline-quotes = '"' +docstring-quotes = '"""' +multiline-quotes = '"""' +dictionaries = ["en_US", "python", "technical", "pandas"] +ban-relative-imports = true + +[tool.black] +line-length = 120 +exclude = """ +/( + | .eggs + | .git + | .hg + | .mypy_cache + | .nox + | .pants.d + | .tox + | .venv + | _build + | buck-out + | build + | dist + | node_modules + | venv +)/ +""" + +[tool.isort] +profile = "black" +skip = ['.eggs', '.git', '.hg', '.mypy_cache', '.nox', '.pants.d', '.tox', '.venv', '_build', 'buck-out', 'build', 'dist', 'node_modules', 'venv'] +skip_gitignore = true + +[tool.pylint] +max-line-length = 120 + +[tool.poetry] +name = "pdfextractor_server" +version = "0.0.0" +description = "Extracts the content of pdf documents." +authors = ["STACKIT Data and AI Consulting "] +readme = "README.md" + +[tool.poetry.dependencies] +python = ">=3.11,<3.12" +extractor-api-lib = {path = "../rag-core-library/extractor-api-lib", develop = true} + +[tool.poetry.group.dev.dependencies] +flake8 = "^7.0.0" +pytest = "^8.0.1" +black = "^24.2.0" +flake8-black = "^0.3.6" +flake8-pyproject = "^1.2.3" +coverage = "^7.5.4" +flake8-quotes = "^3.4.0" +flake8-return = "^1.2.0" +flake8-annotations-complexity = "^0.0.8" +flake8-bandit = "^4.1.1" +flake8-bugbear = "^24.8.19" +flake8-builtins = "^2.5.0" +flake8-comprehensions = "^3.15.0" +flake8-eradicate = "^1.5.0" +flake8-expression-complexity = "^0.0.11" +flake8-pytest-style = "^2.0.0" +pep8-naming = "^0.14.1" +flake8-eol = "^0.0.8" +flake8-exceptions = "^0.0.1a0" +flake8-simplify = "^0.21.0" +flake8-wot = "^0.2.0" +flake8-function-order = "^0.0.5" +flake8-tidy-imports = "^4.10.0" +# flake8-logging-format = "^2024.24.12" +# flake8-docstrings = "^1.7.0" + +[tool.poetry.group.tests.dependencies] +httpx = "^0.26.0" + +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api" diff --git a/confluence-updater/tests/__init__.py b/confluence-updater/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/confluence-updater/tests/dummy6_test.py b/confluence-updater/tests/dummy6_test.py new file mode 100644 index 0000000..6e09897 --- /dev/null +++ b/confluence-updater/tests/dummy6_test.py @@ -0,0 +1,3 @@ +def test_dummy() -> None: + print("Dummy test.") + assert True # noqa S101 diff --git a/rag-core-library b/rag-core-library index f6eb00b..a5d45b9 160000 --- a/rag-core-library +++ b/rag-core-library @@ -1 +1 @@ -Subproject commit f6eb00b542b0384345737d21b790ce989c8200af +Subproject commit a5d45b9ae0018bdaabc4f4c2169cd737f664900c From e98f145c31f702e924a01879b648797f101266d9 Mon Sep 17 00:00:00 2001 From: Andreas Klos Date: Fri, 2 May 2025 09:55:53 +0200 Subject: [PATCH 3/5] chore: remove unused files including Dockerfile, Makefile, README, and .gitignore --- confluence-updater/.gitignore | 140 ------------------------ confluence-updater/Dockerfile | 70 ------------ confluence-updater/Makefile | 12 -- confluence-updater/README.md | 11 -- confluence-updater/main.py | 1 - confluence-updater/pyproject.toml | 88 --------------- confluence-updater/tests/__init__.py | 0 confluence-updater/tests/dummy6_test.py | 3 - 8 files changed, 325 deletions(-) delete mode 100644 confluence-updater/.gitignore delete mode 100644 confluence-updater/Dockerfile delete mode 100644 confluence-updater/Makefile delete mode 100644 confluence-updater/README.md delete mode 100644 confluence-updater/main.py delete mode 100644 confluence-updater/pyproject.toml delete mode 100644 confluence-updater/tests/__init__.py delete mode 100644 confluence-updater/tests/dummy6_test.py diff --git a/confluence-updater/.gitignore b/confluence-updater/.gitignore deleted file mode 100644 index a77bf09..0000000 --- a/confluence-updater/.gitignore +++ /dev/null @@ -1,140 +0,0 @@ -.openapi-generator/ - -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] -*$py.class - -# C extensions -*.so - -# Distribution / packaging -.Python -build/ -develop-eggs/ -dist/ -downloads/ -eggs/ -.eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -wheels/ -share/python-wheels/ -*.egg-info/ -.installed.cfg -*.egg -MANIFEST - -# PyInstaller -# Usually these files are written by a python script from a template -# before PyInstaller builds the exe, so as to inject date/other infos into it. -*.manifest -*.spec - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt - -# Unit test / coverage reports -htmlcov/ -.tox/ -.nox/ -.coverage -.coverage.* -.cache -nosetests.xml -coverage.xml -*.cover -*.py,cover -.hypothesis/ -.pytest_cache/ -cover/ - -# Translations -*.mo -*.pot - -# Django stuff: -*.log -local_settings.py -db.sqlite3 -db.sqlite3-journal - -# Flask stuff: -instance/ -.webassets-cache - -# Scrapy stuff: -.scrapy - -# Sphinx documentation -docs/_build/ - -# PyBuilder -.pybuilder/ -target/ - -# Jupyter Notebook -.ipynb_checkpoints - -# IPython -profile_default/ -ipython_config.py - -# pyenv -# For a library or package, you might want to ignore these files since the code is -# intended to run in multiple environments; otherwise, check them in: -# .python-version - -# pipenv -# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. -# However, in case of collaboration, if having platform-specific dependencies or dependencies -# having no cross-platform support, pipenv may install dependencies that don't work, or not -# install all needed dependencies. -#Pipfile.lock - -# PEP 582; used by e.g. github.com/David-OConnor/pyflow -__pypackages__/ - -# Celery stuff -celerybeat-schedule -celerybeat.pid - -# SageMath parsed files -*.sage.py - -# Environments -.env -.venv -env/ -venv/ -ENV/ -env.bak/ -venv.bak/ - -# Spyder project settings -.spyderproject -.spyproject - -# Rope project settings -.ropeproject - -# mkdocs documentation -/site - -# mypy -.mypy_cache/ -.dmypy.json -dmypy.json - -# Pyre type checker -.pyre/ - -# pytype static type analyzer -.pytype/ - -# Cython debug symbols -cython_debug/ diff --git a/confluence-updater/Dockerfile b/confluence-updater/Dockerfile deleted file mode 100644 index 7415a84..0000000 --- a/confluence-updater/Dockerfile +++ /dev/null @@ -1,70 +0,0 @@ -FROM --platform=linux/amd64 python:3.11.7-bookworm AS build - -ARG dev=0 -ENV POETRY_VIRTUALENVS_PATH=/app/document-extractor/.venv -ENV POETRY_VERSION=1.8.3 - -RUN DEBIAN_FRONTEND=noninteractive apt-get update \ - && DEBIAN_FRONTEND=noninteractive apt-get install -y build-essential --no-install-recommends make \ - ffmpeg \ - poppler-utils \ - tesseract-ocr \ - tesseract-ocr-deu \ - tesseract-ocr-eng && \ - python3 -m venv "${POETRY_VIRTUALENVS_PATH}" \ - && $POETRY_VIRTUALENVS_PATH/bin/pip install "poetry==${POETRY_VERSION}" -ENV PATH="${POETRY_VIRTUALENVS_PATH}/bin:$PATH" - -COPY rag-core-library/extractor-api-lib /app/rag-core-library/extractor-api-lib - -WORKDIR /app/document-extractor -COPY document-extractor/pyproject.toml document-extractor/poetry.lock ./ - -RUN mkdir log && chmod 700 log -RUN touch /app/document-extractor/log/logfile.log && chmod 600 /app/document-extractor/log/logfile.log - -RUN poetry config virtualenvs.create false &&\ - if [ "$dev" = "1" ]; then \ - poetry install --no-interaction --no-ansi --no-root --with dev; \ - else \ - poetry install --no-interaction --no-ansi --no-root; \ - fi - -FROM --platform=linux/amd64 python:3.11.7-bookworm -ARG dev=0 - -RUN adduser --disabled-password --gecos "" --uid 65532 nonroot - -ENV POETRY_VIRTUALENVS_PATH=/app/document-extractor/.venv -COPY --from=build --chown=nonroot:nonroot ${POETRY_VIRTUALENVS_PATH} ${POETRY_VIRTUALENVS_PATH} -COPY --from=build /usr/local/bin/ /usr/local/bin/ -COPY --from=build /usr/bin/ /usr/bin/ -COPY --from=build /usr/local/lib/ /usr/local/lib/ -COPY --from=build /usr/lib/x86_64-linux-gnu/ /usr/lib/x86_64-linux-gnu/ -COPY --from=build /usr/share/tesseract-ocr/5/tessdata /usr/share/tesseract-ocr/5/tessdata - -COPY --chown=nonroot:nonroot rag-core-library/extractor-api-lib /app/rag-core-library/extractor-api-lib - -WORKDIR /app/document-extractor - -COPY --chown=nonroot:nonroot document-extractor . - - -# cleanup -RUN apt-get clean autoclean -RUN apt-get autoremove --yes - -RUN if [ "$dev" = "0" ]; then \ - while read -r shell; do rm -f "$shell"; done < /etc/shells; \ - rm -rf /var/lib/{apt,dpkg,cache,log}/ \ - else \ - echo "POETRY_VIRTUALENVS_PATH=/app/document-extractor/.venv" >> /etc/environment;\ - export POETRY_VIRTUALENVS_PATH=/app/document-extractor/.venv;\ - export PATH="${POETRY_VIRTUALENVS_PATH}/bin:$PATH";\ - fi - - -USER nonroot -COPY --from=build --chown=nonroot:nonroot /app/document-extractor/log /app/document-extractor/log - -ENV PATH="${POETRY_VIRTUALENVS_PATH}/bin:${PATH}" diff --git a/confluence-updater/Makefile b/confluence-updater/Makefile deleted file mode 100644 index 0daff64..0000000 --- a/confluence-updater/Makefile +++ /dev/null @@ -1,12 +0,0 @@ -.PHONY: lint coverage test - -lint: - poetry run flake8 . - -test: - poetry run python -m pytest tests - -coverage: - poetry run coverage run --omit *.pyc --omit *__init__.py --source src/rag_core -m pytest tests - poetry run coverage report -m - poetry run coverage html diff --git a/confluence-updater/README.md b/confluence-updater/README.md deleted file mode 100644 index 237f031..0000000 --- a/confluence-updater/README.md +++ /dev/null @@ -1,11 +0,0 @@ -# Confluence Updater - - -# Requirements - - -# Deployment -A detailed explanation of the deployment can be found in the [Readme](../README.md) of the project. -The *helm-chart* used for the deployment can be found [here](../helm-chart/charts/admin-backend/). - - diff --git a/confluence-updater/main.py b/confluence-updater/main.py deleted file mode 100644 index 3c85496..0000000 --- a/confluence-updater/main.py +++ /dev/null @@ -1 +0,0 @@ -from extractor_api_lib.main import app as perfect_extractor_app # noqa: F401 diff --git a/confluence-updater/pyproject.toml b/confluence-updater/pyproject.toml deleted file mode 100644 index d20071d..0000000 --- a/confluence-updater/pyproject.toml +++ /dev/null @@ -1,88 +0,0 @@ -[tool.flake8] -exclude= [".eggs", "./src/openapi_server/models/*", ".git", ".hg", ".mypy_cache", ".tox", ".venv", ".devcontainer", "venv", "_build", "buck-out", "build", "dist"] -statistics = true -show-source = false -max-complexity = 10 -max-annotations-complexity = 3 -docstring-convent = 'numpy' -max-line-length = 120 -ignore = ["E203", "W503", "E704"] -inline-quotes = '"' -docstring-quotes = '"""' -multiline-quotes = '"""' -dictionaries = ["en_US", "python", "technical", "pandas"] -ban-relative-imports = true - -[tool.black] -line-length = 120 -exclude = """ -/( - | .eggs - | .git - | .hg - | .mypy_cache - | .nox - | .pants.d - | .tox - | .venv - | _build - | buck-out - | build - | dist - | node_modules - | venv -)/ -""" - -[tool.isort] -profile = "black" -skip = ['.eggs', '.git', '.hg', '.mypy_cache', '.nox', '.pants.d', '.tox', '.venv', '_build', 'buck-out', 'build', 'dist', 'node_modules', 'venv'] -skip_gitignore = true - -[tool.pylint] -max-line-length = 120 - -[tool.poetry] -name = "pdfextractor_server" -version = "0.0.0" -description = "Extracts the content of pdf documents." -authors = ["STACKIT Data and AI Consulting "] -readme = "README.md" - -[tool.poetry.dependencies] -python = ">=3.11,<3.12" -extractor-api-lib = {path = "../rag-core-library/extractor-api-lib", develop = true} - -[tool.poetry.group.dev.dependencies] -flake8 = "^7.0.0" -pytest = "^8.0.1" -black = "^24.2.0" -flake8-black = "^0.3.6" -flake8-pyproject = "^1.2.3" -coverage = "^7.5.4" -flake8-quotes = "^3.4.0" -flake8-return = "^1.2.0" -flake8-annotations-complexity = "^0.0.8" -flake8-bandit = "^4.1.1" -flake8-bugbear = "^24.8.19" -flake8-builtins = "^2.5.0" -flake8-comprehensions = "^3.15.0" -flake8-eradicate = "^1.5.0" -flake8-expression-complexity = "^0.0.11" -flake8-pytest-style = "^2.0.0" -pep8-naming = "^0.14.1" -flake8-eol = "^0.0.8" -flake8-exceptions = "^0.0.1a0" -flake8-simplify = "^0.21.0" -flake8-wot = "^0.2.0" -flake8-function-order = "^0.0.5" -flake8-tidy-imports = "^4.10.0" -# flake8-logging-format = "^2024.24.12" -# flake8-docstrings = "^1.7.0" - -[tool.poetry.group.tests.dependencies] -httpx = "^0.26.0" - -[build-system] -requires = ["poetry-core"] -build-backend = "poetry.core.masonry.api" diff --git a/confluence-updater/tests/__init__.py b/confluence-updater/tests/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/confluence-updater/tests/dummy6_test.py b/confluence-updater/tests/dummy6_test.py deleted file mode 100644 index 6e09897..0000000 --- a/confluence-updater/tests/dummy6_test.py +++ /dev/null @@ -1,3 +0,0 @@ -def test_dummy() -> None: - print("Dummy test.") - assert True # noqa S101 From 403501b415235e483558c7ac6cb9c7715b7abc03 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Fri, 2 May 2025 08:00:24 +0000 Subject: [PATCH 4/5] chore: update submodules to latest main --- rag-core-library | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rag-core-library b/rag-core-library index a5d45b9..cf78325 160000 --- a/rag-core-library +++ b/rag-core-library @@ -1 +1 @@ -Subproject commit a5d45b9ae0018bdaabc4f4c2169cd737f664900c +Subproject commit cf78325c59920fd31170c20b1244092b572d469d From 9738cdb9486f8d5d88e3ab301e1fbfe041a05415 Mon Sep 17 00:00:00 2001 From: Andreas Klos Date: Fri, 23 May 2025 07:18:52 +0200 Subject: [PATCH 5/5] chore: add CONFLUENCE_MAX_PAGES environment variable support and update subproject commits --- .gitignore | 1 + Tiltfile | 6 ++++-- rag-core-library | 2 +- rag-infrastructure | 2 +- 4 files changed, 7 insertions(+), 4 deletions(-) diff --git a/.gitignore b/.gitignore index 48241d4..546cf9a 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,7 @@ helm-chart/charts/rag-0.0.1.tgz pyrightconfig.json notes*.md notes.md +*todo*.md # macOS Finder metadata .DS_Store diff --git a/Tiltfile b/Tiltfile index 93b57ae..de6360c 100644 --- a/Tiltfile +++ b/Tiltfile @@ -132,7 +132,6 @@ local_resource( allow_parallel=True, ) - ######################################################################################################################## ################################## build backend_rag image and do live update ########################################## ######################################################################################################################## @@ -355,6 +354,9 @@ if has_confluence_config(): if os.environ.get("CONFLUENCE_DOCUMENT_NAME"): document_names = os.environ["CONFLUENCE_DOCUMENT_NAME"].replace(",", "\\,") confluence_settings.append("adminBackend.envs.confluenceLoader.CONFLUENCE_DOCUMENT_NAME=%s" % document_names) + if os.environ.get("CONFLUENCE_MAX_PAGES"): + max_pages = os.environ["CONFLUENCE_MAX_PAGES"].replace(",", "\\,") + confluence_settings.append("adminBackend.envs.confluenceLoader.CONFLUENCE_MAX_PAGES=%s" % max_pages) value_override.extend(confluence_settings) if os.environ.get("STACKIT_VLLM_API_KEY", False): @@ -373,7 +375,7 @@ if os.environ.get("STACKIT_EMBEDDER_API_KEY", False): yaml = helm( "./rag-infrastructure/rag", name="rag", - namespace="rag", + namespace=namespace, values=[ "./rag-infrastructure/rag/values.yaml", ], diff --git a/rag-core-library b/rag-core-library index a5d45b9..74f45e7 160000 --- a/rag-core-library +++ b/rag-core-library @@ -1 +1 @@ -Subproject commit a5d45b9ae0018bdaabc4f4c2169cd737f664900c +Subproject commit 74f45e729d71627008d19903874238af2e3aca35 diff --git a/rag-infrastructure b/rag-infrastructure index 24e755e..29b455d 160000 --- a/rag-infrastructure +++ b/rag-infrastructure @@ -1 +1 @@ -Subproject commit 24e755ec4961752a01de4a868df2c7f390194b5d +Subproject commit 29b455d8a2d1b433c5076881751b7e7df9f9c7f2