ML/LlamaIndex: Add software tests and CI configuration

amotl · amotl · commit 88a552bfc4d7 · 2024-11-05T11:42:37.000+01:00
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
@@ -114,6 +114,11 @@ updates:
     schedule:
       interval: "daily"
 
+  - directory: "/topic/machine-learning/llama-index"
+    package-ecosystem: "pip"
+    schedule:
+      interval: "daily"
+
   - directory: "/topic/machine-learning/mlops-mlflow"
     package-ecosystem: "pip"
     schedule:
diff --git a/.github/workflows/ml-llamaindex.yml b/.github/workflows/ml-llamaindex.yml
@@ -0,0 +1,82 @@
+name: LlamaIndex
+
+on:
+  pull_request:
+    branches: ~
+    paths:
+    - '.github/workflows/ml-llamaindex.yml'
+    - 'topic/machine-learning/llama-index/**'
+    - '/requirements.txt'
+  push:
+    branches: [ main ]
+    paths:
+    - '.github/workflows/ml-llamaindex.yml'
+    - 'topic/machine-learning/llama-index/**'
+    - '/requirements.txt'
+
+  # Allow job to be triggered manually.
+  workflow_dispatch:
+
+  # Run job each night after CrateDB nightly has been published.
+  schedule:
+    - cron: '0 3 * * *'
+
+# Cancel in-progress jobs when pushing to the same branch.
+concurrency:
+  cancel-in-progress: true
+  group: ${{ github.workflow }}-${{ github.ref }}
+
+jobs:
+  test:
+    name: "
+     Python: ${{ matrix.python-version }}
+     CrateDB: ${{ matrix.cratedb-version }}
+     on ${{ matrix.os }}"
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [
+          'ubuntu-latest',
+        ]
+        python-version: [
+          '3.8',
+          '3.13',
+        ]
+        cratedb-version: [ 'nightly' ]
+
+    services:
+      cratedb:
+        image: crate/crate:${{ matrix.cratedb-version }}
+        ports:
+          - 4200:4200
+          - 5432:5432
+        env:
+          CRATE_HEAP_SIZE: 4g
+
+    env:
+      OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+
+    steps:
+
+      - name: Acquire sources
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+          architecture: x64
+          cache: 'pip'
+          cache-dependency-path: |
+            requirements.txt
+            topic/machine-learning/llama-index/requirements.txt
+            topic/machine-learning/llama-index/requirements-dev.txt
+
+      - name: Install utilities
+        run: |
+          pip install -r requirements.txt
+
+      - name: Validate topic/machine-learning/llama-index
+        run: |
+          ngr test --accept-no-venv topic/machine-learning/llama-index
diff --git a/topic/machine-learning/llama-index/init.sql b/topic/machine-learning/llama-index/init.sql
@@ -0,0 +1,23 @@
+CREATE TABLE IF NOT EXISTS time_series_data (
+    timestamp TIMESTAMP,
+    value DOUBLE,
+    location STRING,
+    sensor_id INT
+);
+
+INSERT INTO time_series_data (timestamp, value, location, sensor_id)
+VALUES
+    ('2023-09-14T00:00:00', 10.5, 'Sensor A', 1),
+    ('2023-09-14T01:00:00', 15.2, 'Sensor A', 1),
+    ('2023-09-14T02:00:00', 18.9, 'Sensor A', 1),
+    ('2023-09-14T03:00:00', 12.7, 'Sensor B', 2),
+    ('2023-09-14T04:00:00', 17.3, 'Sensor B', 2),
+    ('2023-09-14T05:00:00', 20.1, 'Sensor B', 2),
+    ('2023-09-14T06:00:00', 22.5, 'Sensor A', 1),
+    ('2023-09-14T07:00:00', 18.3, 'Sensor A', 1),
+    ('2023-09-14T08:00:00', 16.8, 'Sensor A', 1),
+    ('2023-09-14T09:00:00', 14.6, 'Sensor B', 2),
+    ('2023-09-14T10:00:00', 13.2, 'Sensor B', 2),
+    ('2023-09-14T11:00:00', 11.7, 'Sensor B', 2);
+
+REFRESH TABLE time_series_data;
diff --git a/topic/machine-learning/llama-index/pyproject.toml b/topic/machine-learning/llama-index/pyproject.toml
@@ -0,0 +1,27 @@
+[tool.pytest.ini_options]
+minversion = "2.0"
+addopts = """
+  -rfEX -p pytester --strict-markers --verbosity=3 --capture=no
+  --cov=. --cov-report=term-missing --cov-report=xml
+  """
+
+#log_level = "DEBUG"
+#log_cli_level = "DEBUG"
+
+testpaths = [
+    "*.py",
+]
+xfail_strict = true
+markers = [
+]
+
+[tool.coverage.run]
+branch = false
+
+[tool.coverage.report]
+fail_under = 0
+show_missing = true
+omit = [
+    "conftest.py",
+    "test*.py",
+]
diff --git a/topic/machine-learning/llama-index/requirements-dev.txt b/topic/machine-learning/llama-index/requirements-dev.txt
@@ -0,0 +1,3 @@
+cratedb-toolkit
+pueblo[testing]
+sqlparse
diff --git a/topic/machine-learning/llama-index/test.py b/topic/machine-learning/llama-index/test.py
@@ -0,0 +1,40 @@
+from pathlib import Path
+
+import pytest
+
+from cratedb_toolkit.io.sql import DatabaseAdapter
+from dotenv import load_dotenv
+
+HERE = Path(__file__).parent
+
+
+@pytest.fixture()
+def cratedb() -> DatabaseAdapter:
+    return DatabaseAdapter(dburi="crate://crate@localhost:4200")
+
+
+@pytest.fixture(scope="function", autouse=True)
+def init_database(cratedb):
+    """
+    Initialize database.
+    """
+    cratedb.run_sql("DROP TABLE IF EXISTS time_series_data;")
+    cratedb.run_sql((HERE / "init.sql").read_text())
+
+
+def test_main(cratedb, capsys):
+    """
+    Execute `main.py` and verify outcome.
+    """
+
+    # Load the standalone configuration also for software testing.
+    # On CI, `OPENAI_API_KEY` will need to be supplied externally.
+    load_dotenv("env.standalone")
+
+    # Invoke the workload, in-process.
+    from main import main
+    main()
+
+    # Verify the outcome.
+    out = capsys.readouterr().out
+    assert "Answer was: The average value for sensor 1 is approximately 17.03." in out

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+cratedb-toolkit`
	`2`	`+pueblo[testing]`
	`3`	`+sqlparse`