Merge pull request #41 from bedrock-engineer/test-kaitak-hk-with-file-input

JoostGevaert · web-flow · commit 826ff0e2c709 · 2025-05-26T17:39:19.000+02:00
Test kaitak hk marimo notebook
diff --git a/examples/hk_kaitak_ags3/hk_kaitak_ags3_to_brgi_geodb.py b/examples/hk_kaitak_ags3/hk_kaitak_ags3_to_brgi_geodb.py
diff --git a/pyproject.toml b/pyproject.toml
@@ -72,20 +72,16 @@ Tracker = "https://github.com/bedrock-engineer/bedrock-ge/issues"
 
 [dependency-groups]
 dev = [
-    "duckdb>=1.2.2",
     "frictionless[excel]>=4.40.8",
-    "jupyter>=1.1.1",
-    "marimo>=0.12.5",
+    "marimo[recommended]>=0.13.11",
     "mypy>=1.11.2",
-    "nbconvert>=7.16.6",
     "pandas-stubs>=2.2.2.240807",
-    "ruff>=0.6.7",
-    "sqlglot>=26.12.1",
 ]
 
 tests = [
     "folium>=0.17.0",
     "mapclassify>=2.8.1",
+    "marimo>=0.13.11",
     "matplotlib>=3.9.2",
     "pytest>=8.3.3",
 ]
diff --git a/sandbox/data_validation/try_pandera.ipynb b/sandbox/data_validation/try_pandera.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "id": "efd86e9f",
    "metadata": {},
    "outputs": [],
@@ -12,7 +12,7 @@
     "from pprint import pprint\n",
     "\n",
     "import pandas as pd\n",
-    "import pandera as pa\n",
+    "import pandera.pandas as pa\n",
     "from pandera.typing import DataFrame, Series"
    ]
   },
diff --git a/src/bedrock_ge/gi/ags/read.py b/src/bedrock_ge/gi/ags/read.py
@@ -1,8 +1,8 @@
 from __future__ import annotations
 
+import codecs
 import io
 from contextlib import contextmanager, nullcontext
-from io import TextIOBase
 from pathlib import Path
 from typing import IO, Any, ContextManager, Dict, List
 
@@ -20,21 +20,20 @@ def detect_encoding(source: str | Path | IO[str] | IO[bytes] | bytes) -> str:
 
     Args:
         source (str | Path | IO[str] | IO[bytes] | bytes): The source to detect encoding from.
-            - str: Treated as a file path if it exists, otherwise as text (returns `DEFAULT_ENCODING`)
-            - Path: File path to read and detect encoding
+            - str or Path: File path.
             - IO[str]: Already decoded text stream (returns `DEFAULT_ENCODING`)
             - IO[bytes]: Binary stream to detect encoding from
             - bytes: Binary data to detect encoding from
 
     Returns:
-        str: The detected encoding name (e.g., 'utf-8', 'iso-8859-1', etc.)
+        str: The detected encoding name (e.g., 'utf-8', 'iso-8859-1', 'ascii', etc.)
 
     Raises:
         TypeError: If the source type is unsupported
         FileNotFoundError: If a file path doesn't exist
     """
     # Set number of bytes to read for detection and required confidence
-    SAMPLE_SIZE = 10000
+    SAMPLE_SIZE = 1_000_000
     REQUIRED_CONFIDENCE = 0.7
 
     def _detect_from_bytes(data: bytes) -> str:
@@ -47,6 +46,9 @@ def _detect_from_bytes(data: bytes) -> str:
         if not encoding or confidence < REQUIRED_CONFIDENCE:
             return DEFAULT_ENCODING
 
+        if encoding.lower() == "ascii":
+            return "utf-8"
+
         return encoding
 
     def _read_from_path(path: Path):
@@ -79,18 +81,17 @@ def _read_from_path(path: Path):
     # IO[str] object
     if hasattr(source, "encoding"):
         if source.encoding:
-            # Could be `None`
+            # Could be `None`, e.g. io.StringIO has an encoding attribute which is None.
             return source.encoding
         else:
             return DEFAULT_ENCODING
 
     # IO[bytes]
-    if isinstance(source, io.BytesIO):
-        original_position = source.tell()
+    if isinstance(source, io.BufferedIOBase):
         try:
+            original_position = source.tell()
             source.seek(0)
             sample = source.read(SAMPLE_SIZE)
-            encoding = _detect_from_bytes(sample)
             if isinstance(sample, bytes):
                 encoding = _detect_from_bytes(sample)
             else:
@@ -105,9 +106,9 @@ def _read_from_path(path: Path):
     raise TypeError(f"Unsupported input type for encoding detection: {type(source)}")
 
 
-def read_ags_source(
+def open_ags_source(
     source: str | Path | IO[str] | IO[bytes] | bytes, encoding=None
-) -> ContextManager[TextIOBase]:
+) -> ContextManager[io.TextIOBase]:
     """Opens or wraps a given source for reading AGS (text-based) data.
 
     Args:
@@ -124,41 +125,42 @@ def read_ags_source(
     Raises:
         TypeError: If the source type is unsupported or binary streams are not decoded.
     """
+    try:
+        codecs.lookup(encoding)
+    except LookupError:
+        raise ValueError(f"Unsupported encoding: {encoding}")
 
     @contextmanager
-    def string_source(content: str):
-        string_io = io.StringIO(content)
+    def _bytes_source(bytes_content: bytes):
+        string_io = io.StringIO(bytes_content.decode(encoding))
         try:
             yield string_io
         finally:
             string_io.close()
 
-    if isinstance(source, str):
+    if isinstance(source, (str, Path)):
         path = Path(source)
         if path.exists() and path.is_file():
             return open(path, "r", encoding=encoding)
         raise FileNotFoundError(f"Path does not exist or is not a file: {source}")
 
-    elif isinstance(source, Path):
-        if source.exists() and source.is_file():
-            return open(source, "r", encoding=encoding)
-        raise FileNotFoundError(f"Path does not exist or is not a file: {source}")
-
-    elif isinstance(source, bytes):
-        return string_source(source.decode(encoding))
+    elif isinstance(source, io.TextIOBase):
+        source.seek(0)
+        return nullcontext(source)
 
-    elif isinstance(source, io.BytesIO):
-        return string_source(source.getvalue().decode(encoding))
+    elif isinstance(source, io.BufferedIOBase):
+        text_stream = io.TextIOWrapper(source, encoding=encoding)
+        text_stream.seek(0)
+        return nullcontext(text_stream)
 
-    elif hasattr(source, "read"):
-        # reset the cursor to the beginning
-        try:
-            source.seek(0)
-        except (AttributeError, io.UnsupportedOperation):
-            pass
-        return nullcontext(source)
+    elif isinstance(source, bytes):
+        return _bytes_source(source)
 
-    raise TypeError(f"Unsupported input type: {type(source)}")
+    else:
+        raise TypeError(
+            f"Unsupported source type: {type(source)}. "
+            "Expected str, Path, IO[str], IO[bytes], or bytes."
+        )
 
 
 def ags_to_dfs(
@@ -179,15 +181,11 @@ def ags_to_dfs(
         Dict[str, pd.DataFrame]]: A dictionary where keys represent AGS group
             names with corresponding DataFrames for the corresponding group data.
     """
-    # if bytes are provided, convert to IO[bytes] to be file-like
-    if isinstance(source, bytes):
-        source = io.BytesIO(source)
-
     if not encoding:
         encoding = detect_encoding(source)
 
     # Get first non-blank line, `None` if all lines are blank
-    with read_ags_source(source, encoding=encoding) as f:
+    with open_ags_source(source, encoding=encoding) as f:
         first_line = next((line.strip() for line in f if line.strip()), None)
 
     if first_line:
@@ -239,7 +237,7 @@ def ags3_to_dfs(
     headers: List[str] = ["", "", ""]
     group_data: List[List[Any]] = [[], [], []]
 
-    with read_ags_source(source, encoding=encoding) as file:
+    with open_ags_source(source, encoding=encoding) as file:
         for i, line in enumerate(file):
             line = line.strip()
             last_line_type = line_type
@@ -333,7 +331,7 @@ def ags4_to_dfs(
             object that represents and AGS4 file.
 
     Returns:
-        Dict[str, pd.DataFrame]: A dictionary of pandas DataFrames, where each key 
+        Dict[str, pd.DataFrame]: A dictionary of pandas DataFrames, where each key
             represents a group name from AGS 4 data, and the corresponding value is a
             pandas DataFrame containing the data for that group.
     """
diff --git a/src/bedrock_ge/gi/ags/schemas.py b/src/bedrock_ge/gi/ags/schemas.py
@@ -1,4 +1,4 @@
-import pandera as pa
+import pandera.pandas as pa
 from pandera.typing import Series
 
 
diff --git a/src/bedrock_ge/gi/ags/transform.py b/src/bedrock_ge/gi/ags/transform.py
@@ -3,7 +3,7 @@
 from typing import Dict
 
 import pandas as pd
-import pandera as pa
+import pandera.pandas as pa
 from pandera.typing import DataFrame
 from pyproj import CRS
 
diff --git a/src/bedrock_ge/gi/gis_geometry.py b/src/bedrock_ge/gi/gis_geometry.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 from typing import Dict, Tuple, Union
 
 import geopandas as gpd
diff --git a/src/bedrock_ge/gi/schemas.py b/src/bedrock_ge/gi/schemas.py
@@ -2,7 +2,7 @@
 
 from typing import Optional
 
-import pandera as pa
+import pandera.pandas as pa
 from pandera.typing import Series
 from pandera.typing.geopandas import GeoSeries
 
diff --git a/tests/test_bedrock_ge/gi/test_ags.py b/tests/test_bedrock_ge/gi/test_ags.py
@@ -52,15 +52,15 @@ def test_detect_encoding():
     ags4_bio = io.BytesIO(ags4_byte)
 
     sources = {
-        ags3: ags3_encoding,
+        ags3: default_encoding,
         ags4: ags4_encoding,
-        ags3_path: ags3_encoding,
+        ags3_path: default_encoding,
         ags4_path: ags4_encoding,
-        ags3_byte: ags3_encoding,
+        ags3_byte: default_encoding,
         ags4_byte: ags4_encoding,
         ags3_sio: default_encoding,
         ags4_sio: default_encoding,
-        ags3_bio: ags3_encoding,
+        ags3_bio: default_encoding,
         ags4_bio: ags4_encoding,
     }
     for source, expected in sources.items():
diff --git a/tests/test_examples/test_hk_kaitak_ags3_to_brgi_geodb.py b/tests/test_examples/test_hk_kaitak_ags3_to_brgi_geodb.py
@@ -2,6 +2,7 @@
 import shutil
 import sqlite3
 import subprocess
+import sys
 from pathlib import Path
 from tempfile import TemporaryDirectory
 
@@ -39,13 +40,16 @@ def test_kaitak_ags3_notebook_runs_and_creates_gpkg(examples_dir):
         # TODO: implement logging
         # NOTE: The env (environment variables) and encoding are required for running
         # the notebook as a script from both Windows and Linux. Without => UnicodeDecodeError
-        # NOTE: `uvx uv run` runs the marimo notebook as a script in a temporary environment,
+        # NOTE: `(uvx) uv run` runs the marimo notebook as a script in a temporary environment,
         # with the Python version and dependencies specified in the PEP 723 inline script metadata.
+        # The issue with this approach is that it uses the latest version of bedrock-ge,
+        # rather than the current code in this repo.
         env = os.environ.copy()
         env["PYTHONIOENCODING"] = "utf-8"
         result = subprocess.run(
             # ["uvx", "uv", "run", "--no-project", "--no-cache", str(notebook_path)],
-            ["uv", "run", str(notebook_path)],
+            # ["uv", "run", str(notebook_path)],
+            [sys.executable, str(notebook_path)],
             check=False,
             capture_output=True,
             text=True,
@@ -55,7 +59,7 @@ def test_kaitak_ags3_notebook_runs_and_creates_gpkg(examples_dir):
 
         # Check that the script ran successfully
         assert result.returncode == 0, (
-            f"📛 Running `uvx run marimo notebook.py` failed with code {result.returncode}\n"
+            f"📛 Running `python notebook.py` failed with code {result.returncode}\n"
             f"📄 STDOUT:\n{result.stdout}\n"
             f"⚠️ STDERR:\n{result.stderr}"
         )
diff --git a/uv.lock b/uv.lock

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-import pandera as pa`
	`1`	`+import pandera.pandas as pa`
`2`	`2`	`from pandera.typing import Series`
`3`	`3`
`4`	`4`
Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,5 @@`
	`1`	`+from __future__ import annotations`
	`2`	`+`
`1`	`3`	`from typing import Dict, Tuple, Union`
`2`	`4`
`3`	`5`	`import geopandas as gpd`