Skip to content

Commit 826ff0e

Browse files
authored
Merge pull request #41 from bedrock-engineer/test-kaitak-hk-with-file-input
Test kaitak hk marimo notebook
2 parents f88feff + 042a8c8 commit 826ff0e

File tree

11 files changed

+551
-1423
lines changed

11 files changed

+551
-1423
lines changed

examples/hk_kaitak_ags3/hk_kaitak_ags3_to_brgi_geodb.py

Lines changed: 212 additions & 224 deletions
Large diffs are not rendered by default.

pyproject.toml

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -72,20 +72,16 @@ Tracker = "https://github.com/bedrock-engineer/bedrock-ge/issues"
7272

7373
[dependency-groups]
7474
dev = [
75-
"duckdb>=1.2.2",
7675
"frictionless[excel]>=4.40.8",
77-
"jupyter>=1.1.1",
78-
"marimo>=0.12.5",
76+
"marimo[recommended]>=0.13.11",
7977
"mypy>=1.11.2",
80-
"nbconvert>=7.16.6",
8178
"pandas-stubs>=2.2.2.240807",
82-
"ruff>=0.6.7",
83-
"sqlglot>=26.12.1",
8479
]
8580

8681
tests = [
8782
"folium>=0.17.0",
8883
"mapclassify>=2.8.1",
84+
"marimo>=0.13.11",
8985
"matplotlib>=3.9.2",
9086
"pytest>=8.3.3",
9187
]

sandbox/data_validation/try_pandera.ipynb

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
"cells": [
33
{
44
"cell_type": "code",
5-
"execution_count": 1,
5+
"execution_count": null,
66
"id": "efd86e9f",
77
"metadata": {},
88
"outputs": [],
@@ -12,7 +12,7 @@
1212
"from pprint import pprint\n",
1313
"\n",
1414
"import pandas as pd\n",
15-
"import pandera as pa\n",
15+
"import pandera.pandas as pa\n",
1616
"from pandera.typing import DataFrame, Series"
1717
]
1818
},

src/bedrock_ge/gi/ags/read.py

Lines changed: 36 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
from __future__ import annotations
22

3+
import codecs
34
import io
45
from contextlib import contextmanager, nullcontext
5-
from io import TextIOBase
66
from pathlib import Path
77
from typing import IO, Any, ContextManager, Dict, List
88

@@ -20,21 +20,20 @@ def detect_encoding(source: str | Path | IO[str] | IO[bytes] | bytes) -> str:
2020
2121
Args:
2222
source (str | Path | IO[str] | IO[bytes] | bytes): The source to detect encoding from.
23-
- str: Treated as a file path if it exists, otherwise as text (returns `DEFAULT_ENCODING`)
24-
- Path: File path to read and detect encoding
23+
- str or Path: File path.
2524
- IO[str]: Already decoded text stream (returns `DEFAULT_ENCODING`)
2625
- IO[bytes]: Binary stream to detect encoding from
2726
- bytes: Binary data to detect encoding from
2827
2928
Returns:
30-
str: The detected encoding name (e.g., 'utf-8', 'iso-8859-1', etc.)
29+
str: The detected encoding name (e.g., 'utf-8', 'iso-8859-1', 'ascii', etc.)
3130
3231
Raises:
3332
TypeError: If the source type is unsupported
3433
FileNotFoundError: If a file path doesn't exist
3534
"""
3635
# Set number of bytes to read for detection and required confidence
37-
SAMPLE_SIZE = 10000
36+
SAMPLE_SIZE = 1_000_000
3837
REQUIRED_CONFIDENCE = 0.7
3938

4039
def _detect_from_bytes(data: bytes) -> str:
@@ -47,6 +46,9 @@ def _detect_from_bytes(data: bytes) -> str:
4746
if not encoding or confidence < REQUIRED_CONFIDENCE:
4847
return DEFAULT_ENCODING
4948

49+
if encoding.lower() == "ascii":
50+
return "utf-8"
51+
5052
return encoding
5153

5254
def _read_from_path(path: Path):
@@ -79,18 +81,17 @@ def _read_from_path(path: Path):
7981
# IO[str] object
8082
if hasattr(source, "encoding"):
8183
if source.encoding:
82-
# Could be `None`
84+
# Could be `None`, e.g. io.StringIO has an encoding attribute which is None.
8385
return source.encoding
8486
else:
8587
return DEFAULT_ENCODING
8688

8789
# IO[bytes]
88-
if isinstance(source, io.BytesIO):
89-
original_position = source.tell()
90+
if isinstance(source, io.BufferedIOBase):
9091
try:
92+
original_position = source.tell()
9193
source.seek(0)
9294
sample = source.read(SAMPLE_SIZE)
93-
encoding = _detect_from_bytes(sample)
9495
if isinstance(sample, bytes):
9596
encoding = _detect_from_bytes(sample)
9697
else:
@@ -105,9 +106,9 @@ def _read_from_path(path: Path):
105106
raise TypeError(f"Unsupported input type for encoding detection: {type(source)}")
106107

107108

108-
def read_ags_source(
109+
def open_ags_source(
109110
source: str | Path | IO[str] | IO[bytes] | bytes, encoding=None
110-
) -> ContextManager[TextIOBase]:
111+
) -> ContextManager[io.TextIOBase]:
111112
"""Opens or wraps a given source for reading AGS (text-based) data.
112113
113114
Args:
@@ -124,41 +125,42 @@ def read_ags_source(
124125
Raises:
125126
TypeError: If the source type is unsupported or binary streams are not decoded.
126127
"""
128+
try:
129+
codecs.lookup(encoding)
130+
except LookupError:
131+
raise ValueError(f"Unsupported encoding: {encoding}")
127132

128133
@contextmanager
129-
def string_source(content: str):
130-
string_io = io.StringIO(content)
134+
def _bytes_source(bytes_content: bytes):
135+
string_io = io.StringIO(bytes_content.decode(encoding))
131136
try:
132137
yield string_io
133138
finally:
134139
string_io.close()
135140

136-
if isinstance(source, str):
141+
if isinstance(source, (str, Path)):
137142
path = Path(source)
138143
if path.exists() and path.is_file():
139144
return open(path, "r", encoding=encoding)
140145
raise FileNotFoundError(f"Path does not exist or is not a file: {source}")
141146

142-
elif isinstance(source, Path):
143-
if source.exists() and source.is_file():
144-
return open(source, "r", encoding=encoding)
145-
raise FileNotFoundError(f"Path does not exist or is not a file: {source}")
146-
147-
elif isinstance(source, bytes):
148-
return string_source(source.decode(encoding))
147+
elif isinstance(source, io.TextIOBase):
148+
source.seek(0)
149+
return nullcontext(source)
149150

150-
elif isinstance(source, io.BytesIO):
151-
return string_source(source.getvalue().decode(encoding))
151+
elif isinstance(source, io.BufferedIOBase):
152+
text_stream = io.TextIOWrapper(source, encoding=encoding)
153+
text_stream.seek(0)
154+
return nullcontext(text_stream)
152155

153-
elif hasattr(source, "read"):
154-
# reset the cursor to the beginning
155-
try:
156-
source.seek(0)
157-
except (AttributeError, io.UnsupportedOperation):
158-
pass
159-
return nullcontext(source)
156+
elif isinstance(source, bytes):
157+
return _bytes_source(source)
160158

161-
raise TypeError(f"Unsupported input type: {type(source)}")
159+
else:
160+
raise TypeError(
161+
f"Unsupported source type: {type(source)}. "
162+
"Expected str, Path, IO[str], IO[bytes], or bytes."
163+
)
162164

163165

164166
def ags_to_dfs(
@@ -179,15 +181,11 @@ def ags_to_dfs(
179181
Dict[str, pd.DataFrame]]: A dictionary where keys represent AGS group
180182
names with corresponding DataFrames for the corresponding group data.
181183
"""
182-
# if bytes are provided, convert to IO[bytes] to be file-like
183-
if isinstance(source, bytes):
184-
source = io.BytesIO(source)
185-
186184
if not encoding:
187185
encoding = detect_encoding(source)
188186

189187
# Get first non-blank line, `None` if all lines are blank
190-
with read_ags_source(source, encoding=encoding) as f:
188+
with open_ags_source(source, encoding=encoding) as f:
191189
first_line = next((line.strip() for line in f if line.strip()), None)
192190

193191
if first_line:
@@ -239,7 +237,7 @@ def ags3_to_dfs(
239237
headers: List[str] = ["", "", ""]
240238
group_data: List[List[Any]] = [[], [], []]
241239

242-
with read_ags_source(source, encoding=encoding) as file:
240+
with open_ags_source(source, encoding=encoding) as file:
243241
for i, line in enumerate(file):
244242
line = line.strip()
245243
last_line_type = line_type
@@ -333,7 +331,7 @@ def ags4_to_dfs(
333331
object that represents and AGS4 file.
334332
335333
Returns:
336-
Dict[str, pd.DataFrame]: A dictionary of pandas DataFrames, where each key
334+
Dict[str, pd.DataFrame]: A dictionary of pandas DataFrames, where each key
337335
represents a group name from AGS 4 data, and the corresponding value is a
338336
pandas DataFrame containing the data for that group.
339337
"""

src/bedrock_ge/gi/ags/schemas.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import pandera as pa
1+
import pandera.pandas as pa
22
from pandera.typing import Series
33

44

src/bedrock_ge/gi/ags/transform.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
from typing import Dict
44

55
import pandas as pd
6-
import pandera as pa
6+
import pandera.pandas as pa
77
from pandera.typing import DataFrame
88
from pyproj import CRS
99

src/bedrock_ge/gi/gis_geometry.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
from __future__ import annotations
2+
13
from typing import Dict, Tuple, Union
24

35
import geopandas as gpd

src/bedrock_ge/gi/schemas.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
from typing import Optional
44

5-
import pandera as pa
5+
import pandera.pandas as pa
66
from pandera.typing import Series
77
from pandera.typing.geopandas import GeoSeries
88

tests/test_bedrock_ge/gi/test_ags.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -52,15 +52,15 @@ def test_detect_encoding():
5252
ags4_bio = io.BytesIO(ags4_byte)
5353

5454
sources = {
55-
ags3: ags3_encoding,
55+
ags3: default_encoding,
5656
ags4: ags4_encoding,
57-
ags3_path: ags3_encoding,
57+
ags3_path: default_encoding,
5858
ags4_path: ags4_encoding,
59-
ags3_byte: ags3_encoding,
59+
ags3_byte: default_encoding,
6060
ags4_byte: ags4_encoding,
6161
ags3_sio: default_encoding,
6262
ags4_sio: default_encoding,
63-
ags3_bio: ags3_encoding,
63+
ags3_bio: default_encoding,
6464
ags4_bio: ags4_encoding,
6565
}
6666
for source, expected in sources.items():

tests/test_examples/test_hk_kaitak_ags3_to_brgi_geodb.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import shutil
33
import sqlite3
44
import subprocess
5+
import sys
56
from pathlib import Path
67
from tempfile import TemporaryDirectory
78

@@ -39,13 +40,16 @@ def test_kaitak_ags3_notebook_runs_and_creates_gpkg(examples_dir):
3940
# TODO: implement logging
4041
# NOTE: The env (environment variables) and encoding are required for running
4142
# the notebook as a script from both Windows and Linux. Without => UnicodeDecodeError
42-
# NOTE: `uvx uv run` runs the marimo notebook as a script in a temporary environment,
43+
# NOTE: `(uvx) uv run` runs the marimo notebook as a script in a temporary environment,
4344
# with the Python version and dependencies specified in the PEP 723 inline script metadata.
45+
# The issue with this approach is that it uses the latest version of bedrock-ge,
46+
# rather than the current code in this repo.
4447
env = os.environ.copy()
4548
env["PYTHONIOENCODING"] = "utf-8"
4649
result = subprocess.run(
4750
# ["uvx", "uv", "run", "--no-project", "--no-cache", str(notebook_path)],
48-
["uv", "run", str(notebook_path)],
51+
# ["uv", "run", str(notebook_path)],
52+
[sys.executable, str(notebook_path)],
4953
check=False,
5054
capture_output=True,
5155
text=True,
@@ -55,7 +59,7 @@ def test_kaitak_ags3_notebook_runs_and_creates_gpkg(examples_dir):
5559

5660
# Check that the script ran successfully
5761
assert result.returncode == 0, (
58-
f"📛 Running `uvx run marimo notebook.py` failed with code {result.returncode}\n"
62+
f"📛 Running `python notebook.py` failed with code {result.returncode}\n"
5963
f"📄 STDOUT:\n{result.stdout}\n"
6064
f"⚠️ STDERR:\n{result.stderr}"
6165
)

0 commit comments

Comments
 (0)