Skip to content

Commit 61dd66f

Browse files
mwipRaczeQ
andauthored
Add support for exporting to duckdb (via parquet) (#157)
* Add support for exporting to duckdb (via parquet) This patch adds functionality to export directly to a DuckDB database via the --duckdb flag or using a ".duckdb" or ".db" file. Optionally one can change the table name in which data will be imported. Documentation was mostly copied from existing functions but doctests were updated and checked for consistency with the results. Closes #94 * Fix open mypy issues * Fix doctest errors; fix filter_osm_ids typing properly This patch fixes remaining doctest errors that occured during #157. Meanwhile, a remaining bug was discovered around the typing of filter_osm_ids. It was solved, too. * Resolve refurb: immut. tuple over list for suffix * Fix typo in test_cli.py * Fix test_cli.py for duckdb: correct file, split args * chore: add new test case for increased coverage * chore: add author to the changelog * feat: add automatic directory generation for duckdb export * chore: refactor cli codebase * fix: remove commas * chore: change working directory test --------- Co-authored-by: Kamil Raczycki <raczyckikamil@gmail.com>
1 parent 201a199 commit 61dd66f

File tree

7 files changed

+903
-15
lines changed

7 files changed

+903
-15
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
77

88
## [Unreleased]
99

10+
### Added
11+
12+
- Option to export to DuckDB database [#94](https://github.com/kraina-ai/quackosm/issues/119) (implemented by [@mwip](https://github.com/mwip))
13+
1014
## [0.11.0] - 2024-09-24
1115

1216
### Changed

quackosm/__init__.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,13 @@
66
"""
77

88
from quackosm.functions import (
9+
convert_geometry_to_duckdb,
910
convert_geometry_to_geodataframe,
1011
convert_geometry_to_parquet,
12+
convert_osm_extract_to_duckdb,
1113
convert_osm_extract_to_geodataframe,
1214
convert_osm_extract_to_parquet,
15+
convert_pbf_to_duckdb,
1316
convert_pbf_to_geodataframe,
1417
convert_pbf_to_parquet,
1518
)
@@ -23,8 +26,11 @@
2326
__all__ = [
2427
"PbfFileReader",
2528
"convert_pbf_to_parquet",
29+
"convert_pbf_to_duckdb",
2630
"convert_geometry_to_parquet",
31+
"convert_geometry_to_duckdb",
2732
"convert_osm_extract_to_parquet",
33+
"convert_osm_extract_to_duckdb",
2834
"convert_pbf_to_geodataframe",
2935
"convert_geometry_to_geodataframe",
3036
"convert_osm_extract_to_geodataframe",

quackosm/cli.py

Lines changed: 119 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -500,11 +500,31 @@ def main(
500500
"--output",
501501
"-o",
502502
help=(
503-
"Path where to save final geoparquet file. If not provided, it will be generated"
503+
"Path where to save final result file. If not provided, it will be generated"
504504
" automatically based on the input pbf file name."
505+
" Can be [bold green].parquet[/bold green] or"
506+
" [bold green].db[/bold green] or [bold green].duckdb[/bold green] extension."
505507
),
506508
),
507509
] = None,
510+
duckdb: Annotated[
511+
bool,
512+
typer.Option(
513+
"--duckdb",
514+
help=(
515+
"Export to duckdb database. If not provided, data can still be exported if"
516+
" [bold bright_cyan]output[/bold bright_cyan] has [bold green].db[/bold green]"
517+
" or [bold green].duckdb[/bold green] extension."
518+
),
519+
),
520+
] = False,
521+
duckdb_table_name: Annotated[
522+
Optional[str],
523+
typer.Option(
524+
"--duckdb-table-name",
525+
help="Table name which the data will be imported into in the DuckDB database.",
526+
),
527+
] = "quackosm",
508528
ignore_cache: Annotated[
509529
bool,
510530
typer.Option(
@@ -687,11 +707,21 @@ def main(
687707
verbosity_mode = "silent"
688708

689709
logging.disable(logging.CRITICAL)
690-
if pbf_file:
710+
711+
is_duckdb = (result_file_path and result_file_path.suffix in (".duckdb", ".db")) or duckdb
712+
713+
pbf_file_parquet = pbf_file and not is_duckdb
714+
pbf_file_duckdb = pbf_file and is_duckdb
715+
osm_extract_parquet = osm_extract_query and not is_duckdb
716+
osm_extract_duckdb = osm_extract_query and is_duckdb
717+
geometry_parquet = not pbf_file and not osm_extract_query and not is_duckdb
718+
geometry_duckdb = not pbf_file and not osm_extract_query and is_duckdb
719+
720+
if pbf_file_parquet:
691721
from quackosm.functions import convert_pbf_to_parquet
692722

693-
geoparquet_path = convert_pbf_to_parquet(
694-
pbf_path=pbf_file,
723+
result_path = convert_pbf_to_parquet(
724+
pbf_path=cast(str, pbf_file),
695725
tags_filter=osm_tags_filter or osm_tags_filter_file, # type: ignore
696726
keep_all_tags=keep_all_tags,
697727
geometry_filter=geometry_filter_value,
@@ -708,13 +738,34 @@ def main(
708738
save_as_wkt=wkt_result,
709739
verbosity_mode=verbosity_mode,
710740
)
711-
elif osm_extract_query:
741+
elif pbf_file_duckdb:
742+
from quackosm.functions import convert_pbf_to_duckdb
743+
744+
result_path = convert_pbf_to_duckdb(
745+
pbf_path=cast(str, pbf_file),
746+
tags_filter=osm_tags_filter or osm_tags_filter_file, # type: ignore
747+
keep_all_tags=keep_all_tags,
748+
geometry_filter=geometry_filter_value,
749+
explode_tags=explode_tags,
750+
ignore_cache=ignore_cache,
751+
working_directory=working_directory,
752+
result_file_path=result_file_path,
753+
osm_way_polygon_features_config=(
754+
json.loads(Path(osm_way_polygon_features_config).read_text())
755+
if osm_way_polygon_features_config
756+
else None
757+
),
758+
filter_osm_ids=filter_osm_ids, # type: ignore
759+
duckdb_table_name=duckdb_table_name or "quackosm",
760+
verbosity_mode=verbosity_mode,
761+
)
762+
elif osm_extract_parquet:
712763
from quackosm._exceptions import OsmExtractSearchError
713764
from quackosm.functions import convert_osm_extract_to_parquet
714765

715766
try:
716-
geoparquet_path = convert_osm_extract_to_parquet(
717-
osm_extract_query=osm_extract_query,
767+
result_path = convert_osm_extract_to_parquet(
768+
osm_extract_query=cast(str, osm_extract_query),
718769
osm_extract_source=osm_extract_source,
719770
tags_filter=osm_tags_filter or osm_tags_filter_file, # type: ignore
720771
keep_all_tags=keep_all_tags,
@@ -738,10 +789,64 @@ def main(
738789
err_console = Console(stderr=True)
739790
err_console.print(ex)
740791
raise typer.Exit(code=1) from None
741-
else:
792+
elif osm_extract_duckdb:
793+
from quackosm._exceptions import OsmExtractSearchError
794+
from quackosm.functions import convert_osm_extract_to_duckdb
795+
796+
try:
797+
result_path = convert_osm_extract_to_duckdb(
798+
osm_extract_query=cast(str, osm_extract_query),
799+
osm_extract_source=osm_extract_source,
800+
tags_filter=osm_tags_filter or osm_tags_filter_file, # type: ignore
801+
keep_all_tags=keep_all_tags,
802+
geometry_filter=geometry_filter_value,
803+
explode_tags=explode_tags,
804+
ignore_cache=ignore_cache,
805+
working_directory=working_directory,
806+
result_file_path=result_file_path,
807+
osm_way_polygon_features_config=(
808+
json.loads(Path(osm_way_polygon_features_config).read_text())
809+
if osm_way_polygon_features_config
810+
else None
811+
),
812+
filter_osm_ids=filter_osm_ids, # type: ignore
813+
duckdb_table_name=duckdb_table_name or "quackosm",
814+
save_as_wkt=wkt_result,
815+
verbosity_mode=verbosity_mode,
816+
)
817+
except OsmExtractSearchError as ex:
818+
from rich.console import Console
819+
820+
err_console = Console(stderr=True)
821+
err_console.print(ex)
822+
raise typer.Exit(code=1) from None
823+
elif geometry_parquet:
742824
from quackosm.functions import convert_geometry_to_parquet
743825

744-
geoparquet_path = convert_geometry_to_parquet(
826+
result_path = convert_geometry_to_parquet(
827+
geometry_filter=geometry_filter_value,
828+
osm_extract_source=osm_extract_source,
829+
tags_filter=osm_tags_filter or osm_tags_filter_file, # type: ignore
830+
keep_all_tags=keep_all_tags,
831+
explode_tags=explode_tags,
832+
ignore_cache=ignore_cache,
833+
working_directory=working_directory,
834+
result_file_path=result_file_path,
835+
osm_way_polygon_features_config=(
836+
json.loads(Path(osm_way_polygon_features_config).read_text())
837+
if osm_way_polygon_features_config
838+
else None
839+
),
840+
filter_osm_ids=filter_osm_ids, # type: ignore
841+
save_as_wkt=wkt_result,
842+
verbosity_mode=verbosity_mode,
843+
geometry_coverage_iou_threshold=geometry_coverage_iou_threshold,
844+
allow_uncovered_geometry=allow_uncovered_geometry,
845+
)
846+
elif geometry_duckdb:
847+
from quackosm.functions import convert_geometry_to_duckdb
848+
849+
result_path = convert_geometry_to_duckdb(
745850
geometry_filter=geometry_filter_value,
746851
osm_extract_source=osm_extract_source,
747852
tags_filter=osm_tags_filter or osm_tags_filter_file, # type: ignore
@@ -756,9 +861,13 @@ def main(
756861
else None
757862
),
758863
filter_osm_ids=filter_osm_ids, # type: ignore
864+
duckdb_table_name=duckdb_table_name or "quackosm",
759865
save_as_wkt=wkt_result,
760866
verbosity_mode=verbosity_mode,
761867
geometry_coverage_iou_threshold=geometry_coverage_iou_threshold,
762868
allow_uncovered_geometry=allow_uncovered_geometry,
763869
)
764-
typer.secho(geoparquet_path, fg="green")
870+
else:
871+
raise RuntimeError("Unknown operation mode")
872+
873+
typer.secho(result_path, fg="green")

0 commit comments

Comments
 (0)