Skip to content

Commit 7e4c793

Browse files
authored
feat: add option to keep metadata tags (#210)
1 parent e1834fa commit 7e4c793

File tree

7 files changed

+154
-46
lines changed

7 files changed

+154
-46
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
77

88
## [Unreleased]
99

10+
### Added
11+
12+
- Option to skip metadata tags filtering, based on a default GDAL configuration
13+
1014
## [0.14.0] - 2025-05-17
1115

1216
### Added

quackosm/_constants.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,20 @@
1111

1212
FEATURES_INDEX = "feature_id"
1313

14+
METADATA_TAGS_TO_IGNORE = [
15+
"area",
16+
"created_by",
17+
"converted_by",
18+
"source",
19+
"time",
20+
"ele",
21+
"note",
22+
"todo",
23+
"fixme",
24+
"FIXME",
25+
"openGeoDB:",
26+
]
27+
1428
__all__ = [
1529
"FEATURES_INDEX",
1630
"GEOMETRY_COLUMN",

quackosm/cli.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -608,6 +608,14 @@ def main(
608608
show_default=True,
609609
),
610610
] = True,
611+
ignore_metadata_tags: Annotated[
612+
bool,
613+
typer.Option(
614+
"--ignore-metadata-tags/--keep-metadata-tags",
615+
help="Whether to remove metadata tags, based on the default GDAL config.",
616+
show_default=True,
617+
),
618+
] = True,
611619
wkt_result: Annotated[
612620
bool,
613621
typer.Option(
@@ -775,6 +783,7 @@ def main(
775783
if osm_way_polygon_features_config
776784
else None
777785
),
786+
ignore_metadata_tags=ignore_metadata_tags,
778787
filter_osm_ids=filter_osm_ids, # type: ignore
779788
custom_sql_filter=custom_sql_filter,
780789
sort_result=sort_result,
@@ -798,6 +807,7 @@ def main(
798807
if osm_way_polygon_features_config
799808
else None
800809
),
810+
ignore_metadata_tags=ignore_metadata_tags,
801811
filter_osm_ids=filter_osm_ids, # type: ignore
802812
custom_sql_filter=custom_sql_filter,
803813
sort_result=sort_result,
@@ -824,6 +834,7 @@ def main(
824834
if osm_way_polygon_features_config
825835
else None
826836
),
837+
ignore_metadata_tags=ignore_metadata_tags,
827838
filter_osm_ids=filter_osm_ids, # type: ignore
828839
custom_sql_filter=custom_sql_filter,
829840
sort_result=sort_result,
@@ -856,6 +867,7 @@ def main(
856867
if osm_way_polygon_features_config
857868
else None
858869
),
870+
ignore_metadata_tags=ignore_metadata_tags,
859871
filter_osm_ids=filter_osm_ids, # type: ignore
860872
custom_sql_filter=custom_sql_filter,
861873
sort_result=sort_result,
@@ -886,6 +898,7 @@ def main(
886898
if osm_way_polygon_features_config
887899
else None
888900
),
901+
ignore_metadata_tags=ignore_metadata_tags,
889902
filter_osm_ids=filter_osm_ids, # type: ignore
890903
custom_sql_filter=custom_sql_filter,
891904
sort_result=sort_result,
@@ -911,6 +924,7 @@ def main(
911924
if osm_way_polygon_features_config
912925
else None
913926
),
927+
ignore_metadata_tags=ignore_metadata_tags,
914928
filter_osm_ids=filter_osm_ids, # type: ignore
915929
custom_sql_filter=custom_sql_filter,
916930
duckdb_table_name=duckdb_table_name or "quackosm",

quackosm/functions.py

Lines changed: 37 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,9 +26,9 @@
2626
__all__ = [
2727
"convert_pbf_to_parquet",
2828
"convert_pbf_to_duckdb",
29+
"convert_pbf_to_geodataframe",
2930
"convert_geometry_to_parquet",
3031
"convert_geometry_to_duckdb",
31-
"convert_pbf_to_geodataframe",
3232
"convert_geometry_to_geodataframe",
3333
"convert_osm_extract_to_parquet",
3434
"convert_osm_extract_to_duckdb",
@@ -47,6 +47,7 @@ def convert_pbf_to_duckdb(
4747
compression: str = PARQUET_COMPRESSION,
4848
compression_level: int = PARQUET_COMPRESSION_LEVEL,
4949
row_group_size: int = PARQUET_ROW_GROUP_SIZE,
50+
ignore_metadata_tags: bool = True,
5051
ignore_cache: bool = False,
5152
filter_osm_ids: Optional[list[str]] = None,
5253
custom_sql_filter: Optional[str] = None,
@@ -97,6 +98,8 @@ def convert_pbf_to_duckdb(
9798
Defaults to 3.
9899
row_group_size (int, optional): Approximate number of rows per row group in the final
99100
parquet file. Defaults to 100_000.
101+
ignore_metadata_tags (bool, optional): Remove metadata tags, based on the default GDAL
102+
config. Defaults to `True`.
100103
ignore_cache (bool, optional): Whether to ignore precalculated geoparquet files or not.
101104
Defaults to False.
102105
filter_osm_ids: (list[str], optional): List of OSM features ids to read from the file.
@@ -271,6 +274,7 @@ def convert_pbf_to_duckdb(
271274
compression=compression,
272275
compression_level=compression_level,
273276
row_group_size=row_group_size,
277+
ignore_metadata_tags=ignore_metadata_tags,
274278
verbosity_mode=verbosity_mode,
275279
debug_memory=debug_memory,
276280
debug_times=debug_times,
@@ -298,6 +302,7 @@ def convert_geometry_to_duckdb(
298302
compression: str = PARQUET_COMPRESSION,
299303
compression_level: int = PARQUET_COMPRESSION_LEVEL,
300304
row_group_size: int = PARQUET_ROW_GROUP_SIZE,
305+
ignore_metadata_tags: bool = True,
301306
ignore_cache: bool = False,
302307
filter_osm_ids: Optional[list[str]] = None,
303308
custom_sql_filter: Optional[str] = None,
@@ -353,6 +358,8 @@ def convert_geometry_to_duckdb(
353358
Defaults to 3.
354359
row_group_size (int, optional): Approximate number of rows per row group in the final
355360
parquet file. Defaults to 100_000.
361+
ignore_metadata_tags (bool, optional): Remove metadata tags, based on the default GDAL
362+
config. Defaults to `True`.
356363
ignore_cache: (bool, optional): Whether to ignore precalculated geoparquet files or not.
357364
Defaults to False.
358365
filter_osm_ids: (list[str], optional): List of OSM features ids to read from the file.
@@ -491,6 +498,7 @@ def convert_geometry_to_duckdb(
491498
compression_level=compression_level,
492499
row_group_size=row_group_size,
493500
osm_extract_source=osm_extract_source,
501+
ignore_metadata_tags=ignore_metadata_tags,
494502
verbosity_mode=verbosity_mode,
495503
geometry_coverage_iou_threshold=geometry_coverage_iou_threshold,
496504
allow_uncovered_geometry=allow_uncovered_geometry,
@@ -520,6 +528,7 @@ def convert_osm_extract_to_duckdb(
520528
compression: str = PARQUET_COMPRESSION,
521529
compression_level: int = PARQUET_COMPRESSION_LEVEL,
522530
row_group_size: int = PARQUET_ROW_GROUP_SIZE,
531+
ignore_metadata_tags: bool = True,
523532
ignore_cache: bool = False,
524533
filter_osm_ids: Optional[list[str]] = None,
525534
custom_sql_filter: Optional[str] = None,
@@ -573,6 +582,8 @@ def convert_osm_extract_to_duckdb(
573582
Defaults to 3.
574583
row_group_size (int, optional): Approximate number of rows per row group in the final
575584
parquet file. Defaults to 100_000.
585+
ignore_metadata_tags (bool, optional): Remove metadata tags, based on the default GDAL
586+
config. Defaults to `True`.
576587
ignore_cache (bool, optional): Whether to ignore precalculated geoparquet files or not.
577588
Defaults to False.
578589
filter_osm_ids: (list[str], optional): List of OSM features ids to read from the file.
@@ -666,6 +677,7 @@ def convert_osm_extract_to_duckdb(
666677
compression=compression,
667678
compression_level=compression_level,
668679
row_group_size=row_group_size,
680+
ignore_metadata_tags=ignore_metadata_tags,
669681
verbosity_mode=verbosity_mode,
670682
debug_memory=debug_memory,
671683
debug_times=debug_times,
@@ -693,6 +705,7 @@ def convert_pbf_to_parquet(
693705
compression: str = PARQUET_COMPRESSION,
694706
compression_level: int = PARQUET_COMPRESSION_LEVEL,
695707
row_group_size: int = PARQUET_ROW_GROUP_SIZE,
708+
ignore_metadata_tags: bool = True,
696709
ignore_cache: bool = False,
697710
filter_osm_ids: Optional[list[str]] = None,
698711
custom_sql_filter: Optional[str] = None,
@@ -743,6 +756,8 @@ def convert_pbf_to_parquet(
743756
Defaults to 3.
744757
row_group_size (int, optional): Approximate number of rows per row group in the final
745758
parquet file. Defaults to 100_000.
759+
ignore_metadata_tags (bool, optional): Remove metadata tags, based on the default GDAL
760+
config. Defaults to `True`.
746761
ignore_cache (bool, optional): Whether to ignore precalculated geoparquet files or not.
747762
Defaults to False.
748763
filter_osm_ids: (list[str], optional): List of OSM features ids to read from the file.
@@ -917,6 +932,7 @@ def convert_pbf_to_parquet(
917932
compression=compression,
918933
compression_level=compression_level,
919934
row_group_size=row_group_size,
935+
ignore_metadata_tags=ignore_metadata_tags,
920936
verbosity_mode=verbosity_mode,
921937
debug_memory=debug_memory,
922938
debug_times=debug_times,
@@ -944,6 +960,7 @@ def convert_geometry_to_parquet(
944960
compression: str = PARQUET_COMPRESSION,
945961
compression_level: int = PARQUET_COMPRESSION_LEVEL,
946962
row_group_size: int = PARQUET_ROW_GROUP_SIZE,
963+
ignore_metadata_tags: bool = True,
947964
ignore_cache: bool = False,
948965
filter_osm_ids: Optional[list[str]] = None,
949966
custom_sql_filter: Optional[str] = None,
@@ -999,6 +1016,8 @@ def convert_geometry_to_parquet(
9991016
Defaults to 3.
10001017
row_group_size (int, optional): Approximate number of rows per row group in the final
10011018
parquet file. Defaults to 100_000.
1019+
ignore_metadata_tags (bool, optional): Remove metadata tags, based on the default GDAL
1020+
config. Defaults to `True`.
10021021
ignore_cache: (bool, optional): Whether to ignore precalculated geoparquet files or not.
10031022
Defaults to False.
10041023
filter_osm_ids: (list[str], optional): List of OSM features ids to read from the file.
@@ -1136,6 +1155,7 @@ def convert_geometry_to_parquet(
11361155
compression_level=compression_level,
11371156
row_group_size=row_group_size,
11381157
osm_extract_source=osm_extract_source,
1158+
ignore_metadata_tags=ignore_metadata_tags,
11391159
verbosity_mode=verbosity_mode,
11401160
geometry_coverage_iou_threshold=geometry_coverage_iou_threshold,
11411161
allow_uncovered_geometry=allow_uncovered_geometry,
@@ -1165,6 +1185,7 @@ def convert_osm_extract_to_parquet(
11651185
compression: str = PARQUET_COMPRESSION,
11661186
compression_level: int = PARQUET_COMPRESSION_LEVEL,
11671187
row_group_size: int = PARQUET_ROW_GROUP_SIZE,
1188+
ignore_metadata_tags: bool = True,
11681189
ignore_cache: bool = False,
11691190
filter_osm_ids: Optional[list[str]] = None,
11701191
custom_sql_filter: Optional[str] = None,
@@ -1218,6 +1239,8 @@ def convert_osm_extract_to_parquet(
12181239
Defaults to 3.
12191240
row_group_size (int, optional): Approximate number of rows per row group in the final
12201241
parquet file. Defaults to 100_000.
1242+
ignore_metadata_tags (bool, optional): Remove metadata tags, based on the default GDAL
1243+
config. Defaults to `True`.
12211244
ignore_cache (bool, optional): Whether to ignore precalculated geoparquet files or not.
12221245
Defaults to False.
12231246
filter_osm_ids: (list[str], optional): List of OSM features ids to read from the file.
@@ -1312,6 +1335,7 @@ def convert_osm_extract_to_parquet(
13121335
compression=compression,
13131336
compression_level=compression_level,
13141337
row_group_size=row_group_size,
1338+
ignore_metadata_tags=ignore_metadata_tags,
13151339
verbosity_mode=verbosity_mode,
13161340
debug_memory=debug_memory,
13171341
debug_times=debug_times,
@@ -1339,6 +1363,7 @@ def convert_pbf_to_geodataframe(
13391363
compression: str = PARQUET_COMPRESSION,
13401364
compression_level: int = PARQUET_COMPRESSION_LEVEL,
13411365
row_group_size: int = PARQUET_ROW_GROUP_SIZE,
1366+
ignore_metadata_tags: bool = True,
13421367
ignore_cache: bool = False,
13431368
filter_osm_ids: Optional[list[str]] = None,
13441369
custom_sql_filter: Optional[str] = None,
@@ -1388,6 +1413,8 @@ def convert_pbf_to_geodataframe(
13881413
Defaults to 3.
13891414
row_group_size (int, optional): Approximate number of rows per row group in the final
13901415
parquet file. Defaults to 100_000.
1416+
ignore_metadata_tags (bool, optional): Remove metadata tags, based on the default GDAL
1417+
config. Defaults to `True`.
13911418
ignore_cache: (bool, optional): Whether to ignore precalculated geoparquet files or not.
13921419
Defaults to False.
13931420
filter_osm_ids: (list[str], optional): List of OSM features ids to read from the file.
@@ -1536,6 +1563,7 @@ def convert_pbf_to_geodataframe(
15361563
compression=compression,
15371564
compression_level=compression_level,
15381565
row_group_size=row_group_size,
1566+
ignore_metadata_tags=ignore_metadata_tags,
15391567
verbosity_mode=verbosity_mode,
15401568
debug_memory=debug_memory,
15411569
debug_times=debug_times,
@@ -1559,6 +1587,7 @@ def convert_geometry_to_geodataframe(
15591587
compression: str = PARQUET_COMPRESSION,
15601588
compression_level: int = PARQUET_COMPRESSION_LEVEL,
15611589
row_group_size: int = PARQUET_ROW_GROUP_SIZE,
1590+
ignore_metadata_tags: bool = True,
15621591
ignore_cache: bool = False,
15631592
filter_osm_ids: Optional[list[str]] = None,
15641593
custom_sql_filter: Optional[str] = None,
@@ -1610,6 +1639,8 @@ def convert_geometry_to_geodataframe(
16101639
Defaults to 3.
16111640
row_group_size (int, optional): Approximate number of rows per row group in the final
16121641
parquet file. Defaults to 100_000.
1642+
ignore_metadata_tags (bool, optional): Remove metadata tags, based on the default GDAL
1643+
config. Defaults to `True`.
16131644
ignore_cache: (bool, optional): Whether to ignore precalculated geoparquet files or not.
16141645
Defaults to False.
16151646
filter_osm_ids: (list[str], optional): List of OSM features ids to read from the file.
@@ -1704,6 +1735,7 @@ def convert_geometry_to_geodataframe(
17041735
compression_level=compression_level,
17051736
row_group_size=row_group_size,
17061737
osm_extract_source=osm_extract_source,
1738+
ignore_metadata_tags=ignore_metadata_tags,
17071739
verbosity_mode=verbosity_mode,
17081740
geometry_coverage_iou_threshold=geometry_coverage_iou_threshold,
17091741
allow_uncovered_geometry=allow_uncovered_geometry,
@@ -1729,6 +1761,7 @@ def convert_osm_extract_to_geodataframe(
17291761
compression: str = PARQUET_COMPRESSION,
17301762
compression_level: int = PARQUET_COMPRESSION_LEVEL,
17311763
row_group_size: int = PARQUET_ROW_GROUP_SIZE,
1764+
ignore_metadata_tags: bool = True,
17321765
ignore_cache: bool = False,
17331766
filter_osm_ids: Optional[list[str]] = None,
17341767
custom_sql_filter: Optional[str] = None,
@@ -1778,6 +1811,8 @@ def convert_osm_extract_to_geodataframe(
17781811
Defaults to 3.
17791812
row_group_size (int, optional): Approximate number of rows per row group in the final
17801813
parquet file. Defaults to 100_000.
1814+
ignore_metadata_tags (bool, optional): Remove metadata tags, based on the default GDAL
1815+
config. Defaults to `True`.
17811816
ignore_cache (bool, optional): Whether to ignore precalculated geoparquet files or not.
17821817
Defaults to False.
17831818
filter_osm_ids: (list[str], optional): List of OSM features ids to read from the file.
@@ -1862,6 +1897,7 @@ def convert_osm_extract_to_geodataframe(
18621897
compression=compression,
18631898
compression_level=compression_level,
18641899
row_group_size=row_group_size,
1900+
ignore_metadata_tags=ignore_metadata_tags,
18651901
verbosity_mode=verbosity_mode,
18661902
debug_memory=debug_memory,
18671903
debug_times=debug_times,

0 commit comments

Comments
 (0)