Skip to content

Commit d7ec49c

Browse files
authored
feat: shorten cache file path hashes (#189)
* feat: shorten cache file paths hashes * chore: add licensecheck exception * chore: change docstring examples * chore: add exception to licensecheck
1 parent e3dd64e commit d7ec49c

File tree

5 files changed

+80
-82
lines changed

5 files changed

+80
-82
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
77

88
## [Unreleased]
99

10+
### Changed
11+
12+
- Shortened the cache file paths hashes from default 64 characters to 8 [#188](https://github.com/kraina-ai/quackosm/issues/188)
13+
1014
## [0.12.1] - 2025-01-03
1115

1216
### Added

pyproject.toml

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -202,7 +202,9 @@ using = "requirements"
202202
zero = false
203203
ignore_licenses = ["UNKNOWN"]
204204
ignore_packages = [
205-
'docformatter', # uses MIT license, has mismatched license in analysis
206-
'mkdocs-jupyter', # uses Apache-2.0 license, has mismatched license in analysis
207-
'python-geohash', # uses both MIT and Apache-2.0 licenses, has mismatched license in analysis
205+
'docformatter', # uses MIT license, has mismatched license in analysis
206+
'mkdocs-jupyter', # uses Apache-2.0 license, has mismatched license in analysis
207+
'python-geohash', # uses both MIT and Apache-2.0 licenses, has mismatched license in analysis
208+
'mkdocs-autorefs', # uses ISC License, has mismatched license in analysis
209+
'referencing', # uses MIT license, has mismatched license in analysis
208210
]

quackosm/functions.py

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
"convert_osm_extract_to_geodataframe",
3030
]
3131

32+
3233
def convert_pbf_to_duckdb(
3334
pbf_path: Union[str, Path, Iterable[Union[str, Path]]],
3435
tags_filter: Optional[Union[OsmTagsFilter, GroupedOsmTagsFilter]] = None,
@@ -154,7 +155,7 @@ def convert_pbf_to_duckdb(
154155
... monaco_pbf_path, tags_filter={"building": True, "amenity": True, "highway": True}
155156
... ) # doctest: +IGNORE_RESULT
156157
>>> ddb_path.as_posix()
157-
'files/monaco_6593ca69098459d039054bc5fe0a87c56681e29a5f59d38ce3485c03cb0e9374_noclip_compact.duckdb'
158+
'files/monaco_6593ca69_noclip_compact.duckdb'
158159
159160
Get features for Malé - the capital city of Maldives
160161
@@ -204,7 +205,7 @@ def convert_pbf_to_duckdb(
204205
... )
205206
... ) # doctest: +IGNORE_RESULT
206207
>>> ddb_path.as_posix()
207-
'files/maldives_nofilter_4eeabb20ccd8aefeaa80b9a46a202ab985fd454760823b7012cc7778498a085b_compact.duckdb'
208+
'files/maldives_nofilter_4eeabb20_compact.duckdb'
208209
209210
>>> with duckdb.connect(str(ddb_path)) as con:
210211
... con.load_extension('spatial')
@@ -259,6 +260,7 @@ def convert_pbf_to_duckdb(
259260
duckdb_table_name=duckdb_table_name,
260261
)
261262

263+
262264
def convert_geometry_to_duckdb(
263265
geometry_filter: BaseGeometry = None,
264266
osm_extract_source: Union[OsmExtractSource, str] = OsmExtractSource.any,
@@ -356,7 +358,7 @@ def convert_geometry_to_duckdb(
356358
... )
357359
>>> ddb_path = qosm.convert_geometry_to_duckdb(from_wkt(wkt)) # doctest: +IGNORE_RESULT
358360
>>> ddb_path.as_posix()
359-
'files/bf4b33debfd6d3e605555340606df6ce7eea934958c1f3477aca0ccf79e7929f_nofilter_compact.duckdb'
361+
'files/bf4b33de_nofilter_compact.duckdb'
360362
361363
Inspect the file with duckdb
362364
>>> import duckdb
@@ -401,7 +403,7 @@ def convert_geometry_to_duckdb(
401403
... osm_extract_source='Geofabrik',
402404
... ) # doctest: +IGNORE_RESULT
403405
>>> ddb_path.as_posix()
404-
'files/bf4b33debfd6d3e605555340606df6ce7eea934958c1f3477aca0ccf79e7929f_nofilter_compact.duckdb'
406+
'files/bf4b33de_nofilter_compact.duckdb'
405407
406408
Inspect the file with duckdb
407409
>>> with duckdb.connect(str(ddb_path)) as con:
@@ -459,6 +461,7 @@ def convert_geometry_to_duckdb(
459461
duckdb_table_name=duckdb_table_name,
460462
)
461463

464+
462465
def convert_osm_extract_to_duckdb(
463466
osm_extract_query: str,
464467
osm_extract_source: Union[OsmExtractSource, str] = OsmExtractSource.any,
@@ -612,6 +615,7 @@ def convert_osm_extract_to_duckdb(
612615
duckdb_table_name=duckdb_table_name,
613616
)
614617

618+
615619
def convert_pbf_to_parquet(
616620
pbf_path: Union[str, Path, Iterable[Union[str, Path]]],
617621
tags_filter: Optional[Union[OsmTagsFilter, GroupedOsmTagsFilter]] = None,
@@ -739,7 +743,7 @@ def convert_pbf_to_parquet(
739743
... tags_filter={"building": True, "amenity": True, "highway": True}
740744
... ) # doctest: +IGNORE_RESULT
741745
>>> gpq_path.as_posix()
742-
'files/monaco_6593ca69098459d039054bc5fe0a87c56681e29a5f59d38ce3485c03cb0e9374_noclip_exploded.parquet'
746+
'files/monaco_6593ca69_noclip_exploded.parquet'
743747
744748
Inspect the file with duckdb
745749
>>> duckdb.read_parquet(str(gpq_path)).order("feature_id") # doctest: +SKIP
@@ -788,7 +792,7 @@ def convert_pbf_to_parquet(
788792
... )
789793
... ) # doctest: +IGNORE_RESULT
790794
>>> gpq_path.as_posix()
791-
'files/maldives_nofilter_4eeabb20ccd8aefeaa80b9a46a202ab985fd454760823b7012cc7778498a085b_compact.parquet'
795+
'files/maldives_nofilter_4eeabb20_compact.parquet'
792796
793797
Inspect the file with duckdb
794798
>>> duckdb.read_parquet(str(gpq_path)).order("feature_id") # doctest: +SKIP
@@ -942,7 +946,7 @@ def convert_geometry_to_parquet(
942946
... )
943947
>>> gpq_path = qosm.convert_geometry_to_parquet(from_wkt(wkt)) # doctest: +IGNORE_RESULT
944948
>>> gpq_path.as_posix()
945-
'files/bf4b33debfd6d3e605555340606df6ce7eea934958c1f3477aca0ccf79e7929f_nofilter_compact.parquet'
949+
'files/bf4b33de_nofilter_compact.parquet'
946950
947951
Inspect the file with duckdb
948952
>>> import duckdb
@@ -986,7 +990,7 @@ def convert_geometry_to_parquet(
986990
... osm_extract_source='Geofabrik',
987991
... ) # doctest: +IGNORE_RESULT
988992
>>> gpq_path.as_posix()
989-
'files/bf4b33debfd6d3e605555340606df6ce7eea934958c1f3477aca0ccf79e7929f_nofilter_compact.parquet'
993+
'files/bf4b33de_nofilter_compact.parquet'
990994
991995
Inspect the file with duckdb
992996
>>> duckdb.read_parquet(str(gpq_path)).order("feature_id") # doctest: +SKIP

quackosm/pbf_file_reader.py

Lines changed: 6 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1117,7 +1117,7 @@ def _generate_result_file_path(
11171117
h.update(
11181118
(json.dumps(self.tags_filter or {}) + str(self.custom_sql_filter or "")).encode()
11191119
)
1120-
osm_filter_tags_hash_part = f"{h.hexdigest()}{keep_all_tags_part}"
1120+
osm_filter_tags_hash_part = f"{h.hexdigest()[:8]}{keep_all_tags_part}"
11211121

11221122
clipping_geometry_hash_part = self._generate_geometry_hash()
11231123

@@ -1127,7 +1127,7 @@ def _generate_result_file_path(
11271127
if filter_osm_ids:
11281128
h = hashlib.new("sha256")
11291129
h.update(json.dumps(sorted(set(filter_osm_ids))).encode())
1130-
filter_osm_ids_hash_part = f"_{h.hexdigest()}"
1130+
filter_osm_ids_hash_part = f"_{h.hexdigest()[:8]}"
11311131

11321132
if save_as_wkt:
11331133
result_file_name = (
@@ -1151,7 +1151,7 @@ def _generate_result_file_path_from_geometry(
11511151
h.update(
11521152
(json.dumps(self.tags_filter or {}) + str(self.custom_sql_filter or "")).encode()
11531153
)
1154-
osm_filter_tags_hash_part = f"{h.hexdigest()}{keep_all_tags_part}"
1154+
osm_filter_tags_hash_part = f"{h.hexdigest()[:8]}{keep_all_tags_part}"
11551155

11561156
clipping_geometry_hash_part = self._generate_geometry_hash()
11571157

@@ -1161,7 +1161,7 @@ def _generate_result_file_path_from_geometry(
11611161
if filter_osm_ids:
11621162
h = hashlib.new("sha256")
11631163
h.update(json.dumps(sorted(set(filter_osm_ids))).encode())
1164-
filter_osm_ids_hash_part = f"_{h.hexdigest()}"
1164+
filter_osm_ids_hash_part = f"_{h.hexdigest()[:8]}"
11651165

11661166
if save_as_wkt:
11671167
result_file_name = (
@@ -1200,7 +1200,7 @@ def _generate_geometry_hash(self) -> str:
12001200
if oriented_geometry is not None:
12011201
h = hashlib.new("sha256")
12021202
h.update(wktlib.dumps(oriented_geometry).encode())
1203-
clipping_geometry_hash_part = h.hexdigest()
1203+
clipping_geometry_hash_part = h.hexdigest()[:8]
12041204

12051205
return clipping_geometry_hash_part
12061206

@@ -2989,11 +2989,7 @@ def _group_ways_with_polars(current_ways_group_path: Path, current_destination_p
29892989
hive_partitioning=False,
29902990
).group_by("id").agg(pl.col("point").sort_by(pl.col("ref_idx"))).rename(
29912991
{"point": "linestring"}
2992-
).collect(
2993-
streaming=True
2994-
).write_parquet(
2995-
current_destination_path
2996-
)
2992+
).collect(streaming=True).write_parquet(current_destination_path)
29972993

29982994

29992995
def _drop_duplicates_in_pyarrow_table(

0 commit comments

Comments
 (0)