Skip to content

Commit e8eaa04

Browse files
authored
Merge pull request #8 from geoarrow/integration-test-parquet
Add tests for row group statistics of Parquet example files
2 parents 3feaaa0 + a97175e commit e8eaa04

File tree

64 files changed

+688
-501
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

64 files changed

+688
-501
lines changed

collect_parquet_builtin.py

Lines changed: 4 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,10 @@
11
from pathlib import Path
22
import json
33

4-
# Currently requires a special branch of pyarrow with extra GeoArrow features
5-
# https://github.com/apache/arrow/compare/main...paleolimbot:arrow:parquet-geo-write-files-from-geoarrow
4+
# Currently requires nightly pyarrow
65
import pyarrow as pa
76
from pyarrow import parquet
87
import geoarrow.pyarrow as ga
9-
from geoarrow.pyarrow import io
108

119
here = Path(__file__).parent
1210

@@ -26,45 +24,22 @@ def list_wkb_files():
2624
return wkb_files
2725

2826

29-
def convert_arrow_wkb_to_parquet(
30-
src, dst, compression, write_geoparquet_metadata=False
31-
):
32-
# Calculate the basic GeoParquet metadata to add to the file for readers that
33-
# don't support this type (but can return the storage of an unknown logical type)
34-
with pa.ipc.open_stream(src) as reader:
35-
schema = reader.schema
36-
37-
if write_geoparquet_metadata:
38-
columns = io._geoparquet_columns_from_schema(
39-
schema,
40-
geometry_columns=["geometry"],
41-
primary_geometry_column=["geometry"],
42-
add_geometry_types=False,
43-
)
44-
geo_metadata = {
45-
"version": "1.0.0",
46-
"primary_column": "geometry",
47-
"columns": columns,
48-
}
49-
27+
def convert_arrow_wkb_to_parquet(src, dst, compression):
5028
# Maintain chunking from IPC into Parquet so that the statistics
5129
# are theoretically the same.
5230
with (
5331
pa.ipc.open_stream(src) as reader,
5432
parquet.ParquetWriter(
5533
dst,
5634
reader.schema,
57-
store_schema=write_geoparquet_metadata,
35+
store_schema=False,
5836
compression=compression,
5937
) as writer,
6038
):
6139
print(f"Reading {src}")
6240
for batch in reader:
6341
writer.write_batch(batch)
6442

65-
if write_geoparquet_metadata:
66-
writer.add_key_value_metadata({"geo": json.dumps(geo_metadata)})
67-
6843
print(f"Wrote {dst}")
6944

7045

@@ -87,48 +62,18 @@ def check_parquet_file(src, dst):
8762
return True
8863

8964

90-
def generate_parquet_testing_files(wkb_files, parquet_testing_path):
91-
successful_checks = 0
92-
written_files = 0
93-
for path in wkb_files:
94-
# Skip big files + one CRS example that includes a non-PROJJSON value
95-
# on purpose (allowed in GeoArrow), which is rightly rejected
96-
# by Parquet
97-
name = path.name.replace("_wkb.arrows", "")
98-
if (
99-
"microsoft-buildings" in name
100-
or ("ns-water" in name and name != "ns-water_water-point")
101-
or "wkt2" in name
102-
):
103-
print(f"Skipping {name}")
104-
continue
105-
106-
dst = parquet_testing_path / f"{name}.parquet"
107-
convert_arrow_wkb_to_parquet(path, dst, compression="none")
108-
written_files += 1
109-
successful_checks += check_parquet_file(path, dst)
110-
111-
if successful_checks != written_files:
112-
raise ValueError("Some checks failed when generating testing files")
113-
114-
11565
def generate_geoarrow_data_parquet_files(wkb_files):
11666
successful_checks = 0
11767
written_files = 0
11868
for path in wkb_files:
11969
name = path.name.replace("_wkb.arrows", "")
120-
if "wkt2" in name:
121-
print(f"Skipping {name}")
122-
continue
12370
if name.startswith("ns-water") or name.startswith("microsoft"):
12471
compression = "zstd"
12572
else:
12673
compression = "none"
12774

12875
dst = path.parent / f"{name}.parquet"
129-
convert_arrow_wkb_to_parquet(
130-
path, dst, compression=compression, write_geoparquet_metadata=True
131-
)
76+
convert_arrow_wkb_to_parquet(path, dst, compression=compression)
13277
written_files += 1
13378
successful_checks += check_parquet_file(path, dst)
13479

@@ -139,5 +84,4 @@ def generate_geoarrow_data_parquet_files(wkb_files):
13984
if __name__ == "__main__":
14085
parquet_testing_path = here.parent / "parquet-testing" / "data" / "geospatial"
14186
wkb_files = list_wkb_files()
142-
generate_parquet_testing_files(wkb_files, parquet_testing_path)
14387
generate_geoarrow_data_parquet_files(wkb_files)

example-crs/README.md

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,10 @@ All versions of Natural Earth map data redistributed from this repository are in
1717

1818
## Files
1919

20-
- vermont-crs84 ([arrows/wkb](https://raw.githubusercontent.com/geoarrow/geoarrow-data/v0.2.0-rc6/example-crs/files/example-crs_vermont-crs84_wkb.arrows), [geoparquet](https://raw.githubusercontent.com/geoarrow/geoarrow-data/v0.2.0-rc6/example-crs/files/example-crs_vermont-crs84_geo.parquet), [fgb](https://raw.githubusercontent.com/geoarrow/geoarrow-data/v0.2.0-rc6/example-crs/files/example-crs_vermont-crs84.fgb), [parquet](https://raw.githubusercontent.com/geoarrow/geoarrow-data/v0.2.0-rc6/example-crs/files/example-crs_vermont-crs84.parquet))
21-
- vermont-4326 ([arrows/wkb](https://raw.githubusercontent.com/geoarrow/geoarrow-data/v0.2.0-rc6/example-crs/files/example-crs_vermont-4326_wkb.arrows), [geoparquet](https://raw.githubusercontent.com/geoarrow/geoarrow-data/v0.2.0-rc6/example-crs/files/example-crs_vermont-4326_geo.parquet), [fgb](https://raw.githubusercontent.com/geoarrow/geoarrow-data/v0.2.0-rc6/example-crs/files/example-crs_vermont-4326.fgb), [parquet](https://raw.githubusercontent.com/geoarrow/geoarrow-data/v0.2.0-rc6/example-crs/files/example-crs_vermont-4326.parquet))
22-
- vermont-utm ([arrows/wkb](https://raw.githubusercontent.com/geoarrow/geoarrow-data/v0.2.0-rc6/example-crs/files/example-crs_vermont-utm_wkb.arrows), [geoparquet](https://raw.githubusercontent.com/geoarrow/geoarrow-data/v0.2.0-rc6/example-crs/files/example-crs_vermont-utm_geo.parquet), [fgb](https://raw.githubusercontent.com/geoarrow/geoarrow-data/v0.2.0-rc6/example-crs/files/example-crs_vermont-utm.fgb), [parquet](https://raw.githubusercontent.com/geoarrow/geoarrow-data/v0.2.0-rc6/example-crs/files/example-crs_vermont-utm.parquet))
23-
- vermont-custom ([arrows/wkb](https://raw.githubusercontent.com/geoarrow/geoarrow-data/v0.2.0-rc6/example-crs/files/example-crs_vermont-custom_wkb.arrows), [geoparquet](https://raw.githubusercontent.com/geoarrow/geoarrow-data/v0.2.0-rc6/example-crs/files/example-crs_vermont-custom_geo.parquet), [fgb](https://raw.githubusercontent.com/geoarrow/geoarrow-data/v0.2.0-rc6/example-crs/files/example-crs_vermont-custom.fgb), [parquet](https://raw.githubusercontent.com/geoarrow/geoarrow-data/v0.2.0-rc6/example-crs/files/example-crs_vermont-custom.parquet))
24-
- vermont-crs84-wkt2 ([arrows/wkb](https://raw.githubusercontent.com/geoarrow/geoarrow-data/v0.2.0-rc6/example-crs/files/example-crs_vermont-crs84-wkt2_wkb.arrows))
25-
- vermont-crs84-auth-code ([arrows/wkb](https://raw.githubusercontent.com/geoarrow/geoarrow-data/v0.2.0-rc6/example-crs/files/example-crs_vermont-crs84-auth-code_wkb.arrows), [parquet](https://raw.githubusercontent.com/geoarrow/geoarrow-data/v0.2.0-rc6/example-crs/files/example-crs_vermont-crs84-auth-code.parquet))
26-
- vermont-crs84-unknown ([arrows/wkb](https://raw.githubusercontent.com/geoarrow/geoarrow-data/v0.2.0-rc6/example-crs/files/example-crs_vermont-crs84-unknown_wkb.arrows), [parquet](https://raw.githubusercontent.com/geoarrow/geoarrow-data/v0.2.0-rc6/example-crs/files/example-crs_vermont-crs84-unknown.parquet))
20+
- vermont-crs84 ([arrows/wkb](https://raw.githubusercontent.com/geoarrow/geoarrow-data/v0.2.0/example-crs/files/example-crs_vermont-crs84_wkb.arrows), [geoparquet](https://raw.githubusercontent.com/geoarrow/geoarrow-data/v0.2.0/example-crs/files/example-crs_vermont-crs84_geo.parquet), [fgb](https://raw.githubusercontent.com/geoarrow/geoarrow-data/v0.2.0/example-crs/files/example-crs_vermont-crs84.fgb), [parquet](https://raw.githubusercontent.com/geoarrow/geoarrow-data/v0.2.0/example-crs/files/example-crs_vermont-crs84.parquet))
21+
- vermont-4326 ([arrows/wkb](https://raw.githubusercontent.com/geoarrow/geoarrow-data/v0.2.0/example-crs/files/example-crs_vermont-4326_wkb.arrows), [geoparquet](https://raw.githubusercontent.com/geoarrow/geoarrow-data/v0.2.0/example-crs/files/example-crs_vermont-4326_geo.parquet), [fgb](https://raw.githubusercontent.com/geoarrow/geoarrow-data/v0.2.0/example-crs/files/example-crs_vermont-4326.fgb), [parquet](https://raw.githubusercontent.com/geoarrow/geoarrow-data/v0.2.0/example-crs/files/example-crs_vermont-4326.parquet))
22+
- vermont-utm ([arrows/wkb](https://raw.githubusercontent.com/geoarrow/geoarrow-data/v0.2.0/example-crs/files/example-crs_vermont-utm_wkb.arrows), [geoparquet](https://raw.githubusercontent.com/geoarrow/geoarrow-data/v0.2.0/example-crs/files/example-crs_vermont-utm_geo.parquet), [fgb](https://raw.githubusercontent.com/geoarrow/geoarrow-data/v0.2.0/example-crs/files/example-crs_vermont-utm.fgb), [parquet](https://raw.githubusercontent.com/geoarrow/geoarrow-data/v0.2.0/example-crs/files/example-crs_vermont-utm.parquet))
23+
- vermont-custom ([arrows/wkb](https://raw.githubusercontent.com/geoarrow/geoarrow-data/v0.2.0/example-crs/files/example-crs_vermont-custom_wkb.arrows), [geoparquet](https://raw.githubusercontent.com/geoarrow/geoarrow-data/v0.2.0/example-crs/files/example-crs_vermont-custom_geo.parquet), [fgb](https://raw.githubusercontent.com/geoarrow/geoarrow-data/v0.2.0/example-crs/files/example-crs_vermont-custom.fgb), [parquet](https://raw.githubusercontent.com/geoarrow/geoarrow-data/v0.2.0/example-crs/files/example-crs_vermont-custom.parquet))
24+
- vermont-crs84-wkt2 ([arrows/wkb](https://raw.githubusercontent.com/geoarrow/geoarrow-data/v0.2.0/example-crs/files/example-crs_vermont-crs84-wkt2_wkb.arrows), [parquet](https://raw.githubusercontent.com/geoarrow/geoarrow-data/v0.2.0/example-crs/files/example-crs_vermont-crs84-wkt2.parquet))
25+
- vermont-crs84-auth-code ([arrows/wkb](https://raw.githubusercontent.com/geoarrow/geoarrow-data/v0.2.0/example-crs/files/example-crs_vermont-crs84-auth-code_wkb.arrows), [parquet](https://raw.githubusercontent.com/geoarrow/geoarrow-data/v0.2.0/example-crs/files/example-crs_vermont-crs84-auth-code.parquet))
26+
- vermont-crs84-unknown ([arrows/wkb](https://raw.githubusercontent.com/geoarrow/geoarrow-data/v0.2.0/example-crs/files/example-crs_vermont-crs84-unknown_wkb.arrows), [parquet](https://raw.githubusercontent.com/geoarrow/geoarrow-data/v0.2.0/example-crs/files/example-crs_vermont-crs84-unknown.parquet))
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
-6.12 KB
Binary file not shown.

example-crs/manifest.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ files:
1111
- name: vermont-utm
1212
- name: vermont-custom
1313
- name: vermont-crs84-wkt2
14-
skip_format: [geoparquet, fgb, parquet]
14+
skip_format: [geoparquet, fgb]
1515
- name: vermont-crs84-auth-code
1616
skip_format: [geoparquet, fgb]
1717
- name: vermont-crs84-unknown

0 commit comments

Comments
 (0)