1
1
from pathlib import Path
2
2
import json
3
3
4
- # Currently requires a special branch of pyarrow with extra GeoArrow features
5
- # https://github.com/apache/arrow/compare/main...paleolimbot:arrow:parquet-geo-write-files-from-geoarrow
4
+ # Currently requires nightly pyarrow
6
5
import pyarrow as pa
7
6
from pyarrow import parquet
8
7
import geoarrow .pyarrow as ga
9
- from geoarrow .pyarrow import io
10
8
11
9
here = Path (__file__ ).parent
12
10
@@ -26,45 +24,22 @@ def list_wkb_files():
26
24
return wkb_files
27
25
28
26
29
- def convert_arrow_wkb_to_parquet (
30
- src , dst , compression , write_geoparquet_metadata = False
31
- ):
32
- # Calculate the basic GeoParquet metadata to add to the file for readers that
33
- # don't support this type (but can return the storage of an unknown logical type)
34
- with pa .ipc .open_stream (src ) as reader :
35
- schema = reader .schema
36
-
37
- if write_geoparquet_metadata :
38
- columns = io ._geoparquet_columns_from_schema (
39
- schema ,
40
- geometry_columns = ["geometry" ],
41
- primary_geometry_column = ["geometry" ],
42
- add_geometry_types = False ,
43
- )
44
- geo_metadata = {
45
- "version" : "1.0.0" ,
46
- "primary_column" : "geometry" ,
47
- "columns" : columns ,
48
- }
49
-
27
+ def convert_arrow_wkb_to_parquet (src , dst , compression ):
50
28
# Maintain chunking from IPC into Parquet so that the statistics
51
29
# are theoretically the same.
52
30
with (
53
31
pa .ipc .open_stream (src ) as reader ,
54
32
parquet .ParquetWriter (
55
33
dst ,
56
34
reader .schema ,
57
- store_schema = write_geoparquet_metadata ,
35
+ store_schema = False ,
58
36
compression = compression ,
59
37
) as writer ,
60
38
):
61
39
print (f"Reading { src } " )
62
40
for batch in reader :
63
41
writer .write_batch (batch )
64
42
65
- if write_geoparquet_metadata :
66
- writer .add_key_value_metadata ({"geo" : json .dumps (geo_metadata )})
67
-
68
43
print (f"Wrote { dst } " )
69
44
70
45
@@ -87,48 +62,18 @@ def check_parquet_file(src, dst):
87
62
return True
88
63
89
64
90
- def generate_parquet_testing_files (wkb_files , parquet_testing_path ):
91
- successful_checks = 0
92
- written_files = 0
93
- for path in wkb_files :
94
- # Skip big files + one CRS example that includes a non-PROJJSON value
95
- # on purpose (allowed in GeoArrow), which is rightly rejected
96
- # by Parquet
97
- name = path .name .replace ("_wkb.arrows" , "" )
98
- if (
99
- "microsoft-buildings" in name
100
- or ("ns-water" in name and name != "ns-water_water-point" )
101
- or "wkt2" in name
102
- ):
103
- print (f"Skipping { name } " )
104
- continue
105
-
106
- dst = parquet_testing_path / f"{ name } .parquet"
107
- convert_arrow_wkb_to_parquet (path , dst , compression = "none" )
108
- written_files += 1
109
- successful_checks += check_parquet_file (path , dst )
110
-
111
- if successful_checks != written_files :
112
- raise ValueError ("Some checks failed when generating testing files" )
113
-
114
-
115
65
def generate_geoarrow_data_parquet_files (wkb_files ):
116
66
successful_checks = 0
117
67
written_files = 0
118
68
for path in wkb_files :
119
69
name = path .name .replace ("_wkb.arrows" , "" )
120
- if "wkt2" in name :
121
- print (f"Skipping { name } " )
122
- continue
123
70
if name .startswith ("ns-water" ) or name .startswith ("microsoft" ):
124
71
compression = "zstd"
125
72
else :
126
73
compression = "none"
127
74
128
75
dst = path .parent / f"{ name } .parquet"
129
- convert_arrow_wkb_to_parquet (
130
- path , dst , compression = compression , write_geoparquet_metadata = True
131
- )
76
+ convert_arrow_wkb_to_parquet (path , dst , compression = compression )
132
77
written_files += 1
133
78
successful_checks += check_parquet_file (path , dst )
134
79
@@ -139,5 +84,4 @@ def generate_geoarrow_data_parquet_files(wkb_files):
139
84
if __name__ == "__main__" :
140
85
parquet_testing_path = here .parent / "parquet-testing" / "data" / "geospatial"
141
86
wkb_files = list_wkb_files ()
142
- generate_parquet_testing_files (wkb_files , parquet_testing_path )
143
87
generate_geoarrow_data_parquet_files (wkb_files )
0 commit comments