5
5
6
6
import geopandas as gpd
7
7
import numpy as np
8
+ import sparse
8
9
import xarray as xr
9
10
from exactextract import exact_extract
10
11
from exactextract .raster import NumPyRasterSource
16
17
import dask_geopandas
17
18
18
19
MIN_CHUNK_SIZE = 2 # exactextract cannot handle arrays of size 1.
20
+ GEOM_AXIS = 0
21
+ Y_AXIS = 1
22
+ X_AXIS = 2
23
+
24
+ DEFAULT_STRATEGY = "feature-sequential"
25
+ Strategy = Literal ["feature-sequential" , "raster-sequential" , "raster-parallel" ]
26
+ CoverageWeights = Literal ["area_spherical_m2" , "area_cartesian" , "area_spherical_km2" , "fraction" , "none" ]
19
27
20
28
__all__ = [
21
29
"coverage" ,
22
30
]
23
31
24
32
25
- def get_dtype (coverage_weight , geometries ):
26
- if coverage_weight .lower () == "fraction" :
27
- dtype = "float64"
28
- elif coverage_weight .lower () == "none" :
29
- dtype = np .min_scalar_type (len (geometries ))
30
- else :
31
- raise NotImplementedError
32
- return dtype
33
-
34
-
35
- def np_coverage (
36
- x : np .ndarray ,
37
- y : np .ndarray ,
38
- * ,
39
- geometries : gpd .GeoDataFrame ,
40
- coverage_weight : Literal ["fraction" , "none" ] = "fraction" ,
41
- ) -> np .ndarray [Any , Any ]:
42
- """
43
- Parameters
44
- ----------
45
-
46
- """
33
+ def xy_to_raster_source (x : np .ndarray , y : np .ndarray , * , srs_wkt : str | None ) -> NumPyRasterSource :
47
34
assert x .ndim == 1
48
35
assert y .ndim == 1
49
36
50
- dtype = get_dtype (coverage_weight , geometries )
51
-
52
37
xsize = x .size
53
38
ysize = y .size
54
39
@@ -67,32 +52,90 @@ def np_coverage(
67
52
xmax = x .max () + dx1 ,
68
53
ymin = y .min () - dy0 ,
69
54
ymax = y .max () + dy1 ,
70
- srs_wkt = geometries . crs . to_wkt () ,
55
+ srs_wkt = srs_wkt ,
71
56
)
57
+
58
+ return raster
59
+
60
+
61
+ def get_dtype (coverage_weight : CoverageWeights , geometries ):
62
+ if coverage_weight .lower () == "none" :
63
+ dtype = np .uint8
64
+ else :
65
+ dtype = np .float64
66
+ return dtype
67
+
68
+
69
+ def np_coverage (
70
+ x : np .ndarray ,
71
+ y : np .ndarray ,
72
+ * ,
73
+ geometries : gpd .GeoDataFrame ,
74
+ strategy : Strategy = DEFAULT_STRATEGY ,
75
+ coverage_weight : CoverageWeights = "fraction" ,
76
+ ) -> np .ndarray [Any , Any ]:
77
+ """
78
+ Parameters
79
+ ----------
80
+
81
+ """
82
+ dtype = get_dtype (coverage_weight , geometries )
83
+
84
+ if len (geometries .columns ) > 1 :
85
+ raise ValueError ("Require a single geometries column or a GeoSeries." )
86
+
87
+ shape = (y .size , x .size )
88
+ raster = xy_to_raster_source (x , y , srs_wkt = geometries .crs .to_wkt ())
72
89
result = exact_extract (
73
90
rast = raster ,
74
91
vec = geometries ,
75
92
ops = ["cell_id" , f"coverage(coverage_weight={ coverage_weight } )" ],
76
93
output = "pandas" ,
77
94
# max_cells_in_memory=2*x.size * y.size
78
95
)
79
- out = np .zeros ((len (geometries ), * shape ), dtype = dtype )
80
- # TODO: vectorized assignment?
96
+
97
+ lens = np .vectorize (len )(result .cell_id .values )
98
+ nnz = np .sum (lens )
99
+
100
+ # Notes on GCXS vs COO, For N data points in 263 geoms by 4000 x by 4000 y
101
+ # 1. GCXS cannot compress _all_ axes. This is relevant here.
102
+ # 2. GCXS: indptr is 4000*4000 + 1, N per indices & N per data
103
+ # 3. COO: 4*N
104
+ # It is not obvious that there is much improvement to GCXS at least currently
105
+ geom_idxs = np .empty ((nnz ,), dtype = np .int64 )
106
+ xy_idxs = np .empty ((nnz ,), dtype = np .int64 )
107
+ data = np .empty ((nnz ,), dtype = dtype )
108
+
109
+ off = 0
81
110
for i in range (len (geometries )):
82
- res = result .loc [i ]
83
- # indices = np.unravel_index(res.cell_id, shape=shape)
84
- # out[(i, *indices)] = offset + i + 1 # 0 is the fill value
85
- out [i , ...].flat [res .cell_id ] = res .coverage
86
- return out
111
+ cell_id = result .cell_id .values [i ]
112
+ if cell_id .size == 0 :
113
+ continue
114
+ geom_idxs [off : off + cell_id .size ] = i
115
+ xy_idxs [off : off + cell_id .size ] = cell_id
116
+ data [off : off + cell_id .size ] = result .coverage .values [i ]
117
+ off += cell_id .size
118
+ return sparse .COO (
119
+ (geom_idxs , * np .unravel_index (xy_idxs , shape = shape )),
120
+ data = data ,
121
+ sorted = True ,
122
+ fill_value = 0 ,
123
+ shape = (len (geometries ), * shape ),
124
+ )
87
125
88
126
89
127
def coverage_np_dask_wrapper (
90
- x : np .ndarray , y : np .ndarray , geom_array : np .ndarray , coverage_weight , crs
128
+ geom_array : np .ndarray ,
129
+ x : np .ndarray ,
130
+ y : np .ndarray ,
131
+ coverage_weight : CoverageWeights ,
132
+ strategy : Strategy ,
133
+ crs ,
91
134
) -> np .ndarray :
92
135
return np_coverage (
93
- x = x ,
94
- y = y ,
95
- geometries = gpd .GeoDataFrame (geometry = geom_array , crs = crs ),
136
+ x = x . squeeze ( axis = ( GEOM_AXIS , Y_AXIS )) ,
137
+ y = y . squeeze ( axis = ( GEOM_AXIS , X_AXIS )) ,
138
+ geometries = gpd .GeoDataFrame (geometry = geom_array . squeeze ( axis = ( X_AXIS , Y_AXIS )) , crs = crs ),
96
139
coverage_weight = coverage_weight ,
97
140
)
98
141
@@ -102,27 +145,29 @@ def dask_coverage(
102
145
y : dask .array .Array ,
103
146
* ,
104
147
geom_array : dask .array .Array ,
105
- coverage_weight : Literal ["fraction" , "none" ] = "fraction" ,
148
+ coverage_weight : CoverageWeights = "fraction" ,
149
+ strategy : Strategy = DEFAULT_STRATEGY ,
106
150
crs : Any ,
107
151
) -> dask .array .Array :
108
152
import dask .array
109
153
110
- if any (c == 1 for c in x .chunks ) or any (c == 1 for c in y .chunks ):
154
+ if any (c == 1 for c in x .chunks [ 0 ] ) or any (c == 1 for c in y .chunks [ 0 ] ):
111
155
raise ValueError ("exactextract does not support a chunksize of 1. Please rechunk to avoid this" )
112
156
113
- return dask .array .blockwise (
157
+ out = dask .array .map_blocks (
114
158
coverage_np_dask_wrapper ,
115
- "gji" ,
116
- x ,
117
- "i" ,
118
- y ,
119
- "j" ,
120
- geom_array ,
121
- "g" ,
159
+ geom_array [:, np .newaxis , np .newaxis ],
160
+ x [np .newaxis , np .newaxis , :],
161
+ y [np .newaxis , :, np .newaxis ],
122
162
crs = crs ,
123
163
coverage_weight = coverage_weight ,
124
- dtype = get_dtype (coverage_weight , geom_array ),
164
+ strategy = strategy ,
165
+ chunks = (* geom_array .chunks , * y .chunks , * x .chunks ),
166
+ meta = sparse .COO (
167
+ [], data = np .array ([], dtype = get_dtype (coverage_weight , geom_array )), shape = (0 , 0 , 0 ), fill_value = 0
168
+ ),
125
169
)
170
+ return out
126
171
127
172
128
173
def coverage (
@@ -131,7 +176,8 @@ def coverage(
131
176
* ,
132
177
xdim = "x" ,
133
178
ydim = "y" ,
134
- coverage_weight = "fraction" ,
179
+ strategy : Strategy = "feature-sequential" ,
180
+ coverage_weight : CoverageWeights = "fraction" ,
135
181
) -> xr .DataArray :
136
182
"""
137
183
Returns "coverage" fractions for each pixel for each geometry calculated using exactextract.
@@ -163,35 +209,62 @@ def coverage(
163
209
y = obj [ydim ].data ,
164
210
geometries = geometries ,
165
211
coverage_weight = coverage_weight ,
212
+ strategy = strategy ,
166
213
)
167
214
geom_array = geometries .to_numpy ().squeeze (axis = 1 )
168
215
else :
169
- from dask .array import from_array
216
+ from dask .array import Array , from_array
170
217
171
218
geom_dask_array = geometries_as_dask_array (geometries )
219
+ if not isinstance (obj [xdim ].data , Array ):
220
+ dask_x = from_array (obj [xdim ].data , chunks = obj .chunksizes .get (xdim , - 1 ))
221
+ else :
222
+ dask_x = obj [xdim ].data
223
+
224
+ if not isinstance (obj [ydim ].data , Array ):
225
+ dask_y = from_array (obj [ydim ].data , chunks = obj .chunksizes .get (ydim , - 1 ))
226
+ else :
227
+ dask_y = obj [ydim ].data
228
+
172
229
out = dask_coverage (
173
- x = from_array ( obj [ xdim ]. data , chunks = obj . chunksizes . get ( xdim , - 1 )) ,
174
- y = from_array ( obj [ ydim ]. data , chunks = obj . chunksizes . get ( ydim , - 1 )) ,
230
+ x = dask_x ,
231
+ y = dask_y ,
175
232
geom_array = geom_dask_array ,
176
233
crs = geometries .crs ,
177
234
coverage_weight = coverage_weight ,
235
+ strategy = strategy ,
178
236
)
179
237
if isinstance (geometries , gpd .GeoDataFrame ):
180
238
geom_array = geometries .to_numpy ().squeeze (axis = 1 )
181
239
else :
182
240
geom_array = geom_dask_array
183
241
184
- coverage = xr .DataArray (
185
- dims = ("geometry" , ydim , xdim ),
186
- data = out ,
187
- coords = xr .Coordinates (
188
- coords = {
189
- xdim : obj .coords [xdim ],
190
- ydim : obj .coords [ydim ],
191
- "spatial_ref" : obj .spatial_ref ,
192
- "geometry" : geom_array ,
193
- },
194
- indexes = {xdim : obj .xindexes [xdim ], ydim : obj .xindexes [ydim ]},
195
- ),
242
+ name = "coverage"
243
+ attrs = {}
244
+ if "area" in coverage_weight :
245
+ name = "area"
246
+ if "_m2" in coverage_weight or coverage_weight == "area_cartesian" :
247
+ attrs ["long_name" ] = coverage_weight .removesuffix ("_m2" )
248
+ attrs ["units" ] = "m2"
249
+ elif "_km2" in coverage_weight :
250
+ attrs ["long_name" ] = coverage_weight .removesuffix ("_km2" )
251
+ attrs ["units" ] = "km2"
252
+
253
+ xy_coords = [
254
+ xr .Coordinates .from_xindex (obj .xindexes .get (dim ))
255
+ for dim in (xdim , ydim )
256
+ if obj .xindexes .get (dim ) is not None
257
+ ]
258
+ coords = xr .Coordinates (
259
+ coords = {
260
+ "spatial_ref" : obj .spatial_ref ,
261
+ "geometry" : geom_array ,
262
+ },
263
+ indexes = {},
196
264
)
265
+ if xy_coords :
266
+ for c in xy_coords :
267
+ coords = coords .merge (c )
268
+ coords = coords .coords
269
+ coverage = xr .DataArray (dims = ("geometry" , ydim , xdim ), data = out , coords = coords , attrs = attrs , name = name )
197
270
return coverage
0 commit comments