Skip to content

Commit d577a98

Browse files
RubenImhoffdnerini
andauthored
Add resample distributions function for probability matching (#390)
* feat: first attempt to add resample distributions function for probability matching * fix: remove erroneous import * feat: add resampling option to probability matching step * fix: make sure there are no nans in the resampling function * fix: fill up space outside radar domain with model data and change function names * docs: add info about the weights * fix: add requested changes from review * fix: add tests and make sure probability stays within bounds * Better docstrings * Minor code improvements * Add test * Run black * Handle nans outside of resampling method --------- Co-authored-by: Daniele Nerini <daniele.nerini@gmail.com>
1 parent 3e433ff commit d577a98

File tree

4 files changed

+180
-40
lines changed

4 files changed

+180
-40
lines changed

pysteps/blending/steps.py

Lines changed: 40 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,7 @@ def forecast(
9090
conditional=False,
9191
probmatching_method="cdf",
9292
mask_method="incremental",
93+
resample_distribution=True,
9394
smooth_radar_mask_range=0,
9495
callback=None,
9596
return_output=True,
@@ -205,25 +206,32 @@ def forecast(
205206
If set to True, compute the statistics of the precipitation field
206207
conditionally by excluding pixels where the values are below the threshold
207208
precip_thr.
209+
probmatching_method: {'cdf','mean',None}, optional
210+
Method for matching the statistics of the forecast field with those of
211+
the most recently observed one. 'cdf'=map the forecast CDF to the observed
212+
one, 'mean'=adjust only the conditional mean value of the forecast field
213+
in precipitation areas, None=no matching applied. Using 'mean' requires
214+
that mask_method is not None.
208215
mask_method: {'obs','incremental',None}, optional
209216
The method to use for masking no precipitation areas in the forecast field.
210217
The masked pixels are set to the minimum value of the observations.
211218
'obs' = apply precip_thr to the most recently observed precipitation intensity
212219
field, 'incremental' = iteratively buffer the mask with a certain rate
213220
(currently it is 1 km/min), None=no masking.
221+
resample_distribution: bool, optional
222+
Method to resample the distribution from the extrapolation and NWP cascade as input
223+
for the probability matching. Not resampling these distributions may lead to losing
224+
some extremes when the weight of both the extrapolation and NWP cascade is similar.
225+
Defaults to True.
214226
smooth_radar_mask_range: int, Default is 0.
215227
Method to smooth the transition between the radar-NWP-noise blend and the NWP-noise
216228
blend near the edge of the radar domain (radar mask), where the radar data is either
217-
not present anymore or is not reliable. If set to 0 (grid cells), this generates a normal forecast without smoothing. To create a smooth mask, this range
218-
should be a positive value, representing a buffer band of a number of pixels
219-
by which the mask is cropped and smoothed. The smooth radar mask removes
220-
the hard edges between NWP and radar in the final blended product. Typically, a value between 50 and 100 km can be used. 80 km generally gives good results.
221-
probmatching_method: {'cdf','mean',None}, optional
222-
Method for matching the statistics of the forecast field with those of
223-
the most recently observed one. 'cdf'=map the forecast CDF to the observed
224-
one, 'mean'=adjust only the conditional mean value of the forecast field
225-
in precipitation areas, None=no matching applied. Using 'mean' requires
226-
that mask_method is not None.
229+
not present anymore or is not reliable. If set to 0 (grid cells), this generates a
230+
normal forecast without smoothing. To create a smooth mask, this range should be a
231+
positive value, representing a buffer band of a number of pixels by which the mask
232+
is cropped and smoothed. The smooth radar mask removes the hard edges between NWP
233+
and radar in the final blended product. Typically, a value between 50 and 100 km
234+
can be used. 80 km generally gives good results.
227235
callback: function, optional
228236
Optional function that is called after computation of each time step of
229237
the nowcast. The function takes one argument: a three-dimensional array
@@ -1404,7 +1412,6 @@ def worker(j):
14041412
# latest extrapolated radar rainfall field blended with the
14051413
# nwp model(s) rainfall forecast fields as 'benchmark'.
14061414

1407-
# TODO: Check probability matching method
14081415
# 8.7.1 first blend the extrapolated rainfall field (the field
14091416
# that is only used for post-processing steps) with the NWP
14101417
# rainfall forecast for this time step using the weights
@@ -1538,19 +1545,39 @@ def worker(j):
15381545
# Set to min value outside of mask
15391546
R_f_new[~MASK_prec_] = R_cmin
15401547

1548+
# If probmatching_method is not None, resample the distribution from
1549+
# both the extrapolation cascade and the model (NWP) cascade and use
1550+
# that for the probability matching
1551+
if probmatching_method is not None and resample_distribution:
1552+
# deal with missing values
1553+
arr1 = R_pm_ep[t_index]
1554+
arr2 = precip_models_pm_temp[j]
1555+
arr2 = np.where(np.isnan(arr2), np.nanmin(arr2), arr2)
1556+
arr1 = np.where(np.isnan(arr1), arr2, arr1)
1557+
# resample weights based on cascade level 2
1558+
R_pm_resampled = probmatching.resample_distributions(
1559+
first_array=arr1,
1560+
second_array=arr2,
1561+
probability_first_array=weights_pm_normalized[0],
1562+
)
1563+
else:
1564+
R_pm_resampled = R_pm_blended.copy()
1565+
15411566
if probmatching_method == "cdf":
15421567
# Adjust the CDF of the forecast to match the most recent
15431568
# benchmark rainfall field (R_pm_blended). If the forecast
15441569
if np.any(np.isfinite(R_f_new)):
15451570
R_f_new = probmatching.nonparam_match_empirical_cdf(
1546-
R_f_new, R_pm_blended
1571+
R_f_new, R_pm_resampled
15471572
)
1573+
R_pm_resampled = None
15481574
elif probmatching_method == "mean":
15491575
# Use R_pm_blended as benchmark field and
1550-
mu_0 = np.mean(R_pm_blended[R_pm_blended >= precip_thr])
1576+
mu_0 = np.mean(R_pm_resampled[R_pm_resampled >= precip_thr])
15511577
MASK = R_f_new >= precip_thr
15521578
mu_fct = np.mean(R_f_new[MASK])
15531579
R_f_new[MASK] = R_f_new[MASK] - mu_fct + mu_0
1580+
R_pm_resampled = None
15541581

15551582
R_f_out.append(R_f_new)
15561583

pysteps/postprocessing/probmatching.py

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
pmm_init
1414
pmm_compute
1515
shift_scale
16+
resample_distributions
1617
"""
1718

1819
import numpy as np
@@ -259,6 +260,56 @@ def _get_error(scale):
259260
return shift, scale, R.reshape(shape)
260261

261262

263+
def resample_distributions(first_array, second_array, probability_first_array):
264+
"""
265+
Merges two distributions (e.g., from the extrapolation nowcast and NWP in the blending module)
266+
to effectively combine two distributions for probability matching without losing extremes.
267+
268+
Parameters
269+
----------
270+
first_array: array_like
271+
One of the two arrays from which the distribution should be sampled (e.g., the extrapolation
272+
cascade). It must be of the same shape as `second_array`. Input must not contain NaNs.
273+
second_array: array_like
274+
One of the two arrays from which the distribution should be sampled (e.g., the NWP (model)
275+
cascade). It must be of the same shape as `first_array`.. Input must not contain NaNs.
276+
probability_first_array: float
277+
The weight that `first_array` should get (a value between 0 and 1). This determines the
278+
likelihood of selecting elements from `first_array` over `second_array`.
279+
280+
Returns
281+
-------
282+
csort: array_like
283+
The combined output distribution. This is an array of the same shape as the input arrays,
284+
where each element is chosen from either `first_array` or `second_array` based on the specified
285+
probability, and then sorted in descending order.
286+
287+
Raises
288+
------
289+
ValueError
290+
If `first_array` and `second_array` do not have the same shape or if inputs contain NaNs.
291+
"""
292+
293+
# Valide inputs
294+
if first_array.shape != second_array.shape:
295+
raise ValueError("first_array and second_array must have the same shape")
296+
if np.isnan(first_array).any() or np.isnan(second_array).any():
297+
raise ValueError("Input arrays must not contain NaNs")
298+
probability_first_array = np.clip(probability_first_array, 0.0, 1.0)
299+
300+
# Flatten and sort the arrays
301+
asort = np.sort(first_array, axis=None)[::-1]
302+
bsort = np.sort(second_array, axis=None)[::-1]
303+
n = asort.shape[0]
304+
305+
# Resample the distributions
306+
idxsamples = np.random.binomial(1, probability_first_array, n).astype(bool)
307+
csort = np.where(idxsamples, asort, bsort)
308+
csort = np.sort(csort)[::-1]
309+
310+
return csort
311+
312+
262313
def _invfunc(y, fx, fy):
263314
if len(y) == 0:
264315
return np.array([])

pysteps/tests/test_blending_steps.py

Lines changed: 32 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -8,37 +8,40 @@
88

99

1010
steps_arg_values = [
11-
(1, 3, 4, 8, None, None, False, "spn", True, 4, False, False, 0),
12-
(1, 3, 4, 8, "obs", None, False, "spn", True, 4, False, False, 0),
13-
(1, 3, 4, 8, "incremental", None, False, "spn", True, 4, False, False, 0),
14-
(1, 3, 4, 8, None, "mean", False, "spn", True, 4, False, False, 0),
15-
(1, 3, 4, 8, None, "cdf", False, "spn", True, 4, False, False, 0),
16-
(1, 3, 4, 8, "incremental", "cdf", False, "spn", True, 4, False, False, 0),
17-
(1, 3, 4, 6, "incremental", "cdf", False, "bps", True, 4, False, False, 0),
18-
(1, 3, 4, 6, "incremental", "cdf", False, "bps", False, 4, False, False, 0),
19-
(1, 3, 4, 9, "incremental", "cdf", False, "spn", True, 4, False, False, 0),
20-
(2, 3, 10, 8, "incremental", "cdf", False, "spn", True, 10, False, False, 0),
21-
(5, 3, 5, 8, "incremental", "cdf", False, "spn", True, 5, False, False, 0),
22-
(1, 10, 1, 8, "incremental", "cdf", False, "spn", True, 1, False, False, 0),
23-
(2, 3, 2, 8, "incremental", "cdf", True, "spn", True, 2, False, False, 0),
24-
(1, 3, 6, 8, None, None, False, "spn", True, 6, False, False, 0),
11+
(1, 3, 4, 8, None, None, False, "spn", True, 4, False, False, 0, False),
12+
(1, 3, 4, 8, "obs", None, False, "spn", True, 4, False, False, 0, False),
13+
(1, 3, 4, 8, "incremental", None, False, "spn", True, 4, False, False, 0, False),
14+
(1, 3, 4, 8, None, "mean", False, "spn", True, 4, False, False, 0, False),
15+
(1, 3, 4, 8, None, "mean", False, "spn", True, 4, False, False, 0, True),
16+
(1, 3, 4, 8, None, "cdf", False, "spn", True, 4, False, False, 0, False),
17+
(1, 3, 4, 8, "incremental", "cdf", False, "spn", True, 4, False, False, 0, False),
18+
(1, 3, 4, 6, "incremental", "cdf", False, "bps", True, 4, False, False, 0, False),
19+
(1, 3, 4, 6, "incremental", "cdf", False, "bps", False, 4, False, False, 0, False),
20+
(1, 3, 4, 6, "incremental", "cdf", False, "bps", False, 4, False, False, 0, True),
21+
(1, 3, 4, 9, "incremental", "cdf", False, "spn", True, 4, False, False, 0, False),
22+
(2, 3, 10, 8, "incremental", "cdf", False, "spn", True, 10, False, False, 0, False),
23+
(5, 3, 5, 8, "incremental", "cdf", False, "spn", True, 5, False, False, 0, False),
24+
(1, 10, 1, 8, "incremental", "cdf", False, "spn", True, 1, False, False, 0, False),
25+
(2, 3, 2, 8, "incremental", "cdf", True, "spn", True, 2, False, False, 0, False),
26+
(1, 3, 6, 8, None, None, False, "spn", True, 6, False, False, 0, False),
2527
# Test the case where the radar image contains no rain.
26-
(1, 3, 6, 8, None, None, False, "spn", True, 6, True, False, 0),
27-
(5, 3, 5, 6, "incremental", "cdf", False, "spn", False, 5, True, False, 0),
28+
(1, 3, 6, 8, None, None, False, "spn", True, 6, True, False, 0, False),
29+
(5, 3, 5, 6, "incremental", "cdf", False, "spn", False, 5, True, False, 0, False),
30+
(5, 3, 5, 6, "incremental", "cdf", False, "spn", False, 5, True, False, 0, True),
2831
# Test the case where the NWP fields contain no rain.
29-
(1, 3, 6, 8, None, None, False, "spn", True, 6, False, True, 0),
30-
(5, 3, 5, 6, "incremental", "cdf", False, "spn", False, 5, False, True, 0),
32+
(1, 3, 6, 8, None, None, False, "spn", True, 6, False, True, 0, False),
33+
(5, 3, 5, 6, "incremental", "cdf", False, "spn", False, 5, False, True, 0, True),
3134
# Test the case where both the radar image and the NWP fields contain no rain.
32-
(1, 3, 6, 8, None, None, False, "spn", True, 6, True, True, 0),
33-
(5, 3, 5, 6, "incremental", "cdf", False, "spn", False, 5, True, True, 0),
34-
(5, 3, 5, 6, "obs", "mean", True, "spn", True, 5, True, True, 0),
35+
(1, 3, 6, 8, None, None, False, "spn", True, 6, True, True, 0, False),
36+
(5, 3, 5, 6, "incremental", "cdf", False, "spn", False, 5, True, True, 0, False),
37+
(5, 3, 5, 6, "obs", "mean", True, "spn", True, 5, True, True, 0, False),
3538
# Test for smooth radar mask
36-
(1, 3, 6, 8, None, None, False, "spn", True, 6, False, False, 80),
37-
(5, 3, 5, 6, "incremental", "cdf", False, "spn", False, 5, False, False, 80),
38-
(5, 3, 5, 6, "obs", "mean", False, "spn", False, 5, False, False, 80),
39-
(1, 3, 6, 8, None, None, False, "spn", True, 6, False, True, 80),
40-
(5, 3, 5, 6, "incremental", "cdf", False, "spn", False, 5, True, False, 80),
41-
(5, 3, 5, 6, "obs", "mean", False, "spn", False, 5, True, True, 80),
39+
(1, 3, 6, 8, None, None, False, "spn", True, 6, False, False, 80, False),
40+
(5, 3, 5, 6, "incremental", "cdf", False, "spn", False, 5, False, False, 80, False),
41+
(5, 3, 5, 6, "obs", "mean", False, "spn", False, 5, False, False, 80, False),
42+
(1, 3, 6, 8, None, None, False, "spn", True, 6, False, True, 80, False),
43+
(5, 3, 5, 6, "incremental", "cdf", False, "spn", False, 5, True, False, 80, True),
44+
(5, 3, 5, 6, "obs", "mean", False, "spn", False, 5, True, True, 80, False),
4245
]
4346

4447
steps_arg_names = (
@@ -55,6 +58,7 @@
5558
"zero_radar",
5659
"zero_nwp",
5760
"smooth_radar_mask_range",
61+
"resample_distribution",
5862
)
5963

6064

@@ -73,6 +77,7 @@ def test_steps_blending(
7377
zero_radar,
7478
zero_nwp,
7579
smooth_radar_mask_range,
80+
resample_distribution,
7681
):
7782
pytest.importorskip("cv2")
7883

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
import numpy as np
2+
import pytest
3+
from pysteps.postprocessing.probmatching import resample_distributions
4+
5+
6+
class TestResampleDistributions:
7+
8+
@pytest.fixture(autouse=True)
9+
def setup(self):
10+
# Set the seed for reproducibility
11+
np.random.seed(42)
12+
13+
def test_valid_inputs(self):
14+
first_array = np.array([1, 3, 5, 7, 9])
15+
second_array = np.array([2, 4, 6, 8, 10])
16+
probability_first_array = 0.6
17+
result = resample_distributions(
18+
first_array, second_array, probability_first_array
19+
)
20+
expected_result = np.array([9, 8, 6, 3, 1]) # Expected result based on the seed
21+
assert result.shape == first_array.shape
22+
assert np.array_equal(result, expected_result)
23+
24+
def test_probability_zero(self):
25+
first_array = np.array([1, 3, 5, 7, 9])
26+
second_array = np.array([2, 4, 6, 8, 10])
27+
probability_first_array = 0.0
28+
result = resample_distributions(
29+
first_array, second_array, probability_first_array
30+
)
31+
assert np.array_equal(result, np.sort(second_array)[::-1])
32+
33+
def test_probability_one(self):
34+
first_array = np.array([1, 3, 5, 7, 9])
35+
second_array = np.array([2, 4, 6, 8, 10])
36+
probability_first_array = 1.0
37+
result = resample_distributions(
38+
first_array, second_array, probability_first_array
39+
)
40+
assert np.array_equal(result, np.sort(first_array)[::-1])
41+
42+
def test_nan_in_inputs(self):
43+
array_with_nan = np.array([1, 3, np.nan, 7, 9])
44+
array_without_nan = np.array([2, 4, 6, 8, 10])
45+
probability_first_array = 0.6
46+
with pytest.raises(ValueError, match="Input arrays must not contain NaNs"):
47+
resample_distributions(
48+
array_with_nan, array_without_nan, probability_first_array
49+
)
50+
with pytest.raises(ValueError, match="Input arrays must not contain NaNs"):
51+
resample_distributions(
52+
array_without_nan, array_with_nan, probability_first_array
53+
)
54+
with pytest.raises(ValueError, match="Input arrays must not contain NaNs"):
55+
resample_distributions(
56+
array_with_nan, array_with_nan, probability_first_array
57+
)

0 commit comments

Comments
 (0)