Skip to content

Commit b665ca9

Browse files
authored
Modify probability matching code to allow ignoring a part of the domain (#428)
* Try not to include out-of-radar-mask values in the probability matching Error out if mask doesn't cover all nans * Fix resample_distributions in case of integers (test case) and add test. * Add extensive tests for resampling and probability matching Especially with nans / ignore mask. * Update documentation about ignore_indices. * Don't copy arrays ad nauseam in nonparam_match_empirical_cdf
1 parent b4b41a5 commit b665ca9

File tree

3 files changed

+215
-49
lines changed

3 files changed

+215
-49
lines changed

pysteps/blending/steps.py

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1538,14 +1538,12 @@ def worker(j):
15381538

15391539
# If probmatching_method is not None, resample the distribution from
15401540
# both the extrapolation cascade and the model (NWP) cascade and use
1541-
# that for the probability matching
1541+
# that for the probability matching.
15421542
if probmatching_method is not None and resample_distribution:
1543-
# deal with missing values
15441543
arr1 = R_pm_ep[t_index]
15451544
arr2 = precip_models_pm_temp[j]
1546-
arr2 = np.where(np.isnan(arr2), np.nanmin(arr2), arr2)
1547-
arr1 = np.where(np.isnan(arr1), arr2, arr1)
1548-
# resample weights based on cascade level 2
1545+
# resample weights based on cascade level 2.
1546+
# Areas where one of the fields is nan are not included.
15491547
R_pm_resampled = probmatching.resample_distributions(
15501548
first_array=arr1,
15511549
second_array=arr2,
@@ -1555,11 +1553,14 @@ def worker(j):
15551553
R_pm_resampled = R_pm_blended.copy()
15561554

15571555
if probmatching_method == "cdf":
1558-
# Adjust the CDF of the forecast to match the most recent
1559-
# benchmark rainfall field (R_pm_blended). If the forecast
1556+
# nan indices in the extrapolation nowcast
1557+
nan_indices = np.isnan(R_pm_ep[t_index])
1558+
# Adjust the CDF of the forecast to match the resampled distribution combined from
1559+
# extrapolation and model fields.
1560+
# Rainfall outside the pure extrapolation domain is not taken into account.
15601561
if np.any(np.isfinite(R_f_new)):
15611562
R_f_new = probmatching.nonparam_match_empirical_cdf(
1562-
R_f_new, R_pm_resampled
1563+
R_f_new, R_pm_resampled, nan_indices
15631564
)
15641565
R_pm_resampled = None
15651566
elif probmatching_method == "mean":

pysteps/postprocessing/probmatching.py

Lines changed: 49 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ def compute_empirical_cdf(bin_edges, hist):
5252
return cdf
5353

5454

55-
def nonparam_match_empirical_cdf(initial_array, target_array):
55+
def nonparam_match_empirical_cdf(initial_array, target_array, ignore_indices=None):
5656
"""
5757
Matches the empirical CDF of the initial array with the empirical CDF
5858
of a target array. Initial ranks are conserved, but empirical distribution
@@ -64,55 +64,66 @@ def nonparam_match_empirical_cdf(initial_array, target_array):
6464
initial_array: array_like
6565
The initial array whose CDF is to be matched with the target.
6666
target_array: array_like
67-
The target array.
67+
The target array
68+
ignore_indices: array_like, optional
69+
Indices of pixels in the initial_array which are to be ignored (not
70+
rescaled) or an array of booleans with True at the pixel locations to
71+
be ignored in initial_array and False elsewhere.
72+
6873
6974
Returns
7075
-------
7176
output_array: ndarray
7277
The matched array of the same shape as the initial array.
7378
"""
7479

75-
if np.any(~np.isfinite(initial_array)):
76-
raise ValueError("initial array contains non-finite values")
77-
if np.any(~np.isfinite(target_array)):
78-
raise ValueError("target array contains non-finite values")
80+
if np.all(np.isnan(initial_array)):
81+
raise ValueError("Initial array contains only nans.")
7982
if initial_array.size != target_array.size:
8083
raise ValueError(
8184
"dimension mismatch between initial_array and target_array: "
8285
f"initial_array.shape={initial_array.shape}, target_array.shape={target_array.shape}"
8386
)
8487

85-
initial_array = np.array(initial_array)
86-
target_array = np.array(target_array)
88+
initial_array_copy = np.array(initial_array, dtype=float)
89+
target_array = np.array(target_array, dtype=float)
8790

88-
# zeros in initial array
89-
zvalue = initial_array.min()
90-
idxzeros = initial_array == zvalue
91+
# Determine zero in initial array and set nans to zero
92+
zvalue = np.nanmin(initial_array_copy)
93+
if ignore_indices is not None:
94+
initial_array_copy[ignore_indices] = zvalue
95+
# Check if there are still nans left after setting the values at ignore_indices to zero.
96+
if np.any(~np.isfinite(initial_array_copy)):
97+
raise ValueError(
98+
"Initial array contains non-finite values outside ignore_indices mask."
99+
)
91100

92-
# zeros in the target array
93-
zvalue_trg = target_array.min()
101+
idxzeros = initial_array_copy == zvalue
102+
103+
# Determine zero of target_array and set nans to zero.
104+
zvalue_trg = np.nanmin(target_array)
105+
target_array = np.where(np.isnan(target_array), zvalue_trg, target_array)
94106

95107
# adjust the fraction of rain in target distribution if the number of
96-
# nonzeros is greater than in the initial array
97-
if np.sum(target_array > zvalue_trg) > np.sum(initial_array > zvalue):
98-
war = np.sum(initial_array > zvalue) / initial_array.size
108+
# nonzeros is greater than in the initial array (the lowest values will be set to zero)
109+
if np.sum(target_array > zvalue_trg) > np.sum(initial_array_copy > zvalue):
110+
war = np.sum(initial_array_copy > zvalue) / initial_array_copy.size
99111
p = np.percentile(target_array, 100 * (1 - war))
100-
target_array = target_array.copy()
101112
target_array[target_array < p] = zvalue_trg
102113

103-
# flatten the arrays
104-
arrayshape = initial_array.shape
105-
target_array = target_array.flatten()
106-
initial_array = initial_array.flatten()
114+
# flatten the arrays without copying them
115+
arrayshape = initial_array_copy.shape
116+
target_array = target_array.reshape(-1)
117+
initial_array_copy = initial_array_copy.reshape(-1)
107118

108119
# rank target values
109120
order = target_array.argsort()
110121
ranked = target_array[order]
111122

112123
# rank initial values order
113-
orderin = initial_array.argsort()
114-
ranks = np.empty(len(initial_array), int)
115-
ranks[orderin] = np.arange(len(initial_array))
124+
orderin = initial_array_copy.argsort()
125+
ranks = np.empty(len(initial_array_copy), int)
126+
ranks[orderin] = np.arange(len(initial_array_copy))
116127

117128
# get ranked values from target and rearrange with the initial order
118129
output_array = ranked[ranks]
@@ -123,6 +134,9 @@ def nonparam_match_empirical_cdf(initial_array, target_array):
123134
# read original zeros
124135
output_array[idxzeros] = zvalue_trg
125136

137+
# Put back the original values outside the nan-mask of the target array.
138+
if ignore_indices is not None:
139+
output_array[ignore_indices] = initial_array[ignore_indices]
126140
return output_array
127141

128142

@@ -264,6 +278,7 @@ def resample_distributions(first_array, second_array, probability_first_array):
264278
"""
265279
Merges two distributions (e.g., from the extrapolation nowcast and NWP in the blending module)
266280
to effectively combine two distributions for probability matching without losing extremes.
281+
Entries for which one array has a nan will not be included from the other array either.
267282
268283
Parameters
269284
----------
@@ -287,16 +302,22 @@ def resample_distributions(first_array, second_array, probability_first_array):
287302
Raises
288303
------
289304
ValueError
290-
If `first_array` and `second_array` do not have the same shape or if inputs contain NaNs.
305+
If `first_array` and `second_array` do not have the same shape.
291306
"""
292307

293308
# Valide inputs
294309
if first_array.shape != second_array.shape:
295310
raise ValueError("first_array and second_array must have the same shape")
296-
if np.isnan(first_array).any() or np.isnan(second_array).any():
297-
raise ValueError("Input arrays must not contain NaNs")
298311
probability_first_array = np.clip(probability_first_array, 0.0, 1.0)
299312

313+
# Propagate the NaN values of the arrays to each other if there are any; convert to float to make sure this works.
314+
nanmask = np.isnan(first_array) | np.isnan(second_array)
315+
if np.any(nanmask):
316+
first_array = first_array.astype(float)
317+
first_array[nanmask] = np.nan
318+
second_array = second_array.astype(float)
319+
second_array[nanmask] = np.nan
320+
300321
# Flatten and sort the arrays
301322
asort = np.sort(first_array, axis=None)[::-1]
302323
bsort = np.sort(second_array, axis=None)[::-1]
@@ -307,6 +328,7 @@ def resample_distributions(first_array, second_array, probability_first_array):
307328
csort = np.where(idxsamples, asort, bsort)
308329
csort = np.sort(csort)[::-1]
309330

331+
# Return the resampled array in descending order (starting with the nan values)
310332
return csort
311333

312334

pysteps/tests/test_postprocessing_probmatching.py

Lines changed: 157 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import numpy as np
22
import pytest
33
from pysteps.postprocessing.probmatching import resample_distributions
4+
from pysteps.postprocessing.probmatching import nonparam_match_empirical_cdf
45

56

67
class TestResampleDistributions:
@@ -39,19 +40,161 @@ def test_probability_one(self):
3940
)
4041
assert np.array_equal(result, np.sort(first_array)[::-1])
4142

42-
def test_nan_in_inputs(self):
43+
def test_nan_in_arr1_prob_1(self):
44+
array_with_nan = np.array([1, 3, np.nan, 7, 9])
45+
array_without_nan = np.array([2.0, 4, 6, 8, 10])
46+
probability_first_array = 1.0
47+
result = resample_distributions(
48+
array_with_nan, array_without_nan, probability_first_array
49+
)
50+
expected_result = np.array([np.nan, 9, 7, 3, 1], dtype=float)
51+
assert np.allclose(result, expected_result, equal_nan=True)
52+
53+
def test_nan_in_arr1_prob_0(self):
4354
array_with_nan = np.array([1, 3, np.nan, 7, 9])
4455
array_without_nan = np.array([2, 4, 6, 8, 10])
45-
probability_first_array = 0.6
46-
with pytest.raises(ValueError, match="Input arrays must not contain NaNs"):
47-
resample_distributions(
48-
array_with_nan, array_without_nan, probability_first_array
49-
)
50-
with pytest.raises(ValueError, match="Input arrays must not contain NaNs"):
51-
resample_distributions(
52-
array_without_nan, array_with_nan, probability_first_array
53-
)
54-
with pytest.raises(ValueError, match="Input arrays must not contain NaNs"):
55-
resample_distributions(
56-
array_with_nan, array_with_nan, probability_first_array
57-
)
56+
probability_first_array = 0.0
57+
result = resample_distributions(
58+
array_with_nan, array_without_nan, probability_first_array
59+
)
60+
expected_result = np.array([np.nan, 10, 8, 4, 2], dtype=float)
61+
assert np.allclose(result, expected_result, equal_nan=True)
62+
63+
def test_nan_in_arr2_prob_1(self):
64+
array_without_nan = np.array([1, 3, 5, 7, 9])
65+
array_with_nan = np.array([2.0, 4, 6, np.nan, 10])
66+
probability_first_array = 1.0
67+
result = resample_distributions(
68+
array_without_nan, array_with_nan, probability_first_array
69+
)
70+
expected_result = np.array([np.nan, 9, 5, 3, 1], dtype=float)
71+
assert np.allclose(result, expected_result, equal_nan=True)
72+
73+
def test_nan_in_arr2_prob_0(self):
74+
array_without_nan = np.array([1, 3, 5, 7, 9])
75+
array_with_nan = np.array([2, 4, 6, np.nan, 10])
76+
probability_first_array = 0.0
77+
result = resample_distributions(
78+
array_without_nan, array_with_nan, probability_first_array
79+
)
80+
expected_result = np.array([np.nan, 10, 6, 4, 2], dtype=float)
81+
assert np.allclose(result, expected_result, equal_nan=True)
82+
83+
def test_nan_in_both_prob_1(self):
84+
array1_with_nan = np.array([1, np.nan, np.nan, 7, 9])
85+
array2_with_nan = np.array([2.0, 4, np.nan, np.nan, 10])
86+
probability_first_array = 1.0
87+
result = resample_distributions(
88+
array1_with_nan, array2_with_nan, probability_first_array
89+
)
90+
expected_result = np.array([np.nan, np.nan, np.nan, 9, 1], dtype=float)
91+
assert np.allclose(result, expected_result, equal_nan=True)
92+
93+
def test_nan_in_both_prob_0(self):
94+
array1_with_nan = np.array([1, np.nan, np.nan, 7, 9])
95+
array2_with_nan = np.array([2.0, 4, np.nan, np.nan, 10])
96+
probability_first_array = 0.0
97+
result = resample_distributions(
98+
array1_with_nan, array2_with_nan, probability_first_array
99+
)
100+
expected_result = np.array([np.nan, np.nan, np.nan, 10, 2], dtype=float)
101+
assert np.allclose(result, expected_result, equal_nan=True)
102+
103+
104+
class TestNonparamMatchEmpiricalCDF:
105+
@pytest.fixture(autouse=True)
106+
def setup(self):
107+
# Set the seed for reproducibility
108+
np.random.seed(42)
109+
110+
def test_ignore_indices_with_nans_both(self):
111+
initial_array = np.array([np.nan, np.nan, 6, 2, 0, 0, 0, 0, 0, 0])
112+
target_array = np.array([np.nan, np.nan, 9, 5, 4, 0, 0, 0, 0, 0])
113+
result = nonparam_match_empirical_cdf(
114+
initial_array, target_array, ignore_indices=np.isnan(initial_array)
115+
)
116+
expected_result = np.array([np.nan, np.nan, 9, 5, 0, 0, 0, 0, 0, 0])
117+
assert np.allclose(result, expected_result, equal_nan=True)
118+
119+
def test_zeroes_initial(self):
120+
initial_array = np.zeros(10)
121+
target_array = np.array([0, 2, 3, 4, 5, 6, 7, 8, 9, 10])
122+
result = nonparam_match_empirical_cdf(initial_array, target_array)
123+
expected_result = np.zeros(10)
124+
assert np.allclose(result, expected_result)
125+
126+
def test_nans_initial(self):
127+
initial_array = np.array(
128+
[0, 1, 2, 3, 4, np.nan, np.nan, np.nan, np.nan, np.nan]
129+
)
130+
target_array = np.array([0, 2, 3, 4, 5, 6, 7, 8, 9, 10])
131+
with pytest.raises(
132+
ValueError,
133+
match="Initial array contains non-finite values outside ignore_indices mask.",
134+
):
135+
nonparam_match_empirical_cdf(initial_array, target_array)
136+
137+
def test_all_nans_initial(self):
138+
initial_array = np.full(10, np.nan)
139+
target_array = np.array([0, 2, 3, 4, 5, 6, 7, 8, 9, 10])
140+
with pytest.raises(ValueError, match="Initial array contains only nans."):
141+
nonparam_match_empirical_cdf(initial_array, target_array)
142+
143+
def test_ignore_indices_nans_initial(self):
144+
initial_array = np.array(
145+
[0, 1, 2, 3, 4, np.nan, np.nan, np.nan, np.nan, np.nan]
146+
)
147+
target_array = np.array([0, 2, 3, 4, 5, 6, 7, 8, 9, 10])
148+
result = nonparam_match_empirical_cdf(
149+
initial_array, target_array, ignore_indices=np.isnan(initial_array)
150+
)
151+
expected_result = np.array(
152+
[0, 7, 8, 9, 10, np.nan, np.nan, np.nan, np.nan, np.nan]
153+
)
154+
assert np.allclose(result, expected_result, equal_nan=True)
155+
156+
def test_ignore_indices_nans_target(self):
157+
# We expect the initial_array values for which ignore_indices is true to be conserved as-is.
158+
initial_array = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
159+
target_array = np.array(
160+
[0, 2, 3, 4, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan]
161+
)
162+
result = nonparam_match_empirical_cdf(
163+
initial_array, target_array, ignore_indices=np.isnan(target_array)
164+
)
165+
expected_result = np.array([0, 2, 3, 4, 4, 5, 6, 7, 8, 9])
166+
assert np.allclose(result, expected_result, equal_nan=True)
167+
168+
def test_more_zeroes_in_initial(self):
169+
initial_array = np.array([1, 4, 0, 0, 0, 0, 0, 0, 0, 0])
170+
target_array = np.array([10, 8, 6, 4, 2, 0, 0, 0, 0, 0])
171+
result = nonparam_match_empirical_cdf(
172+
initial_array, target_array, ignore_indices=np.isnan(initial_array)
173+
)
174+
expected_result = np.array([8, 10, 0, 0, 0, 0, 0, 0, 0, 0])
175+
assert np.allclose(result, expected_result, equal_nan=True)
176+
177+
def test_more_zeroes_in_initial_unsrt(self):
178+
initial_array = np.array([1, 4, 0, 0, 0, 0, 0, 0, 0, 0])
179+
target_array = np.array([6, 4, 2, 0, 0, 0, 0, 0, 10, 8])
180+
result = nonparam_match_empirical_cdf(
181+
initial_array, target_array, ignore_indices=np.isnan(initial_array)
182+
)
183+
expected_result = np.array([8, 10, 0, 0, 0, 0, 0, 0, 0, 0])
184+
assert np.allclose(result, expected_result, equal_nan=True)
185+
186+
def test_more_zeroes_in_target(self):
187+
initial_array = np.array([1, 3, 7, 5, 0, 0, 0, 0, 0, 0])
188+
target_array = np.array([10, 8, 0, 0, 0, 0, 0, 0, 0, 0])
189+
result = nonparam_match_empirical_cdf(
190+
initial_array, target_array, ignore_indices=np.isnan(initial_array)
191+
)
192+
expected_result = np.array([0, 0, 10, 8, 0, 0, 0, 0, 0, 0])
193+
assert np.allclose(result, expected_result, equal_nan=True)
194+
195+
def test_2dim_array(self):
196+
initial_array = np.array([[1, 3, 5], [11, 9, 7]])
197+
target_array = np.array([[2, 4, 6], [8, 10, 12]])
198+
result = nonparam_match_empirical_cdf(initial_array, target_array)
199+
expected_result = np.array([[2, 4, 6], [12, 10, 8]])
200+
assert np.allclose(result, expected_result, equal_nan=True)

0 commit comments

Comments
 (0)