Add resample distributions function for probability matching (#390)

RubenImhoff · dnerini · web-flow · commit d577a9816b20 · 2024-07-22T13:51:18.000+02:00
* feat: first attempt to add resample distributions function for probability matching

* fix: remove erroneous import

* feat: add resampling option to probability matching step

* fix: make sure there are no nans in the resampling function

* fix: fill up space outside radar domain with model data and change function names

* docs: add info about the weights

* fix: add requested changes from review

* fix: add tests and make sure probability stays within bounds

* Better docstrings

* Minor code improvements

* Add test

* Run black

* Handle nans outside of resampling method

---------

Co-authored-by: Daniele Nerini &lt;daniele.nerini@gmail.com&gt;
diff --git a/pysteps/blending/steps.py b/pysteps/blending/steps.py
@@ -90,6 +90,7 @@ def forecast(
     conditional=False,
     probmatching_method="cdf",
     mask_method="incremental",
+    resample_distribution=True,
     smooth_radar_mask_range=0,
     callback=None,
     return_output=True,
@@ -205,25 +206,32 @@ def forecast(
       If set to True, compute the statistics of the precipitation field
       conditionally by excluding pixels where the values are below the threshold
       precip_thr.
+    probmatching_method: {'cdf','mean',None}, optional
+      Method for matching the statistics of the forecast field with those of
+      the most recently observed one. 'cdf'=map the forecast CDF to the observed
+      one, 'mean'=adjust only the conditional mean value of the forecast field
+      in precipitation areas, None=no matching applied. Using 'mean' requires
+      that mask_method is not None.
     mask_method: {'obs','incremental',None}, optional
       The method to use for masking no precipitation areas in the forecast field.
       The masked pixels are set to the minimum value of the observations.
       'obs' = apply precip_thr to the most recently observed precipitation intensity
       field, 'incremental' = iteratively buffer the mask with a certain rate
       (currently it is 1 km/min), None=no masking.
+    resample_distribution: bool, optional
+        Method to resample the distribution from the extrapolation and NWP cascade as input
+        for the probability matching. Not resampling these distributions may lead to losing
+        some extremes when the weight of both the extrapolation and NWP cascade is similar.
+        Defaults to True.
     smooth_radar_mask_range: int, Default is 0.
       Method to smooth the transition between the radar-NWP-noise blend and the NWP-noise
       blend near the edge of the radar domain (radar mask), where the radar data is either
-      not present anymore or is not reliable. If set to 0 (grid cells), this generates a normal forecast without smoothing. To create a smooth mask, this range
-      should be a positive value, representing a buffer band of a number of pixels
-      by which the mask is cropped and smoothed. The smooth radar mask removes
-      the hard edges between NWP and radar in the final blended product. Typically, a value between 50 and 100 km can be used. 80 km generally gives good results.
-    probmatching_method: {'cdf','mean',None}, optional
-      Method for matching the statistics of the forecast field with those of
-      the most recently observed one. 'cdf'=map the forecast CDF to the observed
-      one, 'mean'=adjust only the conditional mean value of the forecast field
-      in precipitation areas, None=no matching applied. Using 'mean' requires
-      that mask_method is not None.
+      not present anymore or is not reliable. If set to 0 (grid cells), this generates a
+      normal forecast without smoothing. To create a smooth mask, this range should be a
+      positive value, representing a buffer band of a number of pixels by which the mask
+      is cropped and smoothed. The smooth radar mask removes the hard edges between NWP
+      and radar in the final blended product. Typically, a value between 50 and 100 km
+      can be used. 80 km generally gives good results.
     callback: function, optional
       Optional function that is called after computation of each time step of
       the nowcast. The function takes one argument: a three-dimensional array
@@ -1404,7 +1412,6 @@ def worker(j):
                         # latest extrapolated radar rainfall field blended with the
                         # nwp model(s) rainfall forecast fields as 'benchmark'.
 
-                        # TODO: Check probability matching method
                         # 8.7.1 first blend the extrapolated rainfall field (the field
                         # that is only used for post-processing steps) with the NWP
                         # rainfall forecast for this time step using the weights
@@ -1538,19 +1545,39 @@ def worker(j):
                             # Set to min value outside of mask
                             R_f_new[~MASK_prec_] = R_cmin
 
+                        # If probmatching_method is not None, resample the distribution from
+                        # both the extrapolation cascade and the model (NWP) cascade and use
+                        # that for the probability matching
+                        if probmatching_method is not None and resample_distribution:
+                            # deal with missing values
+                            arr1 = R_pm_ep[t_index]
+                            arr2 = precip_models_pm_temp[j]
+                            arr2 = np.where(np.isnan(arr2), np.nanmin(arr2), arr2)
+                            arr1 = np.where(np.isnan(arr1), arr2, arr1)
+                            # resample weights based on cascade level 2
+                            R_pm_resampled = probmatching.resample_distributions(
+                                first_array=arr1,
+                                second_array=arr2,
+                                probability_first_array=weights_pm_normalized[0],
+                            )
+                        else:
+                            R_pm_resampled = R_pm_blended.copy()
+
                         if probmatching_method == "cdf":
                             # Adjust the CDF of the forecast to match the most recent
                             # benchmark rainfall field (R_pm_blended). If the forecast
                             if np.any(np.isfinite(R_f_new)):
                                 R_f_new = probmatching.nonparam_match_empirical_cdf(
-                                    R_f_new, R_pm_blended
+                                    R_f_new, R_pm_resampled
                                 )
+                                R_pm_resampled = None
                         elif probmatching_method == "mean":
                             # Use R_pm_blended as benchmark field and
-                            mu_0 = np.mean(R_pm_blended[R_pm_blended >= precip_thr])
+                            mu_0 = np.mean(R_pm_resampled[R_pm_resampled >= precip_thr])
                             MASK = R_f_new >= precip_thr
                             mu_fct = np.mean(R_f_new[MASK])
                             R_f_new[MASK] = R_f_new[MASK] - mu_fct + mu_0
+                            R_pm_resampled = None
 
                         R_f_out.append(R_f_new)
 
diff --git a/pysteps/postprocessing/probmatching.py b/pysteps/postprocessing/probmatching.py
@@ -13,6 +13,7 @@
     pmm_init
     pmm_compute
     shift_scale
+    resample_distributions
 """
 
 import numpy as np
@@ -259,6 +260,56 @@ def _get_error(scale):
     return shift, scale, R.reshape(shape)
 
 
+def resample_distributions(first_array, second_array, probability_first_array):
+    """
+    Merges two distributions (e.g., from the extrapolation nowcast and NWP in the blending module)
+    to effectively combine two distributions for probability matching without losing extremes.
+
+    Parameters
+    ----------
+    first_array: array_like
+        One of the two arrays from which the distribution should be sampled (e.g., the extrapolation
+        cascade). It must be of the same shape as `second_array`. Input must not contain NaNs.
+    second_array: array_like
+        One of the two arrays from which the distribution should be sampled (e.g., the NWP (model)
+        cascade). It must be of the same shape as `first_array`.. Input must not contain NaNs.
+    probability_first_array: float
+        The weight that `first_array` should get (a value between 0 and 1). This determines the
+        likelihood of selecting elements from `first_array` over `second_array`.
+
+    Returns
+    -------
+    csort: array_like
+        The combined output distribution. This is an array of the same shape as the input arrays,
+        where each element is chosen from either `first_array` or `second_array` based on the specified
+        probability, and then sorted in descending order.
+
+    Raises
+    ------
+    ValueError
+        If `first_array` and `second_array` do not have the same shape or if inputs contain NaNs.
+    """
+
+    # Valide inputs
+    if first_array.shape != second_array.shape:
+        raise ValueError("first_array and second_array must have the same shape")
+    if np.isnan(first_array).any() or np.isnan(second_array).any():
+        raise ValueError("Input arrays must not contain NaNs")
+    probability_first_array = np.clip(probability_first_array, 0.0, 1.0)
+
+    # Flatten and sort the arrays
+    asort = np.sort(first_array, axis=None)[::-1]
+    bsort = np.sort(second_array, axis=None)[::-1]
+    n = asort.shape[0]
+
+    # Resample the distributions
+    idxsamples = np.random.binomial(1, probability_first_array, n).astype(bool)
+    csort = np.where(idxsamples, asort, bsort)
+    csort = np.sort(csort)[::-1]
+
+    return csort
+
+
 def _invfunc(y, fx, fy):
     if len(y) == 0:
         return np.array([])
diff --git a/pysteps/tests/test_blending_steps.py b/pysteps/tests/test_blending_steps.py
@@ -8,37 +8,40 @@
 
 
 steps_arg_values = [
-    (1, 3, 4, 8, None, None, False, "spn", True, 4, False, False, 0),
-    (1, 3, 4, 8, "obs", None, False, "spn", True, 4, False, False, 0),
-    (1, 3, 4, 8, "incremental", None, False, "spn", True, 4, False, False, 0),
-    (1, 3, 4, 8, None, "mean", False, "spn", True, 4, False, False, 0),
-    (1, 3, 4, 8, None, "cdf", False, "spn", True, 4, False, False, 0),
-    (1, 3, 4, 8, "incremental", "cdf", False, "spn", True, 4, False, False, 0),
-    (1, 3, 4, 6, "incremental", "cdf", False, "bps", True, 4, False, False, 0),
-    (1, 3, 4, 6, "incremental", "cdf", False, "bps", False, 4, False, False, 0),
-    (1, 3, 4, 9, "incremental", "cdf", False, "spn", True, 4, False, False, 0),
-    (2, 3, 10, 8, "incremental", "cdf", False, "spn", True, 10, False, False, 0),
-    (5, 3, 5, 8, "incremental", "cdf", False, "spn", True, 5, False, False, 0),
-    (1, 10, 1, 8, "incremental", "cdf", False, "spn", True, 1, False, False, 0),
-    (2, 3, 2, 8, "incremental", "cdf", True, "spn", True, 2, False, False, 0),
-    (1, 3, 6, 8, None, None, False, "spn", True, 6, False, False, 0),
+    (1, 3, 4, 8, None, None, False, "spn", True, 4, False, False, 0, False),
+    (1, 3, 4, 8, "obs", None, False, "spn", True, 4, False, False, 0, False),
+    (1, 3, 4, 8, "incremental", None, False, "spn", True, 4, False, False, 0, False),
+    (1, 3, 4, 8, None, "mean", False, "spn", True, 4, False, False, 0, False),
+    (1, 3, 4, 8, None, "mean", False, "spn", True, 4, False, False, 0, True),
+    (1, 3, 4, 8, None, "cdf", False, "spn", True, 4, False, False, 0, False),
+    (1, 3, 4, 8, "incremental", "cdf", False, "spn", True, 4, False, False, 0, False),
+    (1, 3, 4, 6, "incremental", "cdf", False, "bps", True, 4, False, False, 0, False),
+    (1, 3, 4, 6, "incremental", "cdf", False, "bps", False, 4, False, False, 0, False),
+    (1, 3, 4, 6, "incremental", "cdf", False, "bps", False, 4, False, False, 0, True),
+    (1, 3, 4, 9, "incremental", "cdf", False, "spn", True, 4, False, False, 0, False),
+    (2, 3, 10, 8, "incremental", "cdf", False, "spn", True, 10, False, False, 0, False),
+    (5, 3, 5, 8, "incremental", "cdf", False, "spn", True, 5, False, False, 0, False),
+    (1, 10, 1, 8, "incremental", "cdf", False, "spn", True, 1, False, False, 0, False),
+    (2, 3, 2, 8, "incremental", "cdf", True, "spn", True, 2, False, False, 0, False),
+    (1, 3, 6, 8, None, None, False, "spn", True, 6, False, False, 0, False),
     #    Test the case where the radar image contains no rain.
-    (1, 3, 6, 8, None, None, False, "spn", True, 6, True, False, 0),
-    (5, 3, 5, 6, "incremental", "cdf", False, "spn", False, 5, True, False, 0),
+    (1, 3, 6, 8, None, None, False, "spn", True, 6, True, False, 0, False),
+    (5, 3, 5, 6, "incremental", "cdf", False, "spn", False, 5, True, False, 0, False),
+    (5, 3, 5, 6, "incremental", "cdf", False, "spn", False, 5, True, False, 0, True),
     #   Test the case where the NWP fields contain no rain.
-    (1, 3, 6, 8, None, None, False, "spn", True, 6, False, True, 0),
-    (5, 3, 5, 6, "incremental", "cdf", False, "spn", False, 5, False, True, 0),
+    (1, 3, 6, 8, None, None, False, "spn", True, 6, False, True, 0, False),
+    (5, 3, 5, 6, "incremental", "cdf", False, "spn", False, 5, False, True, 0, True),
     # Test the case where both the radar image and the NWP fields contain no rain.
-    (1, 3, 6, 8, None, None, False, "spn", True, 6, True, True, 0),
-    (5, 3, 5, 6, "incremental", "cdf", False, "spn", False, 5, True, True, 0),
-    (5, 3, 5, 6, "obs", "mean", True, "spn", True, 5, True, True, 0),
+    (1, 3, 6, 8, None, None, False, "spn", True, 6, True, True, 0, False),
+    (5, 3, 5, 6, "incremental", "cdf", False, "spn", False, 5, True, True, 0, False),
+    (5, 3, 5, 6, "obs", "mean", True, "spn", True, 5, True, True, 0, False),
     # Test for smooth radar mask
-    (1, 3, 6, 8, None, None, False, "spn", True, 6, False, False, 80),
-    (5, 3, 5, 6, "incremental", "cdf", False, "spn", False, 5, False, False, 80),
-    (5, 3, 5, 6, "obs", "mean", False, "spn", False, 5, False, False, 80),
-    (1, 3, 6, 8, None, None, False, "spn", True, 6, False, True, 80),
-    (5, 3, 5, 6, "incremental", "cdf", False, "spn", False, 5, True, False, 80),
-    (5, 3, 5, 6, "obs", "mean", False, "spn", False, 5, True, True, 80),
+    (1, 3, 6, 8, None, None, False, "spn", True, 6, False, False, 80, False),
+    (5, 3, 5, 6, "incremental", "cdf", False, "spn", False, 5, False, False, 80, False),
+    (5, 3, 5, 6, "obs", "mean", False, "spn", False, 5, False, False, 80, False),
+    (1, 3, 6, 8, None, None, False, "spn", True, 6, False, True, 80, False),
+    (5, 3, 5, 6, "incremental", "cdf", False, "spn", False, 5, True, False, 80, True),
+    (5, 3, 5, 6, "obs", "mean", False, "spn", False, 5, True, True, 80, False),
 ]
 
 steps_arg_names = (
@@ -55,6 +58,7 @@
     "zero_radar",
     "zero_nwp",
     "smooth_radar_mask_range",
+    "resample_distribution",
 )
 
 
@@ -73,6 +77,7 @@ def test_steps_blending(
     zero_radar,
     zero_nwp,
     smooth_radar_mask_range,
+    resample_distribution,
 ):
     pytest.importorskip("cv2")
 
diff --git a/pysteps/tests/test_postprocessing_probmatching.py b/pysteps/tests/test_postprocessing_probmatching.py
@@ -0,0 +1,57 @@
+import numpy as np
+import pytest
+from pysteps.postprocessing.probmatching import resample_distributions
+
+
+class TestResampleDistributions:
+
+    @pytest.fixture(autouse=True)
+    def setup(self):
+        # Set the seed for reproducibility
+        np.random.seed(42)
+
+    def test_valid_inputs(self):
+        first_array = np.array([1, 3, 5, 7, 9])
+        second_array = np.array([2, 4, 6, 8, 10])
+        probability_first_array = 0.6
+        result = resample_distributions(
+            first_array, second_array, probability_first_array
+        )
+        expected_result = np.array([9, 8, 6, 3, 1])  # Expected result based on the seed
+        assert result.shape == first_array.shape
+        assert np.array_equal(result, expected_result)
+
+    def test_probability_zero(self):
+        first_array = np.array([1, 3, 5, 7, 9])
+        second_array = np.array([2, 4, 6, 8, 10])
+        probability_first_array = 0.0
+        result = resample_distributions(
+            first_array, second_array, probability_first_array
+        )
+        assert np.array_equal(result, np.sort(second_array)[::-1])
+
+    def test_probability_one(self):
+        first_array = np.array([1, 3, 5, 7, 9])
+        second_array = np.array([2, 4, 6, 8, 10])
+        probability_first_array = 1.0
+        result = resample_distributions(
+            first_array, second_array, probability_first_array
+        )
+        assert np.array_equal(result, np.sort(first_array)[::-1])
+
+    def test_nan_in_inputs(self):
+        array_with_nan = np.array([1, 3, np.nan, 7, 9])
+        array_without_nan = np.array([2, 4, 6, 8, 10])
+        probability_first_array = 0.6
+        with pytest.raises(ValueError, match="Input arrays must not contain NaNs"):
+            resample_distributions(
+                array_with_nan, array_without_nan, probability_first_array
+            )
+        with pytest.raises(ValueError, match="Input arrays must not contain NaNs"):
+            resample_distributions(
+                array_without_nan, array_with_nan, probability_first_array
+            )
+        with pytest.raises(ValueError, match="Input arrays must not contain NaNs"):
+            resample_distributions(
+                array_with_nan, array_with_nan, probability_first_array
+            )