Add percentage valid points in get_stats() (#644)

vschaffn · web-flow · commit 0c18d9e17cfe · 2025-02-26T16:46:08.000+01:00
diff --git a/doc/source/api.md b/doc/source/api.md
@@ -107,6 +107,15 @@ documentation.
     Raster.plot
 ```
 
+### Get statistics
+
+```{eval-rst}
+.. autosummary::
+    :toctree: gen_modules/
+
+    Raster.get_stats
+```
+
 ### Get or update data methods
 
 ```{eval-rst}
diff --git a/doc/source/raster_class.md b/doc/source/raster_class.md
@@ -260,14 +260,14 @@ The {func}`~geoutils.Raster.icrop` function accepts only a bounding box in pixel
 By default, {func}`~geoutils.Raster.crop` and {func}`~geoutils.Raster.icrop` return a new Raster unless the inplace parameter is set to True, in which case the cropping operation is performed directly on the original raster object.
 For more details, see the {ref}`specific section and function descriptions in the API<api-geo-handle>`.
 
-### Example for `crop`
+### Example for {func}`~geoutils.Raster.crop`
 ```{code-cell} ipython3
 # Crop raster to smaller bounds
 rast_crop = rast.crop(bbox=(0.3, 0.3, 1, 1))
 print(rast_crop.bounds)
 ```
 
-### Example for `icrop`
+### Example for {func}`~geoutils.Raster.icrop`
 ```{code-cell} ipython3
 # Crop raster using pixel coordinates
 rast_icrop = rast.icrop(bbox=(2, 2, 6, 6))
@@ -360,8 +360,31 @@ rast_reproj.to_xarray()
 ```
 
 ## Obtain Statistics
-The `get_stats()` method allows to extract key statistical information from a raster in a dictionary.
-Supported statistics are : mean, median, max, mean, sum, sum of squares, 90th percentile, nmad, rmse, std.
+The {func}`~geoutils.Raster.get_stats` method allows to extract key statistical information from a raster in a dictionary.
+Supported statistics are :
+- **Mean:** arithmetic mean of the data, ignoring masked values.
+- **Median:** middle value when the valid data points are sorted in increasing order, ignoring masked values.
+- **Max:** maximum value among the data, ignoring masked values.
+- **Min:** minimum value among the data, ignoring masked values.
+- **Sum:** sum of all data, ignoring masked values.
+- **Sum of squares:** sum of the squares of all data, ignoring masked values.
+- **90th percentile:** point below which 90% of the data falls, ignoring masked values.
+- **LE90 (Linear Error with 90% confidence):** Difference between the 95th and 5th percentiles of a dataset, representing the range within which 90% of the data points lie. Ignore masked values.
+- **NMAD (Normalized Median Absolute Deviation):** robust measure of variability in the data, less sensitive to outliers compared to standard deviation. Ignore masked values.
+- **RMSE (Root Mean Square Error):** commonly used to express the magnitude of errors or variability and can give insight into the spread of the data. Only relevant when the raster represents a difference of two objects. Ignore masked values.
+- **Std (Standard deviation):** measures the spread or dispersion of the data around the mean, ignoring masked values.
+- **Valid count:** number of finite data points in the array. It counts the non-masked elements.
+- **Total count:** total size of the raster.
+- **Percentage valid points:** ratio between **Valid count** and **Total count**.
+
+
+If an inlier mask is passed:
+- **Total inlier count:** number of data points in the inlier mask.
+- **Valid inlier count:** number of unmasked data points in the array after applying the inlier mask.
+- **Percentage inlier points:** ratio between **Valid inlier count** and **Valid count**. Useful for classification statistics.
+- **Percentage valid inlier points:** ratio between **Valid inlier count** and **Total inlier count**.
+
+
 Callable functions are supported as well.
 
 ### Usage Examples:
@@ -377,7 +400,7 @@ rast.get_stats("mean")
 
 - Get multiple statistics in a dict:
 ```{code-cell} ipython3
-rast.get_stats(["mean", "max", "rmse"])
+rast.get_stats(["mean", "max", "std"])
 ```
 
 - Using a custom callable statistic:
@@ -386,3 +409,11 @@ def custom_stat(data):
     return np.nansum(data > 100)  # Count the number of pixels above 100
 rast.get_stats(custom_stat)
 ```
+
+- Passing an inlier mask:
+```{code-cell} ipython3
+inlier_mask = np.array([[False, False , True],
+                        [False, True , False],
+                        [True, False , True]])
+rast.get_stats(inlier_mask=inlier_mask)
+```
diff --git a/geoutils/raster/raster.py b/geoutils/raster/raster.py
@@ -89,7 +89,7 @@
     decode_sensor_metadata,
     parse_and_convert_metadata_from_filename,
 )
-from geoutils.stats import nmad
+from geoutils.stats import linear_error, nmad
 from geoutils.vector.vector import Vector
 
 # If python38 or above, Literal is builtin. Otherwise, use typing_extensions
@@ -1890,102 +1890,114 @@ def set_mask(self, mask: NDArrayBool | Mask) -> None:
         else:
             self.data[mask_arr > 0] = np.ma.masked
 
-    def _statistics(self, band: int = 1) -> dict[str, np.floating[Any]]:
+    def _statistics(self, band: int = 1, counts: tuple[int, int] | None = None) -> dict[str, np.floating[Any]]:
         """
         Calculate common statistics for a specified band in the raster.
 
         :param band: The index of the band for which to compute statistics. Default is 1.
+        :param counts: (number of finite data points in the array, number of valid points in inlier_mask).
 
-        :returns: A dictionary containing the calculated statistics for the selected band, including mean, median, max,
-        min, sum, sum of squares, 90th percentile, NMAD, RMSE, and standard deviation.
+        :returns: A dictionary containing the calculated statistics for the selected band.
         """
+
         if self.count == 1:
             data = self.data
         else:
             data = self.data[band - 1]
 
-        # If data is a MaskedArray, use the compressed version (without masked values)
-        if isinstance(data, np.ma.MaskedArray):
-            data = data.compressed()
-
         # Compute the statistics
+        mdata = np.ma.filled(data.astype(float), np.nan)
+        valid_count = np.count_nonzero(~self.get_mask()) if counts is None else counts[0]
         stats_dict = {
-            "Mean": np.nanmean(data),
-            "Median": np.nanmedian(data),
-            "Max": np.nanmax(data),
-            "Min": np.nanmin(data),
-            "Sum": np.nansum(data),
-            "Sum of squares": np.nansum(np.square(data)),
-            "90th percentile": np.nanpercentile(data, 90),
+            "Mean": np.ma.mean(data),
+            "Median": np.ma.median(data),
+            "Max": np.ma.max(data),
+            "Min": np.ma.min(data),
+            "Sum": np.ma.sum(data),
+            "Sum of squares": np.ma.sum(np.square(data)),
+            "90th percentile": np.nanpercentile(mdata, 90),
+            "LE90": linear_error(mdata, interval=90),
             "NMAD": nmad(data),
-            "RMSE": np.sqrt(np.nanmean(np.square(data - np.nanmean(data)))),
-            "Standard deviation": np.nanstd(data),
+            "RMSE": np.sqrt(np.ma.mean(np.square(data))),
+            "Standard deviation": np.ma.std(data),
+            "Valid count": valid_count,
+            "Total count": data.size,
+            "Percentage valid points": (valid_count / data.size) * 100,
         }
+
+        if counts is not None:
+            valid_inlier_count = np.count_nonzero(~self.get_mask())
+            stats_dict.update(
+                {
+                    "Valid inlier count": valid_inlier_count,
+                    "Total inlier count": counts[1],
+                    "Percentage inlier points": (valid_inlier_count / counts[0]) * 100,
+                    "Percentage valid inlier points": (valid_inlier_count / counts[1]) * 100,
+                }
+            )
+
         return stats_dict
 
     @overload
     def get_stats(
         self,
-        stats_name: (
-            Literal["mean", "median", "max", "min", "sum", "sum of squares", "90th percentile", "nmad", "rmse", "std"]
-            | Callable[[NDArrayNum], np.floating[Any]]
-        ),
+        stats_name: str | Callable[[NDArrayNum], np.floating[Any]],
+        inlier_mask: Mask | NDArrayBool | None = None,
         band: int = 1,
+        counts: tuple[int, int] | None = None,
     ) -> np.floating[Any]: ...
 
     @overload
     def get_stats(
         self,
-        stats_name: (
-            list[
-                Literal[
-                    "mean", "median", "max", "min", "sum", "sum of squares", "90th percentile", "nmad", "rmse", "std"
-                ]
-                | Callable[[NDArrayNum], np.floating[Any]]
-            ]
-            | None
-        ) = None,
+        stats_name: list[str | Callable[[NDArrayNum], np.floating[Any]]] | None = None,
+        inlier_mask: Mask | NDArrayBool | None = None,
         band: int = 1,
+        counts: tuple[int, int] | None = None,
     ) -> dict[str, np.floating[Any]]: ...
 
     def get_stats(
         self,
         stats_name: (
-            Literal["mean", "median", "max", "min", "sum", "sum of squares", "90th percentile", "nmad", "rmse", "std"]
-            | Callable[[NDArrayNum], np.floating[Any]]
-            | list[
-                Literal[
-                    "mean", "median", "max", "min", "sum", "sum of squares", "90th percentile", "nmad", "rmse", "std"
-                ]
-                | Callable[[NDArrayNum], np.floating[Any]]
-            ]
-            | None
+            str | Callable[[NDArrayNum], np.floating[Any]] | list[str | Callable[[NDArrayNum], np.floating[Any]]] | None
         ) = None,
+        inlier_mask: Mask | NDArrayBool | None = None,
         band: int = 1,
+        counts: tuple[int, int] | None = None,
     ) -> np.floating[Any] | dict[str, np.floating[Any]]:
         """
         Retrieve specified statistics or all available statistics for the raster data. Allows passing custom callables
         to calculate custom stats.
 
         :param stats_name: Name or list of names of the statistics to retrieve. If None, all statistics are returned.
-                   Accepted names include:
-                   - "mean", "median", "max", "min", "sum", "sum of squares", "90th percentile", "nmad", "rmse", "std"
-                   You can also use common aliases for these names (e.g., "average", "maximum", "minimum", etc.).
-                   Custom callables can also be provided.
+            Accepted names include:
+            `mean`, `median`, `max`, `min`, `sum`, `sum of squares`, `90th percentile`, `LE90`, `nmad`, `rmse`,
+            `std`, `valid count`, `total count`, `percentage valid points` and if an inlier mask is passed :
+            `valid inlier count`, `total inlier count`, `percentage inlier point`, `percentage valid inlier points`.
+            Custom callables can also be provided.
+        :param inlier_mask: A boolean mask to filter values for statistical calculations.
         :param band: The index of the band for which to compute statistics. Default is 1.
-
+        :param counts: (number of finite data points in the array, number of valid points in inlier_mask). DO NOT USE.
         :returns: The requested statistic or a dictionary of statistics if multiple or all are requested.
         """
         if not self.is_loaded:
             self.load()
-        stats_dict = self._statistics(band=band)
+        if inlier_mask is not None:
+            valid_points = np.count_nonzero(~self.get_mask())
+            if isinstance(inlier_mask, Mask):
+                inlier_points = np.count_nonzero(~inlier_mask.data)
+            else:
+                inlier_points = np.count_nonzero(~inlier_mask)
+            dem_masked = self.copy()
+            dem_masked.set_mask(inlier_mask)
+            return dem_masked.get_stats(stats_name=stats_name, band=band, counts=(valid_points, inlier_points))
+        stats_dict = self._statistics(band=band, counts=counts)
         if stats_name is None:
             return stats_dict
 
         # Define the metric aliases and their actual names
         stats_aliases = {
             "mean": "Mean",
-            "average": "Mean",
             "median": "Median",
             "max": "Max",
             "maximum": "Max",
@@ -1994,17 +2006,28 @@ def get_stats(
             "sum": "Sum",
             "sumofsquares": "Sum of squares",
             "sum2": "Sum of squares",
-            "percentile": "90th percentile",
             "90thpercentile": "90th percentile",
             "90percentile": "90th percentile",
-            "percentile90": "90th percentile",
+            "le90": "LE90",
             "nmad": "NMAD",
             "rmse": "RMSE",
+            "rms": "RMSE",
             "std": "Standard deviation",
-            "stddev": "Standard deviation",
-            "standarddev": "Standard deviation",
             "standarddeviation": "Standard deviation",
+            "validcount": "Valid count",
+            "totalcount": "Total count",
+            "percentagevalidpoints": "Percentage valid points",
         }
+        if counts is not None:
+            stats_aliases.update(
+                {
+                    "validinliercount": "Valid inlier count",
+                    "totalinliercount": "Total inlier count",
+                    "percentagevalidinlierpoints": "Percentage valid inlier points",
+                    "percentageinlierpoints": "Percentage inlier points",
+                }
+            )
+
         if isinstance(stats_name, list):
             result = {}
             for name in stats_name:
diff --git a/geoutils/stats.py b/geoutils/stats.py
@@ -38,7 +38,31 @@ def nmad(data: NDArrayNum, nfact: float = 1.4826) -> np.floating[Any]:
     :returns nmad: (normalized) median absolute deviation of data.
     """
     if isinstance(data, np.ma.masked_array):
-        data_arr = data.compressed()
+        return nfact * np.ma.median(np.abs(data - np.ma.median(data)))
+    return nfact * np.nanmedian(np.abs(data - np.nanmedian(data)))
+
+
+def linear_error(data: NDArrayNum, interval: float = 90) -> np.floating[Any]:
+    """
+    Compute the linear error (LE) for a given dataset, representing the range of differences between the upper and
+    lower percentiles of the data. By default, this calculates the 90% confidence interval (LE90).
+
+    :param data: A numpy array or masked array of data, typically representing the differences (errors) in elevation or
+    another quantity.
+    :param interval: The confidence interval to compute, specified as a percentage. For example, an interval of 90 will
+    compute the range between the 5th and 95th percentiles (LE90). This value must be between 0 and 100.
+
+    return: The computed linear error, which is the difference between the upper and lower percentiles.
+
+    raises: ValueError if the `interval` is not between 0 and 100.
+    """
+    # Validate the interval
+    if not (0 < interval <= 100):
+        raise ValueError("Interval must be between 0 and 100")
+
+    if isinstance(data, np.ma.masked_array):
+        mdata = np.ma.filled(data.astype(float), np.nan)
     else:
-        data_arr = np.asarray(data)
-    return nfact * np.nanmedian(np.abs(data_arr - np.nanmedian(data_arr)))
+        mdata = data
+    le = np.nanpercentile(mdata, 50 + interval / 2) - np.nanpercentile(mdata, 50 - interval / 2)
+    return le
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -1,6 +1,6 @@
 """Configuration of pytest."""
 
-from pytest import DoctestItem
+from _pytest.doctest import DoctestItem
 
 
 # To order test modules logically during execution
diff --git a/tests/test_raster/test_raster.py b/tests/test_raster/test_raster.py
@@ -1985,8 +1985,6 @@ def test_split_bands(self) -> None:
     def test_stats(self, example: str, caplog) -> None:
         raster = gu.Raster(example)
 
-        # Full stats
-        stats = raster.get_stats()
         expected_stats = [
             "Mean",
             "Median",
@@ -1995,18 +1993,41 @@ def test_stats(self, example: str, caplog) -> None:
             "Sum",
             "Sum of squares",
             "90th percentile",
+            "LE90",
             "NMAD",
             "RMSE",
             "Standard deviation",
+            "Valid count",
+            "Total count",
+            "Percentage valid points",
         ]
+
+        # Full stats
+        stats = raster.get_stats()
         for name in expected_stats:
             assert name in stats
             assert stats.get(name) is not None
 
+        # With mask
+        inlier_mask = raster.get_mask()
+        stats_masked = raster.get_stats(inlier_mask=inlier_mask)
+        for name in [
+            "Valid inlier count",
+            "Total inlier count",
+            "Percentage inlier points",
+            "Percentage valid inlier points",
+        ]:
+            assert name in stats_masked
+            assert stats_masked.get(name) is not None
+            stats_masked.pop(name)
+        assert stats_masked == stats
+
         # Single stat
-        stat = raster.get_stats(stats_name="Average")
-        assert isinstance(stat, np.floating)
+        for name in expected_stats:
+            stat = raster.get_stats(stats_name=name)
+            assert np.isfinite(stat)
 
+        # Callable
         def percentile_95(data: NDArrayNum) -> np.floating[Any]:
             if isinstance(data, np.ma.MaskedArray):
                 data = data.compressed()
@@ -2016,8 +2037,8 @@ def percentile_95(data: NDArrayNum) -> np.floating[Any]:
         assert isinstance(stat, np.floating)
 
         # Selected stats and callable
-        stats_name = ["mean", "maximum", "std", "percentile_95"]
-        stats = raster.get_stats(stats_name=["mean", "maximum", "std", percentile_95])
+        stats_name = ["mean", "max", "std", "percentile_95"]
+        stats = raster.get_stats(stats_name=["mean", "max", "std", percentile_95])
         for name in stats_name:
             assert name in stats
             assert stats.get(name) is not None
diff --git a/tests/test_stats.py b/tests/test_stats.py