Add array API support for _weighted_percentile (scikit-learn#29431)

EmilyXinyi · ogrisel · lucyleeow · web-flow · commit bb261bfd23e6 · 2025-04-04T13:58:55.000+02:00
Co-authored-by: Olivier Grisel &lt;olivier.grisel@ensta.org&gt;
Co-authored-by: Lucy Liu &lt;jliu176@gmail.com&gt;
diff --git a/sklearn/utils/stats.py b/sklearn/utils/stats.py
@@ -1,12 +1,13 @@
 # Authors: The scikit-learn developers
 # SPDX-License-Identifier: BSD-3-Clause
 
-import numpy as np
+from ..utils._array_api import (
+    _find_matching_floating_dtype,
+    get_namespace_and_device,
+)
 
-from .extmath import stable_cumsum
 
-
-def _weighted_percentile(array, sample_weight, percentile_rank=50):
+def _weighted_percentile(array, sample_weight, percentile_rank=50, xp=None):
     """Compute the weighted percentile with method 'inverted_cdf'.
 
     When the percentile lies between two data points of `array`, the function returns
@@ -37,72 +38,86 @@ def _weighted_percentile(array, sample_weight, percentile_rank=50):
         The probability level of the percentile to compute, in percent. Must be between
         0 and 100.
 
+    xp : array_namespace, default=None
+        The standard-compatible namespace for `array`. Default: infer.
+
     Returns
     -------
-    percentile : int if `array` 1D, ndarray if `array` 2D
+    percentile : scalar or 0D array if `array` 1D (or 0D), array if `array` 2D
         Weighted percentile at the requested probability level.
     """
+    xp, _, device = get_namespace_and_device(array)
+    # `sample_weight` should follow `array` for dtypes
+    floating_dtype = _find_matching_floating_dtype(array, xp=xp)
+    array = xp.asarray(array, dtype=floating_dtype, device=device)
+    sample_weight = xp.asarray(sample_weight, dtype=floating_dtype, device=device)
+
     n_dim = array.ndim
     if n_dim == 0:
-        return array[()]
+        return array
     if array.ndim == 1:
-        array = array.reshape((-1, 1))
+        array = xp.reshape(array, (-1, 1))
     # When sample_weight 1D, repeat for each array.shape[1]
     if array.shape != sample_weight.shape and array.shape[0] == sample_weight.shape[0]:
-        sample_weight = np.tile(sample_weight, (array.shape[1], 1)).T
-
+        sample_weight = xp.tile(sample_weight, (array.shape[1], 1)).T
     # Sort `array` and `sample_weight` along axis=0:
-    sorted_idx = np.argsort(array, axis=0)
-    sorted_weights = np.take_along_axis(sample_weight, sorted_idx, axis=0)
+    sorted_idx = xp.argsort(array, axis=0)
+    sorted_weights = xp.take_along_axis(sample_weight, sorted_idx, axis=0)
 
-    # Set NaN values in `sample_weight` to 0. We only perform this operation if NaN
-    # values are present at all to avoid temporary allocations of size `(n_samples,
-    # n_features)`. If NaN values were present, they would sort to the end (which we can
-    # observe from `sorted_idx`).
+    # Set NaN values in `sample_weight` to 0. Only perform this operation if NaN
+    # values present to avoid temporary allocations of size `(n_samples, n_features)`.
     n_features = array.shape[1]
-    largest_value_per_column = array[sorted_idx[-1, ...], np.arange(n_features)]
-    if np.isnan(largest_value_per_column).any():
-        sorted_nan_mask = np.take_along_axis(np.isnan(array), sorted_idx, axis=0)
+    largest_value_per_column = array[
+        sorted_idx[-1, ...], xp.arange(n_features, device=device)
+    ]
+    # NaN values get sorted to end (largest value)
+    if xp.any(xp.isnan(largest_value_per_column)):
+        sorted_nan_mask = xp.take_along_axis(xp.isnan(array), sorted_idx, axis=0)
         sorted_weights[sorted_nan_mask] = 0
 
     # Compute the weighted cumulative distribution function (CDF) based on
-    # sample_weight and scale percentile_rank along it:
-    weight_cdf = stable_cumsum(sorted_weights, axis=0)
-    adjusted_percentile_rank = percentile_rank / 100 * weight_cdf[-1]
-
-    # For percentile_rank=0, ignore leading observations with sample_weight=0; see
-    # PR #20528:
+    # `sample_weight` and scale `percentile_rank` along it.
+    #
+    # Note: we call `xp.cumulative_sum` on the transposed `sorted_weights` to
+    # ensure that the result is of shape `(n_features, n_samples)` so
+    # `xp.searchsorted` calls take contiguous inputs as a result (for
+    # performance reasons).
+    weight_cdf = xp.cumulative_sum(sorted_weights.T, axis=1)
+    adjusted_percentile_rank = percentile_rank / 100 * weight_cdf[..., -1]
+
+    # Ignore leading `sample_weight=0` observations when `percentile_rank=0` (#20528)
     mask = adjusted_percentile_rank == 0
-    adjusted_percentile_rank[mask] = np.nextafter(
+    adjusted_percentile_rank[mask] = xp.nextafter(
         adjusted_percentile_rank[mask], adjusted_percentile_rank[mask] + 1
     )
-
-    # Find index (i) of `adjusted_percentile` in `weight_cdf`,
-    # such that weight_cdf[i-1] < percentile <= weight_cdf[i]
-    percentile_idx = np.array(
+    # For each feature with index j, find sample index i of the scalar value
+    # `adjusted_percentile_rank[j]` in 1D array `weight_cdf[j]`, such that:
+    # weight_cdf[j, i-1] < adjusted_percentile_rank[j] <= weight_cdf[j, i].
+    percentile_indices = xp.asarray(
         [
-            np.searchsorted(weight_cdf[:, i], adjusted_percentile_rank[i])
-            for i in range(weight_cdf.shape[1])
-        ]
+            xp.searchsorted(
+                weight_cdf[feature_idx, ...], adjusted_percentile_rank[feature_idx]
+            )
+            for feature_idx in range(weight_cdf.shape[0])
+        ],
+        device=device,
     )
-
-    # In rare cases, percentile_idx equals to sorted_idx.shape[0]:
+    # In rare cases, `percentile_indices` equals to `sorted_idx.shape[0]`
     max_idx = sorted_idx.shape[0] - 1
-    percentile_idx = np.apply_along_axis(
-        lambda x: np.clip(x, 0, max_idx), axis=0, arr=percentile_idx
-    )
+    percentile_indices = xp.clip(percentile_indices, 0, max_idx)
+
+    col_indices = xp.arange(array.shape[1], device=device)
+    percentile_in_sorted = sorted_idx[percentile_indices, col_indices]
 
-    col_indices = np.arange(array.shape[1])
-    percentile_in_sorted = sorted_idx[percentile_idx, col_indices]
     result = array[percentile_in_sorted, col_indices]
 
     return result[0] if n_dim == 1 else result
 
 
 # TODO: refactor to do the symmetrisation inside _weighted_percentile to avoid
 # sorting the input array twice.
-def _averaged_weighted_percentile(array, sample_weight, percentile_rank=50):
+def _averaged_weighted_percentile(array, sample_weight, percentile_rank=50, xp=None):
     return (
-        _weighted_percentile(array, sample_weight, percentile_rank)
-        - _weighted_percentile(-array, sample_weight, 100 - percentile_rank)
+        _weighted_percentile(array, sample_weight, percentile_rank, xp=xp)
+        - _weighted_percentile(-array, sample_weight, 100 - percentile_rank, xp=xp)
     ) / 2
diff --git a/sklearn/utils/tests/test_stats.py b/sklearn/utils/tests/test_stats.py
@@ -3,6 +3,14 @@
 from numpy.testing import assert_allclose, assert_array_equal
 from pytest import approx
 
+from sklearn._config import config_context
+from sklearn.utils._array_api import (
+    _convert_to_numpy,
+    get_namespace,
+    yield_namespace_device_dtype_combinations,
+)
+from sklearn.utils._array_api import device as array_device
+from sklearn.utils.estimator_checks import _array_api_for_tests
 from sklearn.utils.fixes import np_version, parse_version
 from sklearn.utils.stats import _averaged_weighted_percentile, _weighted_percentile
 
@@ -39,6 +47,7 @@ def test_averaged_and_weighted_percentile():
 
 
 def test_weighted_percentile():
+    """Check `weighted_percentile` on artificial data with obvious median."""
     y = np.empty(102, dtype=np.float64)
     y[:50] = 0
     y[-51:] = 2
@@ -51,15 +60,16 @@ def test_weighted_percentile():
 
 
 def test_weighted_percentile_equal():
+    """Check `weighted_percentile` with all weights equal to 1."""
     y = np.empty(102, dtype=np.float64)
     y.fill(0.0)
     sw = np.ones(102, dtype=np.float64)
-    sw[-1] = 0.0
-    value = _weighted_percentile(y, sw, 50)
-    assert value == 0
+    score = _weighted_percentile(y, sw, 50)
+    assert approx(score) == 0
 
 
 def test_weighted_percentile_zero_weight():
+    """Check `weighted_percentile` with all weights equal to 0."""
     y = np.empty(102, dtype=np.float64)
     y.fill(1.0)
     sw = np.ones(102, dtype=np.float64)
@@ -69,6 +79,11 @@ def test_weighted_percentile_zero_weight():
 
 
 def test_weighted_percentile_zero_weight_zero_percentile():
+    """Check `weighted_percentile(percentile_rank=0)` behaves correctly.
+
+    Ensures that (leading)zero-weight observations ignored when `percentile_rank=0`.
+    See #20528 for details.
+    """
     y = np.array([0, 1, 2, 3, 4, 5])
     sw = np.array([0, 0, 1, 1, 1, 0])
     value = _weighted_percentile(y, sw, 0)
@@ -82,18 +97,18 @@ def test_weighted_percentile_zero_weight_zero_percentile():
 
 
 def test_weighted_median_equal_weights():
-    # Checks that `_weighted_percentile` and `np.median` (both at probability level=0.5
-    # and with `sample_weights` being all 1s) return the same percentiles if the number
-    # of the samples in the data is odd. In this special case, `_weighted_percentile`
-    # always falls on a precise value (not on the next lower value) and is thus equal to
-    # `np.median`.
-    # As discussed in #17370, a similar check with an even number of samples does not
-    # consistently hold, since then the lower of two percentiles might be selected,
-    # while the median might lie in between.
+    """Checks `_weighted_percentile(percentile_rank=50)` is the same as `np.median`.
+
+    `sample_weights` are all 1s and the number of samples is odd.
+    When number of samples is odd, `_weighted_percentile` always falls on a single
+    observation (not between 2 values, in which case the lower value would be taken)
+    and is thus equal to `np.median`.
+    For an even number of samples, this check will not always hold as (note that
+    for some other percentile methods it will always hold). See #17370 for details.
+    """
     rng = np.random.RandomState(0)
     x = rng.randint(10, size=11)
     weights = np.ones(x.shape)
-
     median = np.median(x)
     w_median = _weighted_percentile(x, weights)
     assert median == approx(w_median)
@@ -106,10 +121,8 @@ def test_weighted_median_integer_weights():
     x = rng.randint(20, size=10)
     weights = rng.choice(5, size=10)
     x_manual = np.repeat(x, weights)
-
     median = np.median(x_manual)
     w_median = _weighted_percentile(x, weights)
-
     assert median == approx(w_median)
 
 
@@ -125,8 +138,7 @@ def test_weighted_percentile_2d():
     w_median = _weighted_percentile(x_2d, w1)
     p_axis_0 = [_weighted_percentile(x_2d[:, i], w1) for i in range(x_2d.shape[1])]
     assert_allclose(w_median, p_axis_0)
-
-    # Check when array and sample_weight boht 2D
+    # Check when array and sample_weight both 2D
     w2 = rng.choice(5, size=10)
     w_2d = np.vstack((w1, w2)).T
 
@@ -137,6 +149,91 @@ def test_weighted_percentile_2d():
     assert_allclose(w_median, p_axis_0)
 
 
+@pytest.mark.parametrize(
+    "array_namespace, device, dtype_name", yield_namespace_device_dtype_combinations()
+)
+@pytest.mark.parametrize(
+    "data, weights, percentile",
+    [
+        # NumPy scalars input (handled as 0D arrays on array API)
+        (np.float32(42), np.int32(1), 50),
+        # Random 1D array, constant weights
+        (lambda rng: rng.rand(50), np.ones(50).astype(np.int32), 50),
+        # Random 2D array and random 1D weights
+        (lambda rng: rng.rand(50, 3), lambda rng: rng.rand(50).astype(np.float32), 75),
+        # Random 2D array and random 2D weights
+        (
+            lambda rng: rng.rand(20, 3),
+            lambda rng: rng.rand(20, 3).astype(np.float32),
+            25,
+        ),
+        # zero-weights and `rank_percentile=0` (#20528) (`sample_weight` dtype: int64)
+        (np.array([0, 1, 2, 3, 4, 5]), np.array([0, 0, 1, 1, 1, 0]), 0),
+        # np.nan's in data and some zero-weights (`sample_weight` dtype: int64)
+        (np.array([np.nan, np.nan, 0, 3, 4, 5]), np.array([0, 1, 1, 1, 1, 0]), 0),
+        # `sample_weight` dtype: int32
+        (
+            np.array([0, 1, 2, 3, 4, 5]),
+            np.array([0, 1, 1, 1, 1, 0], dtype=np.int32),
+            25,
+        ),
+    ],
+)
+def test_weighted_percentile_array_api_consistency(
+    global_random_seed, array_namespace, device, dtype_name, data, weights, percentile
+):
+    """Check `_weighted_percentile` gives consistent results with array API."""
+    if array_namespace == "array_api_strict":
+        try:
+            import array_api_strict
+        except ImportError:
+            pass
+        else:
+            if device == array_api_strict.Device("device1"):
+                # See https://github.com/data-apis/array-api-strict/issues/134
+                pytest.xfail(
+                    "array_api_strict has bug when indexing with tuple of arrays "
+                    "on non-'CPU_DEVICE' devices."
+                )
+
+    xp = _array_api_for_tests(array_namespace, device)
+
+    # Skip test for percentile=0 edge case (#20528) on namespace/device where
+    # xp.nextafter is broken. This is the case for torch with MPS device:
+    # https://github.com/pytorch/pytorch/issues/150027
+    zero = xp.zeros(1, device=device)
+    one = xp.ones(1, device=device)
+    if percentile == 0 and xp.all(xp.nextafter(zero, one) == zero):
+        pytest.xfail(f"xp.nextafter is broken on {device}")
+
+    rng = np.random.RandomState(global_random_seed)
+    X_np = data(rng) if callable(data) else data
+    weights_np = weights(rng) if callable(weights) else weights
+    # Ensure `data` of correct dtype
+    X_np = X_np.astype(dtype_name)
+
+    result_np = _weighted_percentile(X_np, weights_np, percentile)
+    # Convert to Array API arrays
+    X_xp = xp.asarray(X_np, device=device)
+    weights_xp = xp.asarray(weights_np, device=device)
+
+    with config_context(array_api_dispatch=True):
+        result_xp = _weighted_percentile(X_xp, weights_xp, percentile)
+        assert array_device(result_xp) == array_device(X_xp)
+        assert get_namespace(result_xp)[0] == get_namespace(X_xp)[0]
+        result_xp_np = _convert_to_numpy(result_xp, xp=xp)
+
+    assert result_xp_np.dtype == result_np.dtype
+    assert result_xp_np.shape == result_np.shape
+    assert_allclose(result_np, result_xp_np)
+
+    # Check dtype correct (`sample_weight` should follow `array`)
+    if dtype_name == "float32":
+        assert result_xp_np.dtype == result_np.dtype == np.float32
+    else:
+        assert result_xp_np.dtype == np.float64
+
+
 @pytest.mark.parametrize("sample_weight_ndim", [1, 2])
 def test_weighted_percentile_nan_filtered(sample_weight_ndim):
     """Test that calling _weighted_percentile on an array with nan values returns