MAINT Use scipy sparse nanmin/nanmax for scipy>=1.11 (scikit-learn#27492)

lesteve · thomasjpfan · ogrisel · web-flow · commit 3179ce31cce4 · 2023-10-09T09:07:26.000+02:00
Co-authored-by: Thomas J. Fan &lt;thomasjpfan@gmail.com&gt;
Co-authored-by: Olivier Grisel &lt;olivier.grisel@ensta.org&gt;
diff --git a/sklearn/utils/fixes.py b/sklearn/utils/fixes.py
@@ -146,6 +146,103 @@ def _sparse_linalg_cg(A, b, **kwargs):
         return scipy.sparse.linalg.cg(A, b, **kwargs)
 
 
+# TODO: Fuse the modern implementations of _sparse_min_max and _sparse_nan_min_max
+# into the public min_max_axis function when Scipy 1.11 is the minimum supported
+# version and delete the backport in the else branch below.
+if sp_base_version >= parse_version("1.11.0"):
+
+    def _sparse_min_max(X, axis):
+        the_min = X.min(axis=axis)
+        the_max = X.max(axis=axis)
+
+        if axis is not None:
+            the_min = the_min.toarray().ravel()
+            the_max = the_max.toarray().ravel()
+
+        return the_min, the_max
+
+    def _sparse_nan_min_max(X, axis):
+        the_min = X.nanmin(axis=axis)
+        the_max = X.nanmax(axis=axis)
+
+        if axis is not None:
+            the_min = the_min.toarray().ravel()
+            the_max = the_max.toarray().ravel()
+
+        return the_min, the_max
+
+else:
+    # This code is mostly taken from scipy 0.14 and extended to handle nans, see
+    # https://github.com/scikit-learn/scikit-learn/pull/11196
+    def _minor_reduce(X, ufunc):
+        major_index = np.flatnonzero(np.diff(X.indptr))
+
+        # reduceat tries casts X.indptr to intp, which errors
+        # if it is int64 on a 32 bit system.
+        # Reinitializing prevents this where possible, see #13737
+        X = type(X)((X.data, X.indices, X.indptr), shape=X.shape)
+        value = ufunc.reduceat(X.data, X.indptr[major_index])
+        return major_index, value
+
+    def _min_or_max_axis(X, axis, min_or_max):
+        N = X.shape[axis]
+        if N == 0:
+            raise ValueError("zero-size array to reduction operation")
+        M = X.shape[1 - axis]
+        mat = X.tocsc() if axis == 0 else X.tocsr()
+        mat.sum_duplicates()
+        major_index, value = _minor_reduce(mat, min_or_max)
+        not_full = np.diff(mat.indptr)[major_index] < N
+        value[not_full] = min_or_max(value[not_full], 0)
+        mask = value != 0
+        major_index = np.compress(mask, major_index)
+        value = np.compress(mask, value)
+
+        if axis == 0:
+            res = scipy.sparse.coo_matrix(
+                (value, (np.zeros(len(value)), major_index)),
+                dtype=X.dtype,
+                shape=(1, M),
+            )
+        else:
+            res = scipy.sparse.coo_matrix(
+                (value, (major_index, np.zeros(len(value)))),
+                dtype=X.dtype,
+                shape=(M, 1),
+            )
+        return res.A.ravel()
+
+    def _sparse_min_or_max(X, axis, min_or_max):
+        if axis is None:
+            if 0 in X.shape:
+                raise ValueError("zero-size array to reduction operation")
+            zero = X.dtype.type(0)
+            if X.nnz == 0:
+                return zero
+            m = min_or_max.reduce(X.data.ravel())
+            if X.nnz != np.prod(X.shape):
+                m = min_or_max(zero, m)
+            return m
+        if axis < 0:
+            axis += 2
+        if (axis == 0) or (axis == 1):
+            return _min_or_max_axis(X, axis, min_or_max)
+        else:
+            raise ValueError("invalid axis, use 0 for rows, or 1 for columns")
+
+    def _sparse_min_max(X, axis):
+        return (
+            _sparse_min_or_max(X, axis, np.minimum),
+            _sparse_min_or_max(X, axis, np.maximum),
+        )
+
+    def _sparse_nan_min_max(X, axis):
+        return (
+            _sparse_min_or_max(X, axis, np.fmin),
+            _sparse_min_or_max(X, axis, np.fmax),
+        )
+
+
 ###############################################################################
 # Backport of Python 3.9's importlib.resources
 # TODO: Remove when Python 3.9 is the minimum supported version
diff --git a/sklearn/utils/sparsefuncs.py b/sklearn/utils/sparsefuncs.py
@@ -11,6 +11,7 @@
 import numpy as np
 import scipy.sparse as sp
 
+from ..utils.fixes import _sparse_min_max, _sparse_nan_min_max
 from ..utils.validation import _check_sample_weight
 from .sparsefuncs_fast import (
     csc_mean_variance_axis0 as _csc_mean_var_axis0,
@@ -417,72 +418,6 @@ def inplace_swap_column(X, m, n):
         _raise_typeerror(X)
 
 
-def _minor_reduce(X, ufunc):
-    major_index = np.flatnonzero(np.diff(X.indptr))
-
-    # reduceat tries casts X.indptr to intp, which errors
-    # if it is int64 on a 32 bit system.
-    # Reinitializing prevents this where possible, see #13737
-    X = type(X)((X.data, X.indices, X.indptr), shape=X.shape)
-    value = ufunc.reduceat(X.data, X.indptr[major_index])
-    return major_index, value
-
-
-def _min_or_max_axis(X, axis, min_or_max):
-    N = X.shape[axis]
-    if N == 0:
-        raise ValueError("zero-size array to reduction operation")
-    M = X.shape[1 - axis]
-    mat = X.tocsc() if axis == 0 else X.tocsr()
-    mat.sum_duplicates()
-    major_index, value = _minor_reduce(mat, min_or_max)
-    not_full = np.diff(mat.indptr)[major_index] < N
-    value[not_full] = min_or_max(value[not_full], 0)
-    mask = value != 0
-    major_index = np.compress(mask, major_index)
-    value = np.compress(mask, value)
-
-    if axis == 0:
-        res = sp.coo_matrix(
-            (value, (np.zeros(len(value)), major_index)), dtype=X.dtype, shape=(1, M)
-        )
-    else:
-        res = sp.coo_matrix(
-            (value, (major_index, np.zeros(len(value)))), dtype=X.dtype, shape=(M, 1)
-        )
-    return res.A.ravel()
-
-
-def _sparse_min_or_max(X, axis, min_or_max):
-    if axis is None:
-        if 0 in X.shape:
-            raise ValueError("zero-size array to reduction operation")
-        zero = X.dtype.type(0)
-        if X.nnz == 0:
-            return zero
-        m = min_or_max.reduce(X.data.ravel())
-        if X.nnz != np.prod(X.shape):
-            m = min_or_max(zero, m)
-        return m
-    if axis < 0:
-        axis += 2
-    if (axis == 0) or (axis == 1):
-        return _min_or_max_axis(X, axis, min_or_max)
-    else:
-        raise ValueError("invalid axis, use 0 for rows, or 1 for columns")
-
-
-def _sparse_min_max(X, axis):
-    return (
-        _sparse_min_or_max(X, axis, np.minimum),
-        _sparse_min_or_max(X, axis, np.maximum),
-    )
-
-
-def _sparse_nan_min_max(X, axis):
-    return (_sparse_min_or_max(X, axis, np.fmin), _sparse_min_or_max(X, axis, np.fmax))
-
-
 def min_max_axis(X, axis, ignore_nan=False):
     """Compute minimum and maximum along an axis on a CSR or CSC matrix.
 
diff --git a/sklearn/utils/tests/test_sparsefuncs.py b/sklearn/utils/tests/test_sparsefuncs.py
@@ -6,7 +6,6 @@
 from scipy import linalg
 
 from sklearn.datasets import make_classification
-from sklearn.utils import _IS_WASM
 from sklearn.utils._testing import assert_allclose
 from sklearn.utils.fixes import CSC_CONTAINERS, CSR_CONTAINERS, LIL_CONTAINERS
 from sklearn.utils.sparsefuncs import (
@@ -796,17 +795,6 @@ def test_min_max(
     )
     X_sparse = sparse_format(X)
 
-    if (
-        _IS_WASM and large_indices and isinstance(X_sparse, sp.sparray)
-    ):  # pragma: nocover
-        pytest.xfail(
-            reason=(
-                "temporary xfailing test until it is fixed in main, see"
-                " https://github.com/scikit-learn/scikit-learn/issues/27470 for more"
-                " details."
-            )
-        )
-
     if large_indices:
         X_sparse.indices = X_sparse.indices.astype("int64")
         X_sparse.indptr = X_sparse.indptr.astype("int64")