Skip to content

Commit 3179ce3

Browse files
lestevethomasjpfanogrisel
authored
MAINT Use scipy sparse nanmin/nanmax for scipy>=1.11 (scikit-learn#27492)
Co-authored-by: Thomas J. Fan <thomasjpfan@gmail.com> Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org>
1 parent dda6337 commit 3179ce3

File tree

3 files changed

+98
-78
lines changed

3 files changed

+98
-78
lines changed

sklearn/utils/fixes.py

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,103 @@ def _sparse_linalg_cg(A, b, **kwargs):
146146
return scipy.sparse.linalg.cg(A, b, **kwargs)
147147

148148

149+
# TODO: Fuse the modern implementations of _sparse_min_max and _sparse_nan_min_max
150+
# into the public min_max_axis function when Scipy 1.11 is the minimum supported
151+
# version and delete the backport in the else branch below.
152+
if sp_base_version >= parse_version("1.11.0"):
153+
154+
def _sparse_min_max(X, axis):
155+
the_min = X.min(axis=axis)
156+
the_max = X.max(axis=axis)
157+
158+
if axis is not None:
159+
the_min = the_min.toarray().ravel()
160+
the_max = the_max.toarray().ravel()
161+
162+
return the_min, the_max
163+
164+
def _sparse_nan_min_max(X, axis):
165+
the_min = X.nanmin(axis=axis)
166+
the_max = X.nanmax(axis=axis)
167+
168+
if axis is not None:
169+
the_min = the_min.toarray().ravel()
170+
the_max = the_max.toarray().ravel()
171+
172+
return the_min, the_max
173+
174+
else:
175+
# This code is mostly taken from scipy 0.14 and extended to handle nans, see
176+
# https://github.com/scikit-learn/scikit-learn/pull/11196
177+
def _minor_reduce(X, ufunc):
178+
major_index = np.flatnonzero(np.diff(X.indptr))
179+
180+
# reduceat tries casts X.indptr to intp, which errors
181+
# if it is int64 on a 32 bit system.
182+
# Reinitializing prevents this where possible, see #13737
183+
X = type(X)((X.data, X.indices, X.indptr), shape=X.shape)
184+
value = ufunc.reduceat(X.data, X.indptr[major_index])
185+
return major_index, value
186+
187+
def _min_or_max_axis(X, axis, min_or_max):
188+
N = X.shape[axis]
189+
if N == 0:
190+
raise ValueError("zero-size array to reduction operation")
191+
M = X.shape[1 - axis]
192+
mat = X.tocsc() if axis == 0 else X.tocsr()
193+
mat.sum_duplicates()
194+
major_index, value = _minor_reduce(mat, min_or_max)
195+
not_full = np.diff(mat.indptr)[major_index] < N
196+
value[not_full] = min_or_max(value[not_full], 0)
197+
mask = value != 0
198+
major_index = np.compress(mask, major_index)
199+
value = np.compress(mask, value)
200+
201+
if axis == 0:
202+
res = scipy.sparse.coo_matrix(
203+
(value, (np.zeros(len(value)), major_index)),
204+
dtype=X.dtype,
205+
shape=(1, M),
206+
)
207+
else:
208+
res = scipy.sparse.coo_matrix(
209+
(value, (major_index, np.zeros(len(value)))),
210+
dtype=X.dtype,
211+
shape=(M, 1),
212+
)
213+
return res.A.ravel()
214+
215+
def _sparse_min_or_max(X, axis, min_or_max):
216+
if axis is None:
217+
if 0 in X.shape:
218+
raise ValueError("zero-size array to reduction operation")
219+
zero = X.dtype.type(0)
220+
if X.nnz == 0:
221+
return zero
222+
m = min_or_max.reduce(X.data.ravel())
223+
if X.nnz != np.prod(X.shape):
224+
m = min_or_max(zero, m)
225+
return m
226+
if axis < 0:
227+
axis += 2
228+
if (axis == 0) or (axis == 1):
229+
return _min_or_max_axis(X, axis, min_or_max)
230+
else:
231+
raise ValueError("invalid axis, use 0 for rows, or 1 for columns")
232+
233+
def _sparse_min_max(X, axis):
234+
return (
235+
_sparse_min_or_max(X, axis, np.minimum),
236+
_sparse_min_or_max(X, axis, np.maximum),
237+
)
238+
239+
def _sparse_nan_min_max(X, axis):
240+
return (
241+
_sparse_min_or_max(X, axis, np.fmin),
242+
_sparse_min_or_max(X, axis, np.fmax),
243+
)
244+
245+
149246
###############################################################################
150247
# Backport of Python 3.9's importlib.resources
151248
# TODO: Remove when Python 3.9 is the minimum supported version

sklearn/utils/sparsefuncs.py

Lines changed: 1 addition & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
import numpy as np
1212
import scipy.sparse as sp
1313

14+
from ..utils.fixes import _sparse_min_max, _sparse_nan_min_max
1415
from ..utils.validation import _check_sample_weight
1516
from .sparsefuncs_fast import (
1617
csc_mean_variance_axis0 as _csc_mean_var_axis0,
@@ -417,72 +418,6 @@ def inplace_swap_column(X, m, n):
417418
_raise_typeerror(X)
418419

419420

420-
def _minor_reduce(X, ufunc):
421-
major_index = np.flatnonzero(np.diff(X.indptr))
422-
423-
# reduceat tries casts X.indptr to intp, which errors
424-
# if it is int64 on a 32 bit system.
425-
# Reinitializing prevents this where possible, see #13737
426-
X = type(X)((X.data, X.indices, X.indptr), shape=X.shape)
427-
value = ufunc.reduceat(X.data, X.indptr[major_index])
428-
return major_index, value
429-
430-
431-
def _min_or_max_axis(X, axis, min_or_max):
432-
N = X.shape[axis]
433-
if N == 0:
434-
raise ValueError("zero-size array to reduction operation")
435-
M = X.shape[1 - axis]
436-
mat = X.tocsc() if axis == 0 else X.tocsr()
437-
mat.sum_duplicates()
438-
major_index, value = _minor_reduce(mat, min_or_max)
439-
not_full = np.diff(mat.indptr)[major_index] < N
440-
value[not_full] = min_or_max(value[not_full], 0)
441-
mask = value != 0
442-
major_index = np.compress(mask, major_index)
443-
value = np.compress(mask, value)
444-
445-
if axis == 0:
446-
res = sp.coo_matrix(
447-
(value, (np.zeros(len(value)), major_index)), dtype=X.dtype, shape=(1, M)
448-
)
449-
else:
450-
res = sp.coo_matrix(
451-
(value, (major_index, np.zeros(len(value)))), dtype=X.dtype, shape=(M, 1)
452-
)
453-
return res.A.ravel()
454-
455-
456-
def _sparse_min_or_max(X, axis, min_or_max):
457-
if axis is None:
458-
if 0 in X.shape:
459-
raise ValueError("zero-size array to reduction operation")
460-
zero = X.dtype.type(0)
461-
if X.nnz == 0:
462-
return zero
463-
m = min_or_max.reduce(X.data.ravel())
464-
if X.nnz != np.prod(X.shape):
465-
m = min_or_max(zero, m)
466-
return m
467-
if axis < 0:
468-
axis += 2
469-
if (axis == 0) or (axis == 1):
470-
return _min_or_max_axis(X, axis, min_or_max)
471-
else:
472-
raise ValueError("invalid axis, use 0 for rows, or 1 for columns")
473-
474-
475-
def _sparse_min_max(X, axis):
476-
return (
477-
_sparse_min_or_max(X, axis, np.minimum),
478-
_sparse_min_or_max(X, axis, np.maximum),
479-
)
480-
481-
482-
def _sparse_nan_min_max(X, axis):
483-
return (_sparse_min_or_max(X, axis, np.fmin), _sparse_min_or_max(X, axis, np.fmax))
484-
485-
486421
def min_max_axis(X, axis, ignore_nan=False):
487422
"""Compute minimum and maximum along an axis on a CSR or CSC matrix.
488423

sklearn/utils/tests/test_sparsefuncs.py

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
from scipy import linalg
77

88
from sklearn.datasets import make_classification
9-
from sklearn.utils import _IS_WASM
109
from sklearn.utils._testing import assert_allclose
1110
from sklearn.utils.fixes import CSC_CONTAINERS, CSR_CONTAINERS, LIL_CONTAINERS
1211
from sklearn.utils.sparsefuncs import (
@@ -796,17 +795,6 @@ def test_min_max(
796795
)
797796
X_sparse = sparse_format(X)
798797

799-
if (
800-
_IS_WASM and large_indices and isinstance(X_sparse, sp.sparray)
801-
): # pragma: nocover
802-
pytest.xfail(
803-
reason=(
804-
"temporary xfailing test until it is fixed in main, see"
805-
" https://github.com/scikit-learn/scikit-learn/issues/27470 for more"
806-
" details."
807-
)
808-
)
809-
810798
if large_indices:
811799
X_sparse.indices = X_sparse.indices.astype("int64")
812800
X_sparse.indptr = X_sparse.indptr.astype("int64")

0 commit comments

Comments
 (0)