Merge branch 'scikit-learn:main' into submodulev2

adam2392 · web-flow · commit 5a2ac9aae2b7 · 2023-06-13T15:18:49.000-04:00
diff --git a/doc/developers/develop.rst b/doc/developers/develop.rst
@@ -535,6 +535,9 @@ The current set of estimator tags are:
 allow_nan (default=False)
     whether the estimator supports data with missing values encoded as np.nan
 
+array_api_support (default=False)
+    whether the estimator supports Array API compatible inputs.
+
 binary_only (default=False)
     whether estimator supports binary classification but lacks multi-class
     classification support.
diff --git a/doc/modules/array_api.rst b/doc/modules/array_api.rst
@@ -93,3 +93,18 @@ Estimators with support for `Array API`-compatible inputs
 Coverage for more estimators is expected to grow over time. Please follow the
 dedicated `meta-issue on GitHub
 <https://github.com/scikit-learn/scikit-learn/issues/22352>`_ to track progress.
+
+Common estimator checks
+=======================
+
+Add the `array_api_support` tag to an estimator's set of tags to indicate that
+it supports the Array API. This will enable dedicated checks as part of the
+common tests to verify that the estimators result's are the same when using
+vanilla NumPy and Array API inputs.
+
+To run these checks you need to install
+`array_api_compat <https://github.com/data-apis/array-api-compat>`_ in your
+test environment. To run the full set of checks you need to install both
+`PyTorch <https://pytorch.org/>`_ and `CuPy <https://cupy.dev/>`_ and have
+a GPU. Checks that can not be executed or have missing dependencies will be
+automatically skipped.
diff --git a/doc/whats_new/v1.3.rst b/doc/whats_new/v1.3.rst
@@ -628,6 +628,10 @@ Changelog
   The `sample_interval_` attribute is deprecated and will be removed in 1.5.
   :pr:`25190` by :user:`Vincent Maladière <Vincent-Maladiere>`.
 
+- |Fix| :class:`preprocessing.PowerTransformer` now correcly raises error when
+  using `method="box-cox"` on data with a constant `np.nan` column.
+  :pr:`26400` by :user:`Yao Xiao <Charlie-XIAO>`.
+
 :mod:`sklearn.svm`
 ..................
 
diff --git a/sklearn/discriminant_analysis.py b/sklearn/discriminant_analysis.py
@@ -745,6 +745,9 @@ def decision_function(self, X):
         # Only override for the doc
         return super().decision_function(X)
 
+    def _more_tags(self):
+        return {"array_api_support": True}
+
 
 class QuadraticDiscriminantAnalysis(ClassifierMixin, BaseEstimator):
     """Quadratic Discriminant Analysis.
diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py
@@ -3311,9 +3311,13 @@ def _box_cox_optimize(self, x):
 
         We here use scipy builtins which uses the brent optimizer.
         """
+        mask = np.isnan(x)
+        if np.all(mask):
+            raise ValueError("Column must not be all nan.")
+
         # the computation of lambda is influenced by NaNs so we need to
         # get rid of them
-        _, lmbda = stats.boxcox(x[~np.isnan(x)], lmbda=None)
+        _, lmbda = stats.boxcox(x[~mask], lmbda=None)
 
         return lmbda
 
diff --git a/sklearn/preprocessing/tests/test_data.py b/sklearn/preprocessing/tests/test_data.py
@@ -2527,6 +2527,21 @@ def test_power_transformer_copy_False(method, standardize):
     assert X_trans is X_inv_trans
 
 
+def test_power_transformer_box_cox_raise_all_nans_col():
+    """Check that box-cox raises informative when a column contains all nans.
+
+    Non-regression test for gh-26303
+    """
+    X = rng.random_sample((4, 5))
+    X[:, 0] = np.nan
+
+    err_msg = "Column must not be all nan."
+
+    pt = PowerTransformer(method="box-cox")
+    with pytest.raises(ValueError, match=err_msg):
+        pt.fit_transform(X)
+
+
 @pytest.mark.parametrize(
     "X_2",
     [
diff --git a/sklearn/tests/test_discriminant_analysis.py b/sklearn/tests/test_discriminant_analysis.py
@@ -4,16 +4,12 @@
 
 from scipy import linalg
 
-from sklearn.base import clone
-from sklearn._config import config_context
 from sklearn.utils import check_random_state
 from sklearn.utils._testing import assert_array_equal
 from sklearn.utils._testing import assert_array_almost_equal
 from sklearn.utils._testing import assert_allclose
 from sklearn.utils._testing import assert_almost_equal
-from sklearn.utils._array_api import _convert_to_numpy
 from sklearn.utils._testing import _convert_container
-from sklearn.utils._testing import skip_if_array_api_compat_not_configured
 
 from sklearn.datasets import make_blobs
 from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
@@ -675,121 +671,3 @@ def test_get_feature_names_out():
         dtype=object,
     )
     assert_array_equal(names_out, expected_names_out)
-
-
-@skip_if_array_api_compat_not_configured
-@pytest.mark.parametrize("array_namespace", ["numpy.array_api", "cupy.array_api"])
-def test_lda_array_api(array_namespace):
-    """Check that the array_api Array gives the same results as ndarrays."""
-    xp = pytest.importorskip(array_namespace)
-
-    X_xp = xp.asarray(X)
-    y_xp = xp.asarray(y3)
-
-    lda = LinearDiscriminantAnalysis()
-    lda.fit(X, y3)
-
-    array_attributes = {
-        key: value for key, value in vars(lda).items() if isinstance(value, np.ndarray)
-    }
-
-    lda_xp = clone(lda)
-    with config_context(array_api_dispatch=True):
-        lda_xp.fit(X_xp, y_xp)
-
-    # Fitted-attributes which are arrays must have the same
-    # namespace than the one of the training data.
-    for key, attribute in array_attributes.items():
-        lda_xp_param = getattr(lda_xp, key)
-        assert hasattr(lda_xp_param, "__array_namespace__")
-
-        lda_xp_param_np = _convert_to_numpy(lda_xp_param, xp=xp)
-        assert_allclose(
-            attribute, lda_xp_param_np, err_msg=f"{key} not the same", atol=1e-3
-        )
-
-    # Check predictions are the same
-    methods = (
-        "decision_function",
-        "predict",
-        "predict_log_proba",
-        "predict_proba",
-        "transform",
-    )
-
-    for method in methods:
-        result = getattr(lda, method)(X)
-        with config_context(array_api_dispatch=True):
-            result_xp = getattr(lda_xp, method)(X_xp)
-        assert hasattr(
-            result_xp, "__array_namespace__"
-        ), f"{method} did not output an array_namespace"
-
-        result_xp_np = _convert_to_numpy(result_xp, xp=xp)
-
-        assert_allclose(
-            result,
-            result_xp_np,
-            err_msg=f"{method} did not the return the same result",
-            atol=1e-5,
-        )
-
-
-@skip_if_array_api_compat_not_configured
-@pytest.mark.parametrize("device", ["cuda", "cpu"])
-@pytest.mark.parametrize("dtype", ["float32", "float64"])
-def test_lda_array_torch(device, dtype):
-    """Check running on PyTorch Tensors gives the same results as NumPy"""
-    torch = pytest.importorskip("torch")
-    if device == "cuda" and not torch.has_cuda:
-        pytest.skip("test requires cuda")
-
-    lda = LinearDiscriminantAnalysis()
-    X_np = X6.astype(dtype)
-    y_np = y6.astype(dtype)
-    lda.fit(X_np, y_np)
-
-    X_torch = torch.asarray(X_np, device=device)
-    y_torch = torch.asarray(y_np, device=device)
-    lda_xp = clone(lda)
-    with config_context(array_api_dispatch=True):
-        lda_xp.fit(X_torch, y_torch)
-
-    array_attributes = {
-        key: value for key, value in vars(lda).items() if isinstance(value, np.ndarray)
-    }
-
-    for key, attribute in array_attributes.items():
-        lda_xp_param = getattr(lda_xp, key)
-        assert isinstance(lda_xp_param, torch.Tensor)
-        assert lda_xp_param.device.type == device
-
-        lda_xp_param_np = _convert_to_numpy(lda_xp_param, xp=torch)
-        assert_allclose(
-            attribute, lda_xp_param_np, err_msg=f"{key} not the same", atol=1e-3
-        )
-
-    # Check predictions are the same
-    methods = (
-        "decision_function",
-        "predict",
-        "predict_log_proba",
-        "predict_proba",
-        "transform",
-    )
-    for method in methods:
-        result = getattr(lda, method)(X_np)
-        with config_context(array_api_dispatch=True):
-            result_xp = getattr(lda_xp, method)(X_torch)
-
-        assert isinstance(result_xp, torch.Tensor)
-        assert result_xp.device.type == device
-
-        result_xp_np = _convert_to_numpy(result_xp, xp=torch)
-
-        assert_allclose(
-            result,
-            result_xp_np,
-            err_msg=f"{method} did not the return the same result",
-            atol=1e-6,
-        )
diff --git a/sklearn/utils/_tags.py b/sklearn/utils/_tags.py
@@ -1,6 +1,7 @@
 import numpy as np
 
 _DEFAULT_TAGS = {
+    "array_api_support": False,
     "non_deterministic": False,
     "requires_positive_X": False,
     "requires_positive_y": False,
diff --git a/sklearn/utils/estimator_checks.py b/sklearn/utils/estimator_checks.py
@@ -1,4 +1,6 @@
 import warnings
+import importlib
+import itertools
 import pickle
 import re
 from copy import deepcopy
@@ -58,6 +60,7 @@
 from ..utils.fixes import sp_version
 from ..utils.fixes import parse_version
 from ..utils.validation import check_is_fitted
+from ..utils._array_api import _convert_to_numpy, get_namespace, device as array_device
 from ..utils._param_validation import make_constraint
 from ..utils._param_validation import generate_invalid_param_val
 from ..utils._param_validation import InvalidParameterError
@@ -73,6 +76,7 @@
 from ..datasets import (
     load_iris,
     make_blobs,
+    make_classification,
     make_multilabel_classification,
     make_regression,
 )
@@ -133,6 +137,21 @@ def _yield_checks(estimator):
 
     yield check_estimator_get_tags_default_keys
 
+    if tags["array_api_support"]:
+        for array_namespace in ["numpy.array_api", "cupy.array_api", "cupy", "torch"]:
+            if array_namespace == "torch":
+                for device, dtype in itertools.product(
+                    ("cpu", "cuda"), ("float64", "float32")
+                ):
+                    yield partial(
+                        check_array_api_input,
+                        array_namespace=array_namespace,
+                        dtype=dtype,
+                        device=device,
+                    )
+            else:
+                yield partial(check_array_api_input, array_namespace=array_namespace)
+
 
 def _yield_classifier_checks(classifier):
     tags = _safe_tags(classifier)
@@ -831,6 +850,111 @@ def _generate_sparse_matrix(X_csr):
         yield sparse_format + "_64", X
 
 
+def check_array_api_input(
+    name, estimator_orig, *, array_namespace, device=None, dtype="float64"
+):
+    """Check that the array_api Array gives the same results as ndarrays."""
+    try:
+        array_mod = importlib.import_module(array_namespace)
+    except ModuleNotFoundError:
+        raise SkipTest(
+            f"{array_namespace} is not installed: not checking array_api input"
+        )
+    try:
+        import array_api_compat  # noqa
+    except ImportError:
+        raise SkipTest(
+            "array_api_compat is not installed: not checking array_api input"
+        )
+
+    # First create an array using the chosen array module and then get the
+    # corresponding (compatibility wrapped) array namespace based on it.
+    # This is because `cupy` is not the same as the compatibility wrapped
+    # namespace of a CuPy array.
+    xp = array_api_compat.get_namespace(array_mod.asarray(1))
+
+    if array_namespace == "torch" and device == "cuda" and not xp.has_cuda:
+        raise SkipTest("PyTorch test requires cuda, which is not available")
+    elif array_namespace in {"cupy", "cupy.array_api"}:  # pragma: nocover
+        import cupy
+
+        if cupy.cuda.runtime.getDeviceCount() == 0:
+            raise SkipTest("CuPy test requires cuda, which is not available")
+
+    X, y = make_classification(random_state=42)
+    X = X.astype(dtype, copy=False)
+
+    X = _enforce_estimator_tags_X(estimator_orig, X)
+    y = _enforce_estimator_tags_y(estimator_orig, y)
+
+    est = clone(estimator_orig)
+
+    X_xp = xp.asarray(X, device=device)
+    y_xp = xp.asarray(y, device=device)
+
+    est.fit(X, y)
+
+    array_attributes = {
+        key: value for key, value in vars(est).items() if isinstance(value, np.ndarray)
+    }
+
+    est_xp = clone(est)
+    with config_context(array_api_dispatch=True):
+        est_xp.fit(X_xp, y_xp)
+
+    # Fitted attributes which are arrays must have the same
+    # namespace as the one of the training data.
+    for key, attribute in array_attributes.items():
+        est_xp_param = getattr(est_xp, key)
+        assert (
+            get_namespace(est_xp_param)[0] == get_namespace(X_xp)[0]
+        ), f"'{key}' attribute is in wrong namespace"
+
+        assert array_device(est_xp_param) == array_device(X_xp)
+
+        est_xp_param_np = _convert_to_numpy(est_xp_param, xp=xp)
+        assert_allclose(
+            attribute,
+            est_xp_param_np,
+            err_msg=f"{key} not the same",
+            atol=np.finfo(X.dtype).eps * 100,
+        )
+
+    # Check estimator methods, if supported, give the same results
+    methods = (
+        "decision_function",
+        "predict",
+        "predict_log_proba",
+        "predict_proba",
+        "transform",
+        "inverse_transform",
+    )
+
+    for method_name in methods:
+        method = getattr(est, method_name, None)
+        if method is None:
+            continue
+
+        result = method(X)
+        with config_context(array_api_dispatch=True):
+            result_xp = getattr(est_xp, method_name)(X_xp)
+
+        assert (
+            get_namespace(result_xp)[0] == get_namespace(X_xp)[0]
+        ), f"'{method}' output is in wrong namespace"
+
+        assert array_device(result_xp) == array_device(X_xp)
+
+        result_xp_np = _convert_to_numpy(result_xp, xp=xp)
+
+        assert_allclose(
+            result,
+            result_xp_np,
+            err_msg=f"{method} did not the return the same result",
+            atol=np.finfo(X.dtype).eps * 100,
+        )
+
+
 def check_estimator_sparse_data(name, estimator_orig):
     rng = np.random.RandomState(0)
     X = rng.uniform(size=(40, 3))
diff --git a/sklearn/utils/tests/test_estimator_checks.py b/sklearn/utils/tests/test_estimator_checks.py