FIX accept multilabel-indicator in _get_response_values (scikit-learn#27002)

glemaitre · thomasjpfan · jeremiedbb · web-flow · commit 3c9495cc64b7 · 2023-09-17T15:15:47.000+02:00
Co-authored-by: Thomas J. Fan &lt;thomasjpfan@gmail.com&gt;
Co-authored-by: Jérémie du Boisberranger &lt;34657725+jeremiedbb@users.noreply.github.com&gt;
Co-authored-by: Stefanie Senger &lt;91849487+StefanieSenger@users.noreply.github.com&gt;
Co-authored-by: Vladimir Fokow &lt;57260995+VladimirFokow@users.noreply.github.com&gt;
Co-authored-by: jeremie du boisberranger &lt;jeremiedbb@yahoo.fr&gt;
Co-authored-by: Xuefeng Xu &lt;xuxf100@qq.com&gt;
Co-authored-by: Tim Head &lt;betatim@gmail.com&gt;
Co-authored-by: Raphael &lt;raphael1peer@gmail.com&gt;
Co-authored-by: Olivier Grisel &lt;olivier.grisel@ensta.org&gt;
Co-authored-by: Sayed Qaiser Ali &lt;66676360+sqali@users.noreply.github.com&gt;
Co-authored-by: Loïc Estève &lt;loic.esteve@ymail.com&gt;
Co-authored-by: Xiao Yuan &lt;yuanx749@gmail.com&gt;
diff --git a/doc/whats_new/v1.3.rst b/doc/whats_new/v1.3.rst
@@ -43,6 +43,13 @@ Changelog
   ``transform`` when ``add_indicator`` is set to ``True`` and missing values are observed
   during ``fit``. :pr:`26600` by :user:`Shreesha Kumar Bhat <Shreesha3112>`.
 
+:mod:`sklearn.metrics`
+.......................
+
+- |Fix| Scorers used with :func:`metrics.get_scorer` handle properly
+  multilabel-indicator matrix.
+  :pr:`27002` by :user:`Guillaume Lemaitre <glemaitre>`.
+
 :mod:`sklearn.neighbors`
 ........................
 
diff --git a/sklearn/metrics/tests/test_score_objects.py b/sklearn/metrics/tests/test_score_objects.py
@@ -542,12 +542,15 @@ def test_thresholded_scorers_multilabel_indicator_data():
 
     # Multi-output multi-class decision_function
     # TODO Is there any yet?
-    clf = DecisionTreeClassifier()
-    clf.fit(X_train, y_train)
-    clf._predict_proba = clf.predict_proba
-    clf.predict_proba = None
-    clf.decision_function = lambda X: [p[:, 1] for p in clf._predict_proba(X)]
+    class TreeWithDecisionFunction(DecisionTreeClassifier):
+        # disable predict_proba
+        predict_proba = None
+
+        def decision_function(self, X):
+            return [p[:, 1] for p in DecisionTreeClassifier.predict_proba(self, X)]
 
+    clf = TreeWithDecisionFunction()
+    clf.fit(X_train, y_train)
     y_proba = clf.decision_function(X_test)
     score1 = get_scorer("roc_auc")(clf, X_test, y_test)
     score2 = roc_auc_score(y_test, np.vstack([p for p in y_proba]).T)
@@ -800,7 +803,19 @@ def test_multimetric_scorer_calls_method_once(
     assert decision_function_func.call_count == expected_decision_func_count
 
 
-def test_multimetric_scorer_calls_method_once_classifier_no_decision():
+@pytest.mark.parametrize(
+    "scorers",
+    [
+        (["roc_auc", "neg_log_loss"]),
+        (
+            {
+                "roc_auc": make_scorer(roc_auc_score, needs_threshold=True),
+                "neg_log_loss": make_scorer(log_loss, needs_proba=True),
+            }
+        ),
+    ],
+)
+def test_multimetric_scorer_calls_method_once_classifier_no_decision(scorers):
     predict_proba_call_cnt = 0
 
     class MockKNeighborsClassifier(KNeighborsClassifier):
@@ -815,7 +830,6 @@ def predict_proba(self, X):
     clf = MockKNeighborsClassifier(n_neighbors=1)
     clf.fit(X, y)
 
-    scorers = ["roc_auc", "neg_log_loss"]
     scorer_dict = _check_multimetric_scoring(clf, scorers)
     scorer = _MultimetricScorer(scorers=scorer_dict)
     scorer(clf, X, y)
@@ -838,7 +852,7 @@ def predict(self, X):
     clf = MockDecisionTreeRegressor()
     clf.fit(X, y)
 
-    scorers = {"neg_mse": "neg_mean_squared_error", "r2": "roc_auc"}
+    scorers = {"neg_mse": "neg_mean_squared_error", "r2": "r2"}
     scorer_dict = _check_multimetric_scoring(clf, scorers)
     scorer = _MultimetricScorer(scorers=scorer_dict)
     scorer(clf, X, y)
@@ -1357,3 +1371,18 @@ def score(y_true, y_pred, param=None):
             ValueError, match="is only supported if enable_metadata_routing=True"
         ):
             scorer(clf, X, y, param="blah")
+
+
+def test_get_scorer_multilabel_indicator():
+    """Check that our scorer deal with multi-label indicator matrices.
+
+    Non-regression test for:
+    https://github.com/scikit-learn/scikit-learn/issues/26817
+    """
+    X, Y = make_multilabel_classification(n_samples=72, n_classes=3, random_state=0)
+    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=0)
+
+    estimator = KNeighborsClassifier().fit(X_train, Y_train)
+
+    score = get_scorer("average_precision")(estimator, X_test, Y_test)
+    assert score > 0.8
diff --git a/sklearn/utils/_response.py b/sklearn/utils/_response.py
@@ -5,9 +5,110 @@
 import numpy as np
 
 from ..base import is_classifier
+from .multiclass import type_of_target
 from .validation import _check_response_method, check_is_fitted
 
 
+def _process_predict_proba(*, y_pred, target_type, classes, pos_label):
+    """Get the response values when the response method is `predict_proba`.
+
+    This function process the `y_pred` array in the binary and multi-label cases.
+    In the binary case, it selects the column corresponding to the positive
+    class. In the multi-label case, it stacks the predictions if they are not
+    in the "compressed" format `(n_samples, n_outputs)`.
+
+    Parameters
+    ----------
+    y_pred : ndarray
+        Output of `estimator.predict_proba`. The shape depends on the target type:
+
+        - for binary classification, it is a 2d array of shape `(n_samples, 2)`;
+        - for multiclass classification, it is a 2d array of shape
+          `(n_samples, n_classes)`;
+        - for multilabel classification, it is either a list of 2d arrays of shape
+          `(n_samples, 2)` (e.g. `RandomForestClassifier` or `KNeighborsClassifier`) or
+          an array of shape `(n_samples, n_outputs)` (e.g. `MLPClassifier` or
+          `RidgeClassifier`).
+
+    target_type : {"binary", "multiclass", "multilabel-indicator"}
+        Type of the target.
+
+    classes : ndarray of shape (n_classes,) or list of such arrays
+        Class labels as reported by `estimator.classes_`.
+
+    pos_label : int, float, bool or str
+        Only used with binary and multiclass targets.
+
+    Returns
+    -------
+    y_pred : ndarray of shape (n_samples,), (n_samples, n_classes) or \
+            (n_samples, n_output)
+        Compressed predictions format as requested by the metrics.
+    """
+    if target_type == "binary" and y_pred.shape[1] < 2:
+        # We don't handle classifiers trained on a single class.
+        raise ValueError(
+            f"Got predict_proba of shape {y_pred.shape}, but need "
+            "classifier with two classes."
+        )
+
+    if target_type == "binary":
+        col_idx = np.flatnonzero(classes == pos_label)[0]
+        return y_pred[:, col_idx]
+    elif target_type == "multilabel-indicator":
+        # Use a compress format of shape `(n_samples, n_output)`.
+        # Only `MLPClassifier` and `RidgeClassifier` return an array of shape
+        # `(n_samples, n_outputs)`.
+        if isinstance(y_pred, list):
+            # list of arrays of shape `(n_samples, 2)`
+            return np.vstack([p[:, -1] for p in y_pred]).T
+        else:
+            # array of shape `(n_samples, n_outputs)`
+            return y_pred
+
+    return y_pred
+
+
+def _process_decision_function(*, y_pred, target_type, classes, pos_label):
+    """Get the response values when the response method is `decision_function`.
+
+    This function process the `y_pred` array in the binary and multi-label cases.
+    In the binary case, it inverts the sign of the score if the positive label
+    is not `classes[1]`. In the multi-label case, it stacks the predictions if
+    they are not in the "compressed" format `(n_samples, n_outputs)`.
+
+    Parameters
+    ----------
+    y_pred : ndarray
+        Output of `estimator.predict_proba`. The shape depends on the target type:
+
+        - for binary classification, it is a 1d array of shape `(n_samples,)` where the
+          sign is assuming that `classes[1]` is the positive class;
+        - for multiclass classification, it is a 2d array of shape
+          `(n_samples, n_classes)`;
+        - for multilabel classification, it is a 2d array of shape `(n_samples,
+          n_outputs)`.
+
+    target_type : {"binary", "multiclass", "multilabel-indicator"}
+        Type of the target.
+
+    classes : ndarray of shape (n_classes,) or list of such arrays
+        Class labels as reported by `estimator.classes_`.
+
+    pos_label : int, float, bool or str
+        Only used with binary and multiclass targets.
+
+    Returns
+    -------
+    y_pred : ndarray of shape (n_samples,), (n_samples, n_classes) or \
+            (n_samples, n_output)
+        Compressed predictions format as requested by the metrics.
+    """
+    if target_type == "binary" and pos_label == classes[0]:
+        return -1 * y_pred
+    return y_pred
+
+
 def _get_response_values(
     estimator,
     X,
@@ -16,12 +117,18 @@ def _get_response_values(
 ):
     """Compute the response values of a classifier or a regressor.
 
-    The response values are predictions, one scalar value for each sample in X
-    that depends on the specific choice of `response_method`.
+    The response values are predictions such that it follows the following shape:
+
+    - for binary classification, it is a 1d array of shape `(n_samples,)`;
+    - for multiclass classification, it is a 2d array of shape `(n_samples, n_classes)`;
+    - for multilabel classification, it is a 2d array of shape `(n_samples, n_outputs)`;
+    - for regression, it is a 1d array of shape `(n_samples,)`.
 
     If `estimator` is a binary classifier, also return the label for the
     effective positive class.
 
+    This utility is used primarily in the displays and the scikit-learn scorers.
+
     .. versionadded:: 1.3
 
     Parameters
@@ -51,8 +158,9 @@ def _get_response_values(
 
     Returns
     -------
-    y_pred : ndarray of shape (n_samples,)
-        Target scores calculated from the provided response_method
+    y_pred : ndarray of shape (n_samples,), (n_samples, n_classes) or \
+            (n_samples, n_outputs)
+        Target scores calculated from the provided `response_method`
         and `pos_label`.
 
     pos_label : int, float, bool, str or None
@@ -72,32 +180,33 @@ def _get_response_values(
     if is_classifier(estimator):
         prediction_method = _check_response_method(estimator, response_method)
         classes = estimator.classes_
-        target_type = "binary" if len(classes) <= 2 else "multiclass"
+        target_type = type_of_target(classes)
 
-        if pos_label is not None and pos_label not in classes.tolist():
-            raise ValueError(
-                f"pos_label={pos_label} is not a valid label: It should be "
-                f"one of {classes}"
-            )
-        elif pos_label is None and target_type == "binary":
-            pos_label = pos_label if pos_label is not None else classes[-1]
+        if target_type in ("binary", "multiclass"):
+            if pos_label is not None and pos_label not in classes.tolist():
+                raise ValueError(
+                    f"pos_label={pos_label} is not a valid label: It should be "
+                    f"one of {classes}"
+                )
+            elif pos_label is None and target_type == "binary":
+                pos_label = classes[-1]
 
         y_pred = prediction_method(X)
+
         if prediction_method.__name__ == "predict_proba":
-            if target_type == "binary" and y_pred.shape[1] <= 2:
-                if y_pred.shape[1] == 2:
-                    col_idx = np.flatnonzero(classes == pos_label)[0]
-                    y_pred = y_pred[:, col_idx]
-                else:
-                    err_msg = (
-                        f"Got predict_proba of shape {y_pred.shape}, but need "
-                        "classifier with two classes."
-                    )
-                    raise ValueError(err_msg)
+            y_pred = _process_predict_proba(
+                y_pred=y_pred,
+                target_type=target_type,
+                classes=classes,
+                pos_label=pos_label,
+            )
         elif prediction_method.__name__ == "decision_function":
-            if target_type == "binary":
-                if pos_label == classes[0]:
-                    y_pred *= -1
+            y_pred = _process_decision_function(
+                y_pred=y_pred,
+                target_type=target_type,
+                classes=classes,
+                pos_label=pos_label,
+            )
     else:  # estimator is a regressor
         if response_method != "predict":
             raise ValueError(
diff --git a/sklearn/utils/tests/test_response.py b/sklearn/utils/tests/test_response.py
@@ -1,11 +1,17 @@
 import numpy as np
 import pytest
 
-from sklearn.datasets import load_iris, make_classification, make_regression
+from sklearn.datasets import (
+    load_iris,
+    make_classification,
+    make_multilabel_classification,
+    make_regression,
+)
 from sklearn.linear_model import (
     LinearRegression,
     LogisticRegression,
 )
+from sklearn.multioutput import ClassifierChain
 from sklearn.preprocessing import scale
 from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
 from sklearn.utils._mocking import _MockEstimatorOnOffPrediction
@@ -230,3 +236,26 @@ def test_get_response_values_multiclass(estimator, response_method):
     assert predictions.shape == (X.shape[0], len(estimator.classes_))
     if response_method == "predict_proba":
         assert np.logical_and(predictions >= 0, predictions <= 1).all()
+
+
+@pytest.mark.parametrize(
+    "response_method", ["predict_proba", "decision_function", "predict"]
+)
+def test_get_response_values_multilabel_indicator(response_method):
+    X, Y = make_multilabel_classification(random_state=0)
+    estimator = ClassifierChain(LogisticRegression()).fit(X, Y)
+
+    y_pred, pos_label = _get_response_values(
+        estimator, X, response_method=response_method
+    )
+    assert pos_label is None
+    assert y_pred.shape == Y.shape
+
+    if response_method == "predict_proba":
+        assert np.logical_and(y_pred >= 0, y_pred <= 1).all()
+    elif response_method == "decision_function":
+        # values returned by `decision_function` are not bounded in [0, 1]
+        assert (y_pred < 0).sum() > 0
+        assert (y_pred > 1).sum() > 0
+    else:  # response_method == "predict"
+        assert np.logical_or(y_pred == 0, y_pred == 1).all()