PERF speedup classification_report by attaching unique values to dtype.metadata (scikit-learn#29738)

adrinjalali · glemaitre · web-flow · commit eb2920766d7b · 2024-09-05T20:35:07.000Z
Co-authored-by: Guillaume Lemaitre &lt;guillaume@probabl.ai&gt;
diff --git a/doc/whats_new/v1.6.rst b/doc/whats_new/v1.6.rst
@@ -277,6 +277,10 @@ Changelog
   :pr:`29210` by :user:`Marc Torrellas Socastro <marctorsoc>` and
   :user:`Stefanie Senger <StefanieSenger>`.
 
+- |Efficiency| :func:`sklearn.metrics.classification_report` is now faster by caching
+  classification labels.
+  :pr:`29738` by `Adrin Jalali`_.
+
 - |API| scoring="neg_max_error" should be used instead of
   scoring="max_error" which is now deprecated.
   :pr:`29462` by :user:`Farid "Freddie" Taba <artificialfintelligence>`.
diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py
@@ -41,6 +41,7 @@
     StrOptions,
     validate_params,
 )
+from ..utils._unique import attach_unique
 from ..utils.extmath import _nanaverage
 from ..utils.multiclass import type_of_target, unique_labels
 from ..utils.sparsefuncs import count_nonzero
@@ -216,6 +217,7 @@ def accuracy_score(y_true, y_pred, *, normalize=True, sample_weight=None):
     """
     xp, _, device = get_namespace_and_device(y_true, y_pred, sample_weight)
     # Compute accuracy for each possible representation
+    y_true, y_pred = attach_unique(y_true, y_pred)
     y_type, y_true, y_pred = _check_targets(y_true, y_pred)
     check_consistent_length(y_true, y_pred, sample_weight)
     if y_type.startswith("multilabel"):
@@ -327,6 +329,7 @@ def confusion_matrix(
     >>> (tn, fp, fn, tp)
     (np.int64(0), np.int64(2), np.int64(1), np.int64(1))
     """
+    y_true, y_pred = attach_unique(y_true, y_pred)
     y_type, y_true, y_pred = _check_targets(y_true, y_pred)
     if y_type not in ("binary", "multiclass"):
         raise ValueError("%s is not supported" % y_type)
@@ -516,6 +519,7 @@ def multilabel_confusion_matrix(
            [[2, 1],
             [1, 2]]])
     """
+    y_true, y_pred = attach_unique(y_true, y_pred)
     y_type, y_true, y_pred = _check_targets(y_true, y_pred)
     if sample_weight is not None:
         sample_weight = column_or_1d(sample_weight)
@@ -1054,6 +1058,7 @@ def matthews_corrcoef(y_true, y_pred, *, sample_weight=None):
     >>> matthews_corrcoef(y_true, y_pred)
     np.float64(-0.33...)
     """
+    y_true, y_pred = attach_unique(y_true, y_pred)
     y_type, y_true, y_pred = _check_targets(y_true, y_pred)
     check_consistent_length(y_true, y_pred, sample_weight)
     if y_type not in {"binary", "multiclass"}:
@@ -1612,6 +1617,7 @@ def _check_set_wise_labels(y_true, y_pred, average, labels, pos_label):
     if average not in average_options and average != "binary":
         raise ValueError("average has to be one of " + str(average_options))
 
+    y_true, y_pred = attach_unique(y_true, y_pred)
     y_type, y_true, y_pred = _check_targets(y_true, y_pred)
     # Convert to Python primitive type to avoid NumPy type / Python str
     # comparison. See https://github.com/numpy/numpy/issues/6784
@@ -2031,7 +2037,7 @@ class after being classified as negative. This is the case when the
     >>> class_likelihood_ratios(y_true, y_pred, labels=["non-cat", "cat"])
     (np.float64(1.5), np.float64(0.75))
     """
-
+    y_true, y_pred = attach_unique(y_true, y_pred)
     y_type, y_true, y_pred = _check_targets(y_true, y_pred)
     if y_type != "binary":
         raise ValueError(
@@ -2681,6 +2687,7 @@ class 2       1.00      0.67      0.80         3
     <BLANKLINE>
     """
 
+    y_true, y_pred = attach_unique(y_true, y_pred)
     y_type, y_true, y_pred = _check_targets(y_true, y_pred)
 
     if labels is None:
@@ -2869,7 +2876,7 @@ def hamming_loss(y_true, y_pred, *, sample_weight=None):
     >>> hamming_loss(np.array([[0, 1], [1, 1]]), np.zeros((2, 2)))
     0.75
     """
-
+    y_true, y_pred = attach_unique(y_true, y_pred)
     y_type, y_true, y_pred = _check_targets(y_true, y_pred)
     check_consistent_length(y_true, y_pred, sample_weight)
 
diff --git a/sklearn/utils/_array_api.py b/sklearn/utils/_array_api.py
@@ -208,7 +208,11 @@ def _is_numpy_namespace(xp):
 
 def _union1d(a, b, xp):
     if _is_numpy_namespace(xp):
-        return xp.asarray(numpy.union1d(a, b))
+        # avoid circular import
+        from ._unique import cached_unique
+
+        a_unique, b_unique = cached_unique(a, b, xp=xp)
+        return xp.asarray(numpy.union1d(a_unique, b_unique))
     assert a.ndim == b.ndim == 1
     return xp.unique_values(xp.concat([xp.unique_values(a), xp.unique_values(b)]))
 
diff --git a/sklearn/utils/_unique.py b/sklearn/utils/_unique.py
@@ -0,0 +1,108 @@
+# Authors: The scikit-learn developers
+# SPDX-License-Identifier: BSD-3-Clause
+
+import numpy as np
+
+from sklearn.utils._array_api import get_namespace
+
+
+def _attach_unique(y):
+    """Attach unique values of y to y and return the result.
+
+    The result is a view of y, and the metadata (unique) is not attached to y.
+    """
+    if not isinstance(y, np.ndarray):
+        return y
+    try:
+        # avoid recalculating unique in nested calls.
+        if "unique" in y.dtype.metadata:
+            return y
+    except (AttributeError, TypeError):
+        pass
+
+    unique = np.unique(y)
+    unique_dtype = np.dtype(y.dtype, metadata={"unique": unique})
+    return y.view(dtype=unique_dtype)
+
+
+def attach_unique(*ys, return_tuple=False):
+    """Attach unique values of ys to ys and return the results.
+
+    The result is a view of y, and the metadata (unique) is not attached to y.
+
+    IMPORTANT: The output of this function should NEVER be returned in functions.
+    This is to avoid this pattern:
+
+    .. code:: python
+
+        y = np.array([1, 2, 3])
+        y = attach_unique(y)
+        y[1] = -1
+        # now np.unique(y) will be different from cached_unique(y)
+
+    Parameters
+    ----------
+    *ys : sequence of array-like
+        Input data arrays.
+
+    return_tuple : bool, default=False
+        If True, always return a tuple even if there is only one array.
+
+    Returns
+    -------
+    ys : tuple of array-like or array-like
+        Input data with unique values attached.
+    """
+    res = tuple(_attach_unique(y) for y in ys)
+    if len(res) == 1 and not return_tuple:
+        return res[0]
+    return res
+
+
+def _cached_unique(y, xp=None):
+    """Return the unique values of y.
+
+    Use the cached values from dtype.metadata if present.
+
+    This function does NOT cache the values in y, i.e. it doesn't change y.
+
+    Call `attach_unique` to attach the unique values to y.
+    """
+    try:
+        if y.dtype.metadata is not None and "unique" in y.dtype.metadata:
+            return y.dtype.metadata["unique"]
+    except AttributeError:
+        # in case y is not a numpy array
+        pass
+    xp, _ = get_namespace(y, xp=xp)
+    return xp.unique_values(y)
+
+
+def cached_unique(*ys, xp=None):
+    """Return the unique values of ys.
+
+    Use the cached values from dtype.metadata if present.
+
+    This function does NOT cache the values in y, i.e. it doesn't change y.
+
+    Call `attach_unique` to attach the unique values to y.
+
+    Parameters
+    ----------
+    *ys : sequence of array-like
+        Input data arrays.
+
+    xp : module, default=None
+        Precomputed array namespace module. When passed, typically from a caller
+        that has already performed inspection of its own inputs, skips array
+        namespace inspection.
+
+    Returns
+    -------
+    res : tuple of array-like or array-like
+        Unique values of ys.
+    """
+    res = tuple(_cached_unique(y, xp=xp) for y in ys)
+    if len(res) == 1:
+        return res[0]
+    return res
diff --git a/sklearn/utils/multiclass.py b/sklearn/utils/multiclass.py
@@ -12,19 +12,20 @@
 
 from ..utils._array_api import get_namespace
 from ..utils.fixes import VisibleDeprecationWarning
+from ._unique import attach_unique, cached_unique
 from .validation import _assert_all_finite, check_array
 
 
-def _unique_multiclass(y):
-    xp, is_array_api_compliant = get_namespace(y)
+def _unique_multiclass(y, xp=None):
+    xp, is_array_api_compliant = get_namespace(y, xp=xp)
     if hasattr(y, "__array__") or is_array_api_compliant:
-        return xp.unique_values(xp.asarray(y))
+        return cached_unique(xp.asarray(y), xp=xp)
     else:
         return set(y)
 
 
-def _unique_indicator(y):
-    xp, _ = get_namespace(y)
+def _unique_indicator(y, xp=None):
+    xp, _ = get_namespace(y, xp=xp)
     return xp.arange(
         check_array(y, input_name="y", accept_sparse=["csr", "csc", "coo"]).shape[1]
     )
@@ -69,8 +70,9 @@ def unique_labels(*ys):
     >>> unique_labels([1, 2, 10], [5, 11])
     array([ 1,  2,  5, 10, 11])
     """
+    ys = attach_unique(*ys, return_tuple=True)
     xp, is_array_api_compliant = get_namespace(*ys)
-    if not ys:
+    if len(ys) == 0:
         raise ValueError("No argument has been passed.")
     # Check that we don't mix label format
 
@@ -104,10 +106,12 @@ def unique_labels(*ys):
 
     if is_array_api_compliant:
         # array_api does not allow for mixed dtypes
-        unique_ys = xp.concat([_unique_labels(y) for y in ys])
+        unique_ys = xp.concat([_unique_labels(y, xp=xp) for y in ys])
         return xp.unique_values(unique_ys)
 
-    ys_labels = set(chain.from_iterable((i for i in _unique_labels(y)) for y in ys))
+    ys_labels = set(
+        chain.from_iterable((i for i in _unique_labels(y, xp=xp)) for y in ys)
+    )
     # Check that we don't mix string type with number type
     if len(set(isinstance(label, str) for label in ys_labels)) > 1:
         raise ValueError("Mix of label input types (string and number)")
@@ -187,7 +191,7 @@ def is_multilabel(y):
             and (y.dtype.kind in "biu" or _is_integral_float(labels))  # bool, int, uint
         )
     else:
-        labels = xp.unique_values(y)
+        labels = cached_unique(y, xp=xp)
 
         return labels.shape[0] < 3 and (
             xp.isdtype(y.dtype, ("bool", "signed integer", "unsigned integer"))
@@ -400,7 +404,7 @@ def type_of_target(y, input_name=""):
     # Check multiclass
     if issparse(first_row_or_val):
         first_row_or_val = first_row_or_val.data
-    if xp.unique_values(y).shape[0] > 2 or (y.ndim == 2 and len(first_row_or_val) > 1):
+    if cached_unique(y).shape[0] > 2 or (y.ndim == 2 and len(first_row_or_val) > 1):
         # [1, 2, 3] or [[1., 2., 3]] or [[1, 2]]
         return "multiclass" + suffix
     else:
diff --git a/sklearn/utils/tests/test_unique.py b/sklearn/utils/tests/test_unique.py
@@ -0,0 +1,54 @@
+import numpy as np
+from numpy.testing import assert_array_equal
+
+from sklearn.utils._unique import attach_unique, cached_unique
+from sklearn.utils.validation import check_array
+
+
+def test_attach_unique_attaches_unique_to_array():
+    arr = np.array([1, 2, 2, 3, 4, 4, 5])
+    arr_ = attach_unique(arr)
+    assert_array_equal(arr_.dtype.metadata["unique"], np.array([1, 2, 3, 4, 5]))
+    assert_array_equal(arr_, arr)
+
+
+def test_cached_unique_returns_cached_unique():
+    my_dtype = np.dtype(np.float64, metadata={"unique": np.array([1, 2])})
+    arr = np.array([1, 2, 2, 3, 4, 4, 5], dtype=my_dtype)
+    assert_array_equal(cached_unique(arr), np.array([1, 2]))
+
+
+def test_attach_unique_not_ndarray():
+    """Test that when not np.ndarray, we don't touch the array."""
+    arr = [1, 2, 2, 3, 4, 4, 5]
+    arr_ = attach_unique(arr)
+    assert arr_ is arr
+
+
+def test_attach_unique_returns_view():
+    """Test that attach_unique returns a view of the array."""
+    arr = np.array([1, 2, 2, 3, 4, 4, 5])
+    arr_ = attach_unique(arr)
+    assert arr_.base is arr
+
+
+def test_attach_unique_return_tuple():
+    """Test return_tuple argument of the function."""
+    arr = np.array([1, 2, 2, 3, 4, 4, 5])
+    arr_tuple = attach_unique(arr, return_tuple=True)
+    assert isinstance(arr_tuple, tuple)
+    assert len(arr_tuple) == 1
+    assert_array_equal(arr_tuple[0], arr)
+
+    arr_single = attach_unique(arr, return_tuple=False)
+    assert isinstance(arr_single, np.ndarray)
+    assert_array_equal(arr_single, arr)
+
+
+def test_check_array_keeps_unique():
+    """Test that check_array keeps the unique metadata."""
+    arr = np.array([[1, 2, 2, 3, 4, 4, 5]])
+    arr_ = attach_unique(arr)
+    arr_ = check_array(arr_)
+    assert_array_equal(arr_.dtype.metadata["unique"], np.array([1, 2, 3, 4, 5]))
+    assert_array_equal(arr_, arr)