ENH Add zero_division param to cohen_kappa_score (scikit-learn#29210)

StefanieSenger · glemaitre · adrinjalali · web-flow · commit 329a1cf4bd46 · 2024-06-18T09:09:41.000Z
Co-authored-by: Guillaume Lemaitre &lt;g.lemaitre58@gmail.com&gt;
Co-authored-by: Adrin Jalali &lt;adrin.jalali@gmail.com&gt;
diff --git a/doc/modules/model_evaluation.rst b/doc/modules/model_evaluation.rst
@@ -605,7 +605,7 @@ The function :func:`cohen_kappa_score` computes `Cohen's kappa
 This measure is intended to compare labelings by different human annotators,
 not a classifier versus a ground truth.
 
-The kappa score (see docstring) is a number between -1 and 1.
+The kappa score is a number between -1 and 1.
 Scores above .8 are generally considered good agreement;
 zero or lower means no agreement (practically random labels).
 
@@ -614,9 +614,9 @@ but not for multilabel problems (except by manually computing a per-label score)
 and not for more than two annotators.
 
   >>> from sklearn.metrics import cohen_kappa_score
-  >>> y_true = [2, 0, 2, 2, 0, 1]
-  >>> y_pred = [0, 0, 2, 2, 0, 2]
-  >>> cohen_kappa_score(y_true, y_pred)
+  >>> labeling1 = [2, 0, 2, 2, 0, 1]
+  >>> labeling2 = [0, 0, 2, 2, 0, 2]
+  >>> cohen_kappa_score(labeling1, labeling2)
   0.4285714285714286
 
 .. _confusion_matrix:
diff --git a/doc/whats_new/v1.6.rst b/doc/whats_new/v1.6.rst
@@ -133,6 +133,11 @@ Changelog
   whether to raise an exception if a subset of the scorers in multimetric scoring fails
   or to return an error code. :pr:`28992` by :user:`Stefanie Senger <StefanieSenger>`.
 
+- |Enhancement| Adds `zero_division` to :func:`cohen_kappa_score`. When there is a
+  division by zero, the metric is undefined and this value is returned.
+  :pr:`29210` by :user:`Marc Torrellas Socastro <marctorsoc>` and
+  :user:`Stefanie Senger <StefanieSenger>`.
+
 :mod:`sklearn.model_selection`
 ..............................
 
diff --git a/sklearn/metrics/_classification.py b/sklearn/metrics/_classification.py
@@ -610,17 +610,54 @@ def multilabel_confusion_matrix(
     return np.array([tn, fp, fn, tp]).T.reshape(-1, 2, 2)
 
 
+def _metric_handle_division(*, numerator, denominator, metric, zero_division):
+    """Helper to handle zero-division.
+
+    Parameters
+    ----------
+    numerator : numbers.Real
+        The numerator of the division.
+    denominator : numbers.Real
+        The denominator of the division.
+    metric : str
+        Name of the caller metric function.
+    zero_division : {0.0, 1.0, "warn"}
+        The strategy to use when encountering 0-denominator.
+
+    Returns
+    -------
+    result : numbers.Real
+        The resulting of the division
+    is_zero_division : bool
+        Whether or not we encountered a zero division. This value could be
+        required to early return `result` in the "caller" function.
+    """
+    if np.isclose(denominator, 0):
+        if zero_division == "warn":
+            msg = f"{metric} is ill-defined and set to 0.0. Use the `zero_division` "
+            "param to control this behavior."
+            warnings.warn(msg, UndefinedMetricWarning, stacklevel=2)
+        return _check_zero_division(zero_division), True
+    return numerator / denominator, False
+
+
 @validate_params(
     {
         "y1": ["array-like"],
         "y2": ["array-like"],
         "labels": ["array-like", None],
         "weights": [StrOptions({"linear", "quadratic"}), None],
         "sample_weight": ["array-like", None],
+        "zero_division": [
+            StrOptions({"warn"}),
+            Options(Real, {0.0, 1.0, np.nan}),
+        ],
     },
     prefer_skip_nested_validation=True,
 )
-def cohen_kappa_score(y1, y2, *, labels=None, weights=None, sample_weight=None):
+def cohen_kappa_score(
+    y1, y2, *, labels=None, weights=None, sample_weight=None, zero_division="warn"
+):
     r"""Compute Cohen's kappa: a statistic that measures inter-annotator agreement.
 
     This function computes Cohen's kappa [1]_, a score that expresses the level
@@ -653,12 +690,20 @@ class labels [2]_.
         ``y1`` or ``y2`` are used.
 
     weights : {'linear', 'quadratic'}, default=None
-        Weighting type to calculate the score. `None` means no weighted;
-        "linear" means linear weighted; "quadratic" means quadratic weighted.
+        Weighting type to calculate the score. `None` means not weighted;
+        "linear" means linear weighting; "quadratic" means quadratic weighting.
 
     sample_weight : array-like of shape (n_samples,), default=None
         Sample weights.
 
+    zero_division : {"warn", 0.0, 1.0, np.nan}, default="warn"
+        Sets the return value when there is a zero division. This is the case when both
+        labelings `y1` and `y2` both exclusively contain the 0 class (e. g.
+        `[0, 0, 0, 0]`) (or if both are empty). If set to "warn", returns `0.0`, but a
+        warning is also raised.
+
+        .. versionadded:: 1.6
+
     Returns
     -------
     kappa : float
@@ -688,7 +733,18 @@ class labels [2]_.
     n_classes = confusion.shape[0]
     sum0 = np.sum(confusion, axis=0)
     sum1 = np.sum(confusion, axis=1)
-    expected = np.outer(sum0, sum1) / np.sum(sum0)
+
+    numerator = np.outer(sum0, sum1)
+    denominator = np.sum(sum0)
+    expected, is_zero_division = _metric_handle_division(
+        numerator=numerator,
+        denominator=denominator,
+        metric="cohen_kappa_score()",
+        zero_division=zero_division,
+    )
+
+    if is_zero_division:
+        return expected
 
     if weights is None:
         w_mat = np.ones([n_classes, n_classes], dtype=int)
@@ -701,8 +757,18 @@ class labels [2]_.
         else:
             w_mat = (w_mat - w_mat.T) ** 2
 
-    k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)
-    return 1 - k
+    numerator = np.sum(w_mat * confusion)
+    denominator = np.sum(w_mat * expected)
+    score, is_zero_division = _metric_handle_division(
+        numerator=numerator,
+        denominator=denominator,
+        metric="cohen_kappa_score()",
+        zero_division=zero_division,
+    )
+
+    if is_zero_division:
+        return score
+    return 1 - score
 
 
 @validate_params(
diff --git a/sklearn/metrics/tests/test_classification.py b/sklearn/metrics/tests/test_classification.py
@@ -810,6 +810,7 @@ def test_matthews_corrcoef_nan():
         partial(fbeta_score, beta=1),
         precision_score,
         recall_score,
+        partial(cohen_kappa_score, labels=[0, 1]),
     ],
 )
 def test_zero_division_nan_no_warning(metric, y_true, y_pred, zero_division):
@@ -834,6 +835,7 @@ def test_zero_division_nan_no_warning(metric, y_true, y_pred, zero_division):
         partial(fbeta_score, beta=1),
         precision_score,
         recall_score,
+        cohen_kappa_score,
     ],
 )
 def test_zero_division_nan_warning(metric, y_true, y_pred):