MAINT Added parameter validation for sklearn.utils.class_weight.compute_sample_weight (scikit-learn#26564)

rand0wn · glemaitre · web-flow · commit a5cc3ab55f3f · 2023-06-29T12:05:57.000Z
Co-authored-by: Guillaume Lemaitre &lt;g.lemaitre58@gmail.com&gt;
diff --git a/sklearn/tests/test_public_functions.py b/sklearn/tests/test_public_functions.py
@@ -306,6 +306,7 @@ def _check_function_param_validation(
     "sklearn.tree.export_graphviz",
     "sklearn.tree.export_text",
     "sklearn.tree.plot_tree",
+    "sklearn.utils.class_weight.compute_sample_weight",
     "sklearn.utils.gen_batches",
     "sklearn.utils.gen_even_slices",
     "sklearn.utils.graph.single_source_shortest_path_length",
diff --git a/sklearn/utils/class_weight.py b/sklearn/utils/class_weight.py
@@ -5,6 +5,8 @@
 import numpy as np
 from scipy import sparse
 
+from ._param_validation import StrOptions, validate_params
+
 
 def compute_class_weight(class_weight, *, classes, y):
     """Estimate class weights for unbalanced datasets.
@@ -75,26 +77,34 @@ def compute_class_weight(class_weight, *, classes, y):
     return weight
 
 
+@validate_params(
+    {
+        "class_weight": [dict, list, StrOptions({"balanced"}), None],
+        "y": ["array-like", "sparse matrix"],
+        "indices": ["array-like", None],
+    },
+    prefer_skip_nested_validation=True,
+)
 def compute_sample_weight(class_weight, y, *, indices=None):
     """Estimate sample weights by class for unbalanced datasets.
 
     Parameters
     ----------
     class_weight : dict, list of dicts, "balanced", or None
-        Weights associated with classes in the form ``{class_label: weight}``.
+        Weights associated with classes in the form `{class_label: weight}`.
         If not given, all classes are supposed to have weight one. For
         multi-output problems, a list of dicts can be provided in the same
         order as the columns of y.
 
         Note that for multioutput (including multilabel) weights should be
         defined for each class of every column in its own dict. For example,
         for four-class multilabel classification weights should be
-        [{0: 1, 1: 1}, {0: 1, 1: 5}, {0: 1, 1: 1}, {0: 1, 1: 1}] instead of
-        [{1:1}, {2:5}, {3:1}, {4:1}].
+        `[{0: 1, 1: 1}, {0: 1, 1: 5}, {0: 1, 1: 1}, {0: 1, 1: 1}]` instead of
+        `[{1:1}, {2:5}, {3:1}, {4:1}]`.
 
-        The "balanced" mode uses the values of y to automatically adjust
+        The `"balanced"` mode uses the values of y to automatically adjust
         weights inversely proportional to class frequencies in the input data:
-        ``n_samples / (n_classes * np.bincount(y))``.
+        `n_samples / (n_classes * np.bincount(y))`.
 
         For multi-output, the weights of each column of y will be multiplied.
 
@@ -103,15 +113,15 @@ def compute_sample_weight(class_weight, y, *, indices=None):
 
     indices : array-like of shape (n_subsample,), default=None
         Array of indices to be used in a subsample. Can be of length less than
-        n_samples in the case of a subsample, or equal to n_samples in the
-        case of a bootstrap subsample with repeated indices. If None, the
-        sample weight will be calculated over the full sample. Only "balanced"
-        is supported for class_weight if this is provided.
+        `n_samples` in the case of a subsample, or equal to `n_samples` in the
+        case of a bootstrap subsample with repeated indices. If `None`, the
+        sample weight will be calculated over the full sample. Only `"balanced"`
+        is supported for `class_weight` if this is provided.
 
     Returns
     -------
     sample_weight_vect : ndarray of shape (n_samples,)
-        Array with sample weights as applied to the original y.
+        Array with sample weights as applied to the original `y`.
     """
 
     # Ensure y is 2D. Sparse matrices are already 2D.
@@ -121,27 +131,22 @@ def compute_sample_weight(class_weight, y, *, indices=None):
             y = np.reshape(y, (-1, 1))
     n_outputs = y.shape[1]
 
-    if isinstance(class_weight, str):
-        if class_weight not in ["balanced"]:
-            raise ValueError(
-                'The only valid preset for class_weight is "balanced". Given "%s".'
-                % class_weight
-            )
-    elif indices is not None and not isinstance(class_weight, str):
+    if indices is not None and class_weight != "balanced":
         raise ValueError(
-            'The only valid class_weight for subsampling is "balanced". Given "%s".'
-            % class_weight
+            "The only valid class_weight for subsampling is 'balanced'. "
+            f"Given {class_weight}."
         )
     elif n_outputs > 1:
-        if not hasattr(class_weight, "__iter__") or isinstance(class_weight, dict):
+        if class_weight is None or isinstance(class_weight, dict):
             raise ValueError(
-                "For multi-output, class_weight should be a "
-                "list of dicts, or a valid string."
+                "For multi-output, class_weight should be a list of dicts, or the "
+                "string 'balanced'."
             )
-        if len(class_weight) != n_outputs:
+        elif isinstance(class_weight, list) and len(class_weight) != n_outputs:
             raise ValueError(
-                "For multi-output, number of elements in "
-                "class_weight should match number of outputs."
+                "For multi-output, number of elements in class_weight should match "
+                f"number of outputs. Got {len(class_weight)} element(s) while having "
+                f"{n_outputs} outputs."
             )
 
     expanded_class_weight = []
diff --git a/sklearn/utils/tests/test_class_weight.py b/sklearn/utils/tests/test_class_weight.py
@@ -235,32 +235,38 @@ def test_compute_sample_weight_with_subsample():
     assert_array_almost_equal(sample_weight, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0])
 
 
-def test_compute_sample_weight_errors():
+@pytest.mark.parametrize(
+    "y_type, class_weight, indices, err_msg",
+    [
+        (
+            "single-output",
+            {1: 2, 2: 1},
+            range(4),
+            "The only valid class_weight for subsampling is 'balanced'.",
+        ),
+        (
+            "multi-output",
+            {1: 2, 2: 1},
+            None,
+            "For multi-output, class_weight should be a list of dicts, or the string",
+        ),
+        (
+            "multi-output",
+            [{1: 2, 2: 1}],
+            None,
+            r"Got 1 element\(s\) while having 2 outputs",
+        ),
+    ],
+)
+def test_compute_sample_weight_errors(y_type, class_weight, indices, err_msg):
     # Test compute_sample_weight raises errors expected.
     # Invalid preset string
-    y = np.asarray([1, 1, 1, 2, 2, 2])
-    y_ = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1]])
-
-    with pytest.raises(ValueError):
-        compute_sample_weight("ni", y)
-    with pytest.raises(ValueError):
-        compute_sample_weight("ni", y, indices=range(4))
-    with pytest.raises(ValueError):
-        compute_sample_weight("ni", y_)
-    with pytest.raises(ValueError):
-        compute_sample_weight("ni", y_, indices=range(4))
+    y_single_output = np.asarray([1, 1, 1, 2, 2, 2])
+    y_multi_output = np.asarray([[1, 0], [1, 0], [1, 0], [2, 1], [2, 1], [2, 1]])
 
-    # Not "balanced" for subsample
-    with pytest.raises(ValueError):
-        compute_sample_weight({1: 2, 2: 1}, y, indices=range(4))
-
-    # Not a list or preset for multi-output
-    with pytest.raises(ValueError):
-        compute_sample_weight({1: 2, 2: 1}, y_)
-
-    # Incorrect length list for multi-output
-    with pytest.raises(ValueError):
-        compute_sample_weight([{1: 2, 2: 1}], y_)
+    y = y_single_output if y_type == "single-output" else y_multi_output
+    with pytest.raises(ValueError, match=err_msg):
+        compute_sample_weight(class_weight, y, indices=indices)
 
 
 def test_compute_sample_weight_more_than_32():