FIX Improve error message when RepeatedStratifiedKFold.split is called without a y argument (scikit-learn#29402)

Anurag-Varma · lesteve · lucyleeow · web-flow · commit 20c7bd0248a0 · 2024-07-11T07:55:15.000Z
Co-authored-by: Loïc Estève &lt;loic.esteve@ymail.com&gt;
Co-authored-by: Lucy Liu &lt;jliu176@gmail.com&gt;
diff --git a/doc/whats_new/v1.6.rst b/doc/whats_new/v1.6.rst
@@ -210,6 +210,9 @@ Changelog
   estimator without re-fitting it.
   :pr:`29067` by :user:`Guillaume Lemaitre <glemaitre>`.
 
+- |Fix| Improve error message when :func:`model_selection.RepeatedStratifiedKFold.split` is called without a `y` argument
+  :pr:`29402` by :user:`Anurag Varma <Anurag-Varma>`.
+
 :mod:`sklearn.neighbors`
 ........................
 
diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py
@@ -1769,6 +1769,43 @@ def __init__(self, *, n_splits=5, n_repeats=10, random_state=None):
             n_splits=n_splits,
         )
 
+    def split(self, X, y, groups=None):
+        """Generate indices to split data into training and test set.
+
+        Parameters
+        ----------
+        X : array-like of shape (n_samples, n_features)
+            Training data, where `n_samples` is the number of samples
+            and `n_features` is the number of features.
+
+            Note that providing ``y`` is sufficient to generate the splits and
+            hence ``np.zeros(n_samples)`` may be used as a placeholder for
+            ``X`` instead of actual training data.
+
+        y : array-like of shape (n_samples,)
+            The target variable for supervised learning problems.
+            Stratification is done based on the y labels.
+
+        groups : object
+            Always ignored, exists for compatibility.
+
+        Yields
+        ------
+        train : ndarray
+            The training set indices for that split.
+
+        test : ndarray
+            The testing set indices for that split.
+
+        Notes
+        -----
+        Randomized CV splitters may return different results for each call of
+        split. You can make the results identical by setting `random_state`
+        to an integer.
+        """
+        y = check_array(y, input_name="y", ensure_2d=False, dtype=None)
+        return super().split(X, y, groups=groups)
+
 
 class BaseShuffleSplit(_MetadataRequester, metaclass=ABCMeta):
     """Base class for *ShuffleSplit.
diff --git a/sklearn/model_selection/tests/test_split.py b/sklearn/model_selection/tests/test_split.py
@@ -86,6 +86,12 @@
 
 ALL_SPLITTERS = NO_GROUP_SPLITTERS + GROUP_SPLITTERS  # type: ignore
 
+SPLITTERS_REQUIRING_TARGET = [
+    StratifiedKFold(),
+    StratifiedShuffleSplit(),
+    RepeatedStratifiedKFold(),
+]
+
 X = np.ones(10)
 y = np.arange(10) // 2
 test_groups = (
@@ -2054,3 +2060,12 @@ def test_no_group_splitters_warns_with_groups(cv):
 
     with pytest.warns(UserWarning, match=msg):
         cv.split(X, y, groups=groups)
+
+
+@pytest.mark.parametrize(
+    "cv", SPLITTERS_REQUIRING_TARGET, ids=[str(cv) for cv in SPLITTERS_REQUIRING_TARGET]
+)
+def test_stratified_splitter_without_y(cv):
+    msg = "missing 1 required positional argument: 'y'"
+    with pytest.raises(TypeError, match=msg):
+        cv.split(X)