ENH add X_val and y_val to HGBT.fit (scikit-learn#27124)

lorentzenchr · web-flow · commit 46727eff52b5 · 2025-04-30T14:37:38.000+02:00
diff --git a/doc/whats_new/upcoming_changes/sklearn.ensemble/27124.feature.rst b/doc/whats_new/upcoming_changes/sklearn.ensemble/27124.feature.rst
@@ -0,0 +1,6 @@
+- :class:`ensemble.HistGradientBoostingClassifier` and
+  :class:`ensemble.HistGradientBoostingRegressor` allow for more control over the
+  validation set used for early stopping. You can now pass data to be used for
+  validation directly to `fit` via the arguments `X_val`, `y_val` and
+  `sample_weight_val`.
+  By :user:`Christian Lorentzen <lorentzenchr>`.
diff --git a/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/gradient_boosting.py
@@ -421,8 +421,8 @@ def _check_categorical_features(self, X):
                 )
 
         n_features = X.shape[1]
-        # At this point `_validate_data` was not called yet because we want to use the
-        # dtypes are used to discover the categorical features. Thus `feature_names_in_`
+        # At this point `validate_data` was not called yet because we use the original
+        # dtypes to discover the categorical features. Thus `feature_names_in_`
         # is not defined yet.
         feature_names_in_ = getattr(X, "columns", None)
 
@@ -508,7 +508,16 @@ def _check_interaction_cst(self, n_features):
         return constraints
 
     @_fit_context(prefer_skip_nested_validation=True)
-    def fit(self, X, y, sample_weight=None):
+    def fit(
+        self,
+        X,
+        y,
+        sample_weight=None,
+        *,
+        X_val=None,
+        y_val=None,
+        sample_weight_val=None,
+    ):
         """Fit the gradient boosting model.
 
         Parameters
@@ -524,6 +533,23 @@ def fit(self, X, y, sample_weight=None):
 
             .. versionadded:: 0.23
 
+        X_val : array-like of shape (n_val, n_features)
+            Additional sample of features for validation used in early stopping.
+            In a `Pipeline`, `X_val` can be transformed the same way as `X` with
+            `Pipeline(..., transform_input=["X_val"])`.
+
+            .. versionadded:: 1.7
+
+        y_val : array-like of shape (n_samples,)
+            Additional sample of target values for validation used in early stopping.
+
+            .. versionadded:: 1.7
+
+        sample_weight_val : array-like of shape (n_samples,) default=None
+            Additional weights for validation used in early stopping.
+
+            .. versionadded:: 1.7
+
         Returns
         -------
         self : object
@@ -548,6 +574,30 @@ def fit(self, X, y, sample_weight=None):
 
         sample_weight = self._finalize_sample_weight(sample_weight, y)
 
+        validation_data_provided = X_val is not None or y_val is not None
+        if validation_data_provided:
+            if y_val is None:
+                raise ValueError("X_val is provided, but y_val was not provided.")
+            if X_val is None:
+                raise ValueError("y_val is provided, but X_val was not provided.")
+            X_val = self._preprocess_X(X_val, reset=False)
+            y_val = _check_y(y_val, estimator=self)
+            y_val = self._encode_y_val(y_val)
+            check_consistent_length(X_val, y_val)
+            if sample_weight_val is not None:
+                sample_weight_val = _check_sample_weight(
+                    sample_weight_val, X_val, dtype=np.float64
+                )
+            if self.early_stopping is False:
+                raise ValueError(
+                    "X_val and y_val are passed to fit while at the same time "
+                    "early_stopping is False. When passing X_val and y_val to fit,"
+                    "early_stopping should be set to either 'auto' or True."
+                )
+
+        # Note: At this point, we could delete self._label_encoder if it exists.
+        # But we don't to keep the code even simpler.
+
         rng = check_random_state(self.random_state)
 
         # When warm starting, we want to reuse the same seed that was used
@@ -598,13 +648,19 @@ def fit(self, X, y, sample_weight=None):
             self._loss = self.loss
 
         if self.early_stopping == "auto":
-            self.do_early_stopping_ = n_samples > 10000
+            self.do_early_stopping_ = n_samples > 10_000
         else:
             self.do_early_stopping_ = self.early_stopping
 
         # create validation data if needed
-        self._use_validation_data = self.validation_fraction is not None
-        if self.do_early_stopping_ and self._use_validation_data:
+        self._use_validation_data = (
+            self.validation_fraction is not None or validation_data_provided
+        )
+        if (
+            self.do_early_stopping_
+            and self._use_validation_data
+            and not validation_data_provided
+        ):
             # stratify for classification
             # instead of checking predict_proba, loss.n_classes >= 2 would also work
             stratify = y if hasattr(self._loss, "predict_proba") else None
@@ -642,7 +698,8 @@ def fit(self, X, y, sample_weight=None):
                 )
         else:
             X_train, y_train, sample_weight_train = X, y, sample_weight
-            X_val = y_val = sample_weight_val = None
+            if not validation_data_provided:
+                X_val = y_val = sample_weight_val = None
 
         # Bin the data
         # For ease of use of the API, the user-facing GBDT classes accept the
@@ -1397,7 +1454,11 @@ def _get_loss(self, sample_weight):
 
     @abstractmethod
     def _encode_y(self, y=None):
-        pass
+        pass  # pragma: no cover
+
+    @abstractmethod
+    def _encode_y_val(self, y=None):
+        pass  # pragma: no cover
 
     @property
     def n_iter_(self):
@@ -1574,8 +1635,8 @@ class HistGradientBoostingRegressor(RegressorMixin, BaseHistGradientBoosting):
         See :term:`the Glossary <warm_start>`.
     early_stopping : 'auto' or bool, default='auto'
         If 'auto', early stopping is enabled if the sample size is larger than
-        10000. If True, early stopping is enabled, otherwise early stopping is
-        disabled.
+        10000 or if `X_val` and `y_val` are passed to `fit`. If True, early stopping
+        is enabled, otherwise early stopping is disabled.
 
         .. versionadded:: 0.23
 
@@ -1593,7 +1654,9 @@ class HistGradientBoostingRegressor(RegressorMixin, BaseHistGradientBoosting):
     validation_fraction : int or float or None, default=0.1
         Proportion (or absolute size) of training data to set aside as
         validation data for early stopping. If None, early stopping is done on
-        the training data. Only used if early stopping is performed.
+        the training data.
+        The value is ignored if either early stopping is not performed, e.g.
+        `early_stopping=False`, or if `X_val` and `y_val` are passed to fit.
     n_iter_no_change : int, default=10
         Used to determine when to "early stop". The fitting process is
         stopped when none of the last ``n_iter_no_change`` scores are better
@@ -1795,6 +1858,9 @@ def _encode_y(self, y):
                 )
         return y
 
+    def _encode_y_val(self, y=None):
+        return self._encode_y(y)
+
     def _get_loss(self, sample_weight):
         if self.loss == "quantile":
             return _LOSSES[self.loss](
@@ -1963,8 +2029,8 @@ class HistGradientBoostingClassifier(ClassifierMixin, BaseHistGradientBoosting):
         See :term:`the Glossary <warm_start>`.
     early_stopping : 'auto' or bool, default='auto'
         If 'auto', early stopping is enabled if the sample size is larger than
-        10000. If True, early stopping is enabled, otherwise early stopping is
-        disabled.
+        10000 or if `X_val` and `y_val` are passed to `fit`. If True, early stopping
+        is enabled, otherwise early stopping is disabled.
 
         .. versionadded:: 0.23
 
@@ -1981,7 +2047,9 @@ class HistGradientBoostingClassifier(ClassifierMixin, BaseHistGradientBoosting):
     validation_fraction : int or float or None, default=0.1
         Proportion (or absolute size) of training data to set aside as
         validation data for early stopping. If None, early stopping is done on
-        the training data. Only used if early stopping is performed.
+        the training data.
+        The value is ignored if either early stopping is not performed, e.g.
+        `early_stopping=False`, or if `X_val` and `y_val` are passed to fit.
     n_iter_no_change : int, default=10
         Used to determine when to "early stop". The fitting process is
         stopped when none of the last ``n_iter_no_change`` scores are better
@@ -2272,20 +2340,27 @@ def staged_decision_function(self, X):
             yield staged_decision
 
     def _encode_y(self, y):
+        """Create self._label_encoder and encode y correspondingly."""
         # encode classes into 0 ... n_classes - 1 and sets attributes classes_
         # and n_trees_per_iteration_
         check_classification_targets(y)
 
-        label_encoder = LabelEncoder()
-        encoded_y = label_encoder.fit_transform(y)
-        self.classes_ = label_encoder.classes_
+        # We need to store the label encoder in case y_val needs to be label encoded,
+        # too.
+        self._label_encoder = LabelEncoder()
+        encoded_y = self._label_encoder.fit_transform(y)
+        self.classes_ = self._label_encoder.classes_
         n_classes = self.classes_.shape[0]
         # only 1 tree for binary classification. For multiclass classification,
         # we build 1 tree per class.
         self.n_trees_per_iteration_ = 1 if n_classes <= 2 else n_classes
         encoded_y = encoded_y.astype(Y_DTYPE, copy=False)
         return encoded_y
 
+    def _encode_y_val(self, y):
+        encoded_y = self._label_encoder.transform(y)
+        return encoded_y.astype(Y_DTYPE, copy=False)
+
     def _get_loss(self, sample_weight):
         # At this point self.loss == "log_loss"
         if self.n_trees_per_iteration_ == 1:
diff --git a/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py b/sklearn/ensemble/_hist_gradient_boosting/tests/test_gradient_boosting.py
@@ -35,7 +35,7 @@
 from sklearn.model_selection import cross_val_score, train_test_split
 from sklearn.pipeline import make_pipeline
 from sklearn.preprocessing import KBinsDiscretizer, MinMaxScaler, OneHotEncoder
-from sklearn.utils import shuffle
+from sklearn.utils import check_random_state, shuffle
 from sklearn.utils._openmp_helpers import _openmp_effective_n_threads
 from sklearn.utils._testing import _convert_container
 from sklearn.utils.fixes import _IS_32BIT
@@ -1450,6 +1450,100 @@ def test_unknown_category_that_are_negative():
     assert_allclose(hist.predict(X_test_neg), hist.predict(X_test_nan))
 
 
+@pytest.mark.parametrize(
+    ("GradientBoosting", "make_X_y"),
+    [
+        (HistGradientBoostingClassifier, make_classification),
+        (HistGradientBoostingRegressor, make_regression),
+    ],
+)
+@pytest.mark.parametrize("sample_weight", [False, True])
+def test_X_val_in_fit(GradientBoosting, make_X_y, sample_weight, global_random_seed):
+    """Test that passing X_val, y_val in fit is same as validation fraction."""
+    rng = np.random.RandomState(42)
+    n_samples = 100
+    X, y = make_X_y(n_samples=n_samples, random_state=rng)
+    if sample_weight:
+        sample_weight = np.abs(rng.normal(size=n_samples))
+        data = (X, y, sample_weight)
+    else:
+        sample_weight = None
+        data = (X, y)
+    rng_seed = global_random_seed
+
+    # Fit with validation fraction and early stopping.
+    m1 = GradientBoosting(
+        early_stopping=True,
+        validation_fraction=0.5,
+        random_state=rng_seed,
+    )
+    m1.fit(X, y, sample_weight)
+
+    # Do train-test split ourselves.
+    rng = check_random_state(rng_seed)
+    # We do the same as in the fit method.
+    stratify = y if isinstance(m1, HistGradientBoostingClassifier) else None
+    random_seed = rng.randint(np.iinfo(np.uint32).max, dtype="u8")
+    X_train, X_val, y_train, y_val, *sw = train_test_split(
+        *data,
+        test_size=0.5,
+        stratify=stratify,
+        random_state=random_seed,
+    )
+    if sample_weight is not None:
+        sample_weight_train = sw[0]
+        sample_weight_val = sw[1]
+    else:
+        sample_weight_train = None
+        sample_weight_val = None
+    m2 = GradientBoosting(
+        early_stopping=True,
+        random_state=rng_seed,
+    )
+    m2.fit(
+        X_train,
+        y_train,
+        sample_weight=sample_weight_train,
+        X_val=X_val,
+        y_val=y_val,
+        sample_weight_val=sample_weight_val,
+    )
+
+    assert_allclose(m2.n_iter_, m1.n_iter_)
+    assert_allclose(m2.predict(X), m1.predict(X))
+
+
+def test_X_val_raises_missing_y_val():
+    """Test that an error is raised if X_val given but y_val None."""
+    X, y = make_classification(n_samples=4)
+    X, X_val = X[:2], X[2:]
+    y, y_val = y[:2], y[2:]
+    with pytest.raises(
+        ValueError,
+        match="X_val is provided, but y_val was not provided",
+    ):
+        HistGradientBoostingClassifier().fit(X, y, X_val=X_val)
+    with pytest.raises(
+        ValueError,
+        match="y_val is provided, but X_val was not provided",
+    ):
+        HistGradientBoostingClassifier().fit(X, y, y_val=y_val)
+
+
+def test_X_val_raises_with_early_stopping_false():
+    """Test that an error is raised if X_val given but early_stopping is False."""
+    X, y = make_regression(n_samples=4)
+    X, X_val = X[:2], X[2:]
+    y, y_val = y[:2], y[2:]
+    with pytest.raises(
+        ValueError,
+        match="X_val and y_val are passed to fit while at the same time",
+    ):
+        HistGradientBoostingRegressor(early_stopping=False).fit(
+            X, y, X_val=X_val, y_val=y_val
+        )
+
+
 @pytest.mark.parametrize("dataframe_lib", ["pandas", "polars"])
 @pytest.mark.parametrize(
     "HistGradientBoosting",