@@ -421,8 +421,8 @@ def _check_categorical_features(self, X):
421
421
)
422
422
423
423
n_features = X .shape [1 ]
424
- # At this point `_validate_data ` was not called yet because we want to use the
425
- # dtypes are used to discover the categorical features. Thus `feature_names_in_`
424
+ # At this point `validate_data ` was not called yet because we use the original
425
+ # dtypes to discover the categorical features. Thus `feature_names_in_`
426
426
# is not defined yet.
427
427
feature_names_in_ = getattr (X , "columns" , None )
428
428
@@ -508,7 +508,16 @@ def _check_interaction_cst(self, n_features):
508
508
return constraints
509
509
510
510
@_fit_context (prefer_skip_nested_validation = True )
511
- def fit (self , X , y , sample_weight = None ):
511
+ def fit (
512
+ self ,
513
+ X ,
514
+ y ,
515
+ sample_weight = None ,
516
+ * ,
517
+ X_val = None ,
518
+ y_val = None ,
519
+ sample_weight_val = None ,
520
+ ):
512
521
"""Fit the gradient boosting model.
513
522
514
523
Parameters
@@ -524,6 +533,23 @@ def fit(self, X, y, sample_weight=None):
524
533
525
534
.. versionadded:: 0.23
526
535
536
+ X_val : array-like of shape (n_val, n_features)
537
+ Additional sample of features for validation used in early stopping.
538
+ In a `Pipeline`, `X_val` can be transformed the same way as `X` with
539
+ `Pipeline(..., transform_input=["X_val"])`.
540
+
541
+ .. versionadded:: 1.7
542
+
543
+ y_val : array-like of shape (n_samples,)
544
+ Additional sample of target values for validation used in early stopping.
545
+
546
+ .. versionadded:: 1.7
547
+
548
+ sample_weight_val : array-like of shape (n_samples,) default=None
549
+ Additional weights for validation used in early stopping.
550
+
551
+ .. versionadded:: 1.7
552
+
527
553
Returns
528
554
-------
529
555
self : object
@@ -548,6 +574,30 @@ def fit(self, X, y, sample_weight=None):
548
574
549
575
sample_weight = self ._finalize_sample_weight (sample_weight , y )
550
576
577
+ validation_data_provided = X_val is not None or y_val is not None
578
+ if validation_data_provided :
579
+ if y_val is None :
580
+ raise ValueError ("X_val is provided, but y_val was not provided." )
581
+ if X_val is None :
582
+ raise ValueError ("y_val is provided, but X_val was not provided." )
583
+ X_val = self ._preprocess_X (X_val , reset = False )
584
+ y_val = _check_y (y_val , estimator = self )
585
+ y_val = self ._encode_y_val (y_val )
586
+ check_consistent_length (X_val , y_val )
587
+ if sample_weight_val is not None :
588
+ sample_weight_val = _check_sample_weight (
589
+ sample_weight_val , X_val , dtype = np .float64
590
+ )
591
+ if self .early_stopping is False :
592
+ raise ValueError (
593
+ "X_val and y_val are passed to fit while at the same time "
594
+ "early_stopping is False. When passing X_val and y_val to fit,"
595
+ "early_stopping should be set to either 'auto' or True."
596
+ )
597
+
598
+ # Note: At this point, we could delete self._label_encoder if it exists.
599
+ # But we don't to keep the code even simpler.
600
+
551
601
rng = check_random_state (self .random_state )
552
602
553
603
# When warm starting, we want to reuse the same seed that was used
@@ -598,13 +648,19 @@ def fit(self, X, y, sample_weight=None):
598
648
self ._loss = self .loss
599
649
600
650
if self .early_stopping == "auto" :
601
- self .do_early_stopping_ = n_samples > 10000
651
+ self .do_early_stopping_ = n_samples > 10_000
602
652
else :
603
653
self .do_early_stopping_ = self .early_stopping
604
654
605
655
# create validation data if needed
606
- self ._use_validation_data = self .validation_fraction is not None
607
- if self .do_early_stopping_ and self ._use_validation_data :
656
+ self ._use_validation_data = (
657
+ self .validation_fraction is not None or validation_data_provided
658
+ )
659
+ if (
660
+ self .do_early_stopping_
661
+ and self ._use_validation_data
662
+ and not validation_data_provided
663
+ ):
608
664
# stratify for classification
609
665
# instead of checking predict_proba, loss.n_classes >= 2 would also work
610
666
stratify = y if hasattr (self ._loss , "predict_proba" ) else None
@@ -642,7 +698,8 @@ def fit(self, X, y, sample_weight=None):
642
698
)
643
699
else :
644
700
X_train , y_train , sample_weight_train = X , y , sample_weight
645
- X_val = y_val = sample_weight_val = None
701
+ if not validation_data_provided :
702
+ X_val = y_val = sample_weight_val = None
646
703
647
704
# Bin the data
648
705
# For ease of use of the API, the user-facing GBDT classes accept the
@@ -1397,7 +1454,11 @@ def _get_loss(self, sample_weight):
1397
1454
1398
1455
@abstractmethod
1399
1456
def _encode_y (self , y = None ):
1400
- pass
1457
+ pass # pragma: no cover
1458
+
1459
+ @abstractmethod
1460
+ def _encode_y_val (self , y = None ):
1461
+ pass # pragma: no cover
1401
1462
1402
1463
@property
1403
1464
def n_iter_ (self ):
@@ -1574,8 +1635,8 @@ class HistGradientBoostingRegressor(RegressorMixin, BaseHistGradientBoosting):
1574
1635
See :term:`the Glossary <warm_start>`.
1575
1636
early_stopping : 'auto' or bool, default='auto'
1576
1637
If 'auto', early stopping is enabled if the sample size is larger than
1577
- 10000. If True, early stopping is enabled, otherwise early stopping is
1578
- disabled.
1638
+ 10000 or if `X_val` and `y_val` are passed to `fit`. If True, early stopping
1639
+ is enabled, otherwise early stopping is disabled.
1579
1640
1580
1641
.. versionadded:: 0.23
1581
1642
@@ -1593,7 +1654,9 @@ class HistGradientBoostingRegressor(RegressorMixin, BaseHistGradientBoosting):
1593
1654
validation_fraction : int or float or None, default=0.1
1594
1655
Proportion (or absolute size) of training data to set aside as
1595
1656
validation data for early stopping. If None, early stopping is done on
1596
- the training data. Only used if early stopping is performed.
1657
+ the training data.
1658
+ The value is ignored if either early stopping is not performed, e.g.
1659
+ `early_stopping=False`, or if `X_val` and `y_val` are passed to fit.
1597
1660
n_iter_no_change : int, default=10
1598
1661
Used to determine when to "early stop". The fitting process is
1599
1662
stopped when none of the last ``n_iter_no_change`` scores are better
@@ -1795,6 +1858,9 @@ def _encode_y(self, y):
1795
1858
)
1796
1859
return y
1797
1860
1861
+ def _encode_y_val (self , y = None ):
1862
+ return self ._encode_y (y )
1863
+
1798
1864
def _get_loss (self , sample_weight ):
1799
1865
if self .loss == "quantile" :
1800
1866
return _LOSSES [self .loss ](
@@ -1963,8 +2029,8 @@ class HistGradientBoostingClassifier(ClassifierMixin, BaseHistGradientBoosting):
1963
2029
See :term:`the Glossary <warm_start>`.
1964
2030
early_stopping : 'auto' or bool, default='auto'
1965
2031
If 'auto', early stopping is enabled if the sample size is larger than
1966
- 10000. If True, early stopping is enabled, otherwise early stopping is
1967
- disabled.
2032
+ 10000 or if `X_val` and `y_val` are passed to `fit`. If True, early stopping
2033
+ is enabled, otherwise early stopping is disabled.
1968
2034
1969
2035
.. versionadded:: 0.23
1970
2036
@@ -1981,7 +2047,9 @@ class HistGradientBoostingClassifier(ClassifierMixin, BaseHistGradientBoosting):
1981
2047
validation_fraction : int or float or None, default=0.1
1982
2048
Proportion (or absolute size) of training data to set aside as
1983
2049
validation data for early stopping. If None, early stopping is done on
1984
- the training data. Only used if early stopping is performed.
2050
+ the training data.
2051
+ The value is ignored if either early stopping is not performed, e.g.
2052
+ `early_stopping=False`, or if `X_val` and `y_val` are passed to fit.
1985
2053
n_iter_no_change : int, default=10
1986
2054
Used to determine when to "early stop". The fitting process is
1987
2055
stopped when none of the last ``n_iter_no_change`` scores are better
@@ -2272,20 +2340,27 @@ def staged_decision_function(self, X):
2272
2340
yield staged_decision
2273
2341
2274
2342
def _encode_y (self , y ):
2343
+ """Create self._label_encoder and encode y correspondingly."""
2275
2344
# encode classes into 0 ... n_classes - 1 and sets attributes classes_
2276
2345
# and n_trees_per_iteration_
2277
2346
check_classification_targets (y )
2278
2347
2279
- label_encoder = LabelEncoder ()
2280
- encoded_y = label_encoder .fit_transform (y )
2281
- self .classes_ = label_encoder .classes_
2348
+ # We need to store the label encoder in case y_val needs to be label encoded,
2349
+ # too.
2350
+ self ._label_encoder = LabelEncoder ()
2351
+ encoded_y = self ._label_encoder .fit_transform (y )
2352
+ self .classes_ = self ._label_encoder .classes_
2282
2353
n_classes = self .classes_ .shape [0 ]
2283
2354
# only 1 tree for binary classification. For multiclass classification,
2284
2355
# we build 1 tree per class.
2285
2356
self .n_trees_per_iteration_ = 1 if n_classes <= 2 else n_classes
2286
2357
encoded_y = encoded_y .astype (Y_DTYPE , copy = False )
2287
2358
return encoded_y
2288
2359
2360
+ def _encode_y_val (self , y ):
2361
+ encoded_y = self ._label_encoder .transform (y )
2362
+ return encoded_y .astype (Y_DTYPE , copy = False )
2363
+
2289
2364
def _get_loss (self , sample_weight ):
2290
2365
# At this point self.loss == "log_loss"
2291
2366
if self .n_trees_per_iteration_ == 1 :
0 commit comments