@@ -72,6 +72,7 @@ def _generate_bagging_indices(
72
72
n_samples ,
73
73
max_features ,
74
74
max_samples ,
75
+ sample_weight ,
75
76
):
76
77
"""Randomly draw feature and sample indices."""
77
78
# Get valid random state
@@ -81,18 +82,37 @@ def _generate_bagging_indices(
81
82
feature_indices = _generate_indices (
82
83
random_state , bootstrap_features , n_features , max_features
83
84
)
84
- sample_indices = _generate_indices (
85
- random_state , bootstrap_samples , n_samples , max_samples
86
- )
85
+ if sample_weight is None :
86
+ sample_indices = _generate_indices (
87
+ random_state , bootstrap_samples , n_samples , max_samples
88
+ )
89
+ else :
90
+ normalized_sample_weight = sample_weight / np .sum (sample_weight )
91
+ sample_indices = random_state .choice (
92
+ n_samples ,
93
+ max_samples ,
94
+ replace = bootstrap_samples ,
95
+ p = normalized_sample_weight ,
96
+ )
87
97
88
98
return feature_indices , sample_indices
89
99
90
100
101
+ def _consumes_sample_weight (estimator ):
102
+ if _routing_enabled ():
103
+ request_or_router = get_routing_for_object (estimator )
104
+ consumes_sample_weight = request_or_router .consumes ("fit" , ("sample_weight" ,))
105
+ else :
106
+ consumes_sample_weight = has_fit_parameter (estimator , "sample_weight" )
107
+ return consumes_sample_weight
108
+
109
+
91
110
def _parallel_build_estimators (
92
111
n_estimators ,
93
112
ensemble ,
94
113
X ,
95
114
y ,
115
+ sample_weight ,
96
116
seeds ,
97
117
total_n_estimators ,
98
118
verbose ,
@@ -108,22 +128,12 @@ def _parallel_build_estimators(
108
128
bootstrap_features = ensemble .bootstrap_features
109
129
has_check_input = has_fit_parameter (ensemble .estimator_ , "check_input" )
110
130
requires_feature_indexing = bootstrap_features or max_features != n_features
131
+ consumes_sample_weight = _consumes_sample_weight (ensemble .estimator_ )
111
132
112
133
# Build estimators
113
134
estimators = []
114
135
estimators_features = []
115
136
116
- # TODO: (slep6) remove if condition for unrouted sample_weight when metadata
117
- # routing can't be disabled.
118
- support_sample_weight = has_fit_parameter (ensemble .estimator_ , "sample_weight" )
119
- if not _routing_enabled () and (
120
- not support_sample_weight and fit_params .get ("sample_weight" ) is not None
121
- ):
122
- raise ValueError (
123
- "The base estimator doesn't support sample weight, but sample_weight is "
124
- "passed to the fit method."
125
- )
126
-
127
137
for i in range (n_estimators ):
128
138
if verbose > 1 :
129
139
print (
@@ -139,7 +149,8 @@ def _parallel_build_estimators(
139
149
else :
140
150
estimator_fit = estimator .fit
141
151
142
- # Draw random feature, sample indices
152
+ # Draw random feature, sample indices (using normalized sample_weight
153
+ # as probabilites if provided).
143
154
features , indices = _generate_bagging_indices (
144
155
random_state ,
145
156
bootstrap_features ,
@@ -148,45 +159,22 @@ def _parallel_build_estimators(
148
159
n_samples ,
149
160
max_features ,
150
161
max_samples ,
162
+ sample_weight ,
151
163
)
152
164
153
165
fit_params_ = fit_params .copy ()
154
166
155
- # TODO(SLEP6): remove if condition for unrouted sample_weight when metadata
156
- # routing can't be disabled.
157
- # 1. If routing is enabled, we will check if the routing supports sample
158
- # weight and use it if it does.
159
- # 2. If routing is not enabled, we will check if the base
160
- # estimator supports sample_weight and use it if it does.
161
-
162
167
# Note: Row sampling can be achieved either through setting sample_weight or
163
- # by indexing. The former is more efficient. Therefore, use this method
168
+ # by indexing. The former is more memory efficient. Therefore, use this method
164
169
# if possible, otherwise use indexing.
165
- if _routing_enabled ():
166
- request_or_router = get_routing_for_object (ensemble .estimator_ )
167
- consumes_sample_weight = request_or_router .consumes (
168
- "fit" , ("sample_weight" ,)
169
- )
170
- else :
171
- consumes_sample_weight = support_sample_weight
172
170
if consumes_sample_weight :
173
- # Draw sub samples, using sample weights, and then fit
174
- curr_sample_weight = _check_sample_weight (
175
- fit_params_ .pop ("sample_weight" , None ), X
176
- ).copy ()
177
-
178
- if bootstrap :
179
- sample_counts = np .bincount (indices , minlength = n_samples )
180
- curr_sample_weight *= sample_counts
181
- else :
182
- not_indices_mask = ~ indices_to_mask (indices , n_samples )
183
- curr_sample_weight [not_indices_mask ] = 0
184
-
185
- fit_params_ ["sample_weight" ] = curr_sample_weight
171
+ # Row sampling by setting sample_weight
172
+ indices_as_sample_weight = np .bincount (indices , minlength = n_samples )
173
+ fit_params_ ["sample_weight" ] = indices_as_sample_weight
186
174
X_ = X [:, features ] if requires_feature_indexing else X
187
175
estimator_fit (X_ , y , ** fit_params_ )
188
176
else :
189
- # cannot use sample_weight, so use indexing
177
+ # Row sampling by indexing
190
178
y_ = _safe_indexing (y , indices )
191
179
X_ = _safe_indexing (X , indices )
192
180
fit_params_ = _check_method_params (X , params = fit_params_ , indices = indices )
@@ -354,9 +342,11 @@ def fit(self, X, y, sample_weight=None, **fit_params):
354
342
regression).
355
343
356
344
sample_weight : array-like of shape (n_samples,), default=None
357
- Sample weights. If None, then samples are equally weighted.
358
- Note that this is supported only if the base estimator supports
359
- sample weighting.
345
+ Sample weights. If None, then samples are equally weighted. Used as
346
+ probabilities to sample the training set. Note that the expected
347
+ frequency semantics for the `sample_weight` parameter are only
348
+ fulfilled when sampling with replacement `bootstrap=True`.
349
+
360
350
**fit_params : dict
361
351
Parameters to pass to the underlying estimators.
362
352
@@ -386,6 +376,15 @@ def fit(self, X, y, sample_weight=None, **fit_params):
386
376
multi_output = True ,
387
377
)
388
378
379
+ if sample_weight is not None :
380
+ sample_weight = _check_sample_weight (sample_weight , X , dtype = None )
381
+
382
+ if not self .bootstrap :
383
+ warn (
384
+ f"When fitting { self .__class__ .__name__ } with sample_weight "
385
+ f"it is recommended to use bootstrap=True, got { self .bootstrap } ."
386
+ )
387
+
389
388
return self ._fit (
390
389
X ,
391
390
y ,
@@ -435,8 +434,6 @@ def _fit(
435
434
436
435
sample_weight : array-like of shape (n_samples,), default=None
437
436
Sample weights. If None, then samples are equally weighted.
438
- Note that this is supported only if the base estimator supports
439
- sample weighting.
440
437
441
438
**fit_params : dict, default=None
442
439
Parameters to pass to the :term:`fit` method of the underlying
@@ -457,30 +454,38 @@ def _fit(
457
454
# Check parameters
458
455
self ._validate_estimator (self ._get_estimator ())
459
456
460
- if sample_weight is not None :
461
- fit_params ["sample_weight" ] = sample_weight
462
-
463
457
if _routing_enabled ():
464
458
routed_params = process_routing (self , "fit" , ** fit_params )
465
459
else :
466
460
routed_params = Bunch ()
467
461
routed_params .estimator = Bunch (fit = fit_params )
468
- if "sample_weight" in fit_params :
469
- routed_params .estimator .fit ["sample_weight" ] = fit_params [
470
- "sample_weight"
471
- ]
472
462
473
463
if max_depth is not None :
474
464
self .estimator_ .max_depth = max_depth
475
465
476
466
# Validate max_samples
477
467
if max_samples is None :
478
468
max_samples = self .max_samples
479
- elif not isinstance (max_samples , numbers .Integral ):
480
- max_samples = int (max_samples * X .shape [0 ])
481
469
482
- if max_samples > X .shape [0 ]:
483
- raise ValueError ("max_samples must be <= n_samples" )
470
+ if not isinstance (max_samples , numbers .Integral ):
471
+ if sample_weight is None :
472
+ max_samples = max (int (max_samples * X .shape [0 ]), 1 )
473
+ else :
474
+ sw_sum = np .sum (sample_weight )
475
+ if sw_sum <= 1 :
476
+ raise ValueError (
477
+ f"The total sum of sample weights is { sw_sum } , which prevents "
478
+ "resampling with a fractional value for max_samples="
479
+ f"{ max_samples } . Either pass max_samples as an integer or "
480
+ "use a larger sample_weight."
481
+ )
482
+ max_samples = max (int (max_samples * sw_sum ), 1 )
483
+
484
+ if not self .bootstrap and max_samples > X .shape [0 ]:
485
+ raise ValueError (
486
+ f"Effective max_samples={ max_samples } must be <= n_samples="
487
+ f"{ X .shape [0 ]} to be able to sample without replacement."
488
+ )
484
489
485
490
# Store validated integer row sampling value
486
491
self ._max_samples = max_samples
@@ -499,6 +504,11 @@ def _fit(
499
504
# Store validated integer feature sampling value
500
505
self ._max_features = max_features
501
506
507
+ # Store sample_weight (needed in _get_estimators_indices). Note that
508
+ # we intentionally do not materialize `sample_weight=None` as an array
509
+ # of ones to avoid unnecessarily cluttering trained estimator pickles.
510
+ self ._sample_weight = sample_weight
511
+
502
512
# Other checks
503
513
if not self .bootstrap and self .oob_score :
504
514
raise ValueError ("Out of bag estimation only available if bootstrap=True" )
@@ -552,6 +562,7 @@ def _fit(
552
562
self ,
553
563
X ,
554
564
y ,
565
+ sample_weight ,
555
566
seeds [starts [i ] : starts [i + 1 ]],
556
567
total_n_estimators ,
557
568
verbose = self .verbose ,
@@ -596,6 +607,7 @@ def _get_estimators_indices(self):
596
607
self ._n_samples ,
597
608
self ._max_features ,
598
609
self ._max_samples ,
610
+ self ._sample_weight ,
599
611
)
600
612
601
613
yield feature_indices , sample_indices
@@ -726,7 +738,8 @@ class BaggingClassifier(ClassifierMixin, BaseBagging):
726
738
replacement by default, see `bootstrap` for more details).
727
739
728
740
- If int, then draw `max_samples` samples.
729
- - If float, then draw `max_samples * X.shape[0]` samples.
741
+ - If float, then draw `max_samples * X.shape[0]` unweighted samples
742
+ or `max_samples * sample_weight.sum()` weighted samples.
730
743
731
744
max_features : int or float, default=1.0
732
745
The number of features to draw from X to train each base estimator (
@@ -737,8 +750,10 @@ class BaggingClassifier(ClassifierMixin, BaseBagging):
737
750
- If float, then draw `max(1, int(max_features * n_features_in_))` features.
738
751
739
752
bootstrap : bool, default=True
740
- Whether samples are drawn with replacement. If False, sampling
741
- without replacement is performed.
753
+ Whether samples are drawn with replacement. If False, sampling without
754
+ replacement is performed. If fitting with `sample_weight`, it is
755
+ strongly recommended to choose True, as only drawing with replacement
756
+ will ensure the expected frequency semantics of `sample_weight`.
742
757
743
758
bootstrap_features : bool, default=False
744
759
Whether features are drawn with replacement.
@@ -1245,8 +1260,10 @@ class BaggingRegressor(RegressorMixin, BaseBagging):
1245
1260
- If float, then draw `max(1, int(max_features * n_features_in_))` features.
1246
1261
1247
1262
bootstrap : bool, default=True
1248
- Whether samples are drawn with replacement. If False, sampling
1249
- without replacement is performed.
1263
+ Whether samples are drawn with replacement. If False, sampling without
1264
+ replacement is performed. If fitting with `sample_weight`, it is
1265
+ strongly recommended to choose True, as only drawing with replacement
1266
+ will ensure the expected frequency semantics of `sample_weight`.
1250
1267
1251
1268
bootstrap_features : bool, default=False
1252
1269
Whether features are drawn with replacement.
0 commit comments