Skip to content

Commit 2107404

Browse files
authored
ENH make initial binning in HGBT parallel (scikit-learn#28064)
1 parent 4bc61a0 commit 2107404

File tree

2 files changed

+27
-9
lines changed

2 files changed

+27
-9
lines changed

doc/whats_new/v1.6.rst

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,14 @@ Changelog
122122
can be silenced using the `reg_param` attribute.
123123
:pr:`19731` by :user:`Alihan Zihna <azihna>`.
124124

125+
:mod:`sklearn.ensemble`
126+
.......................
127+
128+
- |Efficiency| Small runtime improvement of fitting
129+
:class:`ensemble.HistGradientBoostingClassifier` and :class:`ensemble.HistGradientBoostingRegressor`
130+
by parallelizing the initial search for bin thresholds
131+
:pr:`28064` by :user:`Christian Lorentzen <lorentzenchr>`.
132+
125133
:mod:`sklearn.impute`
126134
.....................
127135

sklearn/ensemble/_hist_gradient_boosting/binning.py

Lines changed: 19 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
"""
88

99
# Author: Nicolas Hug
10+
import concurrent.futures
1011

1112
import numpy as np
1213

@@ -226,22 +227,31 @@ def fit(self, X, y=None):
226227

227228
self.missing_values_bin_idx_ = self.n_bins - 1
228229

229-
self.bin_thresholds_ = []
230-
n_bins_non_missing = []
230+
self.bin_thresholds_ = [None] * n_features
231+
n_bins_non_missing = [None] * n_features
232+
233+
with concurrent.futures.ThreadPoolExecutor(
234+
max_workers=self.n_threads
235+
) as executor:
236+
future_to_f_idx = {
237+
executor.submit(_find_binning_thresholds, X[:, f_idx], max_bins): f_idx
238+
for f_idx in range(n_features)
239+
if not self.is_categorical_[f_idx]
240+
}
241+
for future in concurrent.futures.as_completed(future_to_f_idx):
242+
f_idx = future_to_f_idx[future]
243+
self.bin_thresholds_[f_idx] = future.result()
244+
n_bins_non_missing[f_idx] = self.bin_thresholds_[f_idx].shape[0] + 1
231245

232246
for f_idx in range(n_features):
233-
if not self.is_categorical_[f_idx]:
234-
thresholds = _find_binning_thresholds(X[:, f_idx], max_bins)
235-
n_bins_non_missing.append(thresholds.shape[0] + 1)
236-
else:
247+
if self.is_categorical_[f_idx]:
237248
# Since categories are assumed to be encoded in
238249
# [0, n_cats] and since n_cats <= max_bins,
239250
# the thresholds *are* the unique categorical values. This will
240251
# lead to the correct mapping in transform()
241252
thresholds = known_categories[f_idx]
242-
n_bins_non_missing.append(thresholds.shape[0])
243-
244-
self.bin_thresholds_.append(thresholds)
253+
n_bins_non_missing[f_idx] = thresholds.shape[0]
254+
self.bin_thresholds_[f_idx] = thresholds
245255

246256
self.n_bins_non_missing_ = np.array(n_bins_non_missing, dtype=np.uint32)
247257
return self

0 commit comments

Comments
 (0)