Skip to content

Commit 9f86681

Browse files
5nizzaogriselArturoAmorQvirchanbetatim
authored
DOC Scale data before using k-neighbours regression (scikit-learn#31201)
Co-authored-by: Olivier Grisel <olivier.grisel@ensta.org> Co-authored-by: Arturo Amor <86408019+ArturoAmorQ@users.noreply.github.com> Co-authored-by: Virgil Chan <virchan.math@gmail.com> Co-authored-by: Tim Head <betatim@gmail.com>
1 parent d03054b commit 9f86681

File tree

2 files changed

+121
-142
lines changed

2 files changed

+121
-142
lines changed

examples/impute/plot_iterative_imputer_variants_comparison.py

Lines changed: 50 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
imputation with :class:`~impute.IterativeImputer`:
1414
1515
* :class:`~linear_model.BayesianRidge`: regularized linear regression
16-
* :class:`~ensemble.RandomForestRegressor`: Forests of randomized trees regression
16+
* :class:`~ensemble.RandomForestRegressor`: forests of randomized trees regression
1717
* :func:`~pipeline.make_pipeline` (:class:`~kernel_approximation.Nystroem`,
1818
:class:`~linear_model.Ridge`): a pipeline with the expansion of a degree 2
1919
polynomial kernel and regularized linear regression
@@ -62,28 +62,39 @@
6262
from sklearn.model_selection import cross_val_score
6363
from sklearn.neighbors import KNeighborsRegressor
6464
from sklearn.pipeline import make_pipeline
65+
from sklearn.preprocessing import RobustScaler
6566

6667
N_SPLITS = 5
6768

68-
rng = np.random.RandomState(0)
69-
7069
X_full, y_full = fetch_california_housing(return_X_y=True)
7170
# ~2k samples is enough for the purpose of the example.
7271
# Remove the following two lines for a slower run with different error bars.
7372
X_full = X_full[::10]
7473
y_full = y_full[::10]
7574
n_samples, n_features = X_full.shape
7675

76+
77+
def compute_score_for(X, y, imputer=None):
78+
# We scale data before imputation and training a target estimator,
79+
# because our target estimator and some of the imputers assume
80+
# that the features have similar scales.
81+
if imputer is None:
82+
estimator = make_pipeline(RobustScaler(), BayesianRidge())
83+
else:
84+
estimator = make_pipeline(RobustScaler(), imputer, BayesianRidge())
85+
return cross_val_score(
86+
estimator, X, y, scoring="neg_mean_squared_error", cv=N_SPLITS
87+
)
88+
89+
7790
# Estimate the score on the entire dataset, with no missing values
78-
br_estimator = BayesianRidge()
7991
score_full_data = pd.DataFrame(
80-
cross_val_score(
81-
br_estimator, X_full, y_full, scoring="neg_mean_squared_error", cv=N_SPLITS
82-
),
92+
compute_score_for(X_full, y_full),
8393
columns=["Full Data"],
8494
)
8595

8696
# Add a single missing value to each row
97+
rng = np.random.RandomState(0)
8798
X_missing = X_full.copy()
8899
y_missing = y_full
89100
missing_samples = np.arange(n_samples)
@@ -93,48 +104,52 @@
93104
# Estimate the score after imputation (mean and median strategies)
94105
score_simple_imputer = pd.DataFrame()
95106
for strategy in ("mean", "median"):
96-
estimator = make_pipeline(
97-
SimpleImputer(missing_values=np.nan, strategy=strategy), br_estimator
98-
)
99-
score_simple_imputer[strategy] = cross_val_score(
100-
estimator, X_missing, y_missing, scoring="neg_mean_squared_error", cv=N_SPLITS
107+
score_simple_imputer[strategy] = compute_score_for(
108+
X_missing, y_missing, SimpleImputer(strategy=strategy)
101109
)
102110

103111
# Estimate the score after iterative imputation of the missing values
104112
# with different estimators
105-
estimators = [
106-
BayesianRidge(),
107-
RandomForestRegressor(
108-
# We tuned the hyperparameters of the RandomForestRegressor to get a good
109-
# enough predictive performance for a restricted execution time.
110-
n_estimators=4,
111-
max_depth=10,
112-
bootstrap=True,
113-
max_samples=0.5,
114-
n_jobs=2,
115-
random_state=0,
113+
named_estimators = [
114+
("Bayesian Ridge", BayesianRidge()),
115+
(
116+
"Random Forest",
117+
RandomForestRegressor(
118+
# We tuned the hyperparameters of the RandomForestRegressor to get a good
119+
# enough predictive performance for a restricted execution time.
120+
n_estimators=5,
121+
max_depth=10,
122+
bootstrap=True,
123+
max_samples=0.5,
124+
n_jobs=2,
125+
random_state=0,
126+
),
116127
),
117-
make_pipeline(
118-
Nystroem(kernel="polynomial", degree=2, random_state=0), Ridge(alpha=1e3)
128+
(
129+
"Nystroem + Ridge",
130+
make_pipeline(
131+
Nystroem(kernel="polynomial", degree=2, random_state=0), Ridge(alpha=1e4)
132+
),
133+
),
134+
(
135+
"k-NN",
136+
KNeighborsRegressor(n_neighbors=10),
119137
),
120-
KNeighborsRegressor(n_neighbors=15),
121138
]
122139
score_iterative_imputer = pd.DataFrame()
123-
# iterative imputer is sensible to the tolerance and
140+
# Iterative imputer is sensitive to the tolerance and
124141
# dependent on the estimator used internally.
125-
# we tuned the tolerance to keep this example run with limited computational
142+
# We tuned the tolerance to keep this example run with limited computational
126143
# resources while not changing the results too much compared to keeping the
127144
# stricter default value for the tolerance parameter.
128145
tolerances = (1e-3, 1e-1, 1e-1, 1e-2)
129-
for impute_estimator, tol in zip(estimators, tolerances):
130-
estimator = make_pipeline(
146+
for (name, impute_estimator), tol in zip(named_estimators, tolerances):
147+
score_iterative_imputer[name] = compute_score_for(
148+
X_missing,
149+
y_missing,
131150
IterativeImputer(
132-
random_state=0, estimator=impute_estimator, max_iter=25, tol=tol
151+
random_state=0, estimator=impute_estimator, max_iter=40, tol=tol
133152
),
134-
br_estimator,
135-
)
136-
score_iterative_imputer[impute_estimator.__class__.__name__] = cross_val_score(
137-
estimator, X_missing, y_missing, scoring="neg_mean_squared_error", cv=N_SPLITS
138153
)
139154

140155
scores = pd.concat(

0 commit comments

Comments
 (0)