|
13 | 13 | imputation with :class:`~impute.IterativeImputer`:
|
14 | 14 |
|
15 | 15 | * :class:`~linear_model.BayesianRidge`: regularized linear regression
|
16 |
| -* :class:`~ensemble.RandomForestRegressor`: Forests of randomized trees regression |
| 16 | +* :class:`~ensemble.RandomForestRegressor`: forests of randomized trees regression |
17 | 17 | * :func:`~pipeline.make_pipeline` (:class:`~kernel_approximation.Nystroem`,
|
18 | 18 | :class:`~linear_model.Ridge`): a pipeline with the expansion of a degree 2
|
19 | 19 | polynomial kernel and regularized linear regression
|
|
62 | 62 | from sklearn.model_selection import cross_val_score
|
63 | 63 | from sklearn.neighbors import KNeighborsRegressor
|
64 | 64 | from sklearn.pipeline import make_pipeline
|
| 65 | +from sklearn.preprocessing import RobustScaler |
65 | 66 |
|
66 | 67 | N_SPLITS = 5
|
67 | 68 |
|
68 |
| -rng = np.random.RandomState(0) |
69 |
| - |
70 | 69 | X_full, y_full = fetch_california_housing(return_X_y=True)
|
71 | 70 | # ~2k samples is enough for the purpose of the example.
|
72 | 71 | # Remove the following two lines for a slower run with different error bars.
|
73 | 72 | X_full = X_full[::10]
|
74 | 73 | y_full = y_full[::10]
|
75 | 74 | n_samples, n_features = X_full.shape
|
76 | 75 |
|
| 76 | + |
| 77 | +def compute_score_for(X, y, imputer=None): |
| 78 | + # We scale data before imputation and training a target estimator, |
| 79 | + # because our target estimator and some of the imputers assume |
| 80 | + # that the features have similar scales. |
| 81 | + if imputer is None: |
| 82 | + estimator = make_pipeline(RobustScaler(), BayesianRidge()) |
| 83 | + else: |
| 84 | + estimator = make_pipeline(RobustScaler(), imputer, BayesianRidge()) |
| 85 | + return cross_val_score( |
| 86 | + estimator, X, y, scoring="neg_mean_squared_error", cv=N_SPLITS |
| 87 | + ) |
| 88 | + |
| 89 | + |
77 | 90 | # Estimate the score on the entire dataset, with no missing values
|
78 |
| -br_estimator = BayesianRidge() |
79 | 91 | score_full_data = pd.DataFrame(
|
80 |
| - cross_val_score( |
81 |
| - br_estimator, X_full, y_full, scoring="neg_mean_squared_error", cv=N_SPLITS |
82 |
| - ), |
| 92 | + compute_score_for(X_full, y_full), |
83 | 93 | columns=["Full Data"],
|
84 | 94 | )
|
85 | 95 |
|
86 | 96 | # Add a single missing value to each row
|
| 97 | +rng = np.random.RandomState(0) |
87 | 98 | X_missing = X_full.copy()
|
88 | 99 | y_missing = y_full
|
89 | 100 | missing_samples = np.arange(n_samples)
|
|
93 | 104 | # Estimate the score after imputation (mean and median strategies)
|
94 | 105 | score_simple_imputer = pd.DataFrame()
|
95 | 106 | for strategy in ("mean", "median"):
|
96 |
| - estimator = make_pipeline( |
97 |
| - SimpleImputer(missing_values=np.nan, strategy=strategy), br_estimator |
98 |
| - ) |
99 |
| - score_simple_imputer[strategy] = cross_val_score( |
100 |
| - estimator, X_missing, y_missing, scoring="neg_mean_squared_error", cv=N_SPLITS |
| 107 | + score_simple_imputer[strategy] = compute_score_for( |
| 108 | + X_missing, y_missing, SimpleImputer(strategy=strategy) |
101 | 109 | )
|
102 | 110 |
|
103 | 111 | # Estimate the score after iterative imputation of the missing values
|
104 | 112 | # with different estimators
|
105 |
| -estimators = [ |
106 |
| - BayesianRidge(), |
107 |
| - RandomForestRegressor( |
108 |
| - # We tuned the hyperparameters of the RandomForestRegressor to get a good |
109 |
| - # enough predictive performance for a restricted execution time. |
110 |
| - n_estimators=4, |
111 |
| - max_depth=10, |
112 |
| - bootstrap=True, |
113 |
| - max_samples=0.5, |
114 |
| - n_jobs=2, |
115 |
| - random_state=0, |
| 113 | +named_estimators = [ |
| 114 | + ("Bayesian Ridge", BayesianRidge()), |
| 115 | + ( |
| 116 | + "Random Forest", |
| 117 | + RandomForestRegressor( |
| 118 | + # We tuned the hyperparameters of the RandomForestRegressor to get a good |
| 119 | + # enough predictive performance for a restricted execution time. |
| 120 | + n_estimators=5, |
| 121 | + max_depth=10, |
| 122 | + bootstrap=True, |
| 123 | + max_samples=0.5, |
| 124 | + n_jobs=2, |
| 125 | + random_state=0, |
| 126 | + ), |
116 | 127 | ),
|
117 |
| - make_pipeline( |
118 |
| - Nystroem(kernel="polynomial", degree=2, random_state=0), Ridge(alpha=1e3) |
| 128 | + ( |
| 129 | + "Nystroem + Ridge", |
| 130 | + make_pipeline( |
| 131 | + Nystroem(kernel="polynomial", degree=2, random_state=0), Ridge(alpha=1e4) |
| 132 | + ), |
| 133 | + ), |
| 134 | + ( |
| 135 | + "k-NN", |
| 136 | + KNeighborsRegressor(n_neighbors=10), |
119 | 137 | ),
|
120 |
| - KNeighborsRegressor(n_neighbors=15), |
121 | 138 | ]
|
122 | 139 | score_iterative_imputer = pd.DataFrame()
|
123 |
| -# iterative imputer is sensible to the tolerance and |
| 140 | +# Iterative imputer is sensitive to the tolerance and |
124 | 141 | # dependent on the estimator used internally.
|
125 |
| -# we tuned the tolerance to keep this example run with limited computational |
| 142 | +# We tuned the tolerance to keep this example run with limited computational |
126 | 143 | # resources while not changing the results too much compared to keeping the
|
127 | 144 | # stricter default value for the tolerance parameter.
|
128 | 145 | tolerances = (1e-3, 1e-1, 1e-1, 1e-2)
|
129 |
| -for impute_estimator, tol in zip(estimators, tolerances): |
130 |
| - estimator = make_pipeline( |
| 146 | +for (name, impute_estimator), tol in zip(named_estimators, tolerances): |
| 147 | + score_iterative_imputer[name] = compute_score_for( |
| 148 | + X_missing, |
| 149 | + y_missing, |
131 | 150 | IterativeImputer(
|
132 |
| - random_state=0, estimator=impute_estimator, max_iter=25, tol=tol |
| 151 | + random_state=0, estimator=impute_estimator, max_iter=40, tol=tol |
133 | 152 | ),
|
134 |
| - br_estimator, |
135 |
| - ) |
136 |
| - score_iterative_imputer[impute_estimator.__class__.__name__] = cross_val_score( |
137 |
| - estimator, X_missing, y_missing, scoring="neg_mean_squared_error", cv=N_SPLITS |
138 | 153 | )
|
139 | 154 |
|
140 | 155 | scores = pd.concat(
|
|
0 commit comments