|
| 1 | +""" |
| 2 | +========================================== |
| 3 | +Target Encoder's Internal Cross Validation |
| 4 | +========================================== |
| 5 | +
|
| 6 | +.. currentmodule:: sklearn.preprocessing |
| 7 | +
|
| 8 | +The :class:`TargetEnocoder` replaces each category of a categorical feature with |
| 9 | +the mean of the target variable for that category. This method is useful |
| 10 | +in cases where there is a strong relationship between the categorical feature |
| 11 | +and the target. To prevent overfitting, :meth:`TargetEncoder.fit_transform` uses |
| 12 | +interval cross validation to encode the training data to be used by a downstream |
| 13 | +model. In this example, we demonstrate the importance of the cross validation |
| 14 | +procedure to prevent overfitting. |
| 15 | +""" |
| 16 | + |
| 17 | +# %% |
| 18 | +# Create Synthetic Dataset |
| 19 | +# ======================== |
| 20 | +# For this example, we build a dataset with three categorical features: an informative |
| 21 | +# feature with medium cardinality, an uninformative feature with medium cardinality, |
| 22 | +# and an uninformative feature with high cardinality. First, we generate the informative |
| 23 | +# feature: |
| 24 | +from sklearn.preprocessing import KBinsDiscretizer |
| 25 | +import numpy as np |
| 26 | + |
| 27 | +n_samples = 50_000 |
| 28 | + |
| 29 | +rng = np.random.RandomState(42) |
| 30 | +y = rng.randn(n_samples) |
| 31 | +noise = 0.5 * rng.randn(n_samples) |
| 32 | +n_categories = 100 |
| 33 | + |
| 34 | +kbins = KBinsDiscretizer( |
| 35 | + n_bins=n_categories, encode="ordinal", strategy="uniform", random_state=rng |
| 36 | +) |
| 37 | +X_informative = kbins.fit_transform((y + noise).reshape(-1, 1)) |
| 38 | + |
| 39 | +# Remove the linear relationship between y and the bin index by permuting the values of |
| 40 | +# X_informative |
| 41 | +permuted_categories = rng.permutation(n_categories) |
| 42 | +X_informative = permuted_categories[X_informative.astype(np.int32)] |
| 43 | + |
| 44 | +# %% |
| 45 | +# The uninformative feature with medium cardinality is generated by permuting the |
| 46 | +# informative feature and removing the relationship with the target: |
| 47 | +X_shuffled = rng.permutation(X_informative) |
| 48 | + |
| 49 | +# %% |
| 50 | +# The uninformative feature with high cardinality is generated so that is independent of |
| 51 | +# the target variable. We will show that target encoding without cross validation will |
| 52 | +# cause catastrophic overfitting for the downstream regressor. These high cardinality |
| 53 | +# features are basically unique identifiers for samples which should generally be |
| 54 | +# removed from machine learning dataset. In this example, we generate them to show how |
| 55 | +# :class:`TargetEncoder`'s default cross validation behavior mitigates the overfitting |
| 56 | +# issue automatically. |
| 57 | +X_near_unique_categories = rng.choice( |
| 58 | + int(0.9 * n_samples), size=n_samples, replace=True |
| 59 | +).reshape(-1, 1) |
| 60 | + |
| 61 | +# %% |
| 62 | +# Finally, we assemble the dataset and perform a train test split: |
| 63 | +from sklearn.model_selection import train_test_split |
| 64 | +import pandas as pd |
| 65 | + |
| 66 | +X = pd.DataFrame( |
| 67 | + np.concatenate( |
| 68 | + [X_informative, X_shuffled, X_near_unique_categories], |
| 69 | + axis=1, |
| 70 | + ), |
| 71 | + columns=["informative", "shuffled", "near_unique"], |
| 72 | +) |
| 73 | +X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) |
| 74 | + |
| 75 | +# %% |
| 76 | +# Training a Ridge Regressor |
| 77 | +# ========================== |
| 78 | +# In this section, we train a ridge regressor on the dataset with and without |
| 79 | +# encoding and explore the influence of target encoder with and without the |
| 80 | +# interval cross validation. First, we see the Ridge model trained on the |
| 81 | +# raw features will have low performance, because the order of the informative |
| 82 | +# feature is not informative: |
| 83 | +from sklearn.linear_model import Ridge |
| 84 | +import sklearn |
| 85 | + |
| 86 | +# Configure transformers to always output DataFrames |
| 87 | +sklearn.set_config(transform_output="pandas") |
| 88 | + |
| 89 | +ridge = Ridge(alpha=1e-6, solver="lsqr", fit_intercept=False) |
| 90 | + |
| 91 | +raw_model = ridge.fit(X_train, y_train) |
| 92 | +print("Raw Model score on training set: ", raw_model.score(X_train, y_train)) |
| 93 | +print("Raw Model score on test set: ", raw_model.score(X_test, y_test)) |
| 94 | + |
| 95 | +# %% |
| 96 | +# Next, we create a pipeline with the target encoder and ridge model. The pipeline |
| 97 | +# uses :meth:`TargetEncoder.fit_transform` which uses cross validation. We see that |
| 98 | +# the model fits the data well and generalizes to the test set: |
| 99 | +from sklearn.pipeline import make_pipeline |
| 100 | +from sklearn.preprocessing import TargetEncoder |
| 101 | + |
| 102 | +model_with_cv = make_pipeline(TargetEncoder(random_state=0), ridge) |
| 103 | +model_with_cv.fit(X_train, y_train) |
| 104 | +print("Model with CV on training set: ", model_with_cv.score(X_train, y_train)) |
| 105 | +print("Model with CV on test set: ", model_with_cv.score(X_test, y_test)) |
| 106 | + |
| 107 | +# %% |
| 108 | +# The coefficients of the linear model shows that most of the weight is on the |
| 109 | +# feature at column index 0, which is the informative feature |
| 110 | +import pandas as pd |
| 111 | +import matplotlib.pyplot as plt |
| 112 | + |
| 113 | +plt.rcParams["figure.constrained_layout.use"] = True |
| 114 | + |
| 115 | +coefs_cv = pd.Series( |
| 116 | + model_with_cv[-1].coef_, index=model_with_cv[-1].feature_names_in_ |
| 117 | +).sort_values() |
| 118 | +_ = coefs_cv.plot(kind="barh") |
| 119 | + |
| 120 | +# %% |
| 121 | +# While :meth:`TargetEncoder.fit_transform` uses an interval cross validation, |
| 122 | +# :meth:`TargetEncoder.transform` itself does not perform any cross validation. |
| 123 | +# It uses the aggregation of the complete training set to transform the categorical |
| 124 | +# features. Thus, we can use :meth:`TargetEncoder.fit` followed by |
| 125 | +# :meth:`TargetEncoder.transform` to disable the cross validation. This encoding |
| 126 | +# is then passed to the ridge model. |
| 127 | +target_encoder = TargetEncoder(random_state=0) |
| 128 | +target_encoder.fit(X_train, y_train) |
| 129 | +X_train_no_cv_encoding = target_encoder.transform(X_train) |
| 130 | +X_test_no_cv_encoding = target_encoder.transform(X_test) |
| 131 | + |
| 132 | +model_no_cv = ridge.fit(X_train_no_cv_encoding, y_train) |
| 133 | + |
| 134 | +# %% |
| 135 | +# We evaluate the model on the non-cross validated encoding and see that it overfits: |
| 136 | +print( |
| 137 | + "Model without CV on training set: ", |
| 138 | + model_no_cv.score(X_train_no_cv_encoding, y_train), |
| 139 | +) |
| 140 | +print( |
| 141 | + "Model without CV on test set: ", model_no_cv.score(X_test_no_cv_encoding, y_test) |
| 142 | +) |
| 143 | + |
| 144 | +# %% |
| 145 | +# The ridge model overfits, because it assigns more weight to the extremely high |
| 146 | +# cardinality feature relative to the informative feature. |
| 147 | +coefs_no_cv = pd.Series( |
| 148 | + model_no_cv.coef_, index=model_no_cv.feature_names_in_ |
| 149 | +).sort_values() |
| 150 | +_ = coefs_no_cv.plot(kind="barh") |
| 151 | + |
| 152 | +# %% |
| 153 | +# Conclusion |
| 154 | +# ========== |
| 155 | +# This example demonstrates the importance of :class:`TargetEncoder`'s interval cross |
| 156 | +# validation. It is important to use :meth:`TargetEncoder.fit_transform` to encode |
| 157 | +# training data before passing it to a machine learning model. When a |
| 158 | +# :class:`TargetEncoder` is a part of a :class:`~sklearn.pipeline.Pipeline` and the |
| 159 | +# pipeline is fitted, the pipeline will correctly call |
| 160 | +# :meth:`TargetEncoder.fit_transform` and pass the encoding along. |
0 commit comments