DOC Improve target encoder User Guide (scikit-learn#26643)

lucyleeow · web-flow · commit b948fdba24a4 · 2023-06-23T10:28:54.000+02:00
diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst
@@ -886,10 +886,11 @@ binary classification target, the target encoding is given by:
     S_i = \lambda_i\frac{n_{iY}}{n_i} + (1 - \lambda_i)\frac{n_Y}{n}
 
 where :math:`S_i` is the encoding for category :math:`i`, :math:`n_{iY}` is the
-number of observations with :math:`Y=1` with category :math:`i`, :math:`n_i` is
+number of observations with :math:`Y=1` and category :math:`i`, :math:`n_i` is
 the number of observations with category :math:`i`, :math:`n_Y` is the number of
 observations with :math:`Y=1`, :math:`n` is the number of observations, and
-:math:`\lambda_i` is a shrinkage factor. The shrinkage factor is given by:
+:math:`\lambda_i` is a shrinkage factor for category :math:`i`. The shrinkage
+factor is given by:
 
 .. math::
     \lambda_i = \frac{n_i}{m + n_i}
@@ -906,31 +907,36 @@ For continuous targets, the formulation is similar to binary classification:
 .. math::
     S_i = \lambda_i\frac{\sum_{k\in L_i}Y_k}{n_i} + (1 - \lambda_i)\frac{\sum_{k=1}^{n}Y_k}{n}
 
-where :math:`L_i` is the set of observations for which :math:`X=X_i` and
-:math:`n_i` is the cardinality of :math:`L_i`.
-
-:meth:`~TargetEncoder.fit_transform` internally relies on a cross validation
-scheme to prevent information from the target from leaking into the train-time
-representation for non-informative high-cardinality categorical variables and
-help prevent the downstream model to overfit spurious correlations. Note that
-as a result, `fit(X, y).transform(X)` does not equal `fit_transform(X, y)`. In
-:meth:`~TargetEncoder.fit_transform`, the training data is split into multiple
-folds and encodes each fold by using the encodings trained on the other folds.
-After cross validation is complete in :meth:`~TargetEncoder.fit_transform`, the
-target encoder learns one final encoding on the whole training set. This final
-encoding is used to encode categories in :meth:`~TargetEncoder.transform`. The
-following diagram shows the cross validation scheme in
-:meth:`~TargetEncoder.fit_transform` with the default `cv=5`:
+where :math:`L_i` is the set of observations with category :math:`i` and
+:math:`n_i` is the number of observations with category :math:`i`.
+
+:meth:`~TargetEncoder.fit_transform` internally relies on a cross fitting
+scheme to prevent target information from leaking into the train-time
+representation, especially for non-informative high-cardinality categorical
+variables, and help prevent the downstream model from overfitting spurious
+correlations. Note that as a result, `fit(X, y).transform(X)` does not equal
+`fit_transform(X, y)`. In :meth:`~TargetEncoder.fit_transform`, the training
+data is split into *k* folds (determined by the `cv` parameter) and encodes each
+fold using the encodings trained on the other *k-1* folds. The following diagram
+shows the cross fitting scheme in :meth:`~TargetEncoder.fit_transform` with
+the default `cv=5`:
 
 .. image:: ../images/target_encoder_cross_validation.svg
    :width: 600
    :align: center
 
-The :meth:`~TargetEncoder.fit` method does **not** use any cross validation
+:meth:`~TargetEncoder.fit_transform` also learns a 'full data' encoding using
+the whole training set. This is never used in
+:meth:`~TargetEncoder.fit_transform` but is saved to the attribute `encodings_`,
+for use when :meth:`~TargetEncoder.transform` is called. Note that the encodings
+learned for each fold during the cross fitting scheme are not saved to an
+attribute.
+
+The :meth:`~TargetEncoder.fit` method does **not** use any cross fitting
 schemes and learns one encoding on the entire training set, which is used to
 encode categories in :meth:`~TargetEncoder.transform`.
-:meth:`~TargetEncoder.fit`'s one encoding is the same as the final encoding
-learned in :meth:`~TargetEncoder.fit_transform`.
+This encoding is the same as the 'full data'
+encoding learned in :meth:`~TargetEncoder.fit_transform`.
 
 .. note::
   :class:`TargetEncoder` considers missing values, such as `np.nan` or `None`,
diff --git a/examples/preprocessing/plot_target_encoder.py b/examples/preprocessing/plot_target_encoder.py
@@ -12,7 +12,7 @@
 
 .. note::
     `fit(X, y).transform(X)` does not equal `fit_transform(X, y)` because a
-    cross-validation scheme is used in `fit_transform` for encoding. See the
+    cross fitting scheme is used in `fit_transform` for encoding. See the
     :ref:`User Guide <target_encoder>`. for details.
 """
 
diff --git a/examples/preprocessing/plot_target_encoder_cross_val.py b/examples/preprocessing/plot_target_encoder_cross_val.py
@@ -1,16 +1,16 @@
 """
-==========================================
-Target Encoder's Internal Cross Validation
-==========================================
+=======================================
+Target Encoder's Internal Cross fitting
+=======================================
 
 .. currentmodule:: sklearn.preprocessing
 
 The :class:`TargetEnocoder` replaces each category of a categorical feature with
 the mean of the target variable for that category. This method is useful
 in cases where there is a strong relationship between the categorical feature
 and the target. To prevent overfitting, :meth:`TargetEncoder.fit_transform` uses
-interval cross validation to encode the training data to be used by a downstream
-model. In this example, we demonstrate the importance of the cross validation
+an internal cross fitting scheme to encode the training data to be used by a
+downstream model. In this example, we demonstrate the importance of the cross fitting
 procedure to prevent overfitting.
 """
 
@@ -49,11 +49,11 @@
 
 # %%
 # The uninformative feature with high cardinality is generated so that is independent of
-# the target variable. We will show that target encoding without cross validation will
+# the target variable. We will show that target encoding without cross fitting will
 # cause catastrophic overfitting for the downstream regressor. These high cardinality
 # features are basically unique identifiers for samples which should generally be
 # removed from machine learning dataset. In this example, we generate them to show how
-# :class:`TargetEncoder`'s default cross validation behavior mitigates the overfitting
+# :class:`TargetEncoder`'s default cross fitting behavior mitigates the overfitting
 # issue automatically.
 X_near_unique_categories = rng.choice(
     int(0.9 * n_samples), size=n_samples, replace=True
@@ -79,7 +79,7 @@
 # ==========================
 # In this section, we train a ridge regressor on the dataset with and without
 # encoding and explore the influence of target encoder with and without the
-# interval cross validation. First, we see the Ridge model trained on the
+# internal cross fitting. First, we see the Ridge model trained on the
 # raw features will have low performance, because the order of the informative
 # feature is not informative:
 import sklearn
@@ -96,7 +96,7 @@
 
 # %%
 # Next, we create a pipeline with the target encoder and ridge model. The pipeline
-# uses :meth:`TargetEncoder.fit_transform` which uses cross validation. We see that
+# uses :meth:`TargetEncoder.fit_transform` which uses cross fitting. We see that
 # the model fits the data well and generalizes to the test set:
 from sklearn.pipeline import make_pipeline
 from sklearn.preprocessing import TargetEncoder
@@ -120,11 +120,11 @@
 _ = coefs_cv.plot(kind="barh")
 
 # %%
-# While :meth:`TargetEncoder.fit_transform` uses an interval cross validation,
-# :meth:`TargetEncoder.transform` itself does not perform any cross validation.
+# While :meth:`TargetEncoder.fit_transform` uses an internal cross fitting scheme,
+# :meth:`TargetEncoder.transform` itself does not perform any cross fitting.
 # It uses the aggregation of the complete training set to transform the categorical
 # features. Thus, we can use :meth:`TargetEncoder.fit` followed by
-# :meth:`TargetEncoder.transform` to disable the cross validation. This encoding
+# :meth:`TargetEncoder.transform` to disable the cross fitting. This encoding
 # is then passed to the ridge model.
 target_encoder = TargetEncoder(random_state=0)
 target_encoder.fit(X_train, y_train)
@@ -154,8 +154,8 @@
 # %%
 # Conclusion
 # ==========
-# This example demonstrates the importance of :class:`TargetEncoder`'s interval cross
-# validation. It is important to use :meth:`TargetEncoder.fit_transform` to encode
+# This example demonstrates the importance of :class:`TargetEncoder`'s internal cross
+# fitting. It is important to use :meth:`TargetEncoder.fit_transform` to encode
 # training data before passing it to a machine learning model. When a
 # :class:`TargetEncoder` is a part of a :class:`~sklearn.pipeline.Pipeline` and the
 # pipeline is fitted, the pipeline will correctly call
diff --git a/sklearn/preprocessing/_target_encoder.py b/sklearn/preprocessing/_target_encoder.py
@@ -27,7 +27,7 @@ class TargetEncoder(OneToOneFeatureMixin, _BaseEncoder):
 
     .. note::
         `fit(X, y).transform(X)` does not equal `fit_transform(X, y)` because a
-        cross-validation scheme is used in `fit_transform` for encoding. See the
+        cross fitting scheme is used in `fit_transform` for encoding. See the
         :ref:`User Guide <target_encoder>` for details.
 
     .. versionadded:: 1.3
@@ -68,7 +68,7 @@ class TargetEncoder(OneToOneFeatureMixin, _BaseEncoder):
         If `"auto"`, then `smooth` is set to an empirical Bayes estimate.
 
     cv : int, default=5
-        Determines the number of folds in the cross-validation strategy used in
+        Determines the number of folds in the cross fitting strategy used in
         :meth:`fit_transform`. For classification targets, `StratifiedKFold` is used
         and for continuous targets, `KFold` is used.
 
@@ -204,7 +204,7 @@ def fit_transform(self, X, y):
 
         .. note::
             `fit(X, y).transform(X)` does not equal `fit_transform(X, y)` because a
-            cross-validation scheme is used in `fit_transform` for encoding. See the
+            cross fitting scheme is used in `fit_transform` for encoding. See the
             :ref:`User Guide <target_encoder>`. for details.
 
         Parameters
@@ -260,7 +260,7 @@ def transform(self, X):
 
         .. note::
             `fit(X, y).transform(X)` does not equal `fit_transform(X, y)` because a
-            cross-validation scheme is used in `fit_transform` for encoding. See the
+            cross fitting scheme is used in `fit_transform` for encoding. See the
             :ref:`User Guide <target_encoder>`. for details.
 
         Parameters