DOC Add links to preprocessing examples in docstrings and userguide (scikit-learn#26877)

StefanieSenger · ArturoAmorQ · adrinjalali · web-flow · commit 8ccaf0da1d85 · 2023-08-24T15:15:41.000+02:00
Co-authored-by: Arturo Amor &lt;86408019+ArturoAmorQ@users.noreply.github.com&gt;
Co-authored-by: Adrin Jalali &lt;adrin.jalali@gmail.com&gt;
Co-authored-by: Olivier Grisel &lt;olivier.grisel@ensta.org&gt;
diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst
@@ -10,9 +10,10 @@ The ``sklearn.preprocessing`` package provides several common
 utility functions and transformer classes to change raw feature vectors
 into a representation that is more suitable for the downstream estimators.
 
-In general, learning algorithms benefit from standardization of the data set. If
-some outliers are present in the set, robust scalers or transformers are more
-appropriate. The behaviors of the different scalers, transformers, and
+In general, many learning algorithms such as linear models benefit from standardization of the data set
+(see :ref:`sphx_glr_auto_examples_preprocessing_plot_scaling_importance.py`).
+If some outliers are present in the set, robust scalers or other transformers can
+be more appropriate. The behaviors of the different scalers, transformers, and
 normalizers on a dataset containing marginal outliers is highlighted in
 :ref:`sphx_glr_auto_examples_preprocessing_plot_all_scaling.py`.
 
diff --git a/examples/preprocessing/plot_all_scaling.py b/examples/preprocessing/plot_all_scaling.py
@@ -265,6 +265,8 @@ def make_plot(item_idx):
 make_plot(0)
 
 # %%
+# .. _plot_all_scaling_standard_scaler_section:
+#
 # StandardScaler
 # --------------
 #
@@ -285,6 +287,8 @@ def make_plot(item_idx):
 make_plot(1)
 
 # %%
+# .. _plot_all_scaling_minmax_scaler_section:
+#
 # MinMaxScaler
 # ------------
 #
@@ -301,6 +305,8 @@ def make_plot(item_idx):
 make_plot(2)
 
 # %%
+# .. _plot_all_scaling_max_abs_scaler_section:
+#
 # MaxAbsScaler
 # ------------
 #
@@ -318,6 +324,8 @@ def make_plot(item_idx):
 make_plot(3)
 
 # %%
+# .. _plot_all_scaling_robust_scaler_section:
+#
 # RobustScaler
 # ------------
 #
@@ -335,6 +343,8 @@ def make_plot(item_idx):
 make_plot(4)
 
 # %%
+# .. _plot_all_scaling_power_transformer_section:
+#
 # PowerTransformer
 # ----------------
 #
@@ -353,6 +363,8 @@ def make_plot(item_idx):
 make_plot(6)
 
 # %%
+# .. _plot_all_scaling_quantile_transformer_section:
+#
 # QuantileTransformer (uniform output)
 # ------------------------------------
 #
@@ -384,6 +396,8 @@ def make_plot(item_idx):
 make_plot(8)
 
 # %%
+# .. _plot_all_scaling_normalizer_section:
+#
 # Normalizer
 # ----------
 #
diff --git a/sklearn/preprocessing/_data.py b/sklearn/preprocessing/_data.py
@@ -191,8 +191,7 @@ def scale(X, *, axis=0, with_mean=True, with_std=True, copy=True):
     affect model performance.
 
     For a comparison of the different scalers, transformers, and normalizers,
-    see :ref:`examples/preprocessing/plot_all_scaling.py
-    <sphx_glr_auto_examples_preprocessing_plot_all_scaling.py>`.
+    see: :ref:`sphx_glr_auto_examples_preprocessing_plot_all_scaling.py`.
 
     .. warning:: Risk of data leak
 
@@ -294,6 +293,12 @@ class MinMaxScaler(OneToOneFeatureMixin, TransformerMixin, BaseEstimator):
     This transformation is often used as an alternative to zero mean,
     unit variance scaling.
 
+    `MinMaxScaler` doesn't reduce the effect of outliers, but it linearily
+    scales them down into a fixed range, where the largest occuring data point
+    corresponds to the maximum value and the smallest one corresponds to the
+    minimum value. For an example visualization, refer to :ref:`Compare
+    MinMaxScaler with other scalers <plot_all_scaling_minmax_scaler_section>`.
+
     Read more in the :ref:`User Guide <preprocessing_scaler>`.
 
     Parameters
@@ -367,10 +372,6 @@ class MinMaxScaler(OneToOneFeatureMixin, TransformerMixin, BaseEstimator):
     NaNs are treated as missing values: disregarded in fit, and maintained in
     transform.
 
-    For a comparison of the different scalers, transformers, and normalizers,
-    see :ref:`examples/preprocessing/plot_all_scaling.py
-    <sphx_glr_auto_examples_preprocessing_plot_all_scaling.py>`.
-
     Examples
     --------
     >>> from sklearn.preprocessing import MinMaxScaler
@@ -641,8 +642,7 @@ def minmax_scale(X, feature_range=(0, 1), *, axis=0, copy=True):
     Notes
     -----
     For a comparison of the different scalers, transformers, and normalizers,
-    see :ref:`examples/preprocessing/plot_all_scaling.py
-    <sphx_glr_auto_examples_preprocessing_plot_all_scaling.py>`.
+    see: :ref:`sphx_glr_auto_examples_preprocessing_plot_all_scaling.py`.
     """
     # Unlike the scaler object, this function allows 1d input.
     # If copy is required, it will be done inside the scaler object.
@@ -695,6 +695,11 @@ class StandardScaler(OneToOneFeatureMixin, TransformerMixin, BaseEstimator):
     than others, it might dominate the objective function and make the
     estimator unable to learn from other features correctly as expected.
 
+    `StandardScaler` is sensitive to outliers, and the features may scale
+    differently from each other in the presence of outliers. For an example
+    visualization, refer to :ref:`Compare StandardScaler with other scalers
+    <plot_all_scaling_standard_scaler_section>`.
+
     This scaler can also be applied to sparse CSR or CSC matrices by passing
     `with_mean=False` to avoid breaking the sparsity structure of the data.
 
@@ -776,10 +781,6 @@ class StandardScaler(OneToOneFeatureMixin, TransformerMixin, BaseEstimator):
     `numpy.std(x, ddof=0)`. Note that the choice of `ddof` is unlikely to
     affect model performance.
 
-    For a comparison of the different scalers, transformers, and normalizers,
-    see :ref:`examples/preprocessing/plot_all_scaling.py
-    <sphx_glr_auto_examples_preprocessing_plot_all_scaling.py>`.
-
     Examples
     --------
     >>> from sklearn.preprocessing import StandardScaler
@@ -1093,6 +1094,10 @@ class MaxAbsScaler(OneToOneFeatureMixin, TransformerMixin, BaseEstimator):
 
     This scaler can also be applied to sparse CSR or CSC matrices.
 
+    `MaxAbsScaler` doesn't reduce the effect of outliers; it only linearily
+    scales them down. For an example visualization, refer to :ref:`Compare
+    MaxAbsScaler with other scalers <plot_all_scaling_max_abs_scaler_section>`.
+
     .. versionadded:: 0.17
 
     Parameters
@@ -1136,10 +1141,6 @@ class MaxAbsScaler(OneToOneFeatureMixin, TransformerMixin, BaseEstimator):
     NaNs are treated as missing values: disregarded in fit, and maintained in
     transform.
 
-    For a comparison of the different scalers, transformers, and normalizers,
-    see :ref:`examples/preprocessing/plot_all_scaling.py
-    <sphx_glr_auto_examples_preprocessing_plot_all_scaling.py>`.
-
     Examples
     --------
     >>> from sklearn.preprocessing import MaxAbsScaler
@@ -1367,8 +1368,7 @@ def maxabs_scale(X, *, axis=0, copy=True):
     and maintained during the data transformation.
 
     For a comparison of the different scalers, transformers, and normalizers,
-    see :ref:`examples/preprocessing/plot_all_scaling.py
-    <sphx_glr_auto_examples_preprocessing_plot_all_scaling.py>`.
+    see: :ref:`sphx_glr_auto_examples_preprocessing_plot_all_scaling.py`.
     """
     # Unlike the scaler object, this function allows 1d input.
 
@@ -1411,11 +1411,13 @@ class RobustScaler(OneToOneFeatureMixin, TransformerMixin, BaseEstimator):
     set. Median and interquartile range are then stored to be used on
     later data using the :meth:`transform` method.
 
-    Standardization of a dataset is a common requirement for many
-    machine learning estimators. Typically this is done by removing the mean
-    and scaling to unit variance. However, outliers can often influence the
-    sample mean / variance in a negative way. In such cases, the median and
-    the interquartile range often give better results.
+    Standardization of a dataset is a common preprocessing for many machine
+    learning estimators. Typically this is done by removing the mean and
+    scaling to unit variance. However, outliers can often influence the sample
+    mean / variance in a negative way. In such cases, using the median and the
+    interquartile range often give better results. For an example visualization
+    and comparison to other scalers, refer to :ref:`Compare RobustScaler with
+    other scalers <plot_all_scaling_robust_scaler_section>`.
 
     .. versionadded:: 0.17
 
@@ -1486,9 +1488,6 @@ class RobustScaler(OneToOneFeatureMixin, TransformerMixin, BaseEstimator):
 
     Notes
     -----
-    For a comparison of the different scalers, transformers, and normalizers,
-    see :ref:`examples/preprocessing/plot_all_scaling.py
-    <sphx_glr_auto_examples_preprocessing_plot_all_scaling.py>`.
 
     https://en.wikipedia.org/wiki/Median
     https://en.wikipedia.org/wiki/Interquartile_range
@@ -1751,8 +1750,7 @@ def robust_scale(
     To avoid memory copy the caller should pass a CSR matrix.
 
     For a comparison of the different scalers, transformers, and normalizers,
-    see :ref:`examples/preprocessing/plot_all_scaling.py
-    <sphx_glr_auto_examples_preprocessing_plot_all_scaling.py>`.
+    see: :ref:`sphx_glr_auto_examples_preprocessing_plot_all_scaling.py`.
 
     .. warning:: Risk of data leak
 
@@ -1853,8 +1851,7 @@ def normalize(X, norm="l2", *, axis=1, copy=True, return_norm=False):
     Notes
     -----
     For a comparison of the different scalers, transformers, and normalizers,
-    see :ref:`examples/preprocessing/plot_all_scaling.py
-    <sphx_glr_auto_examples_preprocessing_plot_all_scaling.py>`.
+    see: :ref:`sphx_glr_auto_examples_preprocessing_plot_all_scaling.py`.
     """
     if axis == 0:
         sparse_format = "csc"
@@ -1924,6 +1921,9 @@ class Normalizer(OneToOneFeatureMixin, TransformerMixin, BaseEstimator):
     of the vectors and is the base similarity metric for the Vector
     Space Model commonly used by the Information Retrieval community.
 
+    For an example visualization, refer to :ref:`Compare Normalizer with other
+    scalers <plot_all_scaling_normalizer_section>`.
+
     Read more in the :ref:`User Guide <preprocessing_normalization>`.
 
     Parameters
@@ -1962,10 +1962,6 @@ class Normalizer(OneToOneFeatureMixin, TransformerMixin, BaseEstimator):
     :meth:`transform`, as parameter validation is only performed in
     :meth:`fit`.
 
-    For a comparison of the different scalers, transformers, and normalizers,
-    see :ref:`examples/preprocessing/plot_all_scaling.py
-    <sphx_glr_auto_examples_preprocessing_plot_all_scaling.py>`.
-
     Examples
     --------
     >>> from sklearn.preprocessing import Normalizer
@@ -2459,6 +2455,9 @@ class QuantileTransformer(OneToOneFeatureMixin, TransformerMixin, BaseEstimator)
     correlations between variables measured at the same scale but renders
     variables measured at different scales more directly comparable.
 
+    For example visualizations, refer to :ref:`Compare QuantileTransformer with
+    other scalers <plot_all_scaling_quantile_transformer_section>`.
+
     Read more in the :ref:`User Guide <preprocessing_transformer>`.
 
     .. versionadded:: 0.19
@@ -2536,10 +2535,6 @@ class QuantileTransformer(OneToOneFeatureMixin, TransformerMixin, BaseEstimator)
     NaNs are treated as missing values: disregarded in fit, and maintained in
     transform.
 
-    For a comparison of the different scalers, transformers, and normalizers,
-    see :ref:`examples/preprocessing/plot_all_scaling.py
-    <sphx_glr_auto_examples_preprocessing_plot_all_scaling.py>`.
-
     Examples
     --------
     >>> import numpy as np
@@ -2988,8 +2983,7 @@ def quantile_transform(
         LogisticRegression())`.
 
     For a comparison of the different scalers, transformers, and normalizers,
-    see :ref:`examples/preprocessing/plot_all_scaling.py
-    <sphx_glr_auto_examples_preprocessing_plot_all_scaling.py>`.
+    see: :ref:`sphx_glr_auto_examples_preprocessing_plot_all_scaling.py`.
 
     Examples
     --------
@@ -3033,6 +3027,12 @@ class PowerTransformer(OneToOneFeatureMixin, TransformerMixin, BaseEstimator):
     By default, zero-mean, unit-variance normalization is applied to the
     transformed data.
 
+    For an example visualization, refer to :ref:`Compare PowerTransformer with
+    other scalers <plot_all_scaling_power_transformer_section>`. To see the
+    effect of Box-Cox and Yeo-Johnson transformations on different
+    distributions, see:
+    :ref:`sphx_glr_auto_examples_preprocessing_plot_map_data_to_normal.py`.
+
     Read more in the :ref:`User Guide <preprocessing_transformer>`.
 
     .. versionadded:: 0.20
@@ -3080,10 +3080,6 @@ class PowerTransformer(OneToOneFeatureMixin, TransformerMixin, BaseEstimator):
     NaNs are treated as missing values: disregarded in ``fit``, and maintained
     in ``transform``.
 
-    For a comparison of the different scalers, transformers, and normalizers,
-    see :ref:`examples/preprocessing/plot_all_scaling.py
-    <sphx_glr_auto_examples_preprocessing_plot_all_scaling.py>`.
-
     References
     ----------
 
@@ -3500,8 +3496,7 @@ def power_transform(X, method="yeo-johnson", *, standardize=True, copy=True):
     in ``transform``.
 
     For a comparison of the different scalers, transformers, and normalizers,
-    see :ref:`examples/preprocessing/plot_all_scaling.py
-    <sphx_glr_auto_examples_preprocessing_plot_all_scaling.py>`.
+    see: :ref:`sphx_glr_auto_examples_preprocessing_plot_all_scaling.py`.
 
     References
     ----------
diff --git a/sklearn/preprocessing/_discretization.py b/sklearn/preprocessing/_discretization.py
@@ -55,6 +55,9 @@ class KBinsDiscretizer(TransformerMixin, BaseEstimator):
         - 'kmeans': Values in each bin have the same nearest center of a 1D
           k-means cluster.
 
+        For an example of the different strategies see:
+        :ref:`sphx_glr_auto_examples_preprocessing_plot_discretization_strategies.py`.
+
     dtype : {np.float32, np.float64}, default=None
         The desired data-type for the output. If None, output dtype is
         consistent with input dtype. Only np.float32 and np.float64 are
@@ -117,6 +120,12 @@ class KBinsDiscretizer(TransformerMixin, BaseEstimator):
 
     Notes
     -----
+
+    For a visualization of discretization on different datasets refer to
+    :ref:`sphx_glr_auto_examples_preprocessing_plot_discretization_classification.py`.
+    On the effect of discretization on linear models see:
+    :ref:`sphx_glr_auto_examples_preprocessing_plot_discretization.py`.
+
     In bin edges for feature ``i``, the first and last values are used only for
     ``inverse_transform``. During transform, bin edges are extended to::
 
diff --git a/sklearn/preprocessing/_encoders.py b/sklearn/preprocessing/_encoders.py
@@ -463,6 +463,8 @@ class OneHotEncoder(_BaseEncoder):
     instead.
 
     Read more in the :ref:`User Guide <preprocessing_categorical_features>`.
+    For a comparison of different encoders, refer to:
+    :ref:`sphx_glr_auto_examples_preprocessing_plot_target_encoder.py`.
 
     Parameters
     ----------
@@ -1243,6 +1245,8 @@ class OrdinalEncoder(OneToOneFeatureMixin, _BaseEncoder):
     a single column of integers (0 to n_categories - 1) per feature.
 
     Read more in the :ref:`User Guide <preprocessing_categorical_features>`.
+    For a comparison of different encoders, refer to:
+    :ref:`sphx_glr_auto_examples_preprocessing_plot_target_encoder.py`.
 
     .. versionadded:: 0.20
 
diff --git a/sklearn/preprocessing/_target_encoder.py b/sklearn/preprocessing/_target_encoder.py
@@ -23,7 +23,12 @@ class TargetEncoder(OneToOneFeatureMixin, _BaseEncoder):
     that are not seen during :meth:`fit` are encoded with the target mean, i.e.
     `target_mean_`.
 
-    Read more in the :ref:`User Guide <target_encoder>`.
+    For a demo on the importance of the `TargetEncoder` internal cross-fitting,
+    see
+    ref:`sphx_glr_auto_examples_preprocessing_plot_target_encoder_cross_val.py`.
+    For a comparison of different encoders, refer to
+    :ref:`sphx_glr_auto_examples_preprocessing_plot_target_encoder.py`. Read
+    more in the :ref:`User Guide <target_encoder>`.
 
     .. note::
         `fit(X, y).transform(X)` does not equal `fit_transform(X, y)` because a