Merge branch 'scikit-learn:main' into fork

adam2392 · web-flow · commit 31723a6ec35d · 2023-04-20T13:43:10.000-04:00
diff --git a/doc/modules/preprocessing.rst b/doc/modules/preprocessing.rst
@@ -941,6 +941,7 @@ learned in :meth:`~TargetEncoder.fit_transform`.
 .. topic:: Examples:
 
   * :ref:`sphx_glr_auto_examples_preprocessing_plot_target_encoder.py`
+  * :ref:`sphx_glr_auto_examples_preprocessing_plot_target_encoder_cross_val.py`
 
 .. topic:: References
 
diff --git a/examples/preprocessing/plot_target_encoder_cross_val.py b/examples/preprocessing/plot_target_encoder_cross_val.py
@@ -0,0 +1,160 @@
+"""
+==========================================
+Target Encoder's Internal Cross Validation
+==========================================
+
+.. currentmodule:: sklearn.preprocessing
+
+The :class:`TargetEnocoder` replaces each category of a categorical feature with
+the mean of the target variable for that category. This method is useful
+in cases where there is a strong relationship between the categorical feature
+and the target. To prevent overfitting, :meth:`TargetEncoder.fit_transform` uses
+interval cross validation to encode the training data to be used by a downstream
+model. In this example, we demonstrate the importance of the cross validation
+procedure to prevent overfitting.
+"""
+
+# %%
+# Create Synthetic Dataset
+# ========================
+# For this example, we build a dataset with three categorical features: an informative
+# feature with medium cardinality, an uninformative feature with medium cardinality,
+# and an uninformative feature with high cardinality. First, we generate the informative
+# feature:
+from sklearn.preprocessing import KBinsDiscretizer
+import numpy as np
+
+n_samples = 50_000
+
+rng = np.random.RandomState(42)
+y = rng.randn(n_samples)
+noise = 0.5 * rng.randn(n_samples)
+n_categories = 100
+
+kbins = KBinsDiscretizer(
+    n_bins=n_categories, encode="ordinal", strategy="uniform", random_state=rng
+)
+X_informative = kbins.fit_transform((y + noise).reshape(-1, 1))
+
+# Remove the linear relationship between y and the bin index by permuting the values of
+# X_informative
+permuted_categories = rng.permutation(n_categories)
+X_informative = permuted_categories[X_informative.astype(np.int32)]
+
+# %%
+# The uninformative feature with medium cardinality is generated by permuting the
+# informative feature and removing the relationship with the target:
+X_shuffled = rng.permutation(X_informative)
+
+# %%
+# The uninformative feature with high cardinality is generated so that is independent of
+# the target variable. We will show that target encoding without cross validation will
+# cause catastrophic overfitting for the downstream regressor. These high cardinality
+# features are basically unique identifiers for samples which should generally be
+# removed from machine learning dataset. In this example, we generate them to show how
+# :class:`TargetEncoder`'s default cross validation behavior mitigates the overfitting
+# issue automatically.
+X_near_unique_categories = rng.choice(
+    int(0.9 * n_samples), size=n_samples, replace=True
+).reshape(-1, 1)
+
+# %%
+# Finally, we assemble the dataset and perform a train test split:
+from sklearn.model_selection import train_test_split
+import pandas as pd
+
+X = pd.DataFrame(
+    np.concatenate(
+        [X_informative, X_shuffled, X_near_unique_categories],
+        axis=1,
+    ),
+    columns=["informative", "shuffled", "near_unique"],
+)
+X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
+
+# %%
+# Training a Ridge Regressor
+# ==========================
+# In this section, we train a ridge regressor on the dataset with and without
+# encoding and explore the influence of target encoder with and without the
+# interval cross validation. First, we see the Ridge model trained on the
+# raw features will have low performance, because the order of the informative
+# feature is not informative:
+from sklearn.linear_model import Ridge
+import sklearn
+
+# Configure transformers to always output DataFrames
+sklearn.set_config(transform_output="pandas")
+
+ridge = Ridge(alpha=1e-6, solver="lsqr", fit_intercept=False)
+
+raw_model = ridge.fit(X_train, y_train)
+print("Raw Model score on training set: ", raw_model.score(X_train, y_train))
+print("Raw Model score on test set: ", raw_model.score(X_test, y_test))
+
+# %%
+# Next, we create a pipeline with the target encoder and ridge model. The pipeline
+# uses :meth:`TargetEncoder.fit_transform` which uses cross validation. We see that
+# the model fits the data well and generalizes to the test set:
+from sklearn.pipeline import make_pipeline
+from sklearn.preprocessing import TargetEncoder
+
+model_with_cv = make_pipeline(TargetEncoder(random_state=0), ridge)
+model_with_cv.fit(X_train, y_train)
+print("Model with CV on training set: ", model_with_cv.score(X_train, y_train))
+print("Model with CV on test set: ", model_with_cv.score(X_test, y_test))
+
+# %%
+# The coefficients of the linear model shows that most of the weight is on the
+# feature at column index 0, which is the informative feature
+import pandas as pd
+import matplotlib.pyplot as plt
+
+plt.rcParams["figure.constrained_layout.use"] = True
+
+coefs_cv = pd.Series(
+    model_with_cv[-1].coef_, index=model_with_cv[-1].feature_names_in_
+).sort_values()
+_ = coefs_cv.plot(kind="barh")
+
+# %%
+# While :meth:`TargetEncoder.fit_transform` uses an interval cross validation,
+# :meth:`TargetEncoder.transform` itself does not perform any cross validation.
+# It uses the aggregation of the complete training set to transform the categorical
+# features. Thus, we can use :meth:`TargetEncoder.fit` followed by
+# :meth:`TargetEncoder.transform` to disable the cross validation. This encoding
+# is then passed to the ridge model.
+target_encoder = TargetEncoder(random_state=0)
+target_encoder.fit(X_train, y_train)
+X_train_no_cv_encoding = target_encoder.transform(X_train)
+X_test_no_cv_encoding = target_encoder.transform(X_test)
+
+model_no_cv = ridge.fit(X_train_no_cv_encoding, y_train)
+
+# %%
+# We evaluate the model on the non-cross validated encoding and see that it overfits:
+print(
+    "Model without CV on training set: ",
+    model_no_cv.score(X_train_no_cv_encoding, y_train),
+)
+print(
+    "Model without CV on test set: ", model_no_cv.score(X_test_no_cv_encoding, y_test)
+)
+
+# %%
+# The ridge model overfits, because it assigns more weight to the extremely high
+# cardinality feature relative to the informative feature.
+coefs_no_cv = pd.Series(
+    model_no_cv.coef_, index=model_no_cv.feature_names_in_
+).sort_values()
+_ = coefs_no_cv.plot(kind="barh")
+
+# %%
+# Conclusion
+# ==========
+# This example demonstrates the importance of :class:`TargetEncoder`'s interval cross
+# validation. It is important to use :meth:`TargetEncoder.fit_transform` to encode
+# training data before passing it to a machine learning model. When a
+# :class:`TargetEncoder` is a part of a :class:`~sklearn.pipeline.Pipeline` and the
+# pipeline is fitted, the pipeline will correctly call
+# :meth:`TargetEncoder.fit_transform` and pass the encoding along.
diff --git a/examples/text/plot_document_clustering.py b/examples/text/plot_document_clustering.py
@@ -7,7 +7,7 @@
 documents by topics using a `Bag of Words approach
 <https://en.wikipedia.org/wiki/Bag-of-words_model>`_.
 
-Two algorithms are demoed: :class:`~sklearn.cluster.KMeans` and its more
+Two algorithms are demonstrated, namely :class:`~sklearn.cluster.KMeans` and its more
 scalable variant, :class:`~sklearn.cluster.MiniBatchKMeans`. Additionally,
 latent semantic analysis is used to reduce dimensionality and discover latent
 patterns in the data.
diff --git a/sklearn/datasets/_openml.py b/sklearn/datasets/_openml.py
@@ -519,19 +519,21 @@ def _open_url_and_load_gzip_file(url, data_home, n_retries, delay, arff_params):
             url, data_home, n_retries, delay, arff_params
         )
     except Exception as exc:
-        if parser == "pandas":
-            from pandas.errors import ParserError
-
-            if isinstance(exc, ParserError):
-                # A parsing error could come from providing the wrong quotechar
-                # to pandas. By default, we use a double quote. Thus, we retry
-                # with a single quote before to raise the error.
-                arff_params["read_csv_kwargs"] = {"quotechar": "'"}
-                X, y, frame, categories = _open_url_and_load_gzip_file(
-                    url, data_home, n_retries, delay, arff_params
-                )
-            else:
-                raise
+        if parser != "pandas":
+            raise
+
+        from pandas.errors import ParserError
+
+        if not isinstance(exc, ParserError):
+            raise
+
+        # A parsing error could come from providing the wrong quotechar
+        # to pandas. By default, we use a double quote. Thus, we retry
+        # with a single quote before to raise the error.
+        arff_params["read_csv_kwargs"] = {"quotechar": "'"}
+        X, y, frame, categories = _open_url_and_load_gzip_file(
+            url, data_home, n_retries, delay, arff_params
+        )
 
     return X, y, frame, categories
 
diff --git a/sklearn/model_selection/_validation.py b/sklearn/model_selection/_validation.py
@@ -15,6 +15,7 @@
 import numbers
 import time
 from functools import partial
+from numbers import Real
 from traceback import format_exc
 from contextlib import suppress
 from collections import Counter
@@ -29,7 +30,14 @@
 from ..utils.validation import _num_samples
 from ..utils.parallel import delayed, Parallel
 from ..utils.metaestimators import _safe_split
+from ..utils._param_validation import (
+    HasMethods,
+    Integral,
+    StrOptions,
+    validate_params,
+)
 from ..metrics import check_scoring
+from ..metrics import get_scorer_names
 from ..metrics._scorer import _check_multimetric_scoring, _MultimetricScorer
 from ..exceptions import FitFailedWarning
 from ._split import check_cv
@@ -46,6 +54,31 @@
 ]
 
 
+@validate_params(
+    {
+        "estimator": [HasMethods("fit")],
+        "X": ["array-like", "sparse matrix"],
+        "y": ["array-like", None],
+        "groups": ["array-like", None],
+        "scoring": [
+            StrOptions(set(get_scorer_names())),
+            callable,
+            list,
+            tuple,
+            dict,
+            None,
+        ],
+        "cv": ["cv_object"],
+        "n_jobs": [Integral, None],
+        "verbose": ["verbose"],
+        "fit_params": [dict, None],
+        "pre_dispatch": [Integral, str],
+        "return_train_score": ["boolean"],
+        "return_estimator": ["boolean"],
+        "return_indices": ["boolean"],
+        "error_score": [StrOptions({"raise"}), Real],
+    }
+)
 def cross_validate(
     estimator,
     X,
@@ -72,7 +105,7 @@ def cross_validate(
     estimator : estimator object implementing 'fit'
         The object to use to fit the data.
 
-    X : array-like of shape (n_samples, n_features)
+    X : {array-like, sparse matrix} of shape (n_samples, n_features)
         The data to fit. Can be for example a list, or an array.
 
     y : array-like of shape (n_samples,) or (n_samples, n_outputs), default=None
@@ -141,11 +174,6 @@ def cross_validate(
         explosion of memory consumption when more jobs get dispatched
         than CPUs can process. This parameter can be:
 
-            - None, in which case all the jobs are immediately
-              created and spawned. Use this for lightweight and
-              fast-running jobs, to avoid delays due to on-demand
-              spawning of the jobs
-
             - An int, giving the exact number of total jobs that are
               spawned
 
diff --git a/sklearn/model_selection/tests/test_validation.py b/sklearn/model_selection/tests/test_validation.py
@@ -10,6 +10,7 @@
 import pytest
 import numpy as np
 from scipy.sparse import coo_matrix, csr_matrix
+from scipy.sparse import issparse
 from sklearn.exceptions import FitFailedWarning
 
 from sklearn.model_selection.tests.test_search import FailingClassifier
@@ -354,18 +355,10 @@ def test_cross_validate_invalid_scoring_param():
     with pytest.raises(ValueError, match=error_message_regexp):
         cross_validate(estimator, X, y, scoring=[[make_scorer(precision_score)]])
 
-    error_message_regexp = (
-        ".*scoring is invalid.*Refer to the scoring glossary for details:.*"
-    )
-
     # Empty dict should raise invalid scoring error
     with pytest.raises(ValueError, match="An empty dict"):
         cross_validate(estimator, X, y, scoring=(dict()))
 
-    # And so should any other invalid entry
-    with pytest.raises(ValueError, match=error_message_regexp):
-        cross_validate(estimator, X, y, scoring=5)
-
     multiclass_scorer = make_scorer(precision_recall_fscore_support)
 
     # Multiclass Scorers that return multiple values are not supported yet
@@ -382,9 +375,6 @@ def test_cross_validate_invalid_scoring_param():
     with pytest.warns(UserWarning, match=warning_message):
         cross_validate(estimator, X, y, scoring={"foo": multiclass_scorer})
 
-    with pytest.raises(ValueError, match="'mse' is not a valid scoring value."):
-        cross_validate(SVC(), X, y, scoring="mse")
-
 
 def test_cross_validate_nested_estimator():
     # Non-regression test to ensure that nested
@@ -405,7 +395,8 @@ def test_cross_validate_nested_estimator():
     assert all(isinstance(estimator, Pipeline) for estimator in estimators)
 
 
-def test_cross_validate():
+@pytest.mark.parametrize("use_sparse", [False, True])
+def test_cross_validate(use_sparse: bool):
     # Compute train and test mse/r2 scores
     cv = KFold()
 
@@ -417,6 +408,10 @@ def test_cross_validate():
     X_clf, y_clf = make_classification(n_samples=30, random_state=0)
     clf = SVC(kernel="linear", random_state=0)
 
+    if use_sparse:
+        X_reg = csr_matrix(X_reg)
+        X_clf = csr_matrix(X_clf)
+
     for X, y, est in ((X_reg, y_reg, reg), (X_clf, y_clf, clf)):
         # It's okay to evaluate regression metrics on classification too
         mse_scorer = check_scoring(est, scoring="neg_mean_squared_error")
@@ -510,7 +505,15 @@ def check_cross_validate_single_metric(clf, X, y, scores, cv):
         clf, X, y, scoring="neg_mean_squared_error", return_estimator=True, cv=cv
     )
     for k, est in enumerate(mse_scores_dict["estimator"]):
-        assert_almost_equal(est.coef_, fitted_estimators[k].coef_)
+        est_coef = est.coef_.copy()
+        if issparse(est_coef):
+            est_coef = est_coef.toarray()
+
+        fitted_est_coef = fitted_estimators[k].coef_.copy()
+        if issparse(fitted_est_coef):
+            fitted_est_coef = fitted_est_coef.toarray()
+
+        assert_almost_equal(est_coef, fitted_est_coef)
         assert_almost_equal(est.intercept_, fitted_estimators[k].intercept_)
 
 
@@ -2104,10 +2107,12 @@ def test_fit_and_score_failing():
         "error_score must be the string 'raise' or a numeric value. (Hint: if "
         "using 'raise', please make sure that it has been spelled correctly.)"
     )
-    with pytest.raises(ValueError, match=error_message):
-        cross_validate(failing_clf, X, cv=3, error_score="unvalid-string")
 
-    with pytest.raises(ValueError, match=error_message):
+    error_message_cross_validate = (
+        "The 'error_score' parameter of cross_validate must be .*. Got .* instead."
+    )
+
+    with pytest.raises(ValueError, match=error_message_cross_validate):
         cross_val_score(failing_clf, X, cv=3, error_score="unvalid-string")
 
     with pytest.raises(ValueError, match=error_message):
diff --git a/sklearn/tests/test_public_functions.py b/sklearn/tests/test_public_functions.py
@@ -246,6 +246,7 @@ def _check_function_param_validation(
     "sklearn.metrics.roc_curve",
     "sklearn.metrics.top_k_accuracy_score",
     "sklearn.metrics.zero_one_loss",
+    "sklearn.model_selection.cross_validate",
     "sklearn.model_selection.train_test_split",
     "sklearn.neighbors.sort_graph_by_row_values",
     "sklearn.preprocessing.add_dummy_feature",