DOC Rework voting classifier example (scikit-learn#30985)

ArturoAmorQ · ArturoAmorQ · ogrisel · web-flow · commit 1527b1fe98d1 · 2025-05-01T19:31:32.000+10:00
Co-authored-by: ArturoAmorQ &lt;arturo.amor-quiroz@polytechnique.edu&gt;
Co-authored-by: Olivier Grisel &lt;olivier.grisel@ensta.org&gt;
Co-authored-by: Lucy Liu &lt;jliu176@gmail.com&gt;
diff --git a/doc/conf.py b/doc/conf.py
@@ -491,6 +491,9 @@ def add_js_css_files(app, pagename, templatename, context, doctree):
     "auto_examples/ensemble/plot_forest_importances_faces": (
         "auto_examples/ensemble/plot_forest_importances"
     ),
+    "auto_examples/ensemble/plot_voting_probas": (
+        "auto_examples/ensemble/plot_voting_decision_regions"
+    ),
     "auto_examples/datasets/plot_iris_dataset": (
         "auto_examples/decomposition/plot_pca_iris"
     ),
diff --git a/doc/modules/ensemble.rst b/doc/modules/ensemble.rst
@@ -1410,40 +1410,17 @@ classifier 3      w3 * 0.3      w3 * 0.4        w3 * 0.3
 weighted average  0.37          0.4             0.23
 ================  ==========    ==========      ==========
 
-Here, the predicted class label is 2, since it has the highest average probability. See
-this example on :ref:`Visualising class probabilities in a Voting Classifier
-<sphx_glr_auto_examples_ensemble_plot_voting_probas.py>` for a detailed illustration of
-class probabilities averaged by soft voting.
+Here, the predicted class label is 2, since it has the highest average
+predicted probability. See the example on
+:ref:`sphx_glr_auto_examples_ensemble_plot_voting_decision_regions.py` for a
+demonstration of how the predicted class label can be obtained from the weighted
+average of predicted probabilities.
 
-Also, the following example illustrates how the decision regions may change
-when a soft :class:`VotingClassifier` is used based on a linear Support
-Vector Machine, a Decision Tree, and a K-nearest neighbor classifier::
+The following figure illustrates how the decision regions may change when
+a soft :class:`VotingClassifier` is trained with weights on three linear
+models:
 
-   >>> from sklearn import datasets
-   >>> from sklearn.tree import DecisionTreeClassifier
-   >>> from sklearn.neighbors import KNeighborsClassifier
-   >>> from sklearn.svm import SVC
-   >>> from itertools import product
-   >>> from sklearn.ensemble import VotingClassifier
-
-   >>> # Loading some example data
-   >>> iris = datasets.load_iris()
-   >>> X = iris.data[:, [0, 2]]
-   >>> y = iris.target
-
-   >>> # Training classifiers
-   >>> clf1 = DecisionTreeClassifier(max_depth=4)
-   >>> clf2 = KNeighborsClassifier(n_neighbors=7)
-   >>> clf3 = SVC(kernel='rbf', probability=True)
-   >>> eclf = VotingClassifier(estimators=[('dt', clf1), ('knn', clf2), ('svc', clf3)],
-   ...                         voting='soft', weights=[2, 1, 2])
-
-   >>> clf1 = clf1.fit(X, y)
-   >>> clf2 = clf2.fit(X, y)
-   >>> clf3 = clf3.fit(X, y)
-   >>> eclf = eclf.fit(X, y)
-
-.. figure:: ../auto_examples/ensemble/images/sphx_glr_plot_voting_decision_regions_001.png
+.. figure:: ../auto_examples/ensemble/images/sphx_glr_plot_voting_decision_regions_002.png
     :target: ../auto_examples/ensemble/plot_voting_decision_regions.html
     :align: center
     :scale: 75%
diff --git a/examples/ensemble/plot_voting_decision_regions.py b/examples/ensemble/plot_voting_decision_regions.py
@@ -1,73 +1,218 @@
 """
-==================================================
-Plot the decision boundaries of a VotingClassifier
-==================================================
+===============================================================
+Visualizing the probabilistic predictions of a VotingClassifier
+===============================================================
 
 .. currentmodule:: sklearn
 
-Plot the decision boundaries of a :class:`~ensemble.VotingClassifier` for two
-features of the Iris dataset.
+Plot the predicted class probabilities in a toy dataset predicted by three
+different classifiers and averaged by the :class:`~ensemble.VotingClassifier`.
 
-Plot the class probabilities of the first sample in a toy dataset predicted by
-three different classifiers and averaged by the
-:class:`~ensemble.VotingClassifier`.
+First, three linear classifiers are initialized. Two are spline models with
+interaction terms, one using constant extrapolation and the other using periodic
+extrapolation. The third classifier is a :class:`~kernel_approximation.Nystroem`
+with the default "rbf" kernel.
 
-First, three exemplary classifiers are initialized
-(:class:`~tree.DecisionTreeClassifier`,
-:class:`~neighbors.KNeighborsClassifier`, and :class:`~svm.SVC`) and used to
-initialize a soft-voting :class:`~ensemble.VotingClassifier` with weights `[2,
-1, 2]`, which means that the predicted probabilities of the
-:class:`~tree.DecisionTreeClassifier` and :class:`~svm.SVC` each count 2 times
-as much as the weights of the :class:`~neighbors.KNeighborsClassifier`
-classifier when the averaged probability is calculated.
+In the first part of this example, these three classifiers are used to
+demonstrate soft-voting using :class:`~ensemble.VotingClassifier` with weighted
+average. We set `weights=[2, 1, 3]`, meaning the constant extrapolation spline
+model's predictions are weighted twice as much as the periodic spline model's,
+and the Nystroem model's predictions are weighted three times as much as the
+periodic spline.
+
+The second part demonstrates how soft predictions can be converted into hard
+predictions.
 
 """
 
 # Authors: The scikit-learn developers
 # SPDX-License-Identifier: BSD-3-Clause
 
-from itertools import product
+# %%
+# We first generate a noisy XOR dataset, which is a binary classification task.
 
 import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+from matplotlib.colors import ListedColormap
+
+n_samples = 500
+rng = np.random.default_rng(0)
+feature_names = ["Feature #0", "Feature #1"]
+common_scatter_plot_params = dict(
+    cmap=ListedColormap(["tab:red", "tab:blue"]),
+    edgecolor="white",
+    linewidth=1,
+)
+
+xor = pd.DataFrame(
+    np.random.RandomState(0).uniform(low=-1, high=1, size=(n_samples, 2)),
+    columns=feature_names,
+)
+noise = rng.normal(loc=0, scale=0.1, size=(n_samples, 2))
+target_xor = np.logical_xor(
+    xor["Feature #0"] + noise[:, 0] > 0, xor["Feature #1"] + noise[:, 1] > 0
+)
+
+X = xor[feature_names]
+y = target_xor.astype(np.int32)
+
+fig, ax = plt.subplots()
+ax.scatter(X["Feature #0"], X["Feature #1"], c=y, **common_scatter_plot_params)
+ax.set_title("The XOR dataset")
+plt.show()
+
+# %%
+# Due to the inherent non-linear separability of the XOR dataset, tree-based
+# models would often be preferred. However, appropriate feature engineering
+# combined with a linear model can yield effective results, with the added
+# benefit of producing better-calibrated probabilities for samples located in
+# the transition regions affected by noise.
+#
+# We define and fit the models on the whole dataset.
 
-from sklearn import datasets
 from sklearn.ensemble import VotingClassifier
-from sklearn.inspection import DecisionBoundaryDisplay
-from sklearn.neighbors import KNeighborsClassifier
-from sklearn.svm import SVC
-from sklearn.tree import DecisionTreeClassifier
-
-# Loading some example data
-iris = datasets.load_iris()
-X = iris.data[:, [0, 2]]
-y = iris.target
-
-# Training classifiers
-clf1 = DecisionTreeClassifier(max_depth=4)
-clf2 = KNeighborsClassifier(n_neighbors=7)
-clf3 = SVC(gamma=0.1, kernel="rbf", probability=True)
+from sklearn.kernel_approximation import Nystroem
+from sklearn.linear_model import LogisticRegression
+from sklearn.pipeline import make_pipeline
+from sklearn.preprocessing import PolynomialFeatures, SplineTransformer, StandardScaler
+
+clf1 = make_pipeline(
+    SplineTransformer(degree=2, n_knots=2),
+    PolynomialFeatures(interaction_only=True),
+    LogisticRegression(C=10),
+)
+clf2 = make_pipeline(
+    SplineTransformer(
+        degree=2,
+        n_knots=4,
+        extrapolation="periodic",
+        include_bias=True,
+    ),
+    PolynomialFeatures(interaction_only=True),
+    LogisticRegression(C=10),
+)
+clf3 = make_pipeline(
+    StandardScaler(),
+    Nystroem(gamma=2, random_state=0),
+    LogisticRegression(C=10),
+)
+weights = [2, 1, 3]
 eclf = VotingClassifier(
-    estimators=[("dt", clf1), ("knn", clf2), ("svc", clf3)],
+    estimators=[
+        ("constant splines model", clf1),
+        ("periodic splines model", clf2),
+        ("nystroem model", clf3),
+    ],
     voting="soft",
-    weights=[2, 1, 2],
+    weights=weights,
 )
 
 clf1.fit(X, y)
 clf2.fit(X, y)
 clf3.fit(X, y)
 eclf.fit(X, y)
 
-# Plotting decision regions
-f, axarr = plt.subplots(2, 2, sharex="col", sharey="row", figsize=(10, 8))
-for idx, clf, tt in zip(
+# %%
+# Finally we use :class:`~inspection.DecisionBoundaryDisplay` to plot the
+# predicted probabilities. By using a diverging colormap (such as `"RdBu"`), we
+# can ensure that darker colors correspond to `predict_proba` close to either 0
+# or 1, and white corresponds to `predict_proba` of 0.5.
+
+from itertools import product
+
+from sklearn.inspection import DecisionBoundaryDisplay
+
+fig, axarr = plt.subplots(2, 2, sharex="col", sharey="row", figsize=(10, 8))
+for idx, clf, title in zip(
     product([0, 1], [0, 1]),
     [clf1, clf2, clf3, eclf],
-    ["Decision Tree (depth=4)", "KNN (k=7)", "Kernel SVM", "Soft Voting"],
+    [
+        "Splines with\nconstant extrapolation",
+        "Splines with\nperiodic extrapolation",
+        "RBF Nystroem",
+        "Soft Voting",
+    ],
 ):
-    DecisionBoundaryDisplay.from_estimator(
-        clf, X, alpha=0.4, ax=axarr[idx[0], idx[1]], response_method="predict"
+    disp = DecisionBoundaryDisplay.from_estimator(
+        clf,
+        X,
+        response_method="predict_proba",
+        plot_method="pcolormesh",
+        cmap="RdBu",
+        alpha=0.8,
+        ax=axarr[idx[0], idx[1]],
+    )
+    axarr[idx[0], idx[1]].scatter(
+        X["Feature #0"],
+        X["Feature #1"],
+        c=y,
+        **common_scatter_plot_params,
     )
-    axarr[idx[0], idx[1]].scatter(X[:, 0], X[:, 1], c=y, s=20, edgecolor="k")
-    axarr[idx[0], idx[1]].set_title(tt)
+    axarr[idx[0], idx[1]].set_title(title)
+    fig.colorbar(disp.surface_, ax=axarr[idx[0], idx[1]], label="Probability estimate")
 
 plt.show()
+
+# %%
+# As a sanity check, we can verify for a given sample that the probability
+# predicted by the :class:`~ensemble.VotingClassifier` is indeed the weighted
+# average of the individual classifiers' soft-predictions.
+#
+# In the case of binary classification such as in the present example, the
+# :term:`predict_proba` arrays contain the probability of belonging to class 0
+# (here in red) as the first entry, and the probability of belonging to class 1
+# (here in blue) as the second entry.
+
+test_sample = pd.DataFrame({"Feature #0": [-0.5], "Feature #1": [1.5]})
+predict_probas = [est.predict_proba(test_sample).ravel() for est in eclf.estimators_]
+for (est_name, _), est_probas in zip(eclf.estimators, predict_probas):
+    print(f"{est_name}'s predicted probabilities: {est_probas}")
+
+# %%
+print(
+    "Weighted average of soft-predictions: "
+    f"{np.dot(weights, predict_probas) / np.sum(weights)}"
+)
+
+# %%
+# We can see that manual calculation of predicted probabilities above is
+# equivalent to that produced by the `VotingClassifier`:
+
+print(
+    "Predicted probability of VotingClassifier: "
+    f"{eclf.predict_proba(test_sample).ravel()}"
+)
+
+# %%
+# To convert soft predictions into hard predictions when weights are provided,
+# the weighted average predicted probabilities are computed for each class.
+# Then, the final class label is then derived from the class label with the
+# highest average probability, which corresponds to the default threshold at
+# `predict_proba=0.5` in the case of binary classification.
+
+print(
+    "Class with the highest weighted average of soft-predictions: "
+    f"{np.argmax(np.dot(weights, predict_probas) / np.sum(weights))}"
+)
+
+# %%
+# This is equivalent to the output of `VotingClassifier`'s `predict` method:
+
+print(f"Predicted class of VotingClassifier: {eclf.predict(test_sample).ravel()}")
+
+# %%
+# Soft votes can be thresholded as for any other probabilistic classifier. This
+# allows you to set a threshold probability at which the positive class will be
+# predicted, instead of simply selecting the class with the highest predicted
+# probability.
+
+from sklearn.model_selection import FixedThresholdClassifier
+
+eclf_other_threshold = FixedThresholdClassifier(
+    eclf, threshold=0.7, response_method="predict_proba"
+).fit(X, y)
+print(
+    "Predicted class of thresholded VotingClassifier: "
+    f"{eclf_other_threshold.predict(test_sample)}"
+)
diff --git a/examples/ensemble/plot_voting_probas.py b/examples/ensemble/plot_voting_probas.py