DOC simplify and remove warning in plot_outlier_detection_wine (scikit-learn#27443)

glemaitre · web-flow · commit a3a16046e2d5 · 2023-09-25T10:32:14.000+02:00
diff --git a/examples/applications/plot_outlier_detection_wine.py b/examples/applications/plot_outlier_detection_wine.py
@@ -21,66 +21,64 @@
 estimation of the data structure, but yet accurate to some extent.
 The One-Class SVM does not assume any parametric form of the data distribution
 and can therefore model the complex shape of the data much better.
-
-First example
--------------
-The first example illustrates how the Minimum Covariance Determinant
-robust estimator can help concentrate on a relevant cluster when outlying
-points exist. Here the empirical covariance estimation is skewed by points
-outside of the main cluster. Of course, some screening tools would have pointed
-out the presence of two clusters (Support Vector Machines, Gaussian Mixture
-Models, univariate outlier detection, ...). But had it been a high-dimensional
-example, none of these could be applied that easily.
-
 """
 
 # Author: Virgile Fritsch <virgile.fritsch@inria.fr>
 # License: BSD 3 clause
 
-import matplotlib.font_manager
-import matplotlib.pyplot as plt
-import numpy as np
-
+# %%
+# First example
+# -------------
+#
+# The first example illustrates how the Minimum Covariance Determinant
+# robust estimator can help concentrate on a relevant cluster when outlying
+# points exist. Here the empirical covariance estimation is skewed by points
+# outside of the main cluster. Of course, some screening tools would have pointed
+# out the presence of two clusters (Support Vector Machines, Gaussian Mixture
+# Models, univariate outlier detection, ...). But had it been a high-dimensional
+# example, none of these could be applied that easily.
 from sklearn.covariance import EllipticEnvelope
-from sklearn.datasets import load_wine
+from sklearn.inspection import DecisionBoundaryDisplay
 from sklearn.svm import OneClassSVM
 
-# Define "classifiers" to be used
-classifiers = {
+estimators = {
     "Empirical Covariance": EllipticEnvelope(support_fraction=1.0, contamination=0.25),
     "Robust Covariance (Minimum Covariance Determinant)": EllipticEnvelope(
         contamination=0.25
     ),
     "OCSVM": OneClassSVM(nu=0.25, gamma=0.35),
 }
-colors = ["m", "g", "b"]
-legend1 = {}
-legend2 = {}
 
-# Get data
-X1 = load_wine()["data"][:, [1, 2]]  # two clusters
+# %%
+import matplotlib.lines as mlines
+import matplotlib.pyplot as plt
 
+from sklearn.datasets import load_wine
+
+X = load_wine()["data"][:, [1, 2]]  # two clusters
+
+fig, ax = plt.subplots()
+colors = ["tab:blue", "tab:orange", "tab:red"]
 # Learn a frontier for outlier detection with several classifiers
-xx1, yy1 = np.meshgrid(np.linspace(0, 6, 500), np.linspace(1, 4.5, 500))
-for i, (clf_name, clf) in enumerate(classifiers.items()):
-    plt.figure(1)
-    clf.fit(X1)
-    Z1 = clf.decision_function(np.c_[xx1.ravel(), yy1.ravel()])
-    Z1 = Z1.reshape(xx1.shape)
-    legend1[clf_name] = plt.contour(
-        xx1, yy1, Z1, levels=[0], linewidths=2, colors=colors[i]
+legend_lines = []
+for color, (name, estimator) in zip(colors, estimators.items()):
+    estimator.fit(X)
+    DecisionBoundaryDisplay.from_estimator(
+        estimator,
+        X,
+        response_method="decision_function",
+        plot_method="contour",
+        levels=[0],
+        colors=color,
+        ax=ax,
     )
+    legend_lines.append(mlines.Line2D([], [], color=color, label=name))
 
-legend1_values_list = list(legend1.values())
-legend1_keys_list = list(legend1.keys())
 
-# Plot the results (= shape of the data points cloud)
-plt.figure(1)  # two clusters
-plt.title("Outlier detection on a real data set (wine recognition)")
-plt.scatter(X1[:, 0], X1[:, 1], color="black")
+ax.scatter(X[:, 0], X[:, 1], color="black")
 bbox_args = dict(boxstyle="round", fc="0.8")
 arrow_args = dict(arrowstyle="->")
-plt.annotate(
+ax.annotate(
     "outlying points",
     xy=(4, 2),
     xycoords="data",
@@ -89,26 +87,17 @@
     bbox=bbox_args,
     arrowprops=arrow_args,
 )
-plt.xlim((xx1.min(), xx1.max()))
-plt.ylim((yy1.min(), yy1.max()))
-plt.legend(
-    (
-        legend1_values_list[0].collections[0],
-        legend1_values_list[1].collections[0],
-        legend1_values_list[2].collections[0],
-    ),
-    (legend1_keys_list[0], legend1_keys_list[1], legend1_keys_list[2]),
-    loc="upper center",
-    prop=matplotlib.font_manager.FontProperties(size=11),
+ax.legend(handles=legend_lines, loc="upper center")
+_ = ax.set(
+    xlabel="ash",
+    ylabel="malic_acid",
+    title="Outlier detection on a real data set (wine recognition)",
 )
-plt.ylabel("ash")
-plt.xlabel("malic_acid")
-
-plt.show()
 
 # %%
 # Second example
 # --------------
+#
 # The second example shows the ability of the Minimum Covariance Determinant
 # robust estimator of covariance to concentrate on the main mode of the data
 # distribution: the location seems to be well estimated, although the
@@ -117,41 +106,32 @@
 # capture the real data structure, but the difficulty is to adjust its kernel
 # bandwidth parameter so as to obtain a good compromise between the shape of
 # the data scatter matrix and the risk of over-fitting the data.
+X = load_wine()["data"][:, [6, 9]]  # "banana"-shaped
 
-# Get data
-X2 = load_wine()["data"][:, [6, 9]]  # "banana"-shaped
-
+fig, ax = plt.subplots()
+colors = ["tab:blue", "tab:orange", "tab:red"]
 # Learn a frontier for outlier detection with several classifiers
-xx2, yy2 = np.meshgrid(np.linspace(-1, 5.5, 500), np.linspace(-2.5, 19, 500))
-for i, (clf_name, clf) in enumerate(classifiers.items()):
-    plt.figure(2)
-    clf.fit(X2)
-    Z2 = clf.decision_function(np.c_[xx2.ravel(), yy2.ravel()])
-    Z2 = Z2.reshape(xx2.shape)
-    legend2[clf_name] = plt.contour(
-        xx2, yy2, Z2, levels=[0], linewidths=2, colors=colors[i]
+legend_lines = []
+for color, (name, estimator) in zip(colors, estimators.items()):
+    estimator.fit(X)
+    DecisionBoundaryDisplay.from_estimator(
+        estimator,
+        X,
+        response_method="decision_function",
+        plot_method="contour",
+        levels=[0],
+        colors=color,
+        ax=ax,
     )
+    legend_lines.append(mlines.Line2D([], [], color=color, label=name))
 
-legend2_values_list = list(legend2.values())
-legend2_keys_list = list(legend2.keys())
-
-# Plot the results (= shape of the data points cloud)
-plt.figure(2)  # "banana" shape
-plt.title("Outlier detection on a real data set (wine recognition)")
-plt.scatter(X2[:, 0], X2[:, 1], color="black")
-plt.xlim((xx2.min(), xx2.max()))
-plt.ylim((yy2.min(), yy2.max()))
-plt.legend(
-    (
-        legend2_values_list[0].collections[0],
-        legend2_values_list[1].collections[0],
-        legend2_values_list[2].collections[0],
-    ),
-    (legend2_keys_list[0], legend2_keys_list[1], legend2_keys_list[2]),
-    loc="upper center",
-    prop=matplotlib.font_manager.FontProperties(size=11),
+
+ax.scatter(X[:, 0], X[:, 1], color="black")
+ax.legend(handles=legend_lines, loc="upper center")
+ax.set(
+    xlabel="flavanoids",
+    ylabel="color_intensity",
+    title="Outlier detection on a real data set (wine recognition)",
 )
-plt.ylabel("color_intensity")
-plt.xlabel("flavanoids")
 
 plt.show()