Skip to content

Commit a3a1604

Browse files
authored
DOC simplify and remove warning in plot_outlier_detection_wine (scikit-learn#27443)
1 parent e31ba77 commit a3a1604

File tree

1 file changed

+64
-84
lines changed

1 file changed

+64
-84
lines changed

examples/applications/plot_outlier_detection_wine.py

Lines changed: 64 additions & 84 deletions
Original file line numberDiff line numberDiff line change
@@ -21,66 +21,64 @@
2121
estimation of the data structure, but yet accurate to some extent.
2222
The One-Class SVM does not assume any parametric form of the data distribution
2323
and can therefore model the complex shape of the data much better.
24-
25-
First example
26-
-------------
27-
The first example illustrates how the Minimum Covariance Determinant
28-
robust estimator can help concentrate on a relevant cluster when outlying
29-
points exist. Here the empirical covariance estimation is skewed by points
30-
outside of the main cluster. Of course, some screening tools would have pointed
31-
out the presence of two clusters (Support Vector Machines, Gaussian Mixture
32-
Models, univariate outlier detection, ...). But had it been a high-dimensional
33-
example, none of these could be applied that easily.
34-
3524
"""
3625

3726
# Author: Virgile Fritsch <virgile.fritsch@inria.fr>
3827
# License: BSD 3 clause
3928

40-
import matplotlib.font_manager
41-
import matplotlib.pyplot as plt
42-
import numpy as np
43-
29+
# %%
30+
# First example
31+
# -------------
32+
#
33+
# The first example illustrates how the Minimum Covariance Determinant
34+
# robust estimator can help concentrate on a relevant cluster when outlying
35+
# points exist. Here the empirical covariance estimation is skewed by points
36+
# outside of the main cluster. Of course, some screening tools would have pointed
37+
# out the presence of two clusters (Support Vector Machines, Gaussian Mixture
38+
# Models, univariate outlier detection, ...). But had it been a high-dimensional
39+
# example, none of these could be applied that easily.
4440
from sklearn.covariance import EllipticEnvelope
45-
from sklearn.datasets import load_wine
41+
from sklearn.inspection import DecisionBoundaryDisplay
4642
from sklearn.svm import OneClassSVM
4743

48-
# Define "classifiers" to be used
49-
classifiers = {
44+
estimators = {
5045
"Empirical Covariance": EllipticEnvelope(support_fraction=1.0, contamination=0.25),
5146
"Robust Covariance (Minimum Covariance Determinant)": EllipticEnvelope(
5247
contamination=0.25
5348
),
5449
"OCSVM": OneClassSVM(nu=0.25, gamma=0.35),
5550
}
56-
colors = ["m", "g", "b"]
57-
legend1 = {}
58-
legend2 = {}
5951

60-
# Get data
61-
X1 = load_wine()["data"][:, [1, 2]] # two clusters
52+
# %%
53+
import matplotlib.lines as mlines
54+
import matplotlib.pyplot as plt
6255

56+
from sklearn.datasets import load_wine
57+
58+
X = load_wine()["data"][:, [1, 2]] # two clusters
59+
60+
fig, ax = plt.subplots()
61+
colors = ["tab:blue", "tab:orange", "tab:red"]
6362
# Learn a frontier for outlier detection with several classifiers
64-
xx1, yy1 = np.meshgrid(np.linspace(0, 6, 500), np.linspace(1, 4.5, 500))
65-
for i, (clf_name, clf) in enumerate(classifiers.items()):
66-
plt.figure(1)
67-
clf.fit(X1)
68-
Z1 = clf.decision_function(np.c_[xx1.ravel(), yy1.ravel()])
69-
Z1 = Z1.reshape(xx1.shape)
70-
legend1[clf_name] = plt.contour(
71-
xx1, yy1, Z1, levels=[0], linewidths=2, colors=colors[i]
63+
legend_lines = []
64+
for color, (name, estimator) in zip(colors, estimators.items()):
65+
estimator.fit(X)
66+
DecisionBoundaryDisplay.from_estimator(
67+
estimator,
68+
X,
69+
response_method="decision_function",
70+
plot_method="contour",
71+
levels=[0],
72+
colors=color,
73+
ax=ax,
7274
)
75+
legend_lines.append(mlines.Line2D([], [], color=color, label=name))
7376

74-
legend1_values_list = list(legend1.values())
75-
legend1_keys_list = list(legend1.keys())
7677

77-
# Plot the results (= shape of the data points cloud)
78-
plt.figure(1) # two clusters
79-
plt.title("Outlier detection on a real data set (wine recognition)")
80-
plt.scatter(X1[:, 0], X1[:, 1], color="black")
78+
ax.scatter(X[:, 0], X[:, 1], color="black")
8179
bbox_args = dict(boxstyle="round", fc="0.8")
8280
arrow_args = dict(arrowstyle="->")
83-
plt.annotate(
81+
ax.annotate(
8482
"outlying points",
8583
xy=(4, 2),
8684
xycoords="data",
@@ -89,26 +87,17 @@
8987
bbox=bbox_args,
9088
arrowprops=arrow_args,
9189
)
92-
plt.xlim((xx1.min(), xx1.max()))
93-
plt.ylim((yy1.min(), yy1.max()))
94-
plt.legend(
95-
(
96-
legend1_values_list[0].collections[0],
97-
legend1_values_list[1].collections[0],
98-
legend1_values_list[2].collections[0],
99-
),
100-
(legend1_keys_list[0], legend1_keys_list[1], legend1_keys_list[2]),
101-
loc="upper center",
102-
prop=matplotlib.font_manager.FontProperties(size=11),
90+
ax.legend(handles=legend_lines, loc="upper center")
91+
_ = ax.set(
92+
xlabel="ash",
93+
ylabel="malic_acid",
94+
title="Outlier detection on a real data set (wine recognition)",
10395
)
104-
plt.ylabel("ash")
105-
plt.xlabel("malic_acid")
106-
107-
plt.show()
10896

10997
# %%
11098
# Second example
11199
# --------------
100+
#
112101
# The second example shows the ability of the Minimum Covariance Determinant
113102
# robust estimator of covariance to concentrate on the main mode of the data
114103
# distribution: the location seems to be well estimated, although the
@@ -117,41 +106,32 @@
117106
# capture the real data structure, but the difficulty is to adjust its kernel
118107
# bandwidth parameter so as to obtain a good compromise between the shape of
119108
# the data scatter matrix and the risk of over-fitting the data.
109+
X = load_wine()["data"][:, [6, 9]] # "banana"-shaped
120110

121-
# Get data
122-
X2 = load_wine()["data"][:, [6, 9]] # "banana"-shaped
123-
111+
fig, ax = plt.subplots()
112+
colors = ["tab:blue", "tab:orange", "tab:red"]
124113
# Learn a frontier for outlier detection with several classifiers
125-
xx2, yy2 = np.meshgrid(np.linspace(-1, 5.5, 500), np.linspace(-2.5, 19, 500))
126-
for i, (clf_name, clf) in enumerate(classifiers.items()):
127-
plt.figure(2)
128-
clf.fit(X2)
129-
Z2 = clf.decision_function(np.c_[xx2.ravel(), yy2.ravel()])
130-
Z2 = Z2.reshape(xx2.shape)
131-
legend2[clf_name] = plt.contour(
132-
xx2, yy2, Z2, levels=[0], linewidths=2, colors=colors[i]
114+
legend_lines = []
115+
for color, (name, estimator) in zip(colors, estimators.items()):
116+
estimator.fit(X)
117+
DecisionBoundaryDisplay.from_estimator(
118+
estimator,
119+
X,
120+
response_method="decision_function",
121+
plot_method="contour",
122+
levels=[0],
123+
colors=color,
124+
ax=ax,
133125
)
126+
legend_lines.append(mlines.Line2D([], [], color=color, label=name))
134127

135-
legend2_values_list = list(legend2.values())
136-
legend2_keys_list = list(legend2.keys())
137-
138-
# Plot the results (= shape of the data points cloud)
139-
plt.figure(2) # "banana" shape
140-
plt.title("Outlier detection on a real data set (wine recognition)")
141-
plt.scatter(X2[:, 0], X2[:, 1], color="black")
142-
plt.xlim((xx2.min(), xx2.max()))
143-
plt.ylim((yy2.min(), yy2.max()))
144-
plt.legend(
145-
(
146-
legend2_values_list[0].collections[0],
147-
legend2_values_list[1].collections[0],
148-
legend2_values_list[2].collections[0],
149-
),
150-
(legend2_keys_list[0], legend2_keys_list[1], legend2_keys_list[2]),
151-
loc="upper center",
152-
prop=matplotlib.font_manager.FontProperties(size=11),
128+
129+
ax.scatter(X[:, 0], X[:, 1], color="black")
130+
ax.legend(handles=legend_lines, loc="upper center")
131+
ax.set(
132+
xlabel="flavanoids",
133+
ylabel="color_intensity",
134+
title="Outlier detection on a real data set (wine recognition)",
153135
)
154-
plt.ylabel("color_intensity")
155-
plt.xlabel("flavanoids")
156136

157137
plt.show()

0 commit comments

Comments
 (0)