|
21 | 21 | estimation of the data structure, but yet accurate to some extent.
|
22 | 22 | The One-Class SVM does not assume any parametric form of the data distribution
|
23 | 23 | and can therefore model the complex shape of the data much better.
|
24 |
| -
|
25 |
| -First example |
26 |
| -------------- |
27 |
| -The first example illustrates how the Minimum Covariance Determinant |
28 |
| -robust estimator can help concentrate on a relevant cluster when outlying |
29 |
| -points exist. Here the empirical covariance estimation is skewed by points |
30 |
| -outside of the main cluster. Of course, some screening tools would have pointed |
31 |
| -out the presence of two clusters (Support Vector Machines, Gaussian Mixture |
32 |
| -Models, univariate outlier detection, ...). But had it been a high-dimensional |
33 |
| -example, none of these could be applied that easily. |
34 |
| -
|
35 | 24 | """
|
36 | 25 |
|
37 | 26 | # Author: Virgile Fritsch <virgile.fritsch@inria.fr>
|
38 | 27 | # License: BSD 3 clause
|
39 | 28 |
|
40 |
| -import matplotlib.font_manager |
41 |
| -import matplotlib.pyplot as plt |
42 |
| -import numpy as np |
43 |
| - |
| 29 | +# %% |
| 30 | +# First example |
| 31 | +# ------------- |
| 32 | +# |
| 33 | +# The first example illustrates how the Minimum Covariance Determinant |
| 34 | +# robust estimator can help concentrate on a relevant cluster when outlying |
| 35 | +# points exist. Here the empirical covariance estimation is skewed by points |
| 36 | +# outside of the main cluster. Of course, some screening tools would have pointed |
| 37 | +# out the presence of two clusters (Support Vector Machines, Gaussian Mixture |
| 38 | +# Models, univariate outlier detection, ...). But had it been a high-dimensional |
| 39 | +# example, none of these could be applied that easily. |
44 | 40 | from sklearn.covariance import EllipticEnvelope
|
45 |
| -from sklearn.datasets import load_wine |
| 41 | +from sklearn.inspection import DecisionBoundaryDisplay |
46 | 42 | from sklearn.svm import OneClassSVM
|
47 | 43 |
|
48 |
| -# Define "classifiers" to be used |
49 |
| -classifiers = { |
| 44 | +estimators = { |
50 | 45 | "Empirical Covariance": EllipticEnvelope(support_fraction=1.0, contamination=0.25),
|
51 | 46 | "Robust Covariance (Minimum Covariance Determinant)": EllipticEnvelope(
|
52 | 47 | contamination=0.25
|
53 | 48 | ),
|
54 | 49 | "OCSVM": OneClassSVM(nu=0.25, gamma=0.35),
|
55 | 50 | }
|
56 |
| -colors = ["m", "g", "b"] |
57 |
| -legend1 = {} |
58 |
| -legend2 = {} |
59 | 51 |
|
60 |
| -# Get data |
61 |
| -X1 = load_wine()["data"][:, [1, 2]] # two clusters |
| 52 | +# %% |
| 53 | +import matplotlib.lines as mlines |
| 54 | +import matplotlib.pyplot as plt |
62 | 55 |
|
| 56 | +from sklearn.datasets import load_wine |
| 57 | + |
| 58 | +X = load_wine()["data"][:, [1, 2]] # two clusters |
| 59 | + |
| 60 | +fig, ax = plt.subplots() |
| 61 | +colors = ["tab:blue", "tab:orange", "tab:red"] |
63 | 62 | # Learn a frontier for outlier detection with several classifiers
|
64 |
| -xx1, yy1 = np.meshgrid(np.linspace(0, 6, 500), np.linspace(1, 4.5, 500)) |
65 |
| -for i, (clf_name, clf) in enumerate(classifiers.items()): |
66 |
| - plt.figure(1) |
67 |
| - clf.fit(X1) |
68 |
| - Z1 = clf.decision_function(np.c_[xx1.ravel(), yy1.ravel()]) |
69 |
| - Z1 = Z1.reshape(xx1.shape) |
70 |
| - legend1[clf_name] = plt.contour( |
71 |
| - xx1, yy1, Z1, levels=[0], linewidths=2, colors=colors[i] |
| 63 | +legend_lines = [] |
| 64 | +for color, (name, estimator) in zip(colors, estimators.items()): |
| 65 | + estimator.fit(X) |
| 66 | + DecisionBoundaryDisplay.from_estimator( |
| 67 | + estimator, |
| 68 | + X, |
| 69 | + response_method="decision_function", |
| 70 | + plot_method="contour", |
| 71 | + levels=[0], |
| 72 | + colors=color, |
| 73 | + ax=ax, |
72 | 74 | )
|
| 75 | + legend_lines.append(mlines.Line2D([], [], color=color, label=name)) |
73 | 76 |
|
74 |
| -legend1_values_list = list(legend1.values()) |
75 |
| -legend1_keys_list = list(legend1.keys()) |
76 | 77 |
|
77 |
| -# Plot the results (= shape of the data points cloud) |
78 |
| -plt.figure(1) # two clusters |
79 |
| -plt.title("Outlier detection on a real data set (wine recognition)") |
80 |
| -plt.scatter(X1[:, 0], X1[:, 1], color="black") |
| 78 | +ax.scatter(X[:, 0], X[:, 1], color="black") |
81 | 79 | bbox_args = dict(boxstyle="round", fc="0.8")
|
82 | 80 | arrow_args = dict(arrowstyle="->")
|
83 |
| -plt.annotate( |
| 81 | +ax.annotate( |
84 | 82 | "outlying points",
|
85 | 83 | xy=(4, 2),
|
86 | 84 | xycoords="data",
|
|
89 | 87 | bbox=bbox_args,
|
90 | 88 | arrowprops=arrow_args,
|
91 | 89 | )
|
92 |
| -plt.xlim((xx1.min(), xx1.max())) |
93 |
| -plt.ylim((yy1.min(), yy1.max())) |
94 |
| -plt.legend( |
95 |
| - ( |
96 |
| - legend1_values_list[0].collections[0], |
97 |
| - legend1_values_list[1].collections[0], |
98 |
| - legend1_values_list[2].collections[0], |
99 |
| - ), |
100 |
| - (legend1_keys_list[0], legend1_keys_list[1], legend1_keys_list[2]), |
101 |
| - loc="upper center", |
102 |
| - prop=matplotlib.font_manager.FontProperties(size=11), |
| 90 | +ax.legend(handles=legend_lines, loc="upper center") |
| 91 | +_ = ax.set( |
| 92 | + xlabel="ash", |
| 93 | + ylabel="malic_acid", |
| 94 | + title="Outlier detection on a real data set (wine recognition)", |
103 | 95 | )
|
104 |
| -plt.ylabel("ash") |
105 |
| -plt.xlabel("malic_acid") |
106 |
| - |
107 |
| -plt.show() |
108 | 96 |
|
109 | 97 | # %%
|
110 | 98 | # Second example
|
111 | 99 | # --------------
|
| 100 | +# |
112 | 101 | # The second example shows the ability of the Minimum Covariance Determinant
|
113 | 102 | # robust estimator of covariance to concentrate on the main mode of the data
|
114 | 103 | # distribution: the location seems to be well estimated, although the
|
|
117 | 106 | # capture the real data structure, but the difficulty is to adjust its kernel
|
118 | 107 | # bandwidth parameter so as to obtain a good compromise between the shape of
|
119 | 108 | # the data scatter matrix and the risk of over-fitting the data.
|
| 109 | +X = load_wine()["data"][:, [6, 9]] # "banana"-shaped |
120 | 110 |
|
121 |
| -# Get data |
122 |
| -X2 = load_wine()["data"][:, [6, 9]] # "banana"-shaped |
123 |
| - |
| 111 | +fig, ax = plt.subplots() |
| 112 | +colors = ["tab:blue", "tab:orange", "tab:red"] |
124 | 113 | # Learn a frontier for outlier detection with several classifiers
|
125 |
| -xx2, yy2 = np.meshgrid(np.linspace(-1, 5.5, 500), np.linspace(-2.5, 19, 500)) |
126 |
| -for i, (clf_name, clf) in enumerate(classifiers.items()): |
127 |
| - plt.figure(2) |
128 |
| - clf.fit(X2) |
129 |
| - Z2 = clf.decision_function(np.c_[xx2.ravel(), yy2.ravel()]) |
130 |
| - Z2 = Z2.reshape(xx2.shape) |
131 |
| - legend2[clf_name] = plt.contour( |
132 |
| - xx2, yy2, Z2, levels=[0], linewidths=2, colors=colors[i] |
| 114 | +legend_lines = [] |
| 115 | +for color, (name, estimator) in zip(colors, estimators.items()): |
| 116 | + estimator.fit(X) |
| 117 | + DecisionBoundaryDisplay.from_estimator( |
| 118 | + estimator, |
| 119 | + X, |
| 120 | + response_method="decision_function", |
| 121 | + plot_method="contour", |
| 122 | + levels=[0], |
| 123 | + colors=color, |
| 124 | + ax=ax, |
133 | 125 | )
|
| 126 | + legend_lines.append(mlines.Line2D([], [], color=color, label=name)) |
134 | 127 |
|
135 |
| -legend2_values_list = list(legend2.values()) |
136 |
| -legend2_keys_list = list(legend2.keys()) |
137 |
| - |
138 |
| -# Plot the results (= shape of the data points cloud) |
139 |
| -plt.figure(2) # "banana" shape |
140 |
| -plt.title("Outlier detection on a real data set (wine recognition)") |
141 |
| -plt.scatter(X2[:, 0], X2[:, 1], color="black") |
142 |
| -plt.xlim((xx2.min(), xx2.max())) |
143 |
| -plt.ylim((yy2.min(), yy2.max())) |
144 |
| -plt.legend( |
145 |
| - ( |
146 |
| - legend2_values_list[0].collections[0], |
147 |
| - legend2_values_list[1].collections[0], |
148 |
| - legend2_values_list[2].collections[0], |
149 |
| - ), |
150 |
| - (legend2_keys_list[0], legend2_keys_list[1], legend2_keys_list[2]), |
151 |
| - loc="upper center", |
152 |
| - prop=matplotlib.font_manager.FontProperties(size=11), |
| 128 | + |
| 129 | +ax.scatter(X[:, 0], X[:, 1], color="black") |
| 130 | +ax.legend(handles=legend_lines, loc="upper center") |
| 131 | +ax.set( |
| 132 | + xlabel="flavanoids", |
| 133 | + ylabel="color_intensity", |
| 134 | + title="Outlier detection on a real data set (wine recognition)", |
153 | 135 | )
|
154 |
| -plt.ylabel("color_intensity") |
155 |
| -plt.xlabel("flavanoids") |
156 | 136 |
|
157 | 137 | plt.show()
|
0 commit comments