Skip to content

Commit 3de7da3

Browse files
glemaitrelesteve
authored andcommitted
EXA improve examples (scikit-learn#8866)
1 parent 1b2a928 commit 3de7da3

File tree

2 files changed

+12
-43
lines changed

2 files changed

+12
-43
lines changed

examples/bicluster/plot_bicluster_newsgroups.py

Lines changed: 3 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -21,48 +21,9 @@
2121
MiniBatchKMeans. The document clusters derived from the biclusters
2222
achieve a better V-measure than clusters found by MiniBatchKMeans.
2323
24-
Output::
25-
26-
Vectorizing...
27-
Coclustering...
28-
Done in 9.53s. V-measure: 0.4455
29-
MiniBatchKMeans...
30-
Done in 12.00s. V-measure: 0.3309
31-
32-
Best biclusters:
33-
----------------
34-
bicluster 0 : 1951 documents, 4373 words
35-
categories : 23% talk.politics.guns, 19% talk.politics.misc, 14% sci.med
36-
words : gun, guns, geb, banks, firearms, drugs, gordon, clinton,
37-
cdt, amendment
38-
39-
bicluster 1 : 1165 documents, 3304 words
40-
categories : 29% talk.politics.mideast, 26% soc.religion.christian,
41-
25% alt.atheism
42-
words : god, jesus, christians, atheists, kent, sin, morality,
43-
belief, resurrection, marriage
44-
45-
bicluster 2 : 2219 documents, 2830 words
46-
categories : 18% comp.sys.mac.hardware, 16% comp.sys.ibm.pc.hardware,
47-
16% comp.graphics
48-
words : voltage, dsp, board, receiver, circuit, shipping, packages,
49-
stereo, compression, package
50-
51-
bicluster 3 : 1860 documents, 2745 words
52-
categories : 26% rec.motorcycles, 23% rec.autos, 13% misc.forsale
53-
words : bike, car, dod, engine, motorcycle, ride, honda, cars, bmw,
54-
bikes
55-
56-
bicluster 4 : 12 documents, 155 words
57-
categories : 100% rec.sport.hockey
58-
words : scorer, unassisted, reichel, semak, sweeney, kovalenko,
59-
ricci, audette, momesso, nedved
60-
6124
"""
6225
from __future__ import print_function
6326

64-
print(__doc__)
65-
6627
from collections import defaultdict
6728
import operator
6829
import re
@@ -77,6 +38,8 @@
7738
from sklearn.feature_extraction.text import TfidfVectorizer
7839
from sklearn.metrics.cluster import v_measure_score
7940

41+
print(__doc__)
42+
8043

8144
def number_aware_tokenizer(doc):
8245
""" Tokenizer that maps all numeric tokens to a placeholder.
@@ -91,6 +54,7 @@ def number_aware_tokenizer(doc):
9154
for token in tokens]
9255
return tokens
9356

57+
9458
# exclude 'comp.os.ms-windows.misc'
9559
categories = ['alt.atheism', 'comp.graphics',
9660
'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware',

examples/feature_selection/plot_feature_selection_pipeline.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,24 +6,29 @@
66
Simple usage of Pipeline that runs successively a univariate
77
feature selection with anova and then a C-SVM of the selected features.
88
"""
9-
print(__doc__)
10-
119
from sklearn import svm
1210
from sklearn.datasets import samples_generator
1311
from sklearn.feature_selection import SelectKBest, f_regression
1412
from sklearn.pipeline import make_pipeline
13+
from sklearn.model_selection import train_test_split
14+
from sklearn.metrics import classification_report
15+
16+
print(__doc__)
1517

1618
# import some data to play with
1719
X, y = samples_generator.make_classification(
1820
n_features=20, n_informative=3, n_redundant=0, n_classes=4,
1921
n_clusters_per_class=2)
2022

23+
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
24+
2125
# ANOVA SVM-C
2226
# 1) anova filter, take 3 best ranked features
2327
anova_filter = SelectKBest(f_regression, k=3)
2428
# 2) svm
2529
clf = svm.SVC(kernel='linear')
2630

2731
anova_svm = make_pipeline(anova_filter, clf)
28-
anova_svm.fit(X, y)
29-
anova_svm.predict(X)
32+
anova_svm.fit(X_train, y_train)
33+
y_pred = anova_svm.predict(X_test)
34+
print(classification_report(y_test, y_pred))

0 commit comments

Comments
 (0)