EXA improve examples (scikit-learn#8866)

glemaitre · lesteve · commit 3de7da329fb3 · 2017-05-12T14:21:40.000+02:00
diff --git a/examples/bicluster/plot_bicluster_newsgroups.py b/examples/bicluster/plot_bicluster_newsgroups.py
@@ -21,48 +21,9 @@
 MiniBatchKMeans. The document clusters derived from the biclusters
 achieve a better V-measure than clusters found by MiniBatchKMeans.
 
-Output::
-
-    Vectorizing...
-    Coclustering...
-    Done in 9.53s. V-measure: 0.4455
-    MiniBatchKMeans...
-    Done in 12.00s. V-measure: 0.3309
-
-    Best biclusters:
-    ----------------
-    bicluster 0 : 1951 documents, 4373 words
-    categories   : 23% talk.politics.guns, 19% talk.politics.misc, 14% sci.med
-    words        : gun, guns, geb, banks, firearms, drugs, gordon, clinton,
-                   cdt, amendment
-
-    bicluster 1 : 1165 documents, 3304 words
-    categories   : 29% talk.politics.mideast, 26% soc.religion.christian,
-                   25% alt.atheism
-    words        : god, jesus, christians, atheists, kent, sin, morality,
-                   belief, resurrection, marriage
-
-    bicluster 2 : 2219 documents, 2830 words
-    categories   : 18% comp.sys.mac.hardware, 16% comp.sys.ibm.pc.hardware,
-                   16% comp.graphics
-    words        : voltage, dsp, board, receiver, circuit, shipping, packages,
-                   stereo, compression, package
-
-    bicluster 3 : 1860 documents, 2745 words
-    categories   : 26% rec.motorcycles, 23% rec.autos, 13% misc.forsale
-    words        : bike, car, dod, engine, motorcycle, ride, honda, cars, bmw,
-                   bikes
-
-    bicluster 4 : 12 documents, 155 words
-    categories   : 100% rec.sport.hockey
-    words        : scorer, unassisted, reichel, semak, sweeney, kovalenko,
-                   ricci, audette, momesso, nedved
-
 """
 from __future__ import print_function
 
-print(__doc__)
-
 from collections import defaultdict
 import operator
 import re
@@ -77,6 +38,8 @@
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.cluster import v_measure_score
 
+print(__doc__)
+
 
 def number_aware_tokenizer(doc):
     """ Tokenizer that maps all numeric tokens to a placeholder.
@@ -91,6 +54,7 @@ def number_aware_tokenizer(doc):
               for token in tokens]
     return tokens
 
+
 # exclude 'comp.os.ms-windows.misc'
 categories = ['alt.atheism', 'comp.graphics',
               'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware',
diff --git a/examples/feature_selection/plot_feature_selection_pipeline.py b/examples/feature_selection/plot_feature_selection_pipeline.py
@@ -6,24 +6,29 @@
 Simple usage of Pipeline that runs successively a univariate
 feature selection with anova and then a C-SVM of the selected features.
 """
-print(__doc__)
-
 from sklearn import svm
 from sklearn.datasets import samples_generator
 from sklearn.feature_selection import SelectKBest, f_regression
 from sklearn.pipeline import make_pipeline
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import classification_report
+
+print(__doc__)
 
 # import some data to play with
 X, y = samples_generator.make_classification(
     n_features=20, n_informative=3, n_redundant=0, n_classes=4,
     n_clusters_per_class=2)
 
+X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
+
 # ANOVA SVM-C
 # 1) anova filter, take 3 best ranked features
 anova_filter = SelectKBest(f_regression, k=3)
 # 2) svm
 clf = svm.SVC(kernel='linear')
 
 anova_svm = make_pipeline(anova_filter, clf)
-anova_svm.fit(X, y)
-anova_svm.predict(X)
+anova_svm.fit(X_train, y_train)
+y_pred = anova_svm.predict(X_test)
+print(classification_report(y_test, y_pred))