21
21
MiniBatchKMeans. The document clusters derived from the biclusters
22
22
achieve a better V-measure than clusters found by MiniBatchKMeans.
23
23
24
- Output::
25
-
26
- Vectorizing...
27
- Coclustering...
28
- Done in 9.53s. V-measure: 0.4455
29
- MiniBatchKMeans...
30
- Done in 12.00s. V-measure: 0.3309
31
-
32
- Best biclusters:
33
- ----------------
34
- bicluster 0 : 1951 documents, 4373 words
35
- categories : 23% talk.politics.guns, 19% talk.politics.misc, 14% sci.med
36
- words : gun, guns, geb, banks, firearms, drugs, gordon, clinton,
37
- cdt, amendment
38
-
39
- bicluster 1 : 1165 documents, 3304 words
40
- categories : 29% talk.politics.mideast, 26% soc.religion.christian,
41
- 25% alt.atheism
42
- words : god, jesus, christians, atheists, kent, sin, morality,
43
- belief, resurrection, marriage
44
-
45
- bicluster 2 : 2219 documents, 2830 words
46
- categories : 18% comp.sys.mac.hardware, 16% comp.sys.ibm.pc.hardware,
47
- 16% comp.graphics
48
- words : voltage, dsp, board, receiver, circuit, shipping, packages,
49
- stereo, compression, package
50
-
51
- bicluster 3 : 1860 documents, 2745 words
52
- categories : 26% rec.motorcycles, 23% rec.autos, 13% misc.forsale
53
- words : bike, car, dod, engine, motorcycle, ride, honda, cars, bmw,
54
- bikes
55
-
56
- bicluster 4 : 12 documents, 155 words
57
- categories : 100% rec.sport.hockey
58
- words : scorer, unassisted, reichel, semak, sweeney, kovalenko,
59
- ricci, audette, momesso, nedved
60
-
61
24
"""
62
25
from __future__ import print_function
63
26
64
- print (__doc__ )
65
-
66
27
from collections import defaultdict
67
28
import operator
68
29
import re
77
38
from sklearn .feature_extraction .text import TfidfVectorizer
78
39
from sklearn .metrics .cluster import v_measure_score
79
40
41
+ print (__doc__ )
42
+
80
43
81
44
def number_aware_tokenizer (doc ):
82
45
""" Tokenizer that maps all numeric tokens to a placeholder.
@@ -91,6 +54,7 @@ def number_aware_tokenizer(doc):
91
54
for token in tokens ]
92
55
return tokens
93
56
57
+
94
58
# exclude 'comp.os.ms-windows.misc'
95
59
categories = ['alt.atheism' , 'comp.graphics' ,
96
60
'comp.sys.ibm.pc.hardware' , 'comp.sys.mac.hardware' ,
0 commit comments