1
1
import os
2
2
import random
3
+ import statistics
3
4
import time
4
5
5
6
from pybloom import BloomFilter
8
9
from lazynlp .utils import *
9
10
10
11
def build_ngram_from_tokens (tokens , n ):
11
- """ Create a dictionary of n-gram from the list of tokens
12
- """
13
- count = {}
14
- curr = tokens [:n ]
15
- count [' ' .join (curr )] = 1
16
- for token in tokens [n :]:
17
- curr = curr [1 :] + [token ]
18
- string = ' ' .join (curr )
19
- if not string in count :
20
- count [string ] = 0
21
- count [string ] += 1
22
- return count
12
+ """ Create a dictionary of n-gram from the list of tokens
13
+ """
14
+ count = {}
15
+ curr = tokens [:n ]
16
+ count [' ' .join (curr )] = 1
17
+ for token in tokens [n :]:
18
+ curr = curr [1 :] + [token ]
19
+ string = ' ' .join (curr )
20
+ if not string in count :
21
+ count [string ] = 0
22
+ count [string ] += 1
23
+ return count
23
24
24
25
def build_ngram (file , outfile = None , bf = None , gran = 'word' , n = 10 , uncase = True , alphanumeric = True , interval = 100000 ):
25
- """
26
- gran: granularity of the token. It can be 'word' or 'char'
27
- bf: BloomFilter to update the existence of n-grams. Use when the file is too large to store a dictionary count
28
- alphanumeric: whether to keep only alphanumeric characters and space.
29
- outfile: if outfile is specified, build dictionary of n-grams and write it to outfile
30
- interval: how often to report the progress.
31
- """
32
- if not gran in set (['word' , 'char' ]):
33
- raise ValueError ("gran has to be 'word' or 'char'" )
34
- count = {}
35
- f = open (file , 'r' )
36
- i = 1
37
- line = f .readline ()
38
- start = time .time ()
39
- while line :
40
- line = line .strip ()
41
- if line :
42
- if uncase :
43
- line = line .lower ()
44
-
45
- if gran == 'word' :
46
- if alphanumeric :
47
- line = remove_non_alphanumeric (line )
48
- else :
49
- line = remove_non_alpha (line )
50
- line = collapse_white_spaces (line )
51
- tokens = line .split ()
52
- line_count = build_ngram_from_tokens (tokens , n )
53
-
54
- if outfile :
55
- count .update ()
56
-
57
- if not bf is None :
58
- for key in line_count :
59
- bf .add (key )
60
-
61
- if interval > 0 and i % interval == 0 :
62
- print ('Process line: {}. Time: {}' .format (i , time .time () - start ))
63
- start = time .time ()
64
-
65
- i += 1
66
-
67
- line = f .readline ()
68
-
69
- f .close ()
70
-
71
- if outfile :
72
- outfold = outfile [:outfile .rfind ('/' )]
73
- os .makedirs (outfold , exist_ok = True )
74
- dict_sorted_2_file (count , os .path .join (outfile .format (n )))
75
-
76
- if bf :
77
- return bf
78
-
79
- return count
26
+ """
27
+ gran: granularity of the token. It can be 'word' or 'char'
28
+ bf: BloomFilter to update the existence of n-grams. Use when the file is too large to store a dictionary count
29
+ alphanumeric: whether to keep only alphanumeric characters and space.
30
+ outfile: if outfile is specified, build dictionary of n-grams and write it to outfile
31
+ interval: how often to report the progress.
32
+ """
33
+ if not gran in set (['word' , 'char' ]):
34
+ raise ValueError ("gran has to be 'word' or 'char'" )
35
+ count = {}
36
+ f = open (file , 'r' )
37
+ i = 1
38
+ line = f .readline ()
39
+ start = time .time ()
40
+ while line :
41
+ line = line .strip ()
42
+ if line :
43
+ if uncase :
44
+ line = line .lower ()
45
+
46
+ if gran == 'word' :
47
+ if alphanumeric :
48
+ line = remove_non_alphanumeric (line )
49
+ else :
50
+ line = remove_non_alpha (line )
51
+ line = collapse_white_spaces (line )
52
+ tokens = line .split ()
53
+ line_count = build_ngram_from_tokens (tokens , n )
54
+
55
+ if outfile :
56
+ count .update ()
57
+
58
+ if not bf is None :
59
+ for key in line_count :
60
+ bf .add (key )
61
+
62
+ if interval > 0 and i % interval == 0 :
63
+ print ('Process line: {}. Time: {}' .format (i , time .time () - start ))
64
+ start = time .time ()
65
+
66
+ i += 1
67
+
68
+ line = f .readline ()
69
+
70
+ f .close ()
71
+
72
+ if outfile :
73
+ outfold = outfile [:outfile .rfind ('/' )]
74
+ os .makedirs (outfold , exist_ok = True )
75
+ dict_sorted_2_file (count , os .path .join (outfile .format (n )))
76
+
77
+ if bf :
78
+ return bf
79
+
80
+ return count
80
81
81
82
def build_word_ngram (file , outfile , n = 10 , alphanumeric = True , norm = True , interval = 100000 ):
82
- """ Build word ngrams and store in outfile
83
- n-grams in the format:
84
- [n-gram][tab][count]
83
+ """ Build word ngrams and store in outfile
84
+ n-grams in the format:
85
+ [n-gram][tab][count]
85
86
86
- If alphanumeric, exclude all the words that contain non-alphanumeric characters
87
- """
88
- return build_ngram (file , outfile = outfile , n = n , gran = 'word' , alphanumeric = alphanumeric , norm = norm , interval = interval )
87
+ If alphanumeric, exclude all the words that contain non-alphanumeric characters
88
+ """
89
+ return build_ngram (file , outfile = outfile , n = n , gran = 'word' , alphanumeric = alphanumeric , norm = norm , interval = interval )
89
90
90
91
def build_char_ngram (file , outfile , n = 10 , interval = 100000 ):
91
- """
92
- Build character n-grams and store in outfile
93
- """
94
- return build_ngram (file , outfile = outfile , n = n , gran = 'char' , interval = interval )
92
+ """
93
+ Build character n-grams and store in outfile
94
+ """
95
+ return build_ngram (file , outfile = outfile , n = n , gran = 'char' , interval = interval )
95
96
96
97
def estimate_overlap (source_files , target_files , gran = 'word' , n = 8 , capacity = 10000 , error_rate = 1e-5 , header = 0 , interval = 100000 ):
97
- """ Estimate overlapping of target_files with source_files using n-grams
98
- gran: granularity of the token. It can be 'word' or 'char'
99
- header: number of lines of each file to skip. It's because in our format, the first line is the url
100
- """
101
- if not gran in set (['word' , 'char' ]):
102
- raise ValueError ("gran has to be 'word' or 'char'" )
103
- if isinstance (source_files , str ):
104
- source_files = [source_files ]
105
- if isinstance (target_files , str ):
106
- target_files = [target_files ]
107
-
108
- bf = BloomFilter (capacity = capacity , error_rate = error_rate )
109
- for source_file in source_files :
110
- bf = build_ngram (file = source_file , bf = bf , gran = gran , n = n , uncase = True , alphanumeric = True , interval = interval )
111
-
112
- results = []
113
- for file in target_files :
114
- print (file )
115
- results .append (estimate_overlap_bf (bf , file , gran = gran , n = 8 , header = header ))
116
- return results
98
+ """ Estimate overlapping of target_files with source_files using n-grams
99
+ gran: granularity of the token. It can be 'word' or 'char'
100
+ header: number of lines of each file to skip. It's because in our format, the first line is the url
101
+ """
102
+ if not gran in set (['word' , 'char' ]):
103
+ raise ValueError ("gran has to be 'word' or 'char'" )
104
+ if isinstance (source_files , str ):
105
+ source_files = [source_files ]
106
+ if isinstance (target_files , str ):
107
+ target_files = [target_files ]
108
+
109
+ bf = BloomFilter (capacity = capacity , error_rate = error_rate )
110
+ for source_file in source_files :
111
+ bf = build_ngram (file = source_file , bf = bf , gran = gran , n = n , uncase = True , alphanumeric = True , interval = interval )
112
+
113
+ results = []
114
+ for file in target_files :
115
+ print (file )
116
+ results .append (estimate_overlap_bf (bf , file , gran = gran , n = 8 , header = header ))
117
+ return results
117
118
118
119
def estimate_overlap_bf (bf , target_file , gran = 'word' , n = 8 , header = 0 ):
119
- """ Estimate overlapping of target_file with an existing bloomfilter
120
- gran: granularity of the token. It can be 'word' or 'char'
121
- """
122
- if not gran in set (['word' , 'char' ]):
123
- raise ValueError ("gran has to be 'word' or 'char'" )
124
-
125
- f = open (target_file , 'r' )
126
- for _ in range (header + 1 ):
127
- line = f .readline ()
128
-
129
- total , seen = 0 , 0
130
- while line :
131
- line = line .strip ().lower ()
132
-
133
- if gran == 'word' :
134
- line = remove_non_alphanumeric (line )
135
- else :
136
- line = remove_non_alpha (line )
137
- line = collapse_white_spaces (line )
138
- tokens = line .split ()
139
- line_count = build_ngram_from_tokens (tokens , n )
140
-
141
- for key in line_count :
142
- if key in bf :
143
- seen += 1
144
- total += 1
145
-
146
- line = f .readline ()
147
-
148
- result = seen / total
149
- print ('{} seen out of {}: {}' .format (seen , total , result ))
150
- return result
120
+ """ Estimate overlapping of target_file with an existing bloomfilter
121
+ gran: granularity of the token. It can be 'word' or 'char'
122
+ """
123
+ if not gran in set (['word' , 'char' ]):
124
+ raise ValueError ("gran has to be 'word' or 'char'" )
125
+
126
+ f = open (target_file , 'r' )
127
+ for _ in range (header + 1 ):
128
+ line = f .readline ()
129
+
130
+ total , seen = 0 , 0
131
+ while line :
132
+ line = line .strip ().lower ()
133
+
134
+ if gran == 'word' :
135
+ line = remove_non_alphanumeric (line )
136
+ else :
137
+ line = remove_non_alpha (line )
138
+ line = collapse_white_spaces (line )
139
+ tokens = line .split ()
140
+ line_count = build_ngram_from_tokens (tokens , n )
141
+
142
+ for key in line_count :
143
+ if key in bf :
144
+ seen += 1
145
+ total += 1
146
+
147
+ line = f .readline ()
148
+
149
+ result = seen / total
150
+ print ('{} seen out of {}: {}' .format (seen , total , result ))
151
+ return result
152
+
153
+ def file_stats (file ):
154
+ """ Return statistics about line lengths and average character per words
155
+ """
156
+ line_lengths , token_lengths = [], []
157
+ with open (file , 'r' ) as f :
158
+ line = f .readline ()
159
+ while line :
160
+ tokens = line .split ()
161
+ line_lengths .append (len (tokens ))
162
+ line_token_lengths = [len (token ) for token in tokens ]
163
+ token_lengths .append ([len (tokens ), sum (line_token_lengths ) / len (tokens )])
164
+ line = f .readline ()
165
+
166
+ total_tokens = sum ([pair [0 ] for pair in token_lengths ])
167
+ total_chars = sum ([pair [0 ] * pair [1 ] for pair in token_lengths ])
168
+ average_chars = total_chars / total_tokens
169
+ print ("Character per word: average = {}." .format (average_chars ))
170
+
171
+ report = "Word count per line: average = {}, median = {}, max = {}, min = {}, stddev = {}."
172
+ print (report .format (statistics .mean (line_lengths ), statistics .median (line_lengths ),
173
+ max (line_lengths ), min (line_lengths ),
174
+ statistics .stdev (line_lengths )))
175
+ return statistics .mean (line_lengths ), average_chars
176
+
177
+ def estimate_entropy (file , gran = 'word' , max_n = 10 ):
178
+ pass
179
+
0 commit comments