@@ -187,17 +187,19 @@ def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000):
187187 **Note**: Due to the gap shrinking, the same word may have a different
188188 word id before and after the call to this function!
189189 """
190- no_above_abs = int (no_above * self .num_docs ) # convert fractional threshold to absolute threshold
190+ no_above_abs = int (no_above * self .num_docs ) # convert fractional threshold to absolute threshold
191191
192192 # determine which tokens to keep
193- good_ids = (v for v in itervalues (self .token2id )
194- if no_below <= self .dfs .get (v , 0 ) <= no_above_abs )
193+ good_ids = (
194+ v for v in itervalues (self .token2id )
195+ if no_below <= self .dfs .get (v , 0 ) <= no_above_abs )
195196 good_ids = sorted (good_ids , key = self .dfs .get , reverse = True )
196197 if keep_n is not None :
197198 good_ids = good_ids [:keep_n ]
198199 bad_words = [(self [id ], self .dfs .get (id , 0 )) for id in set (self ).difference (good_ids )]
199200 logger .info ("discarding %i tokens: %s..." , len (self ) - len (good_ids ), bad_words [:10 ])
200- logger .info ("keeping %i tokens which were in no less than %i and no more than %i (=%.1f%%) documents" ,
201+ logger .info (
202+ "keeping %i tokens which were in no less than %i and no more than %i (=%.1f%%) documents" ,
201203 len (good_ids ), no_below , no_above_abs , 100.0 * no_above )
202204
203205 # do the actual filtering, then rebuild dictionary to remove gaps in ids
@@ -256,7 +258,7 @@ def save_as_text(self, fname, sort_by_word=True):
256258 Note: text format should be use for corpus inspection. Use `save`/`load`
257259 to store in binary format (pickle) for improved performance.
258260 """
259- logger .info ("saving dictionary mapping to %s" % fname )
261+ logger .info ("saving dictionary mapping to %s" , fname )
260262 with utils .smart_open (fname , 'wb' ) as fout :
261263 if sort_by_word :
262264 for token , tokenid in sorted (iteritems (self .token2id )):
@@ -354,7 +356,7 @@ def from_corpus(corpus, id2word=None):
354356 max_id = - 1
355357 for docno , document in enumerate (corpus ):
356358 if docno % 10000 == 0 :
357- logger .info ("adding document #%i to %s" % ( docno , result ) )
359+ logger .info ("adding document #%i to %s" , docno , result )
358360 result .num_docs += 1
359361 result .num_nnz += len (document )
360362 for wordid , word_freq in document :
@@ -372,6 +374,7 @@ def from_corpus(corpus, id2word=None):
372374 # make sure all token ids have a valid `dfs` entry
373375 result .dfs [id ] = result .dfs .get (id , 0 )
374376
375- logger .info ("built %s from %i documents (total %i corpus positions)" %
376- (result , result .num_docs , result .num_pos ))
377+ logger .info (
378+ "built %s from %i documents (total %i corpus positions)" ,
379+ result , result .num_docs , result .num_pos )
377380 return result
0 commit comments