Skip to content

Commit 3ade404

Browse files
committed
Merge branch 'release-0.12.4' with #596
2 parents b61287a + 4d8dd20 commit 3ade404

File tree

5 files changed

+743
-723
lines changed

5 files changed

+743
-723
lines changed

CHANGELOG.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ Changes
55

66
* Word2vec in line with original word2vec.c (Andrey Kutuzov, #538)
77
- Same default values. See diff https://github.com/akutuzov/gensim/commit/6456cbcd75e6f8720451766ba31cc046b4463ae2
8-
- Standalone script with command line arguments matching those of original C tool.
8+
- Standalone script with command line arguments matching those of original C tool.
99
Usage ./word2vec_standalone.py -train data.txt -output trained_vec.txt -size 200 -window 2 -sample 1e-4
1010
* load_word2vec_format() performance (@svenkreiss, #555)
1111
- Remove `init_sims()` call for performance improvements when normalized vectors are not needed.

gensim/corpora/dictionary.py

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -187,17 +187,19 @@ def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000):
187187
**Note**: Due to the gap shrinking, the same word may have a different
188188
word id before and after the call to this function!
189189
"""
190-
no_above_abs = int(no_above * self.num_docs) # convert fractional threshold to absolute threshold
190+
no_above_abs = int(no_above * self.num_docs) # convert fractional threshold to absolute threshold
191191

192192
# determine which tokens to keep
193-
good_ids = (v for v in itervalues(self.token2id)
194-
if no_below <= self.dfs.get(v, 0) <= no_above_abs)
193+
good_ids = (
194+
v for v in itervalues(self.token2id)
195+
if no_below <= self.dfs.get(v, 0) <= no_above_abs)
195196
good_ids = sorted(good_ids, key=self.dfs.get, reverse=True)
196197
if keep_n is not None:
197198
good_ids = good_ids[:keep_n]
198199
bad_words = [(self[id], self.dfs.get(id, 0)) for id in set(self).difference(good_ids)]
199200
logger.info("discarding %i tokens: %s...", len(self) - len(good_ids), bad_words[:10])
200-
logger.info("keeping %i tokens which were in no less than %i and no more than %i (=%.1f%%) documents",
201+
logger.info(
202+
"keeping %i tokens which were in no less than %i and no more than %i (=%.1f%%) documents",
201203
len(good_ids), no_below, no_above_abs, 100.0 * no_above)
202204

203205
# do the actual filtering, then rebuild dictionary to remove gaps in ids
@@ -256,7 +258,7 @@ def save_as_text(self, fname, sort_by_word=True):
256258
Note: text format should be use for corpus inspection. Use `save`/`load`
257259
to store in binary format (pickle) for improved performance.
258260
"""
259-
logger.info("saving dictionary mapping to %s" % fname)
261+
logger.info("saving dictionary mapping to %s", fname)
260262
with utils.smart_open(fname, 'wb') as fout:
261263
if sort_by_word:
262264
for token, tokenid in sorted(iteritems(self.token2id)):
@@ -354,7 +356,7 @@ def from_corpus(corpus, id2word=None):
354356
max_id = -1
355357
for docno, document in enumerate(corpus):
356358
if docno % 10000 == 0:
357-
logger.info("adding document #%i to %s" % (docno, result))
359+
logger.info("adding document #%i to %s", docno, result)
358360
result.num_docs += 1
359361
result.num_nnz += len(document)
360362
for wordid, word_freq in document:
@@ -372,6 +374,7 @@ def from_corpus(corpus, id2word=None):
372374
# make sure all token ids have a valid `dfs` entry
373375
result.dfs[id] = result.dfs.get(id, 0)
374376

375-
logger.info("built %s from %i documents (total %i corpus positions)" %
376-
(result, result.num_docs, result.num_pos))
377+
logger.info(
378+
"built %s from %i documents (total %i corpus positions)",
379+
result, result.num_docs, result.num_pos)
377380
return result

0 commit comments

Comments
 (0)