|
1 | 1 | Changes |
2 | 2 | =========== |
| 3 | +## 3.7.0, 2019-01-18 |
| 4 | + |
| 5 | +### :star2: New features |
| 6 | + |
| 7 | +* Fast Online NMF (__[@anotherbugmaster](https://github.com/anotherbugmaster)__, [#2007](https://github.com/RaRe-Technologies/gensim/pull/2007)) |
| 8 | + - Benchmark `wiki-english-20171001` |
| 9 | + |
| 10 | + | Model | Perplexity | Coherence | L2 norm | Train time (minutes) | |
| 11 | + |-------|------------|-----------|---------|----------------------| |
| 12 | + | LDA | 4727.07 | -2.514 | 7.372 | 138 | |
| 13 | + | NMF | **975.74** | -2.814 | **7.265** | **73** | |
| 14 | + | NMF (with regularization) | 985.57 | **-2.436** | 7.269 | 441 | |
| 15 | + |
| 16 | + - Simple to use (same interface as `LdaModel`) |
| 17 | + ```python |
| 18 | + from gensim.models.nmf import Nmf |
| 19 | + from gensim.corpora import Dictionary |
| 20 | + import gensim.downloader as api |
| 21 | + |
| 22 | + text8 = api.load('text8') |
| 23 | + |
| 24 | + dictionary = Dictionary(text8) |
| 25 | + dictionary.filter_extremes() |
| 26 | + |
| 27 | + corpus = [ |
| 28 | + dictionary.doc2bow(doc) for doc in text8 |
| 29 | + ] |
| 30 | + |
| 31 | + nmf = Nmf( |
| 32 | + corpus=corpus, |
| 33 | + num_topics=5, |
| 34 | + id2word=dictionary, |
| 35 | + chunksize=2000, |
| 36 | + passes=5, |
| 37 | + random_state=42, |
| 38 | + ) |
| 39 | + |
| 40 | + nmf.show_topics() |
| 41 | + """ |
| 42 | + [(0, '0.007*"km" + 0.006*"est" + 0.006*"islands" + 0.004*"league" + 0.004*"rate" + 0.004*"female" + 0.004*"economy" + 0.003*"male" + 0.003*"team" + 0.003*"elections"'), |
| 43 | + (1, '0.006*"actor" + 0.006*"player" + 0.004*"bwv" + 0.004*"writer" + 0.004*"actress" + 0.004*"singer" + 0.003*"emperor" + 0.003*"jewish" + 0.003*"italian" + 0.003*"prize"'), |
| 44 | + (2, '0.036*"college" + 0.007*"institute" + 0.004*"jewish" + 0.004*"universidad" + 0.003*"engineering" + 0.003*"colleges" + 0.003*"connecticut" + 0.003*"technical" + 0.003*"jews" + 0.003*"universities"'), |
| 45 | + (3, '0.016*"import" + 0.008*"insubstantial" + 0.007*"y" + 0.006*"soviet" + 0.004*"energy" + 0.004*"info" + 0.003*"duplicate" + 0.003*"function" + 0.003*"z" + 0.003*"jargon"'), |
| 46 | + (4, '0.005*"software" + 0.004*"games" + 0.004*"windows" + 0.003*"microsoft" + 0.003*"films" + 0.003*"apple" + 0.003*"video" + 0.002*"album" + 0.002*"fiction" + 0.002*"characters"')] |
| 47 | + """ |
| 48 | + ``` |
| 49 | + - See also: |
| 50 | + - [NMF tutorial](https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/nmf_tutorial.ipynb) |
| 51 | + - [Full NMF Benchmark](https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/nmf_wikipedia.ipynb) |
| 52 | + |
| 53 | +* Massive improvement`FastText` compatibilities (__[@mpenkov](https://github.com/mpenkov)__, [#2313](https://github.com/RaRe-Technologies/gensim/pull/2313)) |
| 54 | + ```python |
| 55 | + from gensim.models import FastText |
| 56 | + |
| 57 | + # 'cc.ru.300.bin' - Russian Facebook FT model trained on Common Crawl |
| 58 | + # Can be downloaded from https://s3-us-west-1.amazonaws.com/fasttext-vectors/word-vectors-v2/cc.ru.300.bin.gz |
| 59 | + |
| 60 | + model = FastText.load_fasttext_format("cc.ru.300.bin") |
| 61 | + |
| 62 | + # Fixed hash-function allow to produce same output as FB FastText & works correctly for non-latin languages (for example, Russian) |
| 63 | + assert "мяу" in m.wv.vocab # 'мяу' - vocab word |
| 64 | + model.wv.most_similar("мяу") |
| 65 | + """ |
| 66 | + [('Мяу', 0.6820122003555298), |
| 67 | + ('МЯУ', 0.6373013257980347), |
| 68 | + ('мяу-мяу', 0.593108594417572), |
| 69 | + ('кис-кис', 0.5899622440338135), |
| 70 | + ('гав', 0.5866007804870605), |
| 71 | + ('Кис-кис', 0.5798211097717285), |
| 72 | + ('Кис-кис-кис', 0.5742273330688477), |
| 73 | + ('Мяу-мяу', 0.5699705481529236), |
| 74 | + ('хрю-хрю', 0.5508339405059814), |
| 75 | + ('ав-ав', 0.5479759573936462)] |
| 76 | + """ |
| 77 | + |
| 78 | + assert "котогород" not in m.wv.vocab # 'котогород' - out-of-vocab word |
| 79 | + model.wv.most_similar("котогород", topn=3) |
| 80 | + """ |
| 81 | + [('автогород', 0.5463314652442932), |
| 82 | + ('ТагилНовокузнецкНовомосковскНовороссийскНовосибирскНовотроицкНовочеркасскНовошахтинскНовый', |
| 83 | + 0.5423436164855957), |
| 84 | + ('областьНовосибирскБарабинскБердскБолотноеИскитимКарасукКаргатКуйбышевКупиноОбьТатарскТогучинЧерепаново', |
| 85 | + 0.5377570390701294)] |
| 86 | + """ |
| 87 | + |
| 88 | + # Now we load full model, for this reason, we can continue an training |
| 89 | + |
| 90 | + from gensim.test.utils import datapath |
| 91 | + from smart_open import smart_open |
| 92 | + |
| 93 | + with smart_open(datapath("crime-and-punishment.txt"), encoding="utf-8") as infile: # russian text |
| 94 | + corpus = [line.strip().split() for line in infile] |
| 95 | + |
| 96 | + model.train(corpus, total_examples=len(corpus), epochs=5) |
| 97 | + ``` |
| 98 | + |
| 99 | +* Similarity search improvements (__[@Witiko](https://github.com/Witiko)__, [#2016](https://github.com/RaRe-Technologies/gensim/pull/2016)) |
| 100 | + - Add similarity search using the Levenshtein distance in `gensim.similarities.LevenshteinSimilarityIndex` |
| 101 | + - Performance optimizations to `gensim.similarities.SoftCosineSimilarity` ([full benchmark](https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/soft_cosine_benchmark.ipynb)) |
| 102 | + |
| 103 | + | dictionary size | corpus size | speed | |
| 104 | + |-----------------|-------------|--------------:| |
| 105 | + | 1000 | 100 | 1.0× | |
| 106 | + | 1000 | 1000 | **53.4×** | |
| 107 | + | 1000 | 100000 | **156784.8×** | |
| 108 | + | 100000 | 100 | **3.8×** | |
| 109 | + | 100000 | 1000 | **405.8×** | |
| 110 | + | 100000 | 100000 | **66262.0×** | |
| 111 | + |
| 112 | + - See [updated soft-cosine tutorial](https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/soft_cosine_tutorial.ipynb) for more information and usage examples |
| 113 | + |
| 114 | +* Add `python3.7` support (__[@menshikh-iv](https://github.com/menshikh-iv)__, [#2211](https://github.com/RaRe-Technologies/gensim/pull/2211)) |
| 115 | + - Wheels for Window, OSX and Linux platforms (__[@menshikh-iv](https://github.com/menshikh-iv)__, [MacPython/gensim-wheels/#12](https://github.com/MacPython/gensim-wheels/pull/12)) |
| 116 | + - Faster installation |
| 117 | + |
| 118 | + |
| 119 | +### :+1: Improvements |
| 120 | + |
| 121 | +##### Optimizations |
| 122 | +* Reduce `Phraser` memory usage (drop frequencies) (__[@jenishah](https://github.com/jenishah)__, [#2208](https://github.com/RaRe-Technologies/gensim/pull/2208)) |
| 123 | +* Reduce memory consumption of summarizer (__[@horpto](https://github.com/horpto)__, [#2298](https://github.com/RaRe-Technologies/gensim/pull/2298)) |
| 124 | +* Replace inline slow equivalent of mean_absolute_difference with fast (__[@horpto](https://github.com/horpto)__, [#2284](https://github.com/RaRe-Technologies/gensim/pull/2284)) |
| 125 | +* Reuse precalculated updated prior in `ldamodel.update_dir_prior` (__[@horpto](https://github.com/horpto)__, [#2274](https://github.com/RaRe-Technologies/gensim/pull/2274)) |
| 126 | +* Improve `KeyedVector.wmdistance` (__[@horpto](https://github.com/horpto)__, [#2326](https://github.com/RaRe-Technologies/gensim/pull/2326)) |
| 127 | +* Optimize `remove_unreachable_nodes` in `gensim.summarization` (__[@horpto](https://github.com/horpto)__, [#2263](https://github.com/RaRe-Technologies/gensim/pull/2263)) |
| 128 | +* Optimize `mz_entropy` from `gensim.summarization` (__[@horpto](https://github.com/horpto)__, [#2267](https://github.com/RaRe-Technologies/gensim/pull/2267)) |
| 129 | +* Improve `filter_extremes` methods in `Dictionary` and `HashDictionary` (__[@horpto](https://github.com/horpto)__, [#2303](https://github.com/RaRe-Technologies/gensim/pull/2303)) |
| 130 | + |
| 131 | +##### Additions |
| 132 | +* Add `KeyedVectors.relative_cosine_similarity` (__[@rsdel2007](https://github.com/rsdel2007)__, [#2307](https://github.com/RaRe-Technologies/gensim/pull/2307)) |
| 133 | +* Add `random_seed` to `LdaMallet` (__[@Zohaggie](https://github.com/Zohaggie)__ & __[@menshikh-iv](https://github.com/menshikh-iv)__, [#2153](https://github.com/RaRe-Technologies/gensim/pull/2153)) |
| 134 | +* Add `common_terms` parameter to `sklearn_api.PhrasesTransformer` (__[@pmlk](https://github.com/pmlk)__, [#2074](https://github.com/RaRe-Technologies/gensim/pull/2074)) |
| 135 | +* Add method for patch `corpora.Dictionary` based on special tokens (__[@Froskekongen](https://github.com/Froskekongen)__, [#2200](https://github.com/RaRe-Technologies/gensim/pull/2200)) |
| 136 | + |
| 137 | +##### Cleanup |
| 138 | +* Improve `six` usage (`xrange`, `map`, `zip`) (__[@horpto](https://github.com/horpto)__, [#2264](https://github.com/RaRe-Technologies/gensim/pull/2264)) |
| 139 | +* Refactor `line2doc` methods of `LowCorpus` and `MalletCorpus` (__[@horpto](https://github.com/horpto)__, [#2269](https://github.com/RaRe-Technologies/gensim/pull/2269)) |
| 140 | +* Get rid most of warnings in testing (__[@menshikh-iv](https://github.com/menshikh-iv)__, [#2191](https://github.com/RaRe-Technologies/gensim/pull/2191)) |
| 141 | +* Fix non-deterministic test failures (pin `PYTHONHASHSEED`) (__[@menshikh-iv](https://github.com/menshikh-iv)__, [#2196](https://github.com/RaRe-Technologies/gensim/pull/2196)) |
| 142 | +* Fix "aliasing chunkize to chunkize_serial" warning on Windows (__[@aquatiko](https://github.com/aquatiko)__, [#2202](https://github.com/RaRe-Technologies/gensim/pull/2202)) |
| 143 | +* Remove `__getitem__` code duplication in `gensim.models.phrases` (__[@jenishah](https://github.com/jenishah)__, [#2206](https://github.com/RaRe-Technologies/gensim/pull/2206)) |
| 144 | +* Add `flake8-rst` for docstring code examples (__[@kataev](https://github.com/kataev)__, [#2192](https://github.com/RaRe-Technologies/gensim/pull/2192)) |
| 145 | +* Get rid `py26` stuff (__[@menshikh-iv](https://github.com/menshikh-iv)__, [#2214](https://github.com/RaRe-Technologies/gensim/pull/2214)) |
| 146 | +* Use `itertools.chain` instead of `sum` to concatenate lists (__[@Stigjb](https://github.com/Stigjb)__, [#2212](https://github.com/RaRe-Technologies/gensim/pull/2212)) |
| 147 | +* Fix flake8 warnings W605, W504 (__[@horpto](https://github.com/horpto)__, [#2256](https://github.com/RaRe-Technologies/gensim/pull/2256)) |
| 148 | +* Remove unnecessary creations of lists at all (__[@horpto](https://github.com/horpto)__, [#2261](https://github.com/RaRe-Technologies/gensim/pull/2261)) |
| 149 | +* Fix extra list creation in `utils.get_max_id` (__[@horpto](https://github.com/horpto)__, [#2254](https://github.com/RaRe-Technologies/gensim/pull/2254)) |
| 150 | +* Fix deprecation warning `np.sum(generator)` (__[@rsdel2007](https://github.com/rsdel2007)__, [#2296](https://github.com/RaRe-Technologies/gensim/pull/2296)) |
| 151 | +* Refactor `BM25` (__[@horpto](https://github.com/horpto)__, [#2275](https://github.com/RaRe-Technologies/gensim/pull/2275)) |
| 152 | +* Fix pyemd import (__[@ramprakash-94](https://github.com/ramprakash-94)__, [#2240](https://github.com/RaRe-Technologies/gensim/pull/2240)) |
| 153 | +* Set `metadata=True` for `make_wikicorpus` script by default (__[@Xinyi2016](https://github.com/Xinyi2016)__, [#2245](https://github.com/RaRe-Technologies/gensim/pull/2245)) |
| 154 | +* Remove unimportant warning from `Phrases` (__[@rsdel2007](https://github.com/rsdel2007)__, [#2331](https://github.com/RaRe-Technologies/gensim/pull/2331)) |
| 155 | +* Replace `open()` by `smart_open()` in `gensim.models.fasttext._load_fasttext_format` (__[@rsdel2007](https://github.com/rsdel2007)__, [#2335](https://github.com/RaRe-Technologies/gensim/pull/2335)) |
| 156 | + |
| 157 | + |
| 158 | +### :red_circle: Bug fixes |
| 159 | +* Fix overflow error for `*Vec` corpusfile-based training (__[@bm371613](https://github.com/bm371613)__, [#2239](https://github.com/RaRe-Technologies/gensim/pull/2239)) |
| 160 | +* Fix `malletmodel2ldamodel` conversion (__[@horpto](https://github.com/horpto)__, [#2288](https://github.com/RaRe-Technologies/gensim/pull/2288)) |
| 161 | +* Replace custom epsilons with numpy equivalent in `LdaModel` (__[@horpto](https://github.com/horpto)__, [#2308](https://github.com/RaRe-Technologies/gensim/pull/2308)) |
| 162 | +* Add missing content to tarball (__[@menshikh-iv](https://github.com/menshikh-iv)__, [#2194](https://github.com/RaRe-Technologies/gensim/pull/2194)) |
| 163 | +* Fixes divided by zero when w_star_count==0 (__[@allenyllee](https://github.com/allenyllee)__, [#2259](https://github.com/RaRe-Technologies/gensim/pull/2259)) |
| 164 | +* Fix check for callbacks (__[@allenyllee](https://github.com/allenyllee)__, [#2251](https://github.com/RaRe-Technologies/gensim/pull/2251)) |
| 165 | +* Fix `SvmLightCorpus.serialize` if `labels` instance of numpy.ndarray (__[@aquatiko](https://github.com/aquatiko)__, [#2243](https://github.com/RaRe-Technologies/gensim/pull/2243)) |
| 166 | +* Fix poincate viz incompatibility with `plotly>=3.0.0` (__[@jenishah](https://github.com/jenishah)__, [#2226](https://github.com/RaRe-Technologies/gensim/pull/2226)) |
| 167 | +* Fix `keep_n` behavior for `Dictionary.filter_extremes` (__[@johann-petrak](https://github.com/johann-petrak)__, [#2232](https://github.com/RaRe-Technologies/gensim/pull/2232)) |
| 168 | +* Fix for `sphinx==1.8.1` (last r (__[@menshikh-iv](https://github.com/menshikh-iv)__, [#None](https://github.com/RaRe-Technologies/gensim/pull/None)) |
| 169 | +* Fix `np.issubdtype` warnings (__[@marioyc](https://github.com/marioyc)__, [#2210](https://github.com/RaRe-Technologies/gensim/pull/2210)) |
| 170 | +* Drop wrong key `-c` from `gensim.downloader` description (__[@horpto](https://github.com/horpto)__, [#2262](https://github.com/RaRe-Technologies/gensim/pull/2262)) |
| 171 | +* Fix gensim build (docs & pyemd issues) (__[@menshikh-iv](https://github.com/menshikh-iv)__, [#2318](https://github.com/RaRe-Technologies/gensim/pull/2318)) |
| 172 | +* Limit visdom version (avoid py2 issue from the latest visdom release) (__[@menshikh-iv](https://github.com/menshikh-iv)__, [#2334](https://github.com/RaRe-Technologies/gensim/pull/2334)) |
| 173 | +* Fix visdom integration (using `viz.line()` instead of `viz.updatetrace()`) (__[@allenyllee](https://github.com/allenyllee)__, [#2252](https://github.com/RaRe-Technologies/gensim/pull/2252)) |
| 174 | + |
| 175 | + |
| 176 | +### :books: Tutorial and doc improvements |
| 177 | + |
| 178 | +* Add gensim-data repo to `gensim.downloader` & fix rendering of code examples (__[@menshikh-iv](https://github.com/menshikh-iv)__, [#2327](https://github.com/RaRe-Technologies/gensim/pull/2327)) |
| 179 | +* Fix typos in `gensim.models` (__[@rsdel2007](https://github.com/rsdel2007)__, [#2323](https://github.com/RaRe-Technologies/gensim/pull/2323)) |
| 180 | +* Fixed typos in notebooks (__[@rsdel2007](https://github.com/rsdel2007)__, [#2322](https://github.com/RaRe-Technologies/gensim/pull/2322)) |
| 181 | +* Update `Doc2Vec` documentation: how tags are assigned in `corpus_file` mode (__[@persiyanov](https://github.com/persiyanov)__, [#2320](https://github.com/RaRe-Technologies/gensim/pull/2320)) |
| 182 | +* Fix typos in `gensim/models/keyedvectors.py` (__[@rsdel2007](https://github.com/rsdel2007)__, [#2290](https://github.com/RaRe-Technologies/gensim/pull/2290)) |
| 183 | +* Add documentation about ranges to scoring functions for `Phrases` (__[@jenishah](https://github.com/jenishah)__, [#2242](https://github.com/RaRe-Technologies/gensim/pull/2242)) |
| 184 | +* Update return sections for `KeyedVectors.evaluate_word_*` (__[@Stigjb](https://github.com/Stigjb)__, [#2205](https://github.com/RaRe-Technologies/gensim/pull/2205)) |
| 185 | +* Fix return type in `KeyedVector.evaluate_word_analogies` (__[@Stigjb](https://github.com/Stigjb)__, [#2207](https://github.com/RaRe-Technologies/gensim/pull/2207)) |
| 186 | +* Fix `WmdSimilarity` documentation (__[@jagmoreira](https://github.com/jagmoreira)__, [#2217](https://github.com/RaRe-Technologies/gensim/pull/2217)) |
| 187 | +* Replace `fify -> fifty` in `gensim.parsing.preprocessing.STOPWORDS` (__[@coderwassananmol](https://github.com/coderwassananmol)__, [#2220](https://github.com/RaRe-Technologies/gensim/pull/2220)) |
| 188 | +* Remove `alpha="auto"` from `LdaMulticore` (not supported yet) (__[@johann-petrak](https://github.com/johann-petrak)__, [#2225](https://github.com/RaRe-Technologies/gensim/pull/2225)) |
| 189 | +* Update Adopters in README (__[@piskvorky](https://github.com/piskvorky)__, [#2234](https://github.com/RaRe-Technologies/gensim/pull/2234)) |
| 190 | +* Fix broken link in `tutorials.md` (__[@rsdel2007](https://github.com/rsdel2007)__, [#2302](https://github.com/RaRe-Technologies/gensim/pull/2302)) |
| 191 | + |
| 192 | + |
| 193 | +### :warning: Deprecations (will be removed in the next major release) |
| 194 | + |
| 195 | +* Remove |
| 196 | + - `gensim.models.wrappers.fasttext` (obsoleted by the new native `gensim.models.fasttext` implementation) |
| 197 | + - `gensim.examples` |
| 198 | + - `gensim.nosy` |
| 199 | + - `gensim.scripts.word2vec_standalone` |
| 200 | + - `gensim.scripts.make_wiki_lemma` |
| 201 | + - `gensim.scripts.make_wiki_online` |
| 202 | + - `gensim.scripts.make_wiki_online_lemma` |
| 203 | + - `gensim.scripts.make_wiki_online_nodebug` |
| 204 | + - `gensim.scripts.make_wiki` (all of these obsoleted by the new native `gensim.scripts.segment_wiki` implementation) |
| 205 | + - "deprecated" functions and attributes |
| 206 | + |
| 207 | +* Move |
| 208 | + - `gensim.scripts.make_wikicorpus` ➡ `gensim.scripts.make_wiki.py` |
| 209 | + - `gensim.summarization` ➡ `gensim.models.summarization` |
| 210 | + - `gensim.topic_coherence` ➡ `gensim.models._coherence` |
| 211 | + - `gensim.utils` ➡ `gensim.utils.utils` (old imports will continue to work) |
| 212 | + - `gensim.parsing.*` ➡ `gensim.utils.text_utils` |
| 213 | + |
| 214 | + |
3 | 215 | ## 3.6.0, 2018-09-20 |
4 | 216 |
|
5 | 217 | ### :star2: New features |
|
0 commit comments