Skip to content

Commit 8420768

Browse files
committed
Merge branch 'release-3.8.1'
2 parents fdc0195 + 526b6b4 commit 8420768

File tree

15 files changed

+157
-26
lines changed

15 files changed

+157
-26
lines changed

CHANGELOG.md

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,46 @@
11
Changes
22
=======
33

4+
## 3.8.1, 2019-09-23
5+
6+
### :red_circle: Bug fixes
7+
8+
* Fix usage of base_dir instead of BASE_DIR in _load_info in downloader. (__[movb](https://github.com/movb)__, [#2605](https://github.com/RaRe-Technologies/gensim/pull/2605))
9+
* Update the version of smart_open in the setup.py file (__[AMR-KELEG](https://github.com/AMR-KELEG)__, [#2582](https://github.com/RaRe-Technologies/gensim/pull/2582))
10+
* Properly handle unicode_errors arg parameter when loading a vocab file (__[wmtzk](https://github.com/wmtzk)__, [#2570](https://github.com/RaRe-Technologies/gensim/pull/2570))
11+
* Catch loading older TfidfModels without smartirs (__[bnomis](https://github.com/bnomis)__, [#2559](https://github.com/RaRe-Technologies/gensim/pull/2559))
12+
* Fix bug where a module import set up logging, pin doctools for Py2 (__[piskvorky](https://github.com/piskvorky)__, [#2552](https://github.com/RaRe-Technologies/gensim/pull/2552))
13+
14+
### :books: Tutorial and doc improvements
15+
16+
* Fix usage example in phrases.py (__[piskvorky](https://github.com/piskvorky)__, [#2575](https://github.com/RaRe-Technologies/gensim/pull/2575))
17+
18+
### :+1: Improvements
19+
20+
* Optimize Poincare model training (__[koiizukag](https://github.com/koiizukag)__, [#2589](https://github.com/RaRe-Technologies/gensim/pull/2589))
21+
22+
### :warning: Deprecations (will be removed in the next major release)
23+
24+
* Remove
25+
- `gensim.models.FastText.load_fasttext_format`: use load_facebook_vectors to load embeddings only (faster, less CPU/memory usage, does not support training continuation) and load_facebook_model to load full model (slower, more CPU/memory intensive, supports training continuation)
26+
- `gensim.models.wrappers.fasttext` (obsoleted by the new native `gensim.models.fasttext` implementation)
27+
- `gensim.examples`
28+
- `gensim.nosy`
29+
- `gensim.scripts.word2vec_standalone`
30+
- `gensim.scripts.make_wiki_lemma`
31+
- `gensim.scripts.make_wiki_online`
32+
- `gensim.scripts.make_wiki_online_lemma`
33+
- `gensim.scripts.make_wiki_online_nodebug`
34+
- `gensim.scripts.make_wiki` (all of these obsoleted by the new native `gensim.scripts.segment_wiki` implementation)
35+
- "deprecated" functions and attributes
36+
37+
* Move
38+
- `gensim.scripts.make_wikicorpus` ➡ `gensim.scripts.make_wiki.py`
39+
- `gensim.summarization` ➡ `gensim.models.summarization`
40+
- `gensim.topic_coherence` ➡ `gensim.models._coherence`
41+
- `gensim.utils` ➡ `gensim.utils.utils` (old imports will continue to work)
42+
- `gensim.parsing.*` ➡ `gensim.utils.text_utils`
43+
444
## 3.8.0, 2019-07-08
545

646
## :warning: 3.8.x will be the last gensim version to support Py2.7. Starting with 4.0.0, gensim will only support Py3.5 and above

docs/src/conf.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@
5757
# The short X.Y version.
5858
version = '3.8'
5959
# The full version, including alpha/beta/rc tags.
60-
release = '3.8.0'
60+
release = '3.8.1'
6161

6262
# The language for content autogenerated by Sphinx. Refer to documentation
6363
# for a list of supported languages.

gensim/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from gensim import parsing, corpora, matutils, interfaces, models, similarities, summarization, utils # noqa:F401
66
import logging
77

8-
__version__ = '3.8.0'
8+
__version__ = '3.8.1'
99

1010

1111
logger = logging.getLogger('gensim')

gensim/downloader.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -191,7 +191,7 @@ def _load_info(url=DATA_LIST_URL, encoding='utf-8'):
191191
If the network access fails, fall back to a local cache. This cache gets
192192
updated each time a network request _succeeds_.
193193
"""
194-
cache_path = os.path.join(base_dir, 'information.json')
194+
cache_path = os.path.join(BASE_DIR, 'information.json')
195195
_create_base_dir()
196196

197197
try:

gensim/models/phrases.py

Lines changed: 31 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,11 @@
11
#!/usr/bin/env python
22
# -*- coding: utf-8 -*-
3+
# Copyright (C) 2011 Radim Rehurek <radimrehurek@seznam.cz>
34
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
45

5-
"""Automatically detect common phrases -- multi-word expressions / word n-grams -- from a stream of sentences.
6+
"""
7+
Automatically detect common phrases -- aka multi-word expressions, word n-gram collocations -- from
8+
a stream of sentences.
69
710
Inspired by:
811
@@ -20,19 +23,38 @@
2023
>>> from gensim.models.word2vec import Text8Corpus
2124
>>> from gensim.models.phrases import Phrases, Phraser
2225
>>>
26+
>>> # Load training data.
2327
>>> sentences = Text8Corpus(datapath('testcorpus.txt'))
24-
>>> phrases = Phrases(sentences, min_count=1, threshold=1) # train model
25-
>>> phrases[[u'trees', u'graph', u'minors']] # apply model to sentence
26-
[u'trees_graph', u'minors']
28+
>>> # The training corpus must be a sequence (stream, generator) of sentences,
29+
>>> # with each sentence a list of tokens:
30+
>>> print(list(sentences)[0][:10])
31+
['computer', 'human', 'interface', 'computer', 'response', 'survey', 'system', 'time', 'user', 'interface']
32+
>>>
33+
>>> # Train a toy bigram model.
34+
>>> phrases = Phrases(sentences, min_count=1, threshold=1)
35+
>>> # Apply the trained phrases model to a new, unseen sentence.
36+
>>> phrases[['trees', 'graph', 'minors']]
37+
['trees_graph', 'minors']
38+
>>> # The toy model considered "trees graph" a single phrase => joined the two
39+
>>> # tokens into a single token, `trees_graph`.
2740
>>>
28-
>>> phrases.add_vocab([["hello", "world"], ["meow"]]) # update model with new sentences
41+
>>> # Update the model with two new sentences on the fly.
42+
>>> phrases.add_vocab([["hello", "world"], ["meow"]])
2943
>>>
30-
>>> bigram = Phraser(phrases) # construct faster model (this is only an wrapper)
31-
>>> bigram[[u'trees', u'graph', u'minors']] # apply model to sentence
32-
[u'trees_graph', u'minors']
44+
>>> # Export the trained model = use less RAM, faster processing. Model updates no longer possible.
45+
>>> bigram = Phraser(phrases)
46+
>>> bigram[['trees', 'graph', 'minors']] # apply the exported model to a sentence
47+
['trees_graph', 'minors']
3348
>>>
34-
>>> for sent in bigram[sentences]: # apply model to text corpus
49+
>>> # Apply the exported model to each sentence of a corpus:
50+
>>> for sent in bigram[sentences]:
3551
... pass
52+
>>>
53+
>>> # Save / load an exported collocation model.
54+
>>> bigram.save("/tmp/my_bigram_model.pkl")
55+
>>> bigram_reloaded = Phraser.load("/tmp/my_bigram_model.pkl")
56+
>>> bigram_reloaded[['trees', 'graph', 'minors']] # apply the exported model to a sentence
57+
['trees_graph', 'minors']
3658
3759
"""
3860

gensim/models/poincare.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -567,10 +567,13 @@ def _handle_duplicates(vector_updates, node_indices):
567567
568568
"""
569569
counts = Counter(node_indices)
570+
node_dict = defaultdict(list)
571+
for i, node_index in enumerate(node_indices):
572+
node_dict[node_index].append(i)
570573
for node_index, count in counts.items():
571574
if count == 1:
572575
continue
573-
positions = [i for i, index in enumerate(node_indices) if index == node_index]
576+
positions = node_dict[node_index]
574577
# Move all updates to the same node to the last such update, zeroing all the others
575578
vector_updates[positions[-1]] = vector_updates[positions].sum(axis=0)
576579
vector_updates[positions[:-1]] = 0

gensim/models/tfidfmodel.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -433,6 +433,10 @@ def load(cls, *args, **kwargs):
433433
model.slope = 0.65
434434
logger.info('older version of %s loaded without slope arg', cls.__name__)
435435
logger.info('Setting slope to %s.', model.slope)
436+
if not hasattr(model, 'smartirs'):
437+
model.smartirs = None
438+
logger.info('older version of %s loaded without smartirs arg', cls.__name__)
439+
logger.info('Setting smartirs to %s.', model.smartirs)
436440
return model
437441

438442
def __str__(self):

gensim/models/utils_any2vec.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -335,7 +335,7 @@ def _load_word2vec_format(cls, fname, fvocab=None, binary=False, encoding='utf8'
335335
counts = {}
336336
with utils.open(fvocab, 'rb') as fin:
337337
for line in fin:
338-
word, count = utils.to_unicode(line).strip().split()
338+
word, count = utils.to_unicode(line, errors=unicode_errors).strip().split()
339339
counts[word] = int(count)
340340

341341
logger.info("loading projection weights from %s", fname)

gensim/summarization/textcleaner.py

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -22,21 +22,16 @@
2222

2323
from gensim.summarization.syntactic_unit import SyntacticUnit
2424
from gensim.parsing.preprocessing import preprocess_documents
25-
from gensim.utils import tokenize
25+
from gensim.utils import tokenize, has_pattern
2626
from six.moves import range
2727
import re
2828
import logging
2929

3030
logger = logging.getLogger(__name__)
3131

32-
try:
32+
HAS_PATTERN = has_pattern()
33+
if HAS_PATTERN:
3334
from pattern.en import tag
34-
logger.info("'pattern' package found; tag filters are available for English")
35-
HAS_PATTERN = True
36-
except ImportError:
37-
logger.info("'pattern' package not found; tag filters are not available for English")
38-
HAS_PATTERN = False
39-
4035

4136
SEPARATOR = r'@'
4237
RE_SENTENCE = re.compile(r'(\S.+?[.!?])(?=\s+|$)|(\S.+?)(?=[\n]|$)', re.UNICODE)
592 Bytes
Binary file not shown.

0 commit comments

Comments
 (0)