Merge branch 'release-3.8.1'

mpenkov · mpenkov · commit 8420768d6580 · 2019-09-23T18:19:34.000+09:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,6 +1,46 @@
 Changes
 =======
 
+## 3.8.1, 2019-09-23
+
+### :red_circle: Bug fixes
+
+* Fix usage of base_dir instead of BASE_DIR in _load_info in downloader. (__[movb](https://github.com/movb)__, [#2605](https://github.com/RaRe-Technologies/gensim/pull/2605))
+* Update the version of smart_open in the setup.py file (__[AMR-KELEG](https://github.com/AMR-KELEG)__, [#2582](https://github.com/RaRe-Technologies/gensim/pull/2582))
+* Properly handle unicode_errors arg parameter when loading a vocab file (__[wmtzk](https://github.com/wmtzk)__, [#2570](https://github.com/RaRe-Technologies/gensim/pull/2570))
+* Catch loading older TfidfModels without smartirs (__[bnomis](https://github.com/bnomis)__, [#2559](https://github.com/RaRe-Technologies/gensim/pull/2559))
+* Fix bug where a module import set up logging, pin doctools for Py2 (__[piskvorky](https://github.com/piskvorky)__, [#2552](https://github.com/RaRe-Technologies/gensim/pull/2552))
+
+### :books: Tutorial and doc improvements
+
+* Fix usage example in phrases.py (__[piskvorky](https://github.com/piskvorky)__, [#2575](https://github.com/RaRe-Technologies/gensim/pull/2575))
+
+### :+1: Improvements
+
+* Optimize Poincare model training (__[koiizukag](https://github.com/koiizukag)__, [#2589](https://github.com/RaRe-Technologies/gensim/pull/2589))
+
+### :warning: Deprecations (will be removed in the next major release)
+
+* Remove
+    - `gensim.models.FastText.load_fasttext_format`: use load_facebook_vectors to load embeddings only (faster, less CPU/memory usage, does not support training continuation) and load_facebook_model to load full model (slower, more CPU/memory intensive, supports training continuation)
+    - `gensim.models.wrappers.fasttext` (obsoleted by the new native `gensim.models.fasttext` implementation)
+    - `gensim.examples`
+    - `gensim.nosy`
+    - `gensim.scripts.word2vec_standalone`
+    - `gensim.scripts.make_wiki_lemma`
+    - `gensim.scripts.make_wiki_online`
+    - `gensim.scripts.make_wiki_online_lemma`
+    - `gensim.scripts.make_wiki_online_nodebug`
+    - `gensim.scripts.make_wiki` (all of these obsoleted by the new native  `gensim.scripts.segment_wiki` implementation)
+    - "deprecated" functions and attributes
+
+* Move
+    - `gensim.scripts.make_wikicorpus` ➡ `gensim.scripts.make_wiki.py`
+    - `gensim.summarization` ➡ `gensim.models.summarization`
+    - `gensim.topic_coherence` ➡ `gensim.models._coherence`
+    - `gensim.utils` ➡ `gensim.utils.utils` (old imports will continue to work)
+    - `gensim.parsing.*` ➡ `gensim.utils.text_utils`
+
 ## 3.8.0, 2019-07-08
 
 ## :warning: 3.8.x will be the last gensim version to support Py2.7. Starting with 4.0.0, gensim will only support Py3.5 and above
diff --git a/docs/src/conf.py b/docs/src/conf.py
@@ -57,7 +57,7 @@
 # The short X.Y version.
 version = '3.8'
 # The full version, including alpha/beta/rc tags.
-release = '3.8.0'
+release = '3.8.1'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
diff --git a/gensim/__init__.py b/gensim/__init__.py
@@ -5,7 +5,7 @@
 from gensim import parsing, corpora, matutils, interfaces, models, similarities, summarization, utils  # noqa:F401
 import logging
 
-__version__ = '3.8.0'
+__version__ = '3.8.1'
 
 
 logger = logging.getLogger('gensim')
diff --git a/gensim/downloader.py b/gensim/downloader.py
@@ -191,7 +191,7 @@ def _load_info(url=DATA_LIST_URL, encoding='utf-8'):
     If the network access fails, fall back to a local cache.  This cache gets
     updated each time a network request _succeeds_.
     """
-    cache_path = os.path.join(base_dir, 'information.json')
+    cache_path = os.path.join(BASE_DIR, 'information.json')
     _create_base_dir()
 
     try:
diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py
@@ -1,8 +1,11 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
+# Copyright (C) 2011 Radim Rehurek <radimrehurek@seznam.cz>
 # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
 
-"""Automatically detect common phrases -- multi-word expressions / word n-grams -- from a stream of sentences.
+"""
+Automatically detect common phrases -- aka multi-word expressions, word n-gram collocations -- from
+a stream of sentences.
 
 Inspired by:
 
@@ -20,19 +23,38 @@
     >>> from gensim.models.word2vec import Text8Corpus
     >>> from gensim.models.phrases import Phrases, Phraser
     >>>
+    >>> # Load training data.
     >>> sentences = Text8Corpus(datapath('testcorpus.txt'))
-    >>> phrases = Phrases(sentences, min_count=1, threshold=1)  # train model
-    >>> phrases[[u'trees', u'graph', u'minors']]  # apply model to sentence
-    [u'trees_graph', u'minors']
+    >>> # The training corpus must be a sequence (stream, generator) of sentences,
+    >>> # with each sentence a list of tokens:
+    >>> print(list(sentences)[0][:10])
+    ['computer', 'human', 'interface', 'computer', 'response', 'survey', 'system', 'time', 'user', 'interface']
+    >>>
+    >>> # Train a toy bigram model.
+    >>> phrases = Phrases(sentences, min_count=1, threshold=1)
+    >>> # Apply the trained phrases model to a new, unseen sentence.
+    >>> phrases[['trees', 'graph', 'minors']]
+    ['trees_graph', 'minors']
+    >>> # The toy model considered "trees graph" a single phrase => joined the two
+    >>> # tokens into a single token, `trees_graph`.
     >>>
-    >>> phrases.add_vocab([["hello", "world"], ["meow"]])  # update model with new sentences
+    >>> # Update the model with two new sentences on the fly.
+    >>> phrases.add_vocab([["hello", "world"], ["meow"]])
     >>>
-    >>> bigram = Phraser(phrases)  # construct faster model (this is only an wrapper)
-    >>> bigram[[u'trees', u'graph', u'minors']]  # apply model to sentence
-    [u'trees_graph', u'minors']
+    >>> # Export the trained model = use less RAM, faster processing. Model updates no longer possible.
+    >>> bigram = Phraser(phrases)
+    >>> bigram[['trees', 'graph', 'minors']]  # apply the exported model to a sentence
+    ['trees_graph', 'minors']
     >>>
-    >>> for sent in bigram[sentences]:  # apply model to text corpus
+    >>> # Apply the exported model to each sentence of a corpus:
+    >>> for sent in bigram[sentences]:
     ...     pass
+    >>>
+    >>> # Save / load an exported collocation model.
+    >>> bigram.save("/tmp/my_bigram_model.pkl")
+    >>> bigram_reloaded = Phraser.load("/tmp/my_bigram_model.pkl")
+    >>> bigram_reloaded[['trees', 'graph', 'minors']]  # apply the exported model to a sentence
+    ['trees_graph', 'minors']
 
 """
 
diff --git a/gensim/models/poincare.py b/gensim/models/poincare.py
@@ -567,10 +567,13 @@ def _handle_duplicates(vector_updates, node_indices):
 
         """
         counts = Counter(node_indices)
+        node_dict = defaultdict(list)
+        for i, node_index in enumerate(node_indices):
+            node_dict[node_index].append(i)
         for node_index, count in counts.items():
             if count == 1:
                 continue
-            positions = [i for i, index in enumerate(node_indices) if index == node_index]
+            positions = node_dict[node_index]
             # Move all updates to the same node to the last such update, zeroing all the others
             vector_updates[positions[-1]] = vector_updates[positions].sum(axis=0)
             vector_updates[positions[:-1]] = 0
diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py
@@ -433,6 +433,10 @@ def load(cls, *args, **kwargs):
             model.slope = 0.65
             logger.info('older version of %s loaded without slope arg', cls.__name__)
             logger.info('Setting slope to %s.', model.slope)
+        if not hasattr(model, 'smartirs'):
+            model.smartirs = None
+            logger.info('older version of %s loaded without smartirs arg', cls.__name__)
+            logger.info('Setting smartirs to %s.', model.smartirs)
         return model
 
     def __str__(self):
diff --git a/gensim/models/utils_any2vec.py b/gensim/models/utils_any2vec.py
@@ -335,7 +335,7 @@ def _load_word2vec_format(cls, fname, fvocab=None, binary=False, encoding='utf8'
         counts = {}
         with utils.open(fvocab, 'rb') as fin:
             for line in fin:
-                word, count = utils.to_unicode(line).strip().split()
+                word, count = utils.to_unicode(line, errors=unicode_errors).strip().split()
                 counts[word] = int(count)
 
     logger.info("loading projection weights from %s", fname)
diff --git a/gensim/summarization/textcleaner.py b/gensim/summarization/textcleaner.py
@@ -22,21 +22,16 @@
 
 from gensim.summarization.syntactic_unit import SyntacticUnit
 from gensim.parsing.preprocessing import preprocess_documents
-from gensim.utils import tokenize
+from gensim.utils import tokenize, has_pattern
 from six.moves import range
 import re
 import logging
 
 logger = logging.getLogger(__name__)
 
-try:
+HAS_PATTERN = has_pattern()
+if HAS_PATTERN:
     from pattern.en import tag
-    logger.info("'pattern' package found; tag filters are available for English")
-    HAS_PATTERN = True
-except ImportError:
-    logger.info("'pattern' package not found; tag filters are not available for English")
-    HAS_PATTERN = False
-
 
 SEPARATOR = r'@'
 RE_SENTENCE = re.compile(r'(\S.+?[.!?])(?=\s+|$)|(\S.+?)(?=[\n]|$)', re.UNICODE)
diff --git a/gensim/test/test_data/tfidf_model_3_2.tst b/gensim/test/test_data/tfidf_model_3_2.tst
diff --git a/gensim/test/test_data/w2v_keyedvectors_load_test.modeldata b/gensim/test/test_data/w2v_keyedvectors_load_test.modeldata
@@ -0,0 +1,3 @@
+2 3
+ありがとう� 0.6 0.6 0.6
+どういたしまして� 0.1 0.2 0.3
diff --git a/gensim/test/test_data/w2v_keyedvectors_load_test.vocab b/gensim/test/test_data/w2v_keyedvectors_load_test.vocab
@@ -0,0 +1,2 @@
+ありがとう� 123
+どういたしまして� 789
diff --git a/gensim/test/test_keyedvectors.py b/gensim/test/test_keyedvectors.py
@@ -361,6 +361,45 @@ def test(self):
         self.assertTrue(vectors.word_vec('computer') is not None)
 
 
+class Word2VecKeyedVectorsTest(unittest.TestCase):
+    def setUp(self):
+        self.model_path = datapath("w2v_keyedvectors_load_test.modeldata")
+        self.vocab_path = datapath("w2v_keyedvectors_load_test.vocab")
+
+    def test_load_model_and_vocab_file_strict(self):
+        """Test loading model and voacab files which have decoding errors: strict mode"""
+        with self.assertRaises(UnicodeDecodeError):
+            gensim.models.KeyedVectors.load_word2vec_format(
+                self.model_path, fvocab=self.vocab_path, binary=False, unicode_errors="strict")
+
+    def test_load_model_and_vocab_file_replace(self):
+        """Test loading model and voacab files which have decoding errors: replace mode"""
+        model = gensim.models.KeyedVectors.load_word2vec_format(
+            self.model_path, fvocab=self.vocab_path, binary=False, unicode_errors="replace")
+        self.assertEqual(model.vocab[u'ありがとう�'].count, 123)
+        self.assertEqual(model.vocab[u'どういたしまして�'].count, 789)
+        self.assertEqual(model.vocab[u'ありがとう�'].index, 0)
+        self.assertEqual(model.vocab[u'どういたしまして�'].index, 1)
+        self.assertTrue(np.array_equal(
+            model.get_vector(u'ありがとう�'), np.array([.6, .6, .6], dtype=np.float32)))
+        self.assertTrue(np.array_equal(
+            model.get_vector(u'どういたしまして�'), np.array([.1, .2, .3], dtype=np.float32)))
+
+    def test_load_model_and_vocab_file_ignore(self):
+        """Test loading model and voacab files which have decoding errors: ignore mode"""
+        model = gensim.models.KeyedVectors.load_word2vec_format(
+            self.model_path, fvocab=self.vocab_path, binary=False, unicode_errors="ignore")
+        print(model.vocab.keys())
+        self.assertEqual(model.vocab[u'ありがとう'].count, 123)
+        self.assertEqual(model.vocab[u'どういたしまして'].count, 789)
+        self.assertEqual(model.vocab[u'ありがとう'].index, 0)
+        self.assertEqual(model.vocab[u'どういたしまして'].index, 1)
+        self.assertTrue(np.array_equal(
+            model.get_vector(u'ありがとう'), np.array([.6, .6, .6], dtype=np.float32)))
+        self.assertTrue(np.array_equal(
+            model.get_vector(u'どういたしまして'), np.array([.1, .2, .3], dtype=np.float32)))
+
+
 if __name__ == '__main__':
     logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)
     unittest.main()
diff --git a/gensim/test/test_tfidfmodel.py b/gensim/test/test_tfidfmodel.py
@@ -435,6 +435,15 @@ def wglobal(df, total_docs):
         self.assertTrue(np.allclose(sorted(transformed_docs[0]), sorted(expected_docs[0])))
         self.assertTrue(np.allclose(sorted(transformed_docs[1]), sorted(expected_docs[1])))
 
+    def test_backwards_compatibility(self):
+        model = tfidfmodel.TfidfModel.load(datapath('tfidf_model_3_2.tst'))
+        # attrs ensured by load method
+        attrs = ['pivot', 'slope', 'smartirs']
+        for a in attrs:
+            self.assertTrue(hasattr(model, a))
+        # __getitem__: assumes smartirs attr is present
+        self.assertEqual(len(model[corpus]), len(corpus))
+
 
 if __name__ == '__main__':
     logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)
diff --git a/setup.py b/setup.py
@@ -265,6 +265,7 @@ def finalize_options(self):
 else:
     win_testenv.append('scikit-learn')
 
+
 linux_testenv = win_testenv[:]
 
 if sys.version_info < (3, 7):
@@ -275,7 +276,20 @@ def finalize_options(self):
     ])
 
 if (3, 0) < sys.version_info < (3, 7):
-    linux_testenv.extend(['nmslib'])
+    linux_testenv.extend(['nmslib'])    
+    
+docs_testenv = linux_testenv + distributed_env + [
+    'sphinx',
+    'sphinxcontrib-napoleon',
+    'plotly',
+    'pattern <= 2.6',
+    'sphinxcontrib.programoutput',
+]
+#
+# Get Py2.7 docs to build, see https://github.com/RaRe-Technologies/gensim/pull/2552
+#
+if sys.version_info == (2, 7):
+    docs_testenv.insert(0, 'doctools==0.14')
 
 ext_modules = [
     Extension('gensim.models.word2vec_inner',
@@ -333,7 +347,7 @@ def finalize_options(self):
 
 setup(
     name='gensim',
-    version='3.8.0',
+    version='3.8.1',
     description='Python framework for fast Vector Space Modelling',
     long_description=LONG_DESCRIPTION,
 
@@ -381,14 +395,14 @@ def finalize_options(self):
         NUMPY_STR,
         'scipy >= 0.18.1',
         'six >= 1.5.0',
-        'smart_open >= 1.7.0',
+        'smart_open >= 1.8.1',
     ],
     tests_require=linux_testenv,
     extras_require={
         'distributed': distributed_env,
         'test-win': win_testenv,
         'test': linux_testenv,
-        'docs': linux_testenv + distributed_env + ['sphinx', 'sphinxcontrib-napoleon', 'plotly', 'pattern <= 2.6', 'sphinxcontrib.programoutput'],
+        'docs': docs_testenv,
     },
 
     include_package_data=True,