Skip to content

Commit 4a241f0

Browse files
committed
Merge branch 'release-4.0.0.rc1'
2 parents 8624aa2 + a8c0001 commit 4a241f0

File tree

164 files changed

+3005
-233489
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

164 files changed

+3005
-233489
lines changed

.github/FUNDING.yml

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# These are supported funding model platforms
2+
3+
github: [piskvorky] # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2]
4+
patreon: # Replace with a single Patreon username
5+
open_collective: # Replace with a single Open Collective username
6+
ko_fi: # Replace with a single Ko-fi username
7+
tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
8+
community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
9+
liberapay: # Replace with a single Liberapay username
10+
issuehunt: # Replace with a single IssueHunt username
11+
otechie: # Replace with a single Otechie username
12+
custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']
13+

.github/workflows/build-wheels.yml

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
name: Build wheels
2+
3+
on:
4+
push:
5+
branches: [ develop ]
6+
pull_request:
7+
branches: [ develop ]
8+
schedule:
9+
- cron: '0 0 * * sun,wed'
10+
11+
jobs:
12+
build:
13+
runs-on: ${{ matrix.os }}
14+
defaults:
15+
run:
16+
shell: bash
17+
strategy:
18+
fail-fast: false
19+
matrix:
20+
python-version: [3.6, 3.7, 3.8]
21+
os: [ubuntu-latest, macos-latest]
22+
platform: [x64]
23+
include:
24+
- os: ubuntu-latest
25+
python-version: 3.7
26+
skip-network-tests: 1
27+
- os: ubuntu-latest
28+
python-version: 3.8
29+
skip-network-tests: 1
30+
- os: macos-latest
31+
travis-os-name: osx # For multibuild
32+
skip-network-tests: 1
33+
env:
34+
PKG_NAME: gensim
35+
REPO_DIR: gensim
36+
BUILD_COMMIT: HEAD
37+
PLAT: x86_64
38+
UNICODE_WIDTH: 32
39+
MB_PYTHON_VERSION: ${{ matrix.python-version }} # MB_PYTHON_VERSION is needed by Multibuild
40+
TEST_DEPENDS: Morfessor==2.0.2a4 python-levenshtein==0.12.0 visdom==0.1.8.9 pytest mock cython nmslib pyemd testfixtures scikit-learn pyemd
41+
DOCKER_TEST_IMAGE: multibuild/xenial_x86_64
42+
TRAVIS_OS_NAME: ${{ matrix.travis-os-name }}
43+
SKIP_NETWORK_TESTS: ${{ matrix.skip-network-tests }}
44+
45+
steps:
46+
- uses: actions/checkout@v2
47+
with:
48+
submodules: recursive
49+
fetch-depth: 0
50+
- name: Print environment variables
51+
run: |
52+
echo "PLAT: ${PLAT}"
53+
echo "DOCKER_TEST_IMAGE: ${DOCKER_TEST_IMAGE}"
54+
echo "TEST_DEPENDS: ${TEST_DEPENDS}"
55+
echo "TRAVIS_OS_NAME: ${TRAVIS_OS_NAME}"
56+
echo "SKIP_NETWORK_TESTS: ${SKIP_NETWORK_TESTS}"
57+
- name: Set up Python ${{ matrix.python-version }}
58+
uses: actions/setup-python@v2
59+
with:
60+
python-version: ${{ matrix.python-version }}
61+
- name: Install dependencies
62+
run: |
63+
python -m pip install --upgrade pip
64+
pip install virtualenv
65+
- name: Build and Install Wheels
66+
run: |
67+
echo ::group::Set up Multibuild
68+
source multibuild/common_utils.sh
69+
source multibuild/travis_steps.sh
70+
source config.sh
71+
echo ::endgroup::
72+
echo ::group::Before install
73+
before_install
74+
echo ::endgroup::
75+
echo ::group::Build wheel
76+
build_wheel $REPO_DIR ${{ matrix.PLAT }}
77+
echo ::endgroup::
78+
echo ::group::Install run
79+
install_run ${{ matrix.PLAT }}
80+
echo ::endgroup::
81+
- name: Upload wheels to s3://gensim-wheels
82+
if: always()
83+
run: |
84+
pip install wheelhouse-uploader
85+
ls wheelhouse/*.whl
86+
python -m wheelhouse_uploader upload --local-folder wheelhouse/ --no-ssl-check gensim-wheels --provider S3 --no-enable-cdn

.github/workflows/tests.yml

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
name: Tests
2+
on:
3+
push:
4+
branches: [ develop ]
5+
pull_request:
6+
branches: [ develop ]
7+
8+
jobs:
9+
tests:
10+
name: ${{ matrix.name }}
11+
runs-on: ${{ matrix.os }}
12+
defaults:
13+
run:
14+
shell: bash
15+
strategy:
16+
fail-fast: false
17+
matrix:
18+
include:
19+
- {name: Linux, python: 3.6, os: ubuntu-20.04, tox: 'flake8,flake8-docs'}
20+
- {name: Linux, python: 3.6, os: ubuntu-20.04, tox: 'py36-linux'}
21+
- {name: Linux, python: 3.7, os: ubuntu-20.04, tox: 'py37-linux'}
22+
- {name: Linux, python: 3.8, os: ubuntu-20.04, tox: 'py38-linux'}
23+
env:
24+
TOX_PARALLEL_NO_SPINNER: 1
25+
26+
steps:
27+
- uses: actions/checkout@v2
28+
- name: Setup up Python ${{ matrix.python }}
29+
uses: actions/setup-python@v2
30+
with:
31+
python-version: ${{ matrix.python }}
32+
- name: Update pip
33+
run: python -m pip install -U pip
34+
35+
#
36+
# Work-around mysterious build problem
37+
# https://github.com/RaRe-Technologies/gensim/pull/3078/checks?check_run_id=2117914443
38+
# https://www.scala-sbt.org/1.x/docs/Installing-sbt-on-Linux.html
39+
#
40+
- name: Update sbt
41+
run: |
42+
echo "deb https://dl.bintray.com/sbt/debian /" | sudo tee -a /etc/apt/sources.list.d/sbt.list
43+
curl -sL "https://keyserver.ubuntu.com/pks/lookup?op=get&search=0x2EE0EA64E40A89B84B2DF73499E82A75642AC823" | sudo apt-key add
44+
sudo apt-get update -y
45+
sudo apt-get install -y sbt
46+
- name: Install tox, gdb
47+
run: |
48+
pip install tox
49+
sudo apt-get update -y
50+
sudo apt-get install -y gdb
51+
- name: Enable core dumps
52+
run: ulimit -c unlimited -S # enable core dumps
53+
- name: Run tox tests
54+
run: tox -e ${{ matrix.tox }}
55+
- name: Collect corefile
56+
if: ${{ failure() }}
57+
run: |
58+
pwd
59+
COREFILE=$(find . -maxdepth 1 -name "core*" | head -n 1)
60+
if [[ -f "$COREFILE" ]]; then EXECFILE=$(gdb -c "$COREFILE" -batch | grep "Core was generated" | tr -d "\`" | cut -d' ' -f5); file "$COREFILE"; gdb -c "$COREFILE" "$EXECFILE" -x continuous_integration/debug.gdb -batch; fi

.gitignore

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,3 +76,18 @@ data
7676
*.inv
7777
*.js
7878
docs/_images/
79+
80+
#
81+
# Generated by Cython
82+
#
83+
gensim/_matutils.c
84+
gensim/corpora/_mmreader.c
85+
gensim/models/doc2vec_corpusfile.cpp
86+
gensim/models/doc2vec_inner.cpp
87+
gensim/models/fasttext_corpusfile.cpp
88+
gensim/models/fasttext_inner.c
89+
gensim/models/nmf_pgd.c
90+
gensim/models/word2vec_corpusfile.cpp
91+
gensim/models/word2vec_inner.c
92+
93+
.ipynb_checkpoints

.gitmodules

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
[submodule "multibuild"]
2+
path = multibuild
3+
url = https://github.com/matthew-brett/multibuild.git

.travis.yml

Lines changed: 45 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -1,53 +1,52 @@
1-
sudo: false
2-
3-
cache:
4-
apt: true
5-
directories:
6-
- $HOME/.cache/pip
7-
- $HOME/.ccache
8-
- $HOME/.pip-cache
9-
dist: trusty
1+
branches:
2+
only:
3+
- /v\d+\.\d+\.\d+/
104
language: python
5+
arch: arm64-graviton2
6+
dist: focal
7+
virt: vm
8+
group: edge
9+
services: docker
1110
env:
12-
TOX_PARALLEL_NO_SPINNER: 1
13-
11+
global:
12+
- REPO_DIR=gensim
13+
- BUILD_COMMIT=HEAD
14+
- UNICODE_WIDTH=32
15+
- PLAT=aarch64
16+
- MB_ML_VER=2014
17+
- SKIP_NETWORK_TESTS=1
18+
- DOCKER_TEST_IMAGE=multibuild/xenial_arm64v8
19+
- BUILD_DEPENDS="numpy==1.19.2 scipy==1.5.3"
20+
- TEST_DEPENDS="pytest mock cython nmslib pyemd testfixtures Morfessor==2.0.2a4 python-levenshtein==0.12.0 visdom==0.1.8.9 scikit-learn"
1421

1522
matrix:
16-
include:
17-
- python: '3.6'
18-
env: TOXENV="flake8,flake8-docs"
19-
20-
- python: '3.8'
23+
- os: linux
2124
env:
22-
- TOXENV="py38-linux"
23-
dist: bionic
24-
25-
- python: '3.7'
25+
- MB_PYTHON_VERSION=3.6
26+
- os: linux
2627
env:
27-
- TOXENV="py37-linux"
28-
# The following two lines used to be necessary because Travis left files lying around in ~/.aws/,
29-
# messing up our tests. Now fixed since https://github.com/travis-ci/travis-ci/issues/7940
30-
# - BOTO_CONFIG="/dev/null"
31-
#sudo: true
32-
dist: xenial
33-
34-
- python: '3.6'
35-
env: TOXENV="py36-linux"
36-
37-
28+
- MB_PYTHON_VERSION=3.7
29+
- os: linux
30+
env:
31+
- MB_PYTHON_VERSION=3.8
32+
- os: linux
33+
env:
34+
- MB_PYTHON_VERSION=3.9
35+
before_install:
36+
- source multibuild/common_utils.sh
37+
- source multibuild/travis_steps.sh
38+
- before_install
3839
install:
39-
- pip install tox
40-
- sudo apt-get install -y gdb
41-
42-
43-
before_script:
44-
- ulimit -c unlimited -S # enable core dumps
45-
46-
47-
script: tox -vv
48-
49-
50-
after_failure:
51-
- pwd
52-
- COREFILE=$(find . -maxdepth 1 -name "core*" | head -n 1)
53-
- if [[ -f "$COREFILE" ]]; then EXECFILE=$(gdb -c "$COREFILE" -batch | grep "Core was generated" | tr -d "\`" | cut -d' ' -f5); file "$COREFILE"; gdb -c "$COREFILE" "$EXECFILE" -x continuous_integration/debug.gdb -batch; fi
40+
- build_wheel $REPO_DIR $PLAT
41+
script:
42+
- install_run $PLAT
43+
after_script:
44+
- ls -laht ${TRAVIS_BUILD_DIR}/wheelhouse/
45+
- pip install wheelhouse-uploader
46+
- python -m wheelhouse_uploader upload --local-folder ${TRAVIS_BUILD_DIR}/wheelhouse/ --no-ssl-check gensim-wheels --provider S3 --no-enable-cdn
47+
48+
notifications:
49+
email:
50+
- penkov+gensimwheels@pm.me
51+
on_success: always
52+
on_failure: always

CHANGELOG.md

Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,87 @@
11
Changes
22
=======
33

4+
## 4.0.0.rc1, 2021-03-19
5+
6+
**⚠️ Gensim 4.0 contains breaking API changes! See the [Migration guide](https://github.com/RaRe-Technologies/gensim/wiki/Migrating-from-Gensim-3.x-to-4) to update your existing Gensim 3.x code and models.**
7+
8+
Gensim 4.0 is a major release with lots of performance & robustness improvements and a new website.
9+
10+
### Main highlights (see also *👍 Improvements* below)
11+
12+
* Massively optimized popular algorithms the community has grown to love: [fastText](https://radimrehurek.com/gensim/models/fasttext.html), [word2vec](https://radimrehurek.com/gensim/models/word2vec.html), [doc2vec](https://radimrehurek.com/gensim/models/doc2vec.html), [phrases](https://radimrehurek.com/gensim/models/phrases.html):
13+
14+
a. **Efficiency**
15+
16+
| model | 3.8.3: wall time / peak RAM / throughput | 4.0.0: wall time / peak RAM / throughput |
17+
|----------|------------|--------|
18+
| fastText | 2.9h / 4.11 GB / 822k words/s | 2.3h / **1.26 GB** / 914k words/s |
19+
| word2vec | 1.7h / 0.36 GB / 1685k words/s | **1.2h** / 0.33 GB / 1762k words/s |
20+
21+
In other words, fastText now needs 3x less RAM (and is faster); word2vec has 2x faster init (and needs less RAM, and is faster); detecting collocation phrases is 2x faster. ([4.0 benchmarks](https://github.com/RaRe-Technologies/gensim/issues/2887#issuecomment-711097334))
22+
23+
b. **Robustness**. We fixed a bunch of long-standing bugs by refactoring the internal code structure (see 🔴 Bug fixes below)
24+
25+
c. **Simplified OOP model** for easier model exports and integration with TensorFlow, PyTorch &co.
26+
27+
These improvements come to you transparently aka "for free", but see [Migration guide](https://github.com/RaRe-Technologies/gensim/wiki/Migrating-from-Gensim-3.x-to-4) for some changes that break the old Gensim 3.x API. **Update your code accordingly**.
28+
29+
* Dropped a bunch of externally contributed modules: summarization, pivoted TFIDF normalization, FIXME.
30+
- Code quality was not up to our standards. Also there was no one to maintain them, answer user questions, support these modules.
31+
32+
So rather than let them rot, we took the hard decision of removing these contributed modules from Gensim. If anyone's interested in maintaining them please fork into your own repo, they can live happily outside of Gensim.
33+
34+
* Dropped Python 2. Gensim 4.0 is Py3.6+. Read our [Python version support policy](https://github.com/RaRe-Technologies/gensim/wiki/Gensim-And-Compatibility).
35+
- If you still need Python 2 for some reason, stay at [Gensim 3.8.3](https://github.com/RaRe-Technologies/gensim/releases/tag/3.8.3).
36+
37+
* A new [Gensim website](https://radimrehurek.com/gensim_4.0.0) – finally! 🙃
38+
39+
So, a major clean-up release overall. We're happy with this **tighter, leaner and faster Gensim**.
40+
41+
This is the direction we'll keep going forward: less kitchen-sink of "latest academic algorithms", more focus on robust engineering, targetting common concrete NLP & document similarity use-cases.
42+
43+
### :star2: New Features
44+
45+
* Default to pickle protocol 4 when saving models (__[piskvorky](https://github.com/piskvorky)__, [#3065](https://github.com/RaRe-Technologies/gensim/pull/3065))
46+
* Record lifecycle events in Gensim models (__[piskvorky](https://github.com/piskvorky)__, [#3060](https://github.com/RaRe-Technologies/gensim/pull/3060))
47+
* Make WMD normalization optional (__[piskvorky](https://github.com/piskvorky)__, [#3073](https://github.com/RaRe-Technologies/gensim/pull/3073))
48+
49+
### :red_circle: Bug fixes
50+
51+
* fix RuntimeError in export_phrases (change defaultdict to dict) (__[thalishsajeed](https://github.com/thalishsajeed)__, [#3041](https://github.com/RaRe-Technologies/gensim/pull/3041))
52+
53+
### :books: Tutorial and doc improvements
54+
55+
* fix various documentation warnings (__[mpenkov](https://github.com/mpenkov)__, [#3077](https://github.com/RaRe-Technologies/gensim/pull/3077))
56+
* Fix broken link in run_doc how-to (__[sezanzeb](https://github.com/sezanzeb)__, [#2991](https://github.com/RaRe-Technologies/gensim/pull/2991))
57+
* Point WordEmbeddingSimilarityIndex documentation to gensim.similarities (__[Witiko](https://github.com/Witiko)__, [#3003](https://github.com/RaRe-Technologies/gensim/pull/3003))
58+
* Make the link to the Gensim 3.8.3 documentation dynamic (__[Witiko](https://github.com/Witiko)__, [#2996](https://github.com/RaRe-Technologies/gensim/pull/2996))
59+
60+
### :+1: Improvements
61+
62+
### :warning: Removed functionality
63+
64+
* remove on_batch_begin and on_batch_end callbacks (__[mpenkov](https://github.com/mpenkov)__, [#3078](https://github.com/RaRe-Technologies/gensim/pull/3078))
65+
* remove pattern dependency (__[mpenkov](https://github.com/mpenkov)__, [#3012](https://github.com/RaRe-Technologies/gensim/pull/3012))
66+
* rm gensim.viz submodule (__[mpenkov](https://github.com/mpenkov)__, [#3055](https://github.com/RaRe-Technologies/gensim/pull/3055))
67+
68+
### :warning: Deprecations (will be removed in the next major release)
69+
70+
### ??? Misc
71+
72+
**FIXME** This is a list of PRs that I couldn't find an appropriate section for.
73+
We could make some other section for them or remove them from the changelog entirely.
74+
This is probably OK as-is for the release candidate, but we should clean this up for the proper, final release.
75+
76+
* [MRG] Add Github sponsor + donation nags (__[piskvorky](https://github.com/piskvorky)__, [#3069](https://github.com/RaRe-Technologies/gensim/pull/3069))
77+
* Update URLs (__[jonaschn](https://github.com/jonaschn)__, [#3063](https://github.com/RaRe-Technologies/gensim/pull/3063))
78+
* Fix race condition in FastText tests (__[sleepy-owl](https://github.com/sleepy-owl)__, [#3059](https://github.com/RaRe-Technologies/gensim/pull/3059))
79+
* Add py39 wheels to travis/azure (__[FredHappyface](https://github.com/FredHappyface)__, [#3058](https://github.com/RaRe-Technologies/gensim/pull/3058))
80+
* Update repos before trying to install gdb (__[janaknat](https://github.com/janaknat)__, [#3035](https://github.com/RaRe-Technologies/gensim/pull/3035))
81+
* transformed camelCase to snake_case test names (__[sezanzeb](https://github.com/sezanzeb)__, [#3033](https://github.com/RaRe-Technologies/gensim/pull/3033))
82+
* move x86 tests from Travis to GHA, add aarch64 wheel build to Travis (__[janaknat](https://github.com/janaknat)__, [#3026](https://github.com/RaRe-Technologies/gensim/pull/3026))
83+
* Add Github Actions x86 and mac jobs to build python wheels (__[janaknat](https://github.com/janaknat)__, [#3024](https://github.com/RaRe-Technologies/gensim/pull/3024))
84+
485
## 4.0.0beta, 2020-10-31
586

687
**⚠️ Gensim 4.0 contains breaking API changes! See the [Migration guide](https://github.com/RaRe-Technologies/gensim/wiki/Migrating-from-Gensim-3.x-to-4) to update your existing Gensim 3.x code and models.**
@@ -104,6 +185,7 @@ Production stability is important to Gensim, so we're improving the process of *
104185
* [#2926](https://github.com/RaRe-Technologies/gensim/pull/2926): Rename `num_words` to `topn` in dtm_coherence, by [@MeganStodel](https://github.com/MeganStodel)
105186
* [#2937](https://github.com/RaRe-Technologies/gensim/pull/2937): Remove Keras dependency, by [@piskvorky](https://github.com/piskvorky)
106187
* Removed all code, methods, attributes and functions marked as deprecated in [Gensim 3.8.3](https://github.com/RaRe-Technologies/gensim/releases/tag/3.8.3).
188+
* Removed pattern dependency (PR [#3012](https://github.com/RaRe-Technologies/gensim/pull/3012), [@mpenkov](https://github.com/mpenkov)). If you need to lemmatize, do it prior to passing the corpus to gensim.
107189

108190
---
109191

ISSUE_TEMPLATE.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,12 @@ What are you trying to achieve? What is the expected result? What are you seeing
1515

1616
Include full tracebacks, logs and datasets if necessary. Please keep the examples minimal ("minimal reproducible example").
1717

18+
If your problem is with a specific Gensim model (word2vec, lsimodel, doc2vec, fasttext, ldamodel etc), include the following:
19+
20+
```python
21+
print(my_model.lifecycle_events)
22+
```
23+
1824
#### Versions
1925

2026
Please provide the output of:

0 commit comments

Comments
 (0)