Skip to content

Commit 006a6ee

Browse files
committed
Merge branch 'release-3.5.0'
2 parents 885430d + 6ecd261 commit 006a6ee

File tree

153 files changed

+24965
-14517
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

153 files changed

+24965
-14517
lines changed

CHANGELOG.md

Lines changed: 202 additions & 0 deletions
Large diffs are not rendered by default.

docs/notebooks/Corpora_and_Vector_Spaces.ipynb

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -279,9 +279,10 @@
279279
},
280280
"outputs": [],
281281
"source": [
282+
"from smart_open import smart_open\n",
282283
"class MyCorpus(object):\n",
283284
" def __iter__(self):\n",
284-
" for line in open('datasets/mycorpus.txt'):\n",
285+
" for line in smart_open('datasets/mycorpus.txt', 'rb'):\n",
285286
" # assume there's one document per line, tokens separated by whitespace\n",
286287
" yield dictionary.doc2bow(line.lower().split())"
287288
]
@@ -374,9 +375,10 @@
374375
],
375376
"source": [
376377
"from six import iteritems\n",
378+
"from smart_open import smart_open\n",
377379
"\n",
378380
"# collect statistics about all tokens\n",
379-
"dictionary = corpora.Dictionary(line.lower().split() for line in open('datasets/mycorpus.txt'))\n",
381+
"dictionary = corpora.Dictionary(line.lower().split() for line in smart_open('datasets/mycorpus.txt', 'rb'))\n",
380382
"\n",
381383
"# remove stop words and words that appear only once\n",
382384
"stop_ids = [dictionary.token2id[stopword] for stopword in stoplist \n",

docs/notebooks/Poincare Evaluation.ipynb

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -697,7 +697,7 @@
697697
" parts = first_line.rstrip().split(\"\\t\")\n",
698698
" model_size = len(parts) - 1\n",
699699
" vocab_size = len(lines)\n",
700-
" with open(output_file, 'w') as f:\n",
700+
" with smart_open(output_file, 'w') as f:\n",
701701
" f.write('%d %d\\n' % (vocab_size, model_size))\n",
702702
" for line in lines:\n",
703703
" f.write(line.replace('\\t', ' '))\n",
@@ -709,7 +709,7 @@
709709
" \n",
710710
" model_size = random_embedding.shape[0]\n",
711711
" vocab_size = len(np_embeddings)\n",
712-
" with open(output_file, 'w') as f:\n",
712+
" with smart_open(output_file, 'w') as f:\n",
713713
" f.write('%d %d\\n' % (vocab_size, model_size))\n",
714714
" for key, vector in np_embeddings.items():\n",
715715
" vector_string = ' '.join('%.6f' % value for value in vector)\n",
@@ -1113,7 +1113,7 @@
11131113
" test_line_candidates = []\n",
11141114
" line_count = 0\n",
11151115
" all_nodes = set()\n",
1116-
" with open(data_file, 'rb') as f:\n",
1116+
" with smart_open(data_file, 'rb') as f:\n",
11171117
" for i, line in enumerate(f):\n",
11181118
" node_1, node_2 = line.split()\n",
11191119
" all_nodes.update([node_1, node_2])\n",
@@ -1135,9 +1135,9 @@
11351135
" train_line_indices = set(l for l in range(line_count) if l not in test_line_indices)\n",
11361136
" \n",
11371137
" train_set_nodes = set()\n",
1138-
" with open(data_file, 'rb') as f:\n",
1139-
" train_file = open(train_filename, 'wb')\n",
1140-
" test_file = open(test_filename, 'wb')\n",
1138+
" with smart_open(data_file, 'rb') as f:\n",
1139+
" train_file = smart_open(train_filename, 'wb')\n",
1140+
" test_file = smart_open(test_filename, 'wb')\n",
11411141
" for i, line in enumerate(f):\n",
11421142
" if i in train_line_indices:\n",
11431143
" train_set_nodes.update(line.split())\n",
@@ -1169,13 +1169,13 @@
11691169
" \"\"\"\n",
11701170
" root_candidates = set()\n",
11711171
" leaf_candidates = set()\n",
1172-
" with open(data_file, 'rb') as f:\n",
1172+
" with smart_open(data_file, 'rb') as f:\n",
11731173
" for line in f:\n",
11741174
" nodes = line.split()\n",
11751175
" root_candidates.update(nodes)\n",
11761176
" leaf_candidates.update(nodes)\n",
11771177
" \n",
1178-
" with open(data_file, 'rb') as f:\n",
1178+
" with smart_open(data_file, 'rb') as f:\n",
11791179
" for line in f:\n",
11801180
" node_1, node_2 = line.split()\n",
11811181
" if node_1 == node_2:\n",

docs/notebooks/Tensorboard_visualizations.ipynb

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -624,6 +624,7 @@
624624
"import pandas as pd\n",
625625
"import smart_open\n",
626626
"import random\n",
627+
"from smart_open import smart_open\n",
627628
"\n",
628629
"# read data\n",
629630
"dataframe = pd.read_csv('movie_plots.csv')\n",
@@ -803,7 +804,7 @@
803804
},
804805
"outputs": [],
805806
"source": [
806-
"with open('movie_plot_metadata.tsv','w') as w:\n",
807+
"with smart_open('movie_plot_metadata.tsv','w') as w:\n",
807808
" w.write('Titles\\tGenres\\n')\n",
808809
" for i,j in zip(dataframe.Titles, dataframe.Genres):\n",
809810
" w.write(\"%s\\t%s\\n\" % (i,j))"
@@ -1024,14 +1025,14 @@
10241025
"outputs": [],
10251026
"source": [
10261027
"# create file for tensors\n",
1027-
"with open('doc_lda_tensor.tsv','w') as w:\n",
1028+
"with smart_open('doc_lda_tensor.tsv','w') as w:\n",
10281029
" for doc_topics in all_topics:\n",
10291030
" for topics in doc_topics:\n",
10301031
" w.write(str(topics[1])+ \"\\t\")\n",
10311032
" w.write(\"\\n\")\n",
10321033
" \n",
10331034
"# create file for metadata\n",
1034-
"with open('doc_lda_metadata.tsv','w') as w:\n",
1035+
"with smart_open('doc_lda_metadata.tsv','w') as w:\n",
10351036
" w.write('Titles\\tGenres\\n')\n",
10361037
" for j, k in zip(dataframe.Titles, dataframe.Genres):\n",
10371038
" w.write(\"%s\\t%s\\n\" % (j, k))"
@@ -1084,7 +1085,7 @@
10841085
"\n",
10851086
"# overwrite metadata file\n",
10861087
"i=0\n",
1087-
"with open('doc_lda_metadata.tsv','w') as w:\n",
1088+
"with smart_open('doc_lda_metadata.tsv','w') as w:\n",
10881089
" w.write('Titles\\tGenres\\n')\n",
10891090
" for j,k in zip(dataframe.Titles, dataframe.Genres):\n",
10901091
" w.write(\"%s\\t%s\\n\" % (''.join((str(j), str(tensors[i]))),k))\n",

docs/notebooks/WMD_tutorial.ipynb

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -302,6 +302,7 @@
302302
"start = time()\n",
303303
"\n",
304304
"import json\n",
305+
"from smart_open import smart_open\n",
305306
"\n",
306307
"# Business IDs of the restaurants.\n",
307308
"ids = ['4bEjOyTaDG24SY5TxsaUNQ', '2e2e7WgqU1BnpxmQL5jbfw', 'zt1TpTuJ6y9n551sw9TaEg',\n",
@@ -310,7 +311,7 @@
310311
"w2v_corpus = [] # Documents to train word2vec on (all 6 restaurants).\n",
311312
"wmd_corpus = [] # Documents to run queries against (only one restaurant).\n",
312313
"documents = [] # wmd_corpus, with no pre-processing (so we can see the original documents).\n",
313-
"with open('/data/yelp_academic_dataset_review.json') as data_file:\n",
314+
"with smart_open('/data/yelp_academic_dataset_review.json', 'rb') as data_file:\n",
314315
" for line in data_file:\n",
315316
" json_line = json.loads(line)\n",
316317
" \n",

docs/notebooks/Word2Vec_FastText_Comparison.ipynb

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,11 +57,12 @@
5757
],
5858
"source": [
5959
"import nltk\n",
60+
"from smart_open import smart_open\n",
6061
"nltk.download('brown') \n",
6162
"# Only the brown corpus is needed in case you don't have it.\n",
6263
"\n",
6364
"# Generate brown corpus text file\n",
64-
"with open('brown_corp.txt', 'w+') as f:\n",
65+
"with smart_open('brown_corp.txt', 'w+') as f:\n",
6566
" for word in nltk.corpus.brown.words():\n",
6667
" f.write('{word} '.format(word=word))\n",
6768
"\n",

docs/notebooks/Wordrank_comparisons.ipynb

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -38,20 +38,21 @@
3838
],
3939
"source": [
4040
"import nltk\n",
41+
"from smart_open import smart_open\n",
4142
"from gensim.parsing.preprocessing import strip_punctuation, strip_multiple_whitespaces\n",
4243
"\n",
4344
"# Only the brown corpus is needed in case you don't have it.\n",
4445
"nltk.download('brown') \n",
4546
"\n",
4647
"# Generate brown corpus text file\n",
47-
"with open('brown_corp.txt', 'w+') as f:\n",
48+
"with smart_open('brown_corp.txt', 'w+') as f:\n",
4849
" for word in nltk.corpus.brown.words():\n",
4950
" f.write('{word} '.format(word=word))\n",
5051
" f.seek(0)\n",
5152
" brown = f.read()\n",
5253
"\n",
5354
"# Preprocess brown corpus\n",
54-
"with open('proc_brown_corp.txt', 'w') as f:\n",
55+
"with smart_open('proc_brown_corp.txt', 'w') as f:\n",
5556
" proc_brown = strip_punctuation(brown)\n",
5657
" proc_brown = strip_multiple_whitespaces(proc_brown).lower()\n",
5758
" f.write(proc_brown)\n",
@@ -1004,12 +1005,13 @@
10041005
"import copy\n",
10051006
"import multiprocessing\n",
10061007
"import numpy as np\n",
1008+
"from smart_open import smart_open\n",
10071009
"\n",
10081010
"\n",
10091011
"def compute_accuracies(model, freq):\n",
10101012
" # mean_freq will contain analogies together with the mean frequency of 4 words involved\n",
10111013
" mean_freq = {}\n",
1012-
" with open(word_analogies_file, 'r') as r:\n",
1014+
" with smart_open(word_analogies_file, 'r') as r:\n",
10131015
" for i, line in enumerate(r):\n",
10141016
" if ':' not in line:\n",
10151017
" analogy = tuple(line.split())\n",

0 commit comments

Comments
 (0)