piskvorky
diff --git a/‎CHANGELOG.md‎
Lines changed: 202 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 202 additions & 0 deletions
diff --git a/‎docs/notebooks/Corpora_and_Vector_Spaces.ipynb‎
Lines changed: 4 additions & 2 deletions b/‎docs/notebooks/Corpora_and_Vector_Spaces.ipynb‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎docs/notebooks/Poincare Evaluation.ipynb‎
Lines changed: 8 additions & 8 deletions b/‎docs/notebooks/Poincare Evaluation.ipynb‎
Lines changed: 8 additions & 8 deletions
diff --git a/‎docs/notebooks/Tensorboard_visualizations.ipynb‎
Lines changed: 5 additions & 4 deletions b/‎docs/notebooks/Tensorboard_visualizations.ipynb‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎docs/notebooks/WMD_tutorial.ipynb‎
Lines changed: 2 additions & 1 deletion b/‎docs/notebooks/WMD_tutorial.ipynb‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎docs/notebooks/Word2Vec_FastText_Comparison.ipynb‎
Lines changed: 2 additions & 1 deletion b/‎docs/notebooks/Word2Vec_FastText_Comparison.ipynb‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎docs/notebooks/Wordrank_comparisons.ipynb‎
Lines changed: 5 additions & 3 deletions b/‎docs/notebooks/Wordrank_comparisons.ipynb‎
Lines changed: 5 additions & 3 deletions
@@ -279,9 +279,10 @@
    },
    "outputs": [],
    "source": [
+    "from smart_open import smart_open\n",
     "class MyCorpus(object):\n",
     "    def __iter__(self):\n",
-    "        for line in open('datasets/mycorpus.txt'):\n",
+    "        for line in smart_open('datasets/mycorpus.txt', 'rb'):\n",
     "            # assume there's one document per line, tokens separated by whitespace\n",
     "            yield dictionary.doc2bow(line.lower().split())"
    ]
@@ -374,9 +375,10 @@
    ],
    "source": [
     "from six import iteritems\n",
+    "from smart_open import smart_open\n",
     "\n",
     "# collect statistics about all tokens\n",
-    "dictionary = corpora.Dictionary(line.lower().split() for line in open('datasets/mycorpus.txt'))\n",
+    "dictionary = corpora.Dictionary(line.lower().split() for line in smart_open('datasets/mycorpus.txt', 'rb'))\n",
     "\n",
     "# remove stop words and words that appear only once\n",
     "stop_ids = [dictionary.token2id[stopword] for stopword in stoplist \n",
 
@@ -697,7 +697,7 @@
     "    parts = first_line.rstrip().split(\"\\t\")\n",
     "    model_size = len(parts) - 1\n",
     "    vocab_size = len(lines)\n",
-    "    with open(output_file, 'w') as f:\n",
+    "    with smart_open(output_file, 'w') as f:\n",
     "        f.write('%d %d\\n' % (vocab_size, model_size))\n",
     "        for line in lines:\n",
     "            f.write(line.replace('\\t', ' '))\n",
@@ -709,7 +709,7 @@
     "    \n",
     "    model_size = random_embedding.shape[0]\n",
     "    vocab_size = len(np_embeddings)\n",
-    "    with open(output_file, 'w') as f:\n",
+    "    with smart_open(output_file, 'w') as f:\n",
     "        f.write('%d %d\\n' % (vocab_size, model_size))\n",
     "        for key, vector in np_embeddings.items():\n",
     "            vector_string = ' '.join('%.6f' % value for value in vector)\n",
@@ -1113,7 +1113,7 @@
     "    test_line_candidates = []\n",
     "    line_count = 0\n",
     "    all_nodes = set()\n",
-    "    with open(data_file, 'rb') as f:\n",
+    "    with smart_open(data_file, 'rb') as f:\n",
     "        for i, line in enumerate(f):\n",
     "            node_1, node_2 = line.split()\n",
     "            all_nodes.update([node_1, node_2])\n",
@@ -1135,9 +1135,9 @@
     "    train_line_indices = set(l for l in range(line_count) if l not in test_line_indices)\n",
     "    \n",
     "    train_set_nodes = set()\n",
-    "    with open(data_file, 'rb') as f:\n",
-    "        train_file = open(train_filename, 'wb')\n",
-    "        test_file = open(test_filename, 'wb')\n",
+    "    with smart_open(data_file, 'rb') as f:\n",
+    "        train_file = smart_open(train_filename, 'wb')\n",
+    "        test_file = smart_open(test_filename, 'wb')\n",
     "        for i, line in enumerate(f):\n",
     "            if i in train_line_indices:\n",
     "                train_set_nodes.update(line.split())\n",
@@ -1169,13 +1169,13 @@
     "    \"\"\"\n",
     "    root_candidates = set()\n",
     "    leaf_candidates = set()\n",
-    "    with open(data_file, 'rb') as f:\n",
+    "    with smart_open(data_file, 'rb') as f:\n",
     "        for line in f:\n",
     "            nodes = line.split()\n",
     "            root_candidates.update(nodes)\n",
     "            leaf_candidates.update(nodes)\n",
     "    \n",
-    "    with open(data_file, 'rb') as f:\n",
+    "    with smart_open(data_file, 'rb') as f:\n",
     "        for line in f:\n",
     "            node_1, node_2 = line.split()\n",
     "            if node_1 == node_2:\n",
 
@@ -624,6 +624,7 @@
     "import pandas as pd\n",
     "import smart_open\n",
     "import random\n",
+    "from smart_open import smart_open\n",
     "\n",
     "# read data\n",
     "dataframe = pd.read_csv('movie_plots.csv')\n",
@@ -803,7 +804,7 @@
    },
    "outputs": [],
    "source": [
-    "with open('movie_plot_metadata.tsv','w') as w:\n",
+    "with smart_open('movie_plot_metadata.tsv','w') as w:\n",
     "    w.write('Titles\\tGenres\\n')\n",
     "    for i,j in zip(dataframe.Titles, dataframe.Genres):\n",
     "        w.write(\"%s\\t%s\\n\" % (i,j))"
@@ -1024,14 +1025,14 @@
    "outputs": [],
    "source": [
     "# create file for tensors\n",
-    "with open('doc_lda_tensor.tsv','w') as w:\n",
+    "with smart_open('doc_lda_tensor.tsv','w') as w:\n",
     "    for doc_topics in all_topics:\n",
     "        for topics in doc_topics:\n",
     "            w.write(str(topics[1])+ \"\\t\")\n",
     "        w.write(\"\\n\")\n",
     "        \n",
     "# create file for metadata\n",
-    "with open('doc_lda_metadata.tsv','w') as w:\n",
+    "with smart_open('doc_lda_metadata.tsv','w') as w:\n",
     "    w.write('Titles\\tGenres\\n')\n",
     "    for j, k in zip(dataframe.Titles, dataframe.Genres):\n",
     "        w.write(\"%s\\t%s\\n\" % (j, k))"
@@ -1084,7 +1085,7 @@
     "\n",
     "# overwrite metadata file\n",
     "i=0\n",
-    "with open('doc_lda_metadata.tsv','w') as w:\n",
+    "with smart_open('doc_lda_metadata.tsv','w') as w:\n",
     "    w.write('Titles\\tGenres\\n')\n",
     "    for j,k in zip(dataframe.Titles, dataframe.Genres):\n",
     "        w.write(\"%s\\t%s\\n\" % (''.join((str(j), str(tensors[i]))),k))\n",
 
@@ -302,6 +302,7 @@
     "start = time()\n",
     "\n",
     "import json\n",
+    "from smart_open import smart_open\n",
     "\n",
     "# Business IDs of the restaurants.\n",
     "ids = ['4bEjOyTaDG24SY5TxsaUNQ', '2e2e7WgqU1BnpxmQL5jbfw', 'zt1TpTuJ6y9n551sw9TaEg',\n",
@@ -310,7 +311,7 @@
     "w2v_corpus = []  # Documents to train word2vec on (all 6 restaurants).\n",
     "wmd_corpus = []  # Documents to run queries against (only one restaurant).\n",
     "documents = []  # wmd_corpus, with no pre-processing (so we can see the original documents).\n",
-    "with open('/data/yelp_academic_dataset_review.json') as data_file:\n",
+    "with smart_open('/data/yelp_academic_dataset_review.json', 'rb') as data_file:\n",
     "    for line in data_file:\n",
     "        json_line = json.loads(line)\n",
     "        \n",
 
@@ -57,11 +57,12 @@
    ],
    "source": [
     "import nltk\n",
+    "from smart_open import smart_open\n",
     "nltk.download('brown') \n",
     "# Only the brown corpus is needed in case you don't have it.\n",
     "\n",
     "# Generate brown corpus text file\n",
-    "with open('brown_corp.txt', 'w+') as f:\n",
+    "with smart_open('brown_corp.txt', 'w+') as f:\n",
     "    for word in nltk.corpus.brown.words():\n",
     "        f.write('{word} '.format(word=word))\n",
     "\n",
 
@@ -38,20 +38,21 @@
    ],
    "source": [
     "import nltk\n",
+    "from smart_open import smart_open\n",
     "from gensim.parsing.preprocessing import strip_punctuation, strip_multiple_whitespaces\n",
     "\n",
     "# Only the brown corpus is needed in case you don't have it.\n",
     "nltk.download('brown') \n",
     "\n",
     "# Generate brown corpus text file\n",
-    "with open('brown_corp.txt', 'w+') as f:\n",
+    "with smart_open('brown_corp.txt', 'w+') as f:\n",
     "    for word in nltk.corpus.brown.words():\n",
     "        f.write('{word} '.format(word=word))\n",
     "    f.seek(0)\n",
     "    brown = f.read()\n",
     "\n",
     "# Preprocess brown corpus\n",
-    "with open('proc_brown_corp.txt', 'w') as f:\n",
+    "with smart_open('proc_brown_corp.txt', 'w') as f:\n",
     "    proc_brown = strip_punctuation(brown)\n",
     "    proc_brown = strip_multiple_whitespaces(proc_brown).lower()\n",
     "    f.write(proc_brown)\n",
@@ -1004,12 +1005,13 @@
     "import copy\n",
     "import multiprocessing\n",
     "import numpy as np\n",
+    "from smart_open import smart_open\n",
     "\n",
     "\n",
     "def compute_accuracies(model, freq):\n",
     "    # mean_freq will contain analogies together with the mean frequency of 4 words involved\n",
     "    mean_freq = {}\n",
-    "    with open(word_analogies_file, 'r') as r:\n",
+    "    with smart_open(word_analogies_file, 'r') as r:\n",
     "        for i, line in enumerate(r):\n",
     "            if ':' not in line:\n",
     "                analogy = tuple(line.split())\n",