Skip to content

Commit fb3f303

Browse files
committed
Merge branch 'release-1.0.1'
2 parents 78da89a + b5744ba commit fb3f303

File tree

7 files changed

+190
-9
lines changed

7 files changed

+190
-9
lines changed

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ New features:
1212
* Add Author-topic modeling (@olavurmortensen,[#893](https://github.com/RaRe-Technologies/gensim/pull/893))
1313
* Add FastText word embedding wrapper (@Jayantj,[#847](https://github.com/RaRe-Technologies/gensim/pull/847))
1414
* Add WordRank word embedding wrapper (@parulsethi,[#1066](https://github.com/RaRe-Technologies/gensim/pull/1066), [#1125](https://github.com/RaRe-Technologies/gensim/pull/1125))
15+
* Add VarEmbed word embedding wrapper (@anmol01gulati, [#1067](https://github.com/RaRe-Technologies/gensim/pull/1067)))
1516
* Add sklearn wrapper for LDAModel (@AadityaJ,[#932](https://github.com/RaRe-Technologies/gensim/pull/932))
1617

1718
Deprecated features:
@@ -49,6 +50,7 @@ Tutorial and doc improvements:
4950
* Fix typos in Author-topic tutorial (@Fil,[#1102](https://github.com/RaRe-Technologies/gensim/pull/1102))
5051
* Address benchmark inconsistencies in Annoy tutorial (@droudy,[#1113](https://github.com/RaRe-Technologies/gensim/pull/1113))
5152
* Add note about Annoy speed depending on numpy BLAS setup in annoytutorial.ipynb (@greninja,[#1137](https://github.com/RaRe-Technologies/gensim/pull/1137))
53+
* Fix dependencies description on doc2vec-IMDB notebook (@luizcavalcanti, [#1132](https://github.com/RaRe-Technologies/gensim/pull/1132))
5254
* Add documentation for WikiCorpus metadata. (@kirit93, [#1163](https://github.com/RaRe-Technologies/gensim/pull/1163))
5355

5456

docs/src/conf.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@
5454
# The short X.Y version.
5555
version = '1.0'
5656
# The full version, including alpha/beta/rc tags.
57-
release = '1.0.0'
57+
release = '1.0.1'
5858

5959
# The language for content autogenerated by Sphinx. Refer to documentation
6060
# for a list of supported languages.

gensim/models/wrappers/fasttext.py

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -277,14 +277,13 @@ def load_dict(self, file_handle):
277277
assert len(self.wv.vocab) == vocab_size, 'mismatch between vocab sizes'
278278
ntokens, = self.struct_unpack(file_handle, '@q')
279279
for i in range(nwords):
280-
word = ''
281-
char, = self.struct_unpack(file_handle, '@c')
282-
char = char.decode()
280+
word_bytes = b''
281+
char_byte = file_handle.read(1)
283282
# Read vocab word
284-
while char != '\x00':
285-
word += char
286-
char, = self.struct_unpack(file_handle, '@c')
287-
char = char.decode()
283+
while char_byte != b'\x00':
284+
word_bytes += char_byte
285+
char_byte = file_handle.read(1)
286+
word = word_bytes.decode('utf8')
288287
count, _ = self.struct_unpack(file_handle, '@ib')
289288
_ = self.struct_unpack(file_handle, '@i')
290289
assert self.wv.vocab[word].index == i, 'mismatch between gensim word index and fastText word index'
83.2 KB
Binary file not shown.
Lines changed: 172 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,172 @@
1+
171 2
2+
ji -1.5308 2.0551
3+
který -0.99211 1.4997
4+
jen -1.1228 1.3667
5+
podle -1.1469 1.4473
6+
zde -1.0191 1.4011
7+
už -0.91921 1.3531
8+
být -1.0086 1.4582
9+
více -1.1058 1.3376
10+
bude -1.2032 1.7383
11+
již -1.3136 1.4792
12+
než -1.0664 1.6635
13+
vás -1.1113 1.5703
14+
by -1.1698 1.966
15+
které -1.1295 1.6275
16+
co -0.93518 1.1776
17+
nebo -1.0791 1.5071
18+
ten -1.1881 1.415
19+
tak -1.4548 1.8457
20+
má -1.0658 1.5255
21+
při -1.3464 1.6107
22+
od -0.79486 1.5585
23+
po -1.2758 1.9186
24+
tipy -0.69335 1.0799
25+
ještě -0.87116 1.1618
26+
až -1.2688 1.6518
27+
bez -0.99627 1.423
28+
také -1.141 1.4808
29+
pouze -0.94181 1.4076
30+
první -1.1166 1.5035
31+
vaše -0.9672 1.4975
32+
která -1.1102 1.5806
33+
nás -1.1328 1.5253
34+
nový -0.85553 1.1462
35+
jsou -1.0792 1.8008
36+
pokud -1.0427 1.3178
37+
může -1.1269 1.419
38+
strana -0.84973 1.1957
39+
jeho -1.1644 1.5879
40+
své -1.0546 1.6185
41+
jiné -0.95046 1.2816
42+
zprávy -0.88762 1.3374
43+
nové -1.0588 1.619
44+
není -1.0321 1.5566
45+
tomu -1.0753 1.5211
46+
ona -1.21 1.6992
47+
ono -1.0733 1.6574
48+
oni -1.1153 1.643
49+
ony -1.0926 1.5244
50+
my -0.92689 1.6378
51+
vy -1.3708 1.8
52+
jí -1.205 1.6606
53+
mě -0.96436 1.4713
54+
mne -1.0956 1.6333
55+
jemu -1.1181 1.4661
56+
on -1.0062 1.4124
57+
těm -0.90732 1.2586
58+
těmu -0.90621 1.4096
59+
němu -1.0823 1.4396
60+
němuž -1.0786 1.3892
61+
jehož -1.1649 1.4418
62+
jíž -1.0574 1.6338
63+
jelikož -1.0449 1.3625
64+
jež -1.2657 1.7032
65+
jakož -1.3373 1.6112
66+
načež -1.0127 1.3696
67+
ze -1.1784 1.7095
68+
jak -1.2097 1.5224
69+
další -0.7288 0.96256
70+
ale -1.1029 1.4153
71+
si -1.1097 1.5884
72+
se -1.2981 1.7707
73+
ve -1.256 1.7985
74+
to -1.6894 2.2424
75+
jako -1.2333 1.5942
76+
za -1.0376 1.6162
77+
zpět -0.83657 1.354
78+
jejich -0.97548 1.4219
79+
do -0.93685 1.4001
80+
pro -1.4367 1.9498
81+
je -1.9446 2.5147
82+
na -1.5543 2.2901
83+
atd -0.98175 1.3697
84+
atp -0.83266 1.1085
85+
jakmile -1.0954 1.2764
86+
přičemž -1.0533 1.4279
87+
já -1.1496 1.4432
88+
nám -1.0246 1.6043
89+
jej -1.203 1.6252
90+
zda -0.93651 1.2363
91+
proč -0.90395 1.3144
92+
máte -0.99962 1.4802
93+
tato -1.3248 1.5575
94+
kam -0.63468 1.246
95+
tohoto -0.9737 1.3422
96+
kdo -0.88982 1.4152
97+
kteří -0.92973 1.4696
98+
mi -1.343 1.7217
99+
tyto -0.99375 1.3067
100+
tom -1.1636 1.608
101+
tomuto -1.0103 1.3488
102+
mít -1.1538 1.6326
103+
nic -0.76497 1.0685
104+
proto -1.1781 1.6367
105+
kterou -1.0561 1.563
106+
byla -0.9338 1.7033
107+
toho -1.1263 1.5702
108+
protože -1.1777 1.4984
109+
asi -1.0555 1.4401
110+
budeš -0.98208 1.5432
111+
s -1.3733 1.6447
112+
k -1.0223 1.6019
113+
o -1.4531 1.879
114+
i -1.0985 1.2956
115+
u -0.91038 1.6173
116+
v -1.2536 1.5998
117+
z -0.96962 1.7437
118+
dnes -0.92891 1.2478
119+
cz -0.84461 1.0881
120+
tímto -0.98475 1.3061
121+
ho -0.74774 1.4925
122+
budem -1.0178 1.4333
123+
byli -0.90776 1.4799
124+
jseš -1.0297 1.4975
125+
můj -0.891 1.2674
126+
svým -1.0586 1.5377
127+
ta -1.4932 2.0156
128+
tomto -1.1626 1.5135
129+
tohle -1.2215 1.6529
130+
tuto -1.0516 1.3583
131+
neg -0.94527 1.5529
132+
pod -1.0601 1.578
133+
téma -0.93273 1.3456
134+
mezi -0.96807 1.3465
135+
přes -1.1927 1.5099
136+
ty -1.3733 1.7374
137+
pak -1.0392 1.5592
138+
vám -0.89801 1.3586
139+
ani -1.2113 1.5634
140+
když -1.0124 1.5112
141+
však -0.75634 1.1299
142+
či -0.79489 1.2817
143+
jsem -1.0435 1.4903
144+
tento -1.0861 1.5053
145+
článku -0.93302 1.3758
146+
články -0.98897 1.4387
147+
aby -1.0874 1.6114
148+
jsme -1.0547 1.6846
149+
před -1.0538 1.5186
150+
pta -1.062 1.6063
151+
a -1.3116 2.0391
152+
aj -1.1578 1.5193
153+
naši -1.2075 1.3714
154+
napište -1.0436 1.4646
155+
re -1.3115 1.5453
156+
což -1.1731 1.3545
157+
tím -1.0296 1.5885
158+
takže -1.1014 1.3574
159+
svých -0.82606 1.1187
160+
její -1.1029 1.3696
161+
svými -1.1052 1.4953
162+
jste -1.1003 1.7465
163+
byl -0.89449 1.4131
164+
tu -1.1255 1.5505
165+
tedy -1.1693 1.6446
166+
teto -1.2134 1.546
167+
bylo -0.86091 1.3805
168+
kde -1.3468 1.7507
169+
ke -1.0699 1.6688
170+
pravé -0.9391 1.5172
171+
nad -1.3404 1.7661
172+
nejsou -0.85023 1.5033

gensim/test/test_fasttext_wrapper.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,14 @@ def testLoadFastTextFormat(self):
120120
self.assertEqual(self.test_model.wv.syn0_all.shape, (self.test_model.num_ngram_vectors, model_size))
121121
self.model_sanity(model)
122122

123+
def testLoadModelWithNonAsciiVocab(self):
124+
model = fasttext.FastText.load_fasttext_format(datapath('non_ascii_fasttext'))
125+
self.assertTrue(u'který' in model)
126+
try:
127+
vector = model[u'který']
128+
except UnicodeDecodeError:
129+
self.fail('Unable to access vector for non-ascii word')
130+
123131
def testNSimilarity(self):
124132
"""Test n_similarity for in-vocab and out-of-vocab words"""
125133
# In vocab, sanity check

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -228,7 +228,7 @@ def finalize_options(self):
228228

229229
setup(
230230
name='gensim',
231-
version='1.0.0',
231+
version='1.0.1',
232232
description='Python framework for fast Vector Space Modelling',
233233
long_description=LONG_DESCRIPTION,
234234

0 commit comments

Comments
 (0)