Skip to content

Commit bf5b5b3

Browse files
committed
Add plot for collision probability with SimHash.
1 parent b108b65 commit bf5b5b3

File tree

3 files changed

+121
-0
lines changed

3 files changed

+121
-0
lines changed

docs/Manifest.toml

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,44 @@ git-tree-sha1 = "5b08ed6036d9d3f0ee6369410b830f8873d4024c"
99
uuid = "b99e7846-7c00-51b0-8f62-c81ae34c0232"
1010
version = "0.5.8"
1111

12+
[[ColorTypes]]
13+
deps = ["FixedPointNumbers", "Random"]
14+
git-tree-sha1 = "b9de8dc6106e09c79f3f776c27c62360d30e5eb8"
15+
uuid = "3da002f7-5984-5a60-b8a6-cbb66c0b333f"
16+
version = "0.9.1"
17+
18+
[[Colors]]
19+
deps = ["ColorTypes", "FixedPointNumbers", "InteractiveUtils", "Printf", "Reexport"]
20+
git-tree-sha1 = "177d8b959d3c103a6d57574c38ee79c81059c31b"
21+
uuid = "5ae59095-9a9b-59fe-a467-6f913c188581"
22+
version = "0.11.2"
23+
24+
[[Compat]]
25+
deps = ["Base64", "Dates", "DelimitedFiles", "Distributed", "InteractiveUtils", "LibGit2", "Libdl", "LinearAlgebra", "Markdown", "Mmap", "Pkg", "Printf", "REPL", "Random", "Serialization", "SharedArrays", "Sockets", "SparseArrays", "Statistics", "Test", "UUIDs", "Unicode"]
26+
git-tree-sha1 = "ed2c4abadf84c53d9e58510b5fc48912c2336fbb"
27+
uuid = "34da2185-b29b-5c13-b0c7-acf172513d20"
28+
version = "2.2.0"
29+
30+
[[Conda]]
31+
deps = ["JSON", "VersionParsing"]
32+
git-tree-sha1 = "9a11d428dcdc425072af4aea19ab1e8c3e01c032"
33+
uuid = "8f4d0f93-b110-5947-807f-2305c1781a2d"
34+
version = "1.3.0"
35+
36+
[[DataStructures]]
37+
deps = ["InteractiveUtils", "OrderedCollections"]
38+
git-tree-sha1 = "b7720de347734f4716d1815b00ce5664ed6bbfd4"
39+
uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
40+
version = "0.17.9"
41+
1242
[[Dates]]
1343
deps = ["Printf"]
1444
uuid = "ade2ca70-3891-5945-98fb-dc099432e06a"
1545

46+
[[DelimitedFiles]]
47+
deps = ["Mmap"]
48+
uuid = "8bb1440f-4735-579b-a4ab-409b98df4dab"
49+
1650
[[Distributed]]
1751
deps = ["Random", "Serialization", "Sockets"]
1852
uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b"
@@ -38,6 +72,11 @@ version = "0.1.4"
3872
[[FileWatching]]
3973
uuid = "7b1f6079-737a-58dc-b8bc-7a2ca5c1b5ee"
4074

75+
[[FixedPointNumbers]]
76+
git-tree-sha1 = "4aaea64dd0c30ad79037084f8ca2b94348e65eaa"
77+
uuid = "53c48c17-4a7d-5ca2-90c5-79b7896eea93"
78+
version = "0.7.1"
79+
4180
[[InteractiveUtils]]
4281
deps = ["Markdown"]
4382
uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
@@ -48,22 +87,44 @@ git-tree-sha1 = "b34d7cef7b337321e97d22242c3c2b91f476748e"
4887
uuid = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
4988
version = "0.21.0"
5089

90+
[[LaTeXStrings]]
91+
deps = ["Compat"]
92+
git-tree-sha1 = "7ab9b8788cfab2bdde22adf9004bda7ad9954b6c"
93+
uuid = "b964fa9f-0449-5b57-a5c2-d3ea65f4040f"
94+
version = "1.0.3"
95+
5196
[[LibGit2]]
5297
uuid = "76f85450-5226-5b5a-8eaa-529ad045b433"
5398

5499
[[Libdl]]
55100
uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
56101

102+
[[LinearAlgebra]]
103+
deps = ["Libdl"]
104+
uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
105+
57106
[[Logging]]
58107
uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
59108

109+
[[MacroTools]]
110+
deps = ["DataStructures", "Markdown", "Random"]
111+
git-tree-sha1 = "e2fc7a55bb2224e203bbd8b59f72b91323233458"
112+
uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
113+
version = "0.5.3"
114+
60115
[[Markdown]]
61116
deps = ["Base64"]
62117
uuid = "d6f4376e-aef5-505a-96c1-9c027394607a"
63118

64119
[[Mmap]]
65120
uuid = "a63ad114-7e13-5084-954f-fe012c677804"
66121

122+
[[OrderedCollections]]
123+
deps = ["Random", "Serialization", "Test"]
124+
git-tree-sha1 = "c4c13474d23c60d20a67b217f1d7f22a40edf8f1"
125+
uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
126+
version = "1.1.0"
127+
67128
[[Parsers]]
68129
deps = ["Dates", "Test"]
69130
git-tree-sha1 = "0139ba59ce9bc680e2925aec5b7db79065d60556"
@@ -78,6 +139,18 @@ uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
78139
deps = ["Unicode"]
79140
uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7"
80141

142+
[[PyCall]]
143+
deps = ["Conda", "Dates", "Libdl", "LinearAlgebra", "MacroTools", "Pkg", "Serialization", "Statistics", "Test", "VersionParsing"]
144+
git-tree-sha1 = "6e5bac1b1faf3575731a6a5b76f638f2389561d3"
145+
uuid = "438e738f-606a-5dbb-bf0a-cddfbfd45ab0"
146+
version = "1.91.2"
147+
148+
[[PyPlot]]
149+
deps = ["Colors", "LaTeXStrings", "PyCall", "Sockets", "Test", "VersionParsing"]
150+
git-tree-sha1 = "ccecc72cf5b41a5de686bd76999040050a8a3472"
151+
uuid = "d330b81b-6aea-500a-939a-2ce795aea3ee"
152+
version = "2.8.2"
153+
81154
[[REPL]]
82155
deps = ["InteractiveUtils", "Markdown", "Sockets"]
83156
uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb"
@@ -86,6 +159,12 @@ uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb"
86159
deps = ["Serialization"]
87160
uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
88161

162+
[[Reexport]]
163+
deps = ["Pkg"]
164+
git-tree-sha1 = "7b1d07f411bc8ddb7977ec7f377b97b158514fe0"
165+
uuid = "189a3867-3050-52da-a836-e630ba90ab69"
166+
version = "0.2.0"
167+
89168
[[SHA]]
90169
uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
91170

@@ -98,9 +177,21 @@ version = "0.1.0"
98177
[[Serialization]]
99178
uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
100179

180+
[[SharedArrays]]
181+
deps = ["Distributed", "Mmap", "Random", "Serialization"]
182+
uuid = "1a1011a3-84de-559e-8e89-a11a2f7dc383"
183+
101184
[[Sockets]]
102185
uuid = "6462fe0b-24de-5631-8697-dd941f90decc"
103186

187+
[[SparseArrays]]
188+
deps = ["LinearAlgebra", "Random"]
189+
uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
190+
191+
[[Statistics]]
192+
deps = ["LinearAlgebra", "SparseArrays"]
193+
uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
194+
104195
[[Test]]
105196
deps = ["Distributed", "InteractiveUtils", "Logging", "Random"]
106197
uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
@@ -111,3 +202,8 @@ uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
111202

112203
[[Unicode]]
113204
uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
205+
206+
[[VersionParsing]]
207+
git-tree-sha1 = "80229be1f670524750d905f8fc8148e5a8c4537f"
208+
uuid = "81def892-9a0e-5fdd-b105-ffc91e053289"
209+
version = "1.2.0"

docs/Project.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
11
[deps]
22
Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
33
DocumenterTools = "35a29f4d-8980-5a13-9543-d66fff28ecb8"
4+
PyPlot = "d330b81b-6aea-500a-939a-2ce795aea3ee"

docs/src/similarities/cosine.md

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,30 @@ julia> length(hashes)
9494
10
9595
```
9696

97+
The probability of a hash collision (for a single hash) is
98+
99+
```
100+
Pr[h(x) = h(y)] = 1 - \frac{\theta}{\pi}
101+
```
102+
103+
where ``\theta = \text{arccos}(\text{cossim}(x,y))`` is the angle between ``x`` and ``y``. This collision probability is shown in the plot below.
104+
105+
```@eval
106+
using PyPlot, LSH
107+
hashfn = SimHash()
108+
x = range(-1, 1; length=1024)
109+
y = [LSH.single_hash_collision_probability(hashfn, xii) for xii in x]
110+
111+
plot(x, y)
112+
title("Probability of hash collision for SimHash")
113+
xlabel(raw"$cossim(x,y)$")
114+
ylabel(raw"$Pr[h(x) = h(y)]$")
115+
116+
savefig("simhash_collision_probability.svg")
117+
```
118+
119+
![Probability of collision for SimHash](simhash_collision_probability.svg)
120+
97121
### Footnotes
98122

99123
[^1]: Moses S. Charikar. *Similarity estimation techniques from rounding algorithms*. In Proceedings of the Thiry-Fourth Annual ACM Symposium on Theory of Computing, STOC '02, page 380–388, New York, NY, USA, 2002. Association for Computing Machinery. 10.1145/509907.509965.

0 commit comments

Comments
 (0)