Skip to content

Commit 9c29ee0

Browse files
committed
Implement weighted Jaccard similarity between sets. Along with 517e8a2 and e3a2652, fixes #23.
1 parent e3a2652 commit 9c29ee0

File tree

2 files changed

+75
-14
lines changed

2 files changed

+75
-14
lines changed

src/similarities.jl

Lines changed: 58 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -212,10 +212,7 @@ Computes the Jaccard similarity between sets ``A`` and ``B``, which is defined a
212212
``\text{Jaccard}(A,B) = \frac{\left|A \cap B\right|}{\left|A \cup B\right|}``
213213
214214
# Arguments
215-
- `A::Set`, `B::Set`: the two sets with which to compute Jaccard similarity.
216-
217-
# Returns
218-
`Float64`: the Jaccard similarity between sets `A` and `B`, which is between `0` and `1`.
215+
- `A::Set`, `B::Set`: two sets whose Jaccard similarity we would like to compute.
219216
220217
# Examples
221218
```jldoctest; setup = :(using LSHFunctions)
@@ -243,9 +240,9 @@ end
243240
@doc raw"""
244241
function jaccard(x::BitArray{1}, y::BitArray{1})
245242
246-
Computes the Jaccard similarity between a pair of binary vectors. Here, Jaccard similarity is defined as
243+
Computes the Jaccard similarity between a pair of binary vectors:
247244
248-
``J(x, y) = \\frac{\\sum_{i} \\min{(x_i,y_i)}}{\\sum_{i} \\max{(x_i,y_i)}}``
245+
``J(x, y) = \frac{\sum_{i} \min{(x_i,y_i)}}{\sum_{i} \max{(x_i,y_i)}}``
249246
250247
# Arguments
251248
- `x::BitArray{1}`, `y::BitArray{1}`: two binary vectors, in the form of `BitArray`s.
@@ -275,9 +272,9 @@ end
275272
@doc raw"""
276273
function jaccard(x::AbstractVector{<:Real}, y::AbstractVector{<:Real})
277274
278-
Computes the Jaccard similarity between a pair of vectors of real numbers. Here, Jaccard similarity is defined as
275+
Computes the Jaccard similarity between a pair of vectors of real numbers:
279276
280-
``J(x, y) = \\frac{\\sum_{i} \\min{(x_i,y_i)}}{\\sum_{i} \\max{(x_i,y_i)}}``
277+
``J(x, y) = \frac{\sum_{i} \min{(x_i,y_i)}}{\sum_{i} \max{(x_i,y_i)}}``
281278
282279
# Arguments
283280
- `x::AbstractVector{<:Real}`, `y::AbstractVector{<:Real}`: a pair of vectors containing real numbers (subtypes of `Real`).
@@ -292,7 +289,8 @@ julia> jaccard(x,y)
292289
0.5
293290
```
294291
"""
295-
function jaccard(x::AbstractVector{T}, y::AbstractVector{<:Real}) where {T <: Real}
292+
function jaccard(x::AbstractVector{T},
293+
y::AbstractVector) :: Float64 where {T <: Real}
296294
if length(x) != length(y)
297295
DimensionMismatch("dimensions must match") |> throw
298296
end
@@ -315,15 +313,64 @@ function jaccard(x::AbstractVector{T}, y::AbstractVector{<:Real}) where {T <: Re
315313
if union == T(0)
316314
# Use the convention that if x and y are full of zeros, their Jaccard
317315
# similarity is zero.
318-
T(union)
316+
Float64(0)
319317
else
320-
T(intersection / union)
318+
Float64(intersection / union)
321319
end
322320
end
323321

324322
jaccard(x::AbstractVector{<:Integer}, y::AbstractVector{<:AbstractFloat}) =
325323
jaccard(y, x)
326324

325+
@doc raw"""
326+
function jaccard(A::Set{<:K},
327+
B::Set{<:K},
328+
weights::Dict{K,V}) where {K,V<:Number}
329+
330+
Computes the weighted Jaccard similarity between two sets:
331+
332+
``J(x, y) = \frac{\sum_{x\in A\cap B} w_x}{\sum_{y\in A\cup B} w_y}``
333+
334+
# Arguments
335+
- `A::Set`, `B::Set`: two sets whose Jaccard similarity we would like to compute.
336+
- `weights::Dict`: a dictionary mapping symbols in the sets `A` and `B` to numerical weights. These weights must be positive.
337+
338+
# Examples
339+
```jldoctest; setup = :(using LSHFunctions)
340+
julia> A = Set(["a", "b", "c"]);
341+
342+
julia> B = Set(["b", "c", "d"]);
343+
344+
julia> W = Dict("a" => 0.2, "b" => 2.4, "c" => 0.6, "d" => 1.8);
345+
346+
julia> jaccard(A,B,W)
347+
0.6
348+
```
349+
"""
350+
function jaccard(A::Set{<:K},
351+
B::Set{<:K},
352+
weights::Dict{K,V}) :: Float64 where {K,V<:Real}
353+
354+
union_weight = V(0)
355+
356+
for el in A B
357+
w = weights[el]
358+
if w < 0
359+
ErrorException("weights must be non-negative") |> throw
360+
end
361+
union_weight += w
362+
end
363+
364+
intersection_weight = sum(weights[el] for el in A B)
365+
366+
# By convention, if A = B = ∅, their Jaccard similarity is zero
367+
if union_weight == V(0)
368+
Float64(0)
369+
else
370+
Float64(intersection_weight / union_weight)
371+
end
372+
end
373+
327374
#====================
328375
Inner product and norms
329376
====================#

test/test_similarities.jl

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -246,7 +246,7 @@ end
246246
@test jaccard(x, y) == 0
247247
end
248248

249-
@testset "Compute weighted Jaccard between Real vectors" begin
249+
@testset "Compute weighted Jaccard similarity between Real vectors" begin
250250
x = [0.8, 0.1, 0.3, 0.4, 0.1]
251251
y = [1.0, 0.6, 0.0, 0.4, 0.5]
252252

@@ -258,8 +258,8 @@ end
258258
x = mod.(rand(Int32, 20), 10)
259259
y = mod.(rand(Int64, 20), 10)
260260
@test jaccard(Float64.(x), Float64.(y)) jaccard(x, y)
261-
@test jaccard(Float64.(x), Float64.(y)) jaccard(Float32.(x), y)
262-
@test jaccard(Float64.(x), Float64.(y)) jaccard(x, Float32.(y))
261+
@test isapprox(jaccard(Float64.(x), Float64.(y)), jaccard(Float32.(x), y), atol=1e-8)
262+
@test isapprox(jaccard(Float64.(x), Float64.(y)), jaccard(x, Float32.(y)), atol=1e-8)
263263
@test jaccard(Float64.(x), Float64.(y)) jaccard(Float32.(x), Float64.(y))
264264

265265
# Define the Jaccard similarity between pairs of Real vectors
@@ -273,6 +273,20 @@ end
273273
@test_throws(DimensionMismatch, jaccard(rand(5), rand(6)))
274274
@test_throws(ErrorException, jaccard(-ones(3), ones(3)))
275275
end
276+
277+
@testset "Compute weighted Jaccard similarity between Sets" begin
278+
A = Set(["a", "b", "c"])
279+
B = Set(["b", "c", "d"])
280+
W = Dict("a" => 0.2, "b" => 2.4, "c" => 0.6, "d" => 1.8)
281+
282+
@test jaccard(A, B, W)
283+
jaccard(B, A, W)
284+
(2.4 + 0.6) / (0.2 + 2.4 + 0.6 + 1.8)
285+
286+
# We should throw an error when any of the weights are negative
287+
W["a"] = -1.0
288+
@test_throws(ErrorException, jaccard(A, B, W))
289+
end
276290
end
277291

278292
@testset "Inner product similarity tests" begin

0 commit comments

Comments
 (0)