Skip to content

Commit e3a2652

Browse files
committed
Add an implementation of weighted Jaccard similarity for pairs of real vectors.
1 parent 517e8a2 commit e3a2652

File tree

2 files changed

+85
-4
lines changed

2 files changed

+85
-4
lines changed

src/similarities.jl

Lines changed: 56 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -231,8 +231,9 @@ true
231231
See also: [`MinHash`](@ref)
232232
"""
233233
function jaccard(A::Set, B::Set) :: Float64
234-
# To avoid corner cases where A and B are both empty
235234
if isempty(A)
235+
# Use the convention that if A = B = ∅, their Jaccard
236+
# similarity is zero.
236237
Float64(0)
237238
else
238239
length(A B) / length(A B)
@@ -262,20 +263,72 @@ julia> jaccard(x,y)
262263
function jaccard(x::BitArray{1}, y::BitArray{1}) :: Float64
263264
union = sum(x .| y)
264265
if union == 0
265-
# To avoid corner cases where x and y are both full of zeros
266+
# Use the convention that if x and y are full of zeros, their Jaccard
267+
# similarity is zero.
266268
Float64(0)
267269
else
268270
intersection = sum(x .& y)
269271
intersection / union
270272
end
271273
end
272274

275+
@doc raw"""
276+
function jaccard(x::AbstractVector{<:Real}, y::AbstractVector{<:Real})
277+
278+
Computes the Jaccard similarity between a pair of vectors of real numbers. Here, Jaccard similarity is defined as
279+
280+
``J(x, y) = \\frac{\\sum_{i} \\min{(x_i,y_i)}}{\\sum_{i} \\max{(x_i,y_i)}}``
281+
282+
# Arguments
283+
- `x::AbstractVector{<:Real}`, `y::AbstractVector{<:Real}`: a pair of vectors containing real numbers (subtypes of `Real`).
284+
285+
# Examples
286+
```jldoctest; setup = :(using LSHFunctions)
287+
julia> x = [0.8, 0.1, 0.3, 0.4, 0.1];
288+
289+
julia> y = [1.0, 0.6, 0.0, 0.4, 0.5];
290+
291+
julia> jaccard(x,y)
292+
0.5
293+
```
294+
"""
295+
function jaccard(x::AbstractVector{T}, y::AbstractVector{<:Real}) where {T <: Real}
296+
if length(x) != length(y)
297+
DimensionMismatch("dimensions must match") |> throw
298+
end
299+
300+
intersection = T(0)
301+
union = T(0)
302+
303+
@inbounds @simd for ii = 1:length(x)
304+
if 0 x[ii] y[ii]
305+
intersection += x[ii]
306+
union += y[ii]
307+
elseif 0 y[ii] < x[ii]
308+
intersection += y[ii]
309+
union += x[ii]
310+
else
311+
ErrorException("vectors must have non-negative elements") |> throw
312+
end
313+
end
314+
315+
if union == T(0)
316+
# Use the convention that if x and y are full of zeros, their Jaccard
317+
# similarity is zero.
318+
T(union)
319+
else
320+
T(intersection / union)
321+
end
322+
end
323+
324+
jaccard(x::AbstractVector{<:Integer}, y::AbstractVector{<:AbstractFloat}) =
325+
jaccard(y, x)
326+
273327
#====================
274328
Inner product and norms
275329
====================#
276330

277331
### Inner products
278-
# TODO: docs
279332

280333
@doc raw"""
281334
inner_prod(x::AbstractVector, y::AbstractVector)

test/test_similarities.jl

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -239,12 +239,40 @@ end
239239

240240
@test jaccard(x, y) == jaccard(y, x) == 2 / 4
241241

242-
# When x and y are both full of false values, we define the
242+
# When x and y are both full of zero bits, we define the
243243
# Jaccard similarity between them to be zero.
244244
x = falses(5)
245245
y = falses(5)
246246
@test jaccard(x, y) == 0
247247
end
248+
249+
@testset "Compute weighted Jaccard between Real vectors" begin
250+
x = [0.8, 0.1, 0.3, 0.4, 0.1]
251+
y = [1.0, 0.6, 0.0, 0.4, 0.5]
252+
253+
@test jaccard(x, y) ==
254+
jaccard(y, x) ==
255+
(0.8+0.1+0.0+0.4+0.1) / (1.0+0.6+0.3+0.4+0.5)
256+
257+
# Test Jaccard similarity between vectors with different dtypes
258+
x = mod.(rand(Int32, 20), 10)
259+
y = mod.(rand(Int64, 20), 10)
260+
@test jaccard(Float64.(x), Float64.(y)) jaccard(x, y)
261+
@test jaccard(Float64.(x), Float64.(y)) jaccard(Float32.(x), y)
262+
@test jaccard(Float64.(x), Float64.(y)) jaccard(x, Float32.(y))
263+
@test jaccard(Float64.(x), Float64.(y)) jaccard(Float32.(x), Float64.(y))
264+
265+
# Define the Jaccard similarity between pairs of Real vectors
266+
# to be zero.
267+
x = zeros(10)
268+
y = zeros(10)
269+
@test jaccard(x, y) == 0
270+
271+
# Throw an error when any of the elements are negative, or when the
272+
# two vectors have different lengths.
273+
@test_throws(DimensionMismatch, jaccard(rand(5), rand(6)))
274+
@test_throws(ErrorException, jaccard(-ones(3), ones(3)))
275+
end
248276
end
249277

250278
@testset "Inner product similarity tests" begin

0 commit comments

Comments
 (0)