Implement collision_probability for LpHash when sim == 0. Fixes issue #13.

kernelmethod · kernelmethod · commit 2426638a8f1f · 2020-02-09T13:15:39.000-07:00
diff --git a/src/hashes/lphash.jl b/src/hashes/lphash.jl
@@ -185,19 +185,25 @@ n_hashes(h::LpHash) = length(h.shift)
 hashtype(::LpHash) = Int32
 
 # See Section 3.2 of the reference paper
-function single_hash_collision_probability(hashfn::LpHash, sim::Real)
+function single_hash_collision_probability(hashfn::LpHash, sim::T) where {T <: Real}
+    ### If sim ≈ 0 then the integral won't be possible to numerically compute,
+    ### however we know that the probability equals one.
+    if sim ≈ T(0)
+        return T(1)
+    end
+
     ### Compute the collision probability for a single hash function
     distr, scale = hashfn.distr, hashfn.scale
-    integral, err = quadgk(x -> pdf(distr, x/sim) * (1 - x/scale),
-                           0, scale, rtol=1e-5)
+    integral, err = quadgk(x -> pdf(distr, x/sim) * (T(1) - x/scale),
+                           T(0), T(scale), rtol=1e-5)
     integral = integral ./ sim
 
     # Note that from the reference for the L^p LSH family, we're supposed to
     # integrate over the p.d.f. for the _absolute value_ of the underlying
     # random variable, rather than the raw p.d.f. Luckily, all of the
     # distributions we have to deal with here are symmetric and centered at
     # zero, so all we have to do is multiply the integral by two.
-    single_hash_prob = integral .* 2
+    single_hash_prob = T(integral .* 2)
 end
 
 function similarity(hashfn::LpHash)
diff --git a/test/hashes/test_lphash.jl b/test/hashes/test_lphash.jl
@@ -123,6 +123,20 @@ Tests
             @test size(hashfn.coeff) == (n_hashes, 16)
         end
     end
+
+    @testset "collision_probability works correctly" begin
+        hashfn = L1Hash()
+
+        # collision_probability should be 1 for two inputs of distance zero
+        x = rand(4)
+        @test collision_probability(hashfn, x, x) ≈ 1.0
+
+        # collision_probability with n_hashes=N should be the same as
+        # collision_probability with n_hashes=1, raised to the power N
+        y = rand(4)
+        @test collision_probability(hashfn, x, y; n_hashes=10) ≈
+              collision_probability(hashfn, x, y; n_hashes=1)^10
+    end
 end