Skip to content

Commit 8f8064f

Browse files
authored
Merge pull request #156 from dynatrace-oss/simhash
Simhash
2 parents 274d2fb + ca7a110 commit 8f8064f

File tree

13 files changed

+400
-50
lines changed

13 files changed

+400
-50
lines changed

README.md

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,12 +25,12 @@ To add a dependency on hash4j using Maven, use the following:
2525
<dependency>
2626
<groupId>com.dynatrace.hash4j</groupId>
2727
<artifactId>hash4j</artifactId>
28-
<version>0.12.0</version>
28+
<version>0.13.0</version>
2929
</dependency>
3030
```
3131
To add a dependency using Gradle:
3232
```gradle
33-
implementation 'com.dynatrace.hash4j:hash4j:0.12.0'
33+
implementation 'com.dynatrace.hash4j:hash4j:0.13.0'
3434
```
3535

3636
## Hash algorithms
@@ -74,7 +74,8 @@ More examples can be found in [HashingDemo.java](src/test/java/com/dynatrace/has
7474
Similarity hashing algorithms are able to compute hash signature of sets that allow estimation of set similarity without using the original sets. Following algorithms are currently available:
7575
* [MinHash](https://en.wikipedia.org/wiki/MinHash)
7676
* [SuperMinHash](https://arxiv.org/abs/1706.05698)
77-
* FastSimHash: A fast implementation of [SimHash](https://en.wikipedia.org/wiki/SimHash)
77+
* [SimHash](https://en.wikipedia.org/wiki/SimHash)
78+
* FastSimHash: A fast implementation of SimHash using a bit hack
7879

7980
### Usage
8081

build.gradle

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ java {
6868
}
6969

7070
group = 'com.dynatrace.hash4j'
71-
version = '0.12.0'
71+
version = '0.13.0'
7272

7373
spotless {
7474
ratchetFrom 'origin/main'
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
/*
2+
* Copyright 2022-2023 Dynatrace LLC
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
package com.dynatrace.hash4j.similarity;
17+
18+
import org.openjdk.jmh.annotations.Benchmark;
19+
import org.openjdk.jmh.annotations.BenchmarkMode;
20+
import org.openjdk.jmh.annotations.Mode;
21+
import org.openjdk.jmh.infra.Blackhole;
22+
23+
public class SimHash_v1PerformanceTest extends SimilarityHashingPerformanceTest {
24+
25+
public static class State_64 extends StateBase {
26+
public State_64() {
27+
super(new SimHashPolicy_v1(64, getPseudoRandomGeneratorProvider()).createHasher());
28+
}
29+
}
30+
31+
public static class State_256 extends StateBase {
32+
public State_256() {
33+
super(new SimHashPolicy_v1(256, getPseudoRandomGeneratorProvider()).createHasher());
34+
}
35+
}
36+
37+
public static class State_1024 extends StateBase {
38+
public State_1024() {
39+
super(new SimHashPolicy_v1(1024, getPseudoRandomGeneratorProvider()).createHasher());
40+
}
41+
}
42+
43+
public static class State_4096 extends StateBase {
44+
public State_4096() {
45+
super(new SimHashPolicy_v1(4096, getPseudoRandomGeneratorProvider()).createHasher());
46+
}
47+
}
48+
49+
@Benchmark
50+
@BenchmarkMode(Mode.AverageTime)
51+
public void testSimilarityHashing_64_1(State_64 state, Blackhole blackhole) {
52+
testSimilarityHashing(state, blackhole);
53+
}
54+
55+
@Benchmark
56+
@BenchmarkMode(Mode.AverageTime)
57+
public void testSimilarityHashing_256_1(State_256 state, Blackhole blackhole) {
58+
testSimilarityHashing(state, blackhole);
59+
}
60+
61+
@Benchmark
62+
@BenchmarkMode(Mode.AverageTime)
63+
public void testSimilarityHashing_1024_1(State_1024 state, Blackhole blackhole) {
64+
testSimilarityHashing(state, blackhole);
65+
}
66+
67+
@Benchmark
68+
@BenchmarkMode(Mode.AverageTime)
69+
public void testSimilarityHashing_4096_1(State_4096 state, Blackhole blackhole) {
70+
testSimilarityHashing(state, blackhole);
71+
}
72+
}

src/jmh/java/com/dynatrace/hash4j/similarity/SimilarityHashingPerformanceTest.java

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright 2022 Dynatrace LLC
2+
* Copyright 2022-2023 Dynatrace LLC
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -27,14 +27,14 @@ public static class StateBase implements ElementHashProvider {
2727
public final SplittableRandom random = new SplittableRandom();
2828
public final SimilarityHasher similarityHasher;
2929

30-
@Param({"1", "10", "100", "1000", "10000"})
30+
@Param({"1", "10", "100", "1000", "10000", "100000"})
3131
public int numElements;
3232

3333
public StateBase(SimilarityHasher similarityHasher) {
3434
this.similarityHasher = similarityHasher;
3535
}
3636

37-
public final long[] elementHashes = new long[10000]; // maximum number of elements
37+
public final long[] elementHashes = new long[100000]; // maximum number of elements
3838

3939
@Override
4040
public long getElementHash(int elementIndex) {
@@ -52,9 +52,7 @@ protected void testSimilarityHashing(StateBase state, Blackhole blackhole) {
5252
state.elementHashes[i] = state.random.nextLong();
5353
}
5454
byte[] signature = state.similarityHasher.compute(state);
55-
for (byte b : signature) {
56-
blackhole.consume(b);
57-
}
55+
blackhole.consume(signature);
5856
}
5957

6058
protected static PseudoRandomGeneratorProvider getPseudoRandomGeneratorProvider() {

src/main/java/com/dynatrace/hash4j/similarity/FastSimHashPolicy_v1.java

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -81,15 +81,16 @@ public byte[] compute(ElementHashProvider elementHashProvider) {
8181

8282
for (int h = 0; h < numTmpCountChunks; ++h) {
8383
long randomValue = pseudoRandomGenerator.nextLong();
84+
int off = h << (6 - BULK_CONSTANT);
8485
for (int j = 0; j < (1 << (6 - BULK_CONSTANT)); ++j) {
85-
tmpCounts[(h << (6 - BULK_CONSTANT)) + j] += (randomValue >>> j) & BULK_MASK;
86+
tmpCounts[off + j] += (randomValue >>> j) & BULK_MASK;
8687
}
8788
}
8889
if (numTmpCountRemaining > 0) {
8990
long randomValue = pseudoRandomGenerator.nextLong();
91+
int off = numTmpCountChunks << (6 - BULK_CONSTANT);
9092
for (int j = 0; j < numTmpCountRemaining; ++j) {
91-
tmpCounts[(numTmpCountChunks << (6 - BULK_CONSTANT)) + j] +=
92-
(randomValue >>> j) & BULK_MASK;
93+
tmpCounts[off + j] += (randomValue >>> j) & BULK_MASK;
9394
}
9495
}
9596
c += 1;
@@ -99,16 +100,18 @@ public byte[] compute(ElementHashProvider elementHashProvider) {
99100
for (int h = 0; h < (counts.length >>> BULK_CONSTANT); ++h) {
100101
long tmp = tmpCounts[h];
101102
tmpCounts[h] = 0;
103+
int off = h << BULK_CONSTANT;
102104
for (int g = 0; g < (1 << BULK_CONSTANT); ++g) {
103-
counts[g + (h << BULK_CONSTANT)] +=
105+
counts[off + g] +=
104106
(int) ((tmp >>> (g << (6 - BULK_CONSTANT))) & TEMPORARY_COUNTER_LIMIT);
105107
}
106108
}
107109
for (int h = (counts.length >>> BULK_CONSTANT); h < tmpCounts.length; ++h) {
108110
long tmp = tmpCounts[h];
109111
tmpCounts[h] = 0;
112+
int off = h << BULK_CONSTANT;
110113
for (int g = 0; g < counts.length - (h << BULK_CONSTANT); ++g) {
111-
counts[g + (h << BULK_CONSTANT)] +=
114+
counts[off + g] +=
112115
(int) ((tmp >>> (g << (6 - BULK_CONSTANT))) & TEMPORARY_COUNTER_LIMIT);
113116
}
114117
}
Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
/*
2+
* Copyright 2022-2023 Dynatrace LLC
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
package com.dynatrace.hash4j.similarity;
17+
18+
import static com.dynatrace.hash4j.util.Preconditions.checkArgument;
19+
import static java.util.Objects.requireNonNull;
20+
21+
import com.dynatrace.hash4j.random.PseudoRandomGenerator;
22+
import com.dynatrace.hash4j.random.PseudoRandomGeneratorProvider;
23+
import java.util.Arrays;
24+
25+
final class SimHashPolicy_v1 extends AbstractSimilarityHashPolicy {
26+
27+
public SimHashPolicy_v1(
28+
int numberOfComponents, PseudoRandomGeneratorProvider pseudoRandomGeneratorProvider) {
29+
super(numberOfComponents, 1, pseudoRandomGeneratorProvider);
30+
}
31+
32+
@Override
33+
public SimilarityHasher createHasher() {
34+
return new Hasher();
35+
}
36+
37+
private class Hasher implements SimilarityHasher {
38+
39+
private final int[] counts = new int[numberOfComponents];
40+
41+
private final PseudoRandomGenerator pseudoRandomGenerator =
42+
pseudoRandomGeneratorProvider.create();
43+
44+
public byte[] compute(ElementHashProvider elementHashProvider) {
45+
46+
requireNonNull(elementHashProvider);
47+
int numberOfElements = elementHashProvider.getNumberOfElements();
48+
checkArgument(numberOfElements > 0, "Number of elements must be positive!");
49+
50+
Arrays.fill(counts, 0);
51+
52+
int numChunks = numberOfComponents >>> 6;
53+
int numRemaining = numberOfComponents & 0x3F;
54+
55+
for (int k = 0; k < numberOfElements; ++k) {
56+
57+
long elementHash = elementHashProvider.getElementHash(k);
58+
pseudoRandomGenerator.reset(elementHash);
59+
60+
for (int j = 0; j < numChunks; j++) {
61+
long randomValue = pseudoRandomGenerator.nextLong();
62+
int off = j << 6;
63+
for (int h = 0; h < 64; ++h) {
64+
counts[off + h] += (((int) (randomValue >>> h)) & 1);
65+
}
66+
}
67+
68+
if (numRemaining > 0) {
69+
long randomValue = pseudoRandomGenerator.nextLong();
70+
int off = numChunks << 6;
71+
for (int h = 0; h < numRemaining; ++h) {
72+
counts[off + h] += (((int) (randomValue >>> h)) & 1);
73+
}
74+
}
75+
}
76+
77+
final long limit = numberOfElements >>> 1;
78+
return packedArrayHandler.create(
79+
i -> (counts[i] + (i & (~numberOfElements & 1)) > limit) ? 1L : 0L, numberOfComponents);
80+
}
81+
}
82+
}
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
/*
2+
* Copyright 2022-2023 Dynatrace LLC
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
package com.dynatrace.hash4j.similarity;
17+
18+
import com.dynatrace.hash4j.random.PseudoRandomGeneratorProvider;
19+
20+
/** Versions of FastSimHash implementations. */
21+
public enum SimHashVersion {
22+
/**
23+
* Default version.
24+
*
25+
* <p>Not stable! Use concrete version if compatibility is important, if for example hash
26+
* signatures are persisted.
27+
*/
28+
DEFAULT {
29+
@Override
30+
SimilarityHashPolicy create(int numberOfComponents) {
31+
return new SimHashPolicy_v1(
32+
numberOfComponents, PseudoRandomGeneratorProvider.splitMix64_V1());
33+
}
34+
},
35+
/** Version 1. */
36+
V1 {
37+
@Override
38+
SimilarityHashPolicy create(int numberOfComponents) {
39+
return new SimHashPolicy_v1(
40+
numberOfComponents, PseudoRandomGeneratorProvider.splitMix64_V1());
41+
}
42+
};
43+
44+
abstract SimilarityHashPolicy create(int numberOfComponents);
45+
}

src/main/java/com/dynatrace/hash4j/similarity/SimilarityHashing.java

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -125,4 +125,39 @@ static SimilarityHashPolicy fastSimHash(
125125
int numberOfComponents, FastSimHashVersion fastSimHashVersion) {
126126
return fastSimHashVersion.create(numberOfComponents);
127127
}
128+
129+
/**
130+
* Returns a {@link SimilarityHashPolicy} for SimHash as introduced in <a
131+
* href="https://dl.acm.org/doi/abs/10.1145/509907.509965?casa_token=LO2phP3daHEAAAAA%3Ad2zE2ktXOGP8JqCsSo0jqsQcfOx8-Jclq7_katfP_FRpXWJMPU3OuDE8QZATbYdePl7VRbibDUqWdQ">Moses
132+
* S. Charikar, Similarity estimation techniques from rounding algorithms, 2002.</a>
133+
*
134+
* <p>As SimHash is significantly slower than FastSimHash, prefer using {@link #fastSimHash(int)}
135+
* instead!
136+
*
137+
* @param numberOfComponents the number of components of the similarity hash
138+
* @return a policy
139+
*/
140+
static SimilarityHashPolicy simHash(int numberOfComponents) {
141+
return simHash(numberOfComponents, SimHashVersion.DEFAULT);
142+
}
143+
144+
/**
145+
* Returns a {@link SimilarityHashPolicy} for SimHash as introduced in <a
146+
* href="https://dl.acm.org/doi/abs/10.1145/509907.509965?casa_token=LO2phP3daHEAAAAA%3Ad2zE2ktXOGP8JqCsSo0jqsQcfOx8-Jclq7_katfP_FRpXWJMPU3OuDE8QZATbYdePl7VRbibDUqWdQ">Moses
147+
* S. Charikar, Similarity estimation techniques from rounding algorithms, 2002.</a>
148+
*
149+
* <p>As SimHash is significantly slower than FastSimHash, prefer using {@link #fastSimHash(int,
150+
* FastSimHashVersion)} instead!
151+
*
152+
* <p>Specifying the version of the implementation ensures compatibility with later hash4j
153+
* versions that may change the default implementation. This is especially important if the
154+
* signatures are persisted.
155+
*
156+
* @param numberOfComponents the number of components of the similarity hash
157+
* @param simHashVersion the version of the implementation
158+
* @return a policy
159+
*/
160+
static SimilarityHashPolicy simHash(int numberOfComponents, SimHashVersion simHashVersion) {
161+
return simHashVersion.create(numberOfComponents);
162+
}
128163
}
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
/*
2+
* Copyright 2022-2023 Dynatrace LLC
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
package com.dynatrace.hash4j.similarity;
17+
18+
abstract class AbstractSimHashPolicyTest extends AbstractSimilarityHasherPolicyTest {
19+
20+
protected static double calculateComponentCollisionProbability(double cosineSimilarity) {
21+
return Math.min(1., Math.max(0.5, Math.acos(-cosineSimilarity) / Math.PI));
22+
}
23+
24+
@Override
25+
protected double calculateExpectedMatchProbability(
26+
long intersectionSize, long difference1Size, long difference2Size) {
27+
28+
double expectedCosineSimilarity =
29+
intersectionSize
30+
/ Math.sqrt(
31+
(intersectionSize + difference1Size)
32+
* (double) (intersectionSize + difference2Size));
33+
34+
return calculateComponentCollisionProbability(expectedCosineSimilarity);
35+
}
36+
37+
@Override
38+
protected int getMaxSizeForCheckSumTest() {
39+
return 300;
40+
}
41+
}

0 commit comments

Comments
 (0)