Skip to content

Commit b3e8d1b

Browse files
authored
Merge pull request #181 from dynatrace-oss/improved-consistent-weighted-sampling
Improved consistent weighted sampling
2 parents 331804a + 6cdb6ea commit b3e8d1b

12 files changed

+551
-107
lines changed

README.md

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -25,12 +25,12 @@ To add a dependency on hash4j using Maven, use the following:
2525
<dependency>
2626
<groupId>com.dynatrace.hash4j</groupId>
2727
<artifactId>hash4j</artifactId>
28-
<version>0.13.0</version>
28+
<version>0.14.0</version>
2929
</dependency>
3030
```
3131
To add a dependency using Gradle:
3232
```gradle
33-
implementation 'com.dynatrace.hash4j:hash4j:0.13.0'
33+
implementation 'com.dynatrace.hash4j:hash4j:0.14.0'
3434
```
3535

3636
## Hash algorithms
@@ -193,11 +193,20 @@ HashValue128 hash = FileHashing.imohash1_0_2().hashFileTo128Bits(file);
193193
See also [FileHashingDemo.java](src/test/java/com/dynatrace/hash4j/file/FileHashingDemo.java).
194194

195195
## Consistent hashing
196-
This library contains an implementation of [JumpHash](https://arxiv.org/abs/1406.2294)
197-
that can be used to achieve distributed agreement when assigning hash values to a given number of buckets.
198-
The hash values are distributed uniformly over the buckets.
199-
The algorithm also minimizes the number of reassignments needed for balancing when the number of buckets changes.
200-
196+
This library contains various algorithms for the distributed agreement on the assignment of hash values to a given number of buckets.
197+
In the naive approach, the hash values are assigned to the buckets with the modulo operation according to
198+
`bucketIdx = abs(hash) % numBuckets`.
199+
If the number of buckets is changed, the bucket index will change for most hash values.
200+
With a consistent hash algorithm, the above expression can be replaced by
201+
`bucketIdx = consistentBucketHasher.getBucket(hash, numBuckets)`
202+
to minimize the number of reassignments while still ensuring a fair distribution across all buckets.
203+
204+
The following consistent hashing algorithms are available:
205+
* [JumpHash](https://arxiv.org/abs/1406.2294): This algorithm has a calculation time that scales logarithmically with the number of buckets
206+
* [Improved Consistent Weighted Sampling](https://doi.org/10.1109/ICDM.2010.80): This algorithm is based on improved
207+
consistent weighted sampling with a constant computation time independent of the number of buckets. This algorithm is faster than
208+
JumpHash for large numbers of buckets.
209+
201210
### Usage
202211
```java
203212
// create a consistent bucket hasher

build.gradle

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ java {
6868
}
6969

7070
group = 'com.dynatrace.hash4j'
71-
version = '0.13.0'
71+
version = '0.14.0'
7272

7373
spotless {
7474
ratchetFrom 'origin/main'
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
/*
2+
* Copyright 2023 Dynatrace LLC
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
package com.dynatrace.hash4j.consistent;
17+
18+
import com.dynatrace.hash4j.random.PseudoRandomGeneratorProvider;
19+
import java.util.SplittableRandom;
20+
import org.openjdk.jmh.annotations.*;
21+
import org.openjdk.jmh.infra.Blackhole;
22+
23+
public class ConsistentJumpBucketHasherPerformanceTest {
24+
25+
private static final ConsistentBucketHasher CONSISTENT_BUCKET_HASHER =
26+
ConsistentHashing.jumpHash(PseudoRandomGeneratorProvider.splitMix64_V1());
27+
28+
@State(Scope.Thread)
29+
public static class TestState {
30+
31+
@Param({"1", "10", "100", "1000", "10000", "100000", "1000000"})
32+
int numBuckets;
33+
34+
SplittableRandom random;
35+
36+
@Setup
37+
public void init() {
38+
random = new SplittableRandom(0x87c5950e6677341eL);
39+
}
40+
}
41+
42+
@Benchmark
43+
@BenchmarkMode(Mode.AverageTime)
44+
public void getBucket(TestState testState, Blackhole blackhole) {
45+
int bucket =
46+
CONSISTENT_BUCKET_HASHER.getBucket(testState.random.nextLong(), testState.numBuckets);
47+
blackhole.consume(bucket);
48+
}
49+
}
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
/*
2+
* Copyright 2023 Dynatrace LLC
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
package com.dynatrace.hash4j.consistent;
17+
18+
import com.dynatrace.hash4j.random.PseudoRandomGeneratorProvider;
19+
import java.util.SplittableRandom;
20+
import org.openjdk.jmh.annotations.*;
21+
import org.openjdk.jmh.infra.Blackhole;
22+
23+
public class ImprovedConsistentWeightedSamplingPerformanceTest {
24+
25+
private static final ConsistentBucketHasher CONSISTENT_BUCKET_HASHER =
26+
ConsistentHashing.improvedConsistentWeightedSampling(
27+
PseudoRandomGeneratorProvider.splitMix64_V1());
28+
29+
@State(Scope.Thread)
30+
public static class TestState {
31+
32+
@Param({"1", "10", "100", "1000", "10000", "100000", "1000000"})
33+
int numBuckets;
34+
35+
SplittableRandom random;
36+
37+
@Setup
38+
public void init() {
39+
random = new SplittableRandom(0x87c5950e6677341eL);
40+
}
41+
}
42+
43+
@Benchmark
44+
@BenchmarkMode(Mode.AverageTime)
45+
public void getBucket(TestState testState, Blackhole blackhole) {
46+
int bucket =
47+
CONSISTENT_BUCKET_HASHER.getBucket(testState.random.nextLong(), testState.numBuckets);
48+
blackhole.consume(bucket);
49+
}
50+
}
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
/*
2+
* Copyright 2023 Dynatrace LLC
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
package com.dynatrace.hash4j.consistent;
17+
18+
import java.util.SplittableRandom;
19+
import org.openjdk.jmh.annotations.*;
20+
import org.openjdk.jmh.infra.Blackhole;
21+
22+
public class ModuloPerformanceTest {
23+
24+
@State(Scope.Thread)
25+
public static class TestState {
26+
27+
@Param({"1", "10", "100", "1000", "10000", "100000", "1000000"})
28+
int numBuckets;
29+
30+
SplittableRandom random;
31+
32+
@Setup
33+
public void init() {
34+
random = new SplittableRandom(0x87c5950e6677341eL);
35+
}
36+
}
37+
38+
@Benchmark
39+
@BenchmarkMode(Mode.AverageTime)
40+
public void getBucket(TestState testState, Blackhole blackhole) {
41+
int bucket = (int) ((testState.random.nextLong() & 0x7FFFFFFFFFFFFFFFL) % testState.numBuckets);
42+
blackhole.consume(bucket);
43+
}
44+
}
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
/*
2+
* Copyright 2023 Dynatrace LLC
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
package com.dynatrace.hash4j.consistent;
17+
18+
import java.util.SplittableRandom;
19+
import org.openjdk.jmh.annotations.*;
20+
import org.openjdk.jmh.infra.Blackhole;
21+
22+
public class RandomNumberPerformanceTest {
23+
24+
@State(Scope.Thread)
25+
public static class TestState {
26+
27+
SplittableRandom random;
28+
29+
@Setup
30+
public void init() {
31+
random = new SplittableRandom(0x87c5950e6677341eL);
32+
}
33+
}
34+
35+
@Benchmark
36+
@BenchmarkMode(Mode.AverageTime)
37+
public void getBucket(TestState testState, Blackhole blackhole) {
38+
blackhole.consume(testState.random.nextLong());
39+
}
40+
}

src/main/java/com/dynatrace/hash4j/consistent/ConsistentHashing.java

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,11 +29,32 @@ private ConsistentHashing() {}
2929
* consistent hash algorithm." arXiv preprint <a
3030
* href="https://arxiv.org/abs/1406.2294">arXiv:1406.2294</a> (2014).
3131
*
32+
* <p>The average computation time depends logarithmically on the number of buckets.
33+
*
3234
* @param pseudoRandomGeneratorProvider a {@link PseudoRandomGeneratorProvider}
3335
* @return a {@link ConsistentBucketHasher}
3436
*/
3537
public static ConsistentBucketHasher jumpHash(
3638
PseudoRandomGeneratorProvider pseudoRandomGeneratorProvider) {
3739
return new ConsistentJumpBucketHasher(pseudoRandomGeneratorProvider);
3840
}
41+
42+
/**
43+
* Returns a {@link ConsistentBucketHasher}.
44+
*
45+
* <p>This algorithm is based on the method described in Sergey Ioffe, "Improved Consistent
46+
* Sampling, Weighted Minhash and L1 Sketching," 2010, doi: <a
47+
* href="https://doi.org/10.1109/ICDM.2010.80">10.1109/ICDM.2010.80.</a> which is applied to a
48+
* one-dimensional input vector whose value is equal to the number of buckets.
49+
*
50+
* <p>The computation time is constant independent of the number of buckets. This method is faster
51+
* than {@link #jumpHash(PseudoRandomGeneratorProvider)} for large number of buckets.
52+
*
53+
* @param pseudoRandomGeneratorProvider a {@link PseudoRandomGeneratorProvider}
54+
* @return a {@link ConsistentBucketHasher}
55+
*/
56+
public static ConsistentBucketHasher improvedConsistentWeightedSampling(
57+
PseudoRandomGeneratorProvider pseudoRandomGeneratorProvider) {
58+
return new ImprovedConsistentWeightedSampling(pseudoRandomGeneratorProvider);
59+
}
3960
}

src/main/java/com/dynatrace/hash4j/consistent/ConsistentJumpBucketHasher.java

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ class ConsistentJumpBucketHasher implements ConsistentBucketHasher {
5454
// see
5555
// https://github.com/google/guava/blob/0a17f4a429323589396c38d8ce75ca058faa6c64/guava/src/com/google/common/hash/Hashing.java#L559
5656
@Override
57-
public int getBucket(long hash, int numBuckets) {
57+
public strictfp int getBucket(long hash, int numBuckets) {
5858
checkArgument(numBuckets > 0, "buckets must be positive");
5959
pseudoRandomGenerator.reset(hash);
6060

@@ -64,11 +64,10 @@ public int getBucket(long hash, int numBuckets) {
6464
// Jump from bucket to bucket until we go out of range
6565
while (true) {
6666
next = (int) ((candidate + 1) / pseudoRandomGenerator.nextDouble());
67-
if (next > candidate && next < numBuckets) {
68-
candidate = next;
69-
} else {
70-
return candidate;
71-
}
67+
if (next >= numBuckets || next <= candidate)
68+
return candidate; // second condition protects against infinite loops caused by bad random
69+
// values such as NaN or values outside of [0,1)
70+
candidate = next;
7271
}
7372
}
7473
}
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
/*
2+
* Copyright 2023 Dynatrace LLC
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
package com.dynatrace.hash4j.consistent;
17+
18+
import static com.dynatrace.hash4j.util.Preconditions.checkArgument;
19+
import static java.util.Objects.requireNonNull;
20+
21+
import com.dynatrace.hash4j.random.PseudoRandomGenerator;
22+
import com.dynatrace.hash4j.random.PseudoRandomGeneratorProvider;
23+
24+
/**
25+
* Consistent hashing algorithm based on a simplified version of the algorithm described in Sergey
26+
* Ioffe, <a href="https://ieeexplore.ieee.org/abstract/document/5693978">"Improved Consistent
27+
* Sampling, Weighted Minhash and L1 Sketching,"</a> 2010 IEEE International Conference on Data
28+
* Mining, Sydney, NSW, Australia, 2010, pp. 246-255, doi: 10.1109/ICDM.2010.80.
29+
*/
30+
class ImprovedConsistentWeightedSampling implements ConsistentBucketHasher {
31+
32+
private final PseudoRandomGenerator pseudoRandomGenerator;
33+
34+
ImprovedConsistentWeightedSampling(PseudoRandomGeneratorProvider pseudoRandomGeneratorProvider) {
35+
requireNonNull(pseudoRandomGeneratorProvider);
36+
this.pseudoRandomGenerator = pseudoRandomGeneratorProvider.create();
37+
}
38+
39+
@Override
40+
public strictfp int getBucket(long hash, int numBuckets) {
41+
checkArgument(numBuckets > 0, "buckets must be positive");
42+
pseudoRandomGenerator.reset(hash);
43+
double r = pseudoRandomGenerator.nextExponential() + pseudoRandomGenerator.nextExponential();
44+
double b = pseudoRandomGenerator.nextDouble();
45+
double t = StrictMath.floor(StrictMath.log(numBuckets) / r + b);
46+
double y = StrictMath.exp(r * (t - b));
47+
// y should always be in the range [0, numBuckets),
48+
// but could be larger due to numerical inaccuracies,
49+
// therefore limit result after rounding down to numBuckets - 1
50+
return Math.min((int) y, numBuckets - 1);
51+
}
52+
}

0 commit comments

Comments
 (0)