From 940eda8b4b17429e7379dc7157cdae46ecb4e61a Mon Sep 17 00:00:00 2001 From: AlexanderSaydakov Date: Thu, 23 May 2024 10:35:53 -0700 Subject: [PATCH 1/4] circular table, no extension slots --- .../filters/quotientfilter/Iterator.java | 7 +- .../quotientfilter/QuotientFilter.java | 201 +++++------------- .../quotientfilter/QuotientFilterTest.java | 82 ++++--- 3 files changed, 97 insertions(+), 193 deletions(-) diff --git a/src/main/java/org/apache/datasketches/filters/quotientfilter/Iterator.java b/src/main/java/org/apache/datasketches/filters/quotientfilter/Iterator.java index ea5039aa6..6b76a1789 100644 --- a/src/main/java/org/apache/datasketches/filters/quotientfilter/Iterator.java +++ b/src/main/java/org/apache/datasketches/filters/quotientfilter/Iterator.java @@ -49,7 +49,7 @@ void clear() { boolean next() { - if (index == qf.get_logical_num_slots_plus_extensions()) { + if (index == qf.get_num_slots()) { return false; } @@ -59,9 +59,9 @@ boolean next() { boolean shifted = (slot & 4) != 0; - while (!occupied && !continuation && !shifted && index < qf.get_logical_num_slots_plus_extensions()) { + while (!occupied && !continuation && !shifted && index < qf.get_num_slots()) { index++; - if (index == qf.get_logical_num_slots_plus_extensions()) { + if (index == qf.get_num_slots()) { return false; } slot = qf.get_slot(index); @@ -99,5 +99,4 @@ void print() { System.out.println("original slot: " + index + " " + bucket_index); } - } diff --git a/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java index 9e0f08ce5..3af8affcd 100644 --- a/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java +++ b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java @@ -17,30 +17,20 @@ * under the License. */ - package org.apache.datasketches.filters.quotientfilter; import java.util.ArrayList; import java.util.HashSet; import java.util.Set; -import org.apache.datasketches.filters.quotientfilter.Bitmap; -import org.apache.datasketches.memory.XxHash; - public class QuotientFilter extends Filter { int bitPerEntry; int fingerprintLength; int power_of_two_size; - int num_extension_slots; int num_existing_entries; Bitmap filter; - // These three fields are used to prevent throwing exceptions when the buffer space of the filter is exceeded - long last_empty_slot; - long last_cluster_start; - public long backward_steps; - double expansion_threshold; long max_entries_before_expansion; boolean expand_autonomously; @@ -60,11 +50,7 @@ public QuotientFilter(int power_of_two, int bits_per_entry) { power_of_two_size = power_of_two; bitPerEntry = bits_per_entry; fingerprintLength = bits_per_entry - 3; - long init_size = 1L << power_of_two; - //System.out.println("Init size: " + init_size); - num_extension_slots = power_of_two * 2; - // System.out.println("Extension slots: " + num_extension_slots); - + final long init_size = 1L << power_of_two; filter = make_filter(init_size, bits_per_entry); expansion_threshold = 0.8; @@ -75,19 +61,6 @@ public QuotientFilter(int power_of_two, int bits_per_entry) { original_fingerprint_size = fingerprintLength; num_expansions = 0; //hash_type = XxHash.hashLong ; //HashType.xxh; - - last_empty_slot = init_size + num_extension_slots - 1; - last_cluster_start = 0; - backward_steps = 0; - //measure_num_bits_per_entry(); - } - - //nuevo - void update(long init_size) - { - last_empty_slot = init_size + num_extension_slots - 1; - last_cluster_start = 0; - backward_steps = 0; } public boolean rejuvenate(long key) { @@ -114,7 +87,7 @@ Bitmap make_filter(long init_size, int bits_per_entry) { // System.out.println(init_size ) ; // System.out.println(num_extension_slots); // System.out.println("Making BitVector with: " + (init_size + num_extension_slots) + "SLOTS"); - return new QuickBitVectorWrapper(bits_per_entry, init_size + num_extension_slots); + return new QuickBitVectorWrapper(bits_per_entry, init_size); } public int get_fingerprint_length() { @@ -126,13 +99,6 @@ public int get_fingerprint_length() { bitPerEntry = bits_per_entry; fingerprintLength = bits_per_entry - 3; filter = bitmap; - num_extension_slots = power_of_two * 2; - - //nuevo - long init_size = 1L << power_of_two; - last_empty_slot = init_size + num_extension_slots - 1; - last_cluster_start = 0; - backward_steps = 0; } boolean expand() { @@ -155,18 +121,17 @@ protected static double measure_num_bits_per_entry(QuotientFilter current, Array for (QuotientFilter q : other_filters) { //q.print_filter_summary(); //System.out.println(); - long q_num_entries = q.get_num_entries(false); - num_entries += q_num_entries; + num_entries += q.get_num_entries(false); } long init_size = 1L << current.power_of_two_size; - long num_bits = current.bitPerEntry * init_size + current.num_extension_slots * current.bitPerEntry; + long num_bits = current.bitPerEntry * init_size; for (QuotientFilter q : other_filters) { init_size = 1L << q.power_of_two_size; - num_bits += q.bitPerEntry * init_size + q.num_extension_slots * q.bitPerEntry; + num_bits += q.bitPerEntry * init_size; } //System.out.println("total entries: \t\t" + num_entries); //System.out.println("total bits: \t\t" + num_bits); - double bits_per_entry = num_bits / num_entries; + final double bits_per_entry = num_bits / num_entries; //System.out.println("total bits/entry: \t" + bits_per_entry); //System.out.println(); return bits_per_entry; @@ -174,10 +139,8 @@ protected static double measure_num_bits_per_entry(QuotientFilter current, Array // scans the quotient filter and returns the number of non-empty slots public long get_num_entries(boolean include_all_internal_filters) { - //long bits = filter.size(); - long slots = get_physcial_num_slots(); long num_entries = 0; - for (long i = 0; i < slots; i++) { + for (long i = 0; i < get_num_slots(); i++) { if (is_occupied(i) || is_continuation(i) || is_shifted(i)) { num_entries++; } @@ -193,21 +156,15 @@ public double get_utilization() { return util; } - public long get_physcial_num_slots() { - long bits = filter.size(); - return bits / bitPerEntry; - } - - // returns the number of physical slots in the filter (including the extention/buffer slots at the end) - public long get_logical_num_slots_plus_extensions() { - return (1L << power_of_two_size) + num_extension_slots; - } - // returns the number of slots in the filter without the extension/buffer slots - public long get_logical_num_slots() { + public long get_num_slots() { return 1L << power_of_two_size; } + long getMask() { + return get_num_slots() - 1; + } + // sets the metadata flag bits for a given slot index void modify_slot(boolean is_occupied, boolean is_continuation, boolean is_shifted, long index) { @@ -226,8 +183,7 @@ void set_fingerprint(long index, long fingerprint) { public String get_pretty_str(boolean vertical) { StringBuffer sbr = new StringBuffer(); - long logic_slots = get_logical_num_slots(); - long all_slots = get_logical_num_slots_plus_extensions(); + long num_slots = get_num_slots(); for (long i = 0; i < filter.size(); i++) { long remainder = i % bitPerEntry; @@ -235,10 +191,8 @@ public String get_pretty_str(boolean vertical) { long slot_num = i/bitPerEntry; sbr.append(" "); if (vertical) { - if (slot_num == logic_slots ){//|| slot_num == all_slots) { + if (slot_num == num_slots) { sbr.append("\n ---------"); - } else if (slot_num == all_slots) { - sbr.append("\n d***********b"); } //sbr.append("\n" + slot_num + " "); sbr.append("\n" + String.format("%-10d", slot_num) + "\t"); @@ -283,7 +237,7 @@ void modify_slot(boolean is_occupied, boolean is_continuation, boolean is_shifte // summarize some statistical measures about the filter public void print_filter_summary() { long num_entries = get_num_entries(false); - long slots = (1L << power_of_two_size) + num_extension_slots; + long slots = get_num_slots(); long num_bits = slots * bitPerEntry; System.out.println("slots:\t" + slots); System.out.println("entries:\t" + num_entries); @@ -300,14 +254,13 @@ public void print_filter_summary() { //System.out.println("avg cluster length: \t" + avg_cluster_length); } + /* + Returns the number of bits used for the filter + */ @Override - public long get_space_use(){ - /* - Returns the number of bits used for the filter - */ - long slots = (1L << power_of_two_size); // + num_extension_slots; - long num_bits = slots * bitPerEntry; - return num_bits ; + public long get_space_use() { + long num_bits = get_num_slots() * bitPerEntry; + return num_bits; } public int get_bits_per_entry() { @@ -347,7 +300,7 @@ boolean is_slot_empty(long index) { long find_cluster_start(long index) { long current_index = index; while (is_shifted(current_index)) { - current_index--; + current_index = (current_index - 1) & getMask(); } return current_index; } @@ -361,9 +314,8 @@ long find_run_start(long index) { if (is_occupied(current_index)) { runs_to_skip_counter++; } - current_index--; + current_index = (current_index - 1) & getMask(); } - last_cluster_start = current_index - 1; while (true) { if (!is_continuation(current_index)) { runs_to_skip_counter--; @@ -371,7 +323,7 @@ long find_run_start(long index) { return current_index; } } - current_index++; + current_index = (current_index + 1) & getMask(); } } @@ -383,8 +335,8 @@ long find_first_fingerprint_in_run(long index, long fingerprint) { //System.out.println("found matching FP at index " + index); return index; } - index++; - } while (index < get_logical_num_slots_plus_extensions() && is_continuation(index)); + index = (index + 1) & getMask(); + } while (is_continuation(index)); return -1; } @@ -397,16 +349,16 @@ long decide_which_fingerprint_to_delete(long index, long fingerprint) { //System.out.println("found matching FP at index " + index); matching_fingerprint_index = index; } - index++; - } while (index < get_logical_num_slots_plus_extensions() && is_continuation(index)); + index = (index + 1) & getMask(); + } while (is_continuation(index)); return matching_fingerprint_index; } // given the start of a run, find the last slot index that still belongs to this run long find_run_end(long index) { - while(index < get_logical_num_slots_plus_extensions() - 1 && is_continuation(index+1)) { - index++; - } + do { + index = (index + 1) & getMask(); + } while(is_continuation(index)); return index; } @@ -432,7 +384,7 @@ Set get_all_fingerprints(long bucket_index) { long run_index = find_run_start(bucket_index); do { set.add(get_fingerprint(run_index)); - run_index++; + run_index = (run_index + 1) & getMask(); } while (is_continuation(run_index)); return set; } @@ -447,17 +399,7 @@ long swap_fingerprints(long index, long new_fingerprint) { // finds the first empty slot after the given slot index long find_first_empty_slot(long index) { while (!is_slot_empty(index)) { - index++; - } - return index; - } - - // moves backwards to find the first empty slot - // used as a part of the mechanism to prevent exceptions when exceeding the quotient filter's bounds - long find_backward_empty_slot(long index) { - while (index >= 0 && !is_slot_empty(index)) { - backward_steps++; - index--; + index = (index + 1) & getMask(); } return index; } @@ -465,10 +407,10 @@ long find_backward_empty_slot(long index) { // return the first slot to the right where the current run starting at the index parameter ends long find_new_run_location(long index) { if (!is_slot_empty(index)) { - index++; + index = (index + 1) & getMask(); } while (is_continuation(index)) { - index++; + index = (index + 1) & getMask(); } return index; } @@ -489,9 +431,6 @@ boolean insert_new_run(long canonical_slot, long long_fp) { // if the slot was initially empty, we can just terminate, as there is nothing to push to the right if (slot_initially_empty) { set_fingerprint(start_of_this_new_run, long_fp); - if (start_of_this_new_run == last_empty_slot) { - last_empty_slot = find_backward_empty_slot(last_cluster_start); - } num_existing_entries++; return true; } @@ -502,26 +441,16 @@ boolean insert_new_run(long canonical_slot, long long_fp) { boolean is_this_slot_empty; boolean temp_continuation = false; do { - if (current_index >= get_logical_num_slots_plus_extensions()) { - return false; - } - is_this_slot_empty = is_slot_empty(current_index); long_fp = swap_fingerprints(current_index, long_fp); - if (current_index > start_of_this_new_run) { + if (current_index != start_of_this_new_run) { set_shifted(current_index, true); - } - - if (current_index > start_of_this_new_run) { boolean current_continuation = is_continuation(current_index); set_continuation(current_index, temp_continuation); temp_continuation = current_continuation; } - current_index++; - if (current_index == last_empty_slot) { // TODO get this out of the while loop - last_empty_slot = find_backward_empty_slot(last_cluster_start); - } + current_index = (current_index + 1) & getMask(); } while (!is_this_slot_empty); num_existing_entries++; return true; @@ -535,7 +464,7 @@ boolean insert(long long_fp, long index, boolean insert_only_if_no_match) { //System.out.println("Num items: " + num_existing_entries); //System.out.println("Max items: " + max_entries_before_expansion); - if (index > last_empty_slot) { + if (index >= get_num_slots()) { return false; } boolean does_run_exist = is_occupied(index); @@ -554,7 +483,7 @@ boolean insert(long long_fp, long index, boolean insert_only_if_no_match) { return insert_fingerprint_and_push_all_else(long_fp, run_start_index); } - // insert an fingerprint as the first fingerprint of the new run and push all other entries in the cluster to the right. + // insert a fingerprint as the first fingerprint of the new run and push all other entries in the cluster to the right. boolean insert_fingerprint_and_push_all_else(long long_fp, long run_start_index) { long current_index = run_start_index; boolean is_this_slot_empty; @@ -562,14 +491,11 @@ boolean insert_fingerprint_and_push_all_else(long long_fp, long run_start_index) boolean temp_continuation = false; do { - if (current_index >= get_logical_num_slots_plus_extensions()) { - return false; - } is_this_slot_empty = is_slot_empty(current_index); - if (current_index > run_start_index) { + if (current_index != run_start_index) { set_shifted(current_index, true); } - if (current_index > run_start_index && !finished_first_run && !is_continuation(current_index)) { + if (current_index != run_start_index && !finished_first_run && !is_continuation(current_index)) { finished_first_run = true; set_continuation(current_index, true); long_fp = swap_fingerprints(current_index, long_fp); @@ -580,17 +506,14 @@ else if (finished_first_run) { temp_continuation = current_continuation; long_fp = swap_fingerprints(current_index, long_fp); } - if (current_index == last_empty_slot) { - last_empty_slot = find_backward_empty_slot(last_cluster_start); - } - current_index++; + current_index = (current_index + 1) & getMask(); } while (!is_this_slot_empty); num_existing_entries++; return true; } boolean delete(long fingerprint, long canonical_slot, long run_start_index, long matching_fingerprint_index) { - long run_end = find_run_end(matching_fingerprint_index); + final long run_end = find_run_end(matching_fingerprint_index); // the run has only one entry, we need to disable its is_occupied flag // we just remember we need to do this here, and we do it later to not interfere with counts @@ -630,43 +553,37 @@ boolean delete(long fingerprint, long canonical_slot, long run_start_index, long //boolean does_next_run_exist = !is_slot_empty(run_end + 1); //boolean is_next_run_shifted = is_shifted(run_end + 1); //if (!does_next_run_exist || !is_next_run_shifted) { - if (run_end >= get_logical_num_slots_plus_extensions()-1 || - is_slot_empty(run_end + 1) || !is_shifted(run_end + 1)) { + final long next_run_start = (run_end + 1) & getMask(); + if (is_slot_empty(next_run_start) || !is_shifted(next_run_start)) { if (turn_off_occupied) { // if we eliminated a run and now need to turn the is_occupied flag off, we do it at the end to not interfere in our counts set_occupied(canonical_slot, false); - - } - if (run_end > last_empty_slot) { - last_empty_slot = run_end; } return true; } - // we now find the start and end of the next run - long next_run_start = run_end + 1; - run_end = find_run_end(next_run_start); + final long next_run_end = find_run_end(next_run_start); // before we start processing the next run, we check whether the previous run we shifted is now back to its canonical slot // The condition num_shifted_count - num_non_occupied == 1 ensures that the run was shifted by only 1 slot, meaning it is now back in its proper place - if ( is_occupied(next_run_start - 1) && num_shifted_count - num_non_occupied == 1 ) { - set_shifted(next_run_start - 1, false); + if (is_occupied(run_end) && num_shifted_count - num_non_occupied == 1) { + set_shifted(run_end, false); } else { - set_shifted(next_run_start - 1, true); + set_shifted(run_end, true); } - for (long i = next_run_start; i <= run_end; i++) { + for (long i = next_run_start; i != ((next_run_end + 1) & getMask()); i++) { long f = get_fingerprint(i); - set_fingerprint(i - 1, f); + set_fingerprint((i - 1) & getMask(), f); if (is_continuation(i)) { - set_continuation(i-1, true); + set_continuation((i - 1) & getMask(), true); } if (!is_occupied(i)) { num_non_occupied++; } + num_shifted_count++; } - num_shifted_count += run_end - next_run_start; set_fingerprint(run_end, 0); set_shifted(run_end, false); set_continuation(run_end, false); @@ -674,9 +591,6 @@ boolean delete(long fingerprint, long canonical_slot, long run_start_index, long } boolean delete(long fingerprint, long canonical_slot) { - if (canonical_slot >= get_logical_num_slots()) { - return false; - } // if the run doesn't exist, the key can't have possibly been inserted boolean does_run_exist = is_occupied(canonical_slot); if (!does_run_exist) { @@ -692,11 +606,8 @@ boolean delete(long fingerprint, long canonical_slot) { } return delete(fingerprint, canonical_slot, run_start_index, matching_fingerprint_index); - } - - /* Performs the modular arithmetic of large_hash % bits_per_entry and uses this as the slot_index */ @@ -792,8 +703,6 @@ protected boolean _search(long large_hash) { return search(fingerprint, slot_index); } - - public boolean get_bit_at_offset(int offset) { return filter.get(offset); } @@ -807,7 +716,7 @@ public void compute_statistics() { int current_run_length = 0; int current_cluster_length = 0; - long num_slots = get_logical_num_slots_plus_extensions(); + long num_slots = get_num_slots(); for (long i = 0; i < num_slots; i++) { boolean occupied = is_occupied(i); @@ -941,5 +850,3 @@ else if (occupied && continuation && shifted ) { // continuation of run // } } - - diff --git a/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterTest.java b/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterTest.java index 00b085fef..6fc24b10b 100644 --- a/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterTest.java +++ b/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterTest.java @@ -37,48 +37,47 @@ public class QuotientFilterTest { */ @Test static public void WikiInsertionTest() { - int bits_per_entry = 8; // 8 bits per entry => 5 bits fingerprints, resolved internally in the filter. + int bits_per_entry = 6; // 6 bits per entry => 3 bits fingerprint, resolved internally in the filter. int num_entries_power = 3; - int num_entries = (int)Math.pow(2, num_entries_power); QuotientFilter qf = new QuotientFilter(num_entries_power, bits_per_entry); - // this test does not need different fingerprints as it is testing the slot locations and metadata bits. - long fingerprint0 = 0; - long fingerprint1 = (1 << bits_per_entry) - 1; - - /* - The expected sketch is - 0 000 00000 - 1 100 00000 - 2 111 00000 - 3 011 00000 - 4 101 00000 - 5 001 11111 - 6 000 00000 - 7 100 00000 - */ - qf.insert(fingerprint0, 1, false); - qf.insert(fingerprint1, 4, false); // 11111 is inserted at slot 45 but pushed to slot 5 - qf.insert(fingerprint0, 7, false); - qf.insert(fingerprint0, 1, false); - qf.insert(fingerprint0, 2, false); - qf.insert(fingerprint0, 1, false); + final int A = 1; + final int B = 2; + final int C = 3; + final int D = 4; + final int E = 5; + final int F = 6; + + qf.insert(B, 1, false); + qf.insert(E, 4, false); + qf.insert(F, 7, false); + qf.insert(C, 1, false); + qf.insert(D, 2, false); + qf.insert(A, 1, false); assertEquals(qf.num_existing_entries, 6); + assertEquals(getState(qf, 0), 0); + assertEquals(qf.get_fingerprint(0), 0); + assertEquals(getState(qf, 1), 0b100); + assertEquals(qf.get_fingerprint(1), B); // this run is not ordered, which is different from Wikipedia example + assertEquals(getState(qf, 2), 0b111); + assertEquals(qf.get_fingerprint(2), C); + assertEquals(getState(qf, 3), 0b011); + assertEquals(qf.get_fingerprint(3), A); + assertEquals(getState(qf, 4), 0b101); + assertEquals(qf.get_fingerprint(4), D); + assertEquals(getState(qf, 5), 0b001); + assertEquals(qf.get_fingerprint(5), E); + assertEquals(getState(qf, 6), 0); + assertEquals(qf.get_fingerprint(6), 0); + assertEquals(getState(qf, 7), 0b100); + assertEquals(qf.get_fingerprint(7), F); + } - - // these are the expected resulting is_occupied, is_continuation, and is_shifted bits - // for all slots contiguously. We do not store the fingerprints here - BitSet result = new BitSet(num_entries * bits_per_entry); - result = set_slot_in_test(result, bits_per_entry, 0, false, false, false, fingerprint0); - result = set_slot_in_test(result, bits_per_entry, 1, true, false, false, fingerprint0); - result = set_slot_in_test(result, bits_per_entry, 2, true, true, true, fingerprint0); - result = set_slot_in_test(result, bits_per_entry, 3, false, true, true, fingerprint0); - result = set_slot_in_test(result, bits_per_entry, 4, true, false, true, fingerprint0); - result = set_slot_in_test(result, bits_per_entry, 5, false, false, true, fingerprint1); - result = set_slot_in_test(result, bits_per_entry, 6, false, false, false, fingerprint0); - result = set_slot_in_test(result, bits_per_entry, 7, true, false, false, fingerprint0); - assertTrue(check_equality(qf, result, true)); + static public int getState(QuotientFilter filter, int slot) { + return (filter.is_occupied(slot) ? 1 : 0) << 2 + | (filter.is_continuation(slot) ? 1 : 0) << 1 + | (filter.is_shifted(slot) ? 1 : 0); } /* @@ -165,13 +164,13 @@ static public void testQuotientFilterInsertionAndIteration() { qf.insert(0, 3, false); qf.insert(0, 3, false); qf.insert(0, 4, false); - qf.insert(0, 23, false); // last key in the filter - qf.insert(0, 24, false); // outside the bounds, logical slot 14 does not exist logically, even if it might exist physically + qf.insert(0, 15, false); // last slot in the filter + qf.insert(0, 16, false); // outside the bounds Iterator it = new Iterator(qf); - int[] arr = new int[] {2, 3, 3, 4, 23}; + int[] arr = new int[] {2, 3, 3, 4, 15}; int arr_index = 0; - while (it.next()) {assertEquals(arr[arr_index++], it.bucket_index);} + while (it.next()) {assertEquals(it.bucket_index, arr[arr_index++]);} } @Test @@ -247,7 +246,7 @@ static public boolean check_equality(QuotientFilter qf, BitSet bs, boolean check } /* - Helper functino to test that no false negatives are returned. + Helper function to test that no false negatives are returned. */ static public boolean test_no_false_negatives(QuotientFilter filter, int num_entries) { HashSet added = new HashSet(); @@ -273,5 +272,4 @@ static public boolean test_no_false_negatives(QuotientFilter filter, int num_ent } return true; } - } From cf3eb3c426b9b417d86334b519c7ae0ef5accc6b Mon Sep 17 00:00:00 2001 From: AlexanderSaydakov Date: Thu, 23 May 2024 11:53:05 -0700 Subject: [PATCH 2/4] removed unnecessary scan to compute the number of entries --- .../quotientfilter/QuotientFilter.java | 37 ++++++------------- .../quotientfilter/QuotientFilterTest.java | 2 +- 2 files changed, 13 insertions(+), 26 deletions(-) diff --git a/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java index 3af8affcd..9a7ebf549 100644 --- a/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java +++ b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java @@ -28,7 +28,7 @@ public class QuotientFilter extends Filter { int bitPerEntry; int fingerprintLength; int power_of_two_size; - int num_existing_entries; + int num_entries; Bitmap filter; double expansion_threshold; @@ -67,8 +67,8 @@ public boolean rejuvenate(long key) { return false; } - public long get_num_existing_entries() { - return num_existing_entries; + public long get_num_entries() { + return num_entries; } public long get_max_entries_before_expansion() { @@ -117,11 +117,11 @@ protected static double measure_num_bits_per_entry(QuotientFilter current, Array //System.out.println("--------------------------"); //current.print_filter_summary(); //System.out.println(); - double num_entries = current.get_num_entries(false); + double num_entries = current.get_num_entries(); for (QuotientFilter q : other_filters) { //q.print_filter_summary(); //System.out.println(); - num_entries += q.get_num_entries(false); + num_entries += q.get_num_entries(); } long init_size = 1L << current.power_of_two_size; long num_bits = current.bitPerEntry * init_size; @@ -137,21 +137,9 @@ protected static double measure_num_bits_per_entry(QuotientFilter current, Array return bits_per_entry; } - // scans the quotient filter and returns the number of non-empty slots - public long get_num_entries(boolean include_all_internal_filters) { - long num_entries = 0; - for (long i = 0; i < get_num_slots(); i++) { - if (is_occupied(i) || is_continuation(i) || is_shifted(i)) { - num_entries++; - } - } - return num_entries; - } - // returns the fraction of occupied slots in the filter public double get_utilization() { long num_logical_slots = 1L << power_of_two_size; - long num_entries = get_num_entries(false); double util = num_entries / (double) num_logical_slots; return util; } @@ -236,7 +224,6 @@ void modify_slot(boolean is_occupied, boolean is_continuation, boolean is_shifte // summarize some statistical measures about the filter public void print_filter_summary() { - long num_entries = get_num_entries(false); long slots = get_num_slots(); long num_bits = slots * bitPerEntry; System.out.println("slots:\t" + slots); @@ -431,7 +418,7 @@ boolean insert_new_run(long canonical_slot, long long_fp) { // if the slot was initially empty, we can just terminate, as there is nothing to push to the right if (slot_initially_empty) { set_fingerprint(start_of_this_new_run, long_fp); - num_existing_entries++; + num_entries++; return true; } @@ -452,7 +439,7 @@ boolean insert_new_run(long canonical_slot, long long_fp) { } current_index = (current_index + 1) & getMask(); } while (!is_this_slot_empty); - num_existing_entries++; + num_entries++; return true; } @@ -461,7 +448,7 @@ boolean insert(long long_fp, long index, boolean insert_only_if_no_match) { //System.out.println("Inserting @ index " + index); //System.out.println("BoolMatch? " + insert_only_if_no_match); //System.out.println("**********"); - //System.out.println("Num items: " + num_existing_entries); + //System.out.println("Num items: " + num_entries); //System.out.println("Max items: " + max_entries_before_expansion); if (index >= get_num_slots()) { @@ -508,7 +495,7 @@ else if (finished_first_run) { } current_index = (current_index + 1) & getMask(); } while (!is_this_slot_empty); - num_existing_entries++; + num_entries++; return true; } @@ -678,7 +665,7 @@ protected boolean _insert(long large_hash, boolean insert_only_if_no_match) { System.exit(1); }*/ -// if (expand_autonomously && num_existing_entries >= max_entries_before_expansion) { +// if (expand_autonomously && num_entries >= max_entries_before_expansion) { // boolean expanded = expand(); // if (expanded) { // num_expansions++; @@ -692,7 +679,7 @@ protected boolean _insert(long large_hash, boolean insert_only_if_no_match) { // long fp_long = gen_fingerprint(large_hash); // boolean success = delete(fp_long, slot_index); // if (success) { -// num_existing_entries--; +// num_entries--; // } // return success; // } @@ -846,7 +833,7 @@ else if (occupied && continuation && shifted ) { // continuation of run // contributed 2*j / (2^j) --> 0 entries. // */ // public double get_load() { -// return num_existing_entries / (double) get_logical_num_slots(); +// return num_entries / (double) get_logical_num_slots(); // } } diff --git a/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterTest.java b/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterTest.java index 6fc24b10b..247ef7bda 100644 --- a/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterTest.java +++ b/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterTest.java @@ -54,7 +54,7 @@ static public void WikiInsertionTest() { qf.insert(C, 1, false); qf.insert(D, 2, false); qf.insert(A, 1, false); - assertEquals(qf.num_existing_entries, 6); + assertEquals(qf.get_num_entries(), 6); assertEquals(getState(qf, 0), 0); assertEquals(qf.get_fingerprint(0), 0); From 68be9a8273ece5110e34f640781b0714f590e92c Mon Sep 17 00:00:00 2001 From: AlexanderSaydakov Date: Thu, 23 May 2024 22:47:19 -0700 Subject: [PATCH 3/4] fixed deletions --- .../quotientfilter/QuotientFilter.java | 46 +++++++++---------- .../filters/quotientfilter/DeletionTests.java | 9 ++-- 2 files changed, 26 insertions(+), 29 deletions(-) diff --git a/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java index 9a7ebf549..93a6761c1 100644 --- a/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java +++ b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java @@ -171,18 +171,14 @@ void set_fingerprint(long index, long fingerprint) { public String get_pretty_str(boolean vertical) { StringBuffer sbr = new StringBuffer(); - long num_slots = get_num_slots(); + long numBits = get_num_slots() * bitPerEntry; - for (long i = 0; i < filter.size(); i++) { + for (long i = 0; i < numBits; i++) { long remainder = i % bitPerEntry; if (remainder == 0) { long slot_num = i/bitPerEntry; sbr.append(" "); if (vertical) { - if (slot_num == num_slots) { - sbr.append("\n ---------"); - } - //sbr.append("\n" + slot_num + " "); sbr.append("\n" + String.format("%-10d", slot_num) + "\t"); } } @@ -343,9 +339,9 @@ long decide_which_fingerprint_to_delete(long index, long fingerprint) { // given the start of a run, find the last slot index that still belongs to this run long find_run_end(long index) { - do { + while (is_continuation((index + 1) & getMask())) { index = (index + 1) & getMask(); - } while(is_continuation(index)); + } return index; } @@ -451,17 +447,16 @@ boolean insert(long long_fp, long index, boolean insert_only_if_no_match) { //System.out.println("Num items: " + num_entries); //System.out.println("Max items: " + max_entries_before_expansion); - if (index >= get_num_slots()) { + if (index >= get_num_slots() || num_entries == get_num_slots()) { return false; } boolean does_run_exist = is_occupied(index); if (!does_run_exist) { - boolean val = insert_new_run(index, long_fp); - return val; + return insert_new_run(index, long_fp); } long run_start_index = find_run_start(index); - if (does_run_exist && insert_only_if_no_match) { + if (insert_only_if_no_match) { long found_index = find_first_fingerprint_in_run(run_start_index, long_fp); if (found_index > -1) { return false; @@ -500,15 +495,15 @@ else if (finished_first_run) { } boolean delete(long fingerprint, long canonical_slot, long run_start_index, long matching_fingerprint_index) { - final long run_end = find_run_end(matching_fingerprint_index); + long run_end = find_run_end(matching_fingerprint_index); // the run has only one entry, we need to disable its is_occupied flag // we just remember we need to do this here, and we do it later to not interfere with counts boolean turn_off_occupied = run_start_index == run_end; // First thing to do is move everything else in the run back by one slot - for (long i = matching_fingerprint_index; i < run_end; i++) { - long f = get_fingerprint(i + 1); + for (long i = matching_fingerprint_index; i != run_end; i = (i + 1) & getMask()) { + long f = get_fingerprint((i + 1) & getMask()); set_fingerprint(i, f); } @@ -519,7 +514,7 @@ boolean delete(long fingerprint, long canonical_slot, long run_start_index, long long cluster_start = find_cluster_start(canonical_slot); long num_shifted_count = 0; long num_non_occupied = 0; - for (long i = cluster_start; i <= run_end; i++) { + for (long i = cluster_start; i != ((run_end + 1) & getMask()); i = (i + 1) & getMask()) { if (is_continuation(i)) { num_shifted_count++; } @@ -540,8 +535,7 @@ boolean delete(long fingerprint, long canonical_slot, long run_start_index, long //boolean does_next_run_exist = !is_slot_empty(run_end + 1); //boolean is_next_run_shifted = is_shifted(run_end + 1); //if (!does_next_run_exist || !is_next_run_shifted) { - final long next_run_start = (run_end + 1) & getMask(); - if (is_slot_empty(next_run_start) || !is_shifted(next_run_start)) { + if (is_slot_empty((run_end + 1) & getMask()) || !is_shifted((run_end + 1) & getMask())) { if (turn_off_occupied) { // if we eliminated a run and now need to turn the is_occupied flag off, we do it at the end to not interfere in our counts set_occupied(canonical_slot, false); @@ -549,18 +543,20 @@ boolean delete(long fingerprint, long canonical_slot, long run_start_index, long return true; } - final long next_run_end = find_run_end(next_run_start); + // we now find the start and end of the next run + final long next_run_start = (run_end + 1) & getMask(); + run_end = find_run_end(next_run_start); // before we start processing the next run, we check whether the previous run we shifted is now back to its canonical slot // The condition num_shifted_count - num_non_occupied == 1 ensures that the run was shifted by only 1 slot, meaning it is now back in its proper place - if (is_occupied(run_end) && num_shifted_count - num_non_occupied == 1) { - set_shifted(run_end, false); + if (is_occupied((next_run_start - 1) & getMask()) && num_shifted_count - num_non_occupied == 1) { + set_shifted((next_run_start - 1) & getMask(), false); } else { - set_shifted(run_end, true); + set_shifted((next_run_start - 1) & getMask(), true); } - for (long i = next_run_start; i != ((next_run_end + 1) & getMask()); i++) { + for (long i = next_run_start; i != ((run_end + 1) & getMask()); i = (i + 1) & getMask()) { long f = get_fingerprint(i); set_fingerprint((i - 1) & getMask(), f); if (is_continuation(i)) { @@ -569,7 +565,9 @@ boolean delete(long fingerprint, long canonical_slot, long run_start_index, long if (!is_occupied(i)) { num_non_occupied++; } - num_shifted_count++; + if (i != next_run_start) { + num_shifted_count++; + } } set_fingerprint(run_end, 0); set_shifted(run_end, false); diff --git a/src/test/java/org/apache/datasketches/filters/quotientfilter/DeletionTests.java b/src/test/java/org/apache/datasketches/filters/quotientfilter/DeletionTests.java index 3f42d3657..1ed94b8a9 100644 --- a/src/test/java/org/apache/datasketches/filters/quotientfilter/DeletionTests.java +++ b/src/test/java/org/apache/datasketches/filters/quotientfilter/DeletionTests.java @@ -91,11 +91,10 @@ static public void DeletionsWithSameFingerprint() { qf.insert(0, 3, false); qf.insert(0, 3, false); qf.insert(0, 6, false); - qf.insert(0, 6, false); + qf.insert(0, 6, false); // these are ignored qf.insert(0, 6, false); qf.insert(0, 7, false); - qf.delete(0, 2); qf.delete(0, 3); @@ -106,9 +105,9 @@ static public void DeletionsWithSameFingerprint() { result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 4, false, false, true, 0); result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 5, false, true, true, 0); result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 6, true, false, false, 0); - result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 7, true, true, true, 0); - result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 8, false, true, true, 0); - result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 9, false, false, true, 0); + result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 7, false, false, false, 0); +// result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 8, false, true, true, 0); +// result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 9, false, false, true, 0); assertTrue(QuotientFilterTest.check_equality(qf, result, true)); } From da8656fae76419f8c92dc9c08d5a52ef1ff5713b Mon Sep 17 00:00:00 2001 From: Charlie Dickens Date: Tue, 28 May 2024 12:06:14 +0100 Subject: [PATCH 4/4] Updated the overflow test to check the wrapping. --- .../filters/quotientfilter/QuotientFilterTest.java | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterTest.java b/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterTest.java index 247ef7bda..c30d7317e 100644 --- a/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterTest.java +++ b/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterTest.java @@ -139,8 +139,14 @@ static public void OverflowTest() { long fp2 = 1 << fingerprint_size - 1; qf.insert(fp2, num_entries - 1, false); + assertEquals(qf.get_fingerprint(7), fp2); + assertEquals(getState(qf, 7), 0b100); qf.insert(fp2, num_entries - 1, false); + assertEquals(qf.get_fingerprint(0), fp2); + assertEquals(getState(qf, 0), 0b011); qf.delete(fp2, num_entries - 1); + assertEquals(qf.get_fingerprint(0), 0); + assertEquals(getState(qf, 0), 0b000); boolean found = qf.search(fp2, num_entries - 1); assertTrue(found); } @@ -166,6 +172,7 @@ static public void testQuotientFilterInsertionAndIteration() { qf.insert(0, 4, false); qf.insert(0, 15, false); // last slot in the filter qf.insert(0, 16, false); // outside the bounds + qf.pretty_print() ; Iterator it = new Iterator(qf); int[] arr = new int[] {2, 3, 3, 4, 15};