From 5aa9191fe7dc6def62ed784c3f203e64852703d1 Mon Sep 17 00:00:00 2001 From: AlexanderSaydakov Date: Tue, 4 Jun 2024 10:11:48 -0700 Subject: [PATCH 1/4] ordered runs, simplified insertion code --- .../quotientfilter/QuotientFilter.java | 116 +++++++----------- .../quotientfilter/QuotientFilterTest.java | 6 +- 2 files changed, 50 insertions(+), 72 deletions(-) diff --git a/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java index e0a442fc5..66b914111 100644 --- a/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java +++ b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java @@ -312,16 +312,19 @@ long find_run_start(long index) { } // given the start of a run, scan the run and return the index of the first matching fingerprint + // if not found returns the insertion position as bitwise complement to make it negative long find_first_fingerprint_in_run(long index, long fingerprint) { - assert(!is_continuation(index)); - do { - if (compare(index, fingerprint)) { - //System.out.println("found matching FP at index " + index); - return index; - } - index = (index + 1) & getMask(); - } while (is_continuation(index)); - return -1; + assert(!is_continuation(index)); + do { + final long fingerprintAtIndex = get_fingerprint(index); + if (fingerprintAtIndex == fingerprint) { + return index; + } else if (fingerprintAtIndex > fingerprint) { + return ~index; + } + index = (index + 1) & getMask(); + } while (is_continuation(index)); + return ~index; } // delete the last matching fingerprint in the run @@ -354,7 +357,7 @@ boolean search(long fingerprint, long index) { } long run_start_index = find_run_start(index); long found_index = find_first_fingerprint_in_run(run_start_index, fingerprint); - return found_index > -1; + return found_index >= 0; } // Given a canonical slot index, find the corresponding run and return all fingerprints in the run. @@ -373,74 +376,49 @@ Set get_all_fingerprints(long bucket_index) { return set; } - // Swaps the fingerprint in a given slot with a new one. Return the pre-existing fingerprint - long swap_fingerprints(long index, long new_fingerprint) { - long existing = get_fingerprint(index); - set_fingerprint(index, new_fingerprint); - return existing; - } - - boolean insert_new_run(long canonical_slot, long long_fp) { - long start_of_this_new_run = find_run_start(canonical_slot); - boolean slot_initially_empty = is_slot_empty(start_of_this_new_run); - - // modify some metadata flags to mark the new run - set_occupied(canonical_slot, true); - if (start_of_this_new_run != canonical_slot) { - set_shifted(start_of_this_new_run, true); - } - set_continuation(start_of_this_new_run, false); - - // if the slot was initially empty, we can just terminate, as there is nothing to push to the right - if (slot_initially_empty) { - set_fingerprint(start_of_this_new_run, long_fp); - num_entries++; - return true; - } - return insert_fingerprint_and_push_all_else(long_fp, start_of_this_new_run, false); - } - - boolean insert(long long_fp, long index) { + boolean insert(long fingerprint, long index) { if (index >= get_num_slots() || num_entries == get_num_slots()) { return false; } + final long run_start = find_run_start(index); if (!is_occupied(index)) { - return insert_new_run(index, long_fp); + return insert_fingerprint_and_push_all_else(fingerprint, run_start, index, true, true); } - long run_start_index = find_run_start(index); - final long found_index = find_first_fingerprint_in_run(run_start_index, long_fp); - if (found_index > -1) { + final long found_index = find_first_fingerprint_in_run(run_start, fingerprint); + if (found_index >= 0) { return false; } - return insert_fingerprint_and_push_all_else(long_fp, run_start_index, true); + return insert_fingerprint_and_push_all_else(fingerprint, ~found_index, index, false, ~found_index == run_start); } - // insert a fingerprint as the last fingerprint of the run and push all other entries in the cluster to the right. - boolean insert_fingerprint_and_push_all_else(long long_fp, long run_start_index, boolean is_same_run) { - long current_index = run_start_index; - boolean is_this_slot_empty; - boolean temp_continuation = false; - - do { - is_this_slot_empty = is_slot_empty(current_index); - if (current_index != run_start_index) { - set_shifted(current_index, true); - } - if (current_index != run_start_index && is_same_run && !is_continuation(current_index)) { - is_same_run = false; - set_continuation(current_index, true); - long_fp = swap_fingerprints(current_index, long_fp); - } - else if (!is_same_run) { - boolean current_continuation = is_continuation(current_index); - set_continuation(current_index, temp_continuation); - temp_continuation = current_continuation; - long_fp = swap_fingerprints(current_index, long_fp); - } - current_index = (current_index + 1) & getMask(); - } while (!is_this_slot_empty); - num_entries++; - return true; + boolean insert_fingerprint_and_push_all_else(long fingerprint, long index, long canonical, boolean is_new_run, boolean is_run_start) { + boolean existing_is_continuation = is_continuation(index); + boolean is_continuation = !is_run_start; + boolean is_shifted = index != canonical; + boolean force_continuation = !is_new_run && is_run_start; + boolean existing_is_empty = is_slot_empty(index); + long existing_fingerprint = get_fingerprint(index); + while (!existing_is_empty) { + set_fingerprint(index, fingerprint); + set_continuation(index, is_continuation); + set_shifted(index, is_shifted); + fingerprint = existing_fingerprint; + is_continuation = existing_is_continuation | force_continuation; + is_shifted = true; + index = (index + 1) & getMask(); + existing_fingerprint = get_fingerprint(index); + existing_is_continuation = is_continuation(index); + existing_is_empty = is_slot_empty(index); + force_continuation = false; + } + set_fingerprint(index, fingerprint); + set_continuation(index, is_continuation); + set_shifted(index, is_shifted); + if (is_new_run) { + set_occupied(canonical, true); + } + num_entries++; + return true; } boolean delete(long fingerprint, long canonical_slot, long run_start_index, long matching_fingerprint_index) { diff --git a/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterTest.java b/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterTest.java index 4f6486eae..44e9833d2 100644 --- a/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterTest.java +++ b/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterTest.java @@ -67,11 +67,11 @@ public void WikiInsertionTest() { assertEquals(getState(qf, 0), 0); assertEquals(qf.get_fingerprint(0), 0); assertEquals(getState(qf, 1), 0b100); - assertEquals(qf.get_fingerprint(1), B); // this run is not ordered, which is different from Wikipedia example + assertEquals(qf.get_fingerprint(1), A); // this run is not ordered, which is different from Wikipedia example assertEquals(getState(qf, 2), 0b111); - assertEquals(qf.get_fingerprint(2), C); + assertEquals(qf.get_fingerprint(2), B); assertEquals(getState(qf, 3), 0b011); - assertEquals(qf.get_fingerprint(3), A); + assertEquals(qf.get_fingerprint(3), C); assertEquals(getState(qf, 4), 0b101); assertEquals(qf.get_fingerprint(4), D); assertEquals(getState(qf, 5), 0b001); From f49c7a09c6a3ca4eea0fd3b6540688c4916db9ac Mon Sep 17 00:00:00 2001 From: AlexanderSaydakov Date: Tue, 4 Jun 2024 11:57:03 -0700 Subject: [PATCH 2/4] removed comment that is no longer valid --- .../datasketches/filters/quotientfilter/QuotientFilterTest.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterTest.java b/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterTest.java index 44e9833d2..4f68cb1f4 100644 --- a/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterTest.java +++ b/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterTest.java @@ -67,7 +67,7 @@ public void WikiInsertionTest() { assertEquals(getState(qf, 0), 0); assertEquals(qf.get_fingerprint(0), 0); assertEquals(getState(qf, 1), 0b100); - assertEquals(qf.get_fingerprint(1), A); // this run is not ordered, which is different from Wikipedia example + assertEquals(qf.get_fingerprint(1), A); assertEquals(getState(qf, 2), 0b111); assertEquals(qf.get_fingerprint(2), B); assertEquals(getState(qf, 3), 0b011); From 3d60bced8fa565a2a6b43c59a563458785c5ee72 Mon Sep 17 00:00:00 2001 From: AlexanderSaydakov Date: Wed, 5 Jun 2024 11:31:23 -0700 Subject: [PATCH 3/4] commented shifting the rest of the cluster after the insertion point --- .../quotientfilter/QuotientFilter.java | 27 ++++++++++++++++--- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java index 66b914111..9489a36f2 100644 --- a/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java +++ b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java @@ -392,28 +392,47 @@ boolean insert(long fingerprint, long index) { } boolean insert_fingerprint_and_push_all_else(long fingerprint, long index, long canonical, boolean is_new_run, boolean is_run_start) { - boolean existing_is_continuation = is_continuation(index); + // in the first shifted entry set is_continuation flag if inserting at the start of the existing run + // otherwise just shift the existing flag as it is + boolean force_continuation = !is_new_run && is_run_start; + + // prepare flags for the current slot boolean is_continuation = !is_run_start; boolean is_shifted = index != canonical; - boolean force_continuation = !is_new_run && is_run_start; - boolean existing_is_empty = is_slot_empty(index); + + // store the existing entry in the current slot to be shifted to the next slot + // is_occupied flag belongs to the slot, therefore it is never shifted + // is_shifted flag is always true for all shifted entries, no need to store it long existing_fingerprint = get_fingerprint(index); + boolean existing_is_continuation = is_continuation(index); + boolean existing_is_empty = is_slot_empty(index); + while (!existing_is_empty) { + // set the current slot set_fingerprint(index, fingerprint); set_continuation(index, is_continuation); set_shifted(index, is_shifted); + + // prepare values for the next slot fingerprint = existing_fingerprint; is_continuation = existing_is_continuation | force_continuation; is_shifted = true; + index = (index + 1) & getMask(); + + // save the existing entry to be shifted existing_fingerprint = get_fingerprint(index); existing_is_continuation = is_continuation(index); existing_is_empty = is_slot_empty(index); - force_continuation = false; + + force_continuation = false; // this is needed for the first shift only } + // at this point the current slot is empty, so just populate with prepared values + // either the incoming fingerprint or the last shifted one set_fingerprint(index, fingerprint); set_continuation(index, is_continuation); set_shifted(index, is_shifted); + if (is_new_run) { set_occupied(canonical, true); } From 1ea45c08a35b8a5dbb90de65711f73cb515ed0f9 Mon Sep 17 00:00:00 2001 From: AlexanderSaydakov Date: Wed, 5 Jun 2024 11:38:10 -0700 Subject: [PATCH 4/4] better wording --- .../datasketches/filters/quotientfilter/QuotientFilter.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java index 9489a36f2..d90f243c6 100644 --- a/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java +++ b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java @@ -400,9 +400,9 @@ boolean insert_fingerprint_and_push_all_else(long fingerprint, long index, long boolean is_continuation = !is_run_start; boolean is_shifted = index != canonical; - // store the existing entry in the current slot to be shifted to the next slot + // remember the existing entry from the current slot to be shifted to the next slot // is_occupied flag belongs to the slot, therefore it is never shifted - // is_shifted flag is always true for all shifted entries, no need to store it + // is_shifted flag is always true for all shifted entries, no need to remember it long existing_fingerprint = get_fingerprint(index); boolean existing_is_continuation = is_continuation(index); boolean existing_is_empty = is_slot_empty(index);