diff --git a/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java index e0a442fc5..d90f243c6 100644 --- a/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java +++ b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java @@ -312,16 +312,19 @@ long find_run_start(long index) { } // given the start of a run, scan the run and return the index of the first matching fingerprint + // if not found returns the insertion position as bitwise complement to make it negative long find_first_fingerprint_in_run(long index, long fingerprint) { - assert(!is_continuation(index)); - do { - if (compare(index, fingerprint)) { - //System.out.println("found matching FP at index " + index); - return index; - } - index = (index + 1) & getMask(); - } while (is_continuation(index)); - return -1; + assert(!is_continuation(index)); + do { + final long fingerprintAtIndex = get_fingerprint(index); + if (fingerprintAtIndex == fingerprint) { + return index; + } else if (fingerprintAtIndex > fingerprint) { + return ~index; + } + index = (index + 1) & getMask(); + } while (is_continuation(index)); + return ~index; } // delete the last matching fingerprint in the run @@ -354,7 +357,7 @@ boolean search(long fingerprint, long index) { } long run_start_index = find_run_start(index); long found_index = find_first_fingerprint_in_run(run_start_index, fingerprint); - return found_index > -1; + return found_index >= 0; } // Given a canonical slot index, find the corresponding run and return all fingerprints in the run. @@ -373,74 +376,68 @@ Set get_all_fingerprints(long bucket_index) { return set; } - // Swaps the fingerprint in a given slot with a new one. Return the pre-existing fingerprint - long swap_fingerprints(long index, long new_fingerprint) { - long existing = get_fingerprint(index); - set_fingerprint(index, new_fingerprint); - return existing; - } - - boolean insert_new_run(long canonical_slot, long long_fp) { - long start_of_this_new_run = find_run_start(canonical_slot); - boolean slot_initially_empty = is_slot_empty(start_of_this_new_run); - - // modify some metadata flags to mark the new run - set_occupied(canonical_slot, true); - if (start_of_this_new_run != canonical_slot) { - set_shifted(start_of_this_new_run, true); - } - set_continuation(start_of_this_new_run, false); - - // if the slot was initially empty, we can just terminate, as there is nothing to push to the right - if (slot_initially_empty) { - set_fingerprint(start_of_this_new_run, long_fp); - num_entries++; - return true; - } - return insert_fingerprint_and_push_all_else(long_fp, start_of_this_new_run, false); - } - - boolean insert(long long_fp, long index) { + boolean insert(long fingerprint, long index) { if (index >= get_num_slots() || num_entries == get_num_slots()) { return false; } + final long run_start = find_run_start(index); if (!is_occupied(index)) { - return insert_new_run(index, long_fp); + return insert_fingerprint_and_push_all_else(fingerprint, run_start, index, true, true); } - long run_start_index = find_run_start(index); - final long found_index = find_first_fingerprint_in_run(run_start_index, long_fp); - if (found_index > -1) { + final long found_index = find_first_fingerprint_in_run(run_start, fingerprint); + if (found_index >= 0) { return false; } - return insert_fingerprint_and_push_all_else(long_fp, run_start_index, true); + return insert_fingerprint_and_push_all_else(fingerprint, ~found_index, index, false, ~found_index == run_start); } - // insert a fingerprint as the last fingerprint of the run and push all other entries in the cluster to the right. - boolean insert_fingerprint_and_push_all_else(long long_fp, long run_start_index, boolean is_same_run) { - long current_index = run_start_index; - boolean is_this_slot_empty; - boolean temp_continuation = false; + boolean insert_fingerprint_and_push_all_else(long fingerprint, long index, long canonical, boolean is_new_run, boolean is_run_start) { + // in the first shifted entry set is_continuation flag if inserting at the start of the existing run + // otherwise just shift the existing flag as it is + boolean force_continuation = !is_new_run && is_run_start; - do { - is_this_slot_empty = is_slot_empty(current_index); - if (current_index != run_start_index) { - set_shifted(current_index, true); - } - if (current_index != run_start_index && is_same_run && !is_continuation(current_index)) { - is_same_run = false; - set_continuation(current_index, true); - long_fp = swap_fingerprints(current_index, long_fp); - } - else if (!is_same_run) { - boolean current_continuation = is_continuation(current_index); - set_continuation(current_index, temp_continuation); - temp_continuation = current_continuation; - long_fp = swap_fingerprints(current_index, long_fp); - } - current_index = (current_index + 1) & getMask(); - } while (!is_this_slot_empty); - num_entries++; - return true; + // prepare flags for the current slot + boolean is_continuation = !is_run_start; + boolean is_shifted = index != canonical; + + // remember the existing entry from the current slot to be shifted to the next slot + // is_occupied flag belongs to the slot, therefore it is never shifted + // is_shifted flag is always true for all shifted entries, no need to remember it + long existing_fingerprint = get_fingerprint(index); + boolean existing_is_continuation = is_continuation(index); + boolean existing_is_empty = is_slot_empty(index); + + while (!existing_is_empty) { + // set the current slot + set_fingerprint(index, fingerprint); + set_continuation(index, is_continuation); + set_shifted(index, is_shifted); + + // prepare values for the next slot + fingerprint = existing_fingerprint; + is_continuation = existing_is_continuation | force_continuation; + is_shifted = true; + + index = (index + 1) & getMask(); + + // save the existing entry to be shifted + existing_fingerprint = get_fingerprint(index); + existing_is_continuation = is_continuation(index); + existing_is_empty = is_slot_empty(index); + + force_continuation = false; // this is needed for the first shift only + } + // at this point the current slot is empty, so just populate with prepared values + // either the incoming fingerprint or the last shifted one + set_fingerprint(index, fingerprint); + set_continuation(index, is_continuation); + set_shifted(index, is_shifted); + + if (is_new_run) { + set_occupied(canonical, true); + } + num_entries++; + return true; } boolean delete(long fingerprint, long canonical_slot, long run_start_index, long matching_fingerprint_index) { diff --git a/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterTest.java b/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterTest.java index 4f6486eae..4f68cb1f4 100644 --- a/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterTest.java +++ b/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterTest.java @@ -67,11 +67,11 @@ public void WikiInsertionTest() { assertEquals(getState(qf, 0), 0); assertEquals(qf.get_fingerprint(0), 0); assertEquals(getState(qf, 1), 0b100); - assertEquals(qf.get_fingerprint(1), B); // this run is not ordered, which is different from Wikipedia example + assertEquals(qf.get_fingerprint(1), A); assertEquals(getState(qf, 2), 0b111); - assertEquals(qf.get_fingerprint(2), C); + assertEquals(qf.get_fingerprint(2), B); assertEquals(getState(qf, 3), 0b011); - assertEquals(qf.get_fingerprint(3), A); + assertEquals(qf.get_fingerprint(3), C); assertEquals(getState(qf, 4), 0b101); assertEquals(qf.get_fingerprint(4), D); assertEquals(getState(qf, 5), 0b001);