Skip to content

ordered runs, simplified insertion code #571

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Jun 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -312,16 +312,19 @@ long find_run_start(long index) {
}

// given the start of a run, scan the run and return the index of the first matching fingerprint
// if not found returns the insertion position as bitwise complement to make it negative
long find_first_fingerprint_in_run(long index, long fingerprint) {
assert(!is_continuation(index));
do {
if (compare(index, fingerprint)) {
//System.out.println("found matching FP at index " + index);
return index;
}
index = (index + 1) & getMask();
} while (is_continuation(index));
return -1;
assert(!is_continuation(index));
do {
final long fingerprintAtIndex = get_fingerprint(index);
if (fingerprintAtIndex == fingerprint) {
return index;
} else if (fingerprintAtIndex > fingerprint) {
return ~index;
}
index = (index + 1) & getMask();
} while (is_continuation(index));
return ~index;
}

// delete the last matching fingerprint in the run
Expand Down Expand Up @@ -354,7 +357,7 @@ boolean search(long fingerprint, long index) {
}
long run_start_index = find_run_start(index);
long found_index = find_first_fingerprint_in_run(run_start_index, fingerprint);
return found_index > -1;
return found_index >= 0;
}

// Given a canonical slot index, find the corresponding run and return all fingerprints in the run.
Expand All @@ -373,74 +376,68 @@ Set<Long> get_all_fingerprints(long bucket_index) {
return set;
}

// Swaps the fingerprint in a given slot with a new one. Return the pre-existing fingerprint
long swap_fingerprints(long index, long new_fingerprint) {
long existing = get_fingerprint(index);
set_fingerprint(index, new_fingerprint);
return existing;
}

boolean insert_new_run(long canonical_slot, long long_fp) {
long start_of_this_new_run = find_run_start(canonical_slot);
boolean slot_initially_empty = is_slot_empty(start_of_this_new_run);

// modify some metadata flags to mark the new run
set_occupied(canonical_slot, true);
if (start_of_this_new_run != canonical_slot) {
set_shifted(start_of_this_new_run, true);
}
set_continuation(start_of_this_new_run, false);

// if the slot was initially empty, we can just terminate, as there is nothing to push to the right
if (slot_initially_empty) {
set_fingerprint(start_of_this_new_run, long_fp);
num_entries++;
return true;
}
return insert_fingerprint_and_push_all_else(long_fp, start_of_this_new_run, false);
}

boolean insert(long long_fp, long index) {
boolean insert(long fingerprint, long index) {
if (index >= get_num_slots() || num_entries == get_num_slots()) {
return false;
}
final long run_start = find_run_start(index);
if (!is_occupied(index)) {
return insert_new_run(index, long_fp);
return insert_fingerprint_and_push_all_else(fingerprint, run_start, index, true, true);
}
long run_start_index = find_run_start(index);
final long found_index = find_first_fingerprint_in_run(run_start_index, long_fp);
if (found_index > -1) {
final long found_index = find_first_fingerprint_in_run(run_start, fingerprint);
if (found_index >= 0) {
return false;
}
return insert_fingerprint_and_push_all_else(long_fp, run_start_index, true);
return insert_fingerprint_and_push_all_else(fingerprint, ~found_index, index, false, ~found_index == run_start);
}

// insert a fingerprint as the last fingerprint of the run and push all other entries in the cluster to the right.
boolean insert_fingerprint_and_push_all_else(long long_fp, long run_start_index, boolean is_same_run) {
long current_index = run_start_index;
boolean is_this_slot_empty;
boolean temp_continuation = false;
boolean insert_fingerprint_and_push_all_else(long fingerprint, long index, long canonical, boolean is_new_run, boolean is_run_start) {
// in the first shifted entry set is_continuation flag if inserting at the start of the existing run
// otherwise just shift the existing flag as it is
boolean force_continuation = !is_new_run && is_run_start;

do {
is_this_slot_empty = is_slot_empty(current_index);
if (current_index != run_start_index) {
set_shifted(current_index, true);
}
if (current_index != run_start_index && is_same_run && !is_continuation(current_index)) {
is_same_run = false;
set_continuation(current_index, true);
long_fp = swap_fingerprints(current_index, long_fp);
}
else if (!is_same_run) {
boolean current_continuation = is_continuation(current_index);
set_continuation(current_index, temp_continuation);
temp_continuation = current_continuation;
long_fp = swap_fingerprints(current_index, long_fp);
}
current_index = (current_index + 1) & getMask();
} while (!is_this_slot_empty);
num_entries++;
return true;
// prepare flags for the current slot
boolean is_continuation = !is_run_start;
boolean is_shifted = index != canonical;

// remember the existing entry from the current slot to be shifted to the next slot
// is_occupied flag belongs to the slot, therefore it is never shifted
// is_shifted flag is always true for all shifted entries, no need to remember it
long existing_fingerprint = get_fingerprint(index);
boolean existing_is_continuation = is_continuation(index);
boolean existing_is_empty = is_slot_empty(index);

while (!existing_is_empty) {
// set the current slot
set_fingerprint(index, fingerprint);
set_continuation(index, is_continuation);
set_shifted(index, is_shifted);

// prepare values for the next slot
fingerprint = existing_fingerprint;
is_continuation = existing_is_continuation | force_continuation;
is_shifted = true;

index = (index + 1) & getMask();

// save the existing entry to be shifted
existing_fingerprint = get_fingerprint(index);
existing_is_continuation = is_continuation(index);
existing_is_empty = is_slot_empty(index);

force_continuation = false; // this is needed for the first shift only
}
// at this point the current slot is empty, so just populate with prepared values
// either the incoming fingerprint or the last shifted one
set_fingerprint(index, fingerprint);
set_continuation(index, is_continuation);
set_shifted(index, is_shifted);

if (is_new_run) {
set_occupied(canonical, true);
}
num_entries++;
return true;
}

boolean delete(long fingerprint, long canonical_slot, long run_start_index, long matching_fingerprint_index) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -67,11 +67,11 @@ public void WikiInsertionTest() {
assertEquals(getState(qf, 0), 0);
assertEquals(qf.get_fingerprint(0), 0);
assertEquals(getState(qf, 1), 0b100);
assertEquals(qf.get_fingerprint(1), B); // this run is not ordered, which is different from Wikipedia example
assertEquals(qf.get_fingerprint(1), A);
assertEquals(getState(qf, 2), 0b111);
assertEquals(qf.get_fingerprint(2), C);
assertEquals(qf.get_fingerprint(2), B);
assertEquals(getState(qf, 3), 0b011);
assertEquals(qf.get_fingerprint(3), A);
assertEquals(qf.get_fingerprint(3), C);
assertEquals(getState(qf, 4), 0b101);
assertEquals(qf.get_fingerprint(4), D);
assertEquals(getState(qf, 5), 0b001);
Expand Down
Loading