Skip to content

cleanup #586

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Aug 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@

package org.apache.datasketches.filters.quotientfilter;

import java.util.ArrayList;
import static org.apache.datasketches.common.Util.LS;

import java.util.HashSet;
import java.util.LinkedList;
import java.util.Queue;
Expand All @@ -41,12 +42,6 @@ public class QuotientFilter extends Filter {
int numExpansions_;
BitArray bitArray_;

// statistics, computed in the compute_statistics method. method should be called before these are used
long numRuns_;
long numClusters_;
public double avgRunLength_;
public double avgClusterLength_;

public QuotientFilter(final int lgQ, final int numFingerprintBits) {
this(lgQ, numFingerprintBits, DEFAULT_LOAD_FACTOR);
}
Expand All @@ -57,7 +52,6 @@ public QuotientFilter(final int lgQ, final int numFingerprintBits, final float l
loadFactor_ = loadFactor;
bitArray_ = makeFilter(getNumSlots(), getNumBitsPerEntry());
numExpansions_ = 0;
//hash_type = XxHash.hashLong ; //HashType.xxh;
}

public boolean rejuvenate(final long key) {
Expand Down Expand Up @@ -85,7 +79,9 @@ public int getFingerprintLength() {
}

void expand() {
if (getFingerprintLength() < 2) throw new SketchesException("for expansion value must have at least 2 bits");
if (getFingerprintLength() < 2) {
throw new SketchesException("for expansion value must have at least 2 bits");
}
final QuotientFilter other = new QuotientFilter(lgQ_ + 1, numFingerprintBits_ - 1, loadFactor_);

long i = 0;
Expand All @@ -111,40 +107,6 @@ void expand() {
numExpansions_++;
}

// measures the number of bits per entry for the filter
public double measureNumBitsPerEntry() {
return measureNumBitsPerEntry(this, new ArrayList<QuotientFilter>());
}

// measures the number of bits per entry for the filter
// it takes an array of filters as a parameter since some filter implementations here consist of multiple filter objects
protected static double measureNumBitsPerEntry(final QuotientFilter current, final ArrayList<QuotientFilter> otherFilters) {
//System.out.println("--------------------------");
//current.print_filter_summary();
//System.out.println();
double numEntries = current.getNumEntries();
for (QuotientFilter q : otherFilters) {
//q.print_filter_summary();
//System.out.println();
numEntries += q.getNumEntries();
}
long numBits = current.getNumBitsPerEntry() * current.getNumSlots();
for (final QuotientFilter q : otherFilters) {
numBits += q.getNumBitsPerEntry() * q.getNumSlots();
}
//System.out.println("total entries: \t\t" + num_entries);
//System.out.println("total bits: \t\t" + num_bits);
final double bits_per_entry = numBits / numEntries;
//System.out.println("total bits/entry: \t" + bits_per_entry);
//System.out.println();
return bits_per_entry;
}

// returns the fraction of occupied slots in the filter
public double getUtilization() {
return numEntries_ / (double) getNumSlots();
}

public int getLgQ() {
return lgQ_;
}
Expand Down Expand Up @@ -228,23 +190,21 @@ void modifySlot(final boolean isOccupied, final boolean isContinuation, final bo
setFingerprint(index, fingerprint);
}

// summarize some statistical measures about the filter
public void printFilterSummary() {
public String toString() {
final StringBuilder sb = new StringBuilder();
final long slots = getNumSlots();
final long numBits = slots * getNumBitsPerEntry();
System.out.println("lgQ: " + lgQ_);
System.out.println("FP length: " + getFingerprintLength());
System.out.println("load factor: " + getLoadFactor());
System.out.println("bits: " + numBits);
System.out.println("bits/entry: " + numBits / (double)numEntries_);
System.out.println("entries: " + numEntries_);
System.out.println("expansions: " + numExpansions_);
System.out.println("load: " + numEntries_ / (double)(slots));
computeStatistics();
//System.out.println("num runs: \t\t" + num_runs);
//System.out.println("avg run length: \t" + avg_run_length);
//System.out.println("num clusters: \t\t" + num_clusters);
//System.out.println("avg cluster length: \t" + avg_cluster_length);
sb.append("***Quotient Filter Summary***").append(LS);
sb.append("lgQ: " + lgQ_).append(LS);
sb.append("FP length: " + getFingerprintLength()).append(LS);
sb.append("load factor: " + getLoadFactor()).append(LS);
sb.append("bits: " + numBits).append(LS);
sb.append("bits/entry: " + numBits / (double)numEntries_).append(LS);
sb.append("entries: " + numEntries_).append(LS);
sb.append("expansions: " + numExpansions_).append(LS);
sb.append("load: " + numEntries_ / (double)(slots)).append(LS);
sb.append("*********End Summary*********").append(LS);
return sb.toString();
}

/*
Expand Down Expand Up @@ -442,24 +402,24 @@ void insertFingerprintAndPushAllElse(long fingerprint, long index, final long ca
numEntries_++;
}

boolean delete(final long canonicalSlot, long runStartIndex, long matchingFingerprintIndex) {
boolean delete(final long canonicalSlot, final long runStartIndex, final long matchingFingerprintIndex) {
long runEnd = findRunEnd(matchingFingerprintIndex);

// the run has only one entry, we need to disable its is_occupied flag
// we just remember we need to do this here, and we do it later to not interfere with counts
boolean turnOffOccupied = runStartIndex == runEnd;
final boolean turnOffOccupied = runStartIndex == runEnd;

// First thing to do is move everything else in the run back by one slot
for (long i = matchingFingerprintIndex; i != runEnd; i = (i + 1) & getSlotMask()) {
long f = getFingerprint((i + 1) & getSlotMask());
final long f = getFingerprint((i + 1) & getSlotMask());
setFingerprint(i, f);
}

// for each slot, we want to know by how much the entry there is shifted
// we can do this by counting the number of continuation flags set to true
// and the number of occupied flags set to false from the start of the cluster to the given cell
// and then subtracting: num_shifted_count - num_non_occupied = number of slots by which an entry is shifted
long clusterStart = findClusterStart(canonicalSlot);
final long clusterStart = findClusterStart(canonicalSlot);
long numShiftedCount = 0;
long numNonOccupied = 0;
for (long i = clusterStart; i != ((runEnd + 1) & getSlotMask()); i = (i + 1) & getSlotMask()) {
Expand Down Expand Up @@ -496,15 +456,16 @@ boolean delete(final long canonicalSlot, long runStartIndex, long matchingFinger
runEnd = findRunEnd(nextRunStart);

// before we start processing the next run, we check whether the previous run we shifted is now back to its canonical slot
// The condition num_shifted_count - num_non_occupied == 1 ensures that the run was shifted by only 1 slot, meaning it is now back in its proper place
// The condition num_shifted_count - num_non_occupied == 1 ensures that the run was shifted by only 1 slot,
// meaning it is now back in its proper place
if (isOccupied((nextRunStart - 1) & getSlotMask()) && numShiftedCount - numNonOccupied == 1) {
setShifted((nextRunStart - 1) & getSlotMask(), false);
} else {
setShifted((nextRunStart - 1) & getSlotMask(), true);
}

for (long i = nextRunStart; i != ((runEnd + 1) & getSlotMask()); i = (i + 1) & getSlotMask()) {
long f = getFingerprint(i);
final long f = getFingerprint(i);
setFingerprint((i - 1) & getSlotMask(), f);
if (isContinuation(i)) {
setContinuation((i - 1) & getSlotMask(), true);
Expand All @@ -524,12 +485,12 @@ boolean delete(final long canonicalSlot, long runStartIndex, long matchingFinger

boolean delete(final long fingerprint, final long canonicalSlot) {
// if the run doesn't exist, the key can't have possibly been inserted
boolean doesRunExist = isOccupied(canonicalSlot);
final boolean doesRunExist = isOccupied(canonicalSlot);
if (!doesRunExist) {
return false;
}
long runStartIndex = findRunStart(canonicalSlot);
long matchingFingerprintIndex = decideWhichFingerprintToDelete(runStartIndex, fingerprint);
final long runStartIndex = findRunStart(canonicalSlot);
final long matchingFingerprintIndex = decideWhichFingerprintToDelete(runStartIndex, fingerprint);
if (matchingFingerprintIndex == -1) {
// we didn't find a matching fingerprint
return false;
Expand Down Expand Up @@ -565,8 +526,8 @@ protected boolean _insert(final long largeHash) {

protected boolean _delete(final long largeHash) {
final long slotIndex = getSlotFromHash(largeHash);
long fingerprint = getFingerprintFromHash(largeHash);
boolean success = delete(fingerprint, slotIndex);
final long fingerprint = getFingerprintFromHash(largeHash);
final boolean success = delete(fingerprint, slotIndex);
if (success) {
numEntries_--;
}
Expand All @@ -583,59 +544,6 @@ public boolean getBitAtOffset(final int offset) {
return bitArray_.getBit(offset);
}

public void computeStatistics() {
numRuns_ = 0;
numClusters_ = 0;
double sumRunLengths = 0;
double sumClusterLengths = 0;

int currentRunLength = 0;
int currentCluster_length = 0;

final long numSlots = getNumSlots();
for (long i = 0; i < numSlots; i++) {
final boolean occupied = isOccupied(i);
final boolean continuation = isContinuation(i);
final boolean shifted = isShifted(i);

if (!occupied && !continuation && !shifted) { // empty slot
sumClusterLengths += currentCluster_length;
currentCluster_length = 0;
sumRunLengths += currentRunLength;
currentRunLength = 0;
} else if ( !occupied && !continuation && shifted ) { // start of new run
numRuns_++;
sumRunLengths += currentRunLength;
currentRunLength = 1;
currentCluster_length++;
} else if ( !occupied && continuation && !shifted ) {
// not used
} else if ( !occupied && continuation && shifted ) { // continuation of run
currentCluster_length++;
currentRunLength++;
} else if ( occupied && !continuation && !shifted ) { // start of new cluster & run
numRuns_++;
numClusters_++;
sumClusterLengths += currentCluster_length;
sumRunLengths += currentRunLength;
currentCluster_length = 1;
currentRunLength = 1;
} else if (occupied && !continuation && shifted ) { // start of new run
numRuns_++;
sumRunLengths += currentRunLength;
currentRunLength = 1;
currentCluster_length++;
} else if (occupied && continuation && !shifted ) {
// not used
} else if (occupied && continuation && shifted ) { // continuation of run
currentCluster_length++;
currentRunLength++;
}
}
avgRunLength_ = sumRunLengths / numRuns_;
avgClusterLength_ = sumClusterLengths / numClusters_;
}

public void merge(final QuotientFilter other) {
if (lgQ_ + numFingerprintBits_ != other.lgQ_ + other.numFingerprintBits_) {
throw new SketchesArgumentException("incompatible sketches in merge");
Expand All @@ -658,4 +566,4 @@ public void merge(final QuotientFilter other) {
if (!fifo.isEmpty() && ! other.isContinuation(i)) { fifo.remove(); }
}
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,6 @@
package org.apache.datasketches.filters.quotientfilter;

import org.apache.datasketches.common.SketchesArgumentException;
import org.apache.datasketches.filters.quotientfilter.QuotientFilterBuilder;
import org.apache.datasketches.memory.WritableMemory;
import org.testng.annotations.Test;

import static org.testng.Assert.*;
Expand All @@ -43,8 +41,6 @@ public void testSuggestFingerprintLengthFromFPP(){

@Test
public static void testSuggestLgNumSlots(){
QuotientFilterBuilder qfb = new QuotientFilterBuilder();

// invalid number of items
assertThrows(SketchesArgumentException.class, () -> QuotientFilterBuilder.suggestLgNumSlots(0,0.9f));
assertThrows(SketchesArgumentException.class, () -> QuotientFilterBuilder.suggestLgNumSlots(-1, 0.9f));
Expand All @@ -58,17 +54,15 @@ public static void testSuggestLgNumSlots(){

for (int i = 0; i < numItems.length; i++) {
long num = numItems[i];
byte result = qfb.suggestLgNumSlots(num, 0.9f);
byte result = QuotientFilterBuilder.suggestLgNumSlots(num, 0.9f);
assertEquals(result, results[i]);
result = qfb.suggestLgNumSlots(num);
result = QuotientFilterBuilder.suggestLgNumSlots(num);
assertEquals(result, results[i]);
}
}

@Test
public static void testSuggestMaxNumItems(){
QuotientFilterBuilder qfb = new QuotientFilterBuilder();

// invalid number of slots
assertThrows(SketchesArgumentException.class, () -> QuotientFilterBuilder.suggestMaxNumItemsFromNumSlots((byte)-127));
assertThrows(SketchesArgumentException.class, () -> QuotientFilterBuilder.suggestMaxNumItemsFromNumSlots((byte)0));
Expand All @@ -83,24 +77,21 @@ public static void testSuggestMaxNumItems(){
float eighty_pc_appx = 820f / 1024f; // ≈ 0.8

for (int i = 0; i < lgNumSlots.length; i++) {
long result_ninety = qfb.suggestMaxNumItemsFromNumSlots(lgNumSlots[i], ninety_pc_appx);
long result_eighty = qfb.suggestMaxNumItemsFromNumSlots(lgNumSlots[i], eighty_pc_appx);
long result_ninety = QuotientFilterBuilder.suggestMaxNumItemsFromNumSlots(lgNumSlots[i], ninety_pc_appx);
long result_eighty = QuotientFilterBuilder.suggestMaxNumItemsFromNumSlots(lgNumSlots[i], eighty_pc_appx);
assertEquals(result_ninety, results_ninety_pc[i]);
assertEquals(result_eighty, results_eighty_pc[i]);
}
}

@Test
public static void testSuggestParamsFromMaxDistinctsFPP(){

// invalid number of slots
assertThrows(SketchesArgumentException.class, () -> QuotientFilterBuilder.suggestParamsFromMaxDistinctsFPP(5000000000L, 0.0001));
assertThrows(SketchesArgumentException.class, () -> QuotientFilterBuilder.suggestParamsFromMaxDistinctsFPP(100000000, 0.));
assertThrows(SketchesArgumentException.class, () -> QuotientFilterBuilder.suggestParamsFromMaxDistinctsFPP(100000000, 1.5));
assertThrows(SketchesArgumentException.class, () -> QuotientFilterBuilder.suggestParamsFromMaxDistinctsFPP(5000000000L, -1.));


QuotientFilterBuilder qfb = new QuotientFilterBuilder();
byte lgNumSlots ;
byte fingerprintLength ;
long[] numItems = {1L, 900L, 500_000_000L} ;
Expand All @@ -112,21 +103,18 @@ public static void testSuggestParamsFromMaxDistinctsFPP(){
byte[] expected_fingerprintLength = {34, 7, 24} ;

for (int i = 0; i < numItems.length; i++) {
QuotientFilterBuilder.QFPair pair = qfb.suggestParamsFromMaxDistinctsFPP(numItems[i], 0.9f, fpp[i]);
QuotientFilterBuilder.QFPair pair = QuotientFilterBuilder.suggestParamsFromMaxDistinctsFPP(numItems[i], 0.9f, fpp[i]);
lgNumSlots = pair.lgNumSlots;
fingerprintLength = pair.fingerprintLength;
assertEquals(expected_lgNumSlotsNinety[i], lgNumSlots);
assertEquals(expected_fingerprintLength[i], fingerprintLength);

// 80% load
pair = qfb.suggestParamsFromMaxDistinctsFPP(numItems[i], fpp[i]);
pair = QuotientFilterBuilder.suggestParamsFromMaxDistinctsFPP(numItems[i], fpp[i]);
lgNumSlots = pair.lgNumSlots;
fingerprintLength = pair.fingerprintLength;
assertEquals(expected_lgNumSlotsEighty[i], lgNumSlots);
assertEquals(expected_fingerprintLength[i], fingerprintLength);
}
}



}
}
Original file line number Diff line number Diff line change
Expand Up @@ -274,7 +274,7 @@ public void smallExpansion() {
final QuotientFilter qf = new QuotientFilter(5, 9);
final int n = 30;
for (int i = 0; i < n; i++) { qf.insert(i); }
qf.printFilterSummary();
System.out.println(qf.toString());
assertEquals(qf.getNumExpansions(), 1);
assertEquals(qf.getNumEntries(), n);

Expand Down
Loading