diff --git a/src/main/java/org/apache/datasketches/filters/quotientfilter/Bitmap.java b/src/main/java/org/apache/datasketches/filters/quotientfilter/Bitmap.java new file mode 100644 index 000000000..658e15f0d --- /dev/null +++ b/src/main/java/org/apache/datasketches/filters/quotientfilter/Bitmap.java @@ -0,0 +1,35 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.datasketches.filters.quotientfilter; + +public abstract class Bitmap { + + public abstract long size(); + public abstract void set(long bit_index, boolean value); + public abstract void setFromTo(long from, long to, long value); + public abstract boolean get(long bit_index); + public abstract long getFromTo(long from, long to); + + public static boolean get_fingerprint_bit(long index, long fingerprint) { + long mask = 1 << index; + long and = fingerprint & mask; + return and != 0; + } +} diff --git a/src/main/java/org/apache/datasketches/filters/quotientfilter/Filter.java b/src/main/java/org/apache/datasketches/filters/quotientfilter/Filter.java new file mode 100644 index 000000000..8f0796738 --- /dev/null +++ b/src/main/java/org/apache/datasketches/filters/quotientfilter/Filter.java @@ -0,0 +1,149 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + + +package org.apache.datasketches.filters.quotientfilter; + +import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; + + +import org.apache.datasketches.memory.XxHash; + + +public abstract class Filter { + + //HashType hash_type; + + //abstract boolean rejuvenate(long key); + //abstract boolean expand(); + //protected abstract boolean _delete(long large_hash); + abstract protected boolean _insert(long large_hash, boolean insert_only_if_no_match); + abstract protected boolean _search(long large_hash); + + + //public boolean delete(long input) { +// return _delete(get_hash(input)); +// } + +// public boolean delete(String input) { +// ByteBuffer input_buffer = ByteBuffer.wrap(input.getBytes(StandardCharsets.UTF_8)); +// //return _delete(HashFunctions.xxhash(input_buffer)); +// return _delete(XxHash.hashLong(input_buffer)); +// } + +// public boolean delete(byte[] input) { +// ByteBuffer input_buffer = ByteBuffer.wrap(input); +// return _delete(HashFunctions.xxhash(input_buffer)); +// } +// + public boolean insert(long input, boolean insert_only_if_no_match) { + //System.out.println("The ABC input is " + input); + long hash = get_hash(input); + //System.out.println("The ABC hash is " + hash); + return _insert(hash, insert_only_if_no_match); + } +// +// public boolean insert(String input, boolean insert_only_if_no_match) { +// ByteBuffer input_buffer = ByteBuffer.wrap(input.getBytes(StandardCharsets.UTF_8)); +// return _insert(HashFunctions.xxhash(input_buffer), insert_only_if_no_match); +// } +// +// public boolean insert(byte[] input, boolean insert_only_if_no_match) { +// ByteBuffer input_buffer = ByteBuffer.wrap(input); +// return _insert(HashFunctions.xxhash(input_buffer), insert_only_if_no_match); +// } +// + public boolean search(long input) { + return _search(get_hash(input)); + } +// +// public boolean search(String input) { +// ByteBuffer input_buffer = ByteBuffer.wrap(input.getBytes(StandardCharsets.UTF_8)); +// return _search(HashFunctions.xxhash(input_buffer)); +// } +// +// public boolean search(byte[] input) { +// ByteBuffer input_buffer = ByteBuffer.wrap(input); +// return _search(HashFunctions.xxhash(input_buffer)); +// } +// + long get_hash(long input) { +// long hash = 0; +// if (hash_type == HashType.arbitrary) { +// hash = HashFunctions.normal_hash((int)input); +// } +// else if (hash_type == HashType.xxh) { +// hash = HashFunctions.xxhash(input); +// } +// else { +// System.exit(1); +// } +// return hash; + return XxHash.hashLong(input, 0L) ; // CD edit for datasketches hash function using same seed. + } + + public long get_space_use() { return 0 ; } +// public int get_bits_per_entry() { return 0 ; } +// +// public abstract long get_num_entries(boolean include_all_internal_filters); +// +// public double get_utilization() { +// return 0; +// } +// +// public double measure_num_bits_per_entry() { +// return 0; +// } +// +// static void print_int_in_binary(int num, int length) { +// String str = ""; +// for (int i = 0; i < length; i++) { +// int mask = (int)Math.pow(2, i); +// int masked = num & mask; +// str += masked > 0 ? "1" : "0"; +// } +// System.out.println(str); +// } +// +// static void print_long_in_binary(long num, int length) { +// String str = ""; +// for (int i = 0; i < length; i++) { +// long mask = (long)Math.pow(2, i); +// long masked = num & mask; +// str += masked > 0 ? "1" : "0"; +// } +// System.out.println(str); +// } +// +// String get_fingerprint_str(long fp, int length) { +// String str = ""; +// for (int i = 0; i < length; i++) { +// str += Bitmap.get_fingerprint_bit(i, fp) ? "1" : "0"; +// } +// return str; +// } +// +// public void pretty_print() { +// +// } + + +} + diff --git a/src/main/java/org/apache/datasketches/filters/quotientfilter/Iterator.java b/src/main/java/org/apache/datasketches/filters/quotientfilter/Iterator.java new file mode 100644 index 000000000..ea5039aa6 --- /dev/null +++ b/src/main/java/org/apache/datasketches/filters/quotientfilter/Iterator.java @@ -0,0 +1,103 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + + +package org.apache.datasketches.filters.quotientfilter; + +import java.util.ArrayDeque; +import java.util.Queue; + +public class Iterator { + + QuotientFilter qf; + long index; + long bucket_index; + long fingerprint; + Queue s; + + Iterator(QuotientFilter new_qf) { + qf = new_qf; + s = new ArrayDeque(); + //s = new ArrayDeque(); + index = 0; + bucket_index = -1; + fingerprint = -1; + } + + void clear() { + s.clear(); + index = 0; + bucket_index = -1; + fingerprint = -1; + } + + boolean next() { + + if (index == qf.get_logical_num_slots_plus_extensions()) { + return false; + } + + long slot = qf.get_slot(index); + boolean occupied = (slot & 1) != 0; + boolean continuation = (slot & 2) != 0; + boolean shifted = (slot & 4) != 0; + + + while (!occupied && !continuation && !shifted && index < qf.get_logical_num_slots_plus_extensions()) { + index++; + if (index == qf.get_logical_num_slots_plus_extensions()) { + return false; + } + slot = qf.get_slot(index); + occupied = (slot & 1) != 0; + continuation = (slot & 2) != 0; + shifted = (slot & 4) != 0; + } + + if (occupied && !continuation && !shifted) { + s.clear(); + s.add(index); + bucket_index = index; + } + else if (occupied && continuation && shifted) { + s.add(index); + } + else if (!occupied && !continuation && shifted) { + s.remove(); + bucket_index = s.peek(); + } + else if (!occupied && continuation && shifted) { + // do nothing + } + else if (occupied && !continuation && shifted) { + s.add(index); + s.remove(); + bucket_index = s.peek(); + } + fingerprint = slot >> 3; + index++; + return true; + } + + void print() { + System.out.println("original slot: " + index + " " + bucket_index); + } + + +} diff --git a/src/main/java/org/apache/datasketches/filters/quotientfilter/QuickBitVector.java b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuickBitVector.java new file mode 100644 index 000000000..ca387ebc9 --- /dev/null +++ b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuickBitVector.java @@ -0,0 +1,328 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.datasketches.filters.quotientfilter; + +/* +Copyright � 1999 CERN - European Organization for Nuclear Research. +Permission to use, copy, modify, distribute and sell this software and its documentation for any purpose +is hereby granted without fee, provided that the above copyright notice appear in all copies and +that both that copyright notice and this permission notice appear in supporting documentation. +CERN makes no representations about the suitability of this software for any purpose. +It is provided "as is" without expressed or implied warranty. +*/ + +/** + * Implements quick non polymorphic non bounds checking low level bitvector operations. + * Includes some operations that interpret sub-bitstrings as long integers. + *

+ * WARNING: Methods of this class do not check preconditions. + * Provided with invalid parameters these method may return (or set) invalid values without throwing any exception. + * You should only use this class when performance is critical and you are absolutely sure that indexes are within bounds. + *

+ * A bitvector is modelled as a long array, i.e. long[] bits holds bits of a bitvector. + * Each long value holds 64 bits. + * The i-th bit is stored in bits[i/64] at + * bit position i % 64 (where bit position 0 refers to the least + * significant bit and 63 refers to the most significant bit). + * + * @author wolfgang.hoschek@cern.ch + * @version 1.0, 09/24/99 + * @see java.util.BitSet + */ +//package bitmap_implementations; + +public class QuickBitVector extends Object { + protected final static int ADDRESS_BITS_PER_UNIT = 6; // 64=2^6 + protected final static int BITS_PER_UNIT = 64; // = 1 << ADDRESS_BITS_PER_UNIT + protected final static int BIT_INDEX_MASK = 63; // = BITS_PER_UNIT - 1; + + private static final long[] pows = precomputePows(); //precompute bitmasks for speed + /** + * Makes this class non instantiable, but still inheritable. + */ + protected QuickBitVector() { + } + /** + * Returns a bit mask with bits in the specified range set to 1, all the rest set to 0. + * In other words, returns a bit mask having 0,1,2,3,...,64 bits set. + * If to-from+1==0 then returns zero (0L). + * Precondition (not checked): to-from+1 ≥ 0 AND to-from+1 ≤ 64. + * + * @param from index of start bit (inclusive) + * @param to index of end bit (inclusive). + * @return the bit mask having all bits between from and to set to 1. + */ + public static final long bitMaskWithBitsSetFromTo(long from, long to) { + return pows[(int)(to-from+1)] << from; + + // This turned out to be slower: + // 0xffffffffffffffffL == ~0L == -1L == all 64 bits set. + // int width; + // return (width=to-from+1) == 0 ? 0L : (0xffffffffffffffffL >>> (BITS_PER_UNIT-width)) << from; + } + /** + * Changes the bit with index bitIndex in the bitvector bits to the "clear" (false) state. + * + * @param bits the bitvector. + * @param bitIndex the index of the bit to be cleared. + */ + public static void clear(long[] bits, long bitIndex) { + bits[(int)(bitIndex >> ADDRESS_BITS_PER_UNIT)] &= ~(1L << (bitIndex & BIT_INDEX_MASK)); + } + /** + * Returns from the bitvector the value of the bit with the specified index. + * The value is true if the bit with the index bitIndex + * is currently set; otherwise, returns false. + * + * @param bits the bitvector. + * @param bitIndex the bit index. + * @return the value of the bit with the specified index. + */ + public static boolean get(long[] bits, long bitIndex) { + return ((bits[(int)(bitIndex >> ADDRESS_BITS_PER_UNIT)] & (1L << (bitIndex & BIT_INDEX_MASK))) != 0); + } + /** + * Returns a long value representing bits of a bitvector from index from to index to. + * Bits are returned as a long value with the return value having bit 0 set to bit from, ..., bit to-from set to bit to. + * All other bits of return value are set to 0. + * If from > to then returns zero (0L). + * Precondition (not checked): to-from+1 ≤ 64. + * @param bits the bitvector. + * @param from index of start bit (inclusive). + * @param to index of end bit (inclusive). + * @return the specified bits as long value. + */ + public static long getLongFromTo(long[] bits, long from, long to) { + if (from>to) return 0L; + + final int fromIndex = (int)(from >> ADDRESS_BITS_PER_UNIT); //equivalent to from/64 + final int toIndex = (int)(to >> ADDRESS_BITS_PER_UNIT); + final int fromOffset = (int)(from & BIT_INDEX_MASK); //equivalent to from%64 + final int toOffset = (int)(to & BIT_INDEX_MASK); + //this is equivalent to the above, but slower: + //final int fromIndex=from/BITS_PER_UNIT; + //final int toIndex=to/BITS_PER_UNIT; + //final int fromOffset=from%BITS_PER_UNIT; + //final int toOffset=to%BITS_PER_UNIT; + + + long mask; + if (fromIndex==toIndex) { //range does not cross unit boundaries; value to retrieve is contained in one single long value. + mask=bitMaskWithBitsSetFromTo(fromOffset, toOffset); + return (bits[fromIndex] & mask) >>> fromOffset; + + } + + //range crosses unit boundaries; value to retrieve is spread over two long values. + //get part from first long value + mask=bitMaskWithBitsSetFromTo(fromOffset, BIT_INDEX_MASK); + final long x1=(bits[fromIndex] & mask) >>> fromOffset; + + //get part from second long value + mask=bitMaskWithBitsSetFromTo(0, toOffset); + final long x2=(bits[toIndex] & mask) << (BITS_PER_UNIT-fromOffset); + + //combine + return x1|x2; + } + + /** + * Returns the index of the least significant bit in state "true". + * Returns 32 if no bit is in state "true". + * + * Examples: + *

+     * 0x80000000 : 31
+     * 0x7fffffff : 0
+     * 0x00000001 : 0
+     * 0x00000000 : 32
+     * 
+ * + * @param value The integer value for which the least significant bit index is to be found. + * @return The index of the least significant bit in state "true". Returns 32 if no bit is in state "true". + */ + static public int leastSignificantBit(int value) { + int i=-1; + while (++i < 32 && (((1<> ADDRESS_BITS_PER_UNIT) ; // This line basically does (nBits-1) / 2^ADDRESS... + long safe_right_shift = ((nBits-1) >>> ADDRESS_BITS_PER_UNIT) ; // This line basically does (nBits-1) / 2^ADDRESS... + // System.out.println("Right shift " + right_shift); + //System.out.println("Safe Right shift " + safe_right_shift); + int unitIndex = (int)((nBits-1) >> ADDRESS_BITS_PER_UNIT); // How many multiples of 64 bits do we need to store nBits bits? + //System.out.println(ADDRESS_BITS_PER_UNIT); + long[] bitVector = new long[unitIndex + 1]; + //System.out.println("length " + bitVector.length); + //System.out.println("Total bits: " + (bitVector.length * 64)); + //System.out.println("Num slots available: " + (bitVector.length * 64) / bitsPerElement); + return bitVector; + } + + /** + * Returns the index of the most significant bit in state "true". + * Returns -1 if no bit is in state "true". + * + * Examples: + *
+     * 0x80000000 : 31
+     * 0x7fffffff : 30
+     * 0x00000001 : 0
+     * 0x00000000 : -1
+     * 
+ * + * @param value The integer value for which the most significant bit index is to be found. + * @return The index of the most significant bit in state "true". Returns -1 if no bit is in state "true". + */ + static public int mostSignificantBit(int value) { + int i=32; + while (--i >=0 && (((1<= 1; ) { + pows[i]=value >>> (BITS_PER_UNIT-i); + } + pows[0]=0L; + return pows; + } + + /** + * Sets the bit with index bitIndex in the bitvector bits to the state specified by value. + * + * @param bits the bitvector. + * @param bitIndex the index of the bit to be changed. + * @param value the value to be stored in the bit. + */ + public static void put(long[] bits, long bitIndex, boolean value) { + if (value) + set(bits, bitIndex); + else + clear(bits, bitIndex); + } + + /** + * Sets bits of a bitvector from index from to index to to the bits of value. + * Bit from is set to bit 0 of value, ..., bit to is set to bit to-from of value. + * All other bits stay unaffected. + * If from > to then does nothing. + * Precondition (not checked): to-from+1 ≤ 64. + * + * this function is equivalent to the slower code below: + * int fromIndex=from/BITS_PER_UNIT; + * int toIndex=to/BITS_PER_UNIT; + * int fromOffset=from%BITS_PER_UNIT; + * int toOffset=to%BITS_PER_UNIT; + * + * @param bits the bitvector. + * @param value the value to be copied into the bitvector. + * @param from index of start bit (inclusive). + * @param to index of end bit (inclusive). + */ + public static void putLongFromTo(long[] bits, long value, long from, long to) { + if (from>to) return; + + final int fromIndex=(int)(from >> ADDRESS_BITS_PER_UNIT); //equivalent to from/64 + final int toIndex=(int)(to >> ADDRESS_BITS_PER_UNIT); + final int fromOffset=(int)(from & BIT_INDEX_MASK); //equivalent to from % 64 + final int toOffset=(int)(to & BIT_INDEX_MASK); + + //make sure all unused bits to the left are cleared. + long mask; + mask=bitMaskWithBitsSetFromTo(to-from+1, BIT_INDEX_MASK); + long cleanValue=value & (~mask); + + long shiftedValue; + + if (fromIndex==toIndex) { //range does not cross unit boundaries; should go into one single long value. + shiftedValue=cleanValue << fromOffset; + mask=bitMaskWithBitsSetFromTo(fromOffset, toOffset); + bits[fromIndex] = (bits[fromIndex] & (~mask)) | shiftedValue; + return; + + } + + //range crosses unit boundaries; value should go into two long values. + //copy into first long value. + shiftedValue=cleanValue << fromOffset; + mask=bitMaskWithBitsSetFromTo(fromOffset, BIT_INDEX_MASK); + bits[fromIndex] = (bits[fromIndex] & (~mask)) | shiftedValue; + + //copy into second long value. + shiftedValue=cleanValue >>> (BITS_PER_UNIT - fromOffset); + mask=bitMaskWithBitsSetFromTo(0, toOffset); + bits[toIndex] = (bits[toIndex] & (~mask)) | shiftedValue; + } + + /** + * Changes the bit with index bitIndex in the bitvector bits to the "set" (true) state. + * + * @param bits the bitvector. + * @param bitIndex the index of the bit to be set. + */ + public static void set(long[] bits, long bitIndex) { + bits[(int)(bitIndex >> ADDRESS_BITS_PER_UNIT)] |= 1L << (bitIndex & BIT_INDEX_MASK); + } + + /** + * Returns the index of the unit that contains the given bitIndex. + * + * @param bitIndex The index of the bit to be checked. + * @return The index of the unit that contains the given bitIndex. + */ + protected static long unit(long bitIndex) { + return bitIndex >> ADDRESS_BITS_PER_UNIT; // equivalent to bitIndex/64 + } +} diff --git a/src/main/java/org/apache/datasketches/filters/quotientfilter/QuickBitVectorWrapper.java b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuickBitVectorWrapper.java new file mode 100644 index 000000000..a4c24a3ff --- /dev/null +++ b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuickBitVectorWrapper.java @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.datasketches.filters.quotientfilter; + +public class QuickBitVectorWrapper extends Bitmap { + + long[] bs; + + public QuickBitVectorWrapper(int bits_per_entry, long num_entries) { + bs = QuickBitVector.makeBitVector(num_entries, bits_per_entry); + } + + @Override + public long size() { + return (long)bs.length * Long.BYTES * 8L; + } + + @Override + public void set(long bit_index, boolean value) { + if (value) { + QuickBitVector.set(bs, bit_index); + } + else { + QuickBitVector.clear(bs, bit_index); + } + } + + @Override + public void setFromTo(long from, long to, long value) { + QuickBitVector.putLongFromTo(bs, value, from, to - 1); + } + + @Override + public boolean get(long bit_index) { + return QuickBitVector.get(bs, bit_index); + } + + @Override + public long getFromTo(long from, long to) { + return QuickBitVector.getLongFromTo(bs, from, to - 1); + } + + +} + diff --git a/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java new file mode 100644 index 000000000..9e0f08ce5 --- /dev/null +++ b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilter.java @@ -0,0 +1,945 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + + +package org.apache.datasketches.filters.quotientfilter; + +import java.util.ArrayList; +import java.util.HashSet; +import java.util.Set; + +import org.apache.datasketches.filters.quotientfilter.Bitmap; +import org.apache.datasketches.memory.XxHash; + +public class QuotientFilter extends Filter { + + int bitPerEntry; + int fingerprintLength; + int power_of_two_size; + int num_extension_slots; + int num_existing_entries; + Bitmap filter; + + // These three fields are used to prevent throwing exceptions when the buffer space of the filter is exceeded + long last_empty_slot; + long last_cluster_start; + public long backward_steps; + + double expansion_threshold; + long max_entries_before_expansion; + boolean expand_autonomously; + boolean is_full; + + // statistics, computed in the compute_statistics method. method should be called before these are used + long num_runs; + long num_clusters; + public double avg_run_length; + public double avg_cluster_length; + + int original_fingerprint_size; + int num_expansions; + + + public QuotientFilter(int power_of_two, int bits_per_entry) { + power_of_two_size = power_of_two; + bitPerEntry = bits_per_entry; + fingerprintLength = bits_per_entry - 3; + long init_size = 1L << power_of_two; + //System.out.println("Init size: " + init_size); + num_extension_slots = power_of_two * 2; + // System.out.println("Extension slots: " + num_extension_slots); + + filter = make_filter(init_size, bits_per_entry); + + expansion_threshold = 0.8; + max_entries_before_expansion = (int) (init_size * expansion_threshold); + expand_autonomously = false; + is_full = false; + + original_fingerprint_size = fingerprintLength; + num_expansions = 0; + //hash_type = XxHash.hashLong ; //HashType.xxh; + + last_empty_slot = init_size + num_extension_slots - 1; + last_cluster_start = 0; + backward_steps = 0; + //measure_num_bits_per_entry(); + } + + //nuevo + void update(long init_size) + { + last_empty_slot = init_size + num_extension_slots - 1; + last_cluster_start = 0; + backward_steps = 0; + } + + public boolean rejuvenate(long key) { + return false; + } + + public long get_num_existing_entries() { + return num_existing_entries; + } + + public long get_max_entries_before_expansion() { + return max_entries_before_expansion; + } + + public boolean expand_autonomously() { + return expand_autonomously; + } + + public void set_expand_autonomously(boolean val) { + expand_autonomously = val; + } + + Bitmap make_filter(long init_size, int bits_per_entry) { +// System.out.println(init_size ) ; +// System.out.println(num_extension_slots); +// System.out.println("Making BitVector with: " + (init_size + num_extension_slots) + "SLOTS"); + return new QuickBitVectorWrapper(bits_per_entry, init_size + num_extension_slots); + } + + public int get_fingerprint_length() { + return fingerprintLength; + } + + QuotientFilter(int power_of_two, int bits_per_entry, Bitmap bitmap) { + power_of_two_size = power_of_two; + bitPerEntry = bits_per_entry; + fingerprintLength = bits_per_entry - 3; + filter = bitmap; + num_extension_slots = power_of_two * 2; + + //nuevo + long init_size = 1L << power_of_two; + last_empty_slot = init_size + num_extension_slots - 1; + last_cluster_start = 0; + backward_steps = 0; + } + + boolean expand() { + is_full = true; + return false; + } + + // measures the number of bits per entry for the filter + public double measure_num_bits_per_entry() { + return measure_num_bits_per_entry(this, new ArrayList()); + } + + // measures the number of bits per entry for the filter + // it takes an array of filters as a parameter since some filter implementations here consist of multiple filter objects + protected static double measure_num_bits_per_entry(QuotientFilter current, ArrayList other_filters) { + //System.out.println("--------------------------"); + //current.print_filter_summary(); + //System.out.println(); + double num_entries = current.get_num_entries(false); + for (QuotientFilter q : other_filters) { + //q.print_filter_summary(); + //System.out.println(); + long q_num_entries = q.get_num_entries(false); + num_entries += q_num_entries; + } + long init_size = 1L << current.power_of_two_size; + long num_bits = current.bitPerEntry * init_size + current.num_extension_slots * current.bitPerEntry; + for (QuotientFilter q : other_filters) { + init_size = 1L << q.power_of_two_size; + num_bits += q.bitPerEntry * init_size + q.num_extension_slots * q.bitPerEntry; + } + //System.out.println("total entries: \t\t" + num_entries); + //System.out.println("total bits: \t\t" + num_bits); + double bits_per_entry = num_bits / num_entries; + //System.out.println("total bits/entry: \t" + bits_per_entry); + //System.out.println(); + return bits_per_entry; + } + + // scans the quotient filter and returns the number of non-empty slots + public long get_num_entries(boolean include_all_internal_filters) { + //long bits = filter.size(); + long slots = get_physcial_num_slots(); + long num_entries = 0; + for (long i = 0; i < slots; i++) { + if (is_occupied(i) || is_continuation(i) || is_shifted(i)) { + num_entries++; + } + } + return num_entries; + } + + // returns the fraction of occupied slots in the filter + public double get_utilization() { + long num_logical_slots = 1L << power_of_two_size; + long num_entries = get_num_entries(false); + double util = num_entries / (double) num_logical_slots; + return util; + } + + public long get_physcial_num_slots() { + long bits = filter.size(); + return bits / bitPerEntry; + } + + // returns the number of physical slots in the filter (including the extention/buffer slots at the end) + public long get_logical_num_slots_plus_extensions() { + return (1L << power_of_two_size) + num_extension_slots; + } + + // returns the number of slots in the filter without the extension/buffer slots + public long get_logical_num_slots() { + return 1L << power_of_two_size; + } + + // sets the metadata flag bits for a given slot index + void modify_slot(boolean is_occupied, boolean is_continuation, boolean is_shifted, + long index) { + set_occupied(index, is_occupied); + set_continuation(index, is_continuation); + set_shifted(index, is_shifted); + } + + // sets the fingerprint for a given slot index + void set_fingerprint(long index, long fingerprint) { + filter.setFromTo(index * bitPerEntry + 3, (long)index * bitPerEntry + 3 + fingerprintLength, fingerprint); + } + + // print a nice representation of the filter that can be understood. + // if vertical is on, each line will represent a slot + public String get_pretty_str(boolean vertical) { + StringBuffer sbr = new StringBuffer(); + + long logic_slots = get_logical_num_slots(); + long all_slots = get_logical_num_slots_plus_extensions(); + + for (long i = 0; i < filter.size(); i++) { + long remainder = i % bitPerEntry; + if (remainder == 0) { + long slot_num = i/bitPerEntry; + sbr.append(" "); + if (vertical) { + if (slot_num == logic_slots ){//|| slot_num == all_slots) { + sbr.append("\n ---------"); + } else if (slot_num == all_slots) { + sbr.append("\n d***********b"); + } + //sbr.append("\n" + slot_num + " "); + sbr.append("\n" + String.format("%-10d", slot_num) + "\t"); + } + } + if (remainder == 3) { + sbr.append(" "); + } + sbr.append(filter.get(i) ? "1" : "0"); + } + sbr.append("\n"); + return sbr.toString(); + } + + // print a nice representation of the filter that can be humanly read. + public void pretty_print() { + System.out.print(get_pretty_str(true)); + } + + // return a fingerprint in a given slot index + long get_fingerprint(long index) { + return filter.getFromTo(index * bitPerEntry + 3, index * bitPerEntry + 3 + fingerprintLength); + } + + // return an entire slot representation, including metadata flags and fingerprint + long get_slot(long index) { + return filter.getFromTo(index * bitPerEntry, (index + 1) * bitPerEntry); + } + + // compare a fingerprint input to the fingerprint in some slot index + protected boolean compare(long index, long fingerprint) { + return get_fingerprint(index) == fingerprint; + } + + // modify the flags and fingerprint of a given slot + void modify_slot(boolean is_occupied, boolean is_continuation, boolean is_shifted, + long index, long fingerprint) { + modify_slot(is_occupied, is_continuation, is_shifted, index); + set_fingerprint(index, fingerprint); + } + + // summarize some statistical measures about the filter + public void print_filter_summary() { + long num_entries = get_num_entries(false); + long slots = (1L << power_of_two_size) + num_extension_slots; + long num_bits = slots * bitPerEntry; + System.out.println("slots:\t" + slots); + System.out.println("entries:\t" + num_entries); + System.out.println("bits\t:" + num_bits); + System.out.println("bits/entry\t:" + num_bits / (double)num_entries); + System.out.println("FP length:\t" + fingerprintLength); + System.out.println("Is full?\t" + is_full); + double capacity = num_entries / (double)(slots) ; + System.out.println("Capacity\t" + capacity); + compute_statistics(); + //System.out.println("num runs: \t\t" + num_runs); + //System.out.println("avg run length: \t" + avg_run_length); + //System.out.println("num clusters: \t\t" + num_clusters); + //System.out.println("avg cluster length: \t" + avg_cluster_length); + } + + @Override + public long get_space_use(){ + /* + Returns the number of bits used for the filter + */ + long slots = (1L << power_of_two_size); // + num_extension_slots; + long num_bits = slots * bitPerEntry; + return num_bits ; + } + + public int get_bits_per_entry() { + return bitPerEntry; + } + + boolean is_occupied(long index) { + return filter.get(index * bitPerEntry); + } + + boolean is_continuation(long index) { + return filter.get(index * bitPerEntry + 1); + } + + boolean is_shifted(long index) { + return filter.get(index * bitPerEntry + 2); + } + + void set_occupied(long index, boolean val) { + filter.set(index * bitPerEntry, val); + } + + void set_continuation(long index, boolean val) { + filter.set(index * bitPerEntry + 1, val); + } + + void set_shifted(long index, boolean val) { + filter.set(index * bitPerEntry + 2, val); + } + + boolean is_slot_empty(long index) { + return !is_occupied(index) && !is_continuation(index) && !is_shifted(index); + } + + // scan the cluster leftwards until finding the start of the cluster and returning its slot index + // used by deletes + long find_cluster_start(long index) { + long current_index = index; + while (is_shifted(current_index)) { + current_index--; + } + return current_index; + } + + // given a canonical slot A, finds the actual index B of where the run belonging to slot A now resides + // since the run might have been shifted to the right due to collisions + long find_run_start(long index) { + long current_index = index; + int runs_to_skip_counter = 1; + while (is_shifted(current_index)) { + if (is_occupied(current_index)) { + runs_to_skip_counter++; + } + current_index--; + } + last_cluster_start = current_index - 1; + while (true) { + if (!is_continuation(current_index)) { + runs_to_skip_counter--; + if (runs_to_skip_counter == 0) { + return current_index; + } + } + current_index++; + } + } + + // given the start of a run, scan the run and return the index of the first matching fingerprint + long find_first_fingerprint_in_run(long index, long fingerprint) { + assert(!is_continuation(index)); + do { + if (compare(index, fingerprint)) { + //System.out.println("found matching FP at index " + index); + return index; + } + index++; + } while (index < get_logical_num_slots_plus_extensions() && is_continuation(index)); + return -1; + } + + // delete the last matching fingerprint in the run + long decide_which_fingerprint_to_delete(long index, long fingerprint) { + assert(!is_continuation(index)); + long matching_fingerprint_index = -1; + do { + if (compare(index, fingerprint)) { + //System.out.println("found matching FP at index " + index); + matching_fingerprint_index = index; + } + index++; + } while (index < get_logical_num_slots_plus_extensions() && is_continuation(index)); + return matching_fingerprint_index; + } + + // given the start of a run, find the last slot index that still belongs to this run + long find_run_end(long index) { + while(index < get_logical_num_slots_plus_extensions() - 1 && is_continuation(index+1)) { + index++; + } + return index; + } + + // given a canonical index slot and a fingerprint, find the relevant run and check if there is a matching fingerprint within it + boolean search(long fingerprint, long index) { + boolean does_run_exist = is_occupied(index); + if (!does_run_exist) { + return false; + } + long run_start_index = find_run_start(index); + long found_index = find_first_fingerprint_in_run(run_start_index, fingerprint); + return found_index > -1; + } + + // Given a canonical slot index, find the corresponding run and return all fingerprints in the run. + // This method is only used for testing purposes. + Set get_all_fingerprints(long bucket_index) { + boolean does_run_exist = is_occupied(bucket_index); + HashSet set = new HashSet(); + if (!does_run_exist) { + return set; + } + long run_index = find_run_start(bucket_index); + do { + set.add(get_fingerprint(run_index)); + run_index++; + } while (is_continuation(run_index)); + return set; + } + + // Swaps the fingerprint in a given slot with a new one. Return the pre-existing fingerprint + long swap_fingerprints(long index, long new_fingerprint) { + long existing = get_fingerprint(index); + set_fingerprint(index, new_fingerprint); + return existing; + } + + // finds the first empty slot after the given slot index + long find_first_empty_slot(long index) { + while (!is_slot_empty(index)) { + index++; + } + return index; + } + + // moves backwards to find the first empty slot + // used as a part of the mechanism to prevent exceptions when exceeding the quotient filter's bounds + long find_backward_empty_slot(long index) { + while (index >= 0 && !is_slot_empty(index)) { + backward_steps++; + index--; + } + return index; + } + + // return the first slot to the right where the current run starting at the index parameter ends + long find_new_run_location(long index) { + if (!is_slot_empty(index)) { + index++; + } + while (is_continuation(index)) { + index++; + } + return index; + } + + boolean insert_new_run(long canonical_slot, long long_fp) { + long first_empty_slot = find_first_empty_slot(canonical_slot); // finds the first empty slot to the right of the canonical slot that is empty + long preexisting_run_start_index = find_run_start(canonical_slot); // scans the cluster leftwards and then to the right until reaching our run's would be location + long start_of_this_new_run = find_new_run_location(preexisting_run_start_index); // If there is already a run at the would-be location, find its end and insert the new run after it + boolean slot_initially_empty = is_slot_empty(start_of_this_new_run); + + // modify some metadata flags to mark the new run + set_occupied(canonical_slot, true); + if (first_empty_slot != canonical_slot) { + set_shifted(start_of_this_new_run, true); + } + set_continuation(start_of_this_new_run, false); + + // if the slot was initially empty, we can just terminate, as there is nothing to push to the right + if (slot_initially_empty) { + set_fingerprint(start_of_this_new_run, long_fp); + if (start_of_this_new_run == last_empty_slot) { + last_empty_slot = find_backward_empty_slot(last_cluster_start); + } + num_existing_entries++; + return true; + } + + // push all entries one slot to the right + // if we inserted this run in the middle of a cluster + long current_index = start_of_this_new_run; + boolean is_this_slot_empty; + boolean temp_continuation = false; + do { + if (current_index >= get_logical_num_slots_plus_extensions()) { + return false; + } + + is_this_slot_empty = is_slot_empty(current_index); + long_fp = swap_fingerprints(current_index, long_fp); + + if (current_index > start_of_this_new_run) { + set_shifted(current_index, true); + } + + if (current_index > start_of_this_new_run) { + boolean current_continuation = is_continuation(current_index); + set_continuation(current_index, temp_continuation); + temp_continuation = current_continuation; + } + current_index++; + if (current_index == last_empty_slot) { // TODO get this out of the while loop + last_empty_slot = find_backward_empty_slot(last_cluster_start); + } + } while (!is_this_slot_empty); + num_existing_entries++; + return true; + } + + boolean insert(long long_fp, long index, boolean insert_only_if_no_match) { + //System.out.println("Inserting Fingerprint " + long_fp); + //System.out.println("Inserting @ index " + index); + //System.out.println("BoolMatch? " + insert_only_if_no_match); + //System.out.println("**********"); + //System.out.println("Num items: " + num_existing_entries); + //System.out.println("Max items: " + max_entries_before_expansion); + + if (index > last_empty_slot) { + return false; + } + boolean does_run_exist = is_occupied(index); + if (!does_run_exist) { + boolean val = insert_new_run(index, long_fp); + return val; + } + + long run_start_index = find_run_start(index); + if (does_run_exist && insert_only_if_no_match) { + long found_index = find_first_fingerprint_in_run(run_start_index, long_fp); + if (found_index > -1) { + return false; + } + } + return insert_fingerprint_and_push_all_else(long_fp, run_start_index); + } + + // insert an fingerprint as the first fingerprint of the new run and push all other entries in the cluster to the right. + boolean insert_fingerprint_and_push_all_else(long long_fp, long run_start_index) { + long current_index = run_start_index; + boolean is_this_slot_empty; + boolean finished_first_run = false; + boolean temp_continuation = false; + + do { + if (current_index >= get_logical_num_slots_plus_extensions()) { + return false; + } + is_this_slot_empty = is_slot_empty(current_index); + if (current_index > run_start_index) { + set_shifted(current_index, true); + } + if (current_index > run_start_index && !finished_first_run && !is_continuation(current_index)) { + finished_first_run = true; + set_continuation(current_index, true); + long_fp = swap_fingerprints(current_index, long_fp); + } + else if (finished_first_run) { + boolean current_continuation = is_continuation(current_index); + set_continuation(current_index, temp_continuation); + temp_continuation = current_continuation; + long_fp = swap_fingerprints(current_index, long_fp); + } + if (current_index == last_empty_slot) { + last_empty_slot = find_backward_empty_slot(last_cluster_start); + } + current_index++; + } while (!is_this_slot_empty); + num_existing_entries++; + return true; + } + + boolean delete(long fingerprint, long canonical_slot, long run_start_index, long matching_fingerprint_index) { + long run_end = find_run_end(matching_fingerprint_index); + + // the run has only one entry, we need to disable its is_occupied flag + // we just remember we need to do this here, and we do it later to not interfere with counts + boolean turn_off_occupied = run_start_index == run_end; + + // First thing to do is move everything else in the run back by one slot + for (long i = matching_fingerprint_index; i < run_end; i++) { + long f = get_fingerprint(i + 1); + set_fingerprint(i, f); + } + + // for each slot, we want to know by how much the entry there is shifted + // we can do this by counting the number of continuation flags set to true + // and the number of occupied flags set to false from the start of the cluster to the given cell + // and then subtracting: num_shifted_count - num_non_occupied = number of slots by which an entry is shifted + long cluster_start = find_cluster_start(canonical_slot); + long num_shifted_count = 0; + long num_non_occupied = 0; + for (long i = cluster_start; i <= run_end; i++) { + if (is_continuation(i)) { + num_shifted_count++; + } + if (!is_occupied(i)) { + num_non_occupied++; + } + } + + set_fingerprint(run_end, 0); + set_shifted(run_end, false); + set_continuation(run_end, false); + + // we now have a nested loop. The outer do-while iterates over the remaining runs in the cluster. + // the inner for loop iterates over cells of particular runs, pushing entries one slot back. + do { + // we first check if the next run actually exists and if it is shifted. + // only if both conditions hold, we need to shift it back one slot. + //boolean does_next_run_exist = !is_slot_empty(run_end + 1); + //boolean is_next_run_shifted = is_shifted(run_end + 1); + //if (!does_next_run_exist || !is_next_run_shifted) { + if (run_end >= get_logical_num_slots_plus_extensions()-1 || + is_slot_empty(run_end + 1) || !is_shifted(run_end + 1)) { + if (turn_off_occupied) { + // if we eliminated a run and now need to turn the is_occupied flag off, we do it at the end to not interfere in our counts + set_occupied(canonical_slot, false); + + } + if (run_end > last_empty_slot) { + last_empty_slot = run_end; + } + return true; + } + + // we now find the start and end of the next run + long next_run_start = run_end + 1; + run_end = find_run_end(next_run_start); + + // before we start processing the next run, we check whether the previous run we shifted is now back to its canonical slot + // The condition num_shifted_count - num_non_occupied == 1 ensures that the run was shifted by only 1 slot, meaning it is now back in its proper place + if ( is_occupied(next_run_start - 1) && num_shifted_count - num_non_occupied == 1 ) { + set_shifted(next_run_start - 1, false); + } + else { + set_shifted(next_run_start - 1, true); + } + + for (long i = next_run_start; i <= run_end; i++) { + long f = get_fingerprint(i); + set_fingerprint(i - 1, f); + if (is_continuation(i)) { + set_continuation(i-1, true); + } + if (!is_occupied(i)) { + num_non_occupied++; + } + } + num_shifted_count += run_end - next_run_start; + set_fingerprint(run_end, 0); + set_shifted(run_end, false); + set_continuation(run_end, false); + } while (true); + } + + boolean delete(long fingerprint, long canonical_slot) { + if (canonical_slot >= get_logical_num_slots()) { + return false; + } + // if the run doesn't exist, the key can't have possibly been inserted + boolean does_run_exist = is_occupied(canonical_slot); + if (!does_run_exist) { + return false; + } + long run_start_index = find_run_start(canonical_slot); + + long matching_fingerprint_index = decide_which_fingerprint_to_delete(run_start_index, fingerprint); + + if (matching_fingerprint_index == -1) { + // we didn't find a matching fingerprint + return false; + } + + return delete(fingerprint, canonical_slot, run_start_index, matching_fingerprint_index); + + } + + + + /* + Performs the modular arithmetic of large_hash % bits_per_entry and uses this as the slot_index + */ + long get_slot_index(long large_hash) { + long slot_index_mask = (1L << power_of_two_size) - 1; + long slot_index = large_hash & slot_index_mask; + //System.out.format("\n**get_slot_index(): [total_hash:index_hash:int_index] --> [%016x:%016x:%016x]\n", large_hash, (int)large_hash, slot_index); + return slot_index; + } + + long gen_fingerprint(long large_hash) { + long fingerprint_mask = (1L << fingerprintLength) - 1L; + fingerprint_mask = fingerprint_mask << power_of_two_size; + long fingerprint = (large_hash & fingerprint_mask) >> power_of_two_size; + //System.out.format("\n**gen_fingerprint(): [total_hash:fingerprint_hash:int_fingerprint] --> [%016x:%016x:%016x]\n", large_hash, ((int)(large_hash>>32)), fingerprint); + return fingerprint; + } + +// void print_key(int input) { +// long large_hash = HashFunctions.normal_hash(input); +// long slot_index = get_slot_index(large_hash); +// long fingerprint = gen_fingerprint(large_hash); +// +// System.out.println("num : " + input); +// System.out.print("hash : "); +// print_long_in_binary(large_hash, fingerprintLength + power_of_two_size); +// //print_int_in_binary(slot_index_mask, 31); +// System.out.print("bucket: "); +// print_long_in_binary(slot_index, power_of_two_size); +// System.out.print("FP : "); +// //print_int_in_binary(fingerprint_mask, 31); +// print_long_in_binary(fingerprint, fingerprintLength); +// System.out.println(); +// +// } +// +// void set_expansion_threshold(double thresh) { +// expansion_threshold = thresh; +// max_entries_before_expansion = (long)(Math.pow(2, power_of_two_size) * expansion_threshold); +// } +// + /* + This is the main insertion function accessed externally. + It calls the underlying filter _insert function which hashes the input + item internally. + Hence, the `large_hash` argument is already a hash key that has been generated + by the hashing library (eg xxhash). + */ + protected boolean _insert(long large_hash, boolean insert_only_if_no_match) { + //System.out.println("Inserting long hash " + large_hash); + if (is_full) { + return false; + } + long slot_index = get_slot_index(large_hash); + long fingerprint = gen_fingerprint(large_hash); + + /*print_long_in_binary(large_hash, 64); + print_long_in_binary(slot_index, 32); + print_long_in_binary((int)fingerprint, 64); + System.out.println(slot_index + " " + fingerprint ); + System.out.println(); */ + + boolean success = insert(fingerprint, slot_index, false); + /*if (!success) { + System.out.println("insertion failure"); + System.out.println(input + "\t" + slot_index + "\t" + get_fingerprint_str(fingerprint, fingerprintLength)); + pretty_print(); + System.exit(1); + }*/ + +// if (expand_autonomously && num_existing_entries >= max_entries_before_expansion) { +// boolean expanded = expand(); +// if (expanded) { +// num_expansions++; +// } +// } + return success; + } +// +// protected boolean _delete(long large_hash) { +// long slot_index = get_slot_index(large_hash); +// long fp_long = gen_fingerprint(large_hash); +// boolean success = delete(fp_long, slot_index); +// if (success) { +// num_existing_entries--; +// } +// return success; +// } +// + protected boolean _search(long large_hash) { + long slot_index = get_slot_index(large_hash); + long fingerprint = gen_fingerprint(large_hash); + return search(fingerprint, slot_index); + } + + + + public boolean get_bit_at_offset(int offset) { + return filter.get(offset); + } + + public void compute_statistics() { + num_runs = 0; + num_clusters = 0; + double sum_run_lengths = 0; + double sum_cluster_lengths = 0; + + int current_run_length = 0; + int current_cluster_length = 0; + + long num_slots = get_logical_num_slots_plus_extensions(); + for (long i = 0; i < num_slots; i++) { + + boolean occupied = is_occupied(i); + boolean continuation = is_continuation(i); + boolean shifted = is_shifted(i); + + if ( !occupied && !continuation && !shifted ) { // empty slot + sum_cluster_lengths += current_cluster_length; + current_cluster_length = 0; + sum_run_lengths += current_run_length; + current_run_length = 0; + } + else if ( !occupied && !continuation && shifted ) { // start of new run + num_runs++; + sum_run_lengths += current_run_length; + current_run_length = 1; + current_cluster_length++; + } + else if ( !occupied && continuation && !shifted ) { + // not used + } + else if ( !occupied && continuation && shifted ) { // continuation of run + current_cluster_length++; + current_run_length++; + } + else if ( occupied && !continuation && !shifted ) { // start of new cluster & run + num_runs++; + num_clusters++; + sum_cluster_lengths += current_cluster_length; + sum_run_lengths += current_run_length; + current_cluster_length = 1; + current_run_length = 1; + } + else if (occupied && !continuation && shifted ) { // start of new run + num_runs++; + sum_run_lengths += current_run_length; + current_run_length = 1; + current_cluster_length++; + } + else if (occupied && continuation && !shifted ) { + // not used + } + else if (occupied && continuation && shifted ) { // continuation of run + current_cluster_length++; + current_run_length++; + } + } + avg_run_length = sum_run_lengths / num_runs; + avg_cluster_length = sum_cluster_lengths / num_clusters; + } + +// +// void ar_sum1(ArrayList ar, int index) +// { +// int s = ar.size(); +// if (s <= index) +// { +// for (int i = s; i measure_cluster_length() +// { +// ArrayList ar = new ArrayList(); +// +// num_runs = 0; +// num_clusters = 0; +// +// int current_run_length = 0; +// int current_cluster_length = 0; +// +// int cnt = 0; +// +// for (int i = 0; i < get_logical_num_slots_plus_extensions(); i++) { +// +// boolean occupied = is_occupied(i); +// boolean continuation = is_continuation(i); +// boolean shifted = is_shifted(i); +// +// if (!occupied && !continuation && !shifted ) { // empty slot +// if(current_cluster_length != 0) ar_sum1(ar, current_cluster_length-1); +// current_cluster_length = 0; +// current_run_length = 0; +// } +// else if (!occupied && !continuation && shifted ) { // start of new run +// num_runs++; +// current_run_length = 1; +// current_cluster_length++; +// } +// else if (!occupied && continuation && shifted ) { // continuation of run +// current_cluster_length++; +// current_run_length++; +// } +// else if (occupied && !continuation && !shifted ) { // start of new cluster & run +// if(current_cluster_length != 0) ar_sum1(ar, current_cluster_length-1); +// num_runs++; +// num_clusters++; +// //if(current_cluster_length == 0) cnt++; +// current_cluster_length = 1; +// current_run_length = 1; +// } +// else if (occupied && !continuation && shifted ) { // start of new run +// num_runs++; +// current_run_length = 1; +// current_cluster_length++; +// } +// else if (occupied && continuation && shifted ) { // continuation of run +// current_cluster_length++; +// current_run_length++; +// } +// } +// if(current_cluster_length != 0) ar_sum1(ar, current_cluster_length-1); +// //System.out.println("CNT = " + cnt); +// return ar; +// } +// +// /* +// @charlied +// Returns the fraction of the filter that is occupied by inserted items. +// Extension slots are omitted from the calculation of the load factor because they are used to accomodate +// items in the filter at the top end of the filter. +// Asymptotically, these will make little-to-no difference to the load in these calculations as the slots +// contributed 2*j / (2^j) --> 0 entries. +// */ +// public double get_load() { +// return num_existing_entries / (double) get_logical_num_slots(); +// } + +} + + diff --git a/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterBuilder.java b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterBuilder.java new file mode 100644 index 000000000..1f98c82f2 --- /dev/null +++ b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterBuilder.java @@ -0,0 +1,133 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.datasketches.filters.quotientfilter; +import java.util.concurrent.ThreadLocalRandom; + +import org.apache.datasketches.common.SketchesArgumentException; + +/** + * This class provides methods to help estimate the correct parameters when + * creating a Quotient filter, and methods to create the filter using those values. + * + * The underlying math is described in the + * + * Wikipedia article on Quotient filters. + */ +public final class QuotientFilterBuilder { + + /* + This function is used to suggest the number of bits per entry for a given number of entries. + The fingerprint length is related to the targetFalsePositiveProb roughly by 2^(-fingerprint_length). + Hence, the length of the fingerprint can be stored in at most 8 bits. + This, after rounding up, is the same as the more sophisticated expression which involves the capacity + from https://en.wikipedia.org/wiki/Quotient_filter#Probability_of_false_positives. + * @param targetFalsePositiveProb A desired false positive probability per item + * @return The suggested fingerprint length in bits + */ + public static byte suggestFingerprintLength(double targetFalsePositiveProb) { + if (targetFalsePositiveProb <= 0. || targetFalsePositiveProb >= 1.) { + + throw new SketchesArgumentException("targetFalsePositiveProb must be a valid probability and strictly greater than 0"); + } + return (byte) Math.ceil(-Math.log(targetFalsePositiveProb) / Math.log(2)); + } + + /** + * This method suggests the number of slots in the filter for a given input size, assuming 90% capacity. + * There is no load factor checking internally within the filter, so this method is used to map between the + * number of items we insert into a sketch and the number of slots we need to allocate. + * A design feature of Niv's implementation is that 2^j +2*j slots are allocated. This asymptotically approaches + * 2^j slots as j grows, and the canonical number of slots is 2^j. Therefore, we will only check against + * 0.9*2^j slots. + * The load factor is 0.9 to get some space-utility advantages over the bloom filter. + * @param maxDistinctItems The maximum number of distinct items that can be inserted into the filter. + * @return The log-base-2 of the number of slots in the filter. + */ + public static byte suggestLgNumSlots(long maxDistinctItems) { + if (maxDistinctItems <= 0) { + throw new SketchesArgumentException("maxDistinctItems must be strictly positive"); + } + byte result = (byte) Math.ceil(Math.log(maxDistinctItems / 0.9) / Math.log(2)); + if (result < 31) { + return result; + } else { + // Largest address space for a Java array is 2^31 - 1 + throw new SketchesArgumentException("Largest address space for a Java array is 2^31 - 1"); + } + } + + /* + Returns the largest number of unique items that can be inserted into the filter. + We use a predefined load factor of 0.9 compared to the number of slots as 2^j. + @param lgNumSlots The log-base-2 of the number of slots in the filter + @return The maximum number of items that can be inserted into the filter + */ + public static long suggestMaxNumItemsFromNumSlots(byte lgNumSlots) { + if (lgNumSlots <= 0) { + throw new SketchesArgumentException("lgNumSlots must be at least 1."); + } else if (lgNumSlots >= 31) { + throw new SketchesArgumentException("lgNumSlots cannot exceed 2^31 - 1."); + } + return (long) Math.floor(0.9 * Math.pow(2, lgNumSlots)); + } + + + /** + * This method suggests the parameters for a Quotient filter based on the maximum number of distinct items and the target false positive probability. + * It first validates the inputs, then calculates the log-base-2 of the number of slots and the fingerprint length. + * The results are returned as a QFPair object. + * + * @param maxDistinctItems The maximum number of distinct items that can be inserted into the filter. + * @param targetFalsePositiveProb The desired false positive probability per item. + * @return A QFPair object containing the suggested number of slots (lgNumSlots) and the suggested fingerprint length. + * @throws SketchesArgumentException if the input parameters are not valid. + */ + public static QFPair suggestParamsFromMaxDistinctsFPP(long maxDistinctItems, double targetFalsePositiveProb) { + validateAccuracyInputs(maxDistinctItems, targetFalsePositiveProb); + byte lgNumSlots = suggestLgNumSlots(maxDistinctItems); + byte fingerprintLength = suggestFingerprintLength(targetFalsePositiveProb); + return new QFPair(lgNumSlots, fingerprintLength); + } + + private static void validateAccuracyInputs(final long maxDistinctItems, final double targetFalsePositiveProb) { + if (maxDistinctItems <= 0) { + throw new SketchesArgumentException("maxDistinctItems must be strictly positive"); + } + if (targetFalsePositiveProb <= 0.0 || targetFalsePositiveProb > 1.0) { + throw new SketchesArgumentException("targetFalsePositiveProb must be a valid probability and strictly greater than 0"); + } + } + + /** + * Helper class to return a pair of parameters for a Quotient filter: + * the log-base-2 of the number of slots (lgNumSlots) and the fingerprint length. + * These parameters are used to configure the Quotient filter. + */ + public static class QFPair { + public final byte lgNumSlots; + public final byte fingerprintLength; + + public QFPair(byte lgNumSlots, byte fingerprintLength) { + this.lgNumSlots = lgNumSlots; + this.fingerprintLength = fingerprintLength; + } + } + +} diff --git a/src/test/java/org/apache/datasketches/filters/quotientfilter/BitVectorTests.java b/src/test/java/org/apache/datasketches/filters/quotientfilter/BitVectorTests.java new file mode 100644 index 000000000..487e36576 --- /dev/null +++ b/src/test/java/org/apache/datasketches/filters/quotientfilter/BitVectorTests.java @@ -0,0 +1,82 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.datasketches.filters.quotientfilter; + +import org.testng.annotations.Test; +import static org.testng.Assert.assertEquals; +import static org.testng.Assert.assertTrue; +import static org.testng.Assert.assertFalse; + +public class BitVectorTests { + + /** + * This test method initializes a QuickBitVectorWrapper with various combinations of bits per entry and number of entries. + * It then calculates the expected length of the bit vector and asserts that the actual size of the bit vector matches the expected length. + * + * Example Input-Output Pairs: + * 1. Input: bitsPerEntry = 2, numEntries = 8 (1L << 3) + * Output: expectedLengthBits = 64 + * + * 2. Input: bitsPerEntry = 3, numEntries = 16 (1L << 4) + * Output: 64 + * + * 3. Input: bitsPerEntry = 33, numEntries = 8 (1L << 3) + * Output: expectedLengthBits = 320 + */ + @Test + static public void testSize(){ + int[] bitsPerEntry = {2, 3, 4, 5, 6, 7, 8, 9, 10, 23, 24, 25, 31, 32, 33}; + long[] numEntries = {1L << 3, 1L<<4, 1L<<8, 1L << 16}; + long nBits ; + long expectedLengthBits ; + + for (int i = 0; i < bitsPerEntry.length; i++){ + for (int j = 0; j < numEntries.length; j++) { + QuickBitVectorWrapper bv = new QuickBitVectorWrapper(bitsPerEntry[i], numEntries[j]); + nBits = bitsPerEntry[i] * numEntries[j]; + expectedLengthBits = 64 * ((nBits % 64 == 0) ? (nBits / 64) : (1 + nBits / 64)); + assertEquals(bv.size(), expectedLengthBits); + } + } + } + + /* + This test amends a few entries in the BitVector and checks that they are appropriately set. + */ + @Test + static public void testSettersAndGetters(){ + QuickBitVectorWrapper bv = new QuickBitVectorWrapper(6, 16); + + // All entries should be False before any updates + for (int i = 0; i < bv.size(); i++){ + assertFalse(bv.get(i), "All entries should be False"); + } + + // Set some values + bv.set(0, true); + assertTrue(bv.get(0), "Value at index 0 should be True"); + + bv.set(32, true) ; + assertTrue(bv.get(32), "Value at index 32 should be True"); + + bv.setFromTo(64, 128, ~0L); + assertTrue(bv.getFromTo(64, 128) == -1L, "Values from 64 to 128 should be set to 1") ; + } +} diff --git a/src/test/java/org/apache/datasketches/filters/quotientfilter/DeletionTests.java b/src/test/java/org/apache/datasketches/filters/quotientfilter/DeletionTests.java new file mode 100644 index 000000000..3f42d3657 --- /dev/null +++ b/src/test/java/org/apache/datasketches/filters/quotientfilter/DeletionTests.java @@ -0,0 +1,156 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.datasketches.filters.quotientfilter; +import org.testng.annotations.Test; +import static org.testng.Assert.assertTrue; + +import java.util.BitSet; + +public class DeletionTests { + + /** + * This test checks the functionality of deleting items from the QuotientFilter. + * The test works by: + * 1. Inserting multiple keys into a single slot to create an overflow. + * 2. Removing these keys. + * 3. Checking that the remaining keys have returned to their canonical slots. + * The expected outcome is that after deletion, the remaining keys should be in their canonical slots. + */ + @Test + static public void BasicDeletions() { + int bits_per_entry = 8; + int num_entries_power = 3; + int num_entries = (int)Math.pow(2, num_entries_power); + QuotientFilter qf = new QuotientFilter(num_entries_power, bits_per_entry); + + long fp1 = 1 << 4; + long fp2 = 1 << 3; + long fp3 = 1 << 2; + long fp4 = 31; + + qf.insert(fp4, 1, false); + qf.insert(fp1, 1, false); + qf.insert(fp1, 1, false); + qf.insert(fp2, 2, false); + qf.insert(fp1, 1, false); + qf.insert(fp1, 1, false); + qf.insert(fp3, 4, false); + + + qf.delete(31, 1); + qf.delete(fp1, 1); + qf.delete(fp1, 1); + qf.delete(fp1, 1); + qf.delete(fp1, 1); + + BitSet result = new BitSet(num_entries * bits_per_entry); + result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 2, true, false, false, fp2); + result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 4, true, false, false, fp3); + assertTrue(QuotientFilterTest.check_equality(qf, result, true)); + } + + /** + * This test checks the functionality of deleting items from the QuotientFilter. + * The test works by: + * 1. Inserting multiple keys into a single slot to create an overflow. + * 2. Removing these keys. + * 3. Checking that the remaining keys have returned to their canonical slots. + * The expected outcome is that after deletion, the remaining keys should be in their canonical slots. + */ + @Test + static public void DeletionsWithSameFingerprint() { + int bits_per_entry = 8; + int num_entries_power = 3; + int num_entries = (int)Math.pow(2, num_entries_power); + QuotientFilter qf = new QuotientFilter(num_entries_power, bits_per_entry); + + + // All keys have the same fingerprint but are mapped into (mostly) different slots + qf.insert(0, 1, false); + qf.insert(0, 1, false); + qf.insert(0, 2, false); + qf.insert(0, 2, false); + qf.insert(0, 3, false); + qf.insert(0, 3, false); + qf.insert(0, 3, false); + qf.insert(0, 6, false); + qf.insert(0, 6, false); + qf.insert(0, 6, false); + qf.insert(0, 7, false); + + + qf.delete(0, 2); + qf.delete(0, 3); + + BitSet result = new BitSet(num_entries * bits_per_entry); + result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 1, true, false, false, 0); + result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 2, true, true, true, 0); + result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 3, true, false, true, 0); + result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 4, false, false, true, 0); + result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 5, false, true, true, 0); + result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 6, true, false, false, 0); + result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 7, true, true, true, 0); + result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 8, false, true, true, 0); + result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 9, false, false, true, 0); + + assertTrue(QuotientFilterTest.check_equality(qf, result, true)); + } + + @Test + /** + * This is a test for deleting items from the QuotientFilter even when an overflow is caused + * by multiple insertions. + * The test works by: + * 1. Inserting multiple keys into a single slot to create an overflow. + * 2. Removing these keys. + * 3. Checking that the remaining keys have returned to their canonical slots. + * + * The expected outcome is that after deletion, the remaining keys should be in their canonical slots. + */ + static public void DeletionsWithOverflow() { + int bits_per_entry = 8; + int num_entries_power = 3; + int num_entries = (int)Math.pow(2, num_entries_power); + QuotientFilter qf = new QuotientFilter(num_entries_power, bits_per_entry); + + qf.insert(0, 1, false); + qf.insert(0, 1, false); + qf.insert(0, 2, false); + qf.insert(0, 2, false); + qf.insert(0, 3, false); + qf.insert(0, 4, false); + qf.insert(0, 4, false); + qf.insert(0, 5, false); + + //qf.pretty_print(); + qf.delete(0, 3); + //qf.pretty_print(); + + BitSet result = new BitSet(num_entries * bits_per_entry); + result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 1, true, false, false, 0); + result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 2, true, true, true, 0); + result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 3, false, false, true, 0); + result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 4, true, true, true, 0); + result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 5, true, false, true, 0); + result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 6, false, true, true, 0); + result = QuotientFilterTest.set_slot_in_test(result, bits_per_entry, 7, false, false, true, 0); + assertTrue(QuotientFilterTest.check_equality(qf, result, true)); + } +} diff --git a/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterBuilderTest.java b/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterBuilderTest.java new file mode 100644 index 000000000..4fc38b2bc --- /dev/null +++ b/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterBuilderTest.java @@ -0,0 +1,113 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.datasketches.filters.quotientfilter; + +import org.apache.datasketches.common.SketchesArgumentException; +import org.apache.datasketches.filters.quotientfilter.QuotientFilterBuilder; +import org.apache.datasketches.memory.WritableMemory; +import org.testng.annotations.Test; + +import static org.testng.Assert.*; +public class QuotientFilterBuilderTest { + + @Test + public void testSuggestFingerprintLengthFromFPP(){ + // invalid false positive rate + assertThrows(SketchesArgumentException.class, () -> QuotientFilterBuilder.suggestFingerprintLength(0.)); + assertThrows(SketchesArgumentException.class, () -> QuotientFilterBuilder.suggestFingerprintLength(1.)); + + // manually computed values based on formula using ceil(log2(1/targetFalsePositiveProb)) + double[] fpps = {0.1, 0.01, 0.001, 0.0001, 1E-5, 1E-6, 1E-7, 1E-8}; + byte[] results = {4, 7, 10, 14, 17, 20, 24, 27, 30}; + for (int i = 0; i < fpps.length; i++) { + assertEquals(QuotientFilterBuilder.suggestFingerprintLength(fpps[i]), results[i]); + } + } + + @Test + public static void testSuggestLgNumSlots(){ + QuotientFilterBuilder qfb = new QuotientFilterBuilder(); + + // invalid number of items + assertThrows(SketchesArgumentException.class, () -> QuotientFilterBuilder.suggestLgNumSlots(0)); + assertThrows(SketchesArgumentException.class, () -> QuotientFilterBuilder.suggestLgNumSlots(-1)); + assertThrows(SketchesArgumentException.class, () -> QuotientFilterBuilder.suggestLgNumSlots(5000000000L)); + + long[] numItems = {1, 100, 1000, 1000000L}; + int[] results = {1, 7, 11, 21} ; + + for (int i = 0; i < numItems.length; i++) { + long num = numItems[i]; + byte result = qfb.suggestLgNumSlots(num); + assertEquals(result, results[i]); + } + } + + @Test + public static void testSuggestMaxNumItems(){ + QuotientFilterBuilder qfb = new QuotientFilterBuilder(); + + // invalid number of slots + assertThrows(SketchesArgumentException.class, () -> QuotientFilterBuilder.suggestMaxNumItemsFromNumSlots((byte)-127)); + assertThrows(SketchesArgumentException.class, () -> QuotientFilterBuilder.suggestMaxNumItemsFromNumSlots((byte)0)); + assertThrows(SketchesArgumentException.class, () -> QuotientFilterBuilder.suggestMaxNumItemsFromNumSlots((byte)32)); + + + byte[] lgNumSlots = {1, 2, 3, 6, 10, 15, 25, 30,}; + long[] results = {1, 3, 7, 57, 921, 29491, 30198988, 966367641} ; + + for (int i = 0; i < lgNumSlots.length; i++) { + long result = qfb.suggestMaxNumItemsFromNumSlots(lgNumSlots[i]); + assertEquals(result, results[i]); + } + } + + @Test + public static void testSuggestParamsFromMaxDistinctsFPP(){ + + // invalid number of slots + assertThrows(SketchesArgumentException.class, () -> QuotientFilterBuilder.suggestParamsFromMaxDistinctsFPP(5000000000L, 0.0001)); + assertThrows(SketchesArgumentException.class, () -> QuotientFilterBuilder.suggestParamsFromMaxDistinctsFPP(100000000, 0.)); + assertThrows(SketchesArgumentException.class, () -> QuotientFilterBuilder.suggestParamsFromMaxDistinctsFPP(100000000, 1.5)); + assertThrows(SketchesArgumentException.class, () -> QuotientFilterBuilder.suggestParamsFromMaxDistinctsFPP(5000000000L, -1.)); + + + QuotientFilterBuilder qfb = new QuotientFilterBuilder(); + byte lgNumSlots ; + byte fingerprintLength ; + long[] numItems = {1L, 900L, 500_000_000L} ; + double[] fpp = {1E-10, 1E-2, 1e-7} ; + + // expected outcomes + byte[] expected_lgNumSlots = {1, 10, 30} ; + byte[] expected_fingerprintLength = {34, 7, 24} ; + + for (int i = 0; i < numItems.length; i++) { + QuotientFilterBuilder.QFPair pair = qfb.suggestParamsFromMaxDistinctsFPP(numItems[i], fpp[i]); + lgNumSlots = pair.lgNumSlots; + fingerprintLength = pair.fingerprintLength; + assertEquals(expected_lgNumSlots[i], lgNumSlots); + assertEquals(expected_fingerprintLength[i], fingerprintLength); + } + } + + + +} diff --git a/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterTest.java b/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterTest.java new file mode 100644 index 000000000..00b085fef --- /dev/null +++ b/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterTest.java @@ -0,0 +1,277 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.datasketches.filters.quotientfilter; +import org.testng.annotations.Test; +import static org.testng.Assert.assertTrue; +import static org.testng.Assert.assertEquals; + +import java.util.BitSet; +import java.util.HashSet; +import java.util.Random; + + +public class QuotientFilterTest { + /* + * This test is based on the example from https://en.wikipedia.org/wiki/Quotient_filter + * in "Algorithm Description" section. + * It performs the same insertions and query as the example and verifies that it gets the same results. + * The insertion keys are: b, e, f, c, d, a which are hashed into slots as: + * (b,1), (e,4), (f, 7), (c,1), (d,2), (a,1) + */ + @Test + static public void WikiInsertionTest() { + int bits_per_entry = 8; // 8 bits per entry => 5 bits fingerprints, resolved internally in the filter. + int num_entries_power = 3; + int num_entries = (int)Math.pow(2, num_entries_power); + QuotientFilter qf = new QuotientFilter(num_entries_power, bits_per_entry); + + // this test does not need different fingerprints as it is testing the slot locations and metadata bits. + long fingerprint0 = 0; + long fingerprint1 = (1 << bits_per_entry) - 1; + + /* + The expected sketch is + 0 000 00000 + 1 100 00000 + 2 111 00000 + 3 011 00000 + 4 101 00000 + 5 001 11111 + 6 000 00000 + 7 100 00000 + */ + qf.insert(fingerprint0, 1, false); + qf.insert(fingerprint1, 4, false); // 11111 is inserted at slot 45 but pushed to slot 5 + qf.insert(fingerprint0, 7, false); + qf.insert(fingerprint0, 1, false); + qf.insert(fingerprint0, 2, false); + qf.insert(fingerprint0, 1, false); + assertEquals(qf.num_existing_entries, 6); + + + + // these are the expected resulting is_occupied, is_continuation, and is_shifted bits + // for all slots contiguously. We do not store the fingerprints here + BitSet result = new BitSet(num_entries * bits_per_entry); + result = set_slot_in_test(result, bits_per_entry, 0, false, false, false, fingerprint0); + result = set_slot_in_test(result, bits_per_entry, 1, true, false, false, fingerprint0); + result = set_slot_in_test(result, bits_per_entry, 2, true, true, true, fingerprint0); + result = set_slot_in_test(result, bits_per_entry, 3, false, true, true, fingerprint0); + result = set_slot_in_test(result, bits_per_entry, 4, true, false, true, fingerprint0); + result = set_slot_in_test(result, bits_per_entry, 5, false, false, true, fingerprint1); + result = set_slot_in_test(result, bits_per_entry, 6, false, false, false, fingerprint0); + result = set_slot_in_test(result, bits_per_entry, 7, true, false, false, fingerprint0); + assertTrue(check_equality(qf, result, true)); + } + + /* + * This test is based on the Figure 2. from https://vldb.org/pvldb/vol5/p1627_michaelabender_vldb2012.pdf. + * It performs the same insertions as in Figure 2 and checks for the same result. + */ + @Test + static public void PaperInsertionTest() { + int bits_per_entry = 8; + int num_entries_power = 4; + int num_entries = (int)Math.pow(2, num_entries_power); + QuotientFilter qf = new QuotientFilter(4, 8); + + // (key, slot): {(a, 1), (b,1), (c ,3), (d, 3), (e, 3), (f, 4), (g, 6), (h, 6)} + qf.insert(0, 1, false); + qf.insert(0, 1, false); + qf.insert(0, 3, false); + qf.insert(0, 3, false); + qf.insert(0, 3, false); + qf.insert(0, 4, false); + qf.insert(0, 6, false); + qf.insert(0, 6, false); + + BitSet result = new BitSet(num_entries * bits_per_entry); + result = set_slot_in_test(result, bits_per_entry, 0, false, false, false, 0); + result = set_slot_in_test(result, bits_per_entry, 1, true, false, false, 0); + result = set_slot_in_test(result, bits_per_entry, 2, false, true, true, 0); + result = set_slot_in_test(result, bits_per_entry, 3, true, false, false, 0); + result = set_slot_in_test(result, bits_per_entry, 4, true, true, true, 0); + result = set_slot_in_test(result, bits_per_entry, 5, false, true, true, 0); + result = set_slot_in_test(result, bits_per_entry, 6, true, false, true, 0); + result = set_slot_in_test(result, bits_per_entry, 7, false, false, true, 0); + result = set_slot_in_test(result, bits_per_entry, 8, false, true, true, 0); + assertTrue(check_equality(qf, result, false)); + } + + // test we don't get any false negatives for quotient filter + @Test + static public void FalseNegativeTest() { + int bits_per_entry = 10; + int num_entries_power = 10; + QuotientFilter filter = new QuotientFilter(num_entries_power, bits_per_entry); + int num_entries = (int) (Math.pow(2, num_entries_power) * 0.9 ); + assertTrue(test_no_false_negatives(filter, num_entries)); + } + + + /* + * Adds two entries to the end of the filter, causing an overflow into the extension slots. + * Checks this can be handled by the internal data structure and then deletes one of the keys from the filter. + */ + @Test + static public void OverflowTest() { + int bits_per_entry = 8; + int num_entries_power = 3; + int num_entries = (int)Math.pow(2, num_entries_power); + int fingerprint_size = bits_per_entry - 3; + QuotientFilter qf = new QuotientFilter(num_entries_power, bits_per_entry); + + long fp2 = 1 << fingerprint_size - 1; + qf.insert(fp2, num_entries - 1, false); + qf.insert(fp2, num_entries - 1, false); + qf.delete(fp2, num_entries - 1); + boolean found = qf.search(fp2, num_entries - 1); + assertTrue(found); + } + + /** + * This method tests the functionality of the QuotientFilter and Iterator classes. It creates a QuotientFilter and inserts + * six entries into it. An Iterator is then used to traverse the entries in the QuotientFilter. The method checks if the + * bucket index of each visited entry matches the expected bucket index. If there's a mismatch, an error message is printed + * and the program exits, indicating a test failure. + */ + @Test + static public void testQuotientFilterInsertionAndIteration() { + + int bits_per_entry = 8; + int num_entries_power = 4; + //int num_entries = (int)Math.pow(2, num_entries_power); + //int fingerprint_size = bits_per_entry - 3; + QuotientFilter qf = new QuotientFilter(num_entries_power, bits_per_entry); + + qf.insert(0, 2, false); + qf.insert(0, 3, false); + qf.insert(0, 3, false); + qf.insert(0, 4, false); + qf.insert(0, 23, false); // last key in the filter + qf.insert(0, 24, false); // outside the bounds, logical slot 14 does not exist logically, even if it might exist physically + + Iterator it = new Iterator(qf); + int[] arr = new int[] {2, 3, 3, 4, 23}; + int arr_index = 0; + while (it.next()) {assertEquals(arr[arr_index++], it.bucket_index);} + } + + @Test + static public void testQuotientFilterIterator() { + + int bits_per_entry = 8; + int num_entries_power = 4; + QuotientFilter qf = new QuotientFilter(num_entries_power, bits_per_entry); + + qf.insert(0, 1, false); + qf.insert(0, 4, false); + qf.insert(0, 7, false); + qf.insert(0, 1, false); + qf.insert(0, 2, false); + qf.insert(0, 1, false); + qf.insert(0, 15, false); + + Iterator it = new Iterator(qf); + int[] arr = new int[] {1, 1, 1, 2, 4, 7, 15}; + int arr_index = 0; + while (it.next()) {assertEquals(arr[arr_index++], it.bucket_index);} + } + + + // Helper functions + + /** + * This method sets the values of a slot in a BitSet based on the provided parameters. + * The slot is defined by the number of bits per entry and the slot index. + * The values to be set include whether the slot is occupied, whether it is a continuation of a previous entry, + * whether it is shifted, and the fingerprint. + * + * @param result The BitSet where the slot values will be set. + * @param bits_per_entry The number of bits per entry in the BitSet. + * @param slot The index of the slot to be set. + * @param is_occupied Whether the slot is occupied. + * @param is_continuation Whether the slot is a continuation of a previous entry. + * @param is_shifted Whether the slot is shifted. + * @param fingerprint The fingerprint to be set in the slot. + * @return The BitSet after setting the slot values. + */ + static public BitSet set_slot_in_test(BitSet result, int bits_per_entry, int slot, boolean is_occupied, boolean is_continuation, boolean is_shifted, long fingerprint) { + int index = bits_per_entry * slot; + result.set(index++, is_occupied); + result.set(index++, is_continuation); + result.set(index++, is_shifted); + for (int i = 0; i < bits_per_entry - 3; i++) { + result.set(index++, Bitmap.get_fingerprint_bit(i, fingerprint) ); + } + return result; + } + + static public BitSet set_slot_in_test(BitSet result, int bits_per_entry, int slot, boolean is_occupied, boolean is_continuation, boolean is_shifted, String fingerprint) { + long l_fingerprint = 0; + for (int i = 0; i < fingerprint.length(); i++) { + char c = fingerprint.charAt(i); + if (c == '1') { + l_fingerprint |= (1 << i); + } + } + return set_slot_in_test(result, bits_per_entry, slot, is_occupied, is_continuation, is_shifted, l_fingerprint); + } + + static public boolean check_equality(QuotientFilter qf, BitSet bs, boolean check_also_fingerprints) { + for (int i = 0; i < bs.size(); i++) { + if (check_also_fingerprints || (i % qf.bitPerEntry == 0 || i % qf.bitPerEntry == 1 || i % qf.bitPerEntry == 2)) { + if (qf.get_bit_at_offset(i) != bs.get(i)) { + return false; + } + } + } + return true; + } + + /* + Helper functino to test that no false negatives are returned. + */ + static public boolean test_no_false_negatives(QuotientFilter filter, int num_entries) { + HashSet added = new HashSet(); + int seed = 5; + Random rand = new Random(seed); + + for (int i = 0; i < num_entries; i++) { + int rand_num = rand.nextInt(); + boolean success = filter.insert(rand_num, false); + if (success) { + added.add(rand_num); + } + else { + System.out.println("insertion failed"); + } + } + + for (Integer i : added) { + boolean found = filter.search((long)i); + if (!found) { + return false ; + } + } + return true; + } + +}