From 271ad4029f13f079c7243a20592403557df33c3b Mon Sep 17 00:00:00 2001 From: Lee Rhodes Date: Fri, 10 May 2024 10:36:46 -0700 Subject: [PATCH 1/5] Added new method to compute absolute maximum number of storage bytes required for a CompactSketch given the configured number of nominal entries (power of 2). --- .../java/org/apache/datasketches/theta/Sketch.java | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/main/java/org/apache/datasketches/theta/Sketch.java b/src/main/java/org/apache/datasketches/theta/Sketch.java index 888116512..960878fcb 100644 --- a/src/main/java/org/apache/datasketches/theta/Sketch.java +++ b/src/main/java/org/apache/datasketches/theta/Sketch.java @@ -304,6 +304,18 @@ public static int getMaxCompactSketchBytes(final int numberOfEntries) { return (numberOfEntries << 3) + 24; } + /** + * Returns the maximum number of storage bytes required for a CompactSketch given the configured + * number of nominal entries (power of 2). + * @param nomEntries Nominal Entries + * @return the maximum number of storage bytes required for a CompactSketch with the given + * nomEntries. + */ + public static int getCompactSketchMaxBytes(final int nomEntries) { + final int nomEnt = ceilingPowerOf2(nomEntries); + return ((nomEnt << 4) * 15) / 16 + (Family.QUICKSELECT.getMaxPreLongs() << 3); + } + /** * Returns the maximum number of storage bytes required for an UpdateSketch with the given * number of nominal entries (power of 2). From 980c132d3aa0caf36cdda0c6e750ca466b2b2d8b Mon Sep 17 00:00:00 2001 From: Lee Rhodes Date: Fri, 10 May 2024 12:07:01 -0700 Subject: [PATCH 2/5] Add test for new method --- .../org/apache/datasketches/theta/Sketch.java | 2 ++ .../apache/datasketches/theta/Sketches.java | 27 +++++++++++++++---- .../datasketches/theta/SketchesTest.java | 6 +++++ 3 files changed, 30 insertions(+), 5 deletions(-) diff --git a/src/main/java/org/apache/datasketches/theta/Sketch.java b/src/main/java/org/apache/datasketches/theta/Sketch.java index 960878fcb..b949169c0 100644 --- a/src/main/java/org/apache/datasketches/theta/Sketch.java +++ b/src/main/java/org/apache/datasketches/theta/Sketch.java @@ -297,7 +297,9 @@ public double getLowerBound(final int numStdDev) { * @param numberOfEntries the actual number of entries stored with the CompactSketch. * @return the maximum number of storage bytes required for a CompactSketch with the given number * of entries. + * @deprecated as a public method. Use {@link #getCompactSketchMaxBytes(int) instead} */ + @Deprecated public static int getMaxCompactSketchBytes(final int numberOfEntries) { if (numberOfEntries == 0) { return 8; } if (numberOfEntries == 1) { return 16; } diff --git a/src/main/java/org/apache/datasketches/theta/Sketches.java b/src/main/java/org/apache/datasketches/theta/Sketches.java index a5862e4a4..4b1461876 100644 --- a/src/main/java/org/apache/datasketches/theta/Sketches.java +++ b/src/main/java/org/apache/datasketches/theta/Sketches.java @@ -79,15 +79,32 @@ public static int getMaxAnotBResultBytes(final int maxNomEntries) { } /** - * Ref: {@link Sketch#getMaxCompactSketchBytes(int)} - * @param numberOfEntries Ref: {@link Sketch#getMaxCompactSketchBytes(int)}, - * {@code numberOfEntries} - * @return Ref: {@link Sketch#getMaxCompactSketchBytes(int)} - */ + * Returns the maximum number of storage bytes required for a CompactSketch with the given + * number of actual entries. Note that this assumes the worse case of the sketch in + * estimation mode, which requires storing theta and count. + * @param numberOfEntries the actual number of entries stored with the CompactSketch. + * @return the maximum number of storage bytes required for a CompactSketch with the given number + * of entries. + * @see Sketch#getMaxCompactSketchBytes(int) + * @deprecated as a public method. Use {@link #getCompactSketchMaxBytes(int) instead} + */ + @Deprecated public static int getMaxCompactSketchBytes(final int numberOfEntries) { return Sketch.getMaxCompactSketchBytes(numberOfEntries); } + /** + * Returns the maximum number of storage bytes required for a CompactSketch given the configured + * number of nominal entries (power of 2). + * @param nomEntries Nominal Entries + * @return the maximum number of storage bytes required for a CompactSketch with the given + * nomEntries. + * @see Sketch#getCompactSketchMaxBytes(int) + */ + public static int getCompactSketchMaxBytes(final int nomEntries) { + return Sketch.getCompactSketchMaxBytes(nomEntries); + } + /** * Ref: {@link SetOperation#getMaxIntersectionBytes(int)} * @param nomEntries Ref: {@link SetOperation#getMaxIntersectionBytes(int)}, {@code nomEntries} diff --git a/src/test/java/org/apache/datasketches/theta/SketchesTest.java b/src/test/java/org/apache/datasketches/theta/SketchesTest.java index 6b887448e..cd51b50ed 100644 --- a/src/test/java/org/apache/datasketches/theta/SketchesTest.java +++ b/src/test/java/org/apache/datasketches/theta/SketchesTest.java @@ -20,6 +20,7 @@ package org.apache.datasketches.theta; import static org.apache.datasketches.theta.BackwardConversions.convertSerVer3toSerVer1; +import static org.apache.datasketches.theta.Sketches.getCompactSketchMaxBytes; import static org.apache.datasketches.theta.Sketches.getMaxCompactSketchBytes; import static org.apache.datasketches.theta.Sketches.getMaxIntersectionBytes; import static org.apache.datasketches.theta.Sketches.getMaxUnionBytes; @@ -35,6 +36,7 @@ import static org.testng.Assert.assertFalse; import static org.testng.Assert.assertTrue; +import org.apache.datasketches.common.Family; import org.apache.datasketches.common.SketchesArgumentException; import org.apache.datasketches.memory.Memory; import org.apache.datasketches.memory.WritableMemory; @@ -141,6 +143,10 @@ public void checkUtilMethods() { final int maxCompSkBytes = getMaxCompactSketchBytes(k+1); assertEquals(24+(k+1)*8, maxCompSkBytes); + final int compSkMaxBytes = getCompactSketchMaxBytes(k); { + assertEquals(compSkMaxBytes, ((k << 4) * 15) / 16 + (Family.QUICKSELECT.getMaxPreLongs() << 3)); + } + final int maxSkBytes = getMaxUpdateSketchBytes(k); assertEquals(24+2*k*8, maxSkBytes); } From 5a3144e3df677aa0a445770ca46f9cf57635c735 Mon Sep 17 00:00:00 2001 From: Lee Rhodes Date: Tue, 14 May 2024 17:12:24 -0700 Subject: [PATCH 3/5] Update SketchesCheckstyle Improvements to getCompactSketchMaxBytes(...) Renaming of two special functions one in HeapQuickSelectSketch and the other in DirectQuickSelectSketchR. Also changed their access privilege. --- .../theta/DirectQuickSelectSketch.java | 10 +++---- .../theta/DirectQuickSelectSketchR.java | 6 ++-- .../theta/HeapQuickSelectSketch.java | 10 +++---- .../org/apache/datasketches/theta/Sketch.java | 10 +++---- tools/SketchesCheckstyle.xml | 30 ++++++++++++------- 5 files changed, 38 insertions(+), 28 deletions(-) diff --git a/src/main/java/org/apache/datasketches/theta/DirectQuickSelectSketch.java b/src/main/java/org/apache/datasketches/theta/DirectQuickSelectSketch.java index 8397c130b..a1ac53c6d 100644 --- a/src/main/java/org/apache/datasketches/theta/DirectQuickSelectSketch.java +++ b/src/main/java/org/apache/datasketches/theta/DirectQuickSelectSketch.java @@ -169,7 +169,7 @@ private DirectQuickSelectSketch( //clear hash table area dstMem.clear(preambleLongs << 3, 8 << lgArrLongs); - hashTableThreshold_ = setHashTableThreshold(lgNomLongs, lgArrLongs); + hashTableThreshold_ = getOffHeapHashTableThreshold(lgNomLongs, lgArrLongs); memReqSvr_ = memReqSvr; } @@ -210,7 +210,7 @@ static DirectQuickSelectSketch writableWrap(final WritableMemory srcMem, final l final DirectQuickSelectSketch dqss = new DirectQuickSelectSketch(seed, srcMem); - dqss.hashTableThreshold_ = setHashTableThreshold(lgNomLongs, lgArrLongs); + dqss.hashTableThreshold_ = getOffHeapHashTableThreshold(lgNomLongs, lgArrLongs); return dqss; } @@ -228,7 +228,7 @@ static DirectQuickSelectSketch fastWritableWrap(final WritableMemory srcMem, fin final DirectQuickSelectSketch dqss = new DirectQuickSelectSketch(seed, srcMem); - dqss.hashTableThreshold_ = setHashTableThreshold(lgNomLongs, lgArrLongs); + dqss.hashTableThreshold_ = getOffHeapHashTableThreshold(lgNomLongs, lgArrLongs); return dqss; } @@ -310,7 +310,7 @@ UpdateReturnState hashUpdate(final long hash) { if (actLgRF > 0) { //Expand in current Memory //lgArrLongs will change; thetaLong, curCount will not resize(wmem_, preambleLongs, lgArrLongs, tgtLgArrLongs); - hashTableThreshold_ = setHashTableThreshold(lgNomLongs, tgtLgArrLongs); + hashTableThreshold_ = getOffHeapHashTableThreshold(lgNomLongs, tgtLgArrLongs); return InsertedCountIncrementedResized; } //end of Expand in current memory, exit. @@ -330,7 +330,7 @@ UpdateReturnState hashUpdate(final long hash) { memReqSvr_.requestClose(wmem_, newDstMem); wmem_ = newDstMem; - hashTableThreshold_ = setHashTableThreshold(lgNomLongs, tgtLgArrLongs); + hashTableThreshold_ = getOffHeapHashTableThreshold(lgNomLongs, tgtLgArrLongs); return InsertedCountIncrementedResized; } //end of Request more memory to resize } //end of resize diff --git a/src/main/java/org/apache/datasketches/theta/DirectQuickSelectSketchR.java b/src/main/java/org/apache/datasketches/theta/DirectQuickSelectSketchR.java index c593f52e3..566d1b2e2 100644 --- a/src/main/java/org/apache/datasketches/theta/DirectQuickSelectSketchR.java +++ b/src/main/java/org/apache/datasketches/theta/DirectQuickSelectSketchR.java @@ -86,7 +86,7 @@ static DirectQuickSelectSketchR readOnlyWrap(final Memory srcMem, final long see final DirectQuickSelectSketchR dqssr = new DirectQuickSelectSketchR(seed, (WritableMemory) srcMem); - dqssr.hashTableThreshold_ = setHashTableThreshold(lgNomLongs, lgArrLongs); + dqssr.hashTableThreshold_ = getOffHeapHashTableThreshold(lgNomLongs, lgArrLongs); return dqssr; } @@ -104,7 +104,7 @@ static DirectQuickSelectSketchR fastReadOnlyWrap(final Memory srcMem, final long final DirectQuickSelectSketchR dqss = new DirectQuickSelectSketchR(seed, (WritableMemory) srcMem); - dqss.hashTableThreshold_ = setHashTableThreshold(lgNomLongs, lgArrLongs); + dqss.hashTableThreshold_ = getOffHeapHashTableThreshold(lgNomLongs, lgArrLongs); return dqss; } @@ -276,7 +276,7 @@ UpdateReturnState hashUpdate(final long hash) { * @return the hash table threshold */ @SuppressFBWarnings(value = "DB_DUPLICATE_BRANCHES", justification = "False Positive, see the code comments") - static final int setHashTableThreshold(final int lgNomLongs, final int lgArrLongs) { + protected static final int getOffHeapHashTableThreshold(final int lgNomLongs, final int lgArrLongs) { //SpotBugs may complain (DB_DUPLICATE_BRANCHES) if DQS_RESIZE_THRESHOLD == REBUILD_THRESHOLD, //but this allows us to tune these constants for different sketches. final double fraction = (lgArrLongs <= lgNomLongs) ? DQS_RESIZE_THRESHOLD : ThetaUtil.REBUILD_THRESHOLD; diff --git a/src/main/java/org/apache/datasketches/theta/HeapQuickSelectSketch.java b/src/main/java/org/apache/datasketches/theta/HeapQuickSelectSketch.java index 37b615456..627013f4f 100644 --- a/src/main/java/org/apache/datasketches/theta/HeapQuickSelectSketch.java +++ b/src/main/java/org/apache/datasketches/theta/HeapQuickSelectSketch.java @@ -92,7 +92,7 @@ private HeapQuickSelectSketch(final int lgNomLongs, final long seed, final float } lgArrLongs_ = ThetaUtil.startingSubMultiple(lgNomLongs + 1, rf.lg(), ThetaUtil.MIN_LG_ARR_LONGS); - hashTableThreshold_ = setHashTableThreshold(lgNomLongs, lgArrLongs_); + hashTableThreshold_ = getHashTableThreshold(lgNomLongs, lgArrLongs_); curCount_ = 0; thetaLong_ = (long)(p * LONG_MAX_VALUE_AS_DOUBLE); empty_ = true; //other flags: bigEndian = readOnly = compact = ordered = false; @@ -128,7 +128,7 @@ static HeapQuickSelectSketch heapifyInstance(final Memory srcMem, final long see final HeapQuickSelectSketch hqss = new HeapQuickSelectSketch(lgNomLongs, seed, p, memRF, preambleLongs, family); hqss.lgArrLongs_ = lgArrLongs; - hqss.hashTableThreshold_ = setHashTableThreshold(lgNomLongs, lgArrLongs); + hqss.hashTableThreshold_ = getHashTableThreshold(lgNomLongs, lgArrLongs); hqss.curCount_ = extractCurCount(srcMem); hqss.thetaLong_ = extractThetaLong(srcMem); hqss.empty_ = PreambleUtil.isEmptyFlag(srcMem); @@ -197,7 +197,7 @@ public void reset() { cache_ = new long[1 << lgArrLongsSM]; lgArrLongs_ = lgArrLongsSM; } - hashTableThreshold_ = setHashTableThreshold(lgNomLongs_, lgArrLongs_); + hashTableThreshold_ = getHashTableThreshold(lgNomLongs_, lgArrLongs_); empty_ = true; curCount_ = 0; thetaLong_ = (long)(getP() * LONG_MAX_VALUE_AS_DOUBLE); @@ -293,7 +293,7 @@ private final void resizeCache() { curCount_ = newCount; cache_ = tgtArr; - hashTableThreshold_ = setHashTableThreshold(lgNomLongs_, lgArrLongs_); + hashTableThreshold_ = getHashTableThreshold(lgNomLongs_, lgArrLongs_); } //array stays the same size. Changes theta and thus count @@ -318,7 +318,7 @@ private final void quickSelectAndRebuild() { * @param lgArrLongs See lgArrLongs. * @return the hash table threshold */ - static final int setHashTableThreshold(final int lgNomLongs, final int lgArrLongs) { + private static final int getHashTableThreshold(final int lgNomLongs, final int lgArrLongs) { final double fraction = (lgArrLongs <= lgNomLongs) ? ThetaUtil.RESIZE_THRESHOLD : ThetaUtil.REBUILD_THRESHOLD; return (int) Math.floor(fraction * (1 << lgArrLongs)); } diff --git a/src/main/java/org/apache/datasketches/theta/Sketch.java b/src/main/java/org/apache/datasketches/theta/Sketch.java index b949169c0..cc1fd4d23 100644 --- a/src/main/java/org/apache/datasketches/theta/Sketch.java +++ b/src/main/java/org/apache/datasketches/theta/Sketch.java @@ -308,14 +308,14 @@ public static int getMaxCompactSketchBytes(final int numberOfEntries) { /** * Returns the maximum number of storage bytes required for a CompactSketch given the configured - * number of nominal entries (power of 2). - * @param nomEntries Nominal Entries + * log_base2 of the number of nominal entries, which is a power of 2. + * @param lgNomEntries Nominal Entries * @return the maximum number of storage bytes required for a CompactSketch with the given * nomEntries. */ - public static int getCompactSketchMaxBytes(final int nomEntries) { - final int nomEnt = ceilingPowerOf2(nomEntries); - return ((nomEnt << 4) * 15) / 16 + (Family.QUICKSELECT.getMaxPreLongs() << 3); + public static int getCompactSketchMaxBytes(final int lgNomEntries) { + return (int)((2 << lgNomEntries) * ThetaUtil.REBUILD_THRESHOLD) + + Family.QUICKSELECT.getMaxPreLongs() * Long.BYTES; } /** diff --git a/tools/SketchesCheckstyle.xml b/tools/SketchesCheckstyle.xml index 8a45ea574..873a878a0 100644 --- a/tools/SketchesCheckstyle.xml +++ b/tools/SketchesCheckstyle.xml @@ -36,18 +36,17 @@ under the License. + + - + - - - - - - - - + + + + @@ -77,7 +76,18 @@ under the License. - + + + + + + + From ca32da73300df647446123da195305463fe29cb8 Mon Sep 17 00:00:00 2001 From: Lee Rhodes Date: Tue, 14 May 2024 17:21:28 -0700 Subject: [PATCH 4/5] Oops, forgot to update the test. --- .../java/org/apache/datasketches/theta/SketchesTest.java | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/test/java/org/apache/datasketches/theta/SketchesTest.java b/src/test/java/org/apache/datasketches/theta/SketchesTest.java index cd51b50ed..84942fe21 100644 --- a/src/test/java/org/apache/datasketches/theta/SketchesTest.java +++ b/src/test/java/org/apache/datasketches/theta/SketchesTest.java @@ -132,7 +132,8 @@ public void checkSetOpMethods() { @Test public void checkUtilMethods() { - final int k = 1024; + final int lgK = 10; + final int k = 1 << lgK; final int maxUnionBytes = getMaxUnionBytes(k); assertEquals(2*k*8+32, maxUnionBytes); @@ -143,8 +144,8 @@ public void checkUtilMethods() { final int maxCompSkBytes = getMaxCompactSketchBytes(k+1); assertEquals(24+(k+1)*8, maxCompSkBytes); - final int compSkMaxBytes = getCompactSketchMaxBytes(k); { - assertEquals(compSkMaxBytes, ((k << 4) * 15) / 16 + (Family.QUICKSELECT.getMaxPreLongs() << 3)); + final int compSkMaxBytes = getCompactSketchMaxBytes(lgK); { + assertEquals(compSkMaxBytes, ((2 << lgK) * 15) / 16 + (Family.QUICKSELECT.getMaxPreLongs() << 3)); } final int maxSkBytes = getMaxUpdateSketchBytes(k); From f979b373b9cdbaae994390f5f43caf976ec180de Mon Sep 17 00:00:00 2001 From: Lee Rhodes Date: Wed, 15 May 2024 15:27:43 -0700 Subject: [PATCH 5/5] modified HeapQuickSelectSketch::getHashTableThreshold(..) And DirectQuickSelectSketchR::getOffHeapHashTableThreshold(..) to eliminate the unnecessary Math.floor(..) function. --- .../org/apache/datasketches/theta/DirectQuickSelectSketchR.java | 2 +- .../org/apache/datasketches/theta/HeapQuickSelectSketch.java | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/apache/datasketches/theta/DirectQuickSelectSketchR.java b/src/main/java/org/apache/datasketches/theta/DirectQuickSelectSketchR.java index 566d1b2e2..a3ffebc14 100644 --- a/src/main/java/org/apache/datasketches/theta/DirectQuickSelectSketchR.java +++ b/src/main/java/org/apache/datasketches/theta/DirectQuickSelectSketchR.java @@ -280,7 +280,7 @@ protected static final int getOffHeapHashTableThreshold(final int lgNomLongs, fi //SpotBugs may complain (DB_DUPLICATE_BRANCHES) if DQS_RESIZE_THRESHOLD == REBUILD_THRESHOLD, //but this allows us to tune these constants for different sketches. final double fraction = (lgArrLongs <= lgNomLongs) ? DQS_RESIZE_THRESHOLD : ThetaUtil.REBUILD_THRESHOLD; - return (int) Math.floor(fraction * (1 << lgArrLongs)); + return (int) (fraction * (1 << lgArrLongs)); } } diff --git a/src/main/java/org/apache/datasketches/theta/HeapQuickSelectSketch.java b/src/main/java/org/apache/datasketches/theta/HeapQuickSelectSketch.java index 627013f4f..b9d4dc9e1 100644 --- a/src/main/java/org/apache/datasketches/theta/HeapQuickSelectSketch.java +++ b/src/main/java/org/apache/datasketches/theta/HeapQuickSelectSketch.java @@ -320,7 +320,7 @@ private final void quickSelectAndRebuild() { */ private static final int getHashTableThreshold(final int lgNomLongs, final int lgArrLongs) { final double fraction = (lgArrLongs <= lgNomLongs) ? ThetaUtil.RESIZE_THRESHOLD : ThetaUtil.REBUILD_THRESHOLD; - return (int) Math.floor(fraction * (1 << lgArrLongs)); + return (int) (fraction * (1 << lgArrLongs)); } }