Merge pull request #369 from marklogic/feature/batch-size-tweak

rjrudin · web-flow · commit 5bdb7851e113 · 2024-11-26T11:34:46.000-05:00
Tweaked calculation of buckets per partition
diff --git a/marklogic-spark-connector/src/main/java/com/marklogic/spark/reader/optic/PlanAnalyzer.java b/marklogic-spark-connector/src/main/java/com/marklogic/spark/reader/optic/PlanAnalyzer.java
@@ -34,21 +34,32 @@ PlanAnalysis analyzePlan(AbstractWriteHandle userPlan, long userPartitionCount,
         return new PlanAnalysis((ObjectNode) viewInfo.get("modifiedPlan"), partitions);
     }
 
-    private List<PlanAnalysis.Partition> calculatePartitions(long rowCount, long userPartitionCount, long userBatchSize) {
+    static List<PlanAnalysis.Partition> calculatePartitions(long rowCount, long userPartitionCount, long userBatchSize) {
         final long batchSize = userBatchSize > 0 ? userBatchSize : Long.parseLong("-1");
-        long bucketCount = (rowCount / userPartitionCount) / batchSize;
-        if (bucketCount < 1) {
-            bucketCount = 1;
-        }
+
+        long bucketsPerPartition = calculateBucketsPerPartition(rowCount, userPartitionCount, batchSize);
         long partitionSize = Long.divideUnsigned(-1, userPartitionCount);
         long nextLowerBound = 0;
 
         List<PlanAnalysis.Partition> partitions = new ArrayList<>();
         for (int i = 1; i <= userPartitionCount; i++) {
             long upperBound = (i == userPartitionCount) ? -1 : nextLowerBound + partitionSize;
-            partitions.add(new PlanAnalysis.Partition(i, nextLowerBound, upperBound, bucketCount, partitionSize));
+            partitions.add(new PlanAnalysis.Partition(i, nextLowerBound, upperBound, bucketsPerPartition, partitionSize));
             nextLowerBound = nextLowerBound + partitionSize + 1;
         }
         return partitions;
     }
+
+    /**
+     * The number of buckets per partition is always the same, as the random distribution of row IDs means we don't know
+     * how rows will be distributed across buckets.
+     */
+    private static long calculateBucketsPerPartition(long rowCount, long userPartitionCount, long batchSize) {
+        double rawBucketsPerPartition = ((double) rowCount / userPartitionCount) / batchSize;
+        // ceil is used here to ensure that given the batch size, a bucket typically will not have more rows in it
+        // than the batch size. That's not guaranteed, as row IDs could have a distribution such that many rows are in
+        // one particular bucket.
+        long bucketsPerPartition = (long) Math.ceil(rawBucketsPerPartition);
+        return bucketsPerPartition < 1 ? 1 : bucketsPerPartition;
+    }
 }
diff --git a/marklogic-spark-connector/src/test/java/com/marklogic/spark/reader/optic/CalculatePartitionsTest.java b/marklogic-spark-connector/src/test/java/com/marklogic/spark/reader/optic/CalculatePartitionsTest.java
@@ -0,0 +1,39 @@
+/*
+ * Copyright © 2024 MarkLogic Corporation. All Rights Reserved.
+ */
+package com.marklogic.spark.reader.optic;
+
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.CsvSource;
+
+import java.util.List;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+class CalculatePartitionsTest {
+
+    @ParameterizedTest
+    @CsvSource({
+        "1,0,1,1",
+        "2,0,2,2",
+        "1,5000,1,2",
+        "1,5001,1,2",
+        "1,6666,1,2",
+        "1,6667,1,2",
+        "1,9999,1,2",
+        "1,10000,1,1",
+        "1,10001,1,1",
+        "3,3000,3,6"
+    })
+    void test(long userPartitionCount, long batchSize, int expectedPartitionCount, int expectedBucketCount) {
+        long rowCount = 10000;
+        List<PlanAnalysis.Partition> partitions = PlanAnalyzer.calculatePartitions(rowCount, userPartitionCount, batchSize);
+        int bucketCount = 0;
+        for (PlanAnalysis.Partition partition : partitions) {
+            bucketCount += partition.getBuckets().size();
+        }
+
+        assertEquals(expectedPartitionCount, partitions.size(), "Unexpected number of partitions");
+        assertEquals(expectedBucketCount, bucketCount, "Unexpected number of buckets");
+    }
+}