@@ -34,21 +34,32 @@ PlanAnalysis analyzePlan(AbstractWriteHandle userPlan, long userPartitionCount,
34
34
return new PlanAnalysis ((ObjectNode ) viewInfo .get ("modifiedPlan" ), partitions );
35
35
}
36
36
37
- private List <PlanAnalysis .Partition > calculatePartitions (long rowCount , long userPartitionCount , long userBatchSize ) {
37
+ static List <PlanAnalysis .Partition > calculatePartitions (long rowCount , long userPartitionCount , long userBatchSize ) {
38
38
final long batchSize = userBatchSize > 0 ? userBatchSize : Long .parseLong ("-1" );
39
- long bucketCount = (rowCount / userPartitionCount ) / batchSize ;
40
- if (bucketCount < 1 ) {
41
- bucketCount = 1 ;
42
- }
39
+
40
+ long bucketsPerPartition = calculateBucketsPerPartition (rowCount , userPartitionCount , batchSize );
43
41
long partitionSize = Long .divideUnsigned (-1 , userPartitionCount );
44
42
long nextLowerBound = 0 ;
45
43
46
44
List <PlanAnalysis .Partition > partitions = new ArrayList <>();
47
45
for (int i = 1 ; i <= userPartitionCount ; i ++) {
48
46
long upperBound = (i == userPartitionCount ) ? -1 : nextLowerBound + partitionSize ;
49
- partitions .add (new PlanAnalysis .Partition (i , nextLowerBound , upperBound , bucketCount , partitionSize ));
47
+ partitions .add (new PlanAnalysis .Partition (i , nextLowerBound , upperBound , bucketsPerPartition , partitionSize ));
50
48
nextLowerBound = nextLowerBound + partitionSize + 1 ;
51
49
}
52
50
return partitions ;
53
51
}
52
+
53
+ /**
54
+ * The number of buckets per partition is always the same, as the random distribution of row IDs means we don't know
55
+ * how rows will be distributed across buckets.
56
+ */
57
+ private static long calculateBucketsPerPartition (long rowCount , long userPartitionCount , long batchSize ) {
58
+ double rawBucketsPerPartition = ((double ) rowCount / userPartitionCount ) / batchSize ;
59
+ // ceil is used here to ensure that given the batch size, a bucket typically will not have more rows in it
60
+ // than the batch size. That's not guaranteed, as row IDs could have a distribution such that many rows are in
61
+ // one particular bucket.
62
+ long bucketsPerPartition = (long ) Math .ceil (rawBucketsPerPartition );
63
+ return bucketsPerPartition < 1 ? 1 : bucketsPerPartition ;
64
+ }
54
65
}
0 commit comments