threadCount now refers to total thread count

rjrudin · rjrudin · commit 2ec25bb9d78d · 2024-07-22T17:13:52.000-04:00
Will communicate this change in the release notes, as it's changing how a public option works, but doing so in a way that better matches user expectations. `threadCountPerPartition` is then offered for what is expected to be a rare case where a user wants to configure the number of threads per partition.
diff --git a/docs/configuration.md b/docs/configuration.md
@@ -193,7 +193,8 @@ The following options control how the connector writes rows as documents to Mark
 | spark.marklogic.write.fileRows.documentType | Forces a document type when MarkLogic does not recognize a URI extension; must be one of `JSON`, `XML`, or `TEXT`. |
 | spark.marklogic.write.jsonRootName | As of 2.3.0, specifies a root field name when writing JSON documents based on arbitrary rows. |
 | spark.marklogic.write.temporalCollection | Name of a temporal collection to assign each document to. |
-| spark.marklogic.write.threadCount | The number of threads used within each partition to send documents to MarkLogic; defaults to 4. |
+| spark.marklogic.write.threadCount | The number of threads used across all partitions to send documents to MarkLogic; defaults to 4. |
+| spark.marklogic.write.threadCountPerPartition | New in 2.3.0; the number of threads used per partition to send documents to MarkLogic. |
 | spark.marklogic.write.transform | Name of a REST transform to apply to each document. |
 | spark.marklogic.write.transformParams | Comma-delimited string of transform parameter names and values - e.g. param1,value1,param2,value2 . |
 | spark.marklogic.write.transformParamsDelimiter | Delimiter to use instead of a command for the `transformParams` option. |
diff --git a/docs/writing.md b/docs/writing.md
@@ -244,8 +244,15 @@ The connector uses MarkLogic's
 following options can be set to adjust how the connector performs when writing data:
 
 - `spark.marklogic.write.batchSize` = the number of documents written in one call to MarkLogic; defaults to 100.
-- `spark.marklogic.write.threadCount` = the number of threads used by each partition to write documents to MarkLogic;
+- `spark.marklogic.write.threadCount` = the number of threads used across all partitions to write documents to MarkLogic;
   defaults to 4.
+- `spark.marklogic.write.threadCountPerPartition` = the number of threads to use per partition to write documents to
+MarkLogic. If set, will be used instead of `spark.marklogic.write.threadCount`. 
+
+Prior to the 2.3.0 release, `spark.marklogic.write.threadCount` configured a number of threads per partition. Based on 
+feedback, this was changed to the number of total threads used across all partitions to match what users typically 
+expect "thread count" to mean in the context of writing to MarkLogic. `spark.marklogic.write.threadCountPerPartition`
+was then added for users who do wish to configure a number of threads per Spark partition.
 
 These options are in addition to the number of partitions within the Spark DataFrame that is being written to
 MarkLogic. For each partition in the DataFrame, a separate instance of a MarkLogic batch writer is created, each
@@ -264,7 +271,7 @@ the connector can directly connect to each host in the cluster.
 
 The rule of thumb above can thus be expressed as:
 
-    Number of partitions * Value of spark.marklogic.write.threadCount <= Number of hosts * number of app server threads
+    Value of spark.marklogic.write.threadCount <= Number of hosts * number of app server threads
 
 ### Using a load balancer
 
diff --git a/src/main/java/com/marklogic/spark/Options.java b/src/main/java/com/marklogic/spark/Options.java
@@ -87,7 +87,7 @@ public abstract class Options {
 
     public static final String WRITE_BATCH_SIZE = "spark.marklogic.write.batchSize";
     public static final String WRITE_THREAD_COUNT = "spark.marklogic.write.threadCount";
-    public static final String WRITE_TOTAL_THREAD_COUNT = "spark.marklogic.write.totalThreadCount";
+    public static final String WRITE_THREAD_COUNT_PER_PARTITION = "spark.marklogic.write.threadCountPerPartition";
     public static final String WRITE_ABORT_ON_FAILURE = "spark.marklogic.write.abortOnFailure";
 
     // For logging progress when writing documents or processing with custom code.
diff --git a/src/main/java/com/marklogic/spark/writer/MarkLogicWrite.java b/src/main/java/com/marklogic/spark/writer/MarkLogicWrite.java
@@ -127,15 +127,13 @@ private DataWriterFactory determineWriterFactory() {
     }
 
     private void logPartitionAndThreadCounts(int numPartitions) {
-        int totalThreadCount = writeContext.getTotalThreadCount();
-        if (totalThreadCount > 0) {
-            int threadCountPerPartition = writeContext.getThreadCountPerPartition();
+        int userDefinedPartitionThreadCount = writeContext.getUserDefinedThreadCountPerPartition();
+        if (userDefinedPartitionThreadCount > 0) {
             Util.MAIN_LOGGER.info("Number of partitions: {}; total thread count: {}; thread count per partition: {}",
-                numPartitions, totalThreadCount, threadCountPerPartition);
+                numPartitions, numPartitions * userDefinedPartitionThreadCount, userDefinedPartitionThreadCount);
         } else {
-            int threadCount = writeContext.getThreadCount();
-            Util.MAIN_LOGGER.info("Number of partitions: {}; thread count per partition: {}; total threads used for writing: {}",
-                numPartitions, threadCount, numPartitions * threadCount);
+            Util.MAIN_LOGGER.info("Number of partitions: {}; total threads used for writing: {}",
+                numPartitions, writeContext.getTotalThreadCount());
         }
     }
 
diff --git a/src/main/java/com/marklogic/spark/writer/WriteContext.java b/src/main/java/com/marklogic/spark/writer/WriteContext.java
@@ -77,25 +77,41 @@ public StructType getSchema() {
         return schema;
     }
 
-    int getThreadCount() {
+    /**
+     * @return the total number of threads to use across all partitions. This is typically how a user thinks in terms
+     * of, as they are not likely to know how many partitions will be created. But they will typically know how many
+     * hosts are in their MarkLogic cluster and how many threads are available to an app server on each host.
+     */
+    int getTotalThreadCount() {
         return (int) getNumericOption(Options.WRITE_THREAD_COUNT, 4, 1);
     }
 
-    int getTotalThreadCount() {
-        return (int) getNumericOption(Options.WRITE_TOTAL_THREAD_COUNT, 0, 1);
+    /**
+     * @return the thread count to use per partition where a user has specified the total thread count across all
+     * partitions.
+     */
+    int getCalculatedThreadCountPerPartition() {
+        int threadCount = getTotalThreadCount();
+        if (this.numPartitions > 0) {
+            return (int) Math.ceil((double) threadCount / (double) numPartitions);
+        }
+        return threadCount;
     }
 
-    int getThreadCountPerPartition() {
-        int totalThreadCount = getTotalThreadCount();
-        if (totalThreadCount > 0 && this.numPartitions > 0) {
-            return (int) Math.ceil((double) totalThreadCount / (double) numPartitions);
-        }
-        return 0;
+    /**
+     * @return the thread count to use per partition where a user has used an option to explicitly define how many
+     * threads should be used by a partition.
+     */
+    int getUserDefinedThreadCountPerPartition() {
+        return (int) getNumericOption(Options.WRITE_THREAD_COUNT_PER_PARTITION, 0, 1);
     }
 
     WriteBatcher newWriteBatcher(DataMovementManager dataMovementManager) {
-        final int threadCount = getTotalThreadCount() > 0 ?
-            getThreadCountPerPartition() : getThreadCount();
+        // If the user told us how many threads they want per partition (we expect this to be rare), then use that.
+        // Otherwise, use the calculated number of threads per partition based on the total thread count that either
+        // the user configured or using the default value for that option.
+        final int threadCount = getUserDefinedThreadCountPerPartition() > 0 ?
+            getUserDefinedThreadCountPerPartition() : getCalculatedThreadCountPerPartition();
 
         Util.MAIN_LOGGER.info("Creating new batcher with thread count of {} and batch size of {}.", threadCount, batchSize);
         WriteBatcher writeBatcher = dataMovementManager
diff --git a/src/main/resources/marklogic-spark-messages.properties b/src/main/resources/marklogic-spark-messages.properties
@@ -12,7 +12,7 @@ spark.marklogic.write.graph=
 spark.marklogic.write.graphOverride=
 spark.marklogic.write.jsonRootName=
 spark.marklogic.write.threadCount=
-spark.marklogic.write.totalThreadCount=
+spark.marklogic.write.threadCountPerPartition=
 spark.marklogic.write.transformParams=
 spark.marklogic.write.uriTemplate=
 spark.marklogic.write.xmlRootName=
diff --git a/src/test/java/com/marklogic/spark/writer/WriteRowsTest.java b/src/test/java/com/marklogic/spark/writer/WriteRowsTest.java
@@ -68,7 +68,7 @@ void batchSizeGreaterThanNumberOfRowsToWrite() {
     @Test
     void twoPartitions() {
         newWriter(2)
-            .option(Options.WRITE_TOTAL_THREAD_COUNT, 16)
+            .option(Options.WRITE_THREAD_COUNT_PER_PARTITION, 8)
             .option(Options.WRITE_BATCH_SIZE, 10)
             .save();
 
@@ -80,7 +80,7 @@ void twoPartitions() {
     @Test
     void insufficientPrivilegeForOtherDatabase() {
         DataFrameWriter writer = newWriter(2)
-            .option(Options.WRITE_TOTAL_THREAD_COUNT, 16)
+            .option(Options.WRITE_THREAD_COUNT_PER_PARTITION, 8)
             .option(Options.WRITE_BATCH_SIZE, 10)
             .option(Options.CLIENT_URI, "spark-test-user:spark@localhost:8016/Documents");
 

Original file line number	Diff line number	Diff line change
`@@ -127,15 +127,13 @@ private DataWriterFactory determineWriterFactory() {`
`127`	`127`	`}`
`128`	`128`
`129`	`129`	`private void logPartitionAndThreadCounts(int numPartitions) {`
`130`		`- int totalThreadCount = writeContext.getTotalThreadCount();`
`131`		`- if (totalThreadCount > 0) {`
`132`		`- int threadCountPerPartition = writeContext.getThreadCountPerPartition();`
	`130`	`+ int userDefinedPartitionThreadCount = writeContext.getUserDefinedThreadCountPerPartition();`
	`131`	`+ if (userDefinedPartitionThreadCount > 0) {`
`133`	`132`	`Util.MAIN_LOGGER.info("Number of partitions: {}; total thread count: {}; thread count per partition: {}",`
`134`		`- numPartitions, totalThreadCount, threadCountPerPartition);`
	`133`	`+ numPartitions, numPartitions * userDefinedPartitionThreadCount, userDefinedPartitionThreadCount);`
`135`	`134`	`} else {`
`136`		`- int threadCount = writeContext.getThreadCount();`
`137`		`- Util.MAIN_LOGGER.info("Number of partitions: {}; thread count per partition: {}; total threads used for writing: {}",`
`138`		`- numPartitions, threadCount, numPartitions * threadCount);`
	`135`	`+ Util.MAIN_LOGGER.info("Number of partitions: {}; total threads used for writing: {}",`
	`136`	`+ numPartitions, writeContext.getTotalThreadCount());`
`139`	`137`	`}`
`140`	`138`	`}`
`141`	`139`