Added logging of progress

rjrudin · rjrudin · commit 7227441c8d39 · 2024-07-18T11:05:25.000-04:00
Works for writing documents (the main use case) and reprocessing data (also a good use case). Not doing anything for exporting yet because in most cases in Flux, we don't have control over the data source used for writing.
diff --git a/src/main/java/com/marklogic/spark/ContextSupport.java b/src/main/java/com/marklogic/spark/ContextSupport.java
@@ -121,7 +121,7 @@ private void parseConnectionString(String value, Map<String, String> connectionP
         }
     }
 
-    protected final long getNumericOption(String optionName, long defaultValue, long minimumValue) {
+    public final long getNumericOption(String optionName, long defaultValue, long minimumValue) {
         try {
             long value = this.getProperties().containsKey(optionName) ?
                 Long.parseLong(this.getProperties().get(optionName)) :
diff --git a/src/main/java/com/marklogic/spark/Options.java b/src/main/java/com/marklogic/spark/Options.java
@@ -90,6 +90,9 @@ public abstract class Options {
     public static final String WRITE_TOTAL_THREAD_COUNT = "spark.marklogic.write.totalThreadCount";
     public static final String WRITE_ABORT_ON_FAILURE = "spark.marklogic.write.abortOnFailure";
 
+    // For logging progress when writing documents or processing with custom code.
+    public static final String WRITE_LOG_PROGRESS = "spark.marklogic.write.logProgress";
+
     // For writing via custom code.
     public static final String WRITE_INVOKE = "spark.marklogic.write.invoke";
     public static final String WRITE_JAVASCRIPT = "spark.marklogic.write.javascript";
diff --git a/src/main/java/com/marklogic/spark/writer/WriteContext.java b/src/main/java/com/marklogic/spark/writer/WriteContext.java
@@ -36,14 +36,18 @@
 import java.util.List;
 import java.util.Map;
 import java.util.Optional;
+import java.util.concurrent.atomic.AtomicInteger;
 import java.util.stream.Stream;
 
 public class WriteContext extends ContextSupport {
 
     static final long serialVersionUID = 1;
+    private static final AtomicInteger progressTracker = new AtomicInteger();
 
     private final StructType schema;
     private final boolean usingFileSchema;
+    private final int batchSize;
+    private final int logProgress;
 
     private int fileSchemaContentPosition;
     private int fileSchemaPathPosition;
@@ -54,6 +58,8 @@ public class WriteContext extends ContextSupport {
     public WriteContext(StructType schema, Map<String, String> properties) {
         super(properties);
         this.schema = schema;
+        this.batchSize = (int) getNumericOption(Options.WRITE_BATCH_SIZE, 100, 1);
+        this.logProgress = (int) getNumericOption(Options.WRITE_LOG_PROGRESS, 10000, 0);
 
         // We support the Spark binaryFile schema - https://spark.apache.org/docs/latest/sql-data-sources-binaryFile.html -
         // so that reader can be reused for loading files as-is.
@@ -90,7 +96,7 @@ int getThreadCountPerPartition() {
     WriteBatcher newWriteBatcher(DataMovementManager dataMovementManager) {
         final int threadCount = getTotalThreadCount() > 0 ?
             getThreadCountPerPartition() : getThreadCount();
-        final int batchSize = (int) getNumericOption(Options.WRITE_BATCH_SIZE, 100, 1);
+
         Util.MAIN_LOGGER.info("Creating new batcher with thread count of {} and batch size of {}.", threadCount, batchSize);
         WriteBatcher writeBatcher = dataMovementManager
             .newWriteBatcher()
@@ -253,9 +259,21 @@ private void logBatchOnSuccess(WriteBatch batch) {
                 docCount--;
             }
         }
+        if (this.logProgress > 0) {
+            logProgressIfNecessary(docCount);
+        }
         logger.debug("Wrote batch; length: {}; job batch number: {}", docCount, batch.getJobBatchNumber());
     }
 
+    private void logProgressIfNecessary(int docCount) {
+        int sum = progressTracker.addAndGet(docCount);
+        int lowerBound = sum / (this.logProgress);
+        int upperBound = (lowerBound * this.logProgress) + this.batchSize;
+        if (sum >= lowerBound && sum < upperBound) {
+            Util.MAIN_LOGGER.info("Documents written: {}", sum);
+        }
+    }
+
     boolean isUsingFileSchema() {
         return this.usingFileSchema;
     }
diff --git a/src/main/java/com/marklogic/spark/writer/customcode/CustomCodeWriter.java b/src/main/java/com/marklogic/spark/writer/customcode/CustomCodeWriter.java
@@ -24,16 +24,19 @@
 import java.io.StringWriter;
 import java.util.ArrayList;
 import java.util.List;
+import java.util.concurrent.atomic.AtomicInteger;
 import java.util.stream.Collectors;
 
 class CustomCodeWriter implements DataWriter<InternalRow> {
 
     private static final Logger logger = LoggerFactory.getLogger(CustomCodeWriter.class);
+    private static final AtomicInteger progressTracker = new AtomicInteger();
 
     private final DatabaseClient databaseClient;
     private final CustomCodeContext customCodeContext;
 
     private final int batchSize;
+    private final int logProgress;
     private final List<String> currentBatch = new ArrayList<>();
     private final String externalVariableDelimiter;
     private ObjectMapper objectMapper;
@@ -46,8 +49,8 @@ class CustomCodeWriter implements DataWriter<InternalRow> {
         this.customCodeContext = customCodeContext;
         this.databaseClient = customCodeContext.connectToMarkLogic();
 
-        this.batchSize = customCodeContext.optionExists(Options.WRITE_BATCH_SIZE) ?
-            Integer.parseInt(customCodeContext.getProperties().get(Options.WRITE_BATCH_SIZE)) : 1;
+        this.batchSize = (int) customCodeContext.getNumericOption(Options.WRITE_BATCH_SIZE, 1, 1);
+        this.logProgress = (int) customCodeContext.getNumericOption(Options.WRITE_LOG_PROGRESS, 10000, 0);
 
         this.externalVariableDelimiter = customCodeContext.optionExists(Options.WRITE_EXTERNAL_VARIABLE_DELIMITER) ?
             customCodeContext.getProperties().get(Options.WRITE_EXTERNAL_VARIABLE_DELIMITER) : ",";
@@ -64,7 +67,6 @@ public void write(InternalRow row) {
             row.getString(0);
 
         this.currentBatch.add(rowValue);
-
         if (this.currentBatch.size() >= this.batchSize) {
             flush();
         }
@@ -104,8 +106,8 @@ private void flush() {
             return;
         }
 
-        if (logger.isDebugEnabled()) {
-            logger.debug("Calling custom code in MarkLogic");
+        if (logger.isTraceEnabled()) {
+            logger.trace("Calling custom code in MarkLogic");
         }
         final int itemCount = currentBatch.size();
         ServerEvaluationCall call = customCodeContext.buildCall(
@@ -164,6 +166,9 @@ private void executeCall(ServerEvaluationCall call, int itemCount) {
         try {
             call.evalAs(String.class);
             this.successItemCount += itemCount;
+            if (this.logProgress > 0) {
+                logProgressIfNecessary(itemCount);
+            }
         } catch (RuntimeException ex) {
             if (customCodeContext.isAbortOnFailure()) {
                 throw ex;
@@ -172,4 +177,13 @@ private void executeCall(ServerEvaluationCall call, int itemCount) {
             Util.MAIN_LOGGER.error(String.format("Unable to process row; cause: %s", ex.getMessage()));
         }
     }
+
+    private void logProgressIfNecessary(int itemCount) {
+        int sum = progressTracker.addAndGet(itemCount);
+        int lowerBound = sum / (this.logProgress);
+        int upperBound = (lowerBound * this.logProgress) + this.batchSize;
+        if (sum >= lowerBound && sum < upperBound) {
+            Util.MAIN_LOGGER.info("Items processed: {}", sum);
+        }
+    }
 }
diff --git a/src/test/java/com/marklogic/spark/writer/WriteRowsTest.java b/src/test/java/com/marklogic/spark/writer/WriteRowsTest.java
@@ -38,6 +38,18 @@ void defaultBatchSizeAndThreadCount() {
         verifyTwoHundredDocsWereWritten();
     }
 
+    @Test
+    void logProgressTest() {
+        newWriter(2)
+            // Including these options here to ensure they don't cause any issues, though we're not yet able to
+            // assert on the info-level log entries that they add.
+            .option(Options.WRITE_BATCH_SIZE, 8)
+            .option(Options.WRITE_LOG_PROGRESS, 50)
+            .save();
+
+        verifyTwoHundredDocsWereWritten();
+    }
+
     @Test
     void batchSizeGreaterThanNumberOfRowsToWrite() {
         newWriter()
@@ -128,7 +140,7 @@ void invalidThreadCount() {
     @Test
     void invalidBatchSize() {
         DataFrameWriter writer = newWriter().option(Options.WRITE_BATCH_SIZE, 0);
-        ConnectorException ex = assertThrowsConnectorException(() -> writer.save());
+        ConnectorException ex = assertThrows(ConnectorException.class, () -> writer.save());
         assertEquals("The value of 'spark.marklogic.write.batchSize' must be 1 or greater.", ex.getMessage(),
             "Note that batchSize is very different for writing than it is for reading. For writing, it specifies the " +
                 "exact number of documents to send to MarkLogic in each call. For reading, it used to determine how " +
diff --git a/src/test/java/com/marklogic/spark/writer/customcode/ProcessWithCustomCodeTest.java b/src/test/java/com/marklogic/spark/writer/customcode/ProcessWithCustomCodeTest.java
@@ -16,6 +16,25 @@
 
 class ProcessWithCustomCodeTest extends AbstractWriteTest {
 
+    @Test
+    void logProgressTest() {
+        newSparkSession().read().format(CONNECTOR_IDENTIFIER)
+            .option(Options.CLIENT_URI, makeClientUri())
+            .option(Options.READ_XQUERY, "for $i in 1 to 100 return $i")
+            .load()
+            .write().format(CONNECTOR_IDENTIFIER)
+            .option(Options.CLIENT_URI, makeClientUri())
+            // With "uneven" numbers like this, the user will still see 5 progress entries, but the counts won't even -
+            // they'll be 24, 40, 64, 80, and 100.
+            .option(Options.WRITE_BATCH_SIZE, 8)
+            .option(Options.WRITE_LOG_PROGRESS, 20)
+            .option(Options.WRITE_JAVASCRIPT, "var URI; console.log('Nothing to do here.')")
+            .mode(SaveMode.Append)
+            .save();
+
+        assertTrue(true, "No assertion needed, this test is only for manual inspection of the progress log entries.");
+    }
+
     @Test
     void invokeJavaScript() {
         newWriterWithDefaultConfig("three-uris.csv", 2)

Original file line number	Diff line number	Diff line change
`@@ -121,7 +121,7 @@ private void parseConnectionString(String value, Map<String, String> connectionP`
`121`	`121`	`}`
`122`	`122`	`}`
`123`	`123`
`124`		`- protected final long getNumericOption(String optionName, long defaultValue, long minimumValue) {`
	`124`	`+ public final long getNumericOption(String optionName, long defaultValue, long minimumValue) {`
`125`	`125`	`try {`
`126`	`126`	`long value = this.getProperties().containsKey(optionName) ?`
`127`	`127`	`Long.parseLong(this.getProperties().get(optionName)) :`