Fixed progress logging bug

rjrudin · rjrudin · commit d22ebab1341f · 2024-07-24T10:17:09.000-04:00
Realized in a Spark environment, the static for counting written docs wasn't getting reset. Did some refactoring to avoid duplicated code as well, and also to setup for potentially logging progress on reads.

Also defaulting the progress interval to 0 instead of 10000. This doesn't seem as useful in a Spark environment. Flux will default it to 10000 instead.
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -156,6 +156,20 @@ df2.head()
 json.loads(df2.head()['content'])
 ```
 
+For a quick test of writing documents, use the following:
+
+```
+
+spark.read.option("header", True).csv("src/test/resources/data.csv")\
+    .repartition(2)\
+    .write.format("marklogic")\
+    .option("spark.marklogic.client.uri", "spark-test-user:spark@localhost:8000")\
+    .option("spark.marklogic.write.permissions", "spark-user-role,read,spark-user-role,update")\
+    .option("spark.marklogic.write.logProgress", 50)\
+    .option("spark.marklogic.write.batchSize", 10)\
+    .mode("append")\
+    .save()
+```
 
 # Testing against a local Spark cluster
 
diff --git a/src/main/java/com/marklogic/spark/DefaultSource.java b/src/main/java/com/marklogic/spark/DefaultSource.java
@@ -93,6 +93,7 @@ public Table getTable(StructType schema, Transform[] partitioning, Map<String, S
             return new MarkLogicTable(schema, properties);
         }
 
+        WriteProgressLogger.progressCounter.set(0);
         return new MarkLogicTable(new WriteContext(schema, properties));
     }
 
diff --git a/src/main/java/com/marklogic/spark/Options.java b/src/main/java/com/marklogic/spark/Options.java
@@ -90,7 +90,9 @@ public abstract class Options {
     public static final String WRITE_THREAD_COUNT_PER_PARTITION = "spark.marklogic.write.threadCountPerPartition";
     public static final String WRITE_ABORT_ON_FAILURE = "spark.marklogic.write.abortOnFailure";
 
-    // For logging progress when writing documents or processing with custom code.
+    // For logging progress when writing documents or processing with custom code. Defines the interval at which
+    // progress should be logged - e.g. a value of 10,000 will result in a message being logged on every 10,000 items
+    // being written/processed.
     public static final String WRITE_LOG_PROGRESS = "spark.marklogic.write.logProgress";
 
     // For writing via custom code.
diff --git a/src/main/java/com/marklogic/spark/ProgressLogger.java b/src/main/java/com/marklogic/spark/ProgressLogger.java
@@ -0,0 +1,36 @@
+/*
+ * Copyright © 2024 Progress Software Corporation and/or its subsidiaries or affiliates. All Rights Reserved.
+ */
+package com.marklogic.spark;
+
+import java.io.Serializable;
+
+public abstract class ProgressLogger implements Serializable {
+
+    static final long serialVersionUID = 1;
+
+    private final long progressInterval;
+    private final int batchSize;
+    private final String message;
+
+    protected ProgressLogger(long progressInterval, int batchSize, String message) {
+        this.progressInterval = progressInterval;
+        this.batchSize = batchSize;
+        this.message = message;
+    }
+
+    protected abstract long getNewSum(long itemCount);
+
+    public void logProgressIfNecessary(long itemCount) {
+        if (this.progressInterval > 0) {
+            long sum = getNewSum(itemCount);
+            if (sum >= progressInterval) {
+                long lowerBound = sum / (this.progressInterval);
+                long upperBound = (lowerBound * this.progressInterval) + this.batchSize;
+                if (sum >= lowerBound && sum < upperBound) {
+                    Util.MAIN_LOGGER.info(message, sum);
+                }
+            }
+        }
+    }
+}
diff --git a/src/main/java/com/marklogic/spark/WriteProgressLogger.java b/src/main/java/com/marklogic/spark/WriteProgressLogger.java
@@ -0,0 +1,21 @@
+/*
+ * Copyright © 2024 Progress Software Corporation and/or its subsidiaries or affiliates. All Rights Reserved.
+ */
+package com.marklogic.spark;
+
+import java.util.concurrent.atomic.AtomicInteger;
+import java.util.concurrent.atomic.AtomicLong;
+
+public class WriteProgressLogger extends ProgressLogger {
+
+    public static final AtomicLong progressCounter = new AtomicLong(0);
+
+    public WriteProgressLogger(long progressInterval, int batchSize, String message) {
+        super(progressInterval, batchSize, message);
+    }
+
+    @Override
+    protected long getNewSum(long itemCount) {
+        return progressCounter.addAndGet(itemCount);
+    }
+}
diff --git a/src/main/java/com/marklogic/spark/writer/WriteContext.java b/src/main/java/com/marklogic/spark/writer/WriteContext.java
@@ -24,10 +24,7 @@
 import com.marklogic.client.document.ServerTransform;
 import com.marklogic.client.impl.GenericDocumentImpl;
 import com.marklogic.client.io.Format;
-import com.marklogic.spark.ConnectorException;
-import com.marklogic.spark.ContextSupport;
-import com.marklogic.spark.Options;
-import com.marklogic.spark.Util;
+import com.marklogic.spark.*;
 import com.marklogic.spark.reader.document.DocumentRowSchema;
 import com.marklogic.spark.reader.file.TripleRowSchema;
 import org.apache.spark.sql.types.StructType;
@@ -36,18 +33,16 @@
 import java.util.List;
 import java.util.Map;
 import java.util.Optional;
-import java.util.concurrent.atomic.AtomicInteger;
 import java.util.stream.Stream;
 
 public class WriteContext extends ContextSupport {
 
     static final long serialVersionUID = 1;
-    private static final AtomicInteger progressTracker = new AtomicInteger();
 
     private final StructType schema;
     private final boolean usingFileSchema;
     private final int batchSize;
-    private final int logProgress;
+    private final ProgressLogger progressLogger;
 
     private int fileSchemaContentPosition;
     private int fileSchemaPathPosition;
@@ -59,7 +54,8 @@ public WriteContext(StructType schema, Map<String, String> properties) {
         super(properties);
         this.schema = schema;
         this.batchSize = (int) getNumericOption(Options.WRITE_BATCH_SIZE, 100, 1);
-        this.logProgress = (int) getNumericOption(Options.WRITE_LOG_PROGRESS, 10000, 0);
+        this.progressLogger = new WriteProgressLogger(getNumericOption(Options.WRITE_LOG_PROGRESS, 0, 0),
+            batchSize, "Documents written: {}");
 
         // We support the Spark binaryFile schema - https://spark.apache.org/docs/latest/sql-data-sources-binaryFile.html -
         // so that reader can be reused for loading files as-is.
@@ -274,22 +270,9 @@ private void logBatchOnSuccess(WriteBatch batch) {
                 docCount--;
             }
         }
-        if (this.logProgress > 0) {
-            logProgressIfNecessary(docCount);
-        }
-        if (logger.isDebugEnabled()) {
-            logger.debug("Wrote batch; length: {}; job batch number: {}", docCount, batch.getJobBatchNumber());
-        }
-    }
-
-    private void logProgressIfNecessary(int docCount) {
-        int sum = progressTracker.addAndGet(docCount);
-        if (sum >= logProgress) {
-            int lowerBound = sum / (this.logProgress);
-            int upperBound = (lowerBound * this.logProgress) + this.batchSize;
-            if (sum >= lowerBound && sum < upperBound) {
-                Util.MAIN_LOGGER.info("Documents written: {}", sum);
-            }
+        progressLogger.logProgressIfNecessary(docCount);
+        if (logger.isTraceEnabled()) {
+            logger.trace("Wrote batch; length: {}; job batch number: {}", docCount, batch.getJobBatchNumber());
         }
     }
 
diff --git a/src/main/java/com/marklogic/spark/writer/customcode/CustomCodeWriter.java b/src/main/java/com/marklogic/spark/writer/customcode/CustomCodeWriter.java
@@ -9,10 +9,7 @@
 import com.marklogic.client.io.JacksonHandle;
 import com.marklogic.client.io.StringHandle;
 import com.marklogic.client.io.marker.AbstractWriteHandle;
-import com.marklogic.spark.ConnectorException;
-import com.marklogic.spark.JsonRowSerializer;
-import com.marklogic.spark.Options;
-import com.marklogic.spark.Util;
+import com.marklogic.spark.*;
 import com.marklogic.spark.reader.customcode.CustomCodeContext;
 import com.marklogic.spark.writer.CommitMessage;
 import org.apache.spark.sql.catalyst.InternalRow;
@@ -23,19 +20,18 @@
 
 import java.util.ArrayList;
 import java.util.List;
-import java.util.concurrent.atomic.AtomicInteger;
 import java.util.stream.Collectors;
 
 class CustomCodeWriter implements DataWriter<InternalRow> {
 
     private static final Logger logger = LoggerFactory.getLogger(CustomCodeWriter.class);
-    private static final AtomicInteger progressTracker = new AtomicInteger();
 
     private final DatabaseClient databaseClient;
     private final CustomCodeContext customCodeContext;
     private final JsonRowSerializer jsonRowSerializer;
     private final int batchSize;
-    private final int logProgress;
+    private final ProgressLogger progressLogger;
+
     private final List<String> currentBatch = new ArrayList<>();
     private final String externalVariableDelimiter;
     private ObjectMapper objectMapper;
@@ -50,7 +46,8 @@ class CustomCodeWriter implements DataWriter<InternalRow> {
         this.jsonRowSerializer = new JsonRowSerializer(customCodeContext.getSchema(), customCodeContext.getProperties());
 
         this.batchSize = (int) customCodeContext.getNumericOption(Options.WRITE_BATCH_SIZE, 1, 1);
-        this.logProgress = (int) customCodeContext.getNumericOption(Options.WRITE_LOG_PROGRESS, 10000, 0);
+        this.progressLogger = new WriteProgressLogger(customCodeContext.getNumericOption(Options.WRITE_LOG_PROGRESS, 0, 0),
+            this.batchSize, "Items processed: {}");
 
         this.externalVariableDelimiter = customCodeContext.optionExists(Options.WRITE_EXTERNAL_VARIABLE_DELIMITER) ?
             customCodeContext.getProperties().get(Options.WRITE_EXTERNAL_VARIABLE_DELIMITER) : ",";
@@ -154,9 +151,7 @@ private void executeCall(ServerEvaluationCall call, int itemCount) {
         try {
             call.evalAs(String.class);
             this.successItemCount += itemCount;
-            if (this.logProgress > 0) {
-                logProgressIfNecessary(itemCount);
-            }
+            this.progressLogger.logProgressIfNecessary(itemCount);
         } catch (RuntimeException ex) {
             if (customCodeContext.isAbortOnFailure()) {
                 throw ex;
@@ -165,13 +160,4 @@ private void executeCall(ServerEvaluationCall call, int itemCount) {
             Util.MAIN_LOGGER.error(String.format("Unable to process row; cause: %s", ex.getMessage()));
         }
     }
-
-    private void logProgressIfNecessary(int itemCount) {
-        int sum = progressTracker.addAndGet(itemCount);
-        int lowerBound = sum / (this.logProgress);
-        int upperBound = (lowerBound * this.logProgress) + this.batchSize;
-        if (sum >= lowerBound && sum < upperBound) {
-            Util.MAIN_LOGGER.info("Items processed: {}", sum);
-        }
-    }
 }
diff --git a/src/test/java/com/marklogic/spark/writer/WriteRowsTest.java b/src/test/java/com/marklogic/spark/writer/WriteRowsTest.java
@@ -48,6 +48,12 @@ void logProgressTest() {
             .save();
 
         verifyTwoHundredDocsWereWritten();
+
+        // For manual inspection, run it again to ensure that the progress counter was reset.
+        newWriter(2)
+            .option(Options.WRITE_BATCH_SIZE, 10)
+            .option(Options.WRITE_LOG_PROGRESS, 40)
+            .save();
     }
 
     @Test

Original file line number	Diff line number	Diff line change
`@@ -93,6 +93,7 @@ public Table getTable(StructType schema, Transform[] partitioning, Map<String, S`
`93`	`93`	`return new MarkLogicTable(schema, properties);`
`94`	`94`	`}`
`95`	`95`
	`96`	`+ WriteProgressLogger.progressCounter.set(0);`
`96`	`97`	`return new MarkLogicTable(new WriteContext(schema, properties));`
`97`	`98`	`}`
`98`	`99`