Merge pull request #268 from marklogic/feature/fixing-progress

rjrudin · web-flow · commit b28c471fe6b5 · 2024-07-24T15:20:31.000-04:00
Fixed progress logging
diff --git a/src/main/java/com/marklogic/spark/DefaultSource.java b/src/main/java/com/marklogic/spark/DefaultSource.java
@@ -86,20 +86,29 @@ public Table getTable(StructType schema, Transform[] partitioning, Map<String, S
             );
         }
 
+        final ContextSupport tempContext = new ContextSupport(properties);
+
         // The appropriate progress logger is reset here so that when the connector is used repeatedly in an
         // environment like PySpark, the counts start with zero on each new Spark job.
+        final long readProgressInterval = tempContext.getNumericOption(Options.READ_LOG_PROGRESS, 0, 0);
         if (isReadDocumentsOperation(properties)) {
-            ReadProgressLogger.progressCounter.set(0);
+            ReadProgressLogger.initialize(readProgressInterval, "Documents read: {}");
             return new DocumentTable(DocumentRowSchema.SCHEMA);
         } else if (isReadTriplesOperation(properties)) {
-            ReadProgressLogger.progressCounter.set(0);
+            ReadProgressLogger.initialize(readProgressInterval, "Triples read: {}");
             return new DocumentTable(TripleRowSchema.SCHEMA);
-        } else if (properties.get(Options.READ_OPTIC_QUERY) != null || Util.isReadWithCustomCodeOperation(properties)) {
-            ReadProgressLogger.progressCounter.set(0);
+        } else if (properties.get(Options.READ_OPTIC_QUERY) != null) {
+            ReadProgressLogger.initialize(readProgressInterval, "Rows read: {}");
+            return new MarkLogicTable(schema, properties);
+        } else if (Util.isReadWithCustomCodeOperation(properties)) {
+            // Not yet logging progress for reading with custom code, as it's assumed the user will then write with
+            // custom code.
             return new MarkLogicTable(schema, properties);
         }
 
-        WriteProgressLogger.progressCounter.set(0);
+        final long writeProgressInterval = tempContext.getNumericOption(Options.WRITE_LOG_PROGRESS, 0, 0);
+        String message = Util.isReadWithCustomCodeOperation(properties) ? "Items processed: {}" : "Documents written: {}";
+        WriteProgressLogger.initialize(writeProgressInterval, message);
         return new MarkLogicTable(new WriteContext(schema, properties));
     }
 
diff --git a/src/main/java/com/marklogic/spark/ProgressLogger.java b/src/main/java/com/marklogic/spark/ProgressLogger.java
diff --git a/src/main/java/com/marklogic/spark/ReadProgressLogger.java b/src/main/java/com/marklogic/spark/ReadProgressLogger.java
@@ -3,23 +3,36 @@
  */
 package com.marklogic.spark;
 
+import java.io.Serializable;
 import java.util.concurrent.atomic.AtomicLong;
 
 /**
  * Handles the progress counter for any operation involving reading from MarkLogic. A Spark job/application can only have
  * one reader, and thus DefaultSource handles resetting this counter before a new read job starts up. A static counter
  * is used so that all reader partitions in the same JVM can have their progress aggregated and logged.
  */
-public class ReadProgressLogger extends ProgressLogger {
+public class ReadProgressLogger implements Serializable {
 
-    public static final AtomicLong progressCounter = new AtomicLong(0);
+    static final long serialVersionUID = 1L;
 
-    public ReadProgressLogger(long progressInterval, int batchSize, String message) {
-        super(progressInterval, batchSize, message);
+    private static final AtomicLong progressCounter = new AtomicLong(0);
+    private static long progressInterval;
+    private static long nextProgressInterval;
+    private static String message;
+
+    public static void initialize(long progressInterval, String message) {
+        progressCounter.set(0);
+        ReadProgressLogger.progressInterval = progressInterval;
+        nextProgressInterval = progressInterval;
+        ReadProgressLogger.message = message;
     }
 
-    @Override
-    protected long getNewSum(long itemCount) {
-        return progressCounter.addAndGet(itemCount);
+    public static void logProgressIfNecessary(long itemCount) {
+        if (progressInterval > 0 && progressCounter.addAndGet(itemCount) >= nextProgressInterval) {
+            synchronized (progressCounter) {
+                Util.MAIN_LOGGER.info(message, nextProgressInterval);
+                nextProgressInterval += progressInterval;
+            }
+        }
     }
 }
diff --git a/src/main/java/com/marklogic/spark/WriteProgressLogger.java b/src/main/java/com/marklogic/spark/WriteProgressLogger.java
@@ -3,23 +3,36 @@
  */
 package com.marklogic.spark;
 
+import java.io.Serializable;
 import java.util.concurrent.atomic.AtomicLong;
 
 /**
  * Handles the progress counter for any operation involving writing to MarkLogic. A Spark job/application can only have
  * one writer, and thus DefaultSource handles resetting this counter before a new write job starts up. A static counter
  * is used so that all writer partitions in the same JVM can have their progress aggregated and logged.
  */
-public class WriteProgressLogger extends ProgressLogger {
+public class WriteProgressLogger implements Serializable {
 
-    public static final AtomicLong progressCounter = new AtomicLong(0);
+    static final long serialVersionUID = 1L;
 
-    public WriteProgressLogger(long progressInterval, int batchSize, String message) {
-        super(progressInterval, batchSize, message);
+    private static final AtomicLong progressCounter = new AtomicLong(0);
+    private static long progressInterval;
+    private static long nextProgressInterval;
+    private static String message;
+
+    public static void initialize(long progressInterval, String message) {
+        progressCounter.set(0);
+        WriteProgressLogger.progressInterval = progressInterval;
+        nextProgressInterval = progressInterval;
+        WriteProgressLogger.message = message;
     }
 
-    @Override
-    protected long getNewSum(long itemCount) {
-        return progressCounter.addAndGet(itemCount);
+    public static void logProgressIfNecessary(long itemCount) {
+        if (progressInterval > 0 && progressCounter.addAndGet(itemCount) >= nextProgressInterval) {
+            synchronized (progressCounter) {
+                Util.MAIN_LOGGER.info(message, nextProgressInterval);
+                nextProgressInterval += progressInterval;
+            }
+        }
     }
 }
diff --git a/src/main/java/com/marklogic/spark/reader/document/ForestReader.java b/src/main/java/com/marklogic/spark/reader/document/ForestReader.java
@@ -12,7 +12,6 @@
 import com.marklogic.client.query.SearchQueryDefinition;
 import com.marklogic.client.query.StructuredQueryBuilder;
 import com.marklogic.spark.Options;
-import com.marklogic.spark.ProgressLogger;
 import com.marklogic.spark.ReadProgressLogger;
 import org.apache.spark.sql.catalyst.InternalRow;
 import org.apache.spark.sql.connector.read.PartitionReader;
@@ -40,7 +39,6 @@ class ForestReader implements PartitionReader<InternalRow> {
     private final Integer limit;
 
     // Only used for logging.
-    private final ProgressLogger progressLogger;
     private final ForestPartition forestPartition;
     private long startTime;
 
@@ -74,11 +72,6 @@ class ForestReader implements PartitionReader<InternalRow> {
         this.requestedMetadata = context.getRequestedMetadata();
         this.documentManager.setMetadataCategories(this.requestedMetadata);
         this.queryBuilder = client.newQueryManager().newStructuredQueryBuilder();
-
-        this.progressLogger = new ReadProgressLogger(
-            context.getNumericOption(Options.READ_LOG_PROGRESS, 0, 0),
-            context.getBatchSize(), "Read documents: {}"
-        );
     }
 
     @Override
@@ -153,7 +146,7 @@ private DocumentPage readPage(List<String> uris) {
         if (logger.isTraceEnabled()) {
             logger.trace("Retrieved page of documents in {}ms from partition {}", (System.currentTimeMillis() - start), this.forestPartition);
         }
-        this.progressLogger.logProgressIfNecessary(page.getPageSize());
+        ReadProgressLogger.logProgressIfNecessary(page.getPageSize());
         return page;
     }
 
diff --git a/src/main/java/com/marklogic/spark/reader/document/OpticTriplesReader.java b/src/main/java/com/marklogic/spark/reader/document/OpticTriplesReader.java
@@ -7,7 +7,6 @@
 import com.marklogic.client.row.RowRecord;
 import com.marklogic.client.type.PlanColumn;
 import com.marklogic.spark.Options;
-import com.marklogic.spark.ProgressLogger;
 import com.marklogic.spark.ReadProgressLogger;
 import org.apache.spark.sql.catalyst.InternalRow;
 import org.apache.spark.sql.catalyst.expressions.GenericInternalRow;
@@ -39,7 +38,6 @@ class OpticTriplesReader implements PartitionReader<InternalRow> {
     // Only for logging
     private final long batchSize;
     private long progressCounter;
-    private final ProgressLogger progressLogger;
 
     private Iterator<RowRecord> currentRowIterator;
 
@@ -60,10 +58,6 @@ public OpticTriplesReader(ForestPartition forestPartition, DocumentContext conte
         this.uriBatcher = new UriBatcher(this.databaseClient, query, forestPartition, context.getBatchSize(), filtered);
 
         this.batchSize = context.getBatchSize();
-        this.progressLogger = new ReadProgressLogger(
-            context.getNumericOption(Options.READ_LOG_PROGRESS, 0, 0),
-            (int) this.batchSize, "Read triples: {}"
-        );
     }
 
     @Override
@@ -86,7 +80,7 @@ public InternalRow get() {
         Object[] row = convertNextTripleIntoRow();
         progressCounter++;
         if (progressCounter >= batchSize) {
-            progressLogger.logProgressIfNecessary(progressCounter);
+            ReadProgressLogger.logProgressIfNecessary(this.progressCounter);
             progressCounter = 0;
         }
         return new GenericInternalRow(row);
diff --git a/src/main/java/com/marklogic/spark/reader/optic/OpticPartitionReader.java b/src/main/java/com/marklogic/spark/reader/optic/OpticPartitionReader.java
@@ -20,8 +20,6 @@
 import com.fasterxml.jackson.databind.ObjectMapper;
 import com.fasterxml.jackson.databind.node.ObjectNode;
 import com.marklogic.client.row.RowManager;
-import com.marklogic.spark.Options;
-import com.marklogic.spark.ProgressLogger;
 import com.marklogic.spark.ReadProgressLogger;
 import com.marklogic.spark.reader.JsonRowDeserializer;
 import org.apache.spark.sql.catalyst.InternalRow;
@@ -51,7 +49,6 @@ class OpticPartitionReader implements PartitionReader<InternalRow> {
     private long totalDuration;
     private long progressCounter;
     private final long batchSize;
-    private final ProgressLogger progressLogger;
 
     // Used solely for testing purposes; is never expected to be used in production. Intended to provide a way for
     // a test to get the count of rows returned from MarkLogic, which is important for ensuring that pushdown operations
@@ -67,11 +64,6 @@ class OpticPartitionReader implements PartitionReader<InternalRow> {
         // be in the rows.
         this.rowManager.setDatatypeStyle(RowManager.RowSetPart.HEADER);
         this.jsonRowDeserializer = new JsonRowDeserializer(opticReadContext.getSchema());
-
-        this.progressLogger = new ReadProgressLogger(
-            opticReadContext.getNumericOption(Options.READ_LOG_PROGRESS, 0, 0),
-            (int) opticReadContext.getBatchSize(), "Read rows: {}"
-        );
     }
 
     @Override
@@ -115,7 +107,7 @@ public InternalRow get() {
         this.totalRowCount++;
         this.progressCounter++;
         if (this.progressCounter >= this.batchSize) {
-            progressLogger.logProgressIfNecessary(this.progressCounter);
+            ReadProgressLogger.logProgressIfNecessary(this.progressCounter);
             this.progressCounter = 0;
         }
         JsonNode row = rowIterator.next();
diff --git a/src/main/java/com/marklogic/spark/writer/WriteContext.java b/src/main/java/com/marklogic/spark/writer/WriteContext.java
@@ -42,7 +42,6 @@ public class WriteContext extends ContextSupport {
     private final StructType schema;
     private final boolean usingFileSchema;
     private final int batchSize;
-    private final ProgressLogger progressLogger;
 
     private int fileSchemaContentPosition;
     private int fileSchemaPathPosition;
@@ -54,8 +53,6 @@ public WriteContext(StructType schema, Map<String, String> properties) {
         super(properties);
         this.schema = schema;
         this.batchSize = (int) getNumericOption(Options.WRITE_BATCH_SIZE, 100, 1);
-        this.progressLogger = new WriteProgressLogger(getNumericOption(Options.WRITE_LOG_PROGRESS, 0, 0),
-            batchSize, "Documents written: {}");
 
         // We support the Spark binaryFile schema - https://spark.apache.org/docs/latest/sql-data-sources-binaryFile.html -
         // so that reader can be reused for loading files as-is.
@@ -270,7 +267,7 @@ private void logBatchOnSuccess(WriteBatch batch) {
                 docCount--;
             }
         }
-        progressLogger.logProgressIfNecessary(docCount);
+        WriteProgressLogger.logProgressIfNecessary(docCount);
         if (logger.isTraceEnabled()) {
             logger.trace("Wrote batch; length: {}; job batch number: {}", docCount, batch.getJobBatchNumber());
         }
diff --git a/src/main/java/com/marklogic/spark/writer/customcode/CustomCodeWriter.java b/src/main/java/com/marklogic/spark/writer/customcode/CustomCodeWriter.java
@@ -30,7 +30,6 @@ class CustomCodeWriter implements DataWriter<InternalRow> {
     private final CustomCodeContext customCodeContext;
     private final JsonRowSerializer jsonRowSerializer;
     private final int batchSize;
-    private final ProgressLogger progressLogger;
 
     private final List<String> currentBatch = new ArrayList<>();
     private final String externalVariableDelimiter;
@@ -46,8 +45,6 @@ class CustomCodeWriter implements DataWriter<InternalRow> {
         this.jsonRowSerializer = new JsonRowSerializer(customCodeContext.getSchema(), customCodeContext.getProperties());
 
         this.batchSize = (int) customCodeContext.getNumericOption(Options.WRITE_BATCH_SIZE, 1, 1);
-        this.progressLogger = new WriteProgressLogger(customCodeContext.getNumericOption(Options.WRITE_LOG_PROGRESS, 0, 0),
-            this.batchSize, "Items processed: {}");
 
         this.externalVariableDelimiter = customCodeContext.optionExists(Options.WRITE_EXTERNAL_VARIABLE_DELIMITER) ?
             customCodeContext.getProperties().get(Options.WRITE_EXTERNAL_VARIABLE_DELIMITER) : ",";
@@ -151,7 +148,7 @@ private void executeCall(ServerEvaluationCall call, int itemCount) {
         try {
             call.evalAs(String.class);
             this.successItemCount += itemCount;
-            this.progressLogger.logProgressIfNecessary(itemCount);
+            WriteProgressLogger.logProgressIfNecessary(itemCount);
         } catch (RuntimeException ex) {
             if (customCodeContext.isAbortOnFailure()) {
                 throw ex;
diff --git a/src/test/java/com/marklogic/spark/writer/WriteRowsTest.java b/src/test/java/com/marklogic/spark/writer/WriteRowsTest.java
@@ -40,11 +40,12 @@ void defaultBatchSizeAndThreadCount() {
 
     @Test
     void logProgressTest() {
-        newWriter(2)
+        newWriter(4)
             // Including these options here to ensure they don't cause any issues, though we're not yet able to
             // assert on the info-level log entries that they add.
             .option(Options.WRITE_BATCH_SIZE, 8)
-            .option(Options.WRITE_LOG_PROGRESS, 50)
+            .option(Options.WRITE_THREAD_COUNT, 8)
+            .option(Options.WRITE_LOG_PROGRESS, 20)
             .save();
 
         verifyTwoHundredDocsWereWritten();