Now logging progress for read operations

rjrudin · rjrudin · commit 050ec7a03a70 · 2024-07-24T12:12:36.000-04:00
Had to make two separate loggers so we can have a static counter for reading and a static counter for writing. Both get reset when DefaultSource is created. 

Did some quick manual testing of this in Flux, and it "just works". 

Note that progress for reading/writing files is not being addressed yet. Just focusing on progress for reading from MarkLogic and writing to MarkLogic.
diff --git a/src/main/java/com/marklogic/spark/DefaultSource.java b/src/main/java/com/marklogic/spark/DefaultSource.java
@@ -79,17 +79,23 @@ public StructType inferSchema(CaseInsensitiveStringMap options) {
     @Override
     public Table getTable(StructType schema, Transform[] partitioning, Map<String, String> properties) {
         if (isFileOperation(properties)) {
+            // Not yet supporting progress logging for file operations.
             return new MarkLogicFileTable(SparkSession.active(),
                 new CaseInsensitiveStringMap(properties),
                 JavaConverters.asScalaBuffer(getPaths(properties)), schema
             );
         }
 
+        // The appropriate progress logger is reset here so that when the connector is used repeatedly in an
+        // environment like PySpark, the counts start with zero on each new Spark job.
         if (isReadDocumentsOperation(properties)) {
+            ReadProgressLogger.progressCounter.set(0);
             return new DocumentTable(DocumentRowSchema.SCHEMA);
         } else if (isReadTriplesOperation(properties)) {
+            ReadProgressLogger.progressCounter.set(0);
             return new DocumentTable(TripleRowSchema.SCHEMA);
-        } else if (isReadOperation(properties)) {
+        } else if (properties.get(Options.READ_OPTIC_QUERY) != null || Util.isReadWithCustomCodeOperation(properties)) {
+            ReadProgressLogger.progressCounter.set(0);
             return new MarkLogicTable(schema, properties);
         }
 
@@ -112,10 +118,6 @@ private boolean isFileOperation(Map<String, String> properties) {
         return properties.containsKey("path") || properties.containsKey("paths");
     }
 
-    private boolean isReadOperation(Map<String, String> properties) {
-        return properties.get(Options.READ_OPTIC_QUERY) != null || Util.isReadWithCustomCodeOperation(properties);
-    }
-
     private boolean isReadDocumentsOperation(Map<String, String> properties) {
         return properties.containsKey(Options.READ_DOCUMENTS_QUERY) ||
             properties.containsKey(Options.READ_DOCUMENTS_STRING_QUERY) ||
diff --git a/src/main/java/com/marklogic/spark/Options.java b/src/main/java/com/marklogic/spark/Options.java
@@ -71,6 +71,11 @@ public abstract class Options {
     public static final String READ_TRIPLES_FILTERED = "spark.marklogic.read.triples.filtered";
     public static final String READ_TRIPLES_BASE_IRI = "spark.marklogic.read.triples.baseIri";
 
+    // For logging progress when reading documents, rows, or items via custom code. Defines the interval at which
+    // progress should be logged - e.g. a value of 10,000 will result in a message being logged on every 10,000 items
+    // being written/processed.
+    public static final String READ_LOG_PROGRESS = "spark.marklogic.read.logProgress";
+
     public static final String READ_FILES_TYPE = "spark.marklogic.read.files.type";
     public static final String READ_FILES_COMPRESSION = "spark.marklogic.read.files.compression";
     public static final String READ_FILES_ENCODING = "spark.marklogic.read.files.encoding";
diff --git a/src/main/java/com/marklogic/spark/ReadProgressLogger.java b/src/main/java/com/marklogic/spark/ReadProgressLogger.java
@@ -0,0 +1,25 @@
+/*
+ * Copyright © 2024 Progress Software Corporation and/or its subsidiaries or affiliates. All Rights Reserved.
+ */
+package com.marklogic.spark;
+
+import java.util.concurrent.atomic.AtomicLong;
+
+/**
+ * Handles the progress counter for any operation involving reading from MarkLogic. A Spark job/application can only have
+ * one reader, and thus DefaultSource handles resetting this counter before a new read job starts up. A static counter
+ * is used so that all reader partitions in the same JVM can have their progress aggregated and logged.
+ */
+public class ReadProgressLogger extends ProgressLogger {
+
+    public static final AtomicLong progressCounter = new AtomicLong(0);
+
+    public ReadProgressLogger(long progressInterval, int batchSize, String message) {
+        super(progressInterval, batchSize, message);
+    }
+
+    @Override
+    protected long getNewSum(long itemCount) {
+        return progressCounter.addAndGet(itemCount);
+    }
+}
diff --git a/src/main/java/com/marklogic/spark/WriteProgressLogger.java b/src/main/java/com/marklogic/spark/WriteProgressLogger.java
@@ -3,9 +3,13 @@
  */
 package com.marklogic.spark;
 
-import java.util.concurrent.atomic.AtomicInteger;
 import java.util.concurrent.atomic.AtomicLong;
 
+/**
+ * Handles the progress counter for any operation involving writing to MarkLogic. A Spark job/application can only have
+ * one writer, and thus DefaultSource handles resetting this counter before a new write job starts up. A static counter
+ * is used so that all writer partitions in the same JVM can have their progress aggregated and logged.
+ */
 public class WriteProgressLogger extends ProgressLogger {
 
     public static final AtomicLong progressCounter = new AtomicLong(0);
diff --git a/src/main/java/com/marklogic/spark/reader/document/ForestReader.java b/src/main/java/com/marklogic/spark/reader/document/ForestReader.java
@@ -12,6 +12,8 @@
 import com.marklogic.client.query.SearchQueryDefinition;
 import com.marklogic.client.query.StructuredQueryBuilder;
 import com.marklogic.spark.Options;
+import com.marklogic.spark.ProgressLogger;
+import com.marklogic.spark.ReadProgressLogger;
 import org.apache.spark.sql.catalyst.InternalRow;
 import org.apache.spark.sql.connector.read.PartitionReader;
 import org.slf4j.Logger;
@@ -38,6 +40,7 @@ class ForestReader implements PartitionReader<InternalRow> {
     private final Integer limit;
 
     // Only used for logging.
+    private final ProgressLogger progressLogger;
     private final ForestPartition forestPartition;
     private long startTime;
 
@@ -71,6 +74,11 @@ class ForestReader implements PartitionReader<InternalRow> {
         this.requestedMetadata = context.getRequestedMetadata();
         this.documentManager.setMetadataCategories(this.requestedMetadata);
         this.queryBuilder = client.newQueryManager().newStructuredQueryBuilder();
+
+        this.progressLogger = new ReadProgressLogger(
+            context.getNumericOption(Options.READ_LOG_PROGRESS, 0, 0),
+            context.getBatchSize(), "Read documents: {}"
+        );
     }
 
     @Override
@@ -145,6 +153,7 @@ private DocumentPage readPage(List<String> uris) {
         if (logger.isTraceEnabled()) {
             logger.trace("Retrieved page of documents in {}ms from partition {}", (System.currentTimeMillis() - start), this.forestPartition);
         }
+        this.progressLogger.logProgressIfNecessary(page.getPageSize());
         return page;
     }
 
diff --git a/src/main/java/com/marklogic/spark/reader/document/OpticTriplesReader.java b/src/main/java/com/marklogic/spark/reader/document/OpticTriplesReader.java
@@ -7,6 +7,8 @@
 import com.marklogic.client.row.RowRecord;
 import com.marklogic.client.type.PlanColumn;
 import com.marklogic.spark.Options;
+import com.marklogic.spark.ProgressLogger;
+import com.marklogic.spark.ReadProgressLogger;
 import org.apache.spark.sql.catalyst.InternalRow;
 import org.apache.spark.sql.catalyst.expressions.GenericInternalRow;
 import org.apache.spark.sql.connector.read.PartitionReader;
@@ -34,6 +36,11 @@ class OpticTriplesReader implements PartitionReader<InternalRow> {
     private final PlanBuilder op;
     private final String graphBaseIri;
 
+    // Only for logging
+    private final long batchSize;
+    private long progressCounter;
+    private final ProgressLogger progressLogger;
+
     private Iterator<RowRecord> currentRowIterator;
 
     public OpticTriplesReader(ForestPartition forestPartition, DocumentContext context) {
@@ -51,6 +58,12 @@ public OpticTriplesReader(ForestPartition forestPartition, DocumentContext conte
             filtered = Boolean.parseBoolean(context.getProperties().get(Options.READ_TRIPLES_FILTERED));
         }
         this.uriBatcher = new UriBatcher(this.databaseClient, query, forestPartition, context.getBatchSize(), filtered);
+
+        this.batchSize = context.getBatchSize();
+        this.progressLogger = new ReadProgressLogger(
+            context.getNumericOption(Options.READ_LOG_PROGRESS, 0, 0),
+            (int) this.batchSize, "Read triples: {}"
+        );
     }
 
     @Override
@@ -71,6 +84,11 @@ public boolean next() throws IOException {
     @Override
     public InternalRow get() {
         Object[] row = convertNextTripleIntoRow();
+        progressCounter++;
+        if (progressCounter >= batchSize) {
+            progressLogger.logProgressIfNecessary(progressCounter);
+            progressCounter = 0;
+        }
         return new GenericInternalRow(row);
     }
 
diff --git a/src/main/java/com/marklogic/spark/reader/optic/OpticPartitionReader.java b/src/main/java/com/marklogic/spark/reader/optic/OpticPartitionReader.java
@@ -20,6 +20,9 @@
 import com.fasterxml.jackson.databind.ObjectMapper;
 import com.fasterxml.jackson.databind.node.ObjectNode;
 import com.marklogic.client.row.RowManager;
+import com.marklogic.spark.Options;
+import com.marklogic.spark.ProgressLogger;
+import com.marklogic.spark.ReadProgressLogger;
 import com.marklogic.spark.reader.JsonRowDeserializer;
 import org.apache.spark.sql.catalyst.InternalRow;
 import org.apache.spark.sql.connector.read.PartitionReader;
@@ -46,6 +49,9 @@ class OpticPartitionReader implements PartitionReader<InternalRow> {
     // Used solely for logging metrics
     private long totalRowCount;
     private long totalDuration;
+    private long progressCounter;
+    private final long batchSize;
+    private final ProgressLogger progressLogger;
 
     // Used solely for testing purposes; is never expected to be used in production. Intended to provide a way for
     // a test to get the count of rows returned from MarkLogic, which is important for ensuring that pushdown operations
@@ -54,12 +60,18 @@ class OpticPartitionReader implements PartitionReader<InternalRow> {
 
     OpticPartitionReader(OpticReadContext opticReadContext, PlanAnalysis.Partition partition) {
         this.opticReadContext = opticReadContext;
+        this.batchSize = opticReadContext.getBatchSize();
         this.partition = partition;
         this.rowManager = opticReadContext.connectToMarkLogic().newRowManager();
         // Nested values won't work with the JacksonParser used by JsonRowDeserializer, so we ask for type info to not
         // be in the rows.
         this.rowManager.setDatatypeStyle(RowManager.RowSetPart.HEADER);
         this.jsonRowDeserializer = new JsonRowDeserializer(opticReadContext.getSchema());
+
+        this.progressLogger = new ReadProgressLogger(
+            opticReadContext.getNumericOption(Options.READ_LOG_PROGRESS, 0, 0),
+            (int) opticReadContext.getBatchSize(), "Read rows: {}"
+        );
     }
 
     @Override
@@ -101,6 +113,11 @@ public boolean next() {
     public InternalRow get() {
         this.currentBucketRowCount++;
         this.totalRowCount++;
+        this.progressCounter++;
+        if (this.progressCounter >= this.batchSize) {
+            progressLogger.logProgressIfNecessary(this.progressCounter);
+            this.progressCounter = 0;
+        }
         JsonNode row = rowIterator.next();
         return this.jsonRowDeserializer.deserializeJson(row.toString());
     }
diff --git a/src/main/java/com/marklogic/spark/reader/optic/OpticReadContext.java b/src/main/java/com/marklogic/spark/reader/optic/OpticReadContext.java
@@ -65,13 +65,14 @@ public class OpticReadContext extends ContextSupport {
     private StructType schema;
     private long serverTimestamp;
     private List<OpticFilter> opticFilters;
+    private final long batchSize;
 
     public OpticReadContext(Map<String, String> properties, StructType schema, int defaultMinPartitions) {
         super(properties);
         this.schema = schema;
 
         final long partitionCount = getNumericOption(Options.READ_NUM_PARTITIONS, defaultMinPartitions, 1);
-        final long batchSize = getNumericOption(Options.READ_BATCH_SIZE, DEFAULT_BATCH_SIZE, 0);
+        this.batchSize = getNumericOption(Options.READ_BATCH_SIZE, DEFAULT_BATCH_SIZE, 0);
 
         final String dslQuery = properties.get(Options.READ_OPTIC_QUERY);
         if (dslQuery == null || dslQuery.trim().length() < 1) {
@@ -283,4 +284,8 @@ PlanAnalysis getPlanAnalysis() {
     long getBucketCount() {
         return planAnalysis != null ? planAnalysis.getAllBuckets().size() : 0;
     }
+
+    long getBatchSize() {
+        return batchSize;
+    }
 }
diff --git a/src/test/java/com/marklogic/spark/reader/document/ReadDocumentRowsTest.java b/src/test/java/com/marklogic/spark/reader/document/ReadDocumentRowsTest.java
@@ -6,6 +6,7 @@
 import com.marklogic.spark.AbstractIntegrationTest;
 import com.marklogic.spark.ConnectorException;
 import com.marklogic.spark.Options;
+import com.marklogic.spark.writer.AbstractWriteTest;
 import org.apache.spark.SparkException;
 import org.apache.spark.sql.DataFrameReader;
 import org.apache.spark.sql.Dataset;
@@ -18,7 +19,7 @@
 
 import static org.junit.jupiter.api.Assertions.*;
 
-class ReadDocumentRowsTest extends AbstractIntegrationTest {
+class ReadDocumentRowsTest extends AbstractWriteTest {
 
     @Test
     void readByCollection() {
@@ -37,6 +38,20 @@ void readByCollection() {
         assertEquals("Vivianne", doc.get("ForeName").asText());
     }
 
+    @Test
+    void logProgress() {
+        newWriter().save();
+
+        Dataset<Row> rows = startRead()
+            .option(Options.READ_DOCUMENTS_PARTITIONS_PER_FOREST, 1)
+            .option(Options.READ_DOCUMENTS_COLLECTIONS, "write-test")
+            .option(Options.READ_BATCH_SIZE, 10)
+            .option(Options.READ_LOG_PROGRESS, 50)
+            .load();
+
+        assertEquals(200, rows.count());
+    }
+
     @Test
     void readViaDirectConnect() {
         Dataset<Row> rows = startRead()
diff --git a/src/test/java/com/marklogic/spark/reader/triples/ReadTriplesTest.java b/src/test/java/com/marklogic/spark/reader/triples/ReadTriplesTest.java
@@ -90,6 +90,8 @@ void collections() {
     void twoCollections() {
         long count = startRead()
             .option(Options.READ_TRIPLES_COLLECTIONS, "http://example.org/graph,other-graph")
+            .option(Options.READ_BATCH_SIZE, 5)
+            .option(Options.READ_LOG_PROGRESS, 10)
             .load().count();
 
         assertEquals(32, count, "Since both test triples files belong to 'test-config', and each also belongs to " +