MLE-17429 Added option for an inconsistent snapshot

rjrudin · rjrudin · commit 5dae33fa021c · 2024-10-08T12:56:40.000-04:00
Fixed a couple Sonar warnings introduced by the Sonar upgrade too.

This will require manual verification via Flux during a long-running job where we can force an update and a merge.
diff --git a/src/main/java/com/marklogic/spark/ContextSupport.java b/src/main/java/com/marklogic/spark/ContextSupport.java
@@ -145,6 +145,10 @@ public final String getStringOption(String option) {
         return hasOption(option) ? properties.get(option).trim() : null;
     }
 
+    public final boolean getBooleanOption(String option, boolean defaultValue) {
+        return hasOption(option) ? Boolean.parseBoolean(getStringOption(option)) : defaultValue;
+    }
+
     public final boolean isStreamingFiles() {
         return "true".equalsIgnoreCase(getStringOption(Options.STREAM_FILES));
     }
diff --git a/src/main/java/com/marklogic/spark/Options.java b/src/main/java/com/marklogic/spark/Options.java
@@ -59,6 +59,14 @@ public abstract class Options {
     public static final String READ_TRIPLES_FILTERED = "spark.marklogic.read.triples.filtered";
     public static final String READ_TRIPLES_BASE_IRI = "spark.marklogic.read.triples.baseIri";
 
+    /**
+     * The connector uses a consistent snapshot by default. Setting this to false results in queries being executed
+     * at multiple points of time, potentially yielding inconsistent results.
+     *
+     * @since 2.4.2
+     */
+    public static final String READ_SNAPSHOT = "spark.marklogic.read.snapshot";
+
     // For logging progress when reading documents, rows, or items via custom code. Defines the interval at which
     // progress should be logged - e.g. a value of 10,000 will result in a message being logged on every 10,000 items
     // being written/processed.
diff --git a/src/main/java/com/marklogic/spark/reader/document/DocumentContext.java b/src/main/java/com/marklogic/spark/reader/document/DocumentContext.java
@@ -99,6 +99,11 @@ int getPartitionsPerForest() {
         return (int) getNumericOption(Options.READ_DOCUMENTS_PARTITIONS_PER_FOREST, defaultPartitionsPerForest, 1);
     }
 
+    boolean isConsistentSnapshot() {
+        // Starting in 2.2.0 and through 2.4.2, the default is a consistent snapshot. We may change this later.
+        return getBooleanOption(Options.READ_SNAPSHOT, true);
+    }
+
     void setLimit(Integer limit) {
         this.limit = limit;
     }
diff --git a/src/main/java/com/marklogic/spark/reader/document/ForestReader.java b/src/main/java/com/marklogic/spark/reader/document/ForestReader.java
@@ -61,16 +61,16 @@ class ForestReader implements PartitionReader<InternalRow> {
             context.connectToMarkLogic(forestPartition.getHost()) :
             context.connectToMarkLogic();
 
+        final boolean filtered = context.getBooleanOption(Options.READ_DOCUMENTS_FILTERED, false);
+        final boolean consistentSnapshot = context.isConsistentSnapshot();
+
         if (logger.isDebugEnabled()) {
-            logger.debug("Will read from host {} for partition: {}", client.getHost(), forestPartition);
+            logger.debug("Will read from host {} for partition: {}; filtered: {}; consistent snapshot: {}",
+                client.getHost(), forestPartition, filtered, consistentSnapshot);
         }
 
         SearchQueryDefinition query = context.buildSearchQuery(client);
-        boolean filtered = false;
-        if (context.hasOption(Options.READ_DOCUMENTS_FILTERED)) {
-            filtered = Boolean.parseBoolean(context.getProperties().get(Options.READ_DOCUMENTS_FILTERED));
-        }
-        this.uriBatcher = new UriBatcher(client, query, forestPartition, context.getBatchSize(), filtered);
+        this.uriBatcher = new UriBatcher(client, query, forestPartition, context.getBatchSize(), filtered, consistentSnapshot);
 
         this.documentManager = client.newDocumentManager();
         this.documentManager.setReadTransform(query.getResponseTransform());
diff --git a/src/main/java/com/marklogic/spark/reader/document/OpticTriplesReader.java b/src/main/java/com/marklogic/spark/reader/document/OpticTriplesReader.java
@@ -15,6 +15,8 @@
 import org.apache.spark.sql.catalyst.expressions.GenericInternalRow;
 import org.apache.spark.sql.connector.read.PartitionReader;
 import org.apache.spark.unsafe.types.UTF8String;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 import java.io.IOException;
 import java.net.URI;
@@ -27,6 +29,8 @@
  */
 class OpticTriplesReader implements PartitionReader<InternalRow> {
 
+    private static final Logger logger = LoggerFactory.getLogger(OpticTriplesReader.class);
+
     private static final String DATATYPE_COLUMN = "datatype";
     private static final String GRAPH_COLUMN = "graph";
     private static final String OBJECT_COLUMN = "object";
@@ -54,12 +58,15 @@ public OpticTriplesReader(ForestPartition forestPartition, DocumentContext conte
         this.op = this.rowManager.newPlanBuilder();
 
         final SearchQueryDefinition query = context.buildTriplesSearchQuery(this.databaseClient);
-        boolean filtered = false;
-        if (context.hasOption(Options.READ_TRIPLES_FILTERED)) {
-            filtered = Boolean.parseBoolean(context.getProperties().get(Options.READ_TRIPLES_FILTERED));
+        final boolean filtered = context.getBooleanOption(Options.READ_TRIPLES_FILTERED, false);
+        final boolean consistentSnapshot = context.isConsistentSnapshot();
+
+        if (logger.isDebugEnabled()) {
+            logger.debug("Will read from host {} for partition: {}; filtered: {}; consistent snapshot: {}",
+                databaseClient.getHost(), forestPartition, filtered, consistentSnapshot);
         }
-        this.uriBatcher = new UriBatcher(this.databaseClient, query, forestPartition, context.getBatchSize(), filtered);
 
+        this.uriBatcher = new UriBatcher(this.databaseClient, query, forestPartition, context.getBatchSize(), filtered, consistentSnapshot);
         this.batchSize = context.getBatchSize();
     }
 
diff --git a/src/main/java/com/marklogic/spark/reader/document/UriBatcher.java b/src/main/java/com/marklogic/spark/reader/document/UriBatcher.java
@@ -25,13 +25,15 @@ class UriBatcher {
     private final ForestPartition partition;
     private final int pageLength;
     private final boolean filtered;
+    private final boolean useConsistentSnapshot;
 
     // These change as batches of URIs are retrieved.
     private String lastUri;
     private long offsetStart = 1;
 
 
-    UriBatcher(DatabaseClient client, SearchQueryDefinition query, ForestPartition partition, int pageLength, boolean filtered) {
+    UriBatcher(DatabaseClient client, SearchQueryDefinition query, ForestPartition partition, int pageLength,
+               boolean filtered, boolean useConsistentSnapshot) {
         this.client = client;
         this.queryManager = (QueryManagerImpl) this.client.newQueryManager();
         this.queryManager.setPageLength(pageLength);
@@ -40,6 +42,7 @@ class UriBatcher {
         this.offsetStart = this.partition.getOffsetStart();
         this.pageLength = pageLength;
         this.filtered = filtered;
+        this.useConsistentSnapshot = useConsistentSnapshot;
     }
 
     /**
@@ -53,7 +56,9 @@ List<String> nextBatchOfUris() {
         }
 
         UrisHandle urisHandle = new UrisHandle();
-        urisHandle.setPointInTimeQueryTimestamp(partition.getServerTimestamp());
+        if (useConsistentSnapshot) {
+            urisHandle.setPointInTimeQueryTimestamp(partition.getServerTimestamp());
+        }
 
         // If we have an offsetEnd, the page length is adjusted to ensure we do not go past offsetEnd.
         if (partition.getOffsetEnd() != null && (this.offsetStart + this.pageLength > partition.getOffsetEnd())) {
diff --git a/src/main/java/com/marklogic/spark/reader/file/FileContext.java b/src/main/java/com/marklogic/spark/reader/file/FileContext.java
@@ -61,10 +61,7 @@ public InputStream openFile(String filePath) {
     }
 
     public boolean isReadAbortOnFailure() {
-        if (hasOption(Options.READ_FILES_ABORT_ON_FAILURE)) {
-            return Boolean.parseBoolean(getStringOption(Options.READ_FILES_ABORT_ON_FAILURE));
-        }
-        return true;
+        return getBooleanOption(Options.READ_FILES_ABORT_ON_FAILURE, true);
     }
 
     byte[] readBytes(InputStream inputStream) throws IOException {
diff --git a/src/test/java/com/marklogic/spark/reader/document/ReadDocumentRowsTest.java b/src/test/java/com/marklogic/spark/reader/document/ReadDocumentRowsTest.java
@@ -40,6 +40,20 @@ void readByCollection() {
         assertEquals("Vivianne", doc.get("ForeName").asText());
     }
 
+    @Test
+    void dirtyRead() {
+        Dataset<Row> rows = startRead()
+            .option(Options.READ_DOCUMENTS_COLLECTIONS, "author")
+            .option(Options.READ_SNAPSHOT, false)
+            .load();
+
+        assertEquals(15, rows.count(), "This test only verifies that the snapshot option can be set to false. " +
+            "We don't yet have a way to verify that the query doesn't use a consistent snapshot, which would entail " +
+            "forcing the read to pause while an update and merge are performed in the database. Verifying the  " +
+            "difference between a consistent snapshot and a dirty read will need to be done manually, including " +
+            "by inspecting the debug logs generated by this test.");
+    }
+
     @Test
     void logProgress() {
         newWriter().save();
diff --git a/src/test/java/com/marklogic/spark/writer/file/WriteFilesWithEncodingTest.java b/src/test/java/com/marklogic/spark/writer/file/WriteFilesWithEncodingTest.java
@@ -142,7 +142,7 @@ void prettyPrintJsonFile(@TempDir Path tempDir) {
     }
 
     @Test
-    void invalidEncoding(@TempDir Path tempDir) {
+    void invalidEncoding() {
         DataFrameWriter writer = newSparkSession().read().format(CONNECTOR_IDENTIFIER)
             .option(Options.CLIENT_URI, makeClientUri())
             .option(Options.READ_DOCUMENTS_URIS, SAMPLE_JSON_DOC_URI)
@@ -151,7 +151,7 @@ void invalidEncoding(@TempDir Path tempDir) {
             .option(Options.WRITE_FILES_ENCODING, "not-valid-encoding")
             .mode(SaveMode.Append);
 
-        ConnectorException ex = assertThrowsConnectorException(() -> writer.save(tempDir.toAbsolutePath().toString()));
+        ConnectorException ex = assertThrowsConnectorException(() -> writer.save("."));
         assertEquals("Unsupported encoding value: not-valid-encoding", ex.getMessage());
     }
 
diff --git a/src/test/java/com/marklogic/spark/writer/file/WriteRdfGzipFilesTest.java b/src/test/java/com/marklogic/spark/writer/file/WriteRdfGzipFilesTest.java
@@ -54,7 +54,7 @@ void gzip(@TempDir Path tempDir) {
     }
 
     @Test
-    void zipIsntValidChoice(@TempDir Path tempDir) {
+    void zipIsntValidChoice() {
         DataFrameWriter writer = newSparkSession()
             .read().format(CONNECTOR_IDENTIFIER)
             .option(Options.CLIENT_URI, makeClientUri())
@@ -65,7 +65,7 @@ void zipIsntValidChoice(@TempDir Path tempDir) {
             .option(Options.WRITE_FILES_COMPRESSION, "zip")
             .mode(SaveMode.Append);
 
-        ConnectorException ex = assertThrowsConnectorException(() -> writer.save(tempDir.toFile().getAbsolutePath()));
+        ConnectorException ex = assertThrowsConnectorException(() -> writer.save("."));
         assertEquals("Unsupported compression value; only 'gzip' is supported: zip", ex.getMessage());
     }
 }

Original file line number	Diff line number	Diff line change
`@@ -145,6 +145,10 @@ public final String getStringOption(String option) {`
`145`	`145`	`return hasOption(option) ? properties.get(option).trim() : null;`
`146`	`146`	`}`
`147`	`147`
	`148`	`+ public final boolean getBooleanOption(String option, boolean defaultValue) {`
	`149`	`+ return hasOption(option) ? Boolean.parseBoolean(getStringOption(option)) : defaultValue;`
	`150`	`+ }`
	`151`	`+`
`148`	`152`	`public final boolean isStreamingFiles() {`
`149`	`153`	`return "true".equalsIgnoreCase(getStringOption(Options.STREAM_FILES));`
`150`	`154`	`}`
Original file line number	Diff line number	Diff line change
`@@ -99,6 +99,11 @@ int getPartitionsPerForest() {`
`99`	`99`	`return (int) getNumericOption(Options.READ_DOCUMENTS_PARTITIONS_PER_FOREST, defaultPartitionsPerForest, 1);`
`100`	`100`	`}`
`101`	`101`
	`102`	`+ boolean isConsistentSnapshot() {`
	`103`	`+ // Starting in 2.2.0 and through 2.4.2, the default is a consistent snapshot. We may change this later.`
	`104`	`+ return getBooleanOption(Options.READ_SNAPSHOT, true);`
	`105`	`+ }`
	`106`	`+`
`102`	`107`	`void setLimit(Integer limit) {`
`103`	`108`	`this.limit = limit;`
`104`	`109`	`}`
Original file line number	Diff line number	Diff line change
`@@ -61,10 +61,7 @@ public InputStream openFile(String filePath) {`
`61`	`61`	`}`
`62`	`62`
`63`	`63`	`public boolean isReadAbortOnFailure() {`
`64`		`- if (hasOption(Options.READ_FILES_ABORT_ON_FAILURE)) {`
`65`		`- return Boolean.parseBoolean(getStringOption(Options.READ_FILES_ABORT_ON_FAILURE));`
`66`		`- }`
`67`		`- return true;`
	`64`	`+ return getBooleanOption(Options.READ_FILES_ABORT_ON_FAILURE, true);`
`68`	`65`	`}`
`69`	`66`
`70`	`67`	`byte[] readBytes(InputStream inputStream) throws IOException {`
Original file line number	Diff line number	Diff line change
`@@ -142,7 +142,7 @@ void prettyPrintJsonFile(@TempDir Path tempDir) {`
`142`	`142`	`}`
`143`	`143`
`144`	`144`	`@Test`
`145`		`- void invalidEncoding(@TempDir Path tempDir) {`
	`145`	`+ void invalidEncoding() {`
`146`	`146`	`DataFrameWriter writer = newSparkSession().read().format(CONNECTOR_IDENTIFIER)`
`147`	`147`	`.option(Options.CLIENT_URI, makeClientUri())`
`148`	`148`	`.option(Options.READ_DOCUMENTS_URIS, SAMPLE_JSON_DOC_URI)`
`@@ -151,7 +151,7 @@ void invalidEncoding(@TempDir Path tempDir) {`
`151`	`151`	`.option(Options.WRITE_FILES_ENCODING, "not-valid-encoding")`
`152`	`152`	`.mode(SaveMode.Append);`
`153`	`153`
`154`		`- ConnectorException ex = assertThrowsConnectorException(() -> writer.save(tempDir.toAbsolutePath().toString()));`
	`154`	`+ ConnectorException ex = assertThrowsConnectorException(() -> writer.save("."));`
`155`	`155`	`assertEquals("Unsupported encoding value: not-valid-encoding", ex.getMessage());`
`156`	`156`	`}`
`157`	`157`
Original file line number	Diff line number	Diff line change
`@@ -54,7 +54,7 @@ void gzip(@TempDir Path tempDir) {`
`54`	`54`	`}`
`55`	`55`
`56`	`56`	`@Test`
`57`		`- void zipIsntValidChoice(@TempDir Path tempDir) {`
	`57`	`+ void zipIsntValidChoice() {`
`58`	`58`	`DataFrameWriter writer = newSparkSession()`
`59`	`59`	`.read().format(CONNECTOR_IDENTIFIER)`
`60`	`60`	`.option(Options.CLIENT_URI, makeClientUri())`
`@@ -65,7 +65,7 @@ void zipIsntValidChoice(@TempDir Path tempDir) {`
`65`	`65`	`.option(Options.WRITE_FILES_COMPRESSION, "zip")`
`66`	`66`	`.mode(SaveMode.Append);`
`67`	`67`
`68`		`- ConnectorException ex = assertThrowsConnectorException(() -> writer.save(tempDir.toFile().getAbsolutePath()));`
	`68`	`+ ConnectorException ex = assertThrowsConnectorException(() -> writer.save("."));`
`69`	`69`	`assertEquals("Unsupported compression value; only 'gzip' is supported: zip", ex.getMessage());`
`70`	`70`	`}`
`71`	`71`	`}`