Merge pull request #289 from marklogic/feature/streaming-put

rjrudin · web-flow · commit 8e823151ef71 · 2024-09-23T13:37:25.000-04:00
MLE-17041 Making PUT call when streaming
diff --git a/src/main/java/com/marklogic/spark/writer/WriteBatcherDataWriter.java b/src/main/java/com/marklogic/spark/writer/WriteBatcherDataWriter.java
@@ -7,9 +7,11 @@
 import com.marklogic.client.datamovement.DataMovementManager;
 import com.marklogic.client.datamovement.WriteBatcher;
 import com.marklogic.client.document.DocumentWriteOperation;
+import com.marklogic.client.document.GenericDocumentManager;
 import com.marklogic.client.impl.HandleAccessor;
 import com.marklogic.client.io.DocumentMetadataHandle;
 import com.marklogic.client.io.marker.AbstractWriteHandle;
+import com.marklogic.client.io.marker.GenericWriteHandle;
 import com.marklogic.spark.ConnectorException;
 import com.marklogic.spark.Options;
 import com.marklogic.spark.Util;
@@ -55,6 +57,10 @@ class WriteBatcherDataWriter implements DataWriter<InternalRow> {
 
     private final RowConverter rowConverter;
 
+    private final boolean isStreamingFiles;
+    // Only initialized if streaming files.
+    private final GenericDocumentManager documentManager;
+
     // Updated as batches are processed.
     private final AtomicInteger successItemCount = new AtomicInteger(0);
     private final AtomicInteger failedItemCount = new AtomicInteger(0);
@@ -65,6 +71,8 @@ class WriteBatcherDataWriter implements DataWriter<InternalRow> {
         this.docBuilder = this.writeContext.newDocBuilder();
         this.databaseClient = writeContext.connectToMarkLogic();
         this.rowConverter = determineRowConverter();
+        this.isStreamingFiles = "true".equals(writeContext.getStringOption(Options.STREAM_FILES));
+        this.documentManager = this.isStreamingFiles ? databaseClient.newDocumentManager() : null;
 
         if (writeContext.isAbortOnFailure()) {
             this.batchRetrier = null;
@@ -86,7 +94,12 @@ public void write(InternalRow row) {
         throwWriteFailureIfExists();
         Optional<DocBuilder.DocumentInputs> document = rowConverter.convertRow(row);
         if (document.isPresent()) {
-            this.writeBatcher.add(this.docBuilder.build(document.get()));
+            DocumentWriteOperation writeOp = this.docBuilder.build(document.get());
+            if (this.isStreamingFiles) {
+                writeDocumentViaPutOperation(writeOp);
+            } else {
+                this.writeBatcher.add(writeOp);
+            }
         }
     }
 
@@ -183,8 +196,7 @@ private BatchRetrier makeBatchRetrier() {
             writeContext.getStringOption(Options.WRITE_TEMPORAL_COLLECTION),
             successfulBatch -> successItemCount.getAndAdd(successfulBatch.size()),
             (failedDoc, failure) -> {
-                Util.MAIN_LOGGER.error("Unable to write document with URI: {}; cause: {}", failedDoc.getUri(), failure.getMessage());
-                failedItemCount.incrementAndGet();
+                captureFailure(failure.getMessage(), failedDoc.getUri());
                 if (this.archiveWriter != null) {
                     writeFailedDocumentToArchive(failedDoc);
                 }
@@ -234,4 +246,28 @@ private void closeArchiveWriter() {
             archiveWriter.close();
         }
     }
+
+    /**
+     * A user typically chooses to stream a document due to its size. A PUT call to v1/documents can handle a document
+     * of any size. But a POST call seems to have a limitation due to the multipart nature of the request - the body
+     * part appears to be read into memory, which can cause the server to run out of memory. So for streaming, a PUT
+     * call is made, which means we don't use the WriteBatcher.
+     *
+     * @param writeOp
+     */
+    private void writeDocumentViaPutOperation(DocumentWriteOperation writeOp) {
+        final String uri = writeOp.getUri();
+        try {
+            this.documentManager.write(uri, writeOp.getMetadata(), (GenericWriteHandle) writeOp.getContent());
+            this.successItemCount.incrementAndGet();
+        } catch (RuntimeException ex) {
+            captureFailure(ex.getMessage(), uri);
+            this.writeFailure.compareAndSet(null, ex);
+        }
+    }
+
+    private void captureFailure(String message, String documentUri) {
+        Util.MAIN_LOGGER.error("Unable to write document with URI: {}; cause: {}", documentUri, message);
+        failedItemCount.incrementAndGet();
+    }
 }
diff --git a/src/test/java/com/marklogic/spark/reader/file/ReadGenericFilesStreamingTest.java b/src/test/java/com/marklogic/spark/reader/file/ReadGenericFilesStreamingTest.java
@@ -4,24 +4,26 @@
 package com.marklogic.spark.reader.file;
 
 import com.marklogic.spark.AbstractIntegrationTest;
+import com.marklogic.spark.ConnectorException;
 import com.marklogic.spark.Options;
+import org.apache.spark.sql.DataFrameWriter;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Row;
+import org.apache.spark.sql.SaveMode;
 import org.junit.jupiter.api.Test;
 
 import java.io.ByteArrayInputStream;
 import java.io.ObjectInputStream;
 
-import static org.junit.jupiter.api.Assertions.assertEquals;
-import static org.junit.jupiter.api.Assertions.assertNotNull;
+import static org.junit.jupiter.api.Assertions.*;
 
+/**
+ * In this context, "streaming" != Spark Structured Streaming, but rather avoiding reading the contents of a file
+ * into memory by postponing reading of the file until the writer phase, where it can then be streamed from disk into
+ * MarkLogic.
+ */
 class ReadGenericFilesStreamingTest extends AbstractIntegrationTest {
 
-    /**
-     * In this context, "streaming" != Spark Structured Streaming, but rather avoiding reading the contents of a file
-     * into memory by postponing reading of the file until the writer phase, where it can then be streamed from disk into
-     * MarkLogic.
-     */
     @Test
     void stream() throws Exception {
         Dataset<Row> dataset = newSparkSession().read().format(CONNECTOR_IDENTIFIER)
@@ -41,6 +43,24 @@ void stream() throws Exception {
             "of tests.", "streamed-files", 4);
     }
 
+    @Test
+    void handleFailureWhileStreaming() {
+        DataFrameWriter writer = newSparkSession()
+            .read().format(CONNECTOR_IDENTIFIER)
+            .option(Options.STREAM_FILES, true)
+            .load("src/test/resources/mixed-files/hello.json")
+            .write().format(CONNECTOR_IDENTIFIER)
+            .option(Options.STREAM_FILES, true)
+            .option(Options.CLIENT_URI, makeClientUri())
+            .option(Options.WRITE_PERMISSIONS, "not-an-actual-role,read")
+            .mode(SaveMode.Append);
+
+        ConnectorException ex = assertThrowsConnectorException(() -> writer.save());
+        assertTrue(ex.getMessage().contains("SEC-ROLEDNE: xdmp:role(\"not-an-actual-role\")"),
+            "This verifies that when the connector uses GenericDocumentManager to PUT a single document, any error " +
+                "is still wrapped in a ConnectorException. Actual error message: " + ex.getMessage());
+    }
+
     private void verifyEachRowHasFileContextAsItsContent(Dataset<Row> dataset) throws Exception {
         for (Row row : dataset.collectAsList()) {
             byte[] content = (byte[]) row.get(1);