Merge pull request #288 from marklogic/feature/streaming-file-write

rjrudin · web-flow · commit 673de1a9956d · 2024-09-23T10:28:26.000-04:00
MLE-17041 Can now stream when writing generic files
diff --git a/src/main/java/com/marklogic/spark/ContextSupport.java b/src/main/java/com/marklogic/spark/ContextSupport.java
@@ -25,7 +25,7 @@ public class ContextSupport implements Serializable {
     // client. Those two actions are rarely done, so the cost of synchronization will be negligible.
     private static final Object CLIENT_LOCK = new Object();
 
-    protected ContextSupport(Map<String, String> properties) {
+    public ContextSupport(Map<String, String> properties) {
         this.properties = properties;
         this.configuratorWasAdded = addOkHttpConfiguratorIfNecessary();
     }
diff --git a/src/main/java/com/marklogic/spark/Options.java b/src/main/java/com/marklogic/spark/Options.java
@@ -155,6 +155,10 @@ public abstract class Options {
      * into the content column instead of the contents of the file. When used during the writer phase when writing rows
      * conforming to {@code DocumentRowSchema}, the connector will stream the file using the {@code FileContext} to
      * avoid reading its contents into memory.
+     * <p>
+     * Similarly, when used in the reader phase when reading documents from MarkLogic, the value of the 'content' column
+     * in each row will be null. During the writer phase, the connector will retrieve the document corresponding to the
+     * value in the 'uri' column and stream it to file.
      *
      * @since 2.4.0
      */
diff --git a/src/main/java/com/marklogic/spark/reader/document/DocumentContext.java b/src/main/java/com/marklogic/spark/reader/document/DocumentContext.java
@@ -43,6 +43,9 @@ Set<DocumentManager.Metadata> getRequestedMetadata() {
     }
 
     boolean contentWasRequested() {
+        if ("true".equals(getStringOption(Options.STREAM_FILES))) {
+            return false;
+        }
         if (!hasOption(Options.READ_DOCUMENTS_CATEGORIES)) {
             return true;
         }
diff --git a/src/main/java/com/marklogic/spark/reader/document/DocumentScanBuilder.java b/src/main/java/com/marklogic/spark/reader/document/DocumentScanBuilder.java
@@ -3,6 +3,8 @@
  */
 package com.marklogic.spark.reader.document;
 
+import com.marklogic.spark.Options;
+import com.marklogic.spark.Util;
 import org.apache.spark.sql.connector.read.Scan;
 import org.apache.spark.sql.connector.read.ScanBuilder;
 import org.apache.spark.sql.connector.read.SupportsPushDownLimit;
@@ -15,6 +17,9 @@ class DocumentScanBuilder implements ScanBuilder, SupportsPushDownLimit {
 
     DocumentScanBuilder(CaseInsensitiveStringMap options, StructType schema) {
         this.context = new DocumentContext(options, schema);
+        if ("true".equalsIgnoreCase(this.context.getStringOption(Options.STREAM_FILES)) && Util.MAIN_LOGGER.isInfoEnabled()) {
+            Util.MAIN_LOGGER.info("Will defer reading documents from MarkLogic so they can be streamed to files during the writer phase.");
+        }
     }
 
     @Override
diff --git a/src/main/java/com/marklogic/spark/writer/file/ContentWriter.java b/src/main/java/com/marklogic/spark/writer/file/ContentWriter.java
@@ -5,7 +5,10 @@
 
 import com.fasterxml.jackson.databind.JsonNode;
 import com.fasterxml.jackson.databind.ObjectMapper;
+import com.marklogic.client.document.GenericDocumentManager;
+import com.marklogic.client.io.BytesHandle;
 import com.marklogic.spark.ConnectorException;
+import com.marklogic.spark.ContextSupport;
 import com.marklogic.spark.Options;
 import com.marklogic.spark.reader.document.DocumentRowSchema;
 import org.apache.spark.sql.catalyst.InternalRow;
@@ -32,6 +35,9 @@ class ContentWriter {
     private final boolean prettyPrint;
     private final Charset encoding;
 
+    // Only set when streaming.
+    private final GenericDocumentManager documentManager;
+
     ContentWriter(Map<String, String> properties) {
         this.encoding = determineEncoding(properties);
         this.prettyPrint = "true".equalsIgnoreCase(properties.get(Options.WRITE_FILES_PRETTY_PRINT));
@@ -42,19 +48,22 @@ class ContentWriter {
             this.transformer = null;
             this.objectMapper = null;
         }
+
+        this.documentManager = "true".equalsIgnoreCase(properties.get(Options.STREAM_FILES)) ?
+            new ContextSupport(properties).connectToMarkLogic().newDocumentManager() : null;
     }
 
     void writeContent(InternalRow row, OutputStream outputStream) throws IOException {
         if (this.prettyPrint) {
             prettyPrintContent(row, outputStream);
         } else {
-            byte[] bytes = row.getBinary(1);
+            byte[] bytes = getContentBytes(row);
             if (this.encoding != null) {
                 // We know the string from MarkLogic is UTF-8, so we use getBytes to convert it to the user's
                 // specified encoding (as opposed to new String(bytes, encoding)).
                 outputStream.write(new String(bytes).getBytes(this.encoding));
             } else {
-                outputStream.write(row.getBinary(1));
+                outputStream.write(bytes);
             }
         }
     }
@@ -107,7 +116,7 @@ private Transformer newTransformer() {
     }
 
     private void prettyPrintContent(InternalRow row, OutputStream outputStream) throws IOException {
-        final byte[] content = row.getBinary(1);
+        final byte[] content = getContentBytes(row);
         final String format = row.isNullAt(2) ? null : row.getString(2);
         if ("JSON".equalsIgnoreCase(format)) {
             prettyPrintJson(content, outputStream);
@@ -141,4 +150,12 @@ private void prettyPrintXml(byte[] content, OutputStream outputStream) {
             throw new ConnectorException(String.format("Unable to pretty print XML; cause: %s", e.getMessage()), e);
         }
     }
+
+    private byte[] getContentBytes(InternalRow row) {
+        if (this.documentManager != null) {
+            String uri = row.getString(0);
+            return documentManager.read(uri, new BytesHandle()).get();
+        }
+        return row.getBinary(1);
+    }
 }
diff --git a/src/test/java/com/marklogic/spark/writer/file/PrettyPrintFilesTest.java b/src/test/java/com/marklogic/spark/writer/file/PrettyPrintFilesTest.java
@@ -19,6 +19,7 @@
 import java.util.List;
 
 import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertTrue;
 
 class PrettyPrintFilesTest extends AbstractIntegrationTest {
 
@@ -108,4 +109,28 @@ void notPrettyPrinted(@TempDir Path tempDir) throws IOException {
         String doc2 = FileUtils.readFileToString(new File(dir, "doc2.json"), "UTF-8");
         assertEquals("{\"hello\":\"world\"}", doc2);
     }
+
+    @Test
+    void notSupportedWhenStreaming(@TempDir Path tempDir) throws Exception {
+        newSparkSession().read()
+            .format(CONNECTOR_IDENTIFIER)
+            .option(Options.CLIENT_URI, makeClientUri())
+            .option(Options.STREAM_FILES, true)
+            .option(Options.READ_DOCUMENTS_COLLECTIONS, "pretty-print")
+            .load()
+            .write()
+            .format(CONNECTOR_IDENTIFIER)
+            .option(Options.CLIENT_URI, makeClientUri())
+            .option(Options.STREAM_FILES, true)
+            .option(Options.WRITE_FILES_PRETTY_PRINT, "true")
+            .mode(SaveMode.Append)
+            .save(tempDir.toFile().getAbsolutePath());
+
+        File dir = new File(tempDir.toFile(), "pretty-print");
+        String doc1 = FileUtils.readFileToString(new File(dir, "doc1.xml"), "UTF-8");
+        assertTrue(doc1.contains("<root><hello>world</hello></root>"),
+            "pretty-printed is not supported when streaming documents, as pretty-printing requires reading the " +
+                "document into memory, which conflicts with streaming. So the XML doc should be on a single line. " +
+                "Actual doc: " + doc1);
+    }
 }
diff --git a/src/test/java/com/marklogic/spark/writer/file/WriteDocumentFilesTest.java b/src/test/java/com/marklogic/spark/writer/file/WriteDocumentFilesTest.java
@@ -11,6 +11,8 @@
 import com.marklogic.spark.AbstractIntegrationTest;
 import com.marklogic.spark.Options;
 import com.marklogic.spark.TestUtil;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
 import org.apache.spark.sql.SaveMode;
 import org.junit.jupiter.api.Test;
 import org.junit.jupiter.api.io.TempDir;
@@ -40,18 +42,33 @@ void writeFifteenAuthorFiles(@TempDir Path tempDir) throws Exception {
             .mode(SaveMode.Append)
             .save(tempDir.toFile().getAbsolutePath());
 
-        for (int i = 1; i <= 15; i++) {
-            File expectedFile = Paths.get(
-                tempDir.toFile().getAbsolutePath(),
-                "author", "author" + i + ".json"
-            ).toFile();
-            assertTrue(expectedFile.exists(), "Expected file at: " + expectedFile);
+        verifyAuthorFilesWereCorrectlyWritten(tempDir);
+    }
 
-            // Verify the JSON is valid.
-            JsonNode doc = objectMapper.readTree(expectedFile);
-            assertTrue(doc.has("CitationID"));
-            assertTrue(doc.has("LastName"));
-        }
+    @Test
+    void streamAuthorDocuments(@TempDir Path tempDir) throws Exception {
+        Dataset<Row> dataset = newSparkSession().read()
+            .format(CONNECTOR_IDENTIFIER)
+            .option(Options.CLIENT_URI, makeClientUri())
+            .option(Options.STREAM_FILES, true)
+            .option(Options.READ_DOCUMENTS_COLLECTIONS, "author")
+            .load();
+
+        assertEquals(15, dataset.count());
+
+        dataset.collectAsList().forEach(row -> assertTrue(row.isNullAt(1),
+            "When the 'stream files' option is used when reading documents, the 'content' column should be null " +
+                "for each row. When each row is written to a file, the document corresponding to the URI in the " +
+                "'uri' column should be retrieved and streamed to file, thus avoiding ever reading the entire " +
+                "document into memory."));
+
+        dataset.write().format(CONNECTOR_IDENTIFIER)
+            .option(Options.CLIENT_URI, makeClientUri())
+            .option(Options.STREAM_FILES, true)
+            .mode(SaveMode.Append)
+            .save(tempDir.toFile().getAbsolutePath());
+
+        verifyAuthorFilesWereCorrectlyWritten(tempDir);
     }
 
     @Test
@@ -119,4 +136,19 @@ void uriHasSpace(@TempDir Path tempDir) {
                 "due to a space), the error should be logged and the file should be written with its unaltered " +
                 "document URI used for the file path.");
     }
+
+    private void verifyAuthorFilesWereCorrectlyWritten(Path tempDir) throws Exception {
+        for (int i = 1; i <= 15; i++) {
+            File expectedFile = Paths.get(
+                tempDir.toFile().getAbsolutePath(),
+                "author", "author" + i + ".json"
+            ).toFile();
+            assertTrue(expectedFile.exists(), "Expected file at: " + expectedFile);
+
+            // Verify the JSON is valid.
+            JsonNode doc = objectMapper.readTree(expectedFile);
+            assertTrue(doc.has("CitationID"));
+            assertTrue(doc.has("LastName"));
+        }
+    }
 }

Original file line number	Diff line number	Diff line change
`@@ -25,7 +25,7 @@ public class ContextSupport implements Serializable {`
`25`	`25`	`// client. Those two actions are rarely done, so the cost of synchronization will be negligible.`
`26`	`26`	`private static final Object CLIENT_LOCK = new Object();`
`27`	`27`
`28`		`- protected ContextSupport(Map<String, String> properties) {`
	`28`	`+ public ContextSupport(Map<String, String> properties) {`
`29`	`29`	`this.properties = properties;`
`30`	`30`	`this.configuratorWasAdded = addOkHttpConfiguratorIfNecessary();`
`31`	`31`	`}`
Original file line number	Diff line number	Diff line change
`@@ -43,6 +43,9 @@ Set<DocumentManager.Metadata> getRequestedMetadata() {`
`43`	`43`	`}`
`44`	`44`
`45`	`45`	`boolean contentWasRequested() {`
	`46`	`+ if ("true".equals(getStringOption(Options.STREAM_FILES))) {`
	`47`	`+ return false;`
	`48`	`+ }`
`46`	`49`	`if (!hasOption(Options.READ_DOCUMENTS_CATEGORIES)) {`
`47`	`50`	`return true;`
`48`	`51`	`}`