Merge pull request #286 from marklogic/feature/streaming-file-read

rjrudin · web-flow · commit f0d58bb51905 · 2024-09-23T10:16:14.000-04:00
MLE-17041 Can now stream when reading generic files
diff --git a/src/main/java/com/marklogic/spark/MarkLogicFileTable.java b/src/main/java/com/marklogic/spark/MarkLogicFileTable.java
@@ -42,6 +42,9 @@ class MarkLogicFileTable extends FileTable {
 
     @Override
     public ScanBuilder newScanBuilder(CaseInsensitiveStringMap options) {
+        if ("true".equalsIgnoreCase(options.get(Options.STREAM_FILES)) && Util.MAIN_LOGGER.isInfoEnabled()) {
+            Util.MAIN_LOGGER.info("Will defer reading of file contents so they can be streamed during the writer phase.");
+        }
         return new FileScanBuilder(options.asCaseSensitiveMap(), super.fileIndex());
     }
 
diff --git a/src/main/java/com/marklogic/spark/Options.java b/src/main/java/com/marklogic/spark/Options.java
@@ -150,6 +150,16 @@ public abstract class Options {
     public static final String WRITE_RDF_FILES_FORMAT = "spark.marklogic.write.files.rdf.format";
     public static final String WRITE_RDF_FILES_GRAPH = "spark.marklogic.write.files.rdf.graph";
 
+    /**
+     * When used in the reader phase while reading generic files, the connector will put a serialized {@code FileContext}
+     * into the content column instead of the contents of the file. When used during the writer phase when writing rows
+     * conforming to {@code DocumentRowSchema}, the connector will stream the file using the {@code FileContext} to
+     * avoid reading its contents into memory.
+     *
+     * @since 2.4.0
+     */
+    public static final String STREAM_FILES = "spark.marklogic.files.stream";
+
     private Options() {
     }
 }
diff --git a/src/main/java/com/marklogic/spark/reader/file/GenericFileReader.java b/src/main/java/com/marklogic/spark/reader/file/GenericFileReader.java
@@ -4,15 +4,18 @@
 package com.marklogic.spark.reader.file;
 
 import com.marklogic.spark.ConnectorException;
+import com.marklogic.spark.Options;
 import com.marklogic.spark.Util;
 import org.apache.spark.sql.catalyst.InternalRow;
 import org.apache.spark.sql.catalyst.expressions.GenericInternalRow;
 import org.apache.spark.sql.connector.read.PartitionReader;
 import org.apache.spark.unsafe.types.ByteArray;
 import org.apache.spark.unsafe.types.UTF8String;
 
+import java.io.ByteArrayOutputStream;
 import java.io.IOException;
 import java.io.InputStream;
+import java.io.ObjectOutputStream;
 
 /**
  * "Generic" = read each file as-is with no special processing.
@@ -21,13 +24,15 @@ class GenericFileReader implements PartitionReader<InternalRow> {
 
     private final FilePartition filePartition;
     private final FileContext fileContext;
+    private final boolean isStreaming;
 
     private InternalRow nextRowToReturn;
     private int filePathIndex;
 
     GenericFileReader(FilePartition filePartition, FileContext fileContext) {
         this.filePartition = filePartition;
         this.fileContext = fileContext;
+        this.isStreaming = "true".equalsIgnoreCase(fileContext.getStringOption(Options.STREAM_FILES));
     }
 
     @Override
@@ -39,14 +44,13 @@ public boolean next() {
         final String path = filePartition.getPaths().get(filePathIndex);
         filePathIndex++;
         try {
-            try (InputStream inputStream = fileContext.openFile(path)) {
-                byte[] content = fileContext.readBytes(inputStream);
-                nextRowToReturn = new GenericInternalRow(new Object[]{
-                    UTF8String.fromString(path),
-                    ByteArray.concat(content),
-                    null, null, null, null, null, null
-                });
-            }
+            byte[] content = this.isStreaming ? serializeFileContext() : readFileIntoByteArray(path);
+            
+            nextRowToReturn = new GenericInternalRow(new Object[]{
+                UTF8String.fromString(path),
+                ByteArray.concat(content),
+                null, null, null, null, null, null
+            });
         } catch (Exception ex) {
             String message = String.format("Unable to read file at %s; cause: %s", path, ex.getMessage());
             if (fileContext.isReadAbortOnFailure()) {
@@ -67,4 +71,18 @@ public InternalRow get() {
     public void close() throws IOException {
         // Nothing to close.
     }
+
+    private byte[] readFileIntoByteArray(String path) throws IOException {
+        try (InputStream inputStream = fileContext.openFile(path)) {
+            return fileContext.readBytes(inputStream);
+        }
+    }
+
+    private byte[] serializeFileContext() throws IOException {
+        ByteArrayOutputStream baos = new ByteArrayOutputStream();
+        try (ObjectOutputStream oos = new ObjectOutputStream(baos)) {
+            oos.writeObject(fileContext);
+        }
+        return baos.toByteArray();
+    }
 }
diff --git a/src/main/java/com/marklogic/spark/writer/DocumentRowConverter.java b/src/main/java/com/marklogic/spark/writer/DocumentRowConverter.java
@@ -9,11 +9,17 @@
 import com.marklogic.client.io.BytesHandle;
 import com.marklogic.client.io.DocumentMetadataHandle;
 import com.marklogic.client.io.Format;
+import com.marklogic.client.io.InputStreamHandle;
+import com.marklogic.client.io.marker.AbstractWriteHandle;
+import com.marklogic.spark.ConnectorException;
 import com.marklogic.spark.Options;
 import com.marklogic.spark.reader.document.DocumentRowSchema;
+import com.marklogic.spark.reader.file.FileContext;
 import org.apache.spark.sql.catalyst.InternalRow;
 
+import java.io.ByteArrayInputStream;
 import java.io.IOException;
+import java.io.ObjectInputStream;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Optional;
@@ -26,11 +32,13 @@ class DocumentRowConverter implements RowConverter {
     private final ObjectMapper objectMapper;
     private final String uriTemplate;
     private final Format documentFormat;
+    private final boolean isStreamingFromFiles;
 
     DocumentRowConverter(WriteContext writeContext) {
         this.uriTemplate = writeContext.getStringOption(Options.WRITE_URI_TEMPLATE);
         this.documentFormat = writeContext.getDocumentFormat();
         this.objectMapper = new ObjectMapper();
+        this.isStreamingFromFiles = writeContext.hasOption(Options.STREAM_FILES);
     }
 
     @Override
@@ -43,25 +51,34 @@ public Optional<DocBuilder.DocumentInputs> convertRow(InternalRow row) {
             return Optional.of(new DocBuilder.DocumentInputs(uri, null, null, metadata));
         }
 
-        final BytesHandle content = new BytesHandle(row.getBinary(1));
-        if (this.documentFormat != null) {
-            content.withFormat(this.documentFormat);
-        }
+        Content content = this.isStreamingFromFiles ?
+            readContentFromFile(uri, row) :
+            readContentFromRow(uri, row);
 
-        JsonNode uriTemplateValues = null;
-        if (this.uriTemplate != null && this.uriTemplate.trim().length() > 0) {
-            String format = row.isNullAt(2) ? null : row.getString(2);
-            uriTemplateValues = deserializeContentToJson(uri, content, format);
-        }
         DocumentMetadataHandle metadata = DocumentRowSchema.makeDocumentMetadata(row);
-        return Optional.of(new DocBuilder.DocumentInputs(uri, content, uriTemplateValues, metadata));
+        return Optional.of(new DocBuilder.DocumentInputs(
+            uri, content.contentHandle, content.uriTemplateValues, metadata)
+        );
     }
 
     @Override
     public List<DocBuilder.DocumentInputs> getRemainingDocumentInputs() {
         return new ArrayList<>();
     }
 
+    private Content readContentFromRow(String uri, InternalRow row) {
+        BytesHandle bytesHandle = new BytesHandle(row.getBinary(1));
+        if (this.documentFormat != null) {
+            bytesHandle.withFormat(this.documentFormat);
+        }
+        JsonNode uriTemplateValues = null;
+        if (this.uriTemplate != null && this.uriTemplate.trim().length() > 0) {
+            String format = row.isNullAt(2) ? null : row.getString(2);
+            uriTemplateValues = deserializeContentToJson(uri, bytesHandle, format);
+        }
+        return new Content(bytesHandle, uriTemplateValues);
+    }
+
     private JsonNode deserializeContentToJson(String initialUri, BytesHandle contentHandle, String format) {
         try {
             return objectMapper.readTree(contentHandle.get());
@@ -75,4 +92,41 @@ private JsonNode deserializeContentToJson(String initialUri, BytesHandle content
             return values;
         }
     }
+
+    /**
+     * In a scenario where the user wants to stream a file into MarkLogic, the content column will contain a serialized
+     * instance of {@code FileContext}, which is used to stream the file into a {@code InputStreamHandle}.
+     */
+    private Content readContentFromFile(String uri, InternalRow row) {
+        byte[] bytes = row.getBinary(1);
+        try {
+            ObjectInputStream ois = new ObjectInputStream(new ByteArrayInputStream(bytes));
+            FileContext fileContext = (FileContext) ois.readObject();
+            InputStreamHandle streamHandle = new InputStreamHandle(fileContext.openFile(uri));
+            if (this.documentFormat != null) {
+                streamHandle.withFormat(this.documentFormat);
+            }
+            return new Content(streamHandle, null);
+        } catch (Exception e) {
+            throw new ConnectorException(String.format("Unable to read from file %s; cause: %s", uri, e.getMessage()));
+        }
+    }
+
+    private static class Content {
+        private final AbstractWriteHandle contentHandle;
+        private final JsonNode uriTemplateValues;
+
+        public Content(AbstractWriteHandle contentHandle, JsonNode uriTemplateValues) {
+            this.contentHandle = contentHandle;
+            this.uriTemplateValues = uriTemplateValues;
+        }
+
+        AbstractWriteHandle getContentHandle() {
+            return contentHandle;
+        }
+
+        JsonNode getUriTemplateValues() {
+            return uriTemplateValues;
+        }
+    }
 }
diff --git a/src/test/java/com/marklogic/spark/reader/file/ReadGenericFilesStreamingTest.java b/src/test/java/com/marklogic/spark/reader/file/ReadGenericFilesStreamingTest.java
@@ -0,0 +1,56 @@
+/*
+ * Copyright © 2024 MarkLogic Corporation. All Rights Reserved.
+ */
+package com.marklogic.spark.reader.file;
+
+import com.marklogic.spark.AbstractIntegrationTest;
+import com.marklogic.spark.Options;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+import org.junit.jupiter.api.Test;
+
+import java.io.ByteArrayInputStream;
+import java.io.ObjectInputStream;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+
+class ReadGenericFilesStreamingTest extends AbstractIntegrationTest {
+
+    /**
+     * In this context, "streaming" != Spark Structured Streaming, but rather avoiding reading the contents of a file
+     * into memory by postponing reading of the file until the writer phase, where it can then be streamed from disk into
+     * MarkLogic.
+     */
+    @Test
+    void stream() throws Exception {
+        Dataset<Row> dataset = newSparkSession().read().format(CONNECTOR_IDENTIFIER)
+            .option(Options.STREAM_FILES, true)
+            .load("src/test/resources/mixed-files");
+
+        assertEquals(4, dataset.count());
+        verifyEachRowHasFileContextAsItsContent(dataset);
+
+        defaultWrite(dataset.write().format(CONNECTOR_IDENTIFIER)
+            .option(Options.STREAM_FILES, true)
+            .option(Options.WRITE_COLLECTIONS, "streamed-files")
+            .option(Options.WRITE_URI_REPLACE, ".*/mixed-files,''"));
+
+        assertCollectionSize("This verifies that enabling streaming does not break any functionality. We don't " +
+            "have a test for a file large enough to warrant streaming as that would drastically slow down the suite " +
+            "of tests.", "streamed-files", 4);
+    }
+
+    private void verifyEachRowHasFileContextAsItsContent(Dataset<Row> dataset) throws Exception {
+        for (Row row : dataset.collectAsList()) {
+            byte[] content = (byte[]) row.get(1);
+            try (ObjectInputStream ois = new ObjectInputStream(new ByteArrayInputStream(content))) {
+                FileContext fileContext = (FileContext) ois.readObject();
+                assertNotNull(fileContext, "To enable streaming of files, the content column should not " +
+                    "contain the contents of the file, which forces reading the entire file into memory. " +
+                    "Instead, the associated FileContext - containing the Hadoop SerializableConfiguration class - " +
+                    "should be serialized so that it can be used to read the file during the writer phase.");
+            }
+        }
+    }
+}

Original file line number	Diff line number	Diff line change
`@@ -42,6 +42,9 @@ class MarkLogicFileTable extends FileTable {`
`42`	`42`
`43`	`43`	`@Override`
`44`	`44`	`public ScanBuilder newScanBuilder(CaseInsensitiveStringMap options) {`
	`45`	`+ if ("true".equalsIgnoreCase(options.get(Options.STREAM_FILES)) && Util.MAIN_LOGGER.isInfoEnabled()) {`
	`46`	`+ Util.MAIN_LOGGER.info("Will defer reading of file contents so they can be streamed during the writer phase.");`
	`47`	`+ }`
`45`	`48`	`return new FileScanBuilder(options.asCaseSensitiveMap(), super.fileIndex());`
`46`	`49`	`}`
`47`	`50`