Merge pull request #304 from marklogic/feature/17142-stream-gzip-files

rjrudin · web-flow · commit 848b99f7edd3 · 2024-09-30T15:39:41.000-04:00
MLE-17142 Can now stream gzip files on import
diff --git a/docs/reading-data/reading-files/generic-file-support.md b/docs/reading-data/reading-files/generic-file-support.md
@@ -46,7 +46,7 @@ The connector also supports the following
 
 ## Reading and writing large binary files
 
-The 2.3.2 connector introduces a fix for reading and writing large binary files to MarkLogic, allowing for the contents
+The 2.4.0 connector introduces support for reading and writing large binary files to MarkLogic, allowing for the contents
 of each file to be streamed from its source to MarkLogic. This avoids an issue where the Spark environment runs out
 of memory while trying to fit the contents of a file into an in-memory row. 
 
@@ -62,6 +62,8 @@ Files read from the MarkLogic Spark connector with the above option can then be
 with the same option above being passed to the writer. The connector will then stream the contents of each file to
 MarkLogic, submitting one request to MarkLogic per document. 
 
+The `spark.marklogic.streamFiles` option can also be used when reading GZIP, ZIP, and archive files. 
+
 ## Reading any file
 
 If you wish to read files without any special handling provided by the connector, you can use the
diff --git a/src/main/java/com/marklogic/spark/reader/file/FileContext.java b/src/main/java/com/marklogic/spark/reader/file/FileContext.java
@@ -44,7 +44,7 @@ public boolean isZip() {
         return "zip".equalsIgnoreCase(getStringOption(Options.READ_FILES_COMPRESSION));
     }
 
-    boolean isGzip() {
+    public boolean isGzip() {
         return "gzip".equalsIgnoreCase(getStringOption(Options.READ_FILES_COMPRESSION));
     }
 
diff --git a/src/main/java/com/marklogic/spark/reader/file/FilePartition.java b/src/main/java/com/marklogic/spark/reader/file/FilePartition.java
@@ -5,6 +5,7 @@
 
 import org.apache.spark.sql.connector.read.InputPartition;
 
+import java.util.Arrays;
 import java.util.List;
 
 public class FilePartition implements InputPartition {
@@ -13,6 +14,10 @@ public class FilePartition implements InputPartition {
 
     private final List<String> paths;
 
+    public FilePartition(String path) {
+        this.paths = Arrays.asList(path);
+    }
+
     public FilePartition(List<String> paths) {
         this.paths = paths;
     }
diff --git a/src/main/java/com/marklogic/spark/reader/file/GzipFileReader.java b/src/main/java/com/marklogic/spark/reader/file/GzipFileReader.java
@@ -3,6 +3,7 @@
  */
 package com.marklogic.spark.reader.file;
 
+import com.marklogic.client.io.InputStreamHandle;
 import com.marklogic.spark.ConnectorException;
 import com.marklogic.spark.Util;
 import org.apache.commons.io.IOUtils;
@@ -19,17 +20,26 @@
  * Expects to read a single gzipped file and return a single row. May expand the scope of this later to expect multiple
  * files and to thus return multiple rows.
  */
-class GzipFileReader implements PartitionReader<InternalRow> {
+public class GzipFileReader implements PartitionReader<InternalRow> {
 
     private final FilePartition filePartition;
     private final FileContext fileContext;
+    private final StreamingMode streamingMode;
 
     private int nextFilePathIndex;
     private InternalRow rowToReturn;
 
-    GzipFileReader(FilePartition filePartition, FileContext fileContext) {
+    // Only set if streaming during the writer phase.
+    private InputStreamHandle streamingContentHandle;
+
+    public GzipFileReader(FilePartition filePartition, FileContext fileContext) {
+        this(filePartition, fileContext, fileContext.isStreamingFiles() ? StreamingMode.STREAM_DURING_READER_PHASE : null);
+    }
+
+    public GzipFileReader(FilePartition filePartition, FileContext fileContext, StreamingMode streamingMode) {
         this.filePartition = filePartition;
         this.fileContext = fileContext;
+        this.streamingMode = streamingMode;
     }
 
     @Override
@@ -40,32 +50,47 @@ public boolean next() {
 
         String currentFilePath = fileContext.decodeFilePath(filePartition.getPaths().get(nextFilePathIndex));
         nextFilePathIndex++;
-        InputStream gzipInputStream = null;
-        try {
-            gzipInputStream = fileContext.openFile(currentFilePath);
-            byte[] content = extractGZIPContents(currentFilePath, gzipInputStream);
-            String uri = makeURI(currentFilePath);
-            this.rowToReturn = new GenericInternalRow(new Object[]{
-                UTF8String.fromString(uri), ByteArray.concat(content),
-                null, null, null, null, null, null
-            });
-            return true;
-        } catch (RuntimeException ex) {
-            if (fileContext.isReadAbortOnFailure()) {
-                throw ex;
+        String uri = makeURI(currentFilePath);
+
+        Object contentValue;
+        if (StreamingMode.STREAM_DURING_READER_PHASE.equals(streamingMode)) {
+            contentValue = FileUtil.serializeFileContext(fileContext, currentFilePath);
+            uri = currentFilePath;
+        } else if (StreamingMode.STREAM_DURING_WRITER_PHASE.equals(streamingMode)) {
+            streamingContentHandle = new InputStreamHandle(fileContext.openFile(currentFilePath));
+            contentValue = null;
+        } else {
+            InputStream gzipInputStream = null;
+            try {
+                gzipInputStream = fileContext.openFile(currentFilePath);
+                byte[] content = extractGZIPContents(currentFilePath, gzipInputStream);
+                contentValue = ByteArray.concat(content);
+            } catch (RuntimeException ex) {
+                if (fileContext.isReadAbortOnFailure()) {
+                    throw ex;
+                }
+                Util.MAIN_LOGGER.warn("Unable to read file at {}; cause: {}", currentFilePath, ex.getMessage());
+                return next();
+            } finally {
+                IOUtils.closeQuietly(gzipInputStream);
             }
-            Util.MAIN_LOGGER.warn("Unable to read file at {}; cause: {}", currentFilePath, ex.getMessage());
-            return next();
-        } finally {
-            IOUtils.closeQuietly(gzipInputStream);
         }
+
+        this.rowToReturn = new GenericInternalRow(new Object[]{
+            UTF8String.fromString(uri), contentValue, null, null, null, null, null, null
+        });
+        return true;
     }
 
     @Override
     public InternalRow get() {
         return rowToReturn;
     }
 
+    public InputStreamHandle getStreamingContentHandle() {
+        return streamingContentHandle;
+    }
+
     @Override
     public void close() {
         // Nothing to close.
diff --git a/src/main/java/com/marklogic/spark/writer/DocumentRowConverter.java b/src/main/java/com/marklogic/spark/writer/DocumentRowConverter.java
@@ -22,7 +22,6 @@
 import java.io.IOException;
 import java.io.ObjectInputStream;
 import java.util.ArrayList;
-import java.util.Arrays;
 import java.util.Iterator;
 import java.util.List;
 import java.util.stream.Stream;
@@ -54,7 +53,7 @@ public Iterator<DocBuilder.DocumentInputs> convertRow(InternalRow row) {
             return Stream.of(new DocBuilder.DocumentInputs(uri, null, null, metadata)).iterator();
         }
 
-        return this.isStreamingFromFiles ? readContentFromFile(uri, row) : readContentFromRow(uri, row);
+        return this.isStreamingFromFiles ? streamContentFromFile(uri, row) : readContentFromRow(uri, row);
     }
 
     @Override
@@ -94,7 +93,7 @@ private JsonNode deserializeContentToJson(String initialUri, BytesHandle content
      * In a scenario where the user wants to stream a file into MarkLogic, the content column will contain a serialized
      * instance of {@code FileContext}, which is used to stream the file into a {@code InputStreamHandle}.
      */
-    private Iterator<DocBuilder.DocumentInputs> readContentFromFile(String filePath, InternalRow row) {
+    private Iterator<DocBuilder.DocumentInputs> streamContentFromFile(String filePath, InternalRow row) {
         byte[] bytes = row.getBinary(1);
         FileContext fileContext;
         try (ObjectInputStream ois = new ObjectInputStream(new ByteArrayInputStream(bytes))) {
@@ -107,9 +106,13 @@ private Iterator<DocBuilder.DocumentInputs> readContentFromFile(String filePath,
             return buildIteratorForArchiveFile(filePath, fileContext);
         } else if (fileContext.isZip()) {
             return buildIteratorForZipFile(filePath, fileContext);
+        } else if (fileContext.isGzip()) {
+            return buildIteratorForGzipFile(filePath, fileContext);
         }
+        return buildIteratorForGenericFile(row, filePath, fileContext);
+    }
 
-        // If it's not an archive or normal zip file, we just have generic files that the user wants to stream.
+    private Iterator<DocBuilder.DocumentInputs> buildIteratorForGenericFile(InternalRow row, String filePath, FileContext fileContext) {
         final String decodedPath = fileContext.decodeFilePath(filePath);
         InputStreamHandle streamHandle = new InputStreamHandle(fileContext.openFile(decodedPath));
         if (this.documentFormat != null) {
@@ -120,16 +123,27 @@ private Iterator<DocBuilder.DocumentInputs> readContentFromFile(String filePath,
     }
 
     private Iterator<DocBuilder.DocumentInputs> buildIteratorForArchiveFile(String filePath, FileContext fileContext) {
-        FilePartition filePartition = new FilePartition(Arrays.asList(filePath));
+        FilePartition filePartition = new FilePartition(filePath);
         ArchiveFileReader archiveFileReader = new ArchiveFileReader(
             filePartition, fileContext, StreamingMode.STREAM_DURING_WRITER_PHASE
         );
         return new ArchiveFileIterator(archiveFileReader, this.documentFormat);
     }
 
     private Iterator<DocBuilder.DocumentInputs> buildIteratorForZipFile(String filePath, FileContext fileContext) {
-        FilePartition filePartition = new FilePartition(Arrays.asList(filePath));
+        FilePartition filePartition = new FilePartition(filePath);
         ZipFileReader reader = new ZipFileReader(filePartition, fileContext, StreamingMode.STREAM_DURING_WRITER_PHASE);
         return new ZipFileIterator(reader, this.documentFormat);
     }
+
+    private Iterator<DocBuilder.DocumentInputs> buildIteratorForGzipFile(String filePath, FileContext fileContext) {
+        GzipFileReader reader = new GzipFileReader(new FilePartition(filePath), fileContext, StreamingMode.STREAM_DURING_WRITER_PHASE);
+        reader.next();
+        String uri = reader.get().getString(0);
+        InputStreamHandle contentHandle = reader.getStreamingContentHandle();
+        if (this.documentFormat != null) {
+            contentHandle.withFormat(this.documentFormat);
+        }
+        return Stream.of(new DocBuilder.DocumentInputs(uri, contentHandle, null, null)).iterator();
+    }
 }
diff --git a/src/main/java/com/marklogic/spark/writer/file/DocumentFileWriterFactory.java b/src/main/java/com/marklogic/spark/writer/file/DocumentFileWriterFactory.java
@@ -32,13 +32,14 @@ public DataWriter<InternalRow> createWriter(int partitionId, long taskId) {
         if (this.schema.equals(TripleRowSchema.SCHEMA)) {
             return new RdfFileWriter(properties, hadoopConfiguration, partitionId);
         }
+
         String compression = this.properties.get(Options.WRITE_FILES_COMPRESSION);
         if (compression != null && compression.length() > 0) {
-            if ("zip".equalsIgnoreCase(compression)) {
-                return new ZipFileWriter(properties, hadoopConfiguration, partitionId);
-            }
-            return new GzipFileWriter(properties, hadoopConfiguration);
+            return "zip".equalsIgnoreCase(compression) ?
+                new ZipFileWriter(properties, hadoopConfiguration, partitionId) :
+                new GzipFileWriter(properties, hadoopConfiguration);
         }
+        
         return new DocumentFileWriter(properties, hadoopConfiguration);
     }
 }
diff --git a/src/test/java/com/marklogic/spark/reader/file/ReadGzipFilesTest.java b/src/test/java/com/marklogic/spark/reader/file/ReadGzipFilesTest.java
@@ -3,14 +3,19 @@
  */
 package com.marklogic.spark.reader.file;
 
+import com.fasterxml.jackson.databind.JsonNode;
+import com.marklogic.junit5.XmlNode;
 import com.marklogic.spark.AbstractIntegrationTest;
 import com.marklogic.spark.ConnectorException;
 import com.marklogic.spark.Options;
 import org.apache.spark.SparkException;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Row;
+import org.apache.spark.sql.SaveMode;
 import org.junit.jupiter.api.Test;
 
+import java.io.ByteArrayInputStream;
+import java.io.ObjectInputStream;
 import java.util.List;
 
 import static org.junit.jupiter.api.Assertions.*;
@@ -30,7 +35,7 @@ void readThreeGZIPFiles() {
 
         verifyRow(rows.get(0), "/src/test/resources/gzip-files/hello.xml", "<hello>world</hello>\n");
         verifyRow(rows.get(1), "/src/test/resources/gzip-files/level1/hello.txt", "hello world\n");
-        verifyRow(rows.get(2), "/src/test/resources/gzip-files/level1/level2/hello.json", "{\"hello\":\"world\"}\n");
+        verifyRow(rows.get(2), "/src/test/resources/gzip-files/level1/level2/hello world.json", "{\"hello\":\"world\"}\n");
     }
 
     @Test
@@ -73,6 +78,46 @@ void dontAbortOnFailure() {
             "error for the non-gzipped mixed-files.zip file being logged as a warning but not causing a failure.");
     }
 
+    @Test
+    void streamThreeGZIPFiles() throws Exception {
+        Dataset<Row> dataset = newSparkSession().read()
+            .format(CONNECTOR_IDENTIFIER)
+            .option(Options.READ_FILES_COMPRESSION, "gzip")
+            .option("recursiveFileLookup", "true")
+            .option(Options.STREAM_FILES, true)
+            .load("src/test/resources/gzip-files");
+
+        List<Row> rows = dataset.collectAsList();
+        assertEquals(3, rows.size());
+        for (Row row : rows) {
+            assertFalse(row.isNullAt(0), "The URI column should be populated.");
+            byte[] content = (byte[]) row.get(1);
+            try (ObjectInputStream ois = new ObjectInputStream(new ByteArrayInputStream(content))) {
+                FileContext fileContext = (FileContext) ois.readObject();
+                assertNotNull(fileContext);
+            }
+        }
+
+        // Write the streaming files to MarkLogic.
+        dataset.write().format(CONNECTOR_IDENTIFIER)
+            .option(Options.STREAM_FILES, true)
+            .option(Options.CLIENT_URI, makeClientUri())
+            .option(Options.WRITE_PERMISSIONS, DEFAULT_PERMISSIONS)
+            .option(Options.WRITE_COLLECTIONS, "streamed-files")
+            .option(Options.WRITE_URI_REPLACE, ".*gzip-files,'/gzip-files'")
+            .mode(SaveMode.Append)
+            .save();
+
+        assertCollectionSize("streamed-files", 3);
+        XmlNode doc = readXmlDocument("/gzip-files/hello.xml");
+        doc.assertElementValue("/hello", "world");
+
+        // Because each streamed file has to be sent via a PUT request, and the PUT endpoint does not allow spaces -
+        // see MLE-17088 - the URI will be encoded.
+        JsonNode node = readJsonDocument("/gzip-files/level1/level2/hello%20world.json");
+        assertEquals("world", node.get("hello").asText());
+    }
+
     private void verifyRow(Row row, String expectedUriSuffix, String expectedContent) {
         String uri = row.getString(0);
         assertTrue(uri.endsWith(expectedUriSuffix), "Unexpected URI: " + uri);
diff --git a/src/test/resources/gzip-files/level1/level2/hello world.json.gz b/src/test/resources/gzip-files/level1/level2/hello world.json.gz

Original file line number	Diff line number	Diff line change
`@@ -44,7 +44,7 @@ public boolean isZip() {`
`44`	`44`	`return "zip".equalsIgnoreCase(getStringOption(Options.READ_FILES_COMPRESSION));`
`45`	`45`	`}`
`46`	`46`
`47`		`- boolean isGzip() {`
	`47`	`+ public boolean isGzip() {`
`48`	`48`	`return "gzip".equalsIgnoreCase(getStringOption(Options.READ_FILES_COMPRESSION));`
`49`	`49`	`}`
`50`	`50`
Original file line number	Diff line number	Diff line change
`@@ -32,13 +32,14 @@ public DataWriter<InternalRow> createWriter(int partitionId, long taskId) {`
`32`	`32`	`if (this.schema.equals(TripleRowSchema.SCHEMA)) {`
`33`	`33`	`return new RdfFileWriter(properties, hadoopConfiguration, partitionId);`
`34`	`34`	`}`
	`35`	`+`
`35`	`36`	`String compression = this.properties.get(Options.WRITE_FILES_COMPRESSION);`
`36`	`37`	`if (compression != null && compression.length() > 0) {`
`37`		`- if ("zip".equalsIgnoreCase(compression)) {`
`38`		`- return new ZipFileWriter(properties, hadoopConfiguration, partitionId);`
`39`		`- }`
`40`		`- return new GzipFileWriter(properties, hadoopConfiguration);`
	`38`	`+ return "zip".equalsIgnoreCase(compression) ?`
	`39`	`+ new ZipFileWriter(properties, hadoopConfiguration, partitionId) :`
	`40`	`+ new GzipFileWriter(properties, hadoopConfiguration);`
`41`	`41`	`}`
	`42`	`+`
`42`	`43`	`return new DocumentFileWriter(properties, hadoopConfiguration);`
`43`	`44`	`}`
`44`	`45`	`}`