Merge pull request #302 from marklogic/feature/stream-normal-zip-file

rjrudin · web-flow · commit efcbee19c4a5 · 2024-09-30T11:39:41.000-04:00
MLE-17141 Can now stream normal zip files into MarkLogic
diff --git a/src/main/java/com/marklogic/spark/reader/file/ArchiveFileReader.java b/src/main/java/com/marklogic/spark/reader/file/ArchiveFileReader.java
@@ -13,9 +13,7 @@
 import org.apache.spark.sql.catalyst.InternalRow;
 import org.apache.spark.sql.connector.read.PartitionReader;
 
-import java.io.ByteArrayOutputStream;
 import java.io.IOException;
-import java.io.ObjectOutputStream;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.zip.ZipEntry;
@@ -36,11 +34,6 @@ public class ArchiveFileReader implements PartitionReader<InternalRow> {
     // Legacy = content first, then metadata.
     private Boolean isLegacyFormat;
 
-    public enum StreamingMode {
-        STREAM_DURING_READER_PHASE,
-        STREAM_DURING_WRITER_PHASE
-    }
-
     ArchiveFileReader(FilePartition filePartition, FileContext fileContext) {
         this(
             filePartition, fileContext,
@@ -229,22 +222,13 @@ private byte[] readMetadataEntry(String zipEntryName) throws IOException {
     }
 
     /**
-     * Builds a row containing the file path, the serialized FileContext, and the metadata.
+     * Builds a row to represent the archive file so that it can be opened during the writer phase.
      */
     private InternalRow buildSingleRowForArchiveFile() {
-        ByteArrayOutputStream baos = new ByteArrayOutputStream();
-        try (ObjectOutputStream oos = new ObjectOutputStream(baos)) {
-            oos.writeObject(this.fileContext);
-            oos.flush();
-        } catch (Exception ex) {
-            String message = String.format("Unable to build row for archive file at %s; cause: %s",
-                this.currentFilePath, ex.getMessage());
-            throw new ConnectorException(message, ex);
-        }
-
+        byte[] serializedFileContext = FileUtil.serializeFileContext(fileContext, currentFilePath);
         InternalRow row = new DocumentRowBuilder(this.metadataCategories)
             .withUri(this.currentFilePath)
-            .withContent(baos.toByteArray())
+            .withContent(serializedFileContext)
             .buildRow();
         this.currentFilePath = null;
         return row;
diff --git a/src/main/java/com/marklogic/spark/reader/file/FileContext.java b/src/main/java/com/marklogic/spark/reader/file/FileContext.java
@@ -40,7 +40,7 @@ public FileContext(Map<String, String> properties, SerializableConfiguration had
         }
     }
 
-    boolean isZip() {
+    public boolean isZip() {
         return "zip".equalsIgnoreCase(getStringOption(Options.READ_FILES_COMPRESSION));
     }
 
diff --git a/src/main/java/com/marklogic/spark/reader/file/FileUtil.java b/src/main/java/com/marklogic/spark/reader/file/FileUtil.java
@@ -3,9 +3,12 @@
  */
 package com.marklogic.spark.reader.file;
 
+import com.marklogic.spark.ConnectorException;
+
 import java.io.ByteArrayOutputStream;
 import java.io.IOException;
 import java.io.InputStream;
+import java.io.ObjectOutputStream;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.zip.ZipEntry;
@@ -64,4 +67,17 @@ static FilePartition[] makeFilePartitions(String[] files, int numPartitions) {
         }
         return partitions;
     }
+
+    static byte[] serializeFileContext(FileContext fileContext, String currentFilePath) {
+        ByteArrayOutputStream baos = new ByteArrayOutputStream();
+        try (ObjectOutputStream oos = new ObjectOutputStream(baos)) {
+            oos.writeObject(fileContext);
+            oos.flush();
+            return baos.toByteArray();
+        } catch (Exception ex) {
+            String message = String.format("Unable to build row for file at %s; cause: %s",
+                currentFilePath, ex.getMessage());
+            throw new ConnectorException(message, ex);
+        }
+    }
 }
diff --git a/src/main/java/com/marklogic/spark/reader/file/GenericFileReader.java b/src/main/java/com/marklogic/spark/reader/file/GenericFileReader.java
@@ -11,10 +11,8 @@
 import org.apache.spark.unsafe.types.ByteArray;
 import org.apache.spark.unsafe.types.UTF8String;
 
-import java.io.ByteArrayOutputStream;
 import java.io.IOException;
 import java.io.InputStream;
-import java.io.ObjectOutputStream;
 
 /**
  * "Generic" = read each file as-is with no special processing.
@@ -47,7 +45,9 @@ public boolean next() {
 
         filePathIndex++;
         try {
-            byte[] content = this.isStreaming ? serializeFileContext() : readFileIntoByteArray(path);
+            byte[] content = this.isStreaming ?
+                FileUtil.serializeFileContext(fileContext, path) :
+                readFileIntoByteArray(path);
 
             nextRowToReturn = new GenericInternalRow(new Object[]{
                 UTF8String.fromString(path),
@@ -80,12 +80,4 @@ private byte[] readFileIntoByteArray(String path) throws IOException {
             return fileContext.readBytes(inputStream);
         }
     }
-
-    private byte[] serializeFileContext() throws IOException {
-        ByteArrayOutputStream baos = new ByteArrayOutputStream();
-        try (ObjectOutputStream oos = new ObjectOutputStream(baos)) {
-            oos.writeObject(fileContext);
-        }
-        return baos.toByteArray();
-    }
 }
diff --git a/src/main/java/com/marklogic/spark/reader/file/StreamingMode.java b/src/main/java/com/marklogic/spark/reader/file/StreamingMode.java
@@ -0,0 +1,14 @@
+/*
+ * Copyright © 2024 MarkLogic Corporation. All Rights Reserved.
+ */
+package com.marklogic.spark.reader.file;
+
+/**
+ * Used when streaming from a zip file or archive file.
+ */
+public enum StreamingMode {
+
+    STREAM_DURING_READER_PHASE,
+    STREAM_DURING_WRITER_PHASE
+    
+}
diff --git a/src/main/java/com/marklogic/spark/reader/file/ZipFileReader.java b/src/main/java/com/marklogic/spark/reader/file/ZipFileReader.java
@@ -3,7 +3,9 @@
  */
 package com.marklogic.spark.reader.file;
 
+import com.marklogic.client.io.InputStreamHandle;
 import com.marklogic.spark.ConnectorException;
+import com.marklogic.spark.reader.document.DocumentRowBuilder;
 import org.apache.commons.crypto.utils.IoUtils;
 import org.apache.spark.sql.catalyst.InternalRow;
 import org.apache.spark.sql.catalyst.expressions.GenericInternalRow;
@@ -14,30 +16,47 @@
 import org.slf4j.LoggerFactory;
 
 import java.io.IOException;
+import java.util.ArrayList;
 import java.util.zip.ZipEntry;
 import java.util.zip.ZipInputStream;
 
 
-class ZipFileReader implements PartitionReader<InternalRow> {
+public class ZipFileReader implements PartitionReader<InternalRow> {
 
     private static final Logger logger = LoggerFactory.getLogger(ZipFileReader.class);
 
     private final FilePartition filePartition;
     private final FileContext fileContext;
+    private final StreamingMode streamingMode;
+
     private int nextFilePathIndex;
     private String currentFilePath;
     private ZipInputStream currentZipInputStream;
     private ZipEntry currentZipEntry;
 
     ZipFileReader(FilePartition filePartition, FileContext fileContext) {
+        this(filePartition, fileContext, fileContext.isStreamingFiles() ? StreamingMode.STREAM_DURING_READER_PHASE : null);
+    }
+
+    public ZipFileReader(FilePartition filePartition, FileContext fileContext, StreamingMode streamingMode) {
         this.filePartition = filePartition;
         this.fileContext = fileContext;
+        this.streamingMode = streamingMode;
         openNextFile();
     }
 
     @Override
-    public boolean next() throws IOException {
-        currentZipEntry = FileUtil.findNextFileEntry(currentZipInputStream);
+    public boolean next() {
+        if (StreamingMode.STREAM_DURING_READER_PHASE.equals(this.streamingMode)) {
+            return nextWhileStreamingDuringReaderPhase();
+        }
+
+        try {
+            currentZipEntry = FileUtil.findNextFileEntry(currentZipInputStream);
+        } catch (IOException e) {
+            throw new ConnectorException(String.format(
+                "Unable to read from zip file %s; cause: %s", currentFilePath, e.getMessage(), e));
+        }
         if (currentZipEntry != null) {
             return true;
         }
@@ -51,25 +70,56 @@ public boolean next() throws IOException {
 
     @Override
     public InternalRow get() {
+        if (StreamingMode.STREAM_DURING_READER_PHASE.equals(this.streamingMode)) {
+            return buildRowForZipFile();
+        }
+
         String zipEntryName = currentZipEntry.getName();
         if (logger.isTraceEnabled()) {
             logger.trace("Reading zip entry {} from zip file {}.", zipEntryName, this.currentFilePath);
         }
         String uri = zipEntryName.startsWith("/") ?
             this.currentFilePath + zipEntryName :
             this.currentFilePath + "/" + zipEntryName;
-        byte[] content = readZipEntry();
+
+        Object content = StreamingMode.STREAM_DURING_WRITER_PHASE.equals(this.streamingMode) ? null
+            : ByteArray.concat(readZipEntry());
+
         return new GenericInternalRow(new Object[]{
-            UTF8String.fromString(uri), ByteArray.concat(content),
-            null, null, null, null, null, null
+            UTF8String.fromString(uri), content, null, null, null, null, null, null
         });
     }
 
+    /**
+     * Exposed for {@code ZipFileIterator} to be able to read from the zip stream when it produces a set of
+     * document inputs.
+     *
+     * @return a {@code InputStreamHandle} to avoid reading a content zip entry into memory.
+     */
+    public InputStreamHandle getContentHandleForCurrentZipEntry() {
+        return new InputStreamHandle(currentZipInputStream);
+    }
+
     @Override
     public void close() {
         IoUtils.closeQuietly(this.currentZipInputStream);
     }
 
+    /**
+     * Implementation of {@code next()} while streaming during the reader phase, where we don't want to actually read
+     * anything from a zip file. We just want to build a row per zip file.
+     */
+    private boolean nextWhileStreamingDuringReaderPhase() {
+        if (currentFilePath != null) {
+            return true;
+        }
+        if (nextFilePathIndex >= filePartition.getPaths().size()) {
+            return false;
+        }
+        openNextFile();
+        return true;
+    }
+
     private void openNextFile() {
         this.currentFilePath = fileContext.decodeFilePath(filePartition.getPaths().get(nextFilePathIndex));
         nextFilePathIndex++;
@@ -84,4 +134,14 @@ private byte[] readZipEntry() {
                 this.currentFilePath, e.getMessage()), e);
         }
     }
+
+    private InternalRow buildRowForZipFile() {
+        byte[] serializedFileContext = FileUtil.serializeFileContext(fileContext, currentFilePath);
+        InternalRow row = new DocumentRowBuilder(new ArrayList<>())
+            .withUri(this.currentFilePath)
+            .withContent(serializedFileContext)
+            .buildRow();
+        this.currentFilePath = null;
+        return row;
+    }
 }
diff --git a/src/main/java/com/marklogic/spark/writer/DocumentRowConverter.java b/src/main/java/com/marklogic/spark/writer/DocumentRowConverter.java
@@ -13,10 +13,9 @@
 import com.marklogic.spark.ConnectorException;
 import com.marklogic.spark.Options;
 import com.marklogic.spark.reader.document.DocumentRowSchema;
-import com.marklogic.spark.reader.file.ArchiveFileReader;
-import com.marklogic.spark.reader.file.FileContext;
-import com.marklogic.spark.reader.file.FilePartition;
+import com.marklogic.spark.reader.file.*;
 import com.marklogic.spark.writer.file.ArchiveFileIterator;
+import com.marklogic.spark.writer.file.ZipFileIterator;
 import org.apache.spark.sql.catalyst.InternalRow;
 
 import java.io.ByteArrayInputStream;
@@ -106,8 +105,11 @@ private Iterator<DocBuilder.DocumentInputs> readContentFromFile(String filePath,
 
         if ("archive".equalsIgnoreCase(fileContext.getStringOption(Options.READ_FILES_TYPE))) {
             return buildIteratorForArchiveFile(filePath, fileContext);
+        } else if (fileContext.isZip()) {
+            return buildIteratorForZipFile(filePath, fileContext);
         }
 
+        // If it's not an archive or normal zip file, we just have generic files that the user wants to stream.
         final String decodedPath = fileContext.decodeFilePath(filePath);
         InputStreamHandle streamHandle = new InputStreamHandle(fileContext.openFile(decodedPath));
         if (this.documentFormat != null) {
@@ -120,8 +122,14 @@ private Iterator<DocBuilder.DocumentInputs> readContentFromFile(String filePath,
     private Iterator<DocBuilder.DocumentInputs> buildIteratorForArchiveFile(String filePath, FileContext fileContext) {
         FilePartition filePartition = new FilePartition(Arrays.asList(filePath));
         ArchiveFileReader archiveFileReader = new ArchiveFileReader(
-            filePartition, fileContext, ArchiveFileReader.StreamingMode.STREAM_DURING_WRITER_PHASE
+            filePartition, fileContext, StreamingMode.STREAM_DURING_WRITER_PHASE
         );
         return new ArchiveFileIterator(archiveFileReader, this.documentFormat);
     }
+
+    private Iterator<DocBuilder.DocumentInputs> buildIteratorForZipFile(String filePath, FileContext fileContext) {
+        FilePartition filePartition = new FilePartition(Arrays.asList(filePath));
+        ZipFileReader reader = new ZipFileReader(filePartition, fileContext, StreamingMode.STREAM_DURING_WRITER_PHASE);
+        return new ZipFileIterator(reader, this.documentFormat);
+    }
 }
diff --git a/src/main/java/com/marklogic/spark/writer/file/ArchiveFileIterator.java b/src/main/java/com/marklogic/spark/writer/file/ArchiveFileIterator.java
@@ -6,6 +6,7 @@
 import com.marklogic.client.io.DocumentMetadataHandle;
 import com.marklogic.client.io.Format;
 import com.marklogic.client.io.InputStreamHandle;
+import com.marklogic.spark.Util;
 import com.marklogic.spark.reader.document.DocumentRowSchema;
 import com.marklogic.spark.reader.file.ArchiveFileReader;
 import com.marklogic.spark.writer.DocBuilder;
@@ -20,8 +21,8 @@
  */
 public class ArchiveFileIterator implements Iterator<DocBuilder.DocumentInputs> {
 
-    private ArchiveFileReader archiveFileReader;
-    private Format documentFormat;
+    private final ArchiveFileReader archiveFileReader;
+    private final Format documentFormat;
 
     public ArchiveFileIterator(ArchiveFileReader archiveFileReader, Format documentFormat) {
         this.archiveFileReader = archiveFileReader;
@@ -39,9 +40,12 @@ public boolean hasNext() {
     @SuppressWarnings("java:S2272")
     public DocBuilder.DocumentInputs next() {
         InternalRow row = archiveFileReader.get();
+        String uri = row.getString(0);
+        if (Util.MAIN_LOGGER.isDebugEnabled()) {
+            Util.MAIN_LOGGER.debug("Creating input stream for entry {}", uri);
+        }
         InputStreamHandle contentHandle = archiveFileReader.getContentHandleForCurrentZipEntry();
         DocumentMetadataHandle metadata = DocumentRowSchema.makeDocumentMetadata(row);
-        String uri = row.getString(0);
         if (this.documentFormat != null) {
             contentHandle.withFormat(this.documentFormat);
         }
diff --git a/src/main/java/com/marklogic/spark/writer/file/ZipFileIterator.java b/src/main/java/com/marklogic/spark/writer/file/ZipFileIterator.java
@@ -0,0 +1,46 @@
+/*
+ * Copyright © 2024 MarkLogic Corporation. All Rights Reserved.
+ */
+package com.marklogic.spark.writer.file;
+
+import com.marklogic.client.io.Format;
+import com.marklogic.client.io.InputStreamHandle;
+import com.marklogic.spark.Util;
+import com.marklogic.spark.reader.file.ZipFileReader;
+import com.marklogic.spark.writer.DocBuilder;
+import org.apache.spark.sql.catalyst.InternalRow;
+
+import java.util.Iterator;
+
+public class ZipFileIterator implements Iterator<DocBuilder.DocumentInputs> {
+
+    private final ZipFileReader zipFileReader;
+    private final Format documentFormat;
+
+    public ZipFileIterator(ZipFileReader zipFileReader, Format documentFormat) {
+        this.zipFileReader = zipFileReader;
+        this.documentFormat = documentFormat;
+    }
+
+    @Override
+    public boolean hasNext() {
+        return zipFileReader.next();
+    }
+
+    @Override
+    // Suppressing sonar warning about throwing a NoSuchElementException. We know this is only used by
+    // DocumentRowConverter, which properly calls hasNext() before calling next().
+    @SuppressWarnings("java:S2272")
+    public DocBuilder.DocumentInputs next() {
+        InternalRow row = zipFileReader.get();
+        String uri = row.getString(0);
+        if (Util.MAIN_LOGGER.isDebugEnabled()) {
+            Util.MAIN_LOGGER.debug("Creating input stream for entry {}", uri);
+        }
+        InputStreamHandle contentHandle = zipFileReader.getContentHandleForCurrentZipEntry();
+        if (this.documentFormat != null) {
+            contentHandle.withFormat(this.documentFormat);
+        }
+        return new DocBuilder.DocumentInputs(uri, contentHandle, null, null);
+    }
+}
diff --git a/src/test/java/com/marklogic/spark/reader/file/ReadZipFilesTest.java b/src/test/java/com/marklogic/spark/reader/file/ReadZipFilesTest.java

Original file line number	Diff line number	Diff line change
`@@ -40,7 +40,7 @@ public FileContext(Map<String, String> properties, SerializableConfiguration had`
`40`	`40`	`}`
`41`	`41`	`}`
`42`	`42`
`43`		`- boolean isZip() {`
	`43`	`+ public boolean isZip() {`
`44`	`44`	`return "zip".equalsIgnoreCase(getStringOption(Options.READ_FILES_COMPRESSION));`
`45`	`45`	`}`
`46`	`46`