marklogic
diff --git a/‎src/main/java/com/marklogic/spark/reader/file/ArchiveFileReader.java
Lines changed: 112 additions & 12 deletions b/‎src/main/java/com/marklogic/spark/reader/file/ArchiveFileReader.java
Lines changed: 112 additions & 12 deletions
diff --git a/‎src/main/java/com/marklogic/spark/writer/DocumentRowConverter.java
Lines changed: 28 additions & 13 deletions b/‎src/main/java/com/marklogic/spark/writer/DocumentRowConverter.java
Lines changed: 28 additions & 13 deletions
diff --git a/‎src/main/java/com/marklogic/spark/writer/WriteBatcherDataWriter.java
Lines changed: 10 additions & 1 deletion b/‎src/main/java/com/marklogic/spark/writer/WriteBatcherDataWriter.java
Lines changed: 10 additions & 1 deletion
diff --git a/‎src/main/java/com/marklogic/spark/writer/file/ArchiveFileIterator.java
Lines changed: 50 additions & 0 deletions b/‎src/main/java/com/marklogic/spark/writer/file/ArchiveFileIterator.java
Lines changed: 50 additions & 0 deletions
@@ -4,6 +4,7 @@
 package com.marklogic.spark.reader.file;
 
 import com.marklogic.client.io.DocumentMetadataHandle;
+import com.marklogic.client.io.InputStreamHandle;
 import com.marklogic.spark.ConnectorException;
 import com.marklogic.spark.Options;
 import com.marklogic.spark.Util;
@@ -12,17 +13,20 @@
 import org.apache.spark.sql.catalyst.InternalRow;
 import org.apache.spark.sql.connector.read.PartitionReader;
 
+import java.io.ByteArrayOutputStream;
 import java.io.IOException;
+import java.io.ObjectOutputStream;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.zip.ZipEntry;
 import java.util.zip.ZipInputStream;
 
-class ArchiveFileReader implements PartitionReader<InternalRow> {
+public class ArchiveFileReader implements PartitionReader<InternalRow> {
 
     private final FilePartition filePartition;
     private final FileContext fileContext;
     private final List<String> metadataCategories;
+    private final StreamingMode streamingMode;
 
     private String currentFilePath;
     private ZipInputStream currentZipInputStream;
@@ -32,9 +36,25 @@ class ArchiveFileReader implements PartitionReader<InternalRow> {
     // Legacy = content first, then metadata.
     private Boolean isLegacyFormat;
 
+    public enum StreamingMode {
+        STREAM_DURING_READER_PHASE,
+        STREAM_DURING_WRITER_PHASE
+    }
+
     ArchiveFileReader(FilePartition filePartition, FileContext fileContext) {
+        this(
+            filePartition, fileContext,
+            // Will refactor this later to avoid duplication of this comparison.
+            // Should be a nice little method in FileContext.
+            "true".equalsIgnoreCase(fileContext.getStringOption(Options.STREAM_FILES)) ? StreamingMode.STREAM_DURING_READER_PHASE : null
+        );
+    }
+
+    public ArchiveFileReader(FilePartition filePartition, FileContext fileContext, StreamingMode streamingMode) {
         this.filePartition = filePartition;
         this.fileContext = fileContext;
+        this.streamingMode = streamingMode;
+
         this.metadataCategories = new ArrayList<>();
         if (fileContext.hasOption(Options.READ_ARCHIVES_CATEGORIES)) {
             for (String category : fileContext.getStringOption(Options.READ_ARCHIVES_CATEGORIES).split(",")) {
@@ -47,6 +67,10 @@ class ArchiveFileReader implements PartitionReader<InternalRow> {
 
     @Override
     public boolean next() {
+        if (StreamingMode.STREAM_DURING_READER_PHASE.equals(this.streamingMode)) {
+            return nextWhileStreamingDuringReaderPhase();
+        }
+
         try {
             ZipEntry nextZipEntry = FileUtil.findNextFileEntry(currentZipInputStream);
             if (nextZipEntry == null) {
@@ -55,6 +79,7 @@ public boolean next() {
 
             if (isLegacyFormat == null) {
                 isLegacyFormat = !nextZipEntry.getName().endsWith(".metadata");
+                logArchiveFormat();
             }
 
             return isLegacyFormat ? readContentFollowedByMetadata(nextZipEntry) : readMetadataFollowedByContent();
@@ -70,14 +95,52 @@ public boolean next() {
 
     @Override
     public InternalRow get() {
-        return nextRowToReturn;
+        return StreamingMode.STREAM_DURING_READER_PHASE.equals(this.streamingMode) ?
+            buildSingleRowForArchiveFile() :
+            nextRowToReturn;
     }
 
     @Override
     public void close() {
         IOUtils.closeQuietly(this.currentZipInputStream);
     }
 
+    /**
+     * Exposed for {@code ArchiveFileIterator} to be able to read from the zip stream when it produces a set of
+     * document inputs.
+     *
+     * @return a {@code InputStreamHandle} to avoid reading a content zip entry into memory.
+     */
+    public InputStreamHandle getContentHandleForCurrentZipEntry() {
+        return new InputStreamHandle(currentZipInputStream);
+    }
+
+    private void logArchiveFormat() {
+        if (Util.MAIN_LOGGER.isInfoEnabled() && isLegacyFormat) {
+            Util.MAIN_LOGGER.info("Archive {} uses Flux 1.0 format, will read content and then metadata.", this.currentFilePath);
+        }
+        if (Util.MAIN_LOGGER.isDebugEnabled() && !isLegacyFormat.booleanValue()) {
+            Util.MAIN_LOGGER.debug("Archive {} uses Flux 1.1+ format, will read metadata and then content.", this.currentFilePath);
+        }
+    }
+
+    /**
+     * Implementation of {@code next()} while streaming during the reader phase, where we don't want to actually read
+     * anything from a zip file. We just want to build a row per zip file.
+     *
+     * @return
+     */
+    private boolean nextWhileStreamingDuringReaderPhase() {
+        if (currentFilePath != null) {
+            return true;
+        }
+        if (nextFilePathIndex >= filePartition.getPaths().size()) {
+            return false;
+        }
+        openNextFile();
+        return true;
+    }
+
     /**
      * This is the Flux 1.0 "legacy" approach, where content was written first, followed by metadata. This does not
      * support streaming.
@@ -87,16 +150,15 @@ private boolean readContentFollowedByMetadata(ZipEntry contentZipEntry) throws I
         if (content == null || content.length == 0) {
             return openNextFileAndReadNextEntry();
         }
-        final String zipEntryName = contentZipEntry.getName();
 
+        final String zipEntryName = contentZipEntry.getName();
         byte[] metadataBytes = readMetadataEntry(zipEntryName);
         if (metadataBytes == null || metadataBytes.length == 0) {
             return openNextFileAndReadNextEntry();
         }
 
         DocumentMetadataHandle metadata = new DocumentMetadataHandle();
         metadata.fromBuffer(metadataBytes);
-
         this.nextRowToReturn = new DocumentRowBuilder(this.metadataCategories)
             .withUri(zipEntryName).withContent(content).withMetadata(metadata)
             .buildRow();
@@ -105,29 +167,45 @@ private boolean readContentFollowedByMetadata(ZipEntry contentZipEntry) throws I
 
     /**
      * This is the Flux 1.1+ approach, where the metadata entry is written first. This supports streaming.
+     * <p>
+     * This is where we implement streaming-during-write-to-MarkLogic. We read the metadata entry as normal - good.
+     * Then we build everything in our row except the content.
      */
     private boolean readMetadataFollowedByContent() throws IOException {
         byte[] metadataBytes = fileContext.readBytes(currentZipInputStream);
         if (metadataBytes == null || metadataBytes.length == 0) {
             return openNextFileAndReadNextEntry();
         }
 
-        ZipEntry contentZipEntry = FileUtil.findNextFileEntry(currentZipInputStream);
-        byte[] content = fileContext.readBytes(currentZipInputStream);
-
         DocumentMetadataHandle metadata = new DocumentMetadataHandle();
         metadata.fromBuffer(metadataBytes);
-        this.nextRowToReturn = new DocumentRowBuilder(this.metadataCategories)
+
+        // We still do this to get the stream ready to read the next entry.
+        ZipEntry contentZipEntry = FileUtil.findNextFileEntry(currentZipInputStream);
+
+        DocumentRowBuilder rowBuilder = new DocumentRowBuilder(this.metadataCategories)
             .withUri(contentZipEntry.getName())
-            .withContent(content).withMetadata(metadata)
-            .buildRow();
+            .withMetadata(metadata);
+
+        if (!StreamingMode.STREAM_DURING_WRITER_PHASE.equals(this.streamingMode)) {
+            byte[] content = fileContext.readBytes(currentZipInputStream);
+            rowBuilder = rowBuilder.withContent(content);
+        }
+
+        this.nextRowToReturn = rowBuilder.buildRow();
         return true;
     }
 
     private void openNextFile() {
-        this.currentFilePath = fileContext.decodeFilePath(filePartition.getPaths().get(nextFilePathIndex));
+        final boolean isStreamingDuringRead = StreamingMode.STREAM_DURING_READER_PHASE.equals(this.streamingMode);
+        final String nextFilePath = filePartition.getPaths().get(nextFilePathIndex);
+
+        this.currentFilePath = isStreamingDuringRead ? nextFilePath : fileContext.decodeFilePath(nextFilePath);
         nextFilePathIndex++;
-        this.currentZipInputStream = new ZipInputStream(fileContext.openFile(this.currentFilePath));
+
+        if (!isStreamingDuringRead) {
+            this.currentZipInputStream = new ZipInputStream(fileContext.openFile(this.currentFilePath));
+        }
     }
 
     private boolean openNextFileAndReadNextEntry() {
@@ -151,4 +229,26 @@ private byte[] readMetadataEntry(String zipEntryName) throws IOException {
         }
         return fileContext.readBytes(currentZipInputStream);
     }
+
+    /**
+     * Builds a row containing the file path, the serialized FileContext, and the metadata.
+     */
+    private InternalRow buildSingleRowForArchiveFile() {
+        ByteArrayOutputStream baos = new ByteArrayOutputStream();
+        try (ObjectOutputStream oos = new ObjectOutputStream(baos)) {
+            oos.writeObject(this.fileContext);
+            oos.flush();
+        } catch (Exception ex) {
+            String message = String.format("Unable to build row for archive file at %s; cause: %s",
+                this.currentFilePath, ex.getMessage());
+            throw new ConnectorException(message, ex);
+        }
+
+        InternalRow row = new DocumentRowBuilder(this.metadataCategories)
+            .withUri(this.currentFilePath)
+            .withContent(baos.toByteArray())
+            .buildRow();
+        this.currentFilePath = null;
+        return row;
+    }
 }
@@ -13,13 +13,17 @@
 import com.marklogic.spark.ConnectorException;
 import com.marklogic.spark.Options;
 import com.marklogic.spark.reader.document.DocumentRowSchema;
+import com.marklogic.spark.reader.file.ArchiveFileReader;
 import com.marklogic.spark.reader.file.FileContext;
+import com.marklogic.spark.reader.file.FilePartition;
+import com.marklogic.spark.writer.file.ArchiveFileIterator;
 import org.apache.spark.sql.catalyst.InternalRow;
 
 import java.io.ByteArrayInputStream;
 import java.io.IOException;
 import java.io.ObjectInputStream;
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.Iterator;
 import java.util.List;
 import java.util.stream.Stream;
@@ -93,20 +97,31 @@ private JsonNode deserializeContentToJson(String initialUri, BytesHandle content
      */
     private Iterator<DocBuilder.DocumentInputs> readContentFromFile(String filePath, InternalRow row) {
         byte[] bytes = row.getBinary(1);
-        String filePathInErrorMessage = filePath;
-        try {
-            ObjectInputStream ois = new ObjectInputStream(new ByteArrayInputStream(bytes));
-            FileContext fileContext = (FileContext) ois.readObject();
-            final String decodedPath = fileContext.decodeFilePath(filePath);
-            filePathInErrorMessage = decodedPath;
-            InputStreamHandle streamHandle = new InputStreamHandle(fileContext.openFile(decodedPath));
-            if (this.documentFormat != null) {
-                streamHandle.withFormat(this.documentFormat);
-            }
-            DocumentMetadataHandle metadata = DocumentRowSchema.makeDocumentMetadata(row);
-            return Stream.of(new DocBuilder.DocumentInputs(filePath, streamHandle, null, metadata)).iterator();
+        FileContext fileContext;
+        try (ObjectInputStream ois = new ObjectInputStream(new ByteArrayInputStream(bytes))) {
+            fileContext = (FileContext) ois.readObject();
         } catch (Exception e) {
-            throw new ConnectorException(String.format("Unable to read from file %s; cause: %s", filePathInErrorMessage, e.getMessage()));
+            throw new ConnectorException(String.format("Unable to read from file %s; cause: %s", filePath, e.getMessage()));
         }
+
+        if ("archive".equalsIgnoreCase(fileContext.getStringOption(Options.READ_FILES_TYPE))) {
+            return buildIteratorForArchiveFile(filePath, fileContext);
+        }
+
+        final String decodedPath = fileContext.decodeFilePath(filePath);
+        InputStreamHandle streamHandle = new InputStreamHandle(fileContext.openFile(decodedPath));
+        if (this.documentFormat != null) {
+            streamHandle.withFormat(this.documentFormat);
+        }
+        DocumentMetadataHandle metadata = DocumentRowSchema.makeDocumentMetadata(row);
+        return Stream.of(new DocBuilder.DocumentInputs(filePath, streamHandle, null, metadata)).iterator();
+    }
+
+    private Iterator<DocBuilder.DocumentInputs> buildIteratorForArchiveFile(String filePath, FileContext fileContext) {
+        FilePartition filePartition = new FilePartition(Arrays.asList(filePath));
+        ArchiveFileReader archiveFileReader = new ArchiveFileReader(
+            filePartition, fileContext, ArchiveFileReader.StreamingMode.STREAM_DURING_WRITER_PHASE
+        );
+        return new ArchiveFileIterator(archiveFileReader, this.documentFormat);
     }
 }
@@ -257,7 +257,7 @@ private void closeArchiveWriter() {
      * @param writeOp
      */
     private void writeDocumentViaPutOperation(DocumentWriteOperation writeOp) {
-        final String uri = writeOp.getUri();
+        final String uri = replaceSpacesInUriForPutEndpoint(writeOp.getUri());
         try {
             this.documentManager.write(uri, writeOp.getMetadata(), (GenericWriteHandle) writeOp.getContent());
             this.successItemCount.incrementAndGet();
@@ -267,6 +267,15 @@ private void writeDocumentViaPutOperation(DocumentWriteOperation writeOp) {
         }
     }
 
+    /**
+     * Sigh. Using URLEncoder.encode will convert forward slashes into "%2F", which a user almost certainly does not
+     * want, since those are meaningful in MarkLogic URIs. The main problem to address with the PUT endpoint is that it
+     * erroneously does not accept spaces (see MLE-17088). So this simply replaces spaces.
+     */
+    private String replaceSpacesInUriForPutEndpoint(String uri) {
+        return uri.replace(" ", "%20");
+    }
+
     private void captureFailure(String message, String documentUri) {
         Util.MAIN_LOGGER.error("Unable to write document with URI: {}; cause: {}", documentUri, message);
         failedItemCount.incrementAndGet();
 
@@ -0,0 +1,50 @@
+/*
+ * Copyright © 2024 MarkLogic Corporation. All Rights Reserved.
+ */
+package com.marklogic.spark.writer.file;
+
+import com.marklogic.client.io.DocumentMetadataHandle;
+import com.marklogic.client.io.Format;
+import com.marklogic.client.io.InputStreamHandle;
+import com.marklogic.spark.reader.document.DocumentRowSchema;
+import com.marklogic.spark.reader.file.ArchiveFileReader;
+import com.marklogic.spark.writer.DocBuilder;
+import org.apache.spark.sql.catalyst.InternalRow;
+
+import java.util.Iterator;
+
+/**
+ * Provides an {@code Iterator} interface on top of an {@code ArchiveFileReader}, thereby allowing a
+ * {@code DocumentRowConverter} to build sets of document inputs from an archive file without reading any content entry
+ * into memory - thus supporting streaming of an archive.
+ */
+public class ArchiveFileIterator implements Iterator<DocBuilder.DocumentInputs> {
+
+    private ArchiveFileReader archiveFileReader;
+    private Format documentFormat;
+
+    public ArchiveFileIterator(ArchiveFileReader archiveFileReader, Format documentFormat) {
+        this.archiveFileReader = archiveFileReader;
+        this.documentFormat = documentFormat;
+    }
+
+    @Override
+    public boolean hasNext() {
+        return archiveFileReader.next();
+    }
+
+    @Override
+    // Suppressing sonar warning about throwing a NoSuchElementException. We know this is only used by
+    // DocumentRowConverter, which properly calls hasNext() before calling next().
+    @SuppressWarnings("java:S2272")
+    public DocBuilder.DocumentInputs next() {
+        InternalRow row = archiveFileReader.get();
+        InputStreamHandle contentHandle = archiveFileReader.getContentHandleForCurrentZipEntry();
+        DocumentMetadataHandle metadata = DocumentRowSchema.makeDocumentMetadata(row);
+        String uri = row.getString(0);
+        if (this.documentFormat != null) {
+            contentHandle.withFormat(this.documentFormat);
+        }
+        return new DocBuilder.DocumentInputs(uri, contentHandle, null, metadata);
+    }
+}