Merge pull request #303 from marklogic/feature/17141-write-stream-zip-file

rjrudin · web-flow · commit 59dc9e89449b · 2024-09-30T12:53:57.000-04:00
MLE-17141 Verifying normal zip files can be streamed
diff --git a/src/main/java/com/marklogic/spark/MarkLogicFileTable.java b/src/main/java/com/marklogic/spark/MarkLogicFileTable.java
@@ -43,7 +43,7 @@ class MarkLogicFileTable extends FileTable {
     @Override
     public ScanBuilder newScanBuilder(CaseInsensitiveStringMap options) {
         if ("true".equalsIgnoreCase(options.get(Options.STREAM_FILES)) && Util.MAIN_LOGGER.isInfoEnabled()) {
-            Util.MAIN_LOGGER.info("Will defer reading of file contents so they can be streamed during the writer phase.");
+            Util.MAIN_LOGGER.info("File streaming is enabled; will read files during writer phase.");
         }
         return new FileScanBuilder(options.asCaseSensitiveMap(), super.fileIndex());
     }
diff --git a/src/main/java/com/marklogic/spark/reader/document/DocumentScanBuilder.java b/src/main/java/com/marklogic/spark/reader/document/DocumentScanBuilder.java
@@ -17,7 +17,7 @@ class DocumentScanBuilder implements ScanBuilder, SupportsPushDownLimit {
     DocumentScanBuilder(CaseInsensitiveStringMap options, StructType schema) {
         this.context = new DocumentContext(options, schema);
         if (this.context.isStreamingFiles() && Util.MAIN_LOGGER.isInfoEnabled()) {
-            Util.MAIN_LOGGER.info("Will defer reading documents from MarkLogic so they can be streamed to files during the writer phase.");
+            Util.MAIN_LOGGER.info("File streaming is enabled; will read documents from MarkLogic during writer phase.");
         }
     }
 
diff --git a/src/test/java/com/marklogic/spark/writer/file/WriteDocumentZipFilesTest.java b/src/test/java/com/marklogic/spark/writer/file/WriteDocumentZipFilesTest.java
@@ -91,6 +91,41 @@ void opaqueURI(@TempDir @NotNull Path tempDir) throws IOException {
             "'schema-specific part', which is just example/123.xml.");
     }
 
+    /**
+     * Verifies that streaming documents to a zip file "just works" on account of supporting streaming of archive files
+     * first. The same ZipFileWriter is used for both. The only difference with archive files is that it will also
+     * check for metadata in each Spark row and include a metadata entry in the archive file.
+     *
+     * @param tempDir
+     * @throws Exception
+     */
+    @Test
+    void streamZipFile(@TempDir Path tempDir) throws Exception {
+        Dataset<Row> dataset = newSparkSession().read()
+            .format(CONNECTOR_IDENTIFIER)
+            .option(Options.READ_DOCUMENTS_PARTITIONS_PER_FOREST, 1)
+            .option(Options.CLIENT_URI, makeClientUri())
+            .option(Options.READ_DOCUMENTS_COLLECTIONS, "author")
+            .option(Options.STREAM_FILES, true)
+            .load();
+
+        assertEquals(15, dataset.count(), "Should have 1 row per author document.");
+        dataset.collectAsList().forEach(row -> {
+            assertFalse(row.isNullAt(0), "The URI column should be non-null.");
+            assertTrue(row.isNullAt(1), "The content column should be empty. The document will be read during the " +
+                "writer phase instead.");
+        });
+
+        dataset.write().format(CONNECTOR_IDENTIFIER)
+            .option(Options.STREAM_FILES, true)
+            .option(Options.CLIENT_URI, makeClientUri())
+            .option(Options.WRITE_FILES_COMPRESSION, "zip")
+            .mode(SaveMode.Append)
+            .save(tempDir.toFile().getAbsolutePath());
+
+        verifyZipFilesHaveExpectedFilenames(tempDir);
+        verifyZipFilesContainFifteenAuthors(tempDir);
+    }
 
     private Dataset<Row> readAuthorCollection() {
         return newSparkSession().read()

Original file line number	Diff line number	Diff line change
`@@ -43,7 +43,7 @@ class MarkLogicFileTable extends FileTable {`
`43`	`43`	`@Override`
`44`	`44`	`public ScanBuilder newScanBuilder(CaseInsensitiveStringMap options) {`
`45`	`45`	`if ("true".equalsIgnoreCase(options.get(Options.STREAM_FILES)) && Util.MAIN_LOGGER.isInfoEnabled()) {`
`46`		`- Util.MAIN_LOGGER.info("Will defer reading of file contents so they can be streamed during the writer phase.");`
	`46`	`+ Util.MAIN_LOGGER.info("File streaming is enabled; will read files during writer phase.");`
`47`	`47`	`}`
`48`	`48`	`return new FileScanBuilder(options.asCaseSensitiveMap(), super.fileIndex());`
`49`	`49`	`}`
Original file line number	Diff line number	Diff line change
`@@ -17,7 +17,7 @@ class DocumentScanBuilder implements ScanBuilder, SupportsPushDownLimit {`
`17`	`17`	`DocumentScanBuilder(CaseInsensitiveStringMap options, StructType schema) {`
`18`	`18`	`this.context = new DocumentContext(options, schema);`
`19`	`19`	`if (this.context.isStreamingFiles() && Util.MAIN_LOGGER.isInfoEnabled()) {`
`20`		`- Util.MAIN_LOGGER.info("Will defer reading documents from MarkLogic so they can be streamed to files during the writer phase.");`
	`20`	`+ Util.MAIN_LOGGER.info("File streaming is enabled; will read documents from MarkLogic during writer phase.");`
`21`	`21`	`}`
`22`	`22`	`}`
`23`	`23`