MLE-17095 Can now stream when exporting an archive

rjrudin · rjrudin · commit cba520bb998e · 2024-09-27T13:57:58.000-04:00
Was fairly simple, since `ContentWriter` already supported streaming normal documents.
diff --git a/src/main/java/com/marklogic/spark/Util.java b/src/main/java/com/marklogic/spark/Util.java
@@ -3,6 +3,7 @@
  */
 package com.marklogic.spark;
 
+import com.marklogic.client.document.DocumentManager;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -72,4 +73,21 @@ static String getOptionNameForErrorMessage(String option) {
         String optionName = bundle.getString(option);
         return optionName != null && optionName.trim().length() > 0 ? optionName.trim() : option;
     }
+
+    static Set<DocumentManager.Metadata> getRequestedMetadata(ContextSupport context) {
+        Set<DocumentManager.Metadata> set = new HashSet<>();
+        if (context.hasOption(Options.READ_DOCUMENTS_CATEGORIES)) {
+            for (String category : context.getStringOption(Options.READ_DOCUMENTS_CATEGORIES).split(",")) {
+                if ("content".equalsIgnoreCase(category)) {
+                    continue;
+                }
+                if ("metadata".equalsIgnoreCase(category)) {
+                    set.add(DocumentManager.Metadata.ALL);
+                } else {
+                    set.add(DocumentManager.Metadata.valueOf(category.toUpperCase()));
+                }
+            }
+        }
+        return set;
+    }
 }
diff --git a/src/main/java/com/marklogic/spark/reader/document/DocumentContext.java b/src/main/java/com/marklogic/spark/reader/document/DocumentContext.java
@@ -4,16 +4,13 @@
 package com.marklogic.spark.reader.document;
 
 import com.marklogic.client.DatabaseClient;
-import com.marklogic.client.document.DocumentManager;
 import com.marklogic.client.query.SearchQueryDefinition;
 import com.marklogic.spark.ContextSupport;
 import com.marklogic.spark.Options;
 import org.apache.spark.sql.types.StructType;
 import org.apache.spark.sql.util.CaseInsensitiveStringMap;
 
-import java.util.HashSet;
 import java.util.Map;
-import java.util.Set;
 
 class DocumentContext extends ContextSupport {
 
@@ -25,23 +22,6 @@ class DocumentContext extends ContextSupport {
         this.schema = schema;
     }
 
-    Set<DocumentManager.Metadata> getRequestedMetadata() {
-        Set<DocumentManager.Metadata> set = new HashSet<>();
-        if (hasOption(Options.READ_DOCUMENTS_CATEGORIES)) {
-            for (String category : getStringOption(Options.READ_DOCUMENTS_CATEGORIES).split(",")) {
-                if ("content".equalsIgnoreCase(category)) {
-                    continue;
-                }
-                if ("metadata".equalsIgnoreCase(category)) {
-                    set.add(DocumentManager.Metadata.ALL);
-                } else {
-                    set.add(DocumentManager.Metadata.valueOf(category.toUpperCase()));
-                }
-            }
-        }
-        return set;
-    }
-
     boolean contentWasRequested() {
         if (isStreamingFiles()) {
             return false;
diff --git a/src/main/java/com/marklogic/spark/reader/document/DocumentScanBuilder.java b/src/main/java/com/marklogic/spark/reader/document/DocumentScanBuilder.java
@@ -3,7 +3,6 @@
  */
 package com.marklogic.spark.reader.document;
 
-import com.marklogic.spark.Options;
 import com.marklogic.spark.Util;
 import org.apache.spark.sql.connector.read.Scan;
 import org.apache.spark.sql.connector.read.ScanBuilder;
diff --git a/src/main/java/com/marklogic/spark/reader/document/ForestReader.java b/src/main/java/com/marklogic/spark/reader/document/ForestReader.java
@@ -16,6 +16,7 @@
 import com.marklogic.client.query.StructuredQueryBuilder;
 import com.marklogic.spark.Options;
 import com.marklogic.spark.ReadProgressLogger;
+import com.marklogic.spark.Util;
 import org.apache.spark.sql.catalyst.InternalRow;
 import org.apache.spark.sql.connector.read.PartitionReader;
 import org.slf4j.Logger;
@@ -74,7 +75,7 @@ class ForestReader implements PartitionReader<InternalRow> {
         this.documentManager = client.newDocumentManager();
         this.documentManager.setReadTransform(query.getResponseTransform());
         this.contentWasRequested = context.contentWasRequested();
-        this.requestedMetadata = context.getRequestedMetadata();
+        this.requestedMetadata = Util.getRequestedMetadata(context);
         this.documentManager.setMetadataCategories(this.requestedMetadata);
         this.queryBuilder = client.newQueryManager().newStructuredQueryBuilder();
     }
diff --git a/src/main/java/com/marklogic/spark/reader/file/GenericFileReader.java b/src/main/java/com/marklogic/spark/reader/file/GenericFileReader.java
@@ -4,7 +4,6 @@
 package com.marklogic.spark.reader.file;
 
 import com.marklogic.spark.ConnectorException;
-import com.marklogic.spark.Options;
 import com.marklogic.spark.Util;
 import org.apache.spark.sql.catalyst.InternalRow;
 import org.apache.spark.sql.catalyst.expressions.GenericInternalRow;
diff --git a/src/main/java/com/marklogic/spark/writer/file/ContentWriter.java b/src/main/java/com/marklogic/spark/writer/file/ContentWriter.java
@@ -6,10 +6,12 @@
 import com.fasterxml.jackson.databind.JsonNode;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import com.marklogic.client.document.GenericDocumentManager;
+import com.marklogic.client.io.DocumentMetadataHandle;
 import com.marklogic.client.io.InputStreamHandle;
 import com.marklogic.spark.ConnectorException;
 import com.marklogic.spark.ContextSupport;
 import com.marklogic.spark.Options;
+import com.marklogic.spark.Util;
 import com.marklogic.spark.reader.document.DocumentRowSchema;
 import org.apache.commons.io.IOUtils;
 import org.apache.spark.sql.catalyst.InternalRow;
@@ -38,7 +40,7 @@ class ContentWriter {
     private final Charset encoding;
 
     private final boolean isStreamingFiles;
-    // Only set when streaming.
+    // Only used when streaming.
     private final GenericDocumentManager documentManager;
 
     ContentWriter(Map<String, String> properties) {
@@ -54,7 +56,14 @@ class ContentWriter {
         }
 
         this.isStreamingFiles = context.isStreamingFiles();
-        this.documentManager = this.isStreamingFiles ? context.connectToMarkLogic().newDocumentManager() : null;
+        if (this.isStreamingFiles) {
+            this.documentManager = context.connectToMarkLogic().newDocumentManager();
+            if (context.hasOption(Options.READ_DOCUMENTS_CATEGORIES)) {
+                this.documentManager.setMetadataCategories(Util.getRequestedMetadata(context));
+            }
+        } else {
+            this.documentManager = null;
+        }
     }
 
     void writeContent(InternalRow row, OutputStream outputStream) throws IOException {
@@ -73,6 +82,21 @@ void writeContent(InternalRow row, OutputStream outputStream) throws IOException
 
     void writeMetadata(InternalRow row, OutputStream outputStream) throws IOException {
         String metadataXml = DocumentRowSchema.makeDocumentMetadata(row).toString();
+        writeMetadata(metadataXml, outputStream);
+    }
+
+    /**
+     * When streaming documents to an archive, the metadata unfortunately has to be retrieved in a separate request
+     * per document. This is due to the Java Client hardcoding "content" as a category in a POST to v1/search. A
+     * future fix to the Java Client to not hardcode this will allow for the metadata to be retrieved during the
+     * reader phase.
+     */
+    void writeMetadataWhileStreaming(String documentUri, OutputStream outputStream) throws IOException {
+        DocumentMetadataHandle metadata = this.documentManager.readMetadata(documentUri, new DocumentMetadataHandle());
+        writeMetadata(metadata.toString(), outputStream);
+    }
+
+    private void writeMetadata(String metadataXml, OutputStream outputStream) throws IOException {
         // Must honor the encoding here as well, as a user could easily have values that require encoding in metadata
         // values or in a properties fragment.
         if (this.encoding != null) {
diff --git a/src/main/java/com/marklogic/spark/writer/file/ZipFileWriter.java b/src/main/java/com/marklogic/spark/writer/file/ZipFileWriter.java
@@ -4,6 +4,8 @@
 package com.marklogic.spark.writer.file;
 
 import com.marklogic.spark.ConnectorException;
+import com.marklogic.spark.ContextSupport;
+import com.marklogic.spark.Options;
 import org.apache.commons.io.IOUtils;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
@@ -26,7 +28,7 @@ public class ZipFileWriter implements DataWriter<InternalRow> {
 
     private static final Logger logger = LoggerFactory.getLogger(ZipFileWriter.class);
 
-    private final Map<String, String> properties;
+    private final ContextSupport context;
     private final SerializableConfiguration hadoopConfiguration;
 
     private final String zipPath;
@@ -44,7 +46,7 @@ public class ZipFileWriter implements DataWriter<InternalRow> {
     public ZipFileWriter(String path, Map<String, String> properties, SerializableConfiguration hadoopConfiguration,
                          int partitionId, boolean createZipFileImmediately) {
         this.zipPath = makeFilePath(path, partitionId);
-        this.properties = properties;
+        this.context = new ContextSupport(properties);
         this.hadoopConfiguration = hadoopConfiguration;
         if (createZipFileImmediately) {
             createZipFileAndContentWriter();
@@ -56,15 +58,11 @@ public void write(InternalRow row) throws IOException {
         if (contentWriter == null) {
             createZipFileAndContentWriter();
         }
+
         final String uri = row.getString(0);
         final String entryName = FileUtil.makePathFromDocumentURI(uri);
 
-        if (hasMetadata(row)) {
-            zipOutputStream.putNextEntry(new ZipEntry(entryName + ".metadata"));
-            this.contentWriter.writeMetadata(row, zipOutputStream);
-            zipEntryCounter++;
-        }
-
+        writeMetadataEntryIfNecessary(row, uri, entryName);
         zipOutputStream.putNextEntry(new ZipEntry(entryName));
         this.contentWriter.writeContent(row, zipOutputStream);
         zipEntryCounter++;
@@ -90,7 +88,7 @@ private void createZipFileAndContentWriter() {
         if (logger.isDebugEnabled()) {
             logger.debug("Will write to: {}", filePath);
         }
-        this.contentWriter = new ContentWriter(properties);
+        this.contentWriter = new ContentWriter(context.getProperties());
         try {
             FileSystem fileSystem = filePath.getFileSystem(hadoopConfiguration.value());
             fileSystem.setWriteChecksum(false);
@@ -100,6 +98,18 @@ private void createZipFileAndContentWriter() {
         }
     }
 
+    private void writeMetadataEntryIfNecessary(InternalRow row, String uri, String entryName) throws IOException {
+        if (this.context.isStreamingFiles() && context.hasOption(Options.READ_DOCUMENTS_CATEGORIES)) {
+            zipOutputStream.putNextEntry(new ZipEntry(entryName + ".metadata"));
+            this.contentWriter.writeMetadataWhileStreaming(uri, zipOutputStream);
+            zipEntryCounter++;
+        } else if (hasMetadata(row)) {
+            zipOutputStream.putNextEntry(new ZipEntry(entryName + ".metadata"));
+            this.contentWriter.writeMetadata(row, zipOutputStream);
+            zipEntryCounter++;
+        }
+    }
+
     private boolean hasMetadata(InternalRow row) {
         return !row.isNullAt(3) || !row.isNullAt(4) || !row.isNullAt(5) || !row.isNullAt(6) || !row.isNullAt(7);
     }
diff --git a/src/test/java/com/marklogic/spark/writer/file/WriteArchiveTest.java b/src/test/java/com/marklogic/spark/writer/file/WriteArchiveTest.java
@@ -7,19 +7,21 @@
 import com.marklogic.spark.AbstractIntegrationTest;
 import com.marklogic.spark.Options;
 import com.marklogic.spark.TestUtil;
+import com.marklogic.spark.reader.document.DocumentRowSchema;
+import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Row;
 import org.apache.spark.sql.SaveMode;
 import org.jdom2.Namespace;
 import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
 import org.junit.jupiter.api.io.TempDir;
 import org.junit.jupiter.params.ParameterizedTest;
 import org.junit.jupiter.params.provider.ValueSource;
 
 import java.nio.file.Path;
 import java.util.List;
 
-import static org.junit.jupiter.api.Assertions.assertEquals;
-import static org.junit.jupiter.api.Assertions.assertTrue;
+import static org.junit.jupiter.api.Assertions.*;
 
 class WriteArchiveTest extends AbstractIntegrationTest {
 
@@ -55,6 +57,38 @@ void writeAllMetadata(String metadata, @TempDir Path tempDir) {
         verifyMetadataFiles(tempDir, metadata);
     }
 
+    @Test
+    void streaming(@TempDir Path tempDir) {
+        Dataset<Row> dataset = newSparkSession().read()
+            .format(CONNECTOR_IDENTIFIER)
+            .option(Options.CLIENT_URI, makeClientUri())
+            .option(Options.READ_DOCUMENTS_COLLECTIONS, "collection1")
+            .option(Options.STREAM_FILES, true)
+            .load();
+
+        dataset.collectAsList().forEach(row -> {
+            assertNotNull(row.getString(0), "The URI column should have the URI of the document to retrieve during the writer phase.");
+            for (int i = 1; i < DocumentRowSchema.SCHEMA.size(); i++) {
+                assertTrue(row.isNullAt(i), "Every other column in the row should be null. We don't want the content, " +
+                    "as that will be retrieved by the writer. And we unfortunately can't get the metadata without " +
+                    "getting the content as well via a POST to v1/documents. So the writer will get the metadata " +
+                    "as well.");
+            }
+        });
+
+        dataset.repartition(1)
+            .write()
+            .format(CONNECTOR_IDENTIFIER)
+            .option(Options.WRITE_FILES_COMPRESSION, "zip")
+            .option(Options.READ_DOCUMENTS_CATEGORIES, "content,collections")
+            .option(Options.CLIENT_URI, makeClientUri())
+            .option(Options.STREAM_FILES, true)
+            .mode(SaveMode.Append)
+            .save(tempDir.toFile().getAbsolutePath());
+
+        verifyMetadataFiles(tempDir, "collections");
+    }
+
     private void verifyMetadataFiles(Path tempDir, String metadataValue) {
         final List<Row> rows = newSparkSession().read()
             .format(CONNECTOR_IDENTIFIER)
@@ -96,18 +130,22 @@ private void verifyMetadata(Row row, String metadataValue) {
         switch (metadataValue) {
             case "collections":
                 verifyCollections(metadata);
+                verifyPermissionsMissing(metadata);
                 break;
             case "permissions":
                 verifyPermissions(metadata);
                 break;
             case "quality":
                 verifyQuality(metadata);
+                verifyPermissionsMissing(metadata);
                 break;
             case "properties":
                 verifyProperties(metadata);
+                verifyPermissionsMissing(metadata);
                 break;
             case "metadatavalues":
                 verifyMetadataValues(metadata);
+                verifyPermissionsMissing(metadata);
                 break;
             case "metadata":
                 verifyCollections(metadata);
@@ -131,6 +169,11 @@ private void verifyPermissions(XmlNode metadata) {
         metadata.assertElementExists(path + "[rapi:role-name = 'qconsole-user' and rapi:capability='read']");
     }
 
+    private void verifyPermissionsMissing(XmlNode metadata) {
+        metadata.assertElementMissing("Permissions should not exist since they were not in the set of " +
+            "metadata categories.", "/rapi:metadata/rapi:permissions");
+    }
+
     private void verifyQuality(XmlNode metadata) {
         metadata.assertElementValue("/rapi:metadata/rapi:quality", "10");
     }