Merge pull request #253 from marklogic/feature/naked-properties

rjrudin · web-flow · commit b194f06c1641 · 2024-07-10T14:55:06.000-04:00
MLE-13889 Can now read and write naked properties
diff --git a/build.gradle b/build.gradle
@@ -42,7 +42,7 @@ dependencies {
     exclude module: "rocksdbjni"
   }
 
-  shadowDependencies ("com.marklogic:marklogic-client-api:6.6.0") {
+  shadowDependencies ("com.marklogic:marklogic-client-api:6.6.1") {
     // The Java Client uses Jackson 2.15.2; Scala 3.4.x does not yet support that and will throw the following error:
     // Scala module 2.14.2 requires Jackson Databind version >= 2.14.0 and < 2.15.0 - Found jackson-databind version 2.15.2
     // So the 4 Jackson modules are excluded to allow for Spark's to be used.
diff --git a/src/main/java/com/marklogic/spark/reader/file/MlcpArchiveFileReader.java b/src/main/java/com/marklogic/spark/reader/file/MlcpArchiveFileReader.java
@@ -1,6 +1,5 @@
 package com.marklogic.spark.reader.file;
 
-import com.marklogic.client.io.DocumentMetadataHandle;
 import com.marklogic.spark.ConnectorException;
 import com.marklogic.spark.Options;
 import com.marklogic.spark.Util;
@@ -200,14 +199,9 @@ private boolean readNakedEntry(ZipEntry metadataZipEntry, MlcpMetadata mlcpMetad
     }
 
     private InternalRow makeNakedRow(ZipEntry metadataZipEntry, MlcpMetadata mlcpMetadata) {
-        DocumentMetadataHandle metadata = mlcpMetadata.getMetadata();
-        metadata.getCollections().clear();
-        metadata.getPermissions().clear();
-        metadata.getMetadataValues().clear();
-        metadata.setQuality(0);
         return new DocumentRowBuilder(metadataCategories)
             .withUri(metadataZipEntry.getName())
-            .withMetadata(metadata)
+            .withMetadata(mlcpMetadata.getMetadata())
             .buildRow();
     }
 
diff --git a/src/main/java/com/marklogic/spark/writer/DocBuilder.java b/src/main/java/com/marklogic/spark/writer/DocBuilder.java
@@ -87,6 +87,14 @@ DocumentWriteOperation build(DocumentInputs inputs) {
         final String graph = inputs.getGraph();
         final DocumentMetadataHandle initialMetadata = inputs.getInitialMetadata();
 
+        final boolean isNakedProperties = inputs.getContent() == null;
+        if (isNakedProperties) {
+            if (initialMetadata != null) {
+                overrideInitialMetadata(initialMetadata);
+            }
+            return new DocumentWriteOperationImpl(uri, initialMetadata, null);
+        }
+
         if (initialMetadata != null) {
             overrideInitialMetadata(initialMetadata);
             if (graph != null) {
diff --git a/src/main/java/com/marklogic/spark/writer/DocumentRowConverter.java b/src/main/java/com/marklogic/spark/writer/DocumentRowConverter.java
@@ -7,7 +7,6 @@
 import com.marklogic.client.io.DocumentMetadataHandle;
 import com.marklogic.client.io.Format;
 import com.marklogic.spark.Options;
-import com.marklogic.spark.Util;
 import com.marklogic.spark.reader.document.DocumentRowSchema;
 import org.apache.spark.sql.catalyst.InternalRow;
 
@@ -34,10 +33,11 @@ class DocumentRowConverter implements RowConverter {
     @Override
     public Optional<DocBuilder.DocumentInputs> convertRow(InternalRow row) {
         final String uri = row.getString(0);
-        if (row.isNullAt(1)) {
-            Util.MAIN_LOGGER.warn("Not writing document with URI {} as it has null content; this will be supported " +
-                "once the MarkLogic Java Client 6.6.1 is available.", uri);
-            return Optional.empty();
+
+        final boolean isNakedProperties = row.isNullAt(1);
+        if (isNakedProperties) {
+            DocumentMetadataHandle metadata = DocumentRowSchema.makeDocumentMetadata(row);
+            return Optional.of(new DocBuilder.DocumentInputs(uri, null, null, metadata));
         }
 
         final BytesHandle content = new BytesHandle(row.getBinary(1));
diff --git a/src/test/java/com/marklogic/spark/AbstractIntegrationTest.java b/src/test/java/com/marklogic/spark/AbstractIntegrationTest.java
@@ -17,11 +17,13 @@
 
 import com.fasterxml.jackson.databind.ObjectMapper;
 import com.marklogic.client.io.DocumentMetadataHandle;
+import com.marklogic.junit5.XmlNode;
 import com.marklogic.junit5.spring.AbstractSpringMarkLogicTest;
 import com.marklogic.junit5.spring.SimpleTestConfig;
 import org.apache.spark.SparkException;
 import org.apache.spark.sql.*;
 import org.apache.spark.util.VersionUtils;
+import org.jdom2.Namespace;
 import org.junit.jupiter.api.AfterEach;
 import org.springframework.beans.factory.annotation.Autowired;
 import org.springframework.core.io.ClassPathResource;
@@ -49,6 +51,7 @@ public abstract class AbstractIntegrationTest extends AbstractSpringMarkLogicTes
     protected static final String CONNECTOR_IDENTIFIER = "marklogic";
     protected static final String NO_AUTHORS_QUERY = "op.fromView('Medical', 'NoAuthors', '')";
     protected static final String DEFAULT_PERMISSIONS = "spark-user-role,read,spark-user-role,update";
+    protected static final Namespace PROPERTIES_NAMESPACE = Namespace.getNamespace("prop", "http://marklogic.com/xdmp/property");
 
     protected static final ObjectMapper objectMapper = new ObjectMapper();
 
@@ -176,4 +179,12 @@ protected final DocumentMetadataHandle readMetadata(String uri) {
         // This should really be in marklogic-unit-test.
         return getDatabaseClient().newDocumentManager().readMetadata(uri, new DocumentMetadataHandle());
     }
+
+    @Override
+    protected XmlNode readDocumentProperties(String uri) {
+        // This should be fixed in marklogic-unit-test to include the properties namespace by default.
+        XmlNode props = super.readDocumentProperties(uri);
+        props.setNamespaces(new Namespace[]{PROPERTIES_NAMESPACE});
+        return props;
+    }
 }
diff --git a/src/test/java/com/marklogic/spark/reader/document/ReadDocumentRowsWithMetadataTest.java b/src/test/java/com/marklogic/spark/reader/document/ReadDocumentRowsWithMetadataTest.java
@@ -126,7 +126,7 @@ private void verifyAllMetadataColumnsArePopulated(Row row) {
         assertEquals(10, row.getInt(5));
 
         XmlNode properties = new XmlNode(row.getString(6), Namespace.getNamespace("ex", "org:example"),
-            Namespace.getNamespace("prop", "http://marklogic.com/xdmp/property"));
+            PROPERTIES_NAMESPACE);
         properties.assertElementValue("/prop:properties/ex:key1", "value1");
         properties.assertElementValue("/prop:properties/key2", "value2");
 
diff --git a/src/test/java/com/marklogic/spark/reader/file/ReadArchiveFileTest.java b/src/test/java/com/marklogic/spark/reader/file/ReadArchiveFileTest.java
@@ -259,8 +259,7 @@ private void verifyPermissions(Row row) {
     }
 
     private void verifyProperties(Row row) {
-        XmlNode properties = new XmlNode(row.getString(6), Namespace.getNamespace("prop", "http://marklogic.com/xdmp/property"),
-            Namespace.getNamespace("ex", "org:example"));
+        XmlNode properties = new XmlNode(row.getString(6), PROPERTIES_NAMESPACE, Namespace.getNamespace("ex", "org:example"));
         properties.assertElementValue("/prop:properties/ex:key1", "value1");
         properties.assertElementValue("/prop:properties/key2", "value2");
     }
diff --git a/src/test/java/com/marklogic/spark/reader/file/ReadMlcpArchiveFilesTest.java b/src/test/java/com/marklogic/spark/reader/file/ReadMlcpArchiveFilesTest.java
@@ -7,7 +7,6 @@
 import org.apache.spark.sql.Column;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Row;
-import org.apache.spark.sql.SaveMode;
 import org.jdom2.Namespace;
 import org.junit.jupiter.api.Test;
 import scala.collection.mutable.WrappedArray;
@@ -16,7 +15,8 @@
 import java.util.List;
 import java.util.Map;
 
-import static org.junit.jupiter.api.Assertions.*;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertTrue;
 
 class ReadMlcpArchiveFilesTest extends AbstractIntegrationTest {
 
@@ -149,8 +149,7 @@ void complexProperties() {
         assertEquals(1, rows.size());
 
         XmlNode properties = new XmlNode(rows.get(0).getString(PROPERTIES_COLUMN),
-            Namespace.getNamespace("prop", "http://marklogic.com/xdmp/property"),
-            Namespace.getNamespace("flexrep", "http://marklogic.com/xdmp/flexible-replication"));
+            PROPERTIES_NAMESPACE, Namespace.getNamespace("flexrep", "http://marklogic.com/xdmp/flexible-replication"));
         properties.assertElementValue(
             "This verifies that the properties column can contain any serialized string of XML. This is necessary so " +
                 "that complex XML structures can be read from and written to MarkLogic.",
@@ -217,56 +216,6 @@ void dontAbortOnArchiveFileMissingContentEntry() {
             "not thrown.");
     }
 
-    @Test
-    void nakedProperties() {
-        Dataset<Row> dataset = newSparkSession().read()
-            .format(CONNECTOR_IDENTIFIER)
-            .option(Options.READ_FILES_TYPE, "mlcp_archive")
-            .load("src/test/resources/mlcp-archive-files/naked1.zip");
-
-        List<Row> rows = dataset.collectAsList();
-        assertEquals(2, rows.size(), "The example.xml.naked entry should have produced 1 row.");
-
-        Row nakedRow = rows.get(0);
-        final String expectedNakedPropertiesUrl = "mlcp/naked/example.xml.naked";
-        assertEquals(expectedNakedPropertiesUrl, nakedRow.getString(0));
-        assertTrue(nakedRow.isNullAt(1), "Content should be null.");
-        assertTrue(nakedRow.isNullAt(2), "Format should be null, since there's no content.");
-        assertEquals(0, nakedRow.getList(COLLECTIONS_COLUMN).size(), "Collections list should be empty since they " +
-            "cannot be written for a naked properties fragment.");
-        assertEquals(0, nakedRow.getJavaMap(PERMISSIONS_COLUMN).size(), "Permissions should be empty since they " +
-            "cannot be written for a naked properties fragment.");
-        assertEquals(0, nakedRow.getInt(QUALITY_COLUMN));
-        assertEquals(0, nakedRow.getJavaMap(METADATAVALUES_COLUMN).size(), "Metadata values should be empty since " +
-            "they cannot be written for a naked properties fragment");
-
-        XmlNode properties = new XmlNode(nakedRow.getString(PROPERTIES_COLUMN),
-            Namespace.getNamespace("prop", "http://marklogic.com/xdmp/property"));
-        properties.assertElementValue("/prop:properties/priority", "1");
-
-        Row normalRow = rows.get(1);
-        assertEquals("mlcp/xml/1.xml", normalRow.getString(0));
-
-        // Now write it to verify the outcome.
-        dataset.write().format(CONNECTOR_IDENTIFIER)
-            .option(Options.CLIENT_URI, makeClientUri())
-            .option(Options.WRITE_COLLECTIONS, "naked-test")
-            .option(Options.WRITE_PERMISSIONS, DEFAULT_PERMISSIONS)
-            .mode(SaveMode.Append)
-            .save();
-
-        List<String> uris = getUrisInCollection("naked-test", 1);
-        assertEquals("mlcp/xml/1.xml", uris.get(0));
-
-        String nakedProperties = getDatabaseClient().newServerEval()
-            .xquery(String.format("xdmp:document-properties('%s')", expectedNakedPropertiesUrl))
-            .evalAs(String.class);
-        assertNull(nakedProperties, "The naked properties row should have been ignored during the write, as " +
-            "Java Client 6.6.0 and earlier do not support writing a document with null content via WriteBatcher. " +
-            "This will be fixed in the Java Client 6.6.1 release, at which point we can start writing naked " +
-            "properties fragments correctly.");
-    }
-
     private void verifyFirstRow(Row row) {
         assertEquals("/test/1.xml", row.getString(0));
         XmlNode doc = new XmlNode(new String((byte[]) row.get(1)));
@@ -327,7 +276,7 @@ private void verifyQuality(Row row) {
 
     private void verifyProperties(Row row) {
         XmlNode properties = new XmlNode(row.getString(PROPERTIES_COLUMN),
-            Namespace.getNamespace("prop", "http://marklogic.com/xdmp/property"), Namespace.getNamespace("ex", "org:example"));
+            PROPERTIES_NAMESPACE, Namespace.getNamespace("ex", "org:example"));
         properties.assertElementValue("/prop:properties/ex:key1", "value1");
         properties.assertElementValue("/prop:properties/key2", "value2");
     }
diff --git a/src/test/java/com/marklogic/spark/reader/file/ReadMlcpArchiveWithNakedPropertiesTest.java b/src/test/java/com/marklogic/spark/reader/file/ReadMlcpArchiveWithNakedPropertiesTest.java
@@ -0,0 +1,111 @@
+package com.marklogic.spark.reader.file;
+
+import com.marklogic.junit5.XmlNode;
+import com.marklogic.spark.AbstractIntegrationTest;
+import com.marklogic.spark.Options;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.SaveMode;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+
+import java.util.List;
+import java.util.stream.Stream;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+/**
+ * A "naked properties" URI in MarkLogic is possible by creating a properties fragment at a URI but not
+ * assigning any document content to it. MLCP archives can contain these, and thus we need to support them when reading
+ * an MLCP archive. However, because v1/search cannot find these documents, it's not possible for the archives created
+ * by this connector to contain them.
+ */
+class ReadMlcpArchiveWithNakedPropertiesTest extends AbstractIntegrationTest {
+
+    private static final int PROPERTIES_COLUMN = 6;
+
+    /**
+     * The plumbing in the parent class for deleting documents before a test runs won't catch naked properties created
+     * by this test, so we ensure they're deleted here.
+     */
+    @BeforeEach
+    void deleteNakedPropertiesFromPreviousTestRuns() {
+        Stream.of("example.xml.naked", "example2.xml.naked", "naked/example.xml.naked").forEach(uri -> {
+            String query = String.format("xdmp:document-delete('%s')", uri);
+            try {
+                getDatabaseClient().newServerEval().xquery(query).evalAs(String.class);
+            } catch (Exception e) {
+                logger.debug("Ignoring this error because it's only due to the naked properties fragment not existing");
+            }
+        });
+    }
+
+    @Test
+    void twoNakedEntries() {
+        newSparkSession().read().format(CONNECTOR_IDENTIFIER)
+            .option(Options.READ_FILES_TYPE, "mlcp_archive")
+            .load("src/test/resources/mlcp-archive-files/two-naked-entries.zip")
+            .write().format(CONNECTOR_IDENTIFIER)
+            .option(Options.CLIENT_URI, makeClientUri())
+            .option(Options.WRITE_COLLECTIONS, "naked")
+            .option(Options.WRITE_PERMISSIONS, DEFAULT_PERMISSIONS)
+            .mode(SaveMode.Append)
+            .save();
+
+        assertCollectionSize("Using v1/search should not find the naked URIs since they do not have a document " +
+            "associated with them", "naked", 0);
+
+        Stream.of("example.xml.naked", "example2.xml.naked").forEach(uri -> {
+            String collection = getDatabaseClient().newServerEval()
+                .javascript(String.format("xdmp.documentGetCollections('%s')[0]", uri))
+                .evalAs(String.class);
+            assertEquals("naked", collection, "Each naked properties document should still be assigned to the " +
+                "collection found in its MLCP metadata entry from the archive file. But these URIs aren't returned " +
+                "by v1/search since there are no documents associated with them.");
+        });
+
+        XmlNode props = readDocumentProperties("example.xml.naked");
+        props.assertElementValue("/prop:properties/priority", "1");
+        props = readDocumentProperties("example2.xml.naked");
+        props.assertElementValue("/prop:properties/priority", "2");
+    }
+
+    @Test
+    void normalAndNakedEntry() {
+        Dataset<Row> dataset = newSparkSession().read()
+            .format(CONNECTOR_IDENTIFIER)
+            .option(Options.READ_FILES_TYPE, "mlcp_archive")
+            .load("src/test/resources/mlcp-archive-files/normal-and-naked-entry.zip");
+
+        List<Row> rows = dataset.collectAsList();
+        assertEquals(2, rows.size(), "The example.xml.naked entry should have produced 1 row.");
+        assertEquals("xml/1.xml", rows.get(1).getString(0));
+
+        final String expectedNakedPropertiesUrl = "naked/example.xml.naked";
+        Row nakedRow = rows.get(0);
+        assertEquals(expectedNakedPropertiesUrl, nakedRow.getString(0));
+        assertTrue(nakedRow.isNullAt(1), "Content should be null.");
+        assertTrue(nakedRow.isNullAt(2), "Format should be null, since there's no content.");
+        XmlNode properties = new XmlNode(nakedRow.getString(PROPERTIES_COLUMN), PROPERTIES_NAMESPACE);
+        properties.assertElementValue("/prop:properties/priority", "1");
+
+        // Write the rows to verify that the naked document is created correctly.
+        dataset.write().format(CONNECTOR_IDENTIFIER)
+            .option(Options.CLIENT_URI, makeClientUri())
+            .option(Options.WRITE_COLLECTIONS, "naked-test")
+            .option(Options.WRITE_PERMISSIONS, DEFAULT_PERMISSIONS)
+            .mode(SaveMode.Append)
+            .save();
+
+        List<String> uris = getUrisInCollection("naked-test", 1);
+        assertEquals("xml/1.xml", uris.get(0), "getUrisInCollection uses v1/search to find URIs, and thus it " +
+            "should only find the URI of the normal document and not the one of the naked properties document.");
+
+        XmlNode nakedProperties = readDocumentProperties(expectedNakedPropertiesUrl);
+        nakedProperties.assertElementValue(
+            "As of Java Client 6.6.1, a DMSDK WriteBatcher now allows for a document to have a null content handle, " +
+                "which allows for 'naked properties' URIs to be written.",
+            "/prop:properties/priority", "1");
+    }
+}
diff --git a/src/test/java/com/marklogic/spark/writer/file/WriteArchiveTest.java b/src/test/java/com/marklogic/spark/writer/file/WriteArchiveTest.java
@@ -81,7 +81,7 @@ private void verifyMetadata(Row row, String metadataValue) {
         String xml = new String((byte[]) row.get(1));
         XmlNode metadata = new XmlNode(xml,
             Namespace.getNamespace("rapi", "http://marklogic.com/rest-api"),
-            Namespace.getNamespace("prop", "http://marklogic.com/xdmp/property"),
+            PROPERTIES_NAMESPACE,
             Namespace.getNamespace("ex", "org:example"));
 
         switch (metadataValue) {
diff --git a/src/test/resources/mlcp-archive-files/naked1.zip b/src/test/resources/mlcp-archive-files/naked1.zip
diff --git a/src/test/resources/mlcp-archive-files/normal-and-naked-entry.zip b/src/test/resources/mlcp-archive-files/normal-and-naked-entry.zip
diff --git a/src/test/resources/mlcp-archive-files/two-naked-entries.zip b/src/test/resources/mlcp-archive-files/two-naked-entries.zip

Original file line number	Diff line number	Diff line change
`@@ -42,7 +42,7 @@ dependencies {`
`42`	`42`	`exclude module: "rocksdbjni"`
`43`	`43`	`}`
`44`	`44`
`45`		`- shadowDependencies ("com.marklogic:marklogic-client-api:6.6.0") {`
	`45`	`+ shadowDependencies ("com.marklogic:marklogic-client-api:6.6.1") {`
`46`	`46`	`// The Java Client uses Jackson 2.15.2; Scala 3.4.x does not yet support that and will throw the following error:`
`47`	`47`	`// Scala module 2.14.2 requires Jackson Databind version >= 2.14.0 and < 2.15.0 - Found jackson-databind version 2.15.2`
`48`	`48`	`// So the 4 Jackson modules are excluded to allow for Spark's to be used.`
Original file line number	Diff line number	Diff line change
`@@ -259,8 +259,7 @@ private void verifyPermissions(Row row) {`
`259`	`259`	`}`
`260`	`260`
`261`	`261`	`private void verifyProperties(Row row) {`
`262`		`- XmlNode properties = new XmlNode(row.getString(6), Namespace.getNamespace("prop", "http://marklogic.com/xdmp/property"),`
`263`		`- Namespace.getNamespace("ex", "org:example"));`
	`262`	`+ XmlNode properties = new XmlNode(row.getString(6), PROPERTIES_NAMESPACE, Namespace.getNamespace("ex", "org:example"));`
`264`	`263`	`properties.assertElementValue("/prop:properties/ex:key1", "value1");`
`265`	`264`	`properties.assertElementValue("/prop:properties/key2", "value2");`
`266`	`265`	`}`