Skip to content

Commit 9dece45

Browse files
committed
MLE-13889 Can now read and write naked properties
Turns out this mostly "just worked" once the Java Client was upgraded to 6.6.1 to allow for WriteBatcher to write documents with null content. Also learned that MarkLogic is fine with assigning collections/permissions to a naked URI as well.
1 parent cab2669 commit 9dece45

File tree

13 files changed

+144
-72
lines changed

13 files changed

+144
-72
lines changed

build.gradle

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ dependencies {
4242
exclude module: "rocksdbjni"
4343
}
4444

45-
shadowDependencies ("com.marklogic:marklogic-client-api:6.6.0") {
45+
shadowDependencies ("com.marklogic:marklogic-client-api:6.6.1") {
4646
// The Java Client uses Jackson 2.15.2; Scala 3.4.x does not yet support that and will throw the following error:
4747
// Scala module 2.14.2 requires Jackson Databind version >= 2.14.0 and < 2.15.0 - Found jackson-databind version 2.15.2
4848
// So the 4 Jackson modules are excluded to allow for Spark's to be used.

src/main/java/com/marklogic/spark/reader/file/MlcpArchiveFileReader.java

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
package com.marklogic.spark.reader.file;
22

3-
import com.marklogic.client.io.DocumentMetadataHandle;
43
import com.marklogic.spark.ConnectorException;
54
import com.marklogic.spark.Options;
65
import com.marklogic.spark.Util;
@@ -200,14 +199,9 @@ private boolean readNakedEntry(ZipEntry metadataZipEntry, MlcpMetadata mlcpMetad
200199
}
201200

202201
private InternalRow makeNakedRow(ZipEntry metadataZipEntry, MlcpMetadata mlcpMetadata) {
203-
DocumentMetadataHandle metadata = mlcpMetadata.getMetadata();
204-
metadata.getCollections().clear();
205-
metadata.getPermissions().clear();
206-
metadata.getMetadataValues().clear();
207-
metadata.setQuality(0);
208202
return new DocumentRowBuilder(metadataCategories)
209203
.withUri(metadataZipEntry.getName())
210-
.withMetadata(metadata)
204+
.withMetadata(mlcpMetadata.getMetadata())
211205
.buildRow();
212206
}
213207

src/main/java/com/marklogic/spark/writer/DocBuilder.java

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,14 @@ DocumentWriteOperation build(DocumentInputs inputs) {
8787
final String graph = inputs.getGraph();
8888
final DocumentMetadataHandle initialMetadata = inputs.getInitialMetadata();
8989

90+
final boolean isNakedProperties = inputs.getContent() == null;
91+
if (isNakedProperties) {
92+
if (initialMetadata != null) {
93+
overrideInitialMetadata(initialMetadata);
94+
}
95+
return new DocumentWriteOperationImpl(uri, initialMetadata, null);
96+
}
97+
9098
if (initialMetadata != null) {
9199
overrideInitialMetadata(initialMetadata);
92100
if (graph != null) {

src/main/java/com/marklogic/spark/writer/DocumentRowConverter.java

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77
import com.marklogic.client.io.DocumentMetadataHandle;
88
import com.marklogic.client.io.Format;
99
import com.marklogic.spark.Options;
10-
import com.marklogic.spark.Util;
1110
import com.marklogic.spark.reader.document.DocumentRowSchema;
1211
import org.apache.spark.sql.catalyst.InternalRow;
1312

@@ -34,10 +33,11 @@ class DocumentRowConverter implements RowConverter {
3433
@Override
3534
public Optional<DocBuilder.DocumentInputs> convertRow(InternalRow row) {
3635
final String uri = row.getString(0);
37-
if (row.isNullAt(1)) {
38-
Util.MAIN_LOGGER.warn("Not writing document with URI {} as it has null content; this will be supported " +
39-
"once the MarkLogic Java Client 6.6.1 is available.", uri);
40-
return Optional.empty();
36+
37+
final boolean isNakedProperties = row.isNullAt(1);
38+
if (isNakedProperties) {
39+
DocumentMetadataHandle metadata = DocumentRowSchema.makeDocumentMetadata(row);
40+
return Optional.of(new DocBuilder.DocumentInputs(uri, null, null, metadata));
4141
}
4242

4343
final BytesHandle content = new BytesHandle(row.getBinary(1));

src/test/java/com/marklogic/spark/AbstractIntegrationTest.java

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,13 @@
1717

1818
import com.fasterxml.jackson.databind.ObjectMapper;
1919
import com.marklogic.client.io.DocumentMetadataHandle;
20+
import com.marklogic.junit5.XmlNode;
2021
import com.marklogic.junit5.spring.AbstractSpringMarkLogicTest;
2122
import com.marklogic.junit5.spring.SimpleTestConfig;
2223
import org.apache.spark.SparkException;
2324
import org.apache.spark.sql.*;
2425
import org.apache.spark.util.VersionUtils;
26+
import org.jdom2.Namespace;
2527
import org.junit.jupiter.api.AfterEach;
2628
import org.springframework.beans.factory.annotation.Autowired;
2729
import org.springframework.core.io.ClassPathResource;
@@ -49,6 +51,7 @@ public abstract class AbstractIntegrationTest extends AbstractSpringMarkLogicTes
4951
protected static final String CONNECTOR_IDENTIFIER = "marklogic";
5052
protected static final String NO_AUTHORS_QUERY = "op.fromView('Medical', 'NoAuthors', '')";
5153
protected static final String DEFAULT_PERMISSIONS = "spark-user-role,read,spark-user-role,update";
54+
protected static final Namespace PROPERTIES_NAMESPACE = Namespace.getNamespace("prop", "http://marklogic.com/xdmp/property");
5255

5356
protected static final ObjectMapper objectMapper = new ObjectMapper();
5457

@@ -176,4 +179,12 @@ protected final DocumentMetadataHandle readMetadata(String uri) {
176179
// This should really be in marklogic-unit-test.
177180
return getDatabaseClient().newDocumentManager().readMetadata(uri, new DocumentMetadataHandle());
178181
}
182+
183+
@Override
184+
protected XmlNode readDocumentProperties(String uri) {
185+
// This should be fixed in marklogic-unit-test to include the properties namespace by default.
186+
XmlNode props = super.readDocumentProperties(uri);
187+
props.setNamespaces(new Namespace[]{PROPERTIES_NAMESPACE});
188+
return props;
189+
}
179190
}

src/test/java/com/marklogic/spark/reader/document/ReadDocumentRowsWithMetadataTest.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,7 @@ private void verifyAllMetadataColumnsArePopulated(Row row) {
126126
assertEquals(10, row.getInt(5));
127127

128128
XmlNode properties = new XmlNode(row.getString(6), Namespace.getNamespace("ex", "org:example"),
129-
Namespace.getNamespace("prop", "http://marklogic.com/xdmp/property"));
129+
PROPERTIES_NAMESPACE);
130130
properties.assertElementValue("/prop:properties/ex:key1", "value1");
131131
properties.assertElementValue("/prop:properties/key2", "value2");
132132

src/test/java/com/marklogic/spark/reader/file/ReadArchiveFileTest.java

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -259,8 +259,7 @@ private void verifyPermissions(Row row) {
259259
}
260260

261261
private void verifyProperties(Row row) {
262-
XmlNode properties = new XmlNode(row.getString(6), Namespace.getNamespace("prop", "http://marklogic.com/xdmp/property"),
263-
Namespace.getNamespace("ex", "org:example"));
262+
XmlNode properties = new XmlNode(row.getString(6), PROPERTIES_NAMESPACE, Namespace.getNamespace("ex", "org:example"));
264263
properties.assertElementValue("/prop:properties/ex:key1", "value1");
265264
properties.assertElementValue("/prop:properties/key2", "value2");
266265
}

src/test/java/com/marklogic/spark/reader/file/ReadMlcpArchiveFilesTest.java

Lines changed: 4 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77
import org.apache.spark.sql.Column;
88
import org.apache.spark.sql.Dataset;
99
import org.apache.spark.sql.Row;
10-
import org.apache.spark.sql.SaveMode;
1110
import org.jdom2.Namespace;
1211
import org.junit.jupiter.api.Test;
1312
import scala.collection.mutable.WrappedArray;
@@ -16,7 +15,8 @@
1615
import java.util.List;
1716
import java.util.Map;
1817

19-
import static org.junit.jupiter.api.Assertions.*;
18+
import static org.junit.jupiter.api.Assertions.assertEquals;
19+
import static org.junit.jupiter.api.Assertions.assertTrue;
2020

2121
class ReadMlcpArchiveFilesTest extends AbstractIntegrationTest {
2222

@@ -149,8 +149,7 @@ void complexProperties() {
149149
assertEquals(1, rows.size());
150150

151151
XmlNode properties = new XmlNode(rows.get(0).getString(PROPERTIES_COLUMN),
152-
Namespace.getNamespace("prop", "http://marklogic.com/xdmp/property"),
153-
Namespace.getNamespace("flexrep", "http://marklogic.com/xdmp/flexible-replication"));
152+
PROPERTIES_NAMESPACE, Namespace.getNamespace("flexrep", "http://marklogic.com/xdmp/flexible-replication"));
154153
properties.assertElementValue(
155154
"This verifies that the properties column can contain any serialized string of XML. This is necessary so " +
156155
"that complex XML structures can be read from and written to MarkLogic.",
@@ -217,56 +216,6 @@ void dontAbortOnArchiveFileMissingContentEntry() {
217216
"not thrown.");
218217
}
219218

220-
@Test
221-
void nakedProperties() {
222-
Dataset<Row> dataset = newSparkSession().read()
223-
.format(CONNECTOR_IDENTIFIER)
224-
.option(Options.READ_FILES_TYPE, "mlcp_archive")
225-
.load("src/test/resources/mlcp-archive-files/naked1.zip");
226-
227-
List<Row> rows = dataset.collectAsList();
228-
assertEquals(2, rows.size(), "The example.xml.naked entry should have produced 1 row.");
229-
230-
Row nakedRow = rows.get(0);
231-
final String expectedNakedPropertiesUrl = "mlcp/naked/example.xml.naked";
232-
assertEquals(expectedNakedPropertiesUrl, nakedRow.getString(0));
233-
assertTrue(nakedRow.isNullAt(1), "Content should be null.");
234-
assertTrue(nakedRow.isNullAt(2), "Format should be null, since there's no content.");
235-
assertEquals(0, nakedRow.getList(COLLECTIONS_COLUMN).size(), "Collections list should be empty since they " +
236-
"cannot be written for a naked properties fragment.");
237-
assertEquals(0, nakedRow.getJavaMap(PERMISSIONS_COLUMN).size(), "Permissions should be empty since they " +
238-
"cannot be written for a naked properties fragment.");
239-
assertEquals(0, nakedRow.getInt(QUALITY_COLUMN));
240-
assertEquals(0, nakedRow.getJavaMap(METADATAVALUES_COLUMN).size(), "Metadata values should be empty since " +
241-
"they cannot be written for a naked properties fragment");
242-
243-
XmlNode properties = new XmlNode(nakedRow.getString(PROPERTIES_COLUMN),
244-
Namespace.getNamespace("prop", "http://marklogic.com/xdmp/property"));
245-
properties.assertElementValue("/prop:properties/priority", "1");
246-
247-
Row normalRow = rows.get(1);
248-
assertEquals("mlcp/xml/1.xml", normalRow.getString(0));
249-
250-
// Now write it to verify the outcome.
251-
dataset.write().format(CONNECTOR_IDENTIFIER)
252-
.option(Options.CLIENT_URI, makeClientUri())
253-
.option(Options.WRITE_COLLECTIONS, "naked-test")
254-
.option(Options.WRITE_PERMISSIONS, DEFAULT_PERMISSIONS)
255-
.mode(SaveMode.Append)
256-
.save();
257-
258-
List<String> uris = getUrisInCollection("naked-test", 1);
259-
assertEquals("mlcp/xml/1.xml", uris.get(0));
260-
261-
String nakedProperties = getDatabaseClient().newServerEval()
262-
.xquery(String.format("xdmp:document-properties('%s')", expectedNakedPropertiesUrl))
263-
.evalAs(String.class);
264-
assertNull(nakedProperties, "The naked properties row should have been ignored during the write, as " +
265-
"Java Client 6.6.0 and earlier do not support writing a document with null content via WriteBatcher. " +
266-
"This will be fixed in the Java Client 6.6.1 release, at which point we can start writing naked " +
267-
"properties fragments correctly.");
268-
}
269-
270219
private void verifyFirstRow(Row row) {
271220
assertEquals("/test/1.xml", row.getString(0));
272221
XmlNode doc = new XmlNode(new String((byte[]) row.get(1)));
@@ -327,7 +276,7 @@ private void verifyQuality(Row row) {
327276

328277
private void verifyProperties(Row row) {
329278
XmlNode properties = new XmlNode(row.getString(PROPERTIES_COLUMN),
330-
Namespace.getNamespace("prop", "http://marklogic.com/xdmp/property"), Namespace.getNamespace("ex", "org:example"));
279+
PROPERTIES_NAMESPACE, Namespace.getNamespace("ex", "org:example"));
331280
properties.assertElementValue("/prop:properties/ex:key1", "value1");
332281
properties.assertElementValue("/prop:properties/key2", "value2");
333282
}
Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
package com.marklogic.spark.reader.file;
2+
3+
import com.marklogic.junit5.XmlNode;
4+
import com.marklogic.spark.AbstractIntegrationTest;
5+
import com.marklogic.spark.Options;
6+
import org.apache.spark.sql.Dataset;
7+
import org.apache.spark.sql.Row;
8+
import org.apache.spark.sql.SaveMode;
9+
import org.junit.jupiter.api.BeforeEach;
10+
import org.junit.jupiter.api.Test;
11+
12+
import java.util.List;
13+
import java.util.stream.Stream;
14+
15+
import static org.junit.jupiter.api.Assertions.assertEquals;
16+
import static org.junit.jupiter.api.Assertions.assertTrue;
17+
18+
/**
19+
* A "naked properties" URI in MarkLogic is possible by creating a properties fragment at a URI but not
20+
* assigning any document content to it. MLCP archives can contain these, and thus we need to support them when reading
21+
* an MLCP archive. However, because v1/search cannot find these documents, it's not possible for the archives created
22+
* by this connector to contain them.
23+
*/
24+
class ReadMlcpArchiveWithNakedPropertiesTest extends AbstractIntegrationTest {
25+
26+
private static final int PROPERTIES_COLUMN = 6;
27+
28+
/**
29+
* The plumbing in the parent class for deleting documents before a test runs won't catch naked properties created
30+
* by this test, so we ensure they're deleted here.
31+
*/
32+
@BeforeEach
33+
void deleteNakedPropertiesFromPreviousTestRuns() {
34+
Stream.of("example.xml.naked", "example2.xml.naked", "naked/example.xml.naked").forEach(uri -> {
35+
String query = String.format("xdmp:document-delete('%s')", uri);
36+
try {
37+
getDatabaseClient().newServerEval().xquery(query).evalAs(String.class);
38+
} catch (Exception e) {
39+
logger.debug("Ignoring this error because it's only due to the naked properties fragment not existing");
40+
}
41+
});
42+
}
43+
44+
@Test
45+
void twoNakedEntries() {
46+
newSparkSession().read().format(CONNECTOR_IDENTIFIER)
47+
.option(Options.READ_FILES_TYPE, "mlcp_archive")
48+
.load("src/test/resources/mlcp-archive-files/two-naked-entries.zip")
49+
.write().format(CONNECTOR_IDENTIFIER)
50+
.option(Options.CLIENT_URI, makeClientUri())
51+
.option(Options.WRITE_COLLECTIONS, "naked")
52+
.option(Options.WRITE_PERMISSIONS, DEFAULT_PERMISSIONS)
53+
.mode(SaveMode.Append)
54+
.save();
55+
56+
assertCollectionSize("Using v1/search should not find the naked URIs since they do not have a document " +
57+
"associated with them", "naked", 0);
58+
59+
Stream.of("example.xml.naked", "example2.xml.naked").forEach(uri -> {
60+
String collection = getDatabaseClient().newServerEval()
61+
.javascript(String.format("xdmp.documentGetCollections('%s')[0]", uri))
62+
.evalAs(String.class);
63+
assertEquals("naked", collection, "Each naked properties document should still be assigned to the " +
64+
"collection found in its MLCP metadata entry from the archive file. But these URIs aren't returned " +
65+
"by v1/search since there are no documents associated with them.");
66+
});
67+
68+
XmlNode props = readDocumentProperties("example.xml.naked");
69+
props.assertElementValue("/prop:properties/priority", "1");
70+
props = readDocumentProperties("example2.xml.naked");
71+
props.assertElementValue("/prop:properties/priority", "2");
72+
}
73+
74+
@Test
75+
void normalAndNakedEntry() {
76+
Dataset<Row> dataset = newSparkSession().read()
77+
.format(CONNECTOR_IDENTIFIER)
78+
.option(Options.READ_FILES_TYPE, "mlcp_archive")
79+
.load("src/test/resources/mlcp-archive-files/normal-and-naked-entry.zip");
80+
81+
List<Row> rows = dataset.collectAsList();
82+
assertEquals(2, rows.size(), "The example.xml.naked entry should have produced 1 row.");
83+
assertEquals("xml/1.xml", rows.get(1).getString(0));
84+
85+
final String expectedNakedPropertiesUrl = "naked/example.xml.naked";
86+
Row nakedRow = rows.get(0);
87+
assertEquals(expectedNakedPropertiesUrl, nakedRow.getString(0));
88+
assertTrue(nakedRow.isNullAt(1), "Content should be null.");
89+
assertTrue(nakedRow.isNullAt(2), "Format should be null, since there's no content.");
90+
XmlNode properties = new XmlNode(nakedRow.getString(PROPERTIES_COLUMN), PROPERTIES_NAMESPACE);
91+
properties.assertElementValue("/prop:properties/priority", "1");
92+
93+
// Write the rows to verify that the naked document is created correctly.
94+
dataset.write().format(CONNECTOR_IDENTIFIER)
95+
.option(Options.CLIENT_URI, makeClientUri())
96+
.option(Options.WRITE_COLLECTIONS, "naked-test")
97+
.option(Options.WRITE_PERMISSIONS, DEFAULT_PERMISSIONS)
98+
.mode(SaveMode.Append)
99+
.save();
100+
101+
List<String> uris = getUrisInCollection("naked-test", 1);
102+
assertEquals("xml/1.xml", uris.get(0), "getUrisInCollection uses v1/search to find URIs, and thus it " +
103+
"should only find the URI of the normal document and not the one of the naked properties document.");
104+
105+
XmlNode nakedProperties = readDocumentProperties(expectedNakedPropertiesUrl);
106+
nakedProperties.assertElementValue(
107+
"As of Java Client 6.6.1, a DMSDK WriteBatcher now allows for a document to have a null content handle, " +
108+
"which allows for 'naked properties' URIs to be written.",
109+
"/prop:properties/priority", "1");
110+
}
111+
}

src/test/java/com/marklogic/spark/writer/file/WriteArchiveTest.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ private void verifyMetadata(Row row, String metadataValue) {
8181
String xml = new String((byte[]) row.get(1));
8282
XmlNode metadata = new XmlNode(xml,
8383
Namespace.getNamespace("rapi", "http://marklogic.com/rest-api"),
84-
Namespace.getNamespace("prop", "http://marklogic.com/xdmp/property"),
84+
PROPERTIES_NAMESPACE,
8585
Namespace.getNamespace("ex", "org:example"));
8686

8787
switch (metadataValue) {

0 commit comments

Comments
 (0)