Skip to content

Commit b194f06

Browse files
authored
Merge pull request #253 from marklogic/feature/naked-properties
MLE-13889 Can now read and write naked properties
2 parents cab2669 + 9dece45 commit b194f06

File tree

13 files changed

+144
-72
lines changed

13 files changed

+144
-72
lines changed

build.gradle

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ dependencies {
4242
exclude module: "rocksdbjni"
4343
}
4444

45-
shadowDependencies ("com.marklogic:marklogic-client-api:6.6.0") {
45+
shadowDependencies ("com.marklogic:marklogic-client-api:6.6.1") {
4646
// The Java Client uses Jackson 2.15.2; Scala 3.4.x does not yet support that and will throw the following error:
4747
// Scala module 2.14.2 requires Jackson Databind version >= 2.14.0 and < 2.15.0 - Found jackson-databind version 2.15.2
4848
// So the 4 Jackson modules are excluded to allow for Spark's to be used.

src/main/java/com/marklogic/spark/reader/file/MlcpArchiveFileReader.java

Lines changed: 1 addition & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
package com.marklogic.spark.reader.file;
22

3-
import com.marklogic.client.io.DocumentMetadataHandle;
43
import com.marklogic.spark.ConnectorException;
54
import com.marklogic.spark.Options;
65
import com.marklogic.spark.Util;
@@ -200,14 +199,9 @@ private boolean readNakedEntry(ZipEntry metadataZipEntry, MlcpMetadata mlcpMetad
200199
}
201200

202201
private InternalRow makeNakedRow(ZipEntry metadataZipEntry, MlcpMetadata mlcpMetadata) {
203-
DocumentMetadataHandle metadata = mlcpMetadata.getMetadata();
204-
metadata.getCollections().clear();
205-
metadata.getPermissions().clear();
206-
metadata.getMetadataValues().clear();
207-
metadata.setQuality(0);
208202
return new DocumentRowBuilder(metadataCategories)
209203
.withUri(metadataZipEntry.getName())
210-
.withMetadata(metadata)
204+
.withMetadata(mlcpMetadata.getMetadata())
211205
.buildRow();
212206
}
213207

src/main/java/com/marklogic/spark/writer/DocBuilder.java

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,14 @@ DocumentWriteOperation build(DocumentInputs inputs) {
8787
final String graph = inputs.getGraph();
8888
final DocumentMetadataHandle initialMetadata = inputs.getInitialMetadata();
8989

90+
final boolean isNakedProperties = inputs.getContent() == null;
91+
if (isNakedProperties) {
92+
if (initialMetadata != null) {
93+
overrideInitialMetadata(initialMetadata);
94+
}
95+
return new DocumentWriteOperationImpl(uri, initialMetadata, null);
96+
}
97+
9098
if (initialMetadata != null) {
9199
overrideInitialMetadata(initialMetadata);
92100
if (graph != null) {

src/main/java/com/marklogic/spark/writer/DocumentRowConverter.java

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77
import com.marklogic.client.io.DocumentMetadataHandle;
88
import com.marklogic.client.io.Format;
99
import com.marklogic.spark.Options;
10-
import com.marklogic.spark.Util;
1110
import com.marklogic.spark.reader.document.DocumentRowSchema;
1211
import org.apache.spark.sql.catalyst.InternalRow;
1312

@@ -34,10 +33,11 @@ class DocumentRowConverter implements RowConverter {
3433
@Override
3534
public Optional<DocBuilder.DocumentInputs> convertRow(InternalRow row) {
3635
final String uri = row.getString(0);
37-
if (row.isNullAt(1)) {
38-
Util.MAIN_LOGGER.warn("Not writing document with URI {} as it has null content; this will be supported " +
39-
"once the MarkLogic Java Client 6.6.1 is available.", uri);
40-
return Optional.empty();
36+
37+
final boolean isNakedProperties = row.isNullAt(1);
38+
if (isNakedProperties) {
39+
DocumentMetadataHandle metadata = DocumentRowSchema.makeDocumentMetadata(row);
40+
return Optional.of(new DocBuilder.DocumentInputs(uri, null, null, metadata));
4141
}
4242

4343
final BytesHandle content = new BytesHandle(row.getBinary(1));

src/test/java/com/marklogic/spark/AbstractIntegrationTest.java

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,13 @@
1717

1818
import com.fasterxml.jackson.databind.ObjectMapper;
1919
import com.marklogic.client.io.DocumentMetadataHandle;
20+
import com.marklogic.junit5.XmlNode;
2021
import com.marklogic.junit5.spring.AbstractSpringMarkLogicTest;
2122
import com.marklogic.junit5.spring.SimpleTestConfig;
2223
import org.apache.spark.SparkException;
2324
import org.apache.spark.sql.*;
2425
import org.apache.spark.util.VersionUtils;
26+
import org.jdom2.Namespace;
2527
import org.junit.jupiter.api.AfterEach;
2628
import org.springframework.beans.factory.annotation.Autowired;
2729
import org.springframework.core.io.ClassPathResource;
@@ -49,6 +51,7 @@ public abstract class AbstractIntegrationTest extends AbstractSpringMarkLogicTes
4951
protected static final String CONNECTOR_IDENTIFIER = "marklogic";
5052
protected static final String NO_AUTHORS_QUERY = "op.fromView('Medical', 'NoAuthors', '')";
5153
protected static final String DEFAULT_PERMISSIONS = "spark-user-role,read,spark-user-role,update";
54+
protected static final Namespace PROPERTIES_NAMESPACE = Namespace.getNamespace("prop", "http://marklogic.com/xdmp/property");
5255

5356
protected static final ObjectMapper objectMapper = new ObjectMapper();
5457

@@ -176,4 +179,12 @@ protected final DocumentMetadataHandle readMetadata(String uri) {
176179
// This should really be in marklogic-unit-test.
177180
return getDatabaseClient().newDocumentManager().readMetadata(uri, new DocumentMetadataHandle());
178181
}
182+
183+
@Override
184+
protected XmlNode readDocumentProperties(String uri) {
185+
// This should be fixed in marklogic-unit-test to include the properties namespace by default.
186+
XmlNode props = super.readDocumentProperties(uri);
187+
props.setNamespaces(new Namespace[]{PROPERTIES_NAMESPACE});
188+
return props;
189+
}
179190
}

src/test/java/com/marklogic/spark/reader/document/ReadDocumentRowsWithMetadataTest.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,7 @@ private void verifyAllMetadataColumnsArePopulated(Row row) {
126126
assertEquals(10, row.getInt(5));
127127

128128
XmlNode properties = new XmlNode(row.getString(6), Namespace.getNamespace("ex", "org:example"),
129-
Namespace.getNamespace("prop", "http://marklogic.com/xdmp/property"));
129+
PROPERTIES_NAMESPACE);
130130
properties.assertElementValue("/prop:properties/ex:key1", "value1");
131131
properties.assertElementValue("/prop:properties/key2", "value2");
132132

src/test/java/com/marklogic/spark/reader/file/ReadArchiveFileTest.java

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -259,8 +259,7 @@ private void verifyPermissions(Row row) {
259259
}
260260

261261
private void verifyProperties(Row row) {
262-
XmlNode properties = new XmlNode(row.getString(6), Namespace.getNamespace("prop", "http://marklogic.com/xdmp/property"),
263-
Namespace.getNamespace("ex", "org:example"));
262+
XmlNode properties = new XmlNode(row.getString(6), PROPERTIES_NAMESPACE, Namespace.getNamespace("ex", "org:example"));
264263
properties.assertElementValue("/prop:properties/ex:key1", "value1");
265264
properties.assertElementValue("/prop:properties/key2", "value2");
266265
}

src/test/java/com/marklogic/spark/reader/file/ReadMlcpArchiveFilesTest.java

Lines changed: 4 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77
import org.apache.spark.sql.Column;
88
import org.apache.spark.sql.Dataset;
99
import org.apache.spark.sql.Row;
10-
import org.apache.spark.sql.SaveMode;
1110
import org.jdom2.Namespace;
1211
import org.junit.jupiter.api.Test;
1312
import scala.collection.mutable.WrappedArray;
@@ -16,7 +15,8 @@
1615
import java.util.List;
1716
import java.util.Map;
1817

19-
import static org.junit.jupiter.api.Assertions.*;
18+
import static org.junit.jupiter.api.Assertions.assertEquals;
19+
import static org.junit.jupiter.api.Assertions.assertTrue;
2020

2121
class ReadMlcpArchiveFilesTest extends AbstractIntegrationTest {
2222

@@ -149,8 +149,7 @@ void complexProperties() {
149149
assertEquals(1, rows.size());
150150

151151
XmlNode properties = new XmlNode(rows.get(0).getString(PROPERTIES_COLUMN),
152-
Namespace.getNamespace("prop", "http://marklogic.com/xdmp/property"),
153-
Namespace.getNamespace("flexrep", "http://marklogic.com/xdmp/flexible-replication"));
152+
PROPERTIES_NAMESPACE, Namespace.getNamespace("flexrep", "http://marklogic.com/xdmp/flexible-replication"));
154153
properties.assertElementValue(
155154
"This verifies that the properties column can contain any serialized string of XML. This is necessary so " +
156155
"that complex XML structures can be read from and written to MarkLogic.",
@@ -217,56 +216,6 @@ void dontAbortOnArchiveFileMissingContentEntry() {
217216
"not thrown.");
218217
}
219218

220-
@Test
221-
void nakedProperties() {
222-
Dataset<Row> dataset = newSparkSession().read()
223-
.format(CONNECTOR_IDENTIFIER)
224-
.option(Options.READ_FILES_TYPE, "mlcp_archive")
225-
.load("src/test/resources/mlcp-archive-files/naked1.zip");
226-
227-
List<Row> rows = dataset.collectAsList();
228-
assertEquals(2, rows.size(), "The example.xml.naked entry should have produced 1 row.");
229-
230-
Row nakedRow = rows.get(0);
231-
final String expectedNakedPropertiesUrl = "mlcp/naked/example.xml.naked";
232-
assertEquals(expectedNakedPropertiesUrl, nakedRow.getString(0));
233-
assertTrue(nakedRow.isNullAt(1), "Content should be null.");
234-
assertTrue(nakedRow.isNullAt(2), "Format should be null, since there's no content.");
235-
assertEquals(0, nakedRow.getList(COLLECTIONS_COLUMN).size(), "Collections list should be empty since they " +
236-
"cannot be written for a naked properties fragment.");
237-
assertEquals(0, nakedRow.getJavaMap(PERMISSIONS_COLUMN).size(), "Permissions should be empty since they " +
238-
"cannot be written for a naked properties fragment.");
239-
assertEquals(0, nakedRow.getInt(QUALITY_COLUMN));
240-
assertEquals(0, nakedRow.getJavaMap(METADATAVALUES_COLUMN).size(), "Metadata values should be empty since " +
241-
"they cannot be written for a naked properties fragment");
242-
243-
XmlNode properties = new XmlNode(nakedRow.getString(PROPERTIES_COLUMN),
244-
Namespace.getNamespace("prop", "http://marklogic.com/xdmp/property"));
245-
properties.assertElementValue("/prop:properties/priority", "1");
246-
247-
Row normalRow = rows.get(1);
248-
assertEquals("mlcp/xml/1.xml", normalRow.getString(0));
249-
250-
// Now write it to verify the outcome.
251-
dataset.write().format(CONNECTOR_IDENTIFIER)
252-
.option(Options.CLIENT_URI, makeClientUri())
253-
.option(Options.WRITE_COLLECTIONS, "naked-test")
254-
.option(Options.WRITE_PERMISSIONS, DEFAULT_PERMISSIONS)
255-
.mode(SaveMode.Append)
256-
.save();
257-
258-
List<String> uris = getUrisInCollection("naked-test", 1);
259-
assertEquals("mlcp/xml/1.xml", uris.get(0));
260-
261-
String nakedProperties = getDatabaseClient().newServerEval()
262-
.xquery(String.format("xdmp:document-properties('%s')", expectedNakedPropertiesUrl))
263-
.evalAs(String.class);
264-
assertNull(nakedProperties, "The naked properties row should have been ignored during the write, as " +
265-
"Java Client 6.6.0 and earlier do not support writing a document with null content via WriteBatcher. " +
266-
"This will be fixed in the Java Client 6.6.1 release, at which point we can start writing naked " +
267-
"properties fragments correctly.");
268-
}
269-
270219
private void verifyFirstRow(Row row) {
271220
assertEquals("/test/1.xml", row.getString(0));
272221
XmlNode doc = new XmlNode(new String((byte[]) row.get(1)));
@@ -327,7 +276,7 @@ private void verifyQuality(Row row) {
327276

328277
private void verifyProperties(Row row) {
329278
XmlNode properties = new XmlNode(row.getString(PROPERTIES_COLUMN),
330-
Namespace.getNamespace("prop", "http://marklogic.com/xdmp/property"), Namespace.getNamespace("ex", "org:example"));
279+
PROPERTIES_NAMESPACE, Namespace.getNamespace("ex", "org:example"));
331280
properties.assertElementValue("/prop:properties/ex:key1", "value1");
332281
properties.assertElementValue("/prop:properties/key2", "value2");
333282
}
Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
package com.marklogic.spark.reader.file;
2+
3+
import com.marklogic.junit5.XmlNode;
4+
import com.marklogic.spark.AbstractIntegrationTest;
5+
import com.marklogic.spark.Options;
6+
import org.apache.spark.sql.Dataset;
7+
import org.apache.spark.sql.Row;
8+
import org.apache.spark.sql.SaveMode;
9+
import org.junit.jupiter.api.BeforeEach;
10+
import org.junit.jupiter.api.Test;
11+
12+
import java.util.List;
13+
import java.util.stream.Stream;
14+
15+
import static org.junit.jupiter.api.Assertions.assertEquals;
16+
import static org.junit.jupiter.api.Assertions.assertTrue;
17+
18+
/**
19+
* A "naked properties" URI in MarkLogic is possible by creating a properties fragment at a URI but not
20+
* assigning any document content to it. MLCP archives can contain these, and thus we need to support them when reading
21+
* an MLCP archive. However, because v1/search cannot find these documents, it's not possible for the archives created
22+
* by this connector to contain them.
23+
*/
24+
class ReadMlcpArchiveWithNakedPropertiesTest extends AbstractIntegrationTest {
25+
26+
private static final int PROPERTIES_COLUMN = 6;
27+
28+
/**
29+
* The plumbing in the parent class for deleting documents before a test runs won't catch naked properties created
30+
* by this test, so we ensure they're deleted here.
31+
*/
32+
@BeforeEach
33+
void deleteNakedPropertiesFromPreviousTestRuns() {
34+
Stream.of("example.xml.naked", "example2.xml.naked", "naked/example.xml.naked").forEach(uri -> {
35+
String query = String.format("xdmp:document-delete('%s')", uri);
36+
try {
37+
getDatabaseClient().newServerEval().xquery(query).evalAs(String.class);
38+
} catch (Exception e) {
39+
logger.debug("Ignoring this error because it's only due to the naked properties fragment not existing");
40+
}
41+
});
42+
}
43+
44+
@Test
45+
void twoNakedEntries() {
46+
newSparkSession().read().format(CONNECTOR_IDENTIFIER)
47+
.option(Options.READ_FILES_TYPE, "mlcp_archive")
48+
.load("src/test/resources/mlcp-archive-files/two-naked-entries.zip")
49+
.write().format(CONNECTOR_IDENTIFIER)
50+
.option(Options.CLIENT_URI, makeClientUri())
51+
.option(Options.WRITE_COLLECTIONS, "naked")
52+
.option(Options.WRITE_PERMISSIONS, DEFAULT_PERMISSIONS)
53+
.mode(SaveMode.Append)
54+
.save();
55+
56+
assertCollectionSize("Using v1/search should not find the naked URIs since they do not have a document " +
57+
"associated with them", "naked", 0);
58+
59+
Stream.of("example.xml.naked", "example2.xml.naked").forEach(uri -> {
60+
String collection = getDatabaseClient().newServerEval()
61+
.javascript(String.format("xdmp.documentGetCollections('%s')[0]", uri))
62+
.evalAs(String.class);
63+
assertEquals("naked", collection, "Each naked properties document should still be assigned to the " +
64+
"collection found in its MLCP metadata entry from the archive file. But these URIs aren't returned " +
65+
"by v1/search since there are no documents associated with them.");
66+
});
67+
68+
XmlNode props = readDocumentProperties("example.xml.naked");
69+
props.assertElementValue("/prop:properties/priority", "1");
70+
props = readDocumentProperties("example2.xml.naked");
71+
props.assertElementValue("/prop:properties/priority", "2");
72+
}
73+
74+
@Test
75+
void normalAndNakedEntry() {
76+
Dataset<Row> dataset = newSparkSession().read()
77+
.format(CONNECTOR_IDENTIFIER)
78+
.option(Options.READ_FILES_TYPE, "mlcp_archive")
79+
.load("src/test/resources/mlcp-archive-files/normal-and-naked-entry.zip");
80+
81+
List<Row> rows = dataset.collectAsList();
82+
assertEquals(2, rows.size(), "The example.xml.naked entry should have produced 1 row.");
83+
assertEquals("xml/1.xml", rows.get(1).getString(0));
84+
85+
final String expectedNakedPropertiesUrl = "naked/example.xml.naked";
86+
Row nakedRow = rows.get(0);
87+
assertEquals(expectedNakedPropertiesUrl, nakedRow.getString(0));
88+
assertTrue(nakedRow.isNullAt(1), "Content should be null.");
89+
assertTrue(nakedRow.isNullAt(2), "Format should be null, since there's no content.");
90+
XmlNode properties = new XmlNode(nakedRow.getString(PROPERTIES_COLUMN), PROPERTIES_NAMESPACE);
91+
properties.assertElementValue("/prop:properties/priority", "1");
92+
93+
// Write the rows to verify that the naked document is created correctly.
94+
dataset.write().format(CONNECTOR_IDENTIFIER)
95+
.option(Options.CLIENT_URI, makeClientUri())
96+
.option(Options.WRITE_COLLECTIONS, "naked-test")
97+
.option(Options.WRITE_PERMISSIONS, DEFAULT_PERMISSIONS)
98+
.mode(SaveMode.Append)
99+
.save();
100+
101+
List<String> uris = getUrisInCollection("naked-test", 1);
102+
assertEquals("xml/1.xml", uris.get(0), "getUrisInCollection uses v1/search to find URIs, and thus it " +
103+
"should only find the URI of the normal document and not the one of the naked properties document.");
104+
105+
XmlNode nakedProperties = readDocumentProperties(expectedNakedPropertiesUrl);
106+
nakedProperties.assertElementValue(
107+
"As of Java Client 6.6.1, a DMSDK WriteBatcher now allows for a document to have a null content handle, " +
108+
"which allows for 'naked properties' URIs to be written.",
109+
"/prop:properties/priority", "1");
110+
}
111+
}

src/test/java/com/marklogic/spark/writer/file/WriteArchiveTest.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ private void verifyMetadata(Row row, String metadataValue) {
8181
String xml = new String((byte[]) row.get(1));
8282
XmlNode metadata = new XmlNode(xml,
8383
Namespace.getNamespace("rapi", "http://marklogic.com/rest-api"),
84-
Namespace.getNamespace("prop", "http://marklogic.com/xdmp/property"),
84+
PROPERTIES_NAMESPACE,
8585
Namespace.getNamespace("ex", "org:example"));
8686

8787
switch (metadataValue) {

0 commit comments

Comments
 (0)