Skip to content

Commit 228340a

Browse files
authored
Merge pull request #350 from marklogic/feature/ignore-bad-file
Logging warning when text cannot be split from document
2 parents 9c97e20 + dcc1af4 commit 228340a

File tree

4 files changed

+53
-3
lines changed

4 files changed

+53
-3
lines changed

src/main/java/com/marklogic/spark/writer/splitter/DOMTextSelector.java

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
import com.marklogic.client.document.DocumentWriteOperation;
77
import com.marklogic.spark.ConnectorException;
8+
import com.marklogic.spark.Util;
89
import com.marklogic.spark.writer.dom.DOMHelper;
910
import org.w3c.dom.Document;
1011
import org.w3c.dom.NodeList;
@@ -28,7 +29,14 @@ public DOMTextSelector(String textExpression, NamespaceContext namespaceContext)
2829

2930
@Override
3031
public String selectTextToSplit(DocumentWriteOperation sourceDocument) {
31-
Document doc = domHelper.extractDocument(sourceDocument);
32+
Document doc;
33+
try {
34+
doc = domHelper.extractDocument(sourceDocument);
35+
} catch (Exception ex) {
36+
Util.MAIN_LOGGER.warn("Unable to select text to split in document: {}; cause: {}", sourceDocument.getUri(), ex.getMessage());
37+
return null;
38+
}
39+
3240
NodeList items;
3341
try {
3442
items = (NodeList) this.textExpression.evaluate(doc, XPathConstants.NODESET);

src/main/java/com/marklogic/spark/writer/splitter/JsonPointerTextSelector.java

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import com.fasterxml.jackson.databind.JsonNode;
88
import com.marklogic.client.document.DocumentWriteOperation;
99
import com.marklogic.spark.ConnectorException;
10+
import com.marklogic.spark.Util;
1011
import com.marklogic.spark.writer.JsonUtil;
1112

1213
import java.util.ArrayList;
@@ -33,8 +34,14 @@ public JsonPointerTextSelector(String[] jsonPointerArray, String joinDelimiter)
3334
}
3435

3536
@Override
36-
public String selectTextToSplit(DocumentWriteOperation operation) {
37-
JsonNode doc = JsonUtil.getJsonFromHandle(operation.getContent());
37+
public String selectTextToSplit(DocumentWriteOperation sourceDocument) {
38+
JsonNode doc;
39+
try {
40+
doc = JsonUtil.getJsonFromHandle(sourceDocument.getContent());
41+
} catch (Exception ex) {
42+
Util.MAIN_LOGGER.warn("Unable to select text to split in document: {}; cause: {}", sourceDocument.getUri(), ex.getMessage());
43+
return null;
44+
}
3845

3946
return jsonPointers.stream()
4047
.map(jsonPointer -> {

src/test/java/com/marklogic/spark/writer/splitter/SplitJsonDocumentTest.java

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -315,6 +315,23 @@ void chunksFieldAlreadyExists() {
315315
"already, such that we do not need to provide a configuration option for defining the chunks array name.");
316316
}
317317

318+
@Test
319+
void xpathOnJsonDocument() {
320+
readDocument("/marklogic-docs/java-client-intro.json")
321+
.write().format(CONNECTOR_IDENTIFIER)
322+
.option(Options.CLIENT_URI, makeClientUri())
323+
.option(Options.WRITE_SPLITTER_XPATH, "/text")
324+
.option(Options.WRITE_PERMISSIONS, DEFAULT_PERMISSIONS)
325+
.option(Options.WRITE_URI_TEMPLATE, "/split-test.json")
326+
.mode(SaveMode.Append)
327+
.save();
328+
329+
JsonNode doc = readJsonDocument("/split-test.json");
330+
assertFalse(doc.has("chunks"), "If a user specifies an XPath split expression and the connector encounters a " +
331+
"non-XML document, a warning should be logged and no chunks should be added. This scenario could happen " +
332+
"when e.g. processing a zip file that contains mostly XML documents, but also a few non-XML documents.");
333+
}
334+
318335
private Dataset<Row> readDocument(String uri) {
319336
return newSparkSession().read().format(CONNECTOR_IDENTIFIER)
320337
.option(Options.CLIENT_URI, makeClientUri())

src/test/java/com/marklogic/spark/writer/splitter/SplitXmlDocumentTest.java

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,24 @@ void undeclaredNamespace() {
7878
);
7979
}
8080

81+
@Test
82+
void jsonPointerOnXmlDocument() {
83+
readDocument("/marklogic-docs/java-client-intro.xml")
84+
.write().format(CONNECTOR_IDENTIFIER)
85+
.option(Options.CLIENT_URI, makeClientUri())
86+
.option(Options.WRITE_SPLITTER_JSON_POINTERS, "/text")
87+
.option(Options.WRITE_PERMISSIONS, DEFAULT_PERMISSIONS)
88+
.option(Options.WRITE_URI_TEMPLATE, "/split-test.xml")
89+
.mode(SaveMode.Append)
90+
.save();
91+
92+
XmlNode doc = readXmlDocument("/split-test.xml");
93+
doc.assertElementMissing("If a user specifies a JSON Pointer split expression and the connector encounters a " +
94+
"non-JSON document, a warning should be logged and no chunks should be added. This scenario could happen " +
95+
"when e.g. processing a zip file that contains mostly JSON documents, but also a few non-JSON documents.",
96+
"//chunks");
97+
}
98+
8199
@Test
82100
void overlapSizeGreaterThanChunkSize() {
83101
DataFrameWriter writer = readDocument("/marklogic-docs/java-client-intro.xml")

0 commit comments

Comments
 (0)