Skip to content

Commit 9ee92d3

Browse files
authored
Merge pull request #251 from marklogic/feature/15472-archive-encoding
MLE-15472 Metadata is now encoded in archive files
2 parents f343694 + 3fb664a commit 9ee92d3

File tree

2 files changed

+87
-1
lines changed

2 files changed

+87
-1
lines changed

src/main/java/com/marklogic/spark/writer/file/ContentWriter.java

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,14 @@ void writeContent(InternalRow row, OutputStream outputStream) throws IOException
5757
}
5858

5959
void writeMetadata(InternalRow row, OutputStream outputStream) throws IOException {
60-
outputStream.write(DocumentRowSchema.makeDocumentMetadata(row).toString().getBytes());
60+
String metadataXml = DocumentRowSchema.makeDocumentMetadata(row).toString();
61+
// Must honor the encoding here as well, as a user could easily have values that require encoding in metadata
62+
// values or in a properties fragment.
63+
if (this.encoding != null) {
64+
outputStream.write(metadataXml.getBytes(this.encoding));
65+
} else {
66+
outputStream.write(metadataXml.getBytes());
67+
}
6168
}
6269

6370
private Charset determineEncoding(Map<String, String> properties) {
Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
package com.marklogic.spark.writer.file;
2+
3+
import com.fasterxml.jackson.databind.JsonNode;
4+
import com.marklogic.spark.AbstractIntegrationTest;
5+
import com.marklogic.spark.Options;
6+
import org.apache.spark.sql.Row;
7+
import org.apache.spark.sql.SaveMode;
8+
import org.junit.jupiter.api.Test;
9+
import org.junit.jupiter.api.io.TempDir;
10+
11+
import java.nio.file.Path;
12+
import java.util.List;
13+
import java.util.Map;
14+
15+
import static org.junit.jupiter.api.Assertions.assertEquals;
16+
17+
class WriteArchiveWithEncodingTest extends AbstractIntegrationTest {
18+
19+
@Test
20+
void test(@TempDir Path tempDir) {
21+
addMetadataToTestDocument();
22+
23+
// Write the JSON test document to an archive with ISO encoding, including its metadata.
24+
newSparkSession().read()
25+
.format(CONNECTOR_IDENTIFIER)
26+
.option(Options.CLIENT_URI, makeClientUri())
27+
.option(Options.READ_DOCUMENTS_URIS, "/utf8-sample.json")
28+
.option(Options.READ_DOCUMENTS_CATEGORIES, "content,metadata")
29+
.load()
30+
.repartition(1)
31+
.write()
32+
.format(CONNECTOR_IDENTIFIER)
33+
.option(Options.WRITE_FILES_ENCODING, "ISO-8859-1")
34+
.option(Options.WRITE_FILES_COMPRESSION, "zip")
35+
.mode(SaveMode.Append)
36+
.save(tempDir.toFile().getAbsolutePath());
37+
38+
// Read the archive with ISO encoding and loading it into MarkLogic.
39+
sparkSession.read()
40+
.format(CONNECTOR_IDENTIFIER)
41+
.option(Options.READ_FILES_ENCODING, "ISO-8859-1")
42+
.option(Options.READ_FILES_COMPRESSION, "zip")
43+
.option(Options.READ_FILES_TYPE, "archive")
44+
.load(tempDir.toAbsolutePath().toString())
45+
.write().format(CONNECTOR_IDENTIFIER)
46+
.option(Options.CLIENT_URI, makeClientUri())
47+
.option(Options.WRITE_COLLECTIONS, "loaded-data")
48+
.option(Options.WRITE_URI_PREFIX, "/loaded")
49+
.mode(SaveMode.Append)
50+
.save();
51+
52+
JsonNode doc = readJsonDocument("/loaded/utf8-sample.json");
53+
assertEquals("MaryZhengäöüß??", doc.get("text").asText(), "The value should be mostly the same as the " +
54+
"original value, except for the last two characters which are replaced when encoded to ISO-8859-1.");
55+
56+
// Read the loaded document.
57+
List<Row> rows = sparkSession.read().format(CONNECTOR_IDENTIFIER)
58+
.option(Options.CLIENT_URI, makeClientUri())
59+
.option(Options.READ_DOCUMENTS_URIS, "/loaded/utf8-sample.json")
60+
.option(Options.READ_DOCUMENTS_CATEGORIES, "metadatavalues")
61+
.load().collectAsList();
62+
63+
Map<String, String> metadata = rows.get(0).getJavaMap(7);
64+
assertEquals("MaryZhengäöüß??", metadata.get("text"), "The user-defined encoding should be applied to " +
65+
"each metadata entry in the archive file as well. This ensures that the encoding is applied to things " +
66+
"like metadata values and properties fragments, where a user is free to capture any text they want.");
67+
}
68+
69+
/**
70+
* It's fine to add this to the test document, which is created by the test app. It won't impact any other tests,
71+
* and it can be run repeatedly without any ill effects.
72+
*/
73+
private void addMetadataToTestDocument() {
74+
getDatabaseClient().newServerEval()
75+
.javascript("declareUpdate(); " +
76+
"xdmp.documentSetMetadata('/utf8-sample.json', {\"text\": \"MaryZhengäöüß测试\"})")
77+
.evalAs(String.class);
78+
}
79+
}

0 commit comments

Comments
 (0)