Skip to content

Commit f343694

Browse files
authored
Merge pull request #250 from marklogic/feature/15472-write-encoding
MLE-15472 Can now specify encoding when writing files
2 parents 9192a17 + fa5fbd7 commit f343694

File tree

8 files changed

+217
-195
lines changed

8 files changed

+217
-195
lines changed

src/main/java/com/marklogic/spark/Options.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,9 @@ public abstract class Options {
140140
// Applies to XML and JSON documents.
141141
public static final String WRITE_FILES_PRETTY_PRINT = "spark.marklogic.write.files.prettyPrint";
142142

143+
// Applies to writing documents as files, gzipped files, and as entries in zips/archives.
144+
public static final String WRITE_FILES_ENCODING = "spark.marklogic.write.files.encoding";
145+
143146
public static final String WRITE_RDF_FILES_FORMAT = "spark.marklogic.write.files.rdf.format";
144147
public static final String WRITE_RDF_FILES_GRAPH = "spark.marklogic.write.files.rdf.graph";
145148

src/main/java/com/marklogic/spark/reader/file/FileContext.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,6 @@ boolean isReadAbortOnFailure() {
6363

6464
byte[] readBytes(InputStream inputStream) throws IOException {
6565
byte[] bytes = FileUtil.readBytes(inputStream);
66-
return this.encoding != null ? new String(bytes).getBytes(encoding) : bytes;
66+
return this.encoding != null ? new String(bytes, this.encoding).getBytes() : bytes;
6767
}
6868
}

src/main/java/com/marklogic/spark/writer/file/ContentWriter.java

Lines changed: 40 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
import java.io.ByteArrayInputStream;
1515
import java.io.IOException;
1616
import java.io.OutputStream;
17+
import java.nio.charset.Charset;
1718
import java.util.Map;
1819

1920
/**
@@ -26,8 +27,10 @@ class ContentWriter {
2627
private final Transformer transformer;
2728
private final ObjectMapper objectMapper;
2829
private final boolean prettyPrint;
30+
private final Charset encoding;
2931

3032
ContentWriter(Map<String, String> properties) {
33+
this.encoding = determineEncoding(properties);
3134
this.prettyPrint = "true".equalsIgnoreCase(properties.get(Options.WRITE_FILES_PRETTY_PRINT));
3235
if (this.prettyPrint) {
3336
this.objectMapper = new ObjectMapper();
@@ -42,12 +45,31 @@ void writeContent(InternalRow row, OutputStream outputStream) throws IOException
4245
if (this.prettyPrint) {
4346
prettyPrintContent(row, outputStream);
4447
} else {
45-
outputStream.write(row.getBinary(1));
48+
byte[] bytes = row.getBinary(1);
49+
if (this.encoding != null) {
50+
// We know the string from MarkLogic is UTF-8, so we use getBytes to convert it to the user's
51+
// specified encoding (as opposed to new String(bytes, encoding)).
52+
outputStream.write(new String(bytes).getBytes(this.encoding));
53+
} else {
54+
outputStream.write(row.getBinary(1));
55+
}
4656
}
4757
}
4858

4959
void writeMetadata(InternalRow row, OutputStream outputStream) throws IOException {
50-
outputStream.write(DocumentRowSchema.makeDocumentMetadata(row).toString().getBytes());
60+
outputStream.write(DocumentRowSchema.makeDocumentMetadata(row).toString().getBytes());
61+
}
62+
63+
private Charset determineEncoding(Map<String, String> properties) {
64+
String encodingValue = properties.get(Options.WRITE_FILES_ENCODING);
65+
if (encodingValue != null && encodingValue.trim().length() > 0) {
66+
try {
67+
return Charset.forName(encodingValue);
68+
} catch (Exception ex) {
69+
throw new ConnectorException(String.format("Unsupported encoding value: %s", encodingValue), ex);
70+
}
71+
}
72+
return null;
5173
}
5274

5375
private Transformer newTransformer() {
@@ -59,7 +81,11 @@ private Transformer newTransformer() {
5981
factory.setAttribute(XMLConstants.ACCESS_EXTERNAL_DTD, "");
6082
factory.setAttribute(XMLConstants.ACCESS_EXTERNAL_STYLESHEET, "");
6183
final Transformer t = factory.newTransformer();
62-
t.setOutputProperty(OutputKeys.ENCODING, "UTF-8");
84+
if (this.encoding != null) {
85+
t.setOutputProperty(OutputKeys.ENCODING, this.encoding.name());
86+
} else {
87+
t.setOutputProperty(OutputKeys.ENCODING, "UTF-8");
88+
}
6389
t.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes");
6490
t.setOutputProperty(OutputKeys.INDENT, "yes");
6591
return t;
@@ -78,13 +104,22 @@ private void prettyPrintContent(InternalRow row, OutputStream outputStream) thro
78104
} else if ("XML".equalsIgnoreCase(format)) {
79105
prettyPrintXml(content, outputStream);
80106
} else {
81-
outputStream.write(content);
107+
if (this.encoding != null) {
108+
outputStream.write(new String(content).getBytes(this.encoding));
109+
} else {
110+
outputStream.write(content);
111+
}
82112
}
83113
}
84114

85115
private void prettyPrintJson(byte[] content, OutputStream outputStream) throws IOException {
86116
JsonNode node = this.objectMapper.readTree(content);
87-
outputStream.write(node.toPrettyString().getBytes());
117+
String prettyJson = node.toPrettyString();
118+
if (this.encoding != null) {
119+
outputStream.write(prettyJson.getBytes(this.encoding));
120+
} else {
121+
outputStream.write(prettyJson.getBytes());
122+
}
88123
}
89124

90125
private void prettyPrintXml(byte[] content, OutputStream outputStream) {

src/test/java/com/marklogic/spark/reader/file/ReadGenericFilesTest.java

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -81,13 +81,14 @@ void customEncoding() {
8181
.write().format(CONNECTOR_IDENTIFIER)
8282
.option(Options.CLIENT_URI, makeClientUri())
8383
.option(Options.WRITE_PERMISSIONS, DEFAULT_PERMISSIONS)
84-
.option(Options.WRITE_COLLECTIONS, "encoding-test")
84+
.option(Options.WRITE_URI_TEMPLATE, "/iso-doc.xml")
8585
.mode(SaveMode.Append)
8686
.save();
8787

88-
String uri = getUrisInCollection("encoding-test", 1).get(0);
89-
XmlNode doc = readXmlDocument(uri);
88+
XmlNode doc = readXmlDocument("/iso-doc.xml");
9089
doc.assertElementExists("/MedlineCitationSet");
90+
doc.assertElementValue("/MedlineCitationSet/MedlineCitation/Affiliation",
91+
"Istituto di Anatomia e Istologia Patologica, Università di Ferrara, Italy.");
9192
}
9293

9394
@Test
Lines changed: 163 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,163 @@
1+
package com.marklogic.spark.writer.file;
2+
3+
import com.fasterxml.jackson.databind.JsonNode;
4+
import com.marklogic.junit5.XmlNode;
5+
import com.marklogic.spark.AbstractIntegrationTest;
6+
import com.marklogic.spark.ConnectorException;
7+
import com.marklogic.spark.Options;
8+
import org.apache.spark.sql.DataFrameWriter;
9+
import org.apache.spark.sql.SaveMode;
10+
import org.junit.jupiter.api.Test;
11+
import org.junit.jupiter.api.io.TempDir;
12+
import org.springframework.util.FileCopyUtils;
13+
14+
import java.io.File;
15+
import java.io.IOException;
16+
import java.nio.file.Path;
17+
18+
import static org.junit.jupiter.api.Assertions.assertEquals;
19+
import static org.junit.jupiter.api.Assertions.assertTrue;
20+
21+
/**
22+
* These tests are simpler than they look at first glance. Each one reads a doc from MarkLogic that contains characters
23+
* supported by UTF-8 but not supported by ISO-8859-1. The test then writes the doc to a file using ISO-8859-1. It then
24+
* reads the file and loads it back into MarkLogic and verifies that the contents of both the written file and written
25+
* document meet the expectations for ISO-8859-1 encoding.
26+
*/
27+
class WriteFilesWithEncodingTest extends AbstractIntegrationTest {
28+
29+
private static final String ISO_ENCODING = "ISO-8859-1";
30+
private static final String SAMPLE_XML_DOC_URI = "/utf8-sample.xml";
31+
private static final String SAMPLE_JSON_DOC_URI = "/utf8-sample.json";
32+
private static final String ORIGINAL_XML_TEXT = "UTF-8 Text: MaryZhengäöüß测试";
33+
34+
@Test
35+
void writeXmlFile(@TempDir Path tempDir) {
36+
XmlNode sampleDoc = readXmlDocument(SAMPLE_XML_DOC_URI);
37+
sampleDoc.assertElementValue(
38+
"Verifying that the sample doc was loaded correctly in the test app; also showing what the text looks " +
39+
"to make this test easier to understand.",
40+
"/doc", ORIGINAL_XML_TEXT);
41+
42+
newSparkSession().read().format(CONNECTOR_IDENTIFIER)
43+
.option(Options.CLIENT_URI, makeClientUri())
44+
.option(Options.READ_DOCUMENTS_URIS, SAMPLE_XML_DOC_URI)
45+
.load()
46+
.write().format(CONNECTOR_IDENTIFIER)
47+
.option(Options.WRITE_FILES_ENCODING, ISO_ENCODING)
48+
.mode(SaveMode.Append)
49+
.save(tempDir.toAbsolutePath().toString());
50+
51+
String fileContent = readFileContents(tempDir, "utf8-sample.xml");
52+
assertTrue(fileContent.contains("<doc>UTF-8 Text: MaryZheng����??</doc>"),
53+
"Unexpected file content: " + fileContent);
54+
55+
newSparkSession().read()
56+
.format(CONNECTOR_IDENTIFIER)
57+
.option(Options.READ_FILES_ENCODING, ISO_ENCODING)
58+
.load(tempDir.toAbsolutePath().toString())
59+
.write().format(CONNECTOR_IDENTIFIER)
60+
.option(Options.CLIENT_URI, makeClientUri())
61+
.option(Options.WRITE_PERMISSIONS, DEFAULT_PERMISSIONS)
62+
.option(Options.WRITE_URI_TEMPLATE, "/iso-doc.xml")
63+
.mode(SaveMode.Append)
64+
.save();
65+
66+
XmlNode doc = readXmlDocument("/iso-doc.xml");
67+
doc.assertElementValue(
68+
"Verifies that the ISO-encoded text is then converted back to UTF-8 when stored in MarkLogic, but the " +
69+
"value is slightly different due to the use of replacement characters in ISO-8859-1.",
70+
"/doc", "UTF-8 Text: MaryZhengäöüß??");
71+
}
72+
73+
@Test
74+
void prettyPrintXmlFile(@TempDir Path tempDir) {
75+
newSparkSession().read().format(CONNECTOR_IDENTIFIER)
76+
.option(Options.CLIENT_URI, makeClientUri())
77+
.option(Options.READ_DOCUMENTS_URIS, SAMPLE_XML_DOC_URI)
78+
.load()
79+
.write().format(CONNECTOR_IDENTIFIER)
80+
.option(Options.WRITE_FILES_ENCODING, ISO_ENCODING)
81+
.option(Options.WRITE_FILES_PRETTY_PRINT, true)
82+
.mode(SaveMode.Append)
83+
.save(tempDir.toAbsolutePath().toString());
84+
85+
String fileContent = readFileContents(tempDir, "utf8-sample.xml");
86+
assertTrue(fileContent.contains("<doc>UTF-8 Text: MaryZheng����&#27979;&#35797;</doc>"),
87+
"Pretty-printing results in some of the characters being escaped by the Java Transformer class, " +
88+
"even though it's been configured to use the user-specified encoding. Unexpected text: " + fileContent);
89+
90+
newSparkSession().read()
91+
.format(CONNECTOR_IDENTIFIER)
92+
.option(Options.READ_FILES_ENCODING, ISO_ENCODING)
93+
.load(tempDir.toAbsolutePath().toString())
94+
.write().format(CONNECTOR_IDENTIFIER)
95+
.option(Options.CLIENT_URI, makeClientUri())
96+
.option(Options.WRITE_PERMISSIONS, DEFAULT_PERMISSIONS)
97+
.option(Options.WRITE_URI_TEMPLATE, "/iso-doc.xml")
98+
.mode(SaveMode.Append)
99+
.save();
100+
101+
XmlNode doc = readXmlDocument("/iso-doc.xml");
102+
doc.assertElementValue(
103+
"The written doc should have the original XML text, as the problematic characters for ISO-8859-1 were " +
104+
"escaped by the Java Transformer class during the pretty-printing process. This shows that " +
105+
"pretty-printing can actually result in fewer characters being altered via replacement tokens.",
106+
"/doc", ORIGINAL_XML_TEXT);
107+
}
108+
109+
@Test
110+
void prettyPrintJsonFile(@TempDir Path tempDir) {
111+
newSparkSession().read().format(CONNECTOR_IDENTIFIER)
112+
.option(Options.CLIENT_URI, makeClientUri())
113+
.option(Options.READ_DOCUMENTS_URIS, SAMPLE_JSON_DOC_URI)
114+
.load()
115+
.write().format(CONNECTOR_IDENTIFIER)
116+
.option(Options.WRITE_FILES_ENCODING, ISO_ENCODING)
117+
.option(Options.WRITE_FILES_PRETTY_PRINT, true)
118+
.mode(SaveMode.Append)
119+
.save(tempDir.toAbsolutePath().toString());
120+
121+
String fileContent = readFileContents(tempDir, "utf8-sample.json");
122+
assertTrue(fileContent.contains("MaryZheng����??"),
123+
"Pretty-printing JSON doesn't impact the encoding at all since the underlying Jackson library " +
124+
"doesn't need to escape any of the characters. Unexpected text: " + fileContent);
125+
126+
newSparkSession().read()
127+
.format(CONNECTOR_IDENTIFIER)
128+
.option(Options.READ_FILES_ENCODING, ISO_ENCODING)
129+
.load(tempDir.toAbsolutePath().toString())
130+
.write().format(CONNECTOR_IDENTIFIER)
131+
.option(Options.CLIENT_URI, makeClientUri())
132+
.option(Options.WRITE_PERMISSIONS, DEFAULT_PERMISSIONS)
133+
.option(Options.WRITE_URI_TEMPLATE, "/iso-doc.json")
134+
.mode(SaveMode.Append)
135+
.save();
136+
137+
JsonNode doc = readJsonDocument("/iso-doc.json");
138+
assertEquals("MaryZhengäöüß??", doc.get("text").asText());
139+
}
140+
141+
@Test
142+
void invalidEncoding(@TempDir Path tempDir) {
143+
DataFrameWriter writer = newSparkSession().read().format(CONNECTOR_IDENTIFIER)
144+
.option(Options.CLIENT_URI, makeClientUri())
145+
.option(Options.READ_DOCUMENTS_URIS, SAMPLE_JSON_DOC_URI)
146+
.load()
147+
.write().format(CONNECTOR_IDENTIFIER)
148+
.option(Options.WRITE_FILES_ENCODING, "not-valid-encoding")
149+
.mode(SaveMode.Append);
150+
151+
ConnectorException ex = assertThrowsConnectorException(() -> writer.save(tempDir.toAbsolutePath().toString()));
152+
assertEquals("Unsupported encoding value: not-valid-encoding", ex.getMessage());
153+
}
154+
155+
private String readFileContents(Path tempDir, String filename) {
156+
File file = new File(tempDir.toFile(), filename);
157+
try {
158+
return new String(FileCopyUtils.copyToByteArray(file));
159+
} catch (IOException e) {
160+
throw new RuntimeException(e);
161+
}
162+
}
163+
}

src/test/ml-data/utf8-sample.json

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
{
2+
"text": "MaryZhengäöüß测试"
3+
}

src/test/ml-data/utf8-sample.xml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
<doc>UTF-8 Text: MaryZhengäöüß测试</doc>

0 commit comments

Comments
 (0)