Merge pull request #250 from marklogic/feature/15472-write-encoding

rjrudin · web-flow · commit f343694eef37 · 2024-07-08T18:36:18.000-04:00
MLE-15472 Can now specify encoding when writing files
diff --git a/src/main/java/com/marklogic/spark/Options.java b/src/main/java/com/marklogic/spark/Options.java
@@ -140,6 +140,9 @@ public abstract class Options {
     // Applies to XML and JSON documents.
     public static final String WRITE_FILES_PRETTY_PRINT = "spark.marklogic.write.files.prettyPrint";
 
+    // Applies to writing documents as files, gzipped files, and as entries in zips/archives.
+    public static final String WRITE_FILES_ENCODING = "spark.marklogic.write.files.encoding";
+
     public static final String WRITE_RDF_FILES_FORMAT = "spark.marklogic.write.files.rdf.format";
     public static final String WRITE_RDF_FILES_GRAPH = "spark.marklogic.write.files.rdf.graph";
 
diff --git a/src/main/java/com/marklogic/spark/reader/file/FileContext.java b/src/main/java/com/marklogic/spark/reader/file/FileContext.java
@@ -63,6 +63,6 @@ boolean isReadAbortOnFailure() {
 
     byte[] readBytes(InputStream inputStream) throws IOException {
         byte[] bytes = FileUtil.readBytes(inputStream);
-        return this.encoding != null ? new String(bytes).getBytes(encoding) : bytes;
+        return this.encoding != null ? new String(bytes, this.encoding).getBytes() : bytes;
     }
 }
diff --git a/src/main/java/com/marklogic/spark/writer/file/ContentWriter.java b/src/main/java/com/marklogic/spark/writer/file/ContentWriter.java
@@ -14,6 +14,7 @@
 import java.io.ByteArrayInputStream;
 import java.io.IOException;
 import java.io.OutputStream;
+import java.nio.charset.Charset;
 import java.util.Map;
 
 /**
@@ -26,8 +27,10 @@ class ContentWriter {
     private final Transformer transformer;
     private final ObjectMapper objectMapper;
     private final boolean prettyPrint;
+    private final Charset encoding;
 
     ContentWriter(Map<String, String> properties) {
+        this.encoding = determineEncoding(properties);
         this.prettyPrint = "true".equalsIgnoreCase(properties.get(Options.WRITE_FILES_PRETTY_PRINT));
         if (this.prettyPrint) {
             this.objectMapper = new ObjectMapper();
@@ -42,12 +45,31 @@ void writeContent(InternalRow row, OutputStream outputStream) throws IOException
         if (this.prettyPrint) {
             prettyPrintContent(row, outputStream);
         } else {
-            outputStream.write(row.getBinary(1));
+            byte[] bytes = row.getBinary(1);
+            if (this.encoding != null) {
+                // We know the string from MarkLogic is UTF-8, so we use getBytes to convert it to the user's
+                // specified encoding (as opposed to new String(bytes, encoding)).
+                outputStream.write(new String(bytes).getBytes(this.encoding));
+            } else {
+                outputStream.write(row.getBinary(1));
+            }
         }
     }
 
     void writeMetadata(InternalRow row, OutputStream outputStream) throws IOException {
-         outputStream.write(DocumentRowSchema.makeDocumentMetadata(row).toString().getBytes());
+        outputStream.write(DocumentRowSchema.makeDocumentMetadata(row).toString().getBytes());
+    }
+
+    private Charset determineEncoding(Map<String, String> properties) {
+        String encodingValue = properties.get(Options.WRITE_FILES_ENCODING);
+        if (encodingValue != null && encodingValue.trim().length() > 0) {
+            try {
+                return Charset.forName(encodingValue);
+            } catch (Exception ex) {
+                throw new ConnectorException(String.format("Unsupported encoding value: %s", encodingValue), ex);
+            }
+        }
+        return null;
     }
 
     private Transformer newTransformer() {
@@ -59,7 +81,11 @@ private Transformer newTransformer() {
             factory.setAttribute(XMLConstants.ACCESS_EXTERNAL_DTD, "");
             factory.setAttribute(XMLConstants.ACCESS_EXTERNAL_STYLESHEET, "");
             final Transformer t = factory.newTransformer();
-            t.setOutputProperty(OutputKeys.ENCODING, "UTF-8");
+            if (this.encoding != null) {
+                t.setOutputProperty(OutputKeys.ENCODING, this.encoding.name());
+            } else {
+                t.setOutputProperty(OutputKeys.ENCODING, "UTF-8");
+            }
             t.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes");
             t.setOutputProperty(OutputKeys.INDENT, "yes");
             return t;
@@ -78,13 +104,22 @@ private void prettyPrintContent(InternalRow row, OutputStream outputStream) thro
         } else if ("XML".equalsIgnoreCase(format)) {
             prettyPrintXml(content, outputStream);
         } else {
-            outputStream.write(content);
+            if (this.encoding != null) {
+                outputStream.write(new String(content).getBytes(this.encoding));
+            } else {
+                outputStream.write(content);
+            }
         }
     }
 
     private void prettyPrintJson(byte[] content, OutputStream outputStream) throws IOException {
         JsonNode node = this.objectMapper.readTree(content);
-        outputStream.write(node.toPrettyString().getBytes());
+        String prettyJson = node.toPrettyString();
+        if (this.encoding != null) {
+            outputStream.write(prettyJson.getBytes(this.encoding));
+        } else {
+            outputStream.write(prettyJson.getBytes());
+        }
     }
 
     private void prettyPrintXml(byte[] content, OutputStream outputStream) {
diff --git a/src/test/java/com/marklogic/spark/reader/file/ReadGenericFilesTest.java b/src/test/java/com/marklogic/spark/reader/file/ReadGenericFilesTest.java
@@ -81,13 +81,14 @@ void customEncoding() {
             .write().format(CONNECTOR_IDENTIFIER)
             .option(Options.CLIENT_URI, makeClientUri())
             .option(Options.WRITE_PERMISSIONS, DEFAULT_PERMISSIONS)
-            .option(Options.WRITE_COLLECTIONS, "encoding-test")
+            .option(Options.WRITE_URI_TEMPLATE, "/iso-doc.xml")
             .mode(SaveMode.Append)
             .save();
 
-        String uri = getUrisInCollection("encoding-test", 1).get(0);
-        XmlNode doc = readXmlDocument(uri);
+        XmlNode doc = readXmlDocument("/iso-doc.xml");
         doc.assertElementExists("/MedlineCitationSet");
+        doc.assertElementValue("/MedlineCitationSet/MedlineCitation/Affiliation",
+            "Istituto di Anatomia e Istologia Patologica, Università di Ferrara, Italy.");
     }
 
     @Test
diff --git a/src/test/java/com/marklogic/spark/writer/file/WriteFilesWithEncodingTest.java b/src/test/java/com/marklogic/spark/writer/file/WriteFilesWithEncodingTest.java
@@ -0,0 +1,163 @@
+package com.marklogic.spark.writer.file;
+
+import com.fasterxml.jackson.databind.JsonNode;
+import com.marklogic.junit5.XmlNode;
+import com.marklogic.spark.AbstractIntegrationTest;
+import com.marklogic.spark.ConnectorException;
+import com.marklogic.spark.Options;
+import org.apache.spark.sql.DataFrameWriter;
+import org.apache.spark.sql.SaveMode;
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.api.io.TempDir;
+import org.springframework.util.FileCopyUtils;
+
+import java.io.File;
+import java.io.IOException;
+import java.nio.file.Path;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+/**
+ * These tests are simpler than they look at first glance. Each one reads a doc from MarkLogic that contains characters
+ * supported by UTF-8 but not supported by ISO-8859-1. The test then writes the doc to a file using ISO-8859-1. It then
+ * reads the file and loads it back into MarkLogic and verifies that the contents of both the written file and written
+ * document meet the expectations for ISO-8859-1 encoding.
+ */
+class WriteFilesWithEncodingTest extends AbstractIntegrationTest {
+
+    private static final String ISO_ENCODING = "ISO-8859-1";
+    private static final String SAMPLE_XML_DOC_URI = "/utf8-sample.xml";
+    private static final String SAMPLE_JSON_DOC_URI = "/utf8-sample.json";
+    private static final String ORIGINAL_XML_TEXT = "UTF-8 Text: MaryZhengäöüß测试";
+
+    @Test
+    void writeXmlFile(@TempDir Path tempDir) {
+        XmlNode sampleDoc = readXmlDocument(SAMPLE_XML_DOC_URI);
+        sampleDoc.assertElementValue(
+            "Verifying that the sample doc was loaded correctly in the test app; also showing what the text looks " +
+                "to make this test easier to understand.",
+            "/doc", ORIGINAL_XML_TEXT);
+
+        newSparkSession().read().format(CONNECTOR_IDENTIFIER)
+            .option(Options.CLIENT_URI, makeClientUri())
+            .option(Options.READ_DOCUMENTS_URIS, SAMPLE_XML_DOC_URI)
+            .load()
+            .write().format(CONNECTOR_IDENTIFIER)
+            .option(Options.WRITE_FILES_ENCODING, ISO_ENCODING)
+            .mode(SaveMode.Append)
+            .save(tempDir.toAbsolutePath().toString());
+
+        String fileContent = readFileContents(tempDir, "utf8-sample.xml");
+        assertTrue(fileContent.contains("<doc>UTF-8 Text: MaryZheng����??</doc>"),
+            "Unexpected file content: " + fileContent);
+
+        newSparkSession().read()
+            .format(CONNECTOR_IDENTIFIER)
+            .option(Options.READ_FILES_ENCODING, ISO_ENCODING)
+            .load(tempDir.toAbsolutePath().toString())
+            .write().format(CONNECTOR_IDENTIFIER)
+            .option(Options.CLIENT_URI, makeClientUri())
+            .option(Options.WRITE_PERMISSIONS, DEFAULT_PERMISSIONS)
+            .option(Options.WRITE_URI_TEMPLATE, "/iso-doc.xml")
+            .mode(SaveMode.Append)
+            .save();
+
+        XmlNode doc = readXmlDocument("/iso-doc.xml");
+        doc.assertElementValue(
+            "Verifies that the ISO-encoded text is then converted back to UTF-8 when stored in MarkLogic, but the " +
+                "value is slightly different due to the use of replacement characters in ISO-8859-1.",
+            "/doc", "UTF-8 Text: MaryZhengäöüß??");
+    }
+
+    @Test
+    void prettyPrintXmlFile(@TempDir Path tempDir) {
+        newSparkSession().read().format(CONNECTOR_IDENTIFIER)
+            .option(Options.CLIENT_URI, makeClientUri())
+            .option(Options.READ_DOCUMENTS_URIS, SAMPLE_XML_DOC_URI)
+            .load()
+            .write().format(CONNECTOR_IDENTIFIER)
+            .option(Options.WRITE_FILES_ENCODING, ISO_ENCODING)
+            .option(Options.WRITE_FILES_PRETTY_PRINT, true)
+            .mode(SaveMode.Append)
+            .save(tempDir.toAbsolutePath().toString());
+
+        String fileContent = readFileContents(tempDir, "utf8-sample.xml");
+        assertTrue(fileContent.contains("<doc>UTF-8 Text: MaryZheng����&#27979;&#35797;</doc>"),
+            "Pretty-printing results in some of the characters being escaped by the Java Transformer class, " +
+                "even though it's been configured to use the user-specified encoding. Unexpected text: " + fileContent);
+
+        newSparkSession().read()
+            .format(CONNECTOR_IDENTIFIER)
+            .option(Options.READ_FILES_ENCODING, ISO_ENCODING)
+            .load(tempDir.toAbsolutePath().toString())
+            .write().format(CONNECTOR_IDENTIFIER)
+            .option(Options.CLIENT_URI, makeClientUri())
+            .option(Options.WRITE_PERMISSIONS, DEFAULT_PERMISSIONS)
+            .option(Options.WRITE_URI_TEMPLATE, "/iso-doc.xml")
+            .mode(SaveMode.Append)
+            .save();
+
+        XmlNode doc = readXmlDocument("/iso-doc.xml");
+        doc.assertElementValue(
+            "The written doc should have the original XML text, as the problematic characters for ISO-8859-1 were " +
+                "escaped by the Java Transformer class during the pretty-printing process. This shows that " +
+                "pretty-printing can actually result in fewer characters being altered via replacement tokens.",
+            "/doc", ORIGINAL_XML_TEXT);
+    }
+
+    @Test
+    void prettyPrintJsonFile(@TempDir Path tempDir) {
+        newSparkSession().read().format(CONNECTOR_IDENTIFIER)
+            .option(Options.CLIENT_URI, makeClientUri())
+            .option(Options.READ_DOCUMENTS_URIS, SAMPLE_JSON_DOC_URI)
+            .load()
+            .write().format(CONNECTOR_IDENTIFIER)
+            .option(Options.WRITE_FILES_ENCODING, ISO_ENCODING)
+            .option(Options.WRITE_FILES_PRETTY_PRINT, true)
+            .mode(SaveMode.Append)
+            .save(tempDir.toAbsolutePath().toString());
+
+        String fileContent = readFileContents(tempDir, "utf8-sample.json");
+        assertTrue(fileContent.contains("MaryZheng����??"),
+            "Pretty-printing JSON doesn't impact the encoding at all since the underlying Jackson library " +
+                "doesn't need to escape any of the characters. Unexpected text: " + fileContent);
+
+        newSparkSession().read()
+            .format(CONNECTOR_IDENTIFIER)
+            .option(Options.READ_FILES_ENCODING, ISO_ENCODING)
+            .load(tempDir.toAbsolutePath().toString())
+            .write().format(CONNECTOR_IDENTIFIER)
+            .option(Options.CLIENT_URI, makeClientUri())
+            .option(Options.WRITE_PERMISSIONS, DEFAULT_PERMISSIONS)
+            .option(Options.WRITE_URI_TEMPLATE, "/iso-doc.json")
+            .mode(SaveMode.Append)
+            .save();
+
+        JsonNode doc = readJsonDocument("/iso-doc.json");
+        assertEquals("MaryZhengäöüß??", doc.get("text").asText());
+    }
+
+    @Test
+    void invalidEncoding(@TempDir Path tempDir) {
+        DataFrameWriter writer = newSparkSession().read().format(CONNECTOR_IDENTIFIER)
+            .option(Options.CLIENT_URI, makeClientUri())
+            .option(Options.READ_DOCUMENTS_URIS, SAMPLE_JSON_DOC_URI)
+            .load()
+            .write().format(CONNECTOR_IDENTIFIER)
+            .option(Options.WRITE_FILES_ENCODING, "not-valid-encoding")
+            .mode(SaveMode.Append);
+
+        ConnectorException ex = assertThrowsConnectorException(() -> writer.save(tempDir.toAbsolutePath().toString()));
+        assertEquals("Unsupported encoding value: not-valid-encoding", ex.getMessage());
+    }
+
+    private String readFileContents(Path tempDir, String filename) {
+        File file = new File(tempDir.toFile(), filename);
+        try {
+            return new String(FileCopyUtils.copyToByteArray(file));
+        } catch (IOException e) {
+            throw new RuntimeException(e);
+        }
+    }
+}
diff --git a/src/test/ml-data/utf8-sample.json b/src/test/ml-data/utf8-sample.json
@@ -0,0 +1,3 @@
+{
+  "text": "MaryZhengäöüß测试"
+}
diff --git a/src/test/ml-data/utf8-sample.xml b/src/test/ml-data/utf8-sample.xml
@@ -0,0 +1 @@
+<doc>UTF-8 Text: MaryZhengäöüß测试</doc>
diff --git a/src/test/resources/encoding/medline.iso-8859-1.txt b/src/test/resources/encoding/medline.iso-8859-1.txt

Original file line number	Diff line number	Diff line change
`@@ -63,6 +63,6 @@ boolean isReadAbortOnFailure() {`
`63`	`63`
`64`	`64`	`byte[] readBytes(InputStream inputStream) throws IOException {`
`65`	`65`	`byte[] bytes = FileUtil.readBytes(inputStream);`
`66`		`- return this.encoding != null ? new String(bytes).getBytes(encoding) : bytes;`
	`66`	`+ return this.encoding != null ? new String(bytes, this.encoding).getBytes() : bytes;`
`67`	`67`	`}`
`68`	`68`	`}`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+{`
	`2`	`+ "text": "MaryZhengäöüß测试"`
	`3`	`+}`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+<doc>UTF-8 Text: MaryZhengäöüß测试</doc>`