|
| 1 | +package com.marklogic.spark.writer.file; |
| 2 | + |
| 3 | +import com.fasterxml.jackson.databind.JsonNode; |
| 4 | +import com.marklogic.junit5.XmlNode; |
| 5 | +import com.marklogic.spark.AbstractIntegrationTest; |
| 6 | +import com.marklogic.spark.ConnectorException; |
| 7 | +import com.marklogic.spark.Options; |
| 8 | +import org.apache.spark.sql.DataFrameWriter; |
| 9 | +import org.apache.spark.sql.SaveMode; |
| 10 | +import org.junit.jupiter.api.Test; |
| 11 | +import org.junit.jupiter.api.io.TempDir; |
| 12 | +import org.springframework.util.FileCopyUtils; |
| 13 | + |
| 14 | +import java.io.File; |
| 15 | +import java.io.IOException; |
| 16 | +import java.nio.file.Path; |
| 17 | + |
| 18 | +import static org.junit.jupiter.api.Assertions.assertEquals; |
| 19 | +import static org.junit.jupiter.api.Assertions.assertTrue; |
| 20 | + |
| 21 | +/** |
| 22 | + * These tests are simpler than they look at first glance. Each one reads a doc from MarkLogic that contains characters |
| 23 | + * supported by UTF-8 but not supported by ISO-8859-1. The test then writes the doc to a file using ISO-8859-1. It then |
| 24 | + * reads the file and loads it back into MarkLogic and verifies that the contents of both the written file and written |
| 25 | + * document meet the expectations for ISO-8859-1 encoding. |
| 26 | + */ |
| 27 | +class WriteFilesWithEncodingTest extends AbstractIntegrationTest { |
| 28 | + |
| 29 | + private static final String ISO_ENCODING = "ISO-8859-1"; |
| 30 | + private static final String SAMPLE_XML_DOC_URI = "/utf8-sample.xml"; |
| 31 | + private static final String SAMPLE_JSON_DOC_URI = "/utf8-sample.json"; |
| 32 | + private static final String ORIGINAL_XML_TEXT = "UTF-8 Text: MaryZhengäöüß测试"; |
| 33 | + |
| 34 | + @Test |
| 35 | + void writeXmlFile(@TempDir Path tempDir) { |
| 36 | + XmlNode sampleDoc = readXmlDocument(SAMPLE_XML_DOC_URI); |
| 37 | + sampleDoc.assertElementValue( |
| 38 | + "Verifying that the sample doc was loaded correctly in the test app; also showing what the text looks " + |
| 39 | + "to make this test easier to understand.", |
| 40 | + "/doc", ORIGINAL_XML_TEXT); |
| 41 | + |
| 42 | + newSparkSession().read().format(CONNECTOR_IDENTIFIER) |
| 43 | + .option(Options.CLIENT_URI, makeClientUri()) |
| 44 | + .option(Options.READ_DOCUMENTS_URIS, SAMPLE_XML_DOC_URI) |
| 45 | + .load() |
| 46 | + .write().format(CONNECTOR_IDENTIFIER) |
| 47 | + .option(Options.WRITE_FILES_ENCODING, ISO_ENCODING) |
| 48 | + .mode(SaveMode.Append) |
| 49 | + .save(tempDir.toAbsolutePath().toString()); |
| 50 | + |
| 51 | + String fileContent = readFileContents(tempDir, "utf8-sample.xml"); |
| 52 | + assertTrue(fileContent.contains("<doc>UTF-8 Text: MaryZheng����??</doc>"), |
| 53 | + "Unexpected file content: " + fileContent); |
| 54 | + |
| 55 | + newSparkSession().read() |
| 56 | + .format(CONNECTOR_IDENTIFIER) |
| 57 | + .option(Options.READ_FILES_ENCODING, ISO_ENCODING) |
| 58 | + .load(tempDir.toAbsolutePath().toString()) |
| 59 | + .write().format(CONNECTOR_IDENTIFIER) |
| 60 | + .option(Options.CLIENT_URI, makeClientUri()) |
| 61 | + .option(Options.WRITE_PERMISSIONS, DEFAULT_PERMISSIONS) |
| 62 | + .option(Options.WRITE_URI_TEMPLATE, "/iso-doc.xml") |
| 63 | + .mode(SaveMode.Append) |
| 64 | + .save(); |
| 65 | + |
| 66 | + XmlNode doc = readXmlDocument("/iso-doc.xml"); |
| 67 | + doc.assertElementValue( |
| 68 | + "Verifies that the ISO-encoded text is then converted back to UTF-8 when stored in MarkLogic, but the " + |
| 69 | + "value is slightly different due to the use of replacement characters in ISO-8859-1.", |
| 70 | + "/doc", "UTF-8 Text: MaryZhengäöüß??"); |
| 71 | + } |
| 72 | + |
| 73 | + @Test |
| 74 | + void prettyPrintXmlFile(@TempDir Path tempDir) { |
| 75 | + newSparkSession().read().format(CONNECTOR_IDENTIFIER) |
| 76 | + .option(Options.CLIENT_URI, makeClientUri()) |
| 77 | + .option(Options.READ_DOCUMENTS_URIS, SAMPLE_XML_DOC_URI) |
| 78 | + .load() |
| 79 | + .write().format(CONNECTOR_IDENTIFIER) |
| 80 | + .option(Options.WRITE_FILES_ENCODING, ISO_ENCODING) |
| 81 | + .option(Options.WRITE_FILES_PRETTY_PRINT, true) |
| 82 | + .mode(SaveMode.Append) |
| 83 | + .save(tempDir.toAbsolutePath().toString()); |
| 84 | + |
| 85 | + String fileContent = readFileContents(tempDir, "utf8-sample.xml"); |
| 86 | + assertTrue(fileContent.contains("<doc>UTF-8 Text: MaryZheng����测试</doc>"), |
| 87 | + "Pretty-printing results in some of the characters being escaped by the Java Transformer class, " + |
| 88 | + "even though it's been configured to use the user-specified encoding. Unexpected text: " + fileContent); |
| 89 | + |
| 90 | + newSparkSession().read() |
| 91 | + .format(CONNECTOR_IDENTIFIER) |
| 92 | + .option(Options.READ_FILES_ENCODING, ISO_ENCODING) |
| 93 | + .load(tempDir.toAbsolutePath().toString()) |
| 94 | + .write().format(CONNECTOR_IDENTIFIER) |
| 95 | + .option(Options.CLIENT_URI, makeClientUri()) |
| 96 | + .option(Options.WRITE_PERMISSIONS, DEFAULT_PERMISSIONS) |
| 97 | + .option(Options.WRITE_URI_TEMPLATE, "/iso-doc.xml") |
| 98 | + .mode(SaveMode.Append) |
| 99 | + .save(); |
| 100 | + |
| 101 | + XmlNode doc = readXmlDocument("/iso-doc.xml"); |
| 102 | + doc.assertElementValue( |
| 103 | + "The written doc should have the original XML text, as the problematic characters for ISO-8859-1 were " + |
| 104 | + "escaped by the Java Transformer class during the pretty-printing process. This shows that " + |
| 105 | + "pretty-printing can actually result in fewer characters being altered via replacement tokens.", |
| 106 | + "/doc", ORIGINAL_XML_TEXT); |
| 107 | + } |
| 108 | + |
| 109 | + @Test |
| 110 | + void prettyPrintJsonFile(@TempDir Path tempDir) { |
| 111 | + newSparkSession().read().format(CONNECTOR_IDENTIFIER) |
| 112 | + .option(Options.CLIENT_URI, makeClientUri()) |
| 113 | + .option(Options.READ_DOCUMENTS_URIS, SAMPLE_JSON_DOC_URI) |
| 114 | + .load() |
| 115 | + .write().format(CONNECTOR_IDENTIFIER) |
| 116 | + .option(Options.WRITE_FILES_ENCODING, ISO_ENCODING) |
| 117 | + .option(Options.WRITE_FILES_PRETTY_PRINT, true) |
| 118 | + .mode(SaveMode.Append) |
| 119 | + .save(tempDir.toAbsolutePath().toString()); |
| 120 | + |
| 121 | + String fileContent = readFileContents(tempDir, "utf8-sample.json"); |
| 122 | + assertTrue(fileContent.contains("MaryZheng����??"), |
| 123 | + "Pretty-printing JSON doesn't impact the encoding at all since the underlying Jackson library " + |
| 124 | + "doesn't need to escape any of the characters. Unexpected text: " + fileContent); |
| 125 | + |
| 126 | + newSparkSession().read() |
| 127 | + .format(CONNECTOR_IDENTIFIER) |
| 128 | + .option(Options.READ_FILES_ENCODING, ISO_ENCODING) |
| 129 | + .load(tempDir.toAbsolutePath().toString()) |
| 130 | + .write().format(CONNECTOR_IDENTIFIER) |
| 131 | + .option(Options.CLIENT_URI, makeClientUri()) |
| 132 | + .option(Options.WRITE_PERMISSIONS, DEFAULT_PERMISSIONS) |
| 133 | + .option(Options.WRITE_URI_TEMPLATE, "/iso-doc.json") |
| 134 | + .mode(SaveMode.Append) |
| 135 | + .save(); |
| 136 | + |
| 137 | + JsonNode doc = readJsonDocument("/iso-doc.json"); |
| 138 | + assertEquals("MaryZhengäöüß??", doc.get("text").asText()); |
| 139 | + } |
| 140 | + |
| 141 | + @Test |
| 142 | + void invalidEncoding(@TempDir Path tempDir) { |
| 143 | + DataFrameWriter writer = newSparkSession().read().format(CONNECTOR_IDENTIFIER) |
| 144 | + .option(Options.CLIENT_URI, makeClientUri()) |
| 145 | + .option(Options.READ_DOCUMENTS_URIS, SAMPLE_JSON_DOC_URI) |
| 146 | + .load() |
| 147 | + .write().format(CONNECTOR_IDENTIFIER) |
| 148 | + .option(Options.WRITE_FILES_ENCODING, "not-valid-encoding") |
| 149 | + .mode(SaveMode.Append); |
| 150 | + |
| 151 | + ConnectorException ex = assertThrowsConnectorException(() -> writer.save(tempDir.toAbsolutePath().toString())); |
| 152 | + assertEquals("Unsupported encoding value: not-valid-encoding", ex.getMessage()); |
| 153 | + } |
| 154 | + |
| 155 | + private String readFileContents(Path tempDir, String filename) { |
| 156 | + File file = new File(tempDir.toFile(), filename); |
| 157 | + try { |
| 158 | + return new String(FileCopyUtils.copyToByteArray(file)); |
| 159 | + } catch (IOException e) { |
| 160 | + throw new RuntimeException(e); |
| 161 | + } |
| 162 | + } |
| 163 | +} |
0 commit comments