|
| 1 | +package com.marklogic.spark.writer.file; |
| 2 | + |
| 3 | +import com.fasterxml.jackson.databind.JsonNode; |
| 4 | +import com.marklogic.spark.AbstractIntegrationTest; |
| 5 | +import com.marklogic.spark.Options; |
| 6 | +import org.apache.spark.sql.Row; |
| 7 | +import org.apache.spark.sql.SaveMode; |
| 8 | +import org.junit.jupiter.api.Test; |
| 9 | +import org.junit.jupiter.api.io.TempDir; |
| 10 | + |
| 11 | +import java.nio.file.Path; |
| 12 | +import java.util.List; |
| 13 | +import java.util.Map; |
| 14 | + |
| 15 | +import static org.junit.jupiter.api.Assertions.assertEquals; |
| 16 | + |
| 17 | +class WriteArchiveWithEncodingTest extends AbstractIntegrationTest { |
| 18 | + |
| 19 | + @Test |
| 20 | + void test(@TempDir Path tempDir) { |
| 21 | + addMetadataToTestDocument(); |
| 22 | + |
| 23 | + // Write the JSON test document to an archive with ISO encoding, including its metadata. |
| 24 | + newSparkSession().read() |
| 25 | + .format(CONNECTOR_IDENTIFIER) |
| 26 | + .option(Options.CLIENT_URI, makeClientUri()) |
| 27 | + .option(Options.READ_DOCUMENTS_URIS, "/utf8-sample.json") |
| 28 | + .option(Options.READ_DOCUMENTS_CATEGORIES, "content,metadata") |
| 29 | + .load() |
| 30 | + .repartition(1) |
| 31 | + .write() |
| 32 | + .format(CONNECTOR_IDENTIFIER) |
| 33 | + .option(Options.WRITE_FILES_ENCODING, "ISO-8859-1") |
| 34 | + .option(Options.WRITE_FILES_COMPRESSION, "zip") |
| 35 | + .mode(SaveMode.Append) |
| 36 | + .save(tempDir.toFile().getAbsolutePath()); |
| 37 | + |
| 38 | + // Read the archive with ISO encoding and loading it into MarkLogic. |
| 39 | + sparkSession.read() |
| 40 | + .format(CONNECTOR_IDENTIFIER) |
| 41 | + .option(Options.READ_FILES_ENCODING, "ISO-8859-1") |
| 42 | + .option(Options.READ_FILES_COMPRESSION, "zip") |
| 43 | + .option(Options.READ_FILES_TYPE, "archive") |
| 44 | + .load(tempDir.toAbsolutePath().toString()) |
| 45 | + .write().format(CONNECTOR_IDENTIFIER) |
| 46 | + .option(Options.CLIENT_URI, makeClientUri()) |
| 47 | + .option(Options.WRITE_COLLECTIONS, "loaded-data") |
| 48 | + .option(Options.WRITE_URI_PREFIX, "/loaded") |
| 49 | + .mode(SaveMode.Append) |
| 50 | + .save(); |
| 51 | + |
| 52 | + JsonNode doc = readJsonDocument("/loaded/utf8-sample.json"); |
| 53 | + assertEquals("MaryZhengäöüß??", doc.get("text").asText(), "The value should be mostly the same as the " + |
| 54 | + "original value, except for the last two characters which are replaced when encoded to ISO-8859-1."); |
| 55 | + |
| 56 | + // Read the loaded document. |
| 57 | + List<Row> rows = sparkSession.read().format(CONNECTOR_IDENTIFIER) |
| 58 | + .option(Options.CLIENT_URI, makeClientUri()) |
| 59 | + .option(Options.READ_DOCUMENTS_URIS, "/loaded/utf8-sample.json") |
| 60 | + .option(Options.READ_DOCUMENTS_CATEGORIES, "metadatavalues") |
| 61 | + .load().collectAsList(); |
| 62 | + |
| 63 | + Map<String, String> metadata = rows.get(0).getJavaMap(7); |
| 64 | + assertEquals("MaryZhengäöüß??", metadata.get("text"), "The user-defined encoding should be applied to " + |
| 65 | + "each metadata entry in the archive file as well. This ensures that the encoding is applied to things " + |
| 66 | + "like metadata values and properties fragments, where a user is free to capture any text they want."); |
| 67 | + } |
| 68 | + |
| 69 | + /** |
| 70 | + * It's fine to add this to the test document, which is created by the test app. It won't impact any other tests, |
| 71 | + * and it can be run repeatedly without any ill effects. |
| 72 | + */ |
| 73 | + private void addMetadataToTestDocument() { |
| 74 | + getDatabaseClient().newServerEval() |
| 75 | + .javascript("declareUpdate(); " + |
| 76 | + "xdmp.documentSetMetadata('/utf8-sample.json', {\"text\": \"MaryZhengäöüß测试\"})") |
| 77 | + .evalAs(String.class); |
| 78 | + } |
| 79 | +} |
0 commit comments