|
1 | 1 | package com.marklogic.spark.reader.document;
|
2 | 2 |
|
| 3 | +import com.marklogic.junit5.XmlNode; |
3 | 4 | import com.marklogic.spark.AbstractIntegrationTest;
|
4 | 5 | import com.marklogic.spark.Options;
|
5 | 6 | import org.apache.spark.sql.DataFrameReader;
|
| 7 | +import org.apache.spark.sql.Dataset; |
6 | 8 | import org.apache.spark.sql.Row;
|
| 9 | +import org.apache.spark.sql.SaveMode; |
7 | 10 | import org.junit.jupiter.api.Test;
|
8 | 11 |
|
9 | 12 | import java.util.List;
|
@@ -79,6 +82,35 @@ void urisWithWrongCollection() {
|
79 | 82 | assertEquals(0, count, "This verifies that the collection impacts the list of URIs.");
|
80 | 83 | }
|
81 | 84 |
|
| 85 | + @Test |
| 86 | + void nonUsAsciiUri() { |
| 87 | + newSparkSession().read().format(CONNECTOR_IDENTIFIER) |
| 88 | + .load("src/test/resources/encoding/太田佳伸のXMLファイル.xml") |
| 89 | + .write().format(CONNECTOR_IDENTIFIER) |
| 90 | + .option(Options.CLIENT_URI, makeClientUri()) |
| 91 | + .option(Options.WRITE_PERMISSIONS, DEFAULT_PERMISSIONS) |
| 92 | + .option(Options.WRITE_URI_REPLACE, ".*encoding,''") |
| 93 | + .mode(SaveMode.Append) |
| 94 | + .save(); |
| 95 | + |
| 96 | + final String expectedUri = "/太田佳伸のXMLファイル.xml"; |
| 97 | + XmlNode doc = readXmlDocument(expectedUri); |
| 98 | + doc.assertElementValue("/root/filename", "太田佳伸のXMLファイル"); |
| 99 | + |
| 100 | + Dataset<Row> dataset = sparkSession.read().format(CONNECTOR_IDENTIFIER) |
| 101 | + .option(Options.CLIENT_URI, makeClientUri()) |
| 102 | + .option(Options.READ_DOCUMENTS_URIS, expectedUri) |
| 103 | + .load(); |
| 104 | + |
| 105 | + assertEquals(1, dataset.count()); |
| 106 | + Row row = dataset.collectAsList().get(0); |
| 107 | + assertEquals(expectedUri, row.getString(0), |
| 108 | + "As of 7.0.0, the Java Client should default to setting mail.mime.allowutf8=true so that the " + |
| 109 | + "Jakarta Mail library allows UTF-8 characters in the header names of multipart response parts. " + |
| 110 | + "Normally, it only allows US-ASCII characters. But since MarkLogic allows UTF-8 characters in " + |
| 111 | + "URIs, we need the Jakarta Mail library (used by the Java Client) to be more permissive."); |
| 112 | + } |
| 113 | + |
82 | 114 | private DataFrameReader startRead() {
|
83 | 115 | return newSparkSession().read()
|
84 | 116 | .format(CONNECTOR_IDENTIFIER)
|
|
0 commit comments