Skip to content

Commit 825e6e0

Browse files
authored
Merge pull request #274 from marklogic/feature/read-japanese-uris
Added test for URI with non-US-ASCII characters
2 parents 7c53696 + acdb1cb commit 825e6e0

File tree

3 files changed

+47
-2
lines changed

3 files changed

+47
-2
lines changed

build.gradle

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,9 @@ java {
2121

2222
repositories {
2323
mavenCentral()
24+
maven {
25+
url "https://bed-artifactory.bedford.progress.com:443/artifactory/ml-maven-snapshots/"
26+
}
2427
}
2528

2629
configurations {
@@ -42,7 +45,7 @@ dependencies {
4245
exclude module: "rocksdbjni"
4346
}
4447

45-
shadowDependencies ("com.marklogic:marklogic-client-api:6.6.1") {
48+
shadowDependencies ("com.marklogic:marklogic-client-api:7.0-SNAPSHOT") {
4649
// The Java Client uses Jackson 2.15.2; Scala 3.4.x does not yet support that and will throw the following error:
4750
// Scala module 2.14.2 requires Jackson Databind version >= 2.14.0 and < 2.15.0 - Found jackson-databind version 2.15.2
4851
// So the 4 Jackson modules are excluded to allow for Spark's to be used.
@@ -63,13 +66,20 @@ dependencies {
6366

6467
shadowDependencies "org.jdom:jdom2:2.0.6.1"
6568

66-
testImplementation ('com.marklogic:ml-app-deployer:4.7.0') {
69+
testImplementation ('com.marklogic:ml-app-deployer:4.8.0') {
6770
exclude group: "com.fasterxml.jackson.core"
6871
exclude group: "com.fasterxml.jackson.dataformat"
72+
73+
// Use the Java Client declared above.
74+
exclude module: "marklogic-client-api"
6975
}
76+
7077
testImplementation ('com.marklogic:marklogic-junit5:1.4.0') {
7178
exclude group: "com.fasterxml.jackson.core"
7279
exclude group: "com.fasterxml.jackson.dataformat"
80+
81+
// Use the Java Client declared above.
82+
exclude module: "marklogic-client-api"
7383
}
7484

7585
testImplementation "ch.qos.logback:logback-classic:1.3.14"

src/test/java/com/marklogic/spark/reader/document/ReadDocumentRowsByUrisTest.java

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,12 @@
11
package com.marklogic.spark.reader.document;
22

3+
import com.marklogic.junit5.XmlNode;
34
import com.marklogic.spark.AbstractIntegrationTest;
45
import com.marklogic.spark.Options;
56
import org.apache.spark.sql.DataFrameReader;
7+
import org.apache.spark.sql.Dataset;
68
import org.apache.spark.sql.Row;
9+
import org.apache.spark.sql.SaveMode;
710
import org.junit.jupiter.api.Test;
811

912
import java.util.List;
@@ -79,6 +82,35 @@ void urisWithWrongCollection() {
7982
assertEquals(0, count, "This verifies that the collection impacts the list of URIs.");
8083
}
8184

85+
@Test
86+
void nonUsAsciiUri() {
87+
newSparkSession().read().format(CONNECTOR_IDENTIFIER)
88+
.load("src/test/resources/encoding/太田佳伸のXMLファイル.xml")
89+
.write().format(CONNECTOR_IDENTIFIER)
90+
.option(Options.CLIENT_URI, makeClientUri())
91+
.option(Options.WRITE_PERMISSIONS, DEFAULT_PERMISSIONS)
92+
.option(Options.WRITE_URI_REPLACE, ".*encoding,''")
93+
.mode(SaveMode.Append)
94+
.save();
95+
96+
final String expectedUri = "/太田佳伸のXMLファイル.xml";
97+
XmlNode doc = readXmlDocument(expectedUri);
98+
doc.assertElementValue("/root/filename", "太田佳伸のXMLファイル");
99+
100+
Dataset<Row> dataset = sparkSession.read().format(CONNECTOR_IDENTIFIER)
101+
.option(Options.CLIENT_URI, makeClientUri())
102+
.option(Options.READ_DOCUMENTS_URIS, expectedUri)
103+
.load();
104+
105+
assertEquals(1, dataset.count());
106+
Row row = dataset.collectAsList().get(0);
107+
assertEquals(expectedUri, row.getString(0),
108+
"As of 7.0.0, the Java Client should default to setting mail.mime.allowutf8=true so that the " +
109+
"Jakarta Mail library allows UTF-8 characters in the header names of multipart response parts. " +
110+
"Normally, it only allows US-ASCII characters. But since MarkLogic allows UTF-8 characters in " +
111+
"URIs, we need the Jakarta Mail library (used by the Java Client) to be more permissive.");
112+
}
113+
82114
private DataFrameReader startRead() {
83115
return newSparkSession().read()
84116
.format(CONNECTOR_IDENTIFIER)
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
<root>
2+
<filename>太田佳伸のXMLファイル</filename>
3+
</root>

0 commit comments

Comments
 (0)