Merge pull request #294 from marklogic/feature/space-in-filename

rjrudin · web-flow · commit ca1c50377064 · 2024-09-25T11:15:15.000-04:00
MLE-17084 Initial fix for handling spaces in filenames
diff --git a/src/main/java/com/marklogic/spark/reader/file/ArchiveFileReader.java b/src/main/java/com/marklogic/spark/reader/file/ArchiveFileReader.java
@@ -87,7 +87,7 @@ public void close() {
     }
 
     private void openNextFile() {
-        this.currentFilePath = filePartition.getPaths().get(nextFilePathIndex);
+        this.currentFilePath = fileContext.getDecodedFilePath(filePartition, nextFilePathIndex);
         nextFilePathIndex++;
         this.currentZipInputStream = new ZipInputStream(fileContext.openFile(this.currentFilePath));
     }
diff --git a/src/main/java/com/marklogic/spark/reader/file/FileContext.java b/src/main/java/com/marklogic/spark/reader/file/FileContext.java
@@ -6,6 +6,7 @@
 import com.marklogic.spark.ConnectorException;
 import com.marklogic.spark.ContextSupport;
 import com.marklogic.spark.Options;
+import com.marklogic.spark.Util;
 import org.apache.hadoop.fs.FSDataInputStream;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
@@ -14,6 +15,8 @@
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.Serializable;
+import java.io.UnsupportedEncodingException;
+import java.net.URLDecoder;
 import java.nio.charset.Charset;
 import java.nio.charset.UnsupportedCharsetException;
 import java.util.Map;
@@ -68,4 +71,19 @@ byte[] readBytes(InputStream inputStream) throws IOException {
         byte[] bytes = FileUtil.readBytes(inputStream);
         return this.encoding != null ? new String(bytes, this.encoding).getBytes() : bytes;
     }
+
+    public String getDecodedFilePath(FilePartition filePartition, int index) {
+        String path = filePartition.getPaths().get(index);
+        try {
+            if (this.encoding != null) {
+                return URLDecoder.decode(path, this.encoding);
+            }
+            return URLDecoder.decode(path, Charset.defaultCharset());
+        } catch (UnsupportedEncodingException e) {
+            if (Util.MAIN_LOGGER.isDebugEnabled()) {
+                Util.MAIN_LOGGER.debug("Cannot decode path '{}', so will use path as-is. Error: {}", path, e.getMessage());
+            }
+            return path;
+        }
+    }
 }
diff --git a/src/main/java/com/marklogic/spark/reader/file/GenericFileReader.java b/src/main/java/com/marklogic/spark/reader/file/GenericFileReader.java
@@ -41,11 +41,11 @@ public boolean next() {
             return false;
         }
 
-        final String path = filePartition.getPaths().get(filePathIndex);
+        final String path = fileContext.getDecodedFilePath(filePartition, filePathIndex);
         filePathIndex++;
         try {
             byte[] content = this.isStreaming ? serializeFileContext() : readFileIntoByteArray(path);
-            
+
             nextRowToReturn = new GenericInternalRow(new Object[]{
                 UTF8String.fromString(path),
                 ByteArray.concat(content),
diff --git a/src/main/java/com/marklogic/spark/reader/file/GzipFileReader.java b/src/main/java/com/marklogic/spark/reader/file/GzipFileReader.java
@@ -38,7 +38,7 @@ public boolean next() {
             return false;
         }
 
-        String currentFilePath = filePartition.getPaths().get(nextFilePathIndex);
+        String currentFilePath = fileContext.getDecodedFilePath(filePartition, nextFilePathIndex);
         nextFilePathIndex++;
         InputStream gzipInputStream = null;
         try {
diff --git a/src/main/java/com/marklogic/spark/reader/file/MlcpArchiveFileReader.java b/src/main/java/com/marklogic/spark/reader/file/MlcpArchiveFileReader.java
@@ -99,7 +99,7 @@ public void close() {
     }
 
     private void openNextFile() {
-        this.currentFilePath = filePartition.getPaths().get(nextFilePathIndex);
+        this.currentFilePath = fileContext.getDecodedFilePath(filePartition, nextFilePathIndex);
         nextFilePathIndex++;
         this.currentZipInputStream = new ZipInputStream(fileContext.openFile(this.currentFilePath));
     }
diff --git a/src/main/java/com/marklogic/spark/reader/file/RdfFileReader.java b/src/main/java/com/marklogic/spark/reader/file/RdfFileReader.java
@@ -76,7 +76,7 @@ public void close() throws IOException {
     }
 
     private boolean initializeRdfStreamReader() {
-        this.currentFilePath = this.filePartition.getPaths().get(nextFilePathIndex);
+        this.currentFilePath = fileContext.getDecodedFilePath(filePartition, nextFilePathIndex);
         if (logger.isDebugEnabled()) {
             logger.debug("Reading file {}", this.currentFilePath);
         }
diff --git a/src/main/java/com/marklogic/spark/reader/file/RdfZipFileReader.java b/src/main/java/com/marklogic/spark/reader/file/RdfZipFileReader.java
@@ -68,7 +68,7 @@ public boolean next() {
             }
 
             // Open up the next zip.
-            this.currentFilePath = filePartition.getPaths().get(nextFilePathIndex);
+            this.currentFilePath = fileContext.getDecodedFilePath(filePartition, nextFilePathIndex);
             nextFilePathIndex++;
             this.currentZipInputStream = new CustomZipInputStream(fileContext.openFile(currentFilePath));
             return next();
diff --git a/src/main/java/com/marklogic/spark/reader/file/ZipFileReader.java b/src/main/java/com/marklogic/spark/reader/file/ZipFileReader.java
@@ -71,7 +71,7 @@ public void close() {
     }
 
     private void openNextFile() {
-        this.currentFilePath = this.filePartition.getPaths().get(nextFilePathIndex);
+        this.currentFilePath = fileContext.getDecodedFilePath(filePartition, nextFilePathIndex);
         nextFilePathIndex++;
         this.currentZipInputStream = new ZipInputStream(fileContext.openFile(this.currentFilePath));
     }
diff --git a/src/main/java/com/marklogic/spark/reader/file/xml/AggregateXmlFileReader.java b/src/main/java/com/marklogic/spark/reader/file/xml/AggregateXmlFileReader.java
@@ -53,7 +53,8 @@ public boolean next() {
             }
 
             try {
-                nextRowToReturn = this.aggregateXMLSplitter.nextRow(filePartition.getPaths().get(filePathIndex));
+                String path = fileContext.getDecodedFilePath(filePartition, filePathIndex);
+                nextRowToReturn = this.aggregateXMLSplitter.nextRow(path);
                 return true;
             } catch (RuntimeException ex) {
                 // Error is expected to be friendly already.
@@ -80,7 +81,7 @@ private boolean initializeAggregateXMLSplitter() {
             return false;
         }
 
-        final String filePath = filePartition.getPaths().get(filePathIndex);
+        final String filePath = fileContext.getDecodedFilePath(filePartition, filePathIndex);
         try {
             this.inputStream = fileContext.openFile(filePath);
             String identifierForError = "file " + filePath;
diff --git a/src/main/java/com/marklogic/spark/reader/file/xml/ZipAggregateXmlFileReader.java b/src/main/java/com/marklogic/spark/reader/file/xml/ZipAggregateXmlFileReader.java
@@ -83,7 +83,7 @@ public void close() {
     }
 
     private void openNextFile() {
-        this.currentFilePath = filePartition.getPaths().get(nextFilePathIndex);
+        this.currentFilePath = fileContext.getDecodedFilePath(filePartition, nextFilePathIndex);
         nextFilePathIndex++;
         this.currentZipInputStream = new ZipInputStream(fileContext.openFile(this.currentFilePath));
     }
diff --git a/src/test/java/com/marklogic/spark/reader/file/ReadGenericFilesStreamingTest.java b/src/test/java/com/marklogic/spark/reader/file/ReadGenericFilesStreamingTest.java
@@ -10,6 +10,7 @@
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Row;
 import org.apache.spark.sql.SaveMode;
+import org.junit.jupiter.api.Disabled;
 import org.junit.jupiter.api.Test;
 
 import java.io.ByteArrayInputStream;
@@ -43,6 +44,22 @@ void stream() throws Exception {
             "of tests.", "streamed-files", 4);
     }
 
+    @Test
+    @Disabled("Doesn't work yet, will fix as part of MLE-17084 in a follow-up PR")
+    void streamFileWithSpacesInFilename() {
+        Dataset<Row> dataset = newSparkSession().read().format(CONNECTOR_IDENTIFIER)
+            .option(Options.STREAM_FILES, true)
+            .load("src/test/resources/generic-files/with-spaces/three uris.csv");
+
+        defaultWrite(dataset.write().format(CONNECTOR_IDENTIFIER)
+            .option(Options.STREAM_FILES, true)
+            .option(Options.WRITE_COLLECTIONS, "streamed-files")
+            .option(Options.WRITE_URI_REPLACE, ".*/with-spaces,''"));
+
+        String uri = getUrisInCollection("streamed-files", 1).get(0);
+        assertEquals("/three uris.csv", uri);
+    }
+
     @Test
     void handleFailureWhileStreaming() {
         DataFrameWriter writer = newSparkSession()
diff --git a/src/test/java/com/marklogic/spark/reader/file/ReadGenericFilesTest.java b/src/test/java/com/marklogic/spark/reader/file/ReadGenericFilesTest.java
@@ -128,4 +128,65 @@ void gzippedCustomEncoding() {
         XmlNode doc = readXmlDocument(uri);
         doc.assertElementExists("/MedlineCitationSet");
     }
+
+    @Test
+    void filenameHasSpace() {
+        Dataset<Row> dataset = newSparkSession().read().format(CONNECTOR_IDENTIFIER)
+            .load("src/test/resources/generic-files/with-spaces/three uris.csv");
+
+        Row row = dataset.collectAsList().get(0);
+        String path = row.getString(0);
+        assertTrue(path.endsWith("generic-files/with-spaces/three uris.csv"),
+            "The file path should be decoded by default. Under the hood, Spark builds up a set of file paths " +
+                "that are URL-encoded. But those will fail when trying to read the file, so they need to be " +
+                "decoded. Actual path: " + path);
+
+        String content = new String((byte[]) row.get(1));
+        assertEquals("URI\n/process-test1\n/process-test2\n/process-test3\n", content);
+
+        // Write the dataset to verify the URI has a space in it.
+        defaultWrite(dataset.write().format(CONNECTOR_IDENTIFIER)
+            .option(Options.WRITE_COLLECTIONS, "space-test")
+            .option(Options.WRITE_URI_REPLACE, ".*generic-files,''"));
+
+        String uri = getUrisInCollection("space-test", 1).get(0);
+        assertEquals("/with-spaces/three uris.csv", uri);
+    }
+
+    @Test
+    void filenameWithSpaceAndCustomEncoding() {
+        Dataset<Row> dataset = newSparkSession().read().format(CONNECTOR_IDENTIFIER)
+            .option(Options.READ_FILES_ENCODING, "ISO-8859-1")
+            .load("src/test/resources/generic-files/with-spaces/medline iso 8859 1.txt");
+
+        Row row = dataset.collectAsList().get(0);
+        String path = row.getString(0);
+        assertTrue(path.endsWith("/with-spaces/medline iso 8859 1.txt"),
+            "Verifying that when a custom encoding is specified, the path can still be decoded correctly.");
+    }
+
+    @Test
+    void filenameWithEncodedSpace() {
+        List<Row> rows = newSparkSession().read().format(CONNECTOR_IDENTIFIER)
+            .load("src/test/resources/generic-files/with-spaces")
+            .select("URI")
+            .orderBy("URI")
+            .collectAsList();
+
+        assertEquals(3, rows.size());
+        assertTrue(rows.get(0).getString(0).endsWith("/hello%20world.json"));
+        assertTrue(rows.get(1).getString(0).endsWith("/medline iso 8859 1.txt"));
+        assertTrue(rows.get(2).getString(0).endsWith("/three uris.csv"));
+    }
+
+    @Test
+    void csvFileWithSpaces() {
+        List<Row> rows = newSparkSession().read()
+            .option("header", true)
+            .csv("src/test/resources/generic-files/with-spaces/three uris.csv")
+            .collectAsList();
+
+        assertEquals(3, rows.size(), "This doesn't test our connector, but rather demonstrates that the OOTB " +
+            "Spark file data sources correctly handle file paths with spaces in them.");
+    }
 }
diff --git a/src/test/resources/generic-files/with-spaces/hello%20world.json b/src/test/resources/generic-files/with-spaces/hello%20world.json
@@ -0,0 +1,3 @@
+{
+  "hello": "world"
+}
diff --git a/src/test/resources/generic-files/with-spaces/medline iso 8859 1.txt b/src/test/resources/generic-files/with-spaces/medline iso 8859 1.txt
@@ -0,0 +1,12 @@
+<MedlineCitationSet>
+<MedlineCitation>
+<PMID>10605436</PMID>
+<ArticleTitle>Concerning the localization of steroids in centrioles and basal bodies by immunofluorescence.</ArticleTitle>
+<Affiliation>Istituto di Anatomia e Istologia Patologica, Universit� di Ferrara, Italy.</Affiliation>
+</MedlineCitation>
+<MedlineCitation>
+<PMID>12261559</PMID>
+<ArticleTitle>[An attempt to study, through genealogies, family structures in the case of a non-noble family]</ArticleTitle>
+<VernacularTitle>Un tentativo di studio, tramite, genealogie, di strutture familiari nel caso di una famiglia non nobile</VernacularTitle>
+</MedlineCitation>
+</MedlineCitationSet>
diff --git a/src/test/resources/generic-files/with-spaces/three uris.csv b/src/test/resources/generic-files/with-spaces/three uris.csv
@@ -0,0 +1,4 @@
+URI
+/process-test1
+/process-test2
+/process-test3

Original file line number	Diff line number	Diff line change
`@@ -87,7 +87,7 @@ public void close() {`
`87`	`87`	`}`
`88`	`88`
`89`	`89`	`private void openNextFile() {`
`90`		`- this.currentFilePath = filePartition.getPaths().get(nextFilePathIndex);`
	`90`	`+ this.currentFilePath = fileContext.getDecodedFilePath(filePartition, nextFilePathIndex);`
`91`	`91`	`nextFilePathIndex++;`
`92`	`92`	`this.currentZipInputStream = new ZipInputStream(fileContext.openFile(this.currentFilePath));`
`93`	`93`	`}`
Original file line number	Diff line number	Diff line change
`@@ -38,7 +38,7 @@ public boolean next() {`
`38`	`38`	`return false;`
`39`	`39`	`}`
`40`	`40`
`41`		`- String currentFilePath = filePartition.getPaths().get(nextFilePathIndex);`
	`41`	`+ String currentFilePath = fileContext.getDecodedFilePath(filePartition, nextFilePathIndex);`
`42`	`42`	`nextFilePathIndex++;`
`43`	`43`	`InputStream gzipInputStream = null;`
`44`	`44`	`try {`
Original file line number	Diff line number	Diff line change
`@@ -99,7 +99,7 @@ public void close() {`
`99`	`99`	`}`
`100`	`100`
`101`	`101`	`private void openNextFile() {`
`102`		`- this.currentFilePath = filePartition.getPaths().get(nextFilePathIndex);`
	`102`	`+ this.currentFilePath = fileContext.getDecodedFilePath(filePartition, nextFilePathIndex);`
`103`	`103`	`nextFilePathIndex++;`
`104`	`104`	`this.currentZipInputStream = new ZipInputStream(fileContext.openFile(this.currentFilePath));`
`105`	`105`	`}`
Original file line number	Diff line number	Diff line change
`@@ -76,7 +76,7 @@ public void close() throws IOException {`
`76`	`76`	`}`
`77`	`77`
`78`	`78`	`private boolean initializeRdfStreamReader() {`
`79`		`- this.currentFilePath = this.filePartition.getPaths().get(nextFilePathIndex);`
	`79`	`+ this.currentFilePath = fileContext.getDecodedFilePath(filePartition, nextFilePathIndex);`
`80`	`80`	`if (logger.isDebugEnabled()) {`
`81`	`81`	`logger.debug("Reading file {}", this.currentFilePath);`
`82`	`82`	`}`
Original file line number	Diff line number	Diff line change
`@@ -68,7 +68,7 @@ public boolean next() {`
`68`	`68`	`}`
`69`	`69`
`70`	`70`	`// Open up the next zip.`
`71`		`- this.currentFilePath = filePartition.getPaths().get(nextFilePathIndex);`
	`71`	`+ this.currentFilePath = fileContext.getDecodedFilePath(filePartition, nextFilePathIndex);`
`72`	`72`	`nextFilePathIndex++;`
`73`	`73`	`this.currentZipInputStream = new CustomZipInputStream(fileContext.openFile(currentFilePath));`
`74`	`74`	`return next();`
Original file line number	Diff line number	Diff line change
`@@ -71,7 +71,7 @@ public void close() {`
`71`	`71`	`}`
`72`	`72`
`73`	`73`	`private void openNextFile() {`
`74`		`- this.currentFilePath = this.filePartition.getPaths().get(nextFilePathIndex);`
	`74`	`+ this.currentFilePath = fileContext.getDecodedFilePath(filePartition, nextFilePathIndex);`
`75`	`75`	`nextFilePathIndex++;`
`76`	`76`	`this.currentZipInputStream = new ZipInputStream(fileContext.openFile(this.currentFilePath));`
`77`	`77`	`}`
Original file line number	Diff line number	Diff line change
`@@ -83,7 +83,7 @@ public void close() {`
`83`	`83`	`}`
`84`	`84`
`85`	`85`	`private void openNextFile() {`
`86`		`- this.currentFilePath = filePartition.getPaths().get(nextFilePathIndex);`
	`86`	`+ this.currentFilePath = fileContext.getDecodedFilePath(filePartition, nextFilePathIndex);`
`87`	`87`	`nextFilePathIndex++;`
`88`	`88`	`this.currentZipInputStream = new ZipInputStream(fileContext.openFile(this.currentFilePath));`
`89`	`89`	`}`