Merge pull request #351 from marklogic/feature/17730-plus-sign-filename

rjrudin · web-flow · commit 45d30dc65efc · 2024-11-07T15:39:05.000-05:00
MLE-17730 Fixed file path encoding issues
diff --git a/src/main/java/com/marklogic/spark/reader/file/ArchiveFileReader.java b/src/main/java/com/marklogic/spark/reader/file/ArchiveFileReader.java
@@ -189,9 +189,7 @@ private boolean readMetadataFollowedByContent() throws IOException {
 
     private void openNextFile() {
         final boolean isStreamingDuringRead = StreamingMode.STREAM_DURING_READER_PHASE.equals(this.streamingMode);
-        final String nextFilePath = filePartition.getPaths().get(nextFilePathIndex);
-
-        this.currentFilePath = isStreamingDuringRead ? nextFilePath : fileContext.decodeFilePath(nextFilePath);
+        this.currentFilePath = filePartition.getPaths().get(nextFilePathIndex);
         nextFilePathIndex++;
 
         if (!isStreamingDuringRead) {
diff --git a/src/main/java/com/marklogic/spark/reader/file/FileBatch.java b/src/main/java/com/marklogic/spark/reader/file/FileBatch.java
@@ -13,6 +13,8 @@
 import org.apache.spark.sql.execution.datasources.PartitioningAwareFileIndex;
 import org.apache.spark.util.SerializableConfiguration;
 
+import java.util.ArrayList;
+import java.util.List;
 import java.util.Map;
 
 class FileBatch implements Batch {
@@ -27,17 +29,14 @@ class FileBatch implements Batch {
 
     @Override
     public InputPartition[] planInputPartitions() {
-        String[] inputFiles = fileIndex.inputFiles();
-        int numPartitions = inputFiles.length;
-        if (properties.containsKey(Options.READ_NUM_PARTITIONS)) {
-            String value = properties.get(Options.READ_NUM_PARTITIONS);
-            try {
-                numPartitions = Integer.parseInt(value);
-            } catch (NumberFormatException e) {
-                throw new ConnectorException(String.format("Invalid value for number of partitions: %s", value));
-            }
-        }
-        return FileUtil.makeFilePartitions(inputFiles, numPartitions);
+        List<String> filePaths = new ArrayList<>();
+        // Need to use allFiles and not inputFiles; the latter surprisingly URL-encodes each file path.
+        // Would likely be better to soon refactor the FilePartition class to hold a FileStatus instead of a String so
+        // that we don't need to convert it at all.
+        fileIndex.allFiles().iterator().foreach(fileStatus -> filePaths.add(fileStatus.getPath().toString()));
+
+        int numPartitions = getNumberOfPartitions(filePaths);
+        return FileUtil.makeFilePartitions(filePaths.toArray(new String[0]), numPartitions);
     }
 
     @Override
@@ -48,4 +47,16 @@ public PartitionReaderFactory createReaderFactory() {
         FileContext fileContext = new FileContext(properties, new SerializableConfiguration(config));
         return new FilePartitionReaderFactory(fileContext);
     }
+
+    private int getNumberOfPartitions(List<String> filePaths) {
+        if (properties.containsKey(Options.READ_NUM_PARTITIONS)) {
+            String value = properties.get(Options.READ_NUM_PARTITIONS);
+            try {
+                return Integer.parseInt(value);
+            } catch (NumberFormatException e) {
+                throw new ConnectorException(String.format("Invalid value for number of partitions: %s", value));
+            }
+        }
+        return filePaths.size();
+    }
 }
diff --git a/src/main/java/com/marklogic/spark/reader/file/FileContext.java b/src/main/java/com/marklogic/spark/reader/file/FileContext.java
@@ -6,14 +6,13 @@
 import com.marklogic.spark.ConnectorException;
 import com.marklogic.spark.ContextSupport;
 import com.marklogic.spark.Options;
-import com.marklogic.spark.Util;
 import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.spark.util.SerializableConfiguration;
 
 import java.io.*;
-import java.net.URLDecoder;
 import java.nio.charset.Charset;
 import java.nio.charset.UnsupportedCharsetException;
 import java.util.Map;
@@ -53,7 +52,9 @@ public InputStream openFile(String filePath, boolean guessIfGzipped) {
         try {
             Path hadoopPath = new Path(filePath);
             FileSystem fileSystem = hadoopPath.getFileSystem(hadoopConfiguration.value());
-            FSDataInputStream inputStream = fileSystem.open(hadoopPath);
+            // Per the Spark BinaryFileFormat source code - calling getFileStatus seems to handle encoding in the file path.
+            FileStatus fileStatus = fileSystem.getFileStatus(hadoopPath);
+            FSDataInputStream inputStream = fileSystem.open(fileStatus.getPath());
             return isFileGzipped(filePath, guessIfGzipped) ? new GZIPInputStream(inputStream) : inputStream;
         } catch (Exception e) {
             throw new ConnectorException(String.format(
@@ -83,20 +84,6 @@ byte[] readBytes(InputStream inputStream) throws IOException {
         return this.encoding != null ? new String(bytes, this.encoding).getBytes() : bytes;
     }
 
-    public String decodeFilePath(String path) {
-        try {
-            if (this.encoding != null) {
-                return URLDecoder.decode(path, this.encoding);
-            }
-            return URLDecoder.decode(path, Charset.defaultCharset());
-        } catch (UnsupportedEncodingException e) {
-            if (Util.MAIN_LOGGER.isDebugEnabled()) {
-                Util.MAIN_LOGGER.debug("Cannot decode path '{}', so will use path as-is. Error: {}", path, e.getMessage());
-            }
-            return path;
-        }
-    }
-
     private boolean isFileGzipped(String filePath, boolean guessIfGzipped) {
         if (this.isGzip()) {
             return true;
diff --git a/src/main/java/com/marklogic/spark/reader/file/GenericFileReader.java b/src/main/java/com/marklogic/spark/reader/file/GenericFileReader.java
@@ -38,12 +38,9 @@ public boolean next() {
             return false;
         }
 
-        // If streaming, we want to put the unaltered file path in the row. The writer can then decode it and also use
-        // its original value as the URI, as the PUT v1/documents endpoint does not allow e.g. spaces.
-        final String originalFilePath = filePartition.getPaths().get(filePathIndex);
-        final String path = this.isStreaming ? originalFilePath : fileContext.decodeFilePath(originalFilePath);
-
+        final String path = filePartition.getPaths().get(filePathIndex);
         filePathIndex++;
+
         try {
             byte[] content = this.isStreaming ?
                 FileUtil.serializeFileContext(fileContext, path) :
diff --git a/src/main/java/com/marklogic/spark/reader/file/GzipFileReader.java b/src/main/java/com/marklogic/spark/reader/file/GzipFileReader.java
@@ -48,7 +48,7 @@ public boolean next() {
             return false;
         }
 
-        String currentFilePath = fileContext.decodeFilePath(filePartition.getPaths().get(nextFilePathIndex));
+        String currentFilePath = filePartition.getPaths().get(nextFilePathIndex);
         nextFilePathIndex++;
         String uri = makeURI(currentFilePath);
 
diff --git a/src/main/java/com/marklogic/spark/reader/file/JsonLinesFileReader.java b/src/main/java/com/marklogic/spark/reader/file/JsonLinesFileReader.java
@@ -61,8 +61,7 @@ public void close() {
     }
 
     private void openNextFile() {
-        final String originalFilePath = filePartition.getPaths().get(filePathIndex);
-        this.currentFilePath = fileContext.decodeFilePath(originalFilePath);
+        this.currentFilePath = filePartition.getPaths().get(filePathIndex);
         this.lineCounter = 1;
         this.filePathIndex++;
         // To mimic the behavior of the Spark JSON data source, this will guess if the file is gzipped based on its
diff --git a/src/main/java/com/marklogic/spark/reader/file/MlcpArchiveFileReader.java b/src/main/java/com/marklogic/spark/reader/file/MlcpArchiveFileReader.java
@@ -99,7 +99,7 @@ public void close() {
     }
 
     private void openNextFile() {
-        this.currentFilePath = fileContext.decodeFilePath(filePartition.getPaths().get(nextFilePathIndex));
+        this.currentFilePath = filePartition.getPaths().get(nextFilePathIndex);
         nextFilePathIndex++;
         this.currentZipInputStream = new ZipInputStream(fileContext.openFile(this.currentFilePath));
     }
diff --git a/src/main/java/com/marklogic/spark/reader/file/RdfFileReader.java b/src/main/java/com/marklogic/spark/reader/file/RdfFileReader.java
@@ -76,7 +76,7 @@ public void close() throws IOException {
     }
 
     private boolean initializeRdfStreamReader() {
-        this.currentFilePath = fileContext.decodeFilePath(filePartition.getPaths().get(nextFilePathIndex));
+        this.currentFilePath = filePartition.getPaths().get(nextFilePathIndex);
         if (logger.isDebugEnabled()) {
             logger.debug("Reading file {}", this.currentFilePath);
         }
diff --git a/src/main/java/com/marklogic/spark/reader/file/RdfZipFileReader.java b/src/main/java/com/marklogic/spark/reader/file/RdfZipFileReader.java
@@ -68,7 +68,7 @@ public boolean next() {
             }
 
             // Open up the next zip.
-            this.currentFilePath = fileContext.decodeFilePath(filePartition.getPaths().get(nextFilePathIndex));
+            this.currentFilePath = filePartition.getPaths().get(nextFilePathIndex);
             nextFilePathIndex++;
             this.currentZipInputStream = new CustomZipInputStream(fileContext.openFile(currentFilePath));
             return next();
diff --git a/src/main/java/com/marklogic/spark/reader/file/ZipFileReader.java b/src/main/java/com/marklogic/spark/reader/file/ZipFileReader.java
@@ -122,7 +122,7 @@ private boolean nextWhileStreamingDuringReaderPhase() {
     }
 
     private void openNextFile() {
-        this.currentFilePath = fileContext.decodeFilePath(filePartition.getPaths().get(nextFilePathIndex));
+        this.currentFilePath = filePartition.getPaths().get(nextFilePathIndex);
         nextFilePathIndex++;
         this.currentZipInputStream = new ZipInputStream(fileContext.openFile(this.currentFilePath));
     }
diff --git a/src/main/java/com/marklogic/spark/reader/file/xml/AggregateXmlFileReader.java b/src/main/java/com/marklogic/spark/reader/file/xml/AggregateXmlFileReader.java
@@ -53,7 +53,7 @@ public boolean next() {
             }
 
             try {
-                String path = fileContext.decodeFilePath(filePartition.getPaths().get(filePathIndex));
+                String path = filePartition.getPaths().get(filePathIndex);
                 nextRowToReturn = this.aggregateXMLSplitter.nextRow(path);
                 return true;
             } catch (RuntimeException ex) {
@@ -81,7 +81,7 @@ private boolean initializeAggregateXMLSplitter() {
             return false;
         }
 
-        final String filePath = fileContext.decodeFilePath(filePartition.getPaths().get(filePathIndex));
+        final String filePath = filePartition.getPaths().get(filePathIndex);
         try {
             this.inputStream = fileContext.openFile(filePath);
             String identifierForError = "file " + filePath;
diff --git a/src/main/java/com/marklogic/spark/reader/file/xml/ZipAggregateXmlFileReader.java b/src/main/java/com/marklogic/spark/reader/file/xml/ZipAggregateXmlFileReader.java
@@ -83,7 +83,7 @@ public void close() {
     }
 
     private void openNextFile() {
-        this.currentFilePath = fileContext.decodeFilePath(filePartition.getPaths().get(nextFilePathIndex));
+        this.currentFilePath = filePartition.getPaths().get(nextFilePathIndex);
         nextFilePathIndex++;
         this.currentZipInputStream = new ZipInputStream(fileContext.openFile(this.currentFilePath));
     }
diff --git a/src/main/java/com/marklogic/spark/writer/DocumentRowConverter.java b/src/main/java/com/marklogic/spark/writer/DocumentRowConverter.java
@@ -137,8 +137,7 @@ private Iterator<DocBuilder.DocumentInputs> streamContentFromFile(String filePat
     }
 
     private Iterator<DocBuilder.DocumentInputs> buildIteratorForGenericFile(InternalRow row, String filePath, FileContext fileContext) {
-        final String decodedPath = fileContext.decodeFilePath(filePath);
-        InputStreamHandle contentHandle = new InputStreamHandle(fileContext.openFile(decodedPath));
+        InputStreamHandle contentHandle = new InputStreamHandle(fileContext.openFile(filePath));
         if (this.documentFormat != null) {
             contentHandle.withFormat(this.documentFormat);
         }
diff --git a/src/test/java/com/marklogic/spark/reader/file/ReadAggregateXmlZipFilesTest.java b/src/test/java/com/marklogic/spark/reader/file/ReadAggregateXmlZipFilesTest.java
@@ -82,7 +82,7 @@ void uriElementNotFound() {
         ConnectorException ex = assertThrowsConnectorException(() -> dataset.count());
         String message = ex.getMessage();
         assertTrue(
-            message.startsWith("No occurrence of URI element 'elementDoesntExist' found in aggregate element 1 in entry employees.xml in file:///"),
+            message.startsWith("No occurrence of URI element 'elementDoesntExist' found in aggregate element 1 in entry employees.xml in file:/"),
             "The error should identify which aggregate element did not contain the URI element; actual error: " + message
         );
     }
@@ -112,7 +112,7 @@ void notXmlFileInZip() {
 
         ConnectorException ex = assertThrowsConnectorException(() -> dataset.count());
         String message = ex.getMessage();
-        assertTrue(message.startsWith("Unable to read XML from entry 500-employees.json in file:///"),
+        assertTrue(message.startsWith("Unable to read XML from entry 500-employees.json in file:/"),
             "The error should identify the file and the entry name; actual error: " + message);
         assertTrue(message.endsWith("json-employees.zip; cause: Failed to traverse document"),
             "The error should identify the file and the root cause; actual error: " + message);
diff --git a/src/test/java/com/marklogic/spark/reader/file/ReadGenericFilesTest.java b/src/test/java/com/marklogic/spark/reader/file/ReadGenericFilesTest.java
@@ -189,4 +189,14 @@ void csvFileWithSpaces() {
         assertEquals(3, rows.size(), "This doesn't test our connector, but rather demonstrates that the OOTB " +
             "Spark file data sources correctly handle file paths with spaces in them.");
     }
+
+    @Test
+    void jsonFileWithPlusSign() {
+        List<Row> rows = newSparkSession().read()
+            .format(CONNECTOR_IDENTIFIER)
+            .load("src/test/resources/generic-files/has+plus.json")
+            .collectAsList();
+
+        assertEquals(1, rows.size());
+    }
 }
diff --git a/src/test/java/com/marklogic/spark/reader/file/ReadGzipAggregateXmlFilesTest.java b/src/test/java/com/marklogic/spark/reader/file/ReadGzipAggregateXmlFilesTest.java
@@ -45,7 +45,7 @@ void nonGZIPFile() {
 
         ConnectorException ex = assertThrowsConnectorException(() -> dataset.count());
         String message = ex.getMessage();
-        assertTrue(message.startsWith("Unable to read file at file:///"), "Unexpected error: " + message);
+        assertTrue(message.startsWith("Unable to read file at file:/"), "Unexpected error: " + message);
         assertTrue(message.endsWith("cause: Not in GZIP format"), "Unexpected error: " + message);
     }
 
diff --git a/src/test/java/com/marklogic/spark/reader/file/ReadGzipFilesTest.java b/src/test/java/com/marklogic/spark/reader/file/ReadGzipFilesTest.java
@@ -55,7 +55,7 @@ void filesNotGzipped() {
 
         SparkException ex = assertThrows(SparkException.class, () -> dataset.count());
         assertTrue(ex.getCause() instanceof ConnectorException);
-        assertTrue(ex.getCause().getMessage().startsWith("Unable to read file at file:///"),
+        assertTrue(ex.getCause().getMessage().startsWith("Unable to read file at file:/"),
             "Unexpected error message: " + ex.getCause().getMessage());
     }
 
diff --git a/src/test/java/com/marklogic/spark/writer/file/WriteArchiveTest.java b/src/test/java/com/marklogic/spark/writer/file/WriteArchiveTest.java
@@ -104,7 +104,7 @@ private void verifyMetadataFiles(Path tempDir, String metadataValue) {
                 "This allows for the content to later be streamed back into MarkLogic. Entry name: " + entryName);
         }
 
-        final String expectedUriPrefix = "file://" + tempDir.toFile().getAbsolutePath();
+        final String expectedUriPrefix = "file:" + tempDir.toFile().getAbsolutePath();
         for (Row row : rows) {
             String uri = row.getString(0);
             assertTrue(uri.startsWith(expectedUriPrefix), "Unexpected URI, which is expected to start with the " +
diff --git a/src/test/java/com/marklogic/spark/writer/file/WriteDocumentZipFilesTest.java b/src/test/java/com/marklogic/spark/writer/file/WriteDocumentZipFilesTest.java
@@ -158,7 +158,7 @@ private void verifyZipFilesContainFifteenAuthors(Path tempDir) throws IOExceptio
         assertEquals(15, rows.size());
 
         // Verify each row was read correctly.
-        final String expectedUriPrefix = "file://" + tempDir.toFile().getAbsolutePath();
+        final String expectedUriPrefix = "file:" + tempDir.toFile().getAbsolutePath();
         for (Row row : rows) {
             String uri = row.getString(0);
             assertTrue(uri.startsWith(expectedUriPrefix), "Unexpected URI, which is expected to start with the " +
diff --git a/src/test/resources/generic-files/has+plus.json b/src/test/resources/generic-files/has+plus.json
@@ -0,0 +1 @@
+{"hello": "world"}

Original file line number	Diff line number	Diff line change
`@@ -48,7 +48,7 @@ public boolean next() {`
`48`	`48`	`return false;`
`49`	`49`	`}`
`50`	`50`
`51`		`- String currentFilePath = fileContext.decodeFilePath(filePartition.getPaths().get(nextFilePathIndex));`
	`51`	`+ String currentFilePath = filePartition.getPaths().get(nextFilePathIndex);`
`52`	`52`	`nextFilePathIndex++;`
`53`	`53`	`String uri = makeURI(currentFilePath);`
`54`	`54`
Original file line number	Diff line number	Diff line change
`@@ -61,8 +61,7 @@ public void close() {`
`61`	`61`	`}`
`62`	`62`
`63`	`63`	`private void openNextFile() {`
`64`		`- final String originalFilePath = filePartition.getPaths().get(filePathIndex);`
`65`		`- this.currentFilePath = fileContext.decodeFilePath(originalFilePath);`
	`64`	`+ this.currentFilePath = filePartition.getPaths().get(filePathIndex);`
`66`	`65`	`this.lineCounter = 1;`
`67`	`66`	`this.filePathIndex++;`
`68`	`67`	`// To mimic the behavior of the Spark JSON data source, this will guess if the file is gzipped based on its`
Original file line number	Diff line number	Diff line change
`@@ -99,7 +99,7 @@ public void close() {`
`99`	`99`	`}`
`100`	`100`
`101`	`101`	`private void openNextFile() {`
`102`		`- this.currentFilePath = fileContext.decodeFilePath(filePartition.getPaths().get(nextFilePathIndex));`
	`102`	`+ this.currentFilePath = filePartition.getPaths().get(nextFilePathIndex);`
`103`	`103`	`nextFilePathIndex++;`
`104`	`104`	`this.currentZipInputStream = new ZipInputStream(fileContext.openFile(this.currentFilePath));`
`105`	`105`	`}`
Original file line number	Diff line number	Diff line change
`@@ -76,7 +76,7 @@ public void close() throws IOException {`
`76`	`76`	`}`
`77`	`77`
`78`	`78`	`private boolean initializeRdfStreamReader() {`
`79`		`- this.currentFilePath = fileContext.decodeFilePath(filePartition.getPaths().get(nextFilePathIndex));`
	`79`	`+ this.currentFilePath = filePartition.getPaths().get(nextFilePathIndex);`
`80`	`80`	`if (logger.isDebugEnabled()) {`
`81`	`81`	`logger.debug("Reading file {}", this.currentFilePath);`
`82`	`82`	`}`
Original file line number	Diff line number	Diff line change
`@@ -68,7 +68,7 @@ public boolean next() {`
`68`	`68`	`}`
`69`	`69`
`70`	`70`	`// Open up the next zip.`
`71`		`- this.currentFilePath = fileContext.decodeFilePath(filePartition.getPaths().get(nextFilePathIndex));`
	`71`	`+ this.currentFilePath = filePartition.getPaths().get(nextFilePathIndex);`
`72`	`72`	`nextFilePathIndex++;`
`73`	`73`	`this.currentZipInputStream = new CustomZipInputStream(fileContext.openFile(currentFilePath));`
`74`	`74`	`return next();`
Original file line number	Diff line number	Diff line change
`@@ -122,7 +122,7 @@ private boolean nextWhileStreamingDuringReaderPhase() {`
`122`	`122`	`}`
`123`	`123`
`124`	`124`	`private void openNextFile() {`
`125`		`- this.currentFilePath = fileContext.decodeFilePath(filePartition.getPaths().get(nextFilePathIndex));`
	`125`	`+ this.currentFilePath = filePartition.getPaths().get(nextFilePathIndex);`
`126`	`126`	`nextFilePathIndex++;`
`127`	`127`	`this.currentZipInputStream = new ZipInputStream(fileContext.openFile(this.currentFilePath));`
`128`	`128`	`}`