Merge pull request #384 from marklogic/feature/mkdir-write-files

rjrudin · web-flow · commit 94004ec9ceec · 2024-12-16T15:03:03.000-05:00
MLE-19124 Fixed bug with failing when export path doesn't exist
diff --git a/marklogic-spark-connector/src/main/java/com/marklogic/spark/MarkLogicFileTable.java b/marklogic-spark-connector/src/main/java/com/marklogic/spark/MarkLogicFileTable.java
@@ -5,7 +5,9 @@
 
 import com.marklogic.spark.reader.file.FileScanBuilder;
 import com.marklogic.spark.writer.file.DocumentFileWriteBuilder;
+import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileStatus;
+import org.apache.hadoop.fs.Path;
 import org.apache.spark.sql.SparkSession;
 import org.apache.spark.sql.connector.read.ScanBuilder;
 import org.apache.spark.sql.connector.write.LogicalWriteInfo;
@@ -17,6 +19,8 @@
 import scala.Option;
 import scala.collection.Seq;
 
+import java.io.IOException;
+
 /**
  * Extends Spark's FileTable class so that it can make use of that class's file index capabilities, which includes
  * support for Spark options like recursiveFileLookup and pathGlobFilter as defined at
@@ -36,6 +40,9 @@ class MarkLogicFileTable extends FileTable {
 
     MarkLogicFileTable(SparkSession sparkSession, CaseInsensitiveStringMap options, Seq<String> paths, StructType schema) {
         super(sparkSession, options, paths, Option.apply(schema));
+        if (isWriteFilesOperation(options, paths)) {
+            makeWritePath(paths.head(), sparkSession);
+        }
         this.options = options;
         this.schema = schema;
     }
@@ -77,4 +84,35 @@ public Class<? extends FileFormat> fallbackFileFormat() {
         // so null is returned.
         return null;
     }
+
+    private boolean isWriteFilesOperation(CaseInsensitiveStringMap options, Seq<String> paths) {
+        // When writing files, a user is limited to a single path. So if the user provides multiple paths when
+        // reading files, we immediately know it's not a write operation.
+        if (paths.size() != 1) {
+            return false;
+        }
+        // Unfortunately not all "read files" options have a common base. The worst case though of
+        // mis-identifying this as a "read" operation and making a directory automatically though is that
+        // the user doesn't get an expected error for trying to read a path that doesn't exist.
+        return options.keySet()
+            .stream()
+            .noneMatch(key -> key.startsWith("spark.marklogic.read.files")
+                || key.startsWith("spark.marklogic.read.aggregates.xml")
+            );
+    }
+
+    private void makeWritePath(String path, SparkSession sparkSession) {
+        if (Util.MAIN_LOGGER.isDebugEnabled()) {
+            Util.MAIN_LOGGER.debug("Calling mkdirs on path: {}", path);
+        }
+        Configuration config = sparkSession.sparkContext().hadoopConfiguration();
+        Path hadoopPath = new Path(path);
+        try {
+            hadoopPath.getFileSystem(config).mkdirs(hadoopPath);
+        } catch (Exception ex) {
+            // The user is likely to get an AnalysisException from Spark due to the path not existing, which is the
+            // better error to be propagated.
+            Util.MAIN_LOGGER.error("Unable to call mkdirs on path: {}; cause: {}", path, ex.getMessage());
+        }
+    }
 }
diff --git a/marklogic-spark-connector/src/test/java/com/marklogic/spark/reader/file/ReadAggregateXmlFilesTest.java b/marklogic-spark-connector/src/test/java/com/marklogic/spark/reader/file/ReadAggregateXmlFilesTest.java
@@ -7,6 +7,7 @@
 import com.marklogic.spark.AbstractIntegrationTest;
 import com.marklogic.spark.ConnectorException;
 import com.marklogic.spark.Options;
+import org.apache.spark.sql.AnalysisException;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Row;
 import org.jdom2.Namespace;
@@ -216,6 +217,16 @@ void invalidEncoding() {
             "Actual error: " + ex.getMessage());
     }
 
+    @Test
+    void pathDoesntExist() {
+        AnalysisException ex = assertThrows(AnalysisException.class, () -> newSparkSession().read()
+            .format(CONNECTOR_IDENTIFIER)
+            .option(Options.READ_AGGREGATES_XML_ELEMENT, "MedlineCitation")
+            .load("path-doesnt-exist"));
+
+        assertTrue(ex.getMessage().contains("Path does not exist"), "Unexpected error: " + ex.getMessage());
+    }
+
     private void verifyRow(Row row, String expectedUriSuffix, String rootPath, String name, int age) {
         String uri = row.getString(0);
         assertTrue(uri.endsWith(expectedUriSuffix), format("URI %s doesn't end with %s", uri, expectedUriSuffix));
diff --git a/marklogic-spark-connector/src/test/java/com/marklogic/spark/reader/file/ReadArchiveFileTest.java b/marklogic-spark-connector/src/test/java/com/marklogic/spark/reader/file/ReadArchiveFileTest.java
@@ -8,6 +8,7 @@
 import com.marklogic.spark.ConnectorException;
 import com.marklogic.spark.Options;
 import com.marklogic.spark.TestUtil;
+import org.apache.spark.sql.AnalysisException;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Row;
 import org.apache.spark.sql.SaveMode;
@@ -222,6 +223,16 @@ void customEncoding() {
         doc.assertElementExists("/MedlineCitationSet");
     }
 
+    @Test
+    void pathDoesntExist() {
+        AnalysisException ex = assertThrows(AnalysisException.class, () -> newSparkSession().read()
+            .format(CONNECTOR_IDENTIFIER)
+            .option(Options.READ_FILES_TYPE, "archive")
+            .load("path-doesnt-exist"));
+
+        assertTrue(ex.getMessage().contains("Path does not exist"), "Unexpected error: " + ex.getMessage());
+    }
+
     private void verifyAllMetadata(Path tempDir, int rowCount) {
         List<Row> rows = sparkSession.read().format(CONNECTOR_IDENTIFIER)
             .option(Options.READ_FILES_TYPE, "archive")
diff --git a/marklogic-spark-connector/src/test/java/com/marklogic/spark/writer/file/WriteDocumentFilesTest.java b/marklogic-spark-connector/src/test/java/com/marklogic/spark/writer/file/WriteDocumentFilesTest.java
@@ -16,6 +16,8 @@
 import org.apache.spark.sql.SaveMode;
 import org.junit.jupiter.api.Test;
 import org.junit.jupiter.api.io.TempDir;
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.ValueSource;
 import org.springframework.util.FileCopyUtils;
 
 import java.io.File;
@@ -25,8 +27,7 @@
 import java.util.List;
 import java.util.stream.Collectors;
 
-import static org.junit.jupiter.api.Assertions.assertEquals;
-import static org.junit.jupiter.api.Assertions.assertTrue;
+import static org.junit.jupiter.api.Assertions.*;
 
 class WriteDocumentFilesTest extends AbstractIntegrationTest {
 
@@ -45,6 +46,25 @@ void writeFifteenAuthorFiles(@TempDir Path tempDir) throws Exception {
         verifyAuthorFilesWereCorrectlyWritten(tempDir);
     }
 
+    @ParameterizedTest
+    @ValueSource(strings = {"doesntexist", "has space", "has+plus"})
+    void pathDoesntExist(String directoryName, @TempDir Path tempDir) {
+        File dir = new File(tempDir.toFile(), directoryName);
+        assertFalse(dir.exists());
+
+        newSparkSession().read().format(CONNECTOR_IDENTIFIER)
+            .option(Options.CLIENT_URI, makeClientUri())
+            .option(Options.READ_DOCUMENTS_URIS, "/author/author1.json")
+            .load()
+            .write().format(CONNECTOR_IDENTIFIER)
+            .mode(SaveMode.Append)
+            .save(dir.getAbsolutePath());
+
+        assertTrue(dir.exists(), "Directory was not created: " + dir.getAbsolutePath());
+        assertEquals(1, dir.listFiles().length);
+        assertTrue(new File(dir, "author").exists());
+    }
+
     @Test
     void streamAuthorDocuments(@TempDir Path tempDir) throws Exception {
         Dataset<Row> dataset = newSparkSession().read()