Adding support for using the Spark file path for a URI

rjrudin · rjrudin · commit f28a2b053947 · 2024-07-17T22:11:15.000-04:00
This may not be used by a connector user, but it'll be used by Flux when importing files via a Spark data source.
diff --git a/src/main/java/com/marklogic/spark/writer/ArbitraryRowConverter.java b/src/main/java/com/marklogic/spark/writer/ArbitraryRowConverter.java
@@ -12,6 +12,7 @@
 import com.marklogic.spark.Util;
 import org.apache.spark.sql.catalyst.InternalRow;
 import org.apache.spark.sql.catalyst.json.JacksonGenerator;
+import org.apache.spark.sql.types.StructField;
 import org.apache.spark.sql.types.StructType;
 import org.json.JSONObject;
 import org.json.XML;
@@ -20,13 +21,16 @@
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Optional;
+import java.util.UUID;
 
 /**
  * Handles building a document from an "arbitrary" row - i.e. one with an unknown schema, where the row will be
  * serialized by Spark to a JSON object.
  */
 class ArbitraryRowConverter implements RowConverter {
 
+    private static final String MARKLOGIC_SPARK_FILE_PATH_COLUMN_NAME = "marklogic_spark_file_path";
+
     private final ObjectMapper objectMapper;
 
     private final StructType schema;
@@ -35,8 +39,12 @@ class ArbitraryRowConverter implements RowConverter {
     private final String xmlRootName;
     private final String xmlNamespace;
 
+    private final int filePathIndex;
+
     ArbitraryRowConverter(WriteContext writeContext) {
         this.schema = writeContext.getSchema();
+        this.filePathIndex = determineFilePathIndex();
+
         this.uriTemplate = writeContext.getStringOption(Options.WRITE_URI_TEMPLATE);
         this.jsonRootName = writeContext.getStringOption(Options.WRITE_JSON_ROOT_NAME);
         this.xmlRootName = writeContext.getStringOption(Options.WRITE_XML_ROOT_NAME);
@@ -46,6 +54,12 @@ class ArbitraryRowConverter implements RowConverter {
 
     @Override
     public Optional<DocBuilder.DocumentInputs> convertRow(InternalRow row) {
+        String initialUri = null;
+        if (this.filePathIndex > -1) {
+            initialUri = row.getString(this.filePathIndex) + "/" + UUID.randomUUID();
+            row.setNullAt(this.filePathIndex);
+        }
+
         final String json = convertRowToJSONString(row);
         AbstractWriteHandle contentHandle = this.xmlRootName != null ?
             new StringHandle(convertJsonToXml(json)).withFormat(Format.XML) :
@@ -66,14 +80,34 @@ public Optional<DocBuilder.DocumentInputs> convertRow(InternalRow row) {
                 }
             }
         }
-        return Optional.of(new DocBuilder.DocumentInputs(null, contentHandle, uriTemplateValues, null));
+        return Optional.of(new DocBuilder.DocumentInputs(initialUri, contentHandle, uriTemplateValues, null));
     }
 
     @Override
     public List<DocBuilder.DocumentInputs> getRemainingDocumentInputs() {
         return new ArrayList<>();
     }
 
+    /**
+     * A Spark user can add a column via:
+     * withColumn("marklogic_spark_file_path", new Column("_metadata.file_path"))
+     * <p>
+     * This allows access to the file path when using a Spark data source - e.g. CSV, Parquet - to read a file.
+     * The column will be used to generate an initial URI for the corresponding document, and the column will then
+     * be removed after that so that it's not included in the document.
+     *
+     * @return
+     */
+    private int determineFilePathIndex() {
+        StructField[] fields = schema.fields();
+        for (int i = 0; i < fields.length; i++) {
+            if (MARKLOGIC_SPARK_FILE_PATH_COLUMN_NAME.equals(fields[i].name())) {
+                return i;
+            }
+        }
+        return -1;
+    }
+
     private ObjectNode readTree(String json) {
         // We don't ever expect this to fail, as the JSON is produced by Spark's JacksonGenerator and should always
         // be valid JSON. But Jackson throws a checked exception, so gotta handle it.
diff --git a/src/test/java/com/marklogic/spark/writer/WriteRowsWithFilePathTest.java b/src/test/java/com/marklogic/spark/writer/WriteRowsWithFilePathTest.java
@@ -0,0 +1,53 @@
+/*
+ * Copyright © 2024 Progress Software Corporation and/or its subsidiaries or affiliates. All Rights Reserved.
+ */
+package com.marklogic.spark.writer;
+
+import com.fasterxml.jackson.databind.JsonNode;
+import com.marklogic.spark.Options;
+import org.apache.spark.sql.Column;
+import org.apache.spark.sql.SaveMode;
+import org.junit.jupiter.api.Test;
+
+import java.util.List;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+class WriteRowsWithFilePathTest extends AbstractWriteTest {
+
+    /**
+     * Intended to allow for Flux to optionally use the filename for an initial URI. Relevant any time we use Flux with
+     * a Spark data source that produces arbitrary data rows.
+     */
+    @Test
+    void test() {
+        newSparkSession().read()
+            .option("header", true)
+            .format("csv")
+            .csv("src/test/resources/data.csv")
+            .withColumn("marklogic_spark_file_path", new Column("_metadata.file_path"))
+            .limit(10)
+            .write().format(CONNECTOR_IDENTIFIER)
+            .option(Options.CLIENT_URI, makeClientUri())
+            .option(Options.WRITE_PERMISSIONS, DEFAULT_PERMISSIONS)
+            .option(Options.WRITE_COLLECTIONS, "some-files")
+            .option(Options.WRITE_URI_REPLACE, ".*/src/test,'/test'")
+            .mode(SaveMode.Append)
+            .save();
+
+        List<String> uris = getUrisInCollection("some-files", 10);
+        uris.forEach(uri -> {
+            assertTrue(uri.startsWith("/test/resources/data.csv/"), "When a column named 'marklogic_spark_file_path' is passed " +
+                "to the connector for writing arbitrary rows, it will be used to construct an initial URI that " +
+                "also has a UUID in it. This is useful for the somewhat rare use case of wanting the physical file " +
+                "path to be a part of the URI (as opposed to using a URI template). Actual URI: " + uri);
+
+            JsonNode doc = readJsonDocument(uri);
+            assertEquals(2, doc.size(), "The marklogic_spark_file_path column should not have been used when " +
+                "constructing the JSON document.");
+            assertTrue(doc.has("docNum"));
+            assertTrue(doc.has("docName"));
+        });
+    }
+}