marklogic
diff --git a/‎build.gradle
Lines changed: 2 additions & 3 deletions b/‎build.gradle
Lines changed: 2 additions & 3 deletions
diff --git a/‎src/main/java/com/marklogic/spark/JsonRowSerializer.java
Lines changed: 80 additions & 0 deletions b/‎src/main/java/com/marklogic/spark/JsonRowSerializer.java
Lines changed: 80 additions & 0 deletions
diff --git a/‎src/main/java/com/marklogic/spark/Options.java
Lines changed: 6 additions & 0 deletions b/‎src/main/java/com/marklogic/spark/Options.java
Lines changed: 6 additions & 0 deletions
diff --git a/‎src/main/java/com/marklogic/spark/Util.java
Lines changed: 0 additions & 15 deletions b/‎src/main/java/com/marklogic/spark/Util.java
Lines changed: 0 additions & 15 deletions
diff --git a/‎src/main/java/com/marklogic/spark/reader/JsonRowDeserializer.java
Lines changed: 5 additions & 2 deletions b/‎src/main/java/com/marklogic/spark/reader/JsonRowDeserializer.java
Lines changed: 5 additions & 2 deletions
diff --git a/‎src/main/java/com/marklogic/spark/writer/ArbitraryRowConverter.java
Lines changed: 63 additions & 47 deletions b/‎src/main/java/com/marklogic/spark/writer/ArbitraryRowConverter.java
Lines changed: 63 additions & 47 deletions
@@ -50,9 +50,8 @@ dependencies {
     exclude group: "com.fasterxml.jackson.dataformat"
   }
 
-  // For XML support; supports converting a string of JSON into a string of XML.
-  // See ArbitraryRowConverter for more information.
-  shadowDependencies "org.json:json:20240303"
+  // Required for converting JSON to XML. Using 2.14.2 to align with Spark 3.4.1.
+  shadowDependencies "com.fasterxml.jackson.dataformat:jackson-dataformat-xml:2.14.2"
 
   // Need this so that an OkHttpClientConfigurator can be created.
   shadowDependencies 'com.squareup.okhttp3:okhttp:4.12.0'
 
@@ -0,0 +1,80 @@
+/*
+ * Copyright © 2024 Progress Software Corporation and/or its subsidiaries or affiliates. All Rights Reserved.
+ */
+package com.marklogic.spark;
+
+import org.apache.spark.sql.catalyst.InternalRow;
+import org.apache.spark.sql.catalyst.json.JSONOptions;
+import org.apache.spark.sql.catalyst.json.JacksonGenerator;
+import org.apache.spark.sql.types.StructType;
+import scala.Predef;
+import scala.collection.JavaConverters;
+
+import java.io.StringWriter;
+import java.util.HashMap;
+import java.util.Map;
+
+/**
+ * Handles serializing a Spark row into a JSON string. Includes support for all the options defined in Spark's
+ * JSONOptions.scala class.
+ */
+public class JsonRowSerializer {
+
+    private final StructType schema;
+    private final JSONOptions jsonOptions;
+    private final boolean includeNullFields;
+
+    public JsonRowSerializer(StructType schema, Map<String, String> connectorProperties) {
+        this.schema = schema;
+
+        final Map<String, String> options = buildOptionsForJsonOptions(connectorProperties);
+        this.includeNullFields = "false".equalsIgnoreCase(options.get("ignoreNullFields"));
+
+        this.jsonOptions = new JSONOptions(
+            // Funky code to convert a Java map into a Scala immutable Map.
+            JavaConverters.mapAsScalaMapConverter(options).asScala().toMap(Predef.$conforms()),
+
+            // As verified via tests, this default timezone ID is overridden by a user via
+            // the spark.sql.session.timeZone option.
+            "Z",
+
+            // We don't expect corrupted records - i.e. corrupted values - to be present in the index. But Spark
+            // requires this to be set. See
+            // https://medium.com/@sasidharan-r/how-to-handle-corrupt-or-bad-record-in-apache-spark-custom-logic-pyspark-aws-430ddec9bb41
+            // for more information.
+            "_corrupt_record"
+        );
+    }
+
+    public String serializeRowToJson(InternalRow row) {
+        StringWriter writer = new StringWriter();
+        JacksonGenerator jacksonGenerator = new JacksonGenerator(this.schema, writer, this.jsonOptions);
+        jacksonGenerator.write(row);
+        jacksonGenerator.flush();
+        return writer.toString();
+    }
+
+    /**
+     * A user can specify any of the options found in the JSONOptions.scala class - though it's not yet clear where
+     * a user finds out about these except via the Spark source code. "ignoreNullFields" however is expected to be the
+     * primary one that is configured.
+     */
+    private Map<String, String> buildOptionsForJsonOptions(Map<String, String> connectorProperties) {
+        Map<String, String> options = new HashMap<>();
+        connectorProperties.forEach((key, value) -> {
+            if (key.startsWith(Options.WRITE_JSON_SERIALIZATION_OPTION_PREFIX)) {
+                String optionName = key.substring(Options.WRITE_JSON_SERIALIZATION_OPTION_PREFIX.length());
+                options.put(optionName, value);
+            }
+        });
+        return options;
+    }
+
+    public JSONOptions getJsonOptions() {
+        return jsonOptions;
+    }
+
+    public boolean isIncludeNullFields() {
+        return this.includeNullFields;
+    }
+}
@@ -119,6 +119,12 @@ public abstract class Options {
     public static final String WRITE_XML_ROOT_NAME = "spark.marklogic.write.xmlRootName";
     public static final String WRITE_XML_NAMESPACE = "spark.marklogic.write.xmlNamespace";
 
+    // For serializing a row into JSON. Intent is to allow for other constants defined in the Spark
+    // JSONOptions.scala class to be used after "spark.marklogic.write.json."
+    // Example - "spark.marklogic.write.json.ignoreNullFields=false.
+    public static final String WRITE_JSON_SERIALIZATION_OPTION_PREFIX = "spark.marklogic.write.json.";
+
+
     // For writing RDF
     public static final String WRITE_GRAPH = "spark.marklogic.write.graph";
     public static final String WRITE_GRAPH_OVERRIDE = "spark.marklogic.write.graphOverride";
 
@@ -15,10 +15,8 @@
  */
 package com.marklogic.spark;
 
-import org.apache.spark.sql.catalyst.json.JSONOptions;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
-import scala.collection.immutable.HashMap;
 
 import java.util.*;
 import java.util.stream.Stream;
@@ -31,19 +29,6 @@ public interface Util {
      */
     Logger MAIN_LOGGER = LoggerFactory.getLogger("com.marklogic.spark");
 
-    JSONOptions DEFAULT_JSON_OPTIONS = new JSONOptions(
-        new HashMap<>(),
-
-        // As verified via tests, this default timezone ID is overridden by a user via the spark.sql.session.timeZone option.
-        "Z",
-
-        // We don't expect corrupted records - i.e. corrupted values - to be present in the index. But Spark
-        // requires this to be set. See
-        // https://medium.com/@sasidharan-r/how-to-handle-corrupt-or-bad-record-in-apache-spark-custom-logic-pyspark-aws-430ddec9bb41
-        // for more information.
-        "_corrupt_record"
-    );
-
     static boolean hasOption(Map<String, String> properties, String... options) {
         return Stream.of(options)
             .anyMatch(option -> properties.get(option) != null && properties.get(option).trim().length() > 0);
 
@@ -2,9 +2,10 @@
 
 import com.fasterxml.jackson.core.JsonFactory;
 import com.fasterxml.jackson.core.JsonParser;
-import com.marklogic.spark.Util;
+import com.marklogic.spark.JsonRowSerializer;
 import org.apache.spark.sql.catalyst.InternalRow;
 import org.apache.spark.sql.catalyst.json.CreateJacksonParser;
+import org.apache.spark.sql.catalyst.json.JSONOptions;
 import org.apache.spark.sql.catalyst.json.JacksonParser;
 import org.apache.spark.sql.sources.Filter;
 import org.apache.spark.sql.types.StructType;
@@ -15,6 +16,7 @@
 import scala.collection.Seq;
 
 import java.util.ArrayList;
+import java.util.HashMap;
 
 /**
  * Handles deserializing a JSON object into a Spark InternalRow. This is accomplished via Spark's JacksonParser.
@@ -45,6 +47,7 @@ public InternalRow deserializeJson(String json) {
     private JacksonParser newJacksonParser(StructType schema) {
         final boolean allowArraysAsStructs = true;
         final Seq<Filter> filters = JavaConverters.asScalaIterator(new ArrayList<Filter>().iterator()).toSeq();
-        return new JacksonParser(schema, Util.DEFAULT_JSON_OPTIONS, allowArraysAsStructs, filters);
+        JSONOptions jsonOptions = new JsonRowSerializer(schema, new HashMap<>()).getJsonOptions();
+        return new JacksonParser(schema, jsonOptions, allowArraysAsStructs, filters);
     }
 }
@@ -1,23 +1,21 @@
 package com.marklogic.spark.writer;
 
 import com.fasterxml.jackson.core.JsonProcessingException;
+import com.fasterxml.jackson.databind.JsonNode;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import com.fasterxml.jackson.databind.node.ObjectNode;
+import com.fasterxml.jackson.dataformat.xml.XmlMapper;
 import com.marklogic.client.io.Format;
 import com.marklogic.client.io.JacksonHandle;
 import com.marklogic.client.io.StringHandle;
 import com.marklogic.client.io.marker.AbstractWriteHandle;
 import com.marklogic.spark.ConnectorException;
+import com.marklogic.spark.JsonRowSerializer;
 import com.marklogic.spark.Options;
-import com.marklogic.spark.Util;
 import org.apache.spark.sql.catalyst.InternalRow;
-import org.apache.spark.sql.catalyst.json.JacksonGenerator;
 import org.apache.spark.sql.types.StructField;
 import org.apache.spark.sql.types.StructType;
-import org.json.JSONObject;
-import org.json.XML;
 
-import java.io.StringWriter;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Optional;
@@ -32,24 +30,23 @@ class ArbitraryRowConverter implements RowConverter {
     private static final String MARKLOGIC_SPARK_FILE_PATH_COLUMN_NAME = "marklogic_spark_file_path";
 
     private final ObjectMapper objectMapper;
-
-    private final StructType schema;
+    private final XmlMapper xmlMapper;
+    private final JsonRowSerializer jsonRowSerializer;
     private final String uriTemplate;
     private final String jsonRootName;
     private final String xmlRootName;
     private final String xmlNamespace;
-
     private final int filePathIndex;
 
     ArbitraryRowConverter(WriteContext writeContext) {
-        this.schema = writeContext.getSchema();
-        this.filePathIndex = determineFilePathIndex();
-
+        this.filePathIndex = determineFilePathIndex(writeContext.getSchema());
         this.uriTemplate = writeContext.getStringOption(Options.WRITE_URI_TEMPLATE);
         this.jsonRootName = writeContext.getStringOption(Options.WRITE_JSON_ROOT_NAME);
         this.xmlRootName = writeContext.getStringOption(Options.WRITE_XML_ROOT_NAME);
         this.xmlNamespace = writeContext.getStringOption(Options.WRITE_XML_NAMESPACE);
         this.objectMapper = new ObjectMapper();
+        this.xmlMapper = this.xmlRootName != null ? new XmlMapper() : null;
+        this.jsonRowSerializer = new JsonRowSerializer(writeContext.getSchema(), writeContext.getProperties());
     }
 
     @Override
@@ -60,26 +57,49 @@ public Optional<DocBuilder.DocumentInputs> convertRow(InternalRow row) {
             row.setNullAt(this.filePathIndex);
         }
 
-        final String json = convertRowToJSONString(row);
-        AbstractWriteHandle contentHandle = this.xmlRootName != null ?
-            new StringHandle(convertJsonToXml(json)).withFormat(Format.XML) :
-            new StringHandle(json).withFormat(Format.JSON);
+        final String json = this.jsonRowSerializer.serializeRowToJson(row);
 
+        AbstractWriteHandle contentHandle = null;
+        ObjectNode deserializedJson = null;
         ObjectNode uriTemplateValues = null;
-        if (this.uriTemplate != null || this.jsonRootName != null) {
-            ObjectNode jsonObject = readTree(json);
+        final boolean mustRemoveFilePathField = this.filePathIndex > 1 && jsonRowSerializer.isIncludeNullFields();
+
+        if (this.jsonRootName != null || this.xmlRootName != null || this.uriTemplate != null || mustRemoveFilePathField) {
+            deserializedJson = readTree(json);
+            if (mustRemoveFilePathField) {
+                deserializedJson.remove(MARKLOGIC_SPARK_FILE_PATH_COLUMN_NAME);
+            }
+        }
+
+        if (this.uriTemplate != null) {
+            uriTemplateValues = deserializedJson;
+        }
+
+        if (this.jsonRootName != null) {
+            ObjectNode jsonObjectWithRootName = objectMapper.createObjectNode();
+            jsonObjectWithRootName.set(jsonRootName, deserializedJson);
+            contentHandle = new JacksonHandle(jsonObjectWithRootName);
             if (this.uriTemplate != null) {
-                uriTemplateValues = jsonObject;
+                uriTemplateValues = jsonObjectWithRootName;
             }
-            if (this.jsonRootName != null) {
-                ObjectNode root = objectMapper.createObjectNode();
-                root.set(jsonRootName, jsonObject);
-                contentHandle = new JacksonHandle(root);
-                if (this.uriTemplate != null) {
-                    uriTemplateValues = root;
-                }
+        }
+
+        if (contentHandle == null) {
+            // If the user wants XML, then we've definitely deserialized the JSON and removed the file path if
+            // needed. So use that JsonNode to produce an XML string.
+            if (xmlRootName != null) {
+                contentHandle = new StringHandle(convertJsonToXml(deserializedJson)).withFormat(Format.XML);
+            }
+            // If we've already gone to the effort of creating deserializedJson, use it for the content.
+            else if (deserializedJson != null) {
+                contentHandle = new JacksonHandle(deserializedJson);
+            } else {
+                // Simplest scenario where we never have a reason to incur the expense of deserializing the JSON string,
+                // so we can just use StringHandle.
+                contentHandle = new StringHandle(json).withFormat(Format.JSON);
             }
         }
+
         return Optional.of(new DocBuilder.DocumentInputs(initialUri, contentHandle, uriTemplateValues, null));
     }
 
@@ -98,7 +118,7 @@ public List<DocBuilder.DocumentInputs> getRemainingDocumentInputs() {
      *
      * @return
      */
-    private int determineFilePathIndex() {
+    private int determineFilePathIndex(StructType schema) {
         StructField[] fields = schema.fields();
         for (int i = 0; i < fields.length; i++) {
             if (MARKLOGIC_SPARK_FILE_PATH_COLUMN_NAME.equals(fields[i].name())) {
@@ -118,33 +138,29 @@ private ObjectNode readTree(String json) {
         }
     }
 
-    private String convertRowToJSONString(InternalRow row) {
-        StringWriter writer = new StringWriter();
-        JacksonGenerator jacksonGenerator = new JacksonGenerator(this.schema, writer, Util.DEFAULT_JSON_OPTIONS);
-        jacksonGenerator.write(row);
-        jacksonGenerator.flush();
-        return writer.toString();
-    }
-
     /**
      * jackson-xml-mapper unfortunately does not yet support a root namespace. Nor does it allow for the root element
      * to be omitted. So we always end up with "ObjectNode" as a root element. See
-     * https://github.com/FasterXML/jackson-dataformat-xml/issues/541 for more information.
-     * <p>
-     * While JSON-Java does not support a root namespace, it does allow for the root element to be omitted. That is
-     * sufficient for us, as we can then generate our own root element - albeit via string concatentation - that
-     * includes a user-defined namespace.
+     * https://github.com/FasterXML/jackson-dataformat-xml/issues/541 for more information. So this method does some
+     * work to replace that root element with one based on user inputs.
      *
-     * @param json
+     * @param doc
      * @return
      */
-    private String convertJsonToXml(String json) {
-        JSONObject jsonObject = new JSONObject(json);
-        if (this.xmlNamespace != null) {
-            StringBuilder xml = new StringBuilder(String.format("<%s xmlns='%s'>", this.xmlRootName, this.xmlNamespace));
-            xml.append(XML.toString(jsonObject, null));
-            return xml.append(String.format("</%s>", this.xmlRootName)).toString();
+    private String convertJsonToXml(JsonNode doc) {
+        try {
+            String xml = xmlMapper.writer().writeValueAsString(doc);
+            String startTag = this.xmlNamespace != null ?
+                String.format("<%s xmlns='%s'>", this.xmlRootName, this.xmlNamespace) :
+                String.format("<%s>", this.xmlRootName);
+            return new StringBuilder(startTag)
+                .append(xml.substring("<ObjectNode>".length(), xml.length() - "</ObjectNode>".length()))
+                .append(String.format("</%s>", this.xmlRootName))
+                .toString();
+        } catch (JsonProcessingException e) {
+            // We don't expect this occur; Jackson should be able to convert any JSON object that it created into
+            // a valid XML document.
+            throw new ConnectorException(String.format("Unable to convert JSON to XML for doc: %s", doc), e);
         }
-        return XML.toString(jsonObject, this.xmlRootName);
     }
 }