Merge pull request #277 from marklogic/feature/15791-agg-xml-perf

rjrudin · web-flow · commit e33ac58baeee · 2024-08-02T12:29:38.000-04:00
MLE-15791 Performance improvement for extracting URI element value
diff --git a/src/main/java/com/marklogic/spark/reader/file/xml/AggregateXmlSplitter.java b/src/main/java/com/marklogic/spark/reader/file/xml/AggregateXmlSplitter.java
@@ -13,7 +13,6 @@
 import javax.xml.stream.XMLInputFactory;
 import javax.xml.stream.XMLStreamException;
 import javax.xml.stream.XMLStreamReader;
-import java.io.ByteArrayInputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.util.Iterator;
@@ -61,9 +60,13 @@ class AggregateXmlSplitter {
         final String element = fileContext.getStringOption(Options.READ_AGGREGATES_XML_ELEMENT);
         final String encoding = fileContext.getStringOption(Options.READ_FILES_ENCODING);
 
+        final XMLSplitter<StringHandle> splitter = this.uriElement != null ?
+            new XMLSplitter<>(new UriElementExtractingVisitor(namespace, element, uriNamespace, uriElement)) :
+            XMLSplitter.makeSplitter(namespace, element);
+
         try {
             XMLStreamReader reader = xmlInputFactory.createXMLStreamReader(inputStream, encoding);
-            this.contentStream = XMLSplitter.makeSplitter(namespace, element).split(reader).iterator();
+            this.contentStream = splitter.split(reader).iterator();
         } catch (IOException | XMLStreamException e) {
             throw new ConnectorException(
                 String.format("Unable to read XML at %s; cause: %s", this.identifierForErrors, e.getMessage()), e
@@ -81,88 +84,39 @@ boolean hasNext() {
     }
 
     /**
-     * @param pathPrefix used to construct a path if no uriElement was specified
+     * @param uriPrefix used to construct a URI if no uriElement was specified
      * @return a row corresponding to the {@code DocumentRowSchema}
      */
-    InternalRow nextRow(String pathPrefix) {
-        String xml;
+    InternalRow nextRow(String uriPrefix) {
+        StringHandle stringHandle;
         try {
-            xml = this.contentStream.next().get();
+            stringHandle = this.contentStream.next();
         } catch (RuntimeException ex) {
             String message = String.format("Unable to read XML from %s; cause: %s",
                 this.identifierForErrors, ex.getMessage());
             throw new ConnectorException(message, ex);
         }
 
-        final String path = this.uriElement != null && !this.uriElement.trim().isEmpty() ?
-            extractUriElementValue(xml) :
-            pathPrefix + "-" + rowCounter + ".xml";
-
+        final String initialUri = determineInitialUri(stringHandle, uriPrefix);
         rowCounter++;
-
-        byte[] content = xml.getBytes();
         return new GenericInternalRow(new Object[]{
-            UTF8String.fromString(path),
-            ByteArray.concat(content),
+            UTF8String.fromString(initialUri),
+            ByteArray.concat(stringHandle.get().getBytes()),
             UTF8String.fromString("xml"),
             null, null, null, null, null
         });
     }
 
-    /**
-     * MLCP has undocumented support for attribute references via "@(attribute-name)". We are not supporting this yet
-     * as we are using XMLSplitter to find the user-defined element, and XMLSplitter does not support finding
-     * attributes. Additionally, this feature is still fairly limited in comparison to the "URI template" that the
-     * connector supports. Ultimately, we'd want to support N path expressions against both Spark columns and against
-     * a JSON or XML tree in a single Spark column.
-     *
-     * @param xml
-     * @return
-     */
-    private String extractUriElementValue(String xml) {
-        Iterator<StringHandle> iterator;
-        XMLSplitter<StringHandle> splitter = XMLSplitter.makeSplitter(this.uriNamespace, this.uriElement);
-        splitter.setVisitor(new UriElementVisitor(this.uriNamespace, this.uriElement));
-        try {
-            iterator = splitter.split(new ByteArrayInputStream(xml.getBytes())).iterator();
-        } catch (Exception e) {
-            // We don't expect this to ever occur, as if the XML couldn't be parsed, an error would have been thrown
-            // when the child element was originally extracted. But still have to catch an exception.
-            String message = String.format("Unable to parse XML in aggregate element %d in %s; cause: %s",
-                rowCounter, this.identifierForErrors, e.getMessage());
-            throw new ConnectorException(message, e);
-        }
-
-        if (!iterator.hasNext()) {
-            String message = String.format("No occurrence of URI element '%s' found in aggregate element %d in %s",
-                this.uriElement, rowCounter, this.identifierForErrors);
-            throw new ConnectorException(message);
-        }
-        return iterator.next().get();
-    }
-
-    /**
-     * Extends the Java Client visitor class so that it can return a handle containing the text of the
-     * user-defined URI element.
-     */
-    private class UriElementVisitor extends XMLSplitter.BasicElementVisitor {
-        public UriElementVisitor(String nsUri, String localName) {
-            super(nsUri, localName);
-        }
-
-        @Override
-        public StringHandle makeBufferedHandle(XMLStreamReader xmlStreamReader) {
-            String text;
-            try {
-                text = xmlStreamReader.getElementText();
-            } catch (XMLStreamException e) {
-                String message = String.format(
-                    "Unable to get text from URI element '%s' found in aggregate element %d in %s; cause: %s",
-                    uriElement, rowCounter, identifierForErrors, e.getMessage()
-                );
-                throw new ConnectorException(message, e);
+    private String determineInitialUri(StringHandle stringHandle, String uriPrefix) {
+        if (stringHandle instanceof StringHandleWithUriValue) {
+            String uriValue = ((StringHandleWithUriValue) stringHandle).getUriValue();
+            if (uriValue == null) {
+                String message = String.format("No occurrence of URI element '%s' found in aggregate element %d in %s",
+                    this.uriElement, rowCounter, this.identifierForErrors);
+                throw new ConnectorException(message);
             }
-            return new StringHandle(text);
+            return uriValue;
         }
+        return String.format("%s-%d.xml", uriPrefix, rowCounter);
     }
 }
diff --git a/src/main/java/com/marklogic/spark/reader/file/xml/StringHandleWithUriValue.java b/src/main/java/com/marklogic/spark/reader/file/xml/StringHandleWithUriValue.java
@@ -0,0 +1,23 @@
+/*
+ * Copyright © 2024 Progress Software Corporation and/or its subsidiaries or affiliates. All Rights Reserved.
+ */
+package com.marklogic.spark.reader.file.xml;
+
+import com.marklogic.client.io.StringHandle;
+
+/**
+ * Captures a URI value based on the user-defined URI element.
+ */
+class StringHandleWithUriValue extends StringHandle {
+
+    private final String uriValue;
+
+    StringHandleWithUriValue(String content, String uriValue) {
+        super(content);
+        this.uriValue = uriValue;
+    }
+
+    String getUriValue() {
+        return uriValue;
+    }
+}
diff --git a/src/main/java/com/marklogic/spark/reader/file/xml/UriElementExtractingReader.java b/src/main/java/com/marklogic/spark/reader/file/xml/UriElementExtractingReader.java
@@ -0,0 +1,58 @@
+/*
+ * Copyright © 2024 Progress Software Corporation and/or its subsidiaries or affiliates. All Rights Reserved.
+ */
+package com.marklogic.spark.reader.file.xml;
+
+import javax.xml.stream.XMLStreamConstants;
+import javax.xml.stream.XMLStreamException;
+import javax.xml.stream.XMLStreamReader;
+import javax.xml.stream.util.StreamReaderDelegate;
+
+/**
+ * Knows how to extract a URI element value while the XML for an aggregate element is being read and serialized.
+ */
+class UriElementExtractingReader extends StreamReaderDelegate {
+
+    private XMLStreamReader source;
+    private final String uriNamespace;
+    private final String uriElement;
+
+    // Used to track when the URI element is detected.
+    private boolean isReadingUriElement;
+    private String uriValue;
+
+    UriElementExtractingReader(XMLStreamReader source, String uriNamespace, String uriElement) {
+        super(source);
+        this.source = source;
+        this.uriNamespace = uriNamespace;
+        this.uriElement = uriElement;
+    }
+
+    @Override
+    public int next() throws XMLStreamException {
+        int value = source.next();
+        if (value == XMLStreamConstants.START_ELEMENT) {
+            // Only use the first instance of the URI element that is found.
+            if (matchesUriElement() && this.uriValue == null) {
+                this.isReadingUriElement = true;
+                this.uriValue = "";
+            }
+        } else if (value == XMLStreamConstants.CHARACTERS) {
+            if (this.isReadingUriElement) {
+                this.uriValue += source.getText();
+            }
+        } else if (value == XMLStreamConstants.END_ELEMENT && this.isReadingUriElement && matchesUriElement()) {
+            this.isReadingUriElement = false;
+        }
+        return value;
+    }
+
+    private boolean matchesUriElement() {
+        return source.getLocalName().equals(uriElement) &&
+            (this.uriNamespace == null || this.uriNamespace.equals(source.getNamespaceURI()));
+    }
+
+    String getUriValue() {
+        return uriValue;
+    }
+}
diff --git a/src/main/java/com/marklogic/spark/reader/file/xml/UriElementExtractingVisitor.java b/src/main/java/com/marklogic/spark/reader/file/xml/UriElementExtractingVisitor.java
@@ -0,0 +1,33 @@
+/*
+ * Copyright © 2024 Progress Software Corporation and/or its subsidiaries or affiliates. All Rights Reserved.
+ */
+package com.marklogic.spark.reader.file.xml;
+
+import com.marklogic.client.datamovement.XMLSplitter;
+import com.marklogic.client.io.Format;
+import com.marklogic.client.io.StringHandle;
+
+import javax.xml.stream.XMLStreamReader;
+
+/**
+ * Supports extracting a URI element value for each aggregate element.
+ */
+class UriElementExtractingVisitor extends XMLSplitter.BasicElementVisitor {
+
+    private final String uriNamespace;
+    private final String uriElement;
+
+    UriElementExtractingVisitor(String nsUri, String localName, String uriNamespace, String uriElement) {
+        super(nsUri, localName);
+        this.uriNamespace = uriNamespace;
+        this.uriElement = uriElement;
+    }
+
+    @Override
+    public StringHandle makeBufferedHandle(XMLStreamReader xmlStreamReader) {
+        UriElementExtractingReader reader = new UriElementExtractingReader(xmlStreamReader, uriNamespace, uriElement);
+        String content = serialize(reader);
+        String uriValue = reader.getUriValue();
+        return new StringHandleWithUriValue(content, uriValue).withFormat(Format.XML);
+    }
+}
diff --git a/src/test/java/com/marklogic/spark/reader/file/ReadAggregateXmlFilesTest.java b/src/test/java/com/marklogic/spark/reader/file/ReadAggregateXmlFilesTest.java
@@ -100,17 +100,20 @@ void uriElementHasNamespace() {
 
     @Test
     void uriElementHasMixedContent() {
-        Dataset<Row> dataset = newSparkSession().read()
+        List<Row> rows = newSparkSession().read()
             .format(CONNECTOR_IDENTIFIER)
             .option(Options.READ_AGGREGATES_XML_ELEMENT, "Employee")
             .option(Options.READ_AGGREGATES_XML_URI_ELEMENT, "mixed")
-            .load("src/test/resources/aggregates/employees.xml");
+            .load("src/test/resources/aggregates/employees.xml")
+            .collectAsList();
 
-        ConnectorException ex = assertThrowsConnectorException(() -> dataset.count());
-        String message = ex.getMessage();
-        assertTrue(message.startsWith("Unable to get text from URI element 'mixed' found in aggregate element 1 in file"),
-            "The error should identify the URI element that text could not be retrieved from along with which aggregate " +
-                "element produced the failure; actual message: " + message);
+        rows.forEach(row -> {
+            String uri = row.getString(0);
+            assertEquals("has mixed content", uri, "We don't have a good reason to throw an exception when the user " +
+                "specifies a URI element with mixed content. While MLCP carefully reconstructs the XML, and thus may " +
+                "not want to deal with the complexity of mixed content, our connector plucks the URI element value " +
+                "while the element is transformed into a string via a standard Java Transformer.");
+        });
     }
 
     @Test
diff --git a/src/test/java/com/marklogic/spark/reader/file/ReadAggregateXmlZipFilesTest.java b/src/test/java/com/marklogic/spark/reader/file/ReadAggregateXmlZipFilesTest.java
@@ -53,21 +53,17 @@ void twoZipsOnePartition() {
 
     @Test
     void uriElementHasMixedContent() {
-        Dataset<Row> dataset = newSparkSession().read()
+        List<Row> rows = newSparkSession().read()
             .format(CONNECTOR_IDENTIFIER)
             .option(Options.READ_AGGREGATES_XML_ELEMENT, "Employee")
             .option(Options.READ_AGGREGATES_XML_URI_ELEMENT, "mixed")
             .option(Options.READ_FILES_COMPRESSION, "zip")
-            .load("src/test/resources/aggregate-zips/employee-aggregates.zip");
+            .load("src/test/resources/aggregate-zips/employee-aggregates.zip")
+            .collectAsList();
 
-        ConnectorException ex = assertThrowsConnectorException(() -> dataset.count());
-        String message = ex.getMessage();
-        assertTrue(
-            message.startsWith(
-                "Unable to get text from URI element 'mixed' found in aggregate element 1 in entry employees.xml in file:///"
-            ),
-            "The error should identify the URI element that text could not be retrieved from along with which aggregate " +
-                "element and which zip entry produced the failure; actual message: " + message
+        rows.forEach(row ->
+            assertEquals("has mixed content", row.getString(0),
+                "Mixed content is supported by the connector in a URI element.")
         );
     }