MLE-12332 Can now read RDF/XML files

rjrudin · rjrudin · commit 7512637c9821 · 2024-02-16T15:11:59.000-05:00
Main things of note:

1. New `TripleRowSchema` instead of using document row schema, as we'll need to capture the graph for quads.
2. `TripleSerializer` is mostly copy/pasted (and cleaned up a bunch) from MLCP.
diff --git a/build.gradle b/build.gradle
@@ -13,8 +13,10 @@ group 'com.marklogic'
 version '2.2-SNAPSHOT'
 
 java {
-  sourceCompatibility = 1.8
-  targetCompatibility = 1.8
+  // To support reading RDF files, Apache Jena is used - but that requires Java 11. If we want to do a 2.2.0 release
+  // without requiring Java 11, we'll remove the support for reading RDF files along with the Jena dependency.
+  sourceCompatibility = 11
+  targetCompatibility = 11
 }
 
 repositories {
@@ -40,6 +42,8 @@ dependencies {
     exclude module: "scala-library"
   }
 
+  implementation "org.apache.jena:jena-arq:4.10.0"
+
   testImplementation 'org.apache.spark:spark-sql_2.12:' + sparkVersion
 
   // The exclusions in these two modules ensure that we use the Jackson libraries from spark-sql when running the tests.
@@ -56,7 +60,7 @@ dependencies {
     exclude module: 'jackson-dataformat-csv'
   }
 
-  testImplementation "ch.qos.logback:logback-classic:1.3.5"
+  testImplementation "ch.qos.logback:logback-classic:1.3.14"
   testImplementation "org.slf4j:jcl-over-slf4j:1.7.36"
   testImplementation "org.skyscreamer:jsonassert:1.5.1"
 }
diff --git a/src/main/java/com/marklogic/spark/DefaultSource.java b/src/main/java/com/marklogic/spark/DefaultSource.java
@@ -21,6 +21,7 @@
 import com.marklogic.spark.reader.document.DocumentRowSchema;
 import com.marklogic.spark.reader.document.DocumentTable;
 import com.marklogic.spark.reader.file.FileRowSchema;
+import com.marklogic.spark.reader.file.TripleRowSchema;
 import com.marklogic.spark.reader.optic.SchemaInferrer;
 import com.marklogic.spark.writer.WriteContext;
 import org.apache.spark.sql.SparkSession;
@@ -63,7 +64,7 @@ public String shortName() {
     public StructType inferSchema(CaseInsensitiveStringMap options) {
         final Map<String, String> properties = options.asCaseSensitiveMap();
         if (isFileOperation(properties)) {
-            return FileRowSchema.SCHEMA;
+            return "rdf".equals(properties.get(Options.READ_FILES_TYPE)) ? TripleRowSchema.SCHEMA : FileRowSchema.SCHEMA;
         }
         if (isReadDocumentsOperation(properties)) {
             return DocumentRowSchema.SCHEMA;
@@ -85,8 +86,7 @@ public Table getTable(StructType schema, Transform[] partitioning, Map<String, S
 
         if (isReadDocumentsOperation(properties)) {
             return new DocumentTable();
-        }
-        else if (isReadOperation(properties)) {
+        } else if (isReadOperation(properties)) {
             if (logger.isDebugEnabled()) {
                 logger.debug("Creating new table for reading");
             }
diff --git a/src/main/java/com/marklogic/spark/Options.java b/src/main/java/com/marklogic/spark/Options.java
@@ -55,6 +55,7 @@ public abstract class Options {
     public static final String READ_DOCUMENTS_TRANSFORM_PARAMS_DELIMITER = "spark.marklogic.read.documents.transformParamsDelimiter";
     public static final String READ_DOCUMENTS_PARTITIONS_PER_FOREST = "spark.marklogic.read.documents.partitionsPerForest";
 
+    public static final String READ_FILES_TYPE = "spark.marklogic.read.files.type";
     public static final String READ_FILES_COMPRESSION = "spark.marklogic.read.files.compression";
 
     // "Aggregate" = an XML document containing N child elements, each of which should become a row / document.
diff --git a/src/main/java/com/marklogic/spark/reader/document/DocumentRowSchema.java b/src/main/java/com/marklogic/spark/reader/document/DocumentRowSchema.java
@@ -6,7 +6,7 @@
 public abstract class DocumentRowSchema {
 
     public static final StructType SCHEMA = new StructType()
-        .add("URI", DataTypes.StringType)
+        .add("URI", DataTypes.StringType, false)
         .add("content", DataTypes.BinaryType)
         .add("format", DataTypes.StringType)
         .add("collections", DataTypes.createArrayType(DataTypes.StringType))
diff --git a/src/main/java/com/marklogic/spark/reader/file/FilePartitionReaderFactory.java b/src/main/java/com/marklogic/spark/reader/file/FilePartitionReaderFactory.java
@@ -25,10 +25,15 @@ class FilePartitionReaderFactory implements PartitionReaderFactory {
     @Override
     public PartitionReader<InternalRow> createReader(InputPartition partition) {
         FilePartition filePartition = (FilePartition) partition;
+
         String compression = this.properties.get(Options.READ_FILES_COMPRESSION);
         final boolean isZip = "zip".equalsIgnoreCase(compression);
         final boolean isGzip = "gzip".equalsIgnoreCase(compression);
 
+        if ("rdf".equalsIgnoreCase(this.properties.get(Options.READ_FILES_TYPE))) {
+            return new RdfFileReader(filePartition, hadoopConfiguration);
+        }
+
         String aggregateXmlElement = this.properties.get(Options.READ_AGGREGATES_XML_ELEMENT);
         if (aggregateXmlElement != null && !aggregateXmlElement.trim().isEmpty()) {
             if (isZip) {
diff --git a/src/main/java/com/marklogic/spark/reader/file/FileRowSchema.java b/src/main/java/com/marklogic/spark/reader/file/FileRowSchema.java
@@ -8,10 +8,10 @@ public abstract class FileRowSchema {
     // Same as Spark's binaryType.
     // See https://spark.apache.org/docs/latest/sql-data-sources-binaryFile.html .
     public static final StructType SCHEMA = new StructType()
-        .add("path", DataTypes.StringType)
+        .add("path", DataTypes.StringType, false)
         .add("modificationTime", DataTypes.TimestampType)
         .add("length", DataTypes.LongType)
-        .add("content", DataTypes.BinaryType);
+        .add("content", DataTypes.BinaryType, false);
 
     private FileRowSchema() {
     }
diff --git a/src/main/java/com/marklogic/spark/reader/file/RdfFileReader.java b/src/main/java/com/marklogic/spark/reader/file/RdfFileReader.java
@@ -0,0 +1,61 @@
+package com.marklogic.spark.reader.file;
+
+import com.marklogic.spark.ConnectorException;
+import org.apache.commons.io.IOUtils;
+import org.apache.hadoop.fs.Path;
+import org.apache.jena.graph.Triple;
+import org.apache.jena.riot.Lang;
+import org.apache.jena.riot.RDFParserBuilder;
+import org.apache.jena.riot.system.AsyncParser;
+import org.apache.spark.sql.catalyst.InternalRow;
+import org.apache.spark.sql.connector.read.PartitionReader;
+import org.apache.spark.util.SerializableConfiguration;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Iterator;
+
+class RdfFileReader implements PartitionReader<InternalRow> {
+
+    private static final Logger logger = LoggerFactory.getLogger(RdfFileReader.class);
+
+    private final InputStream inputStream;
+    private final Iterator<Triple> tripleStream;
+
+    private final TripleSerializer tripleSerializer = new TripleSerializer();
+
+    RdfFileReader(FilePartition partition, SerializableConfiguration hadoopConfiguration) {
+        if (logger.isDebugEnabled()) {
+            logger.debug("Reading RDF file {}", partition.getPath());
+        }
+        Path path = new Path(partition.getPath());
+        try {
+            this.inputStream = path.getFileSystem(hadoopConfiguration.value()).open(path);
+            this.tripleStream = AsyncParser.of(RDFParserBuilder.create()
+                .source(this.inputStream)
+                .lang(Lang.RDFXML)
+                .base(partition.getPath())
+            ).streamTriples().iterator();
+        } catch (Exception e) {
+            throw new ConnectorException(String.format("Unable to read RDF file at %s; cause: %s", path, e.getMessage()), e);
+        }
+    }
+
+    @Override
+    public boolean next() throws IOException {
+        return this.tripleStream.hasNext();
+    }
+
+    @Override
+    public InternalRow get() {
+        Triple triple = this.tripleStream.next();
+        return tripleSerializer.serialize(triple);
+    }
+
+    @Override
+    public void close() throws IOException {
+        IOUtils.closeQuietly(this.inputStream);
+    }
+}
diff --git a/src/main/java/com/marklogic/spark/reader/file/TripleRowSchema.java b/src/main/java/com/marklogic/spark/reader/file/TripleRowSchema.java
@@ -0,0 +1,22 @@
+package com.marklogic.spark.reader.file;
+
+import org.apache.spark.sql.types.DataTypes;
+import org.apache.spark.sql.types.StructType;
+
+/**
+ * Represents a triple as read from an RDF file and serialized into the 3 XML elements comprising
+ * a MarkLogic triple.
+ */
+public abstract class TripleRowSchema {
+
+    public static final StructType SCHEMA = new StructType()
+        .add("subject", DataTypes.StringType, false)
+        .add("predicate", DataTypes.StringType, false)
+        .add("object", DataTypes.StringType, false)
+        .add("datatype", DataTypes.StringType)
+        .add("lang", DataTypes.StringType)
+        .add("graph", DataTypes.StringType);
+
+    private TripleRowSchema() {
+    }
+}
diff --git a/src/main/java/com/marklogic/spark/reader/file/TripleSerializer.java b/src/main/java/com/marklogic/spark/reader/file/TripleSerializer.java
@@ -0,0 +1,100 @@
+package com.marklogic.spark.reader.file;
+
+import org.apache.jena.graph.Node;
+import org.apache.jena.graph.Triple;
+import org.apache.spark.sql.catalyst.InternalRow;
+import org.apache.spark.sql.catalyst.expressions.GenericInternalRow;
+import org.apache.spark.unsafe.types.UTF8String;
+
+import java.util.Random;
+
+/**
+ * Captures the logic from Content Pump for serializing a Jena Triple into a string representation. Note that this does
+ * not contain the "escape XML" logic in the MLCP code as we don't care about an XML representation of the triples yet.
+ * We just want to return the raw values so they can be added to a Spark row.
+ */
+class TripleSerializer {
+
+    // These are both used in the MLCP-specific code below for generating a "blank" value.
+    private final static long HASH64_STEP = 15485863L;
+    private final Random random = new Random();
+
+    public InternalRow serialize(Triple triple) {
+        String[] objectValues = serializeObject(triple);
+        return new GenericInternalRow(new Object[]{
+            UTF8String.fromString(serialize(triple.getSubject())),
+            UTF8String.fromString(serialize(triple.getPredicate())),
+            UTF8String.fromString(objectValues[0]),
+            objectValues[1] != null ? UTF8String.fromString(objectValues[1]) : null,
+            objectValues[2] != null ? UTF8String.fromString(objectValues[2]) : null,
+            null
+        });
+    }
+
+    private String serialize(Node node) {
+        return node.isBlank() ? generateBlankValue(node) : node.toString();
+    }
+
+    /**
+     * @param triple
+     * @return an array containing a string serialization of the object; an optional datatype; and an optional "lang" value.
+     */
+    private String[] serializeObject(Triple triple) {
+        Node node = triple.getObject();
+        if (node.isLiteral()) {
+            String type = node.getLiteralDatatypeURI();
+            String lang = node.getLiteralLanguage();
+            if ("".equals(lang)) {
+                lang = null;
+            }
+            if ("".equals(lang) || lang == null) {
+                if (type == null) {
+                    type = "http://www.w3.org/2001/XMLSchema#string";
+                }
+            } else {
+                type = null;
+            }
+            return new String[]{node.getLiteralLexicalForm(), type, lang};
+        } else if (node.isBlank()) {
+            return new String[]{generateBlankValue(node), null, null};
+        } else {
+            return new String[]{node.toString(), null, null};
+        }
+    }
+
+    /**
+     * Reuses copy/pasted code from the MLCP codebase for generating a blank value for a "blank node" - see
+     * https://en.wikipedia.org/wiki/Blank_node for more details. It is not known why a UUID isn't used.
+     *
+     * @return
+     */
+    private String generateBlankValue(Node blankNode) {
+        String value = Long.toHexString(
+            hash64(
+                fuse(scramble(System.currentTimeMillis()), random.nextLong()),
+                blankNode.getBlankNodeLabel()
+            )
+        );
+        return "http://marklogic.com/semantics/blank/" + value;
+    }
+
+    private long hash64(long value, String str) {
+        char[] arr = str.toCharArray();
+        for (int i = 0; i < str.length(); i++) {
+            value = (value + Character.getNumericValue(arr[i])) * HASH64_STEP;
+        }
+        return value;
+    }
+
+    private long fuse(long a, long b) {
+        return rotl(a, 8) ^ b;
+    }
+
+    private long scramble(long x) {
+        return x ^ rotl(x, 20) ^ rotl(x, 40);
+    }
+
+    private long rotl(long x, long y) {
+        return (x << y) ^ (x >> (64 - y));
+    }
+}
diff --git a/src/test/java/com/marklogic/spark/reader/file/ReadRdfFilesTest.java b/src/test/java/com/marklogic/spark/reader/file/ReadRdfFilesTest.java
@@ -0,0 +1,76 @@
+package com.marklogic.spark.reader.file;
+
+import com.marklogic.spark.AbstractIntegrationTest;
+import com.marklogic.spark.Options;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+import org.junit.jupiter.api.Test;
+
+import java.util.List;
+
+import static org.junit.jupiter.api.Assertions.*;
+
+class ReadRdfFilesTest extends AbstractIntegrationTest {
+
+    @Test
+    void rdfXml() {
+        Dataset<Row> dataset = newSparkSession()
+            .read()
+            .format(CONNECTOR_IDENTIFIER)
+            .option(Options.READ_FILES_TYPE, "rdf")
+            .load("src/test/resources/rdf/mini-taxonomy.xml");
+
+        List<Row> rows = dataset.collectAsList();
+        assertEquals(8, rows.size(), "Expecting 8 triples, as there are 8 child elements in the " +
+            "single rdf:Description element in the test file.");
+
+        // Verify a few triples to make sure things look good.
+        final String subject = "http://vocabulary.worldbank.org/taxonomy/451";
+        verifyRow(rows.get(0), subject, "http://www.w3.org/1999/02/22-rdf-syntax-ns#type", "http://www.w3.org/2004/02/skos/core#Concept");
+        verifyRow(rows.get(1), subject, "http://purl.org/dc/terms/creator", "wb", "http://www.w3.org/2001/XMLSchema#string", null);
+        verifyRow(rows.get(4), subject, "http://www.w3.org/2004/02/skos/core#prefLabel", "Debt Management", null, "en");
+    }
+
+    /**
+     * Verifies that blank nodes are generated in the same manner as with MLCP.
+     */
+    @Test
+    void blankNodes() {
+        Dataset<Row> dataset = newSparkSession()
+            .read()
+            .format(CONNECTOR_IDENTIFIER)
+            .option(Options.READ_FILES_TYPE, "rdf")
+            .load("src/test/resources/rdf/blank-nodes.xml");
+
+        dataset.show(10, 0, true);
+        List<Row> rows = dataset.collectAsList();
+        assertEquals(4, rows.size());
+
+        verifyRow(rows.get(0), "http://example.org/web-data", "http://example.org/data#title", "Web Data",
+            "http://www.w3.org/2001/XMLSchema#string", null);
+        
+        assertBlankValue(rows.get(1).getString(2));
+        assertBlankValue(rows.get(2).getString(0));
+        assertBlankValue(rows.get(3).getString(0));
+    }
+
+    private void verifyRow(Row row, String subject, String predicate, String object) {
+        verifyRow(row, subject, predicate, object, null, null);
+    }
+
+    private void verifyRow(Row row, String subject, String predicate, String object, String datatype, String lang) {
+        assertEquals(subject, row.getString(0));
+        assertEquals(predicate, row.getString(1));
+        assertEquals(object, row.getString(2));
+        assertEquals(datatype, row.get(3));
+        assertEquals(lang, row.getString(4));
+        assertNull(row.get(5), "The graph is expected to be null since these are triples and not quads.");
+    }
+
+    private void assertBlankValue(String value) {
+        assertTrue(value.startsWith("http://marklogic.com/semantics/blank/"),
+            "We are reusing copy/pasted code from MLCP for generating a 'blank' value, which is expected to end with " +
+                "a random hex value. It is not known why this isn't just a Java-generated UUID; we're simply reusing " +
+                "the code because it's what MLCP does. Actual value: " + value);
+    }
+}
diff --git a/src/test/resources/rdf/blank-nodes.xml b/src/test/resources/rdf/blank-nodes.xml
@@ -0,0 +1,11 @@
+<!-- Obtained from https://en.wikipedia.org/wiki/Blank_node -->
+<rdf:RDF
+  xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+  xmlns:ex="http://example.org/data#">
+  <rdf:Description rdf:about="http://example.org/web-data" ex:title="Web Data">
+    <ex:professor rdf:nodeID="b"/>
+  </rdf:Description>
+  <rdf:Description rdf:nodeID="b" ex:fullName="Alice Carol">
+    <ex:homePage rdf:resource="http://example.net/alice-carol"/>
+  </rdf:Description>
+</rdf:RDF>
diff --git a/src/test/resources/rdf/mini-taxonomy.xml b/src/test/resources/rdf/mini-taxonomy.xml
@@ -0,0 +1,18 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<rdf:RDF
+  xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+  xmlns:skos="http://www.w3.org/2004/02/skos/core#"
+  xmlns:dcterms="http://purl.org/dc/terms/">
+
+  <rdf:Description rdf:about="http://vocabulary.worldbank.org/taxonomy/451">
+    <rdf:type rdf:resource="http://www.w3.org/2004/02/skos/core#Concept"/>
+    <dcterms:creator>wb</dcterms:creator>
+    <dcterms:created rdf:datatype="http://www.w3.org/2001/XMLSchema#dateTime">2013-05-21T15:49:55Z</dcterms:created>
+    <dcterms:hasVersion rdf:datatype="http://www.w3.org/2001/XMLSchema#int">0</dcterms:hasVersion>
+    <skos:prefLabel xml:lang="en">Debt Management</skos:prefLabel>
+    <skos:broader rdf:resource="http://vocabulary.worldbank.org/taxonomy/450"/>
+    <skos:narrower rdf:resource="http://vocabulary.worldbank.org/taxonomy/1137"/>
+    <skos:narrower rdf:resource="http://vocabulary.worldbank.org/taxonomy/1107"/>
+  </rdf:Description>
+
+</rdf:RDF>

Original file line number	Diff line number	Diff line change
`@@ -13,8 +13,10 @@ group 'com.marklogic'`
`13`	`13`	`version '2.2-SNAPSHOT'`
`14`	`14`
`15`	`15`	`java {`
`16`		`- sourceCompatibility = 1.8`
`17`		`- targetCompatibility = 1.8`
	`16`	`+ // To support reading RDF files, Apache Jena is used - but that requires Java 11. If we want to do a 2.2.0 release`
	`17`	`+ // without requiring Java 11, we'll remove the support for reading RDF files along with the Jena dependency.`
	`18`	`+ sourceCompatibility = 11`
	`19`	`+ targetCompatibility = 11`
`18`	`20`	`}`
`19`	`21`
`20`	`22`	`repositories {`
`@@ -40,6 +42,8 @@ dependencies {`
`40`	`42`	`exclude module: "scala-library"`
`41`	`43`	`}`
`42`	`44`
	`45`	`+ implementation "org.apache.jena:jena-arq:4.10.0"`
	`46`	`+`
`43`	`47`	`testImplementation 'org.apache.spark:spark-sql_2.12:' + sparkVersion`
`44`	`48`
`45`	`49`	`// The exclusions in these two modules ensure that we use the Jackson libraries from spark-sql when running the tests.`
`@@ -56,7 +60,7 @@ dependencies {`
`56`	`60`	`exclude module: 'jackson-dataformat-csv'`
`57`	`61`	`}`
`58`	`62`
`59`		`- testImplementation "ch.qos.logback:logback-classic:1.3.5"`
	`63`	`+ testImplementation "ch.qos.logback:logback-classic:1.3.14"`
`60`	`64`	`testImplementation "org.slf4j:jcl-over-slf4j:1.7.36"`
`61`	`65`	`testImplementation "org.skyscreamer:jsonassert:1.5.1"`
`62`	`66`	`}`