Merge pull request #327 from marklogic/feature/2.4.2-read-json-lines

rjrudin · web-flow · commit 29eefe1306a0 · 2024-10-17T12:29:50.000-04:00
MLE-17412 Added JSON Lines file reader
diff --git a/src/main/java/com/marklogic/spark/reader/file/FileContext.java b/src/main/java/com/marklogic/spark/reader/file/FileContext.java
@@ -12,10 +12,7 @@
 import org.apache.hadoop.fs.Path;
 import org.apache.spark.util.SerializableConfiguration;
 
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.Serializable;
-import java.io.UnsupportedEncodingException;
+import java.io.*;
 import java.net.URLDecoder;
 import java.nio.charset.Charset;
 import java.nio.charset.UnsupportedCharsetException;
@@ -49,11 +46,28 @@ public boolean isGzip() {
     }
 
     public InputStream openFile(String filePath) {
+        return openFile(filePath, false);
+    }
+
+    public InputStream openFile(String filePath, boolean guessIfGzipped) {
         try {
             Path hadoopPath = new Path(filePath);
             FileSystem fileSystem = hadoopPath.getFileSystem(hadoopConfiguration.value());
             FSDataInputStream inputStream = fileSystem.open(hadoopPath);
-            return this.isGzip() ? new GZIPInputStream(inputStream) : inputStream;
+            return isFileGzipped(filePath, guessIfGzipped) ? new GZIPInputStream(inputStream) : inputStream;
+        } catch (Exception e) {
+            throw new ConnectorException(String.format(
+                "Unable to read file at %s; cause: %s", filePath, e.getMessage()), e);
+        }
+    }
+
+    BufferedReader openFileReader(String filePath, boolean guessIfGzipped) {
+        try {
+            InputStream inputStream = openFile(filePath, guessIfGzipped);
+            InputStreamReader inputStreamReader = this.encoding != null ?
+                new InputStreamReader(inputStream, encoding) :
+                new InputStreamReader(inputStream);
+            return new BufferedReader(inputStreamReader);
         } catch (Exception e) {
             throw new ConnectorException(String.format(
                 "Unable to read file at %s; cause: %s", filePath, e.getMessage()), e);
@@ -82,4 +96,11 @@ public String decodeFilePath(String path) {
             return path;
         }
     }
+
+    private boolean isFileGzipped(String filePath, boolean guessIfGzipped) {
+        if (this.isGzip()) {
+            return true;
+        }
+        return guessIfGzipped && filePath != null && (filePath.endsWith(".gz") || filePath.endsWith(".gzip"));
+    }
 }
diff --git a/src/main/java/com/marklogic/spark/reader/file/FilePartitionReaderFactory.java b/src/main/java/com/marklogic/spark/reader/file/FilePartitionReaderFactory.java
@@ -35,6 +35,8 @@ public PartitionReader<InternalRow> createReader(InputPartition partition) {
             return new MlcpArchiveFileReader(filePartition, fileContext);
         } else if ("archive".equalsIgnoreCase(fileType)) {
             return new ArchiveFileReader(filePartition, fileContext);
+        } else if ("json_lines".equalsIgnoreCase(fileType)) {
+            return new JsonLinesFileReader(filePartition, fileContext);
         } else if (fileContext.hasOption(Options.READ_AGGREGATES_XML_ELEMENT)) {
             return fileContext.isZip() ?
                 new ZipAggregateXmlFileReader(filePartition, fileContext) :
diff --git a/src/main/java/com/marklogic/spark/reader/file/JsonLinesFileReader.java b/src/main/java/com/marklogic/spark/reader/file/JsonLinesFileReader.java
@@ -0,0 +1,85 @@
+/*
+ * Copyright © 2024 MarkLogic Corporation. All Rights Reserved.
+ */
+package com.marklogic.spark.reader.file;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.spark.sql.catalyst.InternalRow;
+import org.apache.spark.sql.catalyst.expressions.GenericInternalRow;
+import org.apache.spark.sql.connector.read.PartitionReader;
+import org.apache.spark.unsafe.types.ByteArray;
+import org.apache.spark.unsafe.types.UTF8String;
+
+import java.io.BufferedReader;
+import java.util.Iterator;
+
+class JsonLinesFileReader implements PartitionReader<InternalRow> {
+
+    private final FilePartition filePartition;
+    private final FileContext fileContext;
+
+    private BufferedReader bufferedReader;
+    private Iterator<String> bufferedLines;
+
+    private InternalRow nextRowToReturn;
+    private String currentFilePath;
+    private int lineCounter;
+    private int filePathIndex;
+
+    JsonLinesFileReader(FilePartition filePartition, FileContext fileContext) {
+        this.filePartition = filePartition;
+        this.fileContext = fileContext;
+    }
+
+    @Override
+    public boolean next() {
+        if (bufferedLines != null && bufferedLines.hasNext()) {
+            this.nextRowToReturn = createRowFromNextJsonLine();
+            return true;
+        }
+
+        if (bufferedReader != null) {
+            IOUtils.closeQuietly(bufferedReader);
+        }
+
+        if (filePathIndex >= filePartition.getPaths().size()) {
+            return false;
+        }
+
+        openNextFile();
+        return next();
+    }
+
+    @Override
+    public InternalRow get() {
+        return nextRowToReturn;
+    }
+
+    @Override
+    public void close() {
+        IOUtils.closeQuietly(bufferedReader);
+    }
+
+    private void openNextFile() {
+        final String originalFilePath = filePartition.getPaths().get(filePathIndex);
+        this.currentFilePath = fileContext.decodeFilePath(originalFilePath);
+        this.lineCounter = 1;
+        this.filePathIndex++;
+        // To mimic the behavior of the Spark JSON data source, this will guess if the file is gzipped based on its
+        // file extension. This allows for .gz/.gzip files to be supported without the user having to specify the
+        // compression option, which is the same behavior as Spark JSON provides.
+        this.bufferedReader = fileContext.openFileReader(currentFilePath, true);
+        this.bufferedLines = bufferedReader.lines().iterator();
+    }
+
+    private InternalRow createRowFromNextJsonLine() {
+        String line = bufferedLines.next();
+        String uri = String.format("%s-%d.json", UTF8String.fromString(currentFilePath), lineCounter);
+        lineCounter++;
+        return new GenericInternalRow(new Object[]{
+            UTF8String.fromString(uri),
+            ByteArray.concat(line.getBytes()),
+            null, null, null, null, null, null
+        });
+    }
+}
diff --git a/src/test/java/com/marklogic/spark/reader/file/ReadJsonLinesFilesTest.java b/src/test/java/com/marklogic/spark/reader/file/ReadJsonLinesFilesTest.java
@@ -0,0 +1,122 @@
+/*
+ * Copyright © 2024 MarkLogic Corporation. All Rights Reserved.
+ */
+package com.marklogic.spark.reader.file;
+
+import com.fasterxml.jackson.databind.JsonNode;
+import com.marklogic.spark.AbstractIntegrationTest;
+import com.marklogic.spark.Options;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.SaveMode;
+import org.junit.jupiter.api.Test;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
+
+class ReadJsonLinesFilesTest extends AbstractIntegrationTest {
+
+    @Test
+    void test() {
+        Dataset<Row> dataset = newSparkSession().read().format(CONNECTOR_IDENTIFIER)
+            .option(Options.READ_FILES_TYPE, "json_lines")
+            .load("src/test/resources/json-lines/nested-objects.txt");
+
+        assertEquals(2, dataset.count(), "Should have one row for each line in the file.");
+
+        dataset.write().format(CONNECTOR_IDENTIFIER)
+            .option(Options.CLIENT_URI, makeClientUri())
+            .option(Options.WRITE_PERMISSIONS, DEFAULT_PERMISSIONS)
+            .option(Options.WRITE_COLLECTIONS, "json-lines")
+            .option(Options.WRITE_URI_REPLACE, ".*json-lines,''")
+            .mode(SaveMode.Append)
+            .save();
+
+        assertCollectionSize("json-lines", 2);
+
+        JsonNode doc = readJsonDocument("/nested-objects.txt-1.json");
+        assertEquals(1, doc.get("id").asInt());
+        assertEquals("blue", doc.at("/data/color").asText());
+        assertEquals("world", doc.get("hello").asText());
+
+        doc = readJsonDocument("/nested-objects.txt-2.json");
+        assertEquals(2, doc.get("id").asInt());
+        assertFalse(doc.has("hello"));
+    }
+
+    @Test
+    void withUriTemplate() {
+        newSparkSession().read().format(CONNECTOR_IDENTIFIER)
+            .option(Options.READ_FILES_TYPE, "json_lines")
+            .load("src/test/resources/json-lines/nested-objects.txt")
+            .write().format(CONNECTOR_IDENTIFIER)
+            .option(Options.CLIENT_URI, makeClientUri())
+            .option(Options.WRITE_PERMISSIONS, DEFAULT_PERMISSIONS)
+            .option(Options.WRITE_COLLECTIONS, "json-lines")
+            .option(Options.WRITE_URI_TEMPLATE, "/a/{id}.json")
+            .mode(SaveMode.Append)
+            .save();
+
+        assertCollectionSize("json-lines", 2);
+
+        JsonNode doc = readJsonDocument("/a/1.json");
+        assertEquals(1, doc.get("id").asInt());
+
+        doc = readJsonDocument("/a/2.json");
+        assertEquals(2, doc.get("id").asInt());
+    }
+
+    @Test
+    void encoding() {
+        newSparkSession().read().format(CONNECTOR_IDENTIFIER)
+            .option(Options.READ_FILES_TYPE, "json_lines")
+            .option(Options.READ_FILES_ENCODING, "ISO-8859-1")
+            .load("src/test/resources/json-lines/objects-iso-8859-1.txt")
+            .write().format(CONNECTOR_IDENTIFIER)
+            .option(Options.CLIENT_URI, makeClientUri())
+            .option(Options.WRITE_PERMISSIONS, DEFAULT_PERMISSIONS)
+            .option(Options.WRITE_COLLECTIONS, "json-lines")
+            .option(Options.WRITE_URI_REPLACE, ".*json-lines,''")
+            .mode(SaveMode.Append)
+            .save();
+
+        assertCollectionSize("json-lines", 2);
+
+        JsonNode doc = readJsonDocument("/objects-iso-8859-1.txt-1.json");
+        assertEquals("Istituto di Anatomia e Istologia Patologica, Università di Ferrara, Italy",
+            doc.get("text").asText(), "Verifying that the encoded text is correctly read and written to MarkLogic.");
+    }
+
+    @Test
+    void gzip() {
+        newSparkSession().read().format(CONNECTOR_IDENTIFIER)
+            .option(Options.READ_FILES_TYPE, "json_lines")
+            .option(Options.READ_FILES_COMPRESSION, "gzip")
+            .load("src/test/resources/json-lines/nested-objects.txt.gz")
+            .write().format(CONNECTOR_IDENTIFIER)
+            .option(Options.CLIENT_URI, makeClientUri())
+            .option(Options.WRITE_PERMISSIONS, DEFAULT_PERMISSIONS)
+            .option(Options.WRITE_COLLECTIONS, "json-lines")
+            .option(Options.WRITE_URI_REPLACE, ".*json-lines,''")
+            .mode(SaveMode.Append)
+            .save();
+
+        assertCollectionSize("json-lines", 2);
+    }
+
+    @Test
+    void gzipWithoutCompressionOption() {
+        newSparkSession().read().format(CONNECTOR_IDENTIFIER)
+            .option(Options.READ_FILES_TYPE, "json_lines")
+            .load("src/test/resources/json-lines/nested-objects.txt.gz")
+            .write().format(CONNECTOR_IDENTIFIER)
+            .option(Options.CLIENT_URI, makeClientUri())
+            .option(Options.WRITE_PERMISSIONS, DEFAULT_PERMISSIONS)
+            .option(Options.WRITE_COLLECTIONS, "json-lines")
+            .option(Options.WRITE_URI_REPLACE, ".*json-lines,''")
+            .mode(SaveMode.Append)
+            .save();
+
+        assertCollectionSize("json-lines", 2);
+    }
+}
diff --git a/src/test/resources/json-lines/nested-objects.txt.gz b/src/test/resources/json-lines/nested-objects.txt.gz
diff --git a/src/test/resources/json-lines/objects-iso-8859-1.txt b/src/test/resources/json-lines/objects-iso-8859-1.txt
@@ -0,0 +1,2 @@
+{"text": "Istituto di Anatomia e Istologia Patologica, Universit� di Ferrara, Italy"}
+{"text": "This doesn't require encoding."}

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+{"text": "Istituto di Anatomia e Istologia Patologica, Universit� di Ferrara, Italy"}`
	`2`	`+{"text": "This doesn't require encoding."}`