Merge pull request #307 from marklogic/feature/closeable-fix

rjrudin · web-flow · commit ea415a24f9eb · 2024-10-01T12:35:20.000-04:00
MLE-17142 Ensuring input streams are closed
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -83,6 +83,10 @@ you've introduced on the feature branch you're working on. You can then click on
 Note that if you only need results on code smells and vulnerabilities, you can repeatedly run `./gradlew sonar`
 without having to re-run the tests.
 
+You can also force Gradle to run `sonar` if any tests fail:
+
+    ./gradlew clean test sonar --continue
+
 ## Accessing MarkLogic logs in Grafana
 
 This project's `docker-compose-3nodes.yaml` file includes
diff --git a/src/main/java/com/marklogic/spark/writer/DocumentRowConverter.java b/src/main/java/com/marklogic/spark/writer/DocumentRowConverter.java
@@ -15,6 +15,8 @@
 import com.marklogic.spark.reader.document.DocumentRowSchema;
 import com.marklogic.spark.reader.file.*;
 import com.marklogic.spark.writer.file.ArchiveFileIterator;
+import com.marklogic.spark.writer.file.FileIterator;
+import com.marklogic.spark.writer.file.GzipFileIterator;
 import com.marklogic.spark.writer.file.ZipFileIterator;
 import org.apache.spark.sql.catalyst.InternalRow;
 
@@ -114,12 +116,12 @@ private Iterator<DocBuilder.DocumentInputs> streamContentFromFile(String filePat
 
     private Iterator<DocBuilder.DocumentInputs> buildIteratorForGenericFile(InternalRow row, String filePath, FileContext fileContext) {
         final String decodedPath = fileContext.decodeFilePath(filePath);
-        InputStreamHandle streamHandle = new InputStreamHandle(fileContext.openFile(decodedPath));
+        InputStreamHandle contentHandle = new InputStreamHandle(fileContext.openFile(decodedPath));
         if (this.documentFormat != null) {
-            streamHandle.withFormat(this.documentFormat);
+            contentHandle.withFormat(this.documentFormat);
         }
         DocumentMetadataHandle metadata = DocumentRowSchema.makeDocumentMetadata(row);
-        return Stream.of(new DocBuilder.DocumentInputs(filePath, streamHandle, null, metadata)).iterator();
+        return new FileIterator(contentHandle, new DocBuilder.DocumentInputs(filePath, contentHandle, null, metadata));
     }
 
     private Iterator<DocBuilder.DocumentInputs> buildIteratorForArchiveFile(String filePath, FileContext fileContext) {
@@ -138,12 +140,6 @@ private Iterator<DocBuilder.DocumentInputs> buildIteratorForZipFile(String fileP
 
     private Iterator<DocBuilder.DocumentInputs> buildIteratorForGzipFile(String filePath, FileContext fileContext) {
         GzipFileReader reader = new GzipFileReader(new FilePartition(filePath), fileContext, StreamingMode.STREAM_DURING_WRITER_PHASE);
-        reader.next();
-        String uri = reader.get().getString(0);
-        InputStreamHandle contentHandle = reader.getStreamingContentHandle();
-        if (this.documentFormat != null) {
-            contentHandle.withFormat(this.documentFormat);
-        }
-        return Stream.of(new DocBuilder.DocumentInputs(uri, contentHandle, null, null)).iterator();
+        return new GzipFileIterator(reader, this.documentFormat);
     }
 }
diff --git a/src/main/java/com/marklogic/spark/writer/WriteBatcherDataWriter.java b/src/main/java/com/marklogic/spark/writer/WriteBatcherDataWriter.java
@@ -20,6 +20,7 @@
 import com.marklogic.spark.reader.file.TripleRowSchema;
 import com.marklogic.spark.writer.file.ZipFileWriter;
 import com.marklogic.spark.writer.rdf.RdfRowConverter;
+import org.apache.commons.io.IOUtils;
 import org.apache.spark.sql.catalyst.InternalRow;
 import org.apache.spark.sql.catalyst.expressions.GenericInternalRow;
 import org.apache.spark.sql.connector.write.DataWriter;
@@ -29,6 +30,7 @@
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import java.io.Closeable;
 import java.util.ArrayList;
 import java.util.Iterator;
 import java.util.List;
@@ -94,12 +96,20 @@ public void write(InternalRow row) {
         throwWriteFailureIfExists();
 
         Iterator<DocBuilder.DocumentInputs> iterator = rowConverter.convertRow(row);
-        while (iterator.hasNext()) {
-            DocumentWriteOperation writeOp = this.docBuilder.build(iterator.next());
-            if (this.isStreamingFiles) {
-                writeDocumentViaPutOperation(writeOp);
-            } else {
-                this.writeBatcher.add(writeOp);
+        try {
+            while (iterator.hasNext()) {
+                DocumentWriteOperation writeOp = this.docBuilder.build(iterator.next());
+                if (this.isStreamingFiles) {
+                    writeDocumentViaPutOperation(writeOp);
+                } else {
+                    this.writeBatcher.add(writeOp);
+                }
+            }
+        } finally {
+            // This is needed for when files are being streamed into MarkLogic; gives a chance for the file reader to
+            // close the associated InputStream.
+            if (iterator instanceof Closeable) {
+                IOUtils.closeQuietly((Closeable) iterator);
             }
         }
     }
diff --git a/src/main/java/com/marklogic/spark/writer/file/ArchiveFileIterator.java b/src/main/java/com/marklogic/spark/writer/file/ArchiveFileIterator.java
@@ -10,16 +10,18 @@
 import com.marklogic.spark.reader.document.DocumentRowSchema;
 import com.marklogic.spark.reader.file.ArchiveFileReader;
 import com.marklogic.spark.writer.DocBuilder;
+import org.apache.commons.io.IOUtils;
 import org.apache.spark.sql.catalyst.InternalRow;
 
+import java.io.Closeable;
 import java.util.Iterator;
 
 /**
  * Provides an {@code Iterator} interface on top of an {@code ArchiveFileReader}, thereby allowing a
  * {@code DocumentRowConverter} to build sets of document inputs from an archive file without reading any content entry
  * into memory - thus supporting streaming of an archive.
  */
-public class ArchiveFileIterator implements Iterator<DocBuilder.DocumentInputs> {
+public class ArchiveFileIterator implements Iterator<DocBuilder.DocumentInputs>, Closeable {
 
     private final ArchiveFileReader archiveFileReader;
     private final Format documentFormat;
@@ -51,4 +53,9 @@ public DocBuilder.DocumentInputs next() {
         }
         return new DocBuilder.DocumentInputs(uri, contentHandle, null, metadata);
     }
+
+    @Override
+    public void close() {
+        IOUtils.closeQuietly(archiveFileReader);
+    }
 }
diff --git a/src/main/java/com/marklogic/spark/writer/file/FileIterator.java b/src/main/java/com/marklogic/spark/writer/file/FileIterator.java
@@ -0,0 +1,42 @@
+/*
+ * Copyright © 2024 MarkLogic Corporation. All Rights Reserved.
+ */
+package com.marklogic.spark.writer.file;
+
+import com.marklogic.client.io.InputStreamHandle;
+import com.marklogic.spark.writer.DocBuilder;
+import org.apache.commons.io.IOUtils;
+
+import java.io.Closeable;
+import java.util.Iterator;
+import java.util.stream.Stream;
+
+/**
+ * Exists solely to provide an implementation of {@code Closeable} so that the {@code InputStreamHandle} can be closed
+ * after the corresponding document is written to MarkLogic.
+ */
+public class FileIterator implements Iterator<DocBuilder.DocumentInputs>, Closeable {
+
+    private final InputStreamHandle contentHandle;
+    private final Iterator<DocBuilder.DocumentInputs> iterator;
+
+    public FileIterator(InputStreamHandle contentHandle, DocBuilder.DocumentInputs inputs) {
+        this.contentHandle = contentHandle;
+        this.iterator = Stream.of(inputs).iterator();
+    }
+
+    @Override
+    public boolean hasNext() {
+        return this.iterator.hasNext();
+    }
+
+    @Override
+    public DocBuilder.DocumentInputs next() {
+        return this.iterator.next();
+    }
+
+    @Override
+    public void close() {
+        IOUtils.closeQuietly(contentHandle);
+    }
+}
diff --git a/src/main/java/com/marklogic/spark/writer/file/GzipFileIterator.java b/src/main/java/com/marklogic/spark/writer/file/GzipFileIterator.java
@@ -0,0 +1,50 @@
+/*
+ * Copyright © 2024 MarkLogic Corporation. All Rights Reserved.
+ */
+package com.marklogic.spark.writer.file;
+
+import com.marklogic.client.io.Format;
+import com.marklogic.client.io.InputStreamHandle;
+import com.marklogic.spark.reader.file.GzipFileReader;
+import com.marklogic.spark.writer.DocBuilder;
+import org.apache.commons.io.IOUtils;
+
+import java.io.Closeable;
+import java.util.Iterator;
+import java.util.stream.Stream;
+
+/**
+ * Exists solely to provide an implementation of {@code Closeable} so that the {@code GzipFileReader} can be closed
+ * after the corresponding document is written to MarkLogic.
+ */
+public class GzipFileIterator implements Iterator<DocBuilder.DocumentInputs>, Closeable {
+
+    private final GzipFileReader gzipFileReader;
+    private Iterator<DocBuilder.DocumentInputs> iterator;
+
+    public GzipFileIterator(GzipFileReader reader, Format documentFormat) {
+        this.gzipFileReader = reader;
+        reader.next();
+        String uri = reader.get().getString(0);
+        InputStreamHandle contentHandle = reader.getStreamingContentHandle();
+        if (documentFormat != null) {
+            contentHandle.withFormat(documentFormat);
+        }
+        this.iterator = Stream.of(new DocBuilder.DocumentInputs(uri, contentHandle, null, null)).iterator();
+    }
+
+    @Override
+    public boolean hasNext() {
+        return this.iterator.hasNext();
+    }
+
+    @Override
+    public DocBuilder.DocumentInputs next() {
+        return this.iterator.next();
+    }
+
+    @Override
+    public void close() {
+        IOUtils.closeQuietly(gzipFileReader);
+    }
+}
diff --git a/src/main/java/com/marklogic/spark/writer/file/ZipFileIterator.java b/src/main/java/com/marklogic/spark/writer/file/ZipFileIterator.java
@@ -8,11 +8,13 @@
 import com.marklogic.spark.Util;
 import com.marklogic.spark.reader.file.ZipFileReader;
 import com.marklogic.spark.writer.DocBuilder;
+import org.apache.commons.crypto.utils.IoUtils;
 import org.apache.spark.sql.catalyst.InternalRow;
 
+import java.io.Closeable;
 import java.util.Iterator;
 
-public class ZipFileIterator implements Iterator<DocBuilder.DocumentInputs> {
+public class ZipFileIterator implements Iterator<DocBuilder.DocumentInputs>, Closeable {
 
     private final ZipFileReader zipFileReader;
     private final Format documentFormat;
@@ -43,4 +45,9 @@ public DocBuilder.DocumentInputs next() {
         }
         return new DocBuilder.DocumentInputs(uri, contentHandle, null, null);
     }
+
+    @Override
+    public void close() {
+        IoUtils.closeQuietly(zipFileReader);
+    }
 }
diff --git a/src/test/java/com/marklogic/spark/reader/file/ReadGzipFilesTest.java b/src/test/java/com/marklogic/spark/reader/file/ReadGzipFilesTest.java
@@ -3,19 +3,14 @@
  */
 package com.marklogic.spark.reader.file;
 
-import com.fasterxml.jackson.databind.JsonNode;
-import com.marklogic.junit5.XmlNode;
 import com.marklogic.spark.AbstractIntegrationTest;
 import com.marklogic.spark.ConnectorException;
 import com.marklogic.spark.Options;
 import org.apache.spark.SparkException;
 import org.apache.spark.sql.Dataset;
 import org.apache.spark.sql.Row;
-import org.apache.spark.sql.SaveMode;
 import org.junit.jupiter.api.Test;
 
-import java.io.ByteArrayInputStream;
-import java.io.ObjectInputStream;
 import java.util.List;
 
 import static org.junit.jupiter.api.Assertions.*;
@@ -78,46 +73,6 @@ void dontAbortOnFailure() {
             "error for the non-gzipped mixed-files.zip file being logged as a warning but not causing a failure.");
     }
 
-    @Test
-    void streamThreeGZIPFiles() throws Exception {
-        Dataset<Row> dataset = newSparkSession().read()
-            .format(CONNECTOR_IDENTIFIER)
-            .option(Options.READ_FILES_COMPRESSION, "gzip")
-            .option("recursiveFileLookup", "true")
-            .option(Options.STREAM_FILES, true)
-            .load("src/test/resources/gzip-files");
-
-        List<Row> rows = dataset.collectAsList();
-        assertEquals(3, rows.size());
-        for (Row row : rows) {
-            assertFalse(row.isNullAt(0), "The URI column should be populated.");
-            byte[] content = (byte[]) row.get(1);
-            try (ObjectInputStream ois = new ObjectInputStream(new ByteArrayInputStream(content))) {
-                FileContext fileContext = (FileContext) ois.readObject();
-                assertNotNull(fileContext);
-            }
-        }
-
-        // Write the streaming files to MarkLogic.
-        dataset.write().format(CONNECTOR_IDENTIFIER)
-            .option(Options.STREAM_FILES, true)
-            .option(Options.CLIENT_URI, makeClientUri())
-            .option(Options.WRITE_PERMISSIONS, DEFAULT_PERMISSIONS)
-            .option(Options.WRITE_COLLECTIONS, "streamed-files")
-            .option(Options.WRITE_URI_REPLACE, ".*gzip-files,'/gzip-files'")
-            .mode(SaveMode.Append)
-            .save();
-
-        assertCollectionSize("streamed-files", 3);
-        XmlNode doc = readXmlDocument("/gzip-files/hello.xml");
-        doc.assertElementValue("/hello", "world");
-
-        // Because each streamed file has to be sent via a PUT request, and the PUT endpoint does not allow spaces -
-        // see MLE-17088 - the URI will be encoded.
-        JsonNode node = readJsonDocument("/gzip-files/level1/level2/hello%20world.json");
-        assertEquals("world", node.get("hello").asText());
-    }
-
     private void verifyRow(Row row, String expectedUriSuffix, String expectedContent) {
         String uri = row.getString(0);
         assertTrue(uri.endsWith(expectedUriSuffix), "Unexpected URI: " + uri);
diff --git a/src/test/java/com/marklogic/spark/reader/file/StreamGenericFilesTest.java b/src/test/java/com/marklogic/spark/reader/file/StreamGenericFilesTest.java
@@ -22,7 +22,7 @@
  * into memory by postponing reading of the file until the writer phase, where it can then be streamed from disk into
  * MarkLogic.
  */
-class ReadGenericFilesStreamingTest extends AbstractIntegrationTest {
+class StreamGenericFilesTest extends AbstractIntegrationTest {
 
     @Test
     void stream() throws Exception {
diff --git a/src/test/java/com/marklogic/spark/reader/file/StreamGzipFilesTest.java b/src/test/java/com/marklogic/spark/reader/file/StreamGzipFilesTest.java
@@ -0,0 +1,62 @@
+/*
+ * Copyright © 2024 MarkLogic Corporation. All Rights Reserved.
+ */
+package com.marklogic.spark.reader.file;
+
+import com.fasterxml.jackson.databind.JsonNode;
+import com.marklogic.junit5.XmlNode;
+import com.marklogic.spark.AbstractIntegrationTest;
+import com.marklogic.spark.Options;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+import org.apache.spark.sql.SaveMode;
+import org.junit.jupiter.api.Test;
+
+import java.io.ByteArrayInputStream;
+import java.io.ObjectInputStream;
+import java.util.List;
+
+import static org.junit.jupiter.api.Assertions.*;
+
+class StreamGzipFilesTest extends AbstractIntegrationTest {
+
+    @Test
+    void streamThreeGZIPFiles() throws Exception {
+        Dataset<Row> dataset = newSparkSession().read()
+            .format(CONNECTOR_IDENTIFIER)
+            .option(Options.READ_FILES_COMPRESSION, "gzip")
+            .option("recursiveFileLookup", "true")
+            .option(Options.STREAM_FILES, true)
+            .load("src/test/resources/gzip-files");
+
+        List<Row> rows = dataset.collectAsList();
+        assertEquals(3, rows.size());
+        for (Row row : rows) {
+            assertFalse(row.isNullAt(0), "The URI column should be populated.");
+            byte[] content = (byte[]) row.get(1);
+            try (ObjectInputStream ois = new ObjectInputStream(new ByteArrayInputStream(content))) {
+                FileContext fileContext = (FileContext) ois.readObject();
+                assertNotNull(fileContext);
+            }
+        }
+
+        // Write the streaming files to MarkLogic.
+        dataset.write().format(CONNECTOR_IDENTIFIER)
+            .option(Options.STREAM_FILES, true)
+            .option(Options.CLIENT_URI, makeClientUri())
+            .option(Options.WRITE_PERMISSIONS, DEFAULT_PERMISSIONS)
+            .option(Options.WRITE_COLLECTIONS, "streamed-files")
+            .option(Options.WRITE_URI_REPLACE, ".*gzip-files,'/gzip-files'")
+            .mode(SaveMode.Append)
+            .save();
+
+        assertCollectionSize("streamed-files", 3);
+        XmlNode doc = readXmlDocument("/gzip-files/hello.xml");
+        doc.assertElementValue("/hello", "world");
+
+        // Because each streamed file has to be sent via a PUT request, and the PUT endpoint does not allow spaces -
+        // see MLE-17088 - the URI will be encoded.
+        JsonNode node = readJsonDocument("/gzip-files/level1/level2/hello%20world.json");
+        assertEquals("world", node.get("hello").asText());
+    }
+}