Merge pull request #353 from marklogic/feature/row-converter-fix

rjrudin · web-flow · commit d08b0e1d72f8 · 2024-11-12T09:39:26.000-05:00
Refactored RowConverter so that an Iterator is always returned
diff --git a/src/main/java/com/marklogic/spark/writer/ArbitraryRowConverter.java b/src/main/java/com/marklogic/spark/writer/ArbitraryRowConverter.java
@@ -19,9 +19,7 @@
 import org.apache.spark.sql.types.StructField;
 import org.apache.spark.sql.types.StructType;
 
-import java.util.ArrayList;
 import java.util.Iterator;
-import java.util.List;
 import java.util.UUID;
 import java.util.stream.Stream;
 
@@ -108,8 +106,8 @@ else if (deserializedJson != null) {
     }
 
     @Override
-    public List<DocBuilder.DocumentInputs> getRemainingDocumentInputs() {
-        return new ArrayList<>();
+    public Iterator<DocBuilder.DocumentInputs> getRemainingDocumentInputs() {
+        return Stream.<DocBuilder.DocumentInputs>empty().iterator();
     }
 
     /**
diff --git a/src/main/java/com/marklogic/spark/writer/DocumentRowConverter.java b/src/main/java/com/marklogic/spark/writer/DocumentRowConverter.java
@@ -24,9 +24,7 @@
 import java.io.ByteArrayInputStream;
 import java.io.IOException;
 import java.io.ObjectInputStream;
-import java.util.ArrayList;
 import java.util.Iterator;
-import java.util.List;
 import java.util.stream.Stream;
 
 /**
@@ -60,8 +58,8 @@ public Iterator<DocBuilder.DocumentInputs> convertRow(InternalRow row) {
     }
 
     @Override
-    public List<DocBuilder.DocumentInputs> getRemainingDocumentInputs() {
-        return new ArrayList<>();
+    public Iterator<DocBuilder.DocumentInputs> getRemainingDocumentInputs() {
+        return Stream.<DocBuilder.DocumentInputs>empty().iterator();
     }
 
     private Iterator<DocBuilder.DocumentInputs> readContentFromRow(String uri, InternalRow row) {
diff --git a/src/main/java/com/marklogic/spark/writer/FileRowConverter.java b/src/main/java/com/marklogic/spark/writer/FileRowConverter.java
@@ -13,9 +13,7 @@
 import org.apache.spark.sql.types.DataTypes;
 
 import java.io.IOException;
-import java.util.ArrayList;
 import java.util.Iterator;
-import java.util.List;
 import java.util.Optional;
 import java.util.stream.Stream;
 
@@ -44,8 +42,8 @@ public Iterator<DocBuilder.DocumentInputs> convertRow(InternalRow row) {
     }
 
     @Override
-    public List<DocBuilder.DocumentInputs> getRemainingDocumentInputs() {
-        return new ArrayList<>();
+    public Iterator<DocBuilder.DocumentInputs> getRemainingDocumentInputs() {
+        return Stream.<DocBuilder.DocumentInputs>empty().iterator();
     }
 
     // Telling Sonar to not tell us to remove this code, since we can't until 3.0.
diff --git a/src/main/java/com/marklogic/spark/writer/RowConverter.java b/src/main/java/com/marklogic/spark/writer/RowConverter.java
@@ -6,7 +6,6 @@
 import org.apache.spark.sql.catalyst.InternalRow;
 
 import java.util.Iterator;
-import java.util.List;
 
 /**
  * Strategy interface for how a Spark row is converted into a set of inputs for writing a document to MarkLogic.
@@ -26,5 +25,5 @@ public interface RowConverter {
      *
      * @return
      */
-    List<DocBuilder.DocumentInputs> getRemainingDocumentInputs();
+    Iterator<DocBuilder.DocumentInputs> getRemainingDocumentInputs();
 }
diff --git a/src/main/java/com/marklogic/spark/writer/WriteBatcherDataWriter.java b/src/main/java/com/marklogic/spark/writer/WriteBatcherDataWriter.java
@@ -33,7 +33,6 @@
 import java.io.Closeable;
 import java.util.ArrayList;
 import java.util.Iterator;
-import java.util.List;
 import java.util.Set;
 import java.util.concurrent.atomic.AtomicInteger;
 import java.util.concurrent.atomic.AtomicReference;
@@ -99,9 +98,49 @@ class WriteBatcherDataWriter implements DataWriter<InternalRow> {
 
     @Override
     public void write(InternalRow row) {
+        throwWriteFailureIfExists();
+        buildAndWriteDocuments(rowConverter.convertRow(row));
+    }
+
+    @Override
+    public WriterCommitMessage commit() {
+        // The RDF row converter may have "pending" rows as it has not yet reached the max number of triples to include
+        // in a document. Those are retrieved here.
+        buildAndWriteDocuments(rowConverter.getRemainingDocumentInputs());
+
+        this.writeBatcher.flushAndWait();
+
         throwWriteFailureIfExists();
 
-        Iterator<DocBuilder.DocumentInputs> iterator = rowConverter.convertRow(row);
+        Set<String> graphs = getGraphNames();
+        return new CommitMessage(successItemCount.get(), failedItemCount.get(), graphs);
+    }
+
+    @Override
+    public void abort() {
+        Util.MAIN_LOGGER.warn("Abort called.");
+        stopJobAndRelease();
+        closeArchiveWriter();
+        Util.MAIN_LOGGER.info("Finished abort.");
+    }
+
+    @Override
+    public void close() {
+        if (logger.isDebugEnabled()) {
+            logger.debug("Close called.");
+        }
+        stopJobAndRelease();
+        closeArchiveWriter();
+    }
+
+    /**
+     * Processes the document inputs returned by the RowConverter for a single row. A row can return multiple instances
+     * of document inputs. Each instance is run through the document processor if it's not null, which can produce
+     * additional documents.
+     *
+     * @param iterator
+     */
+    private void buildAndWriteDocuments(Iterator<DocBuilder.DocumentInputs> iterator) {
         try {
             iterator.forEachRemaining(documentInputs -> {
                 DocumentWriteOperation sourceDocument = this.docBuilder.build(documentInputs);
@@ -128,44 +167,16 @@ private void writeDocument(DocumentWriteOperation writeOp) {
         }
     }
 
-    @Override
-    public WriterCommitMessage commit() {
-        List<DocBuilder.DocumentInputs> documentInputs = rowConverter.getRemainingDocumentInputs();
-        if (documentInputs != null) {
-            documentInputs.forEach(inputs -> {
-                DocumentWriteOperation writeOp = this.docBuilder.build(inputs);
-                this.writeBatcher.add(writeOp);
-            });
-        }
-        this.writeBatcher.flushAndWait();
-
-        throwWriteFailureIfExists();
-
-        // Need this hack so that the complete set of graphs can be reported back to MarkLogicWrite, which handles
-        // creating the graphs after all documents have been written.
-        Set<String> graphs = null;
-        if (this.rowConverter instanceof RdfRowConverter) {
-            graphs = ((RdfRowConverter) rowConverter).getGraphs();
-        }
-
-        return new CommitMessage(successItemCount.get(), failedItemCount.get(), graphs);
-    }
-
-    @Override
-    public void abort() {
-        Util.MAIN_LOGGER.warn("Abort called.");
-        stopJobAndRelease();
-        closeArchiveWriter();
-        Util.MAIN_LOGGER.info("Finished abort.");
-    }
-
-    @Override
-    public void close() {
-        if (logger.isDebugEnabled()) {
-            logger.debug("Close called.");
-        }
-        stopJobAndRelease();
-        closeArchiveWriter();
+    /**
+     * This provides a mechanism for capturing the list of graph names detected while processing RDF rows. These need
+     * to be sent back to MarkLogicWrite, where each graph is written to MarkLogic as a graph document.
+     *
+     * @return
+     */
+    private Set<String> getGraphNames() {
+        return this.rowConverter instanceof RdfRowConverter ?
+            ((RdfRowConverter) rowConverter).getGraphs() :
+            null;
     }
 
     private void addBatchListeners(WriteBatcher writeBatcher) {
diff --git a/src/main/java/com/marklogic/spark/writer/rdf/RdfRowConverter.java b/src/main/java/com/marklogic/spark/writer/rdf/RdfRowConverter.java
@@ -13,7 +13,6 @@
 import org.slf4j.LoggerFactory;
 
 import java.util.*;
-import java.util.stream.Collectors;
 import java.util.stream.Stream;
 
 /**
@@ -87,10 +86,10 @@ public Iterator<DocBuilder.DocumentInputs> convertRow(InternalRow row) {
      * @return
      */
     @Override
-    public List<DocBuilder.DocumentInputs> getRemainingDocumentInputs() {
+    public Iterator<DocBuilder.DocumentInputs> getRemainingDocumentInputs() {
         return this.triplesDocuments.values().stream()
             .map(TriplesDocument::buildDocument)
-            .collect(Collectors.toList());
+            .iterator();
     }
 
     /**
diff --git a/src/test/java/com/marklogic/spark/AbstractIntegrationTest.java b/src/test/java/com/marklogic/spark/AbstractIntegrationTest.java
@@ -58,6 +58,19 @@ public void closeSparkSession() {
         if (sparkSession != null) {
             sparkSession.close();
         }
+        smallDelayUntilNextTest();
+    }
+
+    // Tell Sonar not to worry about this for now.
+    @SuppressWarnings({"java:S2925"})
+    private void smallDelayUntilNextTest() {
+        // Hopefully a temporary hack to see if we get fewer random failures on Jenkins due to connectivity issues that
+        // are likely due to Docker restarting MarkLogic due to insufficient memory.
+        try {
+            Thread.sleep(100);
+        } catch (InterruptedException e) {
+            // No need to handle.
+        }
     }
 
     @Override

Original file line number	Diff line number	Diff line change
`@@ -6,7 +6,6 @@`
`6`	`6`	`import org.apache.spark.sql.catalyst.InternalRow;`
`7`	`7`
`8`	`8`	`import java.util.Iterator;`
`9`		`-import java.util.List;`
`10`	`9`
`11`	`10`	`/**`
`12`	`11`	`* Strategy interface for how a Spark row is converted into a set of inputs for writing a document to MarkLogic.`
`@@ -26,5 +25,5 @@ public interface RowConverter {`
`26`	`25`	`*`
`27`	`26`	`* @return`
`28`	`27`	`*/`
`29`		`- List<DocBuilder.DocumentInputs> getRemainingDocumentInputs();`
	`28`	`+ Iterator<DocBuilder.DocumentInputs> getRemainingDocumentInputs();`
`30`	`29`	`}`