Merge pull request #158 from marklogic/feature/12257-push-limit

rjrudin · web-flow · commit 8379ebe51373 · 2024-02-07T13:11:40.000-05:00
MLE-12257 Pushing down limit
diff --git a/src/main/java/com/marklogic/spark/reader/document/DocumentBatch.java b/src/main/java/com/marklogic/spark/reader/document/DocumentBatch.java
@@ -8,18 +8,17 @@
 import org.apache.spark.sql.connector.read.Batch;
 import org.apache.spark.sql.connector.read.InputPartition;
 import org.apache.spark.sql.connector.read.PartitionReaderFactory;
-import org.apache.spark.sql.util.CaseInsensitiveStringMap;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
 class DocumentBatch implements Batch {
 
     private static final Logger logger = LoggerFactory.getLogger(DocumentBatch.class);
 
-    private DocumentContext context;
+    private final DocumentContext context;
 
-    DocumentBatch(CaseInsensitiveStringMap options) {
-        this.context = new DocumentContext(options);
+    DocumentBatch(DocumentContext context) {
+        this.context = context;
     }
 
     /**
diff --git a/src/main/java/com/marklogic/spark/reader/document/DocumentContext.java b/src/main/java/com/marklogic/spark/reader/document/DocumentContext.java
@@ -13,6 +13,8 @@
 
 class DocumentContext extends ContextSupport {
 
+    private Integer limit;
+
     DocumentContext(CaseInsensitiveStringMap options) {
         super(options.asCaseSensitiveMap());
     }
@@ -72,4 +74,12 @@ int getPartitionsPerForest() {
         int defaultPartitionsPerForest = 4;
         return (int) getNumericOption(Options.READ_DOCUMENTS_PARTITIONS_PER_FOREST, defaultPartitionsPerForest, 1);
     }
+
+    void setLimit(Integer limit) {
+        this.limit = limit;
+    }
+
+    Integer getLimit() {
+        return limit;
+    }
 }
diff --git a/src/main/java/com/marklogic/spark/reader/document/DocumentScan.java b/src/main/java/com/marklogic/spark/reader/document/DocumentScan.java
@@ -3,14 +3,13 @@
 import org.apache.spark.sql.connector.read.Batch;
 import org.apache.spark.sql.connector.read.Scan;
 import org.apache.spark.sql.types.StructType;
-import org.apache.spark.sql.util.CaseInsensitiveStringMap;
 
 class DocumentScan implements Scan {
 
-    private CaseInsensitiveStringMap options;
+    private final DocumentContext context;
 
-    DocumentScan(CaseInsensitiveStringMap options) {
-        this.options = options;
+    DocumentScan(DocumentContext context) {
+        this.context = context;
     }
 
     @Override
@@ -20,6 +19,6 @@ public StructType readSchema() {
 
     @Override
     public Batch toBatch() {
-        return new DocumentBatch(options);
+        return new DocumentBatch(context);
     }
 }
diff --git a/src/main/java/com/marklogic/spark/reader/document/DocumentScanBuilder.java b/src/main/java/com/marklogic/spark/reader/document/DocumentScanBuilder.java
@@ -2,18 +2,33 @@
 
 import org.apache.spark.sql.connector.read.Scan;
 import org.apache.spark.sql.connector.read.ScanBuilder;
+import org.apache.spark.sql.connector.read.SupportsPushDownLimit;
 import org.apache.spark.sql.util.CaseInsensitiveStringMap;
 
-class DocumentScanBuilder implements ScanBuilder {
+class DocumentScanBuilder implements ScanBuilder, SupportsPushDownLimit {
 
-    private CaseInsensitiveStringMap options;
+    private final DocumentContext context;
 
     DocumentScanBuilder(CaseInsensitiveStringMap options) {
-        this.options = options;
+        this.context = new DocumentContext(options);
     }
 
     @Override
     public Scan build() {
-        return new DocumentScan(options);
+        return new DocumentScan(context);
+    }
+
+    @Override
+    public boolean pushLimit(int limit) {
+        this.context.setLimit(limit);
+        return true;
+    }
+
+    @Override
+    public boolean isPartiallyPushed() {
+        // A partition reader can only ensure that it doesn't exceed the limit. In a worst case scenario, every reader
+        // will return "limit" rows. So must return true here to ensure that Spark reduces the dataset to the
+        // appropriate limit.
+        return true;
     }
 }
diff --git a/src/main/java/com/marklogic/spark/reader/document/ForestReader.java b/src/main/java/com/marklogic/spark/reader/document/ForestReader.java
@@ -42,31 +42,32 @@ class ForestReader implements PartitionReader<InternalRow> {
     private final StructuredQueryBuilder queryBuilder;
     private final Set<DocumentManager.Metadata> requestedMetadata;
     private final boolean contentWasRequested;
+    private final Integer limit;
 
     // Only used for logging.
     private final ForestPartition forestPartition;
     private long startTime;
 
     private DocumentPage currentDocumentPage;
 
+    // Used for logging and for ensuring a non-null limit is not exceeded.
     private int docCount;
 
-    ForestReader(ForestPartition forestPartition, DocumentContext documentContext) {
+    ForestReader(ForestPartition forestPartition, DocumentContext context) {
         if (logger.isDebugEnabled()) {
             logger.debug("Will read from partition: {}", forestPartition);
         }
         this.forestPartition = forestPartition;
+        this.limit = context.getLimit();
 
-        DatabaseClient client = documentContext.connectToMarkLogic();
-
-        SearchQueryDefinition query = documentContext.buildSearchQuery(client);
-        int batchSize = documentContext.getBatchSize();
-        this.uriBatcher = new UriBatcher(client, query, forestPartition, batchSize, false);
+        DatabaseClient client = context.connectToMarkLogic();
+        SearchQueryDefinition query = context.buildSearchQuery(client);
+        this.uriBatcher = new UriBatcher(client, query, forestPartition, context.getBatchSize(), false);
 
         this.documentManager = client.newDocumentManager();
         this.documentManager.setReadTransform(query.getResponseTransform());
-        this.contentWasRequested = documentContext.contentWasRequested();
-        this.requestedMetadata = documentContext.getRequestedMetadata();
+        this.contentWasRequested = context.contentWasRequested();
+        this.requestedMetadata = context.getRequestedMetadata();
         this.documentManager.setMetadataCategories(this.requestedMetadata);
         this.queryBuilder = client.newQueryManager().newStructuredQueryBuilder();
     }
@@ -76,6 +77,13 @@ public boolean next() {
         if (startTime == 0) {
             startTime = System.currentTimeMillis();
         }
+
+        if (limit != null && docCount >= limit) {
+            // No logging here as this block may never be hit, depending on whether Spark first detects that the limit
+            // has been reached.
+            return false;
+        }
+
         if (currentDocumentPage == null || !currentDocumentPage.hasNext()) {
             closeCurrentDocumentPage();
             List<String> uris = getNextBatchOfUris();
@@ -89,6 +97,7 @@ public boolean next() {
             }
             this.currentDocumentPage = readPage(uris);
         }
+
         return currentDocumentPage.hasNext();
     }
 
diff --git a/src/test/java/com/marklogic/spark/reader/document/PushDownLimitTest.java b/src/test/java/com/marklogic/spark/reader/document/PushDownLimitTest.java
@@ -0,0 +1,44 @@
+package com.marklogic.spark.reader.document;
+
+import com.marklogic.spark.AbstractIntegrationTest;
+import com.marklogic.spark.Options;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+import org.junit.jupiter.api.Test;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+class PushDownLimitTest extends AbstractIntegrationTest {
+
+    @Test
+    void two() {
+        long count = readAuthors().limit(2).count();
+        assertTrue(count <= 6, "With a limit of 2, each reader should read at most 2 docs; they can't do " +
+            "any fewer than that because each one has no idea how many documents any other reader will get. " +
+            "Unexpected count: " + count);
+    }
+
+    @Test
+    void zero() {
+        long count = readAuthors().limit(0).count();
+        assertEquals(0, count);
+    }
+
+    @Test
+    void limitIsMoreThanTotal() {
+        long count = readAuthors().limit(20).count();
+        assertEquals(15, count, "A limit greater than then number of matching documents has no impact on the results.");
+    }
+
+    private Dataset<Row> readAuthors() {
+        return newSparkSession().read()
+            .format(CONNECTOR_IDENTIFIER)
+            .option(Options.CLIENT_URI, makeClientUri())
+            .option(Options.READ_DOCUMENTS_COLLECTIONS, "author")
+            // Using a single partition to increase the chance that a reader will hit the limit.
+            .option(Options.READ_DOCUMENTS_PARTITIONS_PER_FOREST, 1)
+            .load();
+    }
+
+}

Original file line number	Diff line number	Diff line change
`@@ -13,6 +13,8 @@`
`13`	`13`
`14`	`14`	`class DocumentContext extends ContextSupport {`
`15`	`15`
	`16`	`+ private Integer limit;`
	`17`	`+`
`16`	`18`	`DocumentContext(CaseInsensitiveStringMap options) {`
`17`	`19`	`super(options.asCaseSensitiveMap());`
`18`	`20`	`}`
`@@ -72,4 +74,12 @@ int getPartitionsPerForest() {`
`72`	`74`	`int defaultPartitionsPerForest = 4;`
`73`	`75`	`return (int) getNumericOption(Options.READ_DOCUMENTS_PARTITIONS_PER_FOREST, defaultPartitionsPerForest, 1);`
`74`	`76`	`}`
	`77`	`+`
	`78`	`+ void setLimit(Integer limit) {`
	`79`	`+ this.limit = limit;`
	`80`	`+ }`
	`81`	`+`
	`82`	`+ Integer getLimit() {`
	`83`	`+ return limit;`
	`84`	`+ }`
`75`	`85`	`}`