Merge pull request #171 from marklogic/feature/12259-filtered-query

rjrudin · web-flow · commit c02f5fb05790 · 2024-02-21T14:56:46.000-05:00
MLE-12259 Supporting filtered queries
diff --git a/docs/configuration.md b/docs/configuration.md
@@ -149,6 +149,7 @@ The following options control how the connector reads document rows from MarkLog
 | spark.marklogic.read.documents.categories | Controls which metadata is returned for each document. Defaults to `content`. Allowable values are `content`, `metadata`, `collections`, `permissions`, `quality`, `properties`, and `metadatavalues`. |
 | spark.marklogic.read.documents.collections | Comma-delimited string of zero to many collections to constrain the query. |
 | spark.marklogic.read.documents.directory | Database directory - e.g. "/company/employees/" - to constrain the query. |
+| spark.marklogic.read.documents.filtered | Set to true for [filtered searches](https://docs.marklogic.com/guide/performance/unfiltered). Defaults to `false` as unfiltered searches are significantly faster and will produce accurate results when your application indexes are sufficient for your query. |
 | spark.marklogic.read.documents.options | Name of a set of [MarkLogic search options](https://docs.marklogic.com/guide/search-dev/query-options) to be applied against a string query. |
 | spark.marklogic.read.documents.partitionsPerForest | Number of Spark partition readers to create per forest; defaults to 4. |
 | spark.marklogic.read.documents.transform | Name of a [MarkLogic REST transform](https://docs.marklogic.com/guide/rest-dev/transforms) to apply to each matching document. |
diff --git a/docs/reading-data/documents.md b/docs/reading-data/documents.md
@@ -205,6 +205,23 @@ doc = json.loads(df2.head()['content'])
 doc['Department']
 ```
 
+## Filtered searches
+
+The connector defaults to [unfiltered searches in MarkLogic](https://docs.marklogic.com/guide/performance/unfiltered). 
+Assuming you have sufficient indexes configured for your query, an unfiltered search will return correct results with
+optimal performance.
+
+However, as noted in the above linked documentation, a query may need to be "filtered" to ensure that the returned 
+results are accurate. If your query and index configuration meet this need, you can use the following option to 
+request a filtered search:
+
+    .option("spark.marklogic.read.documents.filtered", "true")
+
+Filtered searches are generally slower, and you should be careful with this setting for larger result sets. However, 
+the cost of a filtered search may be outweighed by the connector having to return far fewer results. In that scenario,
+a filtered search will both return accurate results and may be faster. Ideally though, you can configure indexes on your
+database to allow for an unfiltered search, which will return accurate results and be faster than a filtered search.
+
 ## Tuning performance
 
 The connector mimics the behavior of the [MarkLogic Data Movement SDK](https://docs.marklogic.com/guide/java/data-movement)
diff --git a/src/main/java/com/marklogic/spark/Options.java b/src/main/java/com/marklogic/spark/Options.java
@@ -45,6 +45,7 @@ public abstract class Options {
     public static final String READ_DOCUMENTS_CATEGORIES = "spark.marklogic.read.documents.categories";
     public static final String READ_DOCUMENTS_COLLECTIONS = "spark.marklogic.read.documents.collections";
     public static final String READ_DOCUMENTS_DIRECTORY = "spark.marklogic.read.documents.directory";
+    public static final String READ_DOCUMENTS_FILTERED = "spark.marklogic.read.documents.filtered";
     public static final String READ_DOCUMENTS_OPTIONS = "spark.marklogic.read.documents.options";
     public static final String READ_DOCUMENTS_PARTITIONS_PER_FOREST = "spark.marklogic.read.documents.partitionsPerForest";
     // Corresponds to "q" at https://docs.marklogic.com/REST/POST/v1/search, known as a "string query".
diff --git a/src/main/java/com/marklogic/spark/reader/document/ForestReader.java b/src/main/java/com/marklogic/spark/reader/document/ForestReader.java
@@ -11,6 +11,7 @@
 import com.marklogic.client.query.QueryDefinition;
 import com.marklogic.client.query.SearchQueryDefinition;
 import com.marklogic.client.query.StructuredQueryBuilder;
+import com.marklogic.spark.Options;
 import org.apache.spark.sql.catalyst.InternalRow;
 import org.apache.spark.sql.catalyst.expressions.GenericInternalRow;
 import org.apache.spark.sql.catalyst.util.ArrayBasedMapData;
@@ -66,7 +67,11 @@ class ForestReader implements PartitionReader<InternalRow> {
         }
 
         SearchQueryDefinition query = context.buildSearchQuery(client);
-        this.uriBatcher = new UriBatcher(client, query, forestPartition, context.getBatchSize(), false);
+        boolean filtered = false;
+        if (context.hasOption(Options.READ_DOCUMENTS_FILTERED)) {
+            filtered = Boolean.parseBoolean(context.getProperties().get(Options.READ_DOCUMENTS_FILTERED));
+        }
+        this.uriBatcher = new UriBatcher(client, query, forestPartition, context.getBatchSize(), filtered);
 
         this.documentManager = client.newDocumentManager();
         this.documentManager.setReadTransform(query.getResponseTransform());
diff --git a/src/test/java/com/marklogic/spark/reader/document/ReadFilteredDocumentRowsTest.java b/src/test/java/com/marklogic/spark/reader/document/ReadFilteredDocumentRowsTest.java
@@ -0,0 +1,79 @@
+package com.marklogic.spark.reader.document;
+
+import com.marklogic.spark.AbstractIntegrationTest;
+import com.marklogic.spark.Options;
+import org.apache.spark.sql.DataFrameReader;
+import org.apache.spark.sql.Dataset;
+import org.apache.spark.sql.Row;
+import org.junit.jupiter.api.Test;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+/**
+ * As touched in the documentation for this feature, filtering can in some scenarios significantly improve performance
+ * by not retrieving a large number of false positives. Generally, as the percentage of false positives increases,
+ * the benefit from filtering will increase by causing the connector to retrieve fewer documents. Overall though,
+ * we would still recommend to a customer to configure their indexes so that they can use an unfiltered query that is
+ * both fast and accurate.
+ */
+class ReadFilteredDocumentRowsTest extends AbstractIntegrationTest {
+
+    private static final String FALSE_POSITIVE_QUERY = "<json-property-word-query xmlns='http://marklogic.com/cts'>" +
+        "<property>ForeName</property>" +
+        "<text xml:lang='en'>Wool*</text>" +
+        "</json-property-word-query>";
+
+    private static final String CORRECT_WILDCARD_QUERY = "<json-property-word-query xmlns='http://marklogic.com/cts'>" +
+        "<property>LastName</property>" +
+        "<text xml:lang='en'>Wool*</text>" +
+        "</json-property-word-query>";
+
+    @Test
+    void falsePositive() {
+        DataFrameReader reader = newSparkSession().read()
+            .format(CONNECTOR_IDENTIFIER)
+            .option(Options.CLIENT_URI, makeClientUri())
+            .option(Options.READ_DOCUMENTS_COLLECTIONS, "author")
+            .option(Options.READ_DOCUMENTS_QUERY, FALSE_POSITIVE_QUERY);
+
+        Dataset<Row> dataset = reader.load();
+
+        assertEquals(1, dataset.count(), "The database has trailing-wildcard-searches enabled, which allows for " +
+            "'Wool*' to work. But since the search is unfiltered, we get a false positive as 'Wooles' appears in " +
+            "the LastName property, not the ForeName property.");
+
+        dataset = reader.option(Options.READ_DOCUMENTS_FILTERED, "true").load();
+
+        assertEquals(0, dataset.count(), "Now that the search is filtered, the false positive will be omitted.");
+    }
+
+    @Test
+    void correctWildcardQuery() {
+        DataFrameReader reader = newSparkSession().read()
+            .format(CONNECTOR_IDENTIFIER)
+            .option(Options.CLIENT_URI, makeClientUri())
+            .option(Options.READ_DOCUMENTS_COLLECTIONS, "author")
+            .option(Options.READ_DOCUMENTS_QUERY, CORRECT_WILDCARD_QUERY);
+
+        Dataset<Row> dataset = reader.load();
+        assertEquals(1, dataset.count());
+
+        dataset = reader.option(Options.READ_DOCUMENTS_FILTERED, "true").load();
+        assertEquals(1, dataset.count(), "This test just verifies that a valid wildcard query works correctly on " +
+            "our test database.");
+    }
+
+    @Test
+    void invalidValue() {
+        Dataset<Row> dataset = newSparkSession().read()
+            .format(CONNECTOR_IDENTIFIER)
+            .option(Options.CLIENT_URI, makeClientUri())
+            .option(Options.READ_DOCUMENTS_COLLECTIONS, "author")
+            .option(Options.READ_DOCUMENTS_QUERY, FALSE_POSITIVE_QUERY)
+            .option(Options.READ_DOCUMENTS_FILTERED, "not-valid")
+            .load();
+
+        assertEquals(1, dataset.count(), "Boolean.parseBoolean interprets a non-true/false value as false, so we " +
+            "expect the query to be unfiltered and thus we get back a count of 1 due to the false positive.");
+    }
+}
diff --git a/src/test/ml-config/databases/content-database.json b/src/test/ml-config/databases/content-database.json
@@ -1,6 +1,7 @@
 {
   "database-name": "%%DATABASE%%",
   "schema-database": "%%SCHEMAS_DATABASE%%",
+  "trailing-wildcard-searches": true,
   "range-element-index": [
     {
       "collation": "",

Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,7 @@`
`1`	`1`	`{`
`2`	`2`	`"database-name": "%%DATABASE%%",`
`3`	`3`	`"schema-database": "%%SCHEMAS_DATABASE%%",`
	`4`	`+ "trailing-wildcard-searches": true,`
`4`	`5`	`"range-element-index": [`
`5`	`6`	`{`
`6`	`7`	`"collation": "",`