Merge pull request #65 from marklogic/feature/spark-3.3

rjrudin · web-flow · commit 17580b7f10c5 · 2023-06-07T17:50:04.000-07:00
Testing against Spark 3.3.2 now
diff --git a/build.gradle b/build.gradle
@@ -20,7 +20,7 @@ repositories {
 }
 
 dependencies {
-  compileOnly 'org.apache.spark:spark-sql_2.12:3.4.0'
+  compileOnly 'org.apache.spark:spark-sql_2.12:' + sparkVersion
   implementation "com.marklogic:marklogic-client-api:6.2.1"
 
   // Makes it possible to use lambdas in Java 8 to implement Spark's Function1 and Function2 interfaces
@@ -30,7 +30,7 @@ dependencies {
     exclude module: "scala-library"
   }
 
-  testImplementation 'org.apache.spark:spark-sql_2.12:3.4.0'
+  testImplementation 'org.apache.spark:spark-sql_2.12:' + sparkVersion
   testImplementation 'com.marklogic:ml-app-deployer:4.5.2'
   testImplementation 'com.marklogic:marklogic-junit5:1.3.0'
   testImplementation "ch.qos.logback:logback-classic:1.3.5"
diff --git a/docs/index.md b/docs/index.md
@@ -9,7 +9,8 @@ reading data from and writing data to MarkLogic.
 
 The connector has the following system requirements:
 
-* Apache Spark 3.4.0 or higher; earlier versions of Spark 3.x may work but have not been tested.
+* Apache Spark 3.3.0 or higher; Spark 3.3.2 and Spark 3.4.0 are recommended to better leverage the connector's support 
+  for pushing operations down when reading data.
 * For writing data, MarkLogic 9.0-9 or higher.
 * For reading data, MarkLogic 10.0-9 or higher.
 
diff --git a/docs/reading.md b/docs/reading.md
@@ -118,7 +118,6 @@ down the following operations to MarkLogic:
 - `filter` and `where`
 - `groupBy` when followed by `count`
 - `limit`
-- `offset`
 - `orderBy`
 
 For each of the above operations, user's Optic query is enhanced to include the associated Optic function.
diff --git a/examples/java-dependency/build.gradle b/examples/java-dependency/build.gradle
@@ -8,7 +8,7 @@ repositories {
 }
 
 dependencies {
-  implementation 'org.apache.spark:spark-sql_2.12:3.4.0'
+  implementation 'org.apache.spark:spark-sql_2.12:3.3.2'
   implementation 'com.marklogic:marklogic-spark-connector:2.0-SNAPSHOT'
 }
 
diff --git a/gradle.properties b/gradle.properties
@@ -1,3 +1,9 @@
+# Testing against 3.3.2 for the 2.0.0 release as 3.3.0 was released in June 2022 and 3.3.2 in February 2023, while
+# 3.4.0 is fairly new - April 2023. And at least AWS Glue and EMR are only on 3.3.0. But 3.3.2 has bug fixes that
+# affect some of our tests - see PushDownGroupByCountTest for an example. So we're choosing to build and test
+# against the latest 3.3.x release so we're not writing assertions based on buggy behavior in Spark 3.3.0.
+sparkVersion=3.3.2
+
 # Only used for the test app and for running tests.
 mlHost=localhost
 mlAppName=spark-test
diff --git a/src/main/java/com/marklogic/spark/reader/MarkLogicScanBuilder.java b/src/main/java/com/marklogic/spark/reader/MarkLogicScanBuilder.java
@@ -27,7 +27,6 @@
 import org.apache.spark.sql.connector.read.SupportsPushDownAggregates;
 import org.apache.spark.sql.connector.read.SupportsPushDownFilters;
 import org.apache.spark.sql.connector.read.SupportsPushDownLimit;
-import org.apache.spark.sql.connector.read.SupportsPushDownOffset;
 import org.apache.spark.sql.connector.read.SupportsPushDownRequiredColumns;
 import org.apache.spark.sql.connector.read.SupportsPushDownTopN;
 import org.apache.spark.sql.sources.Filter;
@@ -40,7 +39,7 @@
 import java.util.List;
 
 public class MarkLogicScanBuilder implements ScanBuilder, SupportsPushDownFilters, SupportsPushDownLimit,
-    SupportsPushDownOffset, SupportsPushDownTopN, SupportsPushDownAggregates, SupportsPushDownRequiredColumns {
+    SupportsPushDownTopN, SupportsPushDownAggregates, SupportsPushDownRequiredColumns {
 
     private final static Logger logger = LoggerFactory.getLogger(MarkLogicScanBuilder.class);
 
@@ -139,18 +138,6 @@ public boolean isPartiallyPushed() {
         return readContext.getBucketCount() > 1;
     }
 
-    @Override
-    public boolean pushOffset(int offset) {
-        if (readContext.planAnalysisFoundNoRows()) {
-            return false;
-        }
-        if (logger.isDebugEnabled()) {
-            logger.debug("Pushing down offset: {}", offset);
-        }
-        readContext.pushDownOffset(offset);
-        return true;
-    }
-
     @Override
     public boolean pushAggregation(Aggregation aggregation) {
         if (readContext.planAnalysisFoundNoRows()) {
@@ -160,7 +147,7 @@ public boolean pushAggregation(Aggregation aggregation) {
             if (aggregation.groupByExpressions().length > 0) {
                 Expression expr = aggregation.groupByExpressions()[0];
                 if (logger.isDebugEnabled()) {
-                    logger.debug("Pushing down by groupBy + count on: {}", expr.describe());
+                    logger.debug("Pushing down groupBy + count on: {}", expr.describe());
                 }
                 readContext.pushDownGroupByCount(expr);
             } else {
diff --git a/src/main/java/com/marklogic/spark/reader/PlanUtil.java b/src/main/java/com/marklogic/spark/reader/PlanUtil.java
@@ -67,10 +67,6 @@ static ObjectNode buildLimit(int limit) {
         return newOperation("limit", args -> args.add(limit));
     }
 
-    static ObjectNode buildOffset(int offset) {
-        return newOperation("offset", args -> args.add(offset));
-    }
-
     static ObjectNode buildOrderBy(SortOrder sortOrder) {
         final String direction = SortDirection.ASCENDING.equals(sortOrder.direction()) ? "asc" : "desc";
         final String columnName = expressionToColumnName(sortOrder.expression());
diff --git a/src/main/java/com/marklogic/spark/reader/ReadContext.java b/src/main/java/com/marklogic/spark/reader/ReadContext.java
@@ -152,10 +152,6 @@ void pushDownLimit(int limit) {
         addOperatorToPlan(PlanUtil.buildLimit(limit));
     }
 
-    void pushDownOffset(int offset) {
-        addOperatorToPlan(PlanUtil.buildOffset(offset));
-    }
-
     void pushDownTopN(SortOrder[] orders, int limit) {
         for (SortOrder sortOrder : orders) {
             addOperatorToPlan(PlanUtil.buildOrderBy(sortOrder));
diff --git a/src/test/java/com/marklogic/spark/AbstractIntegrationTest.java b/src/test/java/com/marklogic/spark/AbstractIntegrationTest.java
@@ -19,13 +19,16 @@
 import com.marklogic.junit5.spring.SimpleTestConfig;
 import org.apache.spark.sql.DataFrameReader;
 import org.apache.spark.sql.SparkSession;
+import org.apache.spark.util.VersionUtils;
 import org.junit.jupiter.api.AfterEach;
 import org.springframework.beans.factory.annotation.Autowired;
 import org.springframework.core.io.ClassPathResource;
 import org.springframework.util.FileCopyUtils;
 
 import java.io.IOException;
 
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+
 /**
  * Uses marklogic-junit (from marklogic-unit-test) to construct a DatabaseClient
  * based on the properties in gradle.properties and gradle-local.properties.
@@ -120,4 +123,11 @@ protected final boolean isMarkLogic10() {
         return markLogicVersion.getMajor() == 10;
     }
 
+    protected final boolean isSpark340OrHigher() {
+        assertNotNull(sparkSession, "Cannot check Spark version until a Spark Session has been created.");
+        final String version = sparkSession.version();
+        int major = VersionUtils.majorVersion(version);
+        int minor = VersionUtils.minorVersion(version);
+        return major > 3 || (major == 3 && minor >= 4);
+    }
 }
diff --git a/src/test/java/com/marklogic/spark/reader/PushDownGroupByCountTest.java b/src/test/java/com/marklogic/spark/reader/PushDownGroupByCountTest.java
@@ -114,12 +114,25 @@ void groupByCountOrderByLimit() {
             .collectAsList();
 
         assertEquals(4, rows.size());
-        assertEquals(4, countOfRowsReadFromMarkLogic);
+        if (isSpark340OrHigher()) {
+            assertEquals(4, countOfRowsReadFromMarkLogic);
+        } else {
+            assertEquals(5, countOfRowsReadFromMarkLogic, "With Spark 3.3.x, the limit is not pushed down, perhaps " +
+                "when groupBy is called as well. Spark 3.4.0 fixes this so that the limit is pushed down. So for 3.3.x, " +
+                "we expect all 5 rows - one per CitationID.");
+        }
         assertEquals(4l, (long) rows.get(0).getAs("CitationID"));
         assertEquals(1l, (long) rows.get(0).getAs("count"));
     }
 
     private void verifyGroupByWasPushedDown(List<Row> rows) {
+        /**
+         * Note that in Spark 3.3.0, there seems to be a bug where groupBy+count are not always pushed down. That's not
+         * an issue in Spark 3.3.2, so the behavior in 3.3.0 seems to be considered buggy and thus fixed in 3.3.2.
+         * While AWS Glue and EMR are both currently using 3.3.0 and not 3.3.2, we'd rather test against the latest
+         * bugfix release to ensure we're in sync with that and not writing test assertions against what's considered
+         * buggy behavior in 3.3.0.
+         */
         assertEquals(5, countOfRowsReadFromMarkLogic, "groupBy should be pushed down to MarkLogic when used with " +
             "count, and since there are 5 CitationID values, 5 rows should be returned.");
 
diff --git a/src/test/java/com/marklogic/spark/reader/PushDownOffsetTest.java b/src/test/java/com/marklogic/spark/reader/PushDownOffsetTest.java
diff --git a/src/test/java/com/marklogic/spark/writer/AbstractWriteTest.java b/src/test/java/com/marklogic/spark/writer/AbstractWriteTest.java
@@ -58,4 +58,16 @@ protected DataFrameWriter newWriterWithoutDocumentConfig(String csvFilename, int
             .option("spark.marklogic.client.username", "spark-test-user")
             .option("spark.marklogic.client.password", "spark");
     }
+
+    /**
+     * In Spark 3.3, an error from our writer - such as an IllegalArgumentException for an invalid option - is two
+     * layers deep in a SparkException. In Spark 3.4, it's one layer deep. This method is used to hide that from
+     * test classes so that when we upgrade from Spark 3.3 to Spark 3.4, it's easy to update the tests.
+     *
+     * @param ex
+     * @return
+     */
+    protected final Throwable getCauseFromWriterException(Exception ex) {
+        return isSpark340OrHigher() ? ex.getCause() : ex.getCause().getCause();
+    }
 }
diff --git a/src/test/java/com/marklogic/spark/writer/WriteRowsTest.java b/src/test/java/com/marklogic/spark/writer/WriteRowsTest.java
@@ -98,8 +98,9 @@ void invalidThreadCount() {
             () -> newWriter().option(Options.WRITE_THREAD_COUNT, 0).save()
         );
 
-        assertTrue(ex.getCause() instanceof IllegalArgumentException, "Unexpected cause: " + ex.getCause().getClass());
-        assertEquals("Value of 'spark.marklogic.write.threadCount' option must be 1 or greater", ex.getCause().getMessage());
+        Throwable cause = getCauseFromWriterException(ex);
+        assertTrue(cause instanceof IllegalArgumentException, "Unexpected cause: " + cause.getClass());
+        assertEquals("Value of 'spark.marklogic.write.threadCount' option must be 1 or greater", cause.getMessage());
         verifyNoDocsWereWritten();
     }
 
@@ -110,8 +111,9 @@ void invalidBatchSize() {
             () -> newWriter().option(Options.WRITE_BATCH_SIZE, 0).save()
         );
 
-        assertTrue(ex.getCause() instanceof IllegalArgumentException, "Unexpected cause: " + ex.getCause().getClass());
-        assertEquals("Value of 'spark.marklogic.write.batchSize' option must be 1 or greater", ex.getCause().getMessage(),
+        Throwable cause = getCauseFromWriterException(ex);
+        assertTrue(cause instanceof IllegalArgumentException, "Unexpected cause: " + cause.getClass());
+        assertEquals("Value of 'spark.marklogic.write.batchSize' option must be 1 or greater", cause.getMessage(),
             "Note that batchSize is very different for writing than it is for reading. For writing, it specifies the " +
                 "exact number of documents to send to MarkLogic in each call. For reading, it used to determine how " +
                 "many requests will be made by a partition, and zero is a valid value for reading.");
@@ -143,10 +145,11 @@ void invalidPassword() {
                 .save()
         );
 
-        assertTrue(ex.getCause() instanceof RuntimeException, "Expecting a RuntimeException due to the invalid " +
-            "credentials; unexpected cause: " + ex.getCause());
+        Throwable cause = getCauseFromWriterException(ex);
+        assertTrue(cause instanceof RuntimeException, "Expecting a RuntimeException due to the invalid " +
+            "credentials; unexpected cause: " + cause);
 
-        assertEquals("Unable to connect to MarkLogic; status code: 401; error message: Unauthorized", ex.getCause().getMessage());
+        assertEquals("Unable to connect to MarkLogic; status code: 401; error message: Unauthorized", cause.getMessage());
     }
 
     /**
@@ -172,10 +175,11 @@ void invalidPermissionsConfig() {
             .option(Options.WRITE_PERMISSIONS, "rest-reader,read,rest-writer")
             .save());
 
-        assertTrue(ex.getCause() instanceof IllegalArgumentException);
+        Throwable cause = getCauseFromWriterException(ex);
+        assertTrue(cause instanceof IllegalArgumentException);
         assertEquals("Unable to parse permissions string, which must be a comma-separated list of role names and " +
             "capabilities - i.e. role1,read,role2,update,role3,execute; " +
-            "string: rest-reader,read,rest-writer", ex.getCause().getMessage());
+            "string: rest-reader,read,rest-writer", cause.getMessage());
     }
 
     @Test
@@ -192,10 +196,11 @@ void dontAbortOnFailure() {
     }
 
     private void verifyFailureIsDueToLackOfPermission(SparkException ex) {
-        assertNotNull(ex.getCause(), "Unexpected exception with no cause: " + ex.getClass() + "; " + ex.getMessage());
-        assertTrue(ex.getCause() instanceof IOException, "Unexpected cause: " + ex.getCause().getClass());
-        assertTrue(ex.getCause().getMessage().contains("Server Message: You do not have permission to this method and URL"),
-            "Unexpected cause message: " + ex.getCause().getMessage());
+        Throwable cause = getCauseFromWriterException(ex);
+        assertNotNull(cause, "Unexpected exception with no cause: " + ex.getClass() + "; " + ex.getMessage());
+        assertTrue(cause instanceof IOException, "Unexpected cause: " + cause.getClass());
+        assertTrue(cause.getMessage().contains("Server Message: You do not have permission to this method and URL"),
+            "Unexpected cause message: " + cause.getMessage());
         verifyNoDocsWereWritten();
     }
 
diff --git a/src/test/java/com/marklogic/spark/writer/WriteRowsWithTransformTest.java b/src/test/java/com/marklogic/spark/writer/WriteRowsWithTransformTest.java
@@ -98,11 +98,12 @@ void invalidTransform() {
                 .option(Options.WRITE_TRANSFORM_NAME, "this-doesnt-exist")
                 .save());
 
-        assertTrue(ex.getCause() instanceof IOException);
-        assertTrue(ex.getCause().getMessage().contains("Extension this-doesnt-exist or a dependency does not exist"),
+        Throwable cause = getCauseFromWriterException(ex);
+        assertTrue(cause instanceof IOException);
+        assertTrue(cause.getMessage().contains("Extension this-doesnt-exist or a dependency does not exist"),
             "The connector can't easily validate that a REST transform is valid, but the expectation is that the " +
                 "error message from the REST API will make the problem evident to the user; " +
-                "unexpected message: " + ex.getCause().getMessage());
+                "unexpected message: " + cause.getMessage());
     }
 
     @Test
@@ -113,10 +114,11 @@ void invalidTransformParams() {
                 .option(Options.WRITE_TRANSFORM_PARAMS, "param1,value1,param2")
                 .save());
 
-        assertTrue(ex.getCause() instanceof IllegalArgumentException);
+        Throwable cause = getCauseFromWriterException(ex);
+        assertTrue(cause instanceof IllegalArgumentException);
         assertEquals(
             "The spark.marklogic.write.transformParams option must contain an equal number of parameter names and values; received: param1,value1,param2",
-            ex.getCause().getMessage()
+            cause.getMessage()
         );
     }
 }
diff --git a/src/test/java/com/marklogic/spark/writer/WriteRowsWithUriTemplateTest.java b/src/test/java/com/marklogic/spark/writer/WriteRowsWithUriTemplateTest.java
diff --git a/src/test/java/com/marklogic/spark/writer/WriteStreamOfRowsTest.java b/src/test/java/com/marklogic/spark/writer/WriteStreamOfRowsTest.java

Original file line number	Diff line number	Diff line change
`@@ -20,7 +20,7 @@ repositories {`
`20`	`20`	`}`
`21`	`21`
`22`	`22`	`dependencies {`
`23`		`- compileOnly 'org.apache.spark:spark-sql_2.12:3.4.0'`
	`23`	`+ compileOnly 'org.apache.spark:spark-sql_2.12:' + sparkVersion`
`24`	`24`	`implementation "com.marklogic:marklogic-client-api:6.2.1"`
`25`	`25`
`26`	`26`	`// Makes it possible to use lambdas in Java 8 to implement Spark's Function1 and Function2 interfaces`
`@@ -30,7 +30,7 @@ dependencies {`
`30`	`30`	`exclude module: "scala-library"`
`31`	`31`	`}`
`32`	`32`
`33`		`- testImplementation 'org.apache.spark:spark-sql_2.12:3.4.0'`
	`33`	`+ testImplementation 'org.apache.spark:spark-sql_2.12:' + sparkVersion`
`34`	`34`	`testImplementation 'com.marklogic:ml-app-deployer:4.5.2'`
`35`	`35`	`testImplementation 'com.marklogic:marklogic-junit5:1.3.0'`
`36`	`36`	`testImplementation "ch.qos.logback:logback-classic:1.3.5"`
Original file line number	Diff line number	Diff line change
`@@ -8,7 +8,7 @@ repositories {`
`8`	`8`	`}`
`9`	`9`
`10`	`10`	`dependencies {`
`11`		`- implementation 'org.apache.spark:spark-sql_2.12:3.4.0'`
	`11`	`+ implementation 'org.apache.spark:spark-sql_2.12:3.3.2'`
`12`	`12`	`implementation 'com.marklogic:marklogic-spark-connector:2.0-SNAPSHOT'`
`13`	`13`	`}`
`14`	`14`