marklogic
diff --git a/‎docs/reading.md
Lines changed: 6 additions & 10 deletions b/‎docs/reading.md
Lines changed: 6 additions & 10 deletions
diff --git a/‎src/main/java/com/marklogic/spark/reader/MarkLogicScanBuilder.java
Lines changed: 59 additions & 24 deletions b/‎src/main/java/com/marklogic/spark/reader/MarkLogicScanBuilder.java
Lines changed: 59 additions & 24 deletions
diff --git a/‎src/main/java/com/marklogic/spark/reader/PlanUtil.java
Lines changed: 80 additions & 17 deletions b/‎src/main/java/com/marklogic/spark/reader/PlanUtil.java
Lines changed: 80 additions & 17 deletions
@@ -110,24 +110,20 @@ fixed via changes to the options passed to the connector should be reported as n
 
 The Spark connector framework supports pushing down multiple operations to the connector data source. This can
 often provide a significant performance boost by allowing the data source to perform the operation, which can result in
-both fewer rows returned to Spark and less work for Spark to perform. The connector supports pushing
+both fewer rows returned to Spark and less work for Spark to perform. The MarkLogic Spark connector supports pushing
 down the following operations to MarkLogic:
 
 - `count`
 - `drop` and `select`
 - `filter` and `where`
-- `groupBy` when followed by `count`
+- `groupBy` plus any of `avg`, `count`, `max`, `mean`, `min`, or `sum`
 - `limit`
 - `orderBy` and `sort`
 
-For each of the above operations, the user's Optic query is enhanced to include the associated Optic function.
-Note that if multiple partitions are used to perform the `read` operation, each
-partition will apply the above functions on the rows that it retrieves from MarkLogic. Spark will then merge the results
-from each partition and re-apply the function calls as necessary to ensure that the correct response is returned.
-
-If either `count` or `groupBy` and `count` are pushed down, the connector will make a single request to MarkLogic to 
-resolve the query (thus ignoring the number of partitions and batch size that may have been configured; see below 
-for more information on these options), ensuring that a single count or set of counts is returned to Spark.
+For each of the above operations, the user's Optic query is enhanced to include the associated Optic function. Note 
+that if multiple partitions are used to perform the `read` operation, each partition will apply the above 
+functions on the rows that it retrieves from MarkLogic. Spark will then merge the results from each partition and 
+apply the aggregation to ensure that the correct response is returned. 
 
 In the following example, every operation after `load()` is pushed down to MarkLogic, thereby resulting in far fewer 
 rows being returned to Spark and far less work having to be done by Spark:
 
@@ -17,11 +17,14 @@
 
 import com.marklogic.spark.reader.filter.FilterFactory;
 import com.marklogic.spark.reader.filter.OpticFilter;
-import org.apache.spark.sql.connector.expressions.Expression;
 import org.apache.spark.sql.connector.expressions.SortOrder;
-import org.apache.spark.sql.connector.expressions.aggregate.AggregateFunc;
 import org.apache.spark.sql.connector.expressions.aggregate.Aggregation;
+import org.apache.spark.sql.connector.expressions.aggregate.Avg;
+import org.apache.spark.sql.connector.expressions.aggregate.Count;
 import org.apache.spark.sql.connector.expressions.aggregate.CountStar;
+import org.apache.spark.sql.connector.expressions.aggregate.Max;
+import org.apache.spark.sql.connector.expressions.aggregate.Min;
+import org.apache.spark.sql.connector.expressions.aggregate.Sum;
 import org.apache.spark.sql.connector.read.Scan;
 import org.apache.spark.sql.connector.read.ScanBuilder;
 import org.apache.spark.sql.connector.read.SupportsPushDownAggregates;
@@ -36,7 +39,10 @@
 
 import java.util.ArrayList;
 import java.util.Arrays;
+import java.util.HashSet;
 import java.util.List;
+import java.util.Set;
+import java.util.stream.Stream;
 
 public class MarkLogicScanBuilder implements ScanBuilder, SupportsPushDownFilters, SupportsPushDownLimit,
     SupportsPushDownTopN, SupportsPushDownAggregates, SupportsPushDownRequiredColumns {
@@ -46,6 +52,15 @@ public class MarkLogicScanBuilder implements ScanBuilder, SupportsPushDownFilter
     private ReadContext readContext;
     private List<Filter> pushedFilters;
 
+    private final static Set<Class> SUPPORTED_AGGREGATE_FUNCTIONS = new HashSet() {{
+        add(Avg.class);
+        add(Count.class);
+        add(CountStar.class);
+        add(Max.class);
+        add(Min.class);
+        add(Sum.class);
+    }};
+
     public MarkLogicScanBuilder(ReadContext readContext) {
         this.readContext = readContext;
     }
@@ -138,38 +153,46 @@ public boolean isPartiallyPushed() {
         return readContext.getBucketCount() > 1;
     }
 
+    /**
+     * Per the Spark javadocs, this should return true if we can push down the entire aggregation. This is only
+     * possible if every aggregation function is supported and if only one request will be made to MarkLogic. If
+     * multiple requests are made to MarkLogic (based on the user-defined partition count and batch size), then
+     * Spark has to apply the aggregation against the combined set of rows returned from all requests to MarkLogic.
+     *
+     * @param aggregation
+     * @return
+     */
     @Override
-    public boolean pushAggregation(Aggregation aggregation) {
+    public boolean supportCompletePushDown(Aggregation aggregation) {
         if (readContext.planAnalysisFoundNoRows()) {
             return false;
         }
-        if (supportCompletePushDown(aggregation)) {
-            if (aggregation.groupByExpressions().length > 0) {
-                if (logger.isInfoEnabled()) {
-                    logger.info("Pushing down groupBy + count on: {}", Arrays.asList(aggregation.groupByExpressions()));
-                }
-                readContext.pushDownGroupByCount(aggregation.groupByExpressions());
-            } else {
-                if (logger.isInfoEnabled()) {
-                    logger.info("Pushing down count()");
-                }
-                readContext.pushDownCount();
-            }
-            return true;
+
+        if (hasUnsupportedAggregateFunction(aggregation)) {
+            logger.info("Aggregation contains one or more unsupported functions, " +
+                "so not pushing aggregation to MarkLogic: {}", describeAggregation(aggregation));
+            return false;
         }
-        return false;
+
+        if (readContext.getBucketCount() > 1) {
+            logger.info("Multiple requests will be made to MarkLogic; aggregation will be applied by Spark as well: {}",
+                describeAggregation(aggregation));
+            return false;
+        }
+        return true;
     }
 
     @Override
-    public boolean supportCompletePushDown(Aggregation aggregation) {
-        if (readContext.planAnalysisFoundNoRows()) {
+    public boolean pushAggregation(Aggregation aggregation) {
+        // For the initial 2.0 release, there aren't any known unsupported aggregate functions that can be called
+        // after a "groupBy". If one is detected though, the aggregation won't be pushed down as it's uncertain if
+        // pushing it down would produce the correct results.
+        if (readContext.planAnalysisFoundNoRows() || hasUnsupportedAggregateFunction(aggregation)) {
             return false;
         }
-        AggregateFunc[] expressions = aggregation.aggregateExpressions();
-        // If a count() is used, it's supported if there's no groupBy - i.e. just doing a count() by itself -
-        // and supported with 1 to many groupBy's - e.g. groupBy("column", "someOtherColumn").count().
-        // Other aggregate functions will be supported in the near future.
-        return expressions.length == 1 && expressions[0] instanceof CountStar;
+        logger.info("Pushing down aggregation: {}", describeAggregation(aggregation));
+        readContext.pushDownAggregation(aggregation);
+        return true;
     }
 
     @Override
@@ -189,4 +212,16 @@ public void pruneColumns(StructType requiredSchema) {
             readContext.pushDownRequiredSchema(requiredSchema);
         }
     }
+
+    private boolean hasUnsupportedAggregateFunction(Aggregation aggregation) {
+        return Stream
+            .of(aggregation.aggregateExpressions())
+            .anyMatch(func -> !SUPPORTED_AGGREGATE_FUNCTIONS.contains(func.getClass()));
+    }
+
+    private String describeAggregation(Aggregation aggregation) {
+        return String.format("groupBy: %s; aggregates: %s",
+            Arrays.asList(aggregation.groupByExpressions()),
+            Arrays.asList(aggregation.aggregateExpressions()));
+    }
 }
@@ -23,13 +23,24 @@
 import org.apache.spark.sql.connector.expressions.NamedReference;
 import org.apache.spark.sql.connector.expressions.SortDirection;
 import org.apache.spark.sql.connector.expressions.SortOrder;
+import org.apache.spark.sql.connector.expressions.aggregate.AggregateFunc;
+import org.apache.spark.sql.connector.expressions.aggregate.Aggregation;
+import org.apache.spark.sql.connector.expressions.aggregate.Avg;
+import org.apache.spark.sql.connector.expressions.aggregate.Count;
+import org.apache.spark.sql.connector.expressions.aggregate.CountStar;
+import org.apache.spark.sql.connector.expressions.aggregate.Max;
+import org.apache.spark.sql.connector.expressions.aggregate.Min;
+import org.apache.spark.sql.connector.expressions.aggregate.Sum;
 import org.apache.spark.sql.types.StructField;
 import org.apache.spark.sql.types.StructType;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import java.util.HashMap;
 import java.util.List;
+import java.util.Map;
 import java.util.function.Consumer;
+import java.util.function.Function;
 
 /**
  * Methods for modifying a serialized Optic plan. These were moved here both to facilitate unit testing for some of them
@@ -41,27 +52,59 @@ public abstract class PlanUtil {
 
     private final static ObjectMapper objectMapper = new ObjectMapper();
 
-    static ObjectNode buildGroupByCount() {
-        return newOperation("group-by", args -> {
-            args.add(objectMapper.nullNode());
-            addCountArg(args);
+    private static Map<Class<? extends AggregateFunc>, Function<AggregateFunc, OpticFunction>> aggregateFunctionHandlers;
+
+    // Construct the mapping of Spark aggregate function instances to OpticFunction instances that are used to build
+    // the corresponding serialized Optic function reference.
+    static {
+        aggregateFunctionHandlers = new HashMap<>();
+        aggregateFunctionHandlers.put(Avg.class, func -> {
+            Avg avg = (Avg) func;
+            return new OpticFunction("avg", avg.column(), avg.isDistinct());
+        });
+        aggregateFunctionHandlers.put(Count.class, func -> {
+            Count count = (Count)func;
+            return new OpticFunction("count", count.column(), count.isDistinct());
+        });
+        aggregateFunctionHandlers.put(Max.class, func -> new OpticFunction("max", ((Max) func).column()));
+        aggregateFunctionHandlers.put(Min.class, func -> new OpticFunction("min", ((Min) func).column()));
+        aggregateFunctionHandlers.put(Sum.class, func -> {
+            Sum sum = (Sum) func;
+            return new OpticFunction("sum", sum.column(), sum.isDistinct());
         });
     }
 
-    static ObjectNode buildGroupByCount(List<String> columnNames) {
-        return newOperation("group-by", args -> {
-            ArrayNode columns = args.addArray();
+    static ObjectNode buildGroupByAggregation(List<String> columnNames, Aggregation aggregation) {
+        return newOperation("group-by", groupByArgs -> {
+            ArrayNode columns = groupByArgs.addArray();
             columnNames.forEach(columnName -> populateSchemaCol(columns.addObject(), columnName));
-            addCountArg(args);
-        });
-    }
 
-    private static void addCountArg(ArrayNode args) {
-        args.addObject().put("ns", "op").put("fn", "count").putArray("args")
-            // "count" is used as the column name as that's what Spark uses when the operation is not pushed down.
-            .add("count")
-            // Using "null" is the equivalent of "count(*)" - it counts rows, not values.
-            .add(objectMapper.nullNode());
+            ArrayNode aggregates = groupByArgs.addArray();
+            for (AggregateFunc func : aggregation.aggregateExpressions()) {
+                // Need special handling for CountStar, as it does not have a column name with it.
+                if (func instanceof CountStar) {
+                    aggregates.addObject().put("ns", "op").put("fn", "count").putArray("args")
+                        // "count" is used as the column name as that's what Spark uses when the operation is not pushed down.
+                        .add("count")
+                        // Using "null" is the equivalent of "count(*)" - it counts rows, not values.
+                        .add(objectMapper.nullNode());
+                } else if (aggregateFunctionHandlers.containsKey(func.getClass())) {
+                    OpticFunction opticFunction = aggregateFunctionHandlers.get(func.getClass()).apply(func);
+                    ArrayNode aggregateArgs = aggregates
+                        .addObject().put("ns", "op").put("fn", opticFunction.functionName)
+                        .putArray("args");
+                    aggregateArgs.add(func.toString());
+                    populateSchemaCol(aggregateArgs.addObject(), opticFunction.columnName);
+                    // TODO This is the correct JSON to add, but have not found a way to create an AggregateFunc that
+                    // returns "true" for isDistinct().
+                    if (opticFunction.distinct) {
+                        aggregateArgs.addObject().put("values", "distinct");
+                    }
+                } else {
+                    logger.info("Unsupported aggregate function, will not be pushed to Optic: {}", func);
+                }
+            }
+        });
     }
 
     static ObjectNode buildLimit(int limit) {
@@ -71,7 +114,7 @@ static ObjectNode buildLimit(int limit) {
     static ObjectNode buildOrderBy(SortOrder[] sortOrders) {
         return newOperation("order-by", args -> {
             ArrayNode innerArgs = args.addArray();
-            for (SortOrder sortOrder: sortOrders) {
+            for (SortOrder sortOrder : sortOrders) {
                 final String direction = SortDirection.ASCENDING.equals(sortOrder.direction()) ? "asc" : "desc";
                 ArrayNode orderByArgs = innerArgs.addObject().put("ns", "op").put("fn", direction).putArray("args");
                 String columnName = expressionToColumnName(sortOrder.expression());
@@ -170,4 +213,24 @@ static String expressionToColumnName(Expression expression) {
         }
         return fieldNames[0];
     }
+
+    /**
+     * Captures the name of an Optic function and the column name based on a Spark AggregateFunc's Expression. Used
+     * to simplify building a serialized Optic function reference.
+     */
+    private static class OpticFunction {
+        final String functionName;
+        final String columnName;
+        final boolean distinct;
+
+        OpticFunction(String functionName, Expression column) {
+            this(functionName, column, false);
+        }
+
+        OpticFunction(String functionName, Expression column, boolean distinct) {
+            this.functionName = functionName;
+            this.columnName = expressionToColumnName(column);
+            this.distinct = distinct;
+        }
+    }
 }