diff --git a/src/sqlancer/datafusion/DataFusionErrors.java b/src/sqlancer/datafusion/DataFusionErrors.java index d4b208f5..981ab913 100644 --- a/src/sqlancer/datafusion/DataFusionErrors.java +++ b/src/sqlancer/datafusion/DataFusionErrors.java @@ -2,6 +2,8 @@ import static sqlancer.datafusion.DataFusionUtil.dfAssert; +import java.util.regex.Pattern; + import sqlancer.common.query.ExpectedErrors; public final class DataFusionErrors { @@ -31,12 +33,15 @@ public static void registerExpectedExecutionErrors(ExpectedErrors errors) { /* * Known bugs */ - errors.add("to type Int64"); // https://github.com/apache/datafusion/issues/11252 + errors.add("to type Int64"); // https://github.com/apache/datafusion/issues/11249 errors.add("bitwise"); // https://github.com/apache/datafusion/issues/11260 - errors.add("NestedLoopJoinExec"); // https://github.com/apache/datafusion/issues/11269 + errors.add(" Not all InterleaveExec children have a consistent hash partitioning."); // https://github.com/apache/datafusion/issues/11409 + Pattern pattern = Pattern.compile("JOIN.*NULL", Pattern.CASE_INSENSITIVE); + errors.addRegex(pattern); // https://github.com/apache/datafusion/issues/11414 /* * False positives */ + errors.add("Cannot cast string"); // ifnull() is passed two non-compattable type and caused execution error errors.add("Physical plan does not support logical expression AggregateFunction"); // False positive: when aggr // is generated in where // clause diff --git a/src/sqlancer/datafusion/DataFusionToStringVisitor.java b/src/sqlancer/datafusion/DataFusionToStringVisitor.java index 1f303055..1f0276d4 100644 --- a/src/sqlancer/datafusion/DataFusionToStringVisitor.java +++ b/src/sqlancer/datafusion/DataFusionToStringVisitor.java @@ -1,13 +1,16 @@ package sqlancer.datafusion; +import static sqlancer.datafusion.DataFusionUtil.dfAssert; + import java.util.List; +import sqlancer.Randomly; import sqlancer.common.ast.newast.NewToStringVisitor; import sqlancer.common.ast.newast.Node; import sqlancer.datafusion.ast.DataFusionConstant; import sqlancer.datafusion.ast.DataFusionExpression; -import sqlancer.datafusion.ast.DataFusionJoin; import sqlancer.datafusion.ast.DataFusionSelect; +import sqlancer.datafusion.ast.DataFusionSelect.DataFusionFrom; public class DataFusionToStringVisitor extends NewToStringVisitor { @@ -29,25 +32,61 @@ public void visitSpecific(Node expr) { visit((DataFusionConstant) expr); } else if (expr instanceof DataFusionSelect) { visit((DataFusionSelect) expr); - } else if (expr instanceof DataFusionJoin) { - visit((DataFusionJoin) expr); + } else if (expr instanceof DataFusionFrom) { + visit((DataFusionFrom) expr); } else { throw new AssertionError(expr.getClass()); } } - private void visit(DataFusionJoin join) { - visit(join.getLeftTable()); - sb.append(" "); - sb.append(join.getJoinType()); - sb.append(" "); - - sb.append(" JOIN "); - visit(join.getRightTable()); - if (join.getOnCondition() != null) { - sb.append(" ON "); - visit(join.getOnCondition()); + private void visit(DataFusionFrom from) { + sb.append(" FROM "); + + dfAssert(from.joinTypeList.size() == from.joinConditionList.size(), "Validate from"); + + /* e.g. from t1, t2, t3 */ + if (from.joinConditionList.isEmpty()) { + visit(from.tableList); + return; + } + + dfAssert(from.joinConditionList.size() == from.tableList.size() - 1, "Validate from"); + /* e.g. from t1 join t2 on t1.v1=t2.v1 */ + visit(from.tableList.get(0)); + for (int i = 0; i < from.joinConditionList.size(); i++) { + switch (from.joinTypeList.get(i)) { + case INNER: + sb.append(Randomly.fromOptions(" JOIN ", " INNER JOIN ")); + break; + case LEFT: + sb.append(Randomly.fromOptions(" LEFT JOIN ", " LEFT OUTER JOIN ")); + break; + case RIGHT: + sb.append(Randomly.fromOptions(" RIGHT JOIN ", " RIGHT OUTER JOIN ")); + break; + case FULL: + sb.append(Randomly.fromOptions(" FULL JOIN ", " FULL OUTER JOIN ")); + break; + case CROSS: + sb.append(" CROSS JOIN "); + break; + case NATURAL: + sb.append(" NATURAL JOIN "); + break; + default: + dfAssert(false, "Unreachable"); + } + + visit(from.tableList.get(i + 1)); // ti + + /* ON ... */ + Node cond = from.joinConditionList.get(i); + if (cond != null) { + sb.append(" ON "); + visit(cond); + } } + } private void visit(DataFusionConstant constant) { @@ -62,14 +101,7 @@ private void visit(DataFusionSelect select) { visit(select.getFetchColumns()); } - sb.append(" FROM "); - visit(select.getFromList()); - if (!select.getFromList().isEmpty() && !select.getJoinList().isEmpty()) { - sb.append(", "); - } - if (!select.getJoinList().isEmpty()) { - visit(select.getJoinList()); - } + visit(select.from); if (select.getWhereClause() != null) { sb.append(" WHERE "); visit(select.getWhereClause()); diff --git a/src/sqlancer/datafusion/ast/DataFusionJoin.java b/src/sqlancer/datafusion/ast/DataFusionJoin.java deleted file mode 100644 index 61cd9f55..00000000 --- a/src/sqlancer/datafusion/ast/DataFusionJoin.java +++ /dev/null @@ -1,92 +0,0 @@ -package sqlancer.datafusion.ast; - -import java.util.ArrayList; -import java.util.List; -import java.util.stream.Collectors; - -import sqlancer.Randomly; -import sqlancer.common.ast.newast.Node; -import sqlancer.common.ast.newast.TableReferenceNode; -import sqlancer.datafusion.DataFusionProvider.DataFusionGlobalState; -import sqlancer.datafusion.DataFusionSchema; -import sqlancer.datafusion.DataFusionSchema.DataFusionColumn; -import sqlancer.datafusion.DataFusionSchema.DataFusionTable; -import sqlancer.datafusion.gen.DataFusionExpressionGenerator; - -/* - NOT IMPLEMENTED YET - */ -public class DataFusionJoin implements Node { - - private final TableReferenceNode leftTable; - private final TableReferenceNode rightTable; - private final JoinType joinType; - private final Node onCondition; - - public DataFusionJoin(TableReferenceNode leftTable, - TableReferenceNode rightTable, JoinType joinType, - Node whereCondition) { - this.leftTable = leftTable; - this.rightTable = rightTable; - this.joinType = joinType; - this.onCondition = whereCondition; - } - - public static List> getJoins(List tables, - DataFusionGlobalState globalState) { - // [t1_join_t2, t1_join_t3, ...] - List> tableList = tables.stream() - .map(t -> new TableReferenceNode(t)) - .collect(Collectors.toList()); - List> joinExpressions = new ArrayList<>(); - while (tableList.size() >= 2 && Randomly.getBooleanWithRatherLowProbability()) { - TableReferenceNode leftTable = tableList.remove(0); - TableReferenceNode rightTable = tableList.remove(0); - List columns = new ArrayList<>(leftTable.getTable().getColumns()); - columns.addAll(rightTable.getTable().getColumns()); - // TODO(datafusion) this `joinGen` can generate super chaotic exprsions, maybe we should make it more like a - // normal join expression - DataFusionExpressionGenerator joinGen = new DataFusionExpressionGenerator(globalState).setColumns(columns); - switch (DataFusionJoin.JoinType.getRandom()) { - case INNER: - joinExpressions.add(DataFusionJoin.createInnerJoin(leftTable, rightTable, - joinGen.generateExpression(DataFusionSchema.DataFusionDataType.BOOLEAN))); - break; - default: - throw new AssertionError(); - } - } - return joinExpressions; - } - - public static DataFusionJoin createInnerJoin(TableReferenceNode left, - TableReferenceNode right, Node predicate) { - return new DataFusionJoin(left, right, JoinType.INNER, predicate); - } - - public TableReferenceNode getLeftTable() { - return leftTable; - } - - public TableReferenceNode getRightTable() { - return rightTable; - } - - public JoinType getJoinType() { - return joinType; - } - - public Node getOnCondition() { - return onCondition; - } - - public enum JoinType { - INNER; - // NATURAL, LEFT, RIGHT; - - public static JoinType getRandom() { - return Randomly.fromOptions(values()); - } - } - -} diff --git a/src/sqlancer/datafusion/ast/DataFusionSelect.java b/src/sqlancer/datafusion/ast/DataFusionSelect.java index a80758c4..817f17eb 100644 --- a/src/sqlancer/datafusion/ast/DataFusionSelect.java +++ b/src/sqlancer/datafusion/ast/DataFusionSelect.java @@ -1,5 +1,8 @@ package sqlancer.datafusion.ast; +import static sqlancer.datafusion.DataFusionUtil.dfAssert; + +import java.util.ArrayList; import java.util.List; import java.util.Optional; import java.util.stream.Collectors; @@ -18,39 +21,166 @@ public class DataFusionSelect extends SelectBase> implements Node { public Optional fetchColumnsString = Optional.empty(); // When available, override `fetchColumns` in base // class's `Node` representation (for display) + + // `from` is used to represent from table list and join clause + // `fromList` and `joinList` in base class should always be empty + public DataFusionFrom from; public DataFusionExpressionGenerator exprGen; - // Construct a `DataFusionSelect` with random SELECT, FROM, WHERE + public enum JoinType { + INNER, LEFT, RIGHT, FULL, CROSS, NATURAL + } + + // DataFusionFrom can be used to represent from table list or join list + // 1. When `joinConditionList` is empty, then it's a table list (implicit cross join) + // join condition can be generated in `WHERE` clause (outside `FromClause`) + // e.g. select * from [expr], [expr] is t1, t3, t2 + // - tableList -> {t1, t3,t2} + // - predicateList -> null + // 2. When `joinConditionList` is not empty, the from-clause is a join list + // e.g. + // select * from t1 + // join t2 on t1.v1=t2.v1 + // left join t3 on t1.v1=t2.v1 and t1.v2=t3.v2 + // - tableList -> {t1, t2, t3} + // - joinTypeList -> {INNER, LEFT} + // - joinConditionList -> {[expr_with_t1_t2], [expr_with_t1_t2_t3]} + public static class DataFusionFrom implements Node { + public List> tableList; + public List joinTypeList; + public List> joinConditionList; + + public DataFusionFrom() { + tableList = new ArrayList<>(); + joinTypeList = new ArrayList<>(); + joinConditionList = new ArrayList<>(); + } + + public boolean isExplicitJoin() { + // if it's explicit join, joinTypeList and joinConditionList should be both length of tableList.len - 1 + // Otherwise, both is empty + dfAssert(joinTypeList.size() == joinConditionList.size(), "Validate FromClause"); + return !joinTypeList.isEmpty(); + } + + // Randomly generate a FromClause + // TODO(datafusion) support self join 'select * from t1, t1 as t1a' + // TODO(datafusion) support using 'select * from t1 join t2 using(v0)' + public static DataFusionFrom generateFromClause(DataFusionGlobalState state, + List randomTables) { + DataFusionFrom fromClause = new DataFusionFrom(); // return result + + /* Setup tableList */ + dfAssert(!randomTables.isEmpty(), "Must have some tables"); + List> randomTableNodes = randomTables.stream() + .map(t -> new TableReferenceNode(t)) + .collect(Collectors.toList()); + fromClause.tableList = randomTableNodes; + + /* If JoinConditionList is empty, FromClause will be interpreted as from list */ + if (Randomly.getBoolean() && Randomly.getBoolean()) { + return fromClause; + } + + /* Set fromClause's joinTypeList and joinConditionList */ + List possibleColsToGenExpr = new ArrayList<>(); + possibleColsToGenExpr.addAll(randomTables.get(0).getColumns()); // first table + // Generate join conditions (see class-level comment example's joinConditionList) + // + // Join Type | `ON` Clause Requirement + // INNER JOIN | Required + // LEFT OUTER JOIN | Required + // RIGHT OUTER JOIN | Required + // FULL OUTER JOIN | Required + // CROSS JOIN | Not allowed + // NATURAL JOIN | Not allowed + // JOIN with USING | Optional + for (int i = 1; i < randomTables.size(); i++) { + JoinType randomJoinType = Randomly.fromOptions(JoinType.values()); + fromClause.joinTypeList.add(randomJoinType); + possibleColsToGenExpr.addAll(randomTables.get(i).getColumns()); + DataFusionExpressionGenerator exprGen = new DataFusionExpressionGenerator(state) + .setColumns(possibleColsToGenExpr); + if (randomJoinType == JoinType.CROSS || randomJoinType == JoinType.NATURAL) { + if (Randomly.getBooleanWithSmallProbability()) { + fromClause.joinConditionList + .add(exprGen.generateExpression(DataFusionSchema.DataFusionDataType.BOOLEAN)); + } else { + fromClause.joinConditionList.add(null); + } + } else { + if (Randomly.getBooleanWithSmallProbability()) { + fromClause.joinConditionList.add(null); + } else { + fromClause.joinConditionList + .add(exprGen.generateExpression(DataFusionSchema.DataFusionDataType.BOOLEAN)); + } + } + // TODO(datafusion) make join conditions more likely to be 'col1=col2', also some join types don't have + // 'ON' condition + } + + return fromClause; + } + } + + // Generate SELECT statement according to the dependency of exprs, e.g.: + // SELECT [expr_groupby_cols], [expr_aggr_cols] + // FROM [from_clause] + // WHERE [expr_all_cols] + // GROUP BY [expr_groupby_cols] + // HAVING [expr_gorupby_cols], [expr_aggr_cols] + // ORDER BY [expr_gorupby_cols], [expr_aggr_cols] + // LIMIT [constant] + // + // The generation order will be: + // 1. [from_clause] - Pick tables like t1, t2, t3 and get a join clause + // 2. [expr_all_cols] - Generate a non-aggregate expression with all columns in t1, t2, t3. e.g.: + // - t1.v1 = t2.v1 and t1.v2 > t3.v2 + // 3. [expr_groupby_cols], [expr_aggr_cols] - Randomly pick some cols in t1, t2, t3 as group by columns, and pick + // some other columns as aggregation columns, and generate non-aggr expression [expr_groupby_cols] on group by + // columns, finally generate aggregation expressions [expr_aggr_cols] on non-group-by/aggregation columns. + // For example, group by column is t1.v1, and aggregate columns is t2.v1, t3.v1, generated expressions can be: + // - [expr_groupby_cols] t1.v1 + 1 + // - [expr_aggr_cols] SUM(t3.v1 + t2.v1) public static DataFusionSelect getRandomSelect(DataFusionGlobalState state) { DataFusionSelect randomSelect = new DataFusionSelect(); - // Randomly pick up to 4 tables to select from + /* Setup FROM clause */ DataFusionSchema schema = state.getSchema(); // schema of all tables List allTables = schema.getDatabaseTables(); List randomTables = Randomly.nonEmptySubset(allTables); - int maxSize = Randomly.fromOptions(1, 2, 3, 4); + int maxSize = Randomly.fromOptions(1, 2, 3); if (randomTables.size() > maxSize) { randomTables = randomTables.subList(0, maxSize); } + DataFusionFrom randomFrom = DataFusionFrom.generateFromClause(state, randomTables); - // Randomly choose some columns from `randomTables` - // And generate a random expression which might contain those columns - List randomColumns = DataFusionTable.getRandomColumns(randomTables); + /* Setup WHERE clause */ + List randomColumns = DataFusionTable.getRandomColumns(randomTables); randomSelect.exprGen = new DataFusionExpressionGenerator(state).setColumns(randomColumns); Node whereExpr = randomSelect.exprGen .generateExpression(DataFusionSchema.DataFusionDataType.BOOLEAN); - // Constructing result - List> randomTableNodes = randomTables.stream() - .map(t -> new TableReferenceNode(t)) - .collect(Collectors.toList()); + /* Constructing result */ List> randomColumnNodes = randomColumns.stream() .map((c) -> new ColumnReferenceNode(c)) .collect(Collectors.toList()); - randomSelect.setFetchColumns(randomColumnNodes); - randomSelect.setFromList(randomTableNodes); + randomSelect.setFetchColumns(randomColumnNodes); // TODO(datafusion) make it more random like 'select *' + randomSelect.from = randomFrom; randomSelect.setWhereClause(whereExpr); + // // if explicit join (from t1 join t2), 50% case generate where clause + // // if join is implicit (from t1, t2), 90% case generate where + // if (randomFrom.isExplicitJoin()) { + // if (Randomly.getBoolean()) { + // randomSelect.setWhereClause(whereExpr); + // } + // } else { + // if (!Randomly.getBooleanWithRatherLowProbability()) { + // randomSelect.setWhereClause(whereExpr); + // } + // } return randomSelect; } diff --git a/src/sqlancer/datafusion/test/DataFusionNoRECOracle.java b/src/sqlancer/datafusion/test/DataFusionNoRECOracle.java index 55f18fec..11733a73 100644 --- a/src/sqlancer/datafusion/test/DataFusionNoRECOracle.java +++ b/src/sqlancer/datafusion/test/DataFusionNoRECOracle.java @@ -27,8 +27,11 @@ public DataFusionNoRECOracle(DataFusionGlobalState globalState) { } /* - * Non-Optimizing Reference Engine Construction q1: SELECT [expr1] FROM [expr2] WHERE [expr3] q2: SELECT [expr3] - * FROM [expr2] + * Non-Optimizing Reference Engine Construction + * + * q1: SELECT [expr1] FROM [expr2] WHERE [expr3] + * + * q2: SELECT [expr3] FROM [expr2] * * Oracle Check: q1's result size equals to `true` count in q2's result set */ @@ -43,14 +46,14 @@ public void check() throws SQLException { // Q1: SELECT count(*) FROM [expr2] WHERE [expr3] DataFusionSelect q1 = new DataFusionSelect(); q1.setFetchColumnsString("COUNT(*)"); - q1.setFromList(randomSelect.getFromList()); + q1.from = randomSelect.from; q1.setWhereClause(randomSelect.getWhereClause()); // Q2: SELECT count(case when [expr3] then 1 else null end) FROM [expr2] DataFusionSelect q2 = new DataFusionSelect(); String selectExpr = String.format("COUNT(CASE WHEN %S THEN 1 ELSE NULL END)", DataFusionToStringVisitor.asString(randomSelect.getWhereClause())); q2.setFetchColumnsString(selectExpr); - q2.setFromList(randomSelect.getFromList()); + q2.from = randomSelect.from; q2.setWhereClause(null); /* diff --git a/src/sqlancer/datafusion/test/DataFusionQueryPartitioningWhereTester.java b/src/sqlancer/datafusion/test/DataFusionQueryPartitioningWhereTester.java index 3235c3d6..03948d05 100644 --- a/src/sqlancer/datafusion/test/DataFusionQueryPartitioningWhereTester.java +++ b/src/sqlancer/datafusion/test/DataFusionQueryPartitioningWhereTester.java @@ -20,8 +20,15 @@ public DataFusionQueryPartitioningWhereTester(DataFusionGlobalState state) { } /* - * Query Partitioning - Where q: SELECT [expr1] FROM [expr2] qp1: SELECT [expr1] FROM [expr2] WHERE [expr3] qp2: - * SELECT [expr1] FROM [expr2] WHERE NOT [expr3] qp3: SELECT [expr1] FROM [expr2] WHERE [expr3] IS NULL + * Query Partitioning - Where + * + * q: SELECT [expr1] FROM [expr2] + * + * qp1: SELECT [expr1] FROM [expr2] WHERE [expr3] + * + * qp2: SELECT [expr1] FROM [expr2] WHERE NOT [expr3] + * + * qp3: SELECT [expr1] FROM [expr2] WHERE [expr3] IS NULL * * Oracle check: q's result equals to union(qp1, qp2, qp3) */