Skip to content

Commit fda2ec7

Browse files
committed
fix merge
1 parent 943c384 commit fda2ec7

File tree

1 file changed

+14
-145
lines changed

1 file changed

+14
-145
lines changed

datafusion/physical-optimizer/src/pruning.rs

Lines changed: 14 additions & 145 deletions
Original file line numberDiff line numberDiff line change
@@ -1607,138 +1607,11 @@ fn build_statistics_expr(
16071607
Ok(statistics_expr)
16081608
}
16091609

1610-
<<<<<<< Updated upstream
1611-
/// Wrap the statistics expression in a case expression.
1612-
/// This is necessary to handle the case where the column is known
1613-
/// to be all nulls.
1614-
=======
1615-
/// Convert `column LIKE literal` where P is a constant prefix of the literal
1616-
/// to a range check on the column: `P <= column && column < P'`, where P' is the
1617-
/// lowest string after all P* strings.
1618-
fn build_like_match(
1619-
expr_builder: &mut PruningExpressionBuilder,
1620-
) -> Option<Arc<dyn PhysicalExpr>> {
1621-
// column LIKE literal => (min, max) LIKE literal split at % => min <= split literal && split literal <= max
1622-
// column LIKE 'foo%' => min <= 'foo' && 'foo' <= max
1623-
// column LIKE '%foo' => min <= '' && '' <= max => true
1624-
// column LIKE '%foo%' => min <= '' && '' <= max => true
1625-
// column LIKE 'foo' => min <= 'foo' && 'foo' <= max
1626-
1627-
fn unpack_string(s: &ScalarValue) -> Option<&String> {
1628-
match s {
1629-
ScalarValue::Utf8(Some(s)) => Some(s),
1630-
ScalarValue::LargeUtf8(Some(s)) => Some(s),
1631-
ScalarValue::Utf8View(Some(s)) => Some(s),
1632-
ScalarValue::Dictionary(_, value) => unpack_string(value),
1633-
_ => None,
1634-
}
1635-
}
1636-
1637-
fn extract_string_literal(expr: &Arc<dyn PhysicalExpr>) -> Option<&String> {
1638-
if let Some(lit) = expr.as_any().downcast_ref::<phys_expr::Literal>() {
1639-
let s = unpack_string(lit.value())?;
1640-
return Some(s);
1641-
}
1642-
None
1643-
}
1644-
1645-
// TODO Handle ILIKE perhaps by making the min lowercase and max uppercase
1646-
// this may involve building the physical expressions that call lower() and upper()
1647-
let min_column_expr = expr_builder.min_column_expr().ok()?;
1648-
let max_column_expr = expr_builder.max_column_expr().ok()?;
1649-
let scalar_expr = expr_builder.scalar_expr();
1650-
// check that the scalar is a string literal
1651-
let s = extract_string_literal(scalar_expr)?;
1652-
// ANSI SQL specifies two wildcards: % and _. % matches zero or more characters, _ matches exactly one character.
1653-
let first_wildcard_index = s.find(['%', '_']);
1654-
if first_wildcard_index == Some(0) {
1655-
// there's no filtering we could possibly do, return an error and have this be handled by the unhandled hook
1656-
return None;
1657-
}
1658-
let (lower_bound, upper_bound) = if let Some(wildcard_index) = first_wildcard_index {
1659-
let prefix = &s[..wildcard_index];
1660-
let lower_bound_lit = Arc::new(phys_expr::Literal::new(ScalarValue::Utf8(Some(
1661-
prefix.to_string(),
1662-
))));
1663-
let upper_bound_lit = Arc::new(phys_expr::Literal::new(ScalarValue::Utf8(Some(
1664-
increment_utf8(prefix)?,
1665-
))));
1666-
(lower_bound_lit, upper_bound_lit)
1667-
} else {
1668-
// the like expression is a literal and can be converted into a comparison
1669-
let bound = Arc::new(phys_expr::Literal::new(ScalarValue::Utf8(Some(s.clone()))));
1670-
(bound.clone(), bound)
1671-
};
1672-
let lower_bound_expr = Arc::new(phys_expr::BinaryExpr::new(
1673-
lower_bound,
1674-
Operator::LtEq,
1675-
max_column_expr.clone(),
1676-
));
1677-
let upper_bound_expr = Arc::new(phys_expr::BinaryExpr::new(
1678-
min_column_expr.clone(),
1679-
Operator::LtEq,
1680-
upper_bound,
1681-
));
1682-
let combined = Arc::new(phys_expr::BinaryExpr::new(
1683-
upper_bound_expr,
1684-
Operator::And,
1685-
lower_bound_expr,
1686-
));
1687-
Some(combined)
1688-
}
1689-
1690-
/// Increment a UTF8 string by one, returning `None` if it can't be incremented.
1691-
/// This makes it so that the returned string will always compare greater than the input string
1692-
/// or any other string with the same prefix.
1693-
/// This is necessary since the statistics may have been truncated: if we have a min statistic
1694-
/// of "fo" that may have originally been "foz" or anything else with the prefix "fo".
1695-
/// E.g. `increment_utf8("foo") >= "foo"` and `increment_utf8("foo") >= "fooz"`
1696-
/// In this example `increment_utf8("foo") == "fop"
1697-
fn increment_utf8(data: &str) -> Option<String> {
1698-
// Helper function to check if a character is valid to use
1699-
fn is_valid_unicode(c: char) -> bool {
1700-
let cp = c as u32;
1701-
1702-
// Filter out non-characters (https://www.unicode.org/versions/corrigendum9.html)
1703-
if [0xFFFE, 0xFFFF].contains(&cp) || (0xFDD0..=0xFDEF).contains(&cp) {
1704-
return false;
1705-
}
1706-
1707-
// Filter out private use area
1708-
if cp >= 0x110000 {
1709-
return false;
1710-
}
1711-
1712-
true
1713-
}
1714-
1715-
// Convert string to vector of code points
1716-
let mut code_points: Vec<char> = data.chars().collect();
1717-
1718-
// Work backwards through code points
1719-
for idx in (0..code_points.len()).rev() {
1720-
let original = code_points[idx] as u32;
1721-
1722-
// Try incrementing the code point
1723-
if let Some(next_char) = char::from_u32(original + 1) {
1724-
if is_valid_unicode(next_char) {
1725-
code_points[idx] = next_char;
1726-
// truncate the string to the current index
1727-
code_points.truncate(idx + 1);
1728-
return Some(code_points.into_iter().collect());
1729-
}
1730-
}
1731-
}
1732-
1733-
None
1734-
}
1735-
17361610
/// Wrap the statistics expression in a check that skips the expression if the column is all nulls.
17371611
/// This is important not only as an optimization but also because statistics may not be
17381612
/// accurate for columns that are all nulls.
17391613
/// For example, for an `int` column `x` with all nulls, the min/max/null_count statistics
17401614
/// might be set to 0 and evaluating `x = 0` would incorrectly include the column.
1741-
>>>>>>> Stashed changes
17421615
///
17431616
/// For example:
17441617
///
@@ -1767,15 +1640,11 @@ fn wrap_null_count_check_expr(
17671640
let not_when_null_count_eq_row_count = phys_expr::not(when_null_count_eq_row_count)?;
17681641

17691642
// NOT (x_null_count = x_row_count) AND (<statistics_expr>)
1770-
Ok(
1771-
Arc::new(
1772-
phys_expr::BinaryExpr::new(
1773-
not_when_null_count_eq_row_count,
1774-
Operator::And,
1775-
statistics_expr,
1776-
)
1777-
)
1778-
)
1643+
Ok(Arc::new(phys_expr::BinaryExpr::new(
1644+
not_when_null_count_eq_row_count,
1645+
Operator::And,
1646+
statistics_expr,
1647+
)))
17791648
}
17801649

17811650
#[derive(Debug, Copy, Clone, PartialEq, Eq)]
@@ -2364,7 +2233,8 @@ mod tests {
23642233
#[test]
23652234
fn row_group_predicate_eq() -> Result<()> {
23662235
let schema = Schema::new(vec![Field::new("c1", DataType::Int32, false)]);
2367-
let expected_expr = "NOT c1_null_count@2 = c1_row_count@3 AND c1_min@0 <= 1 AND 1 <= c1_max@1";
2236+
let expected_expr =
2237+
"NOT c1_null_count@2 = c1_row_count@3 AND c1_min@0 <= 1 AND 1 <= c1_max@1";
23682238

23692239
// test column on the left
23702240
let expr = col("c1").eq(lit(1));
@@ -2384,7 +2254,8 @@ mod tests {
23842254
#[test]
23852255
fn row_group_predicate_not_eq() -> Result<()> {
23862256
let schema = Schema::new(vec![Field::new("c1", DataType::Int32, false)]);
2387-
let expected_expr = "NOT c1_null_count@2 = c1_row_count@3 AND (c1_min@0 != 1 OR 1 != c1_max@1)";
2257+
let expected_expr =
2258+
"NOT c1_null_count@2 = c1_row_count@3 AND (c1_min@0 != 1 OR 1 != c1_max@1)";
23882259

23892260
// test column on the left
23902261
let expr = col("c1").not_eq(lit(1));
@@ -2404,8 +2275,7 @@ mod tests {
24042275
#[test]
24052276
fn row_group_predicate_gt() -> Result<()> {
24062277
let schema = Schema::new(vec![Field::new("c1", DataType::Int32, false)]);
2407-
let expected_expr =
2408-
"NOT c1_null_count@1 = c1_row_count@2 AND c1_max@0 > 1";
2278+
let expected_expr = "NOT c1_null_count@1 = c1_row_count@2 AND c1_max@0 > 1";
24092279

24102280
// test column on the left
24112281
let expr = col("c1").gt(lit(1));
@@ -2444,8 +2314,7 @@ mod tests {
24442314
#[test]
24452315
fn row_group_predicate_lt() -> Result<()> {
24462316
let schema = Schema::new(vec![Field::new("c1", DataType::Int32, false)]);
2447-
let expected_expr =
2448-
"NOT c1_null_count@1 = c1_row_count@2 AND c1_min@0 < 1";
2317+
let expected_expr = "NOT c1_null_count@1 = c1_row_count@2 AND c1_min@0 < 1";
24492318

24502319
// test column on the left
24512320
let expr = col("c1").lt(lit(1));
@@ -2490,8 +2359,7 @@ mod tests {
24902359
]);
24912360
// test AND operator joining supported c1 < 1 expression and unsupported c2 > c3 expression
24922361
let expr = col("c1").lt(lit(1)).and(col("c2").lt(col("c3")));
2493-
let expected_expr =
2494-
"NOT c1_null_count@1 = c1_row_count@2 AND c1_min@0 < 1";
2362+
let expected_expr = "NOT c1_null_count@1 = c1_row_count@2 AND c1_min@0 < 1";
24952363
let predicate_expr =
24962364
test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new());
24972365
assert_eq!(predicate_expr.to_string(), expected_expr);
@@ -2800,7 +2668,8 @@ mod tests {
28002668
test_build_predicate_expression(&expr, &schema, &mut RequiredColumns::new());
28012669
assert_eq!(predicate_expr.to_string(), expected_expr);
28022670

2803-
let expected_expr = "NOT c1_null_count@1 = c1_row_count@2 AND TRY_CAST(c1_max@0 AS Int64) > 1";
2671+
let expected_expr =
2672+
"NOT c1_null_count@1 = c1_row_count@2 AND TRY_CAST(c1_max@0 AS Int64) > 1";
28042673

28052674
// test column on the left
28062675
let expr =

0 commit comments

Comments
 (0)