@@ -1607,138 +1607,11 @@ fn build_statistics_expr(
1607
1607
Ok ( statistics_expr)
1608
1608
}
1609
1609
1610
- <<<<<<< Updated upstream
1611
- /// Wrap the statistics expression in a case expression.
1612
- /// This is necessary to handle the case where the column is known
1613
- /// to be all nulls.
1614
- =======
1615
- /// Convert `column LIKE literal` where P is a constant prefix of the literal
1616
- /// to a range check on the column : `P <= column && column < P ' `, where P ' is the
1617
- /// lowest string after all P* strings.
1618
- fn build_like_match (
1619
- expr_builder : & mut PruningExpressionBuilder ,
1620
- ) -> Option < Arc < dyn PhysicalExpr > > {
1621
- // column LIKE literal => (min, max) LIKE literal split at % => min <= split literal && split literal <= max
1622
- // column LIKE 'foo%' => min <= 'foo' && 'foo' <= max
1623
- // column LIKE '%foo' => min <= '' && '' <= max => true
1624
- // column LIKE '%foo%' => min <= '' && '' <= max => true
1625
- // column LIKE 'foo' => min <= 'foo' && 'foo' <= max
1626
-
1627
- fn unpack_string ( s : & ScalarValue ) -> Option < & String > {
1628
- match s {
1629
- ScalarValue :: Utf8 ( Some ( s) ) => Some ( s) ,
1630
- ScalarValue :: LargeUtf8 ( Some ( s) ) => Some ( s) ,
1631
- ScalarValue :: Utf8View ( Some ( s) ) => Some ( s) ,
1632
- ScalarValue :: Dictionary ( _, value) => unpack_string ( value) ,
1633
- _ => None ,
1634
- }
1635
- }
1636
-
1637
- fn extract_string_literal ( expr : & Arc < dyn PhysicalExpr > ) -> Option < & String > {
1638
- if let Some ( lit) = expr. as_any ( ) . downcast_ref :: < phys_expr:: Literal > ( ) {
1639
- let s = unpack_string ( lit. value ( ) ) ?;
1640
- return Some ( s) ;
1641
- }
1642
- None
1643
- }
1644
-
1645
- // TODO Handle ILIKE perhaps by making the min lowercase and max uppercase
1646
- // this may involve building the physical expressions that call lower() and upper()
1647
- let min_column_expr = expr_builder. min_column_expr ( ) . ok ( ) ?;
1648
- let max_column_expr = expr_builder. max_column_expr ( ) . ok ( ) ?;
1649
- let scalar_expr = expr_builder. scalar_expr ( ) ;
1650
- // check that the scalar is a string literal
1651
- let s = extract_string_literal ( scalar_expr) ?;
1652
- // ANSI SQL specifies two wildcards: % and _. % matches zero or more characters, _ matches exactly one character.
1653
- let first_wildcard_index = s. find ( [ '%' , '_' ] ) ;
1654
- if first_wildcard_index == Some ( 0 ) {
1655
- // there's no filtering we could possibly do, return an error and have this be handled by the unhandled hook
1656
- return None ;
1657
- }
1658
- let ( lower_bound, upper_bound) = if let Some ( wildcard_index) = first_wildcard_index {
1659
- let prefix = & s[ ..wildcard_index] ;
1660
- let lower_bound_lit = Arc :: new ( phys_expr:: Literal :: new ( ScalarValue :: Utf8 ( Some (
1661
- prefix. to_string ( ) ,
1662
- ) ) ) ) ;
1663
- let upper_bound_lit = Arc :: new ( phys_expr:: Literal :: new ( ScalarValue :: Utf8 ( Some (
1664
- increment_utf8 ( prefix) ?,
1665
- ) ) ) ) ;
1666
- ( lower_bound_lit, upper_bound_lit)
1667
- } else {
1668
- // the like expression is a literal and can be converted into a comparison
1669
- let bound = Arc :: new ( phys_expr:: Literal :: new ( ScalarValue :: Utf8 ( Some ( s. clone ( ) ) ) ) ) ;
1670
- ( bound. clone ( ) , bound)
1671
- } ;
1672
- let lower_bound_expr = Arc :: new ( phys_expr:: BinaryExpr :: new (
1673
- lower_bound,
1674
- Operator :: LtEq ,
1675
- max_column_expr. clone ( ) ,
1676
- ) ) ;
1677
- let upper_bound_expr = Arc :: new ( phys_expr:: BinaryExpr :: new (
1678
- min_column_expr. clone ( ) ,
1679
- Operator :: LtEq ,
1680
- upper_bound,
1681
- ) ) ;
1682
- let combined = Arc :: new ( phys_expr:: BinaryExpr :: new (
1683
- upper_bound_expr,
1684
- Operator :: And ,
1685
- lower_bound_expr,
1686
- ) ) ;
1687
- Some ( combined)
1688
- }
1689
-
1690
- /// Increment a UTF8 string by one, returning `None` if it can't be incremented.
1691
- /// This makes it so that the returned string will always compare greater than the input string
1692
- /// or any other string with the same prefix.
1693
- /// This is necessary since the statistics may have been truncated: if we have a min statistic
1694
- /// of "fo" that may have originally been "foz" or anything else with the prefix "fo".
1695
- /// E.g. `increment_utf8("foo") >= "foo"` and `increment_utf8("foo") >= "fooz"`
1696
- /// In this example `increment_utf8("foo") == "fop"
1697
- fn increment_utf8( data: & str ) -> Option < String > {
1698
- // Helper function to check if a character is valid to use
1699
- fn is_valid_unicode( c: char ) -> bool {
1700
- let cp = c as u32 ;
1701
-
1702
- // Filter out non-characters (https://www.unicode.org/versions/corrigendum9.html)
1703
- if [ 0xFFFE , 0xFFFF ] . contains ( & cp ) || ( 0xFDD0 ..=0xFDEF ) . contains ( & cp ) {
1704
- return false ;
1705
- }
1706
-
1707
- // Filter out private use area
1708
- if cp >= 0x110000 {
1709
- return false ;
1710
- }
1711
-
1712
- true
1713
- }
1714
-
1715
- // Convert string to vector of code points
1716
- let mut code_points: Vec < char > = data. chars( ) . collect( ) ;
1717
-
1718
- // Work backwards through code points
1719
- for idx in ( 0 ..code_points. len( ) ) . rev( ) {
1720
- let original = code_points[ idx] as u32 ;
1721
-
1722
- // Try incrementing the code point
1723
- if let Some ( next_char) = char:: from_u32 ( original + 1 ) {
1724
- if is_valid_unicode ( next_char ) {
1725
- code_points[ idx] = next_char;
1726
- // truncate the string to the current index
1727
- code_points. truncate( idx + 1 ) ;
1728
- return Some ( code_points. into_iter( ) . collect( ) ) ;
1729
- }
1730
- }
1731
- }
1732
-
1733
- None
1734
- }
1735
-
1736
1610
/// Wrap the statistics expression in a check that skips the expression if the column is all nulls.
1737
1611
/// This is important not only as an optimization but also because statistics may not be
1738
1612
/// accurate for columns that are all nulls.
1739
1613
/// For example, for an `int` column `x` with all nulls, the min/max/null_count statistics
1740
1614
/// might be set to 0 and evaluating `x = 0` would incorrectly include the column.
1741
- >>>>>>> Stashed changes
1742
1615
///
1743
1616
/// For example:
1744
1617
///
@@ -1767,15 +1640,11 @@ fn wrap_null_count_check_expr(
1767
1640
let not_when_null_count_eq_row_count = phys_expr:: not ( when_null_count_eq_row_count) ?;
1768
1641
1769
1642
// NOT (x_null_count = x_row_count) AND (<statistics_expr>)
1770
- Ok (
1771
- Arc :: new (
1772
- phys_expr:: BinaryExpr :: new (
1773
- not_when_null_count_eq_row_count ,
1774
- Operator :: And ,
1775
- statistics_expr ,
1776
- )
1777
- )
1778
- )
1643
+ Ok ( Arc :: new ( phys_expr:: BinaryExpr :: new (
1644
+ not_when_null_count_eq_row_count,
1645
+ Operator :: And ,
1646
+ statistics_expr,
1647
+ ) ) )
1779
1648
}
1780
1649
1781
1650
#[ derive( Debug , Copy , Clone , PartialEq , Eq ) ]
@@ -2364,7 +2233,8 @@ mod tests {
2364
2233
#[ test]
2365
2234
fn row_group_predicate_eq ( ) -> Result < ( ) > {
2366
2235
let schema = Schema :: new ( vec ! [ Field :: new( "c1" , DataType :: Int32 , false ) ] ) ;
2367
- let expected_expr = "NOT c1_null_count@2 = c1_row_count@3 AND c1_min@0 <= 1 AND 1 <= c1_max@1 ";
2236
+ let expected_expr =
2237
+ "NOT c1_null_count@2 = c1_row_count@3 AND c1_min@0 <= 1 AND 1 <= c1_max@1" ;
2368
2238
2369
2239
// test column on the left
2370
2240
let expr = col ( "c1" ) . eq ( lit ( 1 ) ) ;
@@ -2384,7 +2254,8 @@ mod tests {
2384
2254
#[ test]
2385
2255
fn row_group_predicate_not_eq ( ) -> Result < ( ) > {
2386
2256
let schema = Schema :: new ( vec ! [ Field :: new( "c1" , DataType :: Int32 , false ) ] ) ;
2387
- let expected_expr = "NOT c1_null_count@2 = c1_row_count@3 AND (c1_min@0 != 1 OR 1 != c1_max@1)" ;
2257
+ let expected_expr =
2258
+ "NOT c1_null_count@2 = c1_row_count@3 AND (c1_min@0 != 1 OR 1 != c1_max@1)" ;
2388
2259
2389
2260
// test column on the left
2390
2261
let expr = col ( "c1" ) . not_eq ( lit ( 1 ) ) ;
@@ -2404,8 +2275,7 @@ mod tests {
2404
2275
#[ test]
2405
2276
fn row_group_predicate_gt ( ) -> Result < ( ) > {
2406
2277
let schema = Schema :: new ( vec ! [ Field :: new( "c1" , DataType :: Int32 , false ) ] ) ;
2407
- let expected_expr =
2408
- "NOT c1_null_count@1 = c1_row_count@2 AND c1_max@0 > 1" ;
2278
+ let expected_expr = "NOT c1_null_count@1 = c1_row_count@2 AND c1_max@0 > 1" ;
2409
2279
2410
2280
// test column on the left
2411
2281
let expr = col ( "c1" ) . gt ( lit ( 1 ) ) ;
@@ -2444,8 +2314,7 @@ mod tests {
2444
2314
#[ test]
2445
2315
fn row_group_predicate_lt ( ) -> Result < ( ) > {
2446
2316
let schema = Schema :: new ( vec ! [ Field :: new( "c1" , DataType :: Int32 , false ) ] ) ;
2447
- let expected_expr =
2448
- "NOT c1_null_count@1 = c1_row_count@2 AND c1_min@0 < 1" ;
2317
+ let expected_expr = "NOT c1_null_count@1 = c1_row_count@2 AND c1_min@0 < 1" ;
2449
2318
2450
2319
// test column on the left
2451
2320
let expr = col ( "c1" ) . lt ( lit ( 1 ) ) ;
@@ -2490,8 +2359,7 @@ mod tests {
2490
2359
] ) ;
2491
2360
// test AND operator joining supported c1 < 1 expression and unsupported c2 > c3 expression
2492
2361
let expr = col ( "c1" ) . lt ( lit ( 1 ) ) . and ( col ( "c2" ) . lt ( col ( "c3" ) ) ) ;
2493
- let expected_expr =
2494
- "NOT c1_null_count@1 = c1_row_count@2 AND c1_min@0 < 1" ;
2362
+ let expected_expr = "NOT c1_null_count@1 = c1_row_count@2 AND c1_min@0 < 1" ;
2495
2363
let predicate_expr =
2496
2364
test_build_predicate_expression ( & expr, & schema, & mut RequiredColumns :: new ( ) ) ;
2497
2365
assert_eq ! ( predicate_expr. to_string( ) , expected_expr) ;
@@ -2800,7 +2668,8 @@ mod tests {
2800
2668
test_build_predicate_expression ( & expr, & schema, & mut RequiredColumns :: new ( ) ) ;
2801
2669
assert_eq ! ( predicate_expr. to_string( ) , expected_expr) ;
2802
2670
2803
- let expected_expr = "NOT c1_null_count@1 = c1_row_count@2 AND TRY_CAST(c1_max@0 AS Int64) > 1" ;
2671
+ let expected_expr =
2672
+ "NOT c1_null_count@1 = c1_row_count@2 AND TRY_CAST(c1_max@0 AS Int64) > 1" ;
2804
2673
2805
2674
// test column on the left
2806
2675
let expr =
0 commit comments