From 14bd4045e3c7e6b77e9921324e7d1bfd91ea98f2 Mon Sep 17 00:00:00 2001 From: zhuqi-lucas <821684824@qq.com> Date: Fri, 6 Jun 2025 17:36:45 +0800 Subject: [PATCH 01/14] feat: mapping sql Char/Text/String default to Utf8View --- datafusion/sql/src/planner.rs | 2 +- .../sqllogictest/test_files/aggregate.slt | 8 +- datafusion/sqllogictest/test_files/array.slt | 26 +-- .../sqllogictest/test_files/arrow_files.slt | 14 +- .../sqllogictest/test_files/explain_tree.slt | 194 +++++++----------- .../sqllogictest/test_files/functions.slt | 8 +- .../test_files/monotonic_projection_test.slt | 4 +- .../test_files/parquet_filter_pushdown.slt | 10 +- .../test_files/push_down_filter.slt | 2 +- datafusion/sqllogictest/test_files/scalar.slt | 6 +- .../sqllogictest/test_files/simplify_expr.slt | 8 +- .../test_files/sort_merge_join.slt | 81 ++------ datafusion/sqllogictest/test_files/union.slt | 4 +- 13 files changed, 130 insertions(+), 237 deletions(-) diff --git a/datafusion/sql/src/planner.rs b/datafusion/sql/src/planner.rs index 5a1f3cdf69c3..29e5c5f77344 100644 --- a/datafusion/sql/src/planner.rs +++ b/datafusion/sql/src/planner.rs @@ -601,7 +601,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { ) } SQLDataType::Char(_) | SQLDataType::Text | SQLDataType::String(_) => { - Ok(DataType::Utf8) + Ok(DataType::Utf8View) } SQLDataType::Timestamp(precision, tz_info) if precision.is_none() || [0, 3, 6, 9].contains(&precision.unwrap()) => diff --git a/datafusion/sqllogictest/test_files/aggregate.slt b/datafusion/sqllogictest/test_files/aggregate.slt index ed77435d6a85..cd594de30a15 100644 --- a/datafusion/sqllogictest/test_files/aggregate.slt +++ b/datafusion/sqllogictest/test_files/aggregate.slt @@ -2281,10 +2281,10 @@ create table t (c string) as values query T select arrow_typeof(c) from t; ---- -Utf8 -Utf8 -Utf8 -Utf8 +Utf8View +Utf8View +Utf8View +Utf8View query IT select count(c), arrow_typeof(count(c)) from t; diff --git a/datafusion/sqllogictest/test_files/array.slt b/datafusion/sqllogictest/test_files/array.slt index d89ba600d7a6..8cbf7d6c5841 100644 --- a/datafusion/sqllogictest/test_files/array.slt +++ b/datafusion/sqllogictest/test_files/array.slt @@ -6023,7 +6023,7 @@ logical_plan 03)----SubqueryAlias: test 04)------SubqueryAlias: t 05)--------Projection: -06)----------Filter: substr(CAST(md5(CAST(tmp_table.value AS Utf8)) AS Utf8), Int64(1), Int64(32)) IN ([Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278"), Utf8View("a"), Utf8View("b"), Utf8View("c")]) +06)----------Filter: substr(CAST(md5(CAST(tmp_table.value AS Utf8View)) AS Utf8View), Int64(1), Int64(32)) IN ([Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278"), Utf8View("a"), Utf8View("b"), Utf8View("c")]) 07)------------TableScan: tmp_table projection=[value] physical_plan 01)ProjectionExec: expr=[count(Int64(1))@0 as count(*)] @@ -6032,7 +6032,7 @@ physical_plan 04)------AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1))] 05)--------ProjectionExec: expr=[] 06)----------CoalesceBatchesExec: target_batch_size=8192 -07)------------FilterExec: substr(md5(CAST(value@0 AS Utf8)), 1, 32) IN ([Literal { value: Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278") }, Literal { value: Utf8View("a") }, Literal { value: Utf8View("b") }, Literal { value: Utf8View("c") }]) +07)------------FilterExec: substr(CAST(md5(CAST(value@0 AS Utf8View)) AS Utf8View), 1, 32) IN ([Literal { value: Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278") }, Literal { value: Utf8View("a") }, Literal { value: Utf8View("b") }, Literal { value: Utf8View("c") }]) 08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 09)----------------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100000, batch_size=8192] @@ -6052,7 +6052,7 @@ logical_plan 03)----SubqueryAlias: test 04)------SubqueryAlias: t 05)--------Projection: -06)----------Filter: substr(CAST(md5(CAST(tmp_table.value AS Utf8)) AS Utf8), Int64(1), Int64(32)) IN ([Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278"), Utf8View("a"), Utf8View("b"), Utf8View("c")]) +06)----------Filter: substr(CAST(md5(CAST(tmp_table.value AS Utf8View)) AS Utf8View), Int64(1), Int64(32)) IN ([Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278"), Utf8View("a"), Utf8View("b"), Utf8View("c")]) 07)------------TableScan: tmp_table projection=[value] physical_plan 01)ProjectionExec: expr=[count(Int64(1))@0 as count(*)] @@ -6061,7 +6061,7 @@ physical_plan 04)------AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1))] 05)--------ProjectionExec: expr=[] 06)----------CoalesceBatchesExec: target_batch_size=8192 -07)------------FilterExec: substr(md5(CAST(value@0 AS Utf8)), 1, 32) IN ([Literal { value: Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278") }, Literal { value: Utf8View("a") }, Literal { value: Utf8View("b") }, Literal { value: Utf8View("c") }]) +07)------------FilterExec: substr(CAST(md5(CAST(value@0 AS Utf8View)) AS Utf8View), 1, 32) IN ([Literal { value: Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278") }, Literal { value: Utf8View("a") }, Literal { value: Utf8View("b") }, Literal { value: Utf8View("c") }]) 08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 09)----------------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100000, batch_size=8192] @@ -6081,7 +6081,7 @@ logical_plan 03)----SubqueryAlias: test 04)------SubqueryAlias: t 05)--------Projection: -06)----------Filter: substr(CAST(md5(CAST(tmp_table.value AS Utf8)) AS Utf8), Int64(1), Int64(32)) IN ([Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278"), Utf8View("a"), Utf8View("b"), Utf8View("c")]) +06)----------Filter: substr(CAST(md5(CAST(tmp_table.value AS Utf8View)) AS Utf8View), Int64(1), Int64(32)) IN ([Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278"), Utf8View("a"), Utf8View("b"), Utf8View("c")]) 07)------------TableScan: tmp_table projection=[value] physical_plan 01)ProjectionExec: expr=[count(Int64(1))@0 as count(*)] @@ -6090,7 +6090,7 @@ physical_plan 04)------AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1))] 05)--------ProjectionExec: expr=[] 06)----------CoalesceBatchesExec: target_batch_size=8192 -07)------------FilterExec: substr(md5(CAST(value@0 AS Utf8)), 1, 32) IN ([Literal { value: Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278") }, Literal { value: Utf8View("a") }, Literal { value: Utf8View("b") }, Literal { value: Utf8View("c") }]) +07)------------FilterExec: substr(CAST(md5(CAST(value@0 AS Utf8View)) AS Utf8View), 1, 32) IN ([Literal { value: Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278") }, Literal { value: Utf8View("a") }, Literal { value: Utf8View("b") }, Literal { value: Utf8View("c") }]) 08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 09)----------------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100000, batch_size=8192] @@ -6112,7 +6112,7 @@ logical_plan 03)----SubqueryAlias: test 04)------SubqueryAlias: t 05)--------Projection: -06)----------Filter: array_has(LargeList([7f4b18de3cfeb9b4ac78c381ee2ad278, a, b, c]), substr(CAST(md5(CAST(tmp_table.value AS Utf8)) AS Utf8), Int64(1), Int64(32))) +06)----------Filter: array_has(LargeList([7f4b18de3cfeb9b4ac78c381ee2ad278, a, b, c]), substr(CAST(md5(CAST(tmp_table.value AS Utf8View)) AS Utf8View), Int64(1), Int64(32))) 07)------------TableScan: tmp_table projection=[value] physical_plan 01)ProjectionExec: expr=[count(Int64(1))@0 as count(*)] @@ -6121,7 +6121,7 @@ physical_plan 04)------AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1))] 05)--------ProjectionExec: expr=[] 06)----------CoalesceBatchesExec: target_batch_size=8192 -07)------------FilterExec: array_has([7f4b18de3cfeb9b4ac78c381ee2ad278, a, b, c], substr(md5(CAST(value@0 AS Utf8)), 1, 32)) +07)------------FilterExec: array_has([7f4b18de3cfeb9b4ac78c381ee2ad278, a, b, c], substr(CAST(md5(CAST(value@0 AS Utf8View)) AS Utf8View), 1, 32)) 08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 09)----------------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100000, batch_size=8192] @@ -6141,7 +6141,7 @@ logical_plan 03)----SubqueryAlias: test 04)------SubqueryAlias: t 05)--------Projection: -06)----------Filter: substr(CAST(md5(CAST(tmp_table.value AS Utf8)) AS Utf8), Int64(1), Int64(32)) IN ([Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278"), Utf8View("a"), Utf8View("b"), Utf8View("c")]) +06)----------Filter: substr(CAST(md5(CAST(tmp_table.value AS Utf8View)) AS Utf8View), Int64(1), Int64(32)) IN ([Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278"), Utf8View("a"), Utf8View("b"), Utf8View("c")]) 07)------------TableScan: tmp_table projection=[value] physical_plan 01)ProjectionExec: expr=[count(Int64(1))@0 as count(*)] @@ -6150,7 +6150,7 @@ physical_plan 04)------AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1))] 05)--------ProjectionExec: expr=[] 06)----------CoalesceBatchesExec: target_batch_size=8192 -07)------------FilterExec: substr(md5(CAST(value@0 AS Utf8)), 1, 32) IN ([Literal { value: Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278") }, Literal { value: Utf8View("a") }, Literal { value: Utf8View("b") }, Literal { value: Utf8View("c") }]) +07)------------FilterExec: substr(CAST(md5(CAST(value@0 AS Utf8View)) AS Utf8View), 1, 32) IN ([Literal { value: Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278") }, Literal { value: Utf8View("a") }, Literal { value: Utf8View("b") }, Literal { value: Utf8View("c") }]) 08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 09)----------------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100000, batch_size=8192] @@ -6172,7 +6172,7 @@ logical_plan 03)----SubqueryAlias: test 04)------SubqueryAlias: t 05)--------Projection: -06)----------Filter: substr(CAST(md5(CAST(tmp_table.value AS Utf8)) AS Utf8), Int64(1), Int64(32)) IS NOT NULL OR Boolean(NULL) +06)----------Filter: substr(CAST(md5(CAST(tmp_table.value AS Utf8View)) AS Utf8View), Int64(1), Int64(32)) IS NOT NULL OR Boolean(NULL) 07)------------TableScan: tmp_table projection=[value] physical_plan 01)ProjectionExec: expr=[count(Int64(1))@0 as count(*)] @@ -6181,7 +6181,7 @@ physical_plan 04)------AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1))] 05)--------ProjectionExec: expr=[] 06)----------CoalesceBatchesExec: target_batch_size=8192 -07)------------FilterExec: substr(md5(CAST(value@0 AS Utf8)), 1, 32) IS NOT NULL OR NULL +07)------------FilterExec: substr(CAST(md5(CAST(value@0 AS Utf8View)) AS Utf8View), 1, 32) IS NOT NULL OR NULL 08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 09)----------------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100000, batch_size=8192] @@ -7863,7 +7863,7 @@ List(Field { name: "item", data_type: List(Field { name: "item", data_type: Int3 query ??T select [1,2,3]::int[], [['1']]::int[][], arrow_typeof([]::text[]); ---- -[1, 2, 3] [[1]] List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) +[1, 2, 3] [[1]] List(Field { name: "item", data_type: Utf8View, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) # test empty arrays return length # issue: https://github.com/apache/datafusion/pull/12459 diff --git a/datafusion/sqllogictest/test_files/arrow_files.slt b/datafusion/sqllogictest/test_files/arrow_files.slt index 30f322cf98fc..e039029060f6 100644 --- a/datafusion/sqllogictest/test_files/arrow_files.slt +++ b/datafusion/sqllogictest/test_files/arrow_files.slt @@ -61,22 +61,12 @@ LOCATION '../core/tests/data/partitioned_table_arrow/' PARTITIONED BY (part); # select wildcard -query ITBI +query error DataFusion error: Arrow error: External error: Arrow error: Invalid argument error: column types must match schema types, expected Utf8View but found Utf8 at column index 1 SELECT * FROM arrow_partitioned ORDER BY f0; ----- -1 foo true 123 -2 bar false 123 -3 baz true 456 -4 NULL NULL 456 # select all fields -query IITB +query error DataFusion error: Arrow error: External error: Arrow error: Invalid argument error: column types must match schema types, expected Utf8View but found Utf8 at column index 1 SELECT part, f0, f1, f2 FROM arrow_partitioned ORDER BY f0; ----- -123 1 foo true -123 2 bar false -456 3 baz true -456 4 NULL NULL # select without partition column query IB diff --git a/datafusion/sqllogictest/test_files/explain_tree.slt b/datafusion/sqllogictest/test_files/explain_tree.slt index 15bf61576571..8096c8cacf4c 100644 --- a/datafusion/sqllogictest/test_files/explain_tree.slt +++ b/datafusion/sqllogictest/test_files/explain_tree.slt @@ -280,7 +280,7 @@ physical_plan 06)┌─────────────┴─────────────┐ 07)│ DataSourceExec │ 08)│ -------------------- │ -09)│ bytes: 3120 │ +09)│ bytes: 1072 │ 10)│ format: memory │ 11)│ rows: 2 │ 12)└───────────────────────────┘ @@ -367,7 +367,7 @@ physical_plan 21)┌─────────────┴─────────────┐┌─────────────┴─────────────┐ 22)│ DataSourceExec ││ CoalesceBatchesExec │ 23)│ -------------------- ││ -------------------- │ -24)│ bytes: 1560 ││ target_batch_size: │ +24)│ bytes: 536 ││ target_batch_size: │ 25)│ format: memory ││ 8192 │ 26)│ rows: 1 ││ │ 27)└───────────────────────────┘└─────────────┬─────────────┘ @@ -669,7 +669,7 @@ physical_plan 13)┌─────────────┴─────────────┐ 14)│ DataSourceExec │ 15)│ -------------------- │ -16)│ bytes: 1560 │ +16)│ bytes: 536 │ 17)│ format: memory │ 18)│ rows: 1 │ 19)└───────────────────────────┘ @@ -1065,7 +1065,7 @@ physical_plan 13)┌─────────────┴─────────────┐ 14)│ DataSourceExec │ 15)│ -------------------- │ -16)│ bytes: 1560 │ +16)│ bytes: 536 │ 17)│ format: memory │ 18)│ rows: 1 │ 19)└───────────────────────────┘ @@ -1195,60 +1195,42 @@ physical_plan 08)│ HashJoinExec │ 09)│ -------------------- │ 10)│ on: │ -11)│ (int_col = int_col), (CAST├──────────────┐ -12)│ (table1.string_col AS │ │ -13)│ Utf8View) = │ │ -14)│ string_col) │ │ -15)└─────────────┬─────────────┘ │ -16)┌─────────────┴─────────────┐┌─────────────┴─────────────┐ -17)│ CoalesceBatchesExec ││ CoalesceBatchesExec │ -18)│ -------------------- ││ -------------------- │ -19)│ target_batch_size: ││ target_batch_size: │ -20)│ 8192 ││ 8192 │ -21)└─────────────┬─────────────┘└─────────────┬─────────────┘ -22)┌─────────────┴─────────────┐┌─────────────┴─────────────┐ -23)│ RepartitionExec ││ RepartitionExec │ -24)│ -------------------- ││ -------------------- │ -25)│ partition_count(in->out): ││ partition_count(in->out): │ -26)│ 4 -> 4 ││ 4 -> 4 │ -27)│ ││ │ -28)│ partitioning_scheme: ││ partitioning_scheme: │ -29)│ Hash([int_col@0, CAST ││ Hash([int_col@0, │ -30)│ (table1.string_col ││ string_col@1], │ -31)│ AS Utf8View)@4], 4) ││ 4) │ -32)└─────────────┬─────────────┘└─────────────┬─────────────┘ -33)┌─────────────┴─────────────┐┌─────────────┴─────────────┐ -34)│ ProjectionExec ││ RepartitionExec │ -35)│ -------------------- ││ -------------------- │ -36)│ CAST(table1.string_col AS ││ partition_count(in->out): │ -37)│ Utf8View): ││ 1 -> 4 │ -38)│ CAST(string_col AS ││ │ -39)│ Utf8View) ││ partitioning_scheme: │ -40)│ ││ RoundRobinBatch(4) │ -41)│ bigint_col: ││ │ -42)│ bigint_col ││ │ -43)│ ││ │ -44)│ date_col: date_col ││ │ -45)│ int_col: int_col ││ │ -46)│ ││ │ -47)│ string_col: ││ │ -48)│ string_col ││ │ -49)└─────────────┬─────────────┘└─────────────┬─────────────┘ -50)┌─────────────┴─────────────┐┌─────────────┴─────────────┐ -51)│ RepartitionExec ││ DataSourceExec │ -52)│ -------------------- ││ -------------------- │ -53)│ partition_count(in->out): ││ files: 1 │ -54)│ 1 -> 4 ││ format: parquet │ -55)│ ││ │ -56)│ partitioning_scheme: ││ │ -57)│ RoundRobinBatch(4) ││ │ -58)└─────────────┬─────────────┘└───────────────────────────┘ -59)┌─────────────┴─────────────┐ -60)│ DataSourceExec │ -61)│ -------------------- │ -62)│ files: 1 │ -63)│ format: csv │ -64)└───────────────────────────┘ +11)│ (int_col = int_col), ├──────────────┐ +12)│ (string_col = │ │ +13)│ string_col) │ │ +14)└─────────────┬─────────────┘ │ +15)┌─────────────┴─────────────┐┌─────────────┴─────────────┐ +16)│ CoalesceBatchesExec ││ CoalesceBatchesExec │ +17)│ -------------------- ││ -------------------- │ +18)│ target_batch_size: ││ target_batch_size: │ +19)│ 8192 ││ 8192 │ +20)└─────────────┬─────────────┘└─────────────┬─────────────┘ +21)┌─────────────┴─────────────┐┌─────────────┴─────────────┐ +22)│ RepartitionExec ││ RepartitionExec │ +23)│ -------------------- ││ -------------------- │ +24)│ partition_count(in->out): ││ partition_count(in->out): │ +25)│ 4 -> 4 ││ 4 -> 4 │ +26)│ ││ │ +27)│ partitioning_scheme: ││ partitioning_scheme: │ +28)│ Hash([int_col@0, ││ Hash([int_col@0, │ +29)│ string_col@1], ││ string_col@1], │ +30)│ 4) ││ 4) │ +31)└─────────────┬─────────────┘└─────────────┬─────────────┘ +32)┌─────────────┴─────────────┐┌─────────────┴─────────────┐ +33)│ RepartitionExec ││ RepartitionExec │ +34)│ -------------------- ││ -------------------- │ +35)│ partition_count(in->out): ││ partition_count(in->out): │ +36)│ 1 -> 4 ││ 1 -> 4 │ +37)│ ││ │ +38)│ partitioning_scheme: ││ partitioning_scheme: │ +39)│ RoundRobinBatch(4) ││ RoundRobinBatch(4) │ +40)└─────────────┬─────────────┘└─────────────┬─────────────┘ +41)┌─────────────┴─────────────┐┌─────────────┴─────────────┐ +42)│ DataSourceExec ││ DataSourceExec │ +43)│ -------------------- ││ -------------------- │ +44)│ files: 1 ││ files: 1 │ +45)│ format: csv ││ format: parquet │ +46)└───────────────────────────┘└───────────────────────────┘ # Query with outer hash join. query TT @@ -1267,60 +1249,42 @@ physical_plan 10)│ join_type: Left │ 11)│ │ 12)│ on: ├──────────────┐ -13)│ (int_col = int_col), (CAST│ │ -14)│ (table1.string_col AS │ │ -15)│ Utf8View) = │ │ -16)│ string_col) │ │ -17)└─────────────┬─────────────┘ │ -18)┌─────────────┴─────────────┐┌─────────────┴─────────────┐ -19)│ CoalesceBatchesExec ││ CoalesceBatchesExec │ -20)│ -------------------- ││ -------------------- │ -21)│ target_batch_size: ││ target_batch_size: │ -22)│ 8192 ││ 8192 │ -23)└─────────────┬─────────────┘└─────────────┬─────────────┘ -24)┌─────────────┴─────────────┐┌─────────────┴─────────────┐ -25)│ RepartitionExec ││ RepartitionExec │ -26)│ -------------------- ││ -------------------- │ -27)│ partition_count(in->out): ││ partition_count(in->out): │ -28)│ 4 -> 4 ││ 4 -> 4 │ -29)│ ││ │ -30)│ partitioning_scheme: ││ partitioning_scheme: │ -31)│ Hash([int_col@0, CAST ││ Hash([int_col@0, │ -32)│ (table1.string_col ││ string_col@1], │ -33)│ AS Utf8View)@4], 4) ││ 4) │ -34)└─────────────┬─────────────┘└─────────────┬─────────────┘ -35)┌─────────────┴─────────────┐┌─────────────┴─────────────┐ -36)│ ProjectionExec ││ RepartitionExec │ -37)│ -------------------- ││ -------------------- │ -38)│ CAST(table1.string_col AS ││ partition_count(in->out): │ -39)│ Utf8View): ││ 1 -> 4 │ -40)│ CAST(string_col AS ││ │ -41)│ Utf8View) ││ partitioning_scheme: │ -42)│ ││ RoundRobinBatch(4) │ -43)│ bigint_col: ││ │ -44)│ bigint_col ││ │ -45)│ ││ │ -46)│ date_col: date_col ││ │ -47)│ int_col: int_col ││ │ -48)│ ││ │ -49)│ string_col: ││ │ -50)│ string_col ││ │ -51)└─────────────┬─────────────┘└─────────────┬─────────────┘ -52)┌─────────────┴─────────────┐┌─────────────┴─────────────┐ -53)│ RepartitionExec ││ DataSourceExec │ -54)│ -------------------- ││ -------------------- │ -55)│ partition_count(in->out): ││ files: 1 │ -56)│ 1 -> 4 ││ format: parquet │ -57)│ ││ │ -58)│ partitioning_scheme: ││ │ -59)│ RoundRobinBatch(4) ││ │ -60)└─────────────┬─────────────┘└───────────────────────────┘ -61)┌─────────────┴─────────────┐ -62)│ DataSourceExec │ -63)│ -------------------- │ -64)│ files: 1 │ -65)│ format: csv │ -66)└───────────────────────────┘ +13)│ (int_col = int_col), │ │ +14)│ (string_col = │ │ +15)│ string_col) │ │ +16)└─────────────┬─────────────┘ │ +17)┌─────────────┴─────────────┐┌─────────────┴─────────────┐ +18)│ CoalesceBatchesExec ││ CoalesceBatchesExec │ +19)│ -------------------- ││ -------------------- │ +20)│ target_batch_size: ││ target_batch_size: │ +21)│ 8192 ││ 8192 │ +22)└─────────────┬─────────────┘└─────────────┬─────────────┘ +23)┌─────────────┴─────────────┐┌─────────────┴─────────────┐ +24)│ RepartitionExec ││ RepartitionExec │ +25)│ -------------------- ││ -------------------- │ +26)│ partition_count(in->out): ││ partition_count(in->out): │ +27)│ 4 -> 4 ││ 4 -> 4 │ +28)│ ││ │ +29)│ partitioning_scheme: ││ partitioning_scheme: │ +30)│ Hash([int_col@0, ││ Hash([int_col@0, │ +31)│ string_col@1], ││ string_col@1], │ +32)│ 4) ││ 4) │ +33)└─────────────┬─────────────┘└─────────────┬─────────────┘ +34)┌─────────────┴─────────────┐┌─────────────┴─────────────┐ +35)│ RepartitionExec ││ RepartitionExec │ +36)│ -------------------- ││ -------------------- │ +37)│ partition_count(in->out): ││ partition_count(in->out): │ +38)│ 1 -> 4 ││ 1 -> 4 │ +39)│ ││ │ +40)│ partitioning_scheme: ││ partitioning_scheme: │ +41)│ RoundRobinBatch(4) ││ RoundRobinBatch(4) │ +42)└─────────────┬─────────────┘└─────────────┬─────────────┘ +43)┌─────────────┴─────────────┐┌─────────────┴─────────────┐ +44)│ DataSourceExec ││ DataSourceExec │ +45)│ -------------------- ││ -------------------- │ +46)│ files: 1 ││ files: 1 │ +47)│ format: csv ││ format: parquet │ +48)└───────────────────────────┘└───────────────────────────┘ # Query with nested loop join. query TT @@ -1529,7 +1493,7 @@ physical_plan 57)┌─────────────┴─────────────┐┌─────────────┴─────────────┐ 58)│ DataSourceExec ││ DataSourceExec │ 59)│ -------------------- ││ -------------------- │ -60)│ bytes: 1320 ││ bytes: 1312 │ +60)│ bytes: 296 ││ bytes: 288 │ 61)│ format: memory ││ format: memory │ 62)│ rows: 1 ││ rows: 1 │ 63)└───────────────────────────┘└───────────────────────────┘ @@ -1548,14 +1512,14 @@ physical_plan 04)┌─────────────┴─────────────┐┌─────────────┴─────────────┐ 05)│ DataSourceExec ││ ProjectionExec │ 06)│ -------------------- ││ -------------------- │ -07)│ bytes: 1320 ││ id: CAST(id AS Int32) │ +07)│ bytes: 296 ││ id: CAST(id AS Int32) │ 08)│ format: memory ││ name: name │ 09)│ rows: 1 ││ │ 10)└───────────────────────────┘└─────────────┬─────────────┘ 11)-----------------------------┌─────────────┴─────────────┐ 12)-----------------------------│ DataSourceExec │ 13)-----------------------------│ -------------------- │ -14)-----------------------------│ bytes: 1312 │ +14)-----------------------------│ bytes: 288 │ 15)-----------------------------│ format: memory │ 16)-----------------------------│ rows: 1 │ 17)-----------------------------└───────────────────────────┘ diff --git a/datafusion/sqllogictest/test_files/functions.slt b/datafusion/sqllogictest/test_files/functions.slt index 20f79622a62c..9eb70be3a599 100644 --- a/datafusion/sqllogictest/test_files/functions.slt +++ b/datafusion/sqllogictest/test_files/functions.slt @@ -837,10 +837,8 @@ SELECT greatest([2, 3], [1, 4], [5, 0]); ---- [5, 0] -query I +query error DataFusion error: Error during planning: Execution error: Function 'greatest' user\-defined coercion failed with "Error during planning: Cannot find a common type for arguments" SELECT greatest(1::int, 2::text) ----- -2 query R SELECT greatest(-1, 1, 2.3, 123456789, 3 + 5, -(-4)) @@ -1035,10 +1033,8 @@ SELECT least([2, 3], [1, 4], [5, 0]); ---- [1, 4] -query I +query error DataFusion error: Error during planning: Execution error: Function 'least' user\-defined coercion failed with "Error during planning: Cannot find a common type for arguments" SELECT least(1::int, 2::text) ----- -1 query R SELECT least(-1, 1, 2.3, 123456789, 3 + 5, -(-4)) diff --git a/datafusion/sqllogictest/test_files/monotonic_projection_test.slt b/datafusion/sqllogictest/test_files/monotonic_projection_test.slt index e8700b1fea27..9c806cfa0d8a 100644 --- a/datafusion/sqllogictest/test_files/monotonic_projection_test.slt +++ b/datafusion/sqllogictest/test_files/monotonic_projection_test.slt @@ -129,12 +129,12 @@ ORDER BY a_str ASC, b ASC; ---- logical_plan 01)Sort: a_str ASC NULLS LAST, multiple_ordered_table.b ASC NULLS LAST -02)--Projection: CAST(multiple_ordered_table.a AS Utf8) AS a_str, multiple_ordered_table.b +02)--Projection: CAST(multiple_ordered_table.a AS Utf8View) AS a_str, multiple_ordered_table.b 03)----TableScan: multiple_ordered_table projection=[a, b] physical_plan 01)SortPreservingMergeExec: [a_str@0 ASC NULLS LAST, b@1 ASC NULLS LAST] 02)--SortExec: expr=[a_str@0 ASC NULLS LAST, b@1 ASC NULLS LAST], preserve_partitioning=[true] -03)----ProjectionExec: expr=[CAST(a@0 AS Utf8) as a_str, b@1 as b] +03)----ProjectionExec: expr=[CAST(a@0 AS Utf8View) as a_str, b@1 as b] 04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, b], output_ordering=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST], file_type=csv, has_header=true diff --git a/datafusion/sqllogictest/test_files/parquet_filter_pushdown.slt b/datafusion/sqllogictest/test_files/parquet_filter_pushdown.slt index 1b6ae13fbe77..f2754462c858 100644 --- a/datafusion/sqllogictest/test_files/parquet_filter_pushdown.slt +++ b/datafusion/sqllogictest/test_files/parquet_filter_pushdown.slt @@ -219,7 +219,7 @@ physical_plan query TT EXPLAIN select * from t_pushdown where part != 'a'; ---- -logical_plan TableScan: t_pushdown projection=[val, part], full_filters=[t_pushdown.part != Utf8("a")] +logical_plan TableScan: t_pushdown projection=[val, part], full_filters=[t_pushdown.part != Utf8View("a")] physical_plan DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=b/file.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=c/file.parquet]]}, projection=[val, part], file_type=parquet # And if we reference only a file column it gets pushed down @@ -227,8 +227,8 @@ query TT EXPLAIN select * from t_pushdown where val != 'c'; ---- logical_plan -01)Filter: t_pushdown.val != Utf8("c") -02)--TableScan: t_pushdown projection=[val, part], partial_filters=[t_pushdown.val != Utf8("c")] +01)Filter: t_pushdown.val != Utf8View("c") +02)--TableScan: t_pushdown projection=[val, part], partial_filters=[t_pushdown.val != Utf8View("c")] physical_plan DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=a/file.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=b/file.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=c/file.parquet]]}, projection=[val, part], file_type=parquet, predicate=val@0 != c, pruning_predicate=val_null_count@2 != row_count@3 AND (val_min@0 != c OR c != val_max@1), required_guarantees=[val not in (c)] # If we have a mix of filters: @@ -239,8 +239,8 @@ query TT EXPLAIN select * from t_pushdown where val != 'd' AND val != 'c' AND part = 'a' AND part != val; ---- logical_plan -01)Filter: t_pushdown.val != Utf8("d") AND t_pushdown.val != Utf8("c") AND t_pushdown.val != t_pushdown.part -02)--TableScan: t_pushdown projection=[val, part], full_filters=[t_pushdown.part = Utf8("a")], partial_filters=[t_pushdown.val != Utf8("d"), t_pushdown.val != Utf8("c"), t_pushdown.val != t_pushdown.part] +01)Filter: t_pushdown.val != Utf8View("d") AND t_pushdown.val != Utf8View("c") AND t_pushdown.val != t_pushdown.part +02)--TableScan: t_pushdown projection=[val, part], full_filters=[t_pushdown.part = Utf8View("a")], partial_filters=[t_pushdown.val != Utf8View("d"), t_pushdown.val != Utf8View("c"), t_pushdown.val != t_pushdown.part] physical_plan 01)CoalesceBatchesExec: target_batch_size=8192 02)--FilterExec: val@0 != part@1 diff --git a/datafusion/sqllogictest/test_files/push_down_filter.slt b/datafusion/sqllogictest/test_files/push_down_filter.slt index ed948dd11439..a0d319332462 100644 --- a/datafusion/sqllogictest/test_files/push_down_filter.slt +++ b/datafusion/sqllogictest/test_files/push_down_filter.slt @@ -265,7 +265,7 @@ physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/ query TT explain select a from t where CAST(a AS string) = '0123'; ---- -physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter/t.parquet]]}, projection=[a], file_type=parquet, predicate=CAST(a@0 AS Utf8) = 0123 +physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter/t.parquet]]}, projection=[a], file_type=parquet, predicate=CAST(a@0 AS Utf8View) = 0123 statement ok diff --git a/datafusion/sqllogictest/test_files/scalar.slt b/datafusion/sqllogictest/test_files/scalar.slt index f583d659fd4f..ca0b472de9e0 100644 --- a/datafusion/sqllogictest/test_files/scalar.slt +++ b/datafusion/sqllogictest/test_files/scalar.slt @@ -1832,7 +1832,7 @@ query TT EXPLAIN SELECT letter, letter = LEFT('APACHE', 1) FROM simple_string; ---- logical_plan -01)Projection: simple_string.letter, simple_string.letter = Utf8("A") AS simple_string.letter = left(Utf8("APACHE"),Int64(1)) +01)Projection: simple_string.letter, simple_string.letter = Utf8View("A") AS simple_string.letter = left(Utf8("APACHE"),Int64(1)) 02)--TableScan: simple_string projection=[letter] physical_plan 01)ProjectionExec: expr=[letter@0 as letter, letter@0 = A as simple_string.letter = left(Utf8("APACHE"),Int64(1))] @@ -1851,10 +1851,10 @@ query TT EXPLAIN SELECT letter, letter = LEFT(letter2, 1) FROM simple_string; ---- logical_plan -01)Projection: simple_string.letter, simple_string.letter = left(simple_string.letter2, Int64(1)) +01)Projection: simple_string.letter, simple_string.letter = CAST(left(simple_string.letter2, Int64(1)) AS Utf8View) 02)--TableScan: simple_string projection=[letter, letter2] physical_plan -01)ProjectionExec: expr=[letter@0 as letter, letter@0 = left(letter2@1, 1) as simple_string.letter = left(simple_string.letter2,Int64(1))] +01)ProjectionExec: expr=[letter@0 as letter, letter@0 = CAST(left(letter2@1, 1) AS Utf8View) as simple_string.letter = left(simple_string.letter2,Int64(1))] 02)--DataSourceExec: partitions=1, partition_sizes=[1] query TB diff --git a/datafusion/sqllogictest/test_files/simplify_expr.slt b/datafusion/sqllogictest/test_files/simplify_expr.slt index 075ccafcfd2e..c77163dc996d 100644 --- a/datafusion/sqllogictest/test_files/simplify_expr.slt +++ b/datafusion/sqllogictest/test_files/simplify_expr.slt @@ -35,22 +35,22 @@ query TT explain select b from t where b ~ '.*' ---- logical_plan -01)Filter: t.b IS NOT NULL +01)Filter: t.b ~ Utf8View(".*") 02)--TableScan: t projection=[b] physical_plan 01)CoalesceBatchesExec: target_batch_size=8192 -02)--FilterExec: b@0 IS NOT NULL +02)--FilterExec: b@0 ~ .* 03)----DataSourceExec: partitions=1, partition_sizes=[1] query TT explain select b from t where b !~ '.*' ---- logical_plan -01)Filter: t.b = Utf8("") +01)Filter: t.b !~ Utf8View(".*") 02)--TableScan: t projection=[b] physical_plan 01)CoalesceBatchesExec: target_batch_size=8192 -02)--FilterExec: b@0 = +02)--FilterExec: b@0 !~ .* 03)----DataSourceExec: partitions=1, partition_sizes=[1] query T diff --git a/datafusion/sqllogictest/test_files/sort_merge_join.slt b/datafusion/sqllogictest/test_files/sort_merge_join.slt index c17fe8dfc7e6..6ad4ca12442d 100644 --- a/datafusion/sqllogictest/test_files/sort_merge_join.slt +++ b/datafusion/sqllogictest/test_files/sort_merge_join.slt @@ -44,103 +44,46 @@ physical_plan 05)----DataSourceExec: partitions=1, partition_sizes=[1] # inner join with join filter -query TITI rowsort +query error DataFusion error: This feature is not implemented: Unsupported data type in sort merge join comparator: Utf8View SELECT t1.a, t1.b, t2.a, t2.b FROM t1 JOIN t2 ON t1.a = t2.a AND t2.b * 50 <= t1.b ----- -Alice 100 Alice 1 -Alice 100 Alice 2 -Alice 50 Alice 1 -query TITI rowsort +query error DataFusion error: This feature is not implemented: Unsupported data type in sort merge join comparator: Utf8View SELECT t1.a, t1.b, t2.a, t2.b FROM t1 JOIN t2 ON t1.a = t2.a AND t2.b < t1.b ----- -Alice 100 Alice 1 -Alice 100 Alice 2 -Alice 50 Alice 1 -Alice 50 Alice 2 -query TITI rowsort +query error DataFusion error: This feature is not implemented: Unsupported data type in sort merge join comparator: Utf8View SELECT t1.a, t1.b, t2.a, t2.b FROM t1 JOIN t2 ON t1.a = t2.a AND t2.b > t1.b ----- # left join without join filter -query TITI rowsort +query error DataFusion error: This feature is not implemented: Unsupported data type in sort merge join comparator: Utf8View SELECT * FROM t1 LEFT JOIN t2 ON t1.a = t2.a ----- -Alice 100 Alice 1 -Alice 100 Alice 2 -Alice 50 Alice 1 -Alice 50 Alice 2 -Bob 1 NULL NULL # left join with join filter -query TITI rowsort +query error DataFusion error: This feature is not implemented: Unsupported data type in sort merge join comparator: Utf8View SELECT * FROM t1 LEFT JOIN t2 ON t1.a = t2.a AND t2.b * 50 <= t1.b ----- -Alice 100 Alice 1 -Alice 100 Alice 2 -Alice 50 Alice 1 -Bob 1 NULL NULL -query TITI rowsort +query error DataFusion error: This feature is not implemented: Unsupported data type in sort merge join comparator: Utf8View SELECT * FROM t1 LEFT JOIN t2 ON t1.a = t2.a AND t2.b < t1.b ----- -Alice 100 Alice 1 -Alice 100 Alice 2 -Alice 50 Alice 1 -Alice 50 Alice 2 -Bob 1 NULL NULL # right join without join filter -query TITI rowsort +query error DataFusion error: This feature is not implemented: Unsupported data type in sort merge join comparator: Utf8View SELECT * FROM t1 RIGHT JOIN t2 ON t1.a = t2.a ----- -Alice 100 Alice 1 -Alice 100 Alice 2 -Alice 50 Alice 1 -Alice 50 Alice 2 # right join with join filter -query TITI rowsort +query error DataFusion error: This feature is not implemented: Unsupported data type in sort merge join comparator: Utf8View SELECT * FROM t1 RIGHT JOIN t2 ON t1.a = t2.a AND t2.b * 50 <= t1.b ----- -Alice 100 Alice 1 -Alice 100 Alice 2 -Alice 50 Alice 1 -query TITI rowsort +query error DataFusion error: This feature is not implemented: Unsupported data type in sort merge join comparator: Utf8View SELECT * FROM t1 RIGHT JOIN t2 ON t1.a = t2.a AND t1.b > t2.b ----- -Alice 100 Alice 1 -Alice 100 Alice 2 -Alice 50 Alice 1 -Alice 50 Alice 2 # full join without join filter -query TITI rowsort +query error DataFusion error: This feature is not implemented: Unsupported data type in sort merge join comparator: Utf8View SELECT * FROM t1 FULL JOIN t2 ON t1.a = t2.a ----- -Alice 100 Alice 1 -Alice 100 Alice 2 -Alice 50 Alice 1 -Alice 50 Alice 2 -Bob 1 NULL NULL -query TITI rowsort +query error DataFusion error: This feature is not implemented: Unsupported data type in sort merge join comparator: Utf8View SELECT * FROM t1 FULL JOIN t2 ON t1.a = t2.a AND t2.b * 50 > t1.b ----- -Alice 100 NULL NULL -Alice 50 Alice 2 -Bob 1 NULL NULL -NULL NULL Alice 1 -query TITI rowsort +query error DataFusion error: This feature is not implemented: Unsupported data type in sort merge join comparator: Utf8View SELECT * FROM t1 FULL JOIN t2 ON t1.a = t2.a AND t1.b > t2.b + 50 ----- -Alice 100 Alice 1 -Alice 100 Alice 2 -Alice 50 NULL NULL -Bob 1 NULL NULL statement ok DROP TABLE t1; diff --git a/datafusion/sqllogictest/test_files/union.slt b/datafusion/sqllogictest/test_files/union.slt index d549f555f9d8..f901a4d373a3 100644 --- a/datafusion/sqllogictest/test_files/union.slt +++ b/datafusion/sqllogictest/test_files/union.slt @@ -230,7 +230,7 @@ logical_plan 02)--Union 03)----TableScan: t1 projection=[name] 04)----TableScan: t2 projection=[name] -05)----Projection: t2.name || Utf8("_new") AS name +05)----Projection: t2.name || Utf8View("_new") AS name 06)------TableScan: t2 projection=[name] physical_plan 01)AggregateExec: mode=FinalPartitioned, gby=[name@0 as name], aggr=[] @@ -266,7 +266,7 @@ logical_plan 01)Union 02)--TableScan: t1 projection=[name] 03)--TableScan: t2 projection=[name] -04)--Projection: t2.name || Utf8("_new") AS name +04)--Projection: t2.name || Utf8View("_new") AS name 05)----TableScan: t2 projection=[name] physical_plan 01)UnionExec From 4973bf98d6ac9f6c25887a3374dfad5d35a745bb Mon Sep 17 00:00:00 2001 From: zhuqi-lucas <821684824@qq.com> Date: Fri, 6 Jun 2025 17:50:15 +0800 Subject: [PATCH 02/14] Add support utf8view for sort merge join --- datafusion/core/tests/sql/create_drop.rs | 2 +- .../src/joins/sort_merge_join.rs | 2 + .../test_files/sort_merge_join.slt | 81 ++++++++++++++++--- 3 files changed, 72 insertions(+), 13 deletions(-) diff --git a/datafusion/core/tests/sql/create_drop.rs b/datafusion/core/tests/sql/create_drop.rs index 83712053b954..b35e614a464e 100644 --- a/datafusion/core/tests/sql/create_drop.rs +++ b/datafusion/core/tests/sql/create_drop.rs @@ -61,7 +61,7 @@ async fn create_external_table_with_ddl() -> Result<()> { assert_eq!(3, table_schema.fields().len()); assert_eq!(&DataType::Int32, table_schema.field(0).data_type()); - assert_eq!(&DataType::Utf8, table_schema.field(1).data_type()); + assert_eq!(&DataType::Utf8View, table_schema.field(1).data_type()); assert_eq!(&DataType::Boolean, table_schema.field(2).data_type()); Ok(()) diff --git a/datafusion/physical-plan/src/joins/sort_merge_join.rs b/datafusion/physical-plan/src/joins/sort_merge_join.rs index cadd2b53ab11..8271a447d50b 100644 --- a/datafusion/physical-plan/src/joins/sort_merge_join.rs +++ b/datafusion/physical-plan/src/joins/sort_merge_join.rs @@ -2478,6 +2478,7 @@ fn compare_join_arrays( DataType::Float32 => compare_value!(Float32Array), DataType::Float64 => compare_value!(Float64Array), DataType::Utf8 => compare_value!(StringArray), + DataType::Utf8View => compare_value!(StringViewArray), DataType::LargeUtf8 => compare_value!(LargeStringArray), DataType::Decimal128(..) => compare_value!(Decimal128Array), DataType::Timestamp(time_unit, None) => match time_unit { @@ -2545,6 +2546,7 @@ fn is_join_arrays_equal( DataType::Float32 => compare_value!(Float32Array), DataType::Float64 => compare_value!(Float64Array), DataType::Utf8 => compare_value!(StringArray), + DataType::Utf8View => compare_value!(StringViewArray), DataType::LargeUtf8 => compare_value!(LargeStringArray), DataType::Decimal128(..) => compare_value!(Decimal128Array), DataType::Timestamp(time_unit, None) => match time_unit { diff --git a/datafusion/sqllogictest/test_files/sort_merge_join.slt b/datafusion/sqllogictest/test_files/sort_merge_join.slt index 6ad4ca12442d..41dc1990432b 100644 --- a/datafusion/sqllogictest/test_files/sort_merge_join.slt +++ b/datafusion/sqllogictest/test_files/sort_merge_join.slt @@ -44,46 +44,103 @@ physical_plan 05)----DataSourceExec: partitions=1, partition_sizes=[1] # inner join with join filter -query error DataFusion error: This feature is not implemented: Unsupported data type in sort merge join comparator: Utf8View +query TITI SELECT t1.a, t1.b, t2.a, t2.b FROM t1 JOIN t2 ON t1.a = t2.a AND t2.b * 50 <= t1.b +---- +Alice 50 Alice 1 +Alice 100 Alice 2 +Alice 100 Alice 1 -query error DataFusion error: This feature is not implemented: Unsupported data type in sort merge join comparator: Utf8View +query TITI SELECT t1.a, t1.b, t2.a, t2.b FROM t1 JOIN t2 ON t1.a = t2.a AND t2.b < t1.b +---- +Alice 50 Alice 2 +Alice 50 Alice 1 +Alice 100 Alice 2 +Alice 100 Alice 1 -query error DataFusion error: This feature is not implemented: Unsupported data type in sort merge join comparator: Utf8View +query TITI SELECT t1.a, t1.b, t2.a, t2.b FROM t1 JOIN t2 ON t1.a = t2.a AND t2.b > t1.b +---- # left join without join filter -query error DataFusion error: This feature is not implemented: Unsupported data type in sort merge join comparator: Utf8View +query TITI SELECT * FROM t1 LEFT JOIN t2 ON t1.a = t2.a +---- +Alice 50 Alice 2 +Alice 50 Alice 1 +Alice 100 Alice 2 +Alice 100 Alice 1 +Bob 1 NULL NULL # left join with join filter -query error DataFusion error: This feature is not implemented: Unsupported data type in sort merge join comparator: Utf8View +query TITI SELECT * FROM t1 LEFT JOIN t2 ON t1.a = t2.a AND t2.b * 50 <= t1.b +---- +Alice 50 Alice 1 +Alice 100 Alice 2 +Alice 100 Alice 1 +Bob 1 NULL NULL -query error DataFusion error: This feature is not implemented: Unsupported data type in sort merge join comparator: Utf8View +query TITI SELECT * FROM t1 LEFT JOIN t2 ON t1.a = t2.a AND t2.b < t1.b +---- +Alice 50 Alice 2 +Alice 50 Alice 1 +Alice 100 Alice 2 +Alice 100 Alice 1 +Bob 1 NULL NULL # right join without join filter -query error DataFusion error: This feature is not implemented: Unsupported data type in sort merge join comparator: Utf8View +query TITI SELECT * FROM t1 RIGHT JOIN t2 ON t1.a = t2.a +---- +Alice 50 Alice 2 +Alice 100 Alice 2 +Alice 50 Alice 1 +Alice 100 Alice 1 # right join with join filter -query error DataFusion error: This feature is not implemented: Unsupported data type in sort merge join comparator: Utf8View +query TITI SELECT * FROM t1 RIGHT JOIN t2 ON t1.a = t2.a AND t2.b * 50 <= t1.b +---- +Alice 100 Alice 2 +Alice 50 Alice 1 +Alice 100 Alice 1 -query error DataFusion error: This feature is not implemented: Unsupported data type in sort merge join comparator: Utf8View +query TITI SELECT * FROM t1 RIGHT JOIN t2 ON t1.a = t2.a AND t1.b > t2.b +---- +Alice 50 Alice 2 +Alice 100 Alice 2 +Alice 50 Alice 1 +Alice 100 Alice 1 # full join without join filter -query error DataFusion error: This feature is not implemented: Unsupported data type in sort merge join comparator: Utf8View +query TITI SELECT * FROM t1 FULL JOIN t2 ON t1.a = t2.a +---- +Alice 50 Alice 2 +Alice 50 Alice 1 +Alice 100 Alice 2 +Alice 100 Alice 1 +Bob 1 NULL NULL -query error DataFusion error: This feature is not implemented: Unsupported data type in sort merge join comparator: Utf8View +query TITI SELECT * FROM t1 FULL JOIN t2 ON t1.a = t2.a AND t2.b * 50 > t1.b +---- +Alice 50 Alice 2 +Alice 100 NULL NULL +NULL NULL Alice 1 +Bob 1 NULL NULL -query error DataFusion error: This feature is not implemented: Unsupported data type in sort merge join comparator: Utf8View +query TITI SELECT * FROM t1 FULL JOIN t2 ON t1.a = t2.a AND t1.b > t2.b + 50 +---- +Alice 50 NULL NULL +Alice 100 Alice 2 +Alice 100 Alice 1 +Bob 1 NULL NULL statement ok DROP TABLE t1; From 85bf391455d7c51e4bf5a16b28ee668d7a4e85a6 Mon Sep 17 00:00:00 2001 From: zhuqi-lucas <821684824@qq.com> Date: Sun, 8 Jun 2025 16:10:47 +0800 Subject: [PATCH 03/14] fix binary utf8view union with int32 --- datafusion/expr-common/src/type_coercion/binary.rs | 4 ++-- datafusion/sqllogictest/test_files/functions.slt | 8 ++++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/datafusion/expr-common/src/type_coercion/binary.rs b/datafusion/expr-common/src/type_coercion/binary.rs index d0fcda973381..78ff85265213 100644 --- a/datafusion/expr-common/src/type_coercion/binary.rs +++ b/datafusion/expr-common/src/type_coercion/binary.rs @@ -462,7 +462,7 @@ pub fn type_union_resolution(data_types: &[DataType]) -> Option { // If all the data_types are null, return string if data_types.iter().all(|t| t == &DataType::Null) { - return Some(DataType::Utf8); + return Some(DataType::Utf8View); } // Ignore Nulls, if any data_type category is not the same, return None @@ -1202,7 +1202,7 @@ pub fn string_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option Option { use arrow::datatypes::DataType::*; match (lhs_type, rhs_type) { - (Utf8 | LargeUtf8, other_type) | (other_type, Utf8 | LargeUtf8) + (Utf8 | LargeUtf8 | Utf8View, other_type) | (other_type, Utf8 | LargeUtf8 | Utf8View) if other_type.is_numeric() => { Some(other_type.clone()) diff --git a/datafusion/sqllogictest/test_files/functions.slt b/datafusion/sqllogictest/test_files/functions.slt index 9eb70be3a599..20f79622a62c 100644 --- a/datafusion/sqllogictest/test_files/functions.slt +++ b/datafusion/sqllogictest/test_files/functions.slt @@ -837,8 +837,10 @@ SELECT greatest([2, 3], [1, 4], [5, 0]); ---- [5, 0] -query error DataFusion error: Error during planning: Execution error: Function 'greatest' user\-defined coercion failed with "Error during planning: Cannot find a common type for arguments" +query I SELECT greatest(1::int, 2::text) +---- +2 query R SELECT greatest(-1, 1, 2.3, 123456789, 3 + 5, -(-4)) @@ -1033,8 +1035,10 @@ SELECT least([2, 3], [1, 4], [5, 0]); ---- [1, 4] -query error DataFusion error: Error during planning: Execution error: Function 'least' user\-defined coercion failed with "Error during planning: Cannot find a common type for arguments" +query I SELECT least(1::int, 2::text) +---- +1 query R SELECT least(-1, 1, 2.3, 123456789, 3 + 5, -(-4)) From 406575bca5d6871c5973ef80ee1d6ceb0a75dd55 Mon Sep 17 00:00:00 2001 From: zhuqi-lucas <821684824@qq.com> Date: Sun, 8 Jun 2025 16:17:40 +0800 Subject: [PATCH 04/14] fix slt order --- .../test_files/sort_merge_join.slt | 74 +++++++++---------- 1 file changed, 37 insertions(+), 37 deletions(-) diff --git a/datafusion/sqllogictest/test_files/sort_merge_join.slt b/datafusion/sqllogictest/test_files/sort_merge_join.slt index 41dc1990432b..c17fe8dfc7e6 100644 --- a/datafusion/sqllogictest/test_files/sort_merge_join.slt +++ b/datafusion/sqllogictest/test_files/sort_merge_join.slt @@ -44,102 +44,102 @@ physical_plan 05)----DataSourceExec: partitions=1, partition_sizes=[1] # inner join with join filter -query TITI +query TITI rowsort SELECT t1.a, t1.b, t2.a, t2.b FROM t1 JOIN t2 ON t1.a = t2.a AND t2.b * 50 <= t1.b ---- -Alice 50 Alice 1 -Alice 100 Alice 2 Alice 100 Alice 1 +Alice 100 Alice 2 +Alice 50 Alice 1 -query TITI +query TITI rowsort SELECT t1.a, t1.b, t2.a, t2.b FROM t1 JOIN t2 ON t1.a = t2.a AND t2.b < t1.b ---- -Alice 50 Alice 2 -Alice 50 Alice 1 -Alice 100 Alice 2 Alice 100 Alice 1 +Alice 100 Alice 2 +Alice 50 Alice 1 +Alice 50 Alice 2 -query TITI +query TITI rowsort SELECT t1.a, t1.b, t2.a, t2.b FROM t1 JOIN t2 ON t1.a = t2.a AND t2.b > t1.b ---- # left join without join filter -query TITI +query TITI rowsort SELECT * FROM t1 LEFT JOIN t2 ON t1.a = t2.a ---- -Alice 50 Alice 2 -Alice 50 Alice 1 -Alice 100 Alice 2 Alice 100 Alice 1 +Alice 100 Alice 2 +Alice 50 Alice 1 +Alice 50 Alice 2 Bob 1 NULL NULL # left join with join filter -query TITI +query TITI rowsort SELECT * FROM t1 LEFT JOIN t2 ON t1.a = t2.a AND t2.b * 50 <= t1.b ---- -Alice 50 Alice 1 -Alice 100 Alice 2 Alice 100 Alice 1 +Alice 100 Alice 2 +Alice 50 Alice 1 Bob 1 NULL NULL -query TITI +query TITI rowsort SELECT * FROM t1 LEFT JOIN t2 ON t1.a = t2.a AND t2.b < t1.b ---- -Alice 50 Alice 2 -Alice 50 Alice 1 -Alice 100 Alice 2 Alice 100 Alice 1 +Alice 100 Alice 2 +Alice 50 Alice 1 +Alice 50 Alice 2 Bob 1 NULL NULL # right join without join filter -query TITI +query TITI rowsort SELECT * FROM t1 RIGHT JOIN t2 ON t1.a = t2.a ---- -Alice 50 Alice 2 +Alice 100 Alice 1 Alice 100 Alice 2 Alice 50 Alice 1 -Alice 100 Alice 1 +Alice 50 Alice 2 # right join with join filter -query TITI +query TITI rowsort SELECT * FROM t1 RIGHT JOIN t2 ON t1.a = t2.a AND t2.b * 50 <= t1.b ---- +Alice 100 Alice 1 Alice 100 Alice 2 Alice 50 Alice 1 -Alice 100 Alice 1 -query TITI +query TITI rowsort SELECT * FROM t1 RIGHT JOIN t2 ON t1.a = t2.a AND t1.b > t2.b ---- -Alice 50 Alice 2 +Alice 100 Alice 1 Alice 100 Alice 2 Alice 50 Alice 1 -Alice 100 Alice 1 +Alice 50 Alice 2 # full join without join filter -query TITI +query TITI rowsort SELECT * FROM t1 FULL JOIN t2 ON t1.a = t2.a ---- -Alice 50 Alice 2 -Alice 50 Alice 1 -Alice 100 Alice 2 Alice 100 Alice 1 +Alice 100 Alice 2 +Alice 50 Alice 1 +Alice 50 Alice 2 Bob 1 NULL NULL -query TITI +query TITI rowsort SELECT * FROM t1 FULL JOIN t2 ON t1.a = t2.a AND t2.b * 50 > t1.b ---- -Alice 50 Alice 2 Alice 100 NULL NULL -NULL NULL Alice 1 +Alice 50 Alice 2 Bob 1 NULL NULL +NULL NULL Alice 1 -query TITI +query TITI rowsort SELECT * FROM t1 FULL JOIN t2 ON t1.a = t2.a AND t1.b > t2.b + 50 ---- -Alice 50 NULL NULL -Alice 100 Alice 2 Alice 100 Alice 1 +Alice 100 Alice 2 +Alice 50 NULL NULL Bob 1 NULL NULL statement ok From 8ad6491bb7265612683d614088d3158f2f2a3470 Mon Sep 17 00:00:00 2001 From: zhuqi-lucas <821684824@qq.com> Date: Sun, 8 Jun 2025 21:24:25 +0800 Subject: [PATCH 05/14] fix --- datafusion/sqllogictest/test_files/array.slt | 26 ++++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/datafusion/sqllogictest/test_files/array.slt b/datafusion/sqllogictest/test_files/array.slt index 4b07b286b74c..11560dcaecd8 100644 --- a/datafusion/sqllogictest/test_files/array.slt +++ b/datafusion/sqllogictest/test_files/array.slt @@ -6029,7 +6029,7 @@ logical_plan 03)----SubqueryAlias: test 04)------SubqueryAlias: t 05)--------Projection: -06)----------Filter: substr(CAST(md5(CAST(tmp_table.value AS Utf8)) AS Utf8), Int64(1), Int64(32)) IN ([Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278"), Utf8View("a"), Utf8View("b"), Utf8View("c")]) +06)----------Filter: substr(CAST(md5(CAST(tmp_table.value AS Utf8View)) AS Utf8View), Int64(1), Int64(32)) IN ([Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278"), Utf8View("a"), Utf8View("b"), Utf8View("c")]) 07)------------TableScan: tmp_table projection=[value] physical_plan 01)ProjectionExec: expr=[count(Int64(1))@0 as count(*)] @@ -6038,7 +6038,7 @@ physical_plan 04)------AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1))] 05)--------ProjectionExec: expr=[] 06)----------CoalesceBatchesExec: target_batch_size=8192 -07)------------FilterExec: substr(md5(CAST(value@0 AS Utf8)), 1, 32) IN ([Literal { value: Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278"), field: Field { name: "7f4b18de3cfeb9b4ac78c381ee2ad278", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, Literal { value: Utf8View("a"), field: Field { name: "a", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, Literal { value: Utf8View("b"), field: Field { name: "b", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, Literal { value: Utf8View("c"), field: Field { name: "c", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }]) +07)------------FilterExec: substr(CAST(md5(CAST(value@0 AS Utf8View)) AS Utf8View), 1, 32) IN ([Literal { value: Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278"), field: Field { name: "7f4b18de3cfeb9b4ac78c381ee2ad278", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, Literal { value: Utf8View("a"), field: Field { name: "a", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, Literal { value: Utf8View("b"), field: Field { name: "b", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, Literal { value: Utf8View("c"), field: Field { name: "c", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }]) 08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 09)----------------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100000, batch_size=8192] @@ -6058,7 +6058,7 @@ logical_plan 03)----SubqueryAlias: test 04)------SubqueryAlias: t 05)--------Projection: -06)----------Filter: substr(CAST(md5(CAST(tmp_table.value AS Utf8)) AS Utf8), Int64(1), Int64(32)) IN ([Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278"), Utf8View("a"), Utf8View("b"), Utf8View("c")]) +06)----------Filter: substr(CAST(md5(CAST(tmp_table.value AS Utf8View)) AS Utf8View), Int64(1), Int64(32)) IN ([Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278"), Utf8View("a"), Utf8View("b"), Utf8View("c")]) 07)------------TableScan: tmp_table projection=[value] physical_plan 01)ProjectionExec: expr=[count(Int64(1))@0 as count(*)] @@ -6067,7 +6067,7 @@ physical_plan 04)------AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1))] 05)--------ProjectionExec: expr=[] 06)----------CoalesceBatchesExec: target_batch_size=8192 -07)------------FilterExec: substr(md5(CAST(value@0 AS Utf8)), 1, 32) IN ([Literal { value: Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278"), field: Field { name: "7f4b18de3cfeb9b4ac78c381ee2ad278", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, Literal { value: Utf8View("a"), field: Field { name: "a", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, Literal { value: Utf8View("b"), field: Field { name: "b", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, Literal { value: Utf8View("c"), field: Field { name: "c", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }]) +07)------------FilterExec: substr(CAST(md5(CAST(value@0 AS Utf8View)) AS Utf8View), 1, 32) IN ([Literal { value: Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278"), field: Field { name: "7f4b18de3cfeb9b4ac78c381ee2ad278", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, Literal { value: Utf8View("a"), field: Field { name: "a", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, Literal { value: Utf8View("b"), field: Field { name: "b", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, Literal { value: Utf8View("c"), field: Field { name: "c", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }]) 08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 09)----------------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100000, batch_size=8192] @@ -6087,7 +6087,7 @@ logical_plan 03)----SubqueryAlias: test 04)------SubqueryAlias: t 05)--------Projection: -06)----------Filter: substr(CAST(md5(CAST(tmp_table.value AS Utf8)) AS Utf8), Int64(1), Int64(32)) IN ([Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278"), Utf8View("a"), Utf8View("b"), Utf8View("c")]) +06)----------Filter: substr(CAST(md5(CAST(tmp_table.value AS Utf8View)) AS Utf8View), Int64(1), Int64(32)) IN ([Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278"), Utf8View("a"), Utf8View("b"), Utf8View("c")]) 07)------------TableScan: tmp_table projection=[value] physical_plan 01)ProjectionExec: expr=[count(Int64(1))@0 as count(*)] @@ -6096,7 +6096,7 @@ physical_plan 04)------AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1))] 05)--------ProjectionExec: expr=[] 06)----------CoalesceBatchesExec: target_batch_size=8192 -07)------------FilterExec: substr(md5(CAST(value@0 AS Utf8)), 1, 32) IN ([Literal { value: Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278"), field: Field { name: "7f4b18de3cfeb9b4ac78c381ee2ad278", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, Literal { value: Utf8View("a"), field: Field { name: "a", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, Literal { value: Utf8View("b"), field: Field { name: "b", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, Literal { value: Utf8View("c"), field: Field { name: "c", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }]) +07)------------FilterExec: substr(CAST(md5(CAST(value@0 AS Utf8View)) AS Utf8View), 1, 32) IN ([Literal { value: Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278"), field: Field { name: "7f4b18de3cfeb9b4ac78c381ee2ad278", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, Literal { value: Utf8View("a"), field: Field { name: "a", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, Literal { value: Utf8View("b"), field: Field { name: "b", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, Literal { value: Utf8View("c"), field: Field { name: "c", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }]) 08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 09)----------------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100000, batch_size=8192] @@ -6118,7 +6118,7 @@ logical_plan 03)----SubqueryAlias: test 04)------SubqueryAlias: t 05)--------Projection: -06)----------Filter: array_has(LargeList([7f4b18de3cfeb9b4ac78c381ee2ad278, a, b, c]), substr(CAST(md5(CAST(tmp_table.value AS Utf8)) AS Utf8), Int64(1), Int64(32))) +06)----------Filter: array_has(LargeList([7f4b18de3cfeb9b4ac78c381ee2ad278, a, b, c]), substr(CAST(md5(CAST(tmp_table.value AS Utf8View)) AS Utf8View), Int64(1), Int64(32))) 07)------------TableScan: tmp_table projection=[value] physical_plan 01)ProjectionExec: expr=[count(Int64(1))@0 as count(*)] @@ -6127,7 +6127,7 @@ physical_plan 04)------AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1))] 05)--------ProjectionExec: expr=[] 06)----------CoalesceBatchesExec: target_batch_size=8192 -07)------------FilterExec: array_has([7f4b18de3cfeb9b4ac78c381ee2ad278, a, b, c], substr(md5(CAST(value@0 AS Utf8)), 1, 32)) +07)------------FilterExec: array_has([7f4b18de3cfeb9b4ac78c381ee2ad278, a, b, c], substr(CAST(md5(CAST(value@0 AS Utf8View)) AS Utf8View), 1, 32)) 08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 09)----------------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100000, batch_size=8192] @@ -6147,7 +6147,7 @@ logical_plan 03)----SubqueryAlias: test 04)------SubqueryAlias: t 05)--------Projection: -06)----------Filter: substr(CAST(md5(CAST(tmp_table.value AS Utf8)) AS Utf8), Int64(1), Int64(32)) IN ([Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278"), Utf8View("a"), Utf8View("b"), Utf8View("c")]) +06)----------Filter: substr(CAST(md5(CAST(tmp_table.value AS Utf8View)) AS Utf8View), Int64(1), Int64(32)) IN ([Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278"), Utf8View("a"), Utf8View("b"), Utf8View("c")]) 07)------------TableScan: tmp_table projection=[value] physical_plan 01)ProjectionExec: expr=[count(Int64(1))@0 as count(*)] @@ -6156,7 +6156,7 @@ physical_plan 04)------AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1))] 05)--------ProjectionExec: expr=[] 06)----------CoalesceBatchesExec: target_batch_size=8192 -07)------------FilterExec: substr(md5(CAST(value@0 AS Utf8)), 1, 32) IN ([Literal { value: Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278"), field: Field { name: "7f4b18de3cfeb9b4ac78c381ee2ad278", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, Literal { value: Utf8View("a"), field: Field { name: "a", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, Literal { value: Utf8View("b"), field: Field { name: "b", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, Literal { value: Utf8View("c"), field: Field { name: "c", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }]) +07)------------FilterExec: substr(CAST(md5(CAST(value@0 AS Utf8View)) AS Utf8View), 1, 32) IN ([Literal { value: Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278"), field: Field { name: "7f4b18de3cfeb9b4ac78c381ee2ad278", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, Literal { value: Utf8View("a"), field: Field { name: "a", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, Literal { value: Utf8View("b"), field: Field { name: "b", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, Literal { value: Utf8View("c"), field: Field { name: "c", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }]) 08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 09)----------------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100000, batch_size=8192] @@ -6178,7 +6178,7 @@ logical_plan 03)----SubqueryAlias: test 04)------SubqueryAlias: t 05)--------Projection: -06)----------Filter: substr(CAST(md5(CAST(tmp_table.value AS Utf8)) AS Utf8), Int64(1), Int64(32)) IS NOT NULL OR Boolean(NULL) +06)----------Filter: substr(CAST(md5(CAST(tmp_table.value AS Utf8View)) AS Utf8View), Int64(1), Int64(32)) IS NOT NULL OR Boolean(NULL) 07)------------TableScan: tmp_table projection=[value] physical_plan 01)ProjectionExec: expr=[count(Int64(1))@0 as count(*)] @@ -6187,7 +6187,7 @@ physical_plan 04)------AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1))] 05)--------ProjectionExec: expr=[] 06)----------CoalesceBatchesExec: target_batch_size=8192 -07)------------FilterExec: substr(md5(CAST(value@0 AS Utf8)), 1, 32) IS NOT NULL OR NULL +07)------------FilterExec: substr(CAST(md5(CAST(value@0 AS Utf8View)) AS Utf8View), 1, 32) IS NOT NULL OR NULL 08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 09)----------------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100000, batch_size=8192] @@ -7869,7 +7869,7 @@ List(Field { name: "item", data_type: List(Field { name: "item", data_type: Int3 query ??T select [1,2,3]::int[], [['1']]::int[][], arrow_typeof([]::text[]); ---- -[1, 2, 3] [[1]] List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) +[1, 2, 3] [[1]] List(Field { name: "item", data_type: Utf8View, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) # test empty arrays return length # issue: https://github.com/apache/datafusion/pull/12459 From 0b4ee41aa15a26d009c62492f3cd90dbe790d465 Mon Sep 17 00:00:00 2001 From: zhuqi-lucas <821684824@qq.com> Date: Sun, 8 Jun 2025 21:25:37 +0800 Subject: [PATCH 06/14] fmt --- datafusion/expr-common/src/type_coercion/binary.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/datafusion/expr-common/src/type_coercion/binary.rs b/datafusion/expr-common/src/type_coercion/binary.rs index 78ff85265213..955c28c42a3f 100644 --- a/datafusion/expr-common/src/type_coercion/binary.rs +++ b/datafusion/expr-common/src/type_coercion/binary.rs @@ -1202,7 +1202,8 @@ pub fn string_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option Option { use arrow::datatypes::DataType::*; match (lhs_type, rhs_type) { - (Utf8 | LargeUtf8 | Utf8View, other_type) | (other_type, Utf8 | LargeUtf8 | Utf8View) + (Utf8 | LargeUtf8 | Utf8View, other_type) + | (other_type, Utf8 | LargeUtf8 | Utf8View) if other_type.is_numeric() => { Some(other_type.clone()) From 8ee72c96b08152f08d076ed8b9133f2b3729618e Mon Sep 17 00:00:00 2001 From: zhuqi-lucas <821684824@qq.com> Date: Sun, 8 Jun 2025 21:37:47 +0800 Subject: [PATCH 07/14] Fix test --- .../core/tests/user_defined/user_defined_scalar_functions.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/core/tests/user_defined/user_defined_scalar_functions.rs b/datafusion/core/tests/user_defined/user_defined_scalar_functions.rs index dcaa1ef95235..ef8c51aef18e 100644 --- a/datafusion/core/tests/user_defined/user_defined_scalar_functions.rs +++ b/datafusion/core/tests/user_defined/user_defined_scalar_functions.rs @@ -1180,7 +1180,7 @@ async fn create_scalar_function_from_sql_statement_postgres_syntax() -> Result<( quote_style: None, span: Span::empty(), }), - data_type: DataType::Utf8, + data_type: DataType::Utf8View, default_expr: None, }]), return_type: Some(DataType::Int32), From 0b05cb64d9816a09b3626db62e7bf0d526d5ae31 Mon Sep 17 00:00:00 2001 From: zhuqi-lucas <821684824@qq.com> Date: Wed, 11 Jun 2025 13:44:44 +0800 Subject: [PATCH 08/14] fix --- datafusion/common/src/config.rs | 5 +++++ .../core/src/execution/session_state.rs | 1 + datafusion/sql/src/planner.rs | 10 +++++++++- datafusion/sql/tests/sql_integration.rs | 3 +++ .../sqllogictest/test_files/arrow_files.slt | 19 +++++++++++++++++-- .../test_files/information_schema.slt | 2 ++ docs/source/user-guide/configs.md | 1 + 7 files changed, 38 insertions(+), 3 deletions(-) diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index 726015d17149..8e9605f74448 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -264,6 +264,11 @@ config_namespace! { /// Default is false. pub map_varchar_to_utf8view: bool, default = true + /// If true, `CHAR` and `Text` and `String` is mapped to `Utf8View` during SQL planning. + /// If false, `CHAR` and `Text` and `String` is mapped to `Utf8` during SQL planning. + /// Default is true. + pub map_char_to_utf8view: bool, default = true + /// When set to true, the source locations relative to the original SQL /// query (i.e. [`Span`](https://docs.rs/sqlparser/latest/sqlparser/tokenizer/struct.Span.html)) will be collected /// and recorded in the logical plan nodes. diff --git a/datafusion/core/src/execution/session_state.rs b/datafusion/core/src/execution/session_state.rs index 8aa812cc5258..a6e580eb2294 100644 --- a/datafusion/core/src/execution/session_state.rs +++ b/datafusion/core/src/execution/session_state.rs @@ -495,6 +495,7 @@ impl SessionState { .enable_options_value_normalization, support_varchar_with_length: sql_parser_options.support_varchar_with_length, map_varchar_to_utf8view: sql_parser_options.map_varchar_to_utf8view, + map_char_to_utf8view: sql_parser_options.map_char_to_utf8view, collect_spans: sql_parser_options.collect_spans, } } diff --git a/datafusion/sql/src/planner.rs b/datafusion/sql/src/planner.rs index 29e5c5f77344..8ac62e4b94af 100644 --- a/datafusion/sql/src/planner.rs +++ b/datafusion/sql/src/planner.rs @@ -54,6 +54,8 @@ pub struct ParserOptions { pub collect_spans: bool, /// Whether `VARCHAR` is mapped to `Utf8View` during SQL planning. pub map_varchar_to_utf8view: bool, + /// Whether `CHAR` and `Text` and `String` are mapped to `Utf8View` during SQL planning. + pub map_char_to_utf8view: bool, } impl ParserOptions { @@ -73,6 +75,7 @@ impl ParserOptions { enable_ident_normalization: true, support_varchar_with_length: true, map_varchar_to_utf8view: true, + map_char_to_utf8view: true, enable_options_value_normalization: false, collect_spans: false, } @@ -144,6 +147,7 @@ impl From<&SqlParserOptions> for ParserOptions { enable_ident_normalization: options.enable_ident_normalization, support_varchar_with_length: options.support_varchar_with_length, map_varchar_to_utf8view: options.map_varchar_to_utf8view, + map_char_to_utf8view: options.map_char_to_utf8view, enable_options_value_normalization: options .enable_options_value_normalization, collect_spans: options.collect_spans, @@ -601,7 +605,11 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { ) } SQLDataType::Char(_) | SQLDataType::Text | SQLDataType::String(_) => { - Ok(DataType::Utf8View) + if self.options.map_char_to_utf8view { + Ok(DataType::Utf8View) + } else { + Ok(DataType::Utf8) + } } SQLDataType::Timestamp(precision, tz_info) if precision.is_none() || [0, 3, 6, 9].contains(&precision.unwrap()) => diff --git a/datafusion/sql/tests/sql_integration.rs b/datafusion/sql/tests/sql_integration.rs index 4be7953aefc0..70203d0c1a4b 100644 --- a/datafusion/sql/tests/sql_integration.rs +++ b/datafusion/sql/tests/sql_integration.rs @@ -3356,6 +3356,7 @@ fn parse_decimals_parser_options() -> ParserOptions { enable_ident_normalization: false, support_varchar_with_length: false, map_varchar_to_utf8view: true, + map_char_to_utf8view: true, enable_options_value_normalization: false, collect_spans: false, } @@ -3367,6 +3368,7 @@ fn ident_normalization_parser_options_no_ident_normalization() -> ParserOptions enable_ident_normalization: false, support_varchar_with_length: false, map_varchar_to_utf8view: true, + map_char_to_utf8view: true, enable_options_value_normalization: false, collect_spans: false, } @@ -3378,6 +3380,7 @@ fn ident_normalization_parser_options_ident_normalization() -> ParserOptions { enable_ident_normalization: true, support_varchar_with_length: false, map_varchar_to_utf8view: true, + map_char_to_utf8view: true, enable_options_value_normalization: false, collect_spans: false, } diff --git a/datafusion/sqllogictest/test_files/arrow_files.slt b/datafusion/sqllogictest/test_files/arrow_files.slt index e039029060f6..e7ea91c7cacf 100644 --- a/datafusion/sqllogictest/test_files/arrow_files.slt +++ b/datafusion/sqllogictest/test_files/arrow_files.slt @@ -19,6 +19,11 @@ ## Arrow Files Format support ############# +# We using fixed arrow file to test for sqllogictests, and this arrow field is writing with arrow-ipc utf8, +# so when we decode to read it's also loading utf8. +# Currently, so we disable the map_char_to_utf8view +statement ok +set datafusion.sql_parser.map_char_to_utf8view = false; statement ok @@ -61,12 +66,22 @@ LOCATION '../core/tests/data/partitioned_table_arrow/' PARTITIONED BY (part); # select wildcard -query error DataFusion error: Arrow error: External error: Arrow error: Invalid argument error: column types must match schema types, expected Utf8View but found Utf8 at column index 1 +query ITBI SELECT * FROM arrow_partitioned ORDER BY f0; +---- +1 foo true 123 +2 bar false 123 +3 baz true 456 +4 NULL NULL 456 # select all fields -query error DataFusion error: Arrow error: External error: Arrow error: Invalid argument error: column types must match schema types, expected Utf8View but found Utf8 at column index 1 +query IITB SELECT part, f0, f1, f2 FROM arrow_partitioned ORDER BY f0; +---- +123 1 foo true +123 2 bar false +456 3 baz true +456 4 NULL NULL # select without partition column query IB diff --git a/datafusion/sqllogictest/test_files/information_schema.slt b/datafusion/sqllogictest/test_files/information_schema.slt index 108c844f20b4..79d5b390a552 100644 --- a/datafusion/sqllogictest/test_files/information_schema.slt +++ b/datafusion/sqllogictest/test_files/information_schema.slt @@ -307,6 +307,7 @@ datafusion.sql_parser.collect_spans false datafusion.sql_parser.dialect generic datafusion.sql_parser.enable_ident_normalization true datafusion.sql_parser.enable_options_value_normalization false +datafusion.sql_parser.map_char_to_utf8view true datafusion.sql_parser.map_varchar_to_utf8view true datafusion.sql_parser.parse_float_as_decimal false datafusion.sql_parser.recursion_limit 50 @@ -417,6 +418,7 @@ datafusion.sql_parser.collect_spans false When set to true, the source locations datafusion.sql_parser.dialect generic Configure the SQL dialect used by DataFusion's parser; supported values include: Generic, MySQL, PostgreSQL, Hive, SQLite, Snowflake, Redshift, MsSQL, ClickHouse, BigQuery, Ansi, DuckDB and Databricks. datafusion.sql_parser.enable_ident_normalization true When set to true, SQL parser will normalize ident (convert ident to lowercase when not quoted) datafusion.sql_parser.enable_options_value_normalization false When set to true, SQL parser will normalize options value (convert value to lowercase). Note that this option is ignored and will be removed in the future. All case-insensitive values are normalized automatically. +datafusion.sql_parser.map_char_to_utf8view true If true, `CHAR` and `Text` and `String` is mapped to `Utf8View` during SQL planning. If false, `CHAR` and `Text` and `String` is mapped to `Utf8` during SQL planning. Default is true. datafusion.sql_parser.map_varchar_to_utf8view true If true, `VARCHAR` is mapped to `Utf8View` during SQL planning. If false, `VARCHAR` is mapped to `Utf8` during SQL planning. Default is false. datafusion.sql_parser.parse_float_as_decimal false When set to true, SQL parser will parse float as decimal type datafusion.sql_parser.recursion_limit 50 Specifies the recursion depth limit when parsing complex SQL Queries diff --git a/docs/source/user-guide/configs.md b/docs/source/user-guide/configs.md index 6d65f54e228d..531289c46e5c 100644 --- a/docs/source/user-guide/configs.md +++ b/docs/source/user-guide/configs.md @@ -131,6 +131,7 @@ Environment variables are read during `SessionConfig` initialisation so they mus | datafusion.sql_parser.dialect | generic | Configure the SQL dialect used by DataFusion's parser; supported values include: Generic, MySQL, PostgreSQL, Hive, SQLite, Snowflake, Redshift, MsSQL, ClickHouse, BigQuery, Ansi, DuckDB and Databricks. | | datafusion.sql_parser.support_varchar_with_length | true | If true, permit lengths for `VARCHAR` such as `VARCHAR(20)`, but ignore the length. If false, error if a `VARCHAR` with a length is specified. The Arrow type system does not have a notion of maximum string length and thus DataFusion can not enforce such limits. | | datafusion.sql_parser.map_varchar_to_utf8view | true | If true, `VARCHAR` is mapped to `Utf8View` during SQL planning. If false, `VARCHAR` is mapped to `Utf8` during SQL planning. Default is false. | +| datafusion.sql_parser.map_char_to_utf8view | true | If true, `CHAR` and `Text` and `String` is mapped to `Utf8View` during SQL planning. If false, `CHAR` and `Text` and `String` is mapped to `Utf8` during SQL planning. Default is true. | | datafusion.sql_parser.collect_spans | false | When set to true, the source locations relative to the original SQL query (i.e. [`Span`](https://docs.rs/sqlparser/latest/sqlparser/tokenizer/struct.Span.html)) will be collected and recorded in the logical plan nodes. | | datafusion.sql_parser.recursion_limit | 50 | Specifies the recursion depth limit when parsing complex SQL Queries | | datafusion.format.safe | true | If set to `true` any formatting errors will be written to the output instead of being converted into a [`std::fmt::Error`] | From d825fd1893f5708af96ec130e1b775bd82c0a9d1 Mon Sep 17 00:00:00 2001 From: zhuqi-lucas <821684824@qq.com> Date: Wed, 11 Jun 2025 14:01:39 +0800 Subject: [PATCH 09/14] fix test --- datafusion/sql/tests/cases/.params.rs.pending-snap | 10 ++++++++++ datafusion/sql/tests/cases/params.rs | 12 ++++++------ 2 files changed, 16 insertions(+), 6 deletions(-) create mode 100644 datafusion/sql/tests/cases/.params.rs.pending-snap diff --git a/datafusion/sql/tests/cases/.params.rs.pending-snap b/datafusion/sql/tests/cases/.params.rs.pending-snap new file mode 100644 index 000000000000..0a9d867fb253 --- /dev/null +++ b/datafusion/sql/tests/cases/.params.rs.pending-snap @@ -0,0 +1,10 @@ +{"run_id":"1749621453-235174000","line":755,"new":{"module_name":"sql_integration__cases__params","snapshot_name":"prepare_statement_to_plan_multi_params-2","metadata":{"source":"datafusion/sql/tests/cases/params.rs","assertion_line":755,"expression":"dt"},"snapshot":"[Int32, Utf8View, Float64, Int32, Float64, Utf8View]"},"old":{"module_name":"sql_integration__cases__params","metadata":{},"snapshot":"[Int32, Utf8, Float64, Int32, Float64, Utf8]"}} +{"run_id":"1749621483-622288000","line":746,"new":null,"old":null} +{"run_id":"1749621483-622288000","line":755,"new":{"module_name":"sql_integration__cases__params","snapshot_name":"prepare_statement_to_plan_multi_params-2","metadata":{"source":"datafusion/sql/tests/cases/params.rs","assertion_line":755,"expression":"dt"},"snapshot":"[Int32, Utf8View, Float64, Int32, Float64, Utf8View]"},"old":{"module_name":"sql_integration__cases__params","metadata":{},"snapshot":"[Int32, Utf8, Float64, Int32, Float64, Utf8]"}} +{"run_id":"1749621504-462939000","line":746,"new":null,"old":null} +{"run_id":"1749621504-462939000","line":755,"new":null,"old":null} +{"run_id":"1749621567-66687000","line":746,"new":null,"old":null} +{"run_id":"1749621567-66687000","line":755,"new":null,"old":null} +{"run_id":"1749621685-865251000","line":746,"new":null,"old":null} +{"run_id":"1749621685-865251000","line":755,"new":null,"old":null} +{"run_id":"1749621685-865251000","line":769,"new":null,"old":null} diff --git a/datafusion/sql/tests/cases/params.rs b/datafusion/sql/tests/cases/params.rs index b3cc49c31071..15e7d923a91a 100644 --- a/datafusion/sql/tests/cases/params.rs +++ b/datafusion/sql/tests/cases/params.rs @@ -746,31 +746,31 @@ fn test_prepare_statement_to_plan_multi_params() { assert_snapshot!( plan, @r#" - Prepare: "my_plan" [Int32, Utf8, Float64, Int32, Float64, Utf8] + Prepare: "my_plan" [Int32, Utf8View, Float64, Int32, Float64, Utf8View] Projection: person.id, person.age, $6 Filter: person.age IN ([$1, $4]) AND person.salary > $3 AND person.salary < $5 OR person.first_name < $2 TableScan: person "# ); - assert_snapshot!(dt, @r#"[Int32, Utf8, Float64, Int32, Float64, Utf8]"#); + assert_snapshot!(dt, @r#"[Int32, Utf8View, Float64, Int32, Float64, Utf8View]"#); /////////////////// // replace params with values let param_values = vec![ ScalarValue::Int32(Some(10)), - ScalarValue::from("abc"), + ScalarValue::Utf8View(Some("abc".into())), ScalarValue::Float64(Some(100.0)), ScalarValue::Int32(Some(20)), ScalarValue::Float64(Some(200.0)), - ScalarValue::from("xyz"), + ScalarValue::Utf8View(Some("xyz".into())), ]; let plan_with_params = plan.with_param_values(param_values).unwrap(); assert_snapshot!( plan_with_params, @r#" - Projection: person.id, person.age, Utf8("xyz") AS $6 - Filter: person.age IN ([Int32(10), Int32(20)]) AND person.salary > Float64(100) AND person.salary < Float64(200) OR person.first_name < Utf8("abc") + Projection: person.id, person.age, Utf8View("xyz") AS $6 + Filter: person.age IN ([Int32(10), Int32(20)]) AND person.salary > Float64(100) AND person.salary < Float64(200) OR person.first_name < Utf8View("abc") TableScan: person "# ); From f341e1ec68f6527ff347790a962e51e84982869d Mon Sep 17 00:00:00 2001 From: zhuqi-lucas <821684824@qq.com> Date: Wed, 11 Jun 2025 14:42:41 +0800 Subject: [PATCH 10/14] fix --- .../sqllogictest/test_files/parquet_filter_pushdown.slt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/datafusion/sqllogictest/test_files/parquet_filter_pushdown.slt b/datafusion/sqllogictest/test_files/parquet_filter_pushdown.slt index e707c52956d3..5c0419b69d76 100644 --- a/datafusion/sqllogictest/test_files/parquet_filter_pushdown.slt +++ b/datafusion/sqllogictest/test_files/parquet_filter_pushdown.slt @@ -253,7 +253,7 @@ EXPLAIN select val, part from t_pushdown where part = 'a' AND part = val; ---- logical_plan 01)Filter: t_pushdown.val = t_pushdown.part -02)--TableScan: t_pushdown projection=[val, part], full_filters=[t_pushdown.part = Utf8("a")], partial_filters=[t_pushdown.val = t_pushdown.part] +02)--TableScan: t_pushdown projection=[val, part], full_filters=[t_pushdown.part = Utf8View("a")], partial_filters=[t_pushdown.val = t_pushdown.part] physical_plan 01)CoalesceBatchesExec: target_batch_size=8192 02)--FilterExec: val@0 = part@1 @@ -270,7 +270,7 @@ EXPLAIN select val, part from t_pushdown where part = val AND part = 'a'; ---- logical_plan 01)Filter: t_pushdown.val = t_pushdown.part -02)--TableScan: t_pushdown projection=[val, part], full_filters=[t_pushdown.part = Utf8("a")], partial_filters=[t_pushdown.val = t_pushdown.part] +02)--TableScan: t_pushdown projection=[val, part], full_filters=[t_pushdown.part = Utf8View("a")], partial_filters=[t_pushdown.val = t_pushdown.part] physical_plan 01)CoalesceBatchesExec: target_batch_size=8192 02)--FilterExec: val@0 = part@1 @@ -280,4 +280,4 @@ physical_plan query TT select val, part from t_pushdown where part = val AND part = 'a'; ---- -a a \ No newline at end of file +a a From 6485228bee8df78b0e9ecf2a6b04561a0310876d Mon Sep 17 00:00:00 2001 From: zhuqi-lucas <821684824@qq.com> Date: Wed, 11 Jun 2025 15:15:35 +0800 Subject: [PATCH 11/14] clean --- datafusion/sql/tests/cases/.params.rs.pending-snap | 10 ---------- 1 file changed, 10 deletions(-) delete mode 100644 datafusion/sql/tests/cases/.params.rs.pending-snap diff --git a/datafusion/sql/tests/cases/.params.rs.pending-snap b/datafusion/sql/tests/cases/.params.rs.pending-snap deleted file mode 100644 index 0a9d867fb253..000000000000 --- a/datafusion/sql/tests/cases/.params.rs.pending-snap +++ /dev/null @@ -1,10 +0,0 @@ -{"run_id":"1749621453-235174000","line":755,"new":{"module_name":"sql_integration__cases__params","snapshot_name":"prepare_statement_to_plan_multi_params-2","metadata":{"source":"datafusion/sql/tests/cases/params.rs","assertion_line":755,"expression":"dt"},"snapshot":"[Int32, Utf8View, Float64, Int32, Float64, Utf8View]"},"old":{"module_name":"sql_integration__cases__params","metadata":{},"snapshot":"[Int32, Utf8, Float64, Int32, Float64, Utf8]"}} -{"run_id":"1749621483-622288000","line":746,"new":null,"old":null} -{"run_id":"1749621483-622288000","line":755,"new":{"module_name":"sql_integration__cases__params","snapshot_name":"prepare_statement_to_plan_multi_params-2","metadata":{"source":"datafusion/sql/tests/cases/params.rs","assertion_line":755,"expression":"dt"},"snapshot":"[Int32, Utf8View, Float64, Int32, Float64, Utf8View]"},"old":{"module_name":"sql_integration__cases__params","metadata":{},"snapshot":"[Int32, Utf8, Float64, Int32, Float64, Utf8]"}} -{"run_id":"1749621504-462939000","line":746,"new":null,"old":null} -{"run_id":"1749621504-462939000","line":755,"new":null,"old":null} -{"run_id":"1749621567-66687000","line":746,"new":null,"old":null} -{"run_id":"1749621567-66687000","line":755,"new":null,"old":null} -{"run_id":"1749621685-865251000","line":746,"new":null,"old":null} -{"run_id":"1749621685-865251000","line":755,"new":null,"old":null} -{"run_id":"1749621685-865251000","line":769,"new":null,"old":null} From fd20268dd1420ec4c5714f25339881e09457a3ed Mon Sep 17 00:00:00 2001 From: zhuqi-lucas <821684824@qq.com> Date: Wed, 11 Jun 2025 20:31:35 +0800 Subject: [PATCH 12/14] Address comments --- datafusion/common/src/config.rs | 11 +++------- .../core/src/execution/session_state.rs | 3 +-- datafusion/sql/src/planner.rs | 22 ++++++++----------- datafusion/sql/tests/sql_integration.rs | 9 +++----- .../sqllogictest/test_files/arrow_files.slt | 4 ++-- datafusion/sqllogictest/test_files/avro.slt | 4 ++-- datafusion/sqllogictest/test_files/ddl.slt | 4 ++-- 7 files changed, 22 insertions(+), 35 deletions(-) diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index b26173dd88c1..8324ce130aad 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -259,15 +259,10 @@ config_namespace! { /// string length and thus DataFusion can not enforce such limits. pub support_varchar_with_length: bool, default = true - /// If true, `VARCHAR` is mapped to `Utf8View` during SQL planning. - /// If false, `VARCHAR` is mapped to `Utf8` during SQL planning. - /// Default is false. - pub map_varchar_to_utf8view: bool, default = true - - /// If true, `CHAR` and `Text` and `String` is mapped to `Utf8View` during SQL planning. - /// If false, `CHAR` and `Text` and `String` is mapped to `Utf8` during SQL planning. + /// If true, string types (VARCHAR, CHAR, Text, and String) are mapped to `Utf8View` during SQL planning. + /// If false, they are mapped to `Utf8`. /// Default is true. - pub map_char_to_utf8view: bool, default = true + pub map_string_types_to_utf8view: bool, default = true /// When set to true, the source locations relative to the original SQL /// query (i.e. [`Span`](https://docs.rs/sqlparser/latest/sqlparser/tokenizer/struct.Span.html)) will be collected diff --git a/datafusion/core/src/execution/session_state.rs b/datafusion/core/src/execution/session_state.rs index a6e580eb2294..19b5a5fd46f1 100644 --- a/datafusion/core/src/execution/session_state.rs +++ b/datafusion/core/src/execution/session_state.rs @@ -494,8 +494,7 @@ impl SessionState { enable_options_value_normalization: sql_parser_options .enable_options_value_normalization, support_varchar_with_length: sql_parser_options.support_varchar_with_length, - map_varchar_to_utf8view: sql_parser_options.map_varchar_to_utf8view, - map_char_to_utf8view: sql_parser_options.map_char_to_utf8view, + map_string_types_to_utf8view: sql_parser_options.map_string_types_to_utf8view, collect_spans: sql_parser_options.collect_spans, } } diff --git a/datafusion/sql/src/planner.rs b/datafusion/sql/src/planner.rs index 8ac62e4b94af..03396822eca8 100644 --- a/datafusion/sql/src/planner.rs +++ b/datafusion/sql/src/planner.rs @@ -52,10 +52,8 @@ pub struct ParserOptions { pub enable_options_value_normalization: bool, /// Whether to collect spans pub collect_spans: bool, - /// Whether `VARCHAR` is mapped to `Utf8View` during SQL planning. - pub map_varchar_to_utf8view: bool, - /// Whether `CHAR` and `Text` and `String` are mapped to `Utf8View` during SQL planning. - pub map_char_to_utf8view: bool, + /// Whether string types (VARCHAR, CHAR, Text, and String) are mapped to `Utf8View` during SQL planning. + pub map_string_types_to_utf8view: bool, } impl ParserOptions { @@ -74,8 +72,7 @@ impl ParserOptions { parse_float_as_decimal: false, enable_ident_normalization: true, support_varchar_with_length: true, - map_varchar_to_utf8view: true, - map_char_to_utf8view: true, + map_string_types_to_utf8view: true, enable_options_value_normalization: false, collect_spans: false, } @@ -115,9 +112,9 @@ impl ParserOptions { self } - /// Sets the `map_varchar_to_utf8view` option. - pub fn with_map_varchar_to_utf8view(mut self, value: bool) -> Self { - self.map_varchar_to_utf8view = value; + /// Sets the `map_string_types_to_utf8view` option. + pub fn with_map_string_types_to_utf8view(mut self, value: bool) -> Self { + self.map_string_types_to_utf8view = value; self } @@ -146,8 +143,7 @@ impl From<&SqlParserOptions> for ParserOptions { parse_float_as_decimal: options.parse_float_as_decimal, enable_ident_normalization: options.enable_ident_normalization, support_varchar_with_length: options.support_varchar_with_length, - map_varchar_to_utf8view: options.map_varchar_to_utf8view, - map_char_to_utf8view: options.map_char_to_utf8view, + map_string_types_to_utf8view: options.map_string_types_to_utf8view, enable_options_value_normalization: options .enable_options_value_normalization, collect_spans: options.collect_spans, @@ -581,7 +577,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { please set `support_varchar_with_length` to be true" ), _ => { - if self.options.map_varchar_to_utf8view { + if self.options.map_string_types_to_utf8view { Ok(DataType::Utf8View) } else { Ok(DataType::Utf8) @@ -605,7 +601,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { ) } SQLDataType::Char(_) | SQLDataType::Text | SQLDataType::String(_) => { - if self.options.map_char_to_utf8view { + if self.options.map_string_types_to_utf8view { Ok(DataType::Utf8View) } else { Ok(DataType::Utf8) diff --git a/datafusion/sql/tests/sql_integration.rs b/datafusion/sql/tests/sql_integration.rs index 70203d0c1a4b..c82239d9b455 100644 --- a/datafusion/sql/tests/sql_integration.rs +++ b/datafusion/sql/tests/sql_integration.rs @@ -3355,8 +3355,7 @@ fn parse_decimals_parser_options() -> ParserOptions { parse_float_as_decimal: true, enable_ident_normalization: false, support_varchar_with_length: false, - map_varchar_to_utf8view: true, - map_char_to_utf8view: true, + map_string_types_to_utf8view: true, enable_options_value_normalization: false, collect_spans: false, } @@ -3367,8 +3366,7 @@ fn ident_normalization_parser_options_no_ident_normalization() -> ParserOptions parse_float_as_decimal: true, enable_ident_normalization: false, support_varchar_with_length: false, - map_varchar_to_utf8view: true, - map_char_to_utf8view: true, + map_string_types_to_utf8view: true, enable_options_value_normalization: false, collect_spans: false, } @@ -3379,8 +3377,7 @@ fn ident_normalization_parser_options_ident_normalization() -> ParserOptions { parse_float_as_decimal: true, enable_ident_normalization: true, support_varchar_with_length: false, - map_varchar_to_utf8view: true, - map_char_to_utf8view: true, + map_string_types_to_utf8view: true, enable_options_value_normalization: false, collect_spans: false, } diff --git a/datafusion/sqllogictest/test_files/arrow_files.slt b/datafusion/sqllogictest/test_files/arrow_files.slt index e7ea91c7cacf..62453ec4bf3e 100644 --- a/datafusion/sqllogictest/test_files/arrow_files.slt +++ b/datafusion/sqllogictest/test_files/arrow_files.slt @@ -21,9 +21,9 @@ # We using fixed arrow file to test for sqllogictests, and this arrow field is writing with arrow-ipc utf8, # so when we decode to read it's also loading utf8. -# Currently, so we disable the map_char_to_utf8view +# Currently, so we disable the map_string_types_to_utf8view statement ok -set datafusion.sql_parser.map_char_to_utf8view = false; +set datafusion.sql_parser.map_string_types_to_utf8view = false; statement ok diff --git a/datafusion/sqllogictest/test_files/avro.slt b/datafusion/sqllogictest/test_files/avro.slt index 4573af1d59b1..2ad60c0082e8 100644 --- a/datafusion/sqllogictest/test_files/avro.slt +++ b/datafusion/sqllogictest/test_files/avro.slt @@ -15,10 +15,10 @@ # specific language governing permissions and limitations # under the License. -# Currently, the avro not support Utf8View type, so we disable the map_varchar_to_utf8view +# Currently, the avro not support Utf8View type, so we disable the map_string_types_to_utf8view # After https://github.com/apache/arrow-rs/issues/7262 released, we can remove this setting statement ok -set datafusion.sql_parser.map_varchar_to_utf8view = false; +set datafusion.sql_parser.map_string_types_to_utf8view = false; statement ok CREATE EXTERNAL TABLE alltypes_plain ( diff --git a/datafusion/sqllogictest/test_files/ddl.slt b/datafusion/sqllogictest/test_files/ddl.slt index 1e95e426f3e0..81f2955eff49 100644 --- a/datafusion/sqllogictest/test_files/ddl.slt +++ b/datafusion/sqllogictest/test_files/ddl.slt @@ -828,7 +828,7 @@ drop table table_with_pk; statement ok set datafusion.catalog.information_schema = false; -# Test VARCHAR is mapped to Utf8View during SQL planning when setting map_varchar_to_utf8view to true +# Test VARCHAR is mapped to Utf8View during SQL planning when setting map_string_types_to_utf8view to true statement ok CREATE TABLE t1(c1 VARCHAR(10) NOT NULL, c2 VARCHAR); @@ -839,7 +839,7 @@ c1 Utf8View NO c2 Utf8View YES statement ok -set datafusion.sql_parser.map_varchar_to_utf8view = true; +set datafusion.sql_parser.map_string_types_to_utf8view = true; statement ok CREATE TABLE t2(c1 VARCHAR(10) NOT NULL, c2 VARCHAR); From 980285032a9c671f820df2ab85570a33587194a8 Mon Sep 17 00:00:00 2001 From: zhuqi-lucas <821684824@qq.com> Date: Mon, 16 Jun 2025 20:56:29 +0800 Subject: [PATCH 13/14] Fix test --- datafusion/sqllogictest/test_files/information_schema.slt | 6 ++---- docs/source/user-guide/configs.md | 3 +-- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/datafusion/sqllogictest/test_files/information_schema.slt b/datafusion/sqllogictest/test_files/information_schema.slt index 7347f8e95296..f8c86df02453 100644 --- a/datafusion/sqllogictest/test_files/information_schema.slt +++ b/datafusion/sqllogictest/test_files/information_schema.slt @@ -308,8 +308,7 @@ datafusion.sql_parser.collect_spans false datafusion.sql_parser.dialect generic datafusion.sql_parser.enable_ident_normalization true datafusion.sql_parser.enable_options_value_normalization false -datafusion.sql_parser.map_char_to_utf8view true -datafusion.sql_parser.map_varchar_to_utf8view true +datafusion.sql_parser.map_string_types_to_utf8view true datafusion.sql_parser.parse_float_as_decimal false datafusion.sql_parser.recursion_limit 50 datafusion.sql_parser.support_varchar_with_length true @@ -420,8 +419,7 @@ datafusion.sql_parser.collect_spans false When set to true, the source locations datafusion.sql_parser.dialect generic Configure the SQL dialect used by DataFusion's parser; supported values include: Generic, MySQL, PostgreSQL, Hive, SQLite, Snowflake, Redshift, MsSQL, ClickHouse, BigQuery, Ansi, DuckDB and Databricks. datafusion.sql_parser.enable_ident_normalization true When set to true, SQL parser will normalize ident (convert ident to lowercase when not quoted) datafusion.sql_parser.enable_options_value_normalization false When set to true, SQL parser will normalize options value (convert value to lowercase). Note that this option is ignored and will be removed in the future. All case-insensitive values are normalized automatically. -datafusion.sql_parser.map_char_to_utf8view true If true, `CHAR` and `Text` and `String` is mapped to `Utf8View` during SQL planning. If false, `CHAR` and `Text` and `String` is mapped to `Utf8` during SQL planning. Default is true. -datafusion.sql_parser.map_varchar_to_utf8view true If true, `VARCHAR` is mapped to `Utf8View` during SQL planning. If false, `VARCHAR` is mapped to `Utf8` during SQL planning. Default is false. +datafusion.sql_parser.map_string_types_to_utf8view true If true, string types (VARCHAR, CHAR, Text, and String) are mapped to `Utf8View` during SQL planning. If false, they are mapped to `Utf8`. Default is true. datafusion.sql_parser.parse_float_as_decimal false When set to true, SQL parser will parse float as decimal type datafusion.sql_parser.recursion_limit 50 Specifies the recursion depth limit when parsing complex SQL Queries datafusion.sql_parser.support_varchar_with_length true If true, permit lengths for `VARCHAR` such as `VARCHAR(20)`, but ignore the length. If false, error if a `VARCHAR` with a length is specified. The Arrow type system does not have a notion of maximum string length and thus DataFusion can not enforce such limits. diff --git a/docs/source/user-guide/configs.md b/docs/source/user-guide/configs.md index 8b2dc413594b..5c80cbd563c2 100644 --- a/docs/source/user-guide/configs.md +++ b/docs/source/user-guide/configs.md @@ -131,8 +131,7 @@ Environment variables are read during `SessionConfig` initialisation so they mus | datafusion.sql_parser.enable_options_value_normalization | false | When set to true, SQL parser will normalize options value (convert value to lowercase). Note that this option is ignored and will be removed in the future. All case-insensitive values are normalized automatically. | | datafusion.sql_parser.dialect | generic | Configure the SQL dialect used by DataFusion's parser; supported values include: Generic, MySQL, PostgreSQL, Hive, SQLite, Snowflake, Redshift, MsSQL, ClickHouse, BigQuery, Ansi, DuckDB and Databricks. | | datafusion.sql_parser.support_varchar_with_length | true | If true, permit lengths for `VARCHAR` such as `VARCHAR(20)`, but ignore the length. If false, error if a `VARCHAR` with a length is specified. The Arrow type system does not have a notion of maximum string length and thus DataFusion can not enforce such limits. | -| datafusion.sql_parser.map_varchar_to_utf8view | true | If true, `VARCHAR` is mapped to `Utf8View` during SQL planning. If false, `VARCHAR` is mapped to `Utf8` during SQL planning. Default is false. | -| datafusion.sql_parser.map_char_to_utf8view | true | If true, `CHAR` and `Text` and `String` is mapped to `Utf8View` during SQL planning. If false, `CHAR` and `Text` and `String` is mapped to `Utf8` during SQL planning. Default is true. | +| datafusion.sql_parser.map_string_types_to_utf8view | true | If true, string types (VARCHAR, CHAR, Text, and String) are mapped to `Utf8View` during SQL planning. If false, they are mapped to `Utf8`. Default is true. | | datafusion.sql_parser.collect_spans | false | When set to true, the source locations relative to the original SQL query (i.e. [`Span`](https://docs.rs/sqlparser/latest/sqlparser/tokenizer/struct.Span.html)) will be collected and recorded in the logical plan nodes. | | datafusion.sql_parser.recursion_limit | 50 | Specifies the recursion depth limit when parsing complex SQL Queries | | datafusion.format.safe | true | If set to `true` any formatting errors will be written to the output instead of being converted into a [`std::fmt::Error`] | From 11878346a71431ad1dddbe76c5efe967401d9fe6 Mon Sep 17 00:00:00 2001 From: zhuqi-lucas <821684824@qq.com> Date: Tue, 17 Jun 2025 11:15:45 +0800 Subject: [PATCH 14/14] support md5 for ut8view --- datafusion/functions/src/crypto/basic.rs | 8 ++++---- datafusion/functions/src/crypto/md5.rs | 8 ++++---- datafusion/sqllogictest/test_files/array.slt | 12 ++++++------ 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/datafusion/functions/src/crypto/basic.rs b/datafusion/functions/src/crypto/basic.rs index eaa688c1c335..5bf83943a92d 100644 --- a/datafusion/functions/src/crypto/basic.rs +++ b/datafusion/functions/src/crypto/basic.rs @@ -21,7 +21,7 @@ use arrow::array::{ Array, ArrayRef, BinaryArray, BinaryArrayType, BinaryViewArray, GenericBinaryArray, OffsetSizeTrait, }; -use arrow::array::{AsArray, GenericStringArray, StringArray, StringViewArray}; +use arrow::array::{AsArray, GenericStringArray, StringViewArray}; use arrow::datatypes::DataType; use blake2::{Blake2b512, Blake2s256, Digest}; use blake3::Hasher as Blake3; @@ -169,18 +169,18 @@ pub fn md5(args: &[ColumnarValue]) -> Result { let [data] = take_function_args("md5", args)?; let value = digest_process(data, DigestAlgorithm::Md5)?; - // md5 requires special handling because of its unique utf8 return type + // md5 requires special handling because of its unique utf8view return type Ok(match value { ColumnarValue::Array(array) => { let binary_array = as_binary_array(&array)?; - let string_array: StringArray = binary_array + let string_array: StringViewArray = binary_array .iter() .map(|opt| opt.map(hex_encode::<_>)) .collect(); ColumnarValue::Array(Arc::new(string_array)) } ColumnarValue::Scalar(ScalarValue::Binary(opt)) => { - ColumnarValue::Scalar(ScalarValue::Utf8(opt.map(hex_encode::<_>))) + ColumnarValue::Scalar(ScalarValue::Utf8View(opt.map(hex_encode::<_>))) } _ => return exec_err!("Impossibly got invalid results from digest"), }) diff --git a/datafusion/functions/src/crypto/md5.rs b/datafusion/functions/src/crypto/md5.rs index c1540450029c..e209ed06e28b 100644 --- a/datafusion/functions/src/crypto/md5.rs +++ b/datafusion/functions/src/crypto/md5.rs @@ -92,12 +92,12 @@ impl ScalarUDFImpl for Md5Func { fn return_type(&self, arg_types: &[DataType]) -> Result { use DataType::*; Ok(match &arg_types[0] { - LargeUtf8 | LargeBinary => Utf8, - Utf8View | Utf8 | Binary | BinaryView => Utf8, + LargeUtf8 | LargeBinary => Utf8View, + Utf8View | Utf8 | Binary | BinaryView => Utf8View, Null => Null, Dictionary(_, t) => match **t { - LargeUtf8 | LargeBinary => Utf8, - Utf8 | Binary | BinaryView => Utf8, + LargeUtf8 | LargeBinary => Utf8View, + Utf8 | Binary | BinaryView => Utf8View, Null => Null, _ => { return plan_err!( diff --git a/datafusion/sqllogictest/test_files/array.slt b/datafusion/sqllogictest/test_files/array.slt index d02437163c90..a2640fa988ce 100644 --- a/datafusion/sqllogictest/test_files/array.slt +++ b/datafusion/sqllogictest/test_files/array.slt @@ -6082,7 +6082,7 @@ physical_plan 04)------AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1))] 05)--------ProjectionExec: expr=[] 06)----------CoalesceBatchesExec: target_batch_size=8192 -07)------------FilterExec: substr(CAST(md5(CAST(value@0 AS Utf8View)) AS Utf8View), 1, 32) IN ([Literal { value: Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278"), field: Field { name: "7f4b18de3cfeb9b4ac78c381ee2ad278", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, Literal { value: Utf8View("a"), field: Field { name: "a", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, Literal { value: Utf8View("b"), field: Field { name: "b", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, Literal { value: Utf8View("c"), field: Field { name: "c", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }]) +07)------------FilterExec: substr(md5(CAST(value@0 AS Utf8View)), 1, 32) IN ([Literal { value: Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278"), field: Field { name: "7f4b18de3cfeb9b4ac78c381ee2ad278", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, Literal { value: Utf8View("a"), field: Field { name: "a", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, Literal { value: Utf8View("b"), field: Field { name: "b", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, Literal { value: Utf8View("c"), field: Field { name: "c", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }]) 08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 09)----------------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100000, batch_size=8192] @@ -6111,7 +6111,7 @@ physical_plan 04)------AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1))] 05)--------ProjectionExec: expr=[] 06)----------CoalesceBatchesExec: target_batch_size=8192 -07)------------FilterExec: substr(CAST(md5(CAST(value@0 AS Utf8View)) AS Utf8View), 1, 32) IN ([Literal { value: Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278"), field: Field { name: "7f4b18de3cfeb9b4ac78c381ee2ad278", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, Literal { value: Utf8View("a"), field: Field { name: "a", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, Literal { value: Utf8View("b"), field: Field { name: "b", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, Literal { value: Utf8View("c"), field: Field { name: "c", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }]) +07)------------FilterExec: substr(md5(CAST(value@0 AS Utf8View)), 1, 32) IN ([Literal { value: Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278"), field: Field { name: "7f4b18de3cfeb9b4ac78c381ee2ad278", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, Literal { value: Utf8View("a"), field: Field { name: "a", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, Literal { value: Utf8View("b"), field: Field { name: "b", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, Literal { value: Utf8View("c"), field: Field { name: "c", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }]) 08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 09)----------------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100000, batch_size=8192] @@ -6140,7 +6140,7 @@ physical_plan 04)------AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1))] 05)--------ProjectionExec: expr=[] 06)----------CoalesceBatchesExec: target_batch_size=8192 -07)------------FilterExec: substr(CAST(md5(CAST(value@0 AS Utf8View)) AS Utf8View), 1, 32) IN ([Literal { value: Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278"), field: Field { name: "7f4b18de3cfeb9b4ac78c381ee2ad278", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, Literal { value: Utf8View("a"), field: Field { name: "a", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, Literal { value: Utf8View("b"), field: Field { name: "b", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, Literal { value: Utf8View("c"), field: Field { name: "c", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }]) +07)------------FilterExec: substr(md5(CAST(value@0 AS Utf8View)), 1, 32) IN ([Literal { value: Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278"), field: Field { name: "7f4b18de3cfeb9b4ac78c381ee2ad278", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, Literal { value: Utf8View("a"), field: Field { name: "a", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, Literal { value: Utf8View("b"), field: Field { name: "b", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, Literal { value: Utf8View("c"), field: Field { name: "c", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }]) 08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 09)----------------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100000, batch_size=8192] @@ -6171,7 +6171,7 @@ physical_plan 04)------AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1))] 05)--------ProjectionExec: expr=[] 06)----------CoalesceBatchesExec: target_batch_size=8192 -07)------------FilterExec: array_has([7f4b18de3cfeb9b4ac78c381ee2ad278, a, b, c], substr(CAST(md5(CAST(value@0 AS Utf8View)) AS Utf8View), 1, 32)) +07)------------FilterExec: array_has([7f4b18de3cfeb9b4ac78c381ee2ad278, a, b, c], substr(md5(CAST(value@0 AS Utf8View)), 1, 32)) 08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 09)----------------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100000, batch_size=8192] @@ -6200,7 +6200,7 @@ physical_plan 04)------AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1))] 05)--------ProjectionExec: expr=[] 06)----------CoalesceBatchesExec: target_batch_size=8192 -07)------------FilterExec: substr(CAST(md5(CAST(value@0 AS Utf8View)) AS Utf8View), 1, 32) IN ([Literal { value: Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278"), field: Field { name: "7f4b18de3cfeb9b4ac78c381ee2ad278", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, Literal { value: Utf8View("a"), field: Field { name: "a", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, Literal { value: Utf8View("b"), field: Field { name: "b", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, Literal { value: Utf8View("c"), field: Field { name: "c", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }]) +07)------------FilterExec: substr(md5(CAST(value@0 AS Utf8View)), 1, 32) IN ([Literal { value: Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278"), field: Field { name: "7f4b18de3cfeb9b4ac78c381ee2ad278", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, Literal { value: Utf8View("a"), field: Field { name: "a", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, Literal { value: Utf8View("b"), field: Field { name: "b", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, Literal { value: Utf8View("c"), field: Field { name: "c", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }]) 08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 09)----------------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100000, batch_size=8192] @@ -6231,7 +6231,7 @@ physical_plan 04)------AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1))] 05)--------ProjectionExec: expr=[] 06)----------CoalesceBatchesExec: target_batch_size=8192 -07)------------FilterExec: substr(CAST(md5(CAST(value@0 AS Utf8View)) AS Utf8View), 1, 32) IS NOT NULL OR NULL +07)------------FilterExec: substr(md5(CAST(value@0 AS Utf8View)), 1, 32) IS NOT NULL OR NULL 08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 09)----------------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100000, batch_size=8192]