diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index 4efb67a37c99..8324ce130aad 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -259,10 +259,10 @@ config_namespace! { /// string length and thus DataFusion can not enforce such limits. pub support_varchar_with_length: bool, default = true - /// If true, `VARCHAR` is mapped to `Utf8View` during SQL planning. - /// If false, `VARCHAR` is mapped to `Utf8` during SQL planning. - /// Default is false. - pub map_varchar_to_utf8view: bool, default = true + /// If true, string types (VARCHAR, CHAR, Text, and String) are mapped to `Utf8View` during SQL planning. + /// If false, they are mapped to `Utf8`. + /// Default is true. + pub map_string_types_to_utf8view: bool, default = true /// When set to true, the source locations relative to the original SQL /// query (i.e. [`Span`](https://docs.rs/sqlparser/latest/sqlparser/tokenizer/struct.Span.html)) will be collected diff --git a/datafusion/core/src/execution/session_state.rs b/datafusion/core/src/execution/session_state.rs index edf116b00a05..1c0363f421af 100644 --- a/datafusion/core/src/execution/session_state.rs +++ b/datafusion/core/src/execution/session_state.rs @@ -494,7 +494,7 @@ impl SessionState { enable_options_value_normalization: sql_parser_options .enable_options_value_normalization, support_varchar_with_length: sql_parser_options.support_varchar_with_length, - map_varchar_to_utf8view: sql_parser_options.map_varchar_to_utf8view, + map_string_types_to_utf8view: sql_parser_options.map_string_types_to_utf8view, collect_spans: sql_parser_options.collect_spans, } } diff --git a/datafusion/core/tests/sql/create_drop.rs b/datafusion/core/tests/sql/create_drop.rs index 83712053b954..b35e614a464e 100644 --- a/datafusion/core/tests/sql/create_drop.rs +++ b/datafusion/core/tests/sql/create_drop.rs @@ -61,7 +61,7 @@ async fn create_external_table_with_ddl() -> Result<()> { assert_eq!(3, table_schema.fields().len()); assert_eq!(&DataType::Int32, table_schema.field(0).data_type()); - assert_eq!(&DataType::Utf8, table_schema.field(1).data_type()); + assert_eq!(&DataType::Utf8View, table_schema.field(1).data_type()); assert_eq!(&DataType::Boolean, table_schema.field(2).data_type()); Ok(()) diff --git a/datafusion/core/tests/user_defined/user_defined_scalar_functions.rs b/datafusion/core/tests/user_defined/user_defined_scalar_functions.rs index d7dd65deab5f..a4e278e51196 100644 --- a/datafusion/core/tests/user_defined/user_defined_scalar_functions.rs +++ b/datafusion/core/tests/user_defined/user_defined_scalar_functions.rs @@ -1181,7 +1181,7 @@ async fn create_scalar_function_from_sql_statement_postgres_syntax() -> Result<( quote_style: None, span: Span::empty(), }), - data_type: DataType::Utf8, + data_type: DataType::Utf8View, default_expr: None, }]), return_type: Some(DataType::Int32), diff --git a/datafusion/expr-common/src/type_coercion/binary.rs b/datafusion/expr-common/src/type_coercion/binary.rs index d0fcda973381..955c28c42a3f 100644 --- a/datafusion/expr-common/src/type_coercion/binary.rs +++ b/datafusion/expr-common/src/type_coercion/binary.rs @@ -462,7 +462,7 @@ pub fn type_union_resolution(data_types: &[DataType]) -> Option { // If all the data_types are null, return string if data_types.iter().all(|t| t == &DataType::Null) { - return Some(DataType::Utf8); + return Some(DataType::Utf8View); } // Ignore Nulls, if any data_type category is not the same, return None @@ -1202,7 +1202,8 @@ pub fn string_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option Option { use arrow::datatypes::DataType::*; match (lhs_type, rhs_type) { - (Utf8 | LargeUtf8, other_type) | (other_type, Utf8 | LargeUtf8) + (Utf8 | LargeUtf8 | Utf8View, other_type) + | (other_type, Utf8 | LargeUtf8 | Utf8View) if other_type.is_numeric() => { Some(other_type.clone()) diff --git a/datafusion/functions/src/crypto/basic.rs b/datafusion/functions/src/crypto/basic.rs index eaa688c1c335..5bf83943a92d 100644 --- a/datafusion/functions/src/crypto/basic.rs +++ b/datafusion/functions/src/crypto/basic.rs @@ -21,7 +21,7 @@ use arrow::array::{ Array, ArrayRef, BinaryArray, BinaryArrayType, BinaryViewArray, GenericBinaryArray, OffsetSizeTrait, }; -use arrow::array::{AsArray, GenericStringArray, StringArray, StringViewArray}; +use arrow::array::{AsArray, GenericStringArray, StringViewArray}; use arrow::datatypes::DataType; use blake2::{Blake2b512, Blake2s256, Digest}; use blake3::Hasher as Blake3; @@ -169,18 +169,18 @@ pub fn md5(args: &[ColumnarValue]) -> Result { let [data] = take_function_args("md5", args)?; let value = digest_process(data, DigestAlgorithm::Md5)?; - // md5 requires special handling because of its unique utf8 return type + // md5 requires special handling because of its unique utf8view return type Ok(match value { ColumnarValue::Array(array) => { let binary_array = as_binary_array(&array)?; - let string_array: StringArray = binary_array + let string_array: StringViewArray = binary_array .iter() .map(|opt| opt.map(hex_encode::<_>)) .collect(); ColumnarValue::Array(Arc::new(string_array)) } ColumnarValue::Scalar(ScalarValue::Binary(opt)) => { - ColumnarValue::Scalar(ScalarValue::Utf8(opt.map(hex_encode::<_>))) + ColumnarValue::Scalar(ScalarValue::Utf8View(opt.map(hex_encode::<_>))) } _ => return exec_err!("Impossibly got invalid results from digest"), }) diff --git a/datafusion/functions/src/crypto/md5.rs b/datafusion/functions/src/crypto/md5.rs index c1540450029c..e209ed06e28b 100644 --- a/datafusion/functions/src/crypto/md5.rs +++ b/datafusion/functions/src/crypto/md5.rs @@ -92,12 +92,12 @@ impl ScalarUDFImpl for Md5Func { fn return_type(&self, arg_types: &[DataType]) -> Result { use DataType::*; Ok(match &arg_types[0] { - LargeUtf8 | LargeBinary => Utf8, - Utf8View | Utf8 | Binary | BinaryView => Utf8, + LargeUtf8 | LargeBinary => Utf8View, + Utf8View | Utf8 | Binary | BinaryView => Utf8View, Null => Null, Dictionary(_, t) => match **t { - LargeUtf8 | LargeBinary => Utf8, - Utf8 | Binary | BinaryView => Utf8, + LargeUtf8 | LargeBinary => Utf8View, + Utf8 | Binary | BinaryView => Utf8View, Null => Null, _ => { return plan_err!( diff --git a/datafusion/physical-plan/src/joins/sort_merge_join.rs b/datafusion/physical-plan/src/joins/sort_merge_join.rs index f361992caa4c..6ab069aaf4f6 100644 --- a/datafusion/physical-plan/src/joins/sort_merge_join.rs +++ b/datafusion/physical-plan/src/joins/sort_merge_join.rs @@ -2492,6 +2492,7 @@ fn compare_join_arrays( DataType::Float32 => compare_value!(Float32Array), DataType::Float64 => compare_value!(Float64Array), DataType::Utf8 => compare_value!(StringArray), + DataType::Utf8View => compare_value!(StringViewArray), DataType::LargeUtf8 => compare_value!(LargeStringArray), DataType::Decimal128(..) => compare_value!(Decimal128Array), DataType::Timestamp(time_unit, None) => match time_unit { @@ -2559,6 +2560,7 @@ fn is_join_arrays_equal( DataType::Float32 => compare_value!(Float32Array), DataType::Float64 => compare_value!(Float64Array), DataType::Utf8 => compare_value!(StringArray), + DataType::Utf8View => compare_value!(StringViewArray), DataType::LargeUtf8 => compare_value!(LargeStringArray), DataType::Decimal128(..) => compare_value!(Decimal128Array), DataType::Timestamp(time_unit, None) => match time_unit { diff --git a/datafusion/sql/src/planner.rs b/datafusion/sql/src/planner.rs index 5a1f3cdf69c3..03396822eca8 100644 --- a/datafusion/sql/src/planner.rs +++ b/datafusion/sql/src/planner.rs @@ -52,8 +52,8 @@ pub struct ParserOptions { pub enable_options_value_normalization: bool, /// Whether to collect spans pub collect_spans: bool, - /// Whether `VARCHAR` is mapped to `Utf8View` during SQL planning. - pub map_varchar_to_utf8view: bool, + /// Whether string types (VARCHAR, CHAR, Text, and String) are mapped to `Utf8View` during SQL planning. + pub map_string_types_to_utf8view: bool, } impl ParserOptions { @@ -72,7 +72,7 @@ impl ParserOptions { parse_float_as_decimal: false, enable_ident_normalization: true, support_varchar_with_length: true, - map_varchar_to_utf8view: true, + map_string_types_to_utf8view: true, enable_options_value_normalization: false, collect_spans: false, } @@ -112,9 +112,9 @@ impl ParserOptions { self } - /// Sets the `map_varchar_to_utf8view` option. - pub fn with_map_varchar_to_utf8view(mut self, value: bool) -> Self { - self.map_varchar_to_utf8view = value; + /// Sets the `map_string_types_to_utf8view` option. + pub fn with_map_string_types_to_utf8view(mut self, value: bool) -> Self { + self.map_string_types_to_utf8view = value; self } @@ -143,7 +143,7 @@ impl From<&SqlParserOptions> for ParserOptions { parse_float_as_decimal: options.parse_float_as_decimal, enable_ident_normalization: options.enable_ident_normalization, support_varchar_with_length: options.support_varchar_with_length, - map_varchar_to_utf8view: options.map_varchar_to_utf8view, + map_string_types_to_utf8view: options.map_string_types_to_utf8view, enable_options_value_normalization: options .enable_options_value_normalization, collect_spans: options.collect_spans, @@ -577,7 +577,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { please set `support_varchar_with_length` to be true" ), _ => { - if self.options.map_varchar_to_utf8view { + if self.options.map_string_types_to_utf8view { Ok(DataType::Utf8View) } else { Ok(DataType::Utf8) @@ -601,7 +601,11 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { ) } SQLDataType::Char(_) | SQLDataType::Text | SQLDataType::String(_) => { - Ok(DataType::Utf8) + if self.options.map_string_types_to_utf8view { + Ok(DataType::Utf8View) + } else { + Ok(DataType::Utf8) + } } SQLDataType::Timestamp(precision, tz_info) if precision.is_none() || [0, 3, 6, 9].contains(&precision.unwrap()) => diff --git a/datafusion/sql/tests/cases/params.rs b/datafusion/sql/tests/cases/params.rs index b3cc49c31071..15e7d923a91a 100644 --- a/datafusion/sql/tests/cases/params.rs +++ b/datafusion/sql/tests/cases/params.rs @@ -746,31 +746,31 @@ fn test_prepare_statement_to_plan_multi_params() { assert_snapshot!( plan, @r#" - Prepare: "my_plan" [Int32, Utf8, Float64, Int32, Float64, Utf8] + Prepare: "my_plan" [Int32, Utf8View, Float64, Int32, Float64, Utf8View] Projection: person.id, person.age, $6 Filter: person.age IN ([$1, $4]) AND person.salary > $3 AND person.salary < $5 OR person.first_name < $2 TableScan: person "# ); - assert_snapshot!(dt, @r#"[Int32, Utf8, Float64, Int32, Float64, Utf8]"#); + assert_snapshot!(dt, @r#"[Int32, Utf8View, Float64, Int32, Float64, Utf8View]"#); /////////////////// // replace params with values let param_values = vec![ ScalarValue::Int32(Some(10)), - ScalarValue::from("abc"), + ScalarValue::Utf8View(Some("abc".into())), ScalarValue::Float64(Some(100.0)), ScalarValue::Int32(Some(20)), ScalarValue::Float64(Some(200.0)), - ScalarValue::from("xyz"), + ScalarValue::Utf8View(Some("xyz".into())), ]; let plan_with_params = plan.with_param_values(param_values).unwrap(); assert_snapshot!( plan_with_params, @r#" - Projection: person.id, person.age, Utf8("xyz") AS $6 - Filter: person.age IN ([Int32(10), Int32(20)]) AND person.salary > Float64(100) AND person.salary < Float64(200) OR person.first_name < Utf8("abc") + Projection: person.id, person.age, Utf8View("xyz") AS $6 + Filter: person.age IN ([Int32(10), Int32(20)]) AND person.salary > Float64(100) AND person.salary < Float64(200) OR person.first_name < Utf8View("abc") TableScan: person "# ); diff --git a/datafusion/sql/tests/sql_integration.rs b/datafusion/sql/tests/sql_integration.rs index 4be7953aefc0..c82239d9b455 100644 --- a/datafusion/sql/tests/sql_integration.rs +++ b/datafusion/sql/tests/sql_integration.rs @@ -3355,7 +3355,7 @@ fn parse_decimals_parser_options() -> ParserOptions { parse_float_as_decimal: true, enable_ident_normalization: false, support_varchar_with_length: false, - map_varchar_to_utf8view: true, + map_string_types_to_utf8view: true, enable_options_value_normalization: false, collect_spans: false, } @@ -3366,7 +3366,7 @@ fn ident_normalization_parser_options_no_ident_normalization() -> ParserOptions parse_float_as_decimal: true, enable_ident_normalization: false, support_varchar_with_length: false, - map_varchar_to_utf8view: true, + map_string_types_to_utf8view: true, enable_options_value_normalization: false, collect_spans: false, } @@ -3377,7 +3377,7 @@ fn ident_normalization_parser_options_ident_normalization() -> ParserOptions { parse_float_as_decimal: true, enable_ident_normalization: true, support_varchar_with_length: false, - map_varchar_to_utf8view: true, + map_string_types_to_utf8view: true, enable_options_value_normalization: false, collect_spans: false, } diff --git a/datafusion/sqllogictest/test_files/aggregate.slt b/datafusion/sqllogictest/test_files/aggregate.slt index bd9d8b1f43d2..f9dc872a3c5e 100644 --- a/datafusion/sqllogictest/test_files/aggregate.slt +++ b/datafusion/sqllogictest/test_files/aggregate.slt @@ -2278,10 +2278,10 @@ create table t (c string) as values query T select arrow_typeof(c) from t; ---- -Utf8 -Utf8 -Utf8 -Utf8 +Utf8View +Utf8View +Utf8View +Utf8View query IT select count(c), arrow_typeof(count(c)) from t; diff --git a/datafusion/sqllogictest/test_files/array.slt b/datafusion/sqllogictest/test_files/array.slt index 0139daecca1f..a2640fa988ce 100644 --- a/datafusion/sqllogictest/test_files/array.slt +++ b/datafusion/sqllogictest/test_files/array.slt @@ -6073,7 +6073,7 @@ logical_plan 03)----SubqueryAlias: test 04)------SubqueryAlias: t 05)--------Projection: -06)----------Filter: substr(CAST(md5(CAST(tmp_table.value AS Utf8)) AS Utf8), Int64(1), Int64(32)) IN ([Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278"), Utf8View("a"), Utf8View("b"), Utf8View("c")]) +06)----------Filter: substr(CAST(md5(CAST(tmp_table.value AS Utf8View)) AS Utf8View), Int64(1), Int64(32)) IN ([Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278"), Utf8View("a"), Utf8View("b"), Utf8View("c")]) 07)------------TableScan: tmp_table projection=[value] physical_plan 01)ProjectionExec: expr=[count(Int64(1))@0 as count(*)] @@ -6082,7 +6082,7 @@ physical_plan 04)------AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1))] 05)--------ProjectionExec: expr=[] 06)----------CoalesceBatchesExec: target_batch_size=8192 -07)------------FilterExec: substr(md5(CAST(value@0 AS Utf8)), 1, 32) IN ([Literal { value: Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278"), field: Field { name: "7f4b18de3cfeb9b4ac78c381ee2ad278", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, Literal { value: Utf8View("a"), field: Field { name: "a", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, Literal { value: Utf8View("b"), field: Field { name: "b", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, Literal { value: Utf8View("c"), field: Field { name: "c", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }]) +07)------------FilterExec: substr(md5(CAST(value@0 AS Utf8View)), 1, 32) IN ([Literal { value: Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278"), field: Field { name: "7f4b18de3cfeb9b4ac78c381ee2ad278", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, Literal { value: Utf8View("a"), field: Field { name: "a", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, Literal { value: Utf8View("b"), field: Field { name: "b", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, Literal { value: Utf8View("c"), field: Field { name: "c", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }]) 08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 09)----------------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100000, batch_size=8192] @@ -6102,7 +6102,7 @@ logical_plan 03)----SubqueryAlias: test 04)------SubqueryAlias: t 05)--------Projection: -06)----------Filter: substr(CAST(md5(CAST(tmp_table.value AS Utf8)) AS Utf8), Int64(1), Int64(32)) IN ([Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278"), Utf8View("a"), Utf8View("b"), Utf8View("c")]) +06)----------Filter: substr(CAST(md5(CAST(tmp_table.value AS Utf8View)) AS Utf8View), Int64(1), Int64(32)) IN ([Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278"), Utf8View("a"), Utf8View("b"), Utf8View("c")]) 07)------------TableScan: tmp_table projection=[value] physical_plan 01)ProjectionExec: expr=[count(Int64(1))@0 as count(*)] @@ -6111,7 +6111,7 @@ physical_plan 04)------AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1))] 05)--------ProjectionExec: expr=[] 06)----------CoalesceBatchesExec: target_batch_size=8192 -07)------------FilterExec: substr(md5(CAST(value@0 AS Utf8)), 1, 32) IN ([Literal { value: Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278"), field: Field { name: "7f4b18de3cfeb9b4ac78c381ee2ad278", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, Literal { value: Utf8View("a"), field: Field { name: "a", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, Literal { value: Utf8View("b"), field: Field { name: "b", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, Literal { value: Utf8View("c"), field: Field { name: "c", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }]) +07)------------FilterExec: substr(md5(CAST(value@0 AS Utf8View)), 1, 32) IN ([Literal { value: Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278"), field: Field { name: "7f4b18de3cfeb9b4ac78c381ee2ad278", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, Literal { value: Utf8View("a"), field: Field { name: "a", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, Literal { value: Utf8View("b"), field: Field { name: "b", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, Literal { value: Utf8View("c"), field: Field { name: "c", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }]) 08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 09)----------------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100000, batch_size=8192] @@ -6131,7 +6131,7 @@ logical_plan 03)----SubqueryAlias: test 04)------SubqueryAlias: t 05)--------Projection: -06)----------Filter: substr(CAST(md5(CAST(tmp_table.value AS Utf8)) AS Utf8), Int64(1), Int64(32)) IN ([Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278"), Utf8View("a"), Utf8View("b"), Utf8View("c")]) +06)----------Filter: substr(CAST(md5(CAST(tmp_table.value AS Utf8View)) AS Utf8View), Int64(1), Int64(32)) IN ([Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278"), Utf8View("a"), Utf8View("b"), Utf8View("c")]) 07)------------TableScan: tmp_table projection=[value] physical_plan 01)ProjectionExec: expr=[count(Int64(1))@0 as count(*)] @@ -6140,7 +6140,7 @@ physical_plan 04)------AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1))] 05)--------ProjectionExec: expr=[] 06)----------CoalesceBatchesExec: target_batch_size=8192 -07)------------FilterExec: substr(md5(CAST(value@0 AS Utf8)), 1, 32) IN ([Literal { value: Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278"), field: Field { name: "7f4b18de3cfeb9b4ac78c381ee2ad278", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, Literal { value: Utf8View("a"), field: Field { name: "a", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, Literal { value: Utf8View("b"), field: Field { name: "b", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, Literal { value: Utf8View("c"), field: Field { name: "c", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }]) +07)------------FilterExec: substr(md5(CAST(value@0 AS Utf8View)), 1, 32) IN ([Literal { value: Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278"), field: Field { name: "7f4b18de3cfeb9b4ac78c381ee2ad278", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, Literal { value: Utf8View("a"), field: Field { name: "a", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, Literal { value: Utf8View("b"), field: Field { name: "b", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, Literal { value: Utf8View("c"), field: Field { name: "c", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }]) 08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 09)----------------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100000, batch_size=8192] @@ -6162,7 +6162,7 @@ logical_plan 03)----SubqueryAlias: test 04)------SubqueryAlias: t 05)--------Projection: -06)----------Filter: array_has(LargeList([7f4b18de3cfeb9b4ac78c381ee2ad278, a, b, c]), substr(CAST(md5(CAST(tmp_table.value AS Utf8)) AS Utf8), Int64(1), Int64(32))) +06)----------Filter: array_has(LargeList([7f4b18de3cfeb9b4ac78c381ee2ad278, a, b, c]), substr(CAST(md5(CAST(tmp_table.value AS Utf8View)) AS Utf8View), Int64(1), Int64(32))) 07)------------TableScan: tmp_table projection=[value] physical_plan 01)ProjectionExec: expr=[count(Int64(1))@0 as count(*)] @@ -6171,7 +6171,7 @@ physical_plan 04)------AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1))] 05)--------ProjectionExec: expr=[] 06)----------CoalesceBatchesExec: target_batch_size=8192 -07)------------FilterExec: array_has([7f4b18de3cfeb9b4ac78c381ee2ad278, a, b, c], substr(md5(CAST(value@0 AS Utf8)), 1, 32)) +07)------------FilterExec: array_has([7f4b18de3cfeb9b4ac78c381ee2ad278, a, b, c], substr(md5(CAST(value@0 AS Utf8View)), 1, 32)) 08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 09)----------------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100000, batch_size=8192] @@ -6191,7 +6191,7 @@ logical_plan 03)----SubqueryAlias: test 04)------SubqueryAlias: t 05)--------Projection: -06)----------Filter: substr(CAST(md5(CAST(tmp_table.value AS Utf8)) AS Utf8), Int64(1), Int64(32)) IN ([Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278"), Utf8View("a"), Utf8View("b"), Utf8View("c")]) +06)----------Filter: substr(CAST(md5(CAST(tmp_table.value AS Utf8View)) AS Utf8View), Int64(1), Int64(32)) IN ([Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278"), Utf8View("a"), Utf8View("b"), Utf8View("c")]) 07)------------TableScan: tmp_table projection=[value] physical_plan 01)ProjectionExec: expr=[count(Int64(1))@0 as count(*)] @@ -6200,7 +6200,7 @@ physical_plan 04)------AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1))] 05)--------ProjectionExec: expr=[] 06)----------CoalesceBatchesExec: target_batch_size=8192 -07)------------FilterExec: substr(md5(CAST(value@0 AS Utf8)), 1, 32) IN ([Literal { value: Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278"), field: Field { name: "7f4b18de3cfeb9b4ac78c381ee2ad278", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, Literal { value: Utf8View("a"), field: Field { name: "a", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, Literal { value: Utf8View("b"), field: Field { name: "b", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, Literal { value: Utf8View("c"), field: Field { name: "c", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }]) +07)------------FilterExec: substr(md5(CAST(value@0 AS Utf8View)), 1, 32) IN ([Literal { value: Utf8View("7f4b18de3cfeb9b4ac78c381ee2ad278"), field: Field { name: "7f4b18de3cfeb9b4ac78c381ee2ad278", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, Literal { value: Utf8View("a"), field: Field { name: "a", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, Literal { value: Utf8View("b"), field: Field { name: "b", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, Literal { value: Utf8View("c"), field: Field { name: "c", data_type: Utf8View, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }]) 08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 09)----------------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100000, batch_size=8192] @@ -6222,7 +6222,7 @@ logical_plan 03)----SubqueryAlias: test 04)------SubqueryAlias: t 05)--------Projection: -06)----------Filter: substr(CAST(md5(CAST(tmp_table.value AS Utf8)) AS Utf8), Int64(1), Int64(32)) IS NOT NULL OR Boolean(NULL) +06)----------Filter: substr(CAST(md5(CAST(tmp_table.value AS Utf8View)) AS Utf8View), Int64(1), Int64(32)) IS NOT NULL OR Boolean(NULL) 07)------------TableScan: tmp_table projection=[value] physical_plan 01)ProjectionExec: expr=[count(Int64(1))@0 as count(*)] @@ -6231,7 +6231,7 @@ physical_plan 04)------AggregateExec: mode=Partial, gby=[], aggr=[count(Int64(1))] 05)--------ProjectionExec: expr=[] 06)----------CoalesceBatchesExec: target_batch_size=8192 -07)------------FilterExec: substr(md5(CAST(value@0 AS Utf8)), 1, 32) IS NOT NULL OR NULL +07)------------FilterExec: substr(md5(CAST(value@0 AS Utf8View)), 1, 32) IS NOT NULL OR NULL 08)--------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 09)----------------LazyMemoryExec: partitions=1, batch_generators=[generate_series: start=1, end=100000, batch_size=8192] @@ -7913,7 +7913,7 @@ List(Field { name: "item", data_type: List(Field { name: "item", data_type: Int3 query ??T select [1,2,3]::int[], [['1']]::int[][], arrow_typeof([]::text[]); ---- -[1, 2, 3] [[1]] List(Field { name: "item", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) +[1, 2, 3] [[1]] List(Field { name: "item", data_type: Utf8View, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }) # test empty arrays return length # issue: https://github.com/apache/datafusion/pull/12459 diff --git a/datafusion/sqllogictest/test_files/arrow_files.slt b/datafusion/sqllogictest/test_files/arrow_files.slt index 30f322cf98fc..62453ec4bf3e 100644 --- a/datafusion/sqllogictest/test_files/arrow_files.slt +++ b/datafusion/sqllogictest/test_files/arrow_files.slt @@ -19,6 +19,11 @@ ## Arrow Files Format support ############# +# We using fixed arrow file to test for sqllogictests, and this arrow field is writing with arrow-ipc utf8, +# so when we decode to read it's also loading utf8. +# Currently, so we disable the map_string_types_to_utf8view +statement ok +set datafusion.sql_parser.map_string_types_to_utf8view = false; statement ok diff --git a/datafusion/sqllogictest/test_files/avro.slt b/datafusion/sqllogictest/test_files/avro.slt index 4573af1d59b1..2ad60c0082e8 100644 --- a/datafusion/sqllogictest/test_files/avro.slt +++ b/datafusion/sqllogictest/test_files/avro.slt @@ -15,10 +15,10 @@ # specific language governing permissions and limitations # under the License. -# Currently, the avro not support Utf8View type, so we disable the map_varchar_to_utf8view +# Currently, the avro not support Utf8View type, so we disable the map_string_types_to_utf8view # After https://github.com/apache/arrow-rs/issues/7262 released, we can remove this setting statement ok -set datafusion.sql_parser.map_varchar_to_utf8view = false; +set datafusion.sql_parser.map_string_types_to_utf8view = false; statement ok CREATE EXTERNAL TABLE alltypes_plain ( diff --git a/datafusion/sqllogictest/test_files/ddl.slt b/datafusion/sqllogictest/test_files/ddl.slt index 1e95e426f3e0..81f2955eff49 100644 --- a/datafusion/sqllogictest/test_files/ddl.slt +++ b/datafusion/sqllogictest/test_files/ddl.slt @@ -828,7 +828,7 @@ drop table table_with_pk; statement ok set datafusion.catalog.information_schema = false; -# Test VARCHAR is mapped to Utf8View during SQL planning when setting map_varchar_to_utf8view to true +# Test VARCHAR is mapped to Utf8View during SQL planning when setting map_string_types_to_utf8view to true statement ok CREATE TABLE t1(c1 VARCHAR(10) NOT NULL, c2 VARCHAR); @@ -839,7 +839,7 @@ c1 Utf8View NO c2 Utf8View YES statement ok -set datafusion.sql_parser.map_varchar_to_utf8view = true; +set datafusion.sql_parser.map_string_types_to_utf8view = true; statement ok CREATE TABLE t2(c1 VARCHAR(10) NOT NULL, c2 VARCHAR); diff --git a/datafusion/sqllogictest/test_files/explain_tree.slt b/datafusion/sqllogictest/test_files/explain_tree.slt index 15bf61576571..8096c8cacf4c 100644 --- a/datafusion/sqllogictest/test_files/explain_tree.slt +++ b/datafusion/sqllogictest/test_files/explain_tree.slt @@ -280,7 +280,7 @@ physical_plan 06)┌─────────────┴─────────────┐ 07)│ DataSourceExec │ 08)│ -------------------- │ -09)│ bytes: 3120 │ +09)│ bytes: 1072 │ 10)│ format: memory │ 11)│ rows: 2 │ 12)└───────────────────────────┘ @@ -367,7 +367,7 @@ physical_plan 21)┌─────────────┴─────────────┐┌─────────────┴─────────────┐ 22)│ DataSourceExec ││ CoalesceBatchesExec │ 23)│ -------------------- ││ -------------------- │ -24)│ bytes: 1560 ││ target_batch_size: │ +24)│ bytes: 536 ││ target_batch_size: │ 25)│ format: memory ││ 8192 │ 26)│ rows: 1 ││ │ 27)└───────────────────────────┘└─────────────┬─────────────┘ @@ -669,7 +669,7 @@ physical_plan 13)┌─────────────┴─────────────┐ 14)│ DataSourceExec │ 15)│ -------------------- │ -16)│ bytes: 1560 │ +16)│ bytes: 536 │ 17)│ format: memory │ 18)│ rows: 1 │ 19)└───────────────────────────┘ @@ -1065,7 +1065,7 @@ physical_plan 13)┌─────────────┴─────────────┐ 14)│ DataSourceExec │ 15)│ -------------------- │ -16)│ bytes: 1560 │ +16)│ bytes: 536 │ 17)│ format: memory │ 18)│ rows: 1 │ 19)└───────────────────────────┘ @@ -1195,60 +1195,42 @@ physical_plan 08)│ HashJoinExec │ 09)│ -------------------- │ 10)│ on: │ -11)│ (int_col = int_col), (CAST├──────────────┐ -12)│ (table1.string_col AS │ │ -13)│ Utf8View) = │ │ -14)│ string_col) │ │ -15)└─────────────┬─────────────┘ │ -16)┌─────────────┴─────────────┐┌─────────────┴─────────────┐ -17)│ CoalesceBatchesExec ││ CoalesceBatchesExec │ -18)│ -------------------- ││ -------------------- │ -19)│ target_batch_size: ││ target_batch_size: │ -20)│ 8192 ││ 8192 │ -21)└─────────────┬─────────────┘└─────────────┬─────────────┘ -22)┌─────────────┴─────────────┐┌─────────────┴─────────────┐ -23)│ RepartitionExec ││ RepartitionExec │ -24)│ -------------------- ││ -------------------- │ -25)│ partition_count(in->out): ││ partition_count(in->out): │ -26)│ 4 -> 4 ││ 4 -> 4 │ -27)│ ││ │ -28)│ partitioning_scheme: ││ partitioning_scheme: │ -29)│ Hash([int_col@0, CAST ││ Hash([int_col@0, │ -30)│ (table1.string_col ││ string_col@1], │ -31)│ AS Utf8View)@4], 4) ││ 4) │ -32)└─────────────┬─────────────┘└─────────────┬─────────────┘ -33)┌─────────────┴─────────────┐┌─────────────┴─────────────┐ -34)│ ProjectionExec ││ RepartitionExec │ -35)│ -------------------- ││ -------------------- │ -36)│ CAST(table1.string_col AS ││ partition_count(in->out): │ -37)│ Utf8View): ││ 1 -> 4 │ -38)│ CAST(string_col AS ││ │ -39)│ Utf8View) ││ partitioning_scheme: │ -40)│ ││ RoundRobinBatch(4) │ -41)│ bigint_col: ││ │ -42)│ bigint_col ││ │ -43)│ ││ │ -44)│ date_col: date_col ││ │ -45)│ int_col: int_col ││ │ -46)│ ││ │ -47)│ string_col: ││ │ -48)│ string_col ││ │ -49)└─────────────┬─────────────┘└─────────────┬─────────────┘ -50)┌─────────────┴─────────────┐┌─────────────┴─────────────┐ -51)│ RepartitionExec ││ DataSourceExec │ -52)│ -------------------- ││ -------------------- │ -53)│ partition_count(in->out): ││ files: 1 │ -54)│ 1 -> 4 ││ format: parquet │ -55)│ ││ │ -56)│ partitioning_scheme: ││ │ -57)│ RoundRobinBatch(4) ││ │ -58)└─────────────┬─────────────┘└───────────────────────────┘ -59)┌─────────────┴─────────────┐ -60)│ DataSourceExec │ -61)│ -------------------- │ -62)│ files: 1 │ -63)│ format: csv │ -64)└───────────────────────────┘ +11)│ (int_col = int_col), ├──────────────┐ +12)│ (string_col = │ │ +13)│ string_col) │ │ +14)└─────────────┬─────────────┘ │ +15)┌─────────────┴─────────────┐┌─────────────┴─────────────┐ +16)│ CoalesceBatchesExec ││ CoalesceBatchesExec │ +17)│ -------------------- ││ -------------------- │ +18)│ target_batch_size: ││ target_batch_size: │ +19)│ 8192 ││ 8192 │ +20)└─────────────┬─────────────┘└─────────────┬─────────────┘ +21)┌─────────────┴─────────────┐┌─────────────┴─────────────┐ +22)│ RepartitionExec ││ RepartitionExec │ +23)│ -------------------- ││ -------------------- │ +24)│ partition_count(in->out): ││ partition_count(in->out): │ +25)│ 4 -> 4 ││ 4 -> 4 │ +26)│ ││ │ +27)│ partitioning_scheme: ││ partitioning_scheme: │ +28)│ Hash([int_col@0, ││ Hash([int_col@0, │ +29)│ string_col@1], ││ string_col@1], │ +30)│ 4) ││ 4) │ +31)└─────────────┬─────────────┘└─────────────┬─────────────┘ +32)┌─────────────┴─────────────┐┌─────────────┴─────────────┐ +33)│ RepartitionExec ││ RepartitionExec │ +34)│ -------------------- ││ -------------------- │ +35)│ partition_count(in->out): ││ partition_count(in->out): │ +36)│ 1 -> 4 ││ 1 -> 4 │ +37)│ ││ │ +38)│ partitioning_scheme: ││ partitioning_scheme: │ +39)│ RoundRobinBatch(4) ││ RoundRobinBatch(4) │ +40)└─────────────┬─────────────┘└─────────────┬─────────────┘ +41)┌─────────────┴─────────────┐┌─────────────┴─────────────┐ +42)│ DataSourceExec ││ DataSourceExec │ +43)│ -------------------- ││ -------------------- │ +44)│ files: 1 ││ files: 1 │ +45)│ format: csv ││ format: parquet │ +46)└───────────────────────────┘└───────────────────────────┘ # Query with outer hash join. query TT @@ -1267,60 +1249,42 @@ physical_plan 10)│ join_type: Left │ 11)│ │ 12)│ on: ├──────────────┐ -13)│ (int_col = int_col), (CAST│ │ -14)│ (table1.string_col AS │ │ -15)│ Utf8View) = │ │ -16)│ string_col) │ │ -17)└─────────────┬─────────────┘ │ -18)┌─────────────┴─────────────┐┌─────────────┴─────────────┐ -19)│ CoalesceBatchesExec ││ CoalesceBatchesExec │ -20)│ -------------------- ││ -------------------- │ -21)│ target_batch_size: ││ target_batch_size: │ -22)│ 8192 ││ 8192 │ -23)└─────────────┬─────────────┘└─────────────┬─────────────┘ -24)┌─────────────┴─────────────┐┌─────────────┴─────────────┐ -25)│ RepartitionExec ││ RepartitionExec │ -26)│ -------------------- ││ -------------------- │ -27)│ partition_count(in->out): ││ partition_count(in->out): │ -28)│ 4 -> 4 ││ 4 -> 4 │ -29)│ ││ │ -30)│ partitioning_scheme: ││ partitioning_scheme: │ -31)│ Hash([int_col@0, CAST ││ Hash([int_col@0, │ -32)│ (table1.string_col ││ string_col@1], │ -33)│ AS Utf8View)@4], 4) ││ 4) │ -34)└─────────────┬─────────────┘└─────────────┬─────────────┘ -35)┌─────────────┴─────────────┐┌─────────────┴─────────────┐ -36)│ ProjectionExec ││ RepartitionExec │ -37)│ -------------------- ││ -------------------- │ -38)│ CAST(table1.string_col AS ││ partition_count(in->out): │ -39)│ Utf8View): ││ 1 -> 4 │ -40)│ CAST(string_col AS ││ │ -41)│ Utf8View) ││ partitioning_scheme: │ -42)│ ││ RoundRobinBatch(4) │ -43)│ bigint_col: ││ │ -44)│ bigint_col ││ │ -45)│ ││ │ -46)│ date_col: date_col ││ │ -47)│ int_col: int_col ││ │ -48)│ ││ │ -49)│ string_col: ││ │ -50)│ string_col ││ │ -51)└─────────────┬─────────────┘└─────────────┬─────────────┘ -52)┌─────────────┴─────────────┐┌─────────────┴─────────────┐ -53)│ RepartitionExec ││ DataSourceExec │ -54)│ -------------------- ││ -------------------- │ -55)│ partition_count(in->out): ││ files: 1 │ -56)│ 1 -> 4 ││ format: parquet │ -57)│ ││ │ -58)│ partitioning_scheme: ││ │ -59)│ RoundRobinBatch(4) ││ │ -60)└─────────────┬─────────────┘└───────────────────────────┘ -61)┌─────────────┴─────────────┐ -62)│ DataSourceExec │ -63)│ -------------------- │ -64)│ files: 1 │ -65)│ format: csv │ -66)└───────────────────────────┘ +13)│ (int_col = int_col), │ │ +14)│ (string_col = │ │ +15)│ string_col) │ │ +16)└─────────────┬─────────────┘ │ +17)┌─────────────┴─────────────┐┌─────────────┴─────────────┐ +18)│ CoalesceBatchesExec ││ CoalesceBatchesExec │ +19)│ -------------------- ││ -------------------- │ +20)│ target_batch_size: ││ target_batch_size: │ +21)│ 8192 ││ 8192 │ +22)└─────────────┬─────────────┘└─────────────┬─────────────┘ +23)┌─────────────┴─────────────┐┌─────────────┴─────────────┐ +24)│ RepartitionExec ││ RepartitionExec │ +25)│ -------------------- ││ -------------------- │ +26)│ partition_count(in->out): ││ partition_count(in->out): │ +27)│ 4 -> 4 ││ 4 -> 4 │ +28)│ ││ │ +29)│ partitioning_scheme: ││ partitioning_scheme: │ +30)│ Hash([int_col@0, ││ Hash([int_col@0, │ +31)│ string_col@1], ││ string_col@1], │ +32)│ 4) ││ 4) │ +33)└─────────────┬─────────────┘└─────────────┬─────────────┘ +34)┌─────────────┴─────────────┐┌─────────────┴─────────────┐ +35)│ RepartitionExec ││ RepartitionExec │ +36)│ -------------------- ││ -------------------- │ +37)│ partition_count(in->out): ││ partition_count(in->out): │ +38)│ 1 -> 4 ││ 1 -> 4 │ +39)│ ││ │ +40)│ partitioning_scheme: ││ partitioning_scheme: │ +41)│ RoundRobinBatch(4) ││ RoundRobinBatch(4) │ +42)└─────────────┬─────────────┘└─────────────┬─────────────┘ +43)┌─────────────┴─────────────┐┌─────────────┴─────────────┐ +44)│ DataSourceExec ││ DataSourceExec │ +45)│ -------------------- ││ -------------------- │ +46)│ files: 1 ││ files: 1 │ +47)│ format: csv ││ format: parquet │ +48)└───────────────────────────┘└───────────────────────────┘ # Query with nested loop join. query TT @@ -1529,7 +1493,7 @@ physical_plan 57)┌─────────────┴─────────────┐┌─────────────┴─────────────┐ 58)│ DataSourceExec ││ DataSourceExec │ 59)│ -------------------- ││ -------------------- │ -60)│ bytes: 1320 ││ bytes: 1312 │ +60)│ bytes: 296 ││ bytes: 288 │ 61)│ format: memory ││ format: memory │ 62)│ rows: 1 ││ rows: 1 │ 63)└───────────────────────────┘└───────────────────────────┘ @@ -1548,14 +1512,14 @@ physical_plan 04)┌─────────────┴─────────────┐┌─────────────┴─────────────┐ 05)│ DataSourceExec ││ ProjectionExec │ 06)│ -------------------- ││ -------------------- │ -07)│ bytes: 1320 ││ id: CAST(id AS Int32) │ +07)│ bytes: 296 ││ id: CAST(id AS Int32) │ 08)│ format: memory ││ name: name │ 09)│ rows: 1 ││ │ 10)└───────────────────────────┘└─────────────┬─────────────┘ 11)-----------------------------┌─────────────┴─────────────┐ 12)-----------------------------│ DataSourceExec │ 13)-----------------------------│ -------------------- │ -14)-----------------------------│ bytes: 1312 │ +14)-----------------------------│ bytes: 288 │ 15)-----------------------------│ format: memory │ 16)-----------------------------│ rows: 1 │ 17)-----------------------------└───────────────────────────┘ diff --git a/datafusion/sqllogictest/test_files/information_schema.slt b/datafusion/sqllogictest/test_files/information_schema.slt index dc8b7680d83e..f8c86df02453 100644 --- a/datafusion/sqllogictest/test_files/information_schema.slt +++ b/datafusion/sqllogictest/test_files/information_schema.slt @@ -308,7 +308,7 @@ datafusion.sql_parser.collect_spans false datafusion.sql_parser.dialect generic datafusion.sql_parser.enable_ident_normalization true datafusion.sql_parser.enable_options_value_normalization false -datafusion.sql_parser.map_varchar_to_utf8view true +datafusion.sql_parser.map_string_types_to_utf8view true datafusion.sql_parser.parse_float_as_decimal false datafusion.sql_parser.recursion_limit 50 datafusion.sql_parser.support_varchar_with_length true @@ -419,7 +419,7 @@ datafusion.sql_parser.collect_spans false When set to true, the source locations datafusion.sql_parser.dialect generic Configure the SQL dialect used by DataFusion's parser; supported values include: Generic, MySQL, PostgreSQL, Hive, SQLite, Snowflake, Redshift, MsSQL, ClickHouse, BigQuery, Ansi, DuckDB and Databricks. datafusion.sql_parser.enable_ident_normalization true When set to true, SQL parser will normalize ident (convert ident to lowercase when not quoted) datafusion.sql_parser.enable_options_value_normalization false When set to true, SQL parser will normalize options value (convert value to lowercase). Note that this option is ignored and will be removed in the future. All case-insensitive values are normalized automatically. -datafusion.sql_parser.map_varchar_to_utf8view true If true, `VARCHAR` is mapped to `Utf8View` during SQL planning. If false, `VARCHAR` is mapped to `Utf8` during SQL planning. Default is false. +datafusion.sql_parser.map_string_types_to_utf8view true If true, string types (VARCHAR, CHAR, Text, and String) are mapped to `Utf8View` during SQL planning. If false, they are mapped to `Utf8`. Default is true. datafusion.sql_parser.parse_float_as_decimal false When set to true, SQL parser will parse float as decimal type datafusion.sql_parser.recursion_limit 50 Specifies the recursion depth limit when parsing complex SQL Queries datafusion.sql_parser.support_varchar_with_length true If true, permit lengths for `VARCHAR` such as `VARCHAR(20)`, but ignore the length. If false, error if a `VARCHAR` with a length is specified. The Arrow type system does not have a notion of maximum string length and thus DataFusion can not enforce such limits. diff --git a/datafusion/sqllogictest/test_files/monotonic_projection_test.slt b/datafusion/sqllogictest/test_files/monotonic_projection_test.slt index e8700b1fea27..9c806cfa0d8a 100644 --- a/datafusion/sqllogictest/test_files/monotonic_projection_test.slt +++ b/datafusion/sqllogictest/test_files/monotonic_projection_test.slt @@ -129,12 +129,12 @@ ORDER BY a_str ASC, b ASC; ---- logical_plan 01)Sort: a_str ASC NULLS LAST, multiple_ordered_table.b ASC NULLS LAST -02)--Projection: CAST(multiple_ordered_table.a AS Utf8) AS a_str, multiple_ordered_table.b +02)--Projection: CAST(multiple_ordered_table.a AS Utf8View) AS a_str, multiple_ordered_table.b 03)----TableScan: multiple_ordered_table projection=[a, b] physical_plan 01)SortPreservingMergeExec: [a_str@0 ASC NULLS LAST, b@1 ASC NULLS LAST] 02)--SortExec: expr=[a_str@0 ASC NULLS LAST, b@1 ASC NULLS LAST], preserve_partitioning=[true] -03)----ProjectionExec: expr=[CAST(a@0 AS Utf8) as a_str, b@1 as b] +03)----ProjectionExec: expr=[CAST(a@0 AS Utf8View) as a_str, b@1 as b] 04)------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 05)--------DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/window_2.csv]]}, projection=[a, b], output_ordering=[a@0 ASC NULLS LAST, b@1 ASC NULLS LAST], file_type=csv, has_header=true diff --git a/datafusion/sqllogictest/test_files/parquet_filter_pushdown.slt b/datafusion/sqllogictest/test_files/parquet_filter_pushdown.slt index e5b5f5ac878a..5c0419b69d76 100644 --- a/datafusion/sqllogictest/test_files/parquet_filter_pushdown.slt +++ b/datafusion/sqllogictest/test_files/parquet_filter_pushdown.slt @@ -219,7 +219,7 @@ physical_plan query TT EXPLAIN select * from t_pushdown where part != 'a'; ---- -logical_plan TableScan: t_pushdown projection=[val, part], full_filters=[t_pushdown.part != Utf8("a")] +logical_plan TableScan: t_pushdown projection=[val, part], full_filters=[t_pushdown.part != Utf8View("a")] physical_plan DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=b/file.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=c/file.parquet]]}, projection=[val, part], file_type=parquet # And if we reference only a file column it gets pushed down @@ -227,8 +227,8 @@ query TT EXPLAIN select * from t_pushdown where val != 'c'; ---- logical_plan -01)Filter: t_pushdown.val != Utf8("c") -02)--TableScan: t_pushdown projection=[val, part], partial_filters=[t_pushdown.val != Utf8("c")] +01)Filter: t_pushdown.val != Utf8View("c") +02)--TableScan: t_pushdown projection=[val, part], partial_filters=[t_pushdown.val != Utf8View("c")] physical_plan DataSourceExec: file_groups={3 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=a/file.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=b/file.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_filter_pushdown/parquet_part_test/part=c/file.parquet]]}, projection=[val, part], file_type=parquet, predicate=val@0 != c, pruning_predicate=val_null_count@2 != row_count@3 AND (val_min@0 != c OR c != val_max@1), required_guarantees=[val not in (c)] # If we have a mix of filters: @@ -239,8 +239,8 @@ query TT EXPLAIN select * from t_pushdown where val != 'd' AND val != 'c' AND part = 'a' AND part != val; ---- logical_plan -01)Filter: t_pushdown.val != Utf8("d") AND t_pushdown.val != Utf8("c") AND t_pushdown.val != t_pushdown.part -02)--TableScan: t_pushdown projection=[val, part], full_filters=[t_pushdown.part = Utf8("a")], partial_filters=[t_pushdown.val != Utf8("d"), t_pushdown.val != Utf8("c"), t_pushdown.val != t_pushdown.part] +01)Filter: t_pushdown.val != Utf8View("d") AND t_pushdown.val != Utf8View("c") AND t_pushdown.val != t_pushdown.part +02)--TableScan: t_pushdown projection=[val, part], full_filters=[t_pushdown.part = Utf8View("a")], partial_filters=[t_pushdown.val != Utf8View("d"), t_pushdown.val != Utf8View("c"), t_pushdown.val != t_pushdown.part] physical_plan 01)CoalesceBatchesExec: target_batch_size=8192 02)--FilterExec: val@0 != part@1 @@ -253,7 +253,7 @@ EXPLAIN select val, part from t_pushdown where part = 'a' AND part = val; ---- logical_plan 01)Filter: t_pushdown.val = t_pushdown.part -02)--TableScan: t_pushdown projection=[val, part], full_filters=[t_pushdown.part = Utf8("a")], partial_filters=[t_pushdown.val = t_pushdown.part] +02)--TableScan: t_pushdown projection=[val, part], full_filters=[t_pushdown.part = Utf8View("a")], partial_filters=[t_pushdown.val = t_pushdown.part] physical_plan 01)CoalesceBatchesExec: target_batch_size=8192 02)--FilterExec: val@0 = part@1 @@ -270,7 +270,7 @@ EXPLAIN select val, part from t_pushdown where part = val AND part = 'a'; ---- logical_plan 01)Filter: t_pushdown.val = t_pushdown.part -02)--TableScan: t_pushdown projection=[val, part], full_filters=[t_pushdown.part = Utf8("a")], partial_filters=[t_pushdown.val = t_pushdown.part] +02)--TableScan: t_pushdown projection=[val, part], full_filters=[t_pushdown.part = Utf8View("a")], partial_filters=[t_pushdown.val = t_pushdown.part] physical_plan 01)CoalesceBatchesExec: target_batch_size=8192 02)--FilterExec: val@0 = part@1 @@ -280,4 +280,4 @@ physical_plan query TT select val, part from t_pushdown where part = val AND part = 'a'; ---- -a a \ No newline at end of file +a a diff --git a/datafusion/sqllogictest/test_files/push_down_filter.slt b/datafusion/sqllogictest/test_files/push_down_filter.slt index ed948dd11439..a0d319332462 100644 --- a/datafusion/sqllogictest/test_files/push_down_filter.slt +++ b/datafusion/sqllogictest/test_files/push_down_filter.slt @@ -265,7 +265,7 @@ physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/ query TT explain select a from t where CAST(a AS string) = '0123'; ---- -physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter/t.parquet]]}, projection=[a], file_type=parquet, predicate=CAST(a@0 AS Utf8) = 0123 +physical_plan DataSourceExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/push_down_filter/t.parquet]]}, projection=[a], file_type=parquet, predicate=CAST(a@0 AS Utf8View) = 0123 statement ok diff --git a/datafusion/sqllogictest/test_files/scalar.slt b/datafusion/sqllogictest/test_files/scalar.slt index f583d659fd4f..ca0b472de9e0 100644 --- a/datafusion/sqllogictest/test_files/scalar.slt +++ b/datafusion/sqllogictest/test_files/scalar.slt @@ -1832,7 +1832,7 @@ query TT EXPLAIN SELECT letter, letter = LEFT('APACHE', 1) FROM simple_string; ---- logical_plan -01)Projection: simple_string.letter, simple_string.letter = Utf8("A") AS simple_string.letter = left(Utf8("APACHE"),Int64(1)) +01)Projection: simple_string.letter, simple_string.letter = Utf8View("A") AS simple_string.letter = left(Utf8("APACHE"),Int64(1)) 02)--TableScan: simple_string projection=[letter] physical_plan 01)ProjectionExec: expr=[letter@0 as letter, letter@0 = A as simple_string.letter = left(Utf8("APACHE"),Int64(1))] @@ -1851,10 +1851,10 @@ query TT EXPLAIN SELECT letter, letter = LEFT(letter2, 1) FROM simple_string; ---- logical_plan -01)Projection: simple_string.letter, simple_string.letter = left(simple_string.letter2, Int64(1)) +01)Projection: simple_string.letter, simple_string.letter = CAST(left(simple_string.letter2, Int64(1)) AS Utf8View) 02)--TableScan: simple_string projection=[letter, letter2] physical_plan -01)ProjectionExec: expr=[letter@0 as letter, letter@0 = left(letter2@1, 1) as simple_string.letter = left(simple_string.letter2,Int64(1))] +01)ProjectionExec: expr=[letter@0 as letter, letter@0 = CAST(left(letter2@1, 1) AS Utf8View) as simple_string.letter = left(simple_string.letter2,Int64(1))] 02)--DataSourceExec: partitions=1, partition_sizes=[1] query TB diff --git a/datafusion/sqllogictest/test_files/simplify_expr.slt b/datafusion/sqllogictest/test_files/simplify_expr.slt index 075ccafcfd2e..c77163dc996d 100644 --- a/datafusion/sqllogictest/test_files/simplify_expr.slt +++ b/datafusion/sqllogictest/test_files/simplify_expr.slt @@ -35,22 +35,22 @@ query TT explain select b from t where b ~ '.*' ---- logical_plan -01)Filter: t.b IS NOT NULL +01)Filter: t.b ~ Utf8View(".*") 02)--TableScan: t projection=[b] physical_plan 01)CoalesceBatchesExec: target_batch_size=8192 -02)--FilterExec: b@0 IS NOT NULL +02)--FilterExec: b@0 ~ .* 03)----DataSourceExec: partitions=1, partition_sizes=[1] query TT explain select b from t where b !~ '.*' ---- logical_plan -01)Filter: t.b = Utf8("") +01)Filter: t.b !~ Utf8View(".*") 02)--TableScan: t projection=[b] physical_plan 01)CoalesceBatchesExec: target_batch_size=8192 -02)--FilterExec: b@0 = +02)--FilterExec: b@0 !~ .* 03)----DataSourceExec: partitions=1, partition_sizes=[1] query T diff --git a/datafusion/sqllogictest/test_files/union.slt b/datafusion/sqllogictest/test_files/union.slt index d549f555f9d8..f901a4d373a3 100644 --- a/datafusion/sqllogictest/test_files/union.slt +++ b/datafusion/sqllogictest/test_files/union.slt @@ -230,7 +230,7 @@ logical_plan 02)--Union 03)----TableScan: t1 projection=[name] 04)----TableScan: t2 projection=[name] -05)----Projection: t2.name || Utf8("_new") AS name +05)----Projection: t2.name || Utf8View("_new") AS name 06)------TableScan: t2 projection=[name] physical_plan 01)AggregateExec: mode=FinalPartitioned, gby=[name@0 as name], aggr=[] @@ -266,7 +266,7 @@ logical_plan 01)Union 02)--TableScan: t1 projection=[name] 03)--TableScan: t2 projection=[name] -04)--Projection: t2.name || Utf8("_new") AS name +04)--Projection: t2.name || Utf8View("_new") AS name 05)----TableScan: t2 projection=[name] physical_plan 01)UnionExec diff --git a/docs/source/user-guide/configs.md b/docs/source/user-guide/configs.md index 42282e39e41f..5c80cbd563c2 100644 --- a/docs/source/user-guide/configs.md +++ b/docs/source/user-guide/configs.md @@ -131,7 +131,7 @@ Environment variables are read during `SessionConfig` initialisation so they mus | datafusion.sql_parser.enable_options_value_normalization | false | When set to true, SQL parser will normalize options value (convert value to lowercase). Note that this option is ignored and will be removed in the future. All case-insensitive values are normalized automatically. | | datafusion.sql_parser.dialect | generic | Configure the SQL dialect used by DataFusion's parser; supported values include: Generic, MySQL, PostgreSQL, Hive, SQLite, Snowflake, Redshift, MsSQL, ClickHouse, BigQuery, Ansi, DuckDB and Databricks. | | datafusion.sql_parser.support_varchar_with_length | true | If true, permit lengths for `VARCHAR` such as `VARCHAR(20)`, but ignore the length. If false, error if a `VARCHAR` with a length is specified. The Arrow type system does not have a notion of maximum string length and thus DataFusion can not enforce such limits. | -| datafusion.sql_parser.map_varchar_to_utf8view | true | If true, `VARCHAR` is mapped to `Utf8View` during SQL planning. If false, `VARCHAR` is mapped to `Utf8` during SQL planning. Default is false. | +| datafusion.sql_parser.map_string_types_to_utf8view | true | If true, string types (VARCHAR, CHAR, Text, and String) are mapped to `Utf8View` during SQL planning. If false, they are mapped to `Utf8`. Default is true. | | datafusion.sql_parser.collect_spans | false | When set to true, the source locations relative to the original SQL query (i.e. [`Span`](https://docs.rs/sqlparser/latest/sqlparser/tokenizer/struct.Span.html)) will be collected and recorded in the logical plan nodes. | | datafusion.sql_parser.recursion_limit | 50 | Specifies the recursion depth limit when parsing complex SQL Queries | | datafusion.format.safe | true | If set to `true` any formatting errors will be written to the output instead of being converted into a [`std::fmt::Error`] |