From a7ad13aed2ee39b2a62676af417ab35220bf6b88 Mon Sep 17 00:00:00 2001 From: Mehmet Ozan Kabak Date: Mon, 13 May 2024 12:31:11 +0300 Subject: [PATCH 1/2] Don't require quotations for simple namespaced keys like foo.bar --- datafusion/common/src/config.rs | 65 ++++++++----------- datafusion/core/src/execution/context/mod.rs | 24 ++++--- .../tests/cases/roundtrip_logical_plan.rs | 18 ++--- datafusion/sql/src/parser.rs | 21 ++++-- .../test_files/create_external_table.slt | 21 ++++-- .../test_files/tpch/create_tables.slt.part | 2 +- 6 files changed, 81 insertions(+), 70 deletions(-) diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index c60f843393f8..dee82890a003 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -130,9 +130,9 @@ macro_rules! config_namespace { $( stringify!($field_name) => self.$field_name.set(rem, value), )* - _ => return Err(DataFusionError::Configuration(format!( + _ => return _config_err!( "Config value \"{}\" not found on {}", key, stringify!($struct_name) - ))) + ) } } @@ -675,22 +675,17 @@ impl ConfigOptions { /// Set a configuration option pub fn set(&mut self, key: &str, value: &str) -> Result<()> { - let (prefix, key) = key.split_once('.').ok_or_else(|| { - DataFusionError::Configuration(format!( - "could not find config namespace for key \"{key}\"", - )) - })?; + let Some((prefix, key)) = key.split_once('.') else { + return _config_err!("could not find config namespace for key \"{key}\""); + }; if prefix == "datafusion" { return ConfigField::set(self, key, value); } - let e = self.extensions.0.get_mut(prefix); - let e = e.ok_or_else(|| { - DataFusionError::Configuration(format!( - "Could not find config namespace \"{prefix}\"" - )) - })?; + let Some(e) = self.extensions.0.get_mut(prefix) else { + return _config_err!("Could not find config namespace \"{prefix}\""); + }; e.0.set(key, value) } @@ -1278,22 +1273,17 @@ impl TableOptions { /// /// A result indicating success or failure in setting the configuration option. pub fn set(&mut self, key: &str, value: &str) -> Result<()> { - let (prefix, _) = key.split_once('.').ok_or_else(|| { - DataFusionError::Configuration(format!( - "could not find config namespace for key \"{key}\"" - )) - })?; + let Some((prefix, _)) = key.split_once('.') else { + return _config_err!("could not find config namespace for key \"{key}\""); + }; if prefix == "format" { return ConfigField::set(self, key, value); } - let e = self.extensions.0.get_mut(prefix); - let e = e.ok_or_else(|| { - DataFusionError::Configuration(format!( - "Could not find config namespace \"{prefix}\"" - )) - })?; + let Some(e) = self.extensions.0.get_mut(prefix) else { + return _config_err!("Could not find config namespace \"{prefix}\""); + }; e.0.set(key, value) } @@ -1412,19 +1402,19 @@ impl ConfigField for TableParquetOptions { fn set(&mut self, key: &str, value: &str) -> Result<()> { // Determine if the key is a global, metadata, or column-specific setting if key.starts_with("metadata::") { - let k = - match key.split("::").collect::>()[..] { - [_meta] | [_meta, ""] => return Err(DataFusionError::Configuration( + let k = match key.split("::").collect::>()[..] { + [_meta] | [_meta, ""] => { + return _config_err!( "Invalid metadata key provided, missing key in metadata::" - .to_string(), - )), - [_meta, k] => k.into(), - _ => { - return Err(DataFusionError::Configuration(format!( + ) + } + [_meta, k] => k.into(), + _ => { + return _config_err!( "Invalid metadata key provided, found too many '::' in \"{key}\"" - ))) - } - }; + ) + } + }; self.key_value_metadata.insert(k, Some(value.into())); Ok(()) } else if key.contains("::") { @@ -1497,10 +1487,7 @@ macro_rules! config_namespace_with_hashmap { inner_value.set(inner_key, value) } - _ => Err(DataFusionError::Configuration(format!( - "Unrecognized key '{}'.", - key - ))), + _ => _config_err!("Unrecognized key '{key}'."), } } diff --git a/datafusion/core/src/execution/context/mod.rs b/datafusion/core/src/execution/context/mod.rs index e69a249410b1..2fc1a19c3386 100644 --- a/datafusion/core/src/execution/context/mod.rs +++ b/datafusion/core/src/execution/context/mod.rs @@ -23,6 +23,8 @@ use std::ops::ControlFlow; use std::sync::{Arc, Weak}; use super::options::ReadOptions; +#[cfg(feature = "array_expressions")] +use crate::functions_array; use crate::{ catalog::information_schema::{InformationSchemaProvider, INFORMATION_SCHEMA}, catalog::listing_schema::ListingSchemaProvider, @@ -53,53 +55,49 @@ use crate::{ }, optimizer::analyzer::{Analyzer, AnalyzerRule}, optimizer::optimizer::{Optimizer, OptimizerConfig, OptimizerRule}, + physical_expr::{create_physical_expr, PhysicalExpr}, physical_optimizer::optimizer::{PhysicalOptimizer, PhysicalOptimizerRule}, physical_plan::ExecutionPlan, physical_planner::{DefaultPhysicalPlanner, PhysicalPlanner}, variable::{VarProvider, VarType}, }; - -#[cfg(feature = "array_expressions")] -use crate::functions_array; use crate::{functions, functions_aggregate}; use arrow::datatypes::{DataType, SchemaRef}; use arrow::record_batch::RecordBatch; use arrow_schema::Schema; -use async_trait::async_trait; -use chrono::{DateTime, Utc}; -use datafusion_common::tree_node::TreeNode; use datafusion_common::{ alias::AliasGenerator, config::{ConfigExtension, TableOptions}, exec_err, not_impl_err, plan_datafusion_err, plan_err, - tree_node::{TreeNodeRecursion, TreeNodeVisitor}, + tree_node::{TreeNode, TreeNodeRecursion, TreeNodeVisitor}, DFSchema, SchemaReference, TableReference, }; use datafusion_execution::registry::SerializerRegistry; use datafusion_expr::{ + expr_rewriter::FunctionRewrite, logical_plan::{DdlStatement, Statement}, + simplify::SimplifyInfo, var_provider::is_system_variables, Expr, ExprSchemable, StringifiedPlan, UserDefinedLogicalNode, WindowUDF, }; +use datafusion_optimizer::simplify_expressions::ExprSimplifier; use datafusion_sql::{ parser::{CopyToSource, CopyToStatement, DFParser}, planner::{object_name_to_table_reference, ContextProvider, ParserOptions, SqlToRel}, ResolvedTableReference, }; -use parking_lot::RwLock; use sqlparser::dialect::dialect_from_str; + +use async_trait::async_trait; +use chrono::{DateTime, Utc}; +use parking_lot::RwLock; use url::Url; use uuid::Uuid; -use crate::physical_expr::PhysicalExpr; pub use datafusion_execution::config::SessionConfig; pub use datafusion_execution::TaskContext; pub use datafusion_expr::execution_props::ExecutionProps; -use datafusion_expr::expr_rewriter::FunctionRewrite; -use datafusion_expr::simplify::SimplifyInfo; -use datafusion_optimizer::simplify_expressions::ExprSimplifier; -use datafusion_physical_expr::create_physical_expr; mod avro; mod csv; diff --git a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs index e5e57c0bc893..2927fd01d1b3 100644 --- a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs +++ b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs @@ -15,6 +15,12 @@ // specific language governing permissions and limitations // under the License. +use std::any::Any; +use std::collections::HashMap; +use std::fmt::{self, Debug, Formatter}; +use std::sync::Arc; +use std::vec; + use arrow::array::{ArrayRef, FixedSizeListArray}; use arrow::datatypes::{ DataType, Field, Fields, Int32Type, IntervalDayTimeType, IntervalMonthDayNanoType, @@ -24,6 +30,7 @@ use datafusion::datasource::provider::TableProviderFactory; use datafusion::datasource::TableProvider; use datafusion::execution::context::SessionState; use datafusion::execution::runtime_env::{RuntimeConfig, RuntimeEnv}; +use datafusion::execution::FunctionRegistry; use datafusion::functions_aggregate::covariance::{covar_pop, covar_samp}; use datafusion::functions_aggregate::expr_fn::first_value; use datafusion::prelude::*; @@ -51,16 +58,11 @@ use datafusion_proto::bytes::{ logical_plan_to_bytes, logical_plan_to_bytes_with_extension_codec, }; use datafusion_proto::logical_plan::to_proto::serialize_expr; -use datafusion_proto::logical_plan::LogicalExtensionCodec; -use datafusion_proto::logical_plan::{from_proto, DefaultLogicalExtensionCodec}; +use datafusion_proto::logical_plan::{ + from_proto, DefaultLogicalExtensionCodec, LogicalExtensionCodec, +}; use datafusion_proto::protobuf; -use std::any::Any; -use std::collections::HashMap; -use std::fmt::{self, Debug, Formatter}; -use std::sync::Arc; -use std::vec; -use datafusion::execution::FunctionRegistry; use prost::Message; #[cfg(feature = "json")] diff --git a/datafusion/sql/src/parser.rs b/datafusion/sql/src/parser.rs index f61c9cda6345..4ffdfa59cd2d 100644 --- a/datafusion/sql/src/parser.rs +++ b/datafusion/sql/src/parser.rs @@ -462,7 +462,18 @@ impl<'a> DFParser<'a> { pub fn parse_option_key(&mut self) -> Result { let next_token = self.parser.next_token(); match next_token.token { - Token::Word(Word { value, .. }) => Ok(value), + Token::Word(Word { value, .. }) => { + let mut parts = vec![value]; + while self.parser.consume_token(&Token::Period) { + let next_token = self.parser.next_token(); + if let Token::Word(Word { value, .. }) = next_token.token { + parts.push(value); + } else { + return self.parser.expected("key name", next_token); + } + } + Ok(parts.join(".")) + } Token::SingleQuotedString(s) => Ok(s), Token::DoubleQuotedString(s) => Ok(s), Token::EscapedStringLiteral(s) => Ok(s), @@ -712,15 +723,15 @@ impl<'a> DFParser<'a> { } else { self.parser.expect_keyword(Keyword::HEADER)?; self.parser.expect_keyword(Keyword::ROW)?; - return parser_err!("WITH HEADER ROW clause is no longer in use. Please use the OPTIONS clause with 'format.has_header' set appropriately, e.g., OPTIONS ('format.has_header' 'true')"); + return parser_err!("WITH HEADER ROW clause is no longer in use. Please use the OPTIONS clause with 'format.has_header' set appropriately, e.g., OPTIONS (format.has_header true)"); } } Keyword::DELIMITER => { - return parser_err!("DELIMITER clause is no longer in use. Please use the OPTIONS clause with 'format.delimiter' set appropriately, e.g., OPTIONS ('format.delimiter' ',')"); + return parser_err!("DELIMITER clause is no longer in use. Please use the OPTIONS clause with 'format.delimiter' set appropriately, e.g., OPTIONS (format.delimiter ',')"); } Keyword::COMPRESSION => { self.parser.expect_keyword(Keyword::TYPE)?; - return parser_err!("COMPRESSION TYPE clause is no longer in use. Please use the OPTIONS clause with 'format.compression' set appropriately, e.g., OPTIONS ('format.compression' 'gzip')"); + return parser_err!("COMPRESSION TYPE clause is no longer in use. Please use the OPTIONS clause with 'format.compression' set appropriately, e.g., OPTIONS (format.compression gzip)"); } Keyword::PARTITIONED => { self.parser.expect_keyword(Keyword::BY)?; @@ -933,7 +944,7 @@ mod tests { expect_parse_ok(sql, expected)?; // positive case with delimiter - let sql = "CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV LOCATION 'foo.csv' OPTIONS ('format.delimiter' '|')"; + let sql = "CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV LOCATION 'foo.csv' OPTIONS (format.delimiter '|')"; let display = None; let expected = Statement::CreateExternalTable(CreateExternalTable { name: "t".into(), diff --git a/datafusion/sqllogictest/test_files/create_external_table.slt b/datafusion/sqllogictest/test_files/create_external_table.slt index fca177bb61f0..607c909fd63d 100644 --- a/datafusion/sqllogictest/test_files/create_external_table.slt +++ b/datafusion/sqllogictest/test_files/create_external_table.slt @@ -190,8 +190,8 @@ LOCATION 'test_files/scratch/create_external_table/manual_partitioning/'; statement error DataFusion error: Error during planning: Option format.delimiter is specified multiple times CREATE EXTERNAL TABLE t STORED AS CSV OPTIONS ( 'format.delimiter' '*', - 'format.has_header' 'true', - 'format.delimiter' '|') + 'format.has_header' 'true', + 'format.delimiter' '|') LOCATION 'foo.csv'; # If a config does not belong to any namespace, we assume it is a 'format' option and apply the 'format' prefix for backwards compatibility. @@ -201,7 +201,20 @@ CREATE EXTERNAL TABLE IF NOT EXISTS region ( r_name VARCHAR, r_comment VARCHAR, r_rev VARCHAR, -) STORED AS CSV LOCATION 'test_files/tpch/data/region.tbl' +) STORED AS CSV LOCATION 'test_files/tpch/data/region.tbl' OPTIONS ( 'format.delimiter' '|', - 'has_header' 'false'); \ No newline at end of file + 'has_header' 'false'); + +# Verify that we do not need quotations for simple namespaced keys. +statement ok +CREATE EXTERNAL TABLE IF NOT EXISTS region ( + r_regionkey BIGINT, + r_name VARCHAR, + r_comment VARCHAR, + r_rev VARCHAR, +) STORED AS CSV LOCATION 'test_files/tpch/data/region.tbl' +OPTIONS ( + format.delimiter '|', + has_header false, + compression gzip); diff --git a/datafusion/sqllogictest/test_files/tpch/create_tables.slt.part b/datafusion/sqllogictest/test_files/tpch/create_tables.slt.part index 111d24773055..75bcbc198bef 100644 --- a/datafusion/sqllogictest/test_files/tpch/create_tables.slt.part +++ b/datafusion/sqllogictest/test_files/tpch/create_tables.slt.part @@ -121,4 +121,4 @@ CREATE EXTERNAL TABLE IF NOT EXISTS region ( r_name VARCHAR, r_comment VARCHAR, r_rev VARCHAR, -) STORED AS CSV LOCATION 'test_files/tpch/data/region.tbl' OPTIONS ('format.delimiter' '|'); \ No newline at end of file +) STORED AS CSV LOCATION 'test_files/tpch/data/region.tbl' OPTIONS ('format.delimiter' '|'); From 669c3743cfddf4f591b44e14b9c64f3ffac2a080 Mon Sep 17 00:00:00 2001 From: Mehmet Ozan Kabak Date: Mon, 13 May 2024 18:52:42 +0300 Subject: [PATCH 2/2] Add comments clarifying parse error cases for unquoted namespaced keys --- datafusion/sql/src/parser.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/datafusion/sql/src/parser.rs b/datafusion/sql/src/parser.rs index 4ffdfa59cd2d..d09317271d23 100644 --- a/datafusion/sql/src/parser.rs +++ b/datafusion/sql/src/parser.rs @@ -469,6 +469,9 @@ impl<'a> DFParser<'a> { if let Token::Word(Word { value, .. }) = next_token.token { parts.push(value); } else { + // Unquoted namespaced keys have to conform to the syntax + // "[\.]*". If we have a key that breaks this + // pattern, error out: return self.parser.expected("key name", next_token); } }