From ec3876159977bb1171fa9d1b3efe8418d570a0ce Mon Sep 17 00:00:00 2001 From: berkaysynnada Date: Tue, 7 May 2024 13:59:16 +0300 Subject: [PATCH 01/15] Simplify format options --- datafusion-cli/src/helper.rs | 69 +++++++++------ datafusion/common/src/config.rs | 6 +- .../common/src/file_options/csv_writer.rs | 2 +- datafusion/core/src/catalog/listing_schema.rs | 3 - .../core/src/datasource/file_format/csv.rs | 44 +++++++--- .../src/datasource/listing_table_factory.rs | 21 ++--- .../core/src/datasource/physical_plan/csv.rs | 16 ++-- datafusion/core/tests/path_partition.rs | 17 +++- datafusion/expr/src/logical_plan/ddl.rs | 7 -- datafusion/proto/proto/datafusion.proto | 21 ++--- datafusion/proto/src/generated/pbjson.rs | 48 ++--------- datafusion/proto/src/generated/prost.rs | 20 ++--- datafusion/proto/src/logical_plan/mod.rs | 60 +++++-------- .../proto/src/physical_plan/from_proto.rs | 2 +- .../proto/src/physical_plan/to_proto.rs | 5 +- datafusion/sql/src/parser.rs | 85 ++++++------------- datafusion/sql/src/statement.rs | 12 +-- datafusion/sql/tests/sql_integration.rs | 24 +++--- datafusion/sqllogictest/test_files/copy.slt | 6 +- .../sqllogictest/test_files/csv_files.slt | 20 ++--- datafusion/sqllogictest/test_files/ddl.slt | 2 +- datafusion/sqllogictest/test_files/limit.slt | 3 +- datafusion/sqllogictest/test_files/order.slt | 3 +- .../test_files/pg_compat/pg_compat_null.slt | 3 +- .../test_files/pg_compat/pg_compat_simple.slt | 3 +- .../test_files/pg_compat/pg_compat_union.slt | 3 +- .../test_files/pg_compat/pg_compat_window.slt | 3 +- .../sqllogictest/test_files/predicates.slt | 4 +- .../test_files/repartition_scan.slt | 6 +- .../sqllogictest/test_files/subquery.slt | 6 +- .../test_files/tpch/create_tables.slt.part | 16 ++-- 31 files changed, 243 insertions(+), 297 deletions(-) diff --git a/datafusion-cli/src/helper.rs b/datafusion-cli/src/helper.rs index 85f14c1736dc..28cba06723aa 100644 --- a/datafusion-cli/src/helper.rs +++ b/datafusion-cli/src/helper.rs @@ -259,52 +259,69 @@ mod tests { // shoule be valid let result = readline_direct( - Cursor::new(r"create external table test stored as csv location 'data.csv' delimiter ',';".as_bytes()), - &validator, - )?; + Cursor::new( + r"create external table test stored as csv location 'data.csv' options ('format.delimiter' ',');" + .as_bytes(), + ), + &validator, + )?; assert!(matches!(result, ValidationResult::Valid(None))); let result = readline_direct( - Cursor::new(r"create external table test stored as csv location 'data.csv' delimiter '\0';".as_bytes()), - &validator, - )?; + Cursor::new( + r"create external table test stored as csv location 'data.csv' options ('format.delimiter' '\0');" + .as_bytes()), + &validator, + )?; assert!(matches!(result, ValidationResult::Valid(None))); let result = readline_direct( - Cursor::new(r"create external table test stored as csv location 'data.csv' delimiter '\n';".as_bytes()), - &validator, - )?; + Cursor::new( + r"create external table test stored as csv location 'data.csv' options ('format.delimiter' '\n');" + .as_bytes()), + &validator, + )?; assert!(matches!(result, ValidationResult::Valid(None))); let result = readline_direct( - Cursor::new(r"create external table test stored as csv location 'data.csv' delimiter '\r';".as_bytes()), - &validator, - )?; + Cursor::new( + r"create external table test stored as csv location 'data.csv' options ('format.delimiter' '\r');" + .as_bytes()), + &validator, + )?; assert!(matches!(result, ValidationResult::Valid(None))); let result = readline_direct( - Cursor::new(r"create external table test stored as csv location 'data.csv' delimiter '\t';".as_bytes()), - &validator, - )?; + Cursor::new( + r"create external table test stored as csv location 'data.csv' options ('format.delimiter' '\t');" + .as_bytes()), + &validator, + )?; assert!(matches!(result, ValidationResult::Valid(None))); let result = readline_direct( - Cursor::new(r"create external table test stored as csv location 'data.csv' delimiter '\\';".as_bytes()), - &validator, - )?; + Cursor::new( + r"create external table test stored as csv location 'data.csv' options ('format.delimiter' '\\');" + .as_bytes()), + &validator, + )?; assert!(matches!(result, ValidationResult::Valid(None))); - // should be invalid let result = readline_direct( - Cursor::new(r"create external table test stored as csv location 'data.csv' delimiter ',,';".as_bytes()), - &validator, - )?; - assert!(matches!(result, ValidationResult::Invalid(Some(_)))); + Cursor::new( + r"create external table test stored as csv location 'data.csv' options ('format.delimiter' ',,');" + .as_bytes()), + &validator, + )?; + assert!(matches!(result, ValidationResult::Valid(None))); + // should be invalid let result = readline_direct( - Cursor::new(r"create external table test stored as csv location 'data.csv' delimiter '\u{07}';".as_bytes()), - &validator, - )?; + Cursor::new( + r"create external table test stored as csv location 'data.csv' options ('format.delimiter' '\u{07}');" + .as_bytes()), + &validator, + )?; assert!(matches!(result, ValidationResult::Invalid(Some(_)))); Ok(()) diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index 18cd83a47097..1afac3549117 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -1564,7 +1564,7 @@ config_namespace_with_hashmap! { config_namespace! { /// Options controlling CSV format pub struct CsvOptions { - pub has_header: bool, default = true + pub has_header: Option, default = None pub delimiter: u8, default = b',' pub quote: u8, default = b'"' pub escape: Option, default = None @@ -1600,13 +1600,13 @@ impl CsvOptions { /// Set true to indicate that the first line is a header. /// - default to true pub fn with_has_header(mut self, has_header: bool) -> Self { - self.has_header = has_header; + self.has_header = Some(has_header); self } /// True if the first line is a header. pub fn has_header(&self) -> bool { - self.has_header + self.has_header.unwrap_or(false) } /// The character separating values within a row. diff --git a/datafusion/common/src/file_options/csv_writer.rs b/datafusion/common/src/file_options/csv_writer.rs index 5f1a62682f8d..2904ea0f8f03 100644 --- a/datafusion/common/src/file_options/csv_writer.rs +++ b/datafusion/common/src/file_options/csv_writer.rs @@ -50,7 +50,7 @@ impl TryFrom<&CsvOptions> for CsvWriterOptions { fn try_from(value: &CsvOptions) -> Result { let mut builder = WriterBuilder::default() - .with_header(value.has_header) + .with_header(value.has_header.unwrap_or(false)) .with_delimiter(value.delimiter); if let Some(v) = &value.date_format { diff --git a/datafusion/core/src/catalog/listing_schema.rs b/datafusion/core/src/catalog/listing_schema.rs index a5960b21dff5..15e6764b1033 100644 --- a/datafusion/core/src/catalog/listing_schema.rs +++ b/datafusion/core/src/catalog/listing_schema.rs @@ -27,7 +27,6 @@ use crate::datasource::provider::TableProviderFactory; use crate::datasource::TableProvider; use crate::execution::context::SessionState; -use datafusion_common::parsers::CompressionTypeVariant; use datafusion_common::{Constraints, DFSchema, DataFusionError, TableReference}; use datafusion_expr::CreateExternalTable; @@ -140,11 +139,9 @@ impl ListingSchemaProvider { location: table_url, file_type: self.format.clone(), has_header: self.has_header, - delimiter: ',', table_partition_cols: vec![], if_not_exists: false, definition: None, - file_compression_type: CompressionTypeVariant::UNCOMPRESSED, order_exprs: vec![], unbounded: false, options: Default::default(), diff --git a/datafusion/core/src/datasource/file_format/csv.rs b/datafusion/core/src/datasource/file_format/csv.rs index 645f98cd3fb0..2e5054d4366c 100644 --- a/datafusion/core/src/datasource/file_format/csv.rs +++ b/datafusion/core/src/datasource/file_format/csv.rs @@ -136,13 +136,13 @@ impl CsvFormat { /// Set true to indicate that the first line is a header. /// - default to true pub fn with_has_header(mut self, has_header: bool) -> Self { - self.options.has_header = has_header; + self.options.has_header = Some(has_header); self } /// True if the first line is a header. pub fn has_header(&self) -> bool { - self.options.has_header + self.options.has_header.unwrap_or(false) } /// The character separating values within a row. @@ -200,7 +200,7 @@ impl FileFormat for CsvFormat { async fn infer_schema( &self, - _state: &SessionState, + state: &SessionState, store: &Arc, objects: &[ObjectMeta], ) -> Result { @@ -211,7 +211,7 @@ impl FileFormat for CsvFormat { for object in objects { let stream = self.read_to_delimited_chunks(store, object).await; let (schema, records_read) = self - .infer_schema_from_stream(records_to_read, stream) + .infer_schema_from_stream(state, records_to_read, stream) .await?; records_to_read -= records_read; schemas.push(schema); @@ -236,13 +236,15 @@ impl FileFormat for CsvFormat { async fn create_physical_plan( &self, - _state: &SessionState, + state: &SessionState, conf: FileScanConfig, _filters: Option<&Arc>, ) -> Result> { let exec = CsvExec::new( conf, - self.options.has_header, + self.options + .has_header + .unwrap_or(state.config().options().catalog.has_header), self.options.delimiter, self.options.quote, self.options.escape, @@ -286,6 +288,7 @@ impl CsvFormat { /// number of lines that were read async fn infer_schema_from_stream( &self, + state: &SessionState, mut records_to_read: usize, stream: impl Stream>, ) -> Result<(Schema, usize)> { @@ -298,7 +301,12 @@ impl CsvFormat { while let Some(chunk) = stream.next().await.transpose()? { let format = arrow::csv::reader::Format::default() - .with_header(self.options.has_header && first_chunk) + .with_header( + self.options + .has_header + .unwrap_or(state.config_options().catalog.has_header) + && first_chunk, + ) .with_delimiter(self.options.delimiter); let (Schema { fields, .. }, records_read) = @@ -538,6 +546,7 @@ mod tests { use datafusion_common::cast::as_string_array; use datafusion_common::stats::Precision; use datafusion_common::{internal_err, GetExt}; + use datafusion_execution::runtime_env::{RuntimeConfig, RuntimeEnv}; use datafusion_expr::{col, lit}; use chrono::DateTime; @@ -554,7 +563,8 @@ mod tests { let task_ctx = state.task_ctx(); // skip column 9 that overflows the automaticly discovered column type of i64 (u64 would work) let projection = Some(vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12]); - let exec = get_exec(&state, "aggregate_test_100.csv", projection, None).await?; + let exec = + get_exec(&state, "aggregate_test_100.csv", projection, None, true).await?; let stream = exec.execute(0, task_ctx)?; let tt_batches: i32 = stream @@ -582,7 +592,7 @@ mod tests { let task_ctx = session_ctx.task_ctx(); let projection = Some(vec![0, 1, 2, 3]); let exec = - get_exec(&state, "aggregate_test_100.csv", projection, Some(1)).await?; + get_exec(&state, "aggregate_test_100.csv", projection, Some(1), true).await?; let batches = collect(exec, task_ctx).await?; assert_eq!(1, batches.len()); assert_eq!(4, batches[0].num_columns()); @@ -597,7 +607,8 @@ mod tests { let state = session_ctx.state(); let projection = None; - let exec = get_exec(&state, "aggregate_test_100.csv", projection, None).await?; + let exec = + get_exec(&state, "aggregate_test_100.csv", projection, None, true).await?; let x: Vec = exec .schema() @@ -633,7 +644,8 @@ mod tests { let state = session_ctx.state(); let task_ctx = session_ctx.task_ctx(); let projection = Some(vec![0]); - let exec = get_exec(&state, "aggregate_test_100.csv", projection, None).await?; + let exec = + get_exec(&state, "aggregate_test_100.csv", projection, None, true).await?; let batches = collect(exec, task_ctx).await.expect("Collect batches"); @@ -716,6 +728,10 @@ mod tests { async fn query_compress_data( file_compression_type: FileCompressionType, ) -> Result<()> { + let runtime = Arc::new(RuntimeEnv::new(RuntimeConfig::new()).unwrap()); + let cfg = SessionConfig::new().set_str("datafusion.catalog.has_header", "true"); + let session_state = SessionState::new_with_config_rt(cfg, runtime); + let integration = LocalFileSystem::new_with_prefix(arrow_test_data()).unwrap(); let path = Path::from("csv/aggregate_test_100.csv"); @@ -757,7 +773,7 @@ mod tests { .read_to_delimited_chunks_from_stream(compressed_stream.unwrap()) .await; let (schema, records_read) = compressed_csv - .infer_schema_from_stream(records_to_read, decoded_stream) + .infer_schema_from_stream(&session_state, records_to_read, decoded_stream) .await?; assert_eq!(expected, schema); @@ -803,9 +819,11 @@ mod tests { file_name: &str, projection: Option>, limit: Option, + hash_header: bool, ) -> Result> { let root = format!("{}/csv", crate::test_util::arrow_test_data()); - let format = CsvFormat::default(); + let mut format = CsvFormat::default(); + format = format.with_has_header(hash_header); scan_format(state, &format, &root, file_name, projection, limit).await } diff --git a/datafusion/core/src/datasource/listing_table_factory.rs b/datafusion/core/src/datasource/listing_table_factory.rs index 1a0eb34d1234..8380808d5f3b 100644 --- a/datafusion/core/src/datasource/listing_table_factory.rs +++ b/datafusion/core/src/datasource/listing_table_factory.rs @@ -34,7 +34,7 @@ use crate::datasource::TableProvider; use crate::execution::context::SessionState; use arrow::datatypes::{DataType, SchemaRef}; -use datafusion_common::{arrow_datafusion_err, DataFusionError, FileType}; +use datafusion_common::{arrow_datafusion_err, plan_err, DataFusionError, FileType}; use datafusion_expr::CreateExternalTable; use async_trait::async_trait; @@ -67,9 +67,13 @@ impl TableProviderFactory for ListingTableFactory { let file_format: Arc = match file_type { FileType::CSV => { let mut csv_options = table_options.csv; - csv_options.has_header = cmd.has_header; - csv_options.delimiter = cmd.delimiter as u8; - csv_options.compression = cmd.file_compression_type; + if let Some(has_header) = csv_options.has_header { + if cmd.has_header && !has_header { + return plan_err!("Conflicting header options for CSV file"); + } + } else { + csv_options.has_header = Some(cmd.has_header); + } Arc::new(CsvFormat::default().with_options(csv_options)) } #[cfg(feature = "parquet")] @@ -78,9 +82,7 @@ impl TableProviderFactory for ListingTableFactory { } FileType::AVRO => Arc::new(AvroFormat), FileType::JSON => { - let mut json_options = table_options.json; - json_options.compression = cmd.file_compression_type; - Arc::new(JsonFormat::default().with_options(json_options)) + Arc::new(JsonFormat::default().with_options(table_options.json)) } FileType::ARROW => Arc::new(ArrowFormat), }; @@ -172,7 +174,6 @@ mod tests { use super::*; use crate::execution::context::SessionContext; - use datafusion_common::parsers::CompressionTypeVariant; use datafusion_common::{Constraints, DFSchema, TableReference}; #[tokio::test] @@ -192,11 +193,9 @@ mod tests { location: csv_file.path().to_str().unwrap().to_string(), file_type: "csv".to_string(), has_header: true, - delimiter: ',', schema: Arc::new(DFSchema::empty()), table_partition_cols: vec![], if_not_exists: false, - file_compression_type: CompressionTypeVariant::UNCOMPRESSED, definition: None, order_exprs: vec![], unbounded: false, @@ -233,11 +232,9 @@ mod tests { location: csv_file.path().to_str().unwrap().to_string(), file_type: "csv".to_string(), has_header: true, - delimiter: ',', schema: Arc::new(DFSchema::empty()), table_partition_cols: vec![], if_not_exists: false, - file_compression_type: CompressionTypeVariant::UNCOMPRESSED, definition: None, order_exprs: vec![], unbounded: false, diff --git a/datafusion/core/src/datasource/physical_plan/csv.rs b/datafusion/core/src/datasource/physical_plan/csv.rs index 879461c2eb1e..cc7c837e471e 100644 --- a/datafusion/core/src/datasource/physical_plan/csv.rs +++ b/datafusion/core/src/datasource/physical_plan/csv.rs @@ -610,7 +610,8 @@ mod tests { async fn csv_exec_with_mixed_order_projection( file_compression_type: FileCompressionType, ) -> Result<()> { - let session_ctx = SessionContext::new(); + let cfg = SessionConfig::new().set_str("datafusion.catalog.has_header", "true"); + let session_ctx = SessionContext::new_with_config(cfg); let task_ctx = session_ctx.task_ctx(); let file_schema = aggr_test_schema(); let path = format!("{}/csv", arrow_test_data()); @@ -675,7 +676,8 @@ mod tests { async fn csv_exec_with_limit( file_compression_type: FileCompressionType, ) -> Result<()> { - let session_ctx = SessionContext::new(); + let cfg = SessionConfig::new().set_str("datafusion.catalog.has_header", "true"); + let session_ctx = SessionContext::new_with_config(cfg); let task_ctx = session_ctx.task_ctx(); let file_schema = aggr_test_schema(); let path = format!("{}/csv", arrow_test_data()); @@ -1017,7 +1019,9 @@ mod tests { // create partitioned input file and context let tmp_dir = TempDir::new()?; let ctx = SessionContext::new_with_config( - SessionConfig::new().with_target_partitions(8), + SessionConfig::new() + .with_target_partitions(8) + .set_str("datafusion.catalog.has_header", "false"), ); let schema = populate_csv_partitions(&tmp_dir, 8, ".csv")?; @@ -1045,7 +1049,9 @@ mod tests { .await?; // create a new context and verify that the results were saved to a partitioned csv file - let ctx = SessionContext::new(); + let ctx = SessionContext::new_with_config( + SessionConfig::new().set_str("datafusion.catalog.has_header", "false"), + ); let schema = Arc::new(Schema::new(vec![ Field::new("c1", DataType::UInt32, false), @@ -1074,7 +1080,7 @@ mod tests { panic!("Did not find part_0 in csv output files!") } // register each partition as well as the top level dir - let csv_read_option = CsvReadOptions::new().schema(&schema); + let csv_read_option = CsvReadOptions::new().schema(&schema).has_header(false); ctx.register_csv( "part0", &format!("{out_dir}/{part_0_name}"), diff --git a/datafusion/core/tests/path_partition.rs b/datafusion/core/tests/path_partition.rs index dd8eb52f67c7..6dfed4be795b 100644 --- a/datafusion/core/tests/path_partition.rs +++ b/datafusion/core/tests/path_partition.rs @@ -42,6 +42,7 @@ use datafusion_common::ScalarValue; use async_trait::async_trait; use bytes::Bytes; use chrono::{TimeZone, Utc}; +use datafusion_execution::config::SessionConfig; use futures::stream; use futures::stream::BoxStream; use object_store::{ @@ -202,7 +203,9 @@ fn extract_as_utf(v: &ScalarValue) -> Option { #[tokio::test] async fn csv_filter_with_file_col() -> Result<()> { - let ctx = SessionContext::new(); + let ctx = SessionContext::new_with_config( + SessionConfig::new().set_str("datafusion.catalog.has_header", "true"), + ); register_partitioned_aggregate_csv( &ctx, @@ -238,7 +241,9 @@ async fn csv_filter_with_file_col() -> Result<()> { #[tokio::test] async fn csv_filter_with_file_nonstring_col() -> Result<()> { - let ctx = SessionContext::new(); + let ctx = SessionContext::new_with_config( + SessionConfig::new().set_str("datafusion.catalog.has_header", "true"), + ); register_partitioned_aggregate_csv( &ctx, @@ -274,7 +279,9 @@ async fn csv_filter_with_file_nonstring_col() -> Result<()> { #[tokio::test] async fn csv_projection_on_partition() -> Result<()> { - let ctx = SessionContext::new(); + let ctx = SessionContext::new_with_config( + SessionConfig::new().set_str("datafusion.catalog.has_header", "true"), + ); register_partitioned_aggregate_csv( &ctx, @@ -310,7 +317,9 @@ async fn csv_projection_on_partition() -> Result<()> { #[tokio::test] async fn csv_grouping_by_partition() -> Result<()> { - let ctx = SessionContext::new(); + let ctx = SessionContext::new_with_config( + SessionConfig::new().set_str("datafusion.catalog.has_header", "true"), + ); register_partitioned_aggregate_csv( &ctx, diff --git a/datafusion/expr/src/logical_plan/ddl.rs b/datafusion/expr/src/logical_plan/ddl.rs index 8d72c9a8b036..5b1c377dc1e3 100644 --- a/datafusion/expr/src/logical_plan/ddl.rs +++ b/datafusion/expr/src/logical_plan/ddl.rs @@ -25,7 +25,6 @@ use std::{ use crate::{Expr, LogicalPlan, Volatility}; use arrow::datatypes::DataType; -use datafusion_common::parsers::CompressionTypeVariant; use datafusion_common::{Constraints, DFSchemaRef, SchemaReference, TableReference}; use sqlparser::ast::Ident; @@ -192,8 +191,6 @@ pub struct CreateExternalTable { pub file_type: String, /// Whether the CSV file contains a header pub has_header: bool, - /// Delimiter for CSV - pub delimiter: char, /// Partition Columns pub table_partition_cols: Vec, /// Option to not error if table already exists @@ -202,8 +199,6 @@ pub struct CreateExternalTable { pub definition: Option, /// Order expressions supplied by user pub order_exprs: Vec>, - /// File compression type (GZIP, BZIP2, XZ, ZSTD) - pub file_compression_type: CompressionTypeVariant, /// Whether the table is an infinite streams pub unbounded: bool, /// Table(provider) specific options @@ -222,11 +217,9 @@ impl Hash for CreateExternalTable { self.location.hash(state); self.file_type.hash(state); self.has_header.hash(state); - self.delimiter.hash(state); self.table_partition_cols.hash(state); self.if_not_exists.hash(state); self.definition.hash(state); - self.file_compression_type.hash(state); self.order_exprs.hash(state); self.unbounded.hash(state); self.options.len().hash(state); // HashMap is not hashable diff --git a/datafusion/proto/proto/datafusion.proto b/datafusion/proto/proto/datafusion.proto index c057ab8acda7..3da326ff37fe 100644 --- a/datafusion/proto/proto/datafusion.proto +++ b/datafusion/proto/proto/datafusion.proto @@ -201,23 +201,20 @@ message Constraints{ message CreateExternalTableNode { reserved 1; // was string name - TableReference name = 12; + TableReference name = 10; string location = 2; string file_type = 3; bool has_header = 4; DfSchema schema = 5; repeated string table_partition_cols = 6; bool if_not_exists = 7; - string delimiter = 8; - string definition = 9; - reserved 10; // was string file_compression_type - CompressionTypeVariant file_compression_type = 17; - repeated LogicalExprNodeCollection order_exprs = 13; - bool unbounded = 14; - map options = 11; - Constraints constraints = 15; - map column_defaults = 16; -} + string definition = 8; + repeated LogicalExprNodeCollection order_exprs = 11; + bool unbounded = 12; + map options = 9; + Constraints constraints = 13; + map column_defaults = 14; + } message PrepareNode { string name = 1; @@ -1106,7 +1103,7 @@ message CsvWriterOptions { // Options controlling CSV format message CsvOptions { - bool has_header = 1; // Indicates if the CSV has a header row + bytes has_header = 1; // Indicates if the CSV has a header row bytes delimiter = 2; // Delimiter character as a byte bytes quote = 3; // Quote character as a byte bytes escape = 4; // Optional escape character as a byte diff --git a/datafusion/proto/src/generated/pbjson.rs b/datafusion/proto/src/generated/pbjson.rs index 994703c5fcfb..a836ed5abcc5 100644 --- a/datafusion/proto/src/generated/pbjson.rs +++ b/datafusion/proto/src/generated/pbjson.rs @@ -4688,15 +4688,9 @@ impl serde::Serialize for CreateExternalTableNode { if self.if_not_exists { len += 1; } - if !self.delimiter.is_empty() { - len += 1; - } if !self.definition.is_empty() { len += 1; } - if self.file_compression_type != 0 { - len += 1; - } if !self.order_exprs.is_empty() { len += 1; } @@ -4734,17 +4728,9 @@ impl serde::Serialize for CreateExternalTableNode { if self.if_not_exists { struct_ser.serialize_field("ifNotExists", &self.if_not_exists)?; } - if !self.delimiter.is_empty() { - struct_ser.serialize_field("delimiter", &self.delimiter)?; - } if !self.definition.is_empty() { struct_ser.serialize_field("definition", &self.definition)?; } - if self.file_compression_type != 0 { - let v = CompressionTypeVariant::try_from(self.file_compression_type) - .map_err(|_| serde::ser::Error::custom(format!("Invalid variant {}", self.file_compression_type)))?; - struct_ser.serialize_field("fileCompressionType", &v)?; - } if !self.order_exprs.is_empty() { struct_ser.serialize_field("orderExprs", &self.order_exprs)?; } @@ -4781,10 +4767,7 @@ impl<'de> serde::Deserialize<'de> for CreateExternalTableNode { "tablePartitionCols", "if_not_exists", "ifNotExists", - "delimiter", "definition", - "file_compression_type", - "fileCompressionType", "order_exprs", "orderExprs", "unbounded", @@ -4803,9 +4786,7 @@ impl<'de> serde::Deserialize<'de> for CreateExternalTableNode { Schema, TablePartitionCols, IfNotExists, - Delimiter, Definition, - FileCompressionType, OrderExprs, Unbounded, Options, @@ -4839,9 +4820,7 @@ impl<'de> serde::Deserialize<'de> for CreateExternalTableNode { "schema" => Ok(GeneratedField::Schema), "tablePartitionCols" | "table_partition_cols" => Ok(GeneratedField::TablePartitionCols), "ifNotExists" | "if_not_exists" => Ok(GeneratedField::IfNotExists), - "delimiter" => Ok(GeneratedField::Delimiter), "definition" => Ok(GeneratedField::Definition), - "fileCompressionType" | "file_compression_type" => Ok(GeneratedField::FileCompressionType), "orderExprs" | "order_exprs" => Ok(GeneratedField::OrderExprs), "unbounded" => Ok(GeneratedField::Unbounded), "options" => Ok(GeneratedField::Options), @@ -4873,9 +4852,7 @@ impl<'de> serde::Deserialize<'de> for CreateExternalTableNode { let mut schema__ = None; let mut table_partition_cols__ = None; let mut if_not_exists__ = None; - let mut delimiter__ = None; let mut definition__ = None; - let mut file_compression_type__ = None; let mut order_exprs__ = None; let mut unbounded__ = None; let mut options__ = None; @@ -4925,24 +4902,12 @@ impl<'de> serde::Deserialize<'de> for CreateExternalTableNode { } if_not_exists__ = Some(map_.next_value()?); } - GeneratedField::Delimiter => { - if delimiter__.is_some() { - return Err(serde::de::Error::duplicate_field("delimiter")); - } - delimiter__ = Some(map_.next_value()?); - } GeneratedField::Definition => { if definition__.is_some() { return Err(serde::de::Error::duplicate_field("definition")); } definition__ = Some(map_.next_value()?); } - GeneratedField::FileCompressionType => { - if file_compression_type__.is_some() { - return Err(serde::de::Error::duplicate_field("fileCompressionType")); - } - file_compression_type__ = Some(map_.next_value::()? as i32); - } GeneratedField::OrderExprs => { if order_exprs__.is_some() { return Err(serde::de::Error::duplicate_field("orderExprs")); @@ -4987,9 +4952,7 @@ impl<'de> serde::Deserialize<'de> for CreateExternalTableNode { schema: schema__, table_partition_cols: table_partition_cols__.unwrap_or_default(), if_not_exists: if_not_exists__.unwrap_or_default(), - delimiter: delimiter__.unwrap_or_default(), definition: definition__.unwrap_or_default(), - file_compression_type: file_compression_type__.unwrap_or_default(), order_exprs: order_exprs__.unwrap_or_default(), unbounded: unbounded__.unwrap_or_default(), options: options__.unwrap_or_default(), @@ -5459,7 +5422,7 @@ impl serde::Serialize for CsvOptions { { use serde::ser::SerializeStruct; let mut len = 0; - if self.has_header { + if !self.has_header.is_empty() { len += 1; } if !self.delimiter.is_empty() { @@ -5496,8 +5459,9 @@ impl serde::Serialize for CsvOptions { len += 1; } let mut struct_ser = serializer.serialize_struct("datafusion.CsvOptions", len)?; - if self.has_header { - struct_ser.serialize_field("hasHeader", &self.has_header)?; + if !self.has_header.is_empty() { + #[allow(clippy::needless_borrow)] + struct_ser.serialize_field("hasHeader", pbjson::private::base64::encode(&self.has_header).as_str())?; } if !self.delimiter.is_empty() { #[allow(clippy::needless_borrow)] @@ -5654,7 +5618,9 @@ impl<'de> serde::Deserialize<'de> for CsvOptions { if has_header__.is_some() { return Err(serde::de::Error::duplicate_field("hasHeader")); } - has_header__ = Some(map_.next_value()?); + has_header__ = + Some(map_.next_value::<::pbjson::private::BytesDeserialize<_>>()?.0) + ; } GeneratedField::Delimiter => { if delimiter__.is_some() { diff --git a/datafusion/proto/src/generated/prost.rs b/datafusion/proto/src/generated/prost.rs index fc23a9ea05f7..3b48a8050031 100644 --- a/datafusion/proto/src/generated/prost.rs +++ b/datafusion/proto/src/generated/prost.rs @@ -320,7 +320,7 @@ pub struct Constraints { #[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] pub struct CreateExternalTableNode { - #[prost(message, optional, tag = "12")] + #[prost(message, optional, tag = "10")] pub name: ::core::option::Option, #[prost(string, tag = "2")] pub location: ::prost::alloc::string::String, @@ -335,23 +335,19 @@ pub struct CreateExternalTableNode { #[prost(bool, tag = "7")] pub if_not_exists: bool, #[prost(string, tag = "8")] - pub delimiter: ::prost::alloc::string::String, - #[prost(string, tag = "9")] pub definition: ::prost::alloc::string::String, - #[prost(enumeration = "CompressionTypeVariant", tag = "17")] - pub file_compression_type: i32, - #[prost(message, repeated, tag = "13")] + #[prost(message, repeated, tag = "11")] pub order_exprs: ::prost::alloc::vec::Vec, - #[prost(bool, tag = "14")] + #[prost(bool, tag = "12")] pub unbounded: bool, - #[prost(map = "string, string", tag = "11")] + #[prost(map = "string, string", tag = "9")] pub options: ::std::collections::HashMap< ::prost::alloc::string::String, ::prost::alloc::string::String, >, - #[prost(message, optional, tag = "15")] + #[prost(message, optional, tag = "13")] pub constraints: ::core::option::Option, - #[prost(map = "string, message", tag = "16")] + #[prost(map = "string, message", tag = "14")] pub column_defaults: ::std::collections::HashMap< ::prost::alloc::string::String, LogicalExprNode, @@ -1684,8 +1680,8 @@ pub struct CsvWriterOptions { #[derive(Clone, PartialEq, ::prost::Message)] pub struct CsvOptions { /// Indicates if the CSV has a header row - #[prost(bool, tag = "1")] - pub has_header: bool, + #[prost(bytes = "vec", tag = "1")] + pub has_header: ::prost::alloc::vec::Vec, /// Delimiter character as a byte #[prost(bytes = "vec", tag = "2")] pub delimiter: ::prost::alloc::vec::Vec, diff --git a/datafusion/proto/src/logical_plan/mod.rs b/datafusion/proto/src/logical_plan/mod.rs index cccfb0c27307..474fc93ac104 100644 --- a/datafusion/proto/src/logical_plan/mod.rs +++ b/datafusion/proto/src/logical_plan/mod.rs @@ -539,37 +539,28 @@ impl AsLogicalPlan for LogicalPlanNode { column_defaults.insert(col_name.clone(), expr); } - let file_compression_type = protobuf::CompressionTypeVariant::try_from( - create_extern_table.file_compression_type, - ) - .map_err(|_| { - proto_error(format!( - "Unknown file compression type {}", - create_extern_table.file_compression_type - )) - })?; - - Ok(LogicalPlan::Ddl(DdlStatement::CreateExternalTable(CreateExternalTable { - schema: pb_schema.try_into()?, - name: from_table_reference(create_extern_table.name.as_ref(), "CreateExternalTable")?, - location: create_extern_table.location.clone(), - file_type: create_extern_table.file_type.clone(), - has_header: create_extern_table.has_header, - delimiter: create_extern_table.delimiter.chars().next().ok_or_else(|| { - DataFusionError::Internal(String::from("Protobuf deserialization error, unable to parse CSV delimiter")) - })?, - table_partition_cols: create_extern_table - .table_partition_cols - .clone(), - order_exprs, - if_not_exists: create_extern_table.if_not_exists, - file_compression_type: file_compression_type.into(), - definition, - unbounded: create_extern_table.unbounded, - options: create_extern_table.options.clone(), - constraints: constraints.into(), - column_defaults, - }))) + Ok(LogicalPlan::Ddl(DdlStatement::CreateExternalTable( + CreateExternalTable { + schema: pb_schema.try_into()?, + name: from_table_reference( + create_extern_table.name.as_ref(), + "CreateExternalTable", + )?, + location: create_extern_table.location.clone(), + file_type: create_extern_table.file_type.clone(), + has_header: create_extern_table.has_header, + table_partition_cols: create_extern_table + .table_partition_cols + .clone(), + order_exprs, + if_not_exists: create_extern_table.if_not_exists, + definition, + unbounded: create_extern_table.unbounded, + options: create_extern_table.options.clone(), + constraints: constraints.into(), + column_defaults, + }, + ))) } LogicalPlanType::CreateView(create_view) => { let plan = create_view @@ -1328,12 +1319,10 @@ impl AsLogicalPlan for LogicalPlanNode { location, file_type, has_header, - delimiter, schema: df_schema, table_partition_cols, if_not_exists, definition, - file_compression_type, order_exprs, unbounded, options, @@ -1359,9 +1348,6 @@ impl AsLogicalPlan for LogicalPlanNode { .insert(col_name.clone(), serialize_expr(expr, extension_codec)?); } - let file_compression_type = - protobuf::CompressionTypeVariant::from(file_compression_type); - Ok(protobuf::LogicalPlanNode { logical_plan_type: Some(LogicalPlanType::CreateExternalTable( protobuf::CreateExternalTableNode { @@ -1372,10 +1358,8 @@ impl AsLogicalPlan for LogicalPlanNode { schema: Some(df_schema.try_into()?), table_partition_cols: table_partition_cols.clone(), if_not_exists: *if_not_exists, - delimiter: String::from(*delimiter), order_exprs: converted_order_exprs, definition: definition.clone().unwrap_or_default(), - file_compression_type: file_compression_type.into(), unbounded: *unbounded, options: options.clone(), constraints: Some(constraints.clone().into()), diff --git a/datafusion/proto/src/physical_plan/from_proto.rs b/datafusion/proto/src/physical_plan/from_proto.rs index 33e632b0d942..6e568c7fb872 100644 --- a/datafusion/proto/src/physical_plan/from_proto.rs +++ b/datafusion/proto/src/physical_plan/from_proto.rs @@ -806,7 +806,7 @@ impl TryFrom<&protobuf::CsvOptions> for CsvOptions { fn try_from(proto_opts: &protobuf::CsvOptions) -> Result { Ok(CsvOptions { - has_header: proto_opts.has_header, + has_header: proto_opts.has_header.first().map(|h| *h != 0), delimiter: proto_opts.delimiter[0], quote: proto_opts.quote[0], escape: proto_opts.escape.first().copied(), diff --git a/datafusion/proto/src/physical_plan/to_proto.rs b/datafusion/proto/src/physical_plan/to_proto.rs index a0a0ee72054b..cddc8ddcc227 100644 --- a/datafusion/proto/src/physical_plan/to_proto.rs +++ b/datafusion/proto/src/physical_plan/to_proto.rs @@ -980,7 +980,10 @@ impl TryFrom<&CsvOptions> for protobuf::CsvOptions { fn try_from(opts: &CsvOptions) -> Result { let compression: protobuf::CompressionTypeVariant = opts.compression.into(); Ok(protobuf::CsvOptions { - has_header: opts.has_header, + has_header: opts + .has_header + .map(|h| h as u8) + .map_or_else(Vec::new, |e| vec![e]), delimiter: vec![opts.delimiter], quote: vec![opts.quote], escape: opts.escape.map_or_else(Vec::new, |e| vec![e]), diff --git a/datafusion/sql/src/parser.rs b/datafusion/sql/src/parser.rs index 5a999ab21d30..607276f47b73 100644 --- a/datafusion/sql/src/parser.rs +++ b/datafusion/sql/src/parser.rs @@ -198,8 +198,6 @@ pub struct CreateExternalTable { pub file_type: String, /// CSV Header row? pub has_header: bool, - /// User defined delimiter for CSVs - pub delimiter: char, /// Path to file pub location: String, /// Partition Columns @@ -208,8 +206,6 @@ pub struct CreateExternalTable { pub order_exprs: Vec, /// Option to not error if table already exists pub if_not_exists: bool, - /// File compression type (GZIP, BZIP2, XZ) - pub file_compression_type: CompressionTypeVariant, /// Infinite streams? pub unbounded: bool, /// Table(provider) specific options @@ -816,14 +812,10 @@ impl<'a> DFParser<'a> { columns, file_type: builder.file_type.unwrap(), has_header: builder.has_header.unwrap_or(false), - delimiter: builder.delimiter.unwrap_or(','), location: builder.location.unwrap(), table_partition_cols: builder.table_partition_cols.unwrap_or(vec![]), order_exprs: builder.order_exprs, if_not_exists, - file_compression_type: builder - .file_compression_type - .unwrap_or(CompressionTypeVariant::UNCOMPRESSED), unbounded, options: builder.options.unwrap_or(HashMap::new()), constraints, @@ -917,7 +909,6 @@ mod tests { use super::*; use sqlparser::ast::Expr::Identifier; use sqlparser::ast::{BinaryOperator, DataType, Expr, Ident}; - use CompressionTypeVariant::UNCOMPRESSED; fn expect_parse_ok(sql: &str, expected: Statement) -> Result<(), ParserError> { let statements = DFParser::parse_sql(sql)?; @@ -970,12 +961,10 @@ mod tests { columns: vec![make_column_def("c1", DataType::Int(display))], file_type: "CSV".to_string(), has_header: false, - delimiter: ',', location: "foo.csv".into(), table_partition_cols: vec![], order_exprs: vec![], if_not_exists: false, - file_compression_type: UNCOMPRESSED, unbounded: false, options: HashMap::new(), constraints: vec![], @@ -989,12 +978,10 @@ mod tests { columns: vec![make_column_def("c1", DataType::Int(None))], file_type: "CSV".to_string(), has_header: false, - delimiter: ',', location: "foo.csv".into(), table_partition_cols: vec![], order_exprs: vec![], if_not_exists: false, - file_compression_type: UNCOMPRESSED, unbounded: false, options: HashMap::new(), constraints: vec![], @@ -1009,12 +996,10 @@ mod tests { columns: vec![make_column_def("c1", DataType::Int(None))], file_type: "CSV".to_string(), has_header: false, - delimiter: ',', location: "foo.csv".into(), table_partition_cols: vec![], order_exprs: vec![], if_not_exists: false, - file_compression_type: UNCOMPRESSED, unbounded: false, options: HashMap::new(), constraints: vec![], @@ -1022,21 +1007,22 @@ mod tests { expect_parse_ok(sql, expected)?; // positive case with delimiter - let sql = "CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV DELIMITER '|' LOCATION 'foo.csv'"; + let sql = "CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV LOCATION 'foo.csv' OPTIONS ('format.delimiter' '|')"; let display = None; let expected = Statement::CreateExternalTable(CreateExternalTable { name: "t".into(), columns: vec![make_column_def("c1", DataType::Int(display))], file_type: "CSV".to_string(), has_header: false, - delimiter: '|', location: "foo.csv".into(), table_partition_cols: vec![], order_exprs: vec![], if_not_exists: false, - file_compression_type: UNCOMPRESSED, unbounded: false, - options: HashMap::new(), + options: vec!["format.delimiter".to_string()] + .into_iter() + .zip(vec!["|".to_string()]) + .collect(), constraints: vec![], }); expect_parse_ok(sql, expected)?; @@ -1049,12 +1035,10 @@ mod tests { columns: vec![make_column_def("c1", DataType::Int(display))], file_type: "CSV".to_string(), has_header: false, - delimiter: ',', location: "foo.csv".into(), table_partition_cols: vec!["p1".to_string(), "p2".to_string()], order_exprs: vec![], if_not_exists: false, - file_compression_type: UNCOMPRESSED, unbounded: false, options: HashMap::new(), constraints: vec![], @@ -1072,12 +1056,10 @@ mod tests { columns: vec![make_column_def("c1", DataType::Int(display))], file_type: "CSV".to_string(), has_header: true, - delimiter: ',', location: "foo.csv".into(), table_partition_cols: vec![], order_exprs: vec![], if_not_exists: false, - file_compression_type: UNCOMPRESSED, unbounded: false, options: HashMap::new(), constraints: vec![], @@ -1087,27 +1069,30 @@ mod tests { // positive case: it is ok for sql stmt with `COMPRESSION TYPE GZIP` tokens let sqls = vec![ - ("CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV COMPRESSION TYPE GZIP LOCATION 'foo.csv'", "GZIP"), - ("CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV COMPRESSION TYPE BZIP2 LOCATION 'foo.csv'", "BZIP2"), - ("CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV COMPRESSION TYPE XZ LOCATION 'foo.csv'", "XZ"), - ("CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV COMPRESSION TYPE ZSTD LOCATION 'foo.csv'", "ZSTD"), - ]; - for (sql, file_compression_type) in sqls { + ("CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV LOCATION 'foo.csv' OPTIONS + ('format.key.compression' 'GZIP')", "GZIP"), + ("CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV LOCATION 'foo.csv' OPTIONS + ('format.key.compression' 'BZIP2')", "BZIP2"), + ("CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV LOCATION 'foo.csv' OPTIONS + ('format.key.compression' 'XZ')", "XZ"), + ("CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV LOCATION 'foo.csv' OPTIONS + ('format.key.compression' 'ZSTD')", "ZSTD"), + ]; + for (sql, compression) in sqls { let expected = Statement::CreateExternalTable(CreateExternalTable { name: "t".into(), columns: vec![make_column_def("c1", DataType::Int(display))], file_type: "CSV".to_string(), has_header: false, - delimiter: ',', location: "foo.csv".into(), table_partition_cols: vec![], order_exprs: vec![], if_not_exists: false, - file_compression_type: CompressionTypeVariant::from_str( - file_compression_type, - )?, unbounded: false, - options: HashMap::new(), + options: vec!["format.key.compression".to_string()] + .into_iter() + .zip(vec![compression.to_string()]) + .collect(), constraints: vec![], }); expect_parse_ok(sql, expected)?; @@ -1120,12 +1105,10 @@ mod tests { columns: vec![], file_type: "PARQUET".to_string(), has_header: false, - delimiter: ',', location: "foo.parquet".into(), table_partition_cols: vec![], order_exprs: vec![], if_not_exists: false, - file_compression_type: UNCOMPRESSED, unbounded: false, options: HashMap::new(), constraints: vec![], @@ -1139,12 +1122,10 @@ mod tests { columns: vec![], file_type: "PARQUET".to_string(), has_header: false, - delimiter: ',', location: "foo.parquet".into(), table_partition_cols: vec![], order_exprs: vec![], if_not_exists: false, - file_compression_type: UNCOMPRESSED, unbounded: false, options: HashMap::new(), constraints: vec![], @@ -1158,12 +1139,10 @@ mod tests { columns: vec![], file_type: "AVRO".to_string(), has_header: false, - delimiter: ',', location: "foo.avro".into(), table_partition_cols: vec![], order_exprs: vec![], if_not_exists: false, - file_compression_type: UNCOMPRESSED, unbounded: false, options: HashMap::new(), constraints: vec![], @@ -1178,12 +1157,10 @@ mod tests { columns: vec![], file_type: "PARQUET".to_string(), has_header: false, - delimiter: ',', location: "foo.parquet".into(), table_partition_cols: vec![], order_exprs: vec![], if_not_exists: true, - file_compression_type: UNCOMPRESSED, unbounded: false, options: HashMap::new(), constraints: vec![], @@ -1201,12 +1178,10 @@ mod tests { ], file_type: "CSV".to_string(), has_header: false, - delimiter: ',', location: "foo.csv".into(), table_partition_cols: vec!["p1".to_string()], order_exprs: vec![], if_not_exists: false, - file_compression_type: UNCOMPRESSED, unbounded: false, options: HashMap::new(), constraints: vec![], @@ -1231,12 +1206,10 @@ mod tests { columns: vec![], file_type: "X".to_string(), has_header: false, - delimiter: ',', location: "blahblah".into(), table_partition_cols: vec![], order_exprs: vec![], if_not_exists: false, - file_compression_type: UNCOMPRESSED, unbounded: false, options: HashMap::from([("k1".into(), "v1".into())]), constraints: vec![], @@ -1251,12 +1224,10 @@ mod tests { columns: vec![], file_type: "X".to_string(), has_header: false, - delimiter: ',', location: "blahblah".into(), table_partition_cols: vec![], order_exprs: vec![], if_not_exists: false, - file_compression_type: UNCOMPRESSED, unbounded: false, options: HashMap::from([ ("k1".into(), "v1".into()), @@ -1293,7 +1264,6 @@ mod tests { columns: vec![make_column_def("c1", DataType::Int(None))], file_type: "CSV".to_string(), has_header: false, - delimiter: ',', location: "foo.csv".into(), table_partition_cols: vec![], order_exprs: vec![vec![OrderByExpr { @@ -1305,7 +1275,6 @@ mod tests { nulls_first, }]], if_not_exists: false, - file_compression_type: UNCOMPRESSED, unbounded: false, options: HashMap::new(), constraints: vec![], @@ -1324,7 +1293,6 @@ mod tests { ], file_type: "CSV".to_string(), has_header: false, - delimiter: ',', location: "foo.csv".into(), table_partition_cols: vec![], order_exprs: vec![vec![ @@ -1346,7 +1314,6 @@ mod tests { }, ]], if_not_exists: false, - file_compression_type: UNCOMPRESSED, unbounded: false, options: HashMap::new(), constraints: vec![], @@ -1364,7 +1331,6 @@ mod tests { ], file_type: "CSV".to_string(), has_header: false, - delimiter: ',', location: "foo.csv".into(), table_partition_cols: vec![], order_exprs: vec![vec![OrderByExpr { @@ -1383,7 +1349,6 @@ mod tests { nulls_first: None, }]], if_not_exists: false, - file_compression_type: UNCOMPRESSED, unbounded: false, options: HashMap::new(), constraints: vec![], @@ -1394,14 +1359,14 @@ mod tests { let sql = " CREATE UNBOUNDED EXTERNAL TABLE IF NOT EXISTS t (c1 int, c2 float) STORED AS PARQUET - DELIMITER '*' WITH HEADER ROW WITH ORDER (c1 - c2 ASC) - COMPRESSION TYPE zstd PARTITIONED BY (c1) LOCATION 'foo.parquet' - OPTIONS (ROW_GROUP_SIZE '1024', 'TRUNCATE' 'NO') - "; + OPTIONS (ROW_GROUP_SIZE '1024', 'TRUNCATE' 'NO', + 'format.compression' 'zstd', + 'format.delimiter' '*') + "; let expected = Statement::CreateExternalTable(CreateExternalTable { name: "t".into(), columns: vec![ @@ -1410,7 +1375,6 @@ mod tests { ], file_type: "PARQUET".to_string(), has_header: true, - delimiter: '*', location: "foo.parquet".into(), table_partition_cols: vec!["c1".into()], order_exprs: vec![vec![OrderByExpr { @@ -1429,9 +1393,10 @@ mod tests { nulls_first: None, }]], if_not_exists: true, - file_compression_type: CompressionTypeVariant::ZSTD, unbounded: true, options: HashMap::from([ + ("format.compression".into(), "zstd".into()), + ("format.delimiter".into(), "*".into()), ("ROW_GROUP_SIZE".into(), "1024".into()), ("TRUNCATE".into(), "NO".into()), ]), diff --git a/datafusion/sql/src/statement.rs b/datafusion/sql/src/statement.rs index 137bb5fb20b7..ce440cace3cc 100644 --- a/datafusion/sql/src/statement.rs +++ b/datafusion/sql/src/statement.rs @@ -969,11 +969,9 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { columns, file_type, has_header, - delimiter, location, table_partition_cols, if_not_exists, - file_compression_type, order_exprs, unbounded, options, @@ -985,8 +983,14 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { let inline_constraints = calc_inline_constraints_from_columns(&columns); all_constraints.extend(inline_constraints); + let compression = options + .get("format.compression") + .map(|comp| CompressionTypeVariant::from_str(comp)) + .transpose()?; if (file_type == "PARQUET" || file_type == "AVRO" || file_type == "ARROW") - && file_compression_type != CompressionTypeVariant::UNCOMPRESSED + && compression + .map(|comp| comp != CompressionTypeVariant::UNCOMPRESSED) + .unwrap_or(false) { plan_err!( "File compression type cannot be set for PARQUET, AVRO, or ARROW files." @@ -1018,11 +1022,9 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { location, file_type, has_header, - delimiter, table_partition_cols, if_not_exists, definition, - file_compression_type, order_exprs: ordered_exprs, unbounded, options, diff --git a/datafusion/sql/tests/sql_integration.rs b/datafusion/sql/tests/sql_integration.rs index 1a300d11b0b2..781d2ce5bb06 100644 --- a/datafusion/sql/tests/sql_integration.rs +++ b/datafusion/sql/tests/sql_integration.rs @@ -1903,12 +1903,12 @@ fn create_external_table_csv_no_schema() { fn create_external_table_with_compression_type() { // positive case let sqls = vec![ - "CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV COMPRESSION TYPE GZIP LOCATION 'foo.csv.gz'", - "CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV COMPRESSION TYPE BZIP2 LOCATION 'foo.csv.bz2'", - "CREATE EXTERNAL TABLE t(c1 int) STORED AS JSON COMPRESSION TYPE GZIP LOCATION 'foo.json.gz'", - "CREATE EXTERNAL TABLE t(c1 int) STORED AS JSON COMPRESSION TYPE BZIP2 LOCATION 'foo.json.bz2'", - "CREATE EXTERNAL TABLE t(c1 int) STORED AS NONSTANDARD COMPRESSION TYPE GZIP LOCATION 'foo.unk'", - ]; + "CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV LOCATION 'foo.csv.gz' OPTIONS ('format.compression' 'gzip')", + "CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV LOCATION 'foo.csv.bz2' OPTIONS ('format.compression' 'bzip2')", + "CREATE EXTERNAL TABLE t(c1 int) STORED AS JSON LOCATION 'foo.json.gz' OPTIONS ('format.compression' 'gzip')", + "CREATE EXTERNAL TABLE t(c1 int) STORED AS JSON LOCATION 'foo.json.bz2' OPTIONS ('format.compression' 'bzip2')", + "CREATE EXTERNAL TABLE t(c1 int) STORED AS NONSTANDARD LOCATION 'foo.unk' OPTIONS ('format.compression' 'gzip')", + ]; for sql in sqls { let expected = "CreateExternalTable: Bare { table: \"t\" }"; quick_test(sql, expected); @@ -1916,12 +1916,12 @@ fn create_external_table_with_compression_type() { // negative case let sqls = vec![ - "CREATE EXTERNAL TABLE t STORED AS AVRO COMPRESSION TYPE GZIP LOCATION 'foo.avro'", - "CREATE EXTERNAL TABLE t STORED AS AVRO COMPRESSION TYPE BZIP2 LOCATION 'foo.avro'", - "CREATE EXTERNAL TABLE t STORED AS PARQUET COMPRESSION TYPE GZIP LOCATION 'foo.parquet'", - "CREATE EXTERNAL TABLE t STORED AS PARQUET COMPRESSION TYPE BZIP2 LOCATION 'foo.parquet'", - "CREATE EXTERNAL TABLE t STORED AS ARROW COMPRESSION TYPE GZIP LOCATION 'foo.arrow'", - "CREATE EXTERNAL TABLE t STORED AS ARROW COMPRESSION TYPE BZIP2 LOCATION 'foo.arrow'", + "CREATE EXTERNAL TABLE t STORED AS AVRO LOCATION 'foo.avro' OPTIONS ('format.compression' 'gzip')", + "CREATE EXTERNAL TABLE t STORED AS AVRO LOCATION 'foo.avro' OPTIONS ('format.compression' 'bzip2')", + "CREATE EXTERNAL TABLE t STORED AS PARQUET LOCATION 'foo.parquet' OPTIONS ('format.compression' 'gzip')", + "CREATE EXTERNAL TABLE t STORED AS PARQUET LOCATION 'foo.parquet' OPTIONS ('format.compression' 'bzip2')", + "CREATE EXTERNAL TABLE t STORED AS ARROW LOCATION 'foo.arrow' OPTIONS ('format.compression' 'gzip')", + "CREATE EXTERNAL TABLE t STORED AS ARROW LOCATION 'foo.arrow' OPTIONS ('format.compression' 'bzip2')", ]; for sql in sqls { let err = logical_plan(sql).expect_err("query should have failed"); diff --git a/datafusion/sqllogictest/test_files/copy.slt b/datafusion/sqllogictest/test_files/copy.slt index 882a4e220758..00bcea7ec154 100644 --- a/datafusion/sqllogictest/test_files/copy.slt +++ b/datafusion/sqllogictest/test_files/copy.slt @@ -384,7 +384,7 @@ COPY source_table to 'test_files/scratch/copy/table_json_gz' STORED AS JSON OPT # validate folder of csv files statement ok -CREATE EXTERNAL TABLE validate_json_gz STORED AS json COMPRESSION TYPE gzip LOCATION 'test_files/scratch/copy/table_json_gz'; +CREATE EXTERNAL TABLE validate_json_gz STORED AS json LOCATION 'test_files/scratch/copy/table_json_gz' OPTIONS ('format.compression' 'gzip'); query IT select * from validate_json_gz; @@ -400,7 +400,7 @@ COPY source_table to 'test_files/scratch/copy/table_csv' STORED AS CSV OPTIONS # validate folder of csv files statement ok -CREATE EXTERNAL TABLE validate_csv STORED AS csv COMPRESSION TYPE gzip LOCATION 'test_files/scratch/copy/table_csv'; +CREATE EXTERNAL TABLE validate_csv STORED AS csv LOCATION 'test_files/scratch/copy/table_csv' OPTIONS ('format.compression' 'gzip'); query IT select * from validate_csv; @@ -416,7 +416,7 @@ COPY source_table to 'test_files/scratch/copy/table.csv'; # Validate single csv output statement ok -CREATE EXTERNAL TABLE validate_single_csv STORED AS csv WITH HEADER ROW LOCATION 'test_files/scratch/copy/table.csv'; +CREATE EXTERNAL TABLE validate_single_csv STORED AS csv LOCATION 'test_files/scratch/copy/table.csv' OPTIONS ('format.has_header' 'false'); query IT select * from validate_single_csv; diff --git a/datafusion/sqllogictest/test_files/csv_files.slt b/datafusion/sqllogictest/test_files/csv_files.slt index be846028d545..1c501634fef5 100644 --- a/datafusion/sqllogictest/test_files/csv_files.slt +++ b/datafusion/sqllogictest/test_files/csv_files.slt @@ -22,9 +22,9 @@ c1 VARCHAR, c2 VARCHAR ) STORED AS CSV WITH HEADER ROW -DELIMITER ',' -OPTIONS ('format.quote' '~') -LOCATION '../core/tests/data/quote.csv'; +LOCATION '../core/tests/data/quote.csv' +OPTIONS ('format.quote' '~', + 'format.delimiter' ','); statement ok CREATE EXTERNAL TABLE csv_with_escape ( @@ -32,8 +32,8 @@ c1 VARCHAR, c2 VARCHAR ) STORED AS CSV WITH HEADER ROW -DELIMITER ',' -OPTIONS ('format.escape' '\') +OPTIONS ('format.escape' '\', + 'format.delimiter' ',') LOCATION '../core/tests/data/escape.csv'; query TT @@ -70,8 +70,8 @@ c1 VARCHAR, c2 VARCHAR ) STORED AS CSV WITH HEADER ROW -DELIMITER ',' -OPTIONS ('format.escape' '"') +OPTIONS ('format.escape' '"', + 'format.delimiter' ',') LOCATION '../core/tests/data/escape.csv'; # TODO: Validate this with better data. @@ -136,8 +136,8 @@ CREATE EXTERNAL TABLE partitioned_table ( partition_col INT ) STORED AS CSV -WITH HEADER ROW -LOCATION 'test_files/scratch/csv_files/csv_partitions'; +LOCATION 'test_files/scratch/csv_files/csv_partitions' +OPTIONS ('format.has_header' 'false'); query ITII SELECT * FROM partitioned_table ORDER BY int_col; @@ -160,4 +160,4 @@ logical_plan physical_plan 01)SortPreservingMergeExec: [int_col@0 ASC NULLS LAST] 02)--SortExec: expr=[int_col@0 ASC NULLS LAST], preserve_partitioning=[true] -03)----CsvExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/csv_files/csv_partitions/1.csv], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/csv_files/csv_partitions/2.csv]]}, projection=[int_col, string_col, bigint_col, partition_col], has_header=true +03)----CsvExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/csv_files/csv_partitions/1.csv], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/csv_files/csv_partitions/2.csv]]}, projection=[int_col, string_col, bigint_col, partition_col], has_header=false \ No newline at end of file diff --git a/datafusion/sqllogictest/test_files/ddl.slt b/datafusion/sqllogictest/test_files/ddl.slt index 682972b5572a..26b0e5561806 100644 --- a/datafusion/sqllogictest/test_files/ddl.slt +++ b/datafusion/sqllogictest/test_files/ddl.slt @@ -535,7 +535,7 @@ DROP VIEW y; # create_pipe_delimited_csv_table() statement ok -CREATE EXTERNAL TABLE aggregate_simple STORED AS CSV WITH HEADER ROW DELIMITER '|' LOCATION '../core/tests/data/aggregate_simple_pipe.csv'; +CREATE EXTERNAL TABLE aggregate_simple STORED AS CSV WITH HEADER ROW LOCATION '../core/tests/data/aggregate_simple_pipe.csv' OPTIONS ('format.delimiter' '|'); query RRB diff --git a/datafusion/sqllogictest/test_files/limit.slt b/datafusion/sqllogictest/test_files/limit.slt index faaa593d2c17..2238a4aaeaf4 100644 --- a/datafusion/sqllogictest/test_files/limit.slt +++ b/datafusion/sqllogictest/test_files/limit.slt @@ -529,9 +529,8 @@ CREATE UNBOUNDED EXTERNAL TABLE data ( "column1" INTEGER, "column2" VARCHAR, ) STORED AS CSV -WITH HEADER ROW WITH ORDER ("column1", "column2") -LOCATION 'test_files/scratch/limit/data.csv'; +LOCATION 'test_files/scratch/limit/data.csv' OPTIONS ('format.has_header' 'false'); query IT SELECT * from data LIMIT 3; diff --git a/datafusion/sqllogictest/test_files/order.slt b/datafusion/sqllogictest/test_files/order.slt index 0b43b6e31aaf..3f00bd2bb4ea 100644 --- a/datafusion/sqllogictest/test_files/order.slt +++ b/datafusion/sqllogictest/test_files/order.slt @@ -627,7 +627,8 @@ CREATE EXTERNAL TABLE IF NOT EXISTS orders ( o_clerk VARCHAR, o_shippriority INTEGER, o_comment VARCHAR, -) STORED AS CSV WITH ORDER (o_orderkey ASC) DELIMITER ',' WITH HEADER ROW LOCATION '../core/tests/tpch-csv/orders.csv'; +) STORED AS CSV WITH ORDER (o_orderkey ASC) WITH HEADER ROW LOCATION '../core/tests/tpch-csv/orders.csv' +OPTIONS ('format.delimiter' ','); query TT EXPLAIN SELECT o_orderkey, o_orderstatus FROM orders ORDER BY o_orderkey ASC diff --git a/datafusion/sqllogictest/test_files/pg_compat/pg_compat_null.slt b/datafusion/sqllogictest/test_files/pg_compat/pg_compat_null.slt index f933c90acc73..50567a5ef5c1 100644 --- a/datafusion/sqllogictest/test_files/pg_compat/pg_compat_null.slt +++ b/datafusion/sqllogictest/test_files/pg_compat/pg_compat_null.slt @@ -41,8 +41,7 @@ onlyif postgres statement ok COPY aggregate_test_100_by_sql FROM '../../testing/data/csv/aggregate_test_100.csv' - DELIMITER ',' - CSV HEADER; + CSV HEADER OPTIONS ('format.delimiter' ','); ### diff --git a/datafusion/sqllogictest/test_files/pg_compat/pg_compat_simple.slt b/datafusion/sqllogictest/test_files/pg_compat/pg_compat_simple.slt index b01ea73c8056..a0593433c16f 100644 --- a/datafusion/sqllogictest/test_files/pg_compat/pg_compat_simple.slt +++ b/datafusion/sqllogictest/test_files/pg_compat/pg_compat_simple.slt @@ -43,8 +43,7 @@ onlyif postgres statement ok COPY aggregate_test_100_by_sql FROM '../../testing/data/csv/aggregate_test_100.csv' - DELIMITER ',' - CSV HEADER; + CSV HEADER OPTIONS ('format.delimiter' ','); ### ## Setup test for datafusion diff --git a/datafusion/sqllogictest/test_files/pg_compat/pg_compat_union.slt b/datafusion/sqllogictest/test_files/pg_compat/pg_compat_union.slt index 05343de32268..2641d2ee8379 100644 --- a/datafusion/sqllogictest/test_files/pg_compat/pg_compat_union.slt +++ b/datafusion/sqllogictest/test_files/pg_compat/pg_compat_union.slt @@ -40,8 +40,7 @@ onlyif postgres statement ok COPY aggregate_test_100_by_sql FROM '../../testing/data/csv/aggregate_test_100.csv' - DELIMITER ',' - CSV HEADER; + CSV HEADER OPTIONS ('format.delimiter' ','); ### ## Setup test for datafusion diff --git a/datafusion/sqllogictest/test_files/pg_compat/pg_compat_window.slt b/datafusion/sqllogictest/test_files/pg_compat/pg_compat_window.slt index cec51d472075..de780adf6f32 100644 --- a/datafusion/sqllogictest/test_files/pg_compat/pg_compat_window.slt +++ b/datafusion/sqllogictest/test_files/pg_compat/pg_compat_window.slt @@ -40,8 +40,7 @@ onlyif postgres statement ok COPY aggregate_test_100_by_sql FROM '../../testing/data/csv/aggregate_test_100.csv' - DELIMITER ',' - CSV HEADER; + CSV HEADER OPTIONS ('format.delimiter' ','); ### ## Setup test for datafusion diff --git a/datafusion/sqllogictest/test_files/predicates.slt b/datafusion/sqllogictest/test_files/predicates.slt index caf79abcfa4e..12f3ebccea3c 100644 --- a/datafusion/sqllogictest/test_files/predicates.slt +++ b/datafusion/sqllogictest/test_files/predicates.slt @@ -621,7 +621,7 @@ CREATE EXTERNAL TABLE IF NOT EXISTS lineitem ( l_shipinstruct VARCHAR, l_shipmode VARCHAR, l_comment VARCHAR, -) STORED AS CSV DELIMITER ',' WITH HEADER ROW LOCATION '../core/tests/tpch-csv/lineitem.csv'; +) STORED AS CSV WITH HEADER ROW LOCATION '../core/tests/tpch-csv/lineitem.csv' OPTIONS ('format.delimiter' ','); statement ok CREATE EXTERNAL TABLE IF NOT EXISTS part ( @@ -634,7 +634,7 @@ CREATE EXTERNAL TABLE IF NOT EXISTS part ( p_container VARCHAR, p_retailprice DECIMAL(15, 2), p_comment VARCHAR, -) STORED AS CSV DELIMITER ',' WITH HEADER ROW LOCATION '../core/tests/tpch-csv/part.csv'; +) STORED AS CSV WITH HEADER ROW LOCATION '../core/tests/tpch-csv/part.csv' OPTIONS ('format.delimiter' ','); query TT EXPLAIN SELECT l_partkey FROM diff --git a/datafusion/sqllogictest/test_files/repartition_scan.slt b/datafusion/sqllogictest/test_files/repartition_scan.slt index 4a620a695fbf..6ce7d7211202 100644 --- a/datafusion/sqllogictest/test_files/repartition_scan.slt +++ b/datafusion/sqllogictest/test_files/repartition_scan.slt @@ -158,13 +158,13 @@ DROP TABLE parquet_table_with_order; # create a single csv file statement ok COPY (VALUES (1), (2), (3), (4), (5)) TO 'test_files/scratch/repartition_scan/csv_table/1.csv' -STORED AS CSV WITH HEADER ROW; +STORED AS CSV OPTIONS ('format.has_header' 'true'); statement ok CREATE EXTERNAL TABLE csv_table(column1 int) STORED AS csv -WITH HEADER ROW -LOCATION 'test_files/scratch/repartition_scan/csv_table/'; +LOCATION 'test_files/scratch/repartition_scan/csv_table/' OPTIONS + ('format.has_header' 'true'); query I select * from csv_table ORDER BY column1; diff --git a/datafusion/sqllogictest/test_files/subquery.slt b/datafusion/sqllogictest/test_files/subquery.slt index 7196418af1c7..85f85c3a524c 100644 --- a/datafusion/sqllogictest/test_files/subquery.slt +++ b/datafusion/sqllogictest/test_files/subquery.slt @@ -66,7 +66,7 @@ CREATE EXTERNAL TABLE IF NOT EXISTS customer ( c_acctbal DECIMAL(15, 2), c_mktsegment VARCHAR, c_comment VARCHAR, -) STORED AS CSV DELIMITER ',' WITH HEADER ROW LOCATION '../core/tests/tpch-csv/customer.csv'; +) STORED AS CSV WITH HEADER ROW LOCATION '../core/tests/tpch-csv/customer.csv' OPTIONS ('format.delimiter' ','); statement ok CREATE EXTERNAL TABLE IF NOT EXISTS orders ( @@ -79,7 +79,7 @@ CREATE EXTERNAL TABLE IF NOT EXISTS orders ( o_clerk VARCHAR, o_shippriority INTEGER, o_comment VARCHAR, -) STORED AS CSV DELIMITER ',' WITH HEADER ROW LOCATION '../core/tests/tpch-csv/orders.csv'; +) STORED AS CSV WITH HEADER ROW LOCATION '../core/tests/tpch-csv/orders.csv' OPTIONS ('format.delimiter' ','); statement ok CREATE EXTERNAL TABLE IF NOT EXISTS lineitem ( @@ -99,7 +99,7 @@ CREATE EXTERNAL TABLE IF NOT EXISTS lineitem ( l_shipinstruct VARCHAR, l_shipmode VARCHAR, l_comment VARCHAR, -) STORED AS CSV DELIMITER ',' WITH HEADER ROW LOCATION '../core/tests/tpch-csv/lineitem.csv'; +) STORED AS CSV WITH HEADER ROW LOCATION '../core/tests/tpch-csv/lineitem.csv' OPTIONS ('format.delimiter' ','); # in_subquery_to_join_with_correlated_outer_filter query ITI rowsort diff --git a/datafusion/sqllogictest/test_files/tpch/create_tables.slt.part b/datafusion/sqllogictest/test_files/tpch/create_tables.slt.part index 2f5e2d5a7616..75bcbc198bef 100644 --- a/datafusion/sqllogictest/test_files/tpch/create_tables.slt.part +++ b/datafusion/sqllogictest/test_files/tpch/create_tables.slt.part @@ -31,7 +31,7 @@ CREATE EXTERNAL TABLE IF NOT EXISTS supplier ( s_acctbal DECIMAL(15, 2), s_comment VARCHAR, s_rev VARCHAR, -) STORED AS CSV DELIMITER '|' LOCATION 'test_files/tpch/data/supplier.tbl'; +) STORED AS CSV LOCATION 'test_files/tpch/data/supplier.tbl' OPTIONS ('format.delimiter' '|'); statement ok CREATE EXTERNAL TABLE IF NOT EXISTS part ( @@ -45,7 +45,7 @@ CREATE EXTERNAL TABLE IF NOT EXISTS part ( p_retailprice DECIMAL(15, 2), p_comment VARCHAR, p_rev VARCHAR, -) STORED AS CSV DELIMITER '|' LOCATION 'test_files/tpch/data/part.tbl'; +) STORED AS CSV LOCATION 'test_files/tpch/data/part.tbl' OPTIONS ('format.delimiter' '|'); statement ok @@ -56,7 +56,7 @@ CREATE EXTERNAL TABLE IF NOT EXISTS partsupp ( ps_supplycost DECIMAL(15, 2), ps_comment VARCHAR, ps_rev VARCHAR, -) STORED AS CSV DELIMITER '|' LOCATION 'test_files/tpch/data/partsupp.tbl'; +) STORED AS CSV LOCATION 'test_files/tpch/data/partsupp.tbl' OPTIONS ('format.delimiter' '|'); statement ok CREATE EXTERNAL TABLE IF NOT EXISTS customer ( @@ -69,7 +69,7 @@ CREATE EXTERNAL TABLE IF NOT EXISTS customer ( c_mktsegment VARCHAR, c_comment VARCHAR, c_rev VARCHAR, -) STORED AS CSV DELIMITER '|' LOCATION 'test_files/tpch/data/customer.tbl'; +) STORED AS CSV LOCATION 'test_files/tpch/data/customer.tbl' OPTIONS ('format.delimiter' '|'); statement ok CREATE EXTERNAL TABLE IF NOT EXISTS orders ( @@ -83,7 +83,7 @@ CREATE EXTERNAL TABLE IF NOT EXISTS orders ( o_shippriority INTEGER, o_comment VARCHAR, o_rev VARCHAR, -) STORED AS CSV DELIMITER '|' LOCATION 'test_files/tpch/data/orders.tbl'; +) STORED AS CSV LOCATION 'test_files/tpch/data/orders.tbl' OPTIONS ('format.delimiter' '|'); statement ok CREATE EXTERNAL TABLE IF NOT EXISTS lineitem ( @@ -104,7 +104,7 @@ CREATE EXTERNAL TABLE IF NOT EXISTS lineitem ( l_shipmode VARCHAR, l_comment VARCHAR, l_rev VARCHAR, -) STORED AS CSV DELIMITER '|' LOCATION 'test_files/tpch/data/lineitem.tbl'; +) STORED AS CSV LOCATION 'test_files/tpch/data/lineitem.tbl' OPTIONS ('format.delimiter' '|'); statement ok CREATE EXTERNAL TABLE IF NOT EXISTS nation ( @@ -113,7 +113,7 @@ CREATE EXTERNAL TABLE IF NOT EXISTS nation ( n_regionkey BIGINT, n_comment VARCHAR, n_rev VARCHAR, -) STORED AS CSV DELIMITER '|' LOCATION 'test_files/tpch/data/nation.tbl'; +) STORED AS CSV LOCATION 'test_files/tpch/data/nation.tbl' OPTIONS ('format.delimiter' '|'); statement ok CREATE EXTERNAL TABLE IF NOT EXISTS region ( @@ -121,4 +121,4 @@ CREATE EXTERNAL TABLE IF NOT EXISTS region ( r_name VARCHAR, r_comment VARCHAR, r_rev VARCHAR, -) STORED AS CSV DELIMITER '|' LOCATION 'test_files/tpch/data/region.tbl'; +) STORED AS CSV LOCATION 'test_files/tpch/data/region.tbl' OPTIONS ('format.delimiter' '|'); From d1611a9b9b98ba40d0116da8e8b7e608c3befc30 Mon Sep 17 00:00:00 2001 From: berkaysynnada Date: Tue, 7 May 2024 14:40:05 +0300 Subject: [PATCH 02/15] Keep PG copy from tests same --- .../sqllogictest/test_files/pg_compat/pg_compat_null.slt | 3 ++- .../sqllogictest/test_files/pg_compat/pg_compat_simple.slt | 3 ++- .../sqllogictest/test_files/pg_compat/pg_compat_union.slt | 3 ++- .../sqllogictest/test_files/pg_compat/pg_compat_window.slt | 3 ++- 4 files changed, 8 insertions(+), 4 deletions(-) diff --git a/datafusion/sqllogictest/test_files/pg_compat/pg_compat_null.slt b/datafusion/sqllogictest/test_files/pg_compat/pg_compat_null.slt index 50567a5ef5c1..f933c90acc73 100644 --- a/datafusion/sqllogictest/test_files/pg_compat/pg_compat_null.slt +++ b/datafusion/sqllogictest/test_files/pg_compat/pg_compat_null.slt @@ -41,7 +41,8 @@ onlyif postgres statement ok COPY aggregate_test_100_by_sql FROM '../../testing/data/csv/aggregate_test_100.csv' - CSV HEADER OPTIONS ('format.delimiter' ','); + DELIMITER ',' + CSV HEADER; ### diff --git a/datafusion/sqllogictest/test_files/pg_compat/pg_compat_simple.slt b/datafusion/sqllogictest/test_files/pg_compat/pg_compat_simple.slt index a0593433c16f..b01ea73c8056 100644 --- a/datafusion/sqllogictest/test_files/pg_compat/pg_compat_simple.slt +++ b/datafusion/sqllogictest/test_files/pg_compat/pg_compat_simple.slt @@ -43,7 +43,8 @@ onlyif postgres statement ok COPY aggregate_test_100_by_sql FROM '../../testing/data/csv/aggregate_test_100.csv' - CSV HEADER OPTIONS ('format.delimiter' ','); + DELIMITER ',' + CSV HEADER; ### ## Setup test for datafusion diff --git a/datafusion/sqllogictest/test_files/pg_compat/pg_compat_union.slt b/datafusion/sqllogictest/test_files/pg_compat/pg_compat_union.slt index 2641d2ee8379..05343de32268 100644 --- a/datafusion/sqllogictest/test_files/pg_compat/pg_compat_union.slt +++ b/datafusion/sqllogictest/test_files/pg_compat/pg_compat_union.slt @@ -40,7 +40,8 @@ onlyif postgres statement ok COPY aggregate_test_100_by_sql FROM '../../testing/data/csv/aggregate_test_100.csv' - CSV HEADER OPTIONS ('format.delimiter' ','); + DELIMITER ',' + CSV HEADER; ### ## Setup test for datafusion diff --git a/datafusion/sqllogictest/test_files/pg_compat/pg_compat_window.slt b/datafusion/sqllogictest/test_files/pg_compat/pg_compat_window.slt index de780adf6f32..cec51d472075 100644 --- a/datafusion/sqllogictest/test_files/pg_compat/pg_compat_window.slt +++ b/datafusion/sqllogictest/test_files/pg_compat/pg_compat_window.slt @@ -40,7 +40,8 @@ onlyif postgres statement ok COPY aggregate_test_100_by_sql FROM '../../testing/data/csv/aggregate_test_100.csv' - CSV HEADER OPTIONS ('format.delimiter' ','); + DELIMITER ',' + CSV HEADER; ### ## Setup test for datafusion From 99fae814fb25375195738e8d4c348c8ad98672c1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Berkay=20=C5=9Eahin?= <124376117+berkaysynnada@users.noreply.github.com> Date: Wed, 8 May 2024 09:47:17 +0300 Subject: [PATCH 03/15] Update datafusion/common/src/config.rs Co-authored-by: Andrew Lamb --- datafusion/common/src/config.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index 1afac3549117..547bcfac7df3 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -1564,6 +1564,8 @@ config_namespace_with_hashmap! { config_namespace! { /// Options controlling CSV format pub struct CsvOptions { + /// If the first line of the CSV is column names. + /// If not specified, uses default from `CREATE TABLE` command, if any. pub has_header: Option, default = None pub delimiter: u8, default = b',' pub quote: u8, default = b'"' From 13c4db64e787fb79a2e42bc76c376e0ed9eb86c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Berkay=20=C5=9Eahin?= <124376117+berkaysynnada@users.noreply.github.com> Date: Wed, 8 May 2024 09:48:24 +0300 Subject: [PATCH 04/15] Update datafusion/core/src/datasource/file_format/csv.rs Co-authored-by: Andrew Lamb --- datafusion/core/src/datasource/file_format/csv.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/core/src/datasource/file_format/csv.rs b/datafusion/core/src/datasource/file_format/csv.rs index 2e5054d4366c..874bc2cf4e5d 100644 --- a/datafusion/core/src/datasource/file_format/csv.rs +++ b/datafusion/core/src/datasource/file_format/csv.rs @@ -819,7 +819,7 @@ mod tests { file_name: &str, projection: Option>, limit: Option, - hash_header: bool, + has_header: bool, ) -> Result> { let root = format!("{}/csv", crate::test_util::arrow_test_data()); let mut format = CsvFormat::default(); From c613a336f3dfb27d6f501f431e8da967b7a20801 Mon Sep 17 00:00:00 2001 From: berkaysynnada Date: Thu, 9 May 2024 11:34:40 +0300 Subject: [PATCH 05/15] Remove WITH HEADER ROW --- datafusion/common/src/config.rs | 13 +-- datafusion/core/src/catalog/listing_schema.rs | 4 - .../core/src/datasource/file_format/csv.rs | 19 ++-- .../core/src/datasource/listing/table.rs | 12 --- .../src/datasource/listing_table_factory.rs | 18 +--- datafusion/core/src/datasource/stream.rs | 1 - datafusion/core/src/execution/context/mod.rs | 11 +-- datafusion/core/tests/fifo.rs | 8 +- datafusion/core/tests/sql/mod.rs | 4 +- datafusion/expr/src/logical_plan/ddl.rs | 3 - datafusion/proto/proto/datafusion.proto | 21 ++-- datafusion/proto/src/generated/pbjson.rs | 18 ---- datafusion/proto/src/generated/prost.rs | 22 ++--- datafusion/proto/src/logical_plan/mod.rs | 3 - .../tests/cases/roundtrip_logical_plan.rs | 8 +- datafusion/sql/src/parser.rs | 95 ++++--------------- datafusion/sql/src/statement.rs | 2 - .../test_files/agg_func_substitute.slt | 4 +- .../sqllogictest/test_files/aggregate.slt | 10 +- datafusion/sqllogictest/test_files/avro.slt | 18 ++-- .../test_files/create_external_table.slt | 12 --- .../sqllogictest/test_files/csv_files.slt | 12 +-- datafusion/sqllogictest/test_files/cte.slt | 10 +- datafusion/sqllogictest/test_files/ddl.slt | 24 ++--- .../sqllogictest/test_files/decimal.slt | 8 +- .../sqllogictest/test_files/describe.slt | 4 +- .../sqllogictest/test_files/distinct_on.slt | 2 +- datafusion/sqllogictest/test_files/errors.slt | 2 +- .../sqllogictest/test_files/explain.slt | 14 +-- datafusion/sqllogictest/test_files/expr.slt | 2 +- .../sqllogictest/test_files/functions.slt | 2 +- datafusion/sqllogictest/test_files/group.slt | 4 +- .../sqllogictest/test_files/group_by.slt | 26 ++--- .../sqllogictest/test_files/identifiers.slt | 2 +- .../test_files/information_schema.slt | 3 +- datafusion/sqllogictest/test_files/insert.slt | 2 +- .../test_files/insert_to_external.slt | 2 +- .../join_disable_repartition_joins.slt | 4 +- datafusion/sqllogictest/test_files/joins.slt | 8 +- datafusion/sqllogictest/test_files/limit.slt | 2 +- datafusion/sqllogictest/test_files/math.slt | 2 +- .../test_files/monotonic_projection_test.slt | 4 +- datafusion/sqllogictest/test_files/order.slt | 12 +-- .../sqllogictest/test_files/parquet.slt | 30 +++--- .../test_files/pg_compat/pg_compat_null.slt | 2 +- .../test_files/pg_compat/pg_compat_simple.slt | 2 +- .../test_files/pg_compat/pg_compat_union.slt | 2 +- .../test_files/pg_compat/pg_compat_window.slt | 2 +- .../sqllogictest/test_files/predicates.slt | 8 +- .../sqllogictest/test_files/projection.slt | 4 +- .../sqllogictest/test_files/references.slt | 2 +- .../sqllogictest/test_files/repartition.slt | 4 +- .../test_files/repartition_scan.slt | 2 +- datafusion/sqllogictest/test_files/scalar.slt | 2 +- datafusion/sqllogictest/test_files/select.slt | 8 +- .../sqllogictest/test_files/subquery.slt | 6 +- datafusion/sqllogictest/test_files/topk.slt | 2 +- datafusion/sqllogictest/test_files/union.slt | 10 +- .../sqllogictest/test_files/wildcard.slt | 2 +- datafusion/sqllogictest/test_files/window.slt | 29 +++--- docs/source/user-guide/cli/datasources.md | 4 +- docs/source/user-guide/sql/ddl.md | 23 ++--- docs/source/user-guide/sql/write_options.md | 4 +- 63 files changed, 233 insertions(+), 372 deletions(-) diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index 547bcfac7df3..49d31257c8ba 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -1564,9 +1564,9 @@ config_namespace_with_hashmap! { config_namespace! { /// Options controlling CSV format pub struct CsvOptions { - /// If the first line of the CSV is column names. - /// If not specified, uses default from `CREATE TABLE` command, if any. - pub has_header: Option, default = None + /// If the first line of the CSV is column names. + /// If not specified, uses default from `CREATE TABLE` command, if any. + pub has_header: Option, default =None pub delimiter: u8, default = b',' pub quote: u8, default = b'"' pub escape: Option, default = None @@ -1606,9 +1606,10 @@ impl CsvOptions { self } - /// True if the first line is a header. - pub fn has_header(&self) -> bool { - self.has_header.unwrap_or(false) + /// True if the first line is a header. If the condition of having header + /// is not set by the format options, session state decides it. + pub fn has_header(&self, config_opt: &ConfigOptions) -> bool { + self.has_header.unwrap_or(config_opt.catalog.has_header) } /// The character separating values within a row. diff --git a/datafusion/core/src/catalog/listing_schema.rs b/datafusion/core/src/catalog/listing_schema.rs index 15e6764b1033..29f3e4ad8181 100644 --- a/datafusion/core/src/catalog/listing_schema.rs +++ b/datafusion/core/src/catalog/listing_schema.rs @@ -57,7 +57,6 @@ pub struct ListingSchemaProvider { store: Arc, tables: Arc>>>, format: String, - has_header: bool, } impl ListingSchemaProvider { @@ -76,7 +75,6 @@ impl ListingSchemaProvider { factory: Arc, store: Arc, format: String, - has_header: bool, ) -> Self { Self { authority, @@ -85,7 +83,6 @@ impl ListingSchemaProvider { store, tables: Arc::new(Mutex::new(HashMap::new())), format, - has_header, } } @@ -138,7 +135,6 @@ impl ListingSchemaProvider { name, location: table_url, file_type: self.format.clone(), - has_header: self.has_header, table_partition_cols: vec![], if_not_exists: false, definition: None, diff --git a/datafusion/core/src/datasource/file_format/csv.rs b/datafusion/core/src/datasource/file_format/csv.rs index 874bc2cf4e5d..ab2582b4e820 100644 --- a/datafusion/core/src/datasource/file_format/csv.rs +++ b/datafusion/core/src/datasource/file_format/csv.rs @@ -39,7 +39,7 @@ use arrow::array::RecordBatch; use arrow::csv::WriterBuilder; use arrow::datatypes::SchemaRef; use arrow::datatypes::{DataType, Field, Fields, Schema}; -use datafusion_common::config::CsvOptions; +use datafusion_common::config::{ConfigOptions, CsvOptions}; use datafusion_common::file_options::csv_writer::CsvWriterOptions; use datafusion_common::{exec_err, not_impl_err, DataFusionError, FileType}; use datafusion_execution::TaskContext; @@ -141,8 +141,8 @@ impl CsvFormat { } /// True if the first line is a header. - pub fn has_header(&self) -> bool { - self.options.has_header.unwrap_or(false) + pub fn has_header(&self, config_opt: &ConfigOptions) -> bool { + self.options.has_header(config_opt) } /// The character separating values within a row. @@ -242,9 +242,9 @@ impl FileFormat for CsvFormat { ) -> Result> { let exec = CsvExec::new( conf, - self.options - .has_header - .unwrap_or(state.config().options().catalog.has_header), + // If the condition of having header is not set by + // the format options, session state decides it. + self.options.has_header(state.config_options()), self.options.delimiter, self.options.quote, self.options.escape, @@ -302,10 +302,7 @@ impl CsvFormat { while let Some(chunk) = stream.next().await.transpose()? { let format = arrow::csv::reader::Format::default() .with_header( - self.options - .has_header - .unwrap_or(state.config_options().catalog.has_header) - && first_chunk, + self.options.has_header(state.config_options()) && first_chunk, ) .with_delimiter(self.options.delimiter); @@ -823,7 +820,7 @@ mod tests { ) -> Result> { let root = format!("{}/csv", crate::test_util::arrow_test_data()); let mut format = CsvFormat::default(); - format = format.with_has_header(hash_header); + format = format.with_has_header(has_header); scan_format(state, &format, &root, file_name, projection, limit).await } diff --git a/datafusion/core/src/datasource/listing/table.rs b/datafusion/core/src/datasource/listing/table.rs index e6f763f52426..7488091fff70 100644 --- a/datafusion/core/src/datasource/listing/table.rs +++ b/datafusion/core/src/datasource/listing/table.rs @@ -1561,18 +1561,6 @@ mod tests { Ok(()) } - #[tokio::test] - async fn test_insert_into_sql_csv_defaults_header_row() -> Result<()> { - helper_test_insert_into_sql( - "csv", - FileCompressionType::UNCOMPRESSED, - "WITH HEADER ROW", - None, - ) - .await?; - Ok(()) - } - #[tokio::test] async fn test_insert_into_sql_json_defaults() -> Result<()> { helper_test_insert_into_sql("json", FileCompressionType::UNCOMPRESSED, "", None) diff --git a/datafusion/core/src/datasource/listing_table_factory.rs b/datafusion/core/src/datasource/listing_table_factory.rs index 8380808d5f3b..1d33944330da 100644 --- a/datafusion/core/src/datasource/listing_table_factory.rs +++ b/datafusion/core/src/datasource/listing_table_factory.rs @@ -34,7 +34,7 @@ use crate::datasource::TableProvider; use crate::execution::context::SessionState; use arrow::datatypes::{DataType, SchemaRef}; -use datafusion_common::{arrow_datafusion_err, plan_err, DataFusionError, FileType}; +use datafusion_common::{arrow_datafusion_err, DataFusionError, FileType}; use datafusion_expr::CreateExternalTable; use async_trait::async_trait; @@ -66,15 +66,7 @@ impl TableProviderFactory for ListingTableFactory { let file_extension = get_extension(cmd.location.as_str()); let file_format: Arc = match file_type { FileType::CSV => { - let mut csv_options = table_options.csv; - if let Some(has_header) = csv_options.has_header { - if cmd.has_header && !has_header { - return plan_err!("Conflicting header options for CSV file"); - } - } else { - csv_options.has_header = Some(cmd.has_header); - } - Arc::new(CsvFormat::default().with_options(csv_options)) + Arc::new(CsvFormat::default().with_options(table_options.csv)) } #[cfg(feature = "parquet")] FileType::PARQUET => { @@ -192,14 +184,13 @@ mod tests { name, location: csv_file.path().to_str().unwrap().to_string(), file_type: "csv".to_string(), - has_header: true, schema: Arc::new(DFSchema::empty()), table_partition_cols: vec![], if_not_exists: false, definition: None, order_exprs: vec![], unbounded: false, - options: HashMap::new(), + options: HashMap::from([("format.has_header".into(), "true".into())]), constraints: Constraints::empty(), column_defaults: HashMap::new(), }; @@ -231,14 +222,13 @@ mod tests { name, location: csv_file.path().to_str().unwrap().to_string(), file_type: "csv".to_string(), - has_header: true, schema: Arc::new(DFSchema::empty()), table_partition_cols: vec![], if_not_exists: false, definition: None, order_exprs: vec![], unbounded: false, - options, + options: HashMap::from([("format.has_header".into(), "true".into())]), constraints: Constraints::empty(), column_defaults: HashMap::new(), }; diff --git a/datafusion/core/src/datasource/stream.rs b/datafusion/core/src/datasource/stream.rs index 0a64e7c516ef..5e485f42b516 100644 --- a/datafusion/core/src/datasource/stream.rs +++ b/datafusion/core/src/datasource/stream.rs @@ -62,7 +62,6 @@ impl TableProviderFactory for StreamTableFactory { let config = StreamConfig::new_file(schema, location.into()) .with_encoding(encoding) .with_order(cmd.order_exprs.clone()) - .with_header(cmd.has_header) .with_batch_size(state.config().batch_size()) .with_constraints(cmd.constraints.clone()); diff --git a/datafusion/core/src/execution/context/mod.rs b/datafusion/core/src/execution/context/mod.rs index f3af31f895f9..6c2dc692dae0 100644 --- a/datafusion/core/src/execution/context/mod.rs +++ b/datafusion/core/src/execution/context/mod.rs @@ -1560,7 +1560,6 @@ impl SessionState { let url = url.to_string(); let format = format.to_string(); - let has_header = config.options().catalog.has_header; let url = Url::parse(url.as_str()).expect("Invalid default catalog location!"); let authority = match url.host_str() { Some(host) => format!("{}://{}", url.scheme(), host), @@ -1578,14 +1577,8 @@ impl SessionState { Some(factory) => factory, _ => return, }; - let schema = ListingSchemaProvider::new( - authority, - path, - factory.clone(), - store, - format, - has_header, - ); + let schema = + ListingSchemaProvider::new(authority, path, factory.clone(), store, format); let _ = default_catalog .register_schema("default", Arc::new(schema)) .expect("Failed to register default schema"); diff --git a/datafusion/core/tests/fifo.rs b/datafusion/core/tests/fifo.rs index 9b132f18c7a5..a63240d03d94 100644 --- a/datafusion/core/tests/fifo.rs +++ b/datafusion/core/tests/fifo.rs @@ -384,8 +384,8 @@ mod unix_test { a2 INT NOT NULL ) STORED AS CSV - WITH HEADER ROW - LOCATION '{source_display_fifo_path}'" + LOCATION '{source_display_fifo_path}' + OPTIONS ('format.has_header' 'true')" )) .await?; @@ -396,8 +396,8 @@ mod unix_test { a2 INT NOT NULL ) STORED AS CSV - WITH HEADER ROW - LOCATION '{sink_display_fifo_path}'" + LOCATION '{sink_display_fifo_path}' + OPTIONS ('format.has_header' 'true')" )) .await?; diff --git a/datafusion/core/tests/sql/mod.rs b/datafusion/core/tests/sql/mod.rs index 9b7828a777c8..00f091d77cff 100644 --- a/datafusion/core/tests/sql/mod.rs +++ b/datafusion/core/tests/sql/mod.rs @@ -85,8 +85,8 @@ async fn register_aggregate_csv_by_sql(ctx: &SessionContext) { c13 VARCHAR NOT NULL ) STORED AS CSV - WITH HEADER ROW - LOCATION '{testdata}/csv/aggregate_test_100.csv' + LOCATION '{testdata}/csv/aggregate_test_100.csv + OPTIONS ('format.has_header' 'true') " )) .await diff --git a/datafusion/expr/src/logical_plan/ddl.rs b/datafusion/expr/src/logical_plan/ddl.rs index 5b1c377dc1e3..4538ff52c052 100644 --- a/datafusion/expr/src/logical_plan/ddl.rs +++ b/datafusion/expr/src/logical_plan/ddl.rs @@ -189,8 +189,6 @@ pub struct CreateExternalTable { pub location: String, /// The file type of physical file pub file_type: String, - /// Whether the CSV file contains a header - pub has_header: bool, /// Partition Columns pub table_partition_cols: Vec, /// Option to not error if table already exists @@ -216,7 +214,6 @@ impl Hash for CreateExternalTable { self.name.hash(state); self.location.hash(state); self.file_type.hash(state); - self.has_header.hash(state); self.table_partition_cols.hash(state); self.if_not_exists.hash(state); self.definition.hash(state); diff --git a/datafusion/proto/proto/datafusion.proto b/datafusion/proto/proto/datafusion.proto index 3da326ff37fe..dcb6a983978c 100644 --- a/datafusion/proto/proto/datafusion.proto +++ b/datafusion/proto/proto/datafusion.proto @@ -201,19 +201,18 @@ message Constraints{ message CreateExternalTableNode { reserved 1; // was string name - TableReference name = 10; + TableReference name = 9; string location = 2; string file_type = 3; - bool has_header = 4; - DfSchema schema = 5; - repeated string table_partition_cols = 6; - bool if_not_exists = 7; - string definition = 8; - repeated LogicalExprNodeCollection order_exprs = 11; - bool unbounded = 12; - map options = 9; - Constraints constraints = 13; - map column_defaults = 14; + DfSchema schema = 4; + repeated string table_partition_cols = 5; + bool if_not_exists = 6; + string definition = 7; + repeated LogicalExprNodeCollection order_exprs = 10; + bool unbounded = 11; + map options = 8; + Constraints constraints = 12; + map column_defaults = 13; } message PrepareNode { diff --git a/datafusion/proto/src/generated/pbjson.rs b/datafusion/proto/src/generated/pbjson.rs index a836ed5abcc5..7284e52f3cdb 100644 --- a/datafusion/proto/src/generated/pbjson.rs +++ b/datafusion/proto/src/generated/pbjson.rs @@ -4676,9 +4676,6 @@ impl serde::Serialize for CreateExternalTableNode { if !self.file_type.is_empty() { len += 1; } - if self.has_header { - len += 1; - } if self.schema.is_some() { len += 1; } @@ -4716,9 +4713,6 @@ impl serde::Serialize for CreateExternalTableNode { if !self.file_type.is_empty() { struct_ser.serialize_field("fileType", &self.file_type)?; } - if self.has_header { - struct_ser.serialize_field("hasHeader", &self.has_header)?; - } if let Some(v) = self.schema.as_ref() { struct_ser.serialize_field("schema", v)?; } @@ -4760,8 +4754,6 @@ impl<'de> serde::Deserialize<'de> for CreateExternalTableNode { "location", "file_type", "fileType", - "has_header", - "hasHeader", "schema", "table_partition_cols", "tablePartitionCols", @@ -4782,7 +4774,6 @@ impl<'de> serde::Deserialize<'de> for CreateExternalTableNode { Name, Location, FileType, - HasHeader, Schema, TablePartitionCols, IfNotExists, @@ -4816,7 +4807,6 @@ impl<'de> serde::Deserialize<'de> for CreateExternalTableNode { "name" => Ok(GeneratedField::Name), "location" => Ok(GeneratedField::Location), "fileType" | "file_type" => Ok(GeneratedField::FileType), - "hasHeader" | "has_header" => Ok(GeneratedField::HasHeader), "schema" => Ok(GeneratedField::Schema), "tablePartitionCols" | "table_partition_cols" => Ok(GeneratedField::TablePartitionCols), "ifNotExists" | "if_not_exists" => Ok(GeneratedField::IfNotExists), @@ -4848,7 +4838,6 @@ impl<'de> serde::Deserialize<'de> for CreateExternalTableNode { let mut name__ = None; let mut location__ = None; let mut file_type__ = None; - let mut has_header__ = None; let mut schema__ = None; let mut table_partition_cols__ = None; let mut if_not_exists__ = None; @@ -4878,12 +4867,6 @@ impl<'de> serde::Deserialize<'de> for CreateExternalTableNode { } file_type__ = Some(map_.next_value()?); } - GeneratedField::HasHeader => { - if has_header__.is_some() { - return Err(serde::de::Error::duplicate_field("hasHeader")); - } - has_header__ = Some(map_.next_value()?); - } GeneratedField::Schema => { if schema__.is_some() { return Err(serde::de::Error::duplicate_field("schema")); @@ -4948,7 +4931,6 @@ impl<'de> serde::Deserialize<'de> for CreateExternalTableNode { name: name__, location: location__.unwrap_or_default(), file_type: file_type__.unwrap_or_default(), - has_header: has_header__.unwrap_or_default(), schema: schema__, table_partition_cols: table_partition_cols__.unwrap_or_default(), if_not_exists: if_not_exists__.unwrap_or_default(), diff --git a/datafusion/proto/src/generated/prost.rs b/datafusion/proto/src/generated/prost.rs index 3b48a8050031..78a9f2bf34a7 100644 --- a/datafusion/proto/src/generated/prost.rs +++ b/datafusion/proto/src/generated/prost.rs @@ -320,34 +320,32 @@ pub struct Constraints { #[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] pub struct CreateExternalTableNode { - #[prost(message, optional, tag = "10")] + #[prost(message, optional, tag = "9")] pub name: ::core::option::Option, #[prost(string, tag = "2")] pub location: ::prost::alloc::string::String, #[prost(string, tag = "3")] pub file_type: ::prost::alloc::string::String, - #[prost(bool, tag = "4")] - pub has_header: bool, - #[prost(message, optional, tag = "5")] + #[prost(message, optional, tag = "4")] pub schema: ::core::option::Option, - #[prost(string, repeated, tag = "6")] + #[prost(string, repeated, tag = "5")] pub table_partition_cols: ::prost::alloc::vec::Vec<::prost::alloc::string::String>, - #[prost(bool, tag = "7")] + #[prost(bool, tag = "6")] pub if_not_exists: bool, - #[prost(string, tag = "8")] + #[prost(string, tag = "7")] pub definition: ::prost::alloc::string::String, - #[prost(message, repeated, tag = "11")] + #[prost(message, repeated, tag = "10")] pub order_exprs: ::prost::alloc::vec::Vec, - #[prost(bool, tag = "12")] + #[prost(bool, tag = "11")] pub unbounded: bool, - #[prost(map = "string, string", tag = "9")] + #[prost(map = "string, string", tag = "8")] pub options: ::std::collections::HashMap< ::prost::alloc::string::String, ::prost::alloc::string::String, >, - #[prost(message, optional, tag = "13")] + #[prost(message, optional, tag = "12")] pub constraints: ::core::option::Option, - #[prost(map = "string, message", tag = "14")] + #[prost(map = "string, message", tag = "13")] pub column_defaults: ::std::collections::HashMap< ::prost::alloc::string::String, LogicalExprNode, diff --git a/datafusion/proto/src/logical_plan/mod.rs b/datafusion/proto/src/logical_plan/mod.rs index 474fc93ac104..a6352bcefc3e 100644 --- a/datafusion/proto/src/logical_plan/mod.rs +++ b/datafusion/proto/src/logical_plan/mod.rs @@ -548,7 +548,6 @@ impl AsLogicalPlan for LogicalPlanNode { )?, location: create_extern_table.location.clone(), file_type: create_extern_table.file_type.clone(), - has_header: create_extern_table.has_header, table_partition_cols: create_extern_table .table_partition_cols .clone(), @@ -1318,7 +1317,6 @@ impl AsLogicalPlan for LogicalPlanNode { name, location, file_type, - has_header, schema: df_schema, table_partition_cols, if_not_exists, @@ -1354,7 +1352,6 @@ impl AsLogicalPlan for LogicalPlanNode { name: Some(name.clone().into()), location: location.clone(), file_type: file_type.clone(), - has_header: *has_header, schema: Some(df_schema.try_into()?), table_partition_cols: table_partition_cols.clone(), if_not_exists: *if_not_exists, diff --git a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs index 3800b672b5e2..74df7178b0f2 100644 --- a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs +++ b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs @@ -236,10 +236,10 @@ async fn roundtrip_custom_listing_tables() -> Result<()> { primary key(c) ) STORED AS CSV - WITH HEADER ROW WITH ORDER (a ASC, b ASC) WITH ORDER (c ASC) - LOCATION '../core/tests/data/window_2.csv';"; + LOCATION '../core/tests/data/window_2.csv'; + OPTIONS ('format.has_header' 'true')"; let plan = ctx.state().create_logical_plan(query).await?; @@ -266,10 +266,10 @@ async fn roundtrip_logical_plan_aggregation_with_pk() -> Result<()> { primary key(c) ) STORED AS CSV - WITH HEADER ROW WITH ORDER (a ASC, b ASC) WITH ORDER (c ASC) - LOCATION '../core/tests/data/window_2.csv';", + LOCATION '../core/tests/data/window_2.csv'; + OPTIONS ('format.has_header' 'true')", ) .await?; diff --git a/datafusion/sql/src/parser.rs b/datafusion/sql/src/parser.rs index 607276f47b73..7468527662f6 100644 --- a/datafusion/sql/src/parser.rs +++ b/datafusion/sql/src/parser.rs @@ -19,9 +19,7 @@ use std::collections::{HashMap, VecDeque}; use std::fmt; -use std::str::FromStr; -use datafusion_common::parsers::CompressionTypeVariant; use sqlparser::{ ast::{ ColumnDef, ColumnOptionDef, ObjectName, OrderByExpr, Query, @@ -130,10 +128,6 @@ impl fmt::Display for CopyToStatement { write!(f, " PARTITIONED BY ({})", partitioned_by.join(", "))?; } - if self.has_header { - write!(f, " WITH HEADER ROW")?; - } - if !options.is_empty() { let opts: Vec<_> = options.iter().map(|(k, v)| format!("{k} {v}")).collect(); write!(f, " OPTIONS ({})", opts.join(", "))?; @@ -172,9 +166,6 @@ pub(crate) type LexOrdering = Vec; /// [ IF NOT EXISTS ] /// [ () ] /// STORED AS -/// [ WITH HEADER ROW ] -/// [ DELIMITER ] -/// [ COMPRESSION TYPE ] /// [ PARTITIONED BY ( | ) ] /// [ WITH ORDER () /// [ OPTIONS () ] @@ -196,8 +187,6 @@ pub struct CreateExternalTable { pub columns: Vec, /// File type (Parquet, NDJSON, CSV, etc) pub file_type: String, - /// CSV Header row? - pub has_header: bool, /// Path to file pub location: String, /// Partition Columns @@ -424,8 +413,8 @@ impl<'a> DFParser<'a> { Keyword::WITH => { self.parser.expect_keyword(Keyword::HEADER)?; self.parser.expect_keyword(Keyword::ROW)?; - ensure_not_set(&builder.has_header, "WITH HEADER ROW")?; - builder.has_header = Some(true); + return parser_err!("\"WITH HEADER ROW\" clause is no longer in use. Please use the \"OPTIONS\" + clause with 'format.has_header' set appropriately, e.g., OPTIONS ('format.has_header' 'true')"); } Keyword::PARTITIONED => { self.parser.expect_keyword(Keyword::BY)?; @@ -695,9 +684,6 @@ impl<'a> DFParser<'a> { struct Builder { file_type: Option, location: Option, - has_header: Option, - delimiter: Option, - file_compression_type: Option, table_partition_cols: Option>, order_exprs: Vec, options: Option>, @@ -730,22 +716,18 @@ impl<'a> DFParser<'a> { } else { self.parser.expect_keyword(Keyword::HEADER)?; self.parser.expect_keyword(Keyword::ROW)?; - ensure_not_set(&builder.has_header, "WITH HEADER ROW")?; - builder.has_header = Some(true); + return parser_err!("\"WITH HEADER ROW\" clause is no longer in use. Please use the \"OPTIONS\" + clause with 'format.has_header' set appropriately, e.g., OPTIONS ('format.has_header' 'true')"); } } Keyword::DELIMITER => { - ensure_not_set(&builder.delimiter, "DELIMITER")?; - builder.delimiter = Some(self.parse_delimiter()?); + return parser_err!("\"DELIMITER\" clause is no longer in use. Please use the \"OPTIONS\" + clause with 'format.delimiter' set appropriately, e.g., OPTIONS ('format.delimiter' ',')"); } Keyword::COMPRESSION => { self.parser.expect_keyword(Keyword::TYPE)?; - ensure_not_set( - &builder.file_compression_type, - "COMPRESSION TYPE", - )?; - builder.file_compression_type = - Some(self.parse_file_compression_type()?); + return parser_err!("\"COMPRESSION TYPE\" clause is no longer in use. Please use the \"OPTIONS\" + clause with 'format.compression' set appropriately, e.g., OPTIONS ('format.compression' 'gzip')"); } Keyword::PARTITIONED => { self.parser.expect_keyword(Keyword::BY)?; @@ -811,7 +793,6 @@ impl<'a> DFParser<'a> { name: table_name.to_string(), columns, file_type: builder.file_type.unwrap(), - has_header: builder.has_header.unwrap_or(false), location: builder.location.unwrap(), table_partition_cols: builder.table_partition_cols.unwrap_or(vec![]), order_exprs: builder.order_exprs, @@ -832,17 +813,6 @@ impl<'a> DFParser<'a> { } } - /// Parses the set of - fn parse_file_compression_type( - &mut self, - ) -> Result { - let token = self.parser.next_token(); - match &token.token { - Token::Word(w) => CompressionTypeVariant::from_str(&w.value), - _ => self.expected("one of GZIP, BZIP2, XZ, ZSTD", token), - } - } - /// Parses (key value) style options where the values are literal strings. fn parse_string_options(&mut self) -> Result, ParserError> { let mut options = HashMap::new(); @@ -892,16 +862,6 @@ impl<'a> DFParser<'a> { } Ok(options) } - - fn parse_delimiter(&mut self) -> Result { - let token = self.parser.parse_literal_string()?; - match token.len() { - 1 => Ok(token.chars().next().unwrap()), - _ => Err(ParserError::TokenizerError( - "Delimiter must be a single char".to_string(), - )), - } - } } #[cfg(test)] @@ -960,7 +920,6 @@ mod tests { name: "t".into(), columns: vec![make_column_def("c1", DataType::Int(display))], file_type: "CSV".to_string(), - has_header: false, location: "foo.csv".into(), table_partition_cols: vec![], order_exprs: vec![], @@ -977,7 +936,6 @@ mod tests { name: "t".into(), columns: vec![make_column_def("c1", DataType::Int(None))], file_type: "CSV".to_string(), - has_header: false, location: "foo.csv".into(), table_partition_cols: vec![], order_exprs: vec![], @@ -995,7 +953,6 @@ mod tests { name: "t".into(), columns: vec![make_column_def("c1", DataType::Int(None))], file_type: "CSV".to_string(), - has_header: false, location: "foo.csv".into(), table_partition_cols: vec![], order_exprs: vec![], @@ -1013,16 +970,12 @@ mod tests { name: "t".into(), columns: vec![make_column_def("c1", DataType::Int(display))], file_type: "CSV".to_string(), - has_header: false, location: "foo.csv".into(), table_partition_cols: vec![], order_exprs: vec![], if_not_exists: false, unbounded: false, - options: vec!["format.delimiter".to_string()] - .into_iter() - .zip(vec!["|".to_string()]) - .collect(), + options: HashMap::from([("format.delimiter".into(), "|".into())]), constraints: vec![], }); expect_parse_ok(sql, expected)?; @@ -1034,7 +987,6 @@ mod tests { name: "t".into(), columns: vec![make_column_def("c1", DataType::Int(display))], file_type: "CSV".to_string(), - has_header: false, location: "foo.csv".into(), table_partition_cols: vec!["p1".to_string(), "p2".to_string()], order_exprs: vec![], @@ -1045,23 +997,22 @@ mod tests { }); expect_parse_ok(sql, expected)?; - // positive case: it is ok for case insensitive sql stmt with `WITH HEADER ROW` tokens + // positive case: it is ok for case insensitive sql stmt with has_header option tokens let sqls = vec![ - "CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV WITH HEADER ROW LOCATION 'foo.csv'", - "CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV with header row LOCATION 'foo.csv'" + "CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV LOCATION 'foo.csv' OPTIONS ('FORMAT.HAS_HEADER' 'TRUE')", + "CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV LOCATION 'foo.csv' OPTIONS ('format.has_header' 'true')", ]; for sql in sqls { let expected = Statement::CreateExternalTable(CreateExternalTable { name: "t".into(), columns: vec![make_column_def("c1", DataType::Int(display))], file_type: "CSV".to_string(), - has_header: true, location: "foo.csv".into(), table_partition_cols: vec![], order_exprs: vec![], if_not_exists: false, unbounded: false, - options: HashMap::new(), + options: HashMap::from([("format.has_header".into(), "true".into())]), constraints: vec![], }); expect_parse_ok(sql, expected)?; @@ -1083,7 +1034,6 @@ mod tests { name: "t".into(), columns: vec![make_column_def("c1", DataType::Int(display))], file_type: "CSV".to_string(), - has_header: false, location: "foo.csv".into(), table_partition_cols: vec![], order_exprs: vec![], @@ -1104,7 +1054,6 @@ mod tests { name: "t".into(), columns: vec![], file_type: "PARQUET".to_string(), - has_header: false, location: "foo.parquet".into(), table_partition_cols: vec![], order_exprs: vec![], @@ -1121,7 +1070,6 @@ mod tests { name: "t".into(), columns: vec![], file_type: "PARQUET".to_string(), - has_header: false, location: "foo.parquet".into(), table_partition_cols: vec![], order_exprs: vec![], @@ -1138,7 +1086,6 @@ mod tests { name: "t".into(), columns: vec![], file_type: "AVRO".to_string(), - has_header: false, location: "foo.avro".into(), table_partition_cols: vec![], order_exprs: vec![], @@ -1156,7 +1103,6 @@ mod tests { name: "t".into(), columns: vec![], file_type: "PARQUET".to_string(), - has_header: false, location: "foo.parquet".into(), table_partition_cols: vec![], order_exprs: vec![], @@ -1177,7 +1123,6 @@ mod tests { make_column_def("p1", DataType::Int(None)), ], file_type: "CSV".to_string(), - has_header: false, location: "foo.csv".into(), table_partition_cols: vec!["p1".to_string()], order_exprs: vec![], @@ -1205,7 +1150,6 @@ mod tests { name: "t".into(), columns: vec![], file_type: "X".to_string(), - has_header: false, location: "blahblah".into(), table_partition_cols: vec![], order_exprs: vec![], @@ -1223,7 +1167,6 @@ mod tests { name: "t".into(), columns: vec![], file_type: "X".to_string(), - has_header: false, location: "blahblah".into(), table_partition_cols: vec![], order_exprs: vec![], @@ -1263,7 +1206,6 @@ mod tests { name: "t".into(), columns: vec![make_column_def("c1", DataType::Int(None))], file_type: "CSV".to_string(), - has_header: false, location: "foo.csv".into(), table_partition_cols: vec![], order_exprs: vec![vec![OrderByExpr { @@ -1292,7 +1234,6 @@ mod tests { make_column_def("c2", DataType::Int(display)), ], file_type: "CSV".to_string(), - has_header: false, location: "foo.csv".into(), table_partition_cols: vec![], order_exprs: vec![vec![ @@ -1330,7 +1271,6 @@ mod tests { make_column_def("c2", DataType::Int(display)), ], file_type: "CSV".to_string(), - has_header: false, location: "foo.csv".into(), table_partition_cols: vec![], order_exprs: vec![vec![OrderByExpr { @@ -1359,13 +1299,13 @@ mod tests { let sql = " CREATE UNBOUNDED EXTERNAL TABLE IF NOT EXISTS t (c1 int, c2 float) STORED AS PARQUET - WITH HEADER ROW WITH ORDER (c1 - c2 ASC) PARTITIONED BY (c1) LOCATION 'foo.parquet' OPTIONS (ROW_GROUP_SIZE '1024', 'TRUNCATE' 'NO', 'format.compression' 'zstd', - 'format.delimiter' '*') + 'format.delimiter' '*', + 'format.has_header' 'true') "; let expected = Statement::CreateExternalTable(CreateExternalTable { name: "t".into(), @@ -1374,7 +1314,6 @@ mod tests { make_column_def("c2", DataType::Float(None)), ], file_type: "PARQUET".to_string(), - has_header: true, location: "foo.parquet".into(), table_partition_cols: vec!["c1".into()], order_exprs: vec![vec![OrderByExpr { @@ -1399,6 +1338,7 @@ mod tests { ("format.delimiter".into(), "*".into()), ("ROW_GROUP_SIZE".into(), "1024".into()), ("TRUNCATE".into(), "NO".into()), + ("format.has_header".into(), "true".into()), ]), constraints: vec![], }); @@ -1484,7 +1424,8 @@ mod tests { panic!("Expected query, got {statement:?}"); }; - let sql = "COPY (SELECT 1) TO bar STORED AS CSV WITH HEADER ROW"; + let sql = + "COPY (SELECT 1) TO bar STORED AS CSV OPTIONS ('format.has_header' 'true')"; let expected = Statement::CopyTo(CopyToStatement { source: CopyToSource::Query(query), target: "bar".to_string(), diff --git a/datafusion/sql/src/statement.rs b/datafusion/sql/src/statement.rs index ef5561335626..e6eda0b2787d 100644 --- a/datafusion/sql/src/statement.rs +++ b/datafusion/sql/src/statement.rs @@ -968,7 +968,6 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { name, columns, file_type, - has_header, location, table_partition_cols, if_not_exists, @@ -1021,7 +1020,6 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { name, location, file_type, - has_header, table_partition_cols, if_not_exists, definition, diff --git a/datafusion/sqllogictest/test_files/agg_func_substitute.slt b/datafusion/sqllogictest/test_files/agg_func_substitute.slt index c5cd78f1259b..342d45e7fb24 100644 --- a/datafusion/sqllogictest/test_files/agg_func_substitute.slt +++ b/datafusion/sqllogictest/test_files/agg_func_substitute.slt @@ -27,10 +27,10 @@ CREATE EXTERNAL TABLE multiple_ordered_table ( d INTEGER ) STORED AS CSV -WITH HEADER ROW WITH ORDER (a ASC, b ASC) WITH ORDER (c ASC) -LOCATION '../../datafusion/core/tests/data/window_2.csv'; +LOCATION '../../datafusion/core/tests/data/window_2.csv' +OPTIONS ('format.has_header' 'true'); query TT diff --git a/datafusion/sqllogictest/test_files/aggregate.slt b/datafusion/sqllogictest/test_files/aggregate.slt index bc677b73fb94..a8393e08630b 100644 --- a/datafusion/sqllogictest/test_files/aggregate.slt +++ b/datafusion/sqllogictest/test_files/aggregate.slt @@ -35,8 +35,8 @@ CREATE EXTERNAL TABLE aggregate_test_100 ( c13 VARCHAR NOT NULL ) STORED AS CSV -WITH HEADER ROW LOCATION '../../testing/data/csv/aggregate_test_100.csv' +OPTIONS ('format.has_header' 'true'); statement ok CREATE TABLE d_table (c1 decimal(10,3), c2 varchar) @@ -116,8 +116,8 @@ c2 INT NOT NULL, c3 INT NOT NULL ) STORED AS CSV -WITH HEADER ROW -LOCATION '../core/tests/data/aggregate_agg_multi_order.csv'; +LOCATION '../core/tests/data/aggregate_agg_multi_order.csv' +OPTIONS ('format.has_header' 'true'); # test array_agg with order by multiple columns query ? @@ -3846,11 +3846,11 @@ c2 INT NOT NULL, c3 INT NOT NULL ) STORED AS CSV -WITH HEADER ROW WITH ORDER (c1 ASC) WITH ORDER (c2 DESC) WITH ORDER (c3 ASC) -LOCATION '../core/tests/data/convert_first_last.csv'; +LOCATION '../core/tests/data/convert_first_last.csv' +OPTIONS ('format.has_header' 'true'); # test first to last, the result does not show difference, we need to check the conversion by `explain` query TT diff --git a/datafusion/sqllogictest/test_files/avro.slt b/datafusion/sqllogictest/test_files/avro.slt index 7b9fbe556fee..f9f55f0dc153 100644 --- a/datafusion/sqllogictest/test_files/avro.slt +++ b/datafusion/sqllogictest/test_files/avro.slt @@ -31,8 +31,8 @@ CREATE EXTERNAL TABLE alltypes_plain ( timestamp_col TIMESTAMP NOT NULL, ) STORED AS AVRO -WITH HEADER ROW LOCATION '../../testing/data/avro/alltypes_plain.avro' +OPTIONS ('format.has_header' 'true'); statement ok CREATE EXTERNAL TABLE alltypes_plain_snappy ( @@ -49,8 +49,8 @@ CREATE EXTERNAL TABLE alltypes_plain_snappy ( timestamp_col TIMESTAMP NOT NULL, ) STORED AS AVRO -WITH HEADER ROW LOCATION '../../testing/data/avro/alltypes_plain.snappy.avro' +OPTIONS ('format.has_header' 'true'); statement ok CREATE EXTERNAL TABLE alltypes_plain_bzip2 ( @@ -67,8 +67,8 @@ CREATE EXTERNAL TABLE alltypes_plain_bzip2 ( timestamp_col TIMESTAMP NOT NULL, ) STORED AS AVRO -WITH HEADER ROW LOCATION '../../testing/data/avro/alltypes_plain.bzip2.avro' +OPTIONS ('format.has_header' 'true'); statement ok CREATE EXTERNAL TABLE alltypes_plain_xz ( @@ -85,8 +85,8 @@ CREATE EXTERNAL TABLE alltypes_plain_xz ( timestamp_col TIMESTAMP NOT NULL, ) STORED AS AVRO -WITH HEADER ROW LOCATION '../../testing/data/avro/alltypes_plain.xz.avro' +OPTIONS ('format.has_header' 'true'); statement ok CREATE EXTERNAL TABLE alltypes_plain_zstandard ( @@ -103,34 +103,34 @@ CREATE EXTERNAL TABLE alltypes_plain_zstandard ( timestamp_col TIMESTAMP NOT NULL, ) STORED AS AVRO -WITH HEADER ROW LOCATION '../../testing/data/avro/alltypes_plain.zstandard.avro' +OPTIONS ('format.has_header' 'true'); statement ok CREATE EXTERNAL TABLE single_nan ( mycol FLOAT ) STORED AS AVRO -WITH HEADER ROW LOCATION '../../testing/data/avro/single_nan.avro' +OPTIONS ('format.has_header' 'true'); statement ok CREATE EXTERNAL TABLE nested_records STORED AS AVRO -WITH HEADER ROW LOCATION '../../testing/data/avro/nested_records.avro' +OPTIONS ('format.has_header' 'true'); statement ok CREATE EXTERNAL TABLE simple_enum STORED AS AVRO -WITH HEADER ROW LOCATION '../../testing/data/avro/simple_enum.avro' +OPTIONS ('format.has_header' 'true'); statement ok CREATE EXTERNAL TABLE simple_fixed STORED AS AVRO -WITH HEADER ROW LOCATION '../../testing/data/avro/simple_fixed.avro' +OPTIONS ('format.has_header' 'true'); # test avro query query IT diff --git a/datafusion/sqllogictest/test_files/create_external_table.slt b/datafusion/sqllogictest/test_files/create_external_table.slt index 9f1fc523f559..4724bc87a099 100644 --- a/datafusion/sqllogictest/test_files/create_external_table.slt +++ b/datafusion/sqllogictest/test_files/create_external_table.slt @@ -68,18 +68,6 @@ CREATE EXTERNAL TABLE t STORED AS CSV STORED AS PARQUET LOCATION 'foo.parquet' statement error DataFusion error: SQL error: ParserError\("LOCATION specified more than once"\) CREATE EXTERNAL TABLE t STORED AS CSV LOCATION 'foo.csv' LOCATION 'bar.csv' -# Duplicate `WITH HEADER ROW` clause -statement error DataFusion error: SQL error: ParserError\("WITH HEADER ROW specified more than once"\) -CREATE EXTERNAL TABLE t STORED AS CSV WITH HEADER ROW WITH HEADER ROW LOCATION 'foo.csv' - -# Duplicate `DELIMITER` clause -statement error DataFusion error: SQL error: ParserError\("DELIMITER specified more than once"\) -CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV DELIMITER '-' DELIMITER '+' LOCATION 'foo.csv' - -# Duplicate `COMPRESSION TYPE` clause -statement error DataFusion error: SQL error: ParserError\("COMPRESSION TYPE specified more than once"\) -CREATE EXTERNAL TABLE t STORED AS CSV COMPRESSION TYPE BZIP2 COMPRESSION TYPE XZ COMPRESSION TYPE ZSTD COMPRESSION TYPE GZIP LOCATION 'foo.csv' - # Duplicate `PARTITIONED BY` clause statement error DataFusion error: SQL error: ParserError\("PARTITIONED BY specified more than once"\) create EXTERNAL TABLE t(c1 int, c2 int) STORED AS CSV PARTITIONED BY (c1) partitioned by (c2) LOCATION 'foo.csv' diff --git a/datafusion/sqllogictest/test_files/csv_files.slt b/datafusion/sqllogictest/test_files/csv_files.slt index 1c501634fef5..5e30d6ed09ae 100644 --- a/datafusion/sqllogictest/test_files/csv_files.slt +++ b/datafusion/sqllogictest/test_files/csv_files.slt @@ -21,19 +21,19 @@ CREATE EXTERNAL TABLE csv_with_quote ( c1 VARCHAR, c2 VARCHAR ) STORED AS CSV -WITH HEADER ROW LOCATION '../core/tests/data/quote.csv' OPTIONS ('format.quote' '~', - 'format.delimiter' ','); + 'format.delimiter' ',', + 'format.has_header' 'true'); statement ok CREATE EXTERNAL TABLE csv_with_escape ( c1 VARCHAR, c2 VARCHAR ) STORED AS CSV -WITH HEADER ROW OPTIONS ('format.escape' '\', - 'format.delimiter' ',') + 'format.delimiter' ',', + 'format.has_header' 'true') LOCATION '../core/tests/data/escape.csv'; query TT @@ -69,9 +69,9 @@ CREATE EXTERNAL TABLE csv_with_escape_2 ( c1 VARCHAR, c2 VARCHAR ) STORED AS CSV -WITH HEADER ROW OPTIONS ('format.escape' '"', - 'format.delimiter' ',') + 'format.delimiter' ',', + 'format.has_header' 'true') LOCATION '../core/tests/data/escape.csv'; # TODO: Validate this with better data. diff --git a/datafusion/sqllogictest/test_files/cte.slt b/datafusion/sqllogictest/test_files/cte.slt index e9c508cb27fb..8d086e1b2dd2 100644 --- a/datafusion/sqllogictest/test_files/cte.slt +++ b/datafusion/sqllogictest/test_files/cte.slt @@ -126,11 +126,11 @@ physical_plan # setup statement ok -CREATE EXTERNAL TABLE balance STORED as CSV WITH HEADER ROW LOCATION '../core/tests/data/recursive_cte/balance.csv' +CREATE EXTERNAL TABLE balance STORED as CSV LOCATION '../core/tests/data/recursive_cte/balance.csv' OPTIONS ('format.has_header' 'true'); # setup statement ok -CREATE EXTERNAL TABLE growth STORED as CSV WITH HEADER ROW LOCATION '../core/tests/data/recursive_cte/growth.csv' +CREATE EXTERNAL TABLE growth STORED as CSV LOCATION '../core/tests/data/recursive_cte/growth.csv' OPTIONS ('format.has_header' 'true'); # setup statement ok @@ -407,7 +407,7 @@ FROM # setup statement ok -CREATE EXTERNAL TABLE prices STORED as CSV WITH HEADER ROW LOCATION '../core/tests/data/recursive_cte/prices.csv' +CREATE EXTERNAL TABLE prices STORED as CSV LOCATION '../core/tests/data/recursive_cte/prices.csv' OPTIONS ('format.has_header' 'true'); # CTE within window function inside nested CTE works. This test demonstrates using a nested window function to recursively iterate over a column. query RRII @@ -598,11 +598,11 @@ ORDER BY # setup statement ok -CREATE EXTERNAL TABLE sales STORED as CSV WITH HEADER ROW LOCATION '../core/tests/data/recursive_cte/sales.csv' +CREATE EXTERNAL TABLE sales STORED as CSV LOCATION '../core/tests/data/recursive_cte/sales.csv' OPTIONS ('format.has_header' 'true'); # setup statement ok -CREATE EXTERNAL TABLE salespersons STORED as CSV WITH HEADER ROW LOCATION '../core/tests/data/recursive_cte/salespersons.csv' +CREATE EXTERNAL TABLE salespersons STORED as CSV LOCATION '../core/tests/data/recursive_cte/salespersons.csv' OPTIONS ('format.has_header' 'true'); # group by works within recursive cte. This test case demonstrates rolling up a hierarchy of salespeople to their managers. diff --git a/datafusion/sqllogictest/test_files/ddl.slt b/datafusion/sqllogictest/test_files/ddl.slt index 26b0e5561806..a35e688479e7 100644 --- a/datafusion/sqllogictest/test_files/ddl.slt +++ b/datafusion/sqllogictest/test_files/ddl.slt @@ -256,7 +256,7 @@ DROP VIEW non_existent_view ########## statement ok -CREATE external table aggregate_simple(c1 real, c2 double, c3 boolean) STORED as CSV WITH HEADER ROW LOCATION '../core/tests/data/aggregate_simple.csv'; +CREATE external table aggregate_simple(c1 real, c2 double, c3 boolean) STORED as CSV LOCATION '../core/tests/data/aggregate_simple.csv' OPTIONS ('format.has_header' 'true'); # create_table_as statement ok @@ -455,8 +455,8 @@ CREATE EXTERNAL TABLE aggregate_test_100 ( c13 VARCHAR NOT NULL ) STORED AS CSV -WITH HEADER ROW -LOCATION '../../testing/data/csv/aggregate_test_100.csv'; +LOCATION '../../testing/data/csv/aggregate_test_100.csv' +OPTIONS ('format.has_header' 'true'); query TIIIIIIIIIRRT SELECT c1, c2, c3, c4, c5, c6, c7, c8, c9, 10, c11, c12, c13 FROM aggregate_test_100 LIMIT 1; @@ -535,7 +535,7 @@ DROP VIEW y; # create_pipe_delimited_csv_table() statement ok -CREATE EXTERNAL TABLE aggregate_simple STORED AS CSV WITH HEADER ROW LOCATION '../core/tests/data/aggregate_simple_pipe.csv' OPTIONS ('format.delimiter' '|'); +CREATE EXTERNAL TABLE aggregate_simple STORED AS CSV LOCATION '../core/tests/data/aggregate_simple_pipe.csv' OPTIONS ('format.delimiter' '|', 'format.has_header' 'true'); query RRB @@ -581,14 +581,14 @@ statement ok CREATE TABLE IF NOT EXISTS table_without_values(field1 BIGINT, field2 BIGINT); statement ok -CREATE EXTERNAL TABLE aggregate_simple STORED AS CSV WITH HEADER ROW LOCATION '../core/tests/data/aggregate_simple.csv' +CREATE EXTERNAL TABLE aggregate_simple STORED AS CSV LOCATION '../core/tests/data/aggregate_simple.csv' OPTIONS ('format.has_header' 'true'); # Should not recreate the same EXTERNAL table statement error Execution error: Table 'aggregate_simple' already exists -CREATE EXTERNAL TABLE aggregate_simple STORED AS CSV WITH HEADER ROW LOCATION '../core/tests/data/aggregate_simple.csv' +CREATE EXTERNAL TABLE aggregate_simple STORED AS CSV LOCATION '../core/tests/data/aggregate_simple.csv' OPTIONS ('format.has_header' 'true'); statement ok -CREATE EXTERNAL TABLE IF NOT EXISTS aggregate_simple STORED AS CSV WITH HEADER ROW LOCATION '../core/tests/data/aggregate_simple.csv' +CREATE EXTERNAL TABLE IF NOT EXISTS aggregate_simple STORED AS CSV LOCATION '../core/tests/data/aggregate_simple.csv' OPTIONS ('format.has_header' 'true'); # create bad custom table statement error DataFusion error: Execution error: Unable to find factory for DELTATABLE @@ -690,7 +690,7 @@ drop table foo; # create csv table with empty csv file statement ok -CREATE EXTERNAL TABLE empty STORED AS CSV WITH HEADER ROW LOCATION '../core/tests/data/empty.csv'; +CREATE EXTERNAL TABLE empty STORED AS CSV LOCATION '../core/tests/data/empty.csv' OPTIONS ('format.has_header' 'true'); query TTI select column_name, data_type, ordinal_position from information_schema.columns where table_name='empty';; @@ -742,8 +742,8 @@ DROP SCHEMA empty_schema; statement ok CREATE UNBOUNDED external table t(c1 integer, c2 integer, c3 integer) STORED as CSV -WITH HEADER ROW -LOCATION '../core/tests/data/empty.csv'; +LOCATION '../core/tests/data/empty.csv' +OPTIONS ('format.has_header' 'true'); # should see infinite_source=true in the explain query TT @@ -760,8 +760,8 @@ drop table t; statement ok CREATE external table t(c1 integer, c2 integer, c3 integer) STORED as CSV -WITH HEADER ROW -LOCATION '../core/tests/data/empty.csv'; +LOCATION '../core/tests/data/empty.csv' +OPTIONS ('format.has_header' 'true'); # expect to see no infinite_source in the explain query TT diff --git a/datafusion/sqllogictest/test_files/decimal.slt b/datafusion/sqllogictest/test_files/decimal.slt index 3f75e42d9304..8db28c32f13b 100644 --- a/datafusion/sqllogictest/test_files/decimal.slt +++ b/datafusion/sqllogictest/test_files/decimal.slt @@ -44,8 +44,8 @@ c4 BOOLEAN NOT NULL, c5 DECIMAL(12,7) NOT NULL ) STORED AS CSV -WITH HEADER ROW -LOCATION '../core/tests/data/decimal_data.csv'; +LOCATION '../core/tests/data/decimal_data.csv' +OPTIONS ('format.has_header' 'true'); query TT @@ -639,8 +639,8 @@ c4 BOOLEAN NOT NULL, c5 DECIMAL(52,7) NOT NULL ) STORED AS CSV -WITH HEADER ROW -LOCATION '../core/tests/data/decimal_data.csv'; +LOCATION '../core/tests/data/decimal_data.csv' +OPTIONS ('format.has_header' 'true'); query TT select arrow_typeof(c1), arrow_typeof(c5) from decimal256_simple limit 1; diff --git a/datafusion/sqllogictest/test_files/describe.slt b/datafusion/sqllogictest/test_files/describe.slt index f94a2e453884..a15c3a109cab 100644 --- a/datafusion/sqllogictest/test_files/describe.slt +++ b/datafusion/sqllogictest/test_files/describe.slt @@ -24,7 +24,7 @@ statement ok set datafusion.catalog.information_schema = true statement ok -CREATE external table aggregate_simple(c1 real, c2 double, c3 boolean) STORED as CSV WITH HEADER ROW LOCATION '../core/tests/data/aggregate_simple.csv'; +CREATE external table aggregate_simple(c1 real, c2 double, c3 boolean) STORED as CSV LOCATION '../core/tests/data/aggregate_simple.csv' OPTIONS ('format.has_header' 'true'); query TTT rowsort DESCRIBE aggregate_simple; @@ -44,7 +44,7 @@ statement ok set datafusion.catalog.information_schema = false statement ok -CREATE external table aggregate_simple(c1 real, c2 double, c3 boolean) STORED as CSV WITH HEADER ROW LOCATION '../core/tests/data/aggregate_simple.csv'; +CREATE external table aggregate_simple(c1 real, c2 double, c3 boolean) STORED as CSV LOCATION '../core/tests/data/aggregate_simple.csv' OPTIONS ('format.has_header' 'true'); query TTT rowsort DESCRIBE aggregate_simple; diff --git a/datafusion/sqllogictest/test_files/distinct_on.slt b/datafusion/sqllogictest/test_files/distinct_on.slt index 0bb931c88cbc..cdef2990fa3c 100644 --- a/datafusion/sqllogictest/test_files/distinct_on.slt +++ b/datafusion/sqllogictest/test_files/distinct_on.slt @@ -32,8 +32,8 @@ CREATE EXTERNAL TABLE aggregate_test_100 ( c13 VARCHAR NOT NULL ) STORED AS CSV -WITH HEADER ROW LOCATION '../../testing/data/csv/aggregate_test_100.csv' +OPTIONS ('format.has_header' 'true'); # Basic example: distinct on the first column project the second one, and # order by the third diff --git a/datafusion/sqllogictest/test_files/errors.slt b/datafusion/sqllogictest/test_files/errors.slt index ab281eac31f5..c9c96df2b6a9 100644 --- a/datafusion/sqllogictest/test_files/errors.slt +++ b/datafusion/sqllogictest/test_files/errors.slt @@ -34,8 +34,8 @@ CREATE EXTERNAL TABLE aggregate_test_100 ( c13 VARCHAR NOT NULL ) STORED AS CSV -WITH HEADER ROW LOCATION '../../testing/data/csv/aggregate_test_100.csv' +OPTIONS ('format.has_header' 'true'); # csv_query_error statement error DataFusion error: Error during planning: No function matches the given name and argument types 'sin\(Utf8\)'. You might need to add explicit type casts.\n\tCandidate functions:\n\tsin\(Float64/Float32\) diff --git a/datafusion/sqllogictest/test_files/explain.slt b/datafusion/sqllogictest/test_files/explain.slt index 205306882b4a..3a4ac747ebd6 100644 --- a/datafusion/sqllogictest/test_files/explain.slt +++ b/datafusion/sqllogictest/test_files/explain.slt @@ -32,8 +32,8 @@ CREATE EXTERNAL TABLE aggregate_test_100 ( c13 VARCHAR NOT NULL ) STORED AS CSV -WITH HEADER ROW -LOCATION '../../testing/data/csv/aggregate_test_100.csv'; +LOCATION '../../testing/data/csv/aggregate_test_100.csv' +OPTIONS ('format.has_header' 'true'); query TT explain SELECT c1 FROM aggregate_test_100 where c2 > 10 @@ -68,9 +68,9 @@ CREATE EXTERNAL TABLE aggregate_test_100_with_order ( c13 VARCHAR NOT NULL ) STORED AS CSV -WITH HEADER ROW WITH ORDER (c1 ASC) -LOCATION '../core/tests/data/aggregate_test_100_order_by_c1_asc.csv'; +LOCATION '../core/tests/data/aggregate_test_100_order_by_c1_asc.csv' +OPTIONS ('format.has_header' 'true'); query TT explain SELECT c1 FROM aggregate_test_100_with_order order by c1 ASC limit 10 @@ -128,8 +128,8 @@ CREATE EXTERNAL TABLE simple_explain_test ( c INT ) STORED AS CSV -WITH HEADER ROW LOCATION '../core/tests/data/example.csv' +OPTIONS ('format.has_header' 'true'); query TT EXPLAIN SELECT a, b, c FROM simple_explain_test @@ -156,8 +156,8 @@ CREATE UNBOUNDED EXTERNAL TABLE sink_table ( c13 VARCHAR NOT NULL ) STORED AS CSV -WITH HEADER ROW -LOCATION '../../testing/data/csv/aggregate_test_100.csv'; +LOCATION '../../testing/data/csv/aggregate_test_100.csv' +OPTIONS ('format.has_header' 'true'); query TT EXPLAIN INSERT INTO sink_table SELECT * FROM aggregate_test_100 ORDER by c1 diff --git a/datafusion/sqllogictest/test_files/expr.slt b/datafusion/sqllogictest/test_files/expr.slt index 7e7ebd8529da..e3a7db7aef3c 100644 --- a/datafusion/sqllogictest/test_files/expr.slt +++ b/datafusion/sqllogictest/test_files/expr.slt @@ -1935,8 +1935,8 @@ CREATE EXTERNAL TABLE aggregate_test_100_by_sql ( c13 VARCHAR NOT NULL ) STORED AS CSV -WITH HEADER ROW LOCATION '../../testing/data/csv/aggregate_test_100.csv' +OPTIONS ('format.has_header' 'true'); diff --git a/datafusion/sqllogictest/test_files/functions.slt b/datafusion/sqllogictest/test_files/functions.slt index d03b33d0c8e5..4bb553767b45 100644 --- a/datafusion/sqllogictest/test_files/functions.slt +++ b/datafusion/sqllogictest/test_files/functions.slt @@ -549,8 +549,8 @@ CREATE EXTERNAL TABLE aggregate_test_100 ( c13 VARCHAR NOT NULL ) STORED AS CSV -WITH HEADER ROW LOCATION '../../testing/data/csv/aggregate_test_100.csv' +OPTIONS ('format.has_header' 'true'); # sqrt_f32_vs_f64 diff --git a/datafusion/sqllogictest/test_files/group.slt b/datafusion/sqllogictest/test_files/group.slt index 2a28efa73a62..a6b5f9b72a53 100644 --- a/datafusion/sqllogictest/test_files/group.slt +++ b/datafusion/sqllogictest/test_files/group.slt @@ -32,11 +32,11 @@ CREATE EXTERNAL TABLE aggregate_test_100 ( c13 VARCHAR NOT NULL ) STORED AS CSV -WITH HEADER ROW LOCATION '../../testing/data/csv/aggregate_test_100.csv' +OPTIONS ('format.has_header' 'true'); statement ok -CREATE external table aggregate_simple(c1 real, c2 double, c3 boolean) STORED as CSV WITH HEADER ROW LOCATION '../core/tests/data/aggregate_simple.csv'; +CREATE external table aggregate_simple(c1 real, c2 double, c3 boolean) STORED as CSV LOCATION '../core/tests/data/aggregate_simple.csv' OPTIONS ('format.has_header' 'true'); # csv_query_group_by_int_min_max diff --git a/datafusion/sqllogictest/test_files/group_by.slt b/datafusion/sqllogictest/test_files/group_by.slt index 5a605ea58b6c..43bbf6bed643 100644 --- a/datafusion/sqllogictest/test_files/group_by.slt +++ b/datafusion/sqllogictest/test_files/group_by.slt @@ -2042,9 +2042,9 @@ CREATE UNBOUNDED EXTERNAL TABLE annotated_data_infinite2 ( d INTEGER ) STORED AS CSV -WITH HEADER ROW WITH ORDER (a ASC, b ASC, c ASC) -LOCATION '../core/tests/data/window_2.csv'; +LOCATION '../core/tests/data/window_2.csv' +OPTIONS ('format.has_header' 'true'); # Create a table with 2 ordered columns. # In the next step, we will expect to observe the removed sort execs. @@ -2057,10 +2057,10 @@ CREATE EXTERNAL TABLE multiple_ordered_table ( d INTEGER ) STORED AS CSV -WITH HEADER ROW WITH ORDER (a ASC, b ASC) WITH ORDER (c ASC) -LOCATION '../core/tests/data/window_2.csv'; +LOCATION '../core/tests/data/window_2.csv' +OPTIONS ('format.has_header' 'true'); # Expected a sort exec for b DESC query TT @@ -3894,10 +3894,10 @@ CREATE EXTERNAL TABLE multiple_ordered_table_with_pk ( primary key(c) ) STORED AS CSV -WITH HEADER ROW WITH ORDER (a ASC, b ASC) WITH ORDER (c ASC) -LOCATION '../core/tests/data/window_2.csv'; +LOCATION '../core/tests/data/window_2.csv' +OPTIONS ('format.has_header' 'true'); # We can use column b during selection # even if it is not among group by expressions @@ -3935,10 +3935,10 @@ CREATE EXTERNAL TABLE multiple_ordered_table_with_pk ( d INTEGER ) STORED AS CSV -WITH HEADER ROW WITH ORDER (a ASC, b ASC) WITH ORDER (c ASC) -LOCATION '../core/tests/data/window_2.csv'; +LOCATION '../core/tests/data/window_2.csv' +OPTIONS ('format.has_header' 'true'); # We can use column b during selection # even if it is not among group by expressions @@ -4377,8 +4377,8 @@ CREATE EXTERNAL TABLE aggregate_test_100 ( c13 VARCHAR NOT NULL ) STORED AS CSV -WITH HEADER ROW LOCATION '../../testing/data/csv/aggregate_test_100.csv' +OPTIONS ('format.has_header' 'true'); query TIIII SELECT c1, count(distinct c2), min(distinct c2), min(c3), max(c4) FROM aggregate_test_100 GROUP BY c1 ORDER BY c1; @@ -4454,10 +4454,10 @@ CREATE EXTERNAL TABLE unbounded_multiple_ordered_table_with_pk ( d INTEGER ) STORED AS CSV -WITH HEADER ROW WITH ORDER (a ASC, b ASC) WITH ORDER (c ASC) -LOCATION '../core/tests/data/window_2.csv'; +LOCATION '../core/tests/data/window_2.csv' +OPTIONS ('format.has_header' 'true'); # Query below can be executed, since c is primary key. query III rowsort @@ -4538,8 +4538,8 @@ CREATE EXTERNAL TABLE timestamp_table ( c2 INT, ) STORED AS CSV -WITH HEADER ROW -LOCATION 'test_files/scratch/group_by/timestamp_table'; +LOCATION 'test_files/scratch/group_by/timestamp_table' +OPTIONS ('format.has_header' 'true'); # Group By using date_trunc query PI rowsort diff --git a/datafusion/sqllogictest/test_files/identifiers.slt b/datafusion/sqllogictest/test_files/identifiers.slt index f60d60b2bfe0..755d617e7a2a 100644 --- a/datafusion/sqllogictest/test_files/identifiers.slt +++ b/datafusion/sqllogictest/test_files/identifiers.slt @@ -22,8 +22,8 @@ CREATE EXTERNAL TABLE case_insensitive_test ( c INT ) STORED AS CSV -WITH HEADER ROW LOCATION '../core/tests/data/example.csv' +OPTIONS ('format.has_header' 'true'); # normalized column identifiers query II diff --git a/datafusion/sqllogictest/test_files/information_schema.slt b/datafusion/sqllogictest/test_files/information_schema.slt index f52147142cb7..de00cf9d0547 100644 --- a/datafusion/sqllogictest/test_files/information_schema.slt +++ b/datafusion/sqllogictest/test_files/information_schema.slt @@ -571,7 +571,8 @@ DROP VIEW test.xyz statement ok CREATE EXTERNAL TABLE abc STORED AS CSV -WITH HEADER ROW LOCATION '../../testing/data/csv/aggregate_test_100.csv'; +LOCATION '../../testing/data/csv/aggregate_test_100.csv' +OPTIONS ('format.has_header' 'true'); query TTTT SHOW CREATE TABLE abc; diff --git a/datafusion/sqllogictest/test_files/insert.slt b/datafusion/sqllogictest/test_files/insert.slt index 7c9bc4abe767..126e4120e9be 100644 --- a/datafusion/sqllogictest/test_files/insert.slt +++ b/datafusion/sqllogictest/test_files/insert.slt @@ -37,8 +37,8 @@ CREATE EXTERNAL TABLE aggregate_test_100 ( c13 VARCHAR NOT NULL ) STORED AS CSV -WITH HEADER ROW LOCATION '../../testing/data/csv/aggregate_test_100.csv' +OPTIONS ('format.has_header' 'true'); # test_insert_into diff --git a/datafusion/sqllogictest/test_files/insert_to_external.slt b/datafusion/sqllogictest/test_files/insert_to_external.slt index 70eb2b75a75a..dfb333d16e23 100644 --- a/datafusion/sqllogictest/test_files/insert_to_external.slt +++ b/datafusion/sqllogictest/test_files/insert_to_external.slt @@ -37,8 +37,8 @@ CREATE EXTERNAL TABLE aggregate_test_100 ( c13 VARCHAR NOT NULL ) STORED AS CSV -WITH HEADER ROW LOCATION '../../testing/data/csv/aggregate_test_100.csv' +OPTIONS ('format.has_header' 'true'); statement ok diff --git a/datafusion/sqllogictest/test_files/join_disable_repartition_joins.slt b/datafusion/sqllogictest/test_files/join_disable_repartition_joins.slt index 60a14f78bdf5..8de8c478fbc4 100644 --- a/datafusion/sqllogictest/test_files/join_disable_repartition_joins.slt +++ b/datafusion/sqllogictest/test_files/join_disable_repartition_joins.slt @@ -34,9 +34,9 @@ CREATE EXTERNAL TABLE annotated_data ( d INTEGER ) STORED AS CSV -WITH HEADER ROW WITH ORDER (a ASC, b ASC, c ASC) -LOCATION '../core/tests/data/window_2.csv'; +LOCATION '../core/tests/data/window_2.csv' +OPTIONS ('format.has_header' 'true'); query TT EXPLAIN SELECT t2.a diff --git a/datafusion/sqllogictest/test_files/joins.slt b/datafusion/sqllogictest/test_files/joins.slt index f4a86fd02709..95834c4c6ebb 100644 --- a/datafusion/sqllogictest/test_files/joins.slt +++ b/datafusion/sqllogictest/test_files/joins.slt @@ -3162,9 +3162,9 @@ CREATE EXTERNAL TABLE annotated_data ( d INTEGER ) STORED AS CSV -WITH HEADER ROW WITH ORDER (a ASC NULLS FIRST, b ASC, c ASC) -LOCATION '../core/tests/data/window_2.csv'; +LOCATION '../core/tests/data/window_2.csv' +OPTIONS ('format.has_header' 'true'); # sort merge join should propagate ordering equivalence of the left side # for inner join. Hence final requirement rn1 ASC is already satisfied at @@ -3379,10 +3379,10 @@ CREATE EXTERNAL TABLE multiple_ordered_table ( d INTEGER ) STORED AS CSV -WITH HEADER ROW WITH ORDER (a ASC, b ASC) WITH ORDER (c ASC) -LOCATION '../core/tests/data/window_2.csv'; +LOCATION '../core/tests/data/window_2.csv' +OPTIONS ('format.has_header' 'true'); query TT EXPLAIN SELECT LAST_VALUE(l.d ORDER BY l.a) AS amount_usd diff --git a/datafusion/sqllogictest/test_files/limit.slt b/datafusion/sqllogictest/test_files/limit.slt index 2238a4aaeaf4..2c65b1da4474 100644 --- a/datafusion/sqllogictest/test_files/limit.slt +++ b/datafusion/sqllogictest/test_files/limit.slt @@ -36,8 +36,8 @@ CREATE EXTERNAL TABLE aggregate_test_100 ( c13 VARCHAR NOT NULL ) STORED AS CSV -WITH HEADER ROW LOCATION '../../testing/data/csv/aggregate_test_100.csv' +OPTIONS ('format.has_header' 'true'); # async fn csv_query_limit query T diff --git a/datafusion/sqllogictest/test_files/math.slt b/datafusion/sqllogictest/test_files/math.slt index 802323ca45ee..7ddf9d9d0522 100644 --- a/datafusion/sqllogictest/test_files/math.slt +++ b/datafusion/sqllogictest/test_files/math.slt @@ -20,7 +20,7 @@ ########## statement ok -CREATE external table aggregate_simple(c1 real, c2 double, c3 boolean) STORED as CSV WITH HEADER ROW LOCATION '../core/tests/data/aggregate_simple.csv'; +CREATE external table aggregate_simple(c1 real, c2 double, c3 boolean) STORED as CSV LOCATION '../core/tests/data/aggregate_simple.csv' OPTIONS ('format.has_header' 'true'); # Round query R diff --git a/datafusion/sqllogictest/test_files/monotonic_projection_test.slt b/datafusion/sqllogictest/test_files/monotonic_projection_test.slt index 0c89fb3bcdaa..d41b78dcd3f2 100644 --- a/datafusion/sqllogictest/test_files/monotonic_projection_test.slt +++ b/datafusion/sqllogictest/test_files/monotonic_projection_test.slt @@ -25,10 +25,10 @@ CREATE EXTERNAL TABLE multiple_ordered_table ( d INTEGER ) STORED AS CSV -WITH HEADER ROW WITH ORDER (a ASC, b ASC) WITH ORDER (c ASC) -LOCATION '../core/tests/data/window_2.csv'; +LOCATION '../core/tests/data/window_2.csv' +OPTIONS ('format.has_header' 'true'); # test for substitute CAST scenario query TT diff --git a/datafusion/sqllogictest/test_files/order.slt b/datafusion/sqllogictest/test_files/order.slt index 3f00bd2bb4ea..0f869fc0b419 100644 --- a/datafusion/sqllogictest/test_files/order.slt +++ b/datafusion/sqllogictest/test_files/order.slt @@ -36,8 +36,8 @@ CREATE EXTERNAL TABLE aggregate_test_100 ( c13 VARCHAR NOT NULL ) STORED AS CSV -WITH HEADER ROW LOCATION '../../testing/data/csv/aggregate_test_100.csv' +OPTIONS ('format.has_header' 'true'); # test_sort_unprojected_col query I @@ -422,11 +422,11 @@ CREATE EXTERNAL TABLE multiple_ordered_table ( d INTEGER ) STORED AS CSV -WITH HEADER ROW WITH ORDER (a ASC) WITH ORDER (b ASC) WITH ORDER (c ASC) -LOCATION '../core/tests/data/window_2.csv'; +LOCATION '../core/tests/data/window_2.csv' +OPTIONS ('format.has_header' 'true'); query TT EXPLAIN SELECT (b+a+c) AS result @@ -511,10 +511,10 @@ CREATE EXTERNAL TABLE aggregate_test_100 ( c13 VARCHAR NOT NULL ) STORED AS CSV -WITH HEADER ROW WITH ORDER(c11) WITH ORDER(c12 DESC) LOCATION '../../testing/data/csv/aggregate_test_100.csv' +OPTIONS ('format.has_header' 'true'); query TT EXPLAIN SELECT ATAN(c11) as atan_c11 @@ -627,8 +627,8 @@ CREATE EXTERNAL TABLE IF NOT EXISTS orders ( o_clerk VARCHAR, o_shippriority INTEGER, o_comment VARCHAR, -) STORED AS CSV WITH ORDER (o_orderkey ASC) WITH HEADER ROW LOCATION '../core/tests/tpch-csv/orders.csv' -OPTIONS ('format.delimiter' ','); +) STORED AS CSV WITH ORDER (o_orderkey ASC) LOCATION '../core/tests/tpch-csv/orders.csv' +OPTIONS ('format.delimiter' ',', 'format.has_header' 'true'); query TT EXPLAIN SELECT o_orderkey, o_orderstatus FROM orders ORDER BY o_orderkey ASC diff --git a/datafusion/sqllogictest/test_files/parquet.slt b/datafusion/sqllogictest/test_files/parquet.slt index aa1681de6201..9c90a1aaaeae 100644 --- a/datafusion/sqllogictest/test_files/parquet.slt +++ b/datafusion/sqllogictest/test_files/parquet.slt @@ -66,8 +66,8 @@ CREATE EXTERNAL TABLE test_table ( date_col DATE ) STORED AS PARQUET -WITH HEADER ROW -LOCATION 'test_files/scratch/parquet/test_table'; +LOCATION 'test_files/scratch/parquet/test_table' +OPTIONS ('format.has_header' 'true'); # Basic query: query ITID @@ -107,9 +107,9 @@ CREATE EXTERNAL TABLE test_table ( date_col DATE ) STORED AS PARQUET -WITH HEADER ROW WITH ORDER (string_col ASC NULLS LAST, int_col ASC NULLS LAST) -LOCATION 'test_files/scratch/parquet/test_table'; +LOCATION 'test_files/scratch/parquet/test_table' +OPTIONS ('format.has_header' 'true'); # Check output plan, expect an "output_ordering" clause in the physical_plan -> ParquetExec: query TT @@ -189,8 +189,8 @@ CREATE EXTERNAL TABLE alltypes_plain ( timestamp_col TIMESTAMP NOT NULL, ) STORED AS PARQUET -WITH HEADER ROW LOCATION '../../parquet-testing/data/alltypes_plain.parquet' +OPTIONS ('format.has_header' 'true') # Test a basic query with a CAST: query IT @@ -214,8 +214,8 @@ DROP TABLE alltypes_plain; statement ok CREATE EXTERNAL TABLE test_binary STORED AS PARQUET -WITH HEADER ROW -LOCATION '../core/tests/data/test_binary.parquet'; +LOCATION '../core/tests/data/test_binary.parquet' +OPTIONS ('format.has_header' 'true'); # Check size of table: query I @@ -247,8 +247,8 @@ DROP TABLE test_binary; statement ok CREATE EXTERNAL TABLE timestamp_with_tz STORED AS PARQUET -WITH HEADER ROW -LOCATION '../core/tests/data/timestamp_with_tz.parquet'; +LOCATION '../core/tests/data/timestamp_with_tz.parquet' +OPTIONS ('format.has_header' 'true'); # Check size of table: query I @@ -288,8 +288,8 @@ STORED AS PARQUET; statement ok CREATE EXTERNAL TABLE listing_table STORED AS PARQUET -WITH HEADER ROW -LOCATION 'test_files/scratch/parquet/test_table/*.parquet'; +LOCATION 'test_files/scratch/parquet/test_table/*.parquet' +OPTIONS ('format.has_header' 'true'); statement ok set datafusion.execution.listing_table_ignore_subdirectory = true; @@ -317,8 +317,8 @@ DROP TABLE timestamp_with_tz; statement ok CREATE EXTERNAL TABLE single_nan STORED AS PARQUET -WITH HEADER ROW -LOCATION '../../parquet-testing/data/single_nan.parquet'; +LOCATION '../../parquet-testing/data/single_nan.parquet' +OPTIONS ('format.has_header' 'true'); # Check table size: query I @@ -339,8 +339,8 @@ DROP TABLE single_nan; statement ok CREATE EXTERNAL TABLE list_columns STORED AS PARQUET -WITH HEADER ROW -LOCATION '../../parquet-testing/data/list_columns.parquet'; +LOCATION '../../parquet-testing/data/list_columns.parquet' +OPTIONS ('format.has_header' 'true'); query ?? SELECT int64_list, utf8_list FROM list_columns diff --git a/datafusion/sqllogictest/test_files/pg_compat/pg_compat_null.slt b/datafusion/sqllogictest/test_files/pg_compat/pg_compat_null.slt index f933c90acc73..d14b6ca81f67 100644 --- a/datafusion/sqllogictest/test_files/pg_compat/pg_compat_null.slt +++ b/datafusion/sqllogictest/test_files/pg_compat/pg_compat_null.slt @@ -66,8 +66,8 @@ CREATE EXTERNAL TABLE aggregate_test_100_by_sql ( c13 VARCHAR NOT NULL ) STORED AS CSV -WITH HEADER ROW LOCATION '../../testing/data/csv/aggregate_test_100.csv' +OPTIONS ('format.has_header' 'true'); statement ok diff --git a/datafusion/sqllogictest/test_files/pg_compat/pg_compat_simple.slt b/datafusion/sqllogictest/test_files/pg_compat/pg_compat_simple.slt index b01ea73c8056..25b4924715ca 100644 --- a/datafusion/sqllogictest/test_files/pg_compat/pg_compat_simple.slt +++ b/datafusion/sqllogictest/test_files/pg_compat/pg_compat_simple.slt @@ -67,8 +67,8 @@ CREATE EXTERNAL TABLE aggregate_test_100_by_sql ( c13 VARCHAR NOT NULL ) STORED AS CSV -WITH HEADER ROW LOCATION '../../testing/data/csv/aggregate_test_100.csv' +OPTIONS ('format.has_header' 'true'); diff --git a/datafusion/sqllogictest/test_files/pg_compat/pg_compat_union.slt b/datafusion/sqllogictest/test_files/pg_compat/pg_compat_union.slt index 05343de32268..e02c19016790 100644 --- a/datafusion/sqllogictest/test_files/pg_compat/pg_compat_union.slt +++ b/datafusion/sqllogictest/test_files/pg_compat/pg_compat_union.slt @@ -64,8 +64,8 @@ CREATE EXTERNAL TABLE aggregate_test_100_by_sql ( c13 VARCHAR NOT NULL ) STORED AS CSV -WITH HEADER ROW LOCATION '../../testing/data/csv/aggregate_test_100.csv' +OPTIONS ('format.has_header' 'true'); query I rowsort SELECT * FROM ( diff --git a/datafusion/sqllogictest/test_files/pg_compat/pg_compat_window.slt b/datafusion/sqllogictest/test_files/pg_compat/pg_compat_window.slt index cec51d472075..edad3747a203 100644 --- a/datafusion/sqllogictest/test_files/pg_compat/pg_compat_window.slt +++ b/datafusion/sqllogictest/test_files/pg_compat/pg_compat_window.slt @@ -64,8 +64,8 @@ CREATE EXTERNAL TABLE aggregate_test_100_by_sql ( c13 VARCHAR NOT NULL ) STORED AS CSV -WITH HEADER ROW LOCATION '../../testing/data/csv/aggregate_test_100.csv' +OPTIONS ('format.has_header' 'true'); query IIIIIIIIII SELECT diff --git a/datafusion/sqllogictest/test_files/predicates.slt b/datafusion/sqllogictest/test_files/predicates.slt index 12f3ebccea3c..5483750c6a03 100644 --- a/datafusion/sqllogictest/test_files/predicates.slt +++ b/datafusion/sqllogictest/test_files/predicates.slt @@ -39,8 +39,8 @@ CREATE EXTERNAL TABLE aggregate_test_100 ( c13 VARCHAR NOT NULL ) STORED AS CSV -WITH HEADER ROW -LOCATION '../../testing/data/csv/aggregate_test_100.csv' +LOCATION '../../testing/data/csv/aggregate_test_100.csv' +OPTIONS ('format.has_header' 'true'); statement ok CREATE EXTERNAL TABLE alltypes_plain STORED AS PARQUET LOCATION '../../parquet-testing/data/alltypes_plain.parquet'; @@ -621,7 +621,7 @@ CREATE EXTERNAL TABLE IF NOT EXISTS lineitem ( l_shipinstruct VARCHAR, l_shipmode VARCHAR, l_comment VARCHAR, -) STORED AS CSV WITH HEADER ROW LOCATION '../core/tests/tpch-csv/lineitem.csv' OPTIONS ('format.delimiter' ','); +) STORED AS CSV LOCATION '../core/tests/tpch-csv/lineitem.csv' OPTIONS ('format.delimiter' ',', 'format.has_header' 'true'); statement ok CREATE EXTERNAL TABLE IF NOT EXISTS part ( @@ -634,7 +634,7 @@ CREATE EXTERNAL TABLE IF NOT EXISTS part ( p_container VARCHAR, p_retailprice DECIMAL(15, 2), p_comment VARCHAR, -) STORED AS CSV WITH HEADER ROW LOCATION '../core/tests/tpch-csv/part.csv' OPTIONS ('format.delimiter' ','); +) STORED AS CSV LOCATION '../core/tests/tpch-csv/part.csv' OPTIONS ('format.delimiter' ',', 'format.has_header' 'true'); query TT EXPLAIN SELECT l_partkey FROM diff --git a/datafusion/sqllogictest/test_files/projection.slt b/datafusion/sqllogictest/test_files/projection.slt index b752f5644b7f..843ab71091f5 100644 --- a/datafusion/sqllogictest/test_files/projection.slt +++ b/datafusion/sqllogictest/test_files/projection.slt @@ -37,8 +37,8 @@ CREATE EXTERNAL TABLE aggregate_test_100 ( c13 VARCHAR NOT NULL ) STORED AS CSV -WITH HEADER ROW LOCATION '../../testing/data/csv/aggregate_test_100.csv' +OPTIONS ('format.has_header' 'true'); statement ok CREATE EXTERNAL TABLE aggregate_simple ( @@ -47,8 +47,8 @@ CREATE EXTERNAL TABLE aggregate_simple ( c3 BOOLEAN NOT NULL ) STORED AS CSV -WITH HEADER ROW LOCATION '../core/tests/data/aggregate_simple.csv' +OPTIONS ('format.has_header' 'true'); statement ok CREATE TABLE memory_table(a INT NOT NULL, b INT NOT NULL, c INT NOT NULL) AS VALUES diff --git a/datafusion/sqllogictest/test_files/references.slt b/datafusion/sqllogictest/test_files/references.slt index fd77d57c06f1..4c3ac68aebd1 100644 --- a/datafusion/sqllogictest/test_files/references.slt +++ b/datafusion/sqllogictest/test_files/references.slt @@ -39,8 +39,8 @@ CREATE EXTERNAL TABLE aggregate_test_100 ( c13 VARCHAR NOT NULL ) STORED AS CSV -WITH HEADER ROW LOCATION '../../testing/data/csv/aggregate_test_100.csv' +OPTIONS ('format.has_header' 'true'); query I SELECT COUNT(*) FROM aggregate_test_100; diff --git a/datafusion/sqllogictest/test_files/repartition.slt b/datafusion/sqllogictest/test_files/repartition.slt index 3f9e6e61f1d0..a301982740c7 100644 --- a/datafusion/sqllogictest/test_files/repartition.slt +++ b/datafusion/sqllogictest/test_files/repartition.slt @@ -95,8 +95,8 @@ CREATE UNBOUNDED EXTERNAL TABLE sink_table ( c13 VARCHAR NOT NULL ) STORED AS CSV -WITH HEADER ROW -LOCATION '../../testing/data/csv/aggregate_test_100.csv'; +LOCATION '../../testing/data/csv/aggregate_test_100.csv' +OPTIONS ('format.has_header' 'true'); query TII SELECT c1, c2, c3 FROM sink_table WHERE c3 > 0 LIMIT 5; diff --git a/datafusion/sqllogictest/test_files/repartition_scan.slt b/datafusion/sqllogictest/test_files/repartition_scan.slt index 6ce7d7211202..2d4ca1baf23d 100644 --- a/datafusion/sqllogictest/test_files/repartition_scan.slt +++ b/datafusion/sqllogictest/test_files/repartition_scan.slt @@ -277,8 +277,8 @@ DROP TABLE arrow_table; statement ok CREATE EXTERNAL TABLE avro_table STORED AS AVRO -WITH HEADER ROW LOCATION '../../testing/data/avro/simple_enum.avro' +OPTIONS ('format.has_header' 'true'); # It would be great to see the file read as "4" groups with even sizes (offsets) eventually diff --git a/datafusion/sqllogictest/test_files/scalar.slt b/datafusion/sqllogictest/test_files/scalar.slt index 7fb2d55ff84a..2b1a1ef289aa 100644 --- a/datafusion/sqllogictest/test_files/scalar.slt +++ b/datafusion/sqllogictest/test_files/scalar.slt @@ -1352,8 +1352,8 @@ CREATE EXTERNAL TABLE aggregate_test_100 ( c13 VARCHAR NOT NULL ) STORED AS CSV -WITH HEADER ROW LOCATION '../../testing/data/csv/aggregate_test_100.csv' +OPTIONS ('format.has_header' 'true'); # c8 = i32; c6 = i64 query TTT diff --git a/datafusion/sqllogictest/test_files/select.slt b/datafusion/sqllogictest/test_files/select.slt index 24163e37dec3..d73157570d8a 100644 --- a/datafusion/sqllogictest/test_files/select.slt +++ b/datafusion/sqllogictest/test_files/select.slt @@ -33,8 +33,8 @@ CREATE EXTERNAL TABLE aggregate_test_100 ( c13 VARCHAR NOT NULL ) STORED AS CSV -WITH HEADER ROW LOCATION '../../testing/data/csv/aggregate_test_100.csv' +OPTIONS ('format.has_header' 'true'); statement ok CREATE EXTERNAL TABLE aggregate_simple ( @@ -43,8 +43,8 @@ CREATE EXTERNAL TABLE aggregate_simple ( c3 BOOLEAN NOT NULL ) STORED AS CSV -WITH HEADER ROW LOCATION '../core/tests/data/aggregate_simple.csv' +OPTIONS ('format.has_header' 'true'); ########## ## SELECT Tests @@ -1067,9 +1067,9 @@ CREATE EXTERNAL TABLE annotated_data_finite2 ( d INTEGER ) STORED AS CSV -WITH HEADER ROW WITH ORDER (a ASC, b ASC, c ASC) -LOCATION '../core/tests/data/window_2.csv'; +LOCATION '../core/tests/data/window_2.csv' +OPTIONS ('format.has_header' 'true'); # test_source_projection diff --git a/datafusion/sqllogictest/test_files/subquery.slt b/datafusion/sqllogictest/test_files/subquery.slt index 069404d6129d..ba4d644345d4 100644 --- a/datafusion/sqllogictest/test_files/subquery.slt +++ b/datafusion/sqllogictest/test_files/subquery.slt @@ -66,7 +66,7 @@ CREATE EXTERNAL TABLE IF NOT EXISTS customer ( c_acctbal DECIMAL(15, 2), c_mktsegment VARCHAR, c_comment VARCHAR, -) STORED AS CSV WITH HEADER ROW LOCATION '../core/tests/tpch-csv/customer.csv' OPTIONS ('format.delimiter' ','); +) STORED AS CSV LOCATION '../core/tests/tpch-csv/customer.csv' OPTIONS ('format.delimiter' ',', 'format.has_header' 'true'); statement ok CREATE EXTERNAL TABLE IF NOT EXISTS orders ( @@ -79,7 +79,7 @@ CREATE EXTERNAL TABLE IF NOT EXISTS orders ( o_clerk VARCHAR, o_shippriority INTEGER, o_comment VARCHAR, -) STORED AS CSV WITH HEADER ROW LOCATION '../core/tests/tpch-csv/orders.csv' OPTIONS ('format.delimiter' ','); +) STORED AS CSV LOCATION '../core/tests/tpch-csv/orders.csv' OPTIONS ('format.delimiter' ',', 'format.has_header' 'true'); statement ok CREATE EXTERNAL TABLE IF NOT EXISTS lineitem ( @@ -99,7 +99,7 @@ CREATE EXTERNAL TABLE IF NOT EXISTS lineitem ( l_shipinstruct VARCHAR, l_shipmode VARCHAR, l_comment VARCHAR, -) STORED AS CSV WITH HEADER ROW LOCATION '../core/tests/tpch-csv/lineitem.csv' OPTIONS ('format.delimiter' ','); +) STORED AS CSV LOCATION '../core/tests/tpch-csv/lineitem.csv' OPTIONS ('format.delimiter' ',', format.has_header' 'true'); # in_subquery_to_join_with_correlated_outer_filter query ITI rowsort diff --git a/datafusion/sqllogictest/test_files/topk.slt b/datafusion/sqllogictest/test_files/topk.slt index 19412defcd40..616794f84918 100644 --- a/datafusion/sqllogictest/test_files/topk.slt +++ b/datafusion/sqllogictest/test_files/topk.slt @@ -69,8 +69,8 @@ CREATE EXTERNAL TABLE aggregate_test_100 ( c13 VARCHAR NOT NULL ) STORED AS CSV -WITH HEADER ROW LOCATION '../../testing/data/csv/aggregate_test_100.csv' +OPTIONS ('format.has_header' 'true'); query TT explain select * from aggregate_test_100 ORDER BY c13 desc limit 5; diff --git a/datafusion/sqllogictest/test_files/union.slt b/datafusion/sqllogictest/test_files/union.slt index c168890b7b00..36f024961875 100644 --- a/datafusion/sqllogictest/test_files/union.slt +++ b/datafusion/sqllogictest/test_files/union.slt @@ -105,8 +105,8 @@ CREATE EXTERNAL TABLE aggregate_test_100 ( c13 VARCHAR NOT NULL ) STORED AS CSV -WITH HEADER ROW LOCATION '../../testing/data/csv/aggregate_test_100.csv' +OPTIONS ('format.has_header' 'true'); query I select COUNT(*) from ( @@ -475,9 +475,9 @@ CREATE EXTERNAL TABLE t1 ( c13 VARCHAR NOT NULL ) STORED AS CSV -WITH HEADER ROW WITH ORDER (c1 ASC) -LOCATION '../../testing/data/csv/aggregate_test_100.csv'; +LOCATION '../../testing/data/csv/aggregate_test_100.csv' +OPTIONS ('format.has_header' 'true'); statement ok CREATE EXTERNAL TABLE t2 ( @@ -496,9 +496,9 @@ CREATE EXTERNAL TABLE t2 ( c13 VARCHAR NOT NULL ) STORED AS CSV -WITH HEADER ROW WITH ORDER (c1a ASC) -LOCATION '../../testing/data/csv/aggregate_test_100.csv'; +LOCATION '../../testing/data/csv/aggregate_test_100.csv' +OPTIONS ('format.has_header' 'true'); query TT explain diff --git a/datafusion/sqllogictest/test_files/wildcard.slt b/datafusion/sqllogictest/test_files/wildcard.slt index f83e84804a37..9285bdbf2306 100644 --- a/datafusion/sqllogictest/test_files/wildcard.slt +++ b/datafusion/sqllogictest/test_files/wildcard.slt @@ -40,8 +40,8 @@ CREATE EXTERNAL TABLE aggregate_simple ( c3 BOOLEAN NOT NULL, ) STORED AS CSV -WITH HEADER ROW LOCATION '../core/tests/data/aggregate_simple.csv' +OPTIONS ('format.has_header' 'true'); ########## diff --git a/datafusion/sqllogictest/test_files/window.slt b/datafusion/sqllogictest/test_files/window.slt index af09e644c9bf..be1517aa75c1 100644 --- a/datafusion/sqllogictest/test_files/window.slt +++ b/datafusion/sqllogictest/test_files/window.slt @@ -32,8 +32,8 @@ CREATE EXTERNAL TABLE aggregate_test_100 ( c13 VARCHAR NOT NULL ) STORED AS CSV -WITH HEADER ROW LOCATION '../../testing/data/csv/aggregate_test_100.csv' +OPTIONS ('format.has_header' 'true'); statement ok CREATE EXTERNAL TABLE null_cases( @@ -42,8 +42,8 @@ CREATE EXTERNAL TABLE null_cases( c3 BIGINT NULL ) STORED AS CSV -WITH HEADER ROW -LOCATION '../core/tests/data/null_cases.csv'; +LOCATION '../core/tests/data/null_cases.csv' +OPTIONS ('format.has_header' 'true'); ### This is the same table as ### execute_with_partition with 4 partitions @@ -2488,10 +2488,9 @@ CREATE EXTERNAL TABLE annotated_data_finite ( desc_col INTEGER, ) STORED AS CSV -WITH HEADER ROW WITH ORDER (ts ASC) LOCATION '../core/tests/data/window_1.csv' -; +OPTIONS ('format.has_header' 'true'); # 100 rows. Columns in the table are ts, inc_col, desc_col. # Source is CsvExec which is ordered by ts column. @@ -2503,9 +2502,9 @@ CREATE UNBOUNDED EXTERNAL TABLE annotated_data_infinite ( desc_col INTEGER, ) STORED AS CSV -WITH HEADER ROW WITH ORDER (ts ASC) -LOCATION '../core/tests/data/window_1.csv'; +LOCATION '../core/tests/data/window_1.csv' +OPTIONS ('format.has_header' 'true'); # test_source_sorted_aggregate @@ -2910,9 +2909,9 @@ CREATE EXTERNAL TABLE annotated_data_finite2 ( d INTEGER ) STORED AS CSV -WITH HEADER ROW WITH ORDER (a ASC, b ASC, c ASC) -LOCATION '../core/tests/data/window_2.csv'; +LOCATION '../core/tests/data/window_2.csv' +OPTIONS ('format.has_header' 'true'); # Columns in the table are a,b,c,d. Source is CsvExec which is ordered by # a,b,c column. Column a has cardinality 2, column b has cardinality 4. @@ -2926,9 +2925,9 @@ CREATE UNBOUNDED EXTERNAL TABLE annotated_data_infinite2 ( d INTEGER ) STORED AS CSV -WITH HEADER ROW WITH ORDER (a ASC, b ASC, c ASC) -LOCATION '../core/tests/data/window_2.csv'; +LOCATION '../core/tests/data/window_2.csv' +OPTIONS ('format.has_header' 'true'); # test_infinite_source_partition_by @@ -3402,10 +3401,10 @@ CREATE EXTERNAL TABLE multiple_ordered_table ( d INTEGER ) STORED AS CSV -WITH HEADER ROW WITH ORDER (a ASC, b ASC) WITH ORDER (c ASC) -LOCATION '../core/tests/data/window_2.csv'; +LOCATION '../core/tests/data/window_2.csv' +OPTIONS ('format.has_header' 'true'); # Since column b is constant after filter b=0, # There should be no SortExec(b ASC) in the plan @@ -3455,10 +3454,10 @@ CREATE UNBOUNDED EXTERNAL TABLE multiple_ordered_table_inf ( d INTEGER ) STORED AS CSV -WITH HEADER ROW WITH ORDER (a ASC, b ASC) WITH ORDER (c ASC) -LOCATION '../core/tests/data/window_2.csv'; +LOCATION '../core/tests/data/window_2.csv' +OPTIONS ('format.has_header' 'true'); # All of the window execs in the physical plan should work in the # sorted mode. diff --git a/docs/source/user-guide/cli/datasources.md b/docs/source/user-guide/cli/datasources.md index c2c00b633479..8b254a896f90 100644 --- a/docs/source/user-guide/cli/datasources.md +++ b/docs/source/user-guide/cli/datasources.md @@ -166,8 +166,8 @@ Register a single file csv datasource with a header row. ```sql CREATE EXTERNAL TABLE test STORED AS CSV -WITH HEADER ROW -LOCATION '/path/to/aggregate_test_100.csv'; +LOCATION '/path/to/aggregate_test_100.csv' +OPTIONS ('format.has_header' 'true'); ``` Register a single file csv datasource with explicitly defined schema. diff --git a/docs/source/user-guide/sql/ddl.md b/docs/source/user-guide/sql/ddl.md index 3d8b632f6ec2..41980851df1e 100644 --- a/docs/source/user-guide/sql/ddl.md +++ b/docs/source/user-guide/sql/ddl.md @@ -60,9 +60,6 @@ CREATE [UNBOUNDED] EXTERNAL TABLE [ IF NOT EXISTS ] [ () ] STORED AS -[ WITH HEADER ROW ] -[ DELIMITER ] -[ COMPRESSION TYPE ] [ PARTITIONED BY () ] [ WITH ORDER () ] [ OPTIONS () ] @@ -100,8 +97,8 @@ scanning a subset of the file. ```sql CREATE EXTERNAL TABLE test STORED AS CSV -WITH HEADER ROW -LOCATION '/path/to/aggregate_simple.csv'; +LOCATION '/path/to/aggregate_simple.csv' +OPTIONS ('format.has_header' 'true'); ``` It is also possible to use compressed files, such as `.csv.gz`: @@ -110,8 +107,8 @@ It is also possible to use compressed files, such as `.csv.gz`: CREATE EXTERNAL TABLE test STORED AS CSV COMPRESSION TYPE GZIP -WITH HEADER ROW -LOCATION '/path/to/aggregate_simple.csv.gz'; +LOCATION '/path/to/aggregate_simple.csv.gz' +OPTIONS ('format.has_header' 'true'); ``` It is also possible to specify the schema manually. @@ -133,8 +130,8 @@ CREATE EXTERNAL TABLE test ( c13 VARCHAR NOT NULL ) STORED AS CSV -WITH HEADER ROW -LOCATION '/path/to/aggregate_test_100.csv'; +LOCATION '/path/to/aggregate_test_100.csv' +OPTIONS ('format.has_header' 'true'); ``` It is also possible to specify a directory that contains a partitioned @@ -143,8 +140,8 @@ table (multiple files with the same schema) ```sql CREATE EXTERNAL TABLE test STORED AS CSV -WITH HEADER ROW -LOCATION '/path/to/directory/of/files'; +LOCATION '/path/to/directory/of/files' +OPTIONS ('format.has_header' 'true'); ``` With `CREATE UNBOUNDED EXTERNAL TABLE` SQL statement. We can create unbounded data sources such as following: @@ -181,9 +178,9 @@ CREATE EXTERNAL TABLE test ( c13 VARCHAR NOT NULL ) STORED AS CSV -WITH HEADER ROW WITH ORDER (c2 ASC, c5 + c8 DESC NULL FIRST) -LOCATION '/path/to/aggregate_test_100.csv'; +LOCATION '/path/to/aggregate_test_100.csv' +OPTIONS ('format.has_header' 'true'); ``` Where `WITH ORDER` clause specifies the sort order: diff --git a/docs/source/user-guide/sql/write_options.md b/docs/source/user-guide/sql/write_options.md index 7631525f13e5..c496d8d48a02 100644 --- a/docs/source/user-guide/sql/write_options.md +++ b/docs/source/user-guide/sql/write_options.md @@ -38,11 +38,11 @@ CREATE EXTERNAL TABLE my_table(a bigint, b bigint) STORED AS csv COMPRESSION TYPE gzip - WITH HEADER ROW DELIMITER ';' LOCATION '/test/location/my_csv_table/' OPTIONS( - NULL_VALUE 'NAN' + NULL_VALUE 'NAN', + 'format.has_header' 'true' ) ``` From 4873d61d54800290b1523f89b421e400a91c11d5 Mon Sep 17 00:00:00 2001 From: Mehmet Ozan Kabak Date: Thu, 9 May 2024 12:58:15 +0300 Subject: [PATCH 06/15] Review Part 1 --- datafusion-cli/src/helper.rs | 17 +++++--------- datafusion/common/src/config.rs | 23 ++++++++++--------- .../core/src/datasource/file_format/csv.rs | 12 +++++----- 3 files changed, 24 insertions(+), 28 deletions(-) diff --git a/datafusion-cli/src/helper.rs b/datafusion-cli/src/helper.rs index 28cba06723aa..f93aaec4218d 100644 --- a/datafusion-cli/src/helper.rs +++ b/datafusion-cli/src/helper.rs @@ -20,25 +20,20 @@ use std::borrow::Cow; +use crate::highlighter::{NoSyntaxHighlighter, SyntaxHighlighter}; + use datafusion::common::sql_datafusion_err; use datafusion::error::DataFusionError; use datafusion::sql::parser::{DFParser, Statement}; use datafusion::sql::sqlparser::dialect::dialect_from_str; use datafusion::sql::sqlparser::parser::ParserError; -use rustyline::completion::Completer; -use rustyline::completion::FilenameCompleter; -use rustyline::completion::Pair; + +use rustyline::completion::{Completer, FilenameCompleter, Pair}; use rustyline::error::ReadlineError; use rustyline::highlight::Highlighter; use rustyline::hint::Hinter; -use rustyline::validate::ValidationContext; -use rustyline::validate::ValidationResult; -use rustyline::validate::Validator; -use rustyline::Context; -use rustyline::Helper; -use rustyline::Result; - -use crate::highlighter::{NoSyntaxHighlighter, SyntaxHighlighter}; +use rustyline::validate::{ValidationContext, ValidationResult, Validator}; +use rustyline::{Context, Helper, Result}; pub struct CliHelper { completer: FilenameCompleter, diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index 49d31257c8ba..a1445dfb2f40 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -1564,20 +1564,21 @@ config_namespace_with_hashmap! { config_namespace! { /// Options controlling CSV format pub struct CsvOptions { - /// If the first line of the CSV is column names. - /// If not specified, uses default from `CREATE TABLE` command, if any. - pub has_header: Option, default =None + /// Specifies whether there is a CSV header (i.e. the first line + /// consists of is column names). If not specified, uses default from + /// the `CREATE TABLE` command, if any. + pub has_header: Option, default = None pub delimiter: u8, default = b',' pub quote: u8, default = b'"' pub escape: Option, default = None pub compression: CompressionTypeVariant, default = CompressionTypeVariant::UNCOMPRESSED pub schema_infer_max_rec: usize, default = 100 - pub date_format: Option, default = None - pub datetime_format: Option, default = None - pub timestamp_format: Option, default = None - pub timestamp_tz_format: Option, default = None - pub time_format: Option, default = None - pub null_value: Option, default = None + pub date_format: Option, default = None + pub datetime_format: Option, default = None + pub timestamp_format: Option, default = None + pub timestamp_tz_format: Option, default = None + pub time_format: Option, default = None + pub null_value: Option, default = None } } @@ -1606,8 +1607,8 @@ impl CsvOptions { self } - /// True if the first line is a header. If the condition of having header - /// is not set by the format options, session state decides it. + /// Returns true if the first line is a header. If format options does not + /// specify whether there is a header, consults the configuration. pub fn has_header(&self, config_opt: &ConfigOptions) -> bool { self.has_header.unwrap_or(config_opt.catalog.has_header) } diff --git a/datafusion/core/src/datasource/file_format/csv.rs b/datafusion/core/src/datasource/file_format/csv.rs index ab2582b4e820..3e5c7af1f82e 100644 --- a/datafusion/core/src/datasource/file_format/csv.rs +++ b/datafusion/core/src/datasource/file_format/csv.rs @@ -32,8 +32,9 @@ use crate::datasource::physical_plan::{ use crate::error::Result; use crate::execution::context::SessionState; use crate::physical_plan::insert::{DataSink, DataSinkExec}; -use crate::physical_plan::{DisplayAs, DisplayFormatType, Statistics}; -use crate::physical_plan::{ExecutionPlan, SendableRecordBatchStream}; +use crate::physical_plan::{ + DisplayAs, DisplayFormatType, ExecutionPlan, SendableRecordBatchStream, Statistics, +}; use arrow::array::RecordBatch; use arrow::csv::WriterBuilder; @@ -242,8 +243,8 @@ impl FileFormat for CsvFormat { ) -> Result> { let exec = CsvExec::new( conf, - // If the condition of having header is not set by - // the format options, session state decides it. + // If format options does not specify whether there is a header, + // we consult configuration options. self.options.has_header(state.config_options()), self.options.delimiter, self.options.quote, @@ -819,8 +820,7 @@ mod tests { has_header: bool, ) -> Result> { let root = format!("{}/csv", crate::test_util::arrow_test_data()); - let mut format = CsvFormat::default(); - format = format.with_has_header(has_header); + let format = CsvFormat::default().with_has_header(has_header); scan_format(state, &format, &root, file_name, projection, limit).await } From f0f6f09dcb966f97c50f54d62757932cd8b75177 Mon Sep 17 00:00:00 2001 From: berkaysynnada Date: Thu, 9 May 2024 13:12:52 +0300 Subject: [PATCH 07/15] . --- datafusion/core/src/datasource/listing_table_factory.rs | 3 ++- datafusion/sqllogictest/test_files/parquet.slt | 3 +-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/datafusion/core/src/datasource/listing_table_factory.rs b/datafusion/core/src/datasource/listing_table_factory.rs index 1d33944330da..987b9e12a424 100644 --- a/datafusion/core/src/datasource/listing_table_factory.rs +++ b/datafusion/core/src/datasource/listing_table_factory.rs @@ -218,6 +218,7 @@ mod tests { let mut options = HashMap::new(); options.insert("format.schema_infer_max_rec".to_owned(), "1000".to_owned()); + options.insert("format.has_header".into(), "true".into()); let cmd = CreateExternalTable { name, location: csv_file.path().to_str().unwrap().to_string(), @@ -228,7 +229,7 @@ mod tests { definition: None, order_exprs: vec![], unbounded: false, - options: HashMap::from([("format.has_header".into(), "true".into())]), + options, constraints: Constraints::empty(), column_defaults: HashMap::new(), }; diff --git a/datafusion/sqllogictest/test_files/parquet.slt b/datafusion/sqllogictest/test_files/parquet.slt index 9c90a1aaaeae..25463eb36273 100644 --- a/datafusion/sqllogictest/test_files/parquet.slt +++ b/datafusion/sqllogictest/test_files/parquet.slt @@ -66,8 +66,7 @@ CREATE EXTERNAL TABLE test_table ( date_col DATE ) STORED AS PARQUET -LOCATION 'test_files/scratch/parquet/test_table' -OPTIONS ('format.has_header' 'true'); +LOCATION 'test_files/scratch/parquet/test_table'; # Basic query: query ITID From 9d816017f2c11d0197c35e4d8a98c249840a6f96 Mon Sep 17 00:00:00 2001 From: berkaysynnada Date: Thu, 9 May 2024 14:19:58 +0300 Subject: [PATCH 08/15] Fix failing tests --- datafusion/common/src/config.rs | 6 +- .../core/src/datasource/file_format/csv.rs | 15 ++- datafusion/core/src/datasource/stream.rs | 13 ++- datafusion/core/tests/sql/mod.rs | 2 +- .../tests/cases/roundtrip_logical_plan.rs | 4 +- datafusion/sql/src/parser.rs | 105 ++++-------------- datafusion/sql/src/statement.rs | 31 +----- .../test_files/create_external_table.slt | 8 -- .../test_files/repartition_scan.slt | 1 - 9 files changed, 49 insertions(+), 136 deletions(-) diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index a1445dfb2f40..584e69cd8591 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -1566,7 +1566,7 @@ config_namespace! { pub struct CsvOptions { /// Specifies whether there is a CSV header (i.e. the first line /// consists of is column names). If not specified, uses default from - /// the `CREATE TABLE` command, if any. + /// the session state, if any. pub has_header: Option, default = None pub delimiter: u8, default = b',' pub quote: u8, default = b'"' @@ -1609,8 +1609,8 @@ impl CsvOptions { /// Returns true if the first line is a header. If format options does not /// specify whether there is a header, consults the configuration. - pub fn has_header(&self, config_opt: &ConfigOptions) -> bool { - self.has_header.unwrap_or(config_opt.catalog.has_header) + pub fn has_header(&self) -> Option { + self.has_header } /// The character separating values within a row. diff --git a/datafusion/core/src/datasource/file_format/csv.rs b/datafusion/core/src/datasource/file_format/csv.rs index 3e5c7af1f82e..bfd0975a52a9 100644 --- a/datafusion/core/src/datasource/file_format/csv.rs +++ b/datafusion/core/src/datasource/file_format/csv.rs @@ -40,7 +40,7 @@ use arrow::array::RecordBatch; use arrow::csv::WriterBuilder; use arrow::datatypes::SchemaRef; use arrow::datatypes::{DataType, Field, Fields, Schema}; -use datafusion_common::config::{ConfigOptions, CsvOptions}; +use datafusion_common::config::CsvOptions; use datafusion_common::file_options::csv_writer::CsvWriterOptions; use datafusion_common::{exec_err, not_impl_err, DataFusionError, FileType}; use datafusion_execution::TaskContext; @@ -142,8 +142,8 @@ impl CsvFormat { } /// True if the first line is a header. - pub fn has_header(&self, config_opt: &ConfigOptions) -> bool { - self.options.has_header(config_opt) + pub fn has_header(&self) -> Option { + self.options.has_header } /// The character separating values within a row. @@ -245,7 +245,9 @@ impl FileFormat for CsvFormat { conf, // If format options does not specify whether there is a header, // we consult configuration options. - self.options.has_header(state.config_options()), + self.options + .has_header + .unwrap_or(state.config_options().catalog.has_header), self.options.delimiter, self.options.quote, self.options.escape, @@ -303,7 +305,10 @@ impl CsvFormat { while let Some(chunk) = stream.next().await.transpose()? { let format = arrow::csv::reader::Format::default() .with_header( - self.options.has_header(state.config_options()) && first_chunk, + self.options + .has_header + .unwrap_or(state.config_options().catalog.has_header) + && first_chunk, ) .with_delimiter(self.options.delimiter); diff --git a/datafusion/core/src/datasource/stream.rs b/datafusion/core/src/datasource/stream.rs index 5e485f42b516..41296c1b8d23 100644 --- a/datafusion/core/src/datasource/stream.rs +++ b/datafusion/core/src/datasource/stream.rs @@ -30,7 +30,7 @@ use arrow_schema::SchemaRef; use async_trait::async_trait; use futures::StreamExt; -use datafusion_common::{plan_err, Constraints, DataFusionError, Result}; +use datafusion_common::{config_err, plan_err, Constraints, DataFusionError, Result}; use datafusion_common_runtime::SpawnedTask; use datafusion_execution::{SendableRecordBatchStream, TaskContext}; use datafusion_expr::{CreateExternalTable, Expr, TableType}; @@ -58,11 +58,22 @@ impl TableProviderFactory for StreamTableFactory { let schema: SchemaRef = Arc::new(cmd.schema.as_ref().into()); let location = cmd.location.clone(); let encoding = cmd.file_type.parse()?; + let header = if let Ok(opt) = cmd + .options + .get("format.has_header") + .map(|has_header| bool::from_str(has_header)) + .transpose() + { + opt.unwrap_or(false) + } else { + return config_err!("format.has_header can either be true or false"); + }; let config = StreamConfig::new_file(schema, location.into()) .with_encoding(encoding) .with_order(cmd.order_exprs.clone()) .with_batch_size(state.config().batch_size()) + .with_header(header) .with_constraints(cmd.constraints.clone()); Ok(Arc::new(StreamTable(Arc::new(config)))) diff --git a/datafusion/core/tests/sql/mod.rs b/datafusion/core/tests/sql/mod.rs index 00f091d77cff..995ce35c5bc2 100644 --- a/datafusion/core/tests/sql/mod.rs +++ b/datafusion/core/tests/sql/mod.rs @@ -85,7 +85,7 @@ async fn register_aggregate_csv_by_sql(ctx: &SessionContext) { c13 VARCHAR NOT NULL ) STORED AS CSV - LOCATION '{testdata}/csv/aggregate_test_100.csv + LOCATION '{testdata}/csv/aggregate_test_100.csv' OPTIONS ('format.has_header' 'true') " )) diff --git a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs index 74df7178b0f2..63e9177ffaf8 100644 --- a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs +++ b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs @@ -238,7 +238,7 @@ async fn roundtrip_custom_listing_tables() -> Result<()> { STORED AS CSV WITH ORDER (a ASC, b ASC) WITH ORDER (c ASC) - LOCATION '../core/tests/data/window_2.csv'; + LOCATION '../core/tests/data/window_2.csv' OPTIONS ('format.has_header' 'true')"; let plan = ctx.state().create_logical_plan(query).await?; @@ -268,7 +268,7 @@ async fn roundtrip_logical_plan_aggregation_with_pk() -> Result<()> { STORED AS CSV WITH ORDER (a ASC, b ASC) WITH ORDER (c ASC) - LOCATION '../core/tests/data/window_2.csv'; + LOCATION '../core/tests/data/window_2.csv' OPTIONS ('format.has_header' 'true')", ) .await?; diff --git a/datafusion/sql/src/parser.rs b/datafusion/sql/src/parser.rs index 7468527662f6..d3ae7a13ff3d 100644 --- a/datafusion/sql/src/parser.rs +++ b/datafusion/sql/src/parser.rs @@ -101,12 +101,10 @@ pub struct CopyToStatement { pub target: String, /// Partition keys pub partitioned_by: Vec, - /// Indicates whether there is a header row (e.g. CSV) - pub has_header: bool, /// File type (Parquet, NDJSON, CSV etc.) pub stored_as: Option, /// Target specific options - pub options: Vec<(String, Value)>, + pub options: HashMap, } impl fmt::Display for CopyToStatement { @@ -129,7 +127,10 @@ impl fmt::Display for CopyToStatement { } if !options.is_empty() { - let opts: Vec<_> = options.iter().map(|(k, v)| format!("{k} {v}")).collect(); + let opts: Vec<_> = options + .iter() + .map(|(k, v)| format!("'{k}' '{v}'")) + .collect(); write!(f, " OPTIONS ({})", opts.join(", "))?; } @@ -386,8 +387,7 @@ impl<'a> DFParser<'a> { stored_as: Option, target: Option, partitioned_by: Option>, - has_header: Option, - options: Option>, + options: Option>, } let mut builder = Builder::default(); @@ -423,7 +423,7 @@ impl<'a> DFParser<'a> { } Keyword::OPTIONS => { ensure_not_set(&builder.options, "OPTIONS")?; - builder.options = Some(self.parse_value_options()?); + builder.options = Some(self.parse_string_options()?); } _ => { unreachable!() @@ -451,9 +451,8 @@ impl<'a> DFParser<'a> { source, target, partitioned_by: builder.partitioned_by.unwrap_or(vec![]), - has_header: builder.has_header.unwrap_or(false), stored_as: builder.stored_as, - options: builder.options.unwrap_or(vec![]), + options: builder.options.unwrap_or(HashMap::new()), })) } @@ -835,33 +834,6 @@ impl<'a> DFParser<'a> { } Ok(options) } - - /// Parses (key value) style options into a map of String --> [`Value`]. - /// - /// Unlike [`Self::parse_string_options`], this method supports - /// keywords as key names as well as multiple value types such as - /// Numbers as well as Strings. - fn parse_value_options(&mut self) -> Result, ParserError> { - let mut options = vec![]; - self.parser.expect_token(&Token::LParen)?; - - loop { - let key = self.parse_option_key()?; - let value = self.parse_option_value()?; - options.push((key, value)); - let comma = self.parser.consume_token(&Token::Comma); - if self.parser.consume_token(&Token::RParen) { - // allow a trailing comma, even though it's not in standard - break; - } else if !comma { - return self.expected( - "',' or ')' after option definition", - self.parser.peek_token(), - ); - } - } - Ok(options) - } } #[cfg(test)] @@ -997,27 +969,6 @@ mod tests { }); expect_parse_ok(sql, expected)?; - // positive case: it is ok for case insensitive sql stmt with has_header option tokens - let sqls = vec![ - "CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV LOCATION 'foo.csv' OPTIONS ('FORMAT.HAS_HEADER' 'TRUE')", - "CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV LOCATION 'foo.csv' OPTIONS ('format.has_header' 'true')", - ]; - for sql in sqls { - let expected = Statement::CreateExternalTable(CreateExternalTable { - name: "t".into(), - columns: vec![make_column_def("c1", DataType::Int(display))], - file_type: "CSV".to_string(), - location: "foo.csv".into(), - table_partition_cols: vec![], - order_exprs: vec![], - if_not_exists: false, - unbounded: false, - options: HashMap::from([("format.has_header".into(), "true".into())]), - constraints: vec![], - }); - expect_parse_ok(sql, expected)?; - } - // positive case: it is ok for sql stmt with `COMPRESSION TYPE GZIP` tokens let sqls = vec![ ("CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV LOCATION 'foo.csv' OPTIONS @@ -1357,9 +1308,8 @@ mod tests { source: object_name("foo"), target: "bar".to_string(), partitioned_by: vec![], - has_header: false, stored_as: Some("CSV".to_owned()), - options: vec![], + options: HashMap::new(), }); assert_eq!(verified_stmt(sql), expected); @@ -1393,9 +1343,8 @@ mod tests { source: object_name("foo"), target: "bar".to_string(), partitioned_by: vec![], - has_header: false, stored_as: Some("PARQUET".to_owned()), - options: vec![], + options: HashMap::new(), }); let expected = Statement::Explain(ExplainStatement { analyze, @@ -1430,9 +1379,8 @@ mod tests { source: CopyToSource::Query(query), target: "bar".to_string(), partitioned_by: vec![], - has_header: true, stored_as: Some("CSV".to_owned()), - options: vec![], + options: HashMap::from([("format.has_header".into(), "true".into())]), }); assert_eq!(verified_stmt(sql), expected); Ok(()) @@ -1445,12 +1393,8 @@ mod tests { source: object_name("foo"), target: "bar".to_string(), partitioned_by: vec![], - has_header: false, stored_as: Some("CSV".to_owned()), - options: vec![( - "row_group_size".to_string(), - Value::Number("55".to_string(), false), - )], + options: HashMap::from([("row_group_size".into(), "55".into())]), }); assert_eq!(verified_stmt(sql), expected); Ok(()) @@ -1458,17 +1402,13 @@ mod tests { #[test] fn copy_to_partitioned_by() -> Result<(), ParserError> { - let sql = "COPY foo TO bar STORED AS CSV PARTITIONED BY (a) OPTIONS (row_group_size 55)"; + let sql = "COPY foo TO bar STORED AS CSV PARTITIONED BY (a) OPTIONS ('row_group_size' '55')"; let expected = Statement::CopyTo(CopyToStatement { source: object_name("foo"), target: "bar".to_string(), partitioned_by: vec!["a".to_string()], - has_header: false, stored_as: Some("CSV".to_owned()), - options: vec![( - "row_group_size".to_string(), - Value::Number("55".to_string(), false), - )], + options: HashMap::from([("row_group_size".to_string(), "55".into())]), }); assert_eq!(verified_stmt(sql), expected); Ok(()) @@ -1478,18 +1418,12 @@ mod tests { fn copy_to_multi_options() -> Result<(), ParserError> { // order of options is preserved let sql = - "COPY foo TO bar STORED AS parquet OPTIONS ('format.row_group_size' 55, 'format.compression' snappy)"; + "COPY foo TO bar STORED AS parquet OPTIONS ('format.row_group_size' '55', 'format.compression' 'snappy')"; - let expected_options = vec![ - ( - "format.row_group_size".to_string(), - Value::Number("55".to_string(), false), - ), - ( - "format.compression".to_string(), - Value::UnQuotedString("snappy".to_string()), - ), - ]; + let expected_options = HashMap::from([ + ("format.row_group_size".to_string(), "55".into()), + ("format.compression".to_string(), "snappy".into()), + ]); let mut statements = DFParser::parse_sql(sql).unwrap(); assert_eq!(statements.len(), 1); @@ -1527,6 +1461,7 @@ mod tests { /// `canonical` sql string fn one_statement_parses_to(sql: &str, canonical: &str) -> Statement { let mut statements = DFParser::parse_sql(sql).unwrap(); + println!("{:?}", statements[0]); assert_eq!(statements.len(), 1); if sql != canonical { diff --git a/datafusion/sql/src/statement.rs b/datafusion/sql/src/statement.rs index e6eda0b2787d..e5eef838b301 100644 --- a/datafusion/sql/src/statement.rs +++ b/datafusion/sql/src/statement.rs @@ -849,36 +849,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { } }; - let mut options = HashMap::new(); - for (key, value) in statement.options { - let value_string = match value { - Value::SingleQuotedString(s) => s.to_string(), - Value::DollarQuotedString(s) => s.to_string(), - Value::UnQuotedString(s) => s.to_string(), - Value::Number(_, _) | Value::Boolean(_) => value.to_string(), - Value::DoubleQuotedString(_) - | Value::EscapedStringLiteral(_) - | Value::NationalStringLiteral(_) - | Value::SingleQuotedByteStringLiteral(_) - | Value::DoubleQuotedByteStringLiteral(_) - | Value::RawStringLiteral(_) - | Value::HexStringLiteral(_) - | Value::Null - | Value::Placeholder(_) => { - return plan_err!("Unsupported Value in COPY statement {}", value); - } - }; - if !(&key.contains('.')) { - // If config does not belong to any namespace, assume it is - // a format option and apply the format prefix for backwards - // compatibility. - - let renamed_key = format!("format.{}", key); - options.insert(renamed_key.to_lowercase(), value_string.to_lowercase()); - } else { - options.insert(key.to_lowercase(), value_string.to_lowercase()); - } - } + let options = statement.options; let file_type = if let Some(file_type) = statement.stored_as { FileType::from_str(&file_type).map_err(|_| { diff --git a/datafusion/sqllogictest/test_files/create_external_table.slt b/datafusion/sqllogictest/test_files/create_external_table.slt index 4724bc87a099..9068c77ab422 100644 --- a/datafusion/sqllogictest/test_files/create_external_table.slt +++ b/datafusion/sqllogictest/test_files/create_external_table.slt @@ -52,14 +52,6 @@ CREATE EXTERNAL TABLE t STORED AS CSV WITH HEADER LOCATION 'abc' statement error DataFusion error: SQL error: ParserError\("Expected BY, found: LOCATION"\) CREATE EXTERNAL TABLE t STORED AS CSV PARTITIONED LOCATION 'abc' -# Missing `TYPE` in COMPRESSION clause -statement error DataFusion error: SQL error: ParserError\("Expected TYPE, found: LOCATION"\) -CREATE EXTERNAL TABLE t STORED AS CSV COMPRESSION LOCATION 'abc' - -# Invalid compression type -statement error DataFusion error: SQL error: ParserError\("Unsupported file compression type ZZZ"\) -CREATE EXTERNAL TABLE t STORED AS CSV COMPRESSION TYPE ZZZ LOCATION 'blahblah' - # Duplicate `STORED AS` clause statement error DataFusion error: SQL error: ParserError\("STORED AS specified more than once"\) CREATE EXTERNAL TABLE t STORED AS CSV STORED AS PARQUET LOCATION 'foo.parquet' diff --git a/datafusion/sqllogictest/test_files/repartition_scan.slt b/datafusion/sqllogictest/test_files/repartition_scan.slt index 2d4ca1baf23d..6484d437e179 100644 --- a/datafusion/sqllogictest/test_files/repartition_scan.slt +++ b/datafusion/sqllogictest/test_files/repartition_scan.slt @@ -278,7 +278,6 @@ statement ok CREATE EXTERNAL TABLE avro_table STORED AS AVRO LOCATION '../../testing/data/avro/simple_enum.avro' -OPTIONS ('format.has_header' 'true'); # It would be great to see the file read as "4" groups with even sizes (offsets) eventually From bb61cc182affe386ecb047aeaaef35751d7a83c5 Mon Sep 17 00:00:00 2001 From: berkaysynnada Date: Thu, 9 May 2024 15:33:39 +0300 Subject: [PATCH 09/15] Revert "Fix failing tests" This reverts commit 9d816017f2c11d0197c35e4d8a98c249840a6f96. --- datafusion/common/src/config.rs | 6 +- .../core/src/datasource/file_format/csv.rs | 15 +-- datafusion/core/src/datasource/stream.rs | 13 +-- datafusion/core/tests/sql/mod.rs | 2 +- .../tests/cases/roundtrip_logical_plan.rs | 4 +- datafusion/sql/src/parser.rs | 105 ++++++++++++++---- datafusion/sql/src/statement.rs | 31 +++++- .../test_files/create_external_table.slt | 8 ++ .../test_files/repartition_scan.slt | 1 + 9 files changed, 136 insertions(+), 49 deletions(-) diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index 584e69cd8591..a1445dfb2f40 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -1566,7 +1566,7 @@ config_namespace! { pub struct CsvOptions { /// Specifies whether there is a CSV header (i.e. the first line /// consists of is column names). If not specified, uses default from - /// the session state, if any. + /// the `CREATE TABLE` command, if any. pub has_header: Option, default = None pub delimiter: u8, default = b',' pub quote: u8, default = b'"' @@ -1609,8 +1609,8 @@ impl CsvOptions { /// Returns true if the first line is a header. If format options does not /// specify whether there is a header, consults the configuration. - pub fn has_header(&self) -> Option { - self.has_header + pub fn has_header(&self, config_opt: &ConfigOptions) -> bool { + self.has_header.unwrap_or(config_opt.catalog.has_header) } /// The character separating values within a row. diff --git a/datafusion/core/src/datasource/file_format/csv.rs b/datafusion/core/src/datasource/file_format/csv.rs index bfd0975a52a9..3e5c7af1f82e 100644 --- a/datafusion/core/src/datasource/file_format/csv.rs +++ b/datafusion/core/src/datasource/file_format/csv.rs @@ -40,7 +40,7 @@ use arrow::array::RecordBatch; use arrow::csv::WriterBuilder; use arrow::datatypes::SchemaRef; use arrow::datatypes::{DataType, Field, Fields, Schema}; -use datafusion_common::config::CsvOptions; +use datafusion_common::config::{ConfigOptions, CsvOptions}; use datafusion_common::file_options::csv_writer::CsvWriterOptions; use datafusion_common::{exec_err, not_impl_err, DataFusionError, FileType}; use datafusion_execution::TaskContext; @@ -142,8 +142,8 @@ impl CsvFormat { } /// True if the first line is a header. - pub fn has_header(&self) -> Option { - self.options.has_header + pub fn has_header(&self, config_opt: &ConfigOptions) -> bool { + self.options.has_header(config_opt) } /// The character separating values within a row. @@ -245,9 +245,7 @@ impl FileFormat for CsvFormat { conf, // If format options does not specify whether there is a header, // we consult configuration options. - self.options - .has_header - .unwrap_or(state.config_options().catalog.has_header), + self.options.has_header(state.config_options()), self.options.delimiter, self.options.quote, self.options.escape, @@ -305,10 +303,7 @@ impl CsvFormat { while let Some(chunk) = stream.next().await.transpose()? { let format = arrow::csv::reader::Format::default() .with_header( - self.options - .has_header - .unwrap_or(state.config_options().catalog.has_header) - && first_chunk, + self.options.has_header(state.config_options()) && first_chunk, ) .with_delimiter(self.options.delimiter); diff --git a/datafusion/core/src/datasource/stream.rs b/datafusion/core/src/datasource/stream.rs index 41296c1b8d23..5e485f42b516 100644 --- a/datafusion/core/src/datasource/stream.rs +++ b/datafusion/core/src/datasource/stream.rs @@ -30,7 +30,7 @@ use arrow_schema::SchemaRef; use async_trait::async_trait; use futures::StreamExt; -use datafusion_common::{config_err, plan_err, Constraints, DataFusionError, Result}; +use datafusion_common::{plan_err, Constraints, DataFusionError, Result}; use datafusion_common_runtime::SpawnedTask; use datafusion_execution::{SendableRecordBatchStream, TaskContext}; use datafusion_expr::{CreateExternalTable, Expr, TableType}; @@ -58,22 +58,11 @@ impl TableProviderFactory for StreamTableFactory { let schema: SchemaRef = Arc::new(cmd.schema.as_ref().into()); let location = cmd.location.clone(); let encoding = cmd.file_type.parse()?; - let header = if let Ok(opt) = cmd - .options - .get("format.has_header") - .map(|has_header| bool::from_str(has_header)) - .transpose() - { - opt.unwrap_or(false) - } else { - return config_err!("format.has_header can either be true or false"); - }; let config = StreamConfig::new_file(schema, location.into()) .with_encoding(encoding) .with_order(cmd.order_exprs.clone()) .with_batch_size(state.config().batch_size()) - .with_header(header) .with_constraints(cmd.constraints.clone()); Ok(Arc::new(StreamTable(Arc::new(config)))) diff --git a/datafusion/core/tests/sql/mod.rs b/datafusion/core/tests/sql/mod.rs index 995ce35c5bc2..00f091d77cff 100644 --- a/datafusion/core/tests/sql/mod.rs +++ b/datafusion/core/tests/sql/mod.rs @@ -85,7 +85,7 @@ async fn register_aggregate_csv_by_sql(ctx: &SessionContext) { c13 VARCHAR NOT NULL ) STORED AS CSV - LOCATION '{testdata}/csv/aggregate_test_100.csv' + LOCATION '{testdata}/csv/aggregate_test_100.csv OPTIONS ('format.has_header' 'true') " )) diff --git a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs index 63e9177ffaf8..74df7178b0f2 100644 --- a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs +++ b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs @@ -238,7 +238,7 @@ async fn roundtrip_custom_listing_tables() -> Result<()> { STORED AS CSV WITH ORDER (a ASC, b ASC) WITH ORDER (c ASC) - LOCATION '../core/tests/data/window_2.csv' + LOCATION '../core/tests/data/window_2.csv'; OPTIONS ('format.has_header' 'true')"; let plan = ctx.state().create_logical_plan(query).await?; @@ -268,7 +268,7 @@ async fn roundtrip_logical_plan_aggregation_with_pk() -> Result<()> { STORED AS CSV WITH ORDER (a ASC, b ASC) WITH ORDER (c ASC) - LOCATION '../core/tests/data/window_2.csv' + LOCATION '../core/tests/data/window_2.csv'; OPTIONS ('format.has_header' 'true')", ) .await?; diff --git a/datafusion/sql/src/parser.rs b/datafusion/sql/src/parser.rs index d3ae7a13ff3d..7468527662f6 100644 --- a/datafusion/sql/src/parser.rs +++ b/datafusion/sql/src/parser.rs @@ -101,10 +101,12 @@ pub struct CopyToStatement { pub target: String, /// Partition keys pub partitioned_by: Vec, + /// Indicates whether there is a header row (e.g. CSV) + pub has_header: bool, /// File type (Parquet, NDJSON, CSV etc.) pub stored_as: Option, /// Target specific options - pub options: HashMap, + pub options: Vec<(String, Value)>, } impl fmt::Display for CopyToStatement { @@ -127,10 +129,7 @@ impl fmt::Display for CopyToStatement { } if !options.is_empty() { - let opts: Vec<_> = options - .iter() - .map(|(k, v)| format!("'{k}' '{v}'")) - .collect(); + let opts: Vec<_> = options.iter().map(|(k, v)| format!("{k} {v}")).collect(); write!(f, " OPTIONS ({})", opts.join(", "))?; } @@ -387,7 +386,8 @@ impl<'a> DFParser<'a> { stored_as: Option, target: Option, partitioned_by: Option>, - options: Option>, + has_header: Option, + options: Option>, } let mut builder = Builder::default(); @@ -423,7 +423,7 @@ impl<'a> DFParser<'a> { } Keyword::OPTIONS => { ensure_not_set(&builder.options, "OPTIONS")?; - builder.options = Some(self.parse_string_options()?); + builder.options = Some(self.parse_value_options()?); } _ => { unreachable!() @@ -451,8 +451,9 @@ impl<'a> DFParser<'a> { source, target, partitioned_by: builder.partitioned_by.unwrap_or(vec![]), + has_header: builder.has_header.unwrap_or(false), stored_as: builder.stored_as, - options: builder.options.unwrap_or(HashMap::new()), + options: builder.options.unwrap_or(vec![]), })) } @@ -834,6 +835,33 @@ impl<'a> DFParser<'a> { } Ok(options) } + + /// Parses (key value) style options into a map of String --> [`Value`]. + /// + /// Unlike [`Self::parse_string_options`], this method supports + /// keywords as key names as well as multiple value types such as + /// Numbers as well as Strings. + fn parse_value_options(&mut self) -> Result, ParserError> { + let mut options = vec![]; + self.parser.expect_token(&Token::LParen)?; + + loop { + let key = self.parse_option_key()?; + let value = self.parse_option_value()?; + options.push((key, value)); + let comma = self.parser.consume_token(&Token::Comma); + if self.parser.consume_token(&Token::RParen) { + // allow a trailing comma, even though it's not in standard + break; + } else if !comma { + return self.expected( + "',' or ')' after option definition", + self.parser.peek_token(), + ); + } + } + Ok(options) + } } #[cfg(test)] @@ -969,6 +997,27 @@ mod tests { }); expect_parse_ok(sql, expected)?; + // positive case: it is ok for case insensitive sql stmt with has_header option tokens + let sqls = vec![ + "CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV LOCATION 'foo.csv' OPTIONS ('FORMAT.HAS_HEADER' 'TRUE')", + "CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV LOCATION 'foo.csv' OPTIONS ('format.has_header' 'true')", + ]; + for sql in sqls { + let expected = Statement::CreateExternalTable(CreateExternalTable { + name: "t".into(), + columns: vec![make_column_def("c1", DataType::Int(display))], + file_type: "CSV".to_string(), + location: "foo.csv".into(), + table_partition_cols: vec![], + order_exprs: vec![], + if_not_exists: false, + unbounded: false, + options: HashMap::from([("format.has_header".into(), "true".into())]), + constraints: vec![], + }); + expect_parse_ok(sql, expected)?; + } + // positive case: it is ok for sql stmt with `COMPRESSION TYPE GZIP` tokens let sqls = vec![ ("CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV LOCATION 'foo.csv' OPTIONS @@ -1308,8 +1357,9 @@ mod tests { source: object_name("foo"), target: "bar".to_string(), partitioned_by: vec![], + has_header: false, stored_as: Some("CSV".to_owned()), - options: HashMap::new(), + options: vec![], }); assert_eq!(verified_stmt(sql), expected); @@ -1343,8 +1393,9 @@ mod tests { source: object_name("foo"), target: "bar".to_string(), partitioned_by: vec![], + has_header: false, stored_as: Some("PARQUET".to_owned()), - options: HashMap::new(), + options: vec![], }); let expected = Statement::Explain(ExplainStatement { analyze, @@ -1379,8 +1430,9 @@ mod tests { source: CopyToSource::Query(query), target: "bar".to_string(), partitioned_by: vec![], + has_header: true, stored_as: Some("CSV".to_owned()), - options: HashMap::from([("format.has_header".into(), "true".into())]), + options: vec![], }); assert_eq!(verified_stmt(sql), expected); Ok(()) @@ -1393,8 +1445,12 @@ mod tests { source: object_name("foo"), target: "bar".to_string(), partitioned_by: vec![], + has_header: false, stored_as: Some("CSV".to_owned()), - options: HashMap::from([("row_group_size".into(), "55".into())]), + options: vec![( + "row_group_size".to_string(), + Value::Number("55".to_string(), false), + )], }); assert_eq!(verified_stmt(sql), expected); Ok(()) @@ -1402,13 +1458,17 @@ mod tests { #[test] fn copy_to_partitioned_by() -> Result<(), ParserError> { - let sql = "COPY foo TO bar STORED AS CSV PARTITIONED BY (a) OPTIONS ('row_group_size' '55')"; + let sql = "COPY foo TO bar STORED AS CSV PARTITIONED BY (a) OPTIONS (row_group_size 55)"; let expected = Statement::CopyTo(CopyToStatement { source: object_name("foo"), target: "bar".to_string(), partitioned_by: vec!["a".to_string()], + has_header: false, stored_as: Some("CSV".to_owned()), - options: HashMap::from([("row_group_size".to_string(), "55".into())]), + options: vec![( + "row_group_size".to_string(), + Value::Number("55".to_string(), false), + )], }); assert_eq!(verified_stmt(sql), expected); Ok(()) @@ -1418,12 +1478,18 @@ mod tests { fn copy_to_multi_options() -> Result<(), ParserError> { // order of options is preserved let sql = - "COPY foo TO bar STORED AS parquet OPTIONS ('format.row_group_size' '55', 'format.compression' 'snappy')"; + "COPY foo TO bar STORED AS parquet OPTIONS ('format.row_group_size' 55, 'format.compression' snappy)"; - let expected_options = HashMap::from([ - ("format.row_group_size".to_string(), "55".into()), - ("format.compression".to_string(), "snappy".into()), - ]); + let expected_options = vec![ + ( + "format.row_group_size".to_string(), + Value::Number("55".to_string(), false), + ), + ( + "format.compression".to_string(), + Value::UnQuotedString("snappy".to_string()), + ), + ]; let mut statements = DFParser::parse_sql(sql).unwrap(); assert_eq!(statements.len(), 1); @@ -1461,7 +1527,6 @@ mod tests { /// `canonical` sql string fn one_statement_parses_to(sql: &str, canonical: &str) -> Statement { let mut statements = DFParser::parse_sql(sql).unwrap(); - println!("{:?}", statements[0]); assert_eq!(statements.len(), 1); if sql != canonical { diff --git a/datafusion/sql/src/statement.rs b/datafusion/sql/src/statement.rs index e5eef838b301..e6eda0b2787d 100644 --- a/datafusion/sql/src/statement.rs +++ b/datafusion/sql/src/statement.rs @@ -849,7 +849,36 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { } }; - let options = statement.options; + let mut options = HashMap::new(); + for (key, value) in statement.options { + let value_string = match value { + Value::SingleQuotedString(s) => s.to_string(), + Value::DollarQuotedString(s) => s.to_string(), + Value::UnQuotedString(s) => s.to_string(), + Value::Number(_, _) | Value::Boolean(_) => value.to_string(), + Value::DoubleQuotedString(_) + | Value::EscapedStringLiteral(_) + | Value::NationalStringLiteral(_) + | Value::SingleQuotedByteStringLiteral(_) + | Value::DoubleQuotedByteStringLiteral(_) + | Value::RawStringLiteral(_) + | Value::HexStringLiteral(_) + | Value::Null + | Value::Placeholder(_) => { + return plan_err!("Unsupported Value in COPY statement {}", value); + } + }; + if !(&key.contains('.')) { + // If config does not belong to any namespace, assume it is + // a format option and apply the format prefix for backwards + // compatibility. + + let renamed_key = format!("format.{}", key); + options.insert(renamed_key.to_lowercase(), value_string.to_lowercase()); + } else { + options.insert(key.to_lowercase(), value_string.to_lowercase()); + } + } let file_type = if let Some(file_type) = statement.stored_as { FileType::from_str(&file_type).map_err(|_| { diff --git a/datafusion/sqllogictest/test_files/create_external_table.slt b/datafusion/sqllogictest/test_files/create_external_table.slt index 9068c77ab422..4724bc87a099 100644 --- a/datafusion/sqllogictest/test_files/create_external_table.slt +++ b/datafusion/sqllogictest/test_files/create_external_table.slt @@ -52,6 +52,14 @@ CREATE EXTERNAL TABLE t STORED AS CSV WITH HEADER LOCATION 'abc' statement error DataFusion error: SQL error: ParserError\("Expected BY, found: LOCATION"\) CREATE EXTERNAL TABLE t STORED AS CSV PARTITIONED LOCATION 'abc' +# Missing `TYPE` in COMPRESSION clause +statement error DataFusion error: SQL error: ParserError\("Expected TYPE, found: LOCATION"\) +CREATE EXTERNAL TABLE t STORED AS CSV COMPRESSION LOCATION 'abc' + +# Invalid compression type +statement error DataFusion error: SQL error: ParserError\("Unsupported file compression type ZZZ"\) +CREATE EXTERNAL TABLE t STORED AS CSV COMPRESSION TYPE ZZZ LOCATION 'blahblah' + # Duplicate `STORED AS` clause statement error DataFusion error: SQL error: ParserError\("STORED AS specified more than once"\) CREATE EXTERNAL TABLE t STORED AS CSV STORED AS PARQUET LOCATION 'foo.parquet' diff --git a/datafusion/sqllogictest/test_files/repartition_scan.slt b/datafusion/sqllogictest/test_files/repartition_scan.slt index 6484d437e179..2d4ca1baf23d 100644 --- a/datafusion/sqllogictest/test_files/repartition_scan.slt +++ b/datafusion/sqllogictest/test_files/repartition_scan.slt @@ -278,6 +278,7 @@ statement ok CREATE EXTERNAL TABLE avro_table STORED AS AVRO LOCATION '../../testing/data/avro/simple_enum.avro' +OPTIONS ('format.has_header' 'true'); # It would be great to see the file read as "4" groups with even sizes (offsets) eventually From 43be99b6f0d55f1336daa9b18e407c7c10cb90de Mon Sep 17 00:00:00 2001 From: berkaysynnada Date: Thu, 9 May 2024 17:17:04 +0300 Subject: [PATCH 10/15] Final commit --- datafusion/common/src/config.rs | 6 +- .../core/src/datasource/file_format/csv.rs | 15 +- datafusion/core/src/datasource/stream.rs | 13 +- datafusion/core/tests/sql/mod.rs | 2 +- .../tests/cases/roundtrip_logical_plan.rs | 4 +- datafusion/sql/src/parser.rs | 185 +++++++----------- datafusion/sql/src/statement.rs | 39 +++- .../test_files/create_external_table.slt | 10 +- .../sqllogictest/test_files/parquet.slt | 21 +- .../test_files/repartition_scan.slt | 3 +- .../sqllogictest/test_files/subquery.slt | 2 +- 11 files changed, 147 insertions(+), 153 deletions(-) diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index a1445dfb2f40..86f35944700c 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -1566,7 +1566,7 @@ config_namespace! { pub struct CsvOptions { /// Specifies whether there is a CSV header (i.e. the first line /// consists of is column names). If not specified, uses default from - /// the `CREATE TABLE` command, if any. + /// the session state. pub has_header: Option, default = None pub delimiter: u8, default = b',' pub quote: u8, default = b'"' @@ -1609,8 +1609,8 @@ impl CsvOptions { /// Returns true if the first line is a header. If format options does not /// specify whether there is a header, consults the configuration. - pub fn has_header(&self, config_opt: &ConfigOptions) -> bool { - self.has_header.unwrap_or(config_opt.catalog.has_header) + pub fn has_header(&self) -> Option { + self.has_header } /// The character separating values within a row. diff --git a/datafusion/core/src/datasource/file_format/csv.rs b/datafusion/core/src/datasource/file_format/csv.rs index 3e5c7af1f82e..bfd0975a52a9 100644 --- a/datafusion/core/src/datasource/file_format/csv.rs +++ b/datafusion/core/src/datasource/file_format/csv.rs @@ -40,7 +40,7 @@ use arrow::array::RecordBatch; use arrow::csv::WriterBuilder; use arrow::datatypes::SchemaRef; use arrow::datatypes::{DataType, Field, Fields, Schema}; -use datafusion_common::config::{ConfigOptions, CsvOptions}; +use datafusion_common::config::CsvOptions; use datafusion_common::file_options::csv_writer::CsvWriterOptions; use datafusion_common::{exec_err, not_impl_err, DataFusionError, FileType}; use datafusion_execution::TaskContext; @@ -142,8 +142,8 @@ impl CsvFormat { } /// True if the first line is a header. - pub fn has_header(&self, config_opt: &ConfigOptions) -> bool { - self.options.has_header(config_opt) + pub fn has_header(&self) -> Option { + self.options.has_header } /// The character separating values within a row. @@ -245,7 +245,9 @@ impl FileFormat for CsvFormat { conf, // If format options does not specify whether there is a header, // we consult configuration options. - self.options.has_header(state.config_options()), + self.options + .has_header + .unwrap_or(state.config_options().catalog.has_header), self.options.delimiter, self.options.quote, self.options.escape, @@ -303,7 +305,10 @@ impl CsvFormat { while let Some(chunk) = stream.next().await.transpose()? { let format = arrow::csv::reader::Format::default() .with_header( - self.options.has_header(state.config_options()) && first_chunk, + self.options + .has_header + .unwrap_or(state.config_options().catalog.has_header) + && first_chunk, ) .with_delimiter(self.options.delimiter); diff --git a/datafusion/core/src/datasource/stream.rs b/datafusion/core/src/datasource/stream.rs index 5e485f42b516..41296c1b8d23 100644 --- a/datafusion/core/src/datasource/stream.rs +++ b/datafusion/core/src/datasource/stream.rs @@ -30,7 +30,7 @@ use arrow_schema::SchemaRef; use async_trait::async_trait; use futures::StreamExt; -use datafusion_common::{plan_err, Constraints, DataFusionError, Result}; +use datafusion_common::{config_err, plan_err, Constraints, DataFusionError, Result}; use datafusion_common_runtime::SpawnedTask; use datafusion_execution::{SendableRecordBatchStream, TaskContext}; use datafusion_expr::{CreateExternalTable, Expr, TableType}; @@ -58,11 +58,22 @@ impl TableProviderFactory for StreamTableFactory { let schema: SchemaRef = Arc::new(cmd.schema.as_ref().into()); let location = cmd.location.clone(); let encoding = cmd.file_type.parse()?; + let header = if let Ok(opt) = cmd + .options + .get("format.has_header") + .map(|has_header| bool::from_str(has_header)) + .transpose() + { + opt.unwrap_or(false) + } else { + return config_err!("format.has_header can either be true or false"); + }; let config = StreamConfig::new_file(schema, location.into()) .with_encoding(encoding) .with_order(cmd.order_exprs.clone()) .with_batch_size(state.config().batch_size()) + .with_header(header) .with_constraints(cmd.constraints.clone()); Ok(Arc::new(StreamTable(Arc::new(config)))) diff --git a/datafusion/core/tests/sql/mod.rs b/datafusion/core/tests/sql/mod.rs index 00f091d77cff..995ce35c5bc2 100644 --- a/datafusion/core/tests/sql/mod.rs +++ b/datafusion/core/tests/sql/mod.rs @@ -85,7 +85,7 @@ async fn register_aggregate_csv_by_sql(ctx: &SessionContext) { c13 VARCHAR NOT NULL ) STORED AS CSV - LOCATION '{testdata}/csv/aggregate_test_100.csv + LOCATION '{testdata}/csv/aggregate_test_100.csv' OPTIONS ('format.has_header' 'true') " )) diff --git a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs index 74df7178b0f2..63e9177ffaf8 100644 --- a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs +++ b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs @@ -238,7 +238,7 @@ async fn roundtrip_custom_listing_tables() -> Result<()> { STORED AS CSV WITH ORDER (a ASC, b ASC) WITH ORDER (c ASC) - LOCATION '../core/tests/data/window_2.csv'; + LOCATION '../core/tests/data/window_2.csv' OPTIONS ('format.has_header' 'true')"; let plan = ctx.state().create_logical_plan(query).await?; @@ -268,7 +268,7 @@ async fn roundtrip_logical_plan_aggregation_with_pk() -> Result<()> { STORED AS CSV WITH ORDER (a ASC, b ASC) WITH ORDER (c ASC) - LOCATION '../core/tests/data/window_2.csv'; + LOCATION '../core/tests/data/window_2.csv' OPTIONS ('format.has_header' 'true')", ) .await?; diff --git a/datafusion/sql/src/parser.rs b/datafusion/sql/src/parser.rs index 7468527662f6..6a0798fbcfb4 100644 --- a/datafusion/sql/src/parser.rs +++ b/datafusion/sql/src/parser.rs @@ -17,7 +17,7 @@ //! [`DFParser`]: DataFusion SQL Parser based on [`sqlparser`] -use std::collections::{HashMap, VecDeque}; +use std::collections::VecDeque; use std::fmt; use sqlparser::{ @@ -101,8 +101,6 @@ pub struct CopyToStatement { pub target: String, /// Partition keys pub partitioned_by: Vec, - /// Indicates whether there is a header row (e.g. CSV) - pub has_header: bool, /// File type (Parquet, NDJSON, CSV etc.) pub stored_as: Option, /// Target specific options @@ -129,7 +127,8 @@ impl fmt::Display for CopyToStatement { } if !options.is_empty() { - let opts: Vec<_> = options.iter().map(|(k, v)| format!("{k} {v}")).collect(); + let opts: Vec<_> = + options.iter().map(|(k, v)| format!("'{k}' {v}")).collect(); write!(f, " OPTIONS ({})", opts.join(", "))?; } @@ -198,7 +197,7 @@ pub struct CreateExternalTable { /// Infinite streams? pub unbounded: bool, /// Table(provider) specific options - pub options: HashMap, + pub options: Vec<(String, Value)>, /// A table-level constraint pub constraints: Vec, } @@ -386,7 +385,6 @@ impl<'a> DFParser<'a> { stored_as: Option, target: Option, partitioned_by: Option>, - has_header: Option, options: Option>, } @@ -413,8 +411,7 @@ impl<'a> DFParser<'a> { Keyword::WITH => { self.parser.expect_keyword(Keyword::HEADER)?; self.parser.expect_keyword(Keyword::ROW)?; - return parser_err!("\"WITH HEADER ROW\" clause is no longer in use. Please use the \"OPTIONS\" - clause with 'format.has_header' set appropriately, e.g., OPTIONS ('format.has_header' 'true')"); + return parser_err!("WITH HEADER ROW clause is no longer in use. Please use the OPTIONS clause with 'format.has_header' set appropriately, e.g., OPTIONS ('format.has_header' 'true')"); } Keyword::PARTITIONED => { self.parser.expect_keyword(Keyword::BY)?; @@ -451,7 +448,6 @@ impl<'a> DFParser<'a> { source, target, partitioned_by: builder.partitioned_by.unwrap_or(vec![]), - has_header: builder.has_header.unwrap_or(false), stored_as: builder.stored_as, options: builder.options.unwrap_or(vec![]), })) @@ -686,7 +682,7 @@ impl<'a> DFParser<'a> { location: Option, table_partition_cols: Option>, order_exprs: Vec, - options: Option>, + options: Option>, } let mut builder = Builder::default(); @@ -716,18 +712,15 @@ impl<'a> DFParser<'a> { } else { self.parser.expect_keyword(Keyword::HEADER)?; self.parser.expect_keyword(Keyword::ROW)?; - return parser_err!("\"WITH HEADER ROW\" clause is no longer in use. Please use the \"OPTIONS\" - clause with 'format.has_header' set appropriately, e.g., OPTIONS ('format.has_header' 'true')"); + return parser_err!("WITH HEADER ROW clause is no longer in use. Please use the OPTIONS clause with 'format.has_header' set appropriately, e.g., OPTIONS ('format.has_header' 'true')"); } } Keyword::DELIMITER => { - return parser_err!("\"DELIMITER\" clause is no longer in use. Please use the \"OPTIONS\" - clause with 'format.delimiter' set appropriately, e.g., OPTIONS ('format.delimiter' ',')"); + return parser_err!("DELIMITER clause is no longer in use. Please use the OPTIONS clause with 'format.delimiter' set appropriately, e.g., OPTIONS ('format.delimiter' ',')"); } Keyword::COMPRESSION => { self.parser.expect_keyword(Keyword::TYPE)?; - return parser_err!("\"COMPRESSION TYPE\" clause is no longer in use. Please use the \"OPTIONS\" - clause with 'format.compression' set appropriately, e.g., OPTIONS ('format.compression' 'gzip')"); + return parser_err!("COMPRESSION TYPE clause is no longer in use. Please use the OPTIONS clause with 'format.compression' set appropriately, e.g., OPTIONS ('format.compression' 'gzip')"); } Keyword::PARTITIONED => { self.parser.expect_keyword(Keyword::BY)?; @@ -759,7 +752,7 @@ impl<'a> DFParser<'a> { } Keyword::OPTIONS => { ensure_not_set(&builder.options, "OPTIONS")?; - builder.options = Some(self.parse_string_options()?); + builder.options = Some(self.parse_value_options()?); } _ => { unreachable!() @@ -798,7 +791,7 @@ impl<'a> DFParser<'a> { order_exprs: builder.order_exprs, if_not_exists, unbounded, - options: builder.options.unwrap_or(HashMap::new()), + options: builder.options.unwrap_or(Vec::new()), constraints, }; Ok(Statement::CreateExternalTable(create)) @@ -813,34 +806,10 @@ impl<'a> DFParser<'a> { } } - /// Parses (key value) style options where the values are literal strings. - fn parse_string_options(&mut self) -> Result, ParserError> { - let mut options = HashMap::new(); - self.parser.expect_token(&Token::LParen)?; - - loop { - let key = self.parser.parse_literal_string()?; - let value = self.parser.parse_literal_string()?; - options.insert(key, value); - let comma = self.parser.consume_token(&Token::Comma); - if self.parser.consume_token(&Token::RParen) { - // allow a trailing comma, even though it's not in standard - break; - } else if !comma { - return self.expected( - "',' or ')' after option definition", - self.parser.peek_token(), - ); - } - } - Ok(options) - } - /// Parses (key value) style options into a map of String --> [`Value`]. /// - /// Unlike [`Self::parse_string_options`], this method supports - /// keywords as key names as well as multiple value types such as - /// Numbers as well as Strings. + /// This method supports keywords as key names as well as multiple + /// value types such as Numbers as well as Strings. fn parse_value_options(&mut self) -> Result, ParserError> { let mut options = vec![]; self.parser.expect_token(&Token::LParen)?; @@ -925,7 +894,7 @@ mod tests { order_exprs: vec![], if_not_exists: false, unbounded: false, - options: HashMap::new(), + options: Vec::new(), constraints: vec![], }); expect_parse_ok(sql, expected)?; @@ -941,7 +910,7 @@ mod tests { order_exprs: vec![], if_not_exists: false, unbounded: false, - options: HashMap::new(), + options: Vec::new(), constraints: vec![], }); expect_parse_ok(sql, expected)?; @@ -958,7 +927,7 @@ mod tests { order_exprs: vec![], if_not_exists: false, unbounded: false, - options: HashMap::new(), + options: vec![], constraints: vec![], }); expect_parse_ok(sql, expected)?; @@ -975,7 +944,10 @@ mod tests { order_exprs: vec![], if_not_exists: false, unbounded: false, - options: HashMap::from([("format.delimiter".into(), "|".into())]), + options: vec![( + "format.delimiter".into(), + Value::SingleQuotedString("|".into()), + )], constraints: vec![], }); expect_parse_ok(sql, expected)?; @@ -992,42 +964,21 @@ mod tests { order_exprs: vec![], if_not_exists: false, unbounded: false, - options: HashMap::new(), + options: vec![], constraints: vec![], }); expect_parse_ok(sql, expected)?; - // positive case: it is ok for case insensitive sql stmt with has_header option tokens - let sqls = vec![ - "CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV LOCATION 'foo.csv' OPTIONS ('FORMAT.HAS_HEADER' 'TRUE')", - "CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV LOCATION 'foo.csv' OPTIONS ('format.has_header' 'true')", - ]; - for sql in sqls { - let expected = Statement::CreateExternalTable(CreateExternalTable { - name: "t".into(), - columns: vec![make_column_def("c1", DataType::Int(display))], - file_type: "CSV".to_string(), - location: "foo.csv".into(), - table_partition_cols: vec![], - order_exprs: vec![], - if_not_exists: false, - unbounded: false, - options: HashMap::from([("format.has_header".into(), "true".into())]), - constraints: vec![], - }); - expect_parse_ok(sql, expected)?; - } - // positive case: it is ok for sql stmt with `COMPRESSION TYPE GZIP` tokens let sqls = vec![ ("CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV LOCATION 'foo.csv' OPTIONS - ('format.key.compression' 'GZIP')", "GZIP"), + ('format.compression' 'GZIP')", "GZIP"), ("CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV LOCATION 'foo.csv' OPTIONS - ('format.key.compression' 'BZIP2')", "BZIP2"), + ('format.compression' 'BZIP2')", "BZIP2"), ("CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV LOCATION 'foo.csv' OPTIONS - ('format.key.compression' 'XZ')", "XZ"), + ('format.compression' 'XZ')", "XZ"), ("CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV LOCATION 'foo.csv' OPTIONS - ('format.key.compression' 'ZSTD')", "ZSTD"), + ('format.compression' 'ZSTD')", "ZSTD"), ]; for (sql, compression) in sqls { let expected = Statement::CreateExternalTable(CreateExternalTable { @@ -1039,10 +990,10 @@ mod tests { order_exprs: vec![], if_not_exists: false, unbounded: false, - options: vec!["format.key.compression".to_string()] - .into_iter() - .zip(vec![compression.to_string()]) - .collect(), + options: vec![( + "format.compression".into(), + Value::SingleQuotedString(compression.into()), + )], constraints: vec![], }); expect_parse_ok(sql, expected)?; @@ -1059,7 +1010,7 @@ mod tests { order_exprs: vec![], if_not_exists: false, unbounded: false, - options: HashMap::new(), + options: vec![], constraints: vec![], }); expect_parse_ok(sql, expected)?; @@ -1075,7 +1026,7 @@ mod tests { order_exprs: vec![], if_not_exists: false, unbounded: false, - options: HashMap::new(), + options: vec![], constraints: vec![], }); expect_parse_ok(sql, expected)?; @@ -1091,7 +1042,7 @@ mod tests { order_exprs: vec![], if_not_exists: false, unbounded: false, - options: HashMap::new(), + options: vec![], constraints: vec![], }); expect_parse_ok(sql, expected)?; @@ -1108,7 +1059,7 @@ mod tests { order_exprs: vec![], if_not_exists: true, unbounded: false, - options: HashMap::new(), + options: vec![], constraints: vec![], }); expect_parse_ok(sql, expected)?; @@ -1128,7 +1079,7 @@ mod tests { order_exprs: vec![], if_not_exists: false, unbounded: false, - options: HashMap::new(), + options: vec![], constraints: vec![], }); expect_parse_ok(sql, expected)?; @@ -1155,7 +1106,7 @@ mod tests { order_exprs: vec![], if_not_exists: false, unbounded: false, - options: HashMap::from([("k1".into(), "v1".into())]), + options: vec![("k1".into(), Value::SingleQuotedString("v1".into()))], constraints: vec![], }); expect_parse_ok(sql, expected)?; @@ -1172,10 +1123,10 @@ mod tests { order_exprs: vec![], if_not_exists: false, unbounded: false, - options: HashMap::from([ - ("k1".into(), "v1".into()), - ("k2".into(), "v2".into()), - ]), + options: vec![ + ("k1".into(), Value::SingleQuotedString("v1".into())), + ("k2".into(), Value::UnQuotedString("v2".into())), + ], constraints: vec![], }); expect_parse_ok(sql, expected)?; @@ -1218,7 +1169,7 @@ mod tests { }]], if_not_exists: false, unbounded: false, - options: HashMap::new(), + options: vec![], constraints: vec![], }); expect_parse_ok(sql, expected)?; @@ -1256,7 +1207,7 @@ mod tests { ]], if_not_exists: false, unbounded: false, - options: HashMap::new(), + options: vec![], constraints: vec![], }); expect_parse_ok(sql, expected)?; @@ -1290,7 +1241,7 @@ mod tests { }]], if_not_exists: false, unbounded: false, - options: HashMap::new(), + options: vec![], constraints: vec![], }); expect_parse_ok(sql, expected)?; @@ -1302,11 +1253,11 @@ mod tests { WITH ORDER (c1 - c2 ASC) PARTITIONED BY (c1) LOCATION 'foo.parquet' - OPTIONS (ROW_GROUP_SIZE '1024', 'TRUNCATE' 'NO', - 'format.compression' 'zstd', + OPTIONS ('format.compression' 'zstd', 'format.delimiter' '*', - 'format.has_header' 'true') - "; + 'ROW_GROUP_SIZE' '1024', + 'TRUNCATE' 'NO', + 'format.has_header' 'true')"; let expected = Statement::CreateExternalTable(CreateExternalTable { name: "t".into(), columns: vec![ @@ -1333,13 +1284,25 @@ mod tests { }]], if_not_exists: true, unbounded: true, - options: HashMap::from([ - ("format.compression".into(), "zstd".into()), - ("format.delimiter".into(), "*".into()), - ("ROW_GROUP_SIZE".into(), "1024".into()), - ("TRUNCATE".into(), "NO".into()), - ("format.has_header".into(), "true".into()), - ]), + options: vec![ + ( + "format.compression".into(), + Value::SingleQuotedString("zstd".into()), + ), + ( + "format.delimiter".into(), + Value::SingleQuotedString("*".into()), + ), + ( + "ROW_GROUP_SIZE".into(), + Value::SingleQuotedString("1024".into()), + ), + ("TRUNCATE".into(), Value::SingleQuotedString("NO".into())), + ( + "format.has_header".into(), + Value::SingleQuotedString("true".into()), + ), + ], constraints: vec![], }); expect_parse_ok(sql, expected)?; @@ -1357,7 +1320,6 @@ mod tests { source: object_name("foo"), target: "bar".to_string(), partitioned_by: vec![], - has_header: false, stored_as: Some("CSV".to_owned()), options: vec![], }); @@ -1393,7 +1355,6 @@ mod tests { source: object_name("foo"), target: "bar".to_string(), partitioned_by: vec![], - has_header: false, stored_as: Some("PARQUET".to_owned()), options: vec![], }); @@ -1430,9 +1391,11 @@ mod tests { source: CopyToSource::Query(query), target: "bar".to_string(), partitioned_by: vec![], - has_header: true, stored_as: Some("CSV".to_owned()), - options: vec![], + options: vec![( + "format.has_header".into(), + Value::SingleQuotedString("true".into()), + )], }); assert_eq!(verified_stmt(sql), expected); Ok(()) @@ -1440,16 +1403,15 @@ mod tests { #[test] fn copy_to_options() -> Result<(), ParserError> { - let sql = "COPY foo TO bar STORED AS CSV OPTIONS (row_group_size 55)"; + let sql = "COPY foo TO bar STORED AS CSV OPTIONS ('row_group_size' '55')"; let expected = Statement::CopyTo(CopyToStatement { source: object_name("foo"), target: "bar".to_string(), partitioned_by: vec![], - has_header: false, stored_as: Some("CSV".to_owned()), options: vec![( "row_group_size".to_string(), - Value::Number("55".to_string(), false), + Value::SingleQuotedString("55".to_string()), )], }); assert_eq!(verified_stmt(sql), expected); @@ -1458,16 +1420,15 @@ mod tests { #[test] fn copy_to_partitioned_by() -> Result<(), ParserError> { - let sql = "COPY foo TO bar STORED AS CSV PARTITIONED BY (a) OPTIONS (row_group_size 55)"; + let sql = "COPY foo TO bar STORED AS CSV PARTITIONED BY (a) OPTIONS ('row_group_size' '55')"; let expected = Statement::CopyTo(CopyToStatement { source: object_name("foo"), target: "bar".to_string(), partitioned_by: vec!["a".to_string()], - has_header: false, stored_as: Some("CSV".to_owned()), options: vec![( "row_group_size".to_string(), - Value::Number("55".to_string(), false), + Value::SingleQuotedString("55".to_string()), )], }); assert_eq!(verified_stmt(sql), expected); diff --git a/datafusion/sql/src/statement.rs b/datafusion/sql/src/statement.rs index e6eda0b2787d..6bdca77f0b02 100644 --- a/datafusion/sql/src/statement.rs +++ b/datafusion/sql/src/statement.rs @@ -872,7 +872,6 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { // If config does not belong to any namespace, assume it is // a format option and apply the format prefix for backwards // compatibility. - let renamed_key = format!("format.{}", key); options.insert(renamed_key.to_lowercase(), value_string.to_lowercase()); } else { @@ -982,7 +981,41 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { let inline_constraints = calc_inline_constraints_from_columns(&columns); all_constraints.extend(inline_constraints); - let compression = options + let mut options_map = HashMap::::new(); + for (key, value) in options { + let value_string = match value { + Value::SingleQuotedString(s) => s.to_string(), + Value::DollarQuotedString(s) => s.to_string(), + Value::UnQuotedString(s) => s.to_string(), + Value::Number(_, _) | Value::Boolean(_) => value.to_string(), + Value::DoubleQuotedString(_) + | Value::EscapedStringLiteral(_) + | Value::NationalStringLiteral(_) + | Value::SingleQuotedByteStringLiteral(_) + | Value::DoubleQuotedByteStringLiteral(_) + | Value::RawStringLiteral(_) + | Value::HexStringLiteral(_) + | Value::Null + | Value::Placeholder(_) => { + return plan_err!( + "Unsupported Value in CREATE EXTERNAL TABLE statement {}", + value + ); + } + }; + if !(&key.contains('.')) { + // If config does not belong to any namespace, assume it is + // a format option and apply the format prefix for backwards + // compatibility. + let renamed_key = format!("format.{}", key); + options_map + .insert(renamed_key.to_lowercase(), value_string.to_lowercase()); + } else { + options_map.insert(key.to_lowercase(), value_string.to_lowercase()); + } + } + + let compression = options_map .get("format.compression") .map(|comp| CompressionTypeVariant::from_str(comp)) .transpose()?; @@ -1025,7 +1058,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { definition, order_exprs: ordered_exprs, unbounded, - options, + options: options_map, constraints, column_defaults, }, diff --git a/datafusion/sqllogictest/test_files/create_external_table.slt b/datafusion/sqllogictest/test_files/create_external_table.slt index 4724bc87a099..ff96220dba60 100644 --- a/datafusion/sqllogictest/test_files/create_external_table.slt +++ b/datafusion/sqllogictest/test_files/create_external_table.slt @@ -33,7 +33,7 @@ statement error DataFusion error: SQL error: ParserError\("Missing LOCATION clau CREATE EXTERNAL TABLE t STORED AS CSV # Option value is missing -statement error DataFusion error: SQL error: ParserError\("Expected literal string, found: \)"\) +statement error DataFusion error: SQL error: ParserError\("Expected string or numeric value, found: \)"\) CREATE EXTERNAL TABLE t STORED AS x OPTIONS ('k1' 'v1', k2 v2, k3) LOCATION 'blahblah' # Missing `(` in WITH ORDER clause @@ -52,14 +52,6 @@ CREATE EXTERNAL TABLE t STORED AS CSV WITH HEADER LOCATION 'abc' statement error DataFusion error: SQL error: ParserError\("Expected BY, found: LOCATION"\) CREATE EXTERNAL TABLE t STORED AS CSV PARTITIONED LOCATION 'abc' -# Missing `TYPE` in COMPRESSION clause -statement error DataFusion error: SQL error: ParserError\("Expected TYPE, found: LOCATION"\) -CREATE EXTERNAL TABLE t STORED AS CSV COMPRESSION LOCATION 'abc' - -# Invalid compression type -statement error DataFusion error: SQL error: ParserError\("Unsupported file compression type ZZZ"\) -CREATE EXTERNAL TABLE t STORED AS CSV COMPRESSION TYPE ZZZ LOCATION 'blahblah' - # Duplicate `STORED AS` clause statement error DataFusion error: SQL error: ParserError\("STORED AS specified more than once"\) CREATE EXTERNAL TABLE t STORED AS CSV STORED AS PARQUET LOCATION 'foo.parquet' diff --git a/datafusion/sqllogictest/test_files/parquet.slt b/datafusion/sqllogictest/test_files/parquet.slt index 25463eb36273..e70f800bde74 100644 --- a/datafusion/sqllogictest/test_files/parquet.slt +++ b/datafusion/sqllogictest/test_files/parquet.slt @@ -107,8 +107,7 @@ CREATE EXTERNAL TABLE test_table ( ) STORED AS PARQUET WITH ORDER (string_col ASC NULLS LAST, int_col ASC NULLS LAST) -LOCATION 'test_files/scratch/parquet/test_table' -OPTIONS ('format.has_header' 'true'); +LOCATION 'test_files/scratch/parquet/test_table'; # Check output plan, expect an "output_ordering" clause in the physical_plan -> ParquetExec: query TT @@ -188,8 +187,7 @@ CREATE EXTERNAL TABLE alltypes_plain ( timestamp_col TIMESTAMP NOT NULL, ) STORED AS PARQUET -LOCATION '../../parquet-testing/data/alltypes_plain.parquet' -OPTIONS ('format.has_header' 'true') +LOCATION '../../parquet-testing/data/alltypes_plain.parquet'; # Test a basic query with a CAST: query IT @@ -213,8 +211,7 @@ DROP TABLE alltypes_plain; statement ok CREATE EXTERNAL TABLE test_binary STORED AS PARQUET -LOCATION '../core/tests/data/test_binary.parquet' -OPTIONS ('format.has_header' 'true'); +LOCATION '../core/tests/data/test_binary.parquet'; # Check size of table: query I @@ -246,8 +243,7 @@ DROP TABLE test_binary; statement ok CREATE EXTERNAL TABLE timestamp_with_tz STORED AS PARQUET -LOCATION '../core/tests/data/timestamp_with_tz.parquet' -OPTIONS ('format.has_header' 'true'); +LOCATION '../core/tests/data/timestamp_with_tz.parquet'; # Check size of table: query I @@ -287,8 +283,7 @@ STORED AS PARQUET; statement ok CREATE EXTERNAL TABLE listing_table STORED AS PARQUET -LOCATION 'test_files/scratch/parquet/test_table/*.parquet' -OPTIONS ('format.has_header' 'true'); +LOCATION 'test_files/scratch/parquet/test_table/*.parquet'; statement ok set datafusion.execution.listing_table_ignore_subdirectory = true; @@ -316,8 +311,7 @@ DROP TABLE timestamp_with_tz; statement ok CREATE EXTERNAL TABLE single_nan STORED AS PARQUET -LOCATION '../../parquet-testing/data/single_nan.parquet' -OPTIONS ('format.has_header' 'true'); +LOCATION '../../parquet-testing/data/single_nan.parquet'; # Check table size: query I @@ -338,8 +332,7 @@ DROP TABLE single_nan; statement ok CREATE EXTERNAL TABLE list_columns STORED AS PARQUET -LOCATION '../../parquet-testing/data/list_columns.parquet' -OPTIONS ('format.has_header' 'true'); +LOCATION '../../parquet-testing/data/list_columns.parquet'; query ?? SELECT int64_list, utf8_list FROM list_columns diff --git a/datafusion/sqllogictest/test_files/repartition_scan.slt b/datafusion/sqllogictest/test_files/repartition_scan.slt index 2d4ca1baf23d..1e71696f296a 100644 --- a/datafusion/sqllogictest/test_files/repartition_scan.slt +++ b/datafusion/sqllogictest/test_files/repartition_scan.slt @@ -277,8 +277,7 @@ DROP TABLE arrow_table; statement ok CREATE EXTERNAL TABLE avro_table STORED AS AVRO -LOCATION '../../testing/data/avro/simple_enum.avro' -OPTIONS ('format.has_header' 'true'); +LOCATION '../../testing/data/avro/simple_enum.avro'; # It would be great to see the file read as "4" groups with even sizes (offsets) eventually diff --git a/datafusion/sqllogictest/test_files/subquery.slt b/datafusion/sqllogictest/test_files/subquery.slt index ba4d644345d4..4a9fb38e7db1 100644 --- a/datafusion/sqllogictest/test_files/subquery.slt +++ b/datafusion/sqllogictest/test_files/subquery.slt @@ -99,7 +99,7 @@ CREATE EXTERNAL TABLE IF NOT EXISTS lineitem ( l_shipinstruct VARCHAR, l_shipmode VARCHAR, l_comment VARCHAR, -) STORED AS CSV LOCATION '../core/tests/tpch-csv/lineitem.csv' OPTIONS ('format.delimiter' ',', format.has_header' 'true'); +) STORED AS CSV LOCATION '../core/tests/tpch-csv/lineitem.csv' OPTIONS ('format.delimiter' ',', 'format.has_header' 'true'); # in_subquery_to_join_with_correlated_outer_filter query ITI rowsort From fe505a2fad289e619fbe9d103955173eb382fe60 Mon Sep 17 00:00:00 2001 From: berkaysynnada Date: Thu, 9 May 2024 17:59:12 +0300 Subject: [PATCH 11/15] Minor --- datafusion/core/src/datasource/listing/table.rs | 12 ++++++++++++ datafusion/sql/src/parser.rs | 4 ++-- datafusion/sql/src/statement.rs | 5 +++++ 3 files changed, 19 insertions(+), 2 deletions(-) diff --git a/datafusion/core/src/datasource/listing/table.rs b/datafusion/core/src/datasource/listing/table.rs index 7488091fff70..b59ad439f6d8 100644 --- a/datafusion/core/src/datasource/listing/table.rs +++ b/datafusion/core/src/datasource/listing/table.rs @@ -1561,6 +1561,18 @@ mod tests { Ok(()) } + #[tokio::test] + async fn test_insert_into_sql_csv_defaults_header_row() -> Result<()> { + helper_test_insert_into_sql( + "csv", + FileCompressionType::UNCOMPRESSED, + "", + Some(HashMap::from([("has_header".into(), "true".into())])), + ) + .await?; + Ok(()) + } + #[tokio::test] async fn test_insert_into_sql_json_defaults() -> Result<()> { helper_test_insert_into_sql("json", FileCompressionType::UNCOMPRESSED, "", None) diff --git a/datafusion/sql/src/parser.rs b/datafusion/sql/src/parser.rs index 6a0798fbcfb4..f61c9cda6345 100644 --- a/datafusion/sql/src/parser.rs +++ b/datafusion/sql/src/parser.rs @@ -894,7 +894,7 @@ mod tests { order_exprs: vec![], if_not_exists: false, unbounded: false, - options: Vec::new(), + options: vec![], constraints: vec![], }); expect_parse_ok(sql, expected)?; @@ -910,7 +910,7 @@ mod tests { order_exprs: vec![], if_not_exists: false, unbounded: false, - options: Vec::new(), + options: vec![], constraints: vec![], }); expect_parse_ok(sql, expected)?; diff --git a/datafusion/sql/src/statement.rs b/datafusion/sql/src/statement.rs index 6bdca77f0b02..79cd1b643654 100644 --- a/datafusion/sql/src/statement.rs +++ b/datafusion/sql/src/statement.rs @@ -983,6 +983,10 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { let mut options_map = HashMap::::new(); for (key, value) in options { + if options_map.contains_key(&key) { + return plan_err!("Option {key} is specified multiple times"); + } + let value_string = match value { Value::SingleQuotedString(s) => s.to_string(), Value::DollarQuotedString(s) => s.to_string(), @@ -1003,6 +1007,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { ); } }; + if !(&key.contains('.')) { // If config does not belong to any namespace, assume it is // a format option and apply the format prefix for backwards From 8ab014ce893988b0c87bbbdad27583526b7a4630 Mon Sep 17 00:00:00 2001 From: Mehmet Ozan Kabak Date: Fri, 10 May 2024 03:23:13 +0300 Subject: [PATCH 12/15] Review --- datafusion/common/src/config.rs | 7 ++++--- datafusion/core/src/datasource/file_format/csv.rs | 12 +++++++----- datafusion/core/src/datasource/listing/table.rs | 1 - datafusion/core/src/datasource/stream.rs | 12 ++++++------ datafusion/core/src/execution/context/mod.rs | 14 ++++++-------- datafusion/core/tests/path_partition.rs | 5 ++--- datafusion/proto/src/physical_plan/to_proto.rs | 5 +---- .../proto/tests/cases/roundtrip_logical_plan.rs | 7 ++++--- datafusion/sql/src/statement.rs | 11 +++++------ datafusion/sql/tests/sql_integration.rs | 2 +- datafusion/sqllogictest/test_files/csv_files.slt | 2 +- .../sqllogictest/test_files/repartition_scan.slt | 4 ++-- 12 files changed, 39 insertions(+), 43 deletions(-) diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index 86f35944700c..c60f843393f8 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -1565,8 +1565,8 @@ config_namespace! { /// Options controlling CSV format pub struct CsvOptions { /// Specifies whether there is a CSV header (i.e. the first line - /// consists of is column names). If not specified, uses default from - /// the session state. + /// consists of is column names). The value `None` indicates that + /// the configuration should be consulted. pub has_header: Option, default = None pub delimiter: u8, default = b',' pub quote: u8, default = b'"' @@ -1608,7 +1608,8 @@ impl CsvOptions { } /// Returns true if the first line is a header. If format options does not - /// specify whether there is a header, consults the configuration. + /// specify whether there is a header, returns `None` (indicating that the + /// configuration should be consulted). pub fn has_header(&self) -> Option { self.has_header } diff --git a/datafusion/core/src/datasource/file_format/csv.rs b/datafusion/core/src/datasource/file_format/csv.rs index bfd0975a52a9..7cf510910b75 100644 --- a/datafusion/core/src/datasource/file_format/csv.rs +++ b/datafusion/core/src/datasource/file_format/csv.rs @@ -141,7 +141,8 @@ impl CsvFormat { self } - /// True if the first line is a header. + /// Returns `Some(true)` if the first line is a header, `Some(false)` if + /// it is not, and `None` if it is not specified. pub fn has_header(&self) -> Option { self.options.has_header } @@ -305,10 +306,11 @@ impl CsvFormat { while let Some(chunk) = stream.next().await.transpose()? { let format = arrow::csv::reader::Format::default() .with_header( - self.options - .has_header - .unwrap_or(state.config_options().catalog.has_header) - && first_chunk, + first_chunk + && self + .options + .has_header + .unwrap_or(state.config_options().catalog.has_header), ) .with_delimiter(self.options.delimiter); diff --git a/datafusion/core/src/datasource/listing/table.rs b/datafusion/core/src/datasource/listing/table.rs index b59ad439f6d8..cf70894806a3 100644 --- a/datafusion/core/src/datasource/listing/table.rs +++ b/datafusion/core/src/datasource/listing/table.rs @@ -1033,7 +1033,6 @@ impl ListingTable { #[cfg(test)] mod tests { - use super::*; #[cfg(feature = "parquet")] use crate::datasource::{provider_as_source, MemTable}; diff --git a/datafusion/core/src/datasource/stream.rs b/datafusion/core/src/datasource/stream.rs index 41296c1b8d23..4962e410587e 100644 --- a/datafusion/core/src/datasource/stream.rs +++ b/datafusion/core/src/datasource/stream.rs @@ -25,11 +25,12 @@ use std::path::PathBuf; use std::str::FromStr; use std::sync::Arc; +use crate::datasource::provider::TableProviderFactory; +use crate::datasource::{create_ordering, TableProvider}; +use crate::execution::context::SessionState; + use arrow_array::{RecordBatch, RecordBatchReader, RecordBatchWriter}; use arrow_schema::SchemaRef; -use async_trait::async_trait; -use futures::StreamExt; - use datafusion_common::{config_err, plan_err, Constraints, DataFusionError, Result}; use datafusion_common_runtime::SpawnedTask; use datafusion_execution::{SendableRecordBatchStream, TaskContext}; @@ -40,9 +41,8 @@ use datafusion_physical_plan::stream::RecordBatchReceiverStreamBuilder; use datafusion_physical_plan::streaming::{PartitionStream, StreamingTableExec}; use datafusion_physical_plan::{DisplayAs, DisplayFormatType, ExecutionPlan}; -use crate::datasource::provider::TableProviderFactory; -use crate::datasource::{create_ordering, TableProvider}; -use crate::execution::context::SessionState; +use async_trait::async_trait; +use futures::StreamExt; /// A [`TableProviderFactory`] for [`StreamTable`] #[derive(Debug, Default)] diff --git a/datafusion/core/src/execution/context/mod.rs b/datafusion/core/src/execution/context/mod.rs index 6c2dc692dae0..6b29f352dc82 100644 --- a/datafusion/core/src/execution/context/mod.rs +++ b/datafusion/core/src/execution/context/mod.rs @@ -23,6 +23,8 @@ use std::ops::ControlFlow; use std::sync::{Arc, Weak}; use super::options::ReadOptions; +#[cfg(feature = "array_expressions")] +use crate::functions_array; use crate::{ catalog::information_schema::{InformationSchemaProvider, INFORMATION_SCHEMA}, catalog::listing_schema::ListingSchemaProvider, @@ -53,14 +55,12 @@ use crate::{ }, optimizer::analyzer::{Analyzer, AnalyzerRule}, optimizer::optimizer::{Optimizer, OptimizerConfig, OptimizerRule}, + physical_expr::{create_physical_expr, PhysicalExpr}, physical_optimizer::optimizer::{PhysicalOptimizer, PhysicalOptimizerRule}, physical_plan::ExecutionPlan, physical_planner::{DefaultPhysicalPlanner, PhysicalPlanner}, variable::{VarProvider, VarType}, }; - -#[cfg(feature = "array_expressions")] -use crate::functions_array; use crate::{functions, functions_aggregate}; use arrow::datatypes::{DataType, SchemaRef}; @@ -75,10 +75,13 @@ use datafusion_common::{ }; use datafusion_execution::registry::SerializerRegistry; use datafusion_expr::{ + expr_rewriter::FunctionRewrite, logical_plan::{DdlStatement, Statement}, + simplify::SimplifyInfo, var_provider::is_system_variables, Expr, ExprSchemable, StringifiedPlan, UserDefinedLogicalNode, WindowUDF, }; +use datafusion_optimizer::simplify_expressions::ExprSimplifier; use datafusion_sql::{ parser::{CopyToSource, CopyToStatement, DFParser}, planner::{object_name_to_table_reference, ContextProvider, ParserOptions, SqlToRel}, @@ -93,14 +96,9 @@ use sqlparser::dialect::dialect_from_str; use url::Url; use uuid::Uuid; -use crate::physical_expr::PhysicalExpr; pub use datafusion_execution::config::SessionConfig; pub use datafusion_execution::TaskContext; pub use datafusion_expr::execution_props::ExecutionProps; -use datafusion_expr::expr_rewriter::FunctionRewrite; -use datafusion_expr::simplify::SimplifyInfo; -use datafusion_optimizer::simplify_expressions::ExprSimplifier; -use datafusion_physical_expr::create_physical_expr; mod avro; mod csv; diff --git a/datafusion/core/tests/path_partition.rs b/datafusion/core/tests/path_partition.rs index 6dfed4be795b..ffe0494dae99 100644 --- a/datafusion/core/tests/path_partition.rs +++ b/datafusion/core/tests/path_partition.rs @@ -38,13 +38,12 @@ use datafusion::{ }; use datafusion_common::stats::Precision; use datafusion_common::ScalarValue; +use datafusion_execution::config::SessionConfig; use async_trait::async_trait; use bytes::Bytes; use chrono::{TimeZone, Utc}; -use datafusion_execution::config::SessionConfig; -use futures::stream; -use futures::stream::BoxStream; +use futures::stream::{self, BoxStream}; use object_store::{ path::Path, GetOptions, GetResult, GetResultPayload, ListResult, MultipartId, ObjectMeta, ObjectStore, PutOptions, PutResult, diff --git a/datafusion/proto/src/physical_plan/to_proto.rs b/datafusion/proto/src/physical_plan/to_proto.rs index 85ad6cb6ff4c..ffe2aeaaa25a 100644 --- a/datafusion/proto/src/physical_plan/to_proto.rs +++ b/datafusion/proto/src/physical_plan/to_proto.rs @@ -975,10 +975,7 @@ impl TryFrom<&CsvOptions> for protobuf::CsvOptions { fn try_from(opts: &CsvOptions) -> Result { let compression: protobuf::CompressionTypeVariant = opts.compression.into(); Ok(protobuf::CsvOptions { - has_header: opts - .has_header - .map(|h| h as u8) - .map_or_else(Vec::new, |e| vec![e]), + has_header: opts.has_header.map_or_else(Vec::new, |h| vec![h as u8]), delimiter: vec![opts.delimiter], quote: vec![opts.quote], escape: opts.escape.map_or_else(Vec::new, |e| vec![e]), diff --git a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs index 63e9177ffaf8..8a2032fd46cc 100644 --- a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs +++ b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs @@ -30,6 +30,7 @@ use datafusion::datasource::provider::TableProviderFactory; use datafusion::datasource::TableProvider; use datafusion::execution::context::SessionState; use datafusion::execution::runtime_env::{RuntimeConfig, RuntimeEnv}; +use datafusion::execution::FunctionRegistry; use datafusion::functions_aggregate::covariance::covar_samp; use datafusion::functions_aggregate::expr_fn::first_value; use datafusion::prelude::*; @@ -57,11 +58,11 @@ use datafusion_proto::bytes::{ logical_plan_to_bytes, logical_plan_to_bytes_with_extension_codec, }; use datafusion_proto::logical_plan::to_proto::serialize_expr; -use datafusion_proto::logical_plan::LogicalExtensionCodec; -use datafusion_proto::logical_plan::{from_proto, DefaultLogicalExtensionCodec}; +use datafusion_proto::logical_plan::{ + from_proto, DefaultLogicalExtensionCodec, LogicalExtensionCodec, +}; use datafusion_proto::protobuf; -use datafusion::execution::FunctionRegistry; use prost::Message; #[cfg(feature = "json")] diff --git a/datafusion/sql/src/statement.rs b/datafusion/sql/src/statement.rs index 79cd1b643654..13d2e05661a8 100644 --- a/datafusion/sql/src/statement.rs +++ b/datafusion/sql/src/statement.rs @@ -1009,12 +1009,11 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { }; if !(&key.contains('.')) { - // If config does not belong to any namespace, assume it is + // If a config does not belong to any namespace, we assume it is // a format option and apply the format prefix for backwards // compatibility. - let renamed_key = format!("format.{}", key); - options_map - .insert(renamed_key.to_lowercase(), value_string.to_lowercase()); + let renamed_key = format!("format.{}", key.to_lowercase()); + options_map.insert(renamed_key, value_string.to_lowercase()); } else { options_map.insert(key.to_lowercase(), value_string.to_lowercase()); } @@ -1022,11 +1021,11 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { let compression = options_map .get("format.compression") - .map(|comp| CompressionTypeVariant::from_str(comp)) + .map(|c| CompressionTypeVariant::from_str(c)) .transpose()?; if (file_type == "PARQUET" || file_type == "AVRO" || file_type == "ARROW") && compression - .map(|comp| comp != CompressionTypeVariant::UNCOMPRESSED) + .map(|c| c != CompressionTypeVariant::UNCOMPRESSED) .unwrap_or(false) { plan_err!( diff --git a/datafusion/sql/tests/sql_integration.rs b/datafusion/sql/tests/sql_integration.rs index 781d2ce5bb06..af4dac5f3f89 100644 --- a/datafusion/sql/tests/sql_integration.rs +++ b/datafusion/sql/tests/sql_integration.rs @@ -32,13 +32,13 @@ use datafusion_expr::{ AggregateUDF, ColumnarValue, ScalarUDF, ScalarUDFImpl, Signature, TableSource, Volatility, WindowUDF, }; +use datafusion_functions::{string, unicode}; use datafusion_sql::unparser::{expr_to_sql, plan_to_sql}; use datafusion_sql::{ parser::DFParser, planner::{ContextProvider, ParserOptions, PlannerContext, SqlToRel}, }; -use datafusion_functions::{string, unicode}; use rstest::rstest; use sqlparser::dialect::{Dialect, GenericDialect, HiveDialect, MySqlDialect}; use sqlparser::parser::Parser; diff --git a/datafusion/sqllogictest/test_files/csv_files.slt b/datafusion/sqllogictest/test_files/csv_files.slt index 5e30d6ed09ae..50477e1dab45 100644 --- a/datafusion/sqllogictest/test_files/csv_files.slt +++ b/datafusion/sqllogictest/test_files/csv_files.slt @@ -160,4 +160,4 @@ logical_plan physical_plan 01)SortPreservingMergeExec: [int_col@0 ASC NULLS LAST] 02)--SortExec: expr=[int_col@0 ASC NULLS LAST], preserve_partitioning=[true] -03)----CsvExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/csv_files/csv_partitions/1.csv], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/csv_files/csv_partitions/2.csv]]}, projection=[int_col, string_col, bigint_col, partition_col], has_header=false \ No newline at end of file +03)----CsvExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/csv_files/csv_partitions/1.csv], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/csv_files/csv_partitions/2.csv]]}, projection=[int_col, string_col, bigint_col, partition_col], has_header=false diff --git a/datafusion/sqllogictest/test_files/repartition_scan.slt b/datafusion/sqllogictest/test_files/repartition_scan.slt index 1e71696f296a..6b9cb521f5f8 100644 --- a/datafusion/sqllogictest/test_files/repartition_scan.slt +++ b/datafusion/sqllogictest/test_files/repartition_scan.slt @@ -163,8 +163,8 @@ STORED AS CSV OPTIONS ('format.has_header' 'true'); statement ok CREATE EXTERNAL TABLE csv_table(column1 int) STORED AS csv -LOCATION 'test_files/scratch/repartition_scan/csv_table/' OPTIONS - ('format.has_header' 'true'); +LOCATION 'test_files/scratch/repartition_scan/csv_table/' +OPTIONS ('format.has_header' 'true'); query I select * from csv_table ORDER BY column1; From 86e473e472706b2a3250ce329f94b0db7510a786 Mon Sep 17 00:00:00 2001 From: berkaysynnada Date: Fri, 10 May 2024 11:17:59 +0300 Subject: [PATCH 13/15] Update avro.slt --- datafusion/sqllogictest/test_files/avro.slt | 27 +++++++-------------- 1 file changed, 9 insertions(+), 18 deletions(-) diff --git a/datafusion/sqllogictest/test_files/avro.slt b/datafusion/sqllogictest/test_files/avro.slt index f9f55f0dc153..fced1924ced9 100644 --- a/datafusion/sqllogictest/test_files/avro.slt +++ b/datafusion/sqllogictest/test_files/avro.slt @@ -31,8 +31,7 @@ CREATE EXTERNAL TABLE alltypes_plain ( timestamp_col TIMESTAMP NOT NULL, ) STORED AS AVRO -LOCATION '../../testing/data/avro/alltypes_plain.avro' -OPTIONS ('format.has_header' 'true'); +LOCATION '../../testing/data/avro/alltypes_plain.avro'; statement ok CREATE EXTERNAL TABLE alltypes_plain_snappy ( @@ -49,8 +48,7 @@ CREATE EXTERNAL TABLE alltypes_plain_snappy ( timestamp_col TIMESTAMP NOT NULL, ) STORED AS AVRO -LOCATION '../../testing/data/avro/alltypes_plain.snappy.avro' -OPTIONS ('format.has_header' 'true'); +LOCATION '../../testing/data/avro/alltypes_plain.snappy.avro'; statement ok CREATE EXTERNAL TABLE alltypes_plain_bzip2 ( @@ -67,8 +65,7 @@ CREATE EXTERNAL TABLE alltypes_plain_bzip2 ( timestamp_col TIMESTAMP NOT NULL, ) STORED AS AVRO -LOCATION '../../testing/data/avro/alltypes_plain.bzip2.avro' -OPTIONS ('format.has_header' 'true'); +LOCATION '../../testing/data/avro/alltypes_plain.bzip2.avro'; statement ok CREATE EXTERNAL TABLE alltypes_plain_xz ( @@ -85,8 +82,7 @@ CREATE EXTERNAL TABLE alltypes_plain_xz ( timestamp_col TIMESTAMP NOT NULL, ) STORED AS AVRO -LOCATION '../../testing/data/avro/alltypes_plain.xz.avro' -OPTIONS ('format.has_header' 'true'); +LOCATION '../../testing/data/avro/alltypes_plain.xz.avro'; statement ok CREATE EXTERNAL TABLE alltypes_plain_zstandard ( @@ -103,34 +99,29 @@ CREATE EXTERNAL TABLE alltypes_plain_zstandard ( timestamp_col TIMESTAMP NOT NULL, ) STORED AS AVRO -LOCATION '../../testing/data/avro/alltypes_plain.zstandard.avro' -OPTIONS ('format.has_header' 'true'); +LOCATION '../../testing/data/avro/alltypes_plain.zstandard.avro'; statement ok CREATE EXTERNAL TABLE single_nan ( mycol FLOAT ) STORED AS AVRO -LOCATION '../../testing/data/avro/single_nan.avro' -OPTIONS ('format.has_header' 'true'); +LOCATION '../../testing/data/avro/single_nan.avro'; statement ok CREATE EXTERNAL TABLE nested_records STORED AS AVRO -LOCATION '../../testing/data/avro/nested_records.avro' -OPTIONS ('format.has_header' 'true'); +LOCATION '../../testing/data/avro/nested_records.avro'; statement ok CREATE EXTERNAL TABLE simple_enum STORED AS AVRO -LOCATION '../../testing/data/avro/simple_enum.avro' -OPTIONS ('format.has_header' 'true'); +LOCATION '../../testing/data/avro/simple_enum.avro'; statement ok CREATE EXTERNAL TABLE simple_fixed STORED AS AVRO -LOCATION '../../testing/data/avro/simple_fixed.avro' -OPTIONS ('format.has_header' 'true'); +LOCATION '../../testing/data/avro/simple_fixed.avro'; # test avro query query IT From 3af0bfe722f7c3508cc1c5cfd8ad766d92763f6d Mon Sep 17 00:00:00 2001 From: berkaysynnada Date: Mon, 13 May 2024 10:26:33 +0300 Subject: [PATCH 14/15] Apply suggestions --- .../core/src/datasource/file_format/csv.rs | 3 ++- datafusion/core/src/datasource/stream.rs | 2 +- .../test_files/create_external_table.slt | 20 +++++++++++++++++++ .../test_files/tpch/create_tables.slt.part | 2 +- docs/source/user-guide/cli/datasources.md | 2 +- docs/source/user-guide/sql/ddl.md | 10 +++++----- docs/source/user-guide/sql/write_options.md | 2 +- 7 files changed, 31 insertions(+), 10 deletions(-) diff --git a/datafusion/core/src/datasource/file_format/csv.rs b/datafusion/core/src/datasource/file_format/csv.rs index 7cf510910b75..17bc7aafce85 100644 --- a/datafusion/core/src/datasource/file_format/csv.rs +++ b/datafusion/core/src/datasource/file_format/csv.rs @@ -734,7 +734,8 @@ mod tests { file_compression_type: FileCompressionType, ) -> Result<()> { let runtime = Arc::new(RuntimeEnv::new(RuntimeConfig::new()).unwrap()); - let cfg = SessionConfig::new().set_str("datafusion.catalog.has_header", "true"); + let mut cfg = SessionConfig::new(); + cfg.options_mut().catalog.has_header = true; let session_state = SessionState::new_with_config_rt(cfg, runtime); let integration = LocalFileSystem::new_with_prefix(arrow_test_data()).unwrap(); diff --git a/datafusion/core/src/datasource/stream.rs b/datafusion/core/src/datasource/stream.rs index 4962e410587e..f0795bdbb0d8 100644 --- a/datafusion/core/src/datasource/stream.rs +++ b/datafusion/core/src/datasource/stream.rs @@ -66,7 +66,7 @@ impl TableProviderFactory for StreamTableFactory { { opt.unwrap_or(false) } else { - return config_err!("format.has_header can either be true or false"); + return config_err!("Valid values for format.has_header option are 'true' or 'false'"); }; let config = StreamConfig::new_file(schema, location.into()) diff --git a/datafusion/sqllogictest/test_files/create_external_table.slt b/datafusion/sqllogictest/test_files/create_external_table.slt index ff96220dba60..fca177bb61f0 100644 --- a/datafusion/sqllogictest/test_files/create_external_table.slt +++ b/datafusion/sqllogictest/test_files/create_external_table.slt @@ -185,3 +185,23 @@ CREATE EXTERNAL TABLE test3(name string) PARTITIONED BY (month string, year string) STORED AS parquet LOCATION 'test_files/scratch/create_external_table/manual_partitioning/'; + +# Duplicate key assignment in OPTIONS clause +statement error DataFusion error: Error during planning: Option format.delimiter is specified multiple times +CREATE EXTERNAL TABLE t STORED AS CSV OPTIONS ( + 'format.delimiter' '*', + 'format.has_header' 'true', + 'format.delimiter' '|') +LOCATION 'foo.csv'; + +# If a config does not belong to any namespace, we assume it is a 'format' option and apply the 'format' prefix for backwards compatibility. +statement ok +CREATE EXTERNAL TABLE IF NOT EXISTS region ( + r_regionkey BIGINT, + r_name VARCHAR, + r_comment VARCHAR, + r_rev VARCHAR, +) STORED AS CSV LOCATION 'test_files/tpch/data/region.tbl' +OPTIONS ( + 'format.delimiter' '|', + 'has_header' 'false'); \ No newline at end of file diff --git a/datafusion/sqllogictest/test_files/tpch/create_tables.slt.part b/datafusion/sqllogictest/test_files/tpch/create_tables.slt.part index 75bcbc198bef..111d24773055 100644 --- a/datafusion/sqllogictest/test_files/tpch/create_tables.slt.part +++ b/datafusion/sqllogictest/test_files/tpch/create_tables.slt.part @@ -121,4 +121,4 @@ CREATE EXTERNAL TABLE IF NOT EXISTS region ( r_name VARCHAR, r_comment VARCHAR, r_rev VARCHAR, -) STORED AS CSV LOCATION 'test_files/tpch/data/region.tbl' OPTIONS ('format.delimiter' '|'); +) STORED AS CSV LOCATION 'test_files/tpch/data/region.tbl' OPTIONS ('format.delimiter' '|'); \ No newline at end of file diff --git a/docs/source/user-guide/cli/datasources.md b/docs/source/user-guide/cli/datasources.md index 8b254a896f90..2b11645c471a 100644 --- a/docs/source/user-guide/cli/datasources.md +++ b/docs/source/user-guide/cli/datasources.md @@ -167,7 +167,7 @@ Register a single file csv datasource with a header row. CREATE EXTERNAL TABLE test STORED AS CSV LOCATION '/path/to/aggregate_test_100.csv' -OPTIONS ('format.has_header' 'true'); +OPTIONS ('has_header' 'true'); ``` Register a single file csv datasource with explicitly defined schema. diff --git a/docs/source/user-guide/sql/ddl.md b/docs/source/user-guide/sql/ddl.md index 41980851df1e..e16b9681eb80 100644 --- a/docs/source/user-guide/sql/ddl.md +++ b/docs/source/user-guide/sql/ddl.md @@ -98,7 +98,7 @@ scanning a subset of the file. CREATE EXTERNAL TABLE test STORED AS CSV LOCATION '/path/to/aggregate_simple.csv' -OPTIONS ('format.has_header' 'true'); +OPTIONS ('has_header' 'true'); ``` It is also possible to use compressed files, such as `.csv.gz`: @@ -108,7 +108,7 @@ CREATE EXTERNAL TABLE test STORED AS CSV COMPRESSION TYPE GZIP LOCATION '/path/to/aggregate_simple.csv.gz' -OPTIONS ('format.has_header' 'true'); +OPTIONS ('has_header' 'true'); ``` It is also possible to specify the schema manually. @@ -131,7 +131,7 @@ CREATE EXTERNAL TABLE test ( ) STORED AS CSV LOCATION '/path/to/aggregate_test_100.csv' -OPTIONS ('format.has_header' 'true'); +OPTIONS ('has_header' 'true'); ``` It is also possible to specify a directory that contains a partitioned @@ -141,7 +141,7 @@ table (multiple files with the same schema) CREATE EXTERNAL TABLE test STORED AS CSV LOCATION '/path/to/directory/of/files' -OPTIONS ('format.has_header' 'true'); +OPTIONS ('has_header' 'true'); ``` With `CREATE UNBOUNDED EXTERNAL TABLE` SQL statement. We can create unbounded data sources such as following: @@ -180,7 +180,7 @@ CREATE EXTERNAL TABLE test ( STORED AS CSV WITH ORDER (c2 ASC, c5 + c8 DESC NULL FIRST) LOCATION '/path/to/aggregate_test_100.csv' -OPTIONS ('format.has_header' 'true'); +OPTIONS ('has_header' 'true'); ``` Where `WITH ORDER` clause specifies the sort order: diff --git a/docs/source/user-guide/sql/write_options.md b/docs/source/user-guide/sql/write_options.md index c496d8d48a02..3c4790dd0255 100644 --- a/docs/source/user-guide/sql/write_options.md +++ b/docs/source/user-guide/sql/write_options.md @@ -42,7 +42,7 @@ CREATE EXTERNAL TABLE LOCATION '/test/location/my_csv_table/' OPTIONS( NULL_VALUE 'NAN', - 'format.has_header' 'true' + 'has_header' 'true' ) ``` From a817dc86d4622d70482d3e27351e390a6a39a878 Mon Sep 17 00:00:00 2001 From: berkaysynnada Date: Mon, 13 May 2024 10:27:42 +0300 Subject: [PATCH 15/15] Fix imports --- datafusion/core/src/datasource/stream.rs | 4 +++- datafusion/core/src/execution/context/mod.rs | 21 ++++++++++--------- .../tests/cases/roundtrip_logical_plan.rs | 17 +++++++-------- 3 files changed, 22 insertions(+), 20 deletions(-) diff --git a/datafusion/core/src/datasource/stream.rs b/datafusion/core/src/datasource/stream.rs index f0795bdbb0d8..bcce3c1b6422 100644 --- a/datafusion/core/src/datasource/stream.rs +++ b/datafusion/core/src/datasource/stream.rs @@ -66,7 +66,9 @@ impl TableProviderFactory for StreamTableFactory { { opt.unwrap_or(false) } else { - return config_err!("Valid values for format.has_header option are 'true' or 'false'"); + return config_err!( + "Valid values for format.has_header option are 'true' or 'false'" + ); }; let config = StreamConfig::new_file(schema, location.into()) diff --git a/datafusion/core/src/execution/context/mod.rs b/datafusion/core/src/execution/context/mod.rs index 6b29f352dc82..e69a249410b1 100644 --- a/datafusion/core/src/execution/context/mod.rs +++ b/datafusion/core/src/execution/context/mod.rs @@ -23,8 +23,6 @@ use std::ops::ControlFlow; use std::sync::{Arc, Weak}; use super::options::ReadOptions; -#[cfg(feature = "array_expressions")] -use crate::functions_array; use crate::{ catalog::information_schema::{InformationSchemaProvider, INFORMATION_SCHEMA}, catalog::listing_schema::ListingSchemaProvider, @@ -55,17 +53,22 @@ use crate::{ }, optimizer::analyzer::{Analyzer, AnalyzerRule}, optimizer::optimizer::{Optimizer, OptimizerConfig, OptimizerRule}, - physical_expr::{create_physical_expr, PhysicalExpr}, physical_optimizer::optimizer::{PhysicalOptimizer, PhysicalOptimizerRule}, physical_plan::ExecutionPlan, physical_planner::{DefaultPhysicalPlanner, PhysicalPlanner}, variable::{VarProvider, VarType}, }; + +#[cfg(feature = "array_expressions")] +use crate::functions_array; use crate::{functions, functions_aggregate}; use arrow::datatypes::{DataType, SchemaRef}; use arrow::record_batch::RecordBatch; use arrow_schema::Schema; +use async_trait::async_trait; +use chrono::{DateTime, Utc}; +use datafusion_common::tree_node::TreeNode; use datafusion_common::{ alias::AliasGenerator, config::{ConfigExtension, TableOptions}, @@ -75,30 +78,28 @@ use datafusion_common::{ }; use datafusion_execution::registry::SerializerRegistry; use datafusion_expr::{ - expr_rewriter::FunctionRewrite, logical_plan::{DdlStatement, Statement}, - simplify::SimplifyInfo, var_provider::is_system_variables, Expr, ExprSchemable, StringifiedPlan, UserDefinedLogicalNode, WindowUDF, }; -use datafusion_optimizer::simplify_expressions::ExprSimplifier; use datafusion_sql::{ parser::{CopyToSource, CopyToStatement, DFParser}, planner::{object_name_to_table_reference, ContextProvider, ParserOptions, SqlToRel}, ResolvedTableReference, }; - -use async_trait::async_trait; -use chrono::{DateTime, Utc}; -use datafusion_common::tree_node::TreeNode; use parking_lot::RwLock; use sqlparser::dialect::dialect_from_str; use url::Url; use uuid::Uuid; +use crate::physical_expr::PhysicalExpr; pub use datafusion_execution::config::SessionConfig; pub use datafusion_execution::TaskContext; pub use datafusion_expr::execution_props::ExecutionProps; +use datafusion_expr::expr_rewriter::FunctionRewrite; +use datafusion_expr::simplify::SimplifyInfo; +use datafusion_optimizer::simplify_expressions::ExprSimplifier; +use datafusion_physical_expr::create_physical_expr; mod avro; mod csv; diff --git a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs index 4ef7da0f04aa..e5e57c0bc893 100644 --- a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs +++ b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs @@ -15,12 +15,6 @@ // specific language governing permissions and limitations // under the License. -use std::any::Any; -use std::collections::HashMap; -use std::fmt::{self, Debug, Formatter}; -use std::sync::Arc; -use std::vec; - use arrow::array::{ArrayRef, FixedSizeListArray}; use arrow::datatypes::{ DataType, Field, Fields, Int32Type, IntervalDayTimeType, IntervalMonthDayNanoType, @@ -57,11 +51,16 @@ use datafusion_proto::bytes::{ logical_plan_to_bytes, logical_plan_to_bytes_with_extension_codec, }; use datafusion_proto::logical_plan::to_proto::serialize_expr; -use datafusion_proto::logical_plan::{ - from_proto, DefaultLogicalExtensionCodec, LogicalExtensionCodec, -}; +use datafusion_proto::logical_plan::LogicalExtensionCodec; +use datafusion_proto::logical_plan::{from_proto, DefaultLogicalExtensionCodec}; use datafusion_proto::protobuf; +use std::any::Any; +use std::collections::HashMap; +use std::fmt::{self, Debug, Formatter}; +use std::sync::Arc; +use std::vec; +use datafusion::execution::FunctionRegistry; use prost::Message; #[cfg(feature = "json")]