diff --git a/datafusion-cli/src/helper.rs b/datafusion-cli/src/helper.rs index 85f14c1736dc..f93aaec4218d 100644 --- a/datafusion-cli/src/helper.rs +++ b/datafusion-cli/src/helper.rs @@ -20,25 +20,20 @@ use std::borrow::Cow; +use crate::highlighter::{NoSyntaxHighlighter, SyntaxHighlighter}; + use datafusion::common::sql_datafusion_err; use datafusion::error::DataFusionError; use datafusion::sql::parser::{DFParser, Statement}; use datafusion::sql::sqlparser::dialect::dialect_from_str; use datafusion::sql::sqlparser::parser::ParserError; -use rustyline::completion::Completer; -use rustyline::completion::FilenameCompleter; -use rustyline::completion::Pair; + +use rustyline::completion::{Completer, FilenameCompleter, Pair}; use rustyline::error::ReadlineError; use rustyline::highlight::Highlighter; use rustyline::hint::Hinter; -use rustyline::validate::ValidationContext; -use rustyline::validate::ValidationResult; -use rustyline::validate::Validator; -use rustyline::Context; -use rustyline::Helper; -use rustyline::Result; - -use crate::highlighter::{NoSyntaxHighlighter, SyntaxHighlighter}; +use rustyline::validate::{ValidationContext, ValidationResult, Validator}; +use rustyline::{Context, Helper, Result}; pub struct CliHelper { completer: FilenameCompleter, @@ -259,52 +254,69 @@ mod tests { // shoule be valid let result = readline_direct( - Cursor::new(r"create external table test stored as csv location 'data.csv' delimiter ',';".as_bytes()), - &validator, - )?; + Cursor::new( + r"create external table test stored as csv location 'data.csv' options ('format.delimiter' ',');" + .as_bytes(), + ), + &validator, + )?; assert!(matches!(result, ValidationResult::Valid(None))); let result = readline_direct( - Cursor::new(r"create external table test stored as csv location 'data.csv' delimiter '\0';".as_bytes()), - &validator, - )?; + Cursor::new( + r"create external table test stored as csv location 'data.csv' options ('format.delimiter' '\0');" + .as_bytes()), + &validator, + )?; assert!(matches!(result, ValidationResult::Valid(None))); let result = readline_direct( - Cursor::new(r"create external table test stored as csv location 'data.csv' delimiter '\n';".as_bytes()), - &validator, - )?; + Cursor::new( + r"create external table test stored as csv location 'data.csv' options ('format.delimiter' '\n');" + .as_bytes()), + &validator, + )?; assert!(matches!(result, ValidationResult::Valid(None))); let result = readline_direct( - Cursor::new(r"create external table test stored as csv location 'data.csv' delimiter '\r';".as_bytes()), - &validator, - )?; + Cursor::new( + r"create external table test stored as csv location 'data.csv' options ('format.delimiter' '\r');" + .as_bytes()), + &validator, + )?; assert!(matches!(result, ValidationResult::Valid(None))); let result = readline_direct( - Cursor::new(r"create external table test stored as csv location 'data.csv' delimiter '\t';".as_bytes()), - &validator, - )?; + Cursor::new( + r"create external table test stored as csv location 'data.csv' options ('format.delimiter' '\t');" + .as_bytes()), + &validator, + )?; assert!(matches!(result, ValidationResult::Valid(None))); let result = readline_direct( - Cursor::new(r"create external table test stored as csv location 'data.csv' delimiter '\\';".as_bytes()), - &validator, - )?; + Cursor::new( + r"create external table test stored as csv location 'data.csv' options ('format.delimiter' '\\');" + .as_bytes()), + &validator, + )?; assert!(matches!(result, ValidationResult::Valid(None))); - // should be invalid let result = readline_direct( - Cursor::new(r"create external table test stored as csv location 'data.csv' delimiter ',,';".as_bytes()), - &validator, - )?; - assert!(matches!(result, ValidationResult::Invalid(Some(_)))); + Cursor::new( + r"create external table test stored as csv location 'data.csv' options ('format.delimiter' ',,');" + .as_bytes()), + &validator, + )?; + assert!(matches!(result, ValidationResult::Valid(None))); + // should be invalid let result = readline_direct( - Cursor::new(r"create external table test stored as csv location 'data.csv' delimiter '\u{07}';".as_bytes()), - &validator, - )?; + Cursor::new( + r"create external table test stored as csv location 'data.csv' options ('format.delimiter' '\u{07}');" + .as_bytes()), + &validator, + )?; assert!(matches!(result, ValidationResult::Invalid(Some(_)))); Ok(()) diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index 18cd83a47097..c60f843393f8 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -1564,18 +1564,21 @@ config_namespace_with_hashmap! { config_namespace! { /// Options controlling CSV format pub struct CsvOptions { - pub has_header: bool, default = true + /// Specifies whether there is a CSV header (i.e. the first line + /// consists of is column names). The value `None` indicates that + /// the configuration should be consulted. + pub has_header: Option, default = None pub delimiter: u8, default = b',' pub quote: u8, default = b'"' pub escape: Option, default = None pub compression: CompressionTypeVariant, default = CompressionTypeVariant::UNCOMPRESSED pub schema_infer_max_rec: usize, default = 100 - pub date_format: Option, default = None - pub datetime_format: Option, default = None - pub timestamp_format: Option, default = None - pub timestamp_tz_format: Option, default = None - pub time_format: Option, default = None - pub null_value: Option, default = None + pub date_format: Option, default = None + pub datetime_format: Option, default = None + pub timestamp_format: Option, default = None + pub timestamp_tz_format: Option, default = None + pub time_format: Option, default = None + pub null_value: Option, default = None } } @@ -1600,12 +1603,14 @@ impl CsvOptions { /// Set true to indicate that the first line is a header. /// - default to true pub fn with_has_header(mut self, has_header: bool) -> Self { - self.has_header = has_header; + self.has_header = Some(has_header); self } - /// True if the first line is a header. - pub fn has_header(&self) -> bool { + /// Returns true if the first line is a header. If format options does not + /// specify whether there is a header, returns `None` (indicating that the + /// configuration should be consulted). + pub fn has_header(&self) -> Option { self.has_header } diff --git a/datafusion/common/src/file_options/csv_writer.rs b/datafusion/common/src/file_options/csv_writer.rs index 5f1a62682f8d..2904ea0f8f03 100644 --- a/datafusion/common/src/file_options/csv_writer.rs +++ b/datafusion/common/src/file_options/csv_writer.rs @@ -50,7 +50,7 @@ impl TryFrom<&CsvOptions> for CsvWriterOptions { fn try_from(value: &CsvOptions) -> Result { let mut builder = WriterBuilder::default() - .with_header(value.has_header) + .with_header(value.has_header.unwrap_or(false)) .with_delimiter(value.delimiter); if let Some(v) = &value.date_format { diff --git a/datafusion/core/src/catalog/listing_schema.rs b/datafusion/core/src/catalog/listing_schema.rs index a5960b21dff5..29f3e4ad8181 100644 --- a/datafusion/core/src/catalog/listing_schema.rs +++ b/datafusion/core/src/catalog/listing_schema.rs @@ -27,7 +27,6 @@ use crate::datasource::provider::TableProviderFactory; use crate::datasource::TableProvider; use crate::execution::context::SessionState; -use datafusion_common::parsers::CompressionTypeVariant; use datafusion_common::{Constraints, DFSchema, DataFusionError, TableReference}; use datafusion_expr::CreateExternalTable; @@ -58,7 +57,6 @@ pub struct ListingSchemaProvider { store: Arc, tables: Arc>>>, format: String, - has_header: bool, } impl ListingSchemaProvider { @@ -77,7 +75,6 @@ impl ListingSchemaProvider { factory: Arc, store: Arc, format: String, - has_header: bool, ) -> Self { Self { authority, @@ -86,7 +83,6 @@ impl ListingSchemaProvider { store, tables: Arc::new(Mutex::new(HashMap::new())), format, - has_header, } } @@ -139,12 +135,9 @@ impl ListingSchemaProvider { name, location: table_url, file_type: self.format.clone(), - has_header: self.has_header, - delimiter: ',', table_partition_cols: vec![], if_not_exists: false, definition: None, - file_compression_type: CompressionTypeVariant::UNCOMPRESSED, order_exprs: vec![], unbounded: false, options: Default::default(), diff --git a/datafusion/core/src/datasource/file_format/csv.rs b/datafusion/core/src/datasource/file_format/csv.rs index 645f98cd3fb0..17bc7aafce85 100644 --- a/datafusion/core/src/datasource/file_format/csv.rs +++ b/datafusion/core/src/datasource/file_format/csv.rs @@ -32,8 +32,9 @@ use crate::datasource::physical_plan::{ use crate::error::Result; use crate::execution::context::SessionState; use crate::physical_plan::insert::{DataSink, DataSinkExec}; -use crate::physical_plan::{DisplayAs, DisplayFormatType, Statistics}; -use crate::physical_plan::{ExecutionPlan, SendableRecordBatchStream}; +use crate::physical_plan::{ + DisplayAs, DisplayFormatType, ExecutionPlan, SendableRecordBatchStream, Statistics, +}; use arrow::array::RecordBatch; use arrow::csv::WriterBuilder; @@ -136,12 +137,13 @@ impl CsvFormat { /// Set true to indicate that the first line is a header. /// - default to true pub fn with_has_header(mut self, has_header: bool) -> Self { - self.options.has_header = has_header; + self.options.has_header = Some(has_header); self } - /// True if the first line is a header. - pub fn has_header(&self) -> bool { + /// Returns `Some(true)` if the first line is a header, `Some(false)` if + /// it is not, and `None` if it is not specified. + pub fn has_header(&self) -> Option { self.options.has_header } @@ -200,7 +202,7 @@ impl FileFormat for CsvFormat { async fn infer_schema( &self, - _state: &SessionState, + state: &SessionState, store: &Arc, objects: &[ObjectMeta], ) -> Result { @@ -211,7 +213,7 @@ impl FileFormat for CsvFormat { for object in objects { let stream = self.read_to_delimited_chunks(store, object).await; let (schema, records_read) = self - .infer_schema_from_stream(records_to_read, stream) + .infer_schema_from_stream(state, records_to_read, stream) .await?; records_to_read -= records_read; schemas.push(schema); @@ -236,13 +238,17 @@ impl FileFormat for CsvFormat { async fn create_physical_plan( &self, - _state: &SessionState, + state: &SessionState, conf: FileScanConfig, _filters: Option<&Arc>, ) -> Result> { let exec = CsvExec::new( conf, - self.options.has_header, + // If format options does not specify whether there is a header, + // we consult configuration options. + self.options + .has_header + .unwrap_or(state.config_options().catalog.has_header), self.options.delimiter, self.options.quote, self.options.escape, @@ -286,6 +292,7 @@ impl CsvFormat { /// number of lines that were read async fn infer_schema_from_stream( &self, + state: &SessionState, mut records_to_read: usize, stream: impl Stream>, ) -> Result<(Schema, usize)> { @@ -298,7 +305,13 @@ impl CsvFormat { while let Some(chunk) = stream.next().await.transpose()? { let format = arrow::csv::reader::Format::default() - .with_header(self.options.has_header && first_chunk) + .with_header( + first_chunk + && self + .options + .has_header + .unwrap_or(state.config_options().catalog.has_header), + ) .with_delimiter(self.options.delimiter); let (Schema { fields, .. }, records_read) = @@ -538,6 +551,7 @@ mod tests { use datafusion_common::cast::as_string_array; use datafusion_common::stats::Precision; use datafusion_common::{internal_err, GetExt}; + use datafusion_execution::runtime_env::{RuntimeConfig, RuntimeEnv}; use datafusion_expr::{col, lit}; use chrono::DateTime; @@ -554,7 +568,8 @@ mod tests { let task_ctx = state.task_ctx(); // skip column 9 that overflows the automaticly discovered column type of i64 (u64 would work) let projection = Some(vec![0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12]); - let exec = get_exec(&state, "aggregate_test_100.csv", projection, None).await?; + let exec = + get_exec(&state, "aggregate_test_100.csv", projection, None, true).await?; let stream = exec.execute(0, task_ctx)?; let tt_batches: i32 = stream @@ -582,7 +597,7 @@ mod tests { let task_ctx = session_ctx.task_ctx(); let projection = Some(vec![0, 1, 2, 3]); let exec = - get_exec(&state, "aggregate_test_100.csv", projection, Some(1)).await?; + get_exec(&state, "aggregate_test_100.csv", projection, Some(1), true).await?; let batches = collect(exec, task_ctx).await?; assert_eq!(1, batches.len()); assert_eq!(4, batches[0].num_columns()); @@ -597,7 +612,8 @@ mod tests { let state = session_ctx.state(); let projection = None; - let exec = get_exec(&state, "aggregate_test_100.csv", projection, None).await?; + let exec = + get_exec(&state, "aggregate_test_100.csv", projection, None, true).await?; let x: Vec = exec .schema() @@ -633,7 +649,8 @@ mod tests { let state = session_ctx.state(); let task_ctx = session_ctx.task_ctx(); let projection = Some(vec![0]); - let exec = get_exec(&state, "aggregate_test_100.csv", projection, None).await?; + let exec = + get_exec(&state, "aggregate_test_100.csv", projection, None, true).await?; let batches = collect(exec, task_ctx).await.expect("Collect batches"); @@ -716,6 +733,11 @@ mod tests { async fn query_compress_data( file_compression_type: FileCompressionType, ) -> Result<()> { + let runtime = Arc::new(RuntimeEnv::new(RuntimeConfig::new()).unwrap()); + let mut cfg = SessionConfig::new(); + cfg.options_mut().catalog.has_header = true; + let session_state = SessionState::new_with_config_rt(cfg, runtime); + let integration = LocalFileSystem::new_with_prefix(arrow_test_data()).unwrap(); let path = Path::from("csv/aggregate_test_100.csv"); @@ -757,7 +779,7 @@ mod tests { .read_to_delimited_chunks_from_stream(compressed_stream.unwrap()) .await; let (schema, records_read) = compressed_csv - .infer_schema_from_stream(records_to_read, decoded_stream) + .infer_schema_from_stream(&session_state, records_to_read, decoded_stream) .await?; assert_eq!(expected, schema); @@ -803,9 +825,10 @@ mod tests { file_name: &str, projection: Option>, limit: Option, + has_header: bool, ) -> Result> { let root = format!("{}/csv", crate::test_util::arrow_test_data()); - let format = CsvFormat::default(); + let format = CsvFormat::default().with_has_header(has_header); scan_format(state, &format, &root, file_name, projection, limit).await } diff --git a/datafusion/core/src/datasource/listing/table.rs b/datafusion/core/src/datasource/listing/table.rs index e6f763f52426..cf70894806a3 100644 --- a/datafusion/core/src/datasource/listing/table.rs +++ b/datafusion/core/src/datasource/listing/table.rs @@ -1033,7 +1033,6 @@ impl ListingTable { #[cfg(test)] mod tests { - use super::*; #[cfg(feature = "parquet")] use crate::datasource::{provider_as_source, MemTable}; @@ -1566,8 +1565,8 @@ mod tests { helper_test_insert_into_sql( "csv", FileCompressionType::UNCOMPRESSED, - "WITH HEADER ROW", - None, + "", + Some(HashMap::from([("has_header".into(), "true".into())])), ) .await?; Ok(()) diff --git a/datafusion/core/src/datasource/listing_table_factory.rs b/datafusion/core/src/datasource/listing_table_factory.rs index 1a0eb34d1234..987b9e12a424 100644 --- a/datafusion/core/src/datasource/listing_table_factory.rs +++ b/datafusion/core/src/datasource/listing_table_factory.rs @@ -66,11 +66,7 @@ impl TableProviderFactory for ListingTableFactory { let file_extension = get_extension(cmd.location.as_str()); let file_format: Arc = match file_type { FileType::CSV => { - let mut csv_options = table_options.csv; - csv_options.has_header = cmd.has_header; - csv_options.delimiter = cmd.delimiter as u8; - csv_options.compression = cmd.file_compression_type; - Arc::new(CsvFormat::default().with_options(csv_options)) + Arc::new(CsvFormat::default().with_options(table_options.csv)) } #[cfg(feature = "parquet")] FileType::PARQUET => { @@ -78,9 +74,7 @@ impl TableProviderFactory for ListingTableFactory { } FileType::AVRO => Arc::new(AvroFormat), FileType::JSON => { - let mut json_options = table_options.json; - json_options.compression = cmd.file_compression_type; - Arc::new(JsonFormat::default().with_options(json_options)) + Arc::new(JsonFormat::default().with_options(table_options.json)) } FileType::ARROW => Arc::new(ArrowFormat), }; @@ -172,7 +166,6 @@ mod tests { use super::*; use crate::execution::context::SessionContext; - use datafusion_common::parsers::CompressionTypeVariant; use datafusion_common::{Constraints, DFSchema, TableReference}; #[tokio::test] @@ -191,16 +184,13 @@ mod tests { name, location: csv_file.path().to_str().unwrap().to_string(), file_type: "csv".to_string(), - has_header: true, - delimiter: ',', schema: Arc::new(DFSchema::empty()), table_partition_cols: vec![], if_not_exists: false, - file_compression_type: CompressionTypeVariant::UNCOMPRESSED, definition: None, order_exprs: vec![], unbounded: false, - options: HashMap::new(), + options: HashMap::from([("format.has_header".into(), "true".into())]), constraints: Constraints::empty(), column_defaults: HashMap::new(), }; @@ -228,16 +218,14 @@ mod tests { let mut options = HashMap::new(); options.insert("format.schema_infer_max_rec".to_owned(), "1000".to_owned()); + options.insert("format.has_header".into(), "true".into()); let cmd = CreateExternalTable { name, location: csv_file.path().to_str().unwrap().to_string(), file_type: "csv".to_string(), - has_header: true, - delimiter: ',', schema: Arc::new(DFSchema::empty()), table_partition_cols: vec![], if_not_exists: false, - file_compression_type: CompressionTypeVariant::UNCOMPRESSED, definition: None, order_exprs: vec![], unbounded: false, diff --git a/datafusion/core/src/datasource/physical_plan/csv.rs b/datafusion/core/src/datasource/physical_plan/csv.rs index 879461c2eb1e..cc7c837e471e 100644 --- a/datafusion/core/src/datasource/physical_plan/csv.rs +++ b/datafusion/core/src/datasource/physical_plan/csv.rs @@ -610,7 +610,8 @@ mod tests { async fn csv_exec_with_mixed_order_projection( file_compression_type: FileCompressionType, ) -> Result<()> { - let session_ctx = SessionContext::new(); + let cfg = SessionConfig::new().set_str("datafusion.catalog.has_header", "true"); + let session_ctx = SessionContext::new_with_config(cfg); let task_ctx = session_ctx.task_ctx(); let file_schema = aggr_test_schema(); let path = format!("{}/csv", arrow_test_data()); @@ -675,7 +676,8 @@ mod tests { async fn csv_exec_with_limit( file_compression_type: FileCompressionType, ) -> Result<()> { - let session_ctx = SessionContext::new(); + let cfg = SessionConfig::new().set_str("datafusion.catalog.has_header", "true"); + let session_ctx = SessionContext::new_with_config(cfg); let task_ctx = session_ctx.task_ctx(); let file_schema = aggr_test_schema(); let path = format!("{}/csv", arrow_test_data()); @@ -1017,7 +1019,9 @@ mod tests { // create partitioned input file and context let tmp_dir = TempDir::new()?; let ctx = SessionContext::new_with_config( - SessionConfig::new().with_target_partitions(8), + SessionConfig::new() + .with_target_partitions(8) + .set_str("datafusion.catalog.has_header", "false"), ); let schema = populate_csv_partitions(&tmp_dir, 8, ".csv")?; @@ -1045,7 +1049,9 @@ mod tests { .await?; // create a new context and verify that the results were saved to a partitioned csv file - let ctx = SessionContext::new(); + let ctx = SessionContext::new_with_config( + SessionConfig::new().set_str("datafusion.catalog.has_header", "false"), + ); let schema = Arc::new(Schema::new(vec![ Field::new("c1", DataType::UInt32, false), @@ -1074,7 +1080,7 @@ mod tests { panic!("Did not find part_0 in csv output files!") } // register each partition as well as the top level dir - let csv_read_option = CsvReadOptions::new().schema(&schema); + let csv_read_option = CsvReadOptions::new().schema(&schema).has_header(false); ctx.register_csv( "part0", &format!("{out_dir}/{part_0_name}"), diff --git a/datafusion/core/src/datasource/stream.rs b/datafusion/core/src/datasource/stream.rs index 0a64e7c516ef..bcce3c1b6422 100644 --- a/datafusion/core/src/datasource/stream.rs +++ b/datafusion/core/src/datasource/stream.rs @@ -25,12 +25,13 @@ use std::path::PathBuf; use std::str::FromStr; use std::sync::Arc; +use crate::datasource::provider::TableProviderFactory; +use crate::datasource::{create_ordering, TableProvider}; +use crate::execution::context::SessionState; + use arrow_array::{RecordBatch, RecordBatchReader, RecordBatchWriter}; use arrow_schema::SchemaRef; -use async_trait::async_trait; -use futures::StreamExt; - -use datafusion_common::{plan_err, Constraints, DataFusionError, Result}; +use datafusion_common::{config_err, plan_err, Constraints, DataFusionError, Result}; use datafusion_common_runtime::SpawnedTask; use datafusion_execution::{SendableRecordBatchStream, TaskContext}; use datafusion_expr::{CreateExternalTable, Expr, TableType}; @@ -40,9 +41,8 @@ use datafusion_physical_plan::stream::RecordBatchReceiverStreamBuilder; use datafusion_physical_plan::streaming::{PartitionStream, StreamingTableExec}; use datafusion_physical_plan::{DisplayAs, DisplayFormatType, ExecutionPlan}; -use crate::datasource::provider::TableProviderFactory; -use crate::datasource::{create_ordering, TableProvider}; -use crate::execution::context::SessionState; +use async_trait::async_trait; +use futures::StreamExt; /// A [`TableProviderFactory`] for [`StreamTable`] #[derive(Debug, Default)] @@ -58,12 +58,24 @@ impl TableProviderFactory for StreamTableFactory { let schema: SchemaRef = Arc::new(cmd.schema.as_ref().into()); let location = cmd.location.clone(); let encoding = cmd.file_type.parse()?; + let header = if let Ok(opt) = cmd + .options + .get("format.has_header") + .map(|has_header| bool::from_str(has_header)) + .transpose() + { + opt.unwrap_or(false) + } else { + return config_err!( + "Valid values for format.has_header option are 'true' or 'false'" + ); + }; let config = StreamConfig::new_file(schema, location.into()) .with_encoding(encoding) .with_order(cmd.order_exprs.clone()) - .with_header(cmd.has_header) .with_batch_size(state.config().batch_size()) + .with_header(header) .with_constraints(cmd.constraints.clone()); Ok(Arc::new(StreamTable(Arc::new(config)))) diff --git a/datafusion/core/src/execution/context/mod.rs b/datafusion/core/src/execution/context/mod.rs index f3af31f895f9..e69a249410b1 100644 --- a/datafusion/core/src/execution/context/mod.rs +++ b/datafusion/core/src/execution/context/mod.rs @@ -66,6 +66,9 @@ use crate::{functions, functions_aggregate}; use arrow::datatypes::{DataType, SchemaRef}; use arrow::record_batch::RecordBatch; use arrow_schema::Schema; +use async_trait::async_trait; +use chrono::{DateTime, Utc}; +use datafusion_common::tree_node::TreeNode; use datafusion_common::{ alias::AliasGenerator, config::{ConfigExtension, TableOptions}, @@ -84,10 +87,6 @@ use datafusion_sql::{ planner::{object_name_to_table_reference, ContextProvider, ParserOptions, SqlToRel}, ResolvedTableReference, }; - -use async_trait::async_trait; -use chrono::{DateTime, Utc}; -use datafusion_common::tree_node::TreeNode; use parking_lot::RwLock; use sqlparser::dialect::dialect_from_str; use url::Url; @@ -1560,7 +1559,6 @@ impl SessionState { let url = url.to_string(); let format = format.to_string(); - let has_header = config.options().catalog.has_header; let url = Url::parse(url.as_str()).expect("Invalid default catalog location!"); let authority = match url.host_str() { Some(host) => format!("{}://{}", url.scheme(), host), @@ -1578,14 +1576,8 @@ impl SessionState { Some(factory) => factory, _ => return, }; - let schema = ListingSchemaProvider::new( - authority, - path, - factory.clone(), - store, - format, - has_header, - ); + let schema = + ListingSchemaProvider::new(authority, path, factory.clone(), store, format); let _ = default_catalog .register_schema("default", Arc::new(schema)) .expect("Failed to register default schema"); diff --git a/datafusion/core/tests/fifo.rs b/datafusion/core/tests/fifo.rs index 9b132f18c7a5..a63240d03d94 100644 --- a/datafusion/core/tests/fifo.rs +++ b/datafusion/core/tests/fifo.rs @@ -384,8 +384,8 @@ mod unix_test { a2 INT NOT NULL ) STORED AS CSV - WITH HEADER ROW - LOCATION '{source_display_fifo_path}'" + LOCATION '{source_display_fifo_path}' + OPTIONS ('format.has_header' 'true')" )) .await?; @@ -396,8 +396,8 @@ mod unix_test { a2 INT NOT NULL ) STORED AS CSV - WITH HEADER ROW - LOCATION '{sink_display_fifo_path}'" + LOCATION '{sink_display_fifo_path}' + OPTIONS ('format.has_header' 'true')" )) .await?; diff --git a/datafusion/core/tests/path_partition.rs b/datafusion/core/tests/path_partition.rs index dd8eb52f67c7..ffe0494dae99 100644 --- a/datafusion/core/tests/path_partition.rs +++ b/datafusion/core/tests/path_partition.rs @@ -38,12 +38,12 @@ use datafusion::{ }; use datafusion_common::stats::Precision; use datafusion_common::ScalarValue; +use datafusion_execution::config::SessionConfig; use async_trait::async_trait; use bytes::Bytes; use chrono::{TimeZone, Utc}; -use futures::stream; -use futures::stream::BoxStream; +use futures::stream::{self, BoxStream}; use object_store::{ path::Path, GetOptions, GetResult, GetResultPayload, ListResult, MultipartId, ObjectMeta, ObjectStore, PutOptions, PutResult, @@ -202,7 +202,9 @@ fn extract_as_utf(v: &ScalarValue) -> Option { #[tokio::test] async fn csv_filter_with_file_col() -> Result<()> { - let ctx = SessionContext::new(); + let ctx = SessionContext::new_with_config( + SessionConfig::new().set_str("datafusion.catalog.has_header", "true"), + ); register_partitioned_aggregate_csv( &ctx, @@ -238,7 +240,9 @@ async fn csv_filter_with_file_col() -> Result<()> { #[tokio::test] async fn csv_filter_with_file_nonstring_col() -> Result<()> { - let ctx = SessionContext::new(); + let ctx = SessionContext::new_with_config( + SessionConfig::new().set_str("datafusion.catalog.has_header", "true"), + ); register_partitioned_aggregate_csv( &ctx, @@ -274,7 +278,9 @@ async fn csv_filter_with_file_nonstring_col() -> Result<()> { #[tokio::test] async fn csv_projection_on_partition() -> Result<()> { - let ctx = SessionContext::new(); + let ctx = SessionContext::new_with_config( + SessionConfig::new().set_str("datafusion.catalog.has_header", "true"), + ); register_partitioned_aggregate_csv( &ctx, @@ -310,7 +316,9 @@ async fn csv_projection_on_partition() -> Result<()> { #[tokio::test] async fn csv_grouping_by_partition() -> Result<()> { - let ctx = SessionContext::new(); + let ctx = SessionContext::new_with_config( + SessionConfig::new().set_str("datafusion.catalog.has_header", "true"), + ); register_partitioned_aggregate_csv( &ctx, diff --git a/datafusion/core/tests/sql/mod.rs b/datafusion/core/tests/sql/mod.rs index 9b7828a777c8..995ce35c5bc2 100644 --- a/datafusion/core/tests/sql/mod.rs +++ b/datafusion/core/tests/sql/mod.rs @@ -85,8 +85,8 @@ async fn register_aggregate_csv_by_sql(ctx: &SessionContext) { c13 VARCHAR NOT NULL ) STORED AS CSV - WITH HEADER ROW LOCATION '{testdata}/csv/aggregate_test_100.csv' + OPTIONS ('format.has_header' 'true') " )) .await diff --git a/datafusion/expr/src/logical_plan/ddl.rs b/datafusion/expr/src/logical_plan/ddl.rs index 8d72c9a8b036..4538ff52c052 100644 --- a/datafusion/expr/src/logical_plan/ddl.rs +++ b/datafusion/expr/src/logical_plan/ddl.rs @@ -25,7 +25,6 @@ use std::{ use crate::{Expr, LogicalPlan, Volatility}; use arrow::datatypes::DataType; -use datafusion_common::parsers::CompressionTypeVariant; use datafusion_common::{Constraints, DFSchemaRef, SchemaReference, TableReference}; use sqlparser::ast::Ident; @@ -190,10 +189,6 @@ pub struct CreateExternalTable { pub location: String, /// The file type of physical file pub file_type: String, - /// Whether the CSV file contains a header - pub has_header: bool, - /// Delimiter for CSV - pub delimiter: char, /// Partition Columns pub table_partition_cols: Vec, /// Option to not error if table already exists @@ -202,8 +197,6 @@ pub struct CreateExternalTable { pub definition: Option, /// Order expressions supplied by user pub order_exprs: Vec>, - /// File compression type (GZIP, BZIP2, XZ, ZSTD) - pub file_compression_type: CompressionTypeVariant, /// Whether the table is an infinite streams pub unbounded: bool, /// Table(provider) specific options @@ -221,12 +214,9 @@ impl Hash for CreateExternalTable { self.name.hash(state); self.location.hash(state); self.file_type.hash(state); - self.has_header.hash(state); - self.delimiter.hash(state); self.table_partition_cols.hash(state); self.if_not_exists.hash(state); self.definition.hash(state); - self.file_compression_type.hash(state); self.order_exprs.hash(state); self.unbounded.hash(state); self.options.len().hash(state); // HashMap is not hashable diff --git a/datafusion/proto/proto/datafusion.proto b/datafusion/proto/proto/datafusion.proto index 311a0bf863ad..fd79345275ab 100644 --- a/datafusion/proto/proto/datafusion.proto +++ b/datafusion/proto/proto/datafusion.proto @@ -201,23 +201,19 @@ message Constraints{ message CreateExternalTableNode { reserved 1; // was string name - TableReference name = 12; + TableReference name = 9; string location = 2; string file_type = 3; - bool has_header = 4; - DfSchema schema = 5; - repeated string table_partition_cols = 6; - bool if_not_exists = 7; - string delimiter = 8; - string definition = 9; - reserved 10; // was string file_compression_type - CompressionTypeVariant file_compression_type = 17; - repeated LogicalExprNodeCollection order_exprs = 13; - bool unbounded = 14; - map options = 11; - Constraints constraints = 15; - map column_defaults = 16; -} + DfSchema schema = 4; + repeated string table_partition_cols = 5; + bool if_not_exists = 6; + string definition = 7; + repeated LogicalExprNodeCollection order_exprs = 10; + bool unbounded = 11; + map options = 8; + Constraints constraints = 12; + map column_defaults = 13; + } message PrepareNode { string name = 1; @@ -1106,7 +1102,7 @@ message CsvWriterOptions { // Options controlling CSV format message CsvOptions { - bool has_header = 1; // Indicates if the CSV has a header row + bytes has_header = 1; // Indicates if the CSV has a header row bytes delimiter = 2; // Delimiter character as a byte bytes quote = 3; // Quote character as a byte bytes escape = 4; // Optional escape character as a byte diff --git a/datafusion/proto/src/generated/pbjson.rs b/datafusion/proto/src/generated/pbjson.rs index a1a141735881..01d9a6e0dde6 100644 --- a/datafusion/proto/src/generated/pbjson.rs +++ b/datafusion/proto/src/generated/pbjson.rs @@ -4673,9 +4673,6 @@ impl serde::Serialize for CreateExternalTableNode { if !self.file_type.is_empty() { len += 1; } - if self.has_header { - len += 1; - } if self.schema.is_some() { len += 1; } @@ -4685,15 +4682,9 @@ impl serde::Serialize for CreateExternalTableNode { if self.if_not_exists { len += 1; } - if !self.delimiter.is_empty() { - len += 1; - } if !self.definition.is_empty() { len += 1; } - if self.file_compression_type != 0 { - len += 1; - } if !self.order_exprs.is_empty() { len += 1; } @@ -4719,9 +4710,6 @@ impl serde::Serialize for CreateExternalTableNode { if !self.file_type.is_empty() { struct_ser.serialize_field("fileType", &self.file_type)?; } - if self.has_header { - struct_ser.serialize_field("hasHeader", &self.has_header)?; - } if let Some(v) = self.schema.as_ref() { struct_ser.serialize_field("schema", v)?; } @@ -4731,17 +4719,9 @@ impl serde::Serialize for CreateExternalTableNode { if self.if_not_exists { struct_ser.serialize_field("ifNotExists", &self.if_not_exists)?; } - if !self.delimiter.is_empty() { - struct_ser.serialize_field("delimiter", &self.delimiter)?; - } if !self.definition.is_empty() { struct_ser.serialize_field("definition", &self.definition)?; } - if self.file_compression_type != 0 { - let v = CompressionTypeVariant::try_from(self.file_compression_type) - .map_err(|_| serde::ser::Error::custom(format!("Invalid variant {}", self.file_compression_type)))?; - struct_ser.serialize_field("fileCompressionType", &v)?; - } if !self.order_exprs.is_empty() { struct_ser.serialize_field("orderExprs", &self.order_exprs)?; } @@ -4771,17 +4751,12 @@ impl<'de> serde::Deserialize<'de> for CreateExternalTableNode { "location", "file_type", "fileType", - "has_header", - "hasHeader", "schema", "table_partition_cols", "tablePartitionCols", "if_not_exists", "ifNotExists", - "delimiter", "definition", - "file_compression_type", - "fileCompressionType", "order_exprs", "orderExprs", "unbounded", @@ -4796,13 +4771,10 @@ impl<'de> serde::Deserialize<'de> for CreateExternalTableNode { Name, Location, FileType, - HasHeader, Schema, TablePartitionCols, IfNotExists, - Delimiter, Definition, - FileCompressionType, OrderExprs, Unbounded, Options, @@ -4832,13 +4804,10 @@ impl<'de> serde::Deserialize<'de> for CreateExternalTableNode { "name" => Ok(GeneratedField::Name), "location" => Ok(GeneratedField::Location), "fileType" | "file_type" => Ok(GeneratedField::FileType), - "hasHeader" | "has_header" => Ok(GeneratedField::HasHeader), "schema" => Ok(GeneratedField::Schema), "tablePartitionCols" | "table_partition_cols" => Ok(GeneratedField::TablePartitionCols), "ifNotExists" | "if_not_exists" => Ok(GeneratedField::IfNotExists), - "delimiter" => Ok(GeneratedField::Delimiter), "definition" => Ok(GeneratedField::Definition), - "fileCompressionType" | "file_compression_type" => Ok(GeneratedField::FileCompressionType), "orderExprs" | "order_exprs" => Ok(GeneratedField::OrderExprs), "unbounded" => Ok(GeneratedField::Unbounded), "options" => Ok(GeneratedField::Options), @@ -4866,13 +4835,10 @@ impl<'de> serde::Deserialize<'de> for CreateExternalTableNode { let mut name__ = None; let mut location__ = None; let mut file_type__ = None; - let mut has_header__ = None; let mut schema__ = None; let mut table_partition_cols__ = None; let mut if_not_exists__ = None; - let mut delimiter__ = None; let mut definition__ = None; - let mut file_compression_type__ = None; let mut order_exprs__ = None; let mut unbounded__ = None; let mut options__ = None; @@ -4898,12 +4864,6 @@ impl<'de> serde::Deserialize<'de> for CreateExternalTableNode { } file_type__ = Some(map_.next_value()?); } - GeneratedField::HasHeader => { - if has_header__.is_some() { - return Err(serde::de::Error::duplicate_field("hasHeader")); - } - has_header__ = Some(map_.next_value()?); - } GeneratedField::Schema => { if schema__.is_some() { return Err(serde::de::Error::duplicate_field("schema")); @@ -4922,24 +4882,12 @@ impl<'de> serde::Deserialize<'de> for CreateExternalTableNode { } if_not_exists__ = Some(map_.next_value()?); } - GeneratedField::Delimiter => { - if delimiter__.is_some() { - return Err(serde::de::Error::duplicate_field("delimiter")); - } - delimiter__ = Some(map_.next_value()?); - } GeneratedField::Definition => { if definition__.is_some() { return Err(serde::de::Error::duplicate_field("definition")); } definition__ = Some(map_.next_value()?); } - GeneratedField::FileCompressionType => { - if file_compression_type__.is_some() { - return Err(serde::de::Error::duplicate_field("fileCompressionType")); - } - file_compression_type__ = Some(map_.next_value::()? as i32); - } GeneratedField::OrderExprs => { if order_exprs__.is_some() { return Err(serde::de::Error::duplicate_field("orderExprs")); @@ -4980,13 +4928,10 @@ impl<'de> serde::Deserialize<'de> for CreateExternalTableNode { name: name__, location: location__.unwrap_or_default(), file_type: file_type__.unwrap_or_default(), - has_header: has_header__.unwrap_or_default(), schema: schema__, table_partition_cols: table_partition_cols__.unwrap_or_default(), if_not_exists: if_not_exists__.unwrap_or_default(), - delimiter: delimiter__.unwrap_or_default(), definition: definition__.unwrap_or_default(), - file_compression_type: file_compression_type__.unwrap_or_default(), order_exprs: order_exprs__.unwrap_or_default(), unbounded: unbounded__.unwrap_or_default(), options: options__.unwrap_or_default(), @@ -5456,7 +5401,7 @@ impl serde::Serialize for CsvOptions { { use serde::ser::SerializeStruct; let mut len = 0; - if self.has_header { + if !self.has_header.is_empty() { len += 1; } if !self.delimiter.is_empty() { @@ -5493,8 +5438,9 @@ impl serde::Serialize for CsvOptions { len += 1; } let mut struct_ser = serializer.serialize_struct("datafusion.CsvOptions", len)?; - if self.has_header { - struct_ser.serialize_field("hasHeader", &self.has_header)?; + if !self.has_header.is_empty() { + #[allow(clippy::needless_borrow)] + struct_ser.serialize_field("hasHeader", pbjson::private::base64::encode(&self.has_header).as_str())?; } if !self.delimiter.is_empty() { #[allow(clippy::needless_borrow)] @@ -5651,7 +5597,9 @@ impl<'de> serde::Deserialize<'de> for CsvOptions { if has_header__.is_some() { return Err(serde::de::Error::duplicate_field("hasHeader")); } - has_header__ = Some(map_.next_value()?); + has_header__ = + Some(map_.next_value::<::pbjson::private::BytesDeserialize<_>>()?.0) + ; } GeneratedField::Delimiter => { if delimiter__.is_some() { diff --git a/datafusion/proto/src/generated/prost.rs b/datafusion/proto/src/generated/prost.rs index 706794e38070..64e72ba03878 100644 --- a/datafusion/proto/src/generated/prost.rs +++ b/datafusion/proto/src/generated/prost.rs @@ -320,38 +320,32 @@ pub struct Constraints { #[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] pub struct CreateExternalTableNode { - #[prost(message, optional, tag = "12")] + #[prost(message, optional, tag = "9")] pub name: ::core::option::Option, #[prost(string, tag = "2")] pub location: ::prost::alloc::string::String, #[prost(string, tag = "3")] pub file_type: ::prost::alloc::string::String, - #[prost(bool, tag = "4")] - pub has_header: bool, - #[prost(message, optional, tag = "5")] + #[prost(message, optional, tag = "4")] pub schema: ::core::option::Option, - #[prost(string, repeated, tag = "6")] + #[prost(string, repeated, tag = "5")] pub table_partition_cols: ::prost::alloc::vec::Vec<::prost::alloc::string::String>, - #[prost(bool, tag = "7")] + #[prost(bool, tag = "6")] pub if_not_exists: bool, - #[prost(string, tag = "8")] - pub delimiter: ::prost::alloc::string::String, - #[prost(string, tag = "9")] + #[prost(string, tag = "7")] pub definition: ::prost::alloc::string::String, - #[prost(enumeration = "CompressionTypeVariant", tag = "17")] - pub file_compression_type: i32, - #[prost(message, repeated, tag = "13")] + #[prost(message, repeated, tag = "10")] pub order_exprs: ::prost::alloc::vec::Vec, - #[prost(bool, tag = "14")] + #[prost(bool, tag = "11")] pub unbounded: bool, - #[prost(map = "string, string", tag = "11")] + #[prost(map = "string, string", tag = "8")] pub options: ::std::collections::HashMap< ::prost::alloc::string::String, ::prost::alloc::string::String, >, - #[prost(message, optional, tag = "15")] + #[prost(message, optional, tag = "12")] pub constraints: ::core::option::Option, - #[prost(map = "string, message", tag = "16")] + #[prost(map = "string, message", tag = "13")] pub column_defaults: ::std::collections::HashMap< ::prost::alloc::string::String, LogicalExprNode, @@ -1684,8 +1678,8 @@ pub struct CsvWriterOptions { #[derive(Clone, PartialEq, ::prost::Message)] pub struct CsvOptions { /// Indicates if the CSV has a header row - #[prost(bool, tag = "1")] - pub has_header: bool, + #[prost(bytes = "vec", tag = "1")] + pub has_header: ::prost::alloc::vec::Vec, /// Delimiter character as a byte #[prost(bytes = "vec", tag = "2")] pub delimiter: ::prost::alloc::vec::Vec, diff --git a/datafusion/proto/src/logical_plan/mod.rs b/datafusion/proto/src/logical_plan/mod.rs index cccfb0c27307..a6352bcefc3e 100644 --- a/datafusion/proto/src/logical_plan/mod.rs +++ b/datafusion/proto/src/logical_plan/mod.rs @@ -539,37 +539,27 @@ impl AsLogicalPlan for LogicalPlanNode { column_defaults.insert(col_name.clone(), expr); } - let file_compression_type = protobuf::CompressionTypeVariant::try_from( - create_extern_table.file_compression_type, - ) - .map_err(|_| { - proto_error(format!( - "Unknown file compression type {}", - create_extern_table.file_compression_type - )) - })?; - - Ok(LogicalPlan::Ddl(DdlStatement::CreateExternalTable(CreateExternalTable { - schema: pb_schema.try_into()?, - name: from_table_reference(create_extern_table.name.as_ref(), "CreateExternalTable")?, - location: create_extern_table.location.clone(), - file_type: create_extern_table.file_type.clone(), - has_header: create_extern_table.has_header, - delimiter: create_extern_table.delimiter.chars().next().ok_or_else(|| { - DataFusionError::Internal(String::from("Protobuf deserialization error, unable to parse CSV delimiter")) - })?, - table_partition_cols: create_extern_table - .table_partition_cols - .clone(), - order_exprs, - if_not_exists: create_extern_table.if_not_exists, - file_compression_type: file_compression_type.into(), - definition, - unbounded: create_extern_table.unbounded, - options: create_extern_table.options.clone(), - constraints: constraints.into(), - column_defaults, - }))) + Ok(LogicalPlan::Ddl(DdlStatement::CreateExternalTable( + CreateExternalTable { + schema: pb_schema.try_into()?, + name: from_table_reference( + create_extern_table.name.as_ref(), + "CreateExternalTable", + )?, + location: create_extern_table.location.clone(), + file_type: create_extern_table.file_type.clone(), + table_partition_cols: create_extern_table + .table_partition_cols + .clone(), + order_exprs, + if_not_exists: create_extern_table.if_not_exists, + definition, + unbounded: create_extern_table.unbounded, + options: create_extern_table.options.clone(), + constraints: constraints.into(), + column_defaults, + }, + ))) } LogicalPlanType::CreateView(create_view) => { let plan = create_view @@ -1327,13 +1317,10 @@ impl AsLogicalPlan for LogicalPlanNode { name, location, file_type, - has_header, - delimiter, schema: df_schema, table_partition_cols, if_not_exists, definition, - file_compression_type, order_exprs, unbounded, options, @@ -1359,23 +1346,17 @@ impl AsLogicalPlan for LogicalPlanNode { .insert(col_name.clone(), serialize_expr(expr, extension_codec)?); } - let file_compression_type = - protobuf::CompressionTypeVariant::from(file_compression_type); - Ok(protobuf::LogicalPlanNode { logical_plan_type: Some(LogicalPlanType::CreateExternalTable( protobuf::CreateExternalTableNode { name: Some(name.clone().into()), location: location.clone(), file_type: file_type.clone(), - has_header: *has_header, schema: Some(df_schema.try_into()?), table_partition_cols: table_partition_cols.clone(), if_not_exists: *if_not_exists, - delimiter: String::from(*delimiter), order_exprs: converted_order_exprs, definition: definition.clone().unwrap_or_default(), - file_compression_type: file_compression_type.into(), unbounded: *unbounded, options: options.clone(), constraints: Some(constraints.clone().into()), diff --git a/datafusion/proto/src/physical_plan/from_proto.rs b/datafusion/proto/src/physical_plan/from_proto.rs index 4bd07fae497f..c907e991fb86 100644 --- a/datafusion/proto/src/physical_plan/from_proto.rs +++ b/datafusion/proto/src/physical_plan/from_proto.rs @@ -805,7 +805,7 @@ impl TryFrom<&protobuf::CsvOptions> for CsvOptions { fn try_from(proto_opts: &protobuf::CsvOptions) -> Result { Ok(CsvOptions { - has_header: proto_opts.has_header, + has_header: proto_opts.has_header.first().map(|h| *h != 0), delimiter: proto_opts.delimiter[0], quote: proto_opts.quote[0], escape: proto_opts.escape.first().copied(), diff --git a/datafusion/proto/src/physical_plan/to_proto.rs b/datafusion/proto/src/physical_plan/to_proto.rs index 162a2f28e16b..c6b94a934f23 100644 --- a/datafusion/proto/src/physical_plan/to_proto.rs +++ b/datafusion/proto/src/physical_plan/to_proto.rs @@ -973,7 +973,7 @@ impl TryFrom<&CsvOptions> for protobuf::CsvOptions { fn try_from(opts: &CsvOptions) -> Result { let compression: protobuf::CompressionTypeVariant = opts.compression.into(); Ok(protobuf::CsvOptions { - has_header: opts.has_header, + has_header: opts.has_header.map_or_else(Vec::new, |h| vec![h as u8]), delimiter: vec![opts.delimiter], quote: vec![opts.quote], escape: opts.escape.map_or_else(Vec::new, |e| vec![e]), diff --git a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs index 819e20615685..e5e57c0bc893 100644 --- a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs +++ b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs @@ -15,12 +15,6 @@ // specific language governing permissions and limitations // under the License. -use std::any::Any; -use std::collections::HashMap; -use std::fmt::{self, Debug, Formatter}; -use std::sync::Arc; -use std::vec; - use arrow::array::{ArrayRef, FixedSizeListArray}; use arrow::datatypes::{ DataType, Field, Fields, Int32Type, IntervalDayTimeType, IntervalMonthDayNanoType, @@ -60,6 +54,11 @@ use datafusion_proto::logical_plan::to_proto::serialize_expr; use datafusion_proto::logical_plan::LogicalExtensionCodec; use datafusion_proto::logical_plan::{from_proto, DefaultLogicalExtensionCodec}; use datafusion_proto::protobuf; +use std::any::Any; +use std::collections::HashMap; +use std::fmt::{self, Debug, Formatter}; +use std::sync::Arc; +use std::vec; use datafusion::execution::FunctionRegistry; use prost::Message; @@ -236,10 +235,10 @@ async fn roundtrip_custom_listing_tables() -> Result<()> { primary key(c) ) STORED AS CSV - WITH HEADER ROW WITH ORDER (a ASC, b ASC) WITH ORDER (c ASC) - LOCATION '../core/tests/data/window_2.csv';"; + LOCATION '../core/tests/data/window_2.csv' + OPTIONS ('format.has_header' 'true')"; let plan = ctx.state().create_logical_plan(query).await?; @@ -266,10 +265,10 @@ async fn roundtrip_logical_plan_aggregation_with_pk() -> Result<()> { primary key(c) ) STORED AS CSV - WITH HEADER ROW WITH ORDER (a ASC, b ASC) WITH ORDER (c ASC) - LOCATION '../core/tests/data/window_2.csv';", + LOCATION '../core/tests/data/window_2.csv' + OPTIONS ('format.has_header' 'true')", ) .await?; diff --git a/datafusion/sql/src/parser.rs b/datafusion/sql/src/parser.rs index 5a999ab21d30..f61c9cda6345 100644 --- a/datafusion/sql/src/parser.rs +++ b/datafusion/sql/src/parser.rs @@ -17,11 +17,9 @@ //! [`DFParser`]: DataFusion SQL Parser based on [`sqlparser`] -use std::collections::{HashMap, VecDeque}; +use std::collections::VecDeque; use std::fmt; -use std::str::FromStr; -use datafusion_common::parsers::CompressionTypeVariant; use sqlparser::{ ast::{ ColumnDef, ColumnOptionDef, ObjectName, OrderByExpr, Query, @@ -103,8 +101,6 @@ pub struct CopyToStatement { pub target: String, /// Partition keys pub partitioned_by: Vec, - /// Indicates whether there is a header row (e.g. CSV) - pub has_header: bool, /// File type (Parquet, NDJSON, CSV etc.) pub stored_as: Option, /// Target specific options @@ -130,12 +126,9 @@ impl fmt::Display for CopyToStatement { write!(f, " PARTITIONED BY ({})", partitioned_by.join(", "))?; } - if self.has_header { - write!(f, " WITH HEADER ROW")?; - } - if !options.is_empty() { - let opts: Vec<_> = options.iter().map(|(k, v)| format!("{k} {v}")).collect(); + let opts: Vec<_> = + options.iter().map(|(k, v)| format!("'{k}' {v}")).collect(); write!(f, " OPTIONS ({})", opts.join(", "))?; } @@ -172,9 +165,6 @@ pub(crate) type LexOrdering = Vec; /// [ IF NOT EXISTS ] /// [ () ] /// STORED AS -/// [ WITH HEADER ROW ] -/// [ DELIMITER ] -/// [ COMPRESSION TYPE ] /// [ PARTITIONED BY ( | ) ] /// [ WITH ORDER () /// [ OPTIONS () ] @@ -196,10 +186,6 @@ pub struct CreateExternalTable { pub columns: Vec, /// File type (Parquet, NDJSON, CSV, etc) pub file_type: String, - /// CSV Header row? - pub has_header: bool, - /// User defined delimiter for CSVs - pub delimiter: char, /// Path to file pub location: String, /// Partition Columns @@ -208,12 +194,10 @@ pub struct CreateExternalTable { pub order_exprs: Vec, /// Option to not error if table already exists pub if_not_exists: bool, - /// File compression type (GZIP, BZIP2, XZ) - pub file_compression_type: CompressionTypeVariant, /// Infinite streams? pub unbounded: bool, /// Table(provider) specific options - pub options: HashMap, + pub options: Vec<(String, Value)>, /// A table-level constraint pub constraints: Vec, } @@ -401,7 +385,6 @@ impl<'a> DFParser<'a> { stored_as: Option, target: Option, partitioned_by: Option>, - has_header: Option, options: Option>, } @@ -428,8 +411,7 @@ impl<'a> DFParser<'a> { Keyword::WITH => { self.parser.expect_keyword(Keyword::HEADER)?; self.parser.expect_keyword(Keyword::ROW)?; - ensure_not_set(&builder.has_header, "WITH HEADER ROW")?; - builder.has_header = Some(true); + return parser_err!("WITH HEADER ROW clause is no longer in use. Please use the OPTIONS clause with 'format.has_header' set appropriately, e.g., OPTIONS ('format.has_header' 'true')"); } Keyword::PARTITIONED => { self.parser.expect_keyword(Keyword::BY)?; @@ -466,7 +448,6 @@ impl<'a> DFParser<'a> { source, target, partitioned_by: builder.partitioned_by.unwrap_or(vec![]), - has_header: builder.has_header.unwrap_or(false), stored_as: builder.stored_as, options: builder.options.unwrap_or(vec![]), })) @@ -699,12 +680,9 @@ impl<'a> DFParser<'a> { struct Builder { file_type: Option, location: Option, - has_header: Option, - delimiter: Option, - file_compression_type: Option, table_partition_cols: Option>, order_exprs: Vec, - options: Option>, + options: Option>, } let mut builder = Builder::default(); @@ -734,22 +712,15 @@ impl<'a> DFParser<'a> { } else { self.parser.expect_keyword(Keyword::HEADER)?; self.parser.expect_keyword(Keyword::ROW)?; - ensure_not_set(&builder.has_header, "WITH HEADER ROW")?; - builder.has_header = Some(true); + return parser_err!("WITH HEADER ROW clause is no longer in use. Please use the OPTIONS clause with 'format.has_header' set appropriately, e.g., OPTIONS ('format.has_header' 'true')"); } } Keyword::DELIMITER => { - ensure_not_set(&builder.delimiter, "DELIMITER")?; - builder.delimiter = Some(self.parse_delimiter()?); + return parser_err!("DELIMITER clause is no longer in use. Please use the OPTIONS clause with 'format.delimiter' set appropriately, e.g., OPTIONS ('format.delimiter' ',')"); } Keyword::COMPRESSION => { self.parser.expect_keyword(Keyword::TYPE)?; - ensure_not_set( - &builder.file_compression_type, - "COMPRESSION TYPE", - )?; - builder.file_compression_type = - Some(self.parse_file_compression_type()?); + return parser_err!("COMPRESSION TYPE clause is no longer in use. Please use the OPTIONS clause with 'format.compression' set appropriately, e.g., OPTIONS ('format.compression' 'gzip')"); } Keyword::PARTITIONED => { self.parser.expect_keyword(Keyword::BY)?; @@ -781,7 +752,7 @@ impl<'a> DFParser<'a> { } Keyword::OPTIONS => { ensure_not_set(&builder.options, "OPTIONS")?; - builder.options = Some(self.parse_string_options()?); + builder.options = Some(self.parse_value_options()?); } _ => { unreachable!() @@ -815,17 +786,12 @@ impl<'a> DFParser<'a> { name: table_name.to_string(), columns, file_type: builder.file_type.unwrap(), - has_header: builder.has_header.unwrap_or(false), - delimiter: builder.delimiter.unwrap_or(','), location: builder.location.unwrap(), table_partition_cols: builder.table_partition_cols.unwrap_or(vec![]), order_exprs: builder.order_exprs, if_not_exists, - file_compression_type: builder - .file_compression_type - .unwrap_or(CompressionTypeVariant::UNCOMPRESSED), unbounded, - options: builder.options.unwrap_or(HashMap::new()), + options: builder.options.unwrap_or(Vec::new()), constraints, }; Ok(Statement::CreateExternalTable(create)) @@ -840,45 +806,10 @@ impl<'a> DFParser<'a> { } } - /// Parses the set of - fn parse_file_compression_type( - &mut self, - ) -> Result { - let token = self.parser.next_token(); - match &token.token { - Token::Word(w) => CompressionTypeVariant::from_str(&w.value), - _ => self.expected("one of GZIP, BZIP2, XZ, ZSTD", token), - } - } - - /// Parses (key value) style options where the values are literal strings. - fn parse_string_options(&mut self) -> Result, ParserError> { - let mut options = HashMap::new(); - self.parser.expect_token(&Token::LParen)?; - - loop { - let key = self.parser.parse_literal_string()?; - let value = self.parser.parse_literal_string()?; - options.insert(key, value); - let comma = self.parser.consume_token(&Token::Comma); - if self.parser.consume_token(&Token::RParen) { - // allow a trailing comma, even though it's not in standard - break; - } else if !comma { - return self.expected( - "',' or ')' after option definition", - self.parser.peek_token(), - ); - } - } - Ok(options) - } - /// Parses (key value) style options into a map of String --> [`Value`]. /// - /// Unlike [`Self::parse_string_options`], this method supports - /// keywords as key names as well as multiple value types such as - /// Numbers as well as Strings. + /// This method supports keywords as key names as well as multiple + /// value types such as Numbers as well as Strings. fn parse_value_options(&mut self) -> Result, ParserError> { let mut options = vec![]; self.parser.expect_token(&Token::LParen)?; @@ -900,16 +831,6 @@ impl<'a> DFParser<'a> { } Ok(options) } - - fn parse_delimiter(&mut self) -> Result { - let token = self.parser.parse_literal_string()?; - match token.len() { - 1 => Ok(token.chars().next().unwrap()), - _ => Err(ParserError::TokenizerError( - "Delimiter must be a single char".to_string(), - )), - } - } } #[cfg(test)] @@ -917,7 +838,6 @@ mod tests { use super::*; use sqlparser::ast::Expr::Identifier; use sqlparser::ast::{BinaryOperator, DataType, Expr, Ident}; - use CompressionTypeVariant::UNCOMPRESSED; fn expect_parse_ok(sql: &str, expected: Statement) -> Result<(), ParserError> { let statements = DFParser::parse_sql(sql)?; @@ -969,15 +889,12 @@ mod tests { name: "t".into(), columns: vec![make_column_def("c1", DataType::Int(display))], file_type: "CSV".to_string(), - has_header: false, - delimiter: ',', location: "foo.csv".into(), table_partition_cols: vec![], order_exprs: vec![], if_not_exists: false, - file_compression_type: UNCOMPRESSED, unbounded: false, - options: HashMap::new(), + options: vec![], constraints: vec![], }); expect_parse_ok(sql, expected)?; @@ -988,15 +905,12 @@ mod tests { name: "t".into(), columns: vec![make_column_def("c1", DataType::Int(None))], file_type: "CSV".to_string(), - has_header: false, - delimiter: ',', location: "foo.csv".into(), table_partition_cols: vec![], order_exprs: vec![], if_not_exists: false, - file_compression_type: UNCOMPRESSED, unbounded: false, - options: HashMap::new(), + options: vec![], constraints: vec![], }); expect_parse_ok(sql, expected)?; @@ -1008,35 +922,32 @@ mod tests { name: "t".into(), columns: vec![make_column_def("c1", DataType::Int(None))], file_type: "CSV".to_string(), - has_header: false, - delimiter: ',', location: "foo.csv".into(), table_partition_cols: vec![], order_exprs: vec![], if_not_exists: false, - file_compression_type: UNCOMPRESSED, unbounded: false, - options: HashMap::new(), + options: vec![], constraints: vec![], }); expect_parse_ok(sql, expected)?; // positive case with delimiter - let sql = "CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV DELIMITER '|' LOCATION 'foo.csv'"; + let sql = "CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV LOCATION 'foo.csv' OPTIONS ('format.delimiter' '|')"; let display = None; let expected = Statement::CreateExternalTable(CreateExternalTable { name: "t".into(), columns: vec![make_column_def("c1", DataType::Int(display))], file_type: "CSV".to_string(), - has_header: false, - delimiter: '|', location: "foo.csv".into(), table_partition_cols: vec![], order_exprs: vec![], if_not_exists: false, - file_compression_type: UNCOMPRESSED, unbounded: false, - options: HashMap::new(), + options: vec![( + "format.delimiter".into(), + Value::SingleQuotedString("|".into()), + )], constraints: vec![], }); expect_parse_ok(sql, expected)?; @@ -1048,66 +959,41 @@ mod tests { name: "t".into(), columns: vec![make_column_def("c1", DataType::Int(display))], file_type: "CSV".to_string(), - has_header: false, - delimiter: ',', location: "foo.csv".into(), table_partition_cols: vec!["p1".to_string(), "p2".to_string()], order_exprs: vec![], if_not_exists: false, - file_compression_type: UNCOMPRESSED, unbounded: false, - options: HashMap::new(), + options: vec![], constraints: vec![], }); expect_parse_ok(sql, expected)?; - // positive case: it is ok for case insensitive sql stmt with `WITH HEADER ROW` tokens - let sqls = vec![ - "CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV WITH HEADER ROW LOCATION 'foo.csv'", - "CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV with header row LOCATION 'foo.csv'" - ]; - for sql in sqls { - let expected = Statement::CreateExternalTable(CreateExternalTable { - name: "t".into(), - columns: vec![make_column_def("c1", DataType::Int(display))], - file_type: "CSV".to_string(), - has_header: true, - delimiter: ',', - location: "foo.csv".into(), - table_partition_cols: vec![], - order_exprs: vec![], - if_not_exists: false, - file_compression_type: UNCOMPRESSED, - unbounded: false, - options: HashMap::new(), - constraints: vec![], - }); - expect_parse_ok(sql, expected)?; - } - // positive case: it is ok for sql stmt with `COMPRESSION TYPE GZIP` tokens let sqls = vec![ - ("CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV COMPRESSION TYPE GZIP LOCATION 'foo.csv'", "GZIP"), - ("CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV COMPRESSION TYPE BZIP2 LOCATION 'foo.csv'", "BZIP2"), - ("CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV COMPRESSION TYPE XZ LOCATION 'foo.csv'", "XZ"), - ("CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV COMPRESSION TYPE ZSTD LOCATION 'foo.csv'", "ZSTD"), - ]; - for (sql, file_compression_type) in sqls { + ("CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV LOCATION 'foo.csv' OPTIONS + ('format.compression' 'GZIP')", "GZIP"), + ("CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV LOCATION 'foo.csv' OPTIONS + ('format.compression' 'BZIP2')", "BZIP2"), + ("CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV LOCATION 'foo.csv' OPTIONS + ('format.compression' 'XZ')", "XZ"), + ("CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV LOCATION 'foo.csv' OPTIONS + ('format.compression' 'ZSTD')", "ZSTD"), + ]; + for (sql, compression) in sqls { let expected = Statement::CreateExternalTable(CreateExternalTable { name: "t".into(), columns: vec![make_column_def("c1", DataType::Int(display))], file_type: "CSV".to_string(), - has_header: false, - delimiter: ',', location: "foo.csv".into(), table_partition_cols: vec![], order_exprs: vec![], if_not_exists: false, - file_compression_type: CompressionTypeVariant::from_str( - file_compression_type, - )?, unbounded: false, - options: HashMap::new(), + options: vec![( + "format.compression".into(), + Value::SingleQuotedString(compression.into()), + )], constraints: vec![], }); expect_parse_ok(sql, expected)?; @@ -1119,15 +1005,12 @@ mod tests { name: "t".into(), columns: vec![], file_type: "PARQUET".to_string(), - has_header: false, - delimiter: ',', location: "foo.parquet".into(), table_partition_cols: vec![], order_exprs: vec![], if_not_exists: false, - file_compression_type: UNCOMPRESSED, unbounded: false, - options: HashMap::new(), + options: vec![], constraints: vec![], }); expect_parse_ok(sql, expected)?; @@ -1138,15 +1021,12 @@ mod tests { name: "t".into(), columns: vec![], file_type: "PARQUET".to_string(), - has_header: false, - delimiter: ',', location: "foo.parquet".into(), table_partition_cols: vec![], order_exprs: vec![], if_not_exists: false, - file_compression_type: UNCOMPRESSED, unbounded: false, - options: HashMap::new(), + options: vec![], constraints: vec![], }); expect_parse_ok(sql, expected)?; @@ -1157,15 +1037,12 @@ mod tests { name: "t".into(), columns: vec![], file_type: "AVRO".to_string(), - has_header: false, - delimiter: ',', location: "foo.avro".into(), table_partition_cols: vec![], order_exprs: vec![], if_not_exists: false, - file_compression_type: UNCOMPRESSED, unbounded: false, - options: HashMap::new(), + options: vec![], constraints: vec![], }); expect_parse_ok(sql, expected)?; @@ -1177,15 +1054,12 @@ mod tests { name: "t".into(), columns: vec![], file_type: "PARQUET".to_string(), - has_header: false, - delimiter: ',', location: "foo.parquet".into(), table_partition_cols: vec![], order_exprs: vec![], if_not_exists: true, - file_compression_type: UNCOMPRESSED, unbounded: false, - options: HashMap::new(), + options: vec![], constraints: vec![], }); expect_parse_ok(sql, expected)?; @@ -1200,15 +1074,12 @@ mod tests { make_column_def("p1", DataType::Int(None)), ], file_type: "CSV".to_string(), - has_header: false, - delimiter: ',', location: "foo.csv".into(), table_partition_cols: vec!["p1".to_string()], order_exprs: vec![], if_not_exists: false, - file_compression_type: UNCOMPRESSED, unbounded: false, - options: HashMap::new(), + options: vec![], constraints: vec![], }); expect_parse_ok(sql, expected)?; @@ -1230,15 +1101,12 @@ mod tests { name: "t".into(), columns: vec![], file_type: "X".to_string(), - has_header: false, - delimiter: ',', location: "blahblah".into(), table_partition_cols: vec![], order_exprs: vec![], if_not_exists: false, - file_compression_type: UNCOMPRESSED, unbounded: false, - options: HashMap::from([("k1".into(), "v1".into())]), + options: vec![("k1".into(), Value::SingleQuotedString("v1".into()))], constraints: vec![], }); expect_parse_ok(sql, expected)?; @@ -1250,18 +1118,15 @@ mod tests { name: "t".into(), columns: vec![], file_type: "X".to_string(), - has_header: false, - delimiter: ',', location: "blahblah".into(), table_partition_cols: vec![], order_exprs: vec![], if_not_exists: false, - file_compression_type: UNCOMPRESSED, unbounded: false, - options: HashMap::from([ - ("k1".into(), "v1".into()), - ("k2".into(), "v2".into()), - ]), + options: vec![ + ("k1".into(), Value::SingleQuotedString("v1".into())), + ("k2".into(), Value::UnQuotedString("v2".into())), + ], constraints: vec![], }); expect_parse_ok(sql, expected)?; @@ -1292,8 +1157,6 @@ mod tests { name: "t".into(), columns: vec![make_column_def("c1", DataType::Int(None))], file_type: "CSV".to_string(), - has_header: false, - delimiter: ',', location: "foo.csv".into(), table_partition_cols: vec![], order_exprs: vec![vec![OrderByExpr { @@ -1305,9 +1168,8 @@ mod tests { nulls_first, }]], if_not_exists: false, - file_compression_type: UNCOMPRESSED, unbounded: false, - options: HashMap::new(), + options: vec![], constraints: vec![], }); expect_parse_ok(sql, expected)?; @@ -1323,8 +1185,6 @@ mod tests { make_column_def("c2", DataType::Int(display)), ], file_type: "CSV".to_string(), - has_header: false, - delimiter: ',', location: "foo.csv".into(), table_partition_cols: vec![], order_exprs: vec![vec![ @@ -1346,9 +1206,8 @@ mod tests { }, ]], if_not_exists: false, - file_compression_type: UNCOMPRESSED, unbounded: false, - options: HashMap::new(), + options: vec![], constraints: vec![], }); expect_parse_ok(sql, expected)?; @@ -1363,8 +1222,6 @@ mod tests { make_column_def("c2", DataType::Int(display)), ], file_type: "CSV".to_string(), - has_header: false, - delimiter: ',', location: "foo.csv".into(), table_partition_cols: vec![], order_exprs: vec![vec![OrderByExpr { @@ -1383,9 +1240,8 @@ mod tests { nulls_first: None, }]], if_not_exists: false, - file_compression_type: UNCOMPRESSED, unbounded: false, - options: HashMap::new(), + options: vec![], constraints: vec![], }); expect_parse_ok(sql, expected)?; @@ -1394,14 +1250,14 @@ mod tests { let sql = " CREATE UNBOUNDED EXTERNAL TABLE IF NOT EXISTS t (c1 int, c2 float) STORED AS PARQUET - DELIMITER '*' - WITH HEADER ROW WITH ORDER (c1 - c2 ASC) - COMPRESSION TYPE zstd PARTITIONED BY (c1) LOCATION 'foo.parquet' - OPTIONS (ROW_GROUP_SIZE '1024', 'TRUNCATE' 'NO') - "; + OPTIONS ('format.compression' 'zstd', + 'format.delimiter' '*', + 'ROW_GROUP_SIZE' '1024', + 'TRUNCATE' 'NO', + 'format.has_header' 'true')"; let expected = Statement::CreateExternalTable(CreateExternalTable { name: "t".into(), columns: vec![ @@ -1409,8 +1265,6 @@ mod tests { make_column_def("c2", DataType::Float(None)), ], file_type: "PARQUET".to_string(), - has_header: true, - delimiter: '*', location: "foo.parquet".into(), table_partition_cols: vec!["c1".into()], order_exprs: vec![vec![OrderByExpr { @@ -1429,12 +1283,26 @@ mod tests { nulls_first: None, }]], if_not_exists: true, - file_compression_type: CompressionTypeVariant::ZSTD, unbounded: true, - options: HashMap::from([ - ("ROW_GROUP_SIZE".into(), "1024".into()), - ("TRUNCATE".into(), "NO".into()), - ]), + options: vec![ + ( + "format.compression".into(), + Value::SingleQuotedString("zstd".into()), + ), + ( + "format.delimiter".into(), + Value::SingleQuotedString("*".into()), + ), + ( + "ROW_GROUP_SIZE".into(), + Value::SingleQuotedString("1024".into()), + ), + ("TRUNCATE".into(), Value::SingleQuotedString("NO".into())), + ( + "format.has_header".into(), + Value::SingleQuotedString("true".into()), + ), + ], constraints: vec![], }); expect_parse_ok(sql, expected)?; @@ -1452,7 +1320,6 @@ mod tests { source: object_name("foo"), target: "bar".to_string(), partitioned_by: vec![], - has_header: false, stored_as: Some("CSV".to_owned()), options: vec![], }); @@ -1488,7 +1355,6 @@ mod tests { source: object_name("foo"), target: "bar".to_string(), partitioned_by: vec![], - has_header: false, stored_as: Some("PARQUET".to_owned()), options: vec![], }); @@ -1519,14 +1385,17 @@ mod tests { panic!("Expected query, got {statement:?}"); }; - let sql = "COPY (SELECT 1) TO bar STORED AS CSV WITH HEADER ROW"; + let sql = + "COPY (SELECT 1) TO bar STORED AS CSV OPTIONS ('format.has_header' 'true')"; let expected = Statement::CopyTo(CopyToStatement { source: CopyToSource::Query(query), target: "bar".to_string(), partitioned_by: vec![], - has_header: true, stored_as: Some("CSV".to_owned()), - options: vec![], + options: vec![( + "format.has_header".into(), + Value::SingleQuotedString("true".into()), + )], }); assert_eq!(verified_stmt(sql), expected); Ok(()) @@ -1534,16 +1403,15 @@ mod tests { #[test] fn copy_to_options() -> Result<(), ParserError> { - let sql = "COPY foo TO bar STORED AS CSV OPTIONS (row_group_size 55)"; + let sql = "COPY foo TO bar STORED AS CSV OPTIONS ('row_group_size' '55')"; let expected = Statement::CopyTo(CopyToStatement { source: object_name("foo"), target: "bar".to_string(), partitioned_by: vec![], - has_header: false, stored_as: Some("CSV".to_owned()), options: vec![( "row_group_size".to_string(), - Value::Number("55".to_string(), false), + Value::SingleQuotedString("55".to_string()), )], }); assert_eq!(verified_stmt(sql), expected); @@ -1552,16 +1420,15 @@ mod tests { #[test] fn copy_to_partitioned_by() -> Result<(), ParserError> { - let sql = "COPY foo TO bar STORED AS CSV PARTITIONED BY (a) OPTIONS (row_group_size 55)"; + let sql = "COPY foo TO bar STORED AS CSV PARTITIONED BY (a) OPTIONS ('row_group_size' '55')"; let expected = Statement::CopyTo(CopyToStatement { source: object_name("foo"), target: "bar".to_string(), partitioned_by: vec!["a".to_string()], - has_header: false, stored_as: Some("CSV".to_owned()), options: vec![( "row_group_size".to_string(), - Value::Number("55".to_string(), false), + Value::SingleQuotedString("55".to_string()), )], }); assert_eq!(verified_stmt(sql), expected); diff --git a/datafusion/sql/src/statement.rs b/datafusion/sql/src/statement.rs index aee21497b114..13d2e05661a8 100644 --- a/datafusion/sql/src/statement.rs +++ b/datafusion/sql/src/statement.rs @@ -872,7 +872,6 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { // If config does not belong to any namespace, assume it is // a format option and apply the format prefix for backwards // compatibility. - let renamed_key = format!("format.{}", key); options.insert(renamed_key.to_lowercase(), value_string.to_lowercase()); } else { @@ -968,12 +967,9 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { name, columns, file_type, - has_header, - delimiter, location, table_partition_cols, if_not_exists, - file_compression_type, order_exprs, unbounded, options, @@ -985,8 +981,52 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { let inline_constraints = calc_inline_constraints_from_columns(&columns); all_constraints.extend(inline_constraints); + let mut options_map = HashMap::::new(); + for (key, value) in options { + if options_map.contains_key(&key) { + return plan_err!("Option {key} is specified multiple times"); + } + + let value_string = match value { + Value::SingleQuotedString(s) => s.to_string(), + Value::DollarQuotedString(s) => s.to_string(), + Value::UnQuotedString(s) => s.to_string(), + Value::Number(_, _) | Value::Boolean(_) => value.to_string(), + Value::DoubleQuotedString(_) + | Value::EscapedStringLiteral(_) + | Value::NationalStringLiteral(_) + | Value::SingleQuotedByteStringLiteral(_) + | Value::DoubleQuotedByteStringLiteral(_) + | Value::RawStringLiteral(_) + | Value::HexStringLiteral(_) + | Value::Null + | Value::Placeholder(_) => { + return plan_err!( + "Unsupported Value in CREATE EXTERNAL TABLE statement {}", + value + ); + } + }; + + if !(&key.contains('.')) { + // If a config does not belong to any namespace, we assume it is + // a format option and apply the format prefix for backwards + // compatibility. + let renamed_key = format!("format.{}", key.to_lowercase()); + options_map.insert(renamed_key, value_string.to_lowercase()); + } else { + options_map.insert(key.to_lowercase(), value_string.to_lowercase()); + } + } + + let compression = options_map + .get("format.compression") + .map(|c| CompressionTypeVariant::from_str(c)) + .transpose()?; if (file_type == "PARQUET" || file_type == "AVRO" || file_type == "ARROW") - && file_compression_type != CompressionTypeVariant::UNCOMPRESSED + && compression + .map(|c| c != CompressionTypeVariant::UNCOMPRESSED) + .unwrap_or(false) { plan_err!( "File compression type cannot be set for PARQUET, AVRO, or ARROW files." @@ -1017,15 +1057,12 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { name, location, file_type, - has_header, - delimiter, table_partition_cols, if_not_exists, definition, - file_compression_type, order_exprs: ordered_exprs, unbounded, - options, + options: options_map, constraints, column_defaults, }, diff --git a/datafusion/sql/tests/sql_integration.rs b/datafusion/sql/tests/sql_integration.rs index 1a300d11b0b2..af4dac5f3f89 100644 --- a/datafusion/sql/tests/sql_integration.rs +++ b/datafusion/sql/tests/sql_integration.rs @@ -32,13 +32,13 @@ use datafusion_expr::{ AggregateUDF, ColumnarValue, ScalarUDF, ScalarUDFImpl, Signature, TableSource, Volatility, WindowUDF, }; +use datafusion_functions::{string, unicode}; use datafusion_sql::unparser::{expr_to_sql, plan_to_sql}; use datafusion_sql::{ parser::DFParser, planner::{ContextProvider, ParserOptions, PlannerContext, SqlToRel}, }; -use datafusion_functions::{string, unicode}; use rstest::rstest; use sqlparser::dialect::{Dialect, GenericDialect, HiveDialect, MySqlDialect}; use sqlparser::parser::Parser; @@ -1903,12 +1903,12 @@ fn create_external_table_csv_no_schema() { fn create_external_table_with_compression_type() { // positive case let sqls = vec![ - "CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV COMPRESSION TYPE GZIP LOCATION 'foo.csv.gz'", - "CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV COMPRESSION TYPE BZIP2 LOCATION 'foo.csv.bz2'", - "CREATE EXTERNAL TABLE t(c1 int) STORED AS JSON COMPRESSION TYPE GZIP LOCATION 'foo.json.gz'", - "CREATE EXTERNAL TABLE t(c1 int) STORED AS JSON COMPRESSION TYPE BZIP2 LOCATION 'foo.json.bz2'", - "CREATE EXTERNAL TABLE t(c1 int) STORED AS NONSTANDARD COMPRESSION TYPE GZIP LOCATION 'foo.unk'", - ]; + "CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV LOCATION 'foo.csv.gz' OPTIONS ('format.compression' 'gzip')", + "CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV LOCATION 'foo.csv.bz2' OPTIONS ('format.compression' 'bzip2')", + "CREATE EXTERNAL TABLE t(c1 int) STORED AS JSON LOCATION 'foo.json.gz' OPTIONS ('format.compression' 'gzip')", + "CREATE EXTERNAL TABLE t(c1 int) STORED AS JSON LOCATION 'foo.json.bz2' OPTIONS ('format.compression' 'bzip2')", + "CREATE EXTERNAL TABLE t(c1 int) STORED AS NONSTANDARD LOCATION 'foo.unk' OPTIONS ('format.compression' 'gzip')", + ]; for sql in sqls { let expected = "CreateExternalTable: Bare { table: \"t\" }"; quick_test(sql, expected); @@ -1916,12 +1916,12 @@ fn create_external_table_with_compression_type() { // negative case let sqls = vec![ - "CREATE EXTERNAL TABLE t STORED AS AVRO COMPRESSION TYPE GZIP LOCATION 'foo.avro'", - "CREATE EXTERNAL TABLE t STORED AS AVRO COMPRESSION TYPE BZIP2 LOCATION 'foo.avro'", - "CREATE EXTERNAL TABLE t STORED AS PARQUET COMPRESSION TYPE GZIP LOCATION 'foo.parquet'", - "CREATE EXTERNAL TABLE t STORED AS PARQUET COMPRESSION TYPE BZIP2 LOCATION 'foo.parquet'", - "CREATE EXTERNAL TABLE t STORED AS ARROW COMPRESSION TYPE GZIP LOCATION 'foo.arrow'", - "CREATE EXTERNAL TABLE t STORED AS ARROW COMPRESSION TYPE BZIP2 LOCATION 'foo.arrow'", + "CREATE EXTERNAL TABLE t STORED AS AVRO LOCATION 'foo.avro' OPTIONS ('format.compression' 'gzip')", + "CREATE EXTERNAL TABLE t STORED AS AVRO LOCATION 'foo.avro' OPTIONS ('format.compression' 'bzip2')", + "CREATE EXTERNAL TABLE t STORED AS PARQUET LOCATION 'foo.parquet' OPTIONS ('format.compression' 'gzip')", + "CREATE EXTERNAL TABLE t STORED AS PARQUET LOCATION 'foo.parquet' OPTIONS ('format.compression' 'bzip2')", + "CREATE EXTERNAL TABLE t STORED AS ARROW LOCATION 'foo.arrow' OPTIONS ('format.compression' 'gzip')", + "CREATE EXTERNAL TABLE t STORED AS ARROW LOCATION 'foo.arrow' OPTIONS ('format.compression' 'bzip2')", ]; for sql in sqls { let err = logical_plan(sql).expect_err("query should have failed"); diff --git a/datafusion/sqllogictest/test_files/agg_func_substitute.slt b/datafusion/sqllogictest/test_files/agg_func_substitute.slt index c5cd78f1259b..342d45e7fb24 100644 --- a/datafusion/sqllogictest/test_files/agg_func_substitute.slt +++ b/datafusion/sqllogictest/test_files/agg_func_substitute.slt @@ -27,10 +27,10 @@ CREATE EXTERNAL TABLE multiple_ordered_table ( d INTEGER ) STORED AS CSV -WITH HEADER ROW WITH ORDER (a ASC, b ASC) WITH ORDER (c ASC) -LOCATION '../../datafusion/core/tests/data/window_2.csv'; +LOCATION '../../datafusion/core/tests/data/window_2.csv' +OPTIONS ('format.has_header' 'true'); query TT diff --git a/datafusion/sqllogictest/test_files/aggregate.slt b/datafusion/sqllogictest/test_files/aggregate.slt index 1e0d522492e7..40d66f9b52ce 100644 --- a/datafusion/sqllogictest/test_files/aggregate.slt +++ b/datafusion/sqllogictest/test_files/aggregate.slt @@ -35,8 +35,8 @@ CREATE EXTERNAL TABLE aggregate_test_100 ( c13 VARCHAR NOT NULL ) STORED AS CSV -WITH HEADER ROW LOCATION '../../testing/data/csv/aggregate_test_100.csv' +OPTIONS ('format.has_header' 'true'); statement ok CREATE TABLE d_table (c1 decimal(10,3), c2 varchar) @@ -116,8 +116,8 @@ c2 INT NOT NULL, c3 INT NOT NULL ) STORED AS CSV -WITH HEADER ROW -LOCATION '../core/tests/data/aggregate_agg_multi_order.csv'; +LOCATION '../core/tests/data/aggregate_agg_multi_order.csv' +OPTIONS ('format.has_header' 'true'); # test array_agg with order by multiple columns query ? @@ -4221,11 +4221,11 @@ c2 INT NOT NULL, c3 INT NOT NULL ) STORED AS CSV -WITH HEADER ROW WITH ORDER (c1 ASC) WITH ORDER (c2 DESC) WITH ORDER (c3 ASC) -LOCATION '../core/tests/data/convert_first_last.csv'; +LOCATION '../core/tests/data/convert_first_last.csv' +OPTIONS ('format.has_header' 'true'); # test first to last, the result does not show difference, we need to check the conversion by `explain` query TT diff --git a/datafusion/sqllogictest/test_files/avro.slt b/datafusion/sqllogictest/test_files/avro.slt index 7b9fbe556fee..fced1924ced9 100644 --- a/datafusion/sqllogictest/test_files/avro.slt +++ b/datafusion/sqllogictest/test_files/avro.slt @@ -31,8 +31,7 @@ CREATE EXTERNAL TABLE alltypes_plain ( timestamp_col TIMESTAMP NOT NULL, ) STORED AS AVRO -WITH HEADER ROW -LOCATION '../../testing/data/avro/alltypes_plain.avro' +LOCATION '../../testing/data/avro/alltypes_plain.avro'; statement ok CREATE EXTERNAL TABLE alltypes_plain_snappy ( @@ -49,8 +48,7 @@ CREATE EXTERNAL TABLE alltypes_plain_snappy ( timestamp_col TIMESTAMP NOT NULL, ) STORED AS AVRO -WITH HEADER ROW -LOCATION '../../testing/data/avro/alltypes_plain.snappy.avro' +LOCATION '../../testing/data/avro/alltypes_plain.snappy.avro'; statement ok CREATE EXTERNAL TABLE alltypes_plain_bzip2 ( @@ -67,8 +65,7 @@ CREATE EXTERNAL TABLE alltypes_plain_bzip2 ( timestamp_col TIMESTAMP NOT NULL, ) STORED AS AVRO -WITH HEADER ROW -LOCATION '../../testing/data/avro/alltypes_plain.bzip2.avro' +LOCATION '../../testing/data/avro/alltypes_plain.bzip2.avro'; statement ok CREATE EXTERNAL TABLE alltypes_plain_xz ( @@ -85,8 +82,7 @@ CREATE EXTERNAL TABLE alltypes_plain_xz ( timestamp_col TIMESTAMP NOT NULL, ) STORED AS AVRO -WITH HEADER ROW -LOCATION '../../testing/data/avro/alltypes_plain.xz.avro' +LOCATION '../../testing/data/avro/alltypes_plain.xz.avro'; statement ok CREATE EXTERNAL TABLE alltypes_plain_zstandard ( @@ -103,34 +99,29 @@ CREATE EXTERNAL TABLE alltypes_plain_zstandard ( timestamp_col TIMESTAMP NOT NULL, ) STORED AS AVRO -WITH HEADER ROW -LOCATION '../../testing/data/avro/alltypes_plain.zstandard.avro' +LOCATION '../../testing/data/avro/alltypes_plain.zstandard.avro'; statement ok CREATE EXTERNAL TABLE single_nan ( mycol FLOAT ) STORED AS AVRO -WITH HEADER ROW -LOCATION '../../testing/data/avro/single_nan.avro' +LOCATION '../../testing/data/avro/single_nan.avro'; statement ok CREATE EXTERNAL TABLE nested_records STORED AS AVRO -WITH HEADER ROW -LOCATION '../../testing/data/avro/nested_records.avro' +LOCATION '../../testing/data/avro/nested_records.avro'; statement ok CREATE EXTERNAL TABLE simple_enum STORED AS AVRO -WITH HEADER ROW -LOCATION '../../testing/data/avro/simple_enum.avro' +LOCATION '../../testing/data/avro/simple_enum.avro'; statement ok CREATE EXTERNAL TABLE simple_fixed STORED AS AVRO -WITH HEADER ROW -LOCATION '../../testing/data/avro/simple_fixed.avro' +LOCATION '../../testing/data/avro/simple_fixed.avro'; # test avro query query IT diff --git a/datafusion/sqllogictest/test_files/copy.slt b/datafusion/sqllogictest/test_files/copy.slt index 882a4e220758..00bcea7ec154 100644 --- a/datafusion/sqllogictest/test_files/copy.slt +++ b/datafusion/sqllogictest/test_files/copy.slt @@ -384,7 +384,7 @@ COPY source_table to 'test_files/scratch/copy/table_json_gz' STORED AS JSON OPT # validate folder of csv files statement ok -CREATE EXTERNAL TABLE validate_json_gz STORED AS json COMPRESSION TYPE gzip LOCATION 'test_files/scratch/copy/table_json_gz'; +CREATE EXTERNAL TABLE validate_json_gz STORED AS json LOCATION 'test_files/scratch/copy/table_json_gz' OPTIONS ('format.compression' 'gzip'); query IT select * from validate_json_gz; @@ -400,7 +400,7 @@ COPY source_table to 'test_files/scratch/copy/table_csv' STORED AS CSV OPTIONS # validate folder of csv files statement ok -CREATE EXTERNAL TABLE validate_csv STORED AS csv COMPRESSION TYPE gzip LOCATION 'test_files/scratch/copy/table_csv'; +CREATE EXTERNAL TABLE validate_csv STORED AS csv LOCATION 'test_files/scratch/copy/table_csv' OPTIONS ('format.compression' 'gzip'); query IT select * from validate_csv; @@ -416,7 +416,7 @@ COPY source_table to 'test_files/scratch/copy/table.csv'; # Validate single csv output statement ok -CREATE EXTERNAL TABLE validate_single_csv STORED AS csv WITH HEADER ROW LOCATION 'test_files/scratch/copy/table.csv'; +CREATE EXTERNAL TABLE validate_single_csv STORED AS csv LOCATION 'test_files/scratch/copy/table.csv' OPTIONS ('format.has_header' 'false'); query IT select * from validate_single_csv; diff --git a/datafusion/sqllogictest/test_files/create_external_table.slt b/datafusion/sqllogictest/test_files/create_external_table.slt index 9f1fc523f559..fca177bb61f0 100644 --- a/datafusion/sqllogictest/test_files/create_external_table.slt +++ b/datafusion/sqllogictest/test_files/create_external_table.slt @@ -33,7 +33,7 @@ statement error DataFusion error: SQL error: ParserError\("Missing LOCATION clau CREATE EXTERNAL TABLE t STORED AS CSV # Option value is missing -statement error DataFusion error: SQL error: ParserError\("Expected literal string, found: \)"\) +statement error DataFusion error: SQL error: ParserError\("Expected string or numeric value, found: \)"\) CREATE EXTERNAL TABLE t STORED AS x OPTIONS ('k1' 'v1', k2 v2, k3) LOCATION 'blahblah' # Missing `(` in WITH ORDER clause @@ -52,14 +52,6 @@ CREATE EXTERNAL TABLE t STORED AS CSV WITH HEADER LOCATION 'abc' statement error DataFusion error: SQL error: ParserError\("Expected BY, found: LOCATION"\) CREATE EXTERNAL TABLE t STORED AS CSV PARTITIONED LOCATION 'abc' -# Missing `TYPE` in COMPRESSION clause -statement error DataFusion error: SQL error: ParserError\("Expected TYPE, found: LOCATION"\) -CREATE EXTERNAL TABLE t STORED AS CSV COMPRESSION LOCATION 'abc' - -# Invalid compression type -statement error DataFusion error: SQL error: ParserError\("Unsupported file compression type ZZZ"\) -CREATE EXTERNAL TABLE t STORED AS CSV COMPRESSION TYPE ZZZ LOCATION 'blahblah' - # Duplicate `STORED AS` clause statement error DataFusion error: SQL error: ParserError\("STORED AS specified more than once"\) CREATE EXTERNAL TABLE t STORED AS CSV STORED AS PARQUET LOCATION 'foo.parquet' @@ -68,18 +60,6 @@ CREATE EXTERNAL TABLE t STORED AS CSV STORED AS PARQUET LOCATION 'foo.parquet' statement error DataFusion error: SQL error: ParserError\("LOCATION specified more than once"\) CREATE EXTERNAL TABLE t STORED AS CSV LOCATION 'foo.csv' LOCATION 'bar.csv' -# Duplicate `WITH HEADER ROW` clause -statement error DataFusion error: SQL error: ParserError\("WITH HEADER ROW specified more than once"\) -CREATE EXTERNAL TABLE t STORED AS CSV WITH HEADER ROW WITH HEADER ROW LOCATION 'foo.csv' - -# Duplicate `DELIMITER` clause -statement error DataFusion error: SQL error: ParserError\("DELIMITER specified more than once"\) -CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV DELIMITER '-' DELIMITER '+' LOCATION 'foo.csv' - -# Duplicate `COMPRESSION TYPE` clause -statement error DataFusion error: SQL error: ParserError\("COMPRESSION TYPE specified more than once"\) -CREATE EXTERNAL TABLE t STORED AS CSV COMPRESSION TYPE BZIP2 COMPRESSION TYPE XZ COMPRESSION TYPE ZSTD COMPRESSION TYPE GZIP LOCATION 'foo.csv' - # Duplicate `PARTITIONED BY` clause statement error DataFusion error: SQL error: ParserError\("PARTITIONED BY specified more than once"\) create EXTERNAL TABLE t(c1 int, c2 int) STORED AS CSV PARTITIONED BY (c1) partitioned by (c2) LOCATION 'foo.csv' @@ -205,3 +185,23 @@ CREATE EXTERNAL TABLE test3(name string) PARTITIONED BY (month string, year string) STORED AS parquet LOCATION 'test_files/scratch/create_external_table/manual_partitioning/'; + +# Duplicate key assignment in OPTIONS clause +statement error DataFusion error: Error during planning: Option format.delimiter is specified multiple times +CREATE EXTERNAL TABLE t STORED AS CSV OPTIONS ( + 'format.delimiter' '*', + 'format.has_header' 'true', + 'format.delimiter' '|') +LOCATION 'foo.csv'; + +# If a config does not belong to any namespace, we assume it is a 'format' option and apply the 'format' prefix for backwards compatibility. +statement ok +CREATE EXTERNAL TABLE IF NOT EXISTS region ( + r_regionkey BIGINT, + r_name VARCHAR, + r_comment VARCHAR, + r_rev VARCHAR, +) STORED AS CSV LOCATION 'test_files/tpch/data/region.tbl' +OPTIONS ( + 'format.delimiter' '|', + 'has_header' 'false'); \ No newline at end of file diff --git a/datafusion/sqllogictest/test_files/csv_files.slt b/datafusion/sqllogictest/test_files/csv_files.slt index be846028d545..50477e1dab45 100644 --- a/datafusion/sqllogictest/test_files/csv_files.slt +++ b/datafusion/sqllogictest/test_files/csv_files.slt @@ -21,19 +21,19 @@ CREATE EXTERNAL TABLE csv_with_quote ( c1 VARCHAR, c2 VARCHAR ) STORED AS CSV -WITH HEADER ROW -DELIMITER ',' -OPTIONS ('format.quote' '~') -LOCATION '../core/tests/data/quote.csv'; +LOCATION '../core/tests/data/quote.csv' +OPTIONS ('format.quote' '~', + 'format.delimiter' ',', + 'format.has_header' 'true'); statement ok CREATE EXTERNAL TABLE csv_with_escape ( c1 VARCHAR, c2 VARCHAR ) STORED AS CSV -WITH HEADER ROW -DELIMITER ',' -OPTIONS ('format.escape' '\') +OPTIONS ('format.escape' '\', + 'format.delimiter' ',', + 'format.has_header' 'true') LOCATION '../core/tests/data/escape.csv'; query TT @@ -69,9 +69,9 @@ CREATE EXTERNAL TABLE csv_with_escape_2 ( c1 VARCHAR, c2 VARCHAR ) STORED AS CSV -WITH HEADER ROW -DELIMITER ',' -OPTIONS ('format.escape' '"') +OPTIONS ('format.escape' '"', + 'format.delimiter' ',', + 'format.has_header' 'true') LOCATION '../core/tests/data/escape.csv'; # TODO: Validate this with better data. @@ -136,8 +136,8 @@ CREATE EXTERNAL TABLE partitioned_table ( partition_col INT ) STORED AS CSV -WITH HEADER ROW -LOCATION 'test_files/scratch/csv_files/csv_partitions'; +LOCATION 'test_files/scratch/csv_files/csv_partitions' +OPTIONS ('format.has_header' 'false'); query ITII SELECT * FROM partitioned_table ORDER BY int_col; @@ -160,4 +160,4 @@ logical_plan physical_plan 01)SortPreservingMergeExec: [int_col@0 ASC NULLS LAST] 02)--SortExec: expr=[int_col@0 ASC NULLS LAST], preserve_partitioning=[true] -03)----CsvExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/csv_files/csv_partitions/1.csv], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/csv_files/csv_partitions/2.csv]]}, projection=[int_col, string_col, bigint_col, partition_col], has_header=true +03)----CsvExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/csv_files/csv_partitions/1.csv], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/csv_files/csv_partitions/2.csv]]}, projection=[int_col, string_col, bigint_col, partition_col], has_header=false diff --git a/datafusion/sqllogictest/test_files/cte.slt b/datafusion/sqllogictest/test_files/cte.slt index 05491772999e..1ff108cf6c5f 100644 --- a/datafusion/sqllogictest/test_files/cte.slt +++ b/datafusion/sqllogictest/test_files/cte.slt @@ -124,11 +124,11 @@ physical_plan # setup statement ok -CREATE EXTERNAL TABLE balance STORED as CSV WITH HEADER ROW LOCATION '../core/tests/data/recursive_cte/balance.csv' +CREATE EXTERNAL TABLE balance STORED as CSV LOCATION '../core/tests/data/recursive_cte/balance.csv' OPTIONS ('format.has_header' 'true'); # setup statement ok -CREATE EXTERNAL TABLE growth STORED as CSV WITH HEADER ROW LOCATION '../core/tests/data/recursive_cte/growth.csv' +CREATE EXTERNAL TABLE growth STORED as CSV LOCATION '../core/tests/data/recursive_cte/growth.csv' OPTIONS ('format.has_header' 'true'); # setup statement ok @@ -404,7 +404,7 @@ FROM # setup statement ok -CREATE EXTERNAL TABLE prices STORED as CSV WITH HEADER ROW LOCATION '../core/tests/data/recursive_cte/prices.csv' +CREATE EXTERNAL TABLE prices STORED as CSV LOCATION '../core/tests/data/recursive_cte/prices.csv' OPTIONS ('format.has_header' 'true'); # CTE within window function inside nested CTE works. This test demonstrates using a nested window function to recursively iterate over a column. query RRII @@ -595,11 +595,11 @@ ORDER BY # setup statement ok -CREATE EXTERNAL TABLE sales STORED as CSV WITH HEADER ROW LOCATION '../core/tests/data/recursive_cte/sales.csv' +CREATE EXTERNAL TABLE sales STORED as CSV LOCATION '../core/tests/data/recursive_cte/sales.csv' OPTIONS ('format.has_header' 'true'); # setup statement ok -CREATE EXTERNAL TABLE salespersons STORED as CSV WITH HEADER ROW LOCATION '../core/tests/data/recursive_cte/salespersons.csv' +CREATE EXTERNAL TABLE salespersons STORED as CSV LOCATION '../core/tests/data/recursive_cte/salespersons.csv' OPTIONS ('format.has_header' 'true'); # group by works within recursive cte. This test case demonstrates rolling up a hierarchy of salespeople to their managers. diff --git a/datafusion/sqllogictest/test_files/ddl.slt b/datafusion/sqllogictest/test_files/ddl.slt index 682972b5572a..a35e688479e7 100644 --- a/datafusion/sqllogictest/test_files/ddl.slt +++ b/datafusion/sqllogictest/test_files/ddl.slt @@ -256,7 +256,7 @@ DROP VIEW non_existent_view ########## statement ok -CREATE external table aggregate_simple(c1 real, c2 double, c3 boolean) STORED as CSV WITH HEADER ROW LOCATION '../core/tests/data/aggregate_simple.csv'; +CREATE external table aggregate_simple(c1 real, c2 double, c3 boolean) STORED as CSV LOCATION '../core/tests/data/aggregate_simple.csv' OPTIONS ('format.has_header' 'true'); # create_table_as statement ok @@ -455,8 +455,8 @@ CREATE EXTERNAL TABLE aggregate_test_100 ( c13 VARCHAR NOT NULL ) STORED AS CSV -WITH HEADER ROW -LOCATION '../../testing/data/csv/aggregate_test_100.csv'; +LOCATION '../../testing/data/csv/aggregate_test_100.csv' +OPTIONS ('format.has_header' 'true'); query TIIIIIIIIIRRT SELECT c1, c2, c3, c4, c5, c6, c7, c8, c9, 10, c11, c12, c13 FROM aggregate_test_100 LIMIT 1; @@ -535,7 +535,7 @@ DROP VIEW y; # create_pipe_delimited_csv_table() statement ok -CREATE EXTERNAL TABLE aggregate_simple STORED AS CSV WITH HEADER ROW DELIMITER '|' LOCATION '../core/tests/data/aggregate_simple_pipe.csv'; +CREATE EXTERNAL TABLE aggregate_simple STORED AS CSV LOCATION '../core/tests/data/aggregate_simple_pipe.csv' OPTIONS ('format.delimiter' '|', 'format.has_header' 'true'); query RRB @@ -581,14 +581,14 @@ statement ok CREATE TABLE IF NOT EXISTS table_without_values(field1 BIGINT, field2 BIGINT); statement ok -CREATE EXTERNAL TABLE aggregate_simple STORED AS CSV WITH HEADER ROW LOCATION '../core/tests/data/aggregate_simple.csv' +CREATE EXTERNAL TABLE aggregate_simple STORED AS CSV LOCATION '../core/tests/data/aggregate_simple.csv' OPTIONS ('format.has_header' 'true'); # Should not recreate the same EXTERNAL table statement error Execution error: Table 'aggregate_simple' already exists -CREATE EXTERNAL TABLE aggregate_simple STORED AS CSV WITH HEADER ROW LOCATION '../core/tests/data/aggregate_simple.csv' +CREATE EXTERNAL TABLE aggregate_simple STORED AS CSV LOCATION '../core/tests/data/aggregate_simple.csv' OPTIONS ('format.has_header' 'true'); statement ok -CREATE EXTERNAL TABLE IF NOT EXISTS aggregate_simple STORED AS CSV WITH HEADER ROW LOCATION '../core/tests/data/aggregate_simple.csv' +CREATE EXTERNAL TABLE IF NOT EXISTS aggregate_simple STORED AS CSV LOCATION '../core/tests/data/aggregate_simple.csv' OPTIONS ('format.has_header' 'true'); # create bad custom table statement error DataFusion error: Execution error: Unable to find factory for DELTATABLE @@ -690,7 +690,7 @@ drop table foo; # create csv table with empty csv file statement ok -CREATE EXTERNAL TABLE empty STORED AS CSV WITH HEADER ROW LOCATION '../core/tests/data/empty.csv'; +CREATE EXTERNAL TABLE empty STORED AS CSV LOCATION '../core/tests/data/empty.csv' OPTIONS ('format.has_header' 'true'); query TTI select column_name, data_type, ordinal_position from information_schema.columns where table_name='empty';; @@ -742,8 +742,8 @@ DROP SCHEMA empty_schema; statement ok CREATE UNBOUNDED external table t(c1 integer, c2 integer, c3 integer) STORED as CSV -WITH HEADER ROW -LOCATION '../core/tests/data/empty.csv'; +LOCATION '../core/tests/data/empty.csv' +OPTIONS ('format.has_header' 'true'); # should see infinite_source=true in the explain query TT @@ -760,8 +760,8 @@ drop table t; statement ok CREATE external table t(c1 integer, c2 integer, c3 integer) STORED as CSV -WITH HEADER ROW -LOCATION '../core/tests/data/empty.csv'; +LOCATION '../core/tests/data/empty.csv' +OPTIONS ('format.has_header' 'true'); # expect to see no infinite_source in the explain query TT diff --git a/datafusion/sqllogictest/test_files/decimal.slt b/datafusion/sqllogictest/test_files/decimal.slt index 3f75e42d9304..8db28c32f13b 100644 --- a/datafusion/sqllogictest/test_files/decimal.slt +++ b/datafusion/sqllogictest/test_files/decimal.slt @@ -44,8 +44,8 @@ c4 BOOLEAN NOT NULL, c5 DECIMAL(12,7) NOT NULL ) STORED AS CSV -WITH HEADER ROW -LOCATION '../core/tests/data/decimal_data.csv'; +LOCATION '../core/tests/data/decimal_data.csv' +OPTIONS ('format.has_header' 'true'); query TT @@ -639,8 +639,8 @@ c4 BOOLEAN NOT NULL, c5 DECIMAL(52,7) NOT NULL ) STORED AS CSV -WITH HEADER ROW -LOCATION '../core/tests/data/decimal_data.csv'; +LOCATION '../core/tests/data/decimal_data.csv' +OPTIONS ('format.has_header' 'true'); query TT select arrow_typeof(c1), arrow_typeof(c5) from decimal256_simple limit 1; diff --git a/datafusion/sqllogictest/test_files/describe.slt b/datafusion/sqllogictest/test_files/describe.slt index f94a2e453884..a15c3a109cab 100644 --- a/datafusion/sqllogictest/test_files/describe.slt +++ b/datafusion/sqllogictest/test_files/describe.slt @@ -24,7 +24,7 @@ statement ok set datafusion.catalog.information_schema = true statement ok -CREATE external table aggregate_simple(c1 real, c2 double, c3 boolean) STORED as CSV WITH HEADER ROW LOCATION '../core/tests/data/aggregate_simple.csv'; +CREATE external table aggregate_simple(c1 real, c2 double, c3 boolean) STORED as CSV LOCATION '../core/tests/data/aggregate_simple.csv' OPTIONS ('format.has_header' 'true'); query TTT rowsort DESCRIBE aggregate_simple; @@ -44,7 +44,7 @@ statement ok set datafusion.catalog.information_schema = false statement ok -CREATE external table aggregate_simple(c1 real, c2 double, c3 boolean) STORED as CSV WITH HEADER ROW LOCATION '../core/tests/data/aggregate_simple.csv'; +CREATE external table aggregate_simple(c1 real, c2 double, c3 boolean) STORED as CSV LOCATION '../core/tests/data/aggregate_simple.csv' OPTIONS ('format.has_header' 'true'); query TTT rowsort DESCRIBE aggregate_simple; diff --git a/datafusion/sqllogictest/test_files/distinct_on.slt b/datafusion/sqllogictest/test_files/distinct_on.slt index 0bb931c88cbc..cdef2990fa3c 100644 --- a/datafusion/sqllogictest/test_files/distinct_on.slt +++ b/datafusion/sqllogictest/test_files/distinct_on.slt @@ -32,8 +32,8 @@ CREATE EXTERNAL TABLE aggregate_test_100 ( c13 VARCHAR NOT NULL ) STORED AS CSV -WITH HEADER ROW LOCATION '../../testing/data/csv/aggregate_test_100.csv' +OPTIONS ('format.has_header' 'true'); # Basic example: distinct on the first column project the second one, and # order by the third diff --git a/datafusion/sqllogictest/test_files/errors.slt b/datafusion/sqllogictest/test_files/errors.slt index b5464e2a274c..b74b2fe60f52 100644 --- a/datafusion/sqllogictest/test_files/errors.slt +++ b/datafusion/sqllogictest/test_files/errors.slt @@ -34,8 +34,8 @@ CREATE EXTERNAL TABLE aggregate_test_100 ( c13 VARCHAR NOT NULL ) STORED AS CSV -WITH HEADER ROW LOCATION '../../testing/data/csv/aggregate_test_100.csv' +OPTIONS ('format.has_header' 'true'); # csv_query_error statement error diff --git a/datafusion/sqllogictest/test_files/explain.slt b/datafusion/sqllogictest/test_files/explain.slt index 205306882b4a..3a4ac747ebd6 100644 --- a/datafusion/sqllogictest/test_files/explain.slt +++ b/datafusion/sqllogictest/test_files/explain.slt @@ -32,8 +32,8 @@ CREATE EXTERNAL TABLE aggregate_test_100 ( c13 VARCHAR NOT NULL ) STORED AS CSV -WITH HEADER ROW -LOCATION '../../testing/data/csv/aggregate_test_100.csv'; +LOCATION '../../testing/data/csv/aggregate_test_100.csv' +OPTIONS ('format.has_header' 'true'); query TT explain SELECT c1 FROM aggregate_test_100 where c2 > 10 @@ -68,9 +68,9 @@ CREATE EXTERNAL TABLE aggregate_test_100_with_order ( c13 VARCHAR NOT NULL ) STORED AS CSV -WITH HEADER ROW WITH ORDER (c1 ASC) -LOCATION '../core/tests/data/aggregate_test_100_order_by_c1_asc.csv'; +LOCATION '../core/tests/data/aggregate_test_100_order_by_c1_asc.csv' +OPTIONS ('format.has_header' 'true'); query TT explain SELECT c1 FROM aggregate_test_100_with_order order by c1 ASC limit 10 @@ -128,8 +128,8 @@ CREATE EXTERNAL TABLE simple_explain_test ( c INT ) STORED AS CSV -WITH HEADER ROW LOCATION '../core/tests/data/example.csv' +OPTIONS ('format.has_header' 'true'); query TT EXPLAIN SELECT a, b, c FROM simple_explain_test @@ -156,8 +156,8 @@ CREATE UNBOUNDED EXTERNAL TABLE sink_table ( c13 VARCHAR NOT NULL ) STORED AS CSV -WITH HEADER ROW -LOCATION '../../testing/data/csv/aggregate_test_100.csv'; +LOCATION '../../testing/data/csv/aggregate_test_100.csv' +OPTIONS ('format.has_header' 'true'); query TT EXPLAIN INSERT INTO sink_table SELECT * FROM aggregate_test_100 ORDER by c1 diff --git a/datafusion/sqllogictest/test_files/expr.slt b/datafusion/sqllogictest/test_files/expr.slt index 129a67208354..4b5f4d770a03 100644 --- a/datafusion/sqllogictest/test_files/expr.slt +++ b/datafusion/sqllogictest/test_files/expr.slt @@ -1934,8 +1934,8 @@ CREATE EXTERNAL TABLE aggregate_test_100_by_sql ( c13 VARCHAR NOT NULL ) STORED AS CSV -WITH HEADER ROW LOCATION '../../testing/data/csv/aggregate_test_100.csv' +OPTIONS ('format.has_header' 'true'); diff --git a/datafusion/sqllogictest/test_files/functions.slt b/datafusion/sqllogictest/test_files/functions.slt index d03b33d0c8e5..4bb553767b45 100644 --- a/datafusion/sqllogictest/test_files/functions.slt +++ b/datafusion/sqllogictest/test_files/functions.slt @@ -549,8 +549,8 @@ CREATE EXTERNAL TABLE aggregate_test_100 ( c13 VARCHAR NOT NULL ) STORED AS CSV -WITH HEADER ROW LOCATION '../../testing/data/csv/aggregate_test_100.csv' +OPTIONS ('format.has_header' 'true'); # sqrt_f32_vs_f64 diff --git a/datafusion/sqllogictest/test_files/group.slt b/datafusion/sqllogictest/test_files/group.slt index 2a28efa73a62..a6b5f9b72a53 100644 --- a/datafusion/sqllogictest/test_files/group.slt +++ b/datafusion/sqllogictest/test_files/group.slt @@ -32,11 +32,11 @@ CREATE EXTERNAL TABLE aggregate_test_100 ( c13 VARCHAR NOT NULL ) STORED AS CSV -WITH HEADER ROW LOCATION '../../testing/data/csv/aggregate_test_100.csv' +OPTIONS ('format.has_header' 'true'); statement ok -CREATE external table aggregate_simple(c1 real, c2 double, c3 boolean) STORED as CSV WITH HEADER ROW LOCATION '../core/tests/data/aggregate_simple.csv'; +CREATE external table aggregate_simple(c1 real, c2 double, c3 boolean) STORED as CSV LOCATION '../core/tests/data/aggregate_simple.csv' OPTIONS ('format.has_header' 'true'); # csv_query_group_by_int_min_max diff --git a/datafusion/sqllogictest/test_files/group_by.slt b/datafusion/sqllogictest/test_files/group_by.slt index 5a605ea58b6c..43bbf6bed643 100644 --- a/datafusion/sqllogictest/test_files/group_by.slt +++ b/datafusion/sqllogictest/test_files/group_by.slt @@ -2042,9 +2042,9 @@ CREATE UNBOUNDED EXTERNAL TABLE annotated_data_infinite2 ( d INTEGER ) STORED AS CSV -WITH HEADER ROW WITH ORDER (a ASC, b ASC, c ASC) -LOCATION '../core/tests/data/window_2.csv'; +LOCATION '../core/tests/data/window_2.csv' +OPTIONS ('format.has_header' 'true'); # Create a table with 2 ordered columns. # In the next step, we will expect to observe the removed sort execs. @@ -2057,10 +2057,10 @@ CREATE EXTERNAL TABLE multiple_ordered_table ( d INTEGER ) STORED AS CSV -WITH HEADER ROW WITH ORDER (a ASC, b ASC) WITH ORDER (c ASC) -LOCATION '../core/tests/data/window_2.csv'; +LOCATION '../core/tests/data/window_2.csv' +OPTIONS ('format.has_header' 'true'); # Expected a sort exec for b DESC query TT @@ -3894,10 +3894,10 @@ CREATE EXTERNAL TABLE multiple_ordered_table_with_pk ( primary key(c) ) STORED AS CSV -WITH HEADER ROW WITH ORDER (a ASC, b ASC) WITH ORDER (c ASC) -LOCATION '../core/tests/data/window_2.csv'; +LOCATION '../core/tests/data/window_2.csv' +OPTIONS ('format.has_header' 'true'); # We can use column b during selection # even if it is not among group by expressions @@ -3935,10 +3935,10 @@ CREATE EXTERNAL TABLE multiple_ordered_table_with_pk ( d INTEGER ) STORED AS CSV -WITH HEADER ROW WITH ORDER (a ASC, b ASC) WITH ORDER (c ASC) -LOCATION '../core/tests/data/window_2.csv'; +LOCATION '../core/tests/data/window_2.csv' +OPTIONS ('format.has_header' 'true'); # We can use column b during selection # even if it is not among group by expressions @@ -4377,8 +4377,8 @@ CREATE EXTERNAL TABLE aggregate_test_100 ( c13 VARCHAR NOT NULL ) STORED AS CSV -WITH HEADER ROW LOCATION '../../testing/data/csv/aggregate_test_100.csv' +OPTIONS ('format.has_header' 'true'); query TIIII SELECT c1, count(distinct c2), min(distinct c2), min(c3), max(c4) FROM aggregate_test_100 GROUP BY c1 ORDER BY c1; @@ -4454,10 +4454,10 @@ CREATE EXTERNAL TABLE unbounded_multiple_ordered_table_with_pk ( d INTEGER ) STORED AS CSV -WITH HEADER ROW WITH ORDER (a ASC, b ASC) WITH ORDER (c ASC) -LOCATION '../core/tests/data/window_2.csv'; +LOCATION '../core/tests/data/window_2.csv' +OPTIONS ('format.has_header' 'true'); # Query below can be executed, since c is primary key. query III rowsort @@ -4538,8 +4538,8 @@ CREATE EXTERNAL TABLE timestamp_table ( c2 INT, ) STORED AS CSV -WITH HEADER ROW -LOCATION 'test_files/scratch/group_by/timestamp_table'; +LOCATION 'test_files/scratch/group_by/timestamp_table' +OPTIONS ('format.has_header' 'true'); # Group By using date_trunc query PI rowsort diff --git a/datafusion/sqllogictest/test_files/identifiers.slt b/datafusion/sqllogictest/test_files/identifiers.slt index f60d60b2bfe0..755d617e7a2a 100644 --- a/datafusion/sqllogictest/test_files/identifiers.slt +++ b/datafusion/sqllogictest/test_files/identifiers.slt @@ -22,8 +22,8 @@ CREATE EXTERNAL TABLE case_insensitive_test ( c INT ) STORED AS CSV -WITH HEADER ROW LOCATION '../core/tests/data/example.csv' +OPTIONS ('format.has_header' 'true'); # normalized column identifiers query II diff --git a/datafusion/sqllogictest/test_files/information_schema.slt b/datafusion/sqllogictest/test_files/information_schema.slt index f52147142cb7..de00cf9d0547 100644 --- a/datafusion/sqllogictest/test_files/information_schema.slt +++ b/datafusion/sqllogictest/test_files/information_schema.slt @@ -571,7 +571,8 @@ DROP VIEW test.xyz statement ok CREATE EXTERNAL TABLE abc STORED AS CSV -WITH HEADER ROW LOCATION '../../testing/data/csv/aggregate_test_100.csv'; +LOCATION '../../testing/data/csv/aggregate_test_100.csv' +OPTIONS ('format.has_header' 'true'); query TTTT SHOW CREATE TABLE abc; diff --git a/datafusion/sqllogictest/test_files/insert.slt b/datafusion/sqllogictest/test_files/insert.slt index 7c9bc4abe767..126e4120e9be 100644 --- a/datafusion/sqllogictest/test_files/insert.slt +++ b/datafusion/sqllogictest/test_files/insert.slt @@ -37,8 +37,8 @@ CREATE EXTERNAL TABLE aggregate_test_100 ( c13 VARCHAR NOT NULL ) STORED AS CSV -WITH HEADER ROW LOCATION '../../testing/data/csv/aggregate_test_100.csv' +OPTIONS ('format.has_header' 'true'); # test_insert_into diff --git a/datafusion/sqllogictest/test_files/insert_to_external.slt b/datafusion/sqllogictest/test_files/insert_to_external.slt index 70eb2b75a75a..dfb333d16e23 100644 --- a/datafusion/sqllogictest/test_files/insert_to_external.slt +++ b/datafusion/sqllogictest/test_files/insert_to_external.slt @@ -37,8 +37,8 @@ CREATE EXTERNAL TABLE aggregate_test_100 ( c13 VARCHAR NOT NULL ) STORED AS CSV -WITH HEADER ROW LOCATION '../../testing/data/csv/aggregate_test_100.csv' +OPTIONS ('format.has_header' 'true'); statement ok diff --git a/datafusion/sqllogictest/test_files/join_disable_repartition_joins.slt b/datafusion/sqllogictest/test_files/join_disable_repartition_joins.slt index 60a14f78bdf5..8de8c478fbc4 100644 --- a/datafusion/sqllogictest/test_files/join_disable_repartition_joins.slt +++ b/datafusion/sqllogictest/test_files/join_disable_repartition_joins.slt @@ -34,9 +34,9 @@ CREATE EXTERNAL TABLE annotated_data ( d INTEGER ) STORED AS CSV -WITH HEADER ROW WITH ORDER (a ASC, b ASC, c ASC) -LOCATION '../core/tests/data/window_2.csv'; +LOCATION '../core/tests/data/window_2.csv' +OPTIONS ('format.has_header' 'true'); query TT EXPLAIN SELECT t2.a diff --git a/datafusion/sqllogictest/test_files/joins.slt b/datafusion/sqllogictest/test_files/joins.slt index e2c77552f990..0c45e3ffbf69 100644 --- a/datafusion/sqllogictest/test_files/joins.slt +++ b/datafusion/sqllogictest/test_files/joins.slt @@ -3162,9 +3162,9 @@ CREATE EXTERNAL TABLE annotated_data ( d INTEGER ) STORED AS CSV -WITH HEADER ROW WITH ORDER (a ASC NULLS FIRST, b ASC, c ASC) -LOCATION '../core/tests/data/window_2.csv'; +LOCATION '../core/tests/data/window_2.csv' +OPTIONS ('format.has_header' 'true'); # sort merge join should propagate ordering equivalence of the left side # for inner join. Hence final requirement rn1 ASC is already satisfied at @@ -3379,10 +3379,10 @@ CREATE EXTERNAL TABLE multiple_ordered_table ( d INTEGER ) STORED AS CSV -WITH HEADER ROW WITH ORDER (a ASC, b ASC) WITH ORDER (c ASC) -LOCATION '../core/tests/data/window_2.csv'; +LOCATION '../core/tests/data/window_2.csv' +OPTIONS ('format.has_header' 'true'); query TT EXPLAIN SELECT LAST_VALUE(l.d ORDER BY l.a) AS amount_usd diff --git a/datafusion/sqllogictest/test_files/limit.slt b/datafusion/sqllogictest/test_files/limit.slt index faaa593d2c17..2c65b1da4474 100644 --- a/datafusion/sqllogictest/test_files/limit.slt +++ b/datafusion/sqllogictest/test_files/limit.slt @@ -36,8 +36,8 @@ CREATE EXTERNAL TABLE aggregate_test_100 ( c13 VARCHAR NOT NULL ) STORED AS CSV -WITH HEADER ROW LOCATION '../../testing/data/csv/aggregate_test_100.csv' +OPTIONS ('format.has_header' 'true'); # async fn csv_query_limit query T @@ -529,9 +529,8 @@ CREATE UNBOUNDED EXTERNAL TABLE data ( "column1" INTEGER, "column2" VARCHAR, ) STORED AS CSV -WITH HEADER ROW WITH ORDER ("column1", "column2") -LOCATION 'test_files/scratch/limit/data.csv'; +LOCATION 'test_files/scratch/limit/data.csv' OPTIONS ('format.has_header' 'false'); query IT SELECT * from data LIMIT 3; diff --git a/datafusion/sqllogictest/test_files/math.slt b/datafusion/sqllogictest/test_files/math.slt index 3315ff454924..78efbb3f564b 100644 --- a/datafusion/sqllogictest/test_files/math.slt +++ b/datafusion/sqllogictest/test_files/math.slt @@ -20,7 +20,7 @@ ########## statement ok -CREATE external table aggregate_simple(c1 real, c2 double, c3 boolean) STORED as CSV WITH HEADER ROW LOCATION '../core/tests/data/aggregate_simple.csv'; +CREATE external table aggregate_simple(c1 real, c2 double, c3 boolean) STORED as CSV LOCATION '../core/tests/data/aggregate_simple.csv' OPTIONS ('format.has_header' 'true'); # Round query R diff --git a/datafusion/sqllogictest/test_files/monotonic_projection_test.slt b/datafusion/sqllogictest/test_files/monotonic_projection_test.slt index 0c89fb3bcdaa..d41b78dcd3f2 100644 --- a/datafusion/sqllogictest/test_files/monotonic_projection_test.slt +++ b/datafusion/sqllogictest/test_files/monotonic_projection_test.slt @@ -25,10 +25,10 @@ CREATE EXTERNAL TABLE multiple_ordered_table ( d INTEGER ) STORED AS CSV -WITH HEADER ROW WITH ORDER (a ASC, b ASC) WITH ORDER (c ASC) -LOCATION '../core/tests/data/window_2.csv'; +LOCATION '../core/tests/data/window_2.csv' +OPTIONS ('format.has_header' 'true'); # test for substitute CAST scenario query TT diff --git a/datafusion/sqllogictest/test_files/order.slt b/datafusion/sqllogictest/test_files/order.slt index 0b43b6e31aaf..0f869fc0b419 100644 --- a/datafusion/sqllogictest/test_files/order.slt +++ b/datafusion/sqllogictest/test_files/order.slt @@ -36,8 +36,8 @@ CREATE EXTERNAL TABLE aggregate_test_100 ( c13 VARCHAR NOT NULL ) STORED AS CSV -WITH HEADER ROW LOCATION '../../testing/data/csv/aggregate_test_100.csv' +OPTIONS ('format.has_header' 'true'); # test_sort_unprojected_col query I @@ -422,11 +422,11 @@ CREATE EXTERNAL TABLE multiple_ordered_table ( d INTEGER ) STORED AS CSV -WITH HEADER ROW WITH ORDER (a ASC) WITH ORDER (b ASC) WITH ORDER (c ASC) -LOCATION '../core/tests/data/window_2.csv'; +LOCATION '../core/tests/data/window_2.csv' +OPTIONS ('format.has_header' 'true'); query TT EXPLAIN SELECT (b+a+c) AS result @@ -511,10 +511,10 @@ CREATE EXTERNAL TABLE aggregate_test_100 ( c13 VARCHAR NOT NULL ) STORED AS CSV -WITH HEADER ROW WITH ORDER(c11) WITH ORDER(c12 DESC) LOCATION '../../testing/data/csv/aggregate_test_100.csv' +OPTIONS ('format.has_header' 'true'); query TT EXPLAIN SELECT ATAN(c11) as atan_c11 @@ -627,7 +627,8 @@ CREATE EXTERNAL TABLE IF NOT EXISTS orders ( o_clerk VARCHAR, o_shippriority INTEGER, o_comment VARCHAR, -) STORED AS CSV WITH ORDER (o_orderkey ASC) DELIMITER ',' WITH HEADER ROW LOCATION '../core/tests/tpch-csv/orders.csv'; +) STORED AS CSV WITH ORDER (o_orderkey ASC) LOCATION '../core/tests/tpch-csv/orders.csv' +OPTIONS ('format.delimiter' ',', 'format.has_header' 'true'); query TT EXPLAIN SELECT o_orderkey, o_orderstatus FROM orders ORDER BY o_orderkey ASC diff --git a/datafusion/sqllogictest/test_files/parquet.slt b/datafusion/sqllogictest/test_files/parquet.slt index aa1681de6201..e70f800bde74 100644 --- a/datafusion/sqllogictest/test_files/parquet.slt +++ b/datafusion/sqllogictest/test_files/parquet.slt @@ -66,7 +66,6 @@ CREATE EXTERNAL TABLE test_table ( date_col DATE ) STORED AS PARQUET -WITH HEADER ROW LOCATION 'test_files/scratch/parquet/test_table'; # Basic query: @@ -107,7 +106,6 @@ CREATE EXTERNAL TABLE test_table ( date_col DATE ) STORED AS PARQUET -WITH HEADER ROW WITH ORDER (string_col ASC NULLS LAST, int_col ASC NULLS LAST) LOCATION 'test_files/scratch/parquet/test_table'; @@ -189,8 +187,7 @@ CREATE EXTERNAL TABLE alltypes_plain ( timestamp_col TIMESTAMP NOT NULL, ) STORED AS PARQUET -WITH HEADER ROW -LOCATION '../../parquet-testing/data/alltypes_plain.parquet' +LOCATION '../../parquet-testing/data/alltypes_plain.parquet'; # Test a basic query with a CAST: query IT @@ -214,7 +211,6 @@ DROP TABLE alltypes_plain; statement ok CREATE EXTERNAL TABLE test_binary STORED AS PARQUET -WITH HEADER ROW LOCATION '../core/tests/data/test_binary.parquet'; # Check size of table: @@ -247,7 +243,6 @@ DROP TABLE test_binary; statement ok CREATE EXTERNAL TABLE timestamp_with_tz STORED AS PARQUET -WITH HEADER ROW LOCATION '../core/tests/data/timestamp_with_tz.parquet'; # Check size of table: @@ -288,7 +283,6 @@ STORED AS PARQUET; statement ok CREATE EXTERNAL TABLE listing_table STORED AS PARQUET -WITH HEADER ROW LOCATION 'test_files/scratch/parquet/test_table/*.parquet'; statement ok @@ -317,7 +311,6 @@ DROP TABLE timestamp_with_tz; statement ok CREATE EXTERNAL TABLE single_nan STORED AS PARQUET -WITH HEADER ROW LOCATION '../../parquet-testing/data/single_nan.parquet'; # Check table size: @@ -339,7 +332,6 @@ DROP TABLE single_nan; statement ok CREATE EXTERNAL TABLE list_columns STORED AS PARQUET -WITH HEADER ROW LOCATION '../../parquet-testing/data/list_columns.parquet'; query ?? diff --git a/datafusion/sqllogictest/test_files/pg_compat/pg_compat_null.slt b/datafusion/sqllogictest/test_files/pg_compat/pg_compat_null.slt index f933c90acc73..d14b6ca81f67 100644 --- a/datafusion/sqllogictest/test_files/pg_compat/pg_compat_null.slt +++ b/datafusion/sqllogictest/test_files/pg_compat/pg_compat_null.slt @@ -66,8 +66,8 @@ CREATE EXTERNAL TABLE aggregate_test_100_by_sql ( c13 VARCHAR NOT NULL ) STORED AS CSV -WITH HEADER ROW LOCATION '../../testing/data/csv/aggregate_test_100.csv' +OPTIONS ('format.has_header' 'true'); statement ok diff --git a/datafusion/sqllogictest/test_files/pg_compat/pg_compat_simple.slt b/datafusion/sqllogictest/test_files/pg_compat/pg_compat_simple.slt index b01ea73c8056..25b4924715ca 100644 --- a/datafusion/sqllogictest/test_files/pg_compat/pg_compat_simple.slt +++ b/datafusion/sqllogictest/test_files/pg_compat/pg_compat_simple.slt @@ -67,8 +67,8 @@ CREATE EXTERNAL TABLE aggregate_test_100_by_sql ( c13 VARCHAR NOT NULL ) STORED AS CSV -WITH HEADER ROW LOCATION '../../testing/data/csv/aggregate_test_100.csv' +OPTIONS ('format.has_header' 'true'); diff --git a/datafusion/sqllogictest/test_files/pg_compat/pg_compat_union.slt b/datafusion/sqllogictest/test_files/pg_compat/pg_compat_union.slt index 05343de32268..e02c19016790 100644 --- a/datafusion/sqllogictest/test_files/pg_compat/pg_compat_union.slt +++ b/datafusion/sqllogictest/test_files/pg_compat/pg_compat_union.slt @@ -64,8 +64,8 @@ CREATE EXTERNAL TABLE aggregate_test_100_by_sql ( c13 VARCHAR NOT NULL ) STORED AS CSV -WITH HEADER ROW LOCATION '../../testing/data/csv/aggregate_test_100.csv' +OPTIONS ('format.has_header' 'true'); query I rowsort SELECT * FROM ( diff --git a/datafusion/sqllogictest/test_files/pg_compat/pg_compat_window.slt b/datafusion/sqllogictest/test_files/pg_compat/pg_compat_window.slt index cec51d472075..edad3747a203 100644 --- a/datafusion/sqllogictest/test_files/pg_compat/pg_compat_window.slt +++ b/datafusion/sqllogictest/test_files/pg_compat/pg_compat_window.slt @@ -64,8 +64,8 @@ CREATE EXTERNAL TABLE aggregate_test_100_by_sql ( c13 VARCHAR NOT NULL ) STORED AS CSV -WITH HEADER ROW LOCATION '../../testing/data/csv/aggregate_test_100.csv' +OPTIONS ('format.has_header' 'true'); query IIIIIIIIII SELECT diff --git a/datafusion/sqllogictest/test_files/predicates.slt b/datafusion/sqllogictest/test_files/predicates.slt index caf79abcfa4e..5483750c6a03 100644 --- a/datafusion/sqllogictest/test_files/predicates.slt +++ b/datafusion/sqllogictest/test_files/predicates.slt @@ -39,8 +39,8 @@ CREATE EXTERNAL TABLE aggregate_test_100 ( c13 VARCHAR NOT NULL ) STORED AS CSV -WITH HEADER ROW -LOCATION '../../testing/data/csv/aggregate_test_100.csv' +LOCATION '../../testing/data/csv/aggregate_test_100.csv' +OPTIONS ('format.has_header' 'true'); statement ok CREATE EXTERNAL TABLE alltypes_plain STORED AS PARQUET LOCATION '../../parquet-testing/data/alltypes_plain.parquet'; @@ -621,7 +621,7 @@ CREATE EXTERNAL TABLE IF NOT EXISTS lineitem ( l_shipinstruct VARCHAR, l_shipmode VARCHAR, l_comment VARCHAR, -) STORED AS CSV DELIMITER ',' WITH HEADER ROW LOCATION '../core/tests/tpch-csv/lineitem.csv'; +) STORED AS CSV LOCATION '../core/tests/tpch-csv/lineitem.csv' OPTIONS ('format.delimiter' ',', 'format.has_header' 'true'); statement ok CREATE EXTERNAL TABLE IF NOT EXISTS part ( @@ -634,7 +634,7 @@ CREATE EXTERNAL TABLE IF NOT EXISTS part ( p_container VARCHAR, p_retailprice DECIMAL(15, 2), p_comment VARCHAR, -) STORED AS CSV DELIMITER ',' WITH HEADER ROW LOCATION '../core/tests/tpch-csv/part.csv'; +) STORED AS CSV LOCATION '../core/tests/tpch-csv/part.csv' OPTIONS ('format.delimiter' ',', 'format.has_header' 'true'); query TT EXPLAIN SELECT l_partkey FROM diff --git a/datafusion/sqllogictest/test_files/projection.slt b/datafusion/sqllogictest/test_files/projection.slt index b752f5644b7f..843ab71091f5 100644 --- a/datafusion/sqllogictest/test_files/projection.slt +++ b/datafusion/sqllogictest/test_files/projection.slt @@ -37,8 +37,8 @@ CREATE EXTERNAL TABLE aggregate_test_100 ( c13 VARCHAR NOT NULL ) STORED AS CSV -WITH HEADER ROW LOCATION '../../testing/data/csv/aggregate_test_100.csv' +OPTIONS ('format.has_header' 'true'); statement ok CREATE EXTERNAL TABLE aggregate_simple ( @@ -47,8 +47,8 @@ CREATE EXTERNAL TABLE aggregate_simple ( c3 BOOLEAN NOT NULL ) STORED AS CSV -WITH HEADER ROW LOCATION '../core/tests/data/aggregate_simple.csv' +OPTIONS ('format.has_header' 'true'); statement ok CREATE TABLE memory_table(a INT NOT NULL, b INT NOT NULL, c INT NOT NULL) AS VALUES diff --git a/datafusion/sqllogictest/test_files/references.slt b/datafusion/sqllogictest/test_files/references.slt index fd77d57c06f1..4c3ac68aebd1 100644 --- a/datafusion/sqllogictest/test_files/references.slt +++ b/datafusion/sqllogictest/test_files/references.slt @@ -39,8 +39,8 @@ CREATE EXTERNAL TABLE aggregate_test_100 ( c13 VARCHAR NOT NULL ) STORED AS CSV -WITH HEADER ROW LOCATION '../../testing/data/csv/aggregate_test_100.csv' +OPTIONS ('format.has_header' 'true'); query I SELECT COUNT(*) FROM aggregate_test_100; diff --git a/datafusion/sqllogictest/test_files/repartition.slt b/datafusion/sqllogictest/test_files/repartition.slt index 3f9e6e61f1d0..a301982740c7 100644 --- a/datafusion/sqllogictest/test_files/repartition.slt +++ b/datafusion/sqllogictest/test_files/repartition.slt @@ -95,8 +95,8 @@ CREATE UNBOUNDED EXTERNAL TABLE sink_table ( c13 VARCHAR NOT NULL ) STORED AS CSV -WITH HEADER ROW -LOCATION '../../testing/data/csv/aggregate_test_100.csv'; +LOCATION '../../testing/data/csv/aggregate_test_100.csv' +OPTIONS ('format.has_header' 'true'); query TII SELECT c1, c2, c3 FROM sink_table WHERE c3 > 0 LIMIT 5; diff --git a/datafusion/sqllogictest/test_files/repartition_scan.slt b/datafusion/sqllogictest/test_files/repartition_scan.slt index 4a620a695fbf..6b9cb521f5f8 100644 --- a/datafusion/sqllogictest/test_files/repartition_scan.slt +++ b/datafusion/sqllogictest/test_files/repartition_scan.slt @@ -158,13 +158,13 @@ DROP TABLE parquet_table_with_order; # create a single csv file statement ok COPY (VALUES (1), (2), (3), (4), (5)) TO 'test_files/scratch/repartition_scan/csv_table/1.csv' -STORED AS CSV WITH HEADER ROW; +STORED AS CSV OPTIONS ('format.has_header' 'true'); statement ok CREATE EXTERNAL TABLE csv_table(column1 int) STORED AS csv -WITH HEADER ROW -LOCATION 'test_files/scratch/repartition_scan/csv_table/'; +LOCATION 'test_files/scratch/repartition_scan/csv_table/' +OPTIONS ('format.has_header' 'true'); query I select * from csv_table ORDER BY column1; @@ -277,8 +277,7 @@ DROP TABLE arrow_table; statement ok CREATE EXTERNAL TABLE avro_table STORED AS AVRO -WITH HEADER ROW -LOCATION '../../testing/data/avro/simple_enum.avro' +LOCATION '../../testing/data/avro/simple_enum.avro'; # It would be great to see the file read as "4" groups with even sizes (offsets) eventually diff --git a/datafusion/sqllogictest/test_files/scalar.slt b/datafusion/sqllogictest/test_files/scalar.slt index c52881b7b0ba..04b0fa8acab0 100644 --- a/datafusion/sqllogictest/test_files/scalar.slt +++ b/datafusion/sqllogictest/test_files/scalar.slt @@ -1352,8 +1352,8 @@ CREATE EXTERNAL TABLE aggregate_test_100 ( c13 VARCHAR NOT NULL ) STORED AS CSV -WITH HEADER ROW LOCATION '../../testing/data/csv/aggregate_test_100.csv' +OPTIONS ('format.has_header' 'true'); # c8 = i32; c6 = i64 query TTT diff --git a/datafusion/sqllogictest/test_files/select.slt b/datafusion/sqllogictest/test_files/select.slt index 24163e37dec3..d73157570d8a 100644 --- a/datafusion/sqllogictest/test_files/select.slt +++ b/datafusion/sqllogictest/test_files/select.slt @@ -33,8 +33,8 @@ CREATE EXTERNAL TABLE aggregate_test_100 ( c13 VARCHAR NOT NULL ) STORED AS CSV -WITH HEADER ROW LOCATION '../../testing/data/csv/aggregate_test_100.csv' +OPTIONS ('format.has_header' 'true'); statement ok CREATE EXTERNAL TABLE aggregate_simple ( @@ -43,8 +43,8 @@ CREATE EXTERNAL TABLE aggregate_simple ( c3 BOOLEAN NOT NULL ) STORED AS CSV -WITH HEADER ROW LOCATION '../core/tests/data/aggregate_simple.csv' +OPTIONS ('format.has_header' 'true'); ########## ## SELECT Tests @@ -1067,9 +1067,9 @@ CREATE EXTERNAL TABLE annotated_data_finite2 ( d INTEGER ) STORED AS CSV -WITH HEADER ROW WITH ORDER (a ASC, b ASC, c ASC) -LOCATION '../core/tests/data/window_2.csv'; +LOCATION '../core/tests/data/window_2.csv' +OPTIONS ('format.has_header' 'true'); # test_source_projection diff --git a/datafusion/sqllogictest/test_files/subquery.slt b/datafusion/sqllogictest/test_files/subquery.slt index 085f192a5d4e..4a9fb38e7db1 100644 --- a/datafusion/sqllogictest/test_files/subquery.slt +++ b/datafusion/sqllogictest/test_files/subquery.slt @@ -66,7 +66,7 @@ CREATE EXTERNAL TABLE IF NOT EXISTS customer ( c_acctbal DECIMAL(15, 2), c_mktsegment VARCHAR, c_comment VARCHAR, -) STORED AS CSV DELIMITER ',' WITH HEADER ROW LOCATION '../core/tests/tpch-csv/customer.csv'; +) STORED AS CSV LOCATION '../core/tests/tpch-csv/customer.csv' OPTIONS ('format.delimiter' ',', 'format.has_header' 'true'); statement ok CREATE EXTERNAL TABLE IF NOT EXISTS orders ( @@ -79,7 +79,7 @@ CREATE EXTERNAL TABLE IF NOT EXISTS orders ( o_clerk VARCHAR, o_shippriority INTEGER, o_comment VARCHAR, -) STORED AS CSV DELIMITER ',' WITH HEADER ROW LOCATION '../core/tests/tpch-csv/orders.csv'; +) STORED AS CSV LOCATION '../core/tests/tpch-csv/orders.csv' OPTIONS ('format.delimiter' ',', 'format.has_header' 'true'); statement ok CREATE EXTERNAL TABLE IF NOT EXISTS lineitem ( @@ -99,7 +99,7 @@ CREATE EXTERNAL TABLE IF NOT EXISTS lineitem ( l_shipinstruct VARCHAR, l_shipmode VARCHAR, l_comment VARCHAR, -) STORED AS CSV DELIMITER ',' WITH HEADER ROW LOCATION '../core/tests/tpch-csv/lineitem.csv'; +) STORED AS CSV LOCATION '../core/tests/tpch-csv/lineitem.csv' OPTIONS ('format.delimiter' ',', 'format.has_header' 'true'); # in_subquery_to_join_with_correlated_outer_filter query ITI rowsort diff --git a/datafusion/sqllogictest/test_files/topk.slt b/datafusion/sqllogictest/test_files/topk.slt index 19412defcd40..616794f84918 100644 --- a/datafusion/sqllogictest/test_files/topk.slt +++ b/datafusion/sqllogictest/test_files/topk.slt @@ -69,8 +69,8 @@ CREATE EXTERNAL TABLE aggregate_test_100 ( c13 VARCHAR NOT NULL ) STORED AS CSV -WITH HEADER ROW LOCATION '../../testing/data/csv/aggregate_test_100.csv' +OPTIONS ('format.has_header' 'true'); query TT explain select * from aggregate_test_100 ORDER BY c13 desc limit 5; diff --git a/datafusion/sqllogictest/test_files/tpch/create_tables.slt.part b/datafusion/sqllogictest/test_files/tpch/create_tables.slt.part index 2f5e2d5a7616..111d24773055 100644 --- a/datafusion/sqllogictest/test_files/tpch/create_tables.slt.part +++ b/datafusion/sqllogictest/test_files/tpch/create_tables.slt.part @@ -31,7 +31,7 @@ CREATE EXTERNAL TABLE IF NOT EXISTS supplier ( s_acctbal DECIMAL(15, 2), s_comment VARCHAR, s_rev VARCHAR, -) STORED AS CSV DELIMITER '|' LOCATION 'test_files/tpch/data/supplier.tbl'; +) STORED AS CSV LOCATION 'test_files/tpch/data/supplier.tbl' OPTIONS ('format.delimiter' '|'); statement ok CREATE EXTERNAL TABLE IF NOT EXISTS part ( @@ -45,7 +45,7 @@ CREATE EXTERNAL TABLE IF NOT EXISTS part ( p_retailprice DECIMAL(15, 2), p_comment VARCHAR, p_rev VARCHAR, -) STORED AS CSV DELIMITER '|' LOCATION 'test_files/tpch/data/part.tbl'; +) STORED AS CSV LOCATION 'test_files/tpch/data/part.tbl' OPTIONS ('format.delimiter' '|'); statement ok @@ -56,7 +56,7 @@ CREATE EXTERNAL TABLE IF NOT EXISTS partsupp ( ps_supplycost DECIMAL(15, 2), ps_comment VARCHAR, ps_rev VARCHAR, -) STORED AS CSV DELIMITER '|' LOCATION 'test_files/tpch/data/partsupp.tbl'; +) STORED AS CSV LOCATION 'test_files/tpch/data/partsupp.tbl' OPTIONS ('format.delimiter' '|'); statement ok CREATE EXTERNAL TABLE IF NOT EXISTS customer ( @@ -69,7 +69,7 @@ CREATE EXTERNAL TABLE IF NOT EXISTS customer ( c_mktsegment VARCHAR, c_comment VARCHAR, c_rev VARCHAR, -) STORED AS CSV DELIMITER '|' LOCATION 'test_files/tpch/data/customer.tbl'; +) STORED AS CSV LOCATION 'test_files/tpch/data/customer.tbl' OPTIONS ('format.delimiter' '|'); statement ok CREATE EXTERNAL TABLE IF NOT EXISTS orders ( @@ -83,7 +83,7 @@ CREATE EXTERNAL TABLE IF NOT EXISTS orders ( o_shippriority INTEGER, o_comment VARCHAR, o_rev VARCHAR, -) STORED AS CSV DELIMITER '|' LOCATION 'test_files/tpch/data/orders.tbl'; +) STORED AS CSV LOCATION 'test_files/tpch/data/orders.tbl' OPTIONS ('format.delimiter' '|'); statement ok CREATE EXTERNAL TABLE IF NOT EXISTS lineitem ( @@ -104,7 +104,7 @@ CREATE EXTERNAL TABLE IF NOT EXISTS lineitem ( l_shipmode VARCHAR, l_comment VARCHAR, l_rev VARCHAR, -) STORED AS CSV DELIMITER '|' LOCATION 'test_files/tpch/data/lineitem.tbl'; +) STORED AS CSV LOCATION 'test_files/tpch/data/lineitem.tbl' OPTIONS ('format.delimiter' '|'); statement ok CREATE EXTERNAL TABLE IF NOT EXISTS nation ( @@ -113,7 +113,7 @@ CREATE EXTERNAL TABLE IF NOT EXISTS nation ( n_regionkey BIGINT, n_comment VARCHAR, n_rev VARCHAR, -) STORED AS CSV DELIMITER '|' LOCATION 'test_files/tpch/data/nation.tbl'; +) STORED AS CSV LOCATION 'test_files/tpch/data/nation.tbl' OPTIONS ('format.delimiter' '|'); statement ok CREATE EXTERNAL TABLE IF NOT EXISTS region ( @@ -121,4 +121,4 @@ CREATE EXTERNAL TABLE IF NOT EXISTS region ( r_name VARCHAR, r_comment VARCHAR, r_rev VARCHAR, -) STORED AS CSV DELIMITER '|' LOCATION 'test_files/tpch/data/region.tbl'; +) STORED AS CSV LOCATION 'test_files/tpch/data/region.tbl' OPTIONS ('format.delimiter' '|'); \ No newline at end of file diff --git a/datafusion/sqllogictest/test_files/union.slt b/datafusion/sqllogictest/test_files/union.slt index c168890b7b00..36f024961875 100644 --- a/datafusion/sqllogictest/test_files/union.slt +++ b/datafusion/sqllogictest/test_files/union.slt @@ -105,8 +105,8 @@ CREATE EXTERNAL TABLE aggregate_test_100 ( c13 VARCHAR NOT NULL ) STORED AS CSV -WITH HEADER ROW LOCATION '../../testing/data/csv/aggregate_test_100.csv' +OPTIONS ('format.has_header' 'true'); query I select COUNT(*) from ( @@ -475,9 +475,9 @@ CREATE EXTERNAL TABLE t1 ( c13 VARCHAR NOT NULL ) STORED AS CSV -WITH HEADER ROW WITH ORDER (c1 ASC) -LOCATION '../../testing/data/csv/aggregate_test_100.csv'; +LOCATION '../../testing/data/csv/aggregate_test_100.csv' +OPTIONS ('format.has_header' 'true'); statement ok CREATE EXTERNAL TABLE t2 ( @@ -496,9 +496,9 @@ CREATE EXTERNAL TABLE t2 ( c13 VARCHAR NOT NULL ) STORED AS CSV -WITH HEADER ROW WITH ORDER (c1a ASC) -LOCATION '../../testing/data/csv/aggregate_test_100.csv'; +LOCATION '../../testing/data/csv/aggregate_test_100.csv' +OPTIONS ('format.has_header' 'true'); query TT explain diff --git a/datafusion/sqllogictest/test_files/wildcard.slt b/datafusion/sqllogictest/test_files/wildcard.slt index f83e84804a37..9285bdbf2306 100644 --- a/datafusion/sqllogictest/test_files/wildcard.slt +++ b/datafusion/sqllogictest/test_files/wildcard.slt @@ -40,8 +40,8 @@ CREATE EXTERNAL TABLE aggregate_simple ( c3 BOOLEAN NOT NULL, ) STORED AS CSV -WITH HEADER ROW LOCATION '../core/tests/data/aggregate_simple.csv' +OPTIONS ('format.has_header' 'true'); ########## diff --git a/datafusion/sqllogictest/test_files/window.slt b/datafusion/sqllogictest/test_files/window.slt index af09e644c9bf..be1517aa75c1 100644 --- a/datafusion/sqllogictest/test_files/window.slt +++ b/datafusion/sqllogictest/test_files/window.slt @@ -32,8 +32,8 @@ CREATE EXTERNAL TABLE aggregate_test_100 ( c13 VARCHAR NOT NULL ) STORED AS CSV -WITH HEADER ROW LOCATION '../../testing/data/csv/aggregate_test_100.csv' +OPTIONS ('format.has_header' 'true'); statement ok CREATE EXTERNAL TABLE null_cases( @@ -42,8 +42,8 @@ CREATE EXTERNAL TABLE null_cases( c3 BIGINT NULL ) STORED AS CSV -WITH HEADER ROW -LOCATION '../core/tests/data/null_cases.csv'; +LOCATION '../core/tests/data/null_cases.csv' +OPTIONS ('format.has_header' 'true'); ### This is the same table as ### execute_with_partition with 4 partitions @@ -2488,10 +2488,9 @@ CREATE EXTERNAL TABLE annotated_data_finite ( desc_col INTEGER, ) STORED AS CSV -WITH HEADER ROW WITH ORDER (ts ASC) LOCATION '../core/tests/data/window_1.csv' -; +OPTIONS ('format.has_header' 'true'); # 100 rows. Columns in the table are ts, inc_col, desc_col. # Source is CsvExec which is ordered by ts column. @@ -2503,9 +2502,9 @@ CREATE UNBOUNDED EXTERNAL TABLE annotated_data_infinite ( desc_col INTEGER, ) STORED AS CSV -WITH HEADER ROW WITH ORDER (ts ASC) -LOCATION '../core/tests/data/window_1.csv'; +LOCATION '../core/tests/data/window_1.csv' +OPTIONS ('format.has_header' 'true'); # test_source_sorted_aggregate @@ -2910,9 +2909,9 @@ CREATE EXTERNAL TABLE annotated_data_finite2 ( d INTEGER ) STORED AS CSV -WITH HEADER ROW WITH ORDER (a ASC, b ASC, c ASC) -LOCATION '../core/tests/data/window_2.csv'; +LOCATION '../core/tests/data/window_2.csv' +OPTIONS ('format.has_header' 'true'); # Columns in the table are a,b,c,d. Source is CsvExec which is ordered by # a,b,c column. Column a has cardinality 2, column b has cardinality 4. @@ -2926,9 +2925,9 @@ CREATE UNBOUNDED EXTERNAL TABLE annotated_data_infinite2 ( d INTEGER ) STORED AS CSV -WITH HEADER ROW WITH ORDER (a ASC, b ASC, c ASC) -LOCATION '../core/tests/data/window_2.csv'; +LOCATION '../core/tests/data/window_2.csv' +OPTIONS ('format.has_header' 'true'); # test_infinite_source_partition_by @@ -3402,10 +3401,10 @@ CREATE EXTERNAL TABLE multiple_ordered_table ( d INTEGER ) STORED AS CSV -WITH HEADER ROW WITH ORDER (a ASC, b ASC) WITH ORDER (c ASC) -LOCATION '../core/tests/data/window_2.csv'; +LOCATION '../core/tests/data/window_2.csv' +OPTIONS ('format.has_header' 'true'); # Since column b is constant after filter b=0, # There should be no SortExec(b ASC) in the plan @@ -3455,10 +3454,10 @@ CREATE UNBOUNDED EXTERNAL TABLE multiple_ordered_table_inf ( d INTEGER ) STORED AS CSV -WITH HEADER ROW WITH ORDER (a ASC, b ASC) WITH ORDER (c ASC) -LOCATION '../core/tests/data/window_2.csv'; +LOCATION '../core/tests/data/window_2.csv' +OPTIONS ('format.has_header' 'true'); # All of the window execs in the physical plan should work in the # sorted mode. diff --git a/docs/source/user-guide/cli/datasources.md b/docs/source/user-guide/cli/datasources.md index c2c00b633479..2b11645c471a 100644 --- a/docs/source/user-guide/cli/datasources.md +++ b/docs/source/user-guide/cli/datasources.md @@ -166,8 +166,8 @@ Register a single file csv datasource with a header row. ```sql CREATE EXTERNAL TABLE test STORED AS CSV -WITH HEADER ROW -LOCATION '/path/to/aggregate_test_100.csv'; +LOCATION '/path/to/aggregate_test_100.csv' +OPTIONS ('has_header' 'true'); ``` Register a single file csv datasource with explicitly defined schema. diff --git a/docs/source/user-guide/sql/ddl.md b/docs/source/user-guide/sql/ddl.md index 3d8b632f6ec2..e16b9681eb80 100644 --- a/docs/source/user-guide/sql/ddl.md +++ b/docs/source/user-guide/sql/ddl.md @@ -60,9 +60,6 @@ CREATE [UNBOUNDED] EXTERNAL TABLE [ IF NOT EXISTS ] [ () ] STORED AS -[ WITH HEADER ROW ] -[ DELIMITER ] -[ COMPRESSION TYPE ] [ PARTITIONED BY () ] [ WITH ORDER () ] [ OPTIONS () ] @@ -100,8 +97,8 @@ scanning a subset of the file. ```sql CREATE EXTERNAL TABLE test STORED AS CSV -WITH HEADER ROW -LOCATION '/path/to/aggregate_simple.csv'; +LOCATION '/path/to/aggregate_simple.csv' +OPTIONS ('has_header' 'true'); ``` It is also possible to use compressed files, such as `.csv.gz`: @@ -110,8 +107,8 @@ It is also possible to use compressed files, such as `.csv.gz`: CREATE EXTERNAL TABLE test STORED AS CSV COMPRESSION TYPE GZIP -WITH HEADER ROW -LOCATION '/path/to/aggregate_simple.csv.gz'; +LOCATION '/path/to/aggregate_simple.csv.gz' +OPTIONS ('has_header' 'true'); ``` It is also possible to specify the schema manually. @@ -133,8 +130,8 @@ CREATE EXTERNAL TABLE test ( c13 VARCHAR NOT NULL ) STORED AS CSV -WITH HEADER ROW -LOCATION '/path/to/aggregate_test_100.csv'; +LOCATION '/path/to/aggregate_test_100.csv' +OPTIONS ('has_header' 'true'); ``` It is also possible to specify a directory that contains a partitioned @@ -143,8 +140,8 @@ table (multiple files with the same schema) ```sql CREATE EXTERNAL TABLE test STORED AS CSV -WITH HEADER ROW -LOCATION '/path/to/directory/of/files'; +LOCATION '/path/to/directory/of/files' +OPTIONS ('has_header' 'true'); ``` With `CREATE UNBOUNDED EXTERNAL TABLE` SQL statement. We can create unbounded data sources such as following: @@ -181,9 +178,9 @@ CREATE EXTERNAL TABLE test ( c13 VARCHAR NOT NULL ) STORED AS CSV -WITH HEADER ROW WITH ORDER (c2 ASC, c5 + c8 DESC NULL FIRST) -LOCATION '/path/to/aggregate_test_100.csv'; +LOCATION '/path/to/aggregate_test_100.csv' +OPTIONS ('has_header' 'true'); ``` Where `WITH ORDER` clause specifies the sort order: diff --git a/docs/source/user-guide/sql/write_options.md b/docs/source/user-guide/sql/write_options.md index 7631525f13e5..3c4790dd0255 100644 --- a/docs/source/user-guide/sql/write_options.md +++ b/docs/source/user-guide/sql/write_options.md @@ -38,11 +38,11 @@ CREATE EXTERNAL TABLE my_table(a bigint, b bigint) STORED AS csv COMPRESSION TYPE gzip - WITH HEADER ROW DELIMITER ';' LOCATION '/test/location/my_csv_table/' OPTIONS( - NULL_VALUE 'NAN' + NULL_VALUE 'NAN', + 'has_header' 'true' ) ```