Skip to content

Commit 8aab437

Browse files
authored
Merge pull request #9160 from BohuTANG/dev-format-delimiter
fix(format): support ASCII control code hex as format field delimiter
2 parents 466fa2b + 98f902e commit 8aab437

File tree

3 files changed

+40
-13
lines changed

3 files changed

+40
-13
lines changed

docs/doc/11-integrations/00-api/03-streaming-load.md

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -20,14 +20,14 @@ curl -H "<parameter>:<value>" [-H "<parameter>:<value>"...] -F "upload=@<file_l
2020

2121
The request usually includes many occurrences of the argument `-H` and each is followed by one of the following parameters to tell Databend how to handle the file you're loading data from. Please note that `insert_sql` is required, and other parameters are optional.
2222

23-
| Parameter | Values | Supported Formats | Examples |
24-
|-------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------|------------------------------------------------|
25-
| insert_sql | [INSERT_statement] + format [file_format] | All | -H "insert_sql: insert into ontime format CSV" |
26-
| format_skip_header | Tells Databend how many lines at the beginning of the file to skip for header.<br /> 0 (default): No lines to skip;<br /> 1: Skip the first line;<br /> N: Skip the first N lines. | CSV / TSV / NDJSON | -H "format_skip_header: 1" |
27-
| format_compression | Tells Databend the compression format of the file.<br /> NONE (default): Do NOT decompress the file;<br /> AUTO: Automatically decompress the file by suffix;<br /> You can also use one of these values to explicitly specify the compression format: GZIP \| BZ2 \| BROTLI \| ZSTD \| DEFALTE \| RAW_DEFLATE. | CSV / TSV / NDJSON | -H "format_compression:auto" |
28-
| format_field_delimiter | Tells Databend the characters used in the file to separate fields.<br /> Default for CSV files: `,`.<br /> Default for TSV files: `\t`. | CSV / TSV | -H "format_field_delimiter:," |
29-
| format_record_delimiter | Tells Databend the new line characters used in the file to separate records.<br /> Default: `\n`. | CSV / TSV | -H "format_recorder_delimiter:\n" |
30-
| format_quote | Tells Databend the quote characters for strings in CSV file.<br /> Default: ""(Double quotes). | CSV | |
23+
| Parameter | Values | Supported Formats | Examples |
24+
|-------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------|---------------------------------------------------------------------------------------------------------------------------------------|
25+
| insert_sql | [INSERT_statement] + format [file_format] | All | -H "insert_sql: insert into ontime format CSV" |
26+
| format_skip_header | Tells Databend how many lines at the beginning of the file to skip for header.<br /> 0 (default): No lines to skip;<br /> 1: Skip the first line;<br /> N: Skip the first N lines. | CSV / TSV / NDJSON | -H "format_skip_header: 1" |
27+
| format_compression | Tells Databend the compression format of the file.<br /> NONE (default): Do NOT decompress the file;<br /> AUTO: Automatically decompress the file by suffix;<br /> You can also use one of these values to explicitly specify the compression format: GZIP \ | BZ2 \| BROTLI \ | ZSTD \| DEFALTE \| RAW_DEFLATE. | CSV / TSV / NDJSON | -H "format_compression:auto" |
28+
| format_field_delimiter | Tells Databend the characters used in the file to separate fields.<br /> Default for CSV files: `,`.<br /> Default for TSV files: `\t`.<br /> Hive output files using [SOH control character (\x01)]( https://en.wikipedia.org/wiki/C0_and_C1_control_codes#SOH) as the field delimiter. | CSV / TSV | -H "format_field_delimiter:,". |
29+
| format_record_delimiter | Tells Databend the new line characters used in the file to separate records.<br /> Default: `\n`. | CSV / TSV | -H "format_recorder_delimiter:\n" |
30+
| format_quote | Tells Databend the quote characters for strings in CSV file.<br /> Default: ""(Double quotes). | CSV | |
3131

3232
## Alternatives to Streaming Load API
3333

src/query/formats/src/format_option_checker.rs

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -138,14 +138,14 @@ impl FormatOptionChecker for TSVFormatOptionChecker {
138138
"TSV".to_string()
139139
}
140140

141-
fn check_quote(&self, quote: &mut String) -> Result<()> {
142-
check_quote(quote, "\'")
143-
}
144-
145141
fn check_escape(&self, escape: &mut String) -> Result<()> {
146142
check_escape(escape, "\\")
147143
}
148144

145+
fn check_quote(&self, quote: &mut String) -> Result<()> {
146+
check_quote(quote, "\'")
147+
}
148+
149149
fn check_record_delimiter(&self, record_delimiter: &mut String) -> Result<()> {
150150
check_record_delimiter(record_delimiter)
151151
}
@@ -222,7 +222,7 @@ pub fn check_quote(option: &mut String, default: &str) -> Result<()> {
222222
pub fn check_field_delimiter(option: &mut String, default: &str) -> Result<()> {
223223
if option.is_empty() {
224224
*option = default.to_string()
225-
} else if option.len() > 1 {
225+
} else if option.as_bytes().len() > 1 {
226226
return Err(ErrorCode::InvalidArgument(
227227
"field_delimiter can only contain one char",
228228
));

src/query/formats/tests/it/output_format_tcsv.rs

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,3 +108,30 @@ fn test_data_block_nullable() -> Result<()> {
108108
fn test_data_block_not_nullable() -> Result<()> {
109109
test_data_block(false)
110110
}
111+
112+
#[test]
113+
fn test_field_delimiter_with_ascii_control_code() -> Result<()> {
114+
let block = get_simple_block(false)?;
115+
let schema = block.schema().clone();
116+
117+
let settings = Settings::default_settings("default")?;
118+
settings.set_settings(
119+
"format_record_delimiter".to_string(),
120+
"\r\n".to_string(),
121+
false,
122+
)?;
123+
settings.set_settings(
124+
"format_field_delimiter".to_string(),
125+
"\x01".to_string(),
126+
false,
127+
)?;
128+
129+
let mut formatter = get_output_format_clickhouse_with_setting("csv", schema, &settings)?;
130+
let buffer = formatter.serialize_block(&block)?;
131+
132+
let csv_block = String::from_utf8(buffer)?;
133+
let expect = "1\x01\"a\"\x01true\x011.1\x01\"1970-01-02\"\r\n2\x01\"b\"\"\"\x01true\x012.2\x01\"1970-01-03\"\r\n3\x01\"c'\"\x01false\x01NaN\x01\"1970-01-04\"\r\n";
134+
assert_eq!(&csv_block, expect);
135+
136+
Ok(())
137+
}

0 commit comments

Comments
 (0)