Merge pull request #9160 from BohuTANG/dev-format-delimiter

BohuTANG · web-flow · commit 8aab437825c9 · 2022-12-09T09:10:18.000+08:00
fix(format): support ASCII control code hex as format field delimiter
diff --git a/docs/doc/11-integrations/00-api/03-streaming-load.md b/docs/doc/11-integrations/00-api/03-streaming-load.md
@@ -20,14 +20,14 @@ curl -H "<parameter>:<value>"  [-H "<parameter>:<value>"...] -F "upload=@<file_l
 
 The request usually includes many occurrences of the argument `-H` and each is followed by one of the following parameters to tell Databend how to handle the file you're loading data from. Please note that `insert_sql` is required, and other parameters are optional.
 
-| Parameter               | Values                                                                                                                                                                                                                                                                                                     | Supported Formats         | Examples                                       |
-|-------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------|------------------------------------------------|
-| insert_sql              | [INSERT_statement] + format [file_format]                                                                                                                                                                                                                                                                  | All                       | -H "insert_sql: insert into ontime format CSV" |
-| format_skip_header      | Tells Databend how many lines at the beginning of the file to skip for header.<br /> 0 (default): No lines to skip;<br /> 1: Skip the first line;<br /> N: Skip the first N lines.                                                                                                                               | CSV / TSV / NDJSON | -H "format_skip_header: 1"                     |
-| format_compression      | Tells Databend the compression format of the file.<br /> NONE (default): Do NOT decompress the file;<br /> AUTO: Automatically decompress the file by suffix;<br />  You can also use one of these values to explicitly specify the compression format: GZIP \| BZ2 \| BROTLI \| ZSTD \|  DEFALTE \| RAW_DEFLATE. | CSV / TSV / NDJSON | -H "format_compression:auto"                   |
-| format_field_delimiter  | Tells Databend the characters used in the file to separate fields.<br /> Default for CSV files: `,`.<br /> Default for TSV files: `\t`.                                                                                                                                                                        | CSV / TSV                 | -H "format_field_delimiter:,"                  |
-| format_record_delimiter | Tells Databend the new line characters used in the file to separate records.<br />  Default: `\n`.                                                                                                                                                                                                           | CSV / TSV                 | -H "format_recorder_delimiter:\n"              |
-| format_quote           | Tells Databend the quote characters for strings in CSV file.<br /> Default: ""(Double quotes).                                                                                                                                                                                                                             | CSV                       |                                                |
+| Parameter               | Values                                                                                                                                                                                                                                                                                    | Supported Formats         | Examples                                                                                                                              |
+|-------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------|---------------------------------------------------------------------------------------------------------------------------------------|
+| insert_sql              | [INSERT_statement] + format [file_format]                                                                                                                                                                                                                                                 | All                       | -H "insert_sql: insert into ontime format CSV"                                                                                        |
+| format_skip_header      | Tells Databend how many lines at the beginning of the file to skip for header.<br /> 0 (default): No lines to skip;<br /> 1: Skip the first line;<br /> N: Skip the first N lines.                                                                                                        | CSV / TSV / NDJSON | -H "format_skip_header: 1"                                                                                                            |
+| format_compression      | Tells Databend the compression format of the file.<br /> NONE (default): Do NOT decompress the file;<br /> AUTO: Automatically decompress the file by suffix;<br />  You can also use one of these values to explicitly specify the compression format: GZIP \                            | BZ2 \| BROTLI \                                                                                                                              | ZSTD \|  DEFALTE \| RAW_DEFLATE. | CSV / TSV / NDJSON | -H "format_compression:auto"                   |
+| format_field_delimiter  | Tells Databend the characters used in the file to separate fields.<br /> Default for CSV files: `,`.<br /> Default for TSV files: `\t`.<br /> Hive output files using [SOH control character (\x01)]( https://en.wikipedia.org/wiki/C0_and_C1_control_codes#SOH) as the field delimiter.  | CSV / TSV                 | -H "format_field_delimiter:,". |
+| format_record_delimiter | Tells Databend the new line characters used in the file to separate records.<br />  Default: `\n`.                                                                                                                                                                                        | CSV / TSV                 | -H "format_recorder_delimiter:\n"                                                                                                     |
+| format_quote           | Tells Databend the quote characters for strings in CSV file.<br /> Default: ""(Double quotes).                                                                                                                                                                                            | CSV                       |                                                                                                                                       |
 
 ## Alternatives to Streaming Load API
 
diff --git a/src/query/formats/src/format_option_checker.rs b/src/query/formats/src/format_option_checker.rs
@@ -138,14 +138,14 @@ impl FormatOptionChecker for TSVFormatOptionChecker {
         "TSV".to_string()
     }
 
-    fn check_quote(&self, quote: &mut String) -> Result<()> {
-        check_quote(quote, "\'")
-    }
-
     fn check_escape(&self, escape: &mut String) -> Result<()> {
         check_escape(escape, "\\")
     }
 
+    fn check_quote(&self, quote: &mut String) -> Result<()> {
+        check_quote(quote, "\'")
+    }
+
     fn check_record_delimiter(&self, record_delimiter: &mut String) -> Result<()> {
         check_record_delimiter(record_delimiter)
     }
@@ -222,7 +222,7 @@ pub fn check_quote(option: &mut String, default: &str) -> Result<()> {
 pub fn check_field_delimiter(option: &mut String, default: &str) -> Result<()> {
     if option.is_empty() {
         *option = default.to_string()
-    } else if option.len() > 1 {
+    } else if option.as_bytes().len() > 1 {
         return Err(ErrorCode::InvalidArgument(
             "field_delimiter can only contain one char",
         ));
diff --git a/src/query/formats/tests/it/output_format_tcsv.rs b/src/query/formats/tests/it/output_format_tcsv.rs
@@ -108,3 +108,30 @@ fn test_data_block_nullable() -> Result<()> {
 fn test_data_block_not_nullable() -> Result<()> {
     test_data_block(false)
 }
+
+#[test]
+fn test_field_delimiter_with_ascii_control_code() -> Result<()> {
+    let block = get_simple_block(false)?;
+    let schema = block.schema().clone();
+
+    let settings = Settings::default_settings("default")?;
+    settings.set_settings(
+        "format_record_delimiter".to_string(),
+        "\r\n".to_string(),
+        false,
+    )?;
+    settings.set_settings(
+        "format_field_delimiter".to_string(),
+        "\x01".to_string(),
+        false,
+    )?;
+
+    let mut formatter = get_output_format_clickhouse_with_setting("csv", schema, &settings)?;
+    let buffer = formatter.serialize_block(&block)?;
+
+    let csv_block = String::from_utf8(buffer)?;
+    let expect = "1\x01\"a\"\x01true\x011.1\x01\"1970-01-02\"\r\n2\x01\"b\"\"\"\x01true\x012.2\x01\"1970-01-03\"\r\n3\x01\"c'\"\x01false\x01NaN\x01\"1970-01-04\"\r\n";
+    assert_eq!(&csv_block, expect);
+
+    Ok(())
+}