Skip to content

Commit 95160e4

Browse files
committed
Merge remote-tracking branch 'xuanwo/unwind-safe-operator' into unwind-safe-operator
2 parents 31966ea + 10ee984 commit 95160e4

File tree

36 files changed

+757
-802
lines changed

36 files changed

+757
-802
lines changed

Cargo.lock

Lines changed: 10 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

docs/doc/00-overview/index.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ Databend uses the latest techniques in vectorized query processing to allow you
2525

2626
- __Easy to Use__
2727

28-
Databend has no indexes to build, no manual tuning required, no manual figuring out partitions or shard data, it’s all done for you as data is loaded into the table.
28+
No manual tuning is required. You don't need to manually index, partition, or shard the data. Once you load your data into a table, Databend will handle everything else.
2929

3030
## Design Overview
3131

src/common/io/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ bincode = { version = "2.0.0-rc.1", features = ["serde", "std"] }
2121
bytes = "1.2.1"
2222
chrono = { workspace = true }
2323
chrono-tz = { workspace = true }
24+
enquote = "1.1.0"
2425
lexical-core = "0.8.5"
2526
micromarshal = "0.2.1"
2627
ordered-float = "3.1.0"

src/common/io/src/utils.rs

Lines changed: 6 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414

1515
use std::cmp;
1616

17+
use common_exception::ErrorCode;
1718
use common_exception::Result;
1819

1920
pub fn convert_byte_size(num: f64) -> String {
@@ -80,63 +81,11 @@ pub fn deserialize_from_slice<T: serde::de::DeserializeOwned>(slice: &mut &[u8])
8081
Ok(value)
8182
}
8283

83-
pub fn is_control_ascii(c: u8) -> bool {
84-
c <= 31
85-
}
86-
87-
pub fn parse_escape_string(bs: &[u8]) -> String {
88-
let bs = parse_escape_bytes(bs);
89-
90-
let mut cs = Vec::with_capacity(bs.len());
91-
for b in bs {
92-
cs.push(b as char);
93-
}
94-
cs.iter().collect()
95-
}
96-
97-
pub fn parse_escape_bytes(bs: &[u8]) -> Vec<u8> {
98-
let mut vs = Vec::with_capacity(bs.len());
99-
let mut i = 0;
100-
while i < bs.len() {
101-
if bs[i] == b'\\' {
102-
if i + 1 < bs.len() {
103-
let c = parse_escape_byte(bs[i + 1]);
104-
if c != b'\\'
105-
&& c != b'\''
106-
&& c != b'"'
107-
&& c != b'`'
108-
&& c != b'/'
109-
&& !is_control_ascii(c)
110-
{
111-
vs.push(b'\\');
112-
}
113-
114-
vs.push(c);
115-
i += 2;
116-
} else {
117-
// end with \
118-
vs.push(b'\\');
119-
break;
120-
}
121-
} else {
122-
vs.push(bs[i]);
123-
i += 1;
124-
}
125-
}
126-
127-
vs
128-
}
129-
130-
// https://doc.rust-lang.org/reference/tokens.html
131-
pub fn parse_escape_byte(b: u8) -> u8 {
132-
match b {
133-
b'e' => b'\x1B',
134-
b'n' => b'\n',
135-
b'r' => b'\r',
136-
b't' => b'\t',
137-
b'0' => b'\0',
138-
_ => b,
139-
}
84+
/// Returns string after processing escapes.
85+
/// This used for settings string unescape, like unescape format_field_delimiter from `\\x01` to `\x01`.
86+
pub fn unescape_string(escape_str: &str) -> Result<String> {
87+
enquote::unescape(escape_str, None)
88+
.map_err(|e| ErrorCode::Internal(format!("unescape:{} error:{:?}", escape_str, e)))
14089
}
14190

14291
/// Mask a string by "******", but keep `unmask_len` of suffix.

src/common/io/tests/it/utils.rs

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,15 +33,22 @@ fn convert_test() {
3333
}
3434

3535
#[test]
36-
fn parse_escape() {
36+
fn test_unescape_string() {
3737
let cases = vec![
3838
vec!["a", "a"],
3939
vec!["abc", "abc"],
40+
vec!["\\x01", "\x01"],
41+
vec!["\x01", "\x01"],
4042
vec!["\t\nabc", "\t\nabc"],
43+
vec!["\"\t\nabc\"", "\"\t\nabc\""],
44+
vec!["\"\\t\nabc\"", "\"\t\nabc\""],
45+
vec!["'\\t\nabc'", "'\t\nabc'"],
4146
vec!["\\t\\nabc", "\t\nabc"],
47+
vec!["\\\\", r"\"],
48+
vec!["\\\\", "\\"],
4249
];
4350

4451
for c in cases {
45-
assert_eq!(parse_escape_bytes(c[0].as_bytes()), c[1].as_bytes());
52+
assert_eq!(unescape_string(c[0]).unwrap(), c[1]);
4653
}
4754
}

src/query/formats/src/format_option_checker.rs

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -230,17 +230,25 @@ pub fn check_field_delimiter(option: &mut String, default: &str) -> Result<()> {
230230
Ok(())
231231
}
232232

233+
/// `\r\n` or u8
233234
pub fn check_record_delimiter(option: &mut String) -> Result<()> {
234-
if option.is_empty() {
235-
*option = "\n".to_string()
236-
} else {
237-
let o = option.as_str();
238-
if o != "\n" && o != "\r\n" {
235+
match option.len() {
236+
0 => *option = "\n".to_string(),
237+
1 => {}
238+
2 => {
239+
if option != "\r\n" {
240+
return Err(ErrorCode::InvalidArgument(
241+
"record_delimiter with two chars can only be '\\r\\n'",
242+
));
243+
};
244+
}
245+
_ => {
239246
return Err(ErrorCode::InvalidArgument(
240-
"record_delimiter can only be '\\n' or '\\r\\n'",
247+
"record_delimiter can not more than two chars, please use one char or '\\r\\n'",
241248
));
242-
};
249+
}
243250
}
251+
244252
Ok(())
245253
}
246254

src/query/formats/src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,5 +28,6 @@ pub use field_decoder::*;
2828
pub use file_format_type::parse_timezone;
2929
pub use file_format_type::FileFormatOptionsExt;
3030
pub use file_format_type::FileFormatTypeExt;
31+
pub use format_option_checker::check_record_delimiter;
3132

3233
use crate::common_settings::CommonSettings;
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
// Copyright 2022 Datafuse Labs.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
use common_formats::check_record_delimiter;
16+
17+
/// This test code is written by OpenAI's GPT-3.
18+
#[test]
19+
fn test_check_record_delimiter() {
20+
let mut option = "".to_string();
21+
assert!(check_record_delimiter(&mut option).is_ok());
22+
assert_eq!(option, "\n");
23+
24+
let mut option = "|".to_string();
25+
assert!(check_record_delimiter(&mut option).is_ok());
26+
assert_eq!(option, "|");
27+
28+
let mut option = "\r\n".to_string();
29+
assert!(check_record_delimiter(&mut option).is_ok());
30+
assert_eq!(option, "\r\n");
31+
32+
let mut option = "foo".to_string();
33+
assert!(check_record_delimiter(&mut option).is_err());
34+
35+
let mut option = "|\r".to_string();
36+
assert!(check_record_delimiter(&mut option).is_err());
37+
}

src/query/formats/tests/it/main.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ use common_meta_types::StageFileFormatType;
2121
use common_settings::Settings;
2222

2323
mod field_encoder;
24+
mod format_option_checker;
2425
mod output_format_json_each_row;
2526
mod output_format_tcsv;
2627
mod output_format_utils;

src/query/pipeline/sources/src/processors/sources/input_formats/impls/input_format_tsv.rs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ pub struct InputFormatTSV {}
3737
impl InputFormatTSV {
3838
#[allow(clippy::too_many_arguments)]
3939
fn read_row(
40+
field_delimiter: u8,
4041
field_decoder: &FieldDecoderTSV,
4142
buf: &[u8],
4243
deserializers: &mut Vec<common_datavalues::TypeDeserializerImpl>,
@@ -53,7 +54,7 @@ impl InputFormatTSV {
5354
let mut err_msg = None;
5455
let buf_len = buf.len();
5556
while pos <= buf_len {
56-
if pos == buf_len || buf[pos] == b'\t' {
57+
if pos == buf_len || buf[pos] == field_delimiter {
5758
let col_data = &buf[field_start..pos];
5859
if col_data.is_empty() {
5960
deserializers[column_index].de_default();
@@ -156,6 +157,7 @@ impl InputFormatTextBase for InputFormatTSV {
156157
for (i, end) in batch.row_ends.iter().enumerate() {
157158
let buf = &batch.data[start..*end]; // include \n
158159
Self::read_row(
160+
builder.ctx.field_delimiter,
159161
field_decoder,
160162
buf,
161163
columns,

0 commit comments

Comments
 (0)