Skip to content

Commit 6334a0e

Browse files
authored
Merge pull request #9175 from BohuTANG/dev-unescape-string
feat: change the unescape to enquota crate and support \\x01 -> \x01
2 parents 1292882 + b567778 commit 6334a0e

File tree

10 files changed

+79
-92
lines changed

10 files changed

+79
-92
lines changed

Cargo.lock

Lines changed: 10 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

src/common/io/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ bincode = { version = "2.0.0-rc.1", features = ["serde", "std"] }
2121
bytes = "1.2.1"
2222
chrono = { workspace = true }
2323
chrono-tz = { workspace = true }
24+
enquote = "1.1.0"
2425
lexical-core = "0.8.5"
2526
micromarshal = "0.2.1"
2627
ordered-float = "3.1.0"

src/common/io/src/utils.rs

Lines changed: 6 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414

1515
use std::cmp;
1616

17+
use common_exception::ErrorCode;
1718
use common_exception::Result;
1819

1920
pub fn convert_byte_size(num: f64) -> String {
@@ -80,63 +81,11 @@ pub fn deserialize_from_slice<T: serde::de::DeserializeOwned>(slice: &mut &[u8])
8081
Ok(value)
8182
}
8283

83-
pub fn is_control_ascii(c: u8) -> bool {
84-
c <= 31
85-
}
86-
87-
pub fn parse_escape_string(bs: &[u8]) -> String {
88-
let bs = parse_escape_bytes(bs);
89-
90-
let mut cs = Vec::with_capacity(bs.len());
91-
for b in bs {
92-
cs.push(b as char);
93-
}
94-
cs.iter().collect()
95-
}
96-
97-
pub fn parse_escape_bytes(bs: &[u8]) -> Vec<u8> {
98-
let mut vs = Vec::with_capacity(bs.len());
99-
let mut i = 0;
100-
while i < bs.len() {
101-
if bs[i] == b'\\' {
102-
if i + 1 < bs.len() {
103-
let c = parse_escape_byte(bs[i + 1]);
104-
if c != b'\\'
105-
&& c != b'\''
106-
&& c != b'"'
107-
&& c != b'`'
108-
&& c != b'/'
109-
&& !is_control_ascii(c)
110-
{
111-
vs.push(b'\\');
112-
}
113-
114-
vs.push(c);
115-
i += 2;
116-
} else {
117-
// end with \
118-
vs.push(b'\\');
119-
break;
120-
}
121-
} else {
122-
vs.push(bs[i]);
123-
i += 1;
124-
}
125-
}
126-
127-
vs
128-
}
129-
130-
// https://doc.rust-lang.org/reference/tokens.html
131-
pub fn parse_escape_byte(b: u8) -> u8 {
132-
match b {
133-
b'e' => b'\x1B',
134-
b'n' => b'\n',
135-
b'r' => b'\r',
136-
b't' => b'\t',
137-
b'0' => b'\0',
138-
_ => b,
139-
}
84+
/// Returns string after processing escapes.
85+
/// This used for settings string unescape, like unescape format_field_delimiter from `\\x01` to `\x01`.
86+
pub fn unescape_string(escape_str: &str) -> Result<String> {
87+
enquote::unescape(escape_str, None)
88+
.map_err(|e| ErrorCode::Internal(format!("unescape:{} error:{:?}", escape_str, e)))
14089
}
14190

14291
/// Mask a string by "******", but keep `unmask_len` of suffix.

src/common/io/tests/it/utils.rs

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,15 +33,22 @@ fn convert_test() {
3333
}
3434

3535
#[test]
36-
fn parse_escape() {
36+
fn test_unescape_string() {
3737
let cases = vec![
3838
vec!["a", "a"],
3939
vec!["abc", "abc"],
40+
vec!["\\x01", "\x01"],
41+
vec!["\x01", "\x01"],
4042
vec!["\t\nabc", "\t\nabc"],
43+
vec!["\"\t\nabc\"", "\"\t\nabc\""],
44+
vec!["\"\\t\nabc\"", "\"\t\nabc\""],
45+
vec!["'\\t\nabc'", "'\t\nabc'"],
4146
vec!["\\t\\nabc", "\t\nabc"],
47+
vec!["\\\\", r"\"],
48+
vec!["\\\\", "\\"],
4249
];
4350

4451
for c in cases {
45-
assert_eq!(parse_escape_bytes(c[0].as_bytes()), c[1].as_bytes());
52+
assert_eq!(unescape_string(c[0]).unwrap(), c[1]);
4653
}
4754
}

src/query/service/src/servers/http/v1/load.rs

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ use common_base::base::ProgressValues;
2222
use common_base::base::TrySpawn;
2323
use common_exception::ErrorCode;
2424
use common_exception::Result;
25-
use common_io::prelude::parse_escape_string;
25+
use common_io::prelude::unescape_string;
2626
use common_pipeline_sources::processors::sources::input_formats::InputContext;
2727
use common_pipeline_sources::processors::sources::input_formats::StreamingReadBatch;
2828
use futures::StreamExt;
@@ -103,9 +103,11 @@ pub async fn streaming_load(
103103
for (key, value) in req.headers().iter() {
104104
if settings.has_setting(key.as_str()) {
105105
let value = value.to_str().map_err(InternalServerError)?;
106-
let value = parse_escape_string(remove_quote(value.as_bytes()));
106+
let unquote =
107+
std::str::from_utf8(remove_quote(value.as_bytes())).map_err(InternalServerError)?;
108+
let value = unescape_string(unquote).map_err(InternalServerError)?;
107109
settings
108-
.set_settings(key.to_string(), value, false)
110+
.set_settings(key.to_string(), value.to_string(), false)
109111
.map_err(InternalServerError)?
110112
}
111113
}

src/query/sql/src/planner/binder/copy.rs

Lines changed: 17 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ use common_catalog::table_context::TableContext;
3232
use common_config::GlobalConfig;
3333
use common_exception::ErrorCode;
3434
use common_exception::Result;
35-
use common_io::prelude::parse_escape_string;
35+
use common_io::prelude::unescape_string;
3636
use common_meta_types::FileFormatOptions;
3737
use common_meta_types::StageFileFormatType;
3838
use common_meta_types::UserStageInfo;
@@ -611,54 +611,44 @@ pub fn parse_copy_file_format_options(
611611
.parse::<u64>()?;
612612

613613
// Field delimiter.
614-
let field_delimiter = parse_escape_string(
614+
let field_delimiter = unescape_string(
615615
file_format_options
616616
.get("field_delimiter")
617-
.unwrap_or(&"".to_string())
618-
.as_bytes(),
619-
);
617+
.unwrap_or(&"".to_string()),
618+
)?;
620619

621620
// Record delimiter.
622-
let record_delimiter = parse_escape_string(
621+
let record_delimiter = unescape_string(
623622
file_format_options
624623
.get("record_delimiter")
625-
.unwrap_or(&"".to_string())
626-
.as_bytes(),
627-
);
624+
.unwrap_or(&"".to_string()),
625+
)?;
628626

629627
// NaN display.
630-
let nan_display = parse_escape_string(
628+
let nan_display = unescape_string(
631629
file_format_options
632630
.get("nan_display")
633-
.unwrap_or(&"".to_string())
634-
.as_bytes(),
635-
);
631+
.unwrap_or(&"".to_string()),
632+
)?;
636633

637634
// Escape
638-
let escape = parse_escape_string(
639-
file_format_options
640-
.get("escape")
641-
.unwrap_or(&"".to_string())
642-
.as_bytes(),
643-
);
635+
let escape = unescape_string(file_format_options.get("escape").unwrap_or(&"".to_string()))?;
644636

645637
// Compression delimiter.
646-
let compression = parse_escape_string(
638+
let compression = unescape_string(
647639
file_format_options
648640
.get("compression")
649-
.unwrap_or(&"none".to_string())
650-
.as_bytes(),
651-
)
641+
.unwrap_or(&"none".to_string()),
642+
)?
652643
.parse()
653644
.map_err(ErrorCode::UnknownCompressionType)?;
654645

655646
// Row tag in xml.
656-
let row_tag = parse_escape_string(
647+
let row_tag = unescape_string(
657648
file_format_options
658649
.get("row_tag")
659-
.unwrap_or(&"".to_string())
660-
.as_bytes(),
661-
);
650+
.unwrap_or(&"".to_string()),
651+
)?;
662652

663653
Ok(FileFormatOptions {
664654
format: file_format,

tests/suites/1_stateful/05_formats/05_02_csv/05_02_01_csv_escape.sh

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,10 @@
33
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
44
. "$CURDIR"/../../../../shell_env.sh
55

6+
7+
# Should be <root>/tests/data/
8+
DATADIR=$(realpath $CURDIR/../../../data/)
9+
610
echo "drop table if exists test_csv" | $MYSQL_CLIENT_CONNECT
711
echo "drop table if exists test_csv2" | $MYSQL_CLIENT_CONNECT
812

@@ -41,16 +45,16 @@ curl -sH "insert_sql:insert into test_csv format CSV" -F "upload=@/tmp/escape_no
4145
echo "select * from test_csv" | $MYSQL_CLIENT_CONNECT
4246
echo "truncate table test_csv" | $MYSQL_CLIENT_CONNECT
4347

44-
curl -sH "insert_sql:insert into test_csv format CSV" -H "format_escape:'\\'" -F "upload=@/tmp/escape_slash.csv" -u root: -XPUT "http://localhost:${QUERY_HTTP_HANDLER_PORT}/v1/streaming_load" | grep -c "SUCCESS"
48+
curl -sH "insert_sql:insert into test_csv format CSV" -H "format_escape:'\\\'" -F "upload=@/tmp/escape_slash.csv" -u root: -XPUT "http://localhost:${QUERY_HTTP_HANDLER_PORT}/v1/streaming_load" | grep -c "SUCCESS"
4549
echo "select * from test_csv" | $MYSQL_CLIENT_CONNECT
4650
echo "truncate table test_csv" | $MYSQL_CLIENT_CONNECT
4751

4852
aws --endpoint-url http://127.0.0.1:9900/ s3 cp /tmp/escape_slash2.csv s3://testbucket/admin/data/csv/escape_slash2.csv > /dev/null 2>&1
49-
echo "copy into test_csv from 's3://testbucket/admin/data/csv/escape_slash2.csv' connection=(aws_key_id='minioadmin' aws_secret_key='minioadmin' endpoint_url='http://127.0.0.1:9900/') FILE_FORMAT = (type = 'CSV' escape='\\\')" | $MYSQL_CLIENT_CONNECT
53+
echo "copy into test_csv from 'fs:///tmp/escape_slash2.csv' FILE_FORMAT = (type = 'CSV' escape='\\\\\\\\')" | $MYSQL_CLIENT_CONNECT
5054
echo "select * from test_csv" | $MYSQL_CLIENT_CONNECT
5155

5256
aws --endpoint-url http://127.0.0.1:9900/ s3 cp /tmp/escape_slash3.csv s3://testbucket/admin/data/csv/escape_slash3.csv > /dev/null 2>&1
53-
echo "copy into test_csv2 from 's3://testbucket/admin/data/csv/escape_slash3.csv' connection=(aws_key_id='minioadmin' aws_secret_key='minioadmin' endpoint_url='http://127.0.0.1:9900/') FILE_FORMAT = (type = 'CSV' escape='\\\' skip_header=1)" | $MYSQL_CLIENT_CONNECT
57+
echo "copy into test_csv2 from 'fs:///tmp/escape_slash3.csv' FILE_FORMAT = (type = 'CSV' escape='\\\\\\\\' skip_header=1)" | $MYSQL_CLIENT_CONNECT
5458
echo "select * from test_csv2" | $MYSQL_CLIENT_CONNECT
5559

5660
echo "drop table if exists test_csv" | $MYSQL_CLIENT_CONNECT
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
1
2+
4
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
#!/usr/bin/env bash
2+
3+
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
4+
. "$CURDIR"/../../../../shell_env.sh
5+
6+
echo "drop table if exists test_x01_csv" | $MYSQL_CLIENT_CONNECT
7+
8+
echo "CREATE TABLE test_x01_csv
9+
(
10+
a VARCHAR,
11+
b Int,
12+
c VARCHAR
13+
);" | $MYSQL_CLIENT_CONNECT
14+
15+
curl -sH "insert_sql:insert into test_x01_csv format CSV" -H "format_field_delimiter:\x01" -H "format_skip_header:1" -H "format_record_delimiter:\r\n" -F "upload=@${CURDIR}/x01_field_delimiter.csv" \
16+
-u root: -XPUT "http://localhost:${QUERY_HTTP_HANDLER_PORT}/v1/streaming_load" | grep -c "SUCCESS"
17+
echo "select count() from test_x01_csv" | $MYSQL_CLIENT_CONNECT
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
NameAgeOccupation
2+
Donald John Trump80president
3+
马斯克43特斯拉
4+
测试39TT
5+
测试39test

0 commit comments

Comments
 (0)