Skip to content

Commit 8202fed

Browse files
authored
Merge pull request #7539 from sundy-li/fv2-string
feat(query): add char/ord/soundex to function-v2
2 parents fa49f1c + b2fb291 commit 8202fed

File tree

10 files changed

+515
-83
lines changed

10 files changed

+515
-83
lines changed

src/query/expression/src/function.rs

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,11 @@
1313
// limitations under the License.
1414

1515
use std::collections::HashMap;
16+
use std::ops::BitAnd;
1617
use std::sync::Arc;
1718

1819
use chrono_tz::Tz;
20+
use common_arrow::arrow::bitmap::MutableBitmap;
1921
use serde::Deserialize;
2022
use serde::Serialize;
2123

@@ -24,8 +26,11 @@ use crate::property::FunctionProperty;
2426
use crate::types::nullable::NullableColumn;
2527
use crate::types::nullable::NullableDomain;
2628
use crate::types::*;
29+
use crate::util::constant_bitmap;
2730
use crate::values::Value;
2831
use crate::values::ValueRef;
32+
use crate::Column;
33+
use crate::Scalar;
2934

3035
#[derive(Debug, Clone)]
3136
pub struct FunctionSignature {
@@ -1042,3 +1047,53 @@ pub fn passthrough_nullable_3_arg<I1: ArgType, I2: ArgType, I3: ArgType, O: ArgT
10421047
}
10431048
}
10441049
}
1050+
1051+
pub fn wrap_nullable<F>(
1052+
f: F,
1053+
) -> impl Fn(&[ValueRef<AnyType>], &GenericMap) -> Result<Value<AnyType>, String> + Copy
1054+
where F: Fn(&[ValueRef<AnyType>], &GenericMap) -> Result<Value<AnyType>, String> + Copy {
1055+
move |args, generics| {
1056+
type T = NullableType<AnyType>;
1057+
type Result = AnyType;
1058+
1059+
let mut bitmap: Option<MutableBitmap> = None;
1060+
let mut nonull_args: Vec<ValueRef<Result>> = Vec::with_capacity(args.len());
1061+
1062+
let mut len = 1;
1063+
for arg in args {
1064+
let arg = arg.try_downcast::<T>().unwrap();
1065+
match arg {
1066+
ValueRef::Scalar(None) => return Ok(Value::Scalar(Scalar::Null)),
1067+
ValueRef::Scalar(Some(s)) => {
1068+
nonull_args.push(ValueRef::Scalar(s.clone()));
1069+
}
1070+
ValueRef::Column(v) => {
1071+
len = v.len();
1072+
nonull_args.push(ValueRef::Column(v.column.clone()));
1073+
bitmap = match bitmap {
1074+
Some(m) => Some(m.bitand(&v.validity)),
1075+
None => Some(v.validity.clone().make_mut()),
1076+
};
1077+
}
1078+
}
1079+
}
1080+
let nonull_results = f(&nonull_args, generics)?;
1081+
let bitmap = bitmap.unwrap_or_else(|| constant_bitmap(true, len));
1082+
match nonull_results {
1083+
Value::Scalar(s) => {
1084+
if bitmap.get(0) {
1085+
Ok(Value::Scalar(Result::upcast_scalar(s)))
1086+
} else {
1087+
Ok(Value::Scalar(Scalar::Null))
1088+
}
1089+
}
1090+
Value::Column(column) => {
1091+
let result = Column::Nullable(Box::new(NullableColumn {
1092+
column,
1093+
validity: bitmap.into(),
1094+
}));
1095+
Ok(Value::Column(Result::upcast_column(result)))
1096+
}
1097+
}
1098+
}
1099+
}

src/query/expression/src/values.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -212,7 +212,7 @@ impl<T: ArgType> Value<T> {
212212
}
213213

214214
impl<'a> ValueRef<'a, AnyType> {
215-
pub fn try_downcast<T: ArgType>(&self) -> Option<ValueRef<'_, T>> {
215+
pub fn try_downcast<T: ValueType>(&self) -> Option<ValueRef<'_, T>> {
216216
Some(match self {
217217
ValueRef::Scalar(scalar) => ValueRef::Scalar(T::try_downcast_scalar(scalar)?),
218218
ValueRef::Column(col) => ValueRef::Column(T::try_downcast_column(col)?),

src/query/functions-v2/src/scalars/mod.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ mod boolean;
2222
mod control;
2323
mod datetime;
2424
mod math;
25+
mod soundex;
2526
mod string;
2627
mod string_multi_args;
2728

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
// Copyright 2021 Datafuse Labs.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
pub(crate) struct Soundex;
16+
17+
impl Soundex {
18+
#[inline(always)]
19+
pub fn number_map(i: char) -> Option<u8> {
20+
match i.to_ascii_lowercase() {
21+
'b' | 'f' | 'p' | 'v' => Some(b'1'),
22+
'c' | 'g' | 'j' | 'k' | 'q' | 's' | 'x' | 'z' => Some(b'2'),
23+
'd' | 't' => Some(b'3'),
24+
'l' => Some(b'4'),
25+
'm' | 'n' => Some(b'5'),
26+
'r' => Some(b'6'),
27+
_ => Some(b'0'),
28+
}
29+
}
30+
31+
#[inline(always)]
32+
pub fn is_drop(c: char) -> bool {
33+
matches!(
34+
c.to_ascii_lowercase(),
35+
'a' | 'e' | 'i' | 'o' | 'u' | 'y' | 'h' | 'w'
36+
)
37+
}
38+
39+
// https://github.com/mysql/mysql-server/blob/3290a66c89eb1625a7058e0ef732432b6952b435/sql/item_strfunc.cc#L1919
40+
#[inline(always)]
41+
pub fn is_uni_alphabetic(c: char) -> bool {
42+
('a'..='z').contains(&c) || ('A'..='Z').contains(&c) || c as i32 >= 0xC0
43+
}
44+
}

src/query/functions-v2/src/scalars/string.rs

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ use std::io::Write;
1717

1818
use bstr::ByteSlice;
1919
use common_expression::types::number::NumberDomain;
20+
use common_expression::types::number::UInt64Type;
2021
use common_expression::types::string::StringColumn;
2122
use common_expression::types::string::StringColumnBuilder;
2223
use common_expression::types::GenericMap;
@@ -29,6 +30,8 @@ use common_expression::Value;
2930
use common_expression::ValueRef;
3031
use itertools::izip;
3132

33+
use super::soundex::Soundex;
34+
3235
pub fn register(registry: &mut FunctionRegistry) {
3336
registry.register_passthrough_nullable_1_arg::<StringType, StringType, _, _>(
3437
"upper",
@@ -534,6 +537,74 @@ pub fn register(registry: &mut FunctionRegistry) {
534537
},
535538
),
536539
);
540+
541+
registry.register_1_arg::<StringType, UInt64Type, _, _>(
542+
"ord",
543+
FunctionProperty::default(),
544+
|_| None,
545+
|str: &[u8]| {
546+
let mut res: u64 = 0;
547+
if !str.is_empty() {
548+
if str[0].is_ascii() {
549+
res = str[0] as u64;
550+
} else {
551+
for (p, _) in str.iter().enumerate() {
552+
let s = &str[0..p + 1];
553+
if std::str::from_utf8(s).is_ok() {
554+
for (i, b) in s.iter().rev().enumerate() {
555+
res += (*b as u64) * 256_u64.pow(i as u32);
556+
}
557+
break;
558+
}
559+
}
560+
}
561+
}
562+
res
563+
},
564+
);
565+
566+
registry.register_passthrough_nullable_1_arg::<StringType, StringType, _, _>(
567+
"soundex",
568+
FunctionProperty::default(),
569+
|_| None,
570+
vectorize_string_to_string(
571+
|col| usize::max(col.data.len(), 4 * col.len()),
572+
|val, writer| {
573+
let mut last = None;
574+
let mut count = 0;
575+
576+
for ch in String::from_utf8_lossy(val).chars() {
577+
let score = Soundex::number_map(ch);
578+
if last.is_none() {
579+
if !Soundex::is_uni_alphabetic(ch) {
580+
continue;
581+
}
582+
last = score;
583+
writer.put_char(ch.to_ascii_uppercase());
584+
} else {
585+
if !ch.is_ascii_alphabetic()
586+
|| Soundex::is_drop(ch)
587+
|| score.is_none()
588+
|| score == last
589+
{
590+
continue;
591+
}
592+
last = score;
593+
writer.put_char(score.unwrap() as char);
594+
}
595+
596+
count += 1;
597+
}
598+
// add '0'
599+
for _ in count..4 {
600+
writer.put_char('0');
601+
}
602+
603+
writer.commit_row();
604+
Ok(())
605+
},
606+
),
607+
);
537608
}
538609

539610
// Vectorize string to string function with customer estimate_bytes.

0 commit comments

Comments
 (0)