-
Notifications
You must be signed in to change notification settings - Fork 1.5k
implement utf8_view for replace #12004
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
60ff4f6
7a3569b
b08f96e
14a124b
ab85d92
8ce4028
b948b0f
d26f21a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -18,16 +18,16 @@ | |
use std::any::Any; | ||
use std::sync::Arc; | ||
|
||
use arrow::array::{ArrayRef, GenericStringArray, OffsetSizeTrait}; | ||
use arrow::array::{ArrayRef, GenericStringArray, OffsetSizeTrait, StringViewArray}; | ||
use arrow::datatypes::DataType; | ||
|
||
use datafusion_common::cast::as_generic_string_array; | ||
use datafusion_common::cast::{as_generic_string_array, as_string_view_array}; | ||
use datafusion_common::{exec_err, Result}; | ||
use datafusion_expr::TypeSignature::*; | ||
use datafusion_expr::{ColumnarValue, Volatility}; | ||
use datafusion_expr::{ScalarUDFImpl, Signature}; | ||
|
||
use crate::utils::{make_scalar_function, utf8_to_str_type}; | ||
use crate::utils::make_scalar_function; | ||
|
||
#[derive(Debug)] | ||
pub struct ReplaceFunc { | ||
|
@@ -66,20 +66,49 @@ impl ScalarUDFImpl for ReplaceFunc { | |
} | ||
|
||
fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> { | ||
utf8_to_str_type(&arg_types[0], "replace") | ||
match arg_types[0].clone() { | ||
DataType::Utf8 => return Ok(DataType::Utf8), | ||
DataType::LargeUtf8 => return Ok(DataType::LargeUtf8), | ||
DataType::Utf8View => { | ||
println!("utf8view"); | ||
return Ok(DataType::Utf8View); | ||
} | ||
other => { | ||
exec_err!("Unsupported data type {other:?} for function replace") | ||
} | ||
} | ||
} | ||
|
||
fn invoke(&self, args: &[ColumnarValue]) -> Result<ColumnarValue> { | ||
match args[0].data_type() { | ||
DataType::Utf8 => make_scalar_function(replace::<i32>, vec![])(args), | ||
DataType::LargeUtf8 => make_scalar_function(replace::<i64>, vec![])(args), | ||
DataType::Utf8View => make_scalar_function(replace_view, vec![])(args), | ||
other => { | ||
exec_err!("Unsupported data type {other:?} for function replace") | ||
} | ||
} | ||
} | ||
} | ||
|
||
fn replace_view(args: &[ArrayRef]) -> Result<ArrayRef> { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ❤️ |
||
let string_array = as_string_view_array(&args[0])?; | ||
let from_array = as_string_view_array(&args[1])?; | ||
let to_array = as_string_view_array(&args[2])?; | ||
|
||
let result = string_array | ||
.iter() | ||
.zip(from_array.iter()) | ||
.zip(to_array.iter()) | ||
.map(|((string, from), to)| match (string, from, to) { | ||
(Some(string), Some(from), Some(to)) => Some(string.replace(from, to)), | ||
_ => None, | ||
}) | ||
.collect::<StringViewArray>(); | ||
|
||
Ok(Arc::new(result) as ArrayRef) | ||
|
||
} | ||
/// Replaces all occurrences in string of substring from with substring to. | ||
/// replace('abcdefabcdef', 'cd', 'XX') = 'abXXefabXXef' | ||
fn replace<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> { | ||
|
@@ -100,4 +129,59 @@ fn replace<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> { | |
Ok(Arc::new(result) as ArrayRef) | ||
} | ||
|
||
mod test {} | ||
#[cfg(test)] | ||
mod tests { | ||
use super::*; | ||
use crate::utils::test::test_function; | ||
use arrow::array::Array; | ||
use arrow::array::LargeStringArray; | ||
use arrow::array::StringArray; | ||
use arrow::array::StringViewArray; | ||
use arrow::datatypes::DataType::{Utf8, LargeUtf8, Utf8View}; | ||
use datafusion_common::ScalarValue; | ||
|
||
#[test] | ||
fn test_functions() -> Result<()> { | ||
test_function!( | ||
ReplaceFunc::new(), | ||
&[ | ||
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("aabbdqcbb")))), | ||
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("bb")))), | ||
ColumnarValue::Scalar(ScalarValue::Utf8(Some(String::from("ccc")))), | ||
], | ||
Ok(Some("aacccdqcccc")), | ||
&str, | ||
Utf8, | ||
StringArray | ||
); | ||
|
||
test_function!( | ||
ReplaceFunc::new(), | ||
&[ | ||
ColumnarValue::Scalar(ScalarValue::LargeUtf8(Some(String::from("aabbb")))), | ||
ColumnarValue::Scalar(ScalarValue::LargeUtf8(Some(String::from("bbb")))), | ||
ColumnarValue::Scalar(ScalarValue::LargeUtf8(Some(String::from("cc")))), | ||
], | ||
Ok(Some("aacc")), | ||
&str, | ||
LargeUtf8, | ||
LargeStringArray | ||
); | ||
|
||
test_function!( | ||
ReplaceFunc::new(), | ||
&[ | ||
ColumnarValue::Scalar(ScalarValue::Utf8View(Some(String::from("aabbbcw")))), | ||
ColumnarValue::Scalar(ScalarValue::Utf8View(Some(String::from("bb")))), | ||
ColumnarValue::Scalar(ScalarValue::Utf8View(Some(String::from("cc")))), | ||
], | ||
Ok(Some("aaccbcw")), | ||
&str, | ||
Utf8View, | ||
StringViewArray | ||
); | ||
|
||
Ok(()) | ||
} | ||
|
||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -718,9 +718,8 @@ EXPLAIN SELECT | |
FROM test; | ||
---- | ||
logical_plan | ||
01)Projection: replace(__common_expr_1, Utf8("foo"), Utf8("bar")) AS c1, replace(__common_expr_1, CAST(test.column2_utf8view AS Utf8), Utf8("bar")) AS c2 | ||
02)--Projection: CAST(test.column1_utf8view AS Utf8) AS __common_expr_1, test.column2_utf8view | ||
03)----TableScan: test projection=[column1_utf8view, column2_utf8view] | ||
01)Projection: replace(test.column1_utf8view, Utf8("foo"), Utf8("bar")) AS c1, replace(test.column1_utf8view, test.column2_utf8view, Utf8("bar")) AS c2 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think adding a test that shows this query running (aka run SELECT
REPLACE(column1_utf8view, 'foo', 'bar') as c1,
REPLACE(column1_utf8view, column2_utf8view, 'bar') as c2
FROM test; in addition to There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hi @alamb, after updating the replace.rs to process Utf8View, my sql logic test still uses the CAST in its logical plan and this test is failing. Do you have any idea to debug this issue? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hi @alamb, I already found the issue. The reason is that I didn't update the function's signature to Utf8View. |
||
02)--TableScan: test projection=[column1_utf8view, column2_utf8view] | ||
|
||
## Ensure no casts for REVERSE | ||
## TODO file ticket | ||
|
Uh oh!
There was an error while loading. Please reload this page.