Skip to content

Commit 8e67459

Browse files
authored
feat(query): inverted index support json type (#15267)
1 parent 88331f0 commit 8e67459

File tree

10 files changed

+243
-32
lines changed

10 files changed

+243
-32
lines changed

Cargo.lock

Lines changed: 13 additions & 10 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,7 @@ members = [
108108

109109
[workspace.dependencies]
110110
# databend maintains
111-
jsonb = { git = "https://github.com/datafuselabs/jsonb", rev = "a7325f4" }
111+
jsonb = { git = "https://github.com/datafuselabs/jsonb", rev = "3fe3acd" }
112112

113113
opendal = { version = "0.45.1", features = [
114114
"layers-minitrace",

src/query/ee/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ log = { workspace = true }
6060
tempfile = "3.4.0"
6161

6262
[dev-dependencies]
63+
jsonb = { workspace = true }
6364

6465
[build-dependencies]
6566
databend-common-building = { path = "../../common/building" }

src/query/ee/tests/it/inverted_index/pruning.rs

Lines changed: 138 additions & 1 deletion
Large diffs are not rendered by default.

src/query/functions/src/scalars/geometry.rs

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -496,7 +496,6 @@ pub fn register(registry: &mut FunctionRegistry) {
496496
builder.len(),
497497
ErrorCode::GeometryError(e.to_string()).to_string(),
498498
);
499-
return;
500499
}
501500
};
502501
builder.commit_row();
@@ -531,7 +530,6 @@ pub fn register(registry: &mut FunctionRegistry) {
531530
builder.len(),
532531
ErrorCode::GeometryError(e.to_string()).to_string(),
533532
);
534-
return;
535533
}
536534
};
537535
builder.commit_row();

src/query/sql/src/planner/binder/ddl/index.rs

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -384,9 +384,11 @@ impl Binder {
384384
for column in columns {
385385
match table_schema.field_with_name(&column.name) {
386386
Ok(field) => {
387-
if field.data_type.remove_nullable() != TableDataType::String {
387+
if field.data_type.remove_nullable() != TableDataType::String
388+
&& field.data_type.remove_nullable() != TableDataType::Variant
389+
{
388390
return Err(ErrorCode::UnsupportedIndex(format!(
389-
"Inverted index currently only support String type, but the type of column {} is {}",
391+
"Inverted index currently only support String and variant type, but the type of column {} is {}",
390392
column, field.data_type
391393
)));
392394
}

src/query/sql/src/planner/semantic/type_check.rs

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2234,14 +2234,20 @@ impl<'a> TypeChecker<'a> {
22342234
continue;
22352235
}
22362236
let field_names: Vec<&str> = field_str.split(':').collect();
2237+
// if the field is JSON type, must specify the key path in the object
2238+
// for example:
2239+
// the field `info` has the value: `{"tags":{"id":10,"env":"prod","name":"test"}}`
2240+
// a query can be written like this `info.tags.env:prod`
2241+
let field_name = field_names[0].trim();
2242+
let sub_field_names: Vec<&str> = field_name.split('.').collect();
22372243
let column_expr = Expr::ColumnRef {
22382244
span: query_scalar.span(),
22392245
column: ColumnRef {
22402246
database: None,
22412247
table: None,
22422248
column: ColumnID::Name(Identifier::from_name(
22432249
query_scalar.span(),
2244-
field_names[0].trim(),
2250+
sub_field_names[0].trim(),
22452251
)),
22462252
},
22472253
};

src/query/storages/fuse/src/io/write/inverted_index_writer.rs

Lines changed: 42 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -49,9 +49,11 @@ use databend_common_io::constants::DEFAULT_BLOCK_BUFFER_SIZE;
4949
use opendal::Operator;
5050
use serde::Deserialize;
5151
use serde::Serialize;
52+
use serde_json::Map;
5253
use tantivy::schema::Document;
5354
use tantivy::schema::Field;
5455
use tantivy::schema::IndexRecordOption;
56+
use tantivy::schema::JsonObjectOptions;
5557
use tantivy::schema::Schema;
5658
use tantivy::schema::TextFieldIndexing;
5759
use tantivy::schema::TextOptions;
@@ -63,6 +65,7 @@ use tantivy::tokenizer::StopWordFilter;
6365
use tantivy::tokenizer::TextAnalyzer;
6466
use tantivy::tokenizer::TokenizerManager;
6567
use tantivy::Directory;
68+
// use tantivy::schema::document::OwnedValue;
6669
use tantivy::Index;
6770
use tantivy::IndexBuilder;
6871
use tantivy::IndexSettings;
@@ -163,18 +166,26 @@ impl InvertedIndexWriter {
163166
let text_field_indexing = TextFieldIndexing::default()
164167
.set_tokenizer(&tokenizer_name)
165168
.set_index_option(index_record);
166-
let text_options = TextOptions::default().set_indexing_options(text_field_indexing);
169+
let text_options = TextOptions::default().set_indexing_options(text_field_indexing.clone());
170+
let json_options = JsonObjectOptions::default().set_indexing_options(text_field_indexing);
167171

168172
let mut schema_builder = Schema::builder();
169173
let mut index_fields = Vec::with_capacity(schema.fields.len());
170174
for field in &schema.fields {
171-
if field.data_type().remove_nullable() != DataType::String {
172-
return Err(ErrorCode::IllegalDataType(format!(
173-
"inverted index only support String type, but got {}",
174-
field.data_type()
175-
)));
176-
}
177-
let index_field = schema_builder.add_text_field(field.name(), text_options.clone());
175+
let index_field = match field.data_type().remove_nullable() {
176+
DataType::String => {
177+
schema_builder.add_text_field(field.name(), text_options.clone())
178+
}
179+
DataType::Variant => {
180+
schema_builder.add_json_field(field.name(), json_options.clone())
181+
}
182+
_ => {
183+
return Err(ErrorCode::IllegalDataType(format!(
184+
"inverted index only support String and Variant type, but got {}",
185+
field.data_type()
186+
)));
187+
}
188+
};
178189
index_fields.push(index_field);
179190
}
180191
let index_schema = schema_builder.build();
@@ -221,15 +232,33 @@ impl InvertedIndexWriter {
221232
}
222233
}
223234

235+
let mut types = Vec::with_capacity(self.schema.num_fields());
236+
for field in self.schema.fields() {
237+
let ty = field.data_type().remove_nullable();
238+
types.push(ty);
239+
}
224240
for i in 0..block.num_rows() {
225241
let mut doc = Document::new();
226-
for j in 0..block.num_columns() {
242+
for (j, typ) in types.iter().enumerate() {
227243
let field = Field::from_field_id(j as u32);
228244
let column = block.get_by_offset(j);
229-
if let ScalarRef::String(text) = unsafe { column.value.index_unchecked(i) } {
230-
doc.add_text(field, text);
231-
} else {
232-
doc.add_text(field, "");
245+
match unsafe { column.value.index_unchecked(i) } {
246+
ScalarRef::String(text) => doc.add_text(field, text),
247+
ScalarRef::Variant(jsonb_val) => {
248+
// only support object JSON, other JSON type will not add index.
249+
if let Ok(Some(obj_val)) = jsonb::to_serde_json_object(jsonb_val) {
250+
doc.add_json_object(field, obj_val);
251+
} else {
252+
doc.add_json_object(field, Map::new());
253+
}
254+
}
255+
_ => {
256+
if typ == &DataType::Variant {
257+
doc.add_json_object(field, Map::new());
258+
} else {
259+
doc.add_text(field, "");
260+
}
261+
}
233262
}
234263
}
235264
self.operations.push(UserOperation::Add(doc));

tests/sqllogictests/suites/ee/04_ee_inverted_index/04_0000_inverted_index_base.test

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -427,6 +427,41 @@ onlyif mysql
427427
statement error 1105
428428
SELECT id, score(), title FROM books WHERE query('title:(设计 实现)^5 description:(学习 +神经网络)^1.1') ORDER BY score() DESC
429429

430+
statement ok
431+
CREATE TABLE t1 (id int, body json)
432+
433+
statement ok
434+
CREATE INVERTED INDEX IF NOT EXISTS idx ON t1(body) tokenizer = 'chinese'
435+
436+
statement ok
437+
INSERT INTO t1 VALUES
438+
(1, '{"title":"The Psychology of Persuasion","metadata":{"author":"Oliver","publishedDate":"2021-06-15","tags":["psychology","persuasion","behavior"]}}'),
439+
(2, '{"title":"Sustainable Energy Solutions","metadata":{"author":"Pamela","publishedDate":"2023-12-01","tags":["sustainable energy","solutions","environment"]}}'),
440+
(3, '{"title":"The Future of Autonomous Vehicles","metadata":{"author":"Quincy","publishedDate":"2022-05-05","tags":["autonomous vehicles","future","technology"]}}'),
441+
(4, '{"title":"The Role of AI in Customer Service","metadata":{"author":"Rachel","publishedDate":"2021-09-20","tags":["AI","customer service","automation"]}}'),
442+
(5, '{"title":"Internet of Things Applications","metadata":{"author":"Samuel","publishedDate":"2023-12-15","tags":["IoT","applications","technology"]}}'),
443+
(6, '{"title":"人工智能与机器学习","metadata":{"author":"张三","publishedDate":"2023-10-23","tags":["人工智能","机器学习","技术"]}}'),
444+
(7, '{"title":"区块链在金融行业的应用","metadata":{"author":"李四","publishedDate":"2023-09-18","tags":["区块链","金融行业","金融科技"]}}'),
445+
(8, '{"title":"物联网与智能家居","metadata":{"author":"王五","publishedDate":"2023-08-15","tags":["物联网","智能家居","生活"]}}'),
446+
(9, '{"title":"量子计算的未来","metadata":{"author":"赵六","publishedDate":"2023-07-20","tags":["量子计算","未来科技","物理学"]}}'),
447+
(10, '{"title":"网络安全与隐私保护","metadata":{"author":"刘七","publishedDate":"2023-06-25","tags":["网络安全","隐私保护","信息技术"]}}')
448+
449+
query IFT
450+
SELECT id, score(), body FROM t1 WHERE query('body.title:energy')
451+
----
452+
2 3.2352333 {"metadata":{"author":"Pamela","publishedDate":"2023-12-01","tags":["sustainable energy","solutions","environment"]},"title":"Sustainable Energy Solutions"}
453+
454+
query IFT
455+
SELECT id, score(), body FROM t1 WHERE query('body.metadata.tags:technology')
456+
----
457+
3 2.4057739 {"metadata":{"author":"Quincy","publishedDate":"2022-05-05","tags":["autonomous vehicles","future","technology"]},"title":"The Future of Autonomous Vehicles"}
458+
5 2.4057739 {"metadata":{"author":"Samuel","publishedDate":"2023-12-15","tags":["IoT","applications","technology"]},"title":"Internet of Things Applications"}
459+
460+
query IFT
461+
SELECT id, score(), body FROM t1 WHERE query('body.metadata.tags:技术')
462+
----
463+
6 2.4057739 {"metadata":{"author":"张三","publishedDate":"2023-10-23","tags":["人工智能","机器学习","技术"]},"title":"人工智能与机器学习"}
464+
10 2.4057739 {"metadata":{"author":"刘七","publishedDate":"2023-06-25","tags":["网络安全","隐私保护","信息技术"]},"title":"网络安全与隐私保护"}
430465

431466
statement ok
432467
use default

tests/suites/1_stateful/01_streaming_load/01_0003_streaming_variant_load.result

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
8 18446744073709551615
99
9 12.34
1010
10 -56.78
11-
11 140000
11+
11 140000.0
1212
12 0.0099
1313
13 "abcd"
1414
14 "test"
@@ -28,7 +28,7 @@
2828
28 18446744073709551615
2929
29 12.34
3030
30 -56.78
31-
31 140000
31+
31 140000.0
3232
32 0.0099
3333
33 "abcd"
3434
34 "test"

0 commit comments

Comments
 (0)