Skip to content

Commit 577d039

Browse files
committed
feat(query): support jsonb format
1 parent ee772bd commit 577d039

File tree

24 files changed

+1761
-89
lines changed

24 files changed

+1761
-89
lines changed

Cargo.lock

Lines changed: 34 additions & 31 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

docs/doc/30-reference/10-data-types/60-data-type-nullable-types.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ Using `Nullable` will almost always have a negative impact on performance. If th
3333
Check whether the value is `NULL` or `NOT NULL`.
3434

3535
[IS NULL](/doc/reference/functions/conditional-functions/isnull)
36+
3637
[IS NOT NULL](/doc/reference/functions/conditional-functions/isnotnull)
3738

3839
### Example

src/common/jsonb/Cargo.toml

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
[package]
2+
name = "common-jsonb"
3+
version = "0.1.0"
4+
authors = ["Databend Authors <opensource@datafuselabs.com>"]
5+
license = "Apache-2.0"
6+
publish = false
7+
edition = "2021"
8+
9+
[dependencies]
10+
byteorder = "1.4.3"
11+
decimal-rs = "0.1.39"

src/common/jsonb/src/constants.rs

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
// Copyright 2022 Datafuse Labs.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
// JSONB header constants
16+
pub(crate) const ARRAY_CONTAINER_TAG: u32 = 0x80000000;
17+
pub(crate) const OBJECT_CONTAINER_TAG: u32 = 0x40000000;
18+
pub(crate) const SCALAR_CONTAINER_TAG: u32 = 0x20000000;
19+
20+
pub(crate) const CONTAINER_HEADER_TYPE_MASK: u32 = 0xE0000000;
21+
pub(crate) const CONTAINER_HEADER_LEN_MASK: u32 = 0x1FFFFFFF;
22+
23+
// JSONB JEntry constants
24+
pub(crate) const NULL_TAG: u32 = 0x00000000;
25+
pub(crate) const STRING_TAG: u32 = 0x10000000;
26+
pub(crate) const NUMBER_TAG: u32 = 0x20000000;
27+
pub(crate) const FALSE_TAG: u32 = 0x30000000;
28+
pub(crate) const TRUE_TAG: u32 = 0x40000000;
29+
pub(crate) const CONTAINER_TAG: u32 = 0x50000000;
30+
31+
// @todo support offset mode
32+
#[allow(dead_code)]
33+
pub(crate) const JENTRY_IS_OFF_FLAG: u32 = 0x80000000;
34+
pub(crate) const JENTRY_TYPE_MASK: u32 = 0x70000000;
35+
pub(crate) const JENTRY_OFF_LEN_MASK: u32 = 0x0FFFFFFF;
36+
37+
// JSON text constants
38+
pub(crate) const NULL_LEN: usize = 4;
39+
pub(crate) const TRUE_LEN: usize = 4;
40+
pub(crate) const FALSE_LEN: usize = 5;
41+
pub(crate) const UNICODE_LEN: usize = 4;
42+
43+
// JSON text escape characters constants
44+
pub(crate) const BS: char = '\x5C'; // \\ Backslash
45+
pub(crate) const QU: char = '\x22'; // \" Double quotation mark
46+
pub(crate) const SD: char = '\x2F'; // \/ Slash or divide
47+
pub(crate) const BB: char = '\x08'; // \b Backspace
48+
pub(crate) const FF: char = '\x0C'; // \f Formfeed Page Break
49+
pub(crate) const NN: char = '\x0A'; // \n Newline
50+
pub(crate) const RR: char = '\x0D'; // \r Carriage Return
51+
pub(crate) const TT: char = '\x09'; // \t Horizontal Tab

src/common/jsonb/src/de.rs

Lines changed: 183 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,183 @@
1+
// Copyright 2022 Datafuse Labs.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
use std::borrow::Cow;
16+
use std::collections::VecDeque;
17+
18+
use byteorder::BigEndian;
19+
use byteorder::ReadBytesExt;
20+
use decimal_rs::Decimal;
21+
22+
use super::constants::*;
23+
use super::error::*;
24+
use super::jentry::JEntry;
25+
use super::parser::parse_value;
26+
use super::value::Object;
27+
use super::value::Value;
28+
29+
/// The binary `JSONB` contains three parts, `Header`, `JEntry` and `RawData`.
30+
/// This structure can be nested. Each group of structures starts with a `Header`.
31+
/// The upper-level `Value` will store the `Header` length or offset of
32+
/// the lower-level `Value`.
33+
34+
/// `Header` stores the type of the `Value`, include `Array`, `Object` and `Scalar`,
35+
/// `Scalar` has only one `Value`, and a corresponding `JEntry`.
36+
/// `Array` and `Object` are nested type, they have multiple lower-level `Values`.
37+
/// So the `Header` also stores the number of lower-level `Values`.
38+
39+
/// `JEntry` stores the types of `Scalar Value`, including `Null`, `True`, `False`,
40+
/// `Number`, `String` and `Container`. They have three different decode methods.
41+
/// 1. `Null`, `True` and `False` can be obtained by `JEntry`, no extra work required.
42+
/// 2. `Number` and `String` has related `RawData`, `JEntry` store the length
43+
/// or offset of this data, the `Value` can be read out and then decoded.
44+
/// 3. `Container` is actually a nested `Array` or `Object` with the same structure,
45+
/// `JEntry` store the length or offset of the lower-level `Header`,
46+
/// from where the same decode process can begin.
47+
48+
/// `RawData` is the encoded `Value`.
49+
/// `Number` is a variable-length `Decimal`, store both int and float value.
50+
/// `String` is the original string, can be borrowed directly without extra decode.
51+
/// `Array` and `Object` is a lower-level encoded `JSONB` value.
52+
/// The upper-level doesn't care about the specific content.
53+
/// Decode can be executed recursively.
54+
55+
/// Decode `JSONB` Value from binary bytes.
56+
pub fn from_slice(buf: &[u8]) -> Result<Value<'_>, Error> {
57+
let mut decoder = Decoder::new(buf);
58+
match decoder.decode() {
59+
Ok(value) => Ok(value),
60+
// for compatible with the first version of `JSON` text, parse it again
61+
Err(_) => parse_value(buf),
62+
}
63+
}
64+
65+
#[repr(transparent)]
66+
pub struct Decoder<'a> {
67+
buf: &'a [u8],
68+
}
69+
70+
impl<'a> Decoder<'a> {
71+
pub fn new(buf: &'a [u8]) -> Decoder<'a> {
72+
Self { buf }
73+
}
74+
75+
pub fn decode(&mut self) -> Result<Value<'a>, Error> {
76+
// Valid `JSONB` Value has at least one `Header`
77+
if self.buf.len() < 4 {
78+
return Err(Error::InvalidJsonb);
79+
}
80+
let value = self.decode_jsonb()?;
81+
Ok(value)
82+
}
83+
84+
// Read value type from the `Header`
85+
// `Scalar` has one `JEntry`
86+
// `Array` and `Object` store the numbers of elements
87+
fn decode_jsonb(&mut self) -> Result<Value<'a>, Error> {
88+
let container_header = self.buf.read_u32::<BigEndian>()?;
89+
90+
match container_header & CONTAINER_HEADER_TYPE_MASK {
91+
SCALAR_CONTAINER_TAG => {
92+
let encoded = self.buf.read_u32::<BigEndian>()?;
93+
let jentry = JEntry::decode_jentry(encoded);
94+
self.decode_scalar(jentry)
95+
}
96+
ARRAY_CONTAINER_TAG => self.decode_array(container_header),
97+
OBJECT_CONTAINER_TAG => self.decode_object(container_header),
98+
_ => Err(Error::InvalidJsonbHeader),
99+
}
100+
}
101+
102+
// Decode `Value` based on the `JEntry`
103+
// `Null` and `Boolean` don't need to read extra data
104+
// `Number` and `String` `JEntry` stores the length or offset of the data,
105+
// read them and decode to the `Value`
106+
// `Array` and `Object` need to read nested data from the lower-level `Header`
107+
fn decode_scalar(&mut self, jentry: JEntry) -> Result<Value<'a>, Error> {
108+
match jentry.type_code {
109+
NULL_TAG => Ok(Value::Null),
110+
TRUE_TAG => Ok(Value::Bool(true)),
111+
FALSE_TAG => Ok(Value::Bool(false)),
112+
STRING_TAG => {
113+
let offset = jentry.length as usize;
114+
let s = std::str::from_utf8(&self.buf[..offset]).unwrap();
115+
self.buf = &self.buf[offset..];
116+
Ok(Value::String(Cow::Borrowed(s)))
117+
}
118+
NUMBER_TAG => {
119+
let offset = jentry.length as usize;
120+
let d = Decimal::decode(&self.buf[..offset]);
121+
self.buf = &self.buf[offset..];
122+
Ok(Value::Number(d))
123+
}
124+
CONTAINER_TAG => self.decode_jsonb(),
125+
_ => Err(Error::InvalidJsonbJEntry),
126+
}
127+
}
128+
129+
// Decode the numbers of values from the `Header`,
130+
// then read all `JEntries`, finally decode the `Value` by `JEntry`
131+
fn decode_array(&mut self, container_header: u32) -> Result<Value<'a>, Error> {
132+
let length = (container_header & CONTAINER_HEADER_LEN_MASK) as usize;
133+
let jentries = self.decode_jentries(length)?;
134+
let mut values: Vec<Value> = Vec::with_capacity(length);
135+
// decode all values
136+
for jentry in jentries.into_iter() {
137+
let value = self.decode_scalar(jentry)?;
138+
values.push(value);
139+
}
140+
141+
let value = Value::Array(values);
142+
Ok(value)
143+
}
144+
145+
// The basic process is the same as that of `Array`
146+
// but first decode the keys and then decode the values
147+
fn decode_object(&mut self, container_header: u32) -> Result<Value<'a>, Error> {
148+
let length = (container_header & CONTAINER_HEADER_LEN_MASK) as usize;
149+
let mut jentries = self.decode_jentries(length * 2)?;
150+
151+
let mut keys: VecDeque<Value> = VecDeque::with_capacity(length);
152+
// decode all keys first
153+
for _ in 0..length {
154+
let jentry = jentries.pop_front().unwrap();
155+
let key = self.decode_scalar(jentry)?;
156+
keys.push_back(key);
157+
}
158+
159+
let mut obj = Object::new();
160+
// decode all values
161+
for _ in 0..length {
162+
let key = keys.pop_front().unwrap();
163+
let k = key.as_str().unwrap();
164+
let jentry = jentries.pop_front().unwrap();
165+
let value = self.decode_scalar(jentry)?;
166+
obj.insert(k.to_string(), value);
167+
}
168+
169+
let value = Value::Object(obj);
170+
Ok(value)
171+
}
172+
173+
// Decode `JEntries` for `Array` and `Object`
174+
fn decode_jentries(&mut self, length: usize) -> Result<VecDeque<JEntry>, Error> {
175+
let mut jentries: VecDeque<JEntry> = VecDeque::with_capacity(length);
176+
for _ in 0..length {
177+
let encoded = self.buf.read_u32::<BigEndian>()?;
178+
let jentry = JEntry::decode_jentry(encoded);
179+
jentries.push_back(jentry);
180+
}
181+
Ok(jentries)
182+
}
183+
}

0 commit comments

Comments
 (0)