Skip to content

Commit d4c0a32

Browse files
Samyak2alamb
andauthored
[Variant] Add variant_get compute kernel (#7919)
# Which issue does this PR close? - Closes #7893 # What changes are included in this PR? In parquet-variant: - Add a new function `Variant::get_path`: this traverses the path to create a new Variant (does not cast any of it). - Add a new module `parquet_variant::path`: adds structs/enums to define a path to access a variant value deeply. In parquet-variant-compute: - Add a new compute kernel `variant_get`: does the path traversal over a `VariantArray`. In the future, this would also cast the values to a specified type. - Includes some basic unit tests. Not comprehensive. - Includes a simple micro-benchmark for reference. Current limitations: - It can only return another VariantArray. Casts are not implemented yet. - Only top-level object/list access is supported. It panics on finding a nested object/list. Needs #7914 to fix this. - Perf is a TODO. # Are these changes tested? Some basic unit tests are added. # Are there any user-facing changes? Yes --------- Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>
1 parent 7af62d5 commit d4c0a32

File tree

8 files changed

+340
-5
lines changed

8 files changed

+340
-5
lines changed

parquet-variant-compute/Cargo.toml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,3 +41,9 @@ name = "parquet_variant_compute"
4141
bench = false
4242

4343
[dev-dependencies]
44+
criterion = { version = "0.6", default-features = false }
45+
rand = { version = "0.9.1" }
46+
47+
[[bench]]
48+
name = "variant_get"
49+
harness = false
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
use std::sync::Arc;
18+
19+
use arrow::array::ArrayRef;
20+
use criterion::{criterion_group, criterion_main, Criterion};
21+
use parquet_variant::{Variant, VariantBuilder};
22+
use parquet_variant_compute::{
23+
variant_get::{variant_get, GetOptions},
24+
VariantArray, VariantArrayBuilder,
25+
};
26+
use rand::{rngs::StdRng, Rng, SeedableRng};
27+
28+
fn create_primitive_variant(size: usize) -> VariantArray {
29+
let mut rng = StdRng::seed_from_u64(42);
30+
31+
let mut variant_builder = VariantArrayBuilder::new(1);
32+
33+
for _ in 0..size {
34+
let mut builder = VariantBuilder::new();
35+
builder.append_value(rng.random::<i64>());
36+
let (metadata, value) = builder.finish();
37+
variant_builder.append_variant(Variant::try_new(&metadata, &value).unwrap());
38+
}
39+
40+
variant_builder.build()
41+
}
42+
43+
pub fn variant_get_bench(c: &mut Criterion) {
44+
let variant_array = create_primitive_variant(8192);
45+
let input: ArrayRef = Arc::new(variant_array);
46+
47+
let options = GetOptions {
48+
path: vec![].into(),
49+
as_type: None,
50+
cast_options: Default::default(),
51+
};
52+
53+
c.bench_function("variant_get_primitive", |b| {
54+
b.iter(|| variant_get(&input.clone(), options.clone()))
55+
});
56+
}
57+
58+
criterion_group!(benches, variant_get_bench);
59+
criterion_main!(benches);

parquet-variant-compute/src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ mod from_json;
1919
mod to_json;
2020
mod variant_array;
2121
mod variant_array_builder;
22+
pub mod variant_get;
2223

2324
pub use variant_array::VariantArray;
2425
pub use variant_array_builder::VariantArrayBuilder;
Lines changed: 197 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,197 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
use std::sync::Arc;
18+
19+
use arrow::{
20+
array::{Array, ArrayRef},
21+
compute::CastOptions,
22+
error::Result,
23+
};
24+
use arrow_schema::{ArrowError, Field};
25+
use parquet_variant::path::VariantPath;
26+
27+
use crate::{VariantArray, VariantArrayBuilder};
28+
29+
/// Returns an array with the specified path extracted from the variant values.
30+
///
31+
/// The return array type depends on the `as_type` field of the options parameter
32+
/// 1. `as_type: None`: a VariantArray is returned. The values in this new VariantArray will point
33+
/// to the specified path.
34+
/// 2. `as_type: Some(<specific field>)`: an array of the specified type is returned.
35+
pub fn variant_get(input: &ArrayRef, options: GetOptions) -> Result<ArrayRef> {
36+
let variant_array: &VariantArray = input.as_any().downcast_ref().ok_or_else(|| {
37+
ArrowError::InvalidArgumentError(
38+
"expected a VariantArray as the input for variant_get".to_owned(),
39+
)
40+
})?;
41+
42+
if let Some(as_type) = options.as_type {
43+
return Err(ArrowError::NotYetImplemented(format!(
44+
"getting a {} from a VariantArray is not implemented yet",
45+
as_type
46+
)));
47+
}
48+
49+
let mut builder = VariantArrayBuilder::new(variant_array.len());
50+
for i in 0..variant_array.len() {
51+
let new_variant = variant_array.value(i);
52+
// TODO: perf?
53+
let new_variant = new_variant.get_path(&options.path);
54+
match new_variant {
55+
// TODO: we're decoding the value and doing a copy into a variant value again. This
56+
// copy can be much smarter.
57+
Some(new_variant) => builder.append_variant(new_variant),
58+
None => builder.append_null(),
59+
}
60+
}
61+
62+
Ok(Arc::new(builder.build()))
63+
}
64+
65+
/// Controls the action of the variant_get kernel.
66+
#[derive(Debug, Clone)]
67+
pub struct GetOptions<'a> {
68+
/// What path to extract
69+
pub path: VariantPath<'a>,
70+
/// if `as_type` is None, the returned array will itself be a VariantArray.
71+
///
72+
/// if `as_type` is `Some(type)` the field is returned as the specified type.
73+
pub as_type: Option<Field>,
74+
/// Controls the casting behavior (e.g. error vs substituting null on cast error).
75+
pub cast_options: CastOptions<'a>,
76+
}
77+
78+
impl<'a> GetOptions<'a> {
79+
/// Construct options to get the specified path as a variant.
80+
pub fn new_with_path(path: VariantPath<'a>) -> Self {
81+
Self {
82+
path,
83+
as_type: None,
84+
cast_options: Default::default(),
85+
}
86+
}
87+
}
88+
89+
#[cfg(test)]
90+
mod test {
91+
use std::sync::Arc;
92+
93+
use arrow::array::{Array, ArrayRef, StringArray};
94+
use parquet_variant::path::{VariantPath, VariantPathElement};
95+
96+
use crate::batch_json_string_to_variant;
97+
use crate::VariantArray;
98+
99+
use super::{variant_get, GetOptions};
100+
101+
fn single_variant_get_test(input_json: &str, path: VariantPath, expected_json: &str) {
102+
// Create input array from JSON string
103+
let input_array_ref: ArrayRef = Arc::new(StringArray::from(vec![Some(input_json)]));
104+
let input_variant_array_ref: ArrayRef =
105+
Arc::new(batch_json_string_to_variant(&input_array_ref).unwrap());
106+
107+
let result =
108+
variant_get(&input_variant_array_ref, GetOptions::new_with_path(path)).unwrap();
109+
110+
// Create expected array from JSON string
111+
let expected_array_ref: ArrayRef = Arc::new(StringArray::from(vec![Some(expected_json)]));
112+
let expected_variant_array = batch_json_string_to_variant(&expected_array_ref).unwrap();
113+
114+
let result_array: &VariantArray = result.as_any().downcast_ref().unwrap();
115+
assert_eq!(
116+
result_array.len(),
117+
1,
118+
"Expected result array to have length 1"
119+
);
120+
assert!(
121+
result_array.nulls().is_none(),
122+
"Expected no nulls in result array"
123+
);
124+
let result_variant = result_array.value(0);
125+
let expected_variant = expected_variant_array.value(0);
126+
assert_eq!(
127+
result_variant, expected_variant,
128+
"Result variant does not match expected variant"
129+
);
130+
}
131+
132+
#[test]
133+
fn get_primitive_variant_field() {
134+
single_variant_get_test(
135+
r#"{"some_field": 1234}"#,
136+
vec![VariantPathElement::field("some_field".into())].into(),
137+
"1234",
138+
);
139+
}
140+
141+
#[test]
142+
fn get_primitive_variant_list_index() {
143+
single_variant_get_test(
144+
"[1234, 5678]",
145+
vec![VariantPathElement::index(0)].into(),
146+
"1234",
147+
);
148+
}
149+
150+
#[test]
151+
fn get_primitive_variant_inside_object_of_object() {
152+
single_variant_get_test(
153+
r#"{"top_level_field": {"inner_field": 1234}}"#,
154+
vec![
155+
VariantPathElement::field("top_level_field".into()),
156+
VariantPathElement::field("inner_field".into()),
157+
]
158+
.into(),
159+
"1234",
160+
);
161+
}
162+
163+
#[test]
164+
fn get_primitive_variant_inside_list_of_object() {
165+
single_variant_get_test(
166+
r#"[{"some_field": 1234}]"#,
167+
vec![
168+
VariantPathElement::index(0),
169+
VariantPathElement::field("some_field".into()),
170+
]
171+
.into(),
172+
"1234",
173+
);
174+
}
175+
176+
#[test]
177+
fn get_primitive_variant_inside_object_of_list() {
178+
single_variant_get_test(
179+
r#"{"some_field": [1234]}"#,
180+
vec![
181+
VariantPathElement::field("some_field".into()),
182+
VariantPathElement::index(0),
183+
]
184+
.into(),
185+
"1234",
186+
);
187+
}
188+
189+
#[test]
190+
fn get_complex_variant() {
191+
single_variant_get_test(
192+
r#"{"top_level_field": {"inner_field": 1234}}"#,
193+
vec![VariantPathElement::field("top_level_field".into())].into(),
194+
r#"{"inner_field": 1234}"#,
195+
);
196+
}
197+
}

parquet-variant/src/builder.rs

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -376,8 +376,6 @@ impl MetadataBuilder {
376376
fn upsert_field_name(&mut self, field_name: &str) -> u32 {
377377
let (id, new_entry) = self.field_names.insert_full(field_name.to_string());
378378

379-
dbg!(new_entry);
380-
381379
if new_entry {
382380
let n = self.num_field_names();
383381

@@ -1070,7 +1068,6 @@ impl<'a> ObjectBuilder<'a> {
10701068
let metadata_builder = self.parent_state.metadata_builder();
10711069

10721070
let field_id = metadata_builder.upsert_field_name(key);
1073-
dbg!(field_id);
10741071
let field_start = self.buffer.offset();
10751072

10761073
if self.fields.insert(field_id, field_start).is_some() && self.validate_unique_fields {
@@ -2487,8 +2484,6 @@ mod tests {
24872484

24882485
let mut builder = VariantBuilder::new().with_metadata(VariantMetadata::new(&m1));
24892486

2490-
dbg!("building");
2491-
24922487
builder.append_value(variant.clone());
24932488

24942489
let (metadata, value) = builder.finish();

parquet-variant/src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
3030
mod builder;
3131
mod decoder;
32+
pub mod path;
3233
mod utils;
3334
mod variant;
3435

parquet-variant/src/path.rs

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
use std::{borrow::Cow, ops::Deref};
18+
19+
/// Represents a qualified path to a potential subfield or index of a variant value.
20+
#[derive(Debug, Clone)]
21+
pub struct VariantPath<'a>(Vec<VariantPathElement<'a>>);
22+
23+
impl<'a> VariantPath<'a> {
24+
pub fn new(path: Vec<VariantPathElement<'a>>) -> Self {
25+
Self(path)
26+
}
27+
28+
pub fn path(&self) -> &Vec<VariantPathElement> {
29+
&self.0
30+
}
31+
}
32+
33+
impl<'a> From<Vec<VariantPathElement<'a>>> for VariantPath<'a> {
34+
fn from(value: Vec<VariantPathElement<'a>>) -> Self {
35+
Self::new(value)
36+
}
37+
}
38+
39+
impl<'a> Deref for VariantPath<'a> {
40+
type Target = [VariantPathElement<'a>];
41+
42+
fn deref(&self) -> &Self::Target {
43+
&self.0
44+
}
45+
}
46+
47+
/// Element of a path
48+
#[derive(Debug, Clone)]
49+
pub enum VariantPathElement<'a> {
50+
/// Access field with name `name`
51+
Field { name: Cow<'a, str> },
52+
/// Access the list element at `index`
53+
Index { index: usize },
54+
}
55+
56+
impl<'a> VariantPathElement<'a> {
57+
pub fn field(name: Cow<'a, str>) -> VariantPathElement<'a> {
58+
VariantPathElement::Field { name }
59+
}
60+
61+
pub fn index(index: usize) -> VariantPathElement<'a> {
62+
VariantPathElement::Index { index }
63+
}
64+
}

0 commit comments

Comments
 (0)