Skip to content

Commit 184e1bf

Browse files
committed
Add variant_get compute kernel
In parquet-variant: - Add a new function `Variant::get_path`: this traverses the path to create a new Variant (does not cast any of it). - Add a new module `parquet_variant::path`: adds structs/enums to define a path to access a variant value deeply. In parquet-variant-compute: - Add a new compute kernel `variant_get`: does the path traversal over a `VariantArray`. In the future, this would also cast the values to a specified type. - Includes some basic unit tests. Not comprehensive. - Includes a simple micro-benchmark for reference. Current limitations: - It can only return another VariantArray. Casts are not implemented yet. - Only top-level object/list access is supported. It panics on finding a nested object/list. Needs #7914 to fix this. - Perf is a TODO.
1 parent daf31be commit 184e1bf

File tree

7 files changed

+286
-0
lines changed

7 files changed

+286
-0
lines changed

parquet-variant-compute/Cargo.toml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,3 +42,9 @@ name = "parquet_variant_compute"
4242
bench = false
4343

4444
[dev-dependencies]
45+
criterion = { version = "0.6", default-features = false }
46+
rand = { version = "0.9.1" }
47+
48+
[[bench]]
49+
name = "variant_get"
50+
harness = false
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
use std::sync::Arc;
2+
3+
use arrow::array::ArrayRef;
4+
use criterion::{criterion_group, criterion_main, Criterion};
5+
use parquet_variant::{Variant, VariantBuilder};
6+
use parquet_variant_compute::{
7+
variant_get::{variant_get, GetOptions},
8+
VariantArray, VariantArrayBuilder,
9+
};
10+
use rand::{rngs::StdRng, Rng, SeedableRng};
11+
12+
fn create_primitive_variant(size: usize) -> VariantArray {
13+
let mut rng = StdRng::seed_from_u64(42);
14+
15+
let mut variant_builder = VariantArrayBuilder::new(1);
16+
17+
for _ in 0..size {
18+
let mut builder = VariantBuilder::new();
19+
builder.append_value(rng.random::<i64>());
20+
let (metadata, value) = builder.finish();
21+
variant_builder.append_variant(Variant::try_new(&metadata, &value).unwrap());
22+
}
23+
24+
variant_builder.build()
25+
}
26+
27+
pub fn variant_get_bench(c: &mut Criterion) {
28+
let variant_array = create_primitive_variant(8192);
29+
let input: ArrayRef = Arc::new(variant_array);
30+
31+
let options = GetOptions {
32+
path: vec![].into(),
33+
as_type: None,
34+
cast_options: Default::default(),
35+
};
36+
37+
c.bench_function("variant_get_primitive", |b| {
38+
b.iter(|| variant_get(&input.clone(), options.clone()))
39+
});
40+
}
41+
42+
criterion_group!(benches, variant_get_bench);
43+
criterion_main!(benches);

parquet-variant-compute/src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ mod from_json;
1919
mod to_json;
2020
mod variant_array;
2121
mod variant_array_builder;
22+
pub mod variant_get;
2223

2324
pub use variant_array::VariantArray;
2425
pub use variant_array_builder::VariantArrayBuilder;
Lines changed: 158 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,158 @@
1+
use std::sync::Arc;
2+
3+
use arrow::{
4+
array::{Array, ArrayRef},
5+
compute::CastOptions,
6+
error::Result,
7+
};
8+
use arrow_schema::{ArrowError, Field};
9+
use parquet_variant::path::VariantPath;
10+
11+
use crate::{VariantArray, VariantArrayBuilder};
12+
13+
/// Returns an array with the specified path extracted from the variant values.
14+
///
15+
/// The return array type depends on the `as_type` field of the options parameter
16+
/// 1. `as_type: None`: a VariantArray is returned. The values in this new VariantArray will point
17+
/// to the specified path.
18+
/// 2. `as_type: Some(<specific field>)`: an array of the specified type is returned.
19+
pub fn variant_get(input: &ArrayRef, options: GetOptions) -> Result<ArrayRef> {
20+
let variant_array: &VariantArray = input.as_any().downcast_ref().ok_or_else(|| {
21+
ArrowError::InvalidArgumentError(
22+
"expected a VariantArray as the input for variant_get".to_owned(),
23+
)
24+
})?;
25+
26+
if let Some(as_type) = options.as_type {
27+
return Err(ArrowError::NotYetImplemented(format!(
28+
"getting a {} from a VariantArray is not implemented yet",
29+
as_type
30+
)));
31+
}
32+
33+
let mut builder = VariantArrayBuilder::new(variant_array.len());
34+
for i in 0..variant_array.len() {
35+
let new_variant = variant_array.value(i);
36+
// TODO: perf?
37+
let new_variant = new_variant.get_path(&options.path);
38+
if let Some(new_variant) = new_variant {
39+
// TODO: we're decoding the value and doing a copy into a variant value again. This
40+
// copy can be much smarter.
41+
builder.append_variant(new_variant);
42+
} else {
43+
builder.append_null();
44+
}
45+
}
46+
47+
Ok(Arc::new(builder.build()))
48+
}
49+
50+
/// Controls the action of the variant_get kernel.
51+
#[derive(Debug, Clone)]
52+
pub struct GetOptions<'a> {
53+
/// What path to extract
54+
pub path: VariantPath,
55+
/// if `as_type` is None, the returned array will itself be a VariantArray.
56+
///
57+
/// if `as_type` is `Some(type)` the field is returned as the specified type if possible. To specify returning
58+
/// a Variant, pass a Field with variant type in the metadata.
59+
pub as_type: Option<Field>,
60+
/// Controls the casting behavior (e.g. error vs substituting null on cast error).
61+
pub cast_options: CastOptions<'a>,
62+
}
63+
64+
impl<'a> GetOptions<'a> {
65+
/// Construct options to get the specified path as a variant.
66+
pub fn new_with_path(path: VariantPath) -> Self {
67+
Self {
68+
path,
69+
as_type: None,
70+
cast_options: Default::default(),
71+
}
72+
}
73+
}
74+
75+
#[cfg(test)]
76+
mod test {
77+
use std::sync::Arc;
78+
79+
use arrow::array::{Array, ArrayRef};
80+
use parquet_variant::{path::VariantPathElement, VariantBuilder};
81+
82+
use crate::{VariantArray, VariantArrayBuilder};
83+
84+
use super::{variant_get, GetOptions};
85+
86+
#[test]
87+
fn get_primitive_variant() {
88+
let mut builder = VariantBuilder::new();
89+
builder.add_field_name("some_field");
90+
let mut object = builder.new_object();
91+
object.insert("some_field", 1234i64);
92+
object.finish().unwrap();
93+
let (metadata, value) = builder.finish();
94+
95+
let mut builder = VariantArrayBuilder::new(1);
96+
builder.append_variant_buffers(&metadata, &value);
97+
98+
let variant_array = builder.build();
99+
100+
let input = Arc::new(variant_array) as ArrayRef;
101+
102+
let result = variant_get(
103+
&input,
104+
GetOptions::new_with_path(
105+
vec![VariantPathElement::field("some_field".to_owned())].into(),
106+
),
107+
)
108+
.unwrap();
109+
110+
let result: &VariantArray = result.as_any().downcast_ref().unwrap();
111+
assert!(result.nulls().is_none());
112+
let result = result.value(0);
113+
assert_eq!(result.as_int64().unwrap(), 1234);
114+
}
115+
116+
#[test]
117+
#[should_panic(
118+
expected = "Nested values are handled specially by ObjectBuilder and ListBuilder"
119+
)]
120+
fn get_complex_variant() {
121+
let mut builder = VariantBuilder::new();
122+
builder.add_field_name("top_level_field");
123+
builder.add_field_name("inner_field");
124+
125+
let mut object = builder.new_object();
126+
let mut inner_object = object.new_object("top_level_field");
127+
inner_object.insert("inner_field", 1234i64);
128+
inner_object.finish().unwrap();
129+
object.finish().unwrap();
130+
let (metadata, value) = builder.finish();
131+
132+
let mut builder = VariantArrayBuilder::new(1);
133+
builder.append_variant_buffers(&metadata, &value);
134+
135+
let variant_array = builder.build();
136+
137+
let input = Arc::new(variant_array) as ArrayRef;
138+
139+
let result = variant_get(
140+
&input,
141+
GetOptions::new_with_path(
142+
vec![VariantPathElement::field("top_level_field".to_owned())].into(),
143+
),
144+
)
145+
.unwrap();
146+
147+
// uncomment once implemented
148+
todo!("{:?}", result);
149+
// let result: &VariantArray = result.as_any().downcast_ref().unwrap();
150+
// assert!(result.nulls().is_none());
151+
// let result = result.value(0);
152+
// let result = result.as_object().unwrap();
153+
// let binding = result.get("top_level_field").unwrap();
154+
// let result = binding.as_object().unwrap();
155+
// let result = result.get("inner_field").unwrap();
156+
// assert_eq!(result.as_int64().unwrap(), 1234);
157+
}
158+
}

parquet-variant/src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ mod builder;
3131
mod decoder;
3232
mod utils;
3333
mod variant;
34+
pub mod path;
3435

3536
pub use builder::*;
3637
pub use variant::*;

parquet-variant/src/path.rs

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
use std::ops::Deref;
2+
3+
/// Represents a qualified path to a potential subfield or index of a variant value.
4+
#[derive(Debug, Clone)]
5+
pub struct VariantPath(Vec<VariantPathElement>);
6+
7+
impl VariantPath {
8+
pub fn new(path: Vec<VariantPathElement>) -> Self {
9+
Self(path)
10+
}
11+
12+
pub fn path(&self) -> &Vec<VariantPathElement> {
13+
&self.0
14+
}
15+
}
16+
17+
impl From<Vec<VariantPathElement>> for VariantPath {
18+
fn from(value: Vec<VariantPathElement>) -> Self {
19+
Self::new(value)
20+
}
21+
}
22+
23+
impl Deref for VariantPath {
24+
type Target = Vec<VariantPathElement>;
25+
26+
fn deref(&self) -> &Self::Target {
27+
&self.0
28+
}
29+
}
30+
31+
/// Element of a path
32+
#[derive(Debug, Clone)]
33+
pub enum VariantPathElement {
34+
/// Access field with name `name`
35+
Field { name: String },
36+
/// Access the list element at `index`
37+
Index { index: usize },
38+
}
39+
40+
impl VariantPathElement {
41+
pub fn field(name: String) -> VariantPathElement {
42+
VariantPathElement::Field { name }
43+
}
44+
45+
pub fn index(index: usize) -> VariantPathElement {
46+
VariantPathElement::Index { index }
47+
}
48+
}

parquet-variant/src/variant.rs

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ pub use self::object::VariantObject;
2222
use crate::decoder::{
2323
self, get_basic_type, get_primitive_type, VariantBasicType, VariantPrimitiveType,
2424
};
25+
use crate::path::{VariantPath, VariantPathElement};
2526
use crate::utils::{first_byte_from_slice, slice_from_slice};
2627
use std::ops::Deref;
2728

@@ -1063,6 +1064,34 @@ impl<'m, 'v> Variant<'m, 'v> {
10631064
_ => None,
10641065
}
10651066
}
1067+
1068+
/// Return a new Variant with the path followed.
1069+
///
1070+
/// If the path is not found, `None` is returned.
1071+
pub fn get_path(&self, path: &VariantPath) -> Option<Variant> {
1072+
let mut output = self.clone();
1073+
for element in path.deref() {
1074+
match element {
1075+
VariantPathElement::Field { name } => {
1076+
let Variant::Object(variant_object) = output else {
1077+
return None;
1078+
};
1079+
let field = variant_object.get(name);
1080+
let field = field?;
1081+
output = field;
1082+
}
1083+
VariantPathElement::Index { index } => {
1084+
let Variant::List(variant_list) = output else {
1085+
return None;
1086+
};
1087+
let index = variant_list.get(*index);
1088+
let index = index?;
1089+
output = index;
1090+
}
1091+
}
1092+
}
1093+
Some(output)
1094+
}
10661095
}
10671096

10681097
impl From<()> for Variant<'_, '_> {

0 commit comments

Comments
 (0)