Skip to content

Commit 92e5c12

Browse files
committed
Avoid a copy in VariantArrayBuilder when creating JSON
1 parent 55fbf5c commit 92e5c12

File tree

5 files changed

+280
-32
lines changed

5 files changed

+280
-32
lines changed

parquet-variant-compute/src/from_json.rs

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@
2121
use crate::{VariantArray, VariantArrayBuilder};
2222
use arrow::array::{Array, ArrayRef, StringArray};
2323
use arrow_schema::ArrowError;
24-
use parquet_variant::VariantBuilder;
2524
use parquet_variant_json::json_to_variant;
2625

2726
/// Parse a batch of JSON strings into a batch of Variants represented as
@@ -41,10 +40,10 @@ pub fn batch_json_string_to_variant(input: &ArrayRef) -> Result<VariantArray, Ar
4140
// The subfields are expected to be non-nullable according to the parquet variant spec.
4241
variant_array_builder.append_null();
4342
} else {
44-
let mut vb = VariantBuilder::new();
43+
let mut vb = variant_array_builder.variant_builder();
44+
// parse JSON directly to the variant builder
4545
json_to_variant(input_string_array.value(i), &mut vb)?;
46-
let (metadata, value) = vb.finish();
47-
variant_array_builder.append_variant_buffers(&metadata, &value);
46+
vb.finish()
4847
}
4948
}
5049
Ok(variant_array_builder.build())

parquet-variant-compute/src/lib.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ mod variant_array_builder;
2222
pub mod variant_get;
2323

2424
pub use variant_array::VariantArray;
25-
pub use variant_array_builder::VariantArrayBuilder;
25+
pub use variant_array_builder::{VariantArrayBuilder, VariantArrayVariantBuilder};
2626

2727
pub use from_json::batch_json_string_to_variant;
2828
pub use to_json::batch_variant_to_json_string;

parquet-variant-compute/src/variant_array_builder.rs

Lines changed: 256 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
use crate::VariantArray;
2121
use arrow::array::{ArrayRef, BinaryViewArray, BinaryViewBuilder, NullBufferBuilder, StructArray};
2222
use arrow_schema::{DataType, Field, Fields};
23-
use parquet_variant::{Variant, VariantBuilder};
23+
use parquet_variant::{ListBuilder, ObjectBuilder, Variant, VariantBuilder, VariantBuilderExt};
2424
use std::sync::Arc;
2525

2626
/// A builder for [`VariantArray`]
@@ -37,13 +37,13 @@ use std::sync::Arc;
3737
/// ## Example:
3838
/// ```
3939
/// # use arrow::array::Array;
40-
/// # use parquet_variant::{Variant, VariantBuilder};
40+
/// # use parquet_variant::{Variant, VariantBuilder, VariantBuilderExt};
4141
/// # use parquet_variant_compute::VariantArrayBuilder;
4242
/// // Create a new VariantArrayBuilder with a capacity of 100 rows
4343
/// let mut builder = VariantArrayBuilder::new(100);
4444
/// // append variant values
4545
/// builder.append_variant(Variant::from(42));
46-
/// // append a null row
46+
/// // append a null row (note not a Variant::Null)
4747
/// builder.append_null();
4848
/// // append a pre-constructed metadata and value buffers
4949
/// let (metadata, value) = {
@@ -55,9 +55,14 @@ use std::sync::Arc;
5555
/// };
5656
/// builder.append_variant_buffers(&metadata, &value);
5757
///
58+
/// // Use `variant_builder` method to write values directly to the output array
59+
/// let mut vb = builder.variant_builder();
60+
/// vb.append_value("Hello, World!");
61+
/// vb.finish(); // Note: call finish to write the variant to the buffers
62+
///
5863
/// // create the final VariantArray
5964
/// let variant_array = builder.build();
60-
/// assert_eq!(variant_array.len(), 3);
65+
/// assert_eq!(variant_array.len(), 4);
6166
/// // // Access the values
6267
/// // row 1 is not null and is an integer
6368
/// assert!(!variant_array.is_null(0));
@@ -67,6 +72,9 @@ use std::sync::Arc;
6772
/// // row 2 is not null and is an object
6873
/// assert!(!variant_array.is_null(2));
6974
/// assert!(variant_array.value(2).as_object().is_some());
75+
/// // row 3 is a string
76+
/// assert!(!variant_array.is_null(3));
77+
/// assert_eq!(variant_array.value(3), Variant::from("Hello, World!"));
7078
/// ```
7179
#[derive(Debug)]
7280
pub struct VariantArrayBuilder {
@@ -147,11 +155,9 @@ impl VariantArrayBuilder {
147155

148156
/// Append the [`Variant`] to the builder as the next row
149157
pub fn append_variant(&mut self, variant: Variant) {
150-
// TODO make this more efficient by avoiding the intermediate buffers
151-
let mut variant_builder = VariantBuilder::new();
152-
variant_builder.append_value(variant);
153-
let (metadata, value) = variant_builder.finish();
154-
self.append_variant_buffers(&metadata, &value);
158+
let mut direct_builder = self.variant_builder();
159+
direct_builder.variant_builder.append_value(variant);
160+
direct_builder.finish()
155161
}
156162

157163
/// Append a metadata and values buffer to the builder
@@ -168,7 +174,169 @@ impl VariantArrayBuilder {
168174
self.value_buffer.extend_from_slice(value);
169175
}
170176

171-
// TODO: Return a Variant builder that will write to the underlying buffers (TODO)
177+
/// Return a `VariantArrayVariantBuilder` that writes directly to the
178+
/// buffers of this builder.
179+
///
180+
/// You must call [`VariantArrayVariantBuilder::finish`] to complete the builder
181+
///
182+
/// # Example
183+
/// ```
184+
/// # use parquet_variant::{Variant, VariantBuilder, VariantBuilderExt};
185+
/// # use parquet_variant_compute::{VariantArray, VariantArrayBuilder};
186+
/// let mut array_builder = VariantArrayBuilder::new(10);
187+
///
188+
/// // First row has a string
189+
/// let mut variant_builder = array_builder.variant_builder();
190+
/// variant_builder.append_value("Hello, World!");
191+
/// // must call finish to write the variant to the buffers
192+
/// variant_builder.finish();
193+
///
194+
/// // Second row is an object
195+
/// let mut variant_builder = array_builder.variant_builder();
196+
/// let mut obj = variant_builder.new_object();
197+
/// obj.insert("my_field", 42i64);
198+
/// obj.finish().unwrap();
199+
/// variant_builder.finish();
200+
///
201+
/// // finalize the array
202+
/// let variant_array: VariantArray = array_builder.build();
203+
///
204+
/// // verify what we wrote is still there
205+
/// assert_eq!(variant_array.value(0), Variant::from("Hello, World!"));
206+
/// assert!(variant_array.value(1).as_object().is_some());
207+
/// ```
208+
pub fn variant_builder(&mut self) -> VariantArrayVariantBuilder {
209+
// append directly into the metadata and value buffers
210+
let metadata_buffer = std::mem::take(&mut self.metadata_buffer);
211+
let value_buffer = std::mem::take(&mut self.value_buffer);
212+
VariantArrayVariantBuilder::new(self, metadata_buffer, value_buffer)
213+
}
214+
}
215+
216+
/// A `VariantBuilder` that writes directly to the buffers of a `VariantArrayBuilder`.
217+
///
218+
/// Note this struct implements [`VariantBuilderExt`], so it can be used
219+
/// as a drop-in replacement for [`VariantBuilder`] in most cases.
220+
///
221+
/// If [`Self::finish`] is not called, any changes will be rolled back
222+
///
223+
/// See [`VariantArrayBuilder::variant_builder`] for an example
224+
pub struct VariantArrayVariantBuilder<'a> {
225+
/// was finish called?
226+
finished: bool,
227+
/// starting offset in the variant_builder's `metadata` buffer
228+
metadata_offset: usize,
229+
/// starting offset in the variant_builder's `value` buffer
230+
value_offset: usize,
231+
/// Parent array builder that this variant builder writes to. Buffers
232+
/// have been moved into the variant builder, and must be returned on
233+
/// drop
234+
array_builder: &'a mut VariantArrayBuilder,
235+
/// Builder for the in progress variant value, temporarily owns the buffers
236+
/// from `array_builder`
237+
variant_builder: VariantBuilder,
238+
}
239+
240+
impl<'a> VariantBuilderExt for VariantArrayVariantBuilder<'a> {
241+
fn append_value<'m, 'v>(&mut self, value: impl Into<Variant<'m, 'v>>) {
242+
self.variant_builder.append_value(value);
243+
}
244+
245+
fn new_list(&mut self) -> ListBuilder {
246+
self.variant_builder.new_list()
247+
}
248+
249+
fn new_object(&mut self) -> ObjectBuilder {
250+
self.variant_builder.new_object()
251+
}
252+
}
253+
254+
impl<'a> VariantArrayVariantBuilder<'a> {
255+
/// Constructs a new VariantArrayVariantBuilder
256+
///
257+
/// Note this is not public as this is a structure that is logically
258+
/// part of the [`VariantArrayBuilder`] and relies on its internal structure
259+
fn new(
260+
array_builder: &'a mut VariantArrayBuilder,
261+
metadata_buffer: Vec<u8>,
262+
value_buffer: Vec<u8>,
263+
) -> Self {
264+
let metadata_offset = metadata_buffer.len();
265+
let value_offset = value_buffer.len();
266+
VariantArrayVariantBuilder {
267+
finished: false,
268+
metadata_offset,
269+
value_offset,
270+
variant_builder: VariantBuilder::new_with_buffers(metadata_buffer, value_buffer),
271+
array_builder,
272+
}
273+
}
274+
275+
/// Return a reference to the underlying `VariantBuilder`
276+
pub fn inner(&self) -> &VariantBuilder {
277+
&self.variant_builder
278+
}
279+
280+
/// Return a mutable reference to the underlying `VariantBuilder`
281+
pub fn inner_mut(&mut self) -> &mut VariantBuilder {
282+
&mut self.variant_builder
283+
}
284+
285+
/// Called to finish the in progress variant and write it to the underlying
286+
/// buffers
287+
///
288+
/// Note if you do not call finish, on drop any changes made to the
289+
/// underlying buffers will be rolled back.
290+
pub fn finish(mut self) {
291+
self.finished = true;
292+
// Note: buffers are returned and replaced in the drop impl
293+
}
294+
}
295+
296+
impl<'a> Drop for VariantArrayVariantBuilder<'a> {
297+
/// If the builder was not finished, roll back any changes made to the
298+
/// underlying buffers (by truncating them)
299+
fn drop(&mut self) {
300+
let metadata_offset = self.metadata_offset;
301+
let value_offset = self.value_offset;
302+
303+
// get the buffers back from the variant builder
304+
let (mut metadata_buffer, mut value_buffer) =
305+
std::mem::take(&mut self.variant_builder).finish();
306+
307+
// Sanity Check: if the buffers got smaller, something went wrong (previous data was lost)
308+
let metadata_len = metadata_buffer
309+
.len()
310+
.checked_sub(metadata_offset)
311+
.expect("metadata length decreased unexpectedly");
312+
let value_len = value_buffer
313+
.len()
314+
.checked_sub(value_offset)
315+
.expect("value length decreased unexpectedly");
316+
317+
if self.finished {
318+
// if the object was finished, commit the changes by putting the
319+
// offsets and lengths into the parent array builder.
320+
self.array_builder
321+
.metadata_locations
322+
.push((metadata_offset, metadata_len));
323+
self.array_builder
324+
.value_locations
325+
.push((value_offset, value_len));
326+
self.array_builder.nulls.append_non_null();
327+
} else {
328+
// if the object was not finished, truncate the buffers to the
329+
// original offsets to roll back any changes. Note this is fast
330+
// because truncate doesn't free any memory: it just has to drop
331+
// elements (and u8 doesn't have a destructor)
332+
metadata_buffer.truncate(metadata_offset);
333+
value_buffer.truncate(value_offset);
334+
}
335+
336+
// put the buffers back into the array builder
337+
self.array_builder.metadata_buffer = metadata_buffer;
338+
self.array_builder.value_buffer = value_buffer;
339+
}
172340
}
173341

174342
fn binary_view_array_from_buffers(
@@ -220,4 +388,82 @@ mod test {
220388
);
221389
}
222390
}
391+
392+
/// Test using sub builders to append variants
393+
#[test]
394+
fn test_variant_array_builder_variant_builder() {
395+
let mut builder = VariantArrayBuilder::new(10);
396+
builder.append_null(); // should not panic
397+
builder.append_variant(Variant::from(42i32));
398+
399+
// let's make a sub-object in the next row
400+
let mut sub_builder = builder.variant_builder();
401+
let mut obj = sub_builder.new_object();
402+
obj.insert("foo", "bar");
403+
obj.finish().unwrap();
404+
sub_builder.finish(); // must call finish to write the variant to the buffers
405+
406+
// append a new list
407+
let mut sub_builder = builder.variant_builder();
408+
let mut list_builder = sub_builder.new_list();
409+
list_builder.append_value(Variant::from(1i32));
410+
list_builder.append_value(Variant::from(2i32));
411+
list_builder.finish();
412+
sub_builder.finish();
413+
let variant_array = builder.build();
414+
415+
assert_eq!(variant_array.len(), 4);
416+
assert!(variant_array.is_null(0));
417+
assert!(!variant_array.is_null(1));
418+
assert_eq!(variant_array.value(1), Variant::from(42i32));
419+
assert!(!variant_array.is_null(2));
420+
let variant = variant_array.value(2);
421+
let variant = variant.as_object().expect("variant to be an object");
422+
assert_eq!(variant.get("foo").unwrap(), Variant::from("bar"));
423+
assert!(!variant_array.is_null(3));
424+
let variant = variant_array.value(3);
425+
let list = variant.as_list().expect("variant to be a list");
426+
assert_eq!(list.len(), 2);
427+
}
428+
429+
/// Test using non-finished sub builders to append variants
430+
#[test]
431+
fn test_variant_array_builder_variant_builder_reset() {
432+
let mut builder = VariantArrayBuilder::new(10);
433+
434+
// make a sub-object in the first row
435+
let mut sub_builder = builder.variant_builder();
436+
let mut obj = sub_builder.new_object();
437+
obj.insert("foo", 1i32);
438+
obj.finish().unwrap();
439+
sub_builder.finish(); // must call finish to write the variant to the buffers
440+
441+
// start appending an object but don't finish
442+
let mut sub_builder = builder.variant_builder();
443+
let mut obj = sub_builder.new_object();
444+
obj.insert("bar", 2i32);
445+
obj.finish().unwrap();
446+
drop(sub_builder); // drop the sub builder without finishing it
447+
448+
// make a third sub-object (this should reset the previous unfinished object)
449+
let mut sub_builder = builder.variant_builder();
450+
let mut obj = sub_builder.new_object();
451+
obj.insert("baz", 3i32);
452+
obj.finish().unwrap();
453+
sub_builder.finish(); // must call finish to write the variant to the buffers
454+
455+
let variant_array = builder.build();
456+
457+
// only the two finished objects should be present
458+
assert_eq!(variant_array.len(), 2);
459+
assert!(!variant_array.is_null(0));
460+
let variant = variant_array.value(0);
461+
let variant = variant.as_object().expect("variant to be an object");
462+
assert_eq!(variant.get("foo").unwrap(), Variant::from(1i32));
463+
464+
assert!(!variant_array.is_null(1));
465+
let variant = variant_array.value(1);
466+
let variant = variant.as_object().expect("variant to be an object");
467+
assert_eq!(variant.get("baz").unwrap(), Variant::from(3i32));
468+
}
223469
}

0 commit comments

Comments
 (0)