Skip to content

Commit e10d41d

Browse files
committed
Avoid a copy in VariantArrayBuilder when creating JSON
1 parent daf31be commit e10d41d

File tree

4 files changed

+169
-23
lines changed

4 files changed

+169
-23
lines changed

parquet-variant-compute/src/from_json.rs

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@
2121
use crate::{VariantArray, VariantArrayBuilder};
2222
use arrow::array::{Array, ArrayRef, StringArray};
2323
use arrow_schema::ArrowError;
24-
use parquet_variant::VariantBuilder;
2524
use parquet_variant_json::json_to_variant;
2625

2726
/// Parse a batch of JSON strings into a batch of Variants represented as
@@ -41,10 +40,10 @@ pub fn batch_json_string_to_variant(input: &ArrayRef) -> Result<VariantArray, Ar
4140
// The subfields are expected to be non-nullable according to the parquet variant spec.
4241
variant_array_builder.append_null();
4342
} else {
44-
let mut vb = VariantBuilder::new();
43+
let mut vb = variant_array_builder.variant_builder();
44+
// parse JSON directly to the variant builder
4545
json_to_variant(input_string_array.value(i), &mut vb)?;
46-
let (metadata, value) = vb.finish();
47-
variant_array_builder.append_variant_buffers(&metadata, &value);
46+
vb.finish()
4847
}
4948
}
5049
Ok(variant_array_builder.build())

parquet-variant-compute/src/variant_array_builder.rs

Lines changed: 154 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
use crate::VariantArray;
2121
use arrow::array::{ArrayRef, BinaryViewArray, BinaryViewBuilder, NullBufferBuilder, StructArray};
2222
use arrow_schema::{DataType, Field, Fields};
23-
use parquet_variant::{Variant, VariantBuilder};
23+
use parquet_variant::{ListBuilder, ObjectBuilder, Variant, VariantBuilder, VariantBuilderExt};
2424
use std::sync::Arc;
2525

2626
/// A builder for [`VariantArray`]
@@ -147,11 +147,9 @@ impl VariantArrayBuilder {
147147

148148
/// Append the [`Variant`] to the builder as the next row
149149
pub fn append_variant(&mut self, variant: Variant) {
150-
// TODO make this more efficient by avoiding the intermediate buffers
151-
let mut variant_builder = VariantBuilder::new();
152-
variant_builder.append_value(variant);
153-
let (metadata, value) = variant_builder.finish();
154-
self.append_variant_buffers(&metadata, &value);
150+
let mut direct_builder = self.variant_builder();
151+
direct_builder.variant_builder.append_value(variant);
152+
direct_builder.finish()
155153
}
156154

157155
/// Append a metadata and values buffer to the builder
@@ -168,7 +166,156 @@ impl VariantArrayBuilder {
168166
self.value_buffer.extend_from_slice(value);
169167
}
170168

171-
// TODO: Return a Variant builder that will write to the underlying buffers (TODO)
169+
/// Return a `VariantArrayVariantBuilder` that writes directly to the
170+
/// buffers of this builder.
171+
///
172+
/// You must call [`VariantArrayVariantBuilder::finish`] to complete the builder
173+
///
174+
/// # Example
175+
/// ```
176+
/// # use parquet_variant::{Variant, VariantBuilder, VariantBuilderExt};
177+
/// # use parquet_variant_compute::{VariantArray, VariantArrayBuilder};
178+
/// let mut array_builder = VariantArrayBuilder::new(10);
179+
///
180+
/// // First row has a string
181+
/// let mut variant_builder = array_builder.variant_builder();
182+
/// variant_builder.append_value("Hello, World!");
183+
/// // must call finish to write the variant to the buffers
184+
/// variant_builder.finish();
185+
///
186+
/// // Second row is an object
187+
/// let mut variant_builder = array_builder.variant_builder();
188+
/// let mut obj = variant_builder.new_object();
189+
/// obj.insert("my_field", 42i64);
190+
/// obj.finish().unwrap();
191+
/// variant_builder.finish();
192+
///
193+
/// // finalize the array
194+
/// let variant_array: VariantArray = array_builder.build();
195+
///
196+
/// // verify what we wrote is still there
197+
/// assert_eq!(variant_array.value(0), Variant::from("Hello, World!"));
198+
/// assert!(variant_array.value(1).as_object().is_some());
199+
/// ```
200+
pub fn variant_builder(&mut self) -> VariantArrayVariantBuilder {
201+
// append directly into the metadata and value buffers
202+
let metadata_buffer = std::mem::take(&mut self.metadata_buffer);
203+
let value_buffer = std::mem::take(&mut self.value_buffer);
204+
let metadata_offset = metadata_buffer.len();
205+
let value_offset = value_buffer.len();
206+
VariantArrayVariantBuilder {
207+
finished: false,
208+
metadata_offset,
209+
value_offset,
210+
variant_builder: VariantBuilder::new_with_buffers(metadata_buffer, value_buffer),
211+
array_builder: self,
212+
}
213+
}
214+
}
215+
216+
/// A `VariantBuilder` that writes directly to the buffers of a `VariantArrayBuilder`.
217+
///
218+
/// Note this struct implements [`VariantBuilderExt`], so it can be used
219+
/// as a drop-in replacement for [`VariantBuilder`] in most cases.
220+
///
221+
/// If [`Self::finish`] is not called, any changes will be rolled back
222+
///
223+
/// See [`VariantArrayBuilder::variant_builder`] for an example
224+
pub struct VariantArrayVariantBuilder<'a> {
225+
/// was finish called?
226+
finished: bool,
227+
/// starting metadata offset in the underlying buffers
228+
metadata_offset: usize,
229+
/// starting value offset
230+
value_offset: usize,
231+
/// Parent array builder that this variant builder writes to. Buffers
232+
/// have been moved into the variant builder, and must be returned on
233+
/// drop
234+
array_builder: &'a mut VariantArrayBuilder,
235+
/// Builder for the variant, temporarily owns the buffers from `array_builder`
236+
variant_builder: VariantBuilder,
237+
}
238+
239+
impl<'a> VariantBuilderExt for VariantArrayVariantBuilder<'a> {
240+
fn append_value<'m, 'v>(&mut self, value: impl Into<Variant<'m, 'v>>) {
241+
self.variant_builder.append_value(value);
242+
}
243+
244+
fn new_list(&mut self) -> ListBuilder {
245+
self.variant_builder.new_list()
246+
}
247+
248+
fn new_object(&mut self) -> ObjectBuilder {
249+
self.variant_builder.new_object()
250+
}
251+
}
252+
253+
impl VariantArrayVariantBuilder<'_> {
254+
/// Return a reference to the underlying `VariantBuilder`
255+
pub fn inner(&self) -> &VariantBuilder {
256+
&self.variant_builder
257+
}
258+
259+
/// Return a mutable reference to the underlying `VariantBuilder`
260+
pub fn inner_mut(&mut self) -> &mut VariantBuilder {
261+
&mut self.variant_builder
262+
}
263+
264+
/// Called to finish the in progress variant and write it to the underlying
265+
/// buffers
266+
///
267+
/// Note if you do not call finish, on drop any changes made to the
268+
/// underlying buffers will be rolled back.
269+
pub fn finish(mut self) {
270+
self.finished = true;
271+
// Note: buffers are returned and replaced in the drop impl
272+
}
273+
}
274+
275+
impl<'a> Drop for VariantArrayVariantBuilder<'a> {
276+
/// If the builder was not finished, roll back any changes made to the
277+
/// underlying buffers (by truncating them)
278+
fn drop(&mut self) {
279+
let metadata_offset = self.metadata_offset;
280+
let value_offset = self.value_offset;
281+
282+
// get the buffers back from the variant builder
283+
let (mut metadata_buffer, mut value_buffer) =
284+
std::mem::take(&mut self.variant_builder).finish();
285+
286+
// Sanity Check: if the buffers got smaller, something went wrong (previous data was lost)
287+
let metadata_len = metadata_buffer
288+
.len()
289+
.checked_sub(metadata_offset)
290+
.expect("metadata length decreased unexpectedly");
291+
let value_len = value_buffer
292+
.len()
293+
.checked_sub(value_offset)
294+
.expect("value length decreased unexpectedly");
295+
296+
if self.finished {
297+
// if the object was finished, commit the changes by putting the
298+
// offsets and lengths into the parent array builder.
299+
self.array_builder
300+
.metadata_locations
301+
.push((metadata_offset, metadata_len));
302+
self.array_builder
303+
.value_locations
304+
.push((value_offset, value_len));
305+
self.array_builder.nulls.append_non_null();
306+
} else {
307+
// if the object was not finished, truncate the buffers to the
308+
// original offsets to roll back any changes. Note this is fast
309+
// because truncate doesn't free any memory: it just has to drop
310+
// elements (but u8 doesn't have a destructor)
311+
metadata_buffer.truncate(metadata_offset);
312+
value_buffer.truncate(value_offset);
313+
}
314+
315+
// put the buffers back into the array builder
316+
self.array_builder.metadata_buffer = metadata_buffer;
317+
self.array_builder.value_buffer = value_buffer;
318+
}
172319
}
173320

174321
fn binary_view_array_from_buffers(

parquet-variant-json/src/from_json.rs

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
//! Module for parsing JSON strings as Variant
1919
2020
use arrow_schema::ArrowError;
21-
use parquet_variant::{ListBuilder, ObjectBuilder, Variant, VariantBuilder, VariantBuilderExt};
21+
use parquet_variant::{ListBuilder, ObjectBuilder, Variant, VariantBuilderExt};
2222
use serde_json::{Number, Value};
2323

2424
/// Converts a JSON string to Variant using [`VariantBuilder`]. The resulting `value` and `metadata`
@@ -62,15 +62,15 @@ use serde_json::{Number, Value};
6262
/// assert_eq!(json_result, serde_json::to_string(&json_value)?);
6363
/// # Ok::<(), Box<dyn std::error::Error>>(())
6464
/// ```
65-
pub fn json_to_variant(json: &str, builder: &mut VariantBuilder) -> Result<(), ArrowError> {
65+
pub fn json_to_variant(json: &str, builder: &mut impl VariantBuilderExt) -> Result<(), ArrowError> {
6666
let json: Value = serde_json::from_str(json)
6767
.map_err(|e| ArrowError::InvalidArgumentError(format!("JSON format error: {e}")))?;
6868

6969
build_json(&json, builder)?;
7070
Ok(())
7171
}
7272

73-
fn build_json(json: &Value, builder: &mut VariantBuilder) -> Result<(), ArrowError> {
73+
fn build_json(json: &Value, builder: &mut impl VariantBuilderExt) -> Result<(), ArrowError> {
7474
append_json(json, builder)?;
7575
Ok(())
7676
}
@@ -101,7 +101,7 @@ fn variant_from_number<'m, 'v>(n: &Number) -> Result<Variant<'m, 'v>, ArrowError
101101

102102
fn append_json<'m, 'v>(
103103
json: &'v Value,
104-
builder: &mut impl VariantBuilderExt<'m, 'v>,
104+
builder: &mut impl VariantBuilderExt,
105105
) -> Result<(), ArrowError> {
106106
match json {
107107
Value::Null => builder.append_value(Variant::Null),
@@ -137,8 +137,8 @@ struct ObjectFieldBuilder<'o, 'v, 's> {
137137
builder: &'o mut ObjectBuilder<'v>,
138138
}
139139

140-
impl<'m, 'v> VariantBuilderExt<'m, 'v> for ObjectFieldBuilder<'_, '_, '_> {
141-
fn append_value(&mut self, value: impl Into<Variant<'m, 'v>>) {
140+
impl VariantBuilderExt for ObjectFieldBuilder<'_, '_, '_> {
141+
fn append_value<'m, 'v>(&mut self, value: impl Into<Variant<'m, 'v>>) {
142142
self.builder.insert(self.key, value);
143143
}
144144

parquet-variant/src/builder.rs

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1047,16 +1047,16 @@ impl Drop for ObjectBuilder<'_> {
10471047
///
10481048
/// Allows users to append values to a [`VariantBuilder`], [`ListBuilder`] or
10491049
/// [`ObjectBuilder`]. using the same interface.
1050-
pub trait VariantBuilderExt<'m, 'v> {
1051-
fn append_value(&mut self, value: impl Into<Variant<'m, 'v>>);
1050+
pub trait VariantBuilderExt {
1051+
fn append_value<'m, 'v>(&mut self, value: impl Into<Variant<'m, 'v>>);
10521052

10531053
fn new_list(&mut self) -> ListBuilder;
10541054

10551055
fn new_object(&mut self) -> ObjectBuilder;
10561056
}
10571057

1058-
impl<'m, 'v> VariantBuilderExt<'m, 'v> for ListBuilder<'_> {
1059-
fn append_value(&mut self, value: impl Into<Variant<'m, 'v>>) {
1058+
impl VariantBuilderExt for ListBuilder<'_> {
1059+
fn append_value<'m, 'v>(&mut self, value: impl Into<Variant<'m, 'v>>) {
10601060
self.append_value(value);
10611061
}
10621062

@@ -1069,8 +1069,8 @@ impl<'m, 'v> VariantBuilderExt<'m, 'v> for ListBuilder<'_> {
10691069
}
10701070
}
10711071

1072-
impl<'m, 'v> VariantBuilderExt<'m, 'v> for VariantBuilder {
1073-
fn append_value(&mut self, value: impl Into<Variant<'m, 'v>>) {
1072+
impl VariantBuilderExt for VariantBuilder {
1073+
fn append_value<'m, 'v>(&mut self, value: impl Into<Variant<'m, 'v>>) {
10741074
self.append_value(value);
10751075
}
10761076

0 commit comments

Comments
 (0)