Skip to content

Commit 8166cb6

Browse files
committed
Avoid a copy in VariantArrayBuilder when creating JSON
1 parent daf31be commit 8166cb6

File tree

2 files changed

+153
-12
lines changed

2 files changed

+153
-12
lines changed

parquet-variant-compute/src/from_json.rs

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@
2121
use crate::{VariantArray, VariantArrayBuilder};
2222
use arrow::array::{Array, ArrayRef, StringArray};
2323
use arrow_schema::ArrowError;
24-
use parquet_variant::VariantBuilder;
2524
use parquet_variant_json::json_to_variant;
2625

2726
/// Parse a batch of JSON strings into a batch of Variants represented as
@@ -41,10 +40,9 @@ pub fn batch_json_string_to_variant(input: &ArrayRef) -> Result<VariantArray, Ar
4140
// The subfields are expected to be non-nullable according to the parquet variant spec.
4241
variant_array_builder.append_null();
4342
} else {
44-
let mut vb = VariantBuilder::new();
45-
json_to_variant(input_string_array.value(i), &mut vb)?;
46-
let (metadata, value) = vb.finish();
47-
variant_array_builder.append_variant_buffers(&metadata, &value);
43+
let mut vb = variant_array_builder.variant_builder();
44+
json_to_variant(input_string_array.value(i), vb.inner_mut())?;
45+
vb.finish()
4846
}
4947
}
5048
Ok(variant_array_builder.build())

parquet-variant-compute/src/variant_array_builder.rs

Lines changed: 150 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
use crate::VariantArray;
2121
use arrow::array::{ArrayRef, BinaryViewArray, BinaryViewBuilder, NullBufferBuilder, StructArray};
2222
use arrow_schema::{DataType, Field, Fields};
23-
use parquet_variant::{Variant, VariantBuilder};
23+
use parquet_variant::{ListBuilder, ObjectBuilder, Variant, VariantBuilder, VariantBuilderExt};
2424
use std::sync::Arc;
2525

2626
/// A builder for [`VariantArray`]
@@ -147,11 +147,9 @@ impl VariantArrayBuilder {
147147

148148
/// Append the [`Variant`] to the builder as the next row
149149
pub fn append_variant(&mut self, variant: Variant) {
150-
// TODO make this more efficient by avoiding the intermediate buffers
151-
let mut variant_builder = VariantBuilder::new();
152-
variant_builder.append_value(variant);
153-
let (metadata, value) = variant_builder.finish();
154-
self.append_variant_buffers(&metadata, &value);
150+
let mut direct_builder = self.variant_builder();
151+
direct_builder.variant_builder.append_value(variant);
152+
direct_builder.finish()
155153
}
156154

157155
/// Append a metadata and values buffer to the builder
@@ -168,7 +166,152 @@ impl VariantArrayBuilder {
168166
self.value_buffer.extend_from_slice(value);
169167
}
170168

171-
// TODO: Return a Variant builder that will write to the underlying buffers (TODO)
169+
/// Return a `VariantArrayVariantBuilder` that writes directly to the
170+
/// buffers of this builder.
171+
///
172+
/// You must call [`VariantArrayVariantBuilder::finish`] to complete the builder
173+
///
174+
/// # Example
175+
/// ```
176+
/// # use parquet_variant::{Variant, VariantBuilder, VariantBuilderExt};
177+
/// # use parquet_variant_compute::{VariantArray, VariantArrayBuilder};
178+
/// let mut array_builder = VariantArrayBuilder::new(10);
179+
///
180+
/// // First row has a string
181+
/// let mut variant_builder = array_builder.variant_builder();
182+
/// variant_builder.append_value("Hello, World!");
183+
/// // must call finish to write the variant to the buffers
184+
/// variant_builder.finish();
185+
///
186+
/// // Second row is an object
187+
/// let mut variant_builder = array_builder.variant_builder();
188+
/// let mut obj = variant_builder.new_object();
189+
/// obj.insert("my_field", 42i64);
190+
/// obj.finish().unwrap();
191+
/// variant_builder.finish();
192+
///
193+
/// // finalize the array
194+
/// let variant_array: VariantArray = array_builder.build();
195+
///
196+
/// // verify what we wrote is still there
197+
/// assert_eq!(variant_array.value(0), Variant::from("Hello, World!"));
198+
/// assert!(variant_array.value(1).as_object().is_some());
199+
/// ```
200+
pub fn variant_builder(&mut self) -> VariantArrayVariantBuilder {
201+
// append directly into the metadata and value buffers
202+
let metadata_buffer = std::mem::take(&mut self.metadata_buffer);
203+
let value_buffer = std::mem::take(&mut self.value_buffer);
204+
let metadata_offset = metadata_buffer.len();
205+
let value_offset = value_buffer.len();
206+
VariantArrayVariantBuilder {
207+
finished: false,
208+
metadata_offset,
209+
value_offset,
210+
variant_builder: VariantBuilder::new_with_buffers(metadata_buffer, value_buffer),
211+
array_builder: self,
212+
}
213+
}
214+
}
215+
216+
/// A `VariantBuilder` that writes directly to the buffers of a `VariantArrayBuilder`.
217+
///
218+
/// Note this struct implements [`VariantBuilderExt`], so it can be used
219+
/// as a drop-in replacement for [`VariantBuilder`] in most cases.
220+
///
221+
/// See [`VariantArrayBuilder::variant_builder`] for an example
222+
pub struct VariantArrayVariantBuilder<'a> {
223+
/// was finish called?
224+
finished: bool,
225+
/// starting metadata offset in the underlying buffers
226+
metadata_offset: usize,
227+
/// starting value offset
228+
value_offset: usize,
229+
array_builder: &'a mut VariantArrayBuilder,
230+
variant_builder: VariantBuilder,
231+
}
232+
233+
impl<'a, 'm, 'v> VariantBuilderExt<'m, 'v> for VariantArrayVariantBuilder<'a> {
234+
fn append_value(&mut self, value: impl Into<Variant<'m, 'v>>) {
235+
self.variant_builder.append_value(value);
236+
}
237+
238+
fn new_list(&mut self) -> ListBuilder {
239+
self.variant_builder.new_list()
240+
}
241+
242+
fn new_object(&mut self) -> ObjectBuilder {
243+
self.variant_builder.new_object()
244+
}
245+
}
246+
247+
impl VariantArrayVariantBuilder<'_> {
248+
/// Return a reference to the underlying `VariantBuilder`
249+
pub fn inner(&self) -> &VariantBuilder {
250+
&self.variant_builder
251+
}
252+
253+
/// Return a mutable reference to the underlying `VariantBuilder`
254+
pub fn inner_mut(&mut self) -> &mut VariantBuilder {
255+
&mut self.variant_builder
256+
}
257+
258+
/// Called to finalize the variant and write it to the underlying buffers
259+
///
260+
/// Note if you do not call finish, the struct will be reset and the buffers
261+
/// will not be updated.
262+
///
263+
pub fn finish(mut self) {
264+
let metadata_offset = self.metadata_offset;
265+
let value_offset = self.value_offset;
266+
267+
// get the buffers back
268+
let (metadata_buffer, value_buffer) = std::mem::take(&mut self.variant_builder).finish();
269+
let metadata_len = metadata_buffer
270+
.len()
271+
.checked_sub(metadata_offset)
272+
.expect("metadata length decreased unexpectedly");
273+
let value_len = value_buffer
274+
.len()
275+
.checked_sub(value_offset)
276+
.expect("value length decreased unexpectedly");
277+
278+
// put the buffers back
279+
self.array_builder.metadata_buffer = metadata_buffer;
280+
self.array_builder.value_buffer = value_buffer;
281+
282+
// Append offsets and lengths for new nulls into the array builder
283+
self.array_builder
284+
.metadata_locations
285+
.push((metadata_offset, metadata_len));
286+
self.array_builder
287+
.value_locations
288+
.push((value_offset, value_len));
289+
self.array_builder.nulls.append_non_null();
290+
self.finished = true;
291+
}
292+
}
293+
294+
impl<'a> Drop for VariantArrayVariantBuilder<'a> {
295+
fn drop(&mut self) {
296+
if self.finished {
297+
// if the object was finished, we do not need to do anything
298+
return;
299+
}
300+
// if the object was not finished, we need to reset any partial state put the buffers back
301+
println!("VariantArrayVariantBuilder::drop");
302+
let variant_builder = std::mem::take(&mut self.variant_builder);
303+
let (mut metadata_buffer, mut value_buffer) = variant_builder.finish();
304+
assert!(
305+
metadata_buffer.len() >= self.metadata_offset,
306+
"metadata got smaller"
307+
);
308+
assert!(value_buffer.len() >= self.value_offset, "value got smaller");
309+
metadata_buffer.truncate(self.metadata_offset);
310+
value_buffer.truncate(self.value_offset);
311+
// put the buffers back
312+
self.array_builder.metadata_buffer = metadata_buffer;
313+
self.array_builder.value_buffer = value_buffer;
314+
}
172315
}
173316

174317
fn binary_view_array_from_buffers(

0 commit comments

Comments
 (0)