Skip to content

Commit daf31be

Browse files
authored
[Variant] Add VariantBuilder::new_with_buffers to write to existing buffers (#7912)
# Which issue does this PR close? - closes #7805 - part of #6736 - part of #7911 # Rationale for this change I would like to be able to write Variants directly into the target buffer when writing multiple variants However, the current VariantBuilder allocates a new bufffer for each variant # What changes are included in this PR? 1. Add `VariantBuilder::new_with_buffers` and docs and tests You can see how this API can be used to write directly into a buffer in VariantArrayBuilder in this PR: - #7911 # Are these changes tested? Yes new tests # Are there any user-facing changes? New API
1 parent fe77f2f commit daf31be

File tree

1 file changed

+184
-15
lines changed

1 file changed

+184
-15
lines changed

parquet-variant/src/builder.rs

Lines changed: 184 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -61,9 +61,35 @@ fn write_offset(buf: &mut Vec<u8>, value: usize, nbytes: u8) {
6161
buf.extend_from_slice(&bytes[..nbytes as usize]);
6262
}
6363

64-
#[derive(Default)]
64+
/// Wrapper around a `Vec<u8>` that provides methods for appending
65+
/// primitive values, variant types, and metadata.
66+
///
67+
/// This is used internally by the builders to construct the
68+
/// the `value` field for [`Variant`] values.
69+
///
70+
/// You can reuse an existing `Vec<u8>` by using the `from` impl
71+
#[derive(Debug, Default)]
6572
struct ValueBuffer(Vec<u8>);
6673

74+
impl ValueBuffer {
75+
/// Construct a ValueBuffer that will write to a new underlying `Vec`
76+
fn new() -> Self {
77+
Default::default()
78+
}
79+
}
80+
81+
impl From<Vec<u8>> for ValueBuffer {
82+
fn from(value: Vec<u8>) -> Self {
83+
Self(value)
84+
}
85+
}
86+
87+
impl From<ValueBuffer> for Vec<u8> {
88+
fn from(value_buffer: ValueBuffer) -> Self {
89+
value_buffer.0
90+
}
91+
}
92+
6793
impl ValueBuffer {
6894
fn append_u8(&mut self, term: u8) {
6995
self.0.push(term);
@@ -82,7 +108,7 @@ impl ValueBuffer {
82108
}
83109

84110
fn into_inner(self) -> Vec<u8> {
85-
self.0
111+
self.into()
86112
}
87113

88114
fn inner_mut(&mut self) -> &mut Vec<u8> {
@@ -252,13 +278,31 @@ impl ValueBuffer {
252278
}
253279
}
254280

255-
#[derive(Default)]
281+
/// Builder for constructing metadata for [`Variant`] values.
282+
///
283+
/// This is used internally by the [`VariantBuilder`] to construct the metadata
284+
///
285+
/// You can use an existing `Vec<u8>` as the metadata buffer by using the `from` impl.
286+
#[derive(Default, Debug)]
256287
struct MetadataBuilder {
257288
// Field names -- field_ids are assigned in insert order
258289
field_names: IndexSet<String>,
259290

260291
// flag that checks if field names by insertion order are also lexicographically sorted
261292
is_sorted: bool,
293+
294+
/// Output buffer. Metadata is written to the end of this buffer
295+
metadata_buffer: Vec<u8>,
296+
}
297+
298+
/// Create a new MetadataBuilder that will write to the specified metadata buffer
299+
impl From<Vec<u8>> for MetadataBuilder {
300+
fn from(metadata_buffer: Vec<u8>) -> Self {
301+
Self {
302+
metadata_buffer,
303+
..Default::default()
304+
}
305+
}
262306
}
263307

264308
impl MetadataBuilder {
@@ -307,6 +351,12 @@ impl MetadataBuilder {
307351
// Calculate metadata size
308352
let total_dict_size: usize = self.metadata_size();
309353

354+
let Self {
355+
field_names,
356+
is_sorted,
357+
mut metadata_buffer,
358+
} = self;
359+
310360
// Determine appropriate offset size based on the larger of dict size or total string size
311361
let max_offset = std::cmp::max(total_dict_size, nkeys);
312362
let offset_size = int_size(max_offset);
@@ -315,29 +365,29 @@ impl MetadataBuilder {
315365
let string_start = offset_start + (nkeys + 1) * offset_size as usize;
316366
let metadata_size = string_start + total_dict_size;
317367

318-
let mut metadata = Vec::with_capacity(metadata_size);
368+
metadata_buffer.reserve(metadata_size);
319369

320370
// Write header: version=1, field names are sorted, with calculated offset_size
321-
metadata.push(0x01 | (self.is_sorted as u8) << 4 | ((offset_size - 1) << 6));
371+
metadata_buffer.push(0x01 | (is_sorted as u8) << 4 | ((offset_size - 1) << 6));
322372

323373
// Write dictionary size
324-
write_offset(&mut metadata, nkeys, offset_size);
374+
write_offset(&mut metadata_buffer, nkeys, offset_size);
325375

326376
// Write offsets
327377
let mut cur_offset = 0;
328-
for key in self.field_names.iter() {
329-
write_offset(&mut metadata, cur_offset, offset_size);
378+
for key in field_names.iter() {
379+
write_offset(&mut metadata_buffer, cur_offset, offset_size);
330380
cur_offset += key.len();
331381
}
332382
// Write final offset
333-
write_offset(&mut metadata, cur_offset, offset_size);
383+
write_offset(&mut metadata_buffer, cur_offset, offset_size);
334384

335385
// Write string data
336-
for key in self.field_names {
337-
metadata.extend_from_slice(key.as_bytes());
386+
for key in field_names {
387+
metadata_buffer.extend_from_slice(key.as_bytes());
338388
}
339389

340-
metadata
390+
metadata_buffer
341391
}
342392
}
343393

@@ -570,6 +620,41 @@ impl ParentState<'_> {
570620
/// );
571621
///
572622
/// ```
623+
/// # Example: Reusing Buffers
624+
///
625+
/// You can use the [`VariantBuilder`] to write into existing buffers (for
626+
/// example to write multiple variants back to back in the same buffer)
627+
///
628+
/// ```
629+
/// // we will write two variants back to back
630+
/// use parquet_variant::{Variant, VariantBuilder};
631+
/// // Append 12345
632+
/// let mut builder = VariantBuilder::new();
633+
/// builder.append_value(12345);
634+
/// let (metadata, value) = builder.finish();
635+
/// // remember where the first variant ends
636+
/// let (first_meta_offset, first_meta_len) = (0, metadata.len());
637+
/// let (first_value_offset, first_value_len) = (0, value.len());
638+
///
639+
/// // now, append a second variant to the same buffers
640+
/// let mut builder = VariantBuilder::new_with_buffers(metadata, value);
641+
/// builder.append_value("Foo");
642+
/// let (metadata, value) = builder.finish();
643+
///
644+
/// // The variants can be referenced in their appropriate location
645+
/// let variant1 = Variant::new(
646+
/// &metadata[first_meta_offset..first_meta_len],
647+
/// &value[first_value_offset..first_value_len]
648+
/// );
649+
/// assert_eq!(variant1, Variant::Int32(12345));
650+
///
651+
/// let variant2 = Variant::new(
652+
/// &metadata[first_meta_len..],
653+
/// &value[first_value_len..]
654+
/// );
655+
/// assert_eq!(variant2, Variant::from("Foo"));
656+
/// ```
657+
///
573658
/// # Example: Unique Field Validation
574659
///
575660
/// This example shows how enabling unique field validation will cause an error
@@ -626,23 +711,33 @@ impl ParentState<'_> {
626711
/// let (metadata, value) = builder.finish();
627712
/// let variant = Variant::try_new(&metadata, &value).unwrap();
628713
/// ```
629-
///
630-
#[derive(Default)]
714+
#[derive(Default, Debug)]
631715
pub struct VariantBuilder {
632716
buffer: ValueBuffer,
633717
metadata_builder: MetadataBuilder,
634718
validate_unique_fields: bool,
635719
}
636720

637721
impl VariantBuilder {
722+
/// Create a new VariantBuilder with new underlying buffer
638723
pub fn new() -> Self {
639724
Self {
640-
buffer: ValueBuffer::default(),
725+
buffer: ValueBuffer::new(),
641726
metadata_builder: MetadataBuilder::default(),
642727
validate_unique_fields: false,
643728
}
644729
}
645730

731+
/// Create a new VariantBuilder that will write the metadata and values to
732+
/// the specified buffers.
733+
pub fn new_with_buffers(metadata_buffer: Vec<u8>, value_buffer: Vec<u8>) -> Self {
734+
Self {
735+
buffer: ValueBuffer::from(value_buffer),
736+
metadata_builder: MetadataBuilder::from(metadata_buffer),
737+
validate_unique_fields: false,
738+
}
739+
}
740+
646741
/// Enables validation of unique field keys in nested objects.
647742
///
648743
/// This setting is propagated to all [`ObjectBuilder`]s created through this [`VariantBuilder`]
@@ -1916,6 +2011,80 @@ mod tests {
19162011
assert_eq!(metadata.num_field_names(), 3);
19172012
}
19182013

2014+
/// Test reusing buffers with nested objects
2015+
#[test]
2016+
fn test_with_existing_buffers_nested() {
2017+
let mut builder = VariantBuilder::new();
2018+
append_test_list(&mut builder);
2019+
let (m1, v1) = builder.finish();
2020+
let variant1 = Variant::new(&m1, &v1);
2021+
2022+
let mut builder = VariantBuilder::new();
2023+
append_test_object(&mut builder);
2024+
let (m2, v2) = builder.finish();
2025+
let variant2 = Variant::new(&m2, &v2);
2026+
2027+
let mut builder = VariantBuilder::new();
2028+
builder.append_value("This is a string");
2029+
let (m3, v3) = builder.finish();
2030+
let variant3 = Variant::new(&m3, &v3);
2031+
2032+
// Now, append those three variants to the a new buffer that is reused
2033+
let mut builder = VariantBuilder::new();
2034+
append_test_list(&mut builder);
2035+
let (metadata, value) = builder.finish();
2036+
let (meta1_offset, meta1_end) = (0, metadata.len());
2037+
let (value1_offset, value1_end) = (0, value.len());
2038+
2039+
// reuse same buffer
2040+
let mut builder = VariantBuilder::new_with_buffers(metadata, value);
2041+
append_test_object(&mut builder);
2042+
let (metadata, value) = builder.finish();
2043+
let (meta2_offset, meta2_end) = (meta1_end, metadata.len());
2044+
let (value2_offset, value2_end) = (value1_end, value.len());
2045+
2046+
// Append a string
2047+
let mut builder = VariantBuilder::new_with_buffers(metadata, value);
2048+
builder.append_value("This is a string");
2049+
let (metadata, value) = builder.finish();
2050+
let (meta3_offset, meta3_end) = (meta2_end, metadata.len());
2051+
let (value3_offset, value3_end) = (value2_end, value.len());
2052+
2053+
// verify we can read the variants back correctly
2054+
let roundtrip1 = Variant::new(
2055+
&metadata[meta1_offset..meta1_end],
2056+
&value[value1_offset..value1_end],
2057+
);
2058+
assert_eq!(roundtrip1, variant1,);
2059+
2060+
let roundtrip2 = Variant::new(
2061+
&metadata[meta2_offset..meta2_end],
2062+
&value[value2_offset..value2_end],
2063+
);
2064+
assert_eq!(roundtrip2, variant2,);
2065+
2066+
let roundtrip3 = Variant::new(
2067+
&metadata[meta3_offset..meta3_end],
2068+
&value[value3_offset..value3_end],
2069+
);
2070+
assert_eq!(roundtrip3, variant3);
2071+
}
2072+
2073+
/// append a simple List variant
2074+
fn append_test_list(builder: &mut VariantBuilder) {
2075+
let mut list = builder.new_list();
2076+
list.append_value(1234);
2077+
list.append_value("a string value");
2078+
list.finish();
2079+
}
2080+
2081+
/// append an object variant
2082+
fn append_test_object(builder: &mut VariantBuilder) {
2083+
let mut obj = builder.new_object();
2084+
obj.insert("a", true);
2085+
obj.finish().unwrap();
2086+
}
2087+
19192088
#[test]
19202089
fn test_variant_builder_to_list_builder_no_finish() {
19212090
// Create a list builder but never finish it

0 commit comments

Comments
 (0)