Skip to content

Commit aef3bdd

Browse files
[Variant] Support creating sorted dictionaries (#7833)
~_note: this PR is based off of https://github.com/apache/arrow-rs/pull/7808_~ # Which issue does this PR close? - Closes #7698. # Rationale for this change The Parquet variant supports creating objects with sorted dictionaries, where field names are sorted in lexicographical order. Sorting the dictionary and reassigning field IDs afterward can be computationally expensive. This PR offers an alternative: allowing users to specify the field names upfront, so that the correct field IDs can be assigned directly. (The correct field ids being correlated to the lexicographical sort order). This PR introduces two new public methods to `VariantBuilder`: - `with_field_names`, a builder method that takes in a `self`, initializes the dictionary, and returns the modified builder - `add_field_name`, a method to add individual field names to the dictionary manually
1 parent 57f96f2 commit aef3bdd

File tree

2 files changed

+323
-5
lines changed

2 files changed

+323
-5
lines changed

parquet-variant/src/builder.rs

Lines changed: 322 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -237,18 +237,41 @@ impl ValueBuffer {
237237
struct MetadataBuilder {
238238
// Field names -- field_ids are assigned in insert order
239239
field_names: IndexSet<String>,
240+
241+
// flag that checks if field names by insertion order are also lexicographically sorted
242+
is_sorted: bool,
240243
}
241244

242245
impl MetadataBuilder {
243246
/// Upsert field name to dictionary, return its ID
244247
fn upsert_field_name(&mut self, field_name: &str) -> u32 {
245-
let (id, _) = self.field_names.insert_full(field_name.to_string());
248+
let (id, new_entry) = self.field_names.insert_full(field_name.to_string());
249+
250+
if new_entry {
251+
let n = self.num_field_names();
252+
253+
// Dictionary sort order tracking:
254+
// - An empty dictionary is unsorted (ambiguous in spec but required by interop tests)
255+
// - A single-entry dictionary is trivially sorted
256+
// - Otherwise, an already-sorted dictionary becomes unsorted if the new entry breaks order
257+
self.is_sorted =
258+
n == 1 || self.is_sorted && (self.field_names[n - 2] < self.field_names[n - 1]);
259+
}
246260

247261
id as u32
248262
}
249263

264+
/// Returns the number of field names stored in the metadata builder.
265+
/// Note: this method should be the only place to call `self.field_names.len()`
266+
///
267+
/// # Panics
268+
///
269+
/// If the number of field names exceeds the maximum allowed value for `u32`.
250270
fn num_field_names(&self) -> usize {
251-
self.field_names.len()
271+
let n = self.field_names.len();
272+
assert!(n <= u32::MAX as usize);
273+
274+
n
252275
}
253276

254277
fn field_name(&self, i: usize) -> &str {
@@ -275,8 +298,8 @@ impl MetadataBuilder {
275298

276299
let mut metadata = Vec::with_capacity(metadata_size);
277300

278-
// Write header: version=1, not sorted, with calculated offset_size
279-
metadata.push(0x01 | ((offset_size - 1) << 6));
301+
// Write header: version=1, field names are sorted, with calculated offset_size
302+
metadata.push(0x01 | (self.is_sorted as u8) << 4 | ((offset_size - 1) << 6));
280303

281304
// Write dictionary size
282305
write_offset(&mut metadata, nkeys, offset_size);
@@ -299,6 +322,23 @@ impl MetadataBuilder {
299322
}
300323
}
301324

325+
impl<S: AsRef<str>> FromIterator<S> for MetadataBuilder {
326+
fn from_iter<T: IntoIterator<Item = S>>(iter: T) -> Self {
327+
let mut this = Self::default();
328+
this.extend(iter);
329+
330+
this
331+
}
332+
}
333+
334+
impl<S: AsRef<str>> Extend<S> for MetadataBuilder {
335+
fn extend<T: IntoIterator<Item = S>>(&mut self, iter: T) {
336+
for field_name in iter {
337+
self.upsert_field_name(field_name.as_ref());
338+
}
339+
}
340+
}
341+
302342
/// Top level builder for [`Variant`] values
303343
///
304344
/// # Example: create a Primitive Int8
@@ -454,6 +494,46 @@ impl MetadataBuilder {
454494
/// let result = obj.finish(); // returns Err
455495
/// assert!(result.is_err());
456496
/// ```
497+
///
498+
/// # Example: Sorted dictionaries
499+
///
500+
/// This example shows how to create a [`VariantBuilder`] with a pre-sorted field dictionary
501+
/// to improve field access performance when reading [`Variant`] objects.
502+
///
503+
/// You can use [`VariantBuilder::with_field_names`] to add multiple field names at once:
504+
/// ```
505+
/// use parquet_variant::{Variant, VariantBuilder};
506+
/// let mut builder = VariantBuilder::new()
507+
/// .with_field_names(["age", "name", "score"].into_iter());
508+
///
509+
/// let mut obj = builder.new_object();
510+
/// obj.insert("name", "Alice");
511+
/// obj.insert("age", 30);
512+
/// obj.insert("score", 95.5);
513+
/// obj.finish().unwrap();
514+
///
515+
/// let (metadata, value) = builder.finish();
516+
/// let variant = Variant::try_new(&metadata, &value).unwrap();
517+
/// ```
518+
///
519+
/// Alternatively, you can use [`VariantBuilder::add_field_name`] to add field names one by one:
520+
/// ```
521+
/// use parquet_variant::{Variant, VariantBuilder};
522+
/// let mut builder = VariantBuilder::new();
523+
/// builder.add_field_name("age"); // field id = 0
524+
/// builder.add_field_name("name"); // field id = 1
525+
/// builder.add_field_name("score"); // field id = 2
526+
///
527+
/// let mut obj = builder.new_object();
528+
/// obj.insert("name", "Bob"); // field id = 3
529+
/// obj.insert("age", 25);
530+
/// obj.insert("score", 88.0);
531+
/// obj.finish().unwrap();
532+
///
533+
/// let (metadata, value) = builder.finish();
534+
/// let variant = Variant::try_new(&metadata, &value).unwrap();
535+
/// ```
536+
///
457537
#[derive(Default)]
458538
pub struct VariantBuilder {
459539
buffer: ValueBuffer,
@@ -480,6 +560,25 @@ impl VariantBuilder {
480560
self
481561
}
482562

563+
/// This method pre-populates the field name directory in the Variant metadata with
564+
/// the specific field names, in order.
565+
///
566+
/// You can use this to pre-populate a [`VariantBuilder`] with a sorted dictionary if you
567+
/// know the field names beforehand. Sorted dictionaries can accelerate field access when
568+
/// reading [`Variant`]s.
569+
pub fn with_field_names<'a>(mut self, field_names: impl Iterator<Item = &'a str>) -> Self {
570+
self.metadata_builder.extend(field_names);
571+
572+
self
573+
}
574+
575+
/// Adds a single field name to the field name directory in the Variant metadata.
576+
///
577+
/// This method does the same thing as [`VariantBuilder::with_field_names`] but adds one field name at a time.
578+
pub fn add_field_name(&mut self, field_name: &str) {
579+
self.metadata_builder.upsert_field_name(field_name);
580+
}
581+
483582
/// Create an [`ListBuilder`] for creating [`Variant::List`] values.
484583
///
485584
/// See the examples on [`VariantBuilder`] for usage.
@@ -822,6 +921,8 @@ impl<'m, 'v> VariantBuilderExt<'m, 'v> for VariantBuilder {
822921

823922
#[cfg(test)]
824923
mod tests {
924+
use crate::VariantMetadata;
925+
825926
use super::*;
826927

827928
#[test]
@@ -1528,4 +1629,221 @@ mod tests {
15281629
let valid_result = valid_obj.finish();
15291630
assert!(valid_result.is_ok());
15301631
}
1632+
1633+
#[test]
1634+
fn test_sorted_dictionary() {
1635+
// check if variant metadatabuilders are equivalent from different ways of constructing them
1636+
let mut variant1 = VariantBuilder::new().with_field_names(["b", "c", "d"].into_iter());
1637+
1638+
let mut variant2 = {
1639+
let mut builder = VariantBuilder::new();
1640+
1641+
builder.add_field_name("b");
1642+
builder.add_field_name("c");
1643+
builder.add_field_name("d");
1644+
1645+
builder
1646+
};
1647+
1648+
assert_eq!(
1649+
variant1.metadata_builder.field_names,
1650+
variant2.metadata_builder.field_names
1651+
);
1652+
1653+
// check metadata builders say it's sorted
1654+
assert!(variant1.metadata_builder.is_sorted);
1655+
assert!(variant2.metadata_builder.is_sorted);
1656+
1657+
{
1658+
// test the bad case and break the sort order
1659+
variant2.add_field_name("a");
1660+
assert!(!variant2.metadata_builder.is_sorted);
1661+
1662+
// per the spec, make sure the variant will fail to build if only metadata is provided
1663+
let (m, v) = variant2.finish();
1664+
let res = Variant::try_new(&m, &v);
1665+
assert!(res.is_err());
1666+
1667+
// since it is not sorted, make sure the metadata says so
1668+
let header = VariantMetadata::try_new(&m).unwrap();
1669+
assert!(!header.is_sorted());
1670+
}
1671+
1672+
// write out variant1 and make sure the sorted flag is properly encoded
1673+
variant1.append_value(false);
1674+
1675+
let (m, v) = variant1.finish();
1676+
let res = Variant::try_new(&m, &v);
1677+
assert!(res.is_ok());
1678+
1679+
let header = VariantMetadata::try_new(&m).unwrap();
1680+
assert!(header.is_sorted());
1681+
}
1682+
1683+
#[test]
1684+
fn test_object_sorted_dictionary() {
1685+
// predefine the list of field names
1686+
let mut variant1 = VariantBuilder::new().with_field_names(["a", "b", "c"].into_iter());
1687+
let mut obj = variant1.new_object();
1688+
1689+
obj.insert("c", true);
1690+
obj.insert("a", false);
1691+
obj.insert("b", ());
1692+
1693+
// verify the field ids are correctly
1694+
let field_ids_by_insert_order = obj.fields.iter().map(|(&id, _)| id).collect::<Vec<_>>();
1695+
assert_eq!(field_ids_by_insert_order, vec![2, 0, 1]);
1696+
1697+
// add a field name that wasn't pre-defined but doesn't break the sort order
1698+
obj.insert("d", 2);
1699+
obj.finish().unwrap();
1700+
1701+
let (metadata, value) = variant1.finish();
1702+
let variant = Variant::try_new(&metadata, &value).unwrap();
1703+
1704+
let metadata = VariantMetadata::try_new(&metadata).unwrap();
1705+
assert!(metadata.is_sorted());
1706+
1707+
// verify object is sorted by field name order
1708+
let object = variant.as_object().unwrap();
1709+
let field_names = object
1710+
.iter()
1711+
.map(|(field_name, _)| field_name)
1712+
.collect::<Vec<_>>();
1713+
1714+
assert_eq!(field_names, vec!["a", "b", "c", "d"]);
1715+
}
1716+
1717+
#[test]
1718+
fn test_object_not_sorted_dictionary() {
1719+
// predefine the list of field names
1720+
let mut variant1 = VariantBuilder::new().with_field_names(["b", "c", "d"].into_iter());
1721+
let mut obj = variant1.new_object();
1722+
1723+
obj.insert("c", true);
1724+
obj.insert("d", false);
1725+
obj.insert("b", ());
1726+
1727+
// verify the field ids are correctly
1728+
let field_ids_by_insert_order = obj.fields.iter().map(|(&id, _)| id).collect::<Vec<_>>();
1729+
assert_eq!(field_ids_by_insert_order, vec![1, 2, 0]);
1730+
1731+
// add a field name that wasn't pre-defined but breaks the sort order
1732+
obj.insert("a", 2);
1733+
obj.finish().unwrap();
1734+
1735+
let (metadata, value) = variant1.finish();
1736+
let variant = Variant::try_new(&metadata, &value).unwrap();
1737+
1738+
let metadata = VariantMetadata::try_new(&metadata).unwrap();
1739+
assert!(!metadata.is_sorted());
1740+
1741+
// verify object field names are sorted by field name order
1742+
let object = variant.as_object().unwrap();
1743+
let field_names = object
1744+
.iter()
1745+
.map(|(field_name, _)| field_name)
1746+
.collect::<Vec<_>>();
1747+
1748+
assert_eq!(field_names, vec!["a", "b", "c", "d"]);
1749+
}
1750+
1751+
#[test]
1752+
fn test_building_sorted_dictionary() {
1753+
let mut builder = VariantBuilder::new();
1754+
assert!(!builder.metadata_builder.is_sorted);
1755+
assert_eq!(builder.metadata_builder.num_field_names(), 0);
1756+
1757+
builder.add_field_name("a");
1758+
1759+
assert!(builder.metadata_builder.is_sorted);
1760+
assert_eq!(builder.metadata_builder.num_field_names(), 1);
1761+
1762+
let builder = builder.with_field_names(["b", "c", "d"].into_iter());
1763+
1764+
assert!(builder.metadata_builder.is_sorted);
1765+
assert_eq!(builder.metadata_builder.num_field_names(), 4);
1766+
1767+
let builder = builder.with_field_names(["z", "y"].into_iter());
1768+
assert!(!builder.metadata_builder.is_sorted);
1769+
assert_eq!(builder.metadata_builder.num_field_names(), 6);
1770+
}
1771+
1772+
#[test]
1773+
fn test_metadata_builder_from_iter() {
1774+
let metadata = MetadataBuilder::from_iter(vec!["apple", "banana", "cherry"]);
1775+
assert_eq!(metadata.num_field_names(), 3);
1776+
assert_eq!(metadata.field_name(0), "apple");
1777+
assert_eq!(metadata.field_name(1), "banana");
1778+
assert_eq!(metadata.field_name(2), "cherry");
1779+
assert!(metadata.is_sorted);
1780+
1781+
let metadata = MetadataBuilder::from_iter(["zebra", "apple", "banana"]);
1782+
assert_eq!(metadata.num_field_names(), 3);
1783+
assert_eq!(metadata.field_name(0), "zebra");
1784+
assert_eq!(metadata.field_name(1), "apple");
1785+
assert_eq!(metadata.field_name(2), "banana");
1786+
assert!(!metadata.is_sorted);
1787+
1788+
let metadata = MetadataBuilder::from_iter(Vec::<&str>::new());
1789+
assert_eq!(metadata.num_field_names(), 0);
1790+
assert!(!metadata.is_sorted);
1791+
}
1792+
1793+
#[test]
1794+
fn test_metadata_builder_extend() {
1795+
let mut metadata = MetadataBuilder::default();
1796+
assert_eq!(metadata.num_field_names(), 0);
1797+
assert!(!metadata.is_sorted);
1798+
1799+
metadata.extend(["apple", "cherry"]);
1800+
assert_eq!(metadata.num_field_names(), 2);
1801+
assert_eq!(metadata.field_name(0), "apple");
1802+
assert_eq!(metadata.field_name(1), "cherry");
1803+
assert!(metadata.is_sorted);
1804+
1805+
// extend with more field names that maintain sort order
1806+
metadata.extend(vec!["dinosaur", "monkey"]);
1807+
assert_eq!(metadata.num_field_names(), 4);
1808+
assert_eq!(metadata.field_name(2), "dinosaur");
1809+
assert_eq!(metadata.field_name(3), "monkey");
1810+
assert!(metadata.is_sorted);
1811+
1812+
// test extending with duplicate field names
1813+
let initial_count = metadata.num_field_names();
1814+
metadata.extend(["apple", "monkey"]);
1815+
assert_eq!(metadata.num_field_names(), initial_count); // No new fields added
1816+
}
1817+
1818+
#[test]
1819+
fn test_metadata_builder_extend_sort_order() {
1820+
let mut metadata = MetadataBuilder::default();
1821+
1822+
metadata.extend(["middle"]);
1823+
assert!(metadata.is_sorted);
1824+
1825+
metadata.extend(["zebra"]);
1826+
assert!(metadata.is_sorted);
1827+
1828+
// add field that breaks sort order
1829+
metadata.extend(["apple"]);
1830+
assert!(!metadata.is_sorted);
1831+
}
1832+
1833+
#[test]
1834+
fn test_metadata_builder_from_iter_with_string_types() {
1835+
// &str
1836+
let metadata = MetadataBuilder::from_iter(["a", "b", "c"]);
1837+
assert_eq!(metadata.num_field_names(), 3);
1838+
1839+
// string
1840+
let metadata =
1841+
MetadataBuilder::from_iter(vec!["a".to_string(), "b".to_string(), "c".to_string()]);
1842+
assert_eq!(metadata.num_field_names(), 3);
1843+
1844+
// mixed types (anything that implements AsRef<str>)
1845+
let field_names: Vec<Box<str>> = vec!["a".into(), "b".into(), "c".into()];
1846+
let metadata = MetadataBuilder::from_iter(field_names);
1847+
assert_eq!(metadata.num_field_names(), 3);
1848+
}
15311849
}

parquet-variant/tests/test_json_to_variant.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -542,7 +542,7 @@ fn test_json_to_variant_unicode() -> Result<(), ArrowError> {
542542
);
543543
assert_eq!(
544544
metadata,
545-
&[1u8, 2u8, 0u8, 1u8, 4u8, 97u8, 0xe7u8, 0x88u8, 0xb1u8]
545+
&[0b10001u8, 2u8, 0u8, 1u8, 4u8, 97u8, 0xe7u8, 0x88u8, 0xb1u8]
546546
);
547547
JsonToVariantTest {
548548
json,

0 commit comments

Comments
 (0)