@@ -237,18 +237,41 @@ impl ValueBuffer {
237
237
struct MetadataBuilder {
238
238
// Field names -- field_ids are assigned in insert order
239
239
field_names : IndexSet < String > ,
240
+
241
+ // flag that checks if field names by insertion order are also lexicographically sorted
242
+ is_sorted : bool ,
240
243
}
241
244
242
245
impl MetadataBuilder {
243
246
/// Upsert field name to dictionary, return its ID
244
247
fn upsert_field_name ( & mut self , field_name : & str ) -> u32 {
245
- let ( id, _) = self . field_names . insert_full ( field_name. to_string ( ) ) ;
248
+ let ( id, new_entry) = self . field_names . insert_full ( field_name. to_string ( ) ) ;
249
+
250
+ if new_entry {
251
+ let n = self . num_field_names ( ) ;
252
+
253
+ // Dictionary sort order tracking:
254
+ // - An empty dictionary is unsorted (ambiguous in spec but required by interop tests)
255
+ // - A single-entry dictionary is trivially sorted
256
+ // - Otherwise, an already-sorted dictionary becomes unsorted if the new entry breaks order
257
+ self . is_sorted =
258
+ n == 1 || self . is_sorted && ( self . field_names [ n - 2 ] < self . field_names [ n - 1 ] ) ;
259
+ }
246
260
247
261
id as u32
248
262
}
249
263
264
+ /// Returns the number of field names stored in the metadata builder.
265
+ /// Note: this method should be the only place to call `self.field_names.len()`
266
+ ///
267
+ /// # Panics
268
+ ///
269
+ /// If the number of field names exceeds the maximum allowed value for `u32`.
250
270
fn num_field_names ( & self ) -> usize {
251
- self . field_names . len ( )
271
+ let n = self . field_names . len ( ) ;
272
+ assert ! ( n <= u32 :: MAX as usize ) ;
273
+
274
+ n
252
275
}
253
276
254
277
fn field_name ( & self , i : usize ) -> & str {
@@ -275,8 +298,8 @@ impl MetadataBuilder {
275
298
276
299
let mut metadata = Vec :: with_capacity ( metadata_size) ;
277
300
278
- // Write header: version=1, not sorted, with calculated offset_size
279
- metadata. push ( 0x01 | ( ( offset_size - 1 ) << 6 ) ) ;
301
+ // Write header: version=1, field names are sorted, with calculated offset_size
302
+ metadata. push ( 0x01 | ( self . is_sorted as u8 ) << 4 | ( ( offset_size - 1 ) << 6 ) ) ;
280
303
281
304
// Write dictionary size
282
305
write_offset ( & mut metadata, nkeys, offset_size) ;
@@ -299,6 +322,23 @@ impl MetadataBuilder {
299
322
}
300
323
}
301
324
325
+ impl < S : AsRef < str > > FromIterator < S > for MetadataBuilder {
326
+ fn from_iter < T : IntoIterator < Item = S > > ( iter : T ) -> Self {
327
+ let mut this = Self :: default ( ) ;
328
+ this. extend ( iter) ;
329
+
330
+ this
331
+ }
332
+ }
333
+
334
+ impl < S : AsRef < str > > Extend < S > for MetadataBuilder {
335
+ fn extend < T : IntoIterator < Item = S > > ( & mut self , iter : T ) {
336
+ for field_name in iter {
337
+ self . upsert_field_name ( field_name. as_ref ( ) ) ;
338
+ }
339
+ }
340
+ }
341
+
302
342
/// Top level builder for [`Variant`] values
303
343
///
304
344
/// # Example: create a Primitive Int8
@@ -454,6 +494,46 @@ impl MetadataBuilder {
454
494
/// let result = obj.finish(); // returns Err
455
495
/// assert!(result.is_err());
456
496
/// ```
497
+ ///
498
+ /// # Example: Sorted dictionaries
499
+ ///
500
+ /// This example shows how to create a [`VariantBuilder`] with a pre-sorted field dictionary
501
+ /// to improve field access performance when reading [`Variant`] objects.
502
+ ///
503
+ /// You can use [`VariantBuilder::with_field_names`] to add multiple field names at once:
504
+ /// ```
505
+ /// use parquet_variant::{Variant, VariantBuilder};
506
+ /// let mut builder = VariantBuilder::new()
507
+ /// .with_field_names(["age", "name", "score"].into_iter());
508
+ ///
509
+ /// let mut obj = builder.new_object();
510
+ /// obj.insert("name", "Alice");
511
+ /// obj.insert("age", 30);
512
+ /// obj.insert("score", 95.5);
513
+ /// obj.finish().unwrap();
514
+ ///
515
+ /// let (metadata, value) = builder.finish();
516
+ /// let variant = Variant::try_new(&metadata, &value).unwrap();
517
+ /// ```
518
+ ///
519
+ /// Alternatively, you can use [`VariantBuilder::add_field_name`] to add field names one by one:
520
+ /// ```
521
+ /// use parquet_variant::{Variant, VariantBuilder};
522
+ /// let mut builder = VariantBuilder::new();
523
+ /// builder.add_field_name("age"); // field id = 0
524
+ /// builder.add_field_name("name"); // field id = 1
525
+ /// builder.add_field_name("score"); // field id = 2
526
+ ///
527
+ /// let mut obj = builder.new_object();
528
+ /// obj.insert("name", "Bob"); // field id = 3
529
+ /// obj.insert("age", 25);
530
+ /// obj.insert("score", 88.0);
531
+ /// obj.finish().unwrap();
532
+ ///
533
+ /// let (metadata, value) = builder.finish();
534
+ /// let variant = Variant::try_new(&metadata, &value).unwrap();
535
+ /// ```
536
+ ///
457
537
#[ derive( Default ) ]
458
538
pub struct VariantBuilder {
459
539
buffer : ValueBuffer ,
@@ -480,6 +560,25 @@ impl VariantBuilder {
480
560
self
481
561
}
482
562
563
+ /// This method pre-populates the field name directory in the Variant metadata with
564
+ /// the specific field names, in order.
565
+ ///
566
+ /// You can use this to pre-populate a [`VariantBuilder`] with a sorted dictionary if you
567
+ /// know the field names beforehand. Sorted dictionaries can accelerate field access when
568
+ /// reading [`Variant`]s.
569
+ pub fn with_field_names < ' a > ( mut self , field_names : impl Iterator < Item = & ' a str > ) -> Self {
570
+ self . metadata_builder . extend ( field_names) ;
571
+
572
+ self
573
+ }
574
+
575
+ /// Adds a single field name to the field name directory in the Variant metadata.
576
+ ///
577
+ /// This method does the same thing as [`VariantBuilder::with_field_names`] but adds one field name at a time.
578
+ pub fn add_field_name ( & mut self , field_name : & str ) {
579
+ self . metadata_builder . upsert_field_name ( field_name) ;
580
+ }
581
+
483
582
/// Create an [`ListBuilder`] for creating [`Variant::List`] values.
484
583
///
485
584
/// See the examples on [`VariantBuilder`] for usage.
@@ -822,6 +921,8 @@ impl<'m, 'v> VariantBuilderExt<'m, 'v> for VariantBuilder {
822
921
823
922
#[ cfg( test) ]
824
923
mod tests {
924
+ use crate :: VariantMetadata ;
925
+
825
926
use super :: * ;
826
927
827
928
#[ test]
@@ -1528,4 +1629,221 @@ mod tests {
1528
1629
let valid_result = valid_obj. finish ( ) ;
1529
1630
assert ! ( valid_result. is_ok( ) ) ;
1530
1631
}
1632
+
1633
+ #[ test]
1634
+ fn test_sorted_dictionary ( ) {
1635
+ // check if variant metadatabuilders are equivalent from different ways of constructing them
1636
+ let mut variant1 = VariantBuilder :: new ( ) . with_field_names ( [ "b" , "c" , "d" ] . into_iter ( ) ) ;
1637
+
1638
+ let mut variant2 = {
1639
+ let mut builder = VariantBuilder :: new ( ) ;
1640
+
1641
+ builder. add_field_name ( "b" ) ;
1642
+ builder. add_field_name ( "c" ) ;
1643
+ builder. add_field_name ( "d" ) ;
1644
+
1645
+ builder
1646
+ } ;
1647
+
1648
+ assert_eq ! (
1649
+ variant1. metadata_builder. field_names,
1650
+ variant2. metadata_builder. field_names
1651
+ ) ;
1652
+
1653
+ // check metadata builders say it's sorted
1654
+ assert ! ( variant1. metadata_builder. is_sorted) ;
1655
+ assert ! ( variant2. metadata_builder. is_sorted) ;
1656
+
1657
+ {
1658
+ // test the bad case and break the sort order
1659
+ variant2. add_field_name ( "a" ) ;
1660
+ assert ! ( !variant2. metadata_builder. is_sorted) ;
1661
+
1662
+ // per the spec, make sure the variant will fail to build if only metadata is provided
1663
+ let ( m, v) = variant2. finish ( ) ;
1664
+ let res = Variant :: try_new ( & m, & v) ;
1665
+ assert ! ( res. is_err( ) ) ;
1666
+
1667
+ // since it is not sorted, make sure the metadata says so
1668
+ let header = VariantMetadata :: try_new ( & m) . unwrap ( ) ;
1669
+ assert ! ( !header. is_sorted( ) ) ;
1670
+ }
1671
+
1672
+ // write out variant1 and make sure the sorted flag is properly encoded
1673
+ variant1. append_value ( false ) ;
1674
+
1675
+ let ( m, v) = variant1. finish ( ) ;
1676
+ let res = Variant :: try_new ( & m, & v) ;
1677
+ assert ! ( res. is_ok( ) ) ;
1678
+
1679
+ let header = VariantMetadata :: try_new ( & m) . unwrap ( ) ;
1680
+ assert ! ( header. is_sorted( ) ) ;
1681
+ }
1682
+
1683
+ #[ test]
1684
+ fn test_object_sorted_dictionary ( ) {
1685
+ // predefine the list of field names
1686
+ let mut variant1 = VariantBuilder :: new ( ) . with_field_names ( [ "a" , "b" , "c" ] . into_iter ( ) ) ;
1687
+ let mut obj = variant1. new_object ( ) ;
1688
+
1689
+ obj. insert ( "c" , true ) ;
1690
+ obj. insert ( "a" , false ) ;
1691
+ obj. insert ( "b" , ( ) ) ;
1692
+
1693
+ // verify the field ids are correctly
1694
+ let field_ids_by_insert_order = obj. fields . iter ( ) . map ( |( & id, _) | id) . collect :: < Vec < _ > > ( ) ;
1695
+ assert_eq ! ( field_ids_by_insert_order, vec![ 2 , 0 , 1 ] ) ;
1696
+
1697
+ // add a field name that wasn't pre-defined but doesn't break the sort order
1698
+ obj. insert ( "d" , 2 ) ;
1699
+ obj. finish ( ) . unwrap ( ) ;
1700
+
1701
+ let ( metadata, value) = variant1. finish ( ) ;
1702
+ let variant = Variant :: try_new ( & metadata, & value) . unwrap ( ) ;
1703
+
1704
+ let metadata = VariantMetadata :: try_new ( & metadata) . unwrap ( ) ;
1705
+ assert ! ( metadata. is_sorted( ) ) ;
1706
+
1707
+ // verify object is sorted by field name order
1708
+ let object = variant. as_object ( ) . unwrap ( ) ;
1709
+ let field_names = object
1710
+ . iter ( )
1711
+ . map ( |( field_name, _) | field_name)
1712
+ . collect :: < Vec < _ > > ( ) ;
1713
+
1714
+ assert_eq ! ( field_names, vec![ "a" , "b" , "c" , "d" ] ) ;
1715
+ }
1716
+
1717
+ #[ test]
1718
+ fn test_object_not_sorted_dictionary ( ) {
1719
+ // predefine the list of field names
1720
+ let mut variant1 = VariantBuilder :: new ( ) . with_field_names ( [ "b" , "c" , "d" ] . into_iter ( ) ) ;
1721
+ let mut obj = variant1. new_object ( ) ;
1722
+
1723
+ obj. insert ( "c" , true ) ;
1724
+ obj. insert ( "d" , false ) ;
1725
+ obj. insert ( "b" , ( ) ) ;
1726
+
1727
+ // verify the field ids are correctly
1728
+ let field_ids_by_insert_order = obj. fields . iter ( ) . map ( |( & id, _) | id) . collect :: < Vec < _ > > ( ) ;
1729
+ assert_eq ! ( field_ids_by_insert_order, vec![ 1 , 2 , 0 ] ) ;
1730
+
1731
+ // add a field name that wasn't pre-defined but breaks the sort order
1732
+ obj. insert ( "a" , 2 ) ;
1733
+ obj. finish ( ) . unwrap ( ) ;
1734
+
1735
+ let ( metadata, value) = variant1. finish ( ) ;
1736
+ let variant = Variant :: try_new ( & metadata, & value) . unwrap ( ) ;
1737
+
1738
+ let metadata = VariantMetadata :: try_new ( & metadata) . unwrap ( ) ;
1739
+ assert ! ( !metadata. is_sorted( ) ) ;
1740
+
1741
+ // verify object field names are sorted by field name order
1742
+ let object = variant. as_object ( ) . unwrap ( ) ;
1743
+ let field_names = object
1744
+ . iter ( )
1745
+ . map ( |( field_name, _) | field_name)
1746
+ . collect :: < Vec < _ > > ( ) ;
1747
+
1748
+ assert_eq ! ( field_names, vec![ "a" , "b" , "c" , "d" ] ) ;
1749
+ }
1750
+
1751
+ #[ test]
1752
+ fn test_building_sorted_dictionary ( ) {
1753
+ let mut builder = VariantBuilder :: new ( ) ;
1754
+ assert ! ( !builder. metadata_builder. is_sorted) ;
1755
+ assert_eq ! ( builder. metadata_builder. num_field_names( ) , 0 ) ;
1756
+
1757
+ builder. add_field_name ( "a" ) ;
1758
+
1759
+ assert ! ( builder. metadata_builder. is_sorted) ;
1760
+ assert_eq ! ( builder. metadata_builder. num_field_names( ) , 1 ) ;
1761
+
1762
+ let builder = builder. with_field_names ( [ "b" , "c" , "d" ] . into_iter ( ) ) ;
1763
+
1764
+ assert ! ( builder. metadata_builder. is_sorted) ;
1765
+ assert_eq ! ( builder. metadata_builder. num_field_names( ) , 4 ) ;
1766
+
1767
+ let builder = builder. with_field_names ( [ "z" , "y" ] . into_iter ( ) ) ;
1768
+ assert ! ( !builder. metadata_builder. is_sorted) ;
1769
+ assert_eq ! ( builder. metadata_builder. num_field_names( ) , 6 ) ;
1770
+ }
1771
+
1772
+ #[ test]
1773
+ fn test_metadata_builder_from_iter ( ) {
1774
+ let metadata = MetadataBuilder :: from_iter ( vec ! [ "apple" , "banana" , "cherry" ] ) ;
1775
+ assert_eq ! ( metadata. num_field_names( ) , 3 ) ;
1776
+ assert_eq ! ( metadata. field_name( 0 ) , "apple" ) ;
1777
+ assert_eq ! ( metadata. field_name( 1 ) , "banana" ) ;
1778
+ assert_eq ! ( metadata. field_name( 2 ) , "cherry" ) ;
1779
+ assert ! ( metadata. is_sorted) ;
1780
+
1781
+ let metadata = MetadataBuilder :: from_iter ( [ "zebra" , "apple" , "banana" ] ) ;
1782
+ assert_eq ! ( metadata. num_field_names( ) , 3 ) ;
1783
+ assert_eq ! ( metadata. field_name( 0 ) , "zebra" ) ;
1784
+ assert_eq ! ( metadata. field_name( 1 ) , "apple" ) ;
1785
+ assert_eq ! ( metadata. field_name( 2 ) , "banana" ) ;
1786
+ assert ! ( !metadata. is_sorted) ;
1787
+
1788
+ let metadata = MetadataBuilder :: from_iter ( Vec :: < & str > :: new ( ) ) ;
1789
+ assert_eq ! ( metadata. num_field_names( ) , 0 ) ;
1790
+ assert ! ( !metadata. is_sorted) ;
1791
+ }
1792
+
1793
+ #[ test]
1794
+ fn test_metadata_builder_extend ( ) {
1795
+ let mut metadata = MetadataBuilder :: default ( ) ;
1796
+ assert_eq ! ( metadata. num_field_names( ) , 0 ) ;
1797
+ assert ! ( !metadata. is_sorted) ;
1798
+
1799
+ metadata. extend ( [ "apple" , "cherry" ] ) ;
1800
+ assert_eq ! ( metadata. num_field_names( ) , 2 ) ;
1801
+ assert_eq ! ( metadata. field_name( 0 ) , "apple" ) ;
1802
+ assert_eq ! ( metadata. field_name( 1 ) , "cherry" ) ;
1803
+ assert ! ( metadata. is_sorted) ;
1804
+
1805
+ // extend with more field names that maintain sort order
1806
+ metadata. extend ( vec ! [ "dinosaur" , "monkey" ] ) ;
1807
+ assert_eq ! ( metadata. num_field_names( ) , 4 ) ;
1808
+ assert_eq ! ( metadata. field_name( 2 ) , "dinosaur" ) ;
1809
+ assert_eq ! ( metadata. field_name( 3 ) , "monkey" ) ;
1810
+ assert ! ( metadata. is_sorted) ;
1811
+
1812
+ // test extending with duplicate field names
1813
+ let initial_count = metadata. num_field_names ( ) ;
1814
+ metadata. extend ( [ "apple" , "monkey" ] ) ;
1815
+ assert_eq ! ( metadata. num_field_names( ) , initial_count) ; // No new fields added
1816
+ }
1817
+
1818
+ #[ test]
1819
+ fn test_metadata_builder_extend_sort_order ( ) {
1820
+ let mut metadata = MetadataBuilder :: default ( ) ;
1821
+
1822
+ metadata. extend ( [ "middle" ] ) ;
1823
+ assert ! ( metadata. is_sorted) ;
1824
+
1825
+ metadata. extend ( [ "zebra" ] ) ;
1826
+ assert ! ( metadata. is_sorted) ;
1827
+
1828
+ // add field that breaks sort order
1829
+ metadata. extend ( [ "apple" ] ) ;
1830
+ assert ! ( !metadata. is_sorted) ;
1831
+ }
1832
+
1833
+ #[ test]
1834
+ fn test_metadata_builder_from_iter_with_string_types ( ) {
1835
+ // &str
1836
+ let metadata = MetadataBuilder :: from_iter ( [ "a" , "b" , "c" ] ) ;
1837
+ assert_eq ! ( metadata. num_field_names( ) , 3 ) ;
1838
+
1839
+ // string
1840
+ let metadata =
1841
+ MetadataBuilder :: from_iter ( vec ! [ "a" . to_string( ) , "b" . to_string( ) , "c" . to_string( ) ] ) ;
1842
+ assert_eq ! ( metadata. num_field_names( ) , 3 ) ;
1843
+
1844
+ // mixed types (anything that implements AsRef<str>)
1845
+ let field_names: Vec < Box < str > > = vec ! [ "a" . into( ) , "b" . into( ) , "c" . into( ) ] ;
1846
+ let metadata = MetadataBuilder :: from_iter ( field_names) ;
1847
+ assert_eq ! ( metadata. num_field_names( ) , 3 ) ;
1848
+ }
1531
1849
}
0 commit comments