@@ -11,11 +11,12 @@ use core::{
11
11
hash:: { Hash , Hasher } ,
12
12
iter,
13
13
mem:: { size_of, ManuallyDrop } ,
14
- ops:: Deref ,
14
+ ops:: { Deref , DerefMut } ,
15
15
ptr:: NonNull ,
16
- slice:: from_raw_parts,
16
+ slice:: { from_raw_parts, from_raw_parts_mut } ,
17
17
} ;
18
18
use either:: Either ;
19
+ use itertools:: Itertools ;
19
20
20
21
/// Constructs a `ColList` like so `col_list![0, 2]`.
21
22
///
@@ -31,7 +32,12 @@ macro_rules! col_list {
31
32
/// but packed into a `u64` in a way that takes advantage of the fact that
32
33
/// in almost all cases, we won't store a `ColId` larger than 62.
33
34
/// In the rare case that we store larger ids, we fall back to a thin vec approach.
34
- /// We also fall back to a thin vec if the ids stored are not in sorted order, from low to high.
35
+ ///
36
+ /// We also fall back to a thin vec if the ids stored are not in sorted order, from low to high,
37
+ /// or if the list contains duplicates.
38
+ ///
39
+ /// If you want a set of columns, use [`ColSet`] instead. It is more likely to be compressed,
40
+ /// and so is a better choice if you don't require ordering information.
35
41
#[ repr( C ) ]
36
42
pub union ColList {
37
43
/// Used to determine whether the list is stored inline or not.
@@ -194,6 +200,22 @@ impl ColList {
194
200
self . push_inner ( col, self . last ( ) . map_or ( true , |l| l < col) ) ;
195
201
}
196
202
203
+ /// Sort and deduplicate the list.
204
+ /// If the list is already sorted and deduplicated, does nothing.
205
+ /// This will typically result in an inline list unless there are large `ColId`s in play.
206
+ fn sort_dedup ( & mut self ) {
207
+ if let Err ( heap) = self . as_inline_mut ( ) {
208
+ heap. sort ( ) ;
209
+
210
+ // Don't reallocate if the list is already sorted and deduplicated.
211
+ let is_deduped = is_sorted_and_deduped ( heap) ;
212
+ let wants_inline = heap. last ( ) . unwrap_or ( & ColId ( 0 ) ) . 0 < Self :: FIRST_HEAP_COL_U16 ;
213
+ if !is_deduped || wants_inline {
214
+ * self = Self :: from_iter ( heap. iter ( ) . copied ( ) . dedup ( ) ) ;
215
+ }
216
+ }
217
+ }
218
+
197
219
/// Push `col` onto the list.
198
220
///
199
221
/// If `col >= 63` or `!preserves_set_order`,
@@ -311,6 +333,58 @@ impl fmt::Debug for ColList {
311
333
}
312
334
}
313
335
336
+ impl From < ColSet > for ColList {
337
+ fn from ( value : ColSet ) -> Self {
338
+ value. 0
339
+ }
340
+ }
341
+
342
+ /// A compressed set of columns. Like a `ColList`, but guaranteed to be sorted and to contain no duplicate entries.
343
+ /// Dereferences to a `ColList` for convenience.
344
+ #[ derive( Clone , PartialEq , Eq , PartialOrd , Ord , Hash ) ]
345
+ pub struct ColSet ( ColList ) ;
346
+
347
+ impl ColSet {
348
+ /// Check if a `ColSet` contains a given column.
349
+ pub fn contains ( & self , needle : ColId ) -> bool {
350
+ match self . as_inline ( ) {
351
+ Ok ( inline) => inline. contains ( needle) ,
352
+ // We can use binary search because the vector is guaranteed to be sorted.
353
+ Err ( heap) => heap. binary_search ( & needle) . is_ok ( ) ,
354
+ }
355
+ }
356
+
357
+ // Don't implement `insert` because repeated insertions will be O(n^2) if we want to keep the set sorted on the heap.
358
+ // Use iterator methods to create a new `ColSet` instead.
359
+ }
360
+
361
+ impl < C : Into < ColId > > FromIterator < C > for ColSet {
362
+ fn from_iter < T : IntoIterator < Item = C > > ( iter : T ) -> Self {
363
+ Self :: from ( iter. into_iter ( ) . collect :: < ColList > ( ) )
364
+ }
365
+ }
366
+
367
+ impl From < ColList > for ColSet {
368
+ fn from ( mut list : ColList ) -> Self {
369
+ list. sort_dedup ( ) ;
370
+ Self ( list)
371
+ }
372
+ }
373
+
374
+ impl Deref for ColSet {
375
+ type Target = ColList ;
376
+
377
+ fn deref ( & self ) -> & Self :: Target {
378
+ & self . 0
379
+ }
380
+ }
381
+
382
+ impl fmt:: Debug for ColSet {
383
+ fn fmt ( & self , f : & mut fmt:: Formatter < ' _ > ) -> fmt:: Result {
384
+ f. debug_set ( ) . entries ( self . iter ( ) ) . finish ( )
385
+ }
386
+ }
387
+
314
388
/// The inline version of a [`ColList`].
315
389
#[ derive( Clone , Copy , PartialEq ) ]
316
390
struct ColListInline ( u64 ) ;
@@ -508,6 +582,20 @@ impl Deref for ColListVec {
508
582
}
509
583
}
510
584
585
+ impl DerefMut for ColListVec {
586
+ fn deref_mut ( & mut self ) -> & mut Self :: Target {
587
+ let len = self . len ( ) as usize ;
588
+ let ptr = self . 0 . as_ptr ( ) ;
589
+ // SAFETY: `ptr + 2` is always in bounds of the allocation and `ptr <= isize::MAX`.
590
+ let ptr = unsafe { ptr. add ( 2 ) } . cast :: < ColId > ( ) ;
591
+ // SAFETY:
592
+ // - `ptr` is valid for reads and writes for `len * size_of::<ColId>` and it is properly aligned.
593
+ // - `len` elements are initialized.
594
+ // - `len * size_of::<ColId> <= isize::MAX` holds.
595
+ unsafe { from_raw_parts_mut ( ptr, len) }
596
+ }
597
+ }
598
+
511
599
impl Drop for ColListVec {
512
600
fn drop ( & mut self ) {
513
601
let capacity = self . capacity ( ) ;
@@ -529,6 +617,18 @@ impl Clone for ColListVec {
529
617
}
530
618
}
531
619
620
+ /// Check if a buffer is sorted and deduplicated.
621
+ fn is_sorted_and_deduped ( data : & [ ColId ] ) -> bool {
622
+ match data {
623
+ [ ] => true ,
624
+ [ mut prev, rest @ ..] => !rest. iter ( ) . any ( |elem| {
625
+ let bad = prev >= * elem;
626
+ prev = * elem;
627
+ bad
628
+ } ) ,
629
+ }
630
+ }
631
+
532
632
#[ cfg( test) ]
533
633
mod tests {
534
634
use super :: * ;
@@ -616,5 +716,41 @@ mod tests {
616
716
_ => prop_assert_eq!( list. as_singleton( ) , None ) ,
617
717
}
618
718
}
719
+
720
+ #[ test]
721
+ fn test_set_inlines( mut cols in vec( ( 0 ..ColList :: FIRST_HEAP_COL_U16 ) . prop_map_into( ) , 1 ..100 ) ) {
722
+ prop_assume!( !is_sorted_and_deduped( & cols[ ..] ) ) ;
723
+
724
+ let list = ColList :: from_iter( cols. iter( ) . copied( ) ) ;
725
+ prop_assert!( !list. is_inline( ) ) ;
726
+ let set = ColSet :: from( list) ;
727
+ prop_assert!( set. is_inline( ) ) ;
728
+
729
+ for col in cols. iter( ) {
730
+ prop_assert!( set. contains( * col) ) ;
731
+ }
732
+
733
+ cols. sort( ) ;
734
+ cols. dedup( ) ;
735
+ prop_assert_eq!( set. iter( ) . collect:: <Vec <_>>( ) , cols) ;
736
+ }
737
+
738
+ #[ test]
739
+ fn test_set_heap( mut cols in vec( ( ColList :: FIRST_HEAP_COL_U16 ..) . prop_map_into( ) , 1 ..100 ) ) {
740
+ prop_assume!( !is_sorted_and_deduped( & cols[ ..] ) ) ;
741
+
742
+ let list = ColList :: from_iter( cols. iter( ) . copied( ) ) ;
743
+ prop_assert!( !list. is_inline( ) ) ;
744
+ let set = ColSet :: from( list) ;
745
+ prop_assert!( !set. is_inline( ) ) ;
746
+
747
+ for col in cols. iter( ) {
748
+ prop_assert!( set. contains( * col) ) ;
749
+ }
750
+
751
+ cols. sort( ) ;
752
+ cols. dedup( ) ;
753
+ prop_assert_eq!( set. iter( ) . collect:: <Vec <_>>( ) , cols) ;
754
+ }
619
755
}
620
756
}
0 commit comments