Skip to content

Commit 11283c3

Browse files
kazimuthCentril
andauthored
Add spacetimedb_primitives::col_list::ColSet (#1691)
Signed-off-by: james gilles <jameshgilles@gmail.com> Co-authored-by: Mazdak Farrokhzad <twingoow@gmail.com>
1 parent 3530498 commit 11283c3

File tree

4 files changed

+144
-5
lines changed

4 files changed

+144
-5
lines changed

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

crates/bindings/tests/snapshots/deps__spacetimedb_bindings_dependencies.snap

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,8 @@ spacetimedb
3939
│ ├── spacetimedb_primitives
4040
│ │ ├── bitflags
4141
│ │ ├── either
42+
│ │ ├── itertools
43+
│ │ │ └── either
4244
│ │ └── nohash_hasher
4345
│ └── syn
4446
│ ├── proc_macro2 (*)
@@ -56,8 +58,7 @@ spacetimedb
5658
│ │ ├── quote (*)
5759
│ │ └── syn (*)
5860
│ ├── hex
59-
│ ├── itertools
60-
│ │ └── either
61+
│ ├── itertools (*)
6162
│ ├── spacetimedb_bindings_macro (*)
6263
│ ├── spacetimedb_data_structures
6364
│ │ ├── hashbrown

crates/primitives/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ description = "Primitives such as TableId and ColumnIndexAttribute"
99
bitflags.workspace = true
1010
either.workspace = true
1111
nohash-hasher.workspace = true
12+
itertools.workspace = true
1213

1314
[dev-dependencies]
1415
proptest.workspace = true

crates/primitives/src/col_list.rs

Lines changed: 139 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,12 @@ use core::{
1111
hash::{Hash, Hasher},
1212
iter,
1313
mem::{size_of, ManuallyDrop},
14-
ops::Deref,
14+
ops::{Deref, DerefMut},
1515
ptr::NonNull,
16-
slice::from_raw_parts,
16+
slice::{from_raw_parts, from_raw_parts_mut},
1717
};
1818
use either::Either;
19+
use itertools::Itertools;
1920

2021
/// Constructs a `ColList` like so `col_list![0, 2]`.
2122
///
@@ -31,7 +32,12 @@ macro_rules! col_list {
3132
/// but packed into a `u64` in a way that takes advantage of the fact that
3233
/// in almost all cases, we won't store a `ColId` larger than 62.
3334
/// In the rare case that we store larger ids, we fall back to a thin vec approach.
34-
/// We also fall back to a thin vec if the ids stored are not in sorted order, from low to high.
35+
///
36+
/// We also fall back to a thin vec if the ids stored are not in sorted order, from low to high,
37+
/// or if the list contains duplicates.
38+
///
39+
/// If you want a set of columns, use [`ColSet`] instead. It is more likely to be compressed,
40+
/// and so is a better choice if you don't require ordering information.
3541
#[repr(C)]
3642
pub union ColList {
3743
/// Used to determine whether the list is stored inline or not.
@@ -194,6 +200,22 @@ impl ColList {
194200
self.push_inner(col, self.last().map_or(true, |l| l < col));
195201
}
196202

203+
/// Sort and deduplicate the list.
204+
/// If the list is already sorted and deduplicated, does nothing.
205+
/// This will typically result in an inline list unless there are large `ColId`s in play.
206+
fn sort_dedup(&mut self) {
207+
if let Err(heap) = self.as_inline_mut() {
208+
heap.sort();
209+
210+
// Don't reallocate if the list is already sorted and deduplicated.
211+
let is_deduped = is_sorted_and_deduped(heap);
212+
let wants_inline = heap.last().unwrap_or(&ColId(0)).0 < Self::FIRST_HEAP_COL_U16;
213+
if !is_deduped || wants_inline {
214+
*self = Self::from_iter(heap.iter().copied().dedup());
215+
}
216+
}
217+
}
218+
197219
/// Push `col` onto the list.
198220
///
199221
/// If `col >= 63` or `!preserves_set_order`,
@@ -311,6 +333,58 @@ impl fmt::Debug for ColList {
311333
}
312334
}
313335

336+
impl From<ColSet> for ColList {
337+
fn from(value: ColSet) -> Self {
338+
value.0
339+
}
340+
}
341+
342+
/// A compressed set of columns. Like a `ColList`, but guaranteed to be sorted and to contain no duplicate entries.
343+
/// Dereferences to a `ColList` for convenience.
344+
#[derive(Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
345+
pub struct ColSet(ColList);
346+
347+
impl ColSet {
348+
/// Check if a `ColSet` contains a given column.
349+
pub fn contains(&self, needle: ColId) -> bool {
350+
match self.as_inline() {
351+
Ok(inline) => inline.contains(needle),
352+
// We can use binary search because the vector is guaranteed to be sorted.
353+
Err(heap) => heap.binary_search(&needle).is_ok(),
354+
}
355+
}
356+
357+
// Don't implement `insert` because repeated insertions will be O(n^2) if we want to keep the set sorted on the heap.
358+
// Use iterator methods to create a new `ColSet` instead.
359+
}
360+
361+
impl<C: Into<ColId>> FromIterator<C> for ColSet {
362+
fn from_iter<T: IntoIterator<Item = C>>(iter: T) -> Self {
363+
Self::from(iter.into_iter().collect::<ColList>())
364+
}
365+
}
366+
367+
impl From<ColList> for ColSet {
368+
fn from(mut list: ColList) -> Self {
369+
list.sort_dedup();
370+
Self(list)
371+
}
372+
}
373+
374+
impl Deref for ColSet {
375+
type Target = ColList;
376+
377+
fn deref(&self) -> &Self::Target {
378+
&self.0
379+
}
380+
}
381+
382+
impl fmt::Debug for ColSet {
383+
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
384+
f.debug_set().entries(self.iter()).finish()
385+
}
386+
}
387+
314388
/// The inline version of a [`ColList`].
315389
#[derive(Clone, Copy, PartialEq)]
316390
struct ColListInline(u64);
@@ -508,6 +582,20 @@ impl Deref for ColListVec {
508582
}
509583
}
510584

585+
impl DerefMut for ColListVec {
586+
fn deref_mut(&mut self) -> &mut Self::Target {
587+
let len = self.len() as usize;
588+
let ptr = self.0.as_ptr();
589+
// SAFETY: `ptr + 2` is always in bounds of the allocation and `ptr <= isize::MAX`.
590+
let ptr = unsafe { ptr.add(2) }.cast::<ColId>();
591+
// SAFETY:
592+
// - `ptr` is valid for reads and writes for `len * size_of::<ColId>` and it is properly aligned.
593+
// - `len` elements are initialized.
594+
// - `len * size_of::<ColId> <= isize::MAX` holds.
595+
unsafe { from_raw_parts_mut(ptr, len) }
596+
}
597+
}
598+
511599
impl Drop for ColListVec {
512600
fn drop(&mut self) {
513601
let capacity = self.capacity();
@@ -529,6 +617,18 @@ impl Clone for ColListVec {
529617
}
530618
}
531619

620+
/// Check if a buffer is sorted and deduplicated.
621+
fn is_sorted_and_deduped(data: &[ColId]) -> bool {
622+
match data {
623+
[] => true,
624+
[mut prev, rest @ ..] => !rest.iter().any(|elem| {
625+
let bad = prev >= *elem;
626+
prev = *elem;
627+
bad
628+
}),
629+
}
630+
}
631+
532632
#[cfg(test)]
533633
mod tests {
534634
use super::*;
@@ -616,5 +716,41 @@ mod tests {
616716
_ => prop_assert_eq!(list.as_singleton(), None),
617717
}
618718
}
719+
720+
#[test]
721+
fn test_set_inlines(mut cols in vec((0..ColList::FIRST_HEAP_COL_U16).prop_map_into(), 1..100)) {
722+
prop_assume!(!is_sorted_and_deduped(&cols[..]));
723+
724+
let list = ColList::from_iter(cols.iter().copied());
725+
prop_assert!(!list.is_inline());
726+
let set = ColSet::from(list);
727+
prop_assert!(set.is_inline());
728+
729+
for col in cols.iter() {
730+
prop_assert!(set.contains(*col));
731+
}
732+
733+
cols.sort();
734+
cols.dedup();
735+
prop_assert_eq!(set.iter().collect::<Vec<_>>(), cols);
736+
}
737+
738+
#[test]
739+
fn test_set_heap(mut cols in vec((ColList::FIRST_HEAP_COL_U16..).prop_map_into(), 1..100)) {
740+
prop_assume!(!is_sorted_and_deduped(&cols[..]));
741+
742+
let list = ColList::from_iter(cols.iter().copied());
743+
prop_assert!(!list.is_inline());
744+
let set = ColSet::from(list);
745+
prop_assert!(!set.is_inline());
746+
747+
for col in cols.iter() {
748+
prop_assert!(set.contains(*col));
749+
}
750+
751+
cols.sort();
752+
cols.dedup();
753+
prop_assert_eq!(set.iter().collect::<Vec<_>>(), cols);
754+
}
619755
}
620756
}

0 commit comments

Comments
 (0)