Skip to content

Commit 50567d6

Browse files
Add module with fast merge subroutines
1 parent 883f028 commit 50567d6

File tree

2 files changed

+96
-0
lines changed

2 files changed

+96
-0
lines changed

src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ use std::rc::Rc;
1919

2020
mod join;
2121
mod map;
22+
mod merge;
2223
mod test;
2324
mod treefrog;
2425
pub use crate::join::JoinInput;

src/merge.rs

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
//! Subroutines for merging sorted lists efficiently.
2+
3+
use std::cmp::Ordering;
4+
5+
/// Merges two sorted lists into a single sorted list, ignoring duplicates.
6+
pub fn merge_unique<T: Ord>(mut a: Vec<T>, mut b: Vec<T>) -> Vec<T> {
7+
// If one of the lists is zero-length, we don't need to do any work.
8+
if a.is_empty() {
9+
return b;
10+
}
11+
if b.is_empty() {
12+
return a;
13+
}
14+
15+
// Fast path for when all the new elements are after the existing ones.
16+
//
17+
// Cannot panic because we check for empty inputs above.
18+
if *a.last().unwrap() < b[0] {
19+
a.append(&mut b);
20+
return a;
21+
}
22+
if *b.last().unwrap() < a[0] {
23+
b.append(&mut a);
24+
return b;
25+
}
26+
27+
// Ensure that `out` always has sufficient capacity.
28+
//
29+
// SAFETY: The calls to `push_unchecked` below are safe because of this.
30+
let mut out = Vec::with_capacity(a.len() + b.len());
31+
32+
let mut a = a.into_iter();
33+
let mut b = b.into_iter();
34+
35+
// While both inputs have elements remaining, copy the lesser element to the output vector.
36+
while a.len() != 0 && b.len() != 0 {
37+
// SAFETY: The following calls to `get_unchecked` and `next_unchecked` are safe because we
38+
// ensure that `a.len() > 0` and `b.len() > 0` inside the loop.
39+
//
40+
// I was hoping to avoid using "unchecked" operations, but it seems the bounds checks
41+
// don't get optimized away. Using `ExactSizeIterator::is_empty` instead of checking `len`
42+
// seemed to help, but that method is unstable.
43+
44+
let a_elem = unsafe { a.as_slice().get_unchecked(0) };
45+
let b_elem = unsafe { b.as_slice().get_unchecked(0) };
46+
match a_elem.cmp(b_elem) {
47+
Ordering::Less => unsafe { push_unchecked(&mut out, next_unchecked(&mut a)) },
48+
Ordering::Greater => unsafe { push_unchecked(&mut out, next_unchecked(&mut b)) },
49+
Ordering::Equal => unsafe {
50+
push_unchecked(&mut out, next_unchecked(&mut a));
51+
std::mem::drop(next_unchecked(&mut b));
52+
},
53+
}
54+
}
55+
56+
// Once either `a` or `b` runs out of elements, copy all remaining elements in the other one
57+
// directly to the back of the output list.
58+
//
59+
// This branch is free because we have to check `a.is_empty()` above anyways.
60+
//
61+
// Calling `push_unchecked` in a loop was slightly faster than `out.extend(...)`
62+
// despite the fact that `std::vec::IntoIter` implements `TrustedLen`.
63+
if a.len() != 0 {
64+
for elem in a {
65+
unsafe {
66+
push_unchecked(&mut out, elem);
67+
}
68+
}
69+
} else {
70+
for elem in b {
71+
unsafe {
72+
push_unchecked(&mut out, elem);
73+
}
74+
}
75+
}
76+
77+
out
78+
}
79+
80+
/// Pushes `value` to `vec` without checking that the vector has sufficient capacity.
81+
///
82+
/// If `vec.len() == vec.cap()`, calling this function is UB.
83+
unsafe fn push_unchecked<T>(vec: &mut Vec<T>, value: T) {
84+
let end = vec.as_mut_ptr().add(vec.len());
85+
std::ptr::write(end, value);
86+
vec.set_len(vec.len() + 1);
87+
}
88+
89+
/// Equivalent to `iter.next().unwrap()` that is UB to call when `iter` is empty.
90+
unsafe fn next_unchecked<T>(iter: &mut std::vec::IntoIter<T>) -> T {
91+
match iter.next() {
92+
Some(x) => x,
93+
None => std::hint::unreachable_unchecked(),
94+
}
95+
}

0 commit comments

Comments
 (0)