Skip to content

Commit f2560fe

Browse files
authored
Add in memory mmap (#6)
* wip * Add test + benchmark * wip in memory * Add a in memory + persist to disk setup * Refactor loading * Using UnsafeCell * update + fmt * Use mmap-bitvec master
1 parent 4cce4e4 commit f2560fe

File tree

6 files changed

+293
-101
lines changed

6 files changed

+293
-101
lines changed

.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,5 @@
22
**/*.rs.bk
33
Cargo.lock
44
.DS_Store
5-
.idea/
5+
.idea/
6+
old/

Cargo.toml

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,14 @@ edition = "2018"
66

77
[dependencies]
88
bincode = "1"
9-
mmap-bitvec = { git="ssh://git@github.com/onecodex/mmap-bitvec.git", tag="v0.3.1" }
9+
mmap-bitvec = { git="ssh://git@github.com/onecodex/mmap-bitvec.git" }
1010
murmurhash3 = "0.0.5"
11-
serde = "1.0.15"
12-
serde_derive = "1.0.15"
11+
serde = { version = "1.0", features = ["derive"] }
1312

14-
[features]
15-
prefetching = []
13+
[dev-dependencies]
14+
criterion = "0.3"
15+
tempfile = "3.3.0"
16+
17+
[[bench]]
18+
name = "benchmark"
19+
harness = false

benches/benchmark.rs

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
use bfield::BField;
2+
use criterion::{black_box, criterion_group, criterion_main, Criterion};
3+
4+
fn build_bfield(n_secondaries: u8) -> BField<String> {
5+
let tmp_dir = tempfile::tempdir().unwrap();
6+
BField::create(
7+
tmp_dir.path(),
8+
"bfield",
9+
1_000_000,
10+
10,
11+
39,
12+
4,
13+
0.1,
14+
0.025,
15+
n_secondaries,
16+
String::new(),
17+
)
18+
.expect("to build")
19+
}
20+
21+
fn bench_insertion(c: &mut Criterion) {
22+
let mut bfield = build_bfield(4);
23+
c.bench_function("bfield insertion", |b| {
24+
b.iter(|| bfield.insert(&1_u32.to_be_bytes().to_vec(), 1_u32, 0))
25+
});
26+
}
27+
28+
fn bench_querying(c: &mut Criterion) {
29+
let mut bfield = build_bfield(4);
30+
31+
// Identity database
32+
let max_value: u32 = 10_000;
33+
for p in 0..4 {
34+
for i in 0..max_value {
35+
bfield.insert(&i.to_be_bytes().to_vec(), i, p as usize);
36+
}
37+
}
38+
39+
c.bench_function("bfield querying", |b| {
40+
b.iter(|| black_box(bfield.get(black_box(&10_000_i32.to_be_bytes().to_vec()))))
41+
});
42+
}
43+
44+
criterion_group!(benches, bench_insertion, bench_querying);
45+
criterion_main!(benches);

src/bfield.rs

Lines changed: 143 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
use std::io;
2-
use std::path::Path;
2+
use std::path::{Path, PathBuf};
33

44
use mmap_bitvec::combinatorial::rank;
55
use serde::de::DeserializeOwned;
@@ -12,37 +12,42 @@ pub struct BField<T> {
1212
read_only: bool,
1313
}
1414

15-
impl<'a, T: Clone + DeserializeOwned + Serialize> BField<T> {
15+
// This is safe in theory, as the mmap is send+sync
16+
unsafe impl<T> Send for BField<T> {}
17+
unsafe impl<T> Sync for BField<T> {}
18+
19+
impl<T: Clone + DeserializeOwned + Serialize> BField<T> {
1620
#[allow(clippy::too_many_arguments)]
1721
pub fn create<P>(
18-
filename: P,
22+
directory: P,
23+
filename: &str,
1924
size: usize,
2025
n_hashes: u8, // k
2126
marker_width: u8, // nu
2227
n_marker_bits: u8, // kappa
2328
secondary_scaledown: f64, // beta
2429
max_scaledown: f64,
2530
n_secondaries: u8,
31+
in_memory: bool,
2632
other_params: T,
2733
) -> Result<Self, io::Error>
2834
where
2935
P: AsRef<Path>,
3036
{
37+
debug_assert!(!filename.is_empty());
3138
let mut cur_size = size;
3239
let mut members = Vec::new();
40+
3341
for n in 0..n_secondaries {
34-
// panics if filename == ''
35-
let file = filename.as_ref().with_file_name(Path::with_extension(
36-
Path::file_stem(filename.as_ref()).unwrap().as_ref(),
37-
format!("{}.bfd", n),
38-
));
42+
let file = directory.as_ref().join(format!("{}.{}.bfd", filename, n));
3943
let params = if n == 0 {
4044
Some(other_params.clone())
4145
} else {
4246
None
4347
};
4448
let member = BFieldMember::create(
4549
file,
50+
in_memory,
4651
cur_size,
4752
n_hashes,
4853
marker_width,
@@ -66,33 +71,58 @@ impl<'a, T: Clone + DeserializeOwned + Serialize> BField<T> {
6671
})
6772
}
6873

69-
pub fn from_file<P>(filename: P, read_only: bool) -> Result<Self, io::Error>
70-
where
71-
P: AsRef<Path>,
72-
{
74+
pub fn load<P: AsRef<Path>>(main_db_path: P, read_only: bool) -> Result<Self, io::Error> {
7375
let mut members = Vec::new();
7476
let mut n = 0;
77+
78+
let main_db_filename = match main_db_path.as_ref().file_name() {
79+
Some(p) => p.to_string_lossy(),
80+
None => {
81+
return Err(io::Error::new(
82+
io::ErrorKind::NotFound,
83+
format!("Couldn't get filename from {:?}", main_db_path.as_ref()),
84+
));
85+
}
86+
};
87+
assert!(main_db_path.as_ref().parent().is_some());
88+
assert!(main_db_filename.ends_with("0.bfd"));
89+
7590
loop {
76-
let member_filename = filename.as_ref().with_file_name(Path::with_extension(
77-
Path::file_stem(filename.as_ref()).unwrap().as_ref(),
78-
format!("{}.bfd", n),
79-
));
80-
if !member_filename.exists() {
91+
let member_filename =
92+
PathBuf::from(&main_db_filename.replace("0.bfd", &format!("{n}.bfd")));
93+
let member_path = main_db_path
94+
.as_ref()
95+
.parent()
96+
.unwrap()
97+
.join(member_filename);
98+
if !member_path.exists() {
8199
break;
82100
}
83-
let member = BFieldMember::open(&member_filename, read_only)?;
101+
let member = BFieldMember::open(&member_path, read_only)?;
84102
members.push(member);
85103
n += 1;
86104
}
105+
87106
if members.is_empty() {
88107
return Err(io::Error::new(
89108
io::ErrorKind::NotFound,
90-
format!("No Bfield found at {:?}", filename.as_ref()),
109+
format!("No Bfield found at {:?}", main_db_path.as_ref()),
91110
));
92111
}
93112
Ok(BField { members, read_only })
94113
}
95114

115+
pub fn persist_to_disk(self) -> Result<Self, io::Error> {
116+
let mut members = Vec::with_capacity(self.members.len());
117+
for m in self.members {
118+
members.push(m.persist_to_disk()?);
119+
}
120+
Ok(Self {
121+
members,
122+
read_only: self.read_only,
123+
})
124+
}
125+
96126
pub fn build_params(&self) -> (u8, u8, u8, Vec<usize>) {
97127
let (_, n_hashes, marker_width, n_marker_bits) = self.members[0].info();
98128
let sizes = self.members.iter().map(|i| i.info().0).collect();
@@ -117,36 +147,36 @@ impl<'a, T: Clone + DeserializeOwned + Serialize> BField<T> {
117147
/// of the b-field by making them indeterminate (which will make them fall
118148
/// back to the secondaries where they don't exist and thus it'll appear
119149
/// as if they were never inserted to begin with)
120-
pub fn force_insert(&mut self, key: &[u8], value: BFieldVal) {
150+
pub fn force_insert(&self, key: &[u8], value: BFieldVal) {
121151
debug_assert!(!self.read_only, "Can't insert into read_only bfields");
122-
for secondary in self.members.iter_mut() {
123-
if secondary.mask_or_insert(&key, value) {
152+
for secondary in &self.members {
153+
if secondary.mask_or_insert(key, value) {
124154
break;
125155
}
126156
}
127157
}
128158

129-
pub fn insert(&mut self, key: &[u8], value: BFieldVal, pass: usize) -> bool {
159+
pub fn insert(&self, key: &[u8], value: BFieldVal, pass: usize) -> bool {
130160
debug_assert!(!self.read_only, "Can't insert into read_only bfields");
131161
debug_assert!(
132162
pass < self.members.len(),
133163
"Can't have more passes than bfield members"
134164
);
135165
if pass > 0 {
136166
for secondary in self.members[..pass].iter() {
137-
match secondary.get(&key) {
167+
match secondary.get(key) {
138168
BFieldLookup::Indeterminate => continue,
139169
_ => return false,
140170
}
141171
}
142172
}
143-
self.members[pass].insert(&key, value);
173+
self.members[pass].insert(key, value);
144174
true
145175
}
146176

147177
pub fn get(&self, key: &[u8]) -> Option<BFieldVal> {
148178
for secondary in self.members.iter() {
149-
match secondary.get(&key) {
179+
match secondary.get(key) {
150180
BFieldLookup::Indeterminate => continue,
151181
BFieldLookup::Some(value) => return Some(value),
152182
BFieldLookup::None => return None,
@@ -161,3 +191,90 @@ impl<'a, T: Clone + DeserializeOwned + Serialize> BField<T> {
161191
self.members.iter().map(|m| m.info()).collect()
162192
}
163193
}
194+
195+
#[cfg(test)]
196+
mod tests {
197+
use super::*;
198+
199+
#[test]
200+
fn can_build_and_query_file_bfield() {
201+
let tmp_dir = tempfile::tempdir().unwrap();
202+
let n_secondaries = 4;
203+
let bfield = BField::create(
204+
tmp_dir.path(),
205+
"bfield",
206+
1_000_000,
207+
10,
208+
39,
209+
4,
210+
0.1,
211+
0.025,
212+
n_secondaries,
213+
false,
214+
String::new(),
215+
)
216+
.expect("to build");
217+
218+
// Identity database
219+
let max_value: u32 = 10_000;
220+
for p in 0..n_secondaries {
221+
for i in 0..max_value {
222+
bfield.insert(&i.to_be_bytes().to_vec(), i, p as usize);
223+
}
224+
}
225+
226+
for i in 0..max_value {
227+
let val = bfield.get(&i.to_be_bytes().to_vec()).unwrap();
228+
assert_eq!(i, val);
229+
}
230+
drop(bfield);
231+
232+
// and we can load them
233+
let bfield = BField::<String>::load(&tmp_dir.path().join("bfield.0.bfd"), true).unwrap();
234+
for i in 0..max_value {
235+
let val = bfield.get(&i.to_be_bytes().to_vec()).unwrap();
236+
assert_eq!(i, val);
237+
}
238+
}
239+
240+
#[test]
241+
fn can_build_and_query_in_memory_bfield() {
242+
let tmp_dir = tempfile::tempdir().unwrap();
243+
let n_secondaries = 4;
244+
let mut bfield = BField::create(
245+
tmp_dir.path(),
246+
"bfield",
247+
1_000_000,
248+
10,
249+
39,
250+
4,
251+
0.1,
252+
0.025,
253+
n_secondaries,
254+
true,
255+
String::new(),
256+
)
257+
.expect("to build");
258+
259+
// Identity database
260+
let max_value: u32 = 10_000;
261+
for p in 0..n_secondaries {
262+
for i in 0..max_value {
263+
bfield.insert(&i.to_be_bytes().to_vec(), i, p as usize);
264+
}
265+
}
266+
267+
for i in 0..max_value {
268+
let val = bfield.get(&i.to_be_bytes().to_vec()).unwrap();
269+
assert_eq!(i, val);
270+
}
271+
bfield.persist_to_disk().unwrap();
272+
for m in &bfield.members {
273+
assert!(m.filename.exists());
274+
}
275+
for i in 0..max_value {
276+
let val = bfield.get(&i.to_be_bytes().to_vec()).unwrap();
277+
assert_eq!(i, val);
278+
}
279+
}
280+
}

0 commit comments

Comments
 (0)