Skip to content

Commit 09e18f6

Browse files
authored
Preparation for open sourcing (#15)
General preparation for open sourcing
1 parent 2bbeecd commit 09e18f6

File tree

8 files changed

+170
-336
lines changed

8 files changed

+170
-336
lines changed

LICENSE.txt renamed to LICENSE

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
The MIT License (MIT)
22

3-
Copyright (c) Reference Genomics, Inc.
3+
Copyright (c) 2022 One Codex, Inc.
44

55
Permission is hereby granted, free of charge, to any person obtaining a copy
66
of this software and associated documentation files (the "Software"), to deal

README.md

Lines changed: 35 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2,21 +2,44 @@
22

33
![ci](https://github.com/onecodex/mmap-bitvec/workflows/ci/badge.svg)
44

5-
mmap-bitvec is a library for working with mmap-backed bit-vectors and some simple
6-
data structures derived from bit-vectors.
5+
`mmap-bitvec` is a library for working with bit-vectors backed by memory-mapped files. Included is a simple Bloom filter built on top of such bit-vectors.
76

8-
## Benchmarks
9-
10-
To run benchmarks you need to download a bfield.mmap file, I used `s3://refgenomics-datafiles/dbs/mg_targeted_loci_20160517/bfield.mmap` in
11-
the root of the repo and then run `cargo +nightly bench`.
12-
13-
## Example
7+
## Examples
148

9+
Using a memory-mapped bit-vector:
1510
```rust
16-
let mut b = BitVec::from_memory(128).unwrap();
11+
// Build an in-memory bit-vector with a capacity of 128 bits.
12+
use mmap_bitvec::{MmapBitVec, BitVector};
13+
14+
let mut bitvec = MmapBitVec::from_memory(128).unwrap();
15+
bitvec.set(2, true);
16+
assert!(bitvec.get(2));
17+
assert_eq!(bitvec.get_range(0..8), 0b00100000);
18+
19+
// Write the bit-vector to disk, passing in `None` where an optional magic bytes arg can be set
20+
let dir = tempfile::tempdir().unwrap();
21+
bitvec.save_to_disk(dir.path().join("test"), None, &[])
22+
.unwrap();
23+
let f = MmapBitVec::open(dir.path().join("test"), None, false).unwrap();
24+
assert_eq!(f.get(2), true);
25+
```
1726

18-
b.set(2, true);
19-
assert!(b.get(2));
20-
assert_eq!(b.get_range(0..8), 0b00100000);
27+
Using the Bloom filter:
28+
```rust,no_run
29+
use mmap_bitvec::{BloomFilter};
30+
// Create a Bloom filter with a capacity of 100 bits that uses 2 hash functions on each insert.
31+
let mut filter = BloomFilter::new(Some("./test.bloom"), 100, 2).unwrap();
32+
let (a, b) = (1, 2);
33+
assert!(!filter.contains(a));
34+
assert!(!filter.contains(b));
35+
36+
filter.insert(b);
37+
assert!(!filter.contains(a));
38+
assert!(filter.contains(b));
2139
```
2240

41+
## License
42+
43+
This project is licensed under the [MIT license].
44+
45+
[MIT license]: https://github.com/onecodex/mmap-bitvec/blob/master/LICENSE

benches/benchmark.rs

Lines changed: 11 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -7,16 +7,15 @@ use std::mem::transmute;
77
use std::ops::Range;
88

99
use memmap2::{MmapMut, MmapOptions};
10-
use mmap_bitvec::{combinatorial::rank, BitVector, MmapBitVec};
10+
use mmap_bitvec::{BitVector, MmapBitVec};
1111

1212
use criterion::{criterion_group, criterion_main, Criterion};
1313

1414
type BitVecSlice = u64;
15+
1516
const BIT_VEC_SLICE_SIZE: u8 = 64;
16-
const FILENAME: &str = "./bfield.mmap";
17+
const FILENAME: &str = "./data/bfield.mmap";
1718

18-
// we could use an RNG, but I want to make sure everything is
19-
// as comparable as possible
2019
fn next_random(n: usize) -> usize {
2120
// https://en.wikipedia.org/wiki/Xorshift
2221
let mut x = n as u32;
@@ -40,8 +39,6 @@ fn get_range_simplified(mmap: &MmapMut, size: usize, l: usize) -> BitVecSlice {
4039
v >>= 7 - ((l + 63) & 7);
4140

4241
if l < size - BIT_VEC_SLICE_SIZE as usize {
43-
// really nasty/unsafe, but we're just reading a u64/u128 out instead of doing it
44-
// byte-wise --- also does not work with legacy mode!!!
4542
unsafe {
4643
let lg_ptr: *const BitVecSlice = transmute(ptr.offset(byte_idx_st as isize));
4744
v |= (*lg_ptr).to_be() << (l & 7) >> (BIT_VEC_SLICE_SIZE - new_size);
@@ -82,8 +79,6 @@ fn get_range(mmap: &MmapMut, size: usize, r: Range<usize>) -> BitVecSlice {
8279
v >>= 7 - ((r.end - 1) & 7);
8380

8481
if r.start < size - BIT_VEC_SLICE_SIZE as usize {
85-
// really nasty/unsafe, but we're just reading a u64/u128 out instead of doing it
86-
// byte-wise --- also does not work with legacy mode!!!
8782
unsafe {
8883
let lg_ptr: *const BitVecSlice = transmute(ptr.offset(byte_idx_st as isize));
8984
v |= (*lg_ptr).to_be() << (r.start & 7) >> (BIT_VEC_SLICE_SIZE - new_size);
@@ -114,7 +109,7 @@ fn bench_get_range_simplified() {
114109

115110
let mut r = 0;
116111
let mut i = 1;
117-
for _ in 0..100000 {
112+
for _ in 0..100_000 {
118113
let l = i % (size - 64);
119114
r += get_range_simplified(&mmap, size, l).count_ones();
120115
i = next_random(i);
@@ -132,7 +127,7 @@ fn bench_get_range() {
132127

133128
let mut r = 0;
134129
let mut i = 1;
135-
for _ in 0..100000 {
130+
for _ in 0..100_000 {
136131
let l = i % (size - 64);
137132
r += get_range(&mmap, size, l..l + 64).count_ones();
138133
i = next_random(i);
@@ -143,15 +138,15 @@ fn bench_get_range_actual() {
143138
let bitvec = MmapBitVec::open_no_header(FILENAME, 0).unwrap();
144139
let mut r = 0;
145140
let mut i = 1;
146-
for _ in 0..100000 {
141+
for _ in 0..100_000 {
147142
let l = i % (bitvec.size() - 64);
148143
r += bitvec.get_range(l..l + 64).count_ones();
149144
i = next_random(i);
150145
}
151146
}
152147

153148
fn bench_save_to_disk(bv: &MmapBitVec) {
154-
bv.save_to_disk("hello.tmp", [0, 1], &[]).unwrap();
149+
bv.save_to_disk("hello.tmp", Some([0, 1]), &[]).unwrap();
155150
}
156151

157152
fn criterion_benchmark(c: &mut Criterion) {
@@ -161,12 +156,14 @@ fn criterion_benchmark(c: &mut Criterion) {
161156
});
162157
c.bench_function("get_range", |b| b.iter(|| bench_get_range()));
163158
c.bench_function("save_to_disk", |b| {
164-
let mut bitvec = MmapBitVec::from_memory(1_000_000_000).unwrap();
165-
for i in 0..1_000_000_000 {
159+
let mut bitvec = MmapBitVec::from_memory(100_000_000).unwrap();
160+
for i in 0..100_000_000 {
166161
bitvec.set(i, true);
167162
}
168163
b.iter(|| bench_save_to_disk(&bitvec))
169164
});
165+
// clean up temp file
166+
std::fs::remove_file("./hello.tmp").unwrap();
170167
}
171168

172169
criterion_group!(benches, criterion_benchmark);

src/bitvec.rs

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
use std::ops::Range;
22

3-
/// A nasic bitvector trait that we implement for mmap
3+
/// A basic bitvector trait that we implement for mmap
44
pub trait BitVector {
55
/// Get the value at bit `i`
66
fn get(&self, i: usize) -> bool;
@@ -14,7 +14,7 @@ pub trait BitVector {
1414
r.fold(0, |a, x| a + if self.get(x) { 1 } else { 0 })
1515
}
1616

17-
/// Returns the position of the n-th bit set
17+
/// Returns the position of the n-th set bit
1818
fn select(&self, n: usize, start: usize) -> Option<usize> {
1919
let mut bits_left = n;
2020

@@ -30,7 +30,10 @@ pub trait BitVector {
3030
None
3131
}
3232

33-
/// Return all the bits in the given range as a u128
33+
/// Return all the bits in the given range as a `u128`. The input range `r` must span `<= 128`,
34+
/// as the result is bitpacked into a `u128`.
35+
///
36+
/// For example, an input range of `(0, 7)` will set the first 8 bits of the returned `u128` to the result of `self.get(0, 1, ... 7)`.
3437
fn get_range(&self, r: Range<usize>) -> u128 {
3538
if r.end - r.start > 128 {
3639
panic!("Range too large (>128)")
@@ -49,7 +52,7 @@ pub trait BitVector {
4952
bvs
5053
}
5154

52-
/// Sets all the bits in the given range from the given u128
55+
/// Sets all the bits in the given range from the given `u128`
5356
fn set_range(&mut self, r: Range<usize>, x: u128) {
5457
let mut cur = x;
5558
for i in r.rev() {

src/bloom.rs

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,9 @@ use murmurhash3::murmurhash3_x64_128;
77
use crate::bitvec::BitVector;
88
use crate::mmap_bitvec::MmapBitVec;
99

10-
/// Newtype for murmur hashing
11-
/// we don't want to use murmurhash3::Murmur3Hasher b/c it makes copies of the
12-
/// bytes to be hashed with every single `hash` call
10+
/// Newtype for murmur hashing.
11+
/// We don't use murmurhash3::Murmur3Hasher because it makes copies of the
12+
/// bytes to be hashed on every `hash` call
1313
#[derive(Default)]
1414
pub struct MurmurHasher(u64, u64);
1515

@@ -30,7 +30,7 @@ impl Hasher for MurmurHasher {
3030
let hash = murmurhash3_x64_128(bytes, self.0);
3131
*self = MurmurHasher(hash.0, hash.1);
3232
}
33-
// have to provide this to fulfill the trait requirements
33+
3434
fn finish(&self) -> u64 {
3535
self.0
3636
}
@@ -43,9 +43,9 @@ pub struct BloomFilter {
4343
}
4444

4545
impl BloomFilter {
46-
/// Creates a new Bloom filter (or opens an existing one, if the file
47-
/// already exists) of a given size (bits) and with a given number of
48-
/// hash functions for each insert (n_hashes). If a filename is not
46+
/// Creates a new `BloomFilter` (or opens an existing one, if the file
47+
/// already exists) of a given size (in bits) and with a given number of
48+
/// hash functions for each insert (`n_hashes`). If a filename is not
4949
/// passed, the Bloom filter will be created in memory.
5050
pub fn new<P>(filename: Option<P>, bits: usize, n_hashes: u8) -> Result<Self, io::Error>
5151
where
@@ -57,7 +57,7 @@ impl BloomFilter {
5757
if Path::exists(filename.as_ref()) {
5858
MmapBitVec::open(&filename, Some(b"!!"), false)?
5959
} else {
60-
MmapBitVec::create(&filename, bits, *b"!!", &header)?
60+
MmapBitVec::create(&filename, bits, Some(*b"!!"), &header)?
6161
}
6262
}
6363
None => MmapBitVec::from_memory(bits)?,
@@ -68,7 +68,7 @@ impl BloomFilter {
6868
})
6969
}
7070

71-
/// Insert an item into the bloom filter.
71+
/// Insert an item into the Bloom filter.
7272
pub fn insert<H>(&mut self, item: H)
7373
where
7474
H: Hash,
@@ -82,7 +82,7 @@ impl BloomFilter {
8282
}
8383
}
8484

85-
/// Check if an item is in the bloom filter already.
85+
/// Check if an item is in the Bloom filter already.
8686
pub fn contains<H>(&self, item: H) -> bool
8787
where
8888
H: Hash,

0 commit comments

Comments
 (0)