Skip to content

0.7.0 #34

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 63 commits into from
May 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
63 commits
Select commit Hold shift + click to select a range
eee5552
refactor: change segment IDs to integer
marvin-j97 May 1, 2024
4117810
breaking: change segment ids to u64
marvin-j97 May 2, 2024
ddbcf2c
Merge branch 'main' into refactor/tree-id
marvin-j97 May 2, 2024
d3f6444
rewrite segment meta to not use JSON
marvin-j97 May 4, 2024
c0b2a8a
refactor
marvin-j97 May 4, 2024
2ea3c20
add test
marvin-j97 May 4, 2024
901c96f
refactor: block index writing
marvin-j97 May 4, 2024
19749f5
update compression type on-disk repr
marvin-j97 May 4, 2024
400d7ee
refactor
marvin-j97 May 4, 2024
69ab131
refactor
marvin-j97 May 5, 2024
542c1e1
reverted block index writing refactor
marvin-j97 May 5, 2024
217eb36
test: add value raw deserialization
marvin-j97 May 5, 2024
492b19d
add benchmark for BlockHandleBlock
marvin-j97 May 5, 2024
fc28866
update benchmark
marvin-j97 May 5, 2024
925166f
allow setting cache policy on segment readers
marvin-j97 May 5, 2024
b579525
use cache policy when loading index blocks
marvin-j97 May 5, 2024
97aa126
cleanup imports
marvin-j97 May 6, 2024
eb49099
fix: tiered compaction ordering
marvin-j97 May 6, 2024
c5fa555
refactor
marvin-j97 May 6, 2024
9fae07d
closes #25
marvin-j97 May 6, 2024
2f39c18
change config format to json
marvin-j97 May 6, 2024
9165f4f
change config filename
marvin-j97 May 6, 2024
0c5e334
hide config member
marvin-j97 May 6, 2024
5507c88
fix compilation
marvin-j97 May 6, 2024
3a620d8
change visibility
marvin-j97 May 6, 2024
717171e
remove unneeded import
marvin-j97 May 6, 2024
3eaa7ce
test: version ser-de
marvin-j97 May 6, 2024
c319346
test: more ser-de tests
marvin-j97 May 6, 2024
9923f73
update docs
marvin-j97 May 6, 2024
4d736d2
update gitignore
marvin-j97 May 6, 2024
b1c2685
lint
marvin-j97 May 6, 2024
87ce206
move fs_extra to dev deps
marvin-j97 May 6, 2024
eabd585
perf: remove heap allocation in read path
marvin-j97 May 6, 2024
0d9078f
refactor: simplified Levels::get_all_segments_flattened
marvin-j97 May 6, 2024
7bee28e
fix: levels iter
marvin-j97 May 6, 2024
7d34732
clippy fix
marvin-j97 May 6, 2024
e7f778b
add comment
marvin-j97 May 6, 2024
99a63b3
add comment
marvin-j97 May 6, 2024
2408ebd
clippy fix
marvin-j97 May 6, 2024
4ba8fbb
add tli benchmark
marvin-j97 May 7, 2024
9c65a71
stash
marvin-j97 May 8, 2024
671f33f
pass all segment iter tests
marvin-j97 May 9, 2024
8042104
pass all tests again
marvin-j97 May 9, 2024
33262b0
clippy fix
marvin-j97 May 9, 2024
90aeb5d
perf: set upper bound for ranges
marvin-j97 May 9, 2024
a1889aa
refactor: remove unused functions
marvin-j97 May 9, 2024
441d94a
fix: segment point reads
marvin-j97 May 9, 2024
f988bce
refactor
marvin-j97 May 9, 2024
5c560b5
add props to segment meta & persisted config
marvin-j97 May 9, 2024
60e1ec7
refactor
marvin-j97 May 9, 2024
7cdc4a6
fix: build
marvin-j97 May 9, 2024
979451b
update data format
marvin-j97 May 9, 2024
3c4d33f
add comment
marvin-j97 May 9, 2024
26685b5
update comment
marvin-j97 May 9, 2024
fc4f0ce
Merge pull request #35 from fjall-rs/block-index
marvin-j97 May 9, 2024
439708b
update gitignore
marvin-j97 May 9, 2024
44c6979
refactor
marvin-j97 May 9, 2024
7ad34f3
refactor
marvin-j97 May 9, 2024
5764347
clippy
marvin-j97 May 9, 2024
7e30f6b
move memtable mvcc test
marvin-j97 May 9, 2024
cd37ab0
test: segment iter through multiple blocks of the same key
marvin-j97 May 9, 2024
0e5ef12
fix: test
marvin-j97 May 9, 2024
492f8f4
update test names
marvin-j97 May 9, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,14 +41,14 @@ jobs:
workspaces: >
. -> target
examples/kv -> target
- name: Build
run: cargo build -v
- name: Install cargo-all-features
run: cargo install cargo-all-features
- name: Format
run: cargo fmt --all -- --check
- name: Clippy
run: cargo clippy
- name: Run tests
run: cargo test -v -- --nocapture
run: cargo test-all-features -v -- --nocapture
env:
RUST_LOG: debug
- name: Build & test LSM examples
Expand Down
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,6 @@ Cargo.lock
.lsm.data
.data
/old_*
.test
.test*
segment_history.jsonl
.block_index_test
10 changes: 4 additions & 6 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,28 +19,26 @@ path = "src/lib.rs"
[features]
default = []
bloom = ["dep:seahash"]
segment_history = []
segment_history = ["dep:serde", "dep:serde_json"]

[dependencies]
byteorder = "1.5.0"
chrono = "0.4.38"
crc32fast = "1.4.0"
crossbeam-skiplist = "0.1.3"
double-ended-peekable = "0.1.0"
fs_extra = "1.3.0"
guardian = "1.1.0"
log = "0.4.21"
lz4_flex = "0.11.3"
path-absolutize = "3.1.1"
quick_cache = { version = "0.5.1", default-features = false, features = [] }
rand = "0.8.5"
seahash = { version = "4.1.0", optional = true }
serde = { version = "1.0.200", features = ["derive", "rc"] }
serde_json = "1.0.116"
serde = { version = "1.0.200", features = ["derive", "rc"], optional = true }
serde_json = { version = "1.0.116", optional = true }
tempfile = "3.10.1"

[dev-dependencies]
criterion = { version = "0.5.1", features = ["html_reports"] }
fs_extra = "1.3.0"
nanoid = "0.4.0"
test-log = "0.2.16"

Expand Down
118 changes: 113 additions & 5 deletions benches/lsmt.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,28 @@ use lsm_tree::{
use nanoid::nanoid;
use std::{io::Write, sync::Arc};

fn iterate_level_manifest(c: &mut Criterion) {
let mut group = c.benchmark_group("Iterate level manifest");

for segment_count in [0, 1, 5, 10, 20, 50, 100, 250, 500, 1_000] {
let folder = tempfile::tempdir().unwrap();
let tree = Config::new(folder).block_size(1_024).open().unwrap();

for x in 0..segment_count {
tree.insert("a", "b", x as u64);
tree.flush_active_memtable().unwrap();
}

group.bench_function(&format!("iterate {segment_count} segments"), |b| {
let levels = tree.levels.read().unwrap();

b.iter(|| {
assert_eq!(levels.iter().count(), segment_count);
});
});
}
}

fn memtable_get_upper_bound(c: &mut Criterion) {
let memtable = MemTable::default();

Expand All @@ -24,6 +46,66 @@ fn memtable_get_upper_bound(c: &mut Criterion) {
});
}

fn tli_find_item(c: &mut Criterion) {
use lsm_tree::segment::block_index::{
block_handle::KeyedBlockHandle, top_level::TopLevelIndex,
};

let mut group = c.benchmark_group("TLI find item");

for item_count in [10u64, 100, 1_000, 10_000, 100_000, 1_000_000] {
let items = {
let mut items = Vec::with_capacity(item_count as usize);

for x in 0..item_count {
items.push(KeyedBlockHandle {
start_key: x.to_be_bytes().into(),
offset: x,
size: 0,
});
}

items
};

let index = TopLevelIndex::from_boxed_slice(items.into());

group.bench_function(
format!("TLI get_next_block_handle ({item_count} items)"),
|b| {
let key = (item_count / 10 * 6).to_be_bytes();
let expected: Arc<[u8]> = (item_count / 10 * 6 + 1).to_be_bytes().into();

let block = index.get_lowest_block_containing_item(&key).unwrap();

b.iter(|| {
assert_eq!(
expected,
index.get_next_block_handle(block.offset).unwrap().start_key
);
})
},
);

group.bench_function(
format!("TLI get_block_containing_item ({item_count} items)"),
|b| {
let key = (item_count / 10 * 6).to_be_bytes();

b.iter(|| {
assert_eq!(
key,
&*index
.get_lowest_block_containing_item(&key)
.unwrap()
.start_key
);
})
},
);
}
}

fn value_block_size(c: &mut Criterion) {
let mut group = c.benchmark_group("ValueBlock::size");

Expand All @@ -49,6 +131,30 @@ fn value_block_size(c: &mut Criterion) {
}
}

fn value_block_size_find(c: &mut Criterion) {
use lsm_tree::segment::block_index::{block_handle::KeyedBlockHandle, IndexBlock};

let mut group = c.benchmark_group("Find item in BlockHandleBlock");

// NOTE: Anything above 1000 is unlikely
for item_count in [10, 100, 500, 1_000] {
group.bench_function(format!("{item_count} items"), |b| {
let items = (0u64..item_count)
.map(|x| KeyedBlockHandle {
start_key: x.to_be_bytes().into(),
offset: 56,
size: 635,
})
.collect();

let block = IndexBlock { items, crc: 0 };
let key = &0u64.to_be_bytes();

b.iter(|| block.get_lowest_block_containing_item(key))
});
}
}

fn load_block_from_disk(c: &mut Criterion) {
let mut group = c.benchmark_group("Load block from disk");

Expand Down Expand Up @@ -102,7 +208,7 @@ fn load_block_from_disk(c: &mut Criterion) {
}
}

fn file_descriptor(c: &mut Criterion) {
fn file_descriptor_table(c: &mut Criterion) {
use std::fs::File;

let file = tempfile::NamedTempFile::new().unwrap();
Expand All @@ -115,9 +221,9 @@ fn file_descriptor(c: &mut Criterion) {
});
});

let id: Arc<str> = Arc::from("file");
let id = (0, 523).into();
let descriptor_table = lsm_tree::descriptor_table::FileDescriptorTable::new(1, 1);
descriptor_table.insert(file.path(), id.clone());
descriptor_table.insert(file.path(), id);

group.bench_function("descriptor table", |b: &mut criterion::Bencher<'_>| {
b.iter(|| {
Expand Down Expand Up @@ -250,13 +356,15 @@ fn tree_get_pairs(c: &mut Criterion) {

criterion_group!(
benches,
tli_find_item,
memtable_get_upper_bound,
value_block_size_find,
value_block_size,
load_block_from_disk,
file_descriptor,
file_descriptor_table,
bloom_filter_construction,
bloom_filter_contains,
tree_get_pairs,
// first_kv_disjoint
iterate_level_manifest,
);
criterion_main!(benches);
2 changes: 1 addition & 1 deletion src/bit_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ fn set_bit(byte: u8, idx: usize, value: bool) -> u8 {
}

/// Fixed-size bit array
#[derive(Debug)]
#[derive(Debug, Eq, PartialEq)]
pub struct BitArray(Box<[u8]>);

impl BitArray {
Expand Down
69 changes: 38 additions & 31 deletions src/block_cache.rs
Original file line number Diff line number Diff line change
@@ -1,12 +1,10 @@
use crate::segment::block_index::block_handle::BlockHandle;
use crate::segment::{block::ValueBlock, block_index::BlockHandleBlock};
use crate::{
either::{
Either,
Either::{Left, Right},
},
value::UserKey,
use crate::either::{
Either,
Either::{Left, Right},
};
use crate::segment::block_index::block_handle::KeyedBlockHandle;
use crate::segment::id::GlobalSegmentId;
use crate::segment::{block::ValueBlock, block_index::IndexBlock};
use quick_cache::Weighter;
use quick_cache::{sync::Cache, Equivalent};
use std::sync::Arc;
Expand All @@ -17,30 +15,30 @@ enum BlockTag {
Index = 1,
}

type Item = Either<Arc<ValueBlock>, Arc<BlockHandleBlock>>;
type Item = Either<Arc<ValueBlock>, Arc<IndexBlock>>;

// (Type (disk or index), Segment ID, Block key)
// (Type (disk or index), Segment ID, Block offset)
#[derive(Eq, std::hash::Hash, PartialEq)]
struct CacheKey((BlockTag, Arc<str>, UserKey));
struct CacheKey((BlockTag, GlobalSegmentId, u64));

impl From<(BlockTag, Arc<str>, UserKey)> for CacheKey {
fn from(value: (BlockTag, Arc<str>, UserKey)) -> Self {
impl From<(BlockTag, GlobalSegmentId, u64)> for CacheKey {
fn from(value: (BlockTag, GlobalSegmentId, u64)) -> Self {
Self(value)
}
}

impl std::ops::Deref for CacheKey {
type Target = (BlockTag, Arc<str>, UserKey);
type Target = (BlockTag, GlobalSegmentId, u64);

fn deref(&self) -> &Self::Target {
&self.0
}
}

impl Equivalent<CacheKey> for (BlockTag, &str, &UserKey) {
impl Equivalent<CacheKey> for (BlockTag, GlobalSegmentId, &u64) {
fn equivalent(&self, key: &CacheKey) -> bool {
let inner = &**key;
self.0 == inner.0 && self.1 == &*inner.1 && self.2 == &inner.2
self.0 == inner.0 && self.1 == inner.1 && self.2 == &inner.2
}
}

Expand All @@ -56,7 +54,7 @@ impl Weighter<CacheKey, Item> for BlockWeighter {
Either::Right(block) => block
.items
.iter()
.map(|x| x.start_key.len() + std::mem::size_of::<BlockHandle>())
.map(|x| x.start_key.len() + std::mem::size_of::<KeyedBlockHandle>())
.sum::<usize>() as u32,
}
}
Expand Down Expand Up @@ -120,42 +118,51 @@ impl BlockCache {
}

#[doc(hidden)]
pub fn insert_disk_block(&self, segment_id: Arc<str>, key: UserKey, value: Arc<ValueBlock>) {
pub fn insert_disk_block(
&self,
segment_id: GlobalSegmentId,
offset: u64,
value: Arc<ValueBlock>,
) {
if self.capacity > 0 {
self.data
.insert((BlockTag::Data, segment_id, key).into(), Left(value));
.insert((BlockTag::Data, segment_id, offset).into(), Left(value));
}
}

#[doc(hidden)]
pub fn insert_block_handle_block(
pub fn insert_index_block(
&self,
segment_id: Arc<str>,
key: UserKey,
value: Arc<BlockHandleBlock>,
segment_id: GlobalSegmentId,
offset: u64,
value: Arc<IndexBlock>,
) {
if self.capacity > 0 {
self.data
.insert((BlockTag::Index, segment_id, key).into(), Right(value));
.insert((BlockTag::Index, segment_id, offset).into(), Right(value));
}
}

#[doc(hidden)]
#[must_use]
pub fn get_disk_block(&self, segment_id: &str, key: &UserKey) -> Option<Arc<ValueBlock>> {
let key = (BlockTag::Data, segment_id, key);
pub fn get_disk_block(
&self,
segment_id: GlobalSegmentId,
offset: u64,
) -> Option<Arc<ValueBlock>> {
let key = (BlockTag::Data, segment_id, &offset);
let item = self.data.get(&key)?;
Some(item.left().clone())
}

#[doc(hidden)]
#[must_use]
pub fn get_block_handle_block(
pub fn get_index_block(
&self,
segment_id: &str,
key: &UserKey,
) -> Option<Arc<BlockHandleBlock>> {
let key = (BlockTag::Index, segment_id, key);
segment_id: GlobalSegmentId,
offset: u64,
) -> Option<Arc<IndexBlock>> {
let key = (BlockTag::Index, segment_id, &offset);
let item = self.data.get(&key)?;
Some(item.right().clone())
}
Expand Down
Loading
Loading